{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9983857949959645, "eval_steps": 500, "global_step": 2476, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004035512510088781, "grad_norm": 14.673717498779297, "learning_rate": 6.623414492114189e-07, "logits/chosen": -6.744257926940918, "logits/rejected": -8.48335075378418, "logps/chosen": -225.5757598876953, "logps/rejected": -121.2612533569336, "loss": 4.0127, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.24817952513694763, "rewards/margins": 0.6194116473197937, "rewards/rejected": -0.3712320923805237, "step": 5 }, { "epoch": 0.008071025020177562, "grad_norm": 14.2029390335083, "learning_rate": 1.4902682607256923e-06, "logits/chosen": -6.97362756729126, "logits/rejected": -8.407316207885742, "logps/chosen": -200.06394958496094, "logps/rejected": -115.20751953125, "loss": 3.8724, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.3888495862483978, "rewards/margins": -0.8909395337104797, "rewards/rejected": 0.5020898580551147, "step": 10 }, { "epoch": 0.012106537530266344, "grad_norm": 16.932567596435547, "learning_rate": 2.318195072239966e-06, "logits/chosen": -7.110917568206787, "logits/rejected": -8.146717071533203, "logps/chosen": -188.72610473632812, "logps/rejected": -127.42155456542969, "loss": 4.1968, "rewards/accuracies": 0.5, "rewards/chosen": -0.45380306243896484, "rewards/margins": 0.7271897196769714, "rewards/rejected": -1.1809927225112915, "step": 15 }, { "epoch": 0.016142050040355124, "grad_norm": 41.62274932861328, "learning_rate": 3.1461218837542393e-06, "logits/chosen": -6.9441728591918945, "logits/rejected": -8.890535354614258, "logps/chosen": -209.0159149169922, "logps/rejected": -134.5095977783203, "loss": 4.4986, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.34606871008872986, "rewards/margins": -0.4484892785549164, "rewards/rejected": 0.7945581078529358, "step": 20 }, { "epoch": 0.020177562550443905, "grad_norm": 12.708757400512695, "learning_rate": 3.974048695268513e-06, "logits/chosen": -6.77667760848999, "logits/rejected": -8.425729751586914, "logps/chosen": -195.30978393554688, "logps/rejected": -130.95492553710938, "loss": 4.0457, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.19692716002464294, "rewards/margins": 0.040506649762392044, "rewards/rejected": -0.2374337911605835, "step": 25 }, { "epoch": 0.024213075060532687, "grad_norm": 24.609724044799805, "learning_rate": 4.801975506782787e-06, "logits/chosen": -6.328831195831299, "logits/rejected": -8.906606674194336, "logps/chosen": -170.4086456298828, "logps/rejected": -141.0073699951172, "loss": 2.4614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 3.0435898303985596, "rewards/margins": 3.676306962966919, "rewards/rejected": -0.6327171325683594, "step": 30 }, { "epoch": 0.02824858757062147, "grad_norm": 19.817798614501953, "learning_rate": 5.629902318297061e-06, "logits/chosen": -6.3670573234558105, "logits/rejected": -9.06364631652832, "logps/chosen": -192.33477783203125, "logps/rejected": -133.8369903564453, "loss": 2.7064, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.956978976726532, "rewards/margins": 2.165383815765381, "rewards/rejected": -1.208404779434204, "step": 35 }, { "epoch": 0.03228410008071025, "grad_norm": 22.696659088134766, "learning_rate": 5.795458928928562e-06, "logits/chosen": -5.0575947761535645, "logits/rejected": -9.012417793273926, "logps/chosen": -201.5998077392578, "logps/rejected": -140.26181030273438, "loss": 2.9636, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9657144546508789, "rewards/margins": 1.115404725074768, "rewards/rejected": -2.0811190605163574, "step": 40 }, { "epoch": 0.03631961259079903, "grad_norm": 14.558158874511719, "learning_rate": 5.795342126567494e-06, "logits/chosen": -6.1244096755981445, "logits/rejected": -9.38933277130127, "logps/chosen": -167.87869262695312, "logps/rejected": -152.34912109375, "loss": 2.1462, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 2.2052810192108154, "rewards/margins": 3.8512001037597656, "rewards/rejected": -1.6459192037582397, "step": 45 }, { "epoch": 0.04035512510088781, "grad_norm": 7.640690803527832, "learning_rate": 5.795135481362378e-06, "logits/chosen": -5.798432350158691, "logits/rejected": -9.484591484069824, "logps/chosen": -170.1936798095703, "logps/rejected": -157.44212341308594, "loss": 1.9988, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 2.815709352493286, "rewards/margins": 6.181019306182861, "rewards/rejected": -3.3653101921081543, "step": 50 }, { "epoch": 0.04439063761097659, "grad_norm": 5.130834102630615, "learning_rate": 5.794839001856335e-06, "logits/chosen": -6.12804651260376, "logits/rejected": -9.552550315856934, "logps/chosen": -156.65438842773438, "logps/rejected": -185.3906707763672, "loss": 0.7759, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 2.6382603645324707, "rewards/margins": 9.080062866210938, "rewards/rejected": -6.441802978515625, "step": 55 }, { "epoch": 0.048426150121065374, "grad_norm": 16.63554573059082, "learning_rate": 5.794452700306419e-06, "logits/chosen": -6.135761737823486, "logits/rejected": -10.36125659942627, "logps/chosen": -170.6251983642578, "logps/rejected": -221.56149291992188, "loss": 0.1214, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 4.845996856689453, "rewards/margins": 14.91584300994873, "rewards/rejected": -10.069847106933594, "step": 60 }, { "epoch": 0.052461662631154156, "grad_norm": 11.855253219604492, "learning_rate": 5.793976592683102e-06, "logits/chosen": -7.123507022857666, "logits/rejected": -10.608318328857422, "logps/chosen": -174.98123168945312, "logps/rejected": -265.38458251953125, "loss": 0.7598, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 3.166455030441284, "rewards/margins": 15.321142196655273, "rewards/rejected": -12.154687881469727, "step": 65 }, { "epoch": 0.05649717514124294, "grad_norm": 0.005996761843562126, "learning_rate": 5.793410698669617e-06, "logits/chosen": -7.440362453460693, "logits/rejected": -10.746813774108887, "logps/chosen": -183.48101806640625, "logps/rejected": -345.387451171875, "loss": 0.0469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 3.0181121826171875, "rewards/margins": 23.404953002929688, "rewards/rejected": -20.386837005615234, "step": 70 }, { "epoch": 0.06053268765133172, "grad_norm": 0.10333424806594849, "learning_rate": 5.792755041661148e-06, "logits/chosen": -7.130080223083496, "logits/rejected": -11.454127311706543, "logps/chosen": -171.0690155029297, "logps/rejected": -354.4461975097656, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.296330213546753, "rewards/margins": 24.28030014038086, "rewards/rejected": -22.983970642089844, "step": 75 }, { "epoch": 0.0645682001614205, "grad_norm": 0.7864883542060852, "learning_rate": 5.792009648763854e-06, "logits/chosen": -7.139286994934082, "logits/rejected": -11.191341400146484, "logps/chosen": -181.26065063476562, "logps/rejected": -358.15814208984375, "loss": 0.0897, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.704894781112671, "rewards/margins": 24.785791397094727, "rewards/rejected": -23.08089828491211, "step": 80 }, { "epoch": 0.06860371267150928, "grad_norm": 0.39277294278144836, "learning_rate": 5.79117455079376e-06, "logits/chosen": -6.444334506988525, "logits/rejected": -10.354166030883789, "logps/chosen": -210.13134765625, "logps/rejected": -407.18011474609375, "loss": 0.101, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7708677649497986, "rewards/margins": 28.455745697021484, "rewards/rejected": -27.68488121032715, "step": 85 }, { "epoch": 0.07263922518159806, "grad_norm": 13.402353286743164, "learning_rate": 5.790249782275472e-06, "logits/chosen": -7.0808515548706055, "logits/rejected": -10.282732009887695, "logps/chosen": -189.52676391601562, "logps/rejected": -464.342041015625, "loss": 0.0801, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.2694244384765625, "rewards/margins": 34.41953659057617, "rewards/rejected": -32.15011215209961, "step": 90 }, { "epoch": 0.07667473769168684, "grad_norm": 1.9315897588967346e-05, "learning_rate": 5.789235381440756e-06, "logits/chosen": -6.944557189941406, "logits/rejected": -10.625551223754883, "logps/chosen": -174.44134521484375, "logps/rejected": -431.695068359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 0.532370924949646, "rewards/margins": 31.467687606811523, "rewards/rejected": -30.935317993164062, "step": 95 }, { "epoch": 0.08071025020177562, "grad_norm": 7.232487678527832, "learning_rate": 5.788131390226956e-06, "logits/chosen": -6.5953521728515625, "logits/rejected": -9.727926254272461, "logps/chosen": -178.81082153320312, "logps/rejected": -492.8739318847656, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9396479725837708, "rewards/margins": 34.660057067871094, "rewards/rejected": -33.72040557861328, "step": 100 }, { "epoch": 0.0847457627118644, "grad_norm": 0.03762379288673401, "learning_rate": 5.786937854275262e-06, "logits/chosen": -6.660658359527588, "logits/rejected": -9.910313606262207, "logps/chosen": -195.0374298095703, "logps/rejected": -460.4256286621094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.8967695236206055, "rewards/margins": 34.99456787109375, "rewards/rejected": -33.09779739379883, "step": 105 }, { "epoch": 0.08878127522195318, "grad_norm": 6.987350502640766e-07, "learning_rate": 5.785654822928817e-06, "logits/chosen": -6.238831520080566, "logits/rejected": -10.238045692443848, "logps/chosen": -226.43948364257812, "logps/rejected": -505.93011474609375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.7205227613449097, "rewards/margins": 37.881141662597656, "rewards/rejected": -37.160621643066406, "step": 110 }, { "epoch": 0.09281678773204197, "grad_norm": 1.2922990322113037, "learning_rate": 5.7842823492306836e-06, "logits/chosen": -6.823439121246338, "logits/rejected": -9.928242683410645, "logps/chosen": -181.66818237304688, "logps/rejected": -436.5545959472656, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.8081116676330566, "rewards/margins": 34.38478469848633, "rewards/rejected": -31.576675415039062, "step": 115 }, { "epoch": 0.09685230024213075, "grad_norm": 0.04803316295146942, "learning_rate": 5.782820489921651e-06, "logits/chosen": -6.807633399963379, "logits/rejected": -9.851001739501953, "logps/chosen": -180.86163330078125, "logps/rejected": -490.45355224609375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.7332231998443604, "rewards/margins": 38.43529510498047, "rewards/rejected": -35.70207214355469, "step": 120 }, { "epoch": 0.10088781275221953, "grad_norm": 0.6190744638442993, "learning_rate": 5.781269305437881e-06, "logits/chosen": -6.340489387512207, "logits/rejected": -9.130331039428711, "logps/chosen": -187.36895751953125, "logps/rejected": -459.86553955078125, "loss": 0.8067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5271873474121094, "rewards/margins": 33.724124908447266, "rewards/rejected": -33.196937561035156, "step": 125 }, { "epoch": 0.10492332526230831, "grad_norm": 3.028175115105114e-07, "learning_rate": 5.779628859908423e-06, "logits/chosen": -6.4552321434021, "logits/rejected": -8.847993850708008, "logps/chosen": -193.5435028076172, "logps/rejected": -464.19317626953125, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.6207548379898071, "rewards/margins": 35.79097366333008, "rewards/rejected": -35.17021942138672, "step": 130 }, { "epoch": 0.1089588377723971, "grad_norm": 0.000125239253975451, "learning_rate": 5.777899221152549e-06, "logits/chosen": -7.008848667144775, "logits/rejected": -9.154699325561523, "logps/chosen": -179.1580352783203, "logps/rejected": -480.8003845214844, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.13914290070533752, "rewards/margins": 36.69832229614258, "rewards/rejected": -36.83746337890625, "step": 135 }, { "epoch": 0.11299435028248588, "grad_norm": 0.00016098514606710523, "learning_rate": 5.776080460676958e-06, "logits/chosen": -6.9056830406188965, "logits/rejected": -8.463708877563477, "logps/chosen": -205.28890991210938, "logps/rejected": -547.226318359375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.24884264171123505, "rewards/margins": 40.85606384277344, "rewards/rejected": -41.10490798950195, "step": 140 }, { "epoch": 0.11702986279257466, "grad_norm": 0.0029682181775569916, "learning_rate": 5.774172653672819e-06, "logits/chosen": -6.84311056137085, "logits/rejected": -8.932482719421387, "logps/chosen": -169.85238647460938, "logps/rejected": -471.486572265625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.32164981961250305, "rewards/margins": 36.235748291015625, "rewards/rejected": -35.914100646972656, "step": 145 }, { "epoch": 0.12106537530266344, "grad_norm": 2.438822699346588e-09, "learning_rate": 5.772175879012661e-06, "logits/chosen": -6.949581146240234, "logits/rejected": -8.59605884552002, "logps/chosen": -190.7699737548828, "logps/rejected": -651.8414306640625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.2779724597930908, "rewards/margins": 51.624244689941406, "rewards/rejected": -51.9022216796875, "step": 150 }, { "epoch": 0.12510088781275222, "grad_norm": 0.000726940343156457, "learning_rate": 5.77009021924711e-06, "logits/chosen": -6.392744541168213, "logits/rejected": -7.90814208984375, "logps/chosen": -187.94715881347656, "logps/rejected": -569.8294677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.1985471248626709, "rewards/margins": 44.472900390625, "rewards/rejected": -44.67145538330078, "step": 155 }, { "epoch": 0.129136400322841, "grad_norm": 6.066032801754773e-05, "learning_rate": 5.767915760601482e-06, "logits/chosen": -6.010158061981201, "logits/rejected": -6.9548468589782715, "logps/chosen": -190.71681213378906, "logps/rejected": -575.8131103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0250240564346313, "rewards/margins": 45.72368621826172, "rewards/rejected": -44.69865798950195, "step": 160 }, { "epoch": 0.13317191283292978, "grad_norm": 1.9295004676678218e-05, "learning_rate": 5.765652592972214e-06, "logits/chosen": -5.802807807922363, "logits/rejected": -6.968630790710449, "logps/chosen": -197.30953979492188, "logps/rejected": -547.5148315429688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.1490107774734497, "rewards/margins": 42.6492805480957, "rewards/rejected": -42.50027084350586, "step": 165 }, { "epoch": 0.13720742534301855, "grad_norm": 0.0010933686280623078, "learning_rate": 5.763300809923146e-06, "logits/chosen": -5.615510940551758, "logits/rejected": -6.936361789703369, "logps/chosen": -218.2812042236328, "logps/rejected": -580.9514770507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5796593427658081, "rewards/margins": 44.76678466796875, "rewards/rejected": -45.34644317626953, "step": 170 }, { "epoch": 0.14124293785310735, "grad_norm": 0.036393456161022186, "learning_rate": 5.760860508681658e-06, "logits/chosen": -5.778494358062744, "logits/rejected": -7.198030948638916, "logps/chosen": -174.28173828125, "logps/rejected": -542.234375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.23467382788658142, "rewards/margins": 40.992122650146484, "rewards/rejected": -41.2267951965332, "step": 175 }, { "epoch": 0.14527845036319612, "grad_norm": 0.0013515922473743558, "learning_rate": 5.758331790134647e-06, "logits/chosen": -5.442145824432373, "logits/rejected": -7.1225152015686035, "logps/chosen": -179.96202087402344, "logps/rejected": -558.2398071289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6389068365097046, "rewards/margins": 44.37887191772461, "rewards/rejected": -42.739967346191406, "step": 180 }, { "epoch": 0.1493139628732849, "grad_norm": 0.0001895563764264807, "learning_rate": 5.755714758824358e-06, "logits/chosen": -5.537622928619385, "logits/rejected": -7.023104667663574, "logps/chosen": -165.4660186767578, "logps/rejected": -578.6268310546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4900878965854645, "rewards/margins": 45.44189453125, "rewards/rejected": -45.93198013305664, "step": 185 }, { "epoch": 0.15334947538337368, "grad_norm": 7.28354549407959, "learning_rate": 5.753009522944058e-06, "logits/chosen": -5.477646827697754, "logits/rejected": -6.9917120933532715, "logps/chosen": -184.61709594726562, "logps/rejected": -578.7047729492188, "loss": 0.0179, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.895775556564331, "rewards/margins": 43.74489212036133, "rewards/rejected": -45.64066696166992, "step": 190 }, { "epoch": 0.15738498789346247, "grad_norm": 0.0025164347607642412, "learning_rate": 5.75021619433357e-06, "logits/chosen": -5.827167987823486, "logits/rejected": -6.866555213928223, "logps/chosen": -205.6201171875, "logps/rejected": -569.4774169921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.0656617134809494, "rewards/margins": 44.16213607788086, "rewards/rejected": -44.2277946472168, "step": 195 }, { "epoch": 0.16142050040355124, "grad_norm": 8.36625840747729e-05, "learning_rate": 5.747334888474641e-06, "logits/chosen": -5.662519931793213, "logits/rejected": -7.328948020935059, "logps/chosen": -217.7265625, "logps/rejected": -583.5174560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.47370538115501404, "rewards/margins": 44.35625457763672, "rewards/rejected": -43.88254928588867, "step": 200 }, { "epoch": 0.16545601291364004, "grad_norm": 6.310216917881917e-07, "learning_rate": 5.744365724486177e-06, "logits/chosen": -5.578995704650879, "logits/rejected": -7.037115573883057, "logps/chosen": -224.5011444091797, "logps/rejected": -549.6485595703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 1.0947514772415161, "rewards/margins": 42.30724334716797, "rewards/rejected": -41.212486267089844, "step": 205 }, { "epoch": 0.1694915254237288, "grad_norm": 0.0029706968925893307, "learning_rate": 5.741308825119308e-06, "logits/chosen": -5.862391948699951, "logits/rejected": -7.418662071228027, "logps/chosen": -179.6857147216797, "logps/rejected": -551.5291748046875, "loss": 0.0446, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3072229027748108, "rewards/margins": 43.57607650756836, "rewards/rejected": -43.883296966552734, "step": 210 }, { "epoch": 0.1735270379338176, "grad_norm": 0.002366419183090329, "learning_rate": 5.738164316752323e-06, "logits/chosen": -6.236123085021973, "logits/rejected": -7.6675238609313965, "logps/chosen": -188.9443817138672, "logps/rejected": -549.8839111328125, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2124940156936646, "rewards/margins": 41.6048469543457, "rewards/rejected": -42.81734085083008, "step": 215 }, { "epoch": 0.17756255044390637, "grad_norm": 0.010826943442225456, "learning_rate": 5.734932329385438e-06, "logits/chosen": -5.464410781860352, "logits/rejected": -6.6296257972717285, "logps/chosen": -229.86904907226562, "logps/rejected": -617.7103271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.482263445854187, "rewards/margins": 49.28208541870117, "rewards/rejected": -47.799827575683594, "step": 220 }, { "epoch": 0.18159806295399517, "grad_norm": 1.2831571893912042e-06, "learning_rate": 5.731612996635428e-06, "logits/chosen": -5.414681434631348, "logits/rejected": -6.363954544067383, "logps/chosen": -190.43276977539062, "logps/rejected": -582.6544799804688, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.2294385433197021, "rewards/margins": 46.134864807128906, "rewards/rejected": -44.905426025390625, "step": 225 }, { "epoch": 0.18563357546408393, "grad_norm": 3.25995497405529e-05, "learning_rate": 5.728206455730096e-06, "logits/chosen": -5.482069969177246, "logits/rejected": -6.214612007141113, "logps/chosen": -210.40194702148438, "logps/rejected": -621.2275390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.705507755279541, "rewards/margins": 49.22768020629883, "rewards/rejected": -48.52217483520508, "step": 230 }, { "epoch": 0.18966908797417273, "grad_norm": 0.00034202722599729896, "learning_rate": 5.724712847502605e-06, "logits/chosen": -5.495121479034424, "logits/rejected": -6.153656005859375, "logps/chosen": -208.3478546142578, "logps/rejected": -554.0267333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34371915459632874, "rewards/margins": 42.33998107910156, "rewards/rejected": -41.99626541137695, "step": 235 }, { "epoch": 0.1937046004842615, "grad_norm": 0.17204973101615906, "learning_rate": 5.721132316385653e-06, "logits/chosen": -5.564776420593262, "logits/rejected": -6.483262538909912, "logps/chosen": -167.18954467773438, "logps/rejected": -582.14111328125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.559180736541748, "rewards/margins": 43.47228240966797, "rewards/rejected": -45.03145980834961, "step": 240 }, { "epoch": 0.1977401129943503, "grad_norm": 5.5700301260230844e-08, "learning_rate": 5.717465010405504e-06, "logits/chosen": -5.436084747314453, "logits/rejected": -6.044018745422363, "logps/chosen": -207.0467071533203, "logps/rejected": -532.625732421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6226285696029663, "rewards/margins": 43.428916931152344, "rewards/rejected": -41.8062858581543, "step": 245 }, { "epoch": 0.20177562550443906, "grad_norm": 1.8669492192202597e-06, "learning_rate": 5.713711081175866e-06, "logits/chosen": -5.494500160217285, "logits/rejected": -5.98954963684082, "logps/chosen": -204.185791015625, "logps/rejected": -642.2548217773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0250794887542725, "rewards/margins": 50.868370056152344, "rewards/rejected": -51.893455505371094, "step": 250 }, { "epoch": 0.20581113801452786, "grad_norm": 7.218413884402253e-07, "learning_rate": 5.709870683891625e-06, "logits/chosen": -4.952367305755615, "logits/rejected": -5.937825679779053, "logps/chosen": -255.01123046875, "logps/rejected": -659.2103271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.756742477416992, "rewards/margins": 48.82769012451172, "rewards/rejected": -51.584434509277344, "step": 255 }, { "epoch": 0.20984665052461662, "grad_norm": 4.357013494882267e-06, "learning_rate": 5.705943977322427e-06, "logits/chosen": -5.793478965759277, "logits/rejected": -6.125035285949707, "logps/chosen": -190.6591796875, "logps/rejected": -603.3314819335938, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": -2.9713456630706787, "rewards/margins": 44.559974670410156, "rewards/rejected": -47.53131866455078, "step": 260 }, { "epoch": 0.21388216303470542, "grad_norm": 39.15335464477539, "learning_rate": 5.701931123806116e-06, "logits/chosen": -4.789765357971191, "logits/rejected": -5.534176826477051, "logps/chosen": -226.07907104492188, "logps/rejected": -573.4830932617188, "loss": 0.2565, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.387948513031006, "rewards/margins": 41.84444046020508, "rewards/rejected": -44.23238754272461, "step": 265 }, { "epoch": 0.2179176755447942, "grad_norm": 1.974670885829255e-05, "learning_rate": 5.697832289242021e-06, "logits/chosen": -5.39833927154541, "logits/rejected": -5.595280647277832, "logps/chosen": -198.2093963623047, "logps/rejected": -697.5348510742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1948919296264648, "rewards/margins": 55.79084396362305, "rewards/rejected": -56.985740661621094, "step": 270 }, { "epoch": 0.22195318805488298, "grad_norm": 4.396254951188894e-07, "learning_rate": 5.693647643084099e-06, "logits/chosen": -5.110897064208984, "logits/rejected": -5.4116010665893555, "logps/chosen": -207.0631103515625, "logps/rejected": -625.4093627929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.591120183467865, "rewards/margins": 51.06156539916992, "rewards/rejected": -50.4704475402832, "step": 275 }, { "epoch": 0.22598870056497175, "grad_norm": 0.005072440020740032, "learning_rate": 5.689377358333927e-06, "logits/chosen": -5.161062717437744, "logits/rejected": -5.267472743988037, "logps/chosen": -204.6974639892578, "logps/rejected": -566.11572265625, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.25763851404190063, "rewards/margins": 43.52554702758789, "rewards/rejected": -43.267906188964844, "step": 280 }, { "epoch": 0.23002421307506055, "grad_norm": 2.787390656067146e-07, "learning_rate": 5.685021611533554e-06, "logits/chosen": -5.653125286102295, "logits/rejected": -5.518962383270264, "logps/chosen": -220.1473846435547, "logps/rejected": -725.2862548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1357317417860031, "rewards/margins": 58.828521728515625, "rewards/rejected": -58.692787170410156, "step": 285 }, { "epoch": 0.23405972558514931, "grad_norm": 5.707844863422906e-08, "learning_rate": 5.680580582758199e-06, "logits/chosen": -5.020755767822266, "logits/rejected": -5.196411609649658, "logps/chosen": -251.571533203125, "logps/rejected": -686.1497802734375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.9902467727661133, "rewards/margins": 53.397621154785156, "rewards/rejected": -54.38786697387695, "step": 290 }, { "epoch": 0.23809523809523808, "grad_norm": 7.448752057825914e-06, "learning_rate": 5.676054455608805e-06, "logits/chosen": -5.290119171142578, "logits/rejected": -5.161767482757568, "logps/chosen": -192.84292602539062, "logps/rejected": -609.5615844726562, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.3850996494293213, "rewards/margins": 46.675621032714844, "rewards/rejected": -48.06072235107422, "step": 295 }, { "epoch": 0.24213075060532688, "grad_norm": 8.788862970732225e-09, "learning_rate": 5.6714434172044524e-06, "logits/chosen": -5.228567123413086, "logits/rejected": -5.350821495056152, "logps/chosen": -220.0755615234375, "logps/rejected": -698.3884887695312, "loss": 0.054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3978014886379242, "rewards/margins": 55.6969108581543, "rewards/rejected": -55.299110412597656, "step": 300 }, { "epoch": 0.24616626311541565, "grad_norm": 7.401735729217762e-06, "learning_rate": 5.666747658174622e-06, "logits/chosen": -5.45712947845459, "logits/rejected": -5.45941162109375, "logps/chosen": -195.6270294189453, "logps/rejected": -591.5343017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7390839457511902, "rewards/margins": 45.9856071472168, "rewards/rejected": -46.72468948364258, "step": 305 }, { "epoch": 0.25020177562550444, "grad_norm": 7.191949407570064e-05, "learning_rate": 5.661967372651312e-06, "logits/chosen": -5.082695960998535, "logits/rejected": -4.980287075042725, "logps/chosen": -228.07162475585938, "logps/rejected": -646.6930541992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5647106170654297, "rewards/margins": 49.39262008666992, "rewards/rejected": -52.95732879638672, "step": 310 }, { "epoch": 0.2542372881355932, "grad_norm": 0.019374744966626167, "learning_rate": 5.657102758261015e-06, "logits/chosen": -4.977044105529785, "logits/rejected": -5.263559341430664, "logps/chosen": -195.72152709960938, "logps/rejected": -592.3165893554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.038941167294979095, "rewards/margins": 48.49567794799805, "rewards/rejected": -48.45673370361328, "step": 315 }, { "epoch": 0.258272800645682, "grad_norm": 1.7742197087500244e-05, "learning_rate": 5.652154016116545e-06, "logits/chosen": -5.545248985290527, "logits/rejected": -5.140854835510254, "logps/chosen": -201.30615234375, "logps/rejected": -667.4631958007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2748926877975464, "rewards/margins": 53.26398849487305, "rewards/rejected": -54.53887939453125, "step": 320 }, { "epoch": 0.2623083131557708, "grad_norm": 2.952660054234002e-07, "learning_rate": 5.647121350808724e-06, "logits/chosen": -5.486944675445557, "logits/rejected": -5.031026840209961, "logps/chosen": -197.08282470703125, "logps/rejected": -617.19189453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.465013265609741, "rewards/margins": 47.34251022338867, "rewards/rejected": -49.807525634765625, "step": 325 }, { "epoch": 0.26634382566585957, "grad_norm": 9.331162686976313e-07, "learning_rate": 5.642004970397927e-06, "logits/chosen": -5.784380912780762, "logits/rejected": -5.35366678237915, "logps/chosen": -211.95205688476562, "logps/rejected": -729.4708862304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.806584119796753, "rewards/margins": 57.27272415161133, "rewards/rejected": -59.079307556152344, "step": 330 }, { "epoch": 0.27037933817594834, "grad_norm": 0.0007456880412064493, "learning_rate": 5.6368050864054755e-06, "logits/chosen": -5.461863040924072, "logits/rejected": -5.533261775970459, "logps/chosen": -165.5745391845703, "logps/rejected": -578.37353515625, "loss": 0.0871, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.10736255347728729, "rewards/margins": 45.70323181152344, "rewards/rejected": -45.81060028076172, "step": 335 }, { "epoch": 0.2744148506860371, "grad_norm": 3.664925679913722e-05, "learning_rate": 5.6315219138048935e-06, "logits/chosen": -5.822869300842285, "logits/rejected": -5.332520961761475, "logps/chosen": -204.32546997070312, "logps/rejected": -611.2283935546875, "loss": 0.1996, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.39035463333129883, "rewards/margins": 49.15932083129883, "rewards/rejected": -49.5496711730957, "step": 340 }, { "epoch": 0.2784503631961259, "grad_norm": 0.0001088407589122653, "learning_rate": 5.626155671013023e-06, "logits/chosen": -5.600030899047852, "logits/rejected": -5.3980889320373535, "logps/chosen": -226.5461883544922, "logps/rejected": -656.7044677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.44686442613601685, "rewards/margins": 50.97893142700195, "rewards/rejected": -51.4257926940918, "step": 345 }, { "epoch": 0.2824858757062147, "grad_norm": 2.487412757545826e-06, "learning_rate": 5.6207065798809935e-06, "logits/chosen": -5.5403289794921875, "logits/rejected": -6.089693546295166, "logps/chosen": -191.3777313232422, "logps/rejected": -637.6499633789062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3141205310821533, "rewards/margins": 53.1793098449707, "rewards/rejected": -51.86519241333008, "step": 350 }, { "epoch": 0.28652138821630346, "grad_norm": 1.4271338841354009e-05, "learning_rate": 5.615174865685048e-06, "logits/chosen": -5.915181636810303, "logits/rejected": -6.0416340827941895, "logps/chosen": -192.2244110107422, "logps/rejected": -650.7037353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5946803092956543, "rewards/margins": 49.72855758666992, "rewards/rejected": -52.32323455810547, "step": 355 }, { "epoch": 0.29055690072639223, "grad_norm": 4.210760380374268e-06, "learning_rate": 5.6095607571172305e-06, "logits/chosen": -5.622889041900635, "logits/rejected": -6.083870887756348, "logps/chosen": -218.2845916748047, "logps/rejected": -641.5206298828125, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 1.3222715854644775, "rewards/margins": 52.35942459106445, "rewards/rejected": -51.03715133666992, "step": 360 }, { "epoch": 0.29459241323648105, "grad_norm": 9.01486587524414, "learning_rate": 5.603864486275933e-06, "logits/chosen": -5.980809211730957, "logits/rejected": -5.915687561035156, "logps/chosen": -161.8653564453125, "logps/rejected": -579.376220703125, "loss": 0.0762, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07540754973888397, "rewards/margins": 45.42487335205078, "rewards/rejected": -45.500282287597656, "step": 365 }, { "epoch": 0.2986279257465698, "grad_norm": 1.2724981957035197e-07, "learning_rate": 5.598086288656299e-06, "logits/chosen": -6.2588582038879395, "logits/rejected": -6.032774925231934, "logps/chosen": -175.0609130859375, "logps/rejected": -578.6551513671875, "loss": 0.0126, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.44100508093833923, "rewards/margins": 46.375091552734375, "rewards/rejected": -46.81609344482422, "step": 370 }, { "epoch": 0.3026634382566586, "grad_norm": 2.624596762146325e-09, "learning_rate": 5.592226403140486e-06, "logits/chosen": -5.993817329406738, "logits/rejected": -5.717774868011475, "logps/chosen": -205.4371337890625, "logps/rejected": -680.5233764648438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5212082862854004, "rewards/margins": 55.45811080932617, "rewards/rejected": -54.9369010925293, "step": 375 }, { "epoch": 0.30669895076674736, "grad_norm": 3.0500208936246054e-07, "learning_rate": 5.586285071987793e-06, "logits/chosen": -5.604440689086914, "logits/rejected": -5.637472629547119, "logps/chosen": -196.48214721679688, "logps/rejected": -609.2901000976562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6224287748336792, "rewards/margins": 48.38045120239258, "rewards/rejected": -47.758018493652344, "step": 380 }, { "epoch": 0.3107344632768362, "grad_norm": 1.8079782648783294e-07, "learning_rate": 5.580262540824643e-06, "logits/chosen": -6.01360559463501, "logits/rejected": -5.838696479797363, "logps/chosen": -206.0096893310547, "logps/rejected": -612.0291748046875, "loss": 0.0457, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5278552174568176, "rewards/margins": 48.58734893798828, "rewards/rejected": -49.115211486816406, "step": 385 }, { "epoch": 0.31476997578692495, "grad_norm": 1.4297783934580366e-07, "learning_rate": 5.574159058634429e-06, "logits/chosen": -5.916940689086914, "logits/rejected": -5.852144241333008, "logps/chosen": -199.3605499267578, "logps/rejected": -565.8484497070312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.4689599573612213, "rewards/margins": 44.18257522583008, "rewards/rejected": -44.65153884887695, "step": 390 }, { "epoch": 0.3188054882970137, "grad_norm": 1.071068012059584e-09, "learning_rate": 5.567974877747217e-06, "logits/chosen": -5.432553291320801, "logits/rejected": -5.690057754516602, "logps/chosen": -176.4378662109375, "logps/rejected": -621.3287963867188, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.06440804898738861, "rewards/margins": 48.63337326049805, "rewards/rejected": -48.697784423828125, "step": 395 }, { "epoch": 0.3228410008071025, "grad_norm": 2.2283744328888133e-05, "learning_rate": 5.561710253829323e-06, "logits/chosen": -5.052567481994629, "logits/rejected": -5.251601696014404, "logps/chosen": -218.6304168701172, "logps/rejected": -627.0401611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.37047404050827026, "rewards/margins": 49.534812927246094, "rewards/rejected": -49.90529251098633, "step": 400 }, { "epoch": 0.3268765133171913, "grad_norm": 4.2673306666074495e-07, "learning_rate": 5.555365445872731e-06, "logits/chosen": -5.356926441192627, "logits/rejected": -5.504358291625977, "logps/chosen": -193.74072265625, "logps/rejected": -684.0533447265625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.010841893963515759, "rewards/margins": 55.61138916015625, "rewards/rejected": -55.600555419921875, "step": 405 }, { "epoch": 0.3309120258272801, "grad_norm": 1.4931893019820563e-06, "learning_rate": 5.5489407161843985e-06, "logits/chosen": -4.521881580352783, "logits/rejected": -5.312588214874268, "logps/chosen": -170.4561767578125, "logps/rejected": -553.3409423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5783250331878662, "rewards/margins": 41.9833869934082, "rewards/rejected": -43.56171417236328, "step": 410 }, { "epoch": 0.33494753833736884, "grad_norm": 5.23489482873174e-08, "learning_rate": 5.5424363303754016e-06, "logits/chosen": -4.734778881072998, "logits/rejected": -5.203174591064453, "logps/chosen": -212.63583374023438, "logps/rejected": -639.210693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0527421236038208, "rewards/margins": 49.97311782836914, "rewards/rejected": -51.02585983276367, "step": 415 }, { "epoch": 0.3389830508474576, "grad_norm": 9.542733714340557e-09, "learning_rate": 5.535852557349961e-06, "logits/chosen": -5.044532775878906, "logits/rejected": -5.107327461242676, "logps/chosen": -204.3588409423828, "logps/rejected": -590.7885131835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7146708369255066, "rewards/margins": 46.748069763183594, "rewards/rejected": -46.03339767456055, "step": 420 }, { "epoch": 0.34301856335754644, "grad_norm": 9.109339771384839e-08, "learning_rate": 5.529189669294321e-06, "logits/chosen": -5.21028995513916, "logits/rejected": -5.133396148681641, "logps/chosen": -180.69126892089844, "logps/rejected": -593.2762451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6593784093856812, "rewards/margins": 47.91680145263672, "rewards/rejected": -48.57617950439453, "step": 425 }, { "epoch": 0.3470540758676352, "grad_norm": 0.003121189307421446, "learning_rate": 5.522447941665499e-06, "logits/chosen": -4.818991661071777, "logits/rejected": -5.058593273162842, "logps/chosen": -204.40231323242188, "logps/rejected": -636.2479248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4958776533603668, "rewards/margins": 50.53229522705078, "rewards/rejected": -50.036415100097656, "step": 430 }, { "epoch": 0.35108958837772397, "grad_norm": 4.919891230525941e-11, "learning_rate": 5.5156276531798975e-06, "logits/chosen": -4.704854488372803, "logits/rejected": -4.997499942779541, "logps/chosen": -178.4606170654297, "logps/rejected": -594.2947998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3024781942367554, "rewards/margins": 48.864376068115234, "rewards/rejected": -47.56189727783203, "step": 435 }, { "epoch": 0.35512510088781274, "grad_norm": 6.498929724330083e-05, "learning_rate": 5.50872908580178e-06, "logits/chosen": -4.799943447113037, "logits/rejected": -5.2160420417785645, "logps/chosen": -206.88162231445312, "logps/rejected": -581.205810546875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 1.3390350341796875, "rewards/margins": 46.86649703979492, "rewards/rejected": -45.5274658203125, "step": 440 }, { "epoch": 0.3591606133979015, "grad_norm": 7.001431367825717e-05, "learning_rate": 5.501752524731613e-06, "logits/chosen": -5.14467716217041, "logits/rejected": -5.150885581970215, "logps/chosen": -208.15478515625, "logps/rejected": -673.6754150390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6440627574920654, "rewards/margins": 56.38043212890625, "rewards/rejected": -54.736366271972656, "step": 445 }, { "epoch": 0.36319612590799033, "grad_norm": 12.17817211151123, "learning_rate": 5.494698258394281e-06, "logits/chosen": -5.178465843200684, "logits/rejected": -5.559037208557129, "logps/chosen": -200.14974975585938, "logps/rejected": -619.0408325195312, "loss": 0.013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3612689971923828, "rewards/margins": 49.216556549072266, "rewards/rejected": -49.577823638916016, "step": 450 }, { "epoch": 0.3672316384180791, "grad_norm": 2.2588696424463706e-07, "learning_rate": 5.487566578427153e-06, "logits/chosen": -4.518400192260742, "logits/rejected": -5.005860805511475, "logps/chosen": -183.30685424804688, "logps/rejected": -579.0000610351562, "loss": 0.0353, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.009351563639938831, "rewards/margins": 45.234535217285156, "rewards/rejected": -45.2438850402832, "step": 455 }, { "epoch": 0.37126715092816787, "grad_norm": 1.9835477260699008e-08, "learning_rate": 5.480357779668039e-06, "logits/chosen": -4.803116798400879, "logits/rejected": -4.599274635314941, "logps/chosen": -196.0874481201172, "logps/rejected": -615.0345458984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.7047766447067261, "rewards/margins": 49.95933532714844, "rewards/rejected": -49.25455856323242, "step": 460 }, { "epoch": 0.37530266343825663, "grad_norm": 0.04249320551753044, "learning_rate": 5.473072160142985e-06, "logits/chosen": -4.302497863769531, "logits/rejected": -4.2391862869262695, "logps/chosen": -193.98391723632812, "logps/rejected": -574.122802734375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 1.5295803546905518, "rewards/margins": 46.3218879699707, "rewards/rejected": -44.79230499267578, "step": 465 }, { "epoch": 0.37933817594834546, "grad_norm": 1.3640354836752522e-06, "learning_rate": 5.465710021053965e-06, "logits/chosen": -4.223885536193848, "logits/rejected": -4.184884071350098, "logps/chosen": -214.5250701904297, "logps/rejected": -649.9946899414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.26530104875564575, "rewards/margins": 52.98869705200195, "rewards/rejected": -52.72339630126953, "step": 470 }, { "epoch": 0.3833736884584342, "grad_norm": 1.51522408486926e-06, "learning_rate": 5.458271666766421e-06, "logits/chosen": -4.235510349273682, "logits/rejected": -4.147494316101074, "logps/chosen": -169.38424682617188, "logps/rejected": -598.533203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.7080025672912598, "rewards/margins": 48.156715393066406, "rewards/rejected": -47.44871139526367, "step": 475 }, { "epoch": 0.387409200968523, "grad_norm": 4.0114662169798976e-07, "learning_rate": 5.450757404796685e-06, "logits/chosen": -4.068259239196777, "logits/rejected": -3.8137316703796387, "logps/chosen": -173.73312377929688, "logps/rejected": -568.1229858398438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5089066028594971, "rewards/margins": 44.55615234375, "rewards/rejected": -45.065059661865234, "step": 480 }, { "epoch": 0.39144471347861176, "grad_norm": 0.00010334089893149212, "learning_rate": 5.4431675457992604e-06, "logits/chosen": -3.8022620677948, "logits/rejected": -3.916943073272705, "logps/chosen": -203.17031860351562, "logps/rejected": -620.3515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5890091061592102, "rewards/margins": 49.28617858886719, "rewards/rejected": -49.87519073486328, "step": 485 }, { "epoch": 0.3954802259887006, "grad_norm": 4.69732776764431e-06, "learning_rate": 5.435502403553982e-06, "logits/chosen": -4.109463691711426, "logits/rejected": -3.856114149093628, "logps/chosen": -159.9314422607422, "logps/rejected": -594.8986206054688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.8233232498168945, "rewards/margins": 46.235076904296875, "rewards/rejected": -45.4117546081543, "step": 490 }, { "epoch": 0.39951573849878935, "grad_norm": 0.0005119738634675741, "learning_rate": 5.427762294953047e-06, "logits/chosen": -4.303279876708984, "logits/rejected": -4.3433942794799805, "logps/chosen": -168.674072265625, "logps/rejected": -613.7282104492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.3958518505096436, "rewards/margins": 50.94486618041992, "rewards/rejected": -48.549015045166016, "step": 495 }, { "epoch": 0.4035512510088781, "grad_norm": 4.1175621845468413e-07, "learning_rate": 5.419947539987907e-06, "logits/chosen": -4.395185947418213, "logits/rejected": -4.332601070404053, "logps/chosen": -195.73355102539062, "logps/rejected": -567.9427490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7712366580963135, "rewards/margins": 45.928897857666016, "rewards/rejected": -45.15766143798828, "step": 500 }, { "epoch": 0.4035512510088781, "eval_logits/chosen": -4.167891025543213, "eval_logits/rejected": -4.807552337646484, "eval_logps/chosen": -123.61334991455078, "eval_logps/rejected": -621.6817626953125, "eval_loss": 0.0003149278345517814, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.4816978871822357, "eval_rewards/margins": 54.9031982421875, "eval_rewards/rejected": -55.3848991394043, "eval_runtime": 20.0744, "eval_samples_per_second": 9.963, "eval_steps_per_second": 9.963, "step": 500 }, { "epoch": 0.4075867635189669, "grad_norm": 0.00016455356671940535, "learning_rate": 5.412058461736046e-06, "logits/chosen": -4.052435874938965, "logits/rejected": -4.6851277351379395, "logps/chosen": -213.290771484375, "logps/rejected": -625.77490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3994954228401184, "rewards/margins": 49.07280731201172, "rewards/rejected": -49.47230911254883, "step": 505 }, { "epoch": 0.4116222760290557, "grad_norm": 5.402546776167583e-06, "learning_rate": 5.404095386347614e-06, "logits/chosen": -4.575331687927246, "logits/rejected": -4.270782947540283, "logps/chosen": -157.86766052246094, "logps/rejected": -634.0606079101562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.07826220989227295, "rewards/margins": 49.94772720336914, "rewards/rejected": -49.86946105957031, "step": 510 }, { "epoch": 0.4156577885391445, "grad_norm": 0.005546149332076311, "learning_rate": 5.396058643031955e-06, "logits/chosen": -4.225657939910889, "logits/rejected": -4.398144245147705, "logps/chosen": -185.09719848632812, "logps/rejected": -542.6047973632812, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.5985727310180664, "rewards/margins": 41.91501235961914, "rewards/rejected": -42.51358413696289, "step": 515 }, { "epoch": 0.41969330104923325, "grad_norm": 4.697389588415035e-09, "learning_rate": 5.387948564043991e-06, "logits/chosen": -4.278233528137207, "logits/rejected": -4.167605400085449, "logps/chosen": -200.6453094482422, "logps/rejected": -647.5222778320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9711940288543701, "rewards/margins": 53.77836990356445, "rewards/rejected": -51.80717849731445, "step": 520 }, { "epoch": 0.423728813559322, "grad_norm": 4.7003190957184415e-06, "learning_rate": 5.3797654846704845e-06, "logits/chosen": -3.893019914627075, "logits/rejected": -3.8111634254455566, "logps/chosen": -192.12875366210938, "logps/rejected": -630.2559814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8104346990585327, "rewards/margins": 51.10588455200195, "rewards/rejected": -50.295448303222656, "step": 525 }, { "epoch": 0.42776432606941084, "grad_norm": 7.825035027053673e-06, "learning_rate": 5.371509743216178e-06, "logits/chosen": -3.7469964027404785, "logits/rejected": -3.6251449584960938, "logps/chosen": -174.94464111328125, "logps/rejected": -643.8927612304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.11389608681201935, "rewards/margins": 52.19908905029297, "rewards/rejected": -52.08519744873047, "step": 530 }, { "epoch": 0.4317998385794996, "grad_norm": 0.0005877696094103158, "learning_rate": 5.363181680989811e-06, "logits/chosen": -3.4433677196502686, "logits/rejected": -3.4531006813049316, "logps/chosen": -202.9792938232422, "logps/rejected": -599.540771484375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 1.0317554473876953, "rewards/margins": 48.92766571044922, "rewards/rejected": -47.895912170410156, "step": 535 }, { "epoch": 0.4358353510895884, "grad_norm": 6.057070095266681e-06, "learning_rate": 5.3547816422900055e-06, "logits/chosen": -3.7332847118377686, "logits/rejected": -3.420151948928833, "logps/chosen": -192.3090057373047, "logps/rejected": -622.8360595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9278111457824707, "rewards/margins": 49.98357009887695, "rewards/rejected": -49.05575942993164, "step": 540 }, { "epoch": 0.43987086359967714, "grad_norm": 1.941892833201564e-06, "learning_rate": 5.3463099743910335e-06, "logits/chosen": -3.510545015335083, "logits/rejected": -3.4775662422180176, "logps/chosen": -187.3371124267578, "logps/rejected": -635.9449462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.079883337020874, "rewards/margins": 52.63831329345703, "rewards/rejected": -51.55842971801758, "step": 545 }, { "epoch": 0.44390637610976597, "grad_norm": 1.551067470018097e-07, "learning_rate": 5.337767027528465e-06, "logits/chosen": -3.6307311058044434, "logits/rejected": -3.295358657836914, "logps/chosen": -155.5222930908203, "logps/rejected": -526.36376953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5175865888595581, "rewards/margins": 40.378150939941406, "rewards/rejected": -39.86056137084961, "step": 550 }, { "epoch": 0.44794188861985473, "grad_norm": 7.61471255827928e-07, "learning_rate": 5.329153154884676e-06, "logits/chosen": -3.584887742996216, "logits/rejected": -3.3711695671081543, "logps/chosen": -194.7342529296875, "logps/rejected": -551.7994995117188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5104327201843262, "rewards/margins": 41.73994064331055, "rewards/rejected": -43.25037384033203, "step": 555 }, { "epoch": 0.4519774011299435, "grad_norm": 26.125953674316406, "learning_rate": 5.320468712574261e-06, "logits/chosen": -3.9935402870178223, "logits/rejected": -3.6201224327087402, "logps/chosen": -195.44789123535156, "logps/rejected": -583.4216918945312, "loss": 0.0397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.24364593625068665, "rewards/margins": 45.72065353393555, "rewards/rejected": -45.96429443359375, "step": 560 }, { "epoch": 0.45601291364003227, "grad_norm": 0.0010722784791141748, "learning_rate": 5.311714059629304e-06, "logits/chosen": -3.6510071754455566, "logits/rejected": -3.4131195545196533, "logps/chosen": -190.18630981445312, "logps/rejected": -585.0267333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1186213493347168, "rewards/margins": 47.24138641357422, "rewards/rejected": -47.122764587402344, "step": 565 }, { "epoch": 0.4600484261501211, "grad_norm": 3.105681933845972e-09, "learning_rate": 5.302889557984532e-06, "logits/chosen": -3.446031093597412, "logits/rejected": -2.9981565475463867, "logps/chosen": -216.0496368408203, "logps/rejected": -759.55810546875, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.36753106117248535, "rewards/margins": 61.6691780090332, "rewards/rejected": -62.036705017089844, "step": 570 }, { "epoch": 0.46408393866020986, "grad_norm": 0.002022218657657504, "learning_rate": 5.293995572462361e-06, "logits/chosen": -3.622173309326172, "logits/rejected": -3.3048408031463623, "logps/chosen": -177.41781616210938, "logps/rejected": -641.06201171875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816316366195679, "rewards/margins": 51.906837463378906, "rewards/rejected": -50.925201416015625, "step": 575 }, { "epoch": 0.46811945117029863, "grad_norm": 0.001449701958335936, "learning_rate": 5.2850324707578044e-06, "logits/chosen": -3.8222708702087402, "logits/rejected": -3.4854819774627686, "logps/chosen": -183.46389770507812, "logps/rejected": -709.8363647460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.9588214755058289, "rewards/margins": 56.54863357543945, "rewards/rejected": -57.5074577331543, "step": 580 }, { "epoch": 0.4721549636803874, "grad_norm": 3.504745222926431e-07, "learning_rate": 5.2760006234232795e-06, "logits/chosen": -4.350462913513184, "logits/rejected": -3.67132568359375, "logps/chosen": -183.1516571044922, "logps/rejected": -659.9432373046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6819636821746826, "rewards/margins": 54.7717170715332, "rewards/rejected": -53.089759826660156, "step": 585 }, { "epoch": 0.47619047619047616, "grad_norm": 4.356805220595561e-06, "learning_rate": 5.266900403853279e-06, "logits/chosen": -4.081538200378418, "logits/rejected": -3.851404905319214, "logps/chosen": -181.1648712158203, "logps/rejected": -616.5856323242188, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -1.9837591648101807, "rewards/margins": 46.516815185546875, "rewards/rejected": -48.50057601928711, "step": 590 }, { "epoch": 0.480225988700565, "grad_norm": 3.4349441193626262e-06, "learning_rate": 5.257732188268946e-06, "logits/chosen": -4.269750595092773, "logits/rejected": -4.148410320281982, "logps/chosen": -181.13882446289062, "logps/rejected": -601.9407348632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.20981650054454803, "rewards/margins": 48.34564208984375, "rewards/rejected": -48.135826110839844, "step": 595 }, { "epoch": 0.48426150121065376, "grad_norm": 0.0002223353658337146, "learning_rate": 5.248496355702507e-06, "logits/chosen": -4.089084148406982, "logits/rejected": -4.0359673500061035, "logps/chosen": -185.7232208251953, "logps/rejected": -586.719970703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5246174335479736, "rewards/margins": 48.23687744140625, "rewards/rejected": -46.712257385253906, "step": 600 }, { "epoch": 0.4882970137207425, "grad_norm": 0.0023766509257256985, "learning_rate": 5.239193287981618e-06, "logits/chosen": -4.414247989654541, "logits/rejected": -4.1377387046813965, "logps/chosen": -208.94747924804688, "logps/rejected": -674.6076049804688, "loss": 0.0522, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.562401533126831, "rewards/margins": 51.902381896972656, "rewards/rejected": -54.46477508544922, "step": 605 }, { "epoch": 0.4923325262308313, "grad_norm": 5.0129106966778636e-05, "learning_rate": 5.22982336971356e-06, "logits/chosen": -5.071564674377441, "logits/rejected": -4.609591484069824, "logps/chosen": -184.321533203125, "logps/rejected": -701.0501708984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.12223486602306366, "rewards/margins": 56.8878059387207, "rewards/rejected": -56.76557159423828, "step": 610 }, { "epoch": 0.4963680387409201, "grad_norm": 0.8392285704612732, "learning_rate": 5.220386988269352e-06, "logits/chosen": -4.871917724609375, "logits/rejected": -4.130062103271484, "logps/chosen": -156.74441528320312, "logps/rejected": -617.201171875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.254533052444458, "rewards/margins": 49.76227951049805, "rewards/rejected": -49.507747650146484, "step": 615 }, { "epoch": 0.5004035512510089, "grad_norm": 1.558072710849956e-07, "learning_rate": 5.2108845337677355e-06, "logits/chosen": -4.578597068786621, "logits/rejected": -3.849116563796997, "logps/chosen": -184.94943237304688, "logps/rejected": -634.3116455078125, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": -0.6036307215690613, "rewards/margins": 50.691749572753906, "rewards/rejected": -51.295379638671875, "step": 620 }, { "epoch": 0.5044390637610977, "grad_norm": 9.378297249895695e-09, "learning_rate": 5.20131639905904e-06, "logits/chosen": -4.202679634094238, "logits/rejected": -3.378619432449341, "logps/chosen": -212.2616729736328, "logps/rejected": -679.2237548828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.066842794418335, "rewards/margins": 51.69414520263672, "rewards/rejected": -53.76099395751953, "step": 625 }, { "epoch": 0.5084745762711864, "grad_norm": 1.7517296207331112e-12, "learning_rate": 5.191682979708945e-06, "logits/chosen": -3.837179660797119, "logits/rejected": -2.9817659854888916, "logps/chosen": -222.31332397460938, "logps/rejected": -774.2076416015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5249789953231812, "rewards/margins": 63.427825927734375, "rewards/rejected": -63.95280838012695, "step": 630 }, { "epoch": 0.5125100887812752, "grad_norm": 0.0027201841585338116, "learning_rate": 5.181984673982129e-06, "logits/chosen": -3.5101494789123535, "logits/rejected": -2.9132816791534424, "logps/chosen": -184.43637084960938, "logps/rejected": -643.87744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3627457022666931, "rewards/margins": 51.796546936035156, "rewards/rejected": -52.1592903137207, "step": 635 }, { "epoch": 0.516545601291364, "grad_norm": 19.031314849853516, "learning_rate": 5.172221882825801e-06, "logits/chosen": -3.197032928466797, "logits/rejected": -2.6248879432678223, "logps/chosen": -187.95008850097656, "logps/rejected": -679.5035400390625, "loss": 0.2548, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3233927488327026, "rewards/margins": 52.909637451171875, "rewards/rejected": -54.2330322265625, "step": 640 }, { "epoch": 0.5205811138014528, "grad_norm": 6.324626156128943e-06, "learning_rate": 5.162395009853123e-06, "logits/chosen": -3.41083025932312, "logits/rejected": -2.8033792972564697, "logps/chosen": -220.29541015625, "logps/rejected": -719.135009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5389162302017212, "rewards/margins": 57.4514045715332, "rewards/rejected": -57.990325927734375, "step": 645 }, { "epoch": 0.5246166263115416, "grad_norm": 3.3813166737672873e-06, "learning_rate": 5.152504461326533e-06, "logits/chosen": -3.0885791778564453, "logits/rejected": -2.7710540294647217, "logps/chosen": -217.82345581054688, "logps/rejected": -728.4766845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.6404612064361572, "rewards/margins": 58.075401306152344, "rewards/rejected": -59.71586990356445, "step": 650 }, { "epoch": 0.5286521388216303, "grad_norm": 4.533988612820394e-05, "learning_rate": 5.142550646140934e-06, "logits/chosen": -2.7596487998962402, "logits/rejected": -2.7396860122680664, "logps/chosen": -211.9600067138672, "logps/rejected": -700.3244018554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.2655123770236969, "rewards/margins": 55.347137451171875, "rewards/rejected": -55.612648010253906, "step": 655 }, { "epoch": 0.5326876513317191, "grad_norm": 5.40260430170747e-07, "learning_rate": 5.132533975806806e-06, "logits/chosen": -3.378823757171631, "logits/rejected": -2.843888759613037, "logps/chosen": -218.29037475585938, "logps/rejected": -726.20703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.6409857273101807, "rewards/margins": 57.12211990356445, "rewards/rejected": -58.76311111450195, "step": 660 }, { "epoch": 0.536723163841808, "grad_norm": 5.173802719582454e-07, "learning_rate": 5.122454864433184e-06, "logits/chosen": -3.0929930210113525, "logits/rejected": -2.803999662399292, "logps/chosen": -196.25381469726562, "logps/rejected": -619.5491333007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.25847363471984863, "rewards/margins": 50.00075912475586, "rewards/rejected": -49.74229049682617, "step": 665 }, { "epoch": 0.5407586763518967, "grad_norm": 6.059609404474031e-06, "learning_rate": 5.112313728710534e-06, "logits/chosen": -3.0562758445739746, "logits/rejected": -2.748084306716919, "logps/chosen": -191.86276245117188, "logps/rejected": -598.6434326171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.4109610617160797, "rewards/margins": 47.83714294433594, "rewards/rejected": -48.24810791015625, "step": 670 }, { "epoch": 0.5447941888619855, "grad_norm": 4.4000217358508564e-10, "learning_rate": 5.102110987893538e-06, "logits/chosen": -3.081526279449463, "logits/rejected": -2.8767311573028564, "logps/chosen": -182.74066162109375, "logps/rejected": -586.5557861328125, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2247068881988525, "rewards/margins": 46.331939697265625, "rewards/rejected": -47.556644439697266, "step": 675 }, { "epoch": 0.5488297013720742, "grad_norm": 5.9803546719194856e-06, "learning_rate": 5.091847063783754e-06, "logits/chosen": -3.3907852172851562, "logits/rejected": -3.2088146209716797, "logps/chosen": -204.1681671142578, "logps/rejected": -637.3056030273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.6639277935028076, "rewards/margins": 49.85825729370117, "rewards/rejected": -51.522186279296875, "step": 680 }, { "epoch": 0.552865213882163, "grad_norm": 1.386081337928772, "learning_rate": 5.081522380712175e-06, "logits/chosen": -3.2473690509796143, "logits/rejected": -3.1627423763275146, "logps/chosen": -176.0622100830078, "logps/rejected": -558.7054443359375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.03999924659729, "rewards/margins": 43.819122314453125, "rewards/rejected": -44.85912322998047, "step": 685 }, { "epoch": 0.5569007263922519, "grad_norm": 9.425334290824594e-09, "learning_rate": 5.071137365521693e-06, "logits/chosen": -3.298517942428589, "logits/rejected": -3.322131395339966, "logps/chosen": -215.91650390625, "logps/rejected": -655.6781616210938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8203239440917969, "rewards/margins": 54.3050651550293, "rewards/rejected": -52.48474884033203, "step": 690 }, { "epoch": 0.5609362389023406, "grad_norm": 4.014120236206509e-08, "learning_rate": 5.060692447549451e-06, "logits/chosen": -3.7228550910949707, "logits/rejected": -3.572186231613159, "logps/chosen": -191.39697265625, "logps/rejected": -626.5252075195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5162326693534851, "rewards/margins": 49.964805603027344, "rewards/rejected": -50.48103713989258, "step": 695 }, { "epoch": 0.5649717514124294, "grad_norm": 1.0611952347971965e-06, "learning_rate": 5.050188058609086e-06, "logits/chosen": -4.0296220779418945, "logits/rejected": -3.7154762744903564, "logps/chosen": -203.16079711914062, "logps/rejected": -684.3465576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.000720500946045, "rewards/margins": 52.73036575317383, "rewards/rejected": -54.73108673095703, "step": 700 }, { "epoch": 0.5690072639225182, "grad_norm": 9.142602230838293e-09, "learning_rate": 5.039624632972892e-06, "logits/chosen": -3.8378138542175293, "logits/rejected": -3.5750255584716797, "logps/chosen": -194.47052001953125, "logps/rejected": -620.2805786132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8180750608444214, "rewards/margins": 51.04462814331055, "rewards/rejected": -50.22655487060547, "step": 705 }, { "epoch": 0.5730427764326069, "grad_norm": 4.582236456940336e-08, "learning_rate": 5.0290026073538475e-06, "logits/chosen": -3.698603868484497, "logits/rejected": -3.5194180011749268, "logps/chosen": -212.5537872314453, "logps/rejected": -706.3510131835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.08571233600378036, "rewards/margins": 57.29798126220703, "rewards/rejected": -57.2122688293457, "step": 710 }, { "epoch": 0.5770782889426957, "grad_norm": 2.0599900381057523e-06, "learning_rate": 5.018322420887578e-06, "logits/chosen": -3.808070421218872, "logits/rejected": -3.692031145095825, "logps/chosen": -187.86669921875, "logps/rejected": -655.1769409179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925509691238403, "rewards/margins": 53.68902587890625, "rewards/rejected": -52.69647979736328, "step": 715 }, { "epoch": 0.5811138014527845, "grad_norm": 1.9510202946548816e-06, "learning_rate": 5.00758451511419e-06, "logits/chosen": -3.8315036296844482, "logits/rejected": -3.6099681854248047, "logps/chosen": -165.10252380371094, "logps/rejected": -593.412841796875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.174157738685608, "rewards/margins": 48.459686279296875, "rewards/rejected": -47.28553009033203, "step": 720 }, { "epoch": 0.5851493139628733, "grad_norm": 5.612459062831476e-06, "learning_rate": 4.996789333960021e-06, "logits/chosen": -3.8299670219421387, "logits/rejected": -3.661681652069092, "logps/chosen": -206.83255004882812, "logps/rejected": -651.4453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3886255025863647, "rewards/margins": 53.528099060058594, "rewards/rejected": -52.13947677612305, "step": 725 }, { "epoch": 0.5891848264729621, "grad_norm": 8.607041190877851e-10, "learning_rate": 4.9859373237192885e-06, "logits/chosen": -3.7934436798095703, "logits/rejected": -3.587398052215576, "logps/chosen": -196.7765350341797, "logps/rejected": -652.5866088867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.165336012840271, "rewards/margins": 51.675559997558594, "rewards/rejected": -52.84089279174805, "step": 730 }, { "epoch": 0.5932203389830508, "grad_norm": 5.552428516963914e-10, "learning_rate": 4.975028933035635e-06, "logits/chosen": -3.941788911819458, "logits/rejected": -3.607097625732422, "logps/chosen": -183.5889129638672, "logps/rejected": -662.2330932617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7723830938339233, "rewards/margins": 53.89899826049805, "rewards/rejected": -53.126609802246094, "step": 735 }, { "epoch": 0.5972558514931396, "grad_norm": 0.00529184564948082, "learning_rate": 4.964064612883583e-06, "logits/chosen": -3.770343065261841, "logits/rejected": -3.611548662185669, "logps/chosen": -178.23306274414062, "logps/rejected": -604.9061279296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.021260689944028854, "rewards/margins": 48.23130798339844, "rewards/rejected": -48.21004867553711, "step": 740 }, { "epoch": 0.6012913640032284, "grad_norm": 0.00043505587382242084, "learning_rate": 4.953044816549892e-06, "logits/chosen": -3.9126784801483154, "logits/rejected": -3.4190993309020996, "logps/chosen": -209.35379028320312, "logps/rejected": -695.595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3992263376712799, "rewards/margins": 55.706146240234375, "rewards/rejected": -56.105369567871094, "step": 745 }, { "epoch": 0.6053268765133172, "grad_norm": 0.00022037996677681804, "learning_rate": 4.941969999614814e-06, "logits/chosen": -3.9362308979034424, "logits/rejected": -3.6246414184570312, "logps/chosen": -203.05938720703125, "logps/rejected": -682.1746826171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4353013038635254, "rewards/margins": 53.8332405090332, "rewards/rejected": -53.39794158935547, "step": 750 }, { "epoch": 0.609362389023406, "grad_norm": 0.014199169352650642, "learning_rate": 4.930840619933264e-06, "logits/chosen": -3.8848869800567627, "logits/rejected": -3.681208372116089, "logps/chosen": -198.85122680664062, "logps/rejected": -618.7135009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.02768411673605442, "rewards/margins": 49.193695068359375, "rewards/rejected": -49.22138214111328, "step": 755 }, { "epoch": 0.6133979015334947, "grad_norm": 7.011429261183366e-05, "learning_rate": 4.919657137615887e-06, "logits/chosen": -3.7704339027404785, "logits/rejected": -3.3427982330322266, "logps/chosen": -201.8556365966797, "logps/rejected": -644.2147827148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.06345400959253311, "rewards/margins": 50.51708984375, "rewards/rejected": -50.45363235473633, "step": 760 }, { "epoch": 0.6174334140435835, "grad_norm": 9.867144427910812e-12, "learning_rate": 4.908420015010043e-06, "logits/chosen": -3.7089428901672363, "logits/rejected": -3.6501917839050293, "logps/chosen": -192.80807495117188, "logps/rejected": -640.8880615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.8863500356674194, "rewards/margins": 50.76516342163086, "rewards/rejected": -51.651512145996094, "step": 765 }, { "epoch": 0.6214689265536724, "grad_norm": 2.907613634306472e-07, "learning_rate": 4.897129716680683e-06, "logits/chosen": -3.561674118041992, "logits/rejected": -3.4708034992218018, "logps/chosen": -188.24075317382812, "logps/rejected": -586.640625, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.9612159729003906, "rewards/margins": 45.523231506347656, "rewards/rejected": -46.48444366455078, "step": 770 }, { "epoch": 0.6255044390637611, "grad_norm": 0.002174068009480834, "learning_rate": 4.885786709391151e-06, "logits/chosen": -3.647106885910034, "logits/rejected": -3.2635769844055176, "logps/chosen": -204.57762145996094, "logps/rejected": -636.0866088867188, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6810237765312195, "rewards/margins": 49.697898864746094, "rewards/rejected": -50.3789176940918, "step": 775 }, { "epoch": 0.6295399515738499, "grad_norm": 0.0001940367801580578, "learning_rate": 4.874391462083881e-06, "logits/chosen": -3.715165615081787, "logits/rejected": -3.1870551109313965, "logps/chosen": -153.92391967773438, "logps/rejected": -547.9405517578125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.4129582345485687, "rewards/margins": 42.1519660949707, "rewards/rejected": -42.56492233276367, "step": 780 }, { "epoch": 0.6335754640839386, "grad_norm": 1.7206867042318663e-08, "learning_rate": 4.862944445861018e-06, "logits/chosen": -3.1122331619262695, "logits/rejected": -2.8321335315704346, "logps/chosen": -176.03341674804688, "logps/rejected": -625.9478759765625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 2.2548987865448, "rewards/margins": 51.6590690612793, "rewards/rejected": -49.404170989990234, "step": 785 }, { "epoch": 0.6376109765940274, "grad_norm": 6.188603851597918e-09, "learning_rate": 4.85144613396493e-06, "logits/chosen": -2.876060962677002, "logits/rejected": -2.649689197540283, "logps/chosen": -183.8154754638672, "logps/rejected": -595.1080932617188, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.2668537199497223, "rewards/margins": 47.0586051940918, "rewards/rejected": -46.79175567626953, "step": 790 }, { "epoch": 0.6416464891041163, "grad_norm": 8.331250000992441e-07, "learning_rate": 4.839897001758657e-06, "logits/chosen": -3.0971343517303467, "logits/rejected": -2.5379536151885986, "logps/chosen": -167.1718292236328, "logps/rejected": -691.115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.027057696133852005, "rewards/margins": 56.159271240234375, "rewards/rejected": -56.1322135925293, "step": 795 }, { "epoch": 0.645682001614205, "grad_norm": 8.255139505308762e-08, "learning_rate": 4.8282975267062465e-06, "logits/chosen": -2.7153148651123047, "logits/rejected": -2.4867656230926514, "logps/chosen": -183.47657775878906, "logps/rejected": -693.5360717773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.10276589542627335, "rewards/margins": 56.3755989074707, "rewards/rejected": -56.478370666503906, "step": 800 }, { "epoch": 0.6497175141242938, "grad_norm": 4.16763640487261e-07, "learning_rate": 4.816648188353021e-06, "logits/chosen": -2.7407164573669434, "logits/rejected": -2.418300151824951, "logps/chosen": -175.90464782714844, "logps/rejected": -590.6675415039062, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.0449092388153076, "rewards/margins": 46.81920623779297, "rewards/rejected": -45.77429962158203, "step": 805 }, { "epoch": 0.6537530266343826, "grad_norm": 2.927296449684036e-08, "learning_rate": 4.80494946830575e-06, "logits/chosen": -2.9118452072143555, "logits/rejected": -2.656959295272827, "logps/chosen": -194.09654235839844, "logps/rejected": -612.4659423828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2380180358886719, "rewards/margins": 50.537696838378906, "rewards/rejected": -49.29967498779297, "step": 810 }, { "epoch": 0.6577885391444713, "grad_norm": 1.802525275707012e-06, "learning_rate": 4.7932018502127415e-06, "logits/chosen": -2.6927199363708496, "logits/rejected": -2.382631301879883, "logps/chosen": -167.1374969482422, "logps/rejected": -607.7023315429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.121840476989746, "rewards/margins": 50.043373107910156, "rewards/rejected": -48.921531677246094, "step": 815 }, { "epoch": 0.6618240516545602, "grad_norm": 3.575197382588158e-08, "learning_rate": 4.781405819743845e-06, "logits/chosen": -2.5418434143066406, "logits/rejected": -2.201819658279419, "logps/chosen": -173.36370849609375, "logps/rejected": -667.8082275390625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.6706393957138062, "rewards/margins": 52.45208740234375, "rewards/rejected": -53.12272262573242, "step": 820 }, { "epoch": 0.6658595641646489, "grad_norm": 0.0001085257827071473, "learning_rate": 4.769561864570375e-06, "logits/chosen": -2.796977996826172, "logits/rejected": -2.4586143493652344, "logps/chosen": -169.2869873046875, "logps/rejected": -554.4059448242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7596370577812195, "rewards/margins": 44.84055709838867, "rewards/rejected": -44.0809211730957, "step": 825 }, { "epoch": 0.6698950766747377, "grad_norm": 9.224271124708139e-09, "learning_rate": 4.757670474344947e-06, "logits/chosen": -3.1090025901794434, "logits/rejected": -2.546966075897217, "logps/chosen": -202.9197998046875, "logps/rejected": -660.4207153320312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.2798770666122437, "rewards/margins": 51.52782440185547, "rewards/rejected": -52.80769729614258, "step": 830 }, { "epoch": 0.6739305891848265, "grad_norm": 5.559979698865902e-10, "learning_rate": 4.745732140681234e-06, "logits/chosen": -2.7433953285217285, "logits/rejected": -2.7001149654388428, "logps/chosen": -184.8888702392578, "logps/rejected": -696.0082397460938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7940214276313782, "rewards/margins": 55.962562561035156, "rewards/rejected": -56.7565803527832, "step": 835 }, { "epoch": 0.6779661016949152, "grad_norm": 3.247660970373545e-06, "learning_rate": 4.733747357133648e-06, "logits/chosen": -3.103003978729248, "logits/rejected": -2.687114715576172, "logps/chosen": -201.3419647216797, "logps/rejected": -625.7366943359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.3971020579338074, "rewards/margins": 50.13484573364258, "rewards/rejected": -50.53194808959961, "step": 840 }, { "epoch": 0.682001614205004, "grad_norm": 0.23893657326698303, "learning_rate": 4.7217166191769285e-06, "logits/chosen": -2.941079616546631, "logits/rejected": -2.61792254447937, "logps/chosen": -190.81103515625, "logps/rejected": -736.5392456054688, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.32978856563568115, "rewards/margins": 60.59857177734375, "rewards/rejected": -60.92836380004883, "step": 845 }, { "epoch": 0.6860371267150929, "grad_norm": 3.635064194096316e-10, "learning_rate": 4.709640424185663e-06, "logits/chosen": -2.70947527885437, "logits/rejected": -2.5305850505828857, "logps/chosen": -190.23194885253906, "logps/rejected": -712.6015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.968358039855957, "rewards/margins": 58.366455078125, "rewards/rejected": -59.334808349609375, "step": 850 }, { "epoch": 0.6900726392251816, "grad_norm": 1.3918701746717943e-09, "learning_rate": 4.697519271413722e-06, "logits/chosen": -2.937376022338867, "logits/rejected": -2.584773302078247, "logps/chosen": -217.8892059326172, "logps/rejected": -671.4661254882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.7504074573516846, "rewards/margins": 55.942405700683594, "rewards/rejected": -54.19199752807617, "step": 855 }, { "epoch": 0.6941081517352704, "grad_norm": 0.23099367320537567, "learning_rate": 4.685353661973619e-06, "logits/chosen": -2.884357213973999, "logits/rejected": -2.5582494735717773, "logps/chosen": -229.3910675048828, "logps/rejected": -635.4967041015625, "loss": 0.0245, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7188022136688232, "rewards/margins": 51.37095260620117, "rewards/rejected": -50.65215301513672, "step": 860 }, { "epoch": 0.6981436642453591, "grad_norm": 0.0002964086888823658, "learning_rate": 4.673144098815798e-06, "logits/chosen": -2.869894504547119, "logits/rejected": -2.449110746383667, "logps/chosen": -202.23318481445312, "logps/rejected": -649.0850219726562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.9419758319854736, "rewards/margins": 51.27717208862305, "rewards/rejected": -53.219154357910156, "step": 865 }, { "epoch": 0.7021791767554479, "grad_norm": 1.8948995150758208e-10, "learning_rate": 4.660891086707831e-06, "logits/chosen": -2.57234263420105, "logits/rejected": -2.423658847808838, "logps/chosen": -203.21090698242188, "logps/rejected": -633.3655395507812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.0435168743133545, "rewards/margins": 49.001197814941406, "rewards/rejected": -51.04471206665039, "step": 870 }, { "epoch": 0.7062146892655368, "grad_norm": 19.94390106201172, "learning_rate": 4.648595132213563e-06, "logits/chosen": -2.609067440032959, "logits/rejected": -2.2480998039245605, "logps/chosen": -201.28097534179688, "logps/rejected": -686.7906494140625, "loss": 0.1562, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2666256427764893, "rewards/margins": 55.4502067565918, "rewards/rejected": -57.716835021972656, "step": 875 }, { "epoch": 0.7102502017756255, "grad_norm": 8.626292569147154e-09, "learning_rate": 4.636256743672157e-06, "logits/chosen": -2.4962997436523438, "logits/rejected": -2.31247878074646, "logps/chosen": -231.38040161132812, "logps/rejected": -689.7939453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4950392246246338, "rewards/margins": 58.02406692504883, "rewards/rejected": -56.529029846191406, "step": 880 }, { "epoch": 0.7142857142857143, "grad_norm": 4.504759065770969e-11, "learning_rate": 4.623876431177086e-06, "logits/chosen": -2.4891765117645264, "logits/rejected": -2.2994093894958496, "logps/chosen": -203.67324829101562, "logps/rejected": -713.9566040039062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8619958758354187, "rewards/margins": 59.39702224731445, "rewards/rejected": -58.53502655029297, "step": 885 }, { "epoch": 0.718321226795803, "grad_norm": 4.849110268878576e-08, "learning_rate": 4.6114547065550425e-06, "logits/chosen": -2.671452045440674, "logits/rejected": -2.333718776702881, "logps/chosen": -204.56253051757812, "logps/rejected": -669.051513671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7003462314605713, "rewards/margins": 55.4231071472168, "rewards/rejected": -54.7227668762207, "step": 890 }, { "epoch": 0.7223567393058918, "grad_norm": 2.194093212892767e-06, "learning_rate": 4.5989920833447785e-06, "logits/chosen": -2.8131933212280273, "logits/rejected": -2.292783260345459, "logps/chosen": -174.85745239257812, "logps/rejected": -657.0664672851562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.11932434886693954, "rewards/margins": 53.56437301635742, "rewards/rejected": -53.683692932128906, "step": 895 }, { "epoch": 0.7263922518159807, "grad_norm": 3.4506694646552205e-05, "learning_rate": 4.586489076775873e-06, "logits/chosen": -3.1016008853912354, "logits/rejected": -2.4350528717041016, "logps/chosen": -187.2667694091797, "logps/rejected": -675.8737182617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.0020270347595215, "rewards/margins": 56.20416259765625, "rewards/rejected": -54.20212936401367, "step": 900 }, { "epoch": 0.7304277643260694, "grad_norm": 1.0260672752337996e-05, "learning_rate": 4.573946203747435e-06, "logits/chosen": -2.9298782348632812, "logits/rejected": -2.3083486557006836, "logps/chosen": -166.62506103515625, "logps/rejected": -681.3416748046875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.9778293371200562, "rewards/margins": 55.5833740234375, "rewards/rejected": -54.60554122924805, "step": 905 }, { "epoch": 0.7344632768361582, "grad_norm": 2.3111564928512962e-07, "learning_rate": 4.561363982806732e-06, "logits/chosen": -2.90236234664917, "logits/rejected": -2.4039204120635986, "logps/chosen": -189.7134552001953, "logps/rejected": -606.6634521484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.16784986853599548, "rewards/margins": 46.644527435302734, "rewards/rejected": -46.47667694091797, "step": 910 }, { "epoch": 0.738498789346247, "grad_norm": 3.3985789738011363e-09, "learning_rate": 4.5487429341277515e-06, "logits/chosen": -2.9622409343719482, "logits/rejected": -2.5102641582489014, "logps/chosen": -184.97828674316406, "logps/rejected": -589.4735717773438, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 2.828808307647705, "rewards/margins": 50.12920379638672, "rewards/rejected": -47.300392150878906, "step": 915 }, { "epoch": 0.7425343018563357, "grad_norm": 1.0000504516938236e-05, "learning_rate": 4.536083579489699e-06, "logits/chosen": -2.6789345741271973, "logits/rejected": -2.5904479026794434, "logps/chosen": -176.30279541015625, "logps/rejected": -638.8072509765625, "loss": 0.0222, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.22145625948905945, "rewards/margins": 51.11101150512695, "rewards/rejected": -51.33246612548828, "step": 920 }, { "epoch": 0.7465698143664246, "grad_norm": 4.933500719062067e-09, "learning_rate": 4.523386442255421e-06, "logits/chosen": -2.7063870429992676, "logits/rejected": -2.3266043663024902, "logps/chosen": -188.4479217529297, "logps/rejected": -617.3933715820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.45334959030151367, "rewards/margins": 48.02790069580078, "rewards/rejected": -48.48124694824219, "step": 925 }, { "epoch": 0.7506053268765133, "grad_norm": 0.49318745732307434, "learning_rate": 4.510652047349773e-06, "logits/chosen": -2.4405064582824707, "logits/rejected": -2.2240099906921387, "logps/chosen": -162.41676330566406, "logps/rejected": -566.9041137695312, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 1.3360722064971924, "rewards/margins": 44.938411712646484, "rewards/rejected": -43.60233688354492, "step": 930 }, { "epoch": 0.7546408393866021, "grad_norm": 5.484377106768079e-05, "learning_rate": 4.4978809212379175e-06, "logits/chosen": -2.3808765411376953, "logits/rejected": -2.2029104232788086, "logps/chosen": -188.4193878173828, "logps/rejected": -627.7145385742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5347895622253418, "rewards/margins": 51.54859161376953, "rewards/rejected": -50.0137939453125, "step": 935 }, { "epoch": 0.7586763518966909, "grad_norm": 7.028051527413481e-07, "learning_rate": 4.485073591903557e-06, "logits/chosen": -2.585118055343628, "logits/rejected": -2.1795151233673096, "logps/chosen": -182.65345764160156, "logps/rejected": -587.3922729492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.837026596069336, "rewards/margins": 47.53644561767578, "rewards/rejected": -45.69942092895508, "step": 940 }, { "epoch": 0.7627118644067796, "grad_norm": 1.7969907162296295e-07, "learning_rate": 4.472230588827108e-06, "logits/chosen": -2.3025307655334473, "logits/rejected": -2.3886826038360596, "logps/chosen": -187.1461944580078, "logps/rejected": -635.5110473632812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.7026686668395996, "rewards/margins": 54.242637634277344, "rewards/rejected": -51.53997039794922, "step": 945 }, { "epoch": 0.7667473769168685, "grad_norm": 8.575025844947959e-07, "learning_rate": 4.459352442963808e-06, "logits/chosen": -2.5122103691101074, "logits/rejected": -2.4182677268981934, "logps/chosen": -182.31295776367188, "logps/rejected": -614.1731567382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.921123504638672, "rewards/margins": 51.941192626953125, "rewards/rejected": -49.02006149291992, "step": 950 }, { "epoch": 0.7707828894269573, "grad_norm": 0.0031333104707300663, "learning_rate": 4.44643968672177e-06, "logits/chosen": -2.5539796352386475, "logits/rejected": -2.2289395332336426, "logps/chosen": -193.9743194580078, "logps/rejected": -651.2032470703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.838069200515747, "rewards/margins": 52.74513626098633, "rewards/rejected": -50.907066345214844, "step": 955 }, { "epoch": 0.774818401937046, "grad_norm": 4.248950958251953, "learning_rate": 4.433492853939967e-06, "logits/chosen": -2.5147101879119873, "logits/rejected": -2.3647043704986572, "logps/chosen": -165.66586303710938, "logps/rejected": -631.9575805664062, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 1.06210196018219, "rewards/margins": 51.49007034301758, "rewards/rejected": -50.4279670715332, "step": 960 }, { "epoch": 0.7788539144471348, "grad_norm": 2.4190571821236517e-06, "learning_rate": 4.420512479866164e-06, "logits/chosen": -2.546480894088745, "logits/rejected": -2.14469575881958, "logps/chosen": -174.82803344726562, "logps/rejected": -641.9488525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.647111177444458, "rewards/margins": 54.09553146362305, "rewards/rejected": -51.44841766357422, "step": 965 }, { "epoch": 0.7828894269572235, "grad_norm": 3.868479703239558e-10, "learning_rate": 4.40749910113479e-06, "logits/chosen": -2.441559076309204, "logits/rejected": -2.171654224395752, "logps/chosen": -176.8037872314453, "logps/rejected": -580.6808471679688, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 1.0375621318817139, "rewards/margins": 47.11383056640625, "rewards/rejected": -46.076271057128906, "step": 970 }, { "epoch": 0.7869249394673123, "grad_norm": 7.022068757578381e-07, "learning_rate": 4.3944532557447516e-06, "logits/chosen": -2.349679470062256, "logits/rejected": -1.9809576272964478, "logps/chosen": -187.4682159423828, "logps/rejected": -715.4788208007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2204468250274658, "rewards/margins": 60.77362060546875, "rewards/rejected": -59.55317306518555, "step": 975 }, { "epoch": 0.7909604519774012, "grad_norm": 3.107275503566598e-08, "learning_rate": 4.3813754830371926e-06, "logits/chosen": -2.2297985553741455, "logits/rejected": -2.011171817779541, "logps/chosen": -167.58241271972656, "logps/rejected": -619.9654541015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2590758800506592, "rewards/margins": 51.189979553222656, "rewards/rejected": -49.930904388427734, "step": 980 }, { "epoch": 0.7949959644874899, "grad_norm": 1.5470044672838412e-05, "learning_rate": 4.368266323673193e-06, "logits/chosen": -2.0251142978668213, "logits/rejected": -2.029571056365967, "logps/chosen": -184.33131408691406, "logps/rejected": -604.8360595703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.659274697303772, "rewards/margins": 49.11655807495117, "rewards/rejected": -47.4572868347168, "step": 985 }, { "epoch": 0.7990314769975787, "grad_norm": 4.792344157067419e-07, "learning_rate": 4.355126319611424e-06, "logits/chosen": -2.333101749420166, "logits/rejected": -1.8617761135101318, "logps/chosen": -179.00552368164062, "logps/rejected": -635.9749145507812, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 3.2536518573760986, "rewards/margins": 54.14705276489258, "rewards/rejected": -50.893394470214844, "step": 990 }, { "epoch": 0.8030669895076675, "grad_norm": 1.1797761771958903e-06, "learning_rate": 4.341956014085732e-06, "logits/chosen": -2.154024839401245, "logits/rejected": -1.823242425918579, "logps/chosen": -178.6327362060547, "logps/rejected": -662.6701049804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.6403963565826416, "rewards/margins": 54.9346809387207, "rewards/rejected": -52.294281005859375, "step": 995 }, { "epoch": 0.8071025020177562, "grad_norm": 3.290809291733865e-10, "learning_rate": 4.32875595158269e-06, "logits/chosen": -2.140934944152832, "logits/rejected": -1.8849890232086182, "logps/chosen": -182.13467407226562, "logps/rejected": -626.0648193359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6655629873275757, "rewards/margins": 50.04166793823242, "rewards/rejected": -50.70723342895508, "step": 1000 }, { "epoch": 0.8071025020177562, "eval_logits/chosen": -2.379439115524292, "eval_logits/rejected": -1.850855827331543, "eval_logps/chosen": -125.99504089355469, "eval_logps/rejected": -697.2732543945312, "eval_loss": 0.00016668846365064383, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.7198677062988281, "eval_rewards/margins": 62.22418975830078, "eval_rewards/rejected": -62.94405746459961, "eval_runtime": 20.1907, "eval_samples_per_second": 9.906, "eval_steps_per_second": 9.906, "step": 1000 }, { "epoch": 0.8111380145278451, "grad_norm": 3.1391317634188454e-07, "learning_rate": 4.315526677819083e-06, "logits/chosen": -2.1817479133605957, "logits/rejected": -1.9132606983184814, "logps/chosen": -172.30282592773438, "logps/rejected": -711.1854248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.637314558029175, "rewards/margins": 60.403221130371094, "rewards/rejected": -57.765907287597656, "step": 1005 }, { "epoch": 0.8151735270379338, "grad_norm": 1.4970829909088934e-08, "learning_rate": 4.302268739719346e-06, "logits/chosen": -1.9033387899398804, "logits/rejected": -1.9521175622940063, "logps/chosen": -184.00924682617188, "logps/rejected": -605.9544067382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.257529616355896, "rewards/margins": 50.06651306152344, "rewards/rejected": -48.808990478515625, "step": 1010 }, { "epoch": 0.8192090395480226, "grad_norm": 4.846892220911059e-09, "learning_rate": 4.288982685392957e-06, "logits/chosen": -1.7928422689437866, "logits/rejected": -1.8994872570037842, "logps/chosen": -167.9226531982422, "logps/rejected": -612.1998291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.842020034790039, "rewards/margins": 52.71624755859375, "rewards/rejected": -49.87422180175781, "step": 1015 }, { "epoch": 0.8232445520581114, "grad_norm": 2.5761608402063985e-09, "learning_rate": 4.275669064111772e-06, "logits/chosen": -2.256682872772217, "logits/rejected": -1.8658664226531982, "logps/chosen": -169.67526245117188, "logps/rejected": -665.5009765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.51968252658844, "rewards/margins": 55.70459747314453, "rewards/rejected": -54.184913635253906, "step": 1020 }, { "epoch": 0.8272800645682001, "grad_norm": 6.771325899990188e-08, "learning_rate": 4.262328426287321e-06, "logits/chosen": -1.9884824752807617, "logits/rejected": -1.9168665409088135, "logps/chosen": -183.7039337158203, "logps/rejected": -705.42626953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1033751964569092, "rewards/margins": 58.6589241027832, "rewards/rejected": -57.55554962158203, "step": 1025 }, { "epoch": 0.831315577078289, "grad_norm": 3.3713085656472686e-08, "learning_rate": 4.248961323448052e-06, "logits/chosen": -1.743556261062622, "logits/rejected": -1.842688798904419, "logps/chosen": -182.79415893554688, "logps/rejected": -567.0283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6643765568733215, "rewards/margins": 45.839515686035156, "rewards/rejected": -45.17513656616211, "step": 1030 }, { "epoch": 0.8353510895883777, "grad_norm": 1.194185642816592e-05, "learning_rate": 4.235568308216528e-06, "logits/chosen": -2.157599925994873, "logits/rejected": -1.8653596639633179, "logps/chosen": -167.60403442382812, "logps/rejected": -597.2544555664062, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.11773920059204102, "rewards/margins": 47.51897430419922, "rewards/rejected": -47.63671112060547, "step": 1035 }, { "epoch": 0.8393866020984665, "grad_norm": 2.299746029166272e-06, "learning_rate": 4.222149934286583e-06, "logits/chosen": -2.023226737976074, "logits/rejected": -1.9555327892303467, "logps/chosen": -190.4853515625, "logps/rejected": -654.0498046875, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2701537609100342, "rewards/margins": 53.589637756347656, "rewards/rejected": -52.319480895996094, "step": 1040 }, { "epoch": 0.8434221146085553, "grad_norm": 4.4419007139140376e-08, "learning_rate": 4.208706756400428e-06, "logits/chosen": -2.001154661178589, "logits/rejected": -2.0954368114471436, "logps/chosen": -183.1996307373047, "logps/rejected": -638.4387817382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.482452154159546, "rewards/margins": 52.3235969543457, "rewards/rejected": -50.84114456176758, "step": 1045 }, { "epoch": 0.847457627118644, "grad_norm": 5.339713116114808e-10, "learning_rate": 4.19523933032572e-06, "logits/chosen": -2.2489967346191406, "logits/rejected": -2.1265571117401123, "logps/chosen": -166.90223693847656, "logps/rejected": -682.5925903320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.19275008141994476, "rewards/margins": 55.21764373779297, "rewards/rejected": -55.02489471435547, "step": 1050 }, { "epoch": 0.8514931396287329, "grad_norm": 0.0022136280313134193, "learning_rate": 4.181748212832586e-06, "logits/chosen": -2.0977559089660645, "logits/rejected": -2.023172378540039, "logps/chosen": -198.75535583496094, "logps/rejected": -686.7396850585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4131340980529785, "rewards/margins": 55.39250946044922, "rewards/rejected": -54.97937774658203, "step": 1055 }, { "epoch": 0.8555286521388217, "grad_norm": 6.417801845515214e-09, "learning_rate": 4.1682339616706e-06, "logits/chosen": -2.1305503845214844, "logits/rejected": -2.1038641929626465, "logps/chosen": -196.37271118164062, "logps/rejected": -737.64111328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.0135843753814697, "rewards/margins": 59.15848922729492, "rewards/rejected": -60.17206954956055, "step": 1060 }, { "epoch": 0.8595641646489104, "grad_norm": 2.3556165160698583e-06, "learning_rate": 4.1546971355457294e-06, "logits/chosen": -2.192225456237793, "logits/rejected": -2.104820489883423, "logps/chosen": -198.4217529296875, "logps/rejected": -733.4817504882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.7952780723571777, "rewards/margins": 62.16957473754883, "rewards/rejected": -59.374298095703125, "step": 1065 }, { "epoch": 0.8635996771589992, "grad_norm": 7.999670401659387e-07, "learning_rate": 4.141138294097237e-06, "logits/chosen": -2.0236148834228516, "logits/rejected": -2.0797247886657715, "logps/chosen": -184.32025146484375, "logps/rejected": -650.283935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8974075317382812, "rewards/margins": 54.49199295043945, "rewards/rejected": -53.594581604003906, "step": 1070 }, { "epoch": 0.8676351896690879, "grad_norm": 4.578279549605213e-05, "learning_rate": 4.127557997874539e-06, "logits/chosen": -1.9749248027801514, "logits/rejected": -2.0946223735809326, "logps/chosen": -161.22415161132812, "logps/rejected": -631.509521484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 1.3806092739105225, "rewards/margins": 51.45856857299805, "rewards/rejected": -50.077964782714844, "step": 1075 }, { "epoch": 0.8716707021791767, "grad_norm": 1.05284947427009e-10, "learning_rate": 4.1139568083140366e-06, "logits/chosen": -2.2407312393188477, "logits/rejected": -2.142991065979004, "logps/chosen": -188.42117309570312, "logps/rejected": -676.03369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0743017196655273, "rewards/margins": 55.58829879760742, "rewards/rejected": -54.513999938964844, "step": 1080 }, { "epoch": 0.8757062146892656, "grad_norm": 1.772748987605155e-06, "learning_rate": 4.100335287715905e-06, "logits/chosen": -2.0626769065856934, "logits/rejected": -2.1056411266326904, "logps/chosen": -174.00631713867188, "logps/rejected": -678.2622680664062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1672593355178833, "rewards/margins": 56.831214904785156, "rewards/rejected": -55.6639518737793, "step": 1085 }, { "epoch": 0.8797417271993543, "grad_norm": 3.814287993009202e-06, "learning_rate": 4.08669399922084e-06, "logits/chosen": -2.020699977874756, "logits/rejected": -2.086871385574341, "logps/chosen": -209.2863311767578, "logps/rejected": -674.4232177734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2035471200942993, "rewards/margins": 55.49235916137695, "rewards/rejected": -54.2888069152832, "step": 1090 }, { "epoch": 0.8837772397094431, "grad_norm": 0.09039060771465302, "learning_rate": 4.073033506786788e-06, "logits/chosen": -2.1619884967803955, "logits/rejected": -2.1178927421569824, "logps/chosen": -176.5596160888672, "logps/rejected": -659.6011352539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.19374194741249084, "rewards/margins": 54.108497619628906, "rewards/rejected": -53.91475296020508, "step": 1095 }, { "epoch": 0.8878127522195319, "grad_norm": 4.4102868912432314e-08, "learning_rate": 4.059354375165618e-06, "logits/chosen": -2.0466697216033936, "logits/rejected": -2.1500320434570312, "logps/chosen": -172.9442138671875, "logps/rejected": -650.6768798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.564643144607544, "rewards/margins": 55.702110290527344, "rewards/rejected": -53.137474060058594, "step": 1100 }, { "epoch": 0.8918482647296206, "grad_norm": 8.463582901185873e-08, "learning_rate": 4.0456571698797835e-06, "logits/chosen": -2.18721604347229, "logits/rejected": -2.122054100036621, "logps/chosen": -169.60687255859375, "logps/rejected": -638.73291015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3486313819885254, "rewards/margins": 51.44000244140625, "rewards/rejected": -51.78863525390625, "step": 1105 }, { "epoch": 0.8958837772397095, "grad_norm": 1.1218524997858026e-09, "learning_rate": 4.031942457198939e-06, "logits/chosen": -2.301440477371216, "logits/rejected": -2.158604145050049, "logps/chosen": -190.29519653320312, "logps/rejected": -698.674072265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6846076250076294, "rewards/margins": 58.42401885986328, "rewards/rejected": -57.739402770996094, "step": 1110 }, { "epoch": 0.8999192897497982, "grad_norm": 1.2628167400663415e-09, "learning_rate": 4.018210804116528e-06, "logits/chosen": -2.2120747566223145, "logits/rejected": -2.058357000350952, "logps/chosen": -189.54153442382812, "logps/rejected": -742.196044921875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.520849883556366, "rewards/margins": 60.82427215576172, "rewards/rejected": -60.303428649902344, "step": 1115 }, { "epoch": 0.903954802259887, "grad_norm": 5.303474673312714e-12, "learning_rate": 4.0044627783263455e-06, "logits/chosen": -2.1777002811431885, "logits/rejected": -2.0809192657470703, "logps/chosen": -176.7324676513672, "logps/rejected": -746.9542236328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.5138826370239258, "rewards/margins": 63.670921325683594, "rewards/rejected": -62.15704345703125, "step": 1120 }, { "epoch": 0.9079903147699758, "grad_norm": 9.253559483113349e-07, "learning_rate": 3.990698948199065e-06, "logits/chosen": -2.3026418685913086, "logits/rejected": -2.166304349899292, "logps/chosen": -188.59747314453125, "logps/rejected": -738.2992553710938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5834299325942993, "rewards/margins": 59.159645080566406, "rewards/rejected": -60.743072509765625, "step": 1125 }, { "epoch": 0.9120258272800645, "grad_norm": 7.71840238571167, "learning_rate": 3.976919882758746e-06, "logits/chosen": -2.179295778274536, "logits/rejected": -2.163120746612549, "logps/chosen": -199.29782104492188, "logps/rejected": -711.28759765625, "loss": 0.0258, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5043805241584778, "rewards/margins": 59.02302932739258, "rewards/rejected": -59.52741241455078, "step": 1130 }, { "epoch": 0.9160613397901534, "grad_norm": 3.4694976806640625, "learning_rate": 3.9631261516593e-06, "logits/chosen": -2.163170337677002, "logits/rejected": -2.0628504753112793, "logps/chosen": -162.55255126953125, "logps/rejected": -610.28173828125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 1.3925834894180298, "rewards/margins": 51.5882568359375, "rewards/rejected": -50.19567108154297, "step": 1135 }, { "epoch": 0.9200968523002422, "grad_norm": 0.01892303302884102, "learning_rate": 3.949318325160952e-06, "logits/chosen": -2.2075467109680176, "logits/rejected": -2.0947699546813965, "logps/chosen": -183.17608642578125, "logps/rejected": -695.413330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.8907127380371094, "rewards/margins": 60.553489685058594, "rewards/rejected": -57.66277313232422, "step": 1140 }, { "epoch": 0.9241323648103309, "grad_norm": 3.53829118782123e-11, "learning_rate": 3.935496974106658e-06, "logits/chosen": -2.178450345993042, "logits/rejected": -2.00243878364563, "logps/chosen": -177.21621704101562, "logps/rejected": -710.5032348632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.4890907406806946, "rewards/margins": 57.20867919921875, "rewards/rejected": -56.7195930480957, "step": 1145 }, { "epoch": 0.9281678773204197, "grad_norm": 2.4448384289277314e-11, "learning_rate": 3.9216626698985004e-06, "logits/chosen": -2.2597270011901855, "logits/rejected": -2.0258102416992188, "logps/chosen": -196.44766235351562, "logps/rejected": -661.3603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.49641603231430054, "rewards/margins": 53.4591178894043, "rewards/rejected": -52.96270751953125, "step": 1150 }, { "epoch": 0.9322033898305084, "grad_norm": 1.4270384873782405e-10, "learning_rate": 3.907815984474077e-06, "logits/chosen": -2.019181728363037, "logits/rejected": -1.9931190013885498, "logps/chosen": -158.64462280273438, "logps/rejected": -684.2193603515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6224596500396729, "rewards/margins": 57.61418533325195, "rewards/rejected": -55.991722106933594, "step": 1155 }, { "epoch": 0.9362389023405973, "grad_norm": 7.67189067119034e-06, "learning_rate": 3.893957490282847e-06, "logits/chosen": -2.2452640533447266, "logits/rejected": -1.885061264038086, "logps/chosen": -179.76832580566406, "logps/rejected": -712.2828369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5476454496383667, "rewards/margins": 59.20948028564453, "rewards/rejected": -58.661842346191406, "step": 1160 }, { "epoch": 0.9402744148506861, "grad_norm": 2.579972069316483e-10, "learning_rate": 3.880087760262468e-06, "logits/chosen": -2.2836661338806152, "logits/rejected": -1.8482755422592163, "logps/chosen": -178.68569946289062, "logps/rejected": -794.9833984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4355398118495941, "rewards/margins": 65.51766204833984, "rewards/rejected": -65.08211517333984, "step": 1165 }, { "epoch": 0.9443099273607748, "grad_norm": 2.1680438777593736e-08, "learning_rate": 3.8662073678151105e-06, "logits/chosen": -2.3062100410461426, "logits/rejected": -1.8179759979248047, "logps/chosen": -173.17926025390625, "logps/rejected": -740.5164794921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8405473232269287, "rewards/margins": 63.468177795410156, "rewards/rejected": -61.62762451171875, "step": 1170 }, { "epoch": 0.9483454398708636, "grad_norm": 1.0691678445362562e-11, "learning_rate": 3.852316886783747e-06, "logits/chosen": -2.209916353225708, "logits/rejected": -1.870365858078003, "logps/chosen": -214.1045684814453, "logps/rejected": -717.9239501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5090707540512085, "rewards/margins": 58.77640914916992, "rewards/rejected": -59.285484313964844, "step": 1175 }, { "epoch": 0.9523809523809523, "grad_norm": 2.7414895006905304e-11, "learning_rate": 3.838416891428434e-06, "logits/chosen": -2.3161187171936035, "logits/rejected": -1.9169588088989258, "logps/chosen": -203.76986694335938, "logps/rejected": -825.1571044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5773431658744812, "rewards/margins": 70.10179138183594, "rewards/rejected": -69.52444458007812, "step": 1180 }, { "epoch": 0.9564164648910412, "grad_norm": 7.717453898159476e-10, "learning_rate": 3.824507956402571e-06, "logits/chosen": -2.3269741535186768, "logits/rejected": -1.9442075490951538, "logps/chosen": -207.26571655273438, "logps/rejected": -768.75, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7107122540473938, "rewards/margins": 65.07723236083984, "rewards/rejected": -64.36652374267578, "step": 1185 }, { "epoch": 0.96045197740113, "grad_norm": 0.0001168127273558639, "learning_rate": 3.810590656729139e-06, "logits/chosen": -2.1608777046203613, "logits/rejected": -1.803821325302124, "logps/chosen": -160.6416778564453, "logps/rejected": -666.4202270507812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.943156361579895, "rewards/margins": 55.78478240966797, "rewards/rejected": -54.84162521362305, "step": 1190 }, { "epoch": 0.9644874899112187, "grad_norm": 1.654362237479834e-11, "learning_rate": 3.796665567776931e-06, "logits/chosen": -2.272247314453125, "logits/rejected": -1.8698394298553467, "logps/chosen": -183.6510009765625, "logps/rejected": -720.2318115234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8692010641098022, "rewards/margins": 61.564170837402344, "rewards/rejected": -59.694969177246094, "step": 1195 }, { "epoch": 0.9685230024213075, "grad_norm": 5.109391167934518e-06, "learning_rate": 3.7827332652367644e-06, "logits/chosen": -2.2764999866485596, "logits/rejected": -1.8040711879730225, "logps/chosen": -179.1069793701172, "logps/rejected": -694.538330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7285881042480469, "rewards/margins": 59.34184646606445, "rewards/rejected": -58.613258361816406, "step": 1200 }, { "epoch": 0.9725585149313963, "grad_norm": 1.2832400798797607, "learning_rate": 3.7687943250976793e-06, "logits/chosen": -2.106998920440674, "logits/rejected": -1.879958152770996, "logps/chosen": -198.06640625, "logps/rejected": -683.0198974609375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.4126468598842621, "rewards/margins": 55.87993621826172, "rewards/rejected": -55.46729278564453, "step": 1205 }, { "epoch": 0.976594027441485, "grad_norm": 1.8256288925044828e-08, "learning_rate": 3.7548493236231294e-06, "logits/chosen": -2.120365858078003, "logits/rejected": -1.8410284519195557, "logps/chosen": -182.54891967773438, "logps/rejected": -713.6632080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.14251920580863953, "rewards/margins": 58.67591094970703, "rewards/rejected": -58.81842803955078, "step": 1210 }, { "epoch": 0.9806295399515739, "grad_norm": 3.6671007896948993e-10, "learning_rate": 3.7408988373271533e-06, "logits/chosen": -2.147479295730591, "logits/rejected": -1.9329942464828491, "logps/chosen": -196.52110290527344, "logps/rejected": -720.7496337890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.416882276535034, "rewards/margins": 61.195594787597656, "rewards/rejected": -58.778709411621094, "step": 1215 }, { "epoch": 0.9846650524616626, "grad_norm": 2.1985749754094286e-07, "learning_rate": 3.7269434429505464e-06, "logits/chosen": -2.446967363357544, "logits/rejected": -1.85018789768219, "logps/chosen": -194.4342041015625, "logps/rejected": -732.8348999023438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.70732581615448, "rewards/margins": 60.27684783935547, "rewards/rejected": -60.98417282104492, "step": 1220 }, { "epoch": 0.9887005649717514, "grad_norm": 1.7377091126036248e-06, "learning_rate": 3.7129837174370105e-06, "logits/chosen": -2.106842517852783, "logits/rejected": -1.8992770910263062, "logps/chosen": -168.92874145507812, "logps/rejected": -728.6680908203125, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8658353090286255, "rewards/margins": 61.28117752075195, "rewards/rejected": -60.41533660888672, "step": 1225 }, { "epoch": 0.9927360774818402, "grad_norm": 1.408772959621274e-06, "learning_rate": 3.6990202379093078e-06, "logits/chosen": -2.2912869453430176, "logits/rejected": -1.9618747234344482, "logps/chosen": -176.91891479492188, "logps/rejected": -719.5164794921875, "loss": 0.1174, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.864270806312561, "rewards/margins": 61.191001892089844, "rewards/rejected": -59.32673263549805, "step": 1230 }, { "epoch": 0.9967715899919289, "grad_norm": 3.4731894443806866e-11, "learning_rate": 3.685053581645398e-06, "logits/chosen": -2.216810703277588, "logits/rejected": -1.87371027469635, "logps/chosen": -206.3554229736328, "logps/rejected": -722.0701904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.018198013305664062, "rewards/margins": 60.195518493652344, "rewards/rejected": -60.17732620239258, "step": 1235 }, { "epoch": 0.9991928974979822, "eval_logits/chosen": -2.209681510925293, "eval_logits/rejected": -1.4488617181777954, "eval_logps/chosen": -130.8629913330078, "eval_logps/rejected": -836.0968627929688, "eval_loss": 5.865772254765034e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.2066632509231567, "eval_rewards/margins": 75.6197509765625, "eval_rewards/rejected": -76.826416015625, "eval_runtime": 20.1088, "eval_samples_per_second": 9.946, "eval_steps_per_second": 9.946, "step": 1238 }, { "epoch": 1.0008071025020178, "grad_norm": 2.323751857602474e-09, "learning_rate": 3.6710843260545743e-06, "logits/chosen": -2.278907299041748, "logits/rejected": -1.7310543060302734, "logps/chosen": -201.2460479736328, "logps/rejected": -800.8663940429688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1261701583862305, "rewards/margins": 66.39605712890625, "rewards/rejected": -67.522216796875, "step": 1240 }, { "epoch": 1.0048426150121066, "grad_norm": 5.779334969702177e-05, "learning_rate": 3.6571130486535876e-06, "logits/chosen": -2.162686824798584, "logits/rejected": -1.8121849298477173, "logps/chosen": -228.5120391845703, "logps/rejected": -759.7088623046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.17988328635692596, "rewards/margins": 62.44136428833008, "rewards/rejected": -62.621253967285156, "step": 1245 }, { "epoch": 1.0088781275221954, "grad_norm": 1.5120324860617984e-05, "learning_rate": 3.6431403270427783e-06, "logits/chosen": -2.159062623977661, "logits/rejected": -1.780882477760315, "logps/chosen": -191.50076293945312, "logps/rejected": -664.8995361328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1759032011032104, "rewards/margins": 53.6419677734375, "rewards/rejected": -54.81787872314453, "step": 1250 }, { "epoch": 1.012913640032284, "grad_norm": 4.127455599522989e-10, "learning_rate": 3.6291667388821926e-06, "logits/chosen": -2.2117929458618164, "logits/rejected": -1.7112318277359009, "logps/chosen": -216.09521484375, "logps/rejected": -849.525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.589473009109497, "rewards/margins": 66.4390869140625, "rewards/rejected": -70.02855682373047, "step": 1255 }, { "epoch": 1.0169491525423728, "grad_norm": 1.227740104676675e-13, "learning_rate": 3.6151928618677018e-06, "logits/chosen": -2.2690131664276123, "logits/rejected": -1.681115746498108, "logps/chosen": -236.20126342773438, "logps/rejected": -822.6422119140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.6942756175994873, "rewards/margins": 66.62228393554688, "rewards/rejected": -69.31656646728516, "step": 1260 }, { "epoch": 1.0209846650524617, "grad_norm": 7.971179966093089e-11, "learning_rate": 3.6012192737071157e-06, "logits/chosen": -2.1275594234466553, "logits/rejected": -1.6766866445541382, "logps/chosen": -205.51708984375, "logps/rejected": -749.0113525390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2373658418655396, "rewards/margins": 60.467674255371094, "rewards/rejected": -61.705039978027344, "step": 1265 }, { "epoch": 1.0250201775625505, "grad_norm": 7.237382221303434e-11, "learning_rate": 3.5872465520963073e-06, "logits/chosen": -2.261181354522705, "logits/rejected": -1.7118221521377563, "logps/chosen": -197.03875732421875, "logps/rejected": -745.578369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.016669988632202, "rewards/margins": 61.487953186035156, "rewards/rejected": -63.50461959838867, "step": 1270 }, { "epoch": 1.0290556900726393, "grad_norm": 3.946234450324937e-09, "learning_rate": 3.5732752746953205e-06, "logits/chosen": -2.1948084831237793, "logits/rejected": -1.6964082717895508, "logps/chosen": -194.67660522460938, "logps/rejected": -795.627685546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.5028276443481445, "rewards/margins": 64.4461441040039, "rewards/rejected": -66.948974609375, "step": 1275 }, { "epoch": 1.033091202582728, "grad_norm": 1.507763636166004e-10, "learning_rate": 3.559306019104496e-06, "logits/chosen": -2.266756534576416, "logits/rejected": -1.7250194549560547, "logps/chosen": -272.2687072753906, "logps/rejected": -894.3087158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -5.041467189788818, "rewards/margins": 69.62022399902344, "rewards/rejected": -74.66169738769531, "step": 1280 }, { "epoch": 1.0371267150928167, "grad_norm": 4.517583462870789e-08, "learning_rate": 3.545339362840586e-06, "logits/chosen": -2.2050442695617676, "logits/rejected": -1.744755744934082, "logps/chosen": -229.52999877929688, "logps/rejected": -892.4937744140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.112619400024414, "rewards/margins": 73.20848083496094, "rewards/rejected": -76.32109069824219, "step": 1285 }, { "epoch": 1.0411622276029056, "grad_norm": 2.1512201783480123e-05, "learning_rate": 3.531375883312884e-06, "logits/chosen": -2.1128950119018555, "logits/rejected": -1.6664345264434814, "logps/chosen": -194.02040100097656, "logps/rejected": -813.6095581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.945126533508301, "rewards/margins": 66.22413635253906, "rewards/rejected": -69.16926574707031, "step": 1290 }, { "epoch": 1.0451977401129944, "grad_norm": 3.2355671475414738e-09, "learning_rate": 3.5174161577993484e-06, "logits/chosen": -2.1505820751190186, "logits/rejected": -1.7269680500030518, "logps/chosen": -202.79611206054688, "logps/rejected": -729.53662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.0591306686401367, "rewards/margins": 58.82953643798828, "rewards/rejected": -60.88866424560547, "step": 1295 }, { "epoch": 1.0492332526230832, "grad_norm": 6.8008164204325094e-12, "learning_rate": 3.5034607634227415e-06, "logits/chosen": -2.138249397277832, "logits/rejected": -1.6513065099716187, "logps/chosen": -201.3883819580078, "logps/rejected": -711.9617919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.5593791007995605, "rewards/margins": 58.1724853515625, "rewards/rejected": -59.73186492919922, "step": 1300 }, { "epoch": 1.053268765133172, "grad_norm": 1.2778273994484834e-09, "learning_rate": 3.489510277126766e-06, "logits/chosen": -2.17864990234375, "logits/rejected": -1.7530752420425415, "logps/chosen": -221.6562042236328, "logps/rejected": -764.3087158203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.643620252609253, "rewards/margins": 62.18964385986328, "rewards/rejected": -64.83326721191406, "step": 1305 }, { "epoch": 1.0573042776432606, "grad_norm": 1.4308298545984144e-07, "learning_rate": 3.4755652756522155e-06, "logits/chosen": -2.199171543121338, "logits/rejected": -1.765489935874939, "logps/chosen": -238.99496459960938, "logps/rejected": -883.7185668945312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9525084495544434, "rewards/margins": 70.1186752319336, "rewards/rejected": -74.07118225097656, "step": 1310 }, { "epoch": 1.0613397901533494, "grad_norm": 1.7411944282230696e-12, "learning_rate": 3.4616263355131304e-06, "logits/chosen": -2.1380667686462402, "logits/rejected": -1.6777088642120361, "logps/chosen": -203.59722900390625, "logps/rejected": -849.4735107421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.685452938079834, "rewards/margins": 67.57633972167969, "rewards/rejected": -71.26179504394531, "step": 1315 }, { "epoch": 1.0653753026634383, "grad_norm": 8.175649782060646e-06, "learning_rate": 3.447694032972964e-06, "logits/chosen": -2.1039626598358154, "logits/rejected": -1.660135269165039, "logps/chosen": -187.55615234375, "logps/rejected": -729.5601806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.342456579208374, "rewards/margins": 60.368690490722656, "rewards/rejected": -61.711143493652344, "step": 1320 }, { "epoch": 1.069410815173527, "grad_norm": 1.437656327318848e-09, "learning_rate": 3.4337689440207558e-06, "logits/chosen": -2.1757853031158447, "logits/rejected": -1.6718976497650146, "logps/chosen": -214.58181762695312, "logps/rejected": -868.5672607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.56650972366333, "rewards/margins": 70.9485855102539, "rewards/rejected": -73.51509094238281, "step": 1325 }, { "epoch": 1.073446327683616, "grad_norm": 2.5345187060210606e-10, "learning_rate": 3.4198516443473225e-06, "logits/chosen": -2.0571486949920654, "logits/rejected": -1.8012806177139282, "logps/chosen": -210.75216674804688, "logps/rejected": -836.03515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.150726556777954, "rewards/margins": 69.8203353881836, "rewards/rejected": -70.9710693359375, "step": 1330 }, { "epoch": 1.0774818401937045, "grad_norm": 1.144857863025206e-12, "learning_rate": 3.40594270932146e-06, "logits/chosen": -2.1810765266418457, "logits/rejected": -1.7420415878295898, "logps/chosen": -241.93310546875, "logps/rejected": -828.6292724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.558713436126709, "rewards/margins": 66.0389633178711, "rewards/rejected": -69.59767150878906, "step": 1335 }, { "epoch": 1.0815173527037933, "grad_norm": 3.5607028935590035e-13, "learning_rate": 3.3920427139661475e-06, "logits/chosen": -2.1211180686950684, "logits/rejected": -1.7247521877288818, "logps/chosen": -221.8562469482422, "logps/rejected": -776.215087890625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.5397307872772217, "rewards/margins": 62.453857421875, "rewards/rejected": -64.99358367919922, "step": 1340 }, { "epoch": 1.0855528652138822, "grad_norm": 1.3657011285750364e-11, "learning_rate": 3.378152232934784e-06, "logits/chosen": -2.1988587379455566, "logits/rejected": -1.6753028631210327, "logps/chosen": -175.64633178710938, "logps/rejected": -709.5958251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.2454771101474762, "rewards/margins": 59.23875045776367, "rewards/rejected": -59.484230041503906, "step": 1345 }, { "epoch": 1.089588377723971, "grad_norm": 8.035852872012583e-09, "learning_rate": 3.3642718404874256e-06, "logits/chosen": -2.156851291656494, "logits/rejected": -1.7729225158691406, "logps/chosen": -225.20248413085938, "logps/rejected": -882.4959106445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3392541408538818, "rewards/margins": 73.0105972290039, "rewards/rejected": -74.349853515625, "step": 1350 }, { "epoch": 1.0936238902340598, "grad_norm": 0.2016749531030655, "learning_rate": 3.3504021104670467e-06, "logits/chosen": -2.0888161659240723, "logits/rejected": -1.6832849979400635, "logps/chosen": -202.3789825439453, "logps/rejected": -748.2049560546875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.08698773384094238, "rewards/margins": 63.5738639831543, "rewards/rejected": -63.48687744140625, "step": 1355 }, { "epoch": 1.0976594027441484, "grad_norm": 0.0002923495776485652, "learning_rate": 3.336543616275817e-06, "logits/chosen": -2.1031010150909424, "logits/rejected": -1.7796859741210938, "logps/chosen": -224.6013641357422, "logps/rejected": -855.5428466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.2459685802459717, "rewards/margins": 68.72005462646484, "rewards/rejected": -70.96601867675781, "step": 1360 }, { "epoch": 1.1016949152542372, "grad_norm": 1.1111069703195753e-10, "learning_rate": 3.322696930851394e-06, "logits/chosen": -2.19992995262146, "logits/rejected": -1.6886310577392578, "logps/chosen": -204.8900146484375, "logps/rejected": -803.8555908203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.0974869504570961, "rewards/margins": 68.17869567871094, "rewards/rejected": -68.08121490478516, "step": 1365 }, { "epoch": 1.105730427764326, "grad_norm": 7.602197273385777e-14, "learning_rate": 3.308862626643237e-06, "logits/chosen": -2.0778870582580566, "logits/rejected": -1.747521162033081, "logps/chosen": -217.87808227539062, "logps/rejected": -815.7610473632812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.6138692498207092, "rewards/margins": 66.84326934814453, "rewards/rejected": -67.45713806152344, "step": 1370 }, { "epoch": 1.1097659402744149, "grad_norm": 6.315608516160864e-07, "learning_rate": 3.2950412755889417e-06, "logits/chosen": -2.1990807056427, "logits/rejected": -1.758458137512207, "logps/chosen": -217.1395721435547, "logps/rejected": -814.7703857421875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.1943022012710571, "rewards/margins": 66.58109283447266, "rewards/rejected": -67.77540588378906, "step": 1375 }, { "epoch": 1.1138014527845037, "grad_norm": 0.0003513034898787737, "learning_rate": 3.281233449090594e-06, "logits/chosen": -2.1407878398895264, "logits/rejected": -1.6745860576629639, "logps/chosen": -195.0934295654297, "logps/rejected": -757.1023559570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.3091113567352295, "rewards/margins": 61.68535232543945, "rewards/rejected": -63.99445724487305, "step": 1380 }, { "epoch": 1.1178369652945923, "grad_norm": 1.1653614705176008e-11, "learning_rate": 3.267439717991149e-06, "logits/chosen": -2.158762216567993, "logits/rejected": -1.6578963994979858, "logps/chosen": -214.9634552001953, "logps/rejected": -761.684814453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7052465677261353, "rewards/margins": 64.6738510131836, "rewards/rejected": -63.968605041503906, "step": 1385 }, { "epoch": 1.1218724778046811, "grad_norm": 2.7098473204950624e-13, "learning_rate": 3.253660652550829e-06, "logits/chosen": -2.163377285003662, "logits/rejected": -1.6357390880584717, "logps/chosen": -206.15908813476562, "logps/rejected": -732.7711181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3394851684570312, "rewards/margins": 59.24883270263672, "rewards/rejected": -60.58831787109375, "step": 1390 }, { "epoch": 1.12590799031477, "grad_norm": 5.773073041837051e-08, "learning_rate": 3.239896822423549e-06, "logits/chosen": -1.9345413446426392, "logits/rejected": -1.6593739986419678, "logps/chosen": -192.53176879882812, "logps/rejected": -735.7793579101562, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.2152098417282104, "rewards/margins": 60.189491271972656, "rewards/rejected": -61.404701232910156, "step": 1395 }, { "epoch": 1.1299435028248588, "grad_norm": 2.9957369918065524e-08, "learning_rate": 3.2261487966333667e-06, "logits/chosen": -2.1983189582824707, "logits/rejected": -1.6325523853302002, "logps/chosen": -193.6102294921875, "logps/rejected": -812.3824462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6520646810531616, "rewards/margins": 69.5002670288086, "rewards/rejected": -68.84821319580078, "step": 1400 }, { "epoch": 1.1339790153349476, "grad_norm": 1.2821706196763927e-10, "learning_rate": 3.2124171435509555e-06, "logits/chosen": -2.055478811264038, "logits/rejected": -1.6176866292953491, "logps/chosen": -214.99960327148438, "logps/rejected": -765.0739135742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1195714473724365, "rewards/margins": 62.28435134887695, "rewards/rejected": -63.40392303466797, "step": 1405 }, { "epoch": 1.1380145278450362, "grad_norm": 6.468743762866325e-09, "learning_rate": 3.1987024308701113e-06, "logits/chosen": -1.9125087261199951, "logits/rejected": -1.5537140369415283, "logps/chosen": -198.95980834960938, "logps/rejected": -752.4078979492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.06110353395342827, "rewards/margins": 61.86179733276367, "rewards/rejected": -61.800697326660156, "step": 1410 }, { "epoch": 1.142050040355125, "grad_norm": 4.491868423883716e-07, "learning_rate": 3.1850052255842763e-06, "logits/chosen": -2.0721840858459473, "logits/rejected": -1.6219673156738281, "logps/chosen": -171.20423889160156, "logps/rejected": -761.6956176757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.022668171674013138, "rewards/margins": 63.488075256347656, "rewards/rejected": -63.46540069580078, "step": 1415 }, { "epoch": 1.1460855528652139, "grad_norm": 7.394192480525419e-10, "learning_rate": 3.1713260939631067e-06, "logits/chosen": -2.0364065170288086, "logits/rejected": -1.6523240804672241, "logps/chosen": -226.2953338623047, "logps/rejected": -873.6077880859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3698582649230957, "rewards/margins": 72.21980285644531, "rewards/rejected": -73.58966064453125, "step": 1420 }, { "epoch": 1.1501210653753027, "grad_norm": 1.311306959905778e-06, "learning_rate": 3.157665601529054e-06, "logits/chosen": -2.0571930408477783, "logits/rejected": -1.6559698581695557, "logps/chosen": -222.9697723388672, "logps/rejected": -764.7931518554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.691524028778076, "rewards/margins": 66.43643188476562, "rewards/rejected": -63.744903564453125, "step": 1425 }, { "epoch": 1.1541565778853915, "grad_norm": 4.2617276108103397e-07, "learning_rate": 3.14402431303399e-06, "logits/chosen": -1.9636306762695312, "logits/rejected": -1.644447684288025, "logps/chosen": -194.5983428955078, "logps/rejected": -738.1820068359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1158192157745361, "rewards/margins": 60.06477737426758, "rewards/rejected": -61.18059539794922, "step": 1430 }, { "epoch": 1.1581920903954803, "grad_norm": 1.912082471522414e-11, "learning_rate": 3.130402792435858e-06, "logits/chosen": -2.0383236408233643, "logits/rejected": -1.5814425945281982, "logps/chosen": -213.10678100585938, "logps/rejected": -719.7506103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34371060132980347, "rewards/margins": 59.15305709838867, "rewards/rejected": -58.809349060058594, "step": 1435 }, { "epoch": 1.162227602905569, "grad_norm": 2.267983489900871e-07, "learning_rate": 3.116801602875356e-06, "logits/chosen": -2.047435998916626, "logits/rejected": -1.6156845092773438, "logps/chosen": -192.9968719482422, "logps/rejected": -799.4122314453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5533458590507507, "rewards/margins": 66.81327819824219, "rewards/rejected": -67.36662292480469, "step": 1440 }, { "epoch": 1.1662631154156577, "grad_norm": 8.826805242279079e-08, "learning_rate": 3.103221306652658e-06, "logits/chosen": -2.0213122367858887, "logits/rejected": -1.6199147701263428, "logps/chosen": -201.2223358154297, "logps/rejected": -782.5101318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0487488508224487, "rewards/margins": 67.213623046875, "rewards/rejected": -66.16487121582031, "step": 1445 }, { "epoch": 1.1702986279257466, "grad_norm": 2.4668447971343994, "learning_rate": 3.089662465204165e-06, "logits/chosen": -1.9387918710708618, "logits/rejected": -1.5541040897369385, "logps/chosen": -218.3408203125, "logps/rejected": -766.6030883789062, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 0.6600626111030579, "rewards/margins": 63.663543701171875, "rewards/rejected": -63.00347900390625, "step": 1450 }, { "epoch": 1.1743341404358354, "grad_norm": 2.533456608944107e-07, "learning_rate": 3.0761256390792946e-06, "logits/chosen": -1.9399452209472656, "logits/rejected": -1.5922213792800903, "logps/chosen": -177.9163360595703, "logps/rejected": -750.4415283203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.23844614624977112, "rewards/margins": 63.09550857543945, "rewards/rejected": -62.85706329345703, "step": 1455 }, { "epoch": 1.1783696529459242, "grad_norm": 6.994316237296516e-11, "learning_rate": 3.062611387917309e-06, "logits/chosen": -1.8519392013549805, "logits/rejected": -1.5559769868850708, "logps/chosen": -194.449951171875, "logps/rejected": -731.7125854492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0375251770019531, "rewards/margins": 62.02744674682617, "rewards/rejected": -60.98992156982422, "step": 1460 }, { "epoch": 1.1824051654560128, "grad_norm": 6.968992494194026e-08, "learning_rate": 3.049120270424174e-06, "logits/chosen": -1.781243085861206, "logits/rejected": -1.5598220825195312, "logps/chosen": -200.5774688720703, "logps/rejected": -780.6895141601562, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.2801803648471832, "rewards/margins": 65.78279876708984, "rewards/rejected": -65.50260925292969, "step": 1465 }, { "epoch": 1.1864406779661016, "grad_norm": 1.6041022732338206e-08, "learning_rate": 3.0356528443494664e-06, "logits/chosen": -1.813118577003479, "logits/rejected": -1.5580575466156006, "logps/chosen": -180.57241821289062, "logps/rejected": -761.672607421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7057215571403503, "rewards/margins": 65.47200775146484, "rewards/rejected": -64.76628112792969, "step": 1470 }, { "epoch": 1.1904761904761905, "grad_norm": 1.1479943662351388e-09, "learning_rate": 3.022209666463311e-06, "logits/chosen": -1.7811399698257446, "logits/rejected": -1.5352016687393188, "logps/chosen": -187.00917053222656, "logps/rejected": -716.4349365234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9479063153266907, "rewards/margins": 60.103614807128906, "rewards/rejected": -59.15570831298828, "step": 1475 }, { "epoch": 1.1945117029862793, "grad_norm": 6.494347104535336e-08, "learning_rate": 3.0087912925333665e-06, "logits/chosen": -1.669329285621643, "logits/rejected": -1.5829856395721436, "logps/chosen": -203.6710205078125, "logps/rejected": -762.7025146484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7758517861366272, "rewards/margins": 63.24425506591797, "rewards/rejected": -62.46840286254883, "step": 1480 }, { "epoch": 1.1985472154963681, "grad_norm": 1.8413942598272115e-05, "learning_rate": 2.9953982773018425e-06, "logits/chosen": -1.8058834075927734, "logits/rejected": -1.5755075216293335, "logps/chosen": -199.8990020751953, "logps/rejected": -785.3792724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.25892242789268494, "rewards/margins": 65.12300109863281, "rewards/rejected": -64.86407470703125, "step": 1485 }, { "epoch": 1.202582728006457, "grad_norm": 4.552346055675116e-08, "learning_rate": 2.982031174462573e-06, "logits/chosen": -1.802890419960022, "logits/rejected": -1.4651122093200684, "logps/chosen": -183.75181579589844, "logps/rejected": -784.5320434570312, "loss": 0.0499, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9203557968139648, "rewards/margins": 67.39515686035156, "rewards/rejected": -65.47479248046875, "step": 1490 }, { "epoch": 1.2066182405165455, "grad_norm": 4.448557147651577e-11, "learning_rate": 2.9686905366381225e-06, "logits/chosen": -1.7196903228759766, "logits/rejected": -1.4197640419006348, "logps/chosen": -202.1754608154297, "logps/rejected": -804.455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9048557281494141, "rewards/margins": 69.1134262084961, "rewards/rejected": -68.20857238769531, "step": 1495 }, { "epoch": 1.2106537530266344, "grad_norm": 2.487283712326871e-09, "learning_rate": 2.9553769153569375e-06, "logits/chosen": -1.6166270971298218, "logits/rejected": -1.4305615425109863, "logps/chosen": -203.8953857421875, "logps/rejected": -809.4393920898438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.055772304534912, "rewards/margins": 64.39080810546875, "rewards/rejected": -66.44657897949219, "step": 1500 }, { "epoch": 1.2106537530266344, "eval_logits/chosen": -1.61436128616333, "eval_logits/rejected": -1.1442073583602905, "eval_logps/chosen": -130.0927734375, "eval_logps/rejected": -817.969482421875, "eval_loss": 6.29299902357161e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.1296401023864746, "eval_rewards/margins": 73.884033203125, "eval_rewards/rejected": -75.01367950439453, "eval_runtime": 20.0429, "eval_samples_per_second": 9.979, "eval_steps_per_second": 9.979, "step": 1500 }, { "epoch": 1.2146892655367232, "grad_norm": 3.8053379891600514e-11, "learning_rate": 2.942090861030548e-06, "logits/chosen": -1.5321364402770996, "logits/rejected": -1.3989564180374146, "logps/chosen": -203.12643432617188, "logps/rejected": -696.5699462890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.46843022108078003, "rewards/margins": 57.94854736328125, "rewards/rejected": -58.4169807434082, "step": 1505 }, { "epoch": 1.218724778046812, "grad_norm": 5.584149462278809e-12, "learning_rate": 2.928832922930812e-06, "logits/chosen": -1.7552200555801392, "logits/rejected": -1.3492562770843506, "logps/chosen": -187.18692016601562, "logps/rejected": -862.4119262695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1959584951400757, "rewards/margins": 71.46920776367188, "rewards/rejected": -72.66517639160156, "step": 1510 }, { "epoch": 1.2227602905569008, "grad_norm": 4.37933378449884e-09, "learning_rate": 2.9156036491672044e-06, "logits/chosen": -1.4652454853057861, "logits/rejected": -1.4120115041732788, "logps/chosen": -179.9890899658203, "logps/rejected": -730.4194946289062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.333310127258301, "rewards/margins": 62.300567626953125, "rewards/rejected": -59.967262268066406, "step": 1515 }, { "epoch": 1.2267958030669894, "grad_norm": 1.05670423522497e-08, "learning_rate": 2.9024035866641624e-06, "logits/chosen": -1.5189591646194458, "logits/rejected": -1.338213324546814, "logps/chosen": -220.7394256591797, "logps/rejected": -785.5709838867188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.1234169006347656, "rewards/margins": 62.928794860839844, "rewards/rejected": -66.05220794677734, "step": 1520 }, { "epoch": 1.2308313155770783, "grad_norm": 7.764414666766584e-10, "learning_rate": 2.8892332811384705e-06, "logits/chosen": -1.6207401752471924, "logits/rejected": -1.3721296787261963, "logps/chosen": -190.64202880859375, "logps/rejected": -731.6209716796875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.02732701227068901, "rewards/margins": 60.5052604675293, "rewards/rejected": -60.5325927734375, "step": 1525 }, { "epoch": 1.234866828087167, "grad_norm": 2.1133704508624618e-10, "learning_rate": 2.876093277076701e-06, "logits/chosen": -1.438172698020935, "logits/rejected": -1.3710715770721436, "logps/chosen": -181.1151885986328, "logps/rejected": -707.9722900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4808969497680664, "rewards/margins": 60.14398956298828, "rewards/rejected": -58.6630859375, "step": 1530 }, { "epoch": 1.238902340597256, "grad_norm": 5.809496113340362e-12, "learning_rate": 2.862984117712702e-06, "logits/chosen": -1.4556537866592407, "logits/rejected": -1.3690211772918701, "logps/chosen": -192.5563507080078, "logps/rejected": -705.3499145507812, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.05943598598241806, "rewards/margins": 58.401268005371094, "rewards/rejected": -58.341835021972656, "step": 1535 }, { "epoch": 1.2429378531073447, "grad_norm": 2.483193650704152e-10, "learning_rate": 2.849906345005143e-06, "logits/chosen": -1.5469274520874023, "logits/rejected": -1.4111659526824951, "logps/chosen": -185.5227813720703, "logps/rejected": -732.8416748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6678502559661865, "rewards/margins": 62.031410217285156, "rewards/rejected": -61.363555908203125, "step": 1540 }, { "epoch": 1.2469733656174333, "grad_norm": 7.466328497685026e-07, "learning_rate": 2.836860499615105e-06, "logits/chosen": -1.4998805522918701, "logits/rejected": -1.2548482418060303, "logps/chosen": -187.68788146972656, "logps/rejected": -727.1448974609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.137568473815918, "rewards/margins": 61.056373596191406, "rewards/rejected": -59.9188117980957, "step": 1545 }, { "epoch": 1.2510088781275222, "grad_norm": 0.19030039012432098, "learning_rate": 2.823847120883731e-06, "logits/chosen": -1.4331947565078735, "logits/rejected": -1.4037396907806396, "logps/chosen": -198.8579864501953, "logps/rejected": -671.0245361328125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.4905354976654053, "rewards/margins": 55.53157424926758, "rewards/rejected": -54.041038513183594, "step": 1550 }, { "epoch": 1.255044390637611, "grad_norm": 1.2241576641258689e-08, "learning_rate": 2.8108667468099277e-06, "logits/chosen": -1.2984718084335327, "logits/rejected": -1.3773456811904907, "logps/chosen": -187.63363647460938, "logps/rejected": -655.7364501953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.8486075401306152, "rewards/margins": 52.68672561645508, "rewards/rejected": -53.53533935546875, "step": 1555 }, { "epoch": 1.2590799031476998, "grad_norm": 0.05933011695742607, "learning_rate": 2.7979199140281244e-06, "logits/chosen": -1.528843879699707, "logits/rejected": -1.364166498184204, "logps/chosen": -202.91004943847656, "logps/rejected": -743.5071411132812, "loss": 0.1632, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.8074308633804321, "rewards/margins": 59.9074821472168, "rewards/rejected": -60.71491622924805, "step": 1560 }, { "epoch": 1.2631154156577886, "grad_norm": 0.004316593520343304, "learning_rate": 2.7850071577860864e-06, "logits/chosen": -1.4637105464935303, "logits/rejected": -1.3463037014007568, "logps/chosen": -183.74136352539062, "logps/rejected": -679.9913330078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.044971466064453, "rewards/margins": 57.666412353515625, "rewards/rejected": -55.62144088745117, "step": 1565 }, { "epoch": 1.2671509281678772, "grad_norm": 5.578904918872481e-14, "learning_rate": 2.772129011922787e-06, "logits/chosen": -1.4470124244689941, "logits/rejected": -1.3792212009429932, "logps/chosen": -213.13546752929688, "logps/rejected": -706.20556640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3146504163742065, "rewards/margins": 56.900108337402344, "rewards/rejected": -55.5854606628418, "step": 1570 }, { "epoch": 1.271186440677966, "grad_norm": 0.00011116755922557786, "learning_rate": 2.7592860088463376e-06, "logits/chosen": -1.6292082071304321, "logits/rejected": -1.2898311614990234, "logps/chosen": -177.99327087402344, "logps/rejected": -659.3099975585938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5574749708175659, "rewards/margins": 54.58333206176758, "rewards/rejected": -54.025856018066406, "step": 1575 }, { "epoch": 1.2752219531880549, "grad_norm": 8.302888931943642e-10, "learning_rate": 2.7464786795119765e-06, "logits/chosen": -1.3152077198028564, "logits/rejected": -1.4069987535476685, "logps/chosen": -198.95028686523438, "logps/rejected": -720.6678466796875, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.195016860961914, "rewards/margins": 62.487098693847656, "rewards/rejected": -59.292083740234375, "step": 1580 }, { "epoch": 1.2792574656981437, "grad_norm": 3.773255666250641e-11, "learning_rate": 2.733707553400122e-06, "logits/chosen": -1.5914888381958008, "logits/rejected": -1.517871618270874, "logps/chosen": -211.7425994873047, "logps/rejected": -731.9110107421875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8162578344345093, "rewards/margins": 62.33928298950195, "rewards/rejected": -60.52301788330078, "step": 1585 }, { "epoch": 1.2832929782082325, "grad_norm": 4.5216779653856065e-07, "learning_rate": 2.7209731584944742e-06, "logits/chosen": -1.609816312789917, "logits/rejected": -1.524383783340454, "logps/chosen": -186.00634765625, "logps/rejected": -620.8109130859375, "loss": 0.017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.65635746717453, "rewards/margins": 50.01836013793945, "rewards/rejected": -49.36199951171875, "step": 1590 }, { "epoch": 1.2873284907183211, "grad_norm": 4.205317427619093e-09, "learning_rate": 2.708276021260196e-06, "logits/chosen": -1.7833576202392578, "logits/rejected": -1.5492188930511475, "logps/chosen": -184.90185546875, "logps/rejected": -692.207275390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6446915864944458, "rewards/margins": 58.4537467956543, "rewards/rejected": -57.809059143066406, "step": 1595 }, { "epoch": 1.29136400322841, "grad_norm": 0.038646064698696136, "learning_rate": 2.6956166666221425e-06, "logits/chosen": -1.621161699295044, "logits/rejected": -1.4047126770019531, "logps/chosen": -183.34237670898438, "logps/rejected": -727.6070556640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 1.5672276020050049, "rewards/margins": 61.051048278808594, "rewards/rejected": -59.48381805419922, "step": 1600 }, { "epoch": 1.2953995157384988, "grad_norm": 4.8013754430642486e-12, "learning_rate": 2.6829956179431624e-06, "logits/chosen": -2.009873867034912, "logits/rejected": -1.5046837329864502, "logps/chosen": -178.76443481445312, "logps/rejected": -768.0751342773438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.6955463886260986, "rewards/margins": 67.1954345703125, "rewards/rejected": -64.49989318847656, "step": 1605 }, { "epoch": 1.2994350282485876, "grad_norm": 2.052781230589551e-13, "learning_rate": 2.67041339700246e-06, "logits/chosen": -2.0387442111968994, "logits/rejected": -1.5052274465560913, "logps/chosen": -178.11502075195312, "logps/rejected": -809.9860229492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1237400770187378, "rewards/margins": 65.8537826538086, "rewards/rejected": -66.97752380371094, "step": 1610 }, { "epoch": 1.3034705407586764, "grad_norm": 2.8542695984512534e-10, "learning_rate": 2.6578705239740217e-06, "logits/chosen": -1.955378532409668, "logits/rejected": -1.5775176286697388, "logps/chosen": -162.94400024414062, "logps/rejected": -763.7339477539062, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.6734461784362793, "rewards/margins": 65.5910415649414, "rewards/rejected": -62.91759490966797, "step": 1615 }, { "epoch": 1.307506053268765, "grad_norm": 2.7598356136223148e-14, "learning_rate": 2.6453675174051163e-06, "logits/chosen": -1.8466222286224365, "logits/rejected": -1.533707857131958, "logps/chosen": -175.60665893554688, "logps/rejected": -750.2123413085938, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.41148805618286133, "rewards/margins": 62.96259689331055, "rewards/rejected": -62.55109786987305, "step": 1620 }, { "epoch": 1.3115415657788538, "grad_norm": 2.211695351872229e-14, "learning_rate": 2.632904894194851e-06, "logits/chosen": -1.9749736785888672, "logits/rejected": -1.5663533210754395, "logps/chosen": -162.9367218017578, "logps/rejected": -728.7584838867188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 3.2406387329101562, "rewards/margins": 64.27747344970703, "rewards/rejected": -61.036842346191406, "step": 1625 }, { "epoch": 1.3155770782889427, "grad_norm": 1.670774851803003e-09, "learning_rate": 2.620483169572808e-06, "logits/chosen": -2.0658795833587646, "logits/rejected": -1.6234089136123657, "logps/chosen": -193.14317321777344, "logps/rejected": -796.3792724609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3619210720062256, "rewards/margins": 68.16824340820312, "rewards/rejected": -66.80632019042969, "step": 1630 }, { "epoch": 1.3196125907990315, "grad_norm": 2.051085301602029e-09, "learning_rate": 2.608102857077737e-06, "logits/chosen": -2.0360915660858154, "logits/rejected": -1.5871403217315674, "logps/chosen": -204.52206420898438, "logps/rejected": -765.5242919921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.9443421363830566, "rewards/margins": 66.50706481933594, "rewards/rejected": -63.5627326965332, "step": 1635 }, { "epoch": 1.3236481033091203, "grad_norm": 16.524887084960938, "learning_rate": 2.5957644685363316e-06, "logits/chosen": -2.177934169769287, "logits/rejected": -1.6973788738250732, "logps/chosen": -168.85757446289062, "logps/rejected": -727.5289306640625, "loss": 0.0333, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3378555178642273, "rewards/margins": 60.453147888183594, "rewards/rejected": -60.11530303955078, "step": 1640 }, { "epoch": 1.327683615819209, "grad_norm": 1.534827163141017e-07, "learning_rate": 2.5834685140420635e-06, "logits/chosen": -2.096437931060791, "logits/rejected": -1.6312873363494873, "logps/chosen": -174.17112731933594, "logps/rejected": -759.9822998046875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.7924470901489258, "rewards/margins": 64.60475158691406, "rewards/rejected": -63.81230926513672, "step": 1645 }, { "epoch": 1.331719128329298, "grad_norm": 7.918947275697974e-09, "learning_rate": 2.5712155019340976e-06, "logits/chosen": -2.255504846572876, "logits/rejected": -1.6200635433197021, "logps/chosen": -187.43124389648438, "logps/rejected": -807.0477905273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.335760235786438, "rewards/margins": 69.53099060058594, "rewards/rejected": -68.19523620605469, "step": 1650 }, { "epoch": 1.3357546408393866, "grad_norm": 2.1409116257586902e-08, "learning_rate": 2.559005938776276e-06, "logits/chosen": -2.0866482257843018, "logits/rejected": -1.6288915872573853, "logps/chosen": -169.92587280273438, "logps/rejected": -740.3541259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9804185628890991, "rewards/margins": 64.2018814086914, "rewards/rejected": -62.221466064453125, "step": 1655 }, { "epoch": 1.3397901533494754, "grad_norm": 8.355719816142276e-14, "learning_rate": 2.546840329336173e-06, "logits/chosen": -2.1005685329437256, "logits/rejected": -1.7085624933242798, "logps/chosen": -172.1204071044922, "logps/rejected": -825.3946533203125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.7942426204681396, "rewards/margins": 69.8453369140625, "rewards/rejected": -69.05110168457031, "step": 1660 }, { "epoch": 1.3438256658595642, "grad_norm": 3.983503793752313e-13, "learning_rate": 2.5347191765642317e-06, "logits/chosen": -2.1800079345703125, "logits/rejected": -1.7360271215438843, "logps/chosen": -178.7080841064453, "logps/rejected": -767.6512451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3949414789676666, "rewards/margins": 65.77651977539062, "rewards/rejected": -65.38157653808594, "step": 1665 }, { "epoch": 1.347861178369653, "grad_norm": 3.119226543724096e-10, "learning_rate": 2.522642981572965e-06, "logits/chosen": -2.2382187843322754, "logits/rejected": -1.81443190574646, "logps/chosen": -184.4077606201172, "logps/rejected": -816.0771484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7507683634757996, "rewards/margins": 68.29084777832031, "rewards/rejected": -67.54008483886719, "step": 1670 }, { "epoch": 1.3518966908797418, "grad_norm": 3.948374516227204e-09, "learning_rate": 2.510612243616246e-06, "logits/chosen": -2.396482467651367, "logits/rejected": -1.7877686023712158, "logps/chosen": -187.24847412109375, "logps/rejected": -729.6419677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.6970505714416504, "rewards/margins": 63.26873016357422, "rewards/rejected": -60.571678161621094, "step": 1675 }, { "epoch": 1.3559322033898304, "grad_norm": 3.426130559813778e-11, "learning_rate": 2.49862746006866e-06, "logits/chosen": -2.413379192352295, "logits/rejected": -1.799852967262268, "logps/chosen": -182.4119873046875, "logps/rejected": -812.221923828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.938901424407959, "rewards/margins": 72.41883850097656, "rewards/rejected": -69.47993469238281, "step": 1680 }, { "epoch": 1.3599677158999193, "grad_norm": 0.037203673273324966, "learning_rate": 2.486689126404948e-06, "logits/chosen": -2.2255234718322754, "logits/rejected": -1.7428064346313477, "logps/chosen": -174.45114135742188, "logps/rejected": -802.6512451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3830649852752686, "rewards/margins": 68.44318389892578, "rewards/rejected": -67.06011962890625, "step": 1685 }, { "epoch": 1.364003228410008, "grad_norm": 1.5179419676769612e-07, "learning_rate": 2.4747977361795196e-06, "logits/chosen": -2.430809497833252, "logits/rejected": -1.7539234161376953, "logps/chosen": -173.1271209716797, "logps/rejected": -831.7330932617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.7332366704940796, "rewards/margins": 70.12152099609375, "rewards/rejected": -69.38829040527344, "step": 1690 }, { "epoch": 1.368038740920097, "grad_norm": 0.0002964167215395719, "learning_rate": 2.462953781006049e-06, "logits/chosen": -2.331538438796997, "logits/rejected": -1.8691266775131226, "logps/chosen": -179.97695922851562, "logps/rejected": -831.1298828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8194652795791626, "rewards/margins": 71.95481872558594, "rewards/rejected": -70.13536071777344, "step": 1695 }, { "epoch": 1.3720742534301857, "grad_norm": 3.909887311692728e-07, "learning_rate": 2.4511577505371538e-06, "logits/chosen": -2.2468013763427734, "logits/rejected": -1.821124792098999, "logps/chosen": -174.49826049804688, "logps/rejected": -832.2234497070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.2093558311462402, "rewards/margins": 71.55606842041016, "rewards/rejected": -69.34671783447266, "step": 1700 }, { "epoch": 1.3761097659402743, "grad_norm": 7.648792894627415e-14, "learning_rate": 2.439410132444145e-06, "logits/chosen": -2.270819664001465, "logits/rejected": -1.7612590789794922, "logps/chosen": -170.6118927001953, "logps/rejected": -905.99609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.766690492630005, "rewards/margins": 80.04303741455078, "rewards/rejected": -77.2763442993164, "step": 1705 }, { "epoch": 1.3801452784503632, "grad_norm": 0.11734246462583542, "learning_rate": 2.4277114123968747e-06, "logits/chosen": -2.4150424003601074, "logits/rejected": -1.7157480716705322, "logps/chosen": -155.3899383544922, "logps/rejected": -757.6393432617188, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 0.9298669099807739, "rewards/margins": 63.9151725769043, "rewards/rejected": -62.98530197143555, "step": 1710 }, { "epoch": 1.384180790960452, "grad_norm": 2.6616627304965057e-11, "learning_rate": 2.4160620740436484e-06, "logits/chosen": -2.3023900985717773, "logits/rejected": -1.7809321880340576, "logps/chosen": -172.132568359375, "logps/rejected": -779.4400024414062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.194301962852478, "rewards/margins": 67.26702880859375, "rewards/rejected": -66.0727310180664, "step": 1715 }, { "epoch": 1.3882163034705408, "grad_norm": 1.0691862434471233e-13, "learning_rate": 2.404462598991238e-06, "logits/chosen": -2.314537525177002, "logits/rejected": -1.871187448501587, "logps/chosen": -176.63394165039062, "logps/rejected": -764.6021118164062, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0624892711639404, "rewards/margins": 64.38102722167969, "rewards/rejected": -63.31854248046875, "step": 1720 }, { "epoch": 1.3922518159806296, "grad_norm": 1.1766523085737779e-12, "learning_rate": 2.392913466784964e-06, "logits/chosen": -2.1949894428253174, "logits/rejected": -1.7672621011734009, "logps/chosen": -210.588623046875, "logps/rejected": -835.0763549804688, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 2.437427520751953, "rewards/margins": 72.63179016113281, "rewards/rejected": -70.19436645507812, "step": 1725 }, { "epoch": 1.3962873284907182, "grad_norm": 2.1470858424521566e-10, "learning_rate": 2.381415154888877e-06, "logits/chosen": -2.1774909496307373, "logits/rejected": -1.8509496450424194, "logps/chosen": -188.58285522460938, "logps/rejected": -779.4468994140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.199852705001831, "rewards/margins": 66.55570983886719, "rewards/rejected": -65.35586547851562, "step": 1730 }, { "epoch": 1.400322841000807, "grad_norm": 8.875439581413139e-14, "learning_rate": 2.3699681386660127e-06, "logits/chosen": -2.370748281478882, "logits/rejected": -1.691887617111206, "logps/chosen": -184.4141387939453, "logps/rejected": -857.23828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.469444990158081, "rewards/margins": 73.85873413085938, "rewards/rejected": -72.3892822265625, "step": 1735 }, { "epoch": 1.4043583535108959, "grad_norm": 2.7090867743551073e-11, "learning_rate": 2.3585728913587428e-06, "logits/chosen": -2.455437183380127, "logits/rejected": -1.747497797012329, "logps/chosen": -197.980712890625, "logps/rejected": -807.9566650390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.562467336654663, "rewards/margins": 70.05950927734375, "rewards/rejected": -68.4970474243164, "step": 1740 }, { "epoch": 1.4083938660209847, "grad_norm": 1.1996800800088145e-15, "learning_rate": 2.3472298840692108e-06, "logits/chosen": -2.2053990364074707, "logits/rejected": -1.7653766870498657, "logps/chosen": -180.4335479736328, "logps/rejected": -768.0950927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.245727777481079, "rewards/margins": 65.86540222167969, "rewards/rejected": -64.61967468261719, "step": 1745 }, { "epoch": 1.4124293785310735, "grad_norm": 4.732628785392157e-12, "learning_rate": 2.3359395857398505e-06, "logits/chosen": -2.4713072776794434, "logits/rejected": -1.7393347024917603, "logps/chosen": -181.605224609375, "logps/rejected": -837.1864013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.8240095376968384, "rewards/margins": 73.47965240478516, "rewards/rejected": -71.6556396484375, "step": 1750 }, { "epoch": 1.4164648910411621, "grad_norm": 1.2502595403418582e-08, "learning_rate": 2.3247024631340066e-06, "logits/chosen": -2.3331973552703857, "logits/rejected": -1.8218599557876587, "logps/chosen": -194.18751525878906, "logps/rejected": -794.9191284179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.3759591579437256, "rewards/margins": 70.21507263183594, "rewards/rejected": -67.83912658691406, "step": 1755 }, { "epoch": 1.420500403551251, "grad_norm": 5.877570652401687e-11, "learning_rate": 2.3135189808166306e-06, "logits/chosen": -2.2937490940093994, "logits/rejected": -1.9143552780151367, "logps/chosen": -194.3721466064453, "logps/rejected": -837.5227661132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6765773296356201, "rewards/margins": 72.47981262207031, "rewards/rejected": -70.80323028564453, "step": 1760 }, { "epoch": 1.4245359160613398, "grad_norm": 4.08352344993812e-12, "learning_rate": 2.30238960113508e-06, "logits/chosen": -2.4119582176208496, "logits/rejected": -1.7352384328842163, "logps/chosen": -167.0661163330078, "logps/rejected": -829.92431640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1172966957092285, "rewards/margins": 71.37763977050781, "rewards/rejected": -70.26034545898438, "step": 1765 }, { "epoch": 1.4285714285714286, "grad_norm": 4.962503652450323e-08, "learning_rate": 2.291314784200002e-06, "logits/chosen": -2.4528346061706543, "logits/rejected": -1.768619179725647, "logps/chosen": -190.2008819580078, "logps/rejected": -830.40478515625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.780856728553772, "rewards/margins": 70.84291076660156, "rewards/rejected": -70.06204986572266, "step": 1770 }, { "epoch": 1.4326069410815174, "grad_norm": 8.153387687226399e-12, "learning_rate": 2.280294987866311e-06, "logits/chosen": -2.3447072505950928, "logits/rejected": -1.7695497274398804, "logps/chosen": -167.91183471679688, "logps/rejected": -746.8505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9097827672958374, "rewards/margins": 64.11859893798828, "rewards/rejected": -62.20882034301758, "step": 1775 }, { "epoch": 1.436642453591606, "grad_norm": 9.12277631909264e-12, "learning_rate": 2.26933066771426e-06, "logits/chosen": -2.3243443965911865, "logits/rejected": -1.7731202840805054, "logps/chosen": -176.44168090820312, "logps/rejected": -748.0114135742188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9347507357597351, "rewards/margins": 64.25198364257812, "rewards/rejected": -63.317237854003906, "step": 1780 }, { "epoch": 1.4406779661016949, "grad_norm": 2.987074780233523e-12, "learning_rate": 2.2584222770306055e-06, "logits/chosen": -2.354372501373291, "logits/rejected": -1.672562599182129, "logps/chosen": -204.61460876464844, "logps/rejected": -856.75048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4970417022705078, "rewards/margins": 73.69001770019531, "rewards/rejected": -72.19297790527344, "step": 1785 }, { "epoch": 1.4447134786117837, "grad_norm": 1.1456736004333834e-08, "learning_rate": 2.2475702667898733e-06, "logits/chosen": -2.1627533435821533, "logits/rejected": -1.7284364700317383, "logps/chosen": -188.8264617919922, "logps/rejected": -800.7828369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.151843786239624, "rewards/margins": 68.44834899902344, "rewards/rejected": -67.29650115966797, "step": 1790 }, { "epoch": 1.4487489911218725, "grad_norm": 7.901027054835197e-11, "learning_rate": 2.2367750856357042e-06, "logits/chosen": -2.4985828399658203, "logits/rejected": -1.7672927379608154, "logps/chosen": -207.21142578125, "logps/rejected": -824.0856323242188, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 0.8256248235702515, "rewards/margins": 69.46866607666016, "rewards/rejected": -68.64303588867188, "step": 1795 }, { "epoch": 1.4527845036319613, "grad_norm": 2.0653991754729373e-13, "learning_rate": 2.2260371798623166e-06, "logits/chosen": -2.4005846977233887, "logits/rejected": -1.6849313974380493, "logps/chosen": -206.4866943359375, "logps/rejected": -868.9378662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.0821311473846436, "rewards/margins": 75.61005401611328, "rewards/rejected": -73.5279312133789, "step": 1800 }, { "epoch": 1.45682001614205, "grad_norm": 1.5511922944931263e-15, "learning_rate": 2.2153569933960465e-06, "logits/chosen": -2.2854561805725098, "logits/rejected": -1.62441885471344, "logps/chosen": -202.88528442382812, "logps/rejected": -828.8380737304688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8949993252754211, "rewards/margins": 70.76665496826172, "rewards/rejected": -69.87165832519531, "step": 1805 }, { "epoch": 1.4608555286521387, "grad_norm": 2.049617187083186e-08, "learning_rate": 2.204734967777003e-06, "logits/chosen": -2.2413315773010254, "logits/rejected": -1.625156044960022, "logps/chosen": -176.7400665283203, "logps/rejected": -860.0853271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2557581663131714, "rewards/margins": 74.38679504394531, "rewards/rejected": -73.13102722167969, "step": 1810 }, { "epoch": 1.4648910411622276, "grad_norm": 0.03431355208158493, "learning_rate": 2.194171542140807e-06, "logits/chosen": -2.341762065887451, "logits/rejected": -1.7039142847061157, "logps/chosen": -184.9700927734375, "logps/rejected": -853.7747802734375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.5445622205734253, "rewards/margins": 74.40602111816406, "rewards/rejected": -73.8614501953125, "step": 1815 }, { "epoch": 1.4689265536723164, "grad_norm": 2.0087455800837407e-14, "learning_rate": 2.183667153200444e-06, "logits/chosen": -2.257087469100952, "logits/rejected": -1.5907630920410156, "logps/chosen": -178.4795684814453, "logps/rejected": -766.4191284179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5699442028999329, "rewards/margins": 65.43973541259766, "rewards/rejected": -64.86978912353516, "step": 1820 }, { "epoch": 1.4729620661824052, "grad_norm": 4.802135222803372e-08, "learning_rate": 2.1732222352282018e-06, "logits/chosen": -2.3497507572174072, "logits/rejected": -1.752854347229004, "logps/chosen": -221.79385375976562, "logps/rejected": -860.7669677734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.16548208892345428, "rewards/margins": 72.14274597167969, "rewards/rejected": -72.30823516845703, "step": 1825 }, { "epoch": 1.4769975786924938, "grad_norm": 1.0894300430663861e-06, "learning_rate": 2.16283722003772e-06, "logits/chosen": -2.3829636573791504, "logits/rejected": -1.6289317607879639, "logps/chosen": -178.19253540039062, "logps/rejected": -863.5916748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4172009527683258, "rewards/margins": 74.2694091796875, "rewards/rejected": -73.85221099853516, "step": 1830 }, { "epoch": 1.4810330912025829, "grad_norm": 8.519249194982592e-10, "learning_rate": 2.1525125369661413e-06, "logits/chosen": -2.210036277770996, "logits/rejected": -1.5888164043426514, "logps/chosen": -240.2311553955078, "logps/rejected": -922.6525268554688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.13096031546592712, "rewards/margins": 78.05513763427734, "rewards/rejected": -78.18611145019531, "step": 1835 }, { "epoch": 1.4850686037126715, "grad_norm": 1.4540746211266775e-11, "learning_rate": 2.1422486128563556e-06, "logits/chosen": -2.296048641204834, "logits/rejected": -1.6735785007476807, "logps/chosen": -199.35543823242188, "logps/rejected": -815.7294921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 2.3111767768859863, "rewards/margins": 71.4309310913086, "rewards/rejected": -69.11975860595703, "step": 1840 }, { "epoch": 1.4891041162227603, "grad_norm": 2.9199483542879934e-13, "learning_rate": 2.1320458720393604e-06, "logits/chosen": -2.160212278366089, "logits/rejected": -1.6510088443756104, "logps/chosen": -194.401611328125, "logps/rejected": -864.7791748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.9292609691619873, "rewards/margins": 76.22747802734375, "rewards/rejected": -73.2982177734375, "step": 1845 }, { "epoch": 1.4931396287328491, "grad_norm": 7.134872120201363e-13, "learning_rate": 2.121904736316711e-06, "logits/chosen": -2.2305092811584473, "logits/rejected": -1.5933868885040283, "logps/chosen": -213.48681640625, "logps/rejected": -863.0465698242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1499013453722, "rewards/margins": 75.4010009765625, "rewards/rejected": -75.2510986328125, "step": 1850 }, { "epoch": 1.497175141242938, "grad_norm": 1.6194068858783016e-14, "learning_rate": 2.111825624943088e-06, "logits/chosen": -2.3740592002868652, "logits/rejected": -1.580862283706665, "logps/chosen": -183.22999572753906, "logps/rejected": -838.4013671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6093696355819702, "rewards/margins": 71.96637725830078, "rewards/rejected": -70.35700225830078, "step": 1855 }, { "epoch": 1.5012106537530268, "grad_norm": 0.0008141635335050523, "learning_rate": 2.10180895460896e-06, "logits/chosen": -2.3779282569885254, "logits/rejected": -1.6679363250732422, "logps/chosen": -199.64443969726562, "logps/rejected": -882.0906982421875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.7547755241394043, "rewards/margins": 72.85345458984375, "rewards/rejected": -75.60823059082031, "step": 1860 }, { "epoch": 1.5052461662631154, "grad_norm": 2.6895682590652925e-15, "learning_rate": 2.091855139423362e-06, "logits/chosen": -2.3399815559387207, "logits/rejected": -1.56537663936615, "logps/chosen": -220.5291290283203, "logps/rejected": -1033.9248046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.086347222328186, "rewards/margins": 89.32056427001953, "rewards/rejected": -88.23421478271484, "step": 1865 }, { "epoch": 1.5092816787732042, "grad_norm": 1.0685262570586662e-11, "learning_rate": 2.0819645908967705e-06, "logits/chosen": -2.3006718158721924, "logits/rejected": -1.73403799533844, "logps/chosen": -214.35733032226562, "logps/rejected": -952.1671752929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.38021600246429443, "rewards/margins": 81.61865234375, "rewards/rejected": -81.99886322021484, "step": 1870 }, { "epoch": 1.513317191283293, "grad_norm": 8.177987300454512e-17, "learning_rate": 2.0721377179240934e-06, "logits/chosen": -2.277712345123291, "logits/rejected": -1.6226837635040283, "logps/chosen": -179.32369995117188, "logps/rejected": -809.5697631835938, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1507253646850586, "rewards/margins": 68.95421600341797, "rewards/rejected": -68.80349731445312, "step": 1875 }, { "epoch": 1.5173527037933816, "grad_norm": 6.53324779258807e-14, "learning_rate": 2.0623749267677653e-06, "logits/chosen": -2.315620183944702, "logits/rejected": -1.5618131160736084, "logps/chosen": -206.01513671875, "logps/rejected": -964.9822998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.0216317176818848, "rewards/margins": 81.12474060058594, "rewards/rejected": -83.1463623046875, "step": 1880 }, { "epoch": 1.5213882163034707, "grad_norm": 1.943762512057745e-11, "learning_rate": 2.0526766210409486e-06, "logits/chosen": -2.31911563873291, "logits/rejected": -1.5914510488510132, "logps/chosen": -183.07559204101562, "logps/rejected": -811.8446655273438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5357041954994202, "rewards/margins": 68.63307189941406, "rewards/rejected": -69.16877746582031, "step": 1885 }, { "epoch": 1.5254237288135593, "grad_norm": 8.360675707308474e-09, "learning_rate": 2.0430432016908546e-06, "logits/chosen": -2.351818084716797, "logits/rejected": -1.5602799654006958, "logps/chosen": -180.8989715576172, "logps/rejected": -879.6181640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8545559644699097, "rewards/margins": 75.22038269042969, "rewards/rejected": -74.3658218383789, "step": 1890 }, { "epoch": 1.529459241323648, "grad_norm": 1.2710861009568308e-11, "learning_rate": 2.033475066982158e-06, "logits/chosen": -2.329619884490967, "logits/rejected": -1.7010787725448608, "logps/chosen": -187.9939727783203, "logps/rejected": -872.2916259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.832417368888855, "rewards/margins": 75.39540100097656, "rewards/rejected": -74.56298828125, "step": 1895 }, { "epoch": 1.533494753833737, "grad_norm": 1.3158930797814433e-16, "learning_rate": 2.023972612480542e-06, "logits/chosen": -2.3244824409484863, "logits/rejected": -1.5877481698989868, "logps/chosen": -192.69796752929688, "logps/rejected": -854.6046142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.07289906591176987, "rewards/margins": 72.5516357421875, "rewards/rejected": -72.47874450683594, "step": 1900 }, { "epoch": 1.5375302663438255, "grad_norm": 1.4230346935539817e-11, "learning_rate": 2.0145362310363345e-06, "logits/chosen": -2.259767532348633, "logits/rejected": -1.625695824623108, "logps/chosen": -191.95870971679688, "logps/rejected": -884.4142456054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.0062112570740282536, "rewards/margins": 76.26468658447266, "rewards/rejected": -76.25846862792969, "step": 1905 }, { "epoch": 1.5415657788539145, "grad_norm": 1.088810920715332, "learning_rate": 2.0051663127682763e-06, "logits/chosen": -2.403512716293335, "logits/rejected": -1.5855791568756104, "logps/chosen": -188.89625549316406, "logps/rejected": -788.9591064453125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.2140169143676758, "rewards/margins": 65.71537017822266, "rewards/rejected": -66.92938995361328, "step": 1910 }, { "epoch": 1.5456012913640031, "grad_norm": 1.9978227514157176e-14, "learning_rate": 1.995863245047386e-06, "logits/chosen": -2.270082473754883, "logits/rejected": -1.6904840469360352, "logps/chosen": -170.1380157470703, "logps/rejected": -890.0345458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.09062206745147705, "rewards/margins": 75.6651611328125, "rewards/rejected": -75.57453918457031, "step": 1915 }, { "epoch": 1.549636803874092, "grad_norm": 4.887734661451759e-10, "learning_rate": 1.986627412480949e-06, "logits/chosen": -2.312997341156006, "logits/rejected": -1.7650508880615234, "logps/chosen": -187.4536590576172, "logps/rejected": -795.8170776367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.48981624841690063, "rewards/margins": 67.354248046875, "rewards/rejected": -66.86442565917969, "step": 1920 }, { "epoch": 1.5536723163841808, "grad_norm": 4.0495269786333665e-05, "learning_rate": 1.9774591968966156e-06, "logits/chosen": -2.402712821960449, "logits/rejected": -1.7034728527069092, "logps/chosen": -176.81275939941406, "logps/rejected": -758.0675048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2504297494888306, "rewards/margins": 61.4433479309082, "rewards/rejected": -62.69377899169922, "step": 1925 }, { "epoch": 1.5577078288942696, "grad_norm": 4.932055631956533e-12, "learning_rate": 1.9683589773266157e-06, "logits/chosen": -2.296865940093994, "logits/rejected": -1.7235033512115479, "logps/chosen": -181.05572509765625, "logps/rejected": -810.8214111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0652954578399658, "rewards/margins": 70.22538757324219, "rewards/rejected": -69.16008758544922, "step": 1930 }, { "epoch": 1.5617433414043584, "grad_norm": 2.854477591796023e-11, "learning_rate": 1.95932712999209e-06, "logits/chosen": -2.3155179023742676, "logits/rejected": -1.7059166431427002, "logps/chosen": -203.16989135742188, "logps/rejected": -853.52294921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7545735836029053, "rewards/margins": 70.26029968261719, "rewards/rejected": -72.0148696899414, "step": 1935 }, { "epoch": 1.565778853914447, "grad_norm": 3.569977868878027e-09, "learning_rate": 1.9503640282875333e-06, "logits/chosen": -2.185150623321533, "logits/rejected": -1.8300927877426147, "logps/chosen": -200.48281860351562, "logps/rejected": -818.4951171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.211698055267334, "rewards/margins": 71.8260498046875, "rewards/rejected": -68.61434936523438, "step": 1940 }, { "epoch": 1.5698143664245359, "grad_norm": 5.879570164069037e-10, "learning_rate": 1.941470042765362e-06, "logits/chosen": -2.3706514835357666, "logits/rejected": -1.709673285484314, "logps/chosen": -171.584716796875, "logps/rejected": -802.6095581054688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9472419619560242, "rewards/margins": 69.22999572753906, "rewards/rejected": -68.28275299072266, "step": 1945 }, { "epoch": 1.5738498789346247, "grad_norm": 2.399892196081055e-07, "learning_rate": 1.9326455411205902e-06, "logits/chosen": -2.425222873687744, "logits/rejected": -1.737007737159729, "logps/chosen": -176.29135131835938, "logps/rejected": -762.3245849609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.9605649709701538, "rewards/margins": 64.93984985351562, "rewards/rejected": -63.97929000854492, "step": 1950 }, { "epoch": 1.5778853914447135, "grad_norm": 5.022648519314998e-09, "learning_rate": 1.9238908881756326e-06, "logits/chosen": -2.3650848865509033, "logits/rejected": -1.7317321300506592, "logps/chosen": -201.64279174804688, "logps/rejected": -897.0486450195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.0620818138122559, "rewards/margins": 77.2507095336914, "rewards/rejected": -76.1886215209961, "step": 1955 }, { "epoch": 1.5819209039548023, "grad_norm": 3.377210891584298e-13, "learning_rate": 1.9152064458652186e-06, "logits/chosen": -2.4109702110290527, "logits/rejected": -1.7987076044082642, "logps/chosen": -178.39866638183594, "logps/rejected": -710.159912109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5675937533378601, "rewards/margins": 58.662818908691406, "rewards/rejected": -59.23040771484375, "step": 1960 }, { "epoch": 1.585956416464891, "grad_norm": 6.796816442999898e-10, "learning_rate": 1.9065925732214298e-06, "logits/chosen": -2.236110210418701, "logits/rejected": -1.7171659469604492, "logps/chosen": -158.8496551513672, "logps/rejected": -784.4271240234375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 3.2434284687042236, "rewards/margins": 68.64083099365234, "rewards/rejected": -65.39739990234375, "step": 1965 }, { "epoch": 1.58999192897498, "grad_norm": 1.2242643565585354e-09, "learning_rate": 1.8980496263588606e-06, "logits/chosen": -2.099177122116089, "logits/rejected": -1.7306063175201416, "logps/chosen": -165.57896423339844, "logps/rejected": -777.022705078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 1.243430256843567, "rewards/margins": 65.51875305175781, "rewards/rejected": -64.27531433105469, "step": 1970 }, { "epoch": 1.5940274414850686, "grad_norm": 1.2411593454386693e-11, "learning_rate": 1.8895779584598897e-06, "logits/chosen": -2.4255688190460205, "logits/rejected": -1.696668267250061, "logps/chosen": -185.4091033935547, "logps/rejected": -790.3414306640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6319304704666138, "rewards/margins": 65.39628601074219, "rewards/rejected": -66.0282211303711, "step": 1975 }, { "epoch": 1.5980629539951574, "grad_norm": 6.257682343857596e-06, "learning_rate": 1.8811779197600843e-06, "logits/chosen": -2.2648909091949463, "logits/rejected": -1.8240740299224854, "logps/chosen": -172.35232543945312, "logps/rejected": -820.5059814453125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 0.7634139657020569, "rewards/margins": 70.0427017211914, "rewards/rejected": -69.27928161621094, "step": 1980 }, { "epoch": 1.6020984665052462, "grad_norm": 1.2090616507975938e-07, "learning_rate": 1.872849857533717e-06, "logits/chosen": -2.3356215953826904, "logits/rejected": -1.581761360168457, "logps/chosen": -188.34323120117188, "logps/rejected": -875.1851806640625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2022818326950073, "rewards/margins": 75.31105041503906, "rewards/rejected": -74.10877227783203, "step": 1985 }, { "epoch": 1.6061339790153348, "grad_norm": 1.4635671147233964e-12, "learning_rate": 1.8645941160794103e-06, "logits/chosen": -2.078589916229248, "logits/rejected": -1.6479928493499756, "logps/chosen": -201.7108612060547, "logps/rejected": -820.6966552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3580913543701172, "rewards/margins": 69.18218994140625, "rewards/rejected": -68.8240966796875, "step": 1990 }, { "epoch": 1.6101694915254239, "grad_norm": 2.583169589343015e-06, "learning_rate": 1.8564110367059028e-06, "logits/chosen": -2.286557674407959, "logits/rejected": -1.5544240474700928, "logps/chosen": -194.52139282226562, "logps/rejected": -891.4095458984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5269076228141785, "rewards/margins": 75.14198303222656, "rewards/rejected": -75.66889190673828, "step": 1995 }, { "epoch": 1.6142050040355125, "grad_norm": 1.3802330112167027e-10, "learning_rate": 1.8483009577179387e-06, "logits/chosen": -2.1604092121124268, "logits/rejected": -1.5595988035202026, "logps/chosen": -182.5557861328125, "logps/rejected": -878.3699340820312, "loss": 0.0097, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8031524419784546, "rewards/margins": 76.5200424194336, "rewards/rejected": -75.71688842773438, "step": 2000 }, { "epoch": 1.6142050040355125, "eval_logits/chosen": -2.1726837158203125, "eval_logits/rejected": -1.1106358766555786, "eval_logps/chosen": -125.35917663574219, "eval_logps/rejected": -950.3605346679688, "eval_loss": 9.387592581333593e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -0.6562811136245728, "eval_rewards/margins": 87.59650421142578, "eval_rewards/rejected": -88.25279235839844, "eval_runtime": 20.2686, "eval_samples_per_second": 9.867, "eval_steps_per_second": 9.867, "step": 2000 }, { "epoch": 1.6182405165456013, "grad_norm": 4.1508867454354e-09, "learning_rate": 1.8402642144022803e-06, "logits/chosen": -2.19663667678833, "logits/rejected": -1.598111867904663, "logps/chosen": -177.40621948242188, "logps/rejected": -850.9366455078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3505223989486694, "rewards/margins": 72.97136688232422, "rewards/rejected": -71.620849609375, "step": 2005 }, { "epoch": 1.6222760290556901, "grad_norm": 1.0019052983632239e-14, "learning_rate": 1.8323011390138482e-06, "logits/chosen": -2.3754477500915527, "logits/rejected": -1.5484435558319092, "logps/chosen": -183.5631103515625, "logps/rejected": -897.6380004882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3359715938568115, "rewards/margins": 78.36924743652344, "rewards/rejected": -77.03328704833984, "step": 2010 }, { "epoch": 1.6263115415657787, "grad_norm": 3.822767449812581e-11, "learning_rate": 1.8244120607619862e-06, "logits/chosen": -2.374082088470459, "logits/rejected": -1.5431396961212158, "logps/chosen": -188.85418701171875, "logps/rejected": -958.5436401367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.1649308204650879, "rewards/margins": 81.46302795410156, "rewards/rejected": -81.29808807373047, "step": 2015 }, { "epoch": 1.6303470540758678, "grad_norm": 3.277273730173391e-10, "learning_rate": 1.8165973057968464e-06, "logits/chosen": -2.097764492034912, "logits/rejected": -1.4683208465576172, "logps/chosen": -203.26809692382812, "logps/rejected": -837.5242309570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.5845251083374023, "rewards/margins": 73.96414184570312, "rewards/rejected": -71.3796157836914, "step": 2020 }, { "epoch": 1.6343825665859564, "grad_norm": 9.566671456753828e-13, "learning_rate": 1.8088571971959117e-06, "logits/chosen": -2.3980629444122314, "logits/rejected": -1.6122541427612305, "logps/chosen": -189.101806640625, "logps/rejected": -912.7693481445312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6044571995735168, "rewards/margins": 78.82121276855469, "rewards/rejected": -78.21675872802734, "step": 2025 }, { "epoch": 1.6384180790960452, "grad_norm": 3.871112347233856e-14, "learning_rate": 1.8011920549506342e-06, "logits/chosen": -2.343370199203491, "logits/rejected": -1.666820764541626, "logps/chosen": -211.6288299560547, "logps/rejected": -889.9730224609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.307486057281494, "rewards/margins": 77.56246948242188, "rewards/rejected": -75.2549819946289, "step": 2030 }, { "epoch": 1.642453591606134, "grad_norm": 1.4433953706739777e-13, "learning_rate": 1.793602195953209e-06, "logits/chosen": -2.2745273113250732, "logits/rejected": -1.518019437789917, "logps/chosen": -165.61166381835938, "logps/rejected": -736.9768676757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.811711311340332, "rewards/margins": 63.61808395385742, "rewards/rejected": -62.80637741088867, "step": 2035 }, { "epoch": 1.6464891041162226, "grad_norm": 3.211163279726037e-11, "learning_rate": 1.7860879339834725e-06, "logits/chosen": -2.3197433948516846, "logits/rejected": -1.5143916606903076, "logps/chosen": -180.4705352783203, "logps/rejected": -819.6495361328125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 1.3869092464447021, "rewards/margins": 71.40988159179688, "rewards/rejected": -70.0229721069336, "step": 2040 }, { "epoch": 1.6505246166263117, "grad_norm": 2.753719456904946e-07, "learning_rate": 1.7786495796959286e-06, "logits/chosen": -2.3563010692596436, "logits/rejected": -1.5893948078155518, "logps/chosen": -164.78579711914062, "logps/rejected": -772.463134765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6052733659744263, "rewards/margins": 65.36254119873047, "rewards/rejected": -64.75726318359375, "step": 2045 }, { "epoch": 1.6545601291364003, "grad_norm": 2.1331257073209375e-12, "learning_rate": 1.7712874406069087e-06, "logits/chosen": -2.394469738006592, "logits/rejected": -1.557729721069336, "logps/chosen": -171.2790985107422, "logps/rejected": -783.5185546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.1942310333251953, "rewards/margins": 67.44457244873047, "rewards/rejected": -65.2503433227539, "step": 2050 }, { "epoch": 1.658595641646489, "grad_norm": 3.5769844863864364e-09, "learning_rate": 1.764001821081855e-06, "logits/chosen": -2.4040169715881348, "logits/rejected": -1.7718921899795532, "logps/chosen": -173.5054931640625, "logps/rejected": -806.1981201171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.4838249683380127, "rewards/margins": 70.11324310302734, "rewards/rejected": -68.6294174194336, "step": 2055 }, { "epoch": 1.662631154156578, "grad_norm": 0.08302672952413559, "learning_rate": 1.7567930223227407e-06, "logits/chosen": -2.433271646499634, "logits/rejected": -1.640721082687378, "logps/chosen": -184.3240203857422, "logps/rejected": -811.218505859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.6623468399047852, "rewards/margins": 69.88880920410156, "rewards/rejected": -68.22645568847656, "step": 2060 }, { "epoch": 1.6666666666666665, "grad_norm": 1.1935749263947137e-11, "learning_rate": 1.7496613423556142e-06, "logits/chosen": -2.218108654022217, "logits/rejected": -1.6626625061035156, "logps/chosen": -165.69412231445312, "logps/rejected": -758.7576904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9105110168457031, "rewards/margins": 66.62940216064453, "rewards/rejected": -64.71888732910156, "step": 2065 }, { "epoch": 1.6707021791767556, "grad_norm": 8.292745690141601e-08, "learning_rate": 1.7426070760182814e-06, "logits/chosen": -2.325223445892334, "logits/rejected": -1.6296716928482056, "logps/chosen": -167.69943237304688, "logps/rejected": -806.0886840820312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.7861131429672241, "rewards/margins": 68.74629974365234, "rewards/rejected": -67.96018981933594, "step": 2070 }, { "epoch": 1.6747376916868442, "grad_norm": 0.5463241934776306, "learning_rate": 1.7356305149481144e-06, "logits/chosen": -2.2677998542785645, "logits/rejected": -1.6080982685089111, "logps/chosen": -195.4797821044922, "logps/rejected": -839.1272583007812, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.05782616138458252, "rewards/margins": 71.80995178222656, "rewards/rejected": -71.8677749633789, "step": 2075 }, { "epoch": 1.678773204196933, "grad_norm": 2.93797714563844e-14, "learning_rate": 1.7287319475699964e-06, "logits/chosen": -2.307939291000366, "logits/rejected": -1.5694466829299927, "logps/chosen": -173.24276733398438, "logps/rejected": -863.1575927734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.4526156485080719, "rewards/margins": 74.87712097167969, "rewards/rejected": -74.42449951171875, "step": 2080 }, { "epoch": 1.6828087167070218, "grad_norm": 1.295182971289277e-11, "learning_rate": 1.7219116590843942e-06, "logits/chosen": -2.4003868103027344, "logits/rejected": -1.6388174295425415, "logps/chosen": -172.3368682861328, "logps/rejected": -866.6731567382812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.11429212242364883, "rewards/margins": 73.57392883300781, "rewards/rejected": -73.45964050292969, "step": 2085 }, { "epoch": 1.6868442292171104, "grad_norm": 3.4780432006664697e-11, "learning_rate": 1.715169931455573e-06, "logits/chosen": -2.505866289138794, "logits/rejected": -1.59027099609375, "logps/chosen": -210.74667358398438, "logps/rejected": -993.9722900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1159766912460327, "rewards/margins": 82.5609130859375, "rewards/rejected": -83.67689514160156, "step": 2090 }, { "epoch": 1.6908797417271995, "grad_norm": 5.202167195010165e-15, "learning_rate": 1.7085070433999332e-06, "logits/chosen": -2.4701247215270996, "logits/rejected": -1.5382825136184692, "logps/chosen": -191.43820190429688, "logps/rejected": -899.6550903320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.17037799954414368, "rewards/margins": 77.23709869384766, "rewards/rejected": -77.40747833251953, "step": 2095 }, { "epoch": 1.694915254237288, "grad_norm": 1.4999115194314072e-07, "learning_rate": 1.7019232703744922e-06, "logits/chosen": -2.30924391746521, "logits/rejected": -1.5826889276504517, "logps/chosen": -185.04721069335938, "logps/rejected": -847.9348754882812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.085605502128601, "rewards/margins": 73.63111877441406, "rewards/rejected": -72.5455093383789, "step": 2100 }, { "epoch": 1.6989507667473769, "grad_norm": 9.688919588510259e-13, "learning_rate": 1.695418884565496e-06, "logits/chosen": -2.47183895111084, "logits/rejected": -1.5339562892913818, "logps/chosen": -196.5715789794922, "logps/rejected": -886.7354736328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.2402210682630539, "rewards/margins": 75.64317321777344, "rewards/rejected": -75.4029541015625, "step": 2105 }, { "epoch": 1.7029862792574657, "grad_norm": 5.784409409792524e-12, "learning_rate": 1.6889941548771634e-06, "logits/chosen": -2.274028778076172, "logits/rejected": -1.5766329765319824, "logps/chosen": -179.0311737060547, "logps/rejected": -811.05810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.1082003116607666, "rewards/margins": 67.59437561035156, "rewards/rejected": -68.70256805419922, "step": 2110 }, { "epoch": 1.7070217917675545, "grad_norm": 1.904601042851395e-12, "learning_rate": 1.6826493469205724e-06, "logits/chosen": -2.412832736968994, "logits/rejected": -1.5763767957687378, "logps/chosen": -185.71588134765625, "logps/rejected": -880.5065307617188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.693077802658081, "rewards/margins": 74.66535949707031, "rewards/rejected": -72.97228240966797, "step": 2115 }, { "epoch": 1.7110573042776434, "grad_norm": 1.133748761539266e-09, "learning_rate": 1.6763847230026774e-06, "logits/chosen": -2.4004642963409424, "logits/rejected": -1.5032494068145752, "logps/chosen": -194.87184143066406, "logps/rejected": -876.7891845703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0161066055297852, "rewards/margins": 73.76185607910156, "rewards/rejected": -74.77796173095703, "step": 2120 }, { "epoch": 1.715092816787732, "grad_norm": 2.185217784500537e-09, "learning_rate": 1.6702005421154662e-06, "logits/chosen": -2.360910415649414, "logits/rejected": -1.480509638786316, "logps/chosen": -202.68218994140625, "logps/rejected": -882.1689453125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3602263927459717, "rewards/margins": 74.96501159667969, "rewards/rejected": -74.60478210449219, "step": 2125 }, { "epoch": 1.7191283292978208, "grad_norm": 2.189284487030818e-08, "learning_rate": 1.6640970599252513e-06, "logits/chosen": -2.391505718231201, "logits/rejected": -1.6087669134140015, "logps/chosen": -181.651123046875, "logps/rejected": -907.7843017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473064422607422, "rewards/margins": 78.56361389160156, "rewards/rejected": -77.41630554199219, "step": 2130 }, { "epoch": 1.7231638418079096, "grad_norm": 5.855683299360592e-11, "learning_rate": 1.6580745287621011e-06, "logits/chosen": -2.4884731769561768, "logits/rejected": -1.5622793436050415, "logps/chosen": -168.3282012939453, "logps/rejected": -801.8345947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.08271006494760513, "rewards/margins": 67.98313903808594, "rewards/rejected": -67.90042114257812, "step": 2135 }, { "epoch": 1.7271993543179984, "grad_norm": 7.02206034475239e-06, "learning_rate": 1.6521331976094085e-06, "logits/chosen": -2.5047221183776855, "logits/rejected": -1.6489397287368774, "logps/chosen": -179.41793823242188, "logps/rejected": -790.6056518554688, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.9858964681625366, "rewards/margins": 65.54296112060547, "rewards/rejected": -66.52885437011719, "step": 2140 }, { "epoch": 1.7312348668280872, "grad_norm": 1.3888107330828348e-09, "learning_rate": 1.6462733120935954e-06, "logits/chosen": -2.4042506217956543, "logits/rejected": -1.679070234298706, "logps/chosen": -194.14877319335938, "logps/rejected": -851.5810546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.8009588122367859, "rewards/margins": 72.96479034423828, "rewards/rejected": -72.16383361816406, "step": 2145 }, { "epoch": 1.7352703793381759, "grad_norm": 6.150206388610968e-08, "learning_rate": 1.640495114473961e-06, "logits/chosen": -2.484240770339966, "logits/rejected": -1.745948076248169, "logps/chosen": -186.33224487304688, "logps/rejected": -825.8488159179688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.465944528579712, "rewards/margins": 71.541015625, "rewards/rejected": -70.0750732421875, "step": 2150 }, { "epoch": 1.739305891848265, "grad_norm": 1.1800571542153193e-07, "learning_rate": 1.6347988436326635e-06, "logits/chosen": -2.3655037879943848, "logits/rejected": -1.643615961074829, "logps/chosen": -161.09011840820312, "logps/rejected": -792.63427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6866084933280945, "rewards/margins": 68.07306671142578, "rewards/rejected": -67.3864517211914, "step": 2155 }, { "epoch": 1.7433414043583535, "grad_norm": 3.7949415965509445e-16, "learning_rate": 1.629184735064846e-06, "logits/chosen": -2.5563695430755615, "logits/rejected": -1.6402267217636108, "logps/chosen": -189.49514770507812, "logps/rejected": -838.0086669921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.036682605743408, "rewards/margins": 74.20914459228516, "rewards/rejected": -71.17245483398438, "step": 2160 }, { "epoch": 1.7473769168684423, "grad_norm": 9.153041822737573e-13, "learning_rate": 1.6236530208689e-06, "logits/chosen": -2.514753580093384, "logits/rejected": -1.671907663345337, "logps/chosen": -206.55599975585938, "logps/rejected": -862.80517578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.2243609428405762, "rewards/margins": 73.25970458984375, "rewards/rejected": -72.03533935546875, "step": 2165 }, { "epoch": 1.7514124293785311, "grad_norm": 1.10996394298013e-08, "learning_rate": 1.6182039297368708e-06, "logits/chosen": -2.4308342933654785, "logits/rejected": -1.6771306991577148, "logps/chosen": -188.88375854492188, "logps/rejected": -952.5618286132812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.9673004150390625, "rewards/margins": 84.53007507324219, "rewards/rejected": -82.56277465820312, "step": 2170 }, { "epoch": 1.7554479418886197, "grad_norm": 1.1109630577266216e-05, "learning_rate": 1.612837686945001e-06, "logits/chosen": -2.3945653438568115, "logits/rejected": -1.7170158624649048, "logps/chosen": -191.9729766845703, "logps/rejected": -886.5382080078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.6629137992858887, "rewards/margins": 79.09255981445312, "rewards/rejected": -76.42964935302734, "step": 2175 }, { "epoch": 1.7594834543987088, "grad_norm": 4.032548947541237e-11, "learning_rate": 1.607554514344419e-06, "logits/chosen": -2.500761032104492, "logits/rejected": -1.5455501079559326, "logps/chosen": -173.4666290283203, "logps/rejected": -760.9808349609375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028765201568604, "rewards/margins": 65.66234588623047, "rewards/rejected": -64.65946960449219, "step": 2180 }, { "epoch": 1.7635189669087974, "grad_norm": 2.9484915842203918e-08, "learning_rate": 1.6023546303519668e-06, "logits/chosen": -2.486499786376953, "logits/rejected": -1.6204200983047485, "logps/chosen": -187.43856811523438, "logps/rejected": -826.2936401367188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5737768411636353, "rewards/margins": 70.15225219726562, "rewards/rejected": -69.57848358154297, "step": 2185 }, { "epoch": 1.7675544794188862, "grad_norm": 2.9496299066428255e-12, "learning_rate": 1.59723824994117e-06, "logits/chosen": -2.4456582069396973, "logits/rejected": -1.7996530532836914, "logps/chosen": -177.2115478515625, "logps/rejected": -822.51904296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.3664863109588623, "rewards/margins": 72.08853912353516, "rewards/rejected": -69.72205352783203, "step": 2190 }, { "epoch": 1.771589991928975, "grad_norm": 0.004116610623896122, "learning_rate": 1.59220558463335e-06, "logits/chosen": -2.4213509559631348, "logits/rejected": -1.684949517250061, "logps/chosen": -161.568603515625, "logps/rejected": -776.70068359375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 0.6104915738105774, "rewards/margins": 66.18531799316406, "rewards/rejected": -65.57481384277344, "step": 2195 }, { "epoch": 1.7756255044390636, "grad_norm": 0.002312577096745372, "learning_rate": 1.5872568424888794e-06, "logits/chosen": -2.4742507934570312, "logits/rejected": -1.771022081375122, "logps/chosen": -182.96835327148438, "logps/rejected": -823.8525390625, "loss": 0.0114, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.9101009368896484, "rewards/margins": 73.0784912109375, "rewards/rejected": -70.16838073730469, "step": 2200 }, { "epoch": 1.7796610169491527, "grad_norm": 8.287468628020989e-11, "learning_rate": 1.5823922280985822e-06, "logits/chosen": -2.5519115924835205, "logits/rejected": -1.6476218700408936, "logps/chosen": -189.91453552246094, "logps/rejected": -917.6154174804688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.07526250183582306, "rewards/margins": 78.27094268798828, "rewards/rejected": -78.34620666503906, "step": 2205 }, { "epoch": 1.7836965294592413, "grad_norm": 1.789987741362964e-10, "learning_rate": 1.5776119425752724e-06, "logits/chosen": -2.5597891807556152, "logits/rejected": -1.6173295974731445, "logps/chosen": -206.1993408203125, "logps/rejected": -843.0588989257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.5722044706344604, "rewards/margins": 72.11141204833984, "rewards/rejected": -72.68360900878906, "step": 2210 }, { "epoch": 1.78773204196933, "grad_norm": 9.607204388853516e-15, "learning_rate": 1.572916183545442e-06, "logits/chosen": -2.5835394859313965, "logits/rejected": -1.6839256286621094, "logps/chosen": -194.66444396972656, "logps/rejected": -812.2095947265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.3429100811481476, "rewards/margins": 69.46500396728516, "rewards/rejected": -69.1220932006836, "step": 2215 }, { "epoch": 1.791767554479419, "grad_norm": 5.013395948383348e-12, "learning_rate": 1.5683051451410892e-06, "logits/chosen": -2.4575138092041016, "logits/rejected": -1.6742404699325562, "logps/chosen": -187.59310913085938, "logps/rejected": -842.63427734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.1451218128204346, "rewards/margins": 74.1231689453125, "rewards/rejected": -71.97804260253906, "step": 2220 }, { "epoch": 1.7958030669895075, "grad_norm": 7.023188454413631e-12, "learning_rate": 1.5637790179916958e-06, "logits/chosen": -2.3671071529388428, "logits/rejected": -1.640991449356079, "logps/chosen": -191.244384765625, "logps/rejected": -913.2353515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.0969510078430176, "rewards/margins": 76.70948791503906, "rewards/rejected": -77.80644226074219, "step": 2225 }, { "epoch": 1.7998385794995966, "grad_norm": 4.6204579717823435e-08, "learning_rate": 1.55933798921634e-06, "logits/chosen": -2.4800097942352295, "logits/rejected": -1.601954460144043, "logps/chosen": -194.17987060546875, "logps/rejected": -813.6891479492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.11832503974437714, "rewards/margins": 67.49776458740234, "rewards/rejected": -67.6160888671875, "step": 2230 }, { "epoch": 1.8038740920096852, "grad_norm": 1.3645123225769495e-11, "learning_rate": 1.5549822424159672e-06, "logits/chosen": -2.387376308441162, "logits/rejected": -1.6466537714004517, "logps/chosen": -180.13661193847656, "logps/rejected": -863.0101318359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 1.3498878479003906, "rewards/margins": 75.14349365234375, "rewards/rejected": -73.7936019897461, "step": 2235 }, { "epoch": 1.807909604519774, "grad_norm": 0.003801523707807064, "learning_rate": 1.5507119576657964e-06, "logits/chosen": -2.3709187507629395, "logits/rejected": -1.5481702089309692, "logps/chosen": -220.72622680664062, "logps/rejected": -1001.6251220703125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.8153938055038452, "rewards/margins": 85.33943939208984, "rewards/rejected": -86.15483093261719, "step": 2240 }, { "epoch": 1.8119451170298628, "grad_norm": 1.6762128794353681e-12, "learning_rate": 1.5465273115078738e-06, "logits/chosen": -2.345972776412964, "logits/rejected": -1.6707206964492798, "logps/chosen": -216.82894897460938, "logps/rejected": -915.7076416015625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.6842430830001831, "rewards/margins": 76.03483581542969, "rewards/rejected": -76.71907043457031, "step": 2245 }, { "epoch": 1.8159806295399514, "grad_norm": 1.235048063819022e-08, "learning_rate": 1.542428476943779e-06, "logits/chosen": -2.4782257080078125, "logits/rejected": -1.5702565908432007, "logps/chosen": -212.83322143554688, "logps/rejected": -932.3889770507812, "loss": 0.0312, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.119983434677124, "rewards/margins": 78.3243637084961, "rewards/rejected": -79.44435119628906, "step": 2250 }, { "epoch": 1.8200161420500405, "grad_norm": 4.955849966514769e-11, "learning_rate": 1.5384156234274674e-06, "logits/chosen": -2.5281379222869873, "logits/rejected": -1.4845737218856812, "logps/chosen": -188.3463134765625, "logps/rejected": -873.7918701171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.44021672010421753, "rewards/margins": 76.51277160644531, "rewards/rejected": -76.07255554199219, "step": 2255 }, { "epoch": 1.824051654560129, "grad_norm": 8.726830369454319e-11, "learning_rate": 1.5344889168582694e-06, "logits/chosen": -2.4591104984283447, "logits/rejected": -1.5029065608978271, "logps/chosen": -179.2944793701172, "logps/rejected": -815.4078979492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.34952065348625183, "rewards/margins": 70.95982360839844, "rewards/rejected": -70.61029815673828, "step": 2260 }, { "epoch": 1.828087167070218, "grad_norm": 2.3204255183983946e-10, "learning_rate": 1.530648519574028e-06, "logits/chosen": -2.495850086212158, "logits/rejected": -1.655462622642517, "logps/chosen": -182.66537475585938, "logps/rejected": -823.4406127929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.10992805659770966, "rewards/margins": 69.70909118652344, "rewards/rejected": -69.81901550292969, "step": 2265 }, { "epoch": 1.8321226795803067, "grad_norm": 7.799196808126393e-14, "learning_rate": 1.5268945903443903e-06, "logits/chosen": -2.383436441421509, "logits/rejected": -1.579558253288269, "logps/chosen": -211.02163696289062, "logps/rejected": -876.3792114257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.4417477548122406, "rewards/margins": 74.88987731933594, "rewards/rejected": -75.33163452148438, "step": 2270 }, { "epoch": 1.8361581920903953, "grad_norm": 1.3911010364464627e-11, "learning_rate": 1.5232272843642413e-06, "logits/chosen": -2.4409260749816895, "logits/rejected": -1.6151212453842163, "logps/chosen": -213.76712036132812, "logps/rejected": -1009.7847900390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.24899239838123322, "rewards/margins": 84.60908508300781, "rewards/rejected": -84.85807037353516, "step": 2275 }, { "epoch": 1.8401937046004844, "grad_norm": 1.160803719812975e-08, "learning_rate": 1.5196467532472893e-06, "logits/chosen": -2.491570472717285, "logits/rejected": -1.613014817237854, "logps/chosen": -197.33676147460938, "logps/rejected": -914.71533203125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.209885835647583, "rewards/margins": 78.54813385009766, "rewards/rejected": -78.75801086425781, "step": 2280 }, { "epoch": 1.844229217110573, "grad_norm": 1.3560916712479498e-12, "learning_rate": 1.516153145019798e-06, "logits/chosen": -2.5091030597686768, "logits/rejected": -1.4988980293273926, "logps/chosen": -209.3756866455078, "logps/rejected": -886.7674560546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.323325514793396, "rewards/margins": 75.43795013427734, "rewards/rejected": -76.76127624511719, "step": 2285 }, { "epoch": 1.8482647296206618, "grad_norm": 2.73991680038721e-12, "learning_rate": 1.512746604114466e-06, "logits/chosen": -2.401498317718506, "logits/rejected": -1.609773874282837, "logps/chosen": -193.27420043945312, "logps/rejected": -858.9059448242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3177366852760315, "rewards/margins": 73.52757263183594, "rewards/rejected": -73.84529876708984, "step": 2290 }, { "epoch": 1.8523002421307506, "grad_norm": 3.025668882106203e-10, "learning_rate": 1.509427271364456e-06, "logits/chosen": -2.479397773742676, "logits/rejected": -1.469905138015747, "logps/chosen": -202.07891845703125, "logps/rejected": -908.0569458007812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.56782603263855, "rewards/margins": 76.47935485839844, "rewards/rejected": -79.04718017578125, "step": 2295 }, { "epoch": 1.8563357546408394, "grad_norm": 4.678490928000656e-10, "learning_rate": 1.5061952839975717e-06, "logits/chosen": -2.422562837600708, "logits/rejected": -1.5041239261627197, "logps/chosen": -195.02108764648438, "logps/rejected": -901.0309448242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3389101028442383, "rewards/margins": 76.97557067871094, "rewards/rejected": -78.31448364257812, "step": 2300 }, { "epoch": 1.8603712671509283, "grad_norm": 5.00715753581904e-11, "learning_rate": 1.5030507756305867e-06, "logits/chosen": -2.5354232788085938, "logits/rejected": -1.4611945152282715, "logps/chosen": -195.6452178955078, "logps/rejected": -938.2527465820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.9477550983428955, "rewards/margins": 79.07939147949219, "rewards/rejected": -81.02714538574219, "step": 2305 }, { "epoch": 1.8644067796610169, "grad_norm": 0.026829760521650314, "learning_rate": 1.4999938762637178e-06, "logits/chosen": -2.4119842052459717, "logits/rejected": -1.4781947135925293, "logps/chosen": -196.9988250732422, "logps/rejected": -845.21728515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.053754210472107, "rewards/margins": 72.31950378417969, "rewards/rejected": -73.37326049804688, "step": 2310 }, { "epoch": 1.8684422921711057, "grad_norm": 1.1250428855191785e-07, "learning_rate": 1.497024712275253e-06, "logits/chosen": -2.404822587966919, "logits/rejected": -1.535852074623108, "logps/chosen": -203.3873291015625, "logps/rejected": -856.896484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6305880546569824, "rewards/margins": 72.65642547607422, "rewards/rejected": -73.28702545166016, "step": 2315 }, { "epoch": 1.8724778046811945, "grad_norm": 5.29262396102982e-15, "learning_rate": 1.4941434064163247e-06, "logits/chosen": -2.4139046669006348, "logits/rejected": -1.5582201480865479, "logps/chosen": -198.60598754882812, "logps/rejected": -839.0086059570312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.1756175458431244, "rewards/margins": 71.35997772216797, "rewards/rejected": -71.53559112548828, "step": 2320 }, { "epoch": 1.8765133171912833, "grad_norm": 3.715653544393571e-13, "learning_rate": 1.4913500778058363e-06, "logits/chosen": -2.5995001792907715, "logits/rejected": -1.4311624765396118, "logps/chosen": -175.4932098388672, "logps/rejected": -834.4580078125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.3912749290466309, "rewards/margins": 70.7298812866211, "rewards/rejected": -72.12115478515625, "step": 2325 }, { "epoch": 1.8805488297013722, "grad_norm": 5.734391850253928e-10, "learning_rate": 1.488644841925537e-06, "logits/chosen": -2.4160866737365723, "logits/rejected": -1.466683030128479, "logps/chosen": -199.48072814941406, "logps/rejected": -899.3963012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.7429065704345703, "rewards/margins": 75.60400390625, "rewards/rejected": -77.3469009399414, "step": 2330 }, { "epoch": 1.8845843422114608, "grad_norm": 1.2748210690208261e-12, "learning_rate": 1.4860278106152472e-06, "logits/chosen": -2.3627941608428955, "logits/rejected": -1.479870080947876, "logps/chosen": -203.81935119628906, "logps/rejected": -828.3458251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.9774500131607056, "rewards/margins": 69.66748046875, "rewards/rejected": -70.64493560791016, "step": 2335 }, { "epoch": 1.8886198547215496, "grad_norm": 1.003888971951028e-09, "learning_rate": 1.4834990920682367e-06, "logits/chosen": -2.55894136428833, "logits/rejected": -1.5916869640350342, "logps/chosen": -184.75204467773438, "logps/rejected": -871.3912353515625, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.762712836265564, "rewards/margins": 73.69685363769531, "rewards/rejected": -74.4595718383789, "step": 2340 }, { "epoch": 1.8926553672316384, "grad_norm": 4.5260759029552844e-10, "learning_rate": 1.4810587908267487e-06, "logits/chosen": -2.483802318572998, "logits/rejected": -1.548654556274414, "logps/chosen": -198.77725219726562, "logps/rejected": -785.607421875, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9332286715507507, "rewards/margins": 66.09583282470703, "rewards/rejected": -67.029052734375, "step": 2345 }, { "epoch": 1.8966908797417272, "grad_norm": 0.6762046813964844, "learning_rate": 1.4787070077776807e-06, "logits/chosen": -2.432281970977783, "logits/rejected": -1.5511175394058228, "logps/chosen": -192.3115997314453, "logps/rejected": -936.86376953125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": 1.708142876625061, "rewards/margins": 81.94723510742188, "rewards/rejected": -80.23909759521484, "step": 2350 }, { "epoch": 1.900726392251816, "grad_norm": 1.5075913850637335e-08, "learning_rate": 1.4764438401484116e-06, "logits/chosen": -2.4118828773498535, "logits/rejected": -1.4781147241592407, "logps/chosen": -177.75355529785156, "logps/rejected": -991.2161865234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.18818549811840057, "rewards/margins": 84.88606262207031, "rewards/rejected": -85.07425689697266, "step": 2355 }, { "epoch": 1.9047619047619047, "grad_norm": 7.572409496323096e-14, "learning_rate": 1.474269381502784e-06, "logits/chosen": -2.367128849029541, "logits/rejected": -1.4666637182235718, "logps/chosen": -165.86483764648438, "logps/rejected": -752.9413452148438, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.6395973563194275, "rewards/margins": 64.8212661743164, "rewards/rejected": -64.18167114257812, "step": 2360 }, { "epoch": 1.9087974172719937, "grad_norm": 1.967657148327362e-08, "learning_rate": 1.4721837217372333e-06, "logits/chosen": -2.2737574577331543, "logits/rejected": -1.531235933303833, "logps/chosen": -212.21414184570312, "logps/rejected": -921.0654296875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.84429931640625, "rewards/margins": 82.64971923828125, "rewards/rejected": -79.80542755126953, "step": 2365 }, { "epoch": 1.9128329297820823, "grad_norm": 1.0951581685958445e-07, "learning_rate": 1.4701869470770748e-06, "logits/chosen": -2.2953131198883057, "logits/rejected": -1.5066636800765991, "logps/chosen": -199.25428771972656, "logps/rejected": -878.6552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.852959394454956, "rewards/margins": 73.86479949951172, "rewards/rejected": -75.71775817871094, "step": 2370 }, { "epoch": 1.9168684422921711, "grad_norm": 7.57677257891627e-17, "learning_rate": 1.468279140072936e-06, "logits/chosen": -2.3187386989593506, "logits/rejected": -1.4851253032684326, "logps/chosen": -189.5048828125, "logps/rejected": -931.3338012695312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7439261078834534, "rewards/margins": 78.32169342041016, "rewards/rejected": -79.06562042236328, "step": 2375 }, { "epoch": 1.92090395480226, "grad_norm": 6.188864707946777, "learning_rate": 1.4664603795973456e-06, "logits/chosen": -2.3929553031921387, "logits/rejected": -1.6458581686019897, "logps/chosen": -205.9731903076172, "logps/rejected": -844.3062744140625, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0255391597747803, "rewards/margins": 71.34619140625, "rewards/rejected": -72.37173461914062, "step": 2380 }, { "epoch": 1.9249394673123486, "grad_norm": 1.538379312826521e-09, "learning_rate": 1.4647307408414716e-06, "logits/chosen": -2.1848132610321045, "logits/rejected": -1.534245491027832, "logps/chosen": -189.89036560058594, "logps/rejected": -832.71044921875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.841910719871521, "rewards/margins": 72.65717315673828, "rewards/rejected": -71.81526184082031, "step": 2385 }, { "epoch": 1.9289749798224376, "grad_norm": 2.3228279109194228e-12, "learning_rate": 1.463090295312013e-06, "logits/chosen": -2.2082409858703613, "logits/rejected": -1.5119680166244507, "logps/chosen": -197.41903686523438, "logps/rejected": -891.4517822265625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4848177433013916, "rewards/margins": 73.56373596191406, "rewards/rejected": -76.04855346679688, "step": 2390 }, { "epoch": 1.9330104923325262, "grad_norm": 1.9855264099533088e-09, "learning_rate": 1.461539110828244e-06, "logits/chosen": -2.3398635387420654, "logits/rejected": -1.4428939819335938, "logps/chosen": -209.0952606201172, "logps/rejected": -853.8142700195312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.5013784170150757, "rewards/margins": 73.63616943359375, "rewards/rejected": -73.13479614257812, "step": 2395 }, { "epoch": 1.937046004842615, "grad_norm": 1.6247260833804691e-15, "learning_rate": 1.4600772515192106e-06, "logits/chosen": -2.3283591270446777, "logits/rejected": -1.546920657157898, "logps/chosen": -216.7084503173828, "logps/rejected": -994.2333984375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9621613025665283, "rewards/margins": 82.83660888671875, "rewards/rejected": -85.79876708984375, "step": 2400 }, { "epoch": 1.9410815173527038, "grad_norm": 1.073509547921958e-08, "learning_rate": 1.458704777821078e-06, "logits/chosen": -2.3645713329315186, "logits/rejected": -1.4067027568817139, "logps/chosen": -200.91204833984375, "logps/rejected": -918.9840698242188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.800323486328125, "rewards/margins": 78.18113708496094, "rewards/rejected": -78.98146057128906, "step": 2405 }, { "epoch": 1.9451170298627924, "grad_norm": 1.3772864793093476e-11, "learning_rate": 1.4574217464746327e-06, "logits/chosen": -2.310903787612915, "logits/rejected": -1.5039275884628296, "logps/chosen": -214.58486938476562, "logps/rejected": -954.1383666992188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.346979856491089, "rewards/margins": 79.47184753417969, "rewards/rejected": -81.81883239746094, "step": 2410 }, { "epoch": 1.9491525423728815, "grad_norm": 3.1530107953315766e-16, "learning_rate": 1.456228210522938e-06, "logits/chosen": -2.370971202850342, "logits/rejected": -1.4916362762451172, "logps/chosen": -193.93911743164062, "logps/rejected": -913.7125854492188, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.38698410987854, "rewards/margins": 77.61229705810547, "rewards/rejected": -78.9992904663086, "step": 2415 }, { "epoch": 1.95318805488297, "grad_norm": 0.00017189676873385906, "learning_rate": 1.4551242193091386e-06, "logits/chosen": -2.3814711570739746, "logits/rejected": -1.50387704372406, "logps/chosen": -208.2183074951172, "logps/rejected": -895.21142578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7379634380340576, "rewards/margins": 73.07186126708984, "rewards/rejected": -76.80982208251953, "step": 2420 }, { "epoch": 1.957223567393059, "grad_norm": 6.9545963015116286e-06, "learning_rate": 1.454109818474423e-06, "logits/chosen": -2.37473726272583, "logits/rejected": -1.494429588317871, "logps/chosen": -219.91476440429688, "logps/rejected": -959.1448364257812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.8762229681015015, "rewards/margins": 81.72378540039062, "rewards/rejected": -83.60001373291016, "step": 2425 }, { "epoch": 1.9612590799031477, "grad_norm": 1.08978312596264e-07, "learning_rate": 1.4531850499561346e-06, "logits/chosen": -2.1920294761657715, "logits/rejected": -1.5068225860595703, "logps/chosen": -235.8538055419922, "logps/rejected": -892.3212890625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.7669581174850464, "rewards/margins": 75.7994384765625, "rewards/rejected": -76.56639099121094, "step": 2430 }, { "epoch": 1.9652945924132363, "grad_norm": 1.1074299463564863e-11, "learning_rate": 1.4523499519860403e-06, "logits/chosen": -2.343214273452759, "logits/rejected": -1.5643360614776611, "logps/chosen": -210.5376434326172, "logps/rejected": -847.8191528320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2607460021972656, "rewards/margins": 69.53022766113281, "rewards/rejected": -72.79096984863281, "step": 2435 }, { "epoch": 1.9693301049233254, "grad_norm": 2.3133022478513026e-15, "learning_rate": 1.4516045590887472e-06, "logits/chosen": -2.3995468616485596, "logits/rejected": -1.596701979637146, "logps/chosen": -222.043212890625, "logps/rejected": -907.4011840820312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5724616050720215, "rewards/margins": 73.8661117553711, "rewards/rejected": -77.4385757446289, "step": 2440 }, { "epoch": 1.973365617433414, "grad_norm": 1.5661232033448158e-10, "learning_rate": 1.450948902080277e-06, "logits/chosen": -2.359708070755005, "logits/rejected": -1.4867304563522339, "logps/chosen": -217.18637084960938, "logps/rejected": -930.2879028320312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.6154511570930481, "rewards/margins": 81.12036895751953, "rewards/rejected": -81.7358169555664, "step": 2445 }, { "epoch": 1.9774011299435028, "grad_norm": 9.271602152693958e-08, "learning_rate": 1.4503830080667925e-06, "logits/chosen": -2.252596139907837, "logits/rejected": -1.4889358282089233, "logps/chosen": -214.47802734375, "logps/rejected": -975.5628662109375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3346073627471924, "rewards/margins": 83.19673156738281, "rewards/rejected": -84.53134155273438, "step": 2450 }, { "epoch": 1.9814366424535916, "grad_norm": 4.904167003050297e-11, "learning_rate": 1.4499069004434752e-06, "logits/chosen": -2.3398680686950684, "logits/rejected": -1.4827831983566284, "logps/chosen": -243.18582153320312, "logps/rejected": -1041.0335693359375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.123208999633789, "rewards/margins": 87.16325378417969, "rewards/rejected": -90.28645324707031, "step": 2455 }, { "epoch": 1.9854721549636802, "grad_norm": 1.8831811110509022e-12, "learning_rate": 1.4495205988935588e-06, "logits/chosen": -2.3878846168518066, "logits/rejected": -1.3839771747589111, "logps/chosen": -205.76416015625, "logps/rejected": -898.6096801757812, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 0.49735814332962036, "rewards/margins": 78.51222229003906, "rewards/rejected": -78.01486206054688, "step": 2460 }, { "epoch": 1.9895076674737693, "grad_norm": 3.8299350535453414e-07, "learning_rate": 1.449224119387517e-06, "logits/chosen": -2.297084093093872, "logits/rejected": -1.427427053451538, "logps/chosen": -186.83853149414062, "logps/rejected": -814.7015380859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.4254884719848633, "rewards/margins": 67.08326721191406, "rewards/rejected": -69.50875854492188, "step": 2465 }, { "epoch": 1.9935431799838579, "grad_norm": 2.649472655158469e-11, "learning_rate": 1.4490174741824002e-06, "logits/chosen": -2.300013303756714, "logits/rejected": -1.4767221212387085, "logps/chosen": -211.55496215820312, "logps/rejected": -890.3798828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.835497260093689, "rewards/margins": 76.04972839355469, "rewards/rejected": -76.88522338867188, "step": 2470 }, { "epoch": 1.9975786924939467, "grad_norm": 2.895541366728239e-16, "learning_rate": 1.4489006718213316e-06, "logits/chosen": -2.3112311363220215, "logits/rejected": -1.6639906167984009, "logps/chosen": -214.3192138671875, "logps/rejected": -874.0631103515625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -0.3078431189060211, "rewards/margins": 73.70103454589844, "rewards/rejected": -74.00887298583984, "step": 2475 }, { "epoch": 1.9983857949959645, "eval_logits/chosen": -2.316248655319214, "eval_logits/rejected": -1.1027339696884155, "eval_logps/chosen": -135.53904724121094, "eval_logps/rejected": -1003.654296875, "eval_loss": 1.6357729691662826e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.6742682456970215, "eval_rewards/margins": 91.90789031982422, "eval_rewards/rejected": -93.58216857910156, "eval_runtime": 19.6333, "eval_samples_per_second": 10.187, "eval_steps_per_second": 10.187, "step": 2476 } ], "logging_steps": 5, "max_steps": 2478, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }