{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 6736, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009501187648456057, "grad_norm": 54.75, "learning_rate": 9.980997624703087e-07, "logits/chosen": 0.09168118238449097, "logits/rejected": 0.12129797041416168, "logps/chosen": -36.49408721923828, "logps/rejected": -49.25672149658203, "loss": 0.6922, "rewards/accuracies": 0.515625, "rewards/chosen": 0.008192603476345539, "rewards/margins": 0.0024515376426279545, "rewards/rejected": 0.005741065833717585, "step": 16 }, { "epoch": 0.019002375296912115, "grad_norm": 59.75, "learning_rate": 9.961995249406174e-07, "logits/chosen": 0.08897414803504944, "logits/rejected": 0.1399449110031128, "logps/chosen": -37.52133560180664, "logps/rejected": -51.12624740600586, "loss": 0.6829, "rewards/accuracies": 0.671875, "rewards/chosen": 0.017915938049554825, "rewards/margins": 0.021237602457404137, "rewards/rejected": -0.003321664407849312, "step": 32 }, { "epoch": 0.028503562945368172, "grad_norm": 63.75, "learning_rate": 9.942992874109262e-07, "logits/chosen": 0.09679778665304184, "logits/rejected": 0.16931985318660736, "logps/chosen": -36.65215301513672, "logps/rejected": -49.573421478271484, "loss": 0.6798, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03151966631412506, "rewards/margins": 0.027938464656472206, "rewards/rejected": 0.003581203753128648, "step": 48 }, { "epoch": 0.03800475059382423, "grad_norm": 63.25, "learning_rate": 9.923990498812351e-07, "logits/chosen": 0.16125047206878662, "logits/rejected": 0.24480265378952026, "logps/chosen": -38.162559509277344, "logps/rejected": -52.15303039550781, "loss": 0.6703, "rewards/accuracies": 0.734375, "rewards/chosen": 0.03711671382188797, "rewards/margins": 0.047640666365623474, "rewards/rejected": -0.010523954406380653, "step": 64 }, { "epoch": 0.047505938242280284, "grad_norm": 64.5, "learning_rate": 9.904988123515439e-07, "logits/chosen": 0.06547170132398605, "logits/rejected": 0.13260145485401154, "logps/chosen": -36.432579040527344, "logps/rejected": -48.84739303588867, "loss": 0.6626, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.0562339723110199, "rewards/margins": 0.0634559616446495, "rewards/rejected": -0.00722198560833931, "step": 80 }, { "epoch": 0.057007125890736345, "grad_norm": 55.25, "learning_rate": 9.885985748218526e-07, "logits/chosen": 0.07628384232521057, "logits/rejected": 0.13675282895565033, "logps/chosen": -35.15069580078125, "logps/rejected": -47.46355438232422, "loss": 0.6585, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06429558247327805, "rewards/margins": 0.0723925530910492, "rewards/rejected": -0.008096965961158276, "step": 96 }, { "epoch": 0.0665083135391924, "grad_norm": 50.75, "learning_rate": 9.866983372921614e-07, "logits/chosen": 0.11520403623580933, "logits/rejected": 0.16309119760990143, "logps/chosen": -36.81483840942383, "logps/rejected": -50.9425163269043, "loss": 0.6533, "rewards/accuracies": 0.796875, "rewards/chosen": 0.06526956707239151, "rewards/margins": 0.08356323093175888, "rewards/rejected": -0.018293654546141624, "step": 112 }, { "epoch": 0.07600950118764846, "grad_norm": 70.5, "learning_rate": 9.847980997624703e-07, "logits/chosen": 0.08769215643405914, "logits/rejected": 0.17430077493190765, "logps/chosen": -37.91537094116211, "logps/rejected": -52.815677642822266, "loss": 0.6492, "rewards/accuracies": 0.8046875, "rewards/chosen": 0.058046698570251465, "rewards/margins": 0.09332208335399628, "rewards/rejected": -0.035275377333164215, "step": 128 }, { "epoch": 0.0855106888361045, "grad_norm": 56.5, "learning_rate": 9.82897862232779e-07, "logits/chosen": 0.1254298835992813, "logits/rejected": 0.1748633235692978, "logps/chosen": -36.702362060546875, "logps/rejected": -50.70151901245117, "loss": 0.6422, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.06117745488882065, "rewards/margins": 0.10814622789621353, "rewards/rejected": -0.04696878045797348, "step": 144 }, { "epoch": 0.09501187648456057, "grad_norm": 61.75, "learning_rate": 9.809976247030878e-07, "logits/chosen": 0.09889174997806549, "logits/rejected": 0.09486458450555801, "logps/chosen": -37.1884651184082, "logps/rejected": -48.955718994140625, "loss": 0.6406, "rewards/accuracies": 0.765625, "rewards/chosen": 0.08231806755065918, "rewards/margins": 0.11266255378723145, "rewards/rejected": -0.030344482511281967, "step": 160 }, { "epoch": 0.10451306413301663, "grad_norm": 61.0, "learning_rate": 9.790973871733966e-07, "logits/chosen": 0.12300257384777069, "logits/rejected": 0.1372351050376892, "logps/chosen": -37.07274627685547, "logps/rejected": -51.80104446411133, "loss": 0.6329, "rewards/accuracies": 0.84375, "rewards/chosen": 0.07423266023397446, "rewards/margins": 0.1288163661956787, "rewards/rejected": -0.05458369851112366, "step": 176 }, { "epoch": 0.11401425178147269, "grad_norm": 56.25, "learning_rate": 9.771971496437053e-07, "logits/chosen": 0.14542920887470245, "logits/rejected": 0.1965111494064331, "logps/chosen": -35.992820739746094, "logps/rejected": -49.162193298339844, "loss": 0.6215, "rewards/accuracies": 0.859375, "rewards/chosen": 0.09972672164440155, "rewards/margins": 0.1545056849718094, "rewards/rejected": -0.05477897450327873, "step": 192 }, { "epoch": 0.12351543942992874, "grad_norm": 73.5, "learning_rate": 9.752969121140143e-07, "logits/chosen": 0.09255525469779968, "logits/rejected": 0.19868624210357666, "logps/chosen": -35.94331741333008, "logps/rejected": -51.20383071899414, "loss": 0.6101, "rewards/accuracies": 0.875, "rewards/chosen": 0.10126903653144836, "rewards/margins": 0.17989249527454376, "rewards/rejected": -0.0786234438419342, "step": 208 }, { "epoch": 0.1330166270783848, "grad_norm": 58.25, "learning_rate": 9.73396674584323e-07, "logits/chosen": 0.13323335349559784, "logits/rejected": 0.2155267596244812, "logps/chosen": -38.20836639404297, "logps/rejected": -52.9131965637207, "loss": 0.6177, "rewards/accuracies": 0.875, "rewards/chosen": 0.05846577137708664, "rewards/margins": 0.16329729557037354, "rewards/rejected": -0.1048315167427063, "step": 224 }, { "epoch": 0.14251781472684086, "grad_norm": 54.75, "learning_rate": 9.714964370546317e-07, "logits/chosen": 0.09854070097208023, "logits/rejected": 0.20600098371505737, "logps/chosen": -35.198726654052734, "logps/rejected": -50.93071365356445, "loss": 0.6034, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.0857614204287529, "rewards/margins": 0.19499169290065765, "rewards/rejected": -0.10923026502132416, "step": 240 }, { "epoch": 0.15201900237529692, "grad_norm": 60.25, "learning_rate": 9.695961995249405e-07, "logits/chosen": 0.11161897331476212, "logits/rejected": 0.14389154314994812, "logps/chosen": -37.637821197509766, "logps/rejected": -51.35750198364258, "loss": 0.6147, "rewards/accuracies": 0.828125, "rewards/chosen": 0.03235001862049103, "rewards/margins": 0.17306238412857056, "rewards/rejected": -0.14071235060691833, "step": 256 }, { "epoch": 0.16152019002375298, "grad_norm": 61.5, "learning_rate": 9.676959619952494e-07, "logits/chosen": 0.045116275548934937, "logits/rejected": 0.12899161875247955, "logps/chosen": -36.590667724609375, "logps/rejected": -52.80071258544922, "loss": 0.6001, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.04269418492913246, "rewards/margins": 0.20594461262226105, "rewards/rejected": -0.16325044631958008, "step": 272 }, { "epoch": 0.171021377672209, "grad_norm": 57.0, "learning_rate": 9.657957244655582e-07, "logits/chosen": 0.10446017980575562, "logits/rejected": 0.1502765715122223, "logps/chosen": -36.7647705078125, "logps/rejected": -50.46417236328125, "loss": 0.5972, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.0390743762254715, "rewards/margins": 0.21257053315639496, "rewards/rejected": -0.17349614202976227, "step": 288 }, { "epoch": 0.18052256532066507, "grad_norm": 55.0, "learning_rate": 9.63895486935867e-07, "logits/chosen": 0.04691855609416962, "logits/rejected": 0.17598745226860046, "logps/chosen": -35.485595703125, "logps/rejected": -51.60365676879883, "loss": 0.582, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.03419807553291321, "rewards/margins": 0.2475091964006424, "rewards/rejected": -0.2133111208677292, "step": 304 }, { "epoch": 0.19002375296912113, "grad_norm": 59.25, "learning_rate": 9.619952494061757e-07, "logits/chosen": 0.15173594653606415, "logits/rejected": 0.22723865509033203, "logps/chosen": -38.54058837890625, "logps/rejected": -53.069339752197266, "loss": 0.5939, "rewards/accuracies": 0.859375, "rewards/chosen": -0.0007697370601817966, "rewards/margins": 0.22217878699302673, "rewards/rejected": -0.22294853627681732, "step": 320 }, { "epoch": 0.1995249406175772, "grad_norm": 54.5, "learning_rate": 9.600950118764846e-07, "logits/chosen": 0.13400091230869293, "logits/rejected": 0.1886436641216278, "logps/chosen": -37.70890808105469, "logps/rejected": -53.3675651550293, "loss": 0.5775, "rewards/accuracies": 0.859375, "rewards/chosen": -0.012358499690890312, "rewards/margins": 0.26150980591773987, "rewards/rejected": -0.2738683223724365, "step": 336 }, { "epoch": 0.20902612826603326, "grad_norm": 59.0, "learning_rate": 9.581947743467934e-07, "logits/chosen": 0.15047906339168549, "logits/rejected": 0.2155291885137558, "logps/chosen": -36.07099533081055, "logps/rejected": -51.531551361083984, "loss": 0.577, "rewards/accuracies": 0.8984375, "rewards/chosen": 0.0014424873515963554, "rewards/margins": 0.26147347688674927, "rewards/rejected": -0.26003098487854004, "step": 352 }, { "epoch": 0.21852731591448932, "grad_norm": 61.25, "learning_rate": 9.562945368171021e-07, "logits/chosen": 0.11939611285924911, "logits/rejected": 0.20481155812740326, "logps/chosen": -37.29627990722656, "logps/rejected": -53.07925033569336, "loss": 0.5697, "rewards/accuracies": 0.8359375, "rewards/chosen": 0.005910781677812338, "rewards/margins": 0.28491517901420593, "rewards/rejected": -0.27900439500808716, "step": 368 }, { "epoch": 0.22802850356294538, "grad_norm": 55.75, "learning_rate": 9.543942992874109e-07, "logits/chosen": 0.10844097286462784, "logits/rejected": 0.21932217478752136, "logps/chosen": -37.1666259765625, "logps/rejected": -53.12284851074219, "loss": 0.5754, "rewards/accuracies": 0.828125, "rewards/chosen": -0.007143110036849976, "rewards/margins": 0.2737848162651062, "rewards/rejected": -0.28092795610427856, "step": 384 }, { "epoch": 0.2375296912114014, "grad_norm": 54.75, "learning_rate": 9.524940617577196e-07, "logits/chosen": 0.10385459661483765, "logits/rejected": 0.13318760693073273, "logps/chosen": -35.50322341918945, "logps/rejected": -50.462791442871094, "loss": 0.5679, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0004334240220487118, "rewards/margins": 0.2884422540664673, "rewards/rejected": -0.28800880908966064, "step": 400 }, { "epoch": 0.24703087885985747, "grad_norm": 54.25, "learning_rate": 9.505938242280285e-07, "logits/chosen": 0.08173692971467972, "logits/rejected": 0.14498800039291382, "logps/chosen": -35.70138931274414, "logps/rejected": -54.012001037597656, "loss": 0.5657, "rewards/accuracies": 0.859375, "rewards/chosen": -0.03053681179881096, "rewards/margins": 0.29130610823631287, "rewards/rejected": -0.3218429386615753, "step": 416 }, { "epoch": 0.25653206650831356, "grad_norm": 63.75, "learning_rate": 9.486935866983372e-07, "logits/chosen": 0.08363573253154755, "logits/rejected": 0.18085773289203644, "logps/chosen": -39.11247634887695, "logps/rejected": -55.508888244628906, "loss": 0.5693, "rewards/accuracies": 0.875, "rewards/chosen": -0.05329626053571701, "rewards/margins": 0.2859145998954773, "rewards/rejected": -0.3392108976840973, "step": 432 }, { "epoch": 0.2660332541567696, "grad_norm": 65.5, "learning_rate": 9.467933491686461e-07, "logits/chosen": 0.07687534391880035, "logits/rejected": 0.2082725167274475, "logps/chosen": -37.038570404052734, "logps/rejected": -54.50634765625, "loss": 0.5379, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.019455045461654663, "rewards/margins": 0.36107519268989563, "rewards/rejected": -0.3805302679538727, "step": 448 }, { "epoch": 0.2755344418052256, "grad_norm": 63.25, "learning_rate": 9.448931116389548e-07, "logits/chosen": 0.10213326662778854, "logits/rejected": 0.20626387000083923, "logps/chosen": -37.784019470214844, "logps/rejected": -54.457462310791016, "loss": 0.5493, "rewards/accuracies": 0.8515625, "rewards/chosen": -0.04002974182367325, "rewards/margins": 0.3406682312488556, "rewards/rejected": -0.380698025226593, "step": 464 }, { "epoch": 0.2850356294536817, "grad_norm": 61.75, "learning_rate": 9.429928741092636e-07, "logits/chosen": 0.13567008078098297, "logits/rejected": 0.24827228486537933, "logps/chosen": -38.69795227050781, "logps/rejected": -54.621788024902344, "loss": 0.5563, "rewards/accuracies": 0.8125, "rewards/chosen": -0.040405262261629105, "rewards/margins": 0.3285280168056488, "rewards/rejected": -0.3689332604408264, "step": 480 }, { "epoch": 0.29453681710213775, "grad_norm": 58.0, "learning_rate": 9.410926365795724e-07, "logits/chosen": 0.10663773119449615, "logits/rejected": 0.16733931005001068, "logps/chosen": -38.08431625366211, "logps/rejected": -54.35865020751953, "loss": 0.5339, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.04221474006772041, "rewards/margins": 0.37557515501976013, "rewards/rejected": -0.41778990626335144, "step": 496 }, { "epoch": 0.30403800475059384, "grad_norm": 70.0, "learning_rate": 9.391923990498812e-07, "logits/chosen": 0.1082092821598053, "logits/rejected": 0.23803919553756714, "logps/chosen": -38.825382232666016, "logps/rejected": -55.53346252441406, "loss": 0.5598, "rewards/accuracies": 0.8515625, "rewards/chosen": -0.09616291522979736, "rewards/margins": 0.31281211972236633, "rewards/rejected": -0.4089750051498413, "step": 512 }, { "epoch": 0.31353919239904987, "grad_norm": 50.25, "learning_rate": 9.3729216152019e-07, "logits/chosen": 0.07085791230201721, "logits/rejected": 0.15105192363262177, "logps/chosen": -39.11968994140625, "logps/rejected": -55.4873046875, "loss": 0.5426, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.10313617438077927, "rewards/margins": 0.351255863904953, "rewards/rejected": -0.45439204573631287, "step": 528 }, { "epoch": 0.32304038004750596, "grad_norm": 51.25, "learning_rate": 9.353919239904988e-07, "logits/chosen": 0.10827435553073883, "logits/rejected": 0.17179493606090546, "logps/chosen": -37.49391174316406, "logps/rejected": -55.33747100830078, "loss": 0.5326, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.05370311439037323, "rewards/margins": 0.38469013571739197, "rewards/rejected": -0.438393235206604, "step": 544 }, { "epoch": 0.332541567695962, "grad_norm": 65.5, "learning_rate": 9.334916864608076e-07, "logits/chosen": 0.14238286018371582, "logits/rejected": 0.19145098328590393, "logps/chosen": -37.788291931152344, "logps/rejected": -51.97111892700195, "loss": 0.553, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.08003263175487518, "rewards/margins": 0.33586975932121277, "rewards/rejected": -0.41590237617492676, "step": 560 }, { "epoch": 0.342042755344418, "grad_norm": 52.75, "learning_rate": 9.315914489311163e-07, "logits/chosen": 0.16290903091430664, "logits/rejected": 0.1991579532623291, "logps/chosen": -37.21266174316406, "logps/rejected": -55.373538970947266, "loss": 0.524, "rewards/accuracies": 0.9375, "rewards/chosen": -0.07777812331914902, "rewards/margins": 0.40286707878112793, "rewards/rejected": -0.48064517974853516, "step": 576 }, { "epoch": 0.3515439429928741, "grad_norm": 51.75, "learning_rate": 9.296912114014252e-07, "logits/chosen": 0.08858010172843933, "logits/rejected": 0.16938555240631104, "logps/chosen": -37.08319091796875, "logps/rejected": -53.559356689453125, "loss": 0.5302, "rewards/accuracies": 0.890625, "rewards/chosen": -0.08191077411174774, "rewards/margins": 0.3878284990787506, "rewards/rejected": -0.46973931789398193, "step": 592 }, { "epoch": 0.36104513064133015, "grad_norm": 57.0, "learning_rate": 9.277909738717339e-07, "logits/chosen": 0.1242557018995285, "logits/rejected": 0.12163582444190979, "logps/chosen": -40.13254165649414, "logps/rejected": -55.82794952392578, "loss": 0.5285, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.10593372583389282, "rewards/margins": 0.4058433473110199, "rewards/rejected": -0.5117770433425903, "step": 608 }, { "epoch": 0.37054631828978624, "grad_norm": 54.25, "learning_rate": 9.258907363420428e-07, "logits/chosen": 0.12377818673849106, "logits/rejected": 0.218975231051445, "logps/chosen": -38.33209228515625, "logps/rejected": -54.284889221191406, "loss": 0.5255, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.10532137006521225, "rewards/margins": 0.40542417764663696, "rewards/rejected": -0.5107455253601074, "step": 624 }, { "epoch": 0.38004750593824227, "grad_norm": 61.25, "learning_rate": 9.239904988123515e-07, "logits/chosen": 0.14407610893249512, "logits/rejected": 0.13527539372444153, "logps/chosen": -39.09244918823242, "logps/rejected": -56.88185501098633, "loss": 0.5161, "rewards/accuracies": 0.859375, "rewards/chosen": -0.12786737084388733, "rewards/margins": 0.4305534064769745, "rewards/rejected": -0.5584207773208618, "step": 640 }, { "epoch": 0.38954869358669836, "grad_norm": 60.5, "learning_rate": 9.220902612826604e-07, "logits/chosen": 0.08770464360713959, "logits/rejected": 0.1615772247314453, "logps/chosen": -40.214927673339844, "logps/rejected": -57.730621337890625, "loss": 0.4933, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.10945181548595428, "rewards/margins": 0.4860302805900574, "rewards/rejected": -0.5954821109771729, "step": 656 }, { "epoch": 0.3990498812351544, "grad_norm": 49.5, "learning_rate": 9.201900237529691e-07, "logits/chosen": 0.1510995626449585, "logits/rejected": 0.20847982168197632, "logps/chosen": -37.78511428833008, "logps/rejected": -55.67912292480469, "loss": 0.4915, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.07964207231998444, "rewards/margins": 0.5031406283378601, "rewards/rejected": -0.5827827453613281, "step": 672 }, { "epoch": 0.4085510688836104, "grad_norm": 45.75, "learning_rate": 9.18289786223278e-07, "logits/chosen": 0.08455921709537506, "logits/rejected": 0.19820302724838257, "logps/chosen": -37.23653793334961, "logps/rejected": -52.45553207397461, "loss": 0.5195, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.10724642127752304, "rewards/margins": 0.42769795656204224, "rewards/rejected": -0.5349443554878235, "step": 688 }, { "epoch": 0.4180522565320665, "grad_norm": 49.75, "learning_rate": 9.163895486935866e-07, "logits/chosen": 0.0849796012043953, "logits/rejected": 0.1387515664100647, "logps/chosen": -36.640594482421875, "logps/rejected": -56.54729080200195, "loss": 0.4855, "rewards/accuracies": 0.921875, "rewards/chosen": -0.07652697712182999, "rewards/margins": 0.5180978775024414, "rewards/rejected": -0.5946248173713684, "step": 704 }, { "epoch": 0.42755344418052255, "grad_norm": 50.25, "learning_rate": 9.144893111638954e-07, "logits/chosen": 0.1335923671722412, "logits/rejected": 0.12739677727222443, "logps/chosen": -38.18653106689453, "logps/rejected": -56.48445510864258, "loss": 0.4906, "rewards/accuracies": 0.921875, "rewards/chosen": -0.10945376753807068, "rewards/margins": 0.4993862211704254, "rewards/rejected": -0.6088399887084961, "step": 720 }, { "epoch": 0.43705463182897863, "grad_norm": 47.75, "learning_rate": 9.125890736342042e-07, "logits/chosen": 0.05496565252542496, "logits/rejected": 0.15505348145961761, "logps/chosen": -38.44123077392578, "logps/rejected": -58.174560546875, "loss": 0.4734, "rewards/accuracies": 0.953125, "rewards/chosen": -0.1328837275505066, "rewards/margins": 0.5390680432319641, "rewards/rejected": -0.6719517707824707, "step": 736 }, { "epoch": 0.44655581947743467, "grad_norm": 51.0, "learning_rate": 9.106888361045129e-07, "logits/chosen": 0.17514106631278992, "logits/rejected": 0.21084193885326385, "logps/chosen": -39.44230270385742, "logps/rejected": -55.33711624145508, "loss": 0.4927, "rewards/accuracies": 0.921875, "rewards/chosen": -0.13082744181156158, "rewards/margins": 0.4961455464363098, "rewards/rejected": -0.626973032951355, "step": 752 }, { "epoch": 0.45605700712589076, "grad_norm": 53.5, "learning_rate": 9.087885985748218e-07, "logits/chosen": 0.05614431947469711, "logits/rejected": 0.16487029194831848, "logps/chosen": -38.37213897705078, "logps/rejected": -56.517974853515625, "loss": 0.4808, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.13590717315673828, "rewards/margins": 0.5437510013580322, "rewards/rejected": -0.6796582341194153, "step": 768 }, { "epoch": 0.4655581947743468, "grad_norm": 56.5, "learning_rate": 9.068883610451305e-07, "logits/chosen": 0.09643438458442688, "logits/rejected": 0.1764623522758484, "logps/chosen": -37.71235275268555, "logps/rejected": -55.301570892333984, "loss": 0.4736, "rewards/accuracies": 0.890625, "rewards/chosen": -0.10847848653793335, "rewards/margins": 0.5602878928184509, "rewards/rejected": -0.6687663793563843, "step": 784 }, { "epoch": 0.4750593824228028, "grad_norm": 50.5, "learning_rate": 9.049881235154394e-07, "logits/chosen": 0.10280097275972366, "logits/rejected": 0.24883826076984406, "logps/chosen": -37.97513198852539, "logps/rejected": -55.975555419921875, "loss": 0.4709, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.1473047137260437, "rewards/margins": 0.5804386734962463, "rewards/rejected": -0.72774338722229, "step": 800 }, { "epoch": 0.4845605700712589, "grad_norm": 57.0, "learning_rate": 9.030878859857481e-07, "logits/chosen": 0.13920438289642334, "logits/rejected": 0.2100549191236496, "logps/chosen": -38.78316116333008, "logps/rejected": -57.39037322998047, "loss": 0.4858, "rewards/accuracies": 0.921875, "rewards/chosen": -0.19835878908634186, "rewards/margins": 0.5184602737426758, "rewards/rejected": -0.716819167137146, "step": 816 }, { "epoch": 0.49406175771971494, "grad_norm": 47.25, "learning_rate": 9.01187648456057e-07, "logits/chosen": 0.04185810685157776, "logits/rejected": 0.16661542654037476, "logps/chosen": -38.728607177734375, "logps/rejected": -56.995845794677734, "loss": 0.4705, "rewards/accuracies": 0.921875, "rewards/chosen": -0.18127599358558655, "rewards/margins": 0.5616625547409058, "rewards/rejected": -0.7429385185241699, "step": 832 }, { "epoch": 0.503562945368171, "grad_norm": 55.5, "learning_rate": 8.992874109263657e-07, "logits/chosen": 0.09122167527675629, "logits/rejected": 0.22163383662700653, "logps/chosen": -37.88745880126953, "logps/rejected": -58.1086311340332, "loss": 0.502, "rewards/accuracies": 0.890625, "rewards/chosen": -0.20832395553588867, "rewards/margins": 0.4904058575630188, "rewards/rejected": -0.6987298130989075, "step": 848 }, { "epoch": 0.5130641330166271, "grad_norm": 53.75, "learning_rate": 8.973871733966746e-07, "logits/chosen": 0.0702139213681221, "logits/rejected": 0.1391507089138031, "logps/chosen": -40.5086784362793, "logps/rejected": -56.50019454956055, "loss": 0.4854, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.26017051935195923, "rewards/margins": 0.5463624000549316, "rewards/rejected": -0.8065329194068909, "step": 864 }, { "epoch": 0.5225653206650831, "grad_norm": 46.0, "learning_rate": 8.954869358669833e-07, "logits/chosen": 0.1008288711309433, "logits/rejected": 0.20593173801898956, "logps/chosen": -38.77046585083008, "logps/rejected": -56.287872314453125, "loss": 0.4734, "rewards/accuracies": 0.859375, "rewards/chosen": -0.18773072957992554, "rewards/margins": 0.567348301410675, "rewards/rejected": -0.7550791501998901, "step": 880 }, { "epoch": 0.5320665083135392, "grad_norm": 56.0, "learning_rate": 8.935866983372922e-07, "logits/chosen": 0.061835043132305145, "logits/rejected": 0.16514989733695984, "logps/chosen": -40.11195373535156, "logps/rejected": -58.53046798706055, "loss": 0.4531, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.1927071213722229, "rewards/margins": 0.6243357062339783, "rewards/rejected": -0.817042887210846, "step": 896 }, { "epoch": 0.5415676959619953, "grad_norm": 49.5, "learning_rate": 8.916864608076009e-07, "logits/chosen": 0.14350858330726624, "logits/rejected": 0.1577494591474533, "logps/chosen": -39.235504150390625, "logps/rejected": -58.71210861206055, "loss": 0.4787, "rewards/accuracies": 0.921875, "rewards/chosen": -0.22734667360782623, "rewards/margins": 0.5474963784217834, "rewards/rejected": -0.7748430967330933, "step": 912 }, { "epoch": 0.5510688836104513, "grad_norm": 38.25, "learning_rate": 8.897862232779097e-07, "logits/chosen": 0.07540839910507202, "logits/rejected": 0.12306182831525803, "logps/chosen": -37.44085693359375, "logps/rejected": -57.793243408203125, "loss": 0.4301, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.15863735973834991, "rewards/margins": 0.693757176399231, "rewards/rejected": -0.8523945808410645, "step": 928 }, { "epoch": 0.5605700712589073, "grad_norm": 47.25, "learning_rate": 8.878859857482185e-07, "logits/chosen": 0.005182682536542416, "logits/rejected": 0.1266525387763977, "logps/chosen": -38.00994873046875, "logps/rejected": -57.80335235595703, "loss": 0.4626, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.16963782906532288, "rewards/margins": 0.6042665839195251, "rewards/rejected": -0.7739044427871704, "step": 944 }, { "epoch": 0.5700712589073634, "grad_norm": 55.5, "learning_rate": 8.859857482185272e-07, "logits/chosen": 0.11473742127418518, "logits/rejected": 0.16739153861999512, "logps/chosen": -41.33670425415039, "logps/rejected": -59.52238082885742, "loss": 0.4705, "rewards/accuracies": 0.859375, "rewards/chosen": -0.25219887495040894, "rewards/margins": 0.5897648334503174, "rewards/rejected": -0.8419637680053711, "step": 960 }, { "epoch": 0.5795724465558195, "grad_norm": 46.5, "learning_rate": 8.840855106888361e-07, "logits/chosen": 0.03330547362565994, "logits/rejected": 0.15450525283813477, "logps/chosen": -38.50295639038086, "logps/rejected": -55.91643142700195, "loss": 0.4555, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.1985245943069458, "rewards/margins": 0.6212279200553894, "rewards/rejected": -0.8197525143623352, "step": 976 }, { "epoch": 0.5890736342042755, "grad_norm": 49.25, "learning_rate": 8.821852731591448e-07, "logits/chosen": 0.10139049589633942, "logits/rejected": 0.19957150518894196, "logps/chosen": -39.53318405151367, "logps/rejected": -60.60167694091797, "loss": 0.4703, "rewards/accuracies": 0.890625, "rewards/chosen": -0.23673467338085175, "rewards/margins": 0.5786986351013184, "rewards/rejected": -0.8154332637786865, "step": 992 }, { "epoch": 0.5985748218527316, "grad_norm": 42.5, "learning_rate": 8.802850356294537e-07, "logits/chosen": 0.08935532718896866, "logits/rejected": 0.18748882412910461, "logps/chosen": -39.9503059387207, "logps/rejected": -58.329071044921875, "loss": 0.4571, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.23434796929359436, "rewards/margins": 0.6296550631523132, "rewards/rejected": -0.86400306224823, "step": 1008 }, { "epoch": 0.6080760095011877, "grad_norm": 48.0, "learning_rate": 8.783847980997624e-07, "logits/chosen": 0.12793917953968048, "logits/rejected": 0.26207882165908813, "logps/chosen": -40.79100036621094, "logps/rejected": -63.207977294921875, "loss": 0.4433, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2365538626909256, "rewards/margins": 0.6767521500587463, "rewards/rejected": -0.9133059978485107, "step": 1024 }, { "epoch": 0.6175771971496437, "grad_norm": 44.75, "learning_rate": 8.764845605700713e-07, "logits/chosen": 0.07575452327728271, "logits/rejected": 0.13339047133922577, "logps/chosen": -38.435760498046875, "logps/rejected": -56.613590240478516, "loss": 0.4691, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.20284534990787506, "rewards/margins": 0.6009599566459656, "rewards/rejected": -0.8038052916526794, "step": 1040 }, { "epoch": 0.6270783847980997, "grad_norm": 49.75, "learning_rate": 8.7458432304038e-07, "logits/chosen": 0.08708982169628143, "logits/rejected": 0.1805320382118225, "logps/chosen": -38.411643981933594, "logps/rejected": -62.70646286010742, "loss": 0.4134, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2167506217956543, "rewards/margins": 0.7745501399040222, "rewards/rejected": -0.9913008213043213, "step": 1056 }, { "epoch": 0.6365795724465558, "grad_norm": 74.5, "learning_rate": 8.726840855106889e-07, "logits/chosen": 0.10599172860383987, "logits/rejected": 0.17720839381217957, "logps/chosen": -41.43864059448242, "logps/rejected": -60.35655212402344, "loss": 0.4606, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.24390998482704163, "rewards/margins": 0.6325302124023438, "rewards/rejected": -0.8764402270317078, "step": 1072 }, { "epoch": 0.6460807600950119, "grad_norm": 54.5, "learning_rate": 8.707838479809976e-07, "logits/chosen": 0.1116759404540062, "logits/rejected": 0.12336824834346771, "logps/chosen": -40.634971618652344, "logps/rejected": -57.01872253417969, "loss": 0.4681, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.251184344291687, "rewards/margins": 0.6177032589912415, "rewards/rejected": -0.8688876628875732, "step": 1088 }, { "epoch": 0.6555819477434679, "grad_norm": 40.0, "learning_rate": 8.688836104513065e-07, "logits/chosen": 0.05607762932777405, "logits/rejected": 0.16543659567832947, "logps/chosen": -40.00341796875, "logps/rejected": -58.64830780029297, "loss": 0.4405, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2381763756275177, "rewards/margins": 0.6710238456726074, "rewards/rejected": -0.909200131893158, "step": 1104 }, { "epoch": 0.665083135391924, "grad_norm": 50.25, "learning_rate": 8.669833729216152e-07, "logits/chosen": 0.049548353999853134, "logits/rejected": 0.13943257927894592, "logps/chosen": -39.979637145996094, "logps/rejected": -59.84418487548828, "loss": 0.4359, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.2502007484436035, "rewards/margins": 0.7026790380477905, "rewards/rejected": -0.9528796672821045, "step": 1120 }, { "epoch": 0.6745843230403801, "grad_norm": 46.5, "learning_rate": 8.65083135391924e-07, "logits/chosen": 0.08080411702394485, "logits/rejected": 0.15538814663887024, "logps/chosen": -37.723243713378906, "logps/rejected": -58.846256256103516, "loss": 0.422, "rewards/accuracies": 0.921875, "rewards/chosen": -0.18230988085269928, "rewards/margins": 0.7491470575332642, "rewards/rejected": -0.931456983089447, "step": 1136 }, { "epoch": 0.684085510688836, "grad_norm": 38.5, "learning_rate": 8.631828978622328e-07, "logits/chosen": 0.10914994031190872, "logits/rejected": 0.22802668809890747, "logps/chosen": -40.43394088745117, "logps/rejected": -58.36912536621094, "loss": 0.4662, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.2760140001773834, "rewards/margins": 0.613175094127655, "rewards/rejected": -0.8891890048980713, "step": 1152 }, { "epoch": 0.6935866983372921, "grad_norm": 50.0, "learning_rate": 8.612826603325414e-07, "logits/chosen": 0.09185300767421722, "logits/rejected": 0.1087762862443924, "logps/chosen": -38.46809768676758, "logps/rejected": -57.867942810058594, "loss": 0.4468, "rewards/accuracies": 0.890625, "rewards/chosen": -0.26649224758148193, "rewards/margins": 0.6601444482803345, "rewards/rejected": -0.9266366362571716, "step": 1168 }, { "epoch": 0.7030878859857482, "grad_norm": 48.75, "learning_rate": 8.593824228028503e-07, "logits/chosen": 0.09283562749624252, "logits/rejected": 0.1531636118888855, "logps/chosen": -41.238792419433594, "logps/rejected": -59.725738525390625, "loss": 0.447, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30539485812187195, "rewards/margins": 0.671943187713623, "rewards/rejected": -0.9773380160331726, "step": 1184 }, { "epoch": 0.7125890736342043, "grad_norm": 54.25, "learning_rate": 8.57482185273159e-07, "logits/chosen": 0.17938324809074402, "logits/rejected": 0.21232807636260986, "logps/chosen": -39.86995315551758, "logps/rejected": -58.8725471496582, "loss": 0.4251, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.25202998518943787, "rewards/margins": 0.7341017723083496, "rewards/rejected": -0.9861317873001099, "step": 1200 }, { "epoch": 0.7220902612826603, "grad_norm": 54.25, "learning_rate": 8.555819477434679e-07, "logits/chosen": 0.0566461905837059, "logits/rejected": 0.12666890025138855, "logps/chosen": -39.986106872558594, "logps/rejected": -61.40876388549805, "loss": 0.4067, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.2511664927005768, "rewards/margins": 0.7968044281005859, "rewards/rejected": -1.0479708909988403, "step": 1216 }, { "epoch": 0.7315914489311164, "grad_norm": 58.75, "learning_rate": 8.536817102137766e-07, "logits/chosen": 0.08908773213624954, "logits/rejected": 0.13832062482833862, "logps/chosen": -39.75481414794922, "logps/rejected": -60.21009826660156, "loss": 0.4281, "rewards/accuracies": 0.921875, "rewards/chosen": -0.24241387844085693, "rewards/margins": 0.7212178707122803, "rewards/rejected": -0.9636316895484924, "step": 1232 }, { "epoch": 0.7410926365795725, "grad_norm": 52.5, "learning_rate": 8.517814726840855e-07, "logits/chosen": 0.0984007716178894, "logits/rejected": 0.15883538126945496, "logps/chosen": -38.90775680541992, "logps/rejected": -59.53275680541992, "loss": 0.4418, "rewards/accuracies": 0.859375, "rewards/chosen": -0.27383100986480713, "rewards/margins": 0.7038523554801941, "rewards/rejected": -0.9776833653450012, "step": 1248 }, { "epoch": 0.7505938242280285, "grad_norm": 41.25, "learning_rate": 8.498812351543942e-07, "logits/chosen": 0.11719369888305664, "logits/rejected": 0.17845718562602997, "logps/chosen": -38.104705810546875, "logps/rejected": -61.452178955078125, "loss": 0.3903, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.20659206807613373, "rewards/margins": 0.855985701084137, "rewards/rejected": -1.062577724456787, "step": 1264 }, { "epoch": 0.7600950118764845, "grad_norm": 45.75, "learning_rate": 8.479809976247031e-07, "logits/chosen": 0.0768904983997345, "logits/rejected": 0.20530077815055847, "logps/chosen": -39.765953063964844, "logps/rejected": -61.358497619628906, "loss": 0.4049, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.2901615798473358, "rewards/margins": 0.8085931539535522, "rewards/rejected": -1.098754644393921, "step": 1280 }, { "epoch": 0.7695961995249406, "grad_norm": 41.5, "learning_rate": 8.460807600950118e-07, "logits/chosen": 0.1819642186164856, "logits/rejected": 0.20391146838665009, "logps/chosen": -41.09254455566406, "logps/rejected": -62.414024353027344, "loss": 0.4161, "rewards/accuracies": 0.921875, "rewards/chosen": -0.2619169354438782, "rewards/margins": 0.7796422243118286, "rewards/rejected": -1.0415592193603516, "step": 1296 }, { "epoch": 0.7790973871733967, "grad_norm": 45.75, "learning_rate": 8.441805225653206e-07, "logits/chosen": 0.07217932492494583, "logits/rejected": 0.15043236315250397, "logps/chosen": -41.25364685058594, "logps/rejected": -59.63226318359375, "loss": 0.4309, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3054697513580322, "rewards/margins": 0.7218286395072937, "rewards/rejected": -1.0272983312606812, "step": 1312 }, { "epoch": 0.7885985748218527, "grad_norm": 62.75, "learning_rate": 8.422802850356294e-07, "logits/chosen": 0.10031426697969437, "logits/rejected": 0.197869673371315, "logps/chosen": -40.79019546508789, "logps/rejected": -61.019493103027344, "loss": 0.4362, "rewards/accuracies": 0.890625, "rewards/chosen": -0.2853235602378845, "rewards/margins": 0.7121058106422424, "rewards/rejected": -0.9974292516708374, "step": 1328 }, { "epoch": 0.7980997624703088, "grad_norm": 54.0, "learning_rate": 8.403800475059381e-07, "logits/chosen": 0.044592004269361496, "logits/rejected": 0.19066573679447174, "logps/chosen": -40.317413330078125, "logps/rejected": -58.56877899169922, "loss": 0.4502, "rewards/accuracies": 0.890625, "rewards/chosen": -0.30221185088157654, "rewards/margins": 0.6648804545402527, "rewards/rejected": -0.9670923352241516, "step": 1344 }, { "epoch": 0.8076009501187649, "grad_norm": 40.75, "learning_rate": 8.38479809976247e-07, "logits/chosen": 0.06595605611801147, "logits/rejected": 0.18088053166866302, "logps/chosen": -39.756439208984375, "logps/rejected": -63.843971252441406, "loss": 0.3955, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.2804669737815857, "rewards/margins": 0.8086118102073669, "rewards/rejected": -1.0890789031982422, "step": 1360 }, { "epoch": 0.8171021377672208, "grad_norm": 48.0, "learning_rate": 8.365795724465557e-07, "logits/chosen": 0.1287180781364441, "logits/rejected": 0.22206488251686096, "logps/chosen": -41.125389099121094, "logps/rejected": -61.68186950683594, "loss": 0.4428, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30739057064056396, "rewards/margins": 0.7095006704330444, "rewards/rejected": -1.0168912410736084, "step": 1376 }, { "epoch": 0.8266033254156769, "grad_norm": 52.25, "learning_rate": 8.346793349168646e-07, "logits/chosen": 0.053852953016757965, "logits/rejected": 0.13252206146717072, "logps/chosen": -40.2414436340332, "logps/rejected": -61.29638671875, "loss": 0.4071, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3235268294811249, "rewards/margins": 0.811761736869812, "rewards/rejected": -1.1352884769439697, "step": 1392 }, { "epoch": 0.836104513064133, "grad_norm": 56.0, "learning_rate": 8.327790973871733e-07, "logits/chosen": 0.0468582957983017, "logits/rejected": 0.11399667710065842, "logps/chosen": -39.64308547973633, "logps/rejected": -59.60944747924805, "loss": 0.4147, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2946662902832031, "rewards/margins": 0.7918639183044434, "rewards/rejected": -1.086530089378357, "step": 1408 }, { "epoch": 0.8456057007125891, "grad_norm": 60.5, "learning_rate": 8.308788598574822e-07, "logits/chosen": 0.09335757046937943, "logits/rejected": 0.10522627085447311, "logps/chosen": -42.684288024902344, "logps/rejected": -61.48929214477539, "loss": 0.4301, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3570723533630371, "rewards/margins": 0.7129464745521545, "rewards/rejected": -1.0700188875198364, "step": 1424 }, { "epoch": 0.8551068883610451, "grad_norm": 51.75, "learning_rate": 8.289786223277909e-07, "logits/chosen": 0.07347086817026138, "logits/rejected": 0.15308743715286255, "logps/chosen": -41.02301788330078, "logps/rejected": -61.934791564941406, "loss": 0.4048, "rewards/accuracies": 0.90625, "rewards/chosen": -0.29406049847602844, "rewards/margins": 0.8210353255271912, "rewards/rejected": -1.1150959730148315, "step": 1440 }, { "epoch": 0.8646080760095012, "grad_norm": 45.0, "learning_rate": 8.270783847980998e-07, "logits/chosen": 0.0793566182255745, "logits/rejected": 0.17386886477470398, "logps/chosen": -37.58202362060547, "logps/rejected": -59.483882904052734, "loss": 0.4083, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.2546355128288269, "rewards/margins": 0.7988392114639282, "rewards/rejected": -1.0534747838974, "step": 1456 }, { "epoch": 0.8741092636579573, "grad_norm": 51.5, "learning_rate": 8.251781472684085e-07, "logits/chosen": 0.06765860319137573, "logits/rejected": 0.17789000272750854, "logps/chosen": -38.726783752441406, "logps/rejected": -60.934757232666016, "loss": 0.4203, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.3265742063522339, "rewards/margins": 0.7777712345123291, "rewards/rejected": -1.1043453216552734, "step": 1472 }, { "epoch": 0.8836104513064132, "grad_norm": 56.5, "learning_rate": 8.232779097387174e-07, "logits/chosen": 0.05681402608752251, "logits/rejected": 0.1788835972547531, "logps/chosen": -38.938507080078125, "logps/rejected": -61.65933609008789, "loss": 0.3918, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.25624170899391174, "rewards/margins": 0.8536828756332397, "rewards/rejected": -1.1099246740341187, "step": 1488 }, { "epoch": 0.8931116389548693, "grad_norm": 49.25, "learning_rate": 8.213776722090261e-07, "logits/chosen": 0.057860612869262695, "logits/rejected": 0.14475533366203308, "logps/chosen": -37.889766693115234, "logps/rejected": -61.03107452392578, "loss": 0.3916, "rewards/accuracies": 0.921875, "rewards/chosen": -0.23101815581321716, "rewards/margins": 0.8708871603012085, "rewards/rejected": -1.101905345916748, "step": 1504 }, { "epoch": 0.9026128266033254, "grad_norm": 48.5, "learning_rate": 8.194774346793349e-07, "logits/chosen": 0.07886646687984467, "logits/rejected": 0.19103488326072693, "logps/chosen": -40.67872619628906, "logps/rejected": -61.686668395996094, "loss": 0.4185, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3377780318260193, "rewards/margins": 0.795622706413269, "rewards/rejected": -1.133400797843933, "step": 1520 }, { "epoch": 0.9121140142517815, "grad_norm": 44.0, "learning_rate": 8.175771971496437e-07, "logits/chosen": 0.0845412164926529, "logits/rejected": 0.16773808002471924, "logps/chosen": -40.12324523925781, "logps/rejected": -59.303245544433594, "loss": 0.4103, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.3262190520763397, "rewards/margins": 0.8095906376838684, "rewards/rejected": -1.1358096599578857, "step": 1536 }, { "epoch": 0.9216152019002375, "grad_norm": 46.0, "learning_rate": 8.156769596199525e-07, "logits/chosen": 0.08066831529140472, "logits/rejected": 0.17000192403793335, "logps/chosen": -40.7078742980957, "logps/rejected": -62.315185546875, "loss": 0.3972, "rewards/accuracies": 0.890625, "rewards/chosen": -0.28264638781547546, "rewards/margins": 0.8620734214782715, "rewards/rejected": -1.1447197198867798, "step": 1552 }, { "epoch": 0.9311163895486936, "grad_norm": 37.75, "learning_rate": 8.137767220902613e-07, "logits/chosen": 0.08594940602779388, "logits/rejected": 0.22140294313430786, "logps/chosen": -38.64836120605469, "logps/rejected": -62.625946044921875, "loss": 0.376, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.2832408845424652, "rewards/margins": 0.9163691997528076, "rewards/rejected": -1.1996098756790161, "step": 1568 }, { "epoch": 0.9406175771971497, "grad_norm": 61.5, "learning_rate": 8.1187648456057e-07, "logits/chosen": 0.07032456994056702, "logits/rejected": 0.17861232161521912, "logps/chosen": -39.57463836669922, "logps/rejected": -60.60888671875, "loss": 0.4058, "rewards/accuracies": 0.921875, "rewards/chosen": -0.35363835096359253, "rewards/margins": 0.8107983469963074, "rewards/rejected": -1.1644368171691895, "step": 1584 }, { "epoch": 0.9501187648456056, "grad_norm": 52.25, "learning_rate": 8.099762470308789e-07, "logits/chosen": 0.08332876116037369, "logits/rejected": 0.1855737268924713, "logps/chosen": -38.6751594543457, "logps/rejected": -59.24980163574219, "loss": 0.4035, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3069637417793274, "rewards/margins": 0.8082422614097595, "rewards/rejected": -1.115206003189087, "step": 1600 }, { "epoch": 0.9596199524940617, "grad_norm": 45.75, "learning_rate": 8.080760095011876e-07, "logits/chosen": 0.12488888204097748, "logits/rejected": 0.14733757078647614, "logps/chosen": -40.226043701171875, "logps/rejected": -58.75604248046875, "loss": 0.4261, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.35558950901031494, "rewards/margins": 0.7374365925788879, "rewards/rejected": -1.0930261611938477, "step": 1616 }, { "epoch": 0.9691211401425178, "grad_norm": 43.5, "learning_rate": 8.061757719714965e-07, "logits/chosen": 0.04622222110629082, "logits/rejected": 0.1545945107936859, "logps/chosen": -39.63484573364258, "logps/rejected": -61.277587890625, "loss": 0.3831, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3327168822288513, "rewards/margins": 0.8751581311225891, "rewards/rejected": -1.2078750133514404, "step": 1632 }, { "epoch": 0.9786223277909739, "grad_norm": 54.0, "learning_rate": 8.042755344418051e-07, "logits/chosen": 0.09918095171451569, "logits/rejected": 0.18718725442886353, "logps/chosen": -42.15406799316406, "logps/rejected": -62.292816162109375, "loss": 0.4012, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3841949701309204, "rewards/margins": 0.8375911116600037, "rewards/rejected": -1.2217860221862793, "step": 1648 }, { "epoch": 0.9881235154394299, "grad_norm": 47.0, "learning_rate": 8.02375296912114e-07, "logits/chosen": 0.06581688672304153, "logits/rejected": 0.16606493294239044, "logps/chosen": -40.35301971435547, "logps/rejected": -60.86550521850586, "loss": 0.4171, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3261529803276062, "rewards/margins": 0.7935004830360413, "rewards/rejected": -1.1196534633636475, "step": 1664 }, { "epoch": 0.997624703087886, "grad_norm": 46.75, "learning_rate": 8.004750593824227e-07, "logits/chosen": 0.08406564593315125, "logits/rejected": 0.1631183624267578, "logps/chosen": -42.632301330566406, "logps/rejected": -66.31587219238281, "loss": 0.3864, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3757409453392029, "rewards/margins": 0.8852084875106812, "rewards/rejected": -1.2609493732452393, "step": 1680 }, { "epoch": 1.007125890736342, "grad_norm": 53.5, "learning_rate": 7.985748218527315e-07, "logits/chosen": 0.04815902188420296, "logits/rejected": 0.10080163925886154, "logps/chosen": -40.61830520629883, "logps/rejected": -62.35159683227539, "loss": 0.3894, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3303859233856201, "rewards/margins": 0.8965721726417542, "rewards/rejected": -1.2269580364227295, "step": 1696 }, { "epoch": 1.0166270783847982, "grad_norm": 58.5, "learning_rate": 7.966745843230403e-07, "logits/chosen": 0.07239064574241638, "logits/rejected": 0.1947220116853714, "logps/chosen": -42.951637268066406, "logps/rejected": -64.53704071044922, "loss": 0.4144, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.397784024477005, "rewards/margins": 0.7604966759681702, "rewards/rejected": -1.158280611038208, "step": 1712 }, { "epoch": 1.0261282660332542, "grad_norm": 59.75, "learning_rate": 7.947743467933491e-07, "logits/chosen": 0.08597906678915024, "logits/rejected": 0.14526161551475525, "logps/chosen": -40.39187240600586, "logps/rejected": -64.09424591064453, "loss": 0.3834, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.33142587542533875, "rewards/margins": 0.926814079284668, "rewards/rejected": -1.258239984512329, "step": 1728 }, { "epoch": 1.03562945368171, "grad_norm": 40.5, "learning_rate": 7.928741092636579e-07, "logits/chosen": 0.0840529203414917, "logits/rejected": 0.18427981436252594, "logps/chosen": -42.51103591918945, "logps/rejected": -64.92820739746094, "loss": 0.4181, "rewards/accuracies": 0.90625, "rewards/chosen": -0.39203089475631714, "rewards/margins": 0.7877532839775085, "rewards/rejected": -1.1797842979431152, "step": 1744 }, { "epoch": 1.0451306413301662, "grad_norm": 56.5, "learning_rate": 7.909738717339667e-07, "logits/chosen": 0.049816541373729706, "logits/rejected": 0.143840953707695, "logps/chosen": -41.118350982666016, "logps/rejected": -61.427818298339844, "loss": 0.4304, "rewards/accuracies": 0.890625, "rewards/chosen": -0.38534730672836304, "rewards/margins": 0.7327947020530701, "rewards/rejected": -1.118142008781433, "step": 1760 }, { "epoch": 1.0546318289786223, "grad_norm": 59.0, "learning_rate": 7.890736342042755e-07, "logits/chosen": 0.09707099944353104, "logits/rejected": 0.10315439105033875, "logps/chosen": -39.37982940673828, "logps/rejected": -61.087913513183594, "loss": 0.4052, "rewards/accuracies": 0.921875, "rewards/chosen": -0.30062562227249146, "rewards/margins": 0.834286630153656, "rewards/rejected": -1.1349122524261475, "step": 1776 }, { "epoch": 1.0641330166270784, "grad_norm": 46.5, "learning_rate": 7.871733966745842e-07, "logits/chosen": -0.01599489152431488, "logits/rejected": 0.1043221652507782, "logps/chosen": -42.238426208496094, "logps/rejected": -64.75145721435547, "loss": 0.389, "rewards/accuracies": 0.921875, "rewards/chosen": -0.383517324924469, "rewards/margins": 0.8845040202140808, "rewards/rejected": -1.2680213451385498, "step": 1792 }, { "epoch": 1.0736342042755345, "grad_norm": 60.5, "learning_rate": 7.852731591448931e-07, "logits/chosen": 0.04319116473197937, "logits/rejected": 0.10541323572397232, "logps/chosen": -40.8631706237793, "logps/rejected": -59.777008056640625, "loss": 0.3936, "rewards/accuracies": 0.921875, "rewards/chosen": -0.30645447969436646, "rewards/margins": 0.8425821661949158, "rewards/rejected": -1.1490366458892822, "step": 1808 }, { "epoch": 1.0831353919239906, "grad_norm": 48.75, "learning_rate": 7.833729216152018e-07, "logits/chosen": 0.10437559336423874, "logits/rejected": 0.1490824967622757, "logps/chosen": -39.50224304199219, "logps/rejected": -62.01877975463867, "loss": 0.3812, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3324194550514221, "rewards/margins": 0.897403359413147, "rewards/rejected": -1.2298228740692139, "step": 1824 }, { "epoch": 1.0926365795724466, "grad_norm": 48.75, "learning_rate": 7.814726840855107e-07, "logits/chosen": -0.01841648668050766, "logits/rejected": 0.10108716785907745, "logps/chosen": -39.490970611572266, "logps/rejected": -59.66122055053711, "loss": 0.3821, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3041851818561554, "rewards/margins": 0.8931726813316345, "rewards/rejected": -1.1973577737808228, "step": 1840 }, { "epoch": 1.1021377672209025, "grad_norm": 44.0, "learning_rate": 7.795724465558194e-07, "logits/chosen": 0.08540444076061249, "logits/rejected": 0.18863236904144287, "logps/chosen": -41.12372589111328, "logps/rejected": -62.18980026245117, "loss": 0.3981, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3796708285808563, "rewards/margins": 0.852729320526123, "rewards/rejected": -1.2324001789093018, "step": 1856 }, { "epoch": 1.1116389548693586, "grad_norm": 51.0, "learning_rate": 7.776722090261282e-07, "logits/chosen": 0.14734028279781342, "logits/rejected": 0.21354371309280396, "logps/chosen": -43.433292388916016, "logps/rejected": -66.97840118408203, "loss": 0.3905, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.40543994307518005, "rewards/margins": 0.8719741702079773, "rewards/rejected": -1.277414083480835, "step": 1872 }, { "epoch": 1.1211401425178147, "grad_norm": 40.5, "learning_rate": 7.75771971496437e-07, "logits/chosen": 0.04771037772297859, "logits/rejected": 0.14470888674259186, "logps/chosen": -40.52289962768555, "logps/rejected": -63.578758239746094, "loss": 0.3997, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3288933336734772, "rewards/margins": 0.8785545229911804, "rewards/rejected": -1.20744788646698, "step": 1888 }, { "epoch": 1.1306413301662708, "grad_norm": 56.25, "learning_rate": 7.738717339667458e-07, "logits/chosen": 0.02750781551003456, "logits/rejected": 0.12446936219930649, "logps/chosen": -39.09437561035156, "logps/rejected": -60.32615661621094, "loss": 0.3828, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.29237014055252075, "rewards/margins": 0.9157527089118958, "rewards/rejected": -1.208122730255127, "step": 1904 }, { "epoch": 1.1401425178147269, "grad_norm": 50.25, "learning_rate": 7.719714964370546e-07, "logits/chosen": -0.0005194246768951416, "logits/rejected": 0.11599244177341461, "logps/chosen": -38.9823112487793, "logps/rejected": -60.105751037597656, "loss": 0.3918, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3398272395133972, "rewards/margins": 0.8596587181091309, "rewards/rejected": -1.1994860172271729, "step": 1920 }, { "epoch": 1.149643705463183, "grad_norm": 52.25, "learning_rate": 7.700712589073634e-07, "logits/chosen": 0.14513945579528809, "logits/rejected": 0.1485050767660141, "logps/chosen": -39.983741760253906, "logps/rejected": -62.480751037597656, "loss": 0.3717, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3441847264766693, "rewards/margins": 0.9267476201057434, "rewards/rejected": -1.2709323167800903, "step": 1936 }, { "epoch": 1.159144893111639, "grad_norm": 56.25, "learning_rate": 7.681710213776722e-07, "logits/chosen": 0.0954248383641243, "logits/rejected": 0.15733623504638672, "logps/chosen": -40.76930618286133, "logps/rejected": -63.525718688964844, "loss": 0.391, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.36353814601898193, "rewards/margins": 0.9048362970352173, "rewards/rejected": -1.2683744430541992, "step": 1952 }, { "epoch": 1.168646080760095, "grad_norm": 55.5, "learning_rate": 7.66270783847981e-07, "logits/chosen": 0.08181347697973251, "logits/rejected": 0.12661109864711761, "logps/chosen": -41.755645751953125, "logps/rejected": -61.083946228027344, "loss": 0.3934, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3673307001590729, "rewards/margins": 0.8563836812973022, "rewards/rejected": -1.2237144708633423, "step": 1968 }, { "epoch": 1.178147268408551, "grad_norm": 52.0, "learning_rate": 7.643705463182898e-07, "logits/chosen": 0.042737413197755814, "logits/rejected": 0.1984286606311798, "logps/chosen": -40.4138298034668, "logps/rejected": -65.36979675292969, "loss": 0.3825, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.35838350653648376, "rewards/margins": 0.8961877822875977, "rewards/rejected": -1.2545711994171143, "step": 1984 }, { "epoch": 1.187648456057007, "grad_norm": 52.0, "learning_rate": 7.624703087885986e-07, "logits/chosen": 0.06372962146997452, "logits/rejected": 0.20503537356853485, "logps/chosen": -39.09539794921875, "logps/rejected": -65.76801300048828, "loss": 0.3814, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.301858514547348, "rewards/margins": 0.8968943953514099, "rewards/rejected": -1.1987528800964355, "step": 2000 }, { "epoch": 1.1971496437054632, "grad_norm": 44.0, "learning_rate": 7.605700712589074e-07, "logits/chosen": 0.09966941177845001, "logits/rejected": 0.17389996349811554, "logps/chosen": -44.16709899902344, "logps/rejected": -61.92054748535156, "loss": 0.4317, "rewards/accuracies": 0.8359375, "rewards/chosen": -0.43170446157455444, "rewards/margins": 0.799846887588501, "rewards/rejected": -1.2315512895584106, "step": 2016 }, { "epoch": 1.2066508313539193, "grad_norm": 41.0, "learning_rate": 7.586698337292161e-07, "logits/chosen": 0.0820382758975029, "logits/rejected": 0.1380407214164734, "logps/chosen": -40.85121536254883, "logps/rejected": -61.79035568237305, "loss": 0.3944, "rewards/accuracies": 0.875, "rewards/chosen": -0.33426377177238464, "rewards/margins": 0.8878821730613708, "rewards/rejected": -1.222145915031433, "step": 2032 }, { "epoch": 1.2161520190023754, "grad_norm": 49.25, "learning_rate": 7.567695961995249e-07, "logits/chosen": 0.07470327615737915, "logits/rejected": 0.17876245081424713, "logps/chosen": -40.400997161865234, "logps/rejected": -62.64784240722656, "loss": 0.3868, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3647741377353668, "rewards/margins": 0.8964493274688721, "rewards/rejected": -1.261223554611206, "step": 2048 }, { "epoch": 1.2256532066508314, "grad_norm": 59.25, "learning_rate": 7.548693586698337e-07, "logits/chosen": 0.09447329491376877, "logits/rejected": 0.20229777693748474, "logps/chosen": -41.54475021362305, "logps/rejected": -62.25804138183594, "loss": 0.411, "rewards/accuracies": 0.890625, "rewards/chosen": -0.39685580134391785, "rewards/margins": 0.8458825945854187, "rewards/rejected": -1.2427384853363037, "step": 2064 }, { "epoch": 1.2351543942992875, "grad_norm": 46.75, "learning_rate": 7.529691211401425e-07, "logits/chosen": 0.005977040156722069, "logits/rejected": 0.12502533197402954, "logps/chosen": -39.64662170410156, "logps/rejected": -61.320465087890625, "loss": 0.3843, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35564541816711426, "rewards/margins": 0.9019960165023804, "rewards/rejected": -1.2576414346694946, "step": 2080 }, { "epoch": 1.2446555819477434, "grad_norm": 56.75, "learning_rate": 7.510688836104513e-07, "logits/chosen": 0.08646165579557419, "logits/rejected": 0.19107480347156525, "logps/chosen": -41.432594299316406, "logps/rejected": -63.792335510253906, "loss": 0.4067, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4125351309776306, "rewards/margins": 0.8356191515922546, "rewards/rejected": -1.2481542825698853, "step": 2096 }, { "epoch": 1.2541567695961995, "grad_norm": 50.0, "learning_rate": 7.4916864608076e-07, "logits/chosen": 0.0462346225976944, "logits/rejected": 0.18776997923851013, "logps/chosen": -39.760746002197266, "logps/rejected": -62.441749572753906, "loss": 0.3845, "rewards/accuracies": 0.921875, "rewards/chosen": -0.30308061838150024, "rewards/margins": 0.87934410572052, "rewards/rejected": -1.182424783706665, "step": 2112 }, { "epoch": 1.2636579572446556, "grad_norm": 46.25, "learning_rate": 7.472684085510688e-07, "logits/chosen": 0.08780578523874283, "logits/rejected": 0.16588638722896576, "logps/chosen": -41.076515197753906, "logps/rejected": -62.57638931274414, "loss": 0.3855, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3240537941455841, "rewards/margins": 0.8670827150344849, "rewards/rejected": -1.191136360168457, "step": 2128 }, { "epoch": 1.2731591448931117, "grad_norm": 41.0, "learning_rate": 7.453681710213776e-07, "logits/chosen": 0.11139998584985733, "logits/rejected": 0.18487989902496338, "logps/chosen": -40.500728607177734, "logps/rejected": -61.71664810180664, "loss": 0.4028, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3737491965293884, "rewards/margins": 0.8448764085769653, "rewards/rejected": -1.218625545501709, "step": 2144 }, { "epoch": 1.2826603325415677, "grad_norm": 51.25, "learning_rate": 7.434679334916864e-07, "logits/chosen": 0.05119692161679268, "logits/rejected": 0.12550300359725952, "logps/chosen": -43.28956604003906, "logps/rejected": -63.49314880371094, "loss": 0.3864, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.43977251648902893, "rewards/margins": 0.8914352655410767, "rewards/rejected": -1.3312077522277832, "step": 2160 }, { "epoch": 1.2921615201900236, "grad_norm": 66.0, "learning_rate": 7.415676959619952e-07, "logits/chosen": 0.023000139743089676, "logits/rejected": 0.10380115360021591, "logps/chosen": -40.769859313964844, "logps/rejected": -61.942955017089844, "loss": 0.3927, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3861158490180969, "rewards/margins": 0.8635023832321167, "rewards/rejected": -1.2496182918548584, "step": 2176 }, { "epoch": 1.3016627078384797, "grad_norm": 56.0, "learning_rate": 7.39667458432304e-07, "logits/chosen": 0.08992882817983627, "logits/rejected": 0.16640856862068176, "logps/chosen": -41.653472900390625, "logps/rejected": -65.48009490966797, "loss": 0.3645, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3618718385696411, "rewards/margins": 0.9549592733383179, "rewards/rejected": -1.3168312311172485, "step": 2192 }, { "epoch": 1.3111638954869358, "grad_norm": 48.5, "learning_rate": 7.377672209026128e-07, "logits/chosen": 0.022954029962420464, "logits/rejected": 0.1399441808462143, "logps/chosen": -40.20348358154297, "logps/rejected": -62.03968811035156, "loss": 0.3814, "rewards/accuracies": 0.921875, "rewards/chosen": -0.33127692341804504, "rewards/margins": 0.9193905591964722, "rewards/rejected": -1.250667691230774, "step": 2208 }, { "epoch": 1.3206650831353919, "grad_norm": 45.0, "learning_rate": 7.358669833729216e-07, "logits/chosen": 0.05273166671395302, "logits/rejected": 0.10670921206474304, "logps/chosen": -40.3055419921875, "logps/rejected": -60.818115234375, "loss": 0.4315, "rewards/accuracies": 0.890625, "rewards/chosen": -0.36937782168388367, "rewards/margins": 0.7719258666038513, "rewards/rejected": -1.1413036584854126, "step": 2224 }, { "epoch": 1.330166270783848, "grad_norm": 46.5, "learning_rate": 7.339667458432304e-07, "logits/chosen": 0.020988432690501213, "logits/rejected": 0.13837015628814697, "logps/chosen": -38.807682037353516, "logps/rejected": -61.88154983520508, "loss": 0.3805, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.2889925241470337, "rewards/margins": 0.9347207546234131, "rewards/rejected": -1.2237131595611572, "step": 2240 }, { "epoch": 1.339667458432304, "grad_norm": 49.25, "learning_rate": 7.320665083135391e-07, "logits/chosen": 0.09183872491121292, "logits/rejected": 0.19175560772418976, "logps/chosen": -42.616172790527344, "logps/rejected": -64.80868530273438, "loss": 0.3724, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3987230360507965, "rewards/margins": 0.9598705768585205, "rewards/rejected": -1.3585937023162842, "step": 2256 }, { "epoch": 1.3491686460807601, "grad_norm": 56.75, "learning_rate": 7.301662707838479e-07, "logits/chosen": 0.04315632954239845, "logits/rejected": 0.12694051861763, "logps/chosen": -42.23524856567383, "logps/rejected": -63.53181076049805, "loss": 0.4178, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.45748934149742126, "rewards/margins": 0.7994433045387268, "rewards/rejected": -1.2569326162338257, "step": 2272 }, { "epoch": 1.3586698337292162, "grad_norm": 51.75, "learning_rate": 7.282660332541567e-07, "logits/chosen": 0.05002054572105408, "logits/rejected": 0.10560965538024902, "logps/chosen": -41.06108474731445, "logps/rejected": -61.318199157714844, "loss": 0.3899, "rewards/accuracies": 0.890625, "rewards/chosen": -0.39432668685913086, "rewards/margins": 0.906670331954956, "rewards/rejected": -1.300997018814087, "step": 2288 }, { "epoch": 1.3681710213776723, "grad_norm": 59.0, "learning_rate": 7.263657957244655e-07, "logits/chosen": 0.0978277325630188, "logits/rejected": 0.1469835340976715, "logps/chosen": -40.50400161743164, "logps/rejected": -64.60050964355469, "loss": 0.3702, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.39239317178726196, "rewards/margins": 0.9568536281585693, "rewards/rejected": -1.3492467403411865, "step": 2304 }, { "epoch": 1.3776722090261282, "grad_norm": 50.25, "learning_rate": 7.244655581947743e-07, "logits/chosen": 0.0486019104719162, "logits/rejected": 0.17572590708732605, "logps/chosen": -40.24808883666992, "logps/rejected": -61.60661697387695, "loss": 0.4009, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3608768582344055, "rewards/margins": 0.831383466720581, "rewards/rejected": -1.1922603845596313, "step": 2320 }, { "epoch": 1.3871733966745843, "grad_norm": 54.25, "learning_rate": 7.225653206650831e-07, "logits/chosen": 0.03973031044006348, "logits/rejected": 0.14902335405349731, "logps/chosen": -39.89193344116211, "logps/rejected": -61.921695709228516, "loss": 0.3674, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3707820177078247, "rewards/margins": 0.9807763695716858, "rewards/rejected": -1.3515583276748657, "step": 2336 }, { "epoch": 1.3966745843230404, "grad_norm": 45.25, "learning_rate": 7.206650831353919e-07, "logits/chosen": 0.01807180978357792, "logits/rejected": 0.14278706908226013, "logps/chosen": -39.26378631591797, "logps/rejected": -60.40557861328125, "loss": 0.3723, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.36147797107696533, "rewards/margins": 0.9218441247940063, "rewards/rejected": -1.2833220958709717, "step": 2352 }, { "epoch": 1.4061757719714965, "grad_norm": 75.5, "learning_rate": 7.187648456057007e-07, "logits/chosen": 0.07302900403738022, "logits/rejected": 0.14269563555717468, "logps/chosen": -41.234161376953125, "logps/rejected": -66.46778869628906, "loss": 0.3693, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.39566075801849365, "rewards/margins": 0.9585368633270264, "rewards/rejected": -1.35419762134552, "step": 2368 }, { "epoch": 1.4156769596199525, "grad_norm": 40.75, "learning_rate": 7.168646080760095e-07, "logits/chosen": 0.051808636635541916, "logits/rejected": 0.21229980885982513, "logps/chosen": -40.08659744262695, "logps/rejected": -61.43524932861328, "loss": 0.3833, "rewards/accuracies": 0.921875, "rewards/chosen": -0.37025976181030273, "rewards/margins": 0.8906149864196777, "rewards/rejected": -1.2608747482299805, "step": 2384 }, { "epoch": 1.4251781472684084, "grad_norm": 44.75, "learning_rate": 7.149643705463183e-07, "logits/chosen": 0.0380956195294857, "logits/rejected": 0.13877098262310028, "logps/chosen": -42.67876052856445, "logps/rejected": -63.79023361206055, "loss": 0.3799, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3804953396320343, "rewards/margins": 0.9205693006515503, "rewards/rejected": -1.3010647296905518, "step": 2400 }, { "epoch": 1.4346793349168645, "grad_norm": 51.5, "learning_rate": 7.130641330166271e-07, "logits/chosen": 0.08207520097494125, "logits/rejected": 0.14595362544059753, "logps/chosen": -42.30644989013672, "logps/rejected": -63.68936538696289, "loss": 0.3859, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4032093584537506, "rewards/margins": 0.8909638524055481, "rewards/rejected": -1.294173240661621, "step": 2416 }, { "epoch": 1.4441805225653206, "grad_norm": 47.0, "learning_rate": 7.111638954869358e-07, "logits/chosen": 0.017898384481668472, "logits/rejected": 0.18962475657463074, "logps/chosen": -41.686912536621094, "logps/rejected": -62.23005294799805, "loss": 0.3961, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3815171718597412, "rewards/margins": 0.8675276637077332, "rewards/rejected": -1.2490447759628296, "step": 2432 }, { "epoch": 1.4536817102137767, "grad_norm": 46.5, "learning_rate": 7.092636579572447e-07, "logits/chosen": 0.07278081774711609, "logits/rejected": 0.13306282460689545, "logps/chosen": -39.96598815917969, "logps/rejected": -62.889739990234375, "loss": 0.3467, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3247521221637726, "rewards/margins": 1.0032382011413574, "rewards/rejected": -1.3279902935028076, "step": 2448 }, { "epoch": 1.4631828978622328, "grad_norm": 46.75, "learning_rate": 7.073634204275534e-07, "logits/chosen": 0.028041554614901543, "logits/rejected": 0.126488596200943, "logps/chosen": -42.11606216430664, "logps/rejected": -60.70077896118164, "loss": 0.4265, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.48028531670570374, "rewards/margins": 0.801145076751709, "rewards/rejected": -1.2814303636550903, "step": 2464 }, { "epoch": 1.4726840855106889, "grad_norm": 52.75, "learning_rate": 7.054631828978623e-07, "logits/chosen": 0.05731602758169174, "logits/rejected": 0.13755974173545837, "logps/chosen": -39.297462463378906, "logps/rejected": -62.45978546142578, "loss": 0.373, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.31363803148269653, "rewards/margins": 0.9406128525733948, "rewards/rejected": -1.2542507648468018, "step": 2480 }, { "epoch": 1.482185273159145, "grad_norm": 42.0, "learning_rate": 7.03562945368171e-07, "logits/chosen": 0.0816030278801918, "logits/rejected": 0.14841099083423615, "logps/chosen": -41.25545883178711, "logps/rejected": -61.70344924926758, "loss": 0.4112, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.4528937041759491, "rewards/margins": 0.8154016733169556, "rewards/rejected": -1.268295407295227, "step": 2496 }, { "epoch": 1.491686460807601, "grad_norm": 47.75, "learning_rate": 7.016627078384798e-07, "logits/chosen": 0.07149530947208405, "logits/rejected": 0.17929969727993011, "logps/chosen": -41.5832633972168, "logps/rejected": -67.25575256347656, "loss": 0.3869, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.41033345460891724, "rewards/margins": 0.9050929546356201, "rewards/rejected": -1.3154264688491821, "step": 2512 }, { "epoch": 1.5011876484560571, "grad_norm": 43.5, "learning_rate": 6.997624703087886e-07, "logits/chosen": 0.07096850126981735, "logits/rejected": 0.14748625457286835, "logps/chosen": -42.1703987121582, "logps/rejected": -63.72838592529297, "loss": 0.3933, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.43840450048446655, "rewards/margins": 0.9121676683425903, "rewards/rejected": -1.3505722284317017, "step": 2528 }, { "epoch": 1.5106888361045132, "grad_norm": 40.25, "learning_rate": 6.978622327790974e-07, "logits/chosen": 0.07429444789886475, "logits/rejected": 0.169959157705307, "logps/chosen": -42.15153121948242, "logps/rejected": -67.66064453125, "loss": 0.3566, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.385272741317749, "rewards/margins": 1.010432481765747, "rewards/rejected": -1.3957051038742065, "step": 2544 }, { "epoch": 1.520190023752969, "grad_norm": 46.5, "learning_rate": 6.959619952494062e-07, "logits/chosen": 0.01588086411356926, "logits/rejected": 0.1419978141784668, "logps/chosen": -42.120887756347656, "logps/rejected": -65.12626647949219, "loss": 0.3811, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4491952657699585, "rewards/margins": 0.9161649942398071, "rewards/rejected": -1.3653602600097656, "step": 2560 }, { "epoch": 1.5296912114014252, "grad_norm": 45.25, "learning_rate": 6.94061757719715e-07, "logits/chosen": 0.09590751677751541, "logits/rejected": 0.20653869211673737, "logps/chosen": -43.40055465698242, "logps/rejected": -65.3685302734375, "loss": 0.3946, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.42235833406448364, "rewards/margins": 0.85853511095047, "rewards/rejected": -1.2808934450149536, "step": 2576 }, { "epoch": 1.5391923990498813, "grad_norm": 48.5, "learning_rate": 6.921615201900237e-07, "logits/chosen": 0.10696500539779663, "logits/rejected": 0.1343913972377777, "logps/chosen": -42.464046478271484, "logps/rejected": -63.380943298339844, "loss": 0.4106, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.44905325770378113, "rewards/margins": 0.8345274925231934, "rewards/rejected": -1.2835807800292969, "step": 2592 }, { "epoch": 1.5486935866983373, "grad_norm": 57.0, "learning_rate": 6.902612826603324e-07, "logits/chosen": 0.04652204364538193, "logits/rejected": 0.10454034060239792, "logps/chosen": -40.00874328613281, "logps/rejected": -60.814361572265625, "loss": 0.3879, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.38040611147880554, "rewards/margins": 0.8988234400749207, "rewards/rejected": -1.279229760169983, "step": 2608 }, { "epoch": 1.5581947743467932, "grad_norm": 51.25, "learning_rate": 6.883610451306413e-07, "logits/chosen": 0.11851513385772705, "logits/rejected": 0.14123034477233887, "logps/chosen": -41.09252166748047, "logps/rejected": -64.00010681152344, "loss": 0.3637, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3220548927783966, "rewards/margins": 1.0083606243133545, "rewards/rejected": -1.3304154872894287, "step": 2624 }, { "epoch": 1.5676959619952493, "grad_norm": 50.75, "learning_rate": 6.8646080760095e-07, "logits/chosen": 0.016296017915010452, "logits/rejected": 0.11665618419647217, "logps/chosen": -39.934112548828125, "logps/rejected": -61.658660888671875, "loss": 0.3787, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35257166624069214, "rewards/margins": 0.9432335495948792, "rewards/rejected": -1.2958052158355713, "step": 2640 }, { "epoch": 1.5771971496437054, "grad_norm": 47.75, "learning_rate": 6.845605700712589e-07, "logits/chosen": 0.029370354488492012, "logits/rejected": 0.13856028020381927, "logps/chosen": -40.105552673339844, "logps/rejected": -66.02179718017578, "loss": 0.3523, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3618098795413971, "rewards/margins": 1.0319128036499023, "rewards/rejected": -1.3937227725982666, "step": 2656 }, { "epoch": 1.5866983372921615, "grad_norm": 56.25, "learning_rate": 6.826603325415676e-07, "logits/chosen": 0.020885644480586052, "logits/rejected": 0.17086170613765717, "logps/chosen": -39.169334411621094, "logps/rejected": -64.6575698852539, "loss": 0.3559, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3273469805717468, "rewards/margins": 0.9927965998649597, "rewards/rejected": -1.320143461227417, "step": 2672 }, { "epoch": 1.5961995249406176, "grad_norm": 51.75, "learning_rate": 6.807600950118765e-07, "logits/chosen": 0.03238772973418236, "logits/rejected": 0.07314316928386688, "logps/chosen": -42.21445083618164, "logps/rejected": -64.58654022216797, "loss": 0.378, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4178285002708435, "rewards/margins": 0.9217057228088379, "rewards/rejected": -1.3395342826843262, "step": 2688 }, { "epoch": 1.6057007125890737, "grad_norm": 46.0, "learning_rate": 6.788598574821852e-07, "logits/chosen": 0.13759638369083405, "logits/rejected": 0.21088193356990814, "logps/chosen": -41.520233154296875, "logps/rejected": -66.49211120605469, "loss": 0.364, "rewards/accuracies": 0.921875, "rewards/chosen": -0.36787232756614685, "rewards/margins": 0.9915460348129272, "rewards/rejected": -1.359418511390686, "step": 2704 }, { "epoch": 1.6152019002375297, "grad_norm": 43.5, "learning_rate": 6.76959619952494e-07, "logits/chosen": 0.03905269503593445, "logits/rejected": 0.13661767542362213, "logps/chosen": -38.46835708618164, "logps/rejected": -60.601619720458984, "loss": 0.3568, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.31080418825149536, "rewards/margins": 0.9947383999824524, "rewards/rejected": -1.3055424690246582, "step": 2720 }, { "epoch": 1.6247030878859858, "grad_norm": 44.0, "learning_rate": 6.750593824228028e-07, "logits/chosen": 0.07454045116901398, "logits/rejected": 0.17212067544460297, "logps/chosen": -40.81611251831055, "logps/rejected": -62.773468017578125, "loss": 0.3971, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4132643938064575, "rewards/margins": 0.8847667574882507, "rewards/rejected": -1.2980310916900635, "step": 2736 }, { "epoch": 1.634204275534442, "grad_norm": 38.75, "learning_rate": 6.731591448931116e-07, "logits/chosen": 0.0922178328037262, "logits/rejected": 0.13588553667068481, "logps/chosen": -39.945396423339844, "logps/rejected": -61.7259407043457, "loss": 0.3703, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.35344234108924866, "rewards/margins": 0.9801003336906433, "rewards/rejected": -1.3335425853729248, "step": 2752 }, { "epoch": 1.643705463182898, "grad_norm": 55.25, "learning_rate": 6.712589073634204e-07, "logits/chosen": 0.09883704781532288, "logits/rejected": 0.133531391620636, "logps/chosen": -40.637210845947266, "logps/rejected": -62.80479431152344, "loss": 0.3582, "rewards/accuracies": 0.9375, "rewards/chosen": -0.34419822692871094, "rewards/margins": 1.0021624565124512, "rewards/rejected": -1.3463605642318726, "step": 2768 }, { "epoch": 1.6532066508313539, "grad_norm": 35.75, "learning_rate": 6.693586698337292e-07, "logits/chosen": 0.06989182531833649, "logits/rejected": 0.08873751759529114, "logps/chosen": -37.793495178222656, "logps/rejected": -60.03639602661133, "loss": 0.3492, "rewards/accuracies": 0.921875, "rewards/chosen": -0.29584282636642456, "rewards/margins": 1.0541434288024902, "rewards/rejected": -1.3499860763549805, "step": 2784 }, { "epoch": 1.66270783847981, "grad_norm": 44.5, "learning_rate": 6.67458432304038e-07, "logits/chosen": -0.0033259475603699684, "logits/rejected": 0.14055626094341278, "logps/chosen": -41.03327178955078, "logps/rejected": -61.638668060302734, "loss": 0.4027, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4033549129962921, "rewards/margins": 0.8529766798019409, "rewards/rejected": -1.2563316822052002, "step": 2800 }, { "epoch": 1.672209026128266, "grad_norm": 55.25, "learning_rate": 6.655581947743467e-07, "logits/chosen": 0.054215628653764725, "logits/rejected": 0.1540539413690567, "logps/chosen": -42.16969299316406, "logps/rejected": -63.94367218017578, "loss": 0.3909, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4552299976348877, "rewards/margins": 0.8886625170707703, "rewards/rejected": -1.3438924551010132, "step": 2816 }, { "epoch": 1.6817102137767221, "grad_norm": 62.0, "learning_rate": 6.636579572446556e-07, "logits/chosen": 0.054992396384477615, "logits/rejected": 0.13705193996429443, "logps/chosen": -40.24445724487305, "logps/rejected": -60.925933837890625, "loss": 0.3809, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.36651647090911865, "rewards/margins": 0.9117545485496521, "rewards/rejected": -1.278270959854126, "step": 2832 }, { "epoch": 1.691211401425178, "grad_norm": 49.25, "learning_rate": 6.617577197149643e-07, "logits/chosen": 0.11320605874061584, "logits/rejected": 0.17928367853164673, "logps/chosen": -39.65250015258789, "logps/rejected": -62.30231475830078, "loss": 0.3734, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3596932888031006, "rewards/margins": 0.9421689510345459, "rewards/rejected": -1.3018622398376465, "step": 2848 }, { "epoch": 1.700712589073634, "grad_norm": 40.0, "learning_rate": 6.598574821852732e-07, "logits/chosen": 0.025672361254692078, "logits/rejected": 0.07097212225198746, "logps/chosen": -41.138633728027344, "logps/rejected": -63.86958694458008, "loss": 0.3738, "rewards/accuracies": 0.921875, "rewards/chosen": -0.39380064606666565, "rewards/margins": 0.9426943063735962, "rewards/rejected": -1.3364949226379395, "step": 2864 }, { "epoch": 1.7102137767220902, "grad_norm": 50.75, "learning_rate": 6.579572446555819e-07, "logits/chosen": 0.07025223225355148, "logits/rejected": 0.06127552688121796, "logps/chosen": -41.751319885253906, "logps/rejected": -61.100685119628906, "loss": 0.3783, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3228084146976471, "rewards/margins": 0.9433329105377197, "rewards/rejected": -1.2661415338516235, "step": 2880 }, { "epoch": 1.7197149643705463, "grad_norm": 50.75, "learning_rate": 6.560570071258908e-07, "logits/chosen": -0.0122856879606843, "logits/rejected": 0.08213039487600327, "logps/chosen": -39.67725372314453, "logps/rejected": -61.21652603149414, "loss": 0.3817, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39572060108184814, "rewards/margins": 0.9105318188667297, "rewards/rejected": -1.306252360343933, "step": 2896 }, { "epoch": 1.7292161520190024, "grad_norm": 64.5, "learning_rate": 6.541567695961995e-07, "logits/chosen": 0.011255351826548576, "logits/rejected": 0.10233234614133835, "logps/chosen": -42.118900299072266, "logps/rejected": -61.916690826416016, "loss": 0.4019, "rewards/accuracies": 0.890625, "rewards/chosen": -0.41262125968933105, "rewards/margins": 0.8827176094055176, "rewards/rejected": -1.2953388690948486, "step": 2912 }, { "epoch": 1.7387173396674585, "grad_norm": 44.25, "learning_rate": 6.522565320665084e-07, "logits/chosen": 0.07264780253171921, "logits/rejected": 0.10011003911495209, "logps/chosen": -41.43605422973633, "logps/rejected": -59.60805892944336, "loss": 0.406, "rewards/accuracies": 0.875, "rewards/chosen": -0.3696948289871216, "rewards/margins": 0.8629406690597534, "rewards/rejected": -1.2326353788375854, "step": 2928 }, { "epoch": 1.7482185273159145, "grad_norm": 41.0, "learning_rate": 6.503562945368171e-07, "logits/chosen": 0.05529964715242386, "logits/rejected": 0.17115283012390137, "logps/chosen": -40.79829788208008, "logps/rejected": -65.05158996582031, "loss": 0.3523, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.336140513420105, "rewards/margins": 1.0045894384384155, "rewards/rejected": -1.340729832649231, "step": 2944 }, { "epoch": 1.7577197149643706, "grad_norm": 45.5, "learning_rate": 6.484560570071259e-07, "logits/chosen": 0.04629014432430267, "logits/rejected": 0.1502920687198639, "logps/chosen": -41.52702713012695, "logps/rejected": -62.9970817565918, "loss": 0.3898, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4380848705768585, "rewards/margins": 0.9207091331481934, "rewards/rejected": -1.3587939739227295, "step": 2960 }, { "epoch": 1.7672209026128267, "grad_norm": 44.5, "learning_rate": 6.465558194774347e-07, "logits/chosen": -0.011330801993608475, "logits/rejected": 0.10742415487766266, "logps/chosen": -40.159461975097656, "logps/rejected": -64.75343322753906, "loss": 0.3439, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3370579481124878, "rewards/margins": 1.050633192062378, "rewards/rejected": -1.3876910209655762, "step": 2976 }, { "epoch": 1.7767220902612828, "grad_norm": 45.75, "learning_rate": 6.446555819477434e-07, "logits/chosen": 0.06578174233436584, "logits/rejected": 0.1567022204399109, "logps/chosen": -41.37073516845703, "logps/rejected": -63.61731719970703, "loss": 0.4027, "rewards/accuracies": 0.890625, "rewards/chosen": -0.42729830741882324, "rewards/margins": 0.8533509373664856, "rewards/rejected": -1.280649185180664, "step": 2992 }, { "epoch": 1.7862232779097387, "grad_norm": 45.0, "learning_rate": 6.427553444180523e-07, "logits/chosen": 0.028342464938759804, "logits/rejected": 0.17107811570167542, "logps/chosen": -39.00218963623047, "logps/rejected": -63.736061096191406, "loss": 0.3565, "rewards/accuracies": 0.9765625, "rewards/chosen": -0.35766667127609253, "rewards/margins": 0.9768067598342896, "rewards/rejected": -1.3344734907150269, "step": 3008 }, { "epoch": 1.7957244655581948, "grad_norm": 38.25, "learning_rate": 6.40855106888361e-07, "logits/chosen": 0.011551400646567345, "logits/rejected": 0.13449110090732574, "logps/chosen": -41.99809646606445, "logps/rejected": -65.39181518554688, "loss": 0.393, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4379570782184601, "rewards/margins": 0.8674665689468384, "rewards/rejected": -1.3054237365722656, "step": 3024 }, { "epoch": 1.8052256532066508, "grad_norm": 63.5, "learning_rate": 6.389548693586699e-07, "logits/chosen": 0.05890597403049469, "logits/rejected": 0.1898549646139145, "logps/chosen": -39.68592834472656, "logps/rejected": -65.33939361572266, "loss": 0.3571, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.34970682859420776, "rewards/margins": 1.0283163785934448, "rewards/rejected": -1.3780232667922974, "step": 3040 }, { "epoch": 1.814726840855107, "grad_norm": 58.25, "learning_rate": 6.370546318289785e-07, "logits/chosen": 0.019725359976291656, "logits/rejected": 0.15305927395820618, "logps/chosen": -42.078033447265625, "logps/rejected": -61.74312973022461, "loss": 0.4095, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.4093819856643677, "rewards/margins": 0.836819052696228, "rewards/rejected": -1.2462011575698853, "step": 3056 }, { "epoch": 1.8242280285035628, "grad_norm": 53.0, "learning_rate": 6.351543942992874e-07, "logits/chosen": 0.057205233722925186, "logits/rejected": 0.1036025658249855, "logps/chosen": -42.514862060546875, "logps/rejected": -60.93022155761719, "loss": 0.4002, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3770271837711334, "rewards/margins": 0.8727684617042542, "rewards/rejected": -1.24979567527771, "step": 3072 }, { "epoch": 1.833729216152019, "grad_norm": 46.0, "learning_rate": 6.332541567695961e-07, "logits/chosen": 0.032310646027326584, "logits/rejected": 0.1559012234210968, "logps/chosen": -43.822120666503906, "logps/rejected": -64.49073791503906, "loss": 0.4014, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.49134525656700134, "rewards/margins": 0.861031174659729, "rewards/rejected": -1.3523763418197632, "step": 3088 }, { "epoch": 1.843230403800475, "grad_norm": 61.75, "learning_rate": 6.31353919239905e-07, "logits/chosen": 0.004301354289054871, "logits/rejected": 0.08244706690311432, "logps/chosen": -38.925811767578125, "logps/rejected": -63.42317199707031, "loss": 0.3441, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.34742915630340576, "rewards/margins": 1.040870189666748, "rewards/rejected": -1.3882992267608643, "step": 3104 }, { "epoch": 1.852731591448931, "grad_norm": 54.25, "learning_rate": 6.294536817102137e-07, "logits/chosen": -0.00210411474108696, "logits/rejected": 0.1236819252371788, "logps/chosen": -38.81071090698242, "logps/rejected": -61.1712646484375, "loss": 0.3845, "rewards/accuracies": 0.953125, "rewards/chosen": -0.37758293747901917, "rewards/margins": 0.8995540738105774, "rewards/rejected": -1.2771369218826294, "step": 3120 }, { "epoch": 1.8622327790973872, "grad_norm": 50.0, "learning_rate": 6.275534441805226e-07, "logits/chosen": 0.04914906993508339, "logits/rejected": 0.13415980339050293, "logps/chosen": -41.947120666503906, "logps/rejected": -61.855255126953125, "loss": 0.3982, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4452968239784241, "rewards/margins": 0.8472458720207214, "rewards/rejected": -1.2925426959991455, "step": 3136 }, { "epoch": 1.8717339667458432, "grad_norm": 46.25, "learning_rate": 6.256532066508313e-07, "logits/chosen": 0.0254144836217165, "logits/rejected": 0.12364037334918976, "logps/chosen": -41.690284729003906, "logps/rejected": -62.46638870239258, "loss": 0.3497, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3699144124984741, "rewards/margins": 1.013372540473938, "rewards/rejected": -1.383286952972412, "step": 3152 }, { "epoch": 1.8812351543942993, "grad_norm": 46.25, "learning_rate": 6.2375296912114e-07, "logits/chosen": 0.03663626313209534, "logits/rejected": 0.14503881335258484, "logps/chosen": -41.162559509277344, "logps/rejected": -64.2937240600586, "loss": 0.355, "rewards/accuracies": 0.953125, "rewards/chosen": -0.39480462670326233, "rewards/margins": 1.0223582983016968, "rewards/rejected": -1.4171628952026367, "step": 3168 }, { "epoch": 1.8907363420427554, "grad_norm": 40.0, "learning_rate": 6.218527315914489e-07, "logits/chosen": 0.0167838204652071, "logits/rejected": 0.09799753874540329, "logps/chosen": -39.695152282714844, "logps/rejected": -64.00861358642578, "loss": 0.3409, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3396395742893219, "rewards/margins": 1.0993303060531616, "rewards/rejected": -1.4389699697494507, "step": 3184 }, { "epoch": 1.9002375296912115, "grad_norm": 55.5, "learning_rate": 6.199524940617576e-07, "logits/chosen": 0.04460003226995468, "logits/rejected": 0.14700554311275482, "logps/chosen": -44.2281494140625, "logps/rejected": -61.84652328491211, "loss": 0.4352, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.5021332502365112, "rewards/margins": 0.7495644688606262, "rewards/rejected": -1.2516977787017822, "step": 3200 }, { "epoch": 1.9097387173396676, "grad_norm": 32.5, "learning_rate": 6.180522565320665e-07, "logits/chosen": 0.08530572056770325, "logits/rejected": 0.15016531944274902, "logps/chosen": -41.49757385253906, "logps/rejected": -67.6200942993164, "loss": 0.3317, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.38793495297431946, "rewards/margins": 1.099698543548584, "rewards/rejected": -1.487633466720581, "step": 3216 }, { "epoch": 1.9192399049881235, "grad_norm": 61.0, "learning_rate": 6.161520190023752e-07, "logits/chosen": 0.029300598427653313, "logits/rejected": 0.13747388124465942, "logps/chosen": -41.173736572265625, "logps/rejected": -63.320472717285156, "loss": 0.4127, "rewards/accuracies": 0.859375, "rewards/chosen": -0.4380740225315094, "rewards/margins": 0.867578387260437, "rewards/rejected": -1.3056524991989136, "step": 3232 }, { "epoch": 1.9287410926365796, "grad_norm": 44.75, "learning_rate": 6.142517814726841e-07, "logits/chosen": 0.08258551359176636, "logits/rejected": 0.16131961345672607, "logps/chosen": -42.77522277832031, "logps/rejected": -64.70687866210938, "loss": 0.395, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4444977045059204, "rewards/margins": 0.8981250524520874, "rewards/rejected": -1.3426228761672974, "step": 3248 }, { "epoch": 1.9382422802850356, "grad_norm": 67.5, "learning_rate": 6.123515439429928e-07, "logits/chosen": 0.044392723590135574, "logits/rejected": 0.17256534099578857, "logps/chosen": -40.32421875, "logps/rejected": -63.82672119140625, "loss": 0.3611, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.40335986018180847, "rewards/margins": 1.0055795907974243, "rewards/rejected": -1.4089393615722656, "step": 3264 }, { "epoch": 1.9477434679334917, "grad_norm": 49.25, "learning_rate": 6.104513064133017e-07, "logits/chosen": 0.1122560128569603, "logits/rejected": 0.1721959263086319, "logps/chosen": -41.03020477294922, "logps/rejected": -62.24818420410156, "loss": 0.4023, "rewards/accuracies": 0.90625, "rewards/chosen": -0.41969960927963257, "rewards/margins": 0.8544268608093262, "rewards/rejected": -1.2741265296936035, "step": 3280 }, { "epoch": 1.9572446555819476, "grad_norm": 54.5, "learning_rate": 6.085510688836104e-07, "logits/chosen": 0.03646089881658554, "logits/rejected": 0.1658436357975006, "logps/chosen": -41.14552307128906, "logps/rejected": -63.396366119384766, "loss": 0.3697, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4058351218700409, "rewards/margins": 0.9656636714935303, "rewards/rejected": -1.3714988231658936, "step": 3296 }, { "epoch": 1.9667458432304037, "grad_norm": 46.75, "learning_rate": 6.066508313539193e-07, "logits/chosen": 0.03148533031344414, "logits/rejected": 0.14277218282222748, "logps/chosen": -42.05448913574219, "logps/rejected": -65.2642593383789, "loss": 0.3928, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4272368550300598, "rewards/margins": 0.8805699348449707, "rewards/rejected": -1.3078068494796753, "step": 3312 }, { "epoch": 1.9762470308788598, "grad_norm": 41.75, "learning_rate": 6.04750593824228e-07, "logits/chosen": 0.06896740198135376, "logits/rejected": 0.10403262823820114, "logps/chosen": -40.34957504272461, "logps/rejected": -61.979915618896484, "loss": 0.3788, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3596211373806, "rewards/margins": 0.9470376968383789, "rewards/rejected": -1.3066588640213013, "step": 3328 }, { "epoch": 1.9857482185273159, "grad_norm": 55.5, "learning_rate": 6.028503562945369e-07, "logits/chosen": -0.00018313713371753693, "logits/rejected": 0.09493206441402435, "logps/chosen": -40.674354553222656, "logps/rejected": -62.300506591796875, "loss": 0.3676, "rewards/accuracies": 0.9375, "rewards/chosen": -0.35612305998802185, "rewards/margins": 0.9978376626968384, "rewards/rejected": -1.3539607524871826, "step": 3344 }, { "epoch": 1.995249406175772, "grad_norm": 59.75, "learning_rate": 6.009501187648456e-07, "logits/chosen": -0.004904988221824169, "logits/rejected": 0.09877481311559677, "logps/chosen": -40.2232666015625, "logps/rejected": -61.96391296386719, "loss": 0.3655, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3332652151584625, "rewards/margins": 1.0086623430252075, "rewards/rejected": -1.3419275283813477, "step": 3360 }, { "epoch": 2.004750593824228, "grad_norm": 58.5, "learning_rate": 5.990498812351543e-07, "logits/chosen": 0.06698215007781982, "logits/rejected": 0.17540045082569122, "logps/chosen": -43.14054870605469, "logps/rejected": -63.408565521240234, "loss": 0.4218, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4646109938621521, "rewards/margins": 0.7780359387397766, "rewards/rejected": -1.2426469326019287, "step": 3376 }, { "epoch": 2.014251781472684, "grad_norm": 44.25, "learning_rate": 5.971496437054632e-07, "logits/chosen": 0.09281051158905029, "logits/rejected": 0.1449931114912033, "logps/chosen": -41.34113311767578, "logps/rejected": -64.54020690917969, "loss": 0.3712, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.37270694971084595, "rewards/margins": 0.932529628276825, "rewards/rejected": -1.305236577987671, "step": 3392 }, { "epoch": 2.02375296912114, "grad_norm": 43.75, "learning_rate": 5.952494061757719e-07, "logits/chosen": 0.03906689211726189, "logits/rejected": 0.14161323010921478, "logps/chosen": -39.38663101196289, "logps/rejected": -63.778724670410156, "loss": 0.3588, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3333834707736969, "rewards/margins": 1.012413501739502, "rewards/rejected": -1.3457969427108765, "step": 3408 }, { "epoch": 2.0332541567695963, "grad_norm": 35.5, "learning_rate": 5.933491686460808e-07, "logits/chosen": 0.009185846894979477, "logits/rejected": 0.027248330414295197, "logps/chosen": -40.65418243408203, "logps/rejected": -59.860408782958984, "loss": 0.391, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.4065134525299072, "rewards/margins": 0.879849910736084, "rewards/rejected": -1.2863633632659912, "step": 3424 }, { "epoch": 2.0427553444180524, "grad_norm": 50.25, "learning_rate": 5.914489311163895e-07, "logits/chosen": -0.04159889370203018, "logits/rejected": 0.12180589139461517, "logps/chosen": -40.518218994140625, "logps/rejected": -63.04229736328125, "loss": 0.3689, "rewards/accuracies": 0.9921875, "rewards/chosen": -0.39485669136047363, "rewards/margins": 0.9171349406242371, "rewards/rejected": -1.311991572380066, "step": 3440 }, { "epoch": 2.0522565320665085, "grad_norm": 47.0, "learning_rate": 5.895486935866984e-07, "logits/chosen": 0.01238052174448967, "logits/rejected": 0.09721352905035019, "logps/chosen": -42.85576248168945, "logps/rejected": -65.13483428955078, "loss": 0.39, "rewards/accuracies": 0.90625, "rewards/chosen": -0.43811485171318054, "rewards/margins": 0.9170427918434143, "rewards/rejected": -1.3551576137542725, "step": 3456 }, { "epoch": 2.0617577197149646, "grad_norm": 43.0, "learning_rate": 5.876484560570071e-07, "logits/chosen": 0.00015027448534965515, "logits/rejected": 0.10659853368997574, "logps/chosen": -40.47695541381836, "logps/rejected": -62.642822265625, "loss": 0.3501, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37582921981811523, "rewards/margins": 1.0362910032272339, "rewards/rejected": -1.4121201038360596, "step": 3472 }, { "epoch": 2.07125890736342, "grad_norm": 44.0, "learning_rate": 5.85748218527316e-07, "logits/chosen": -0.007269053254276514, "logits/rejected": 0.11244423687458038, "logps/chosen": -40.462440490722656, "logps/rejected": -65.11763000488281, "loss": 0.3694, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3953917920589447, "rewards/margins": 1.003770351409912, "rewards/rejected": -1.3991621732711792, "step": 3488 }, { "epoch": 2.0807600950118763, "grad_norm": 40.25, "learning_rate": 5.838479809976247e-07, "logits/chosen": 0.007935550063848495, "logits/rejected": 0.1055232435464859, "logps/chosen": -39.9696159362793, "logps/rejected": -65.34225463867188, "loss": 0.3638, "rewards/accuracies": 0.921875, "rewards/chosen": -0.38794174790382385, "rewards/margins": 0.9854438304901123, "rewards/rejected": -1.3733854293823242, "step": 3504 }, { "epoch": 2.0902612826603324, "grad_norm": 49.0, "learning_rate": 5.819477434679335e-07, "logits/chosen": -0.002856359351426363, "logits/rejected": 0.12148334830999374, "logps/chosen": -40.93573760986328, "logps/rejected": -62.00894546508789, "loss": 0.4115, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.42675450444221497, "rewards/margins": 0.8534324765205383, "rewards/rejected": -1.2801870107650757, "step": 3520 }, { "epoch": 2.0997624703087885, "grad_norm": 52.75, "learning_rate": 5.800475059382422e-07, "logits/chosen": 0.019883442670106888, "logits/rejected": 0.12564308941364288, "logps/chosen": -42.050662994384766, "logps/rejected": -61.40731430053711, "loss": 0.3947, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.45309561491012573, "rewards/margins": 0.8529905080795288, "rewards/rejected": -1.3060861825942993, "step": 3536 }, { "epoch": 2.1092636579572446, "grad_norm": 44.25, "learning_rate": 5.78147268408551e-07, "logits/chosen": -0.0019861001055687666, "logits/rejected": 0.13409556448459625, "logps/chosen": -42.92340850830078, "logps/rejected": -66.3511734008789, "loss": 0.3933, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.43070507049560547, "rewards/margins": 0.907360315322876, "rewards/rejected": -1.3380653858184814, "step": 3552 }, { "epoch": 2.1187648456057007, "grad_norm": 48.25, "learning_rate": 5.762470308788598e-07, "logits/chosen": 0.13548046350479126, "logits/rejected": 0.11156810820102692, "logps/chosen": -41.66028594970703, "logps/rejected": -62.116031646728516, "loss": 0.3611, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3860835134983063, "rewards/margins": 0.9740406274795532, "rewards/rejected": -1.360124111175537, "step": 3568 }, { "epoch": 2.1282660332541568, "grad_norm": 49.25, "learning_rate": 5.743467933491685e-07, "logits/chosen": 0.022612586617469788, "logits/rejected": 0.13782542943954468, "logps/chosen": -39.91732406616211, "logps/rejected": -64.74801635742188, "loss": 0.3778, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.38275399804115295, "rewards/margins": 0.9404458999633789, "rewards/rejected": -1.3231998682022095, "step": 3584 }, { "epoch": 2.137767220902613, "grad_norm": 46.5, "learning_rate": 5.724465558194774e-07, "logits/chosen": 0.06513424217700958, "logits/rejected": 0.1354922503232956, "logps/chosen": -40.324771881103516, "logps/rejected": -62.070045471191406, "loss": 0.3937, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4051070213317871, "rewards/margins": 0.8988076448440552, "rewards/rejected": -1.3039146661758423, "step": 3600 }, { "epoch": 2.147268408551069, "grad_norm": 53.75, "learning_rate": 5.705463182897861e-07, "logits/chosen": 0.09106743335723877, "logits/rejected": 0.22678810358047485, "logps/chosen": -44.392295837402344, "logps/rejected": -65.7562255859375, "loss": 0.3996, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5041301846504211, "rewards/margins": 0.863517701625824, "rewards/rejected": -1.3676478862762451, "step": 3616 }, { "epoch": 2.156769596199525, "grad_norm": 49.5, "learning_rate": 5.68646080760095e-07, "logits/chosen": 0.045438431203365326, "logits/rejected": 0.09103736281394958, "logps/chosen": -38.36294174194336, "logps/rejected": -58.81281661987305, "loss": 0.3819, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3600079417228699, "rewards/margins": 0.9226837754249573, "rewards/rejected": -1.2826919555664062, "step": 3632 }, { "epoch": 2.166270783847981, "grad_norm": 46.0, "learning_rate": 5.667458432304037e-07, "logits/chosen": 0.13296620547771454, "logits/rejected": 0.18886858224868774, "logps/chosen": -40.72220230102539, "logps/rejected": -64.00652313232422, "loss": 0.3879, "rewards/accuracies": 0.890625, "rewards/chosen": -0.399859756231308, "rewards/margins": 0.9217446446418762, "rewards/rejected": -1.3216043710708618, "step": 3648 }, { "epoch": 2.175771971496437, "grad_norm": 45.5, "learning_rate": 5.648456057007126e-07, "logits/chosen": 0.02620471641421318, "logits/rejected": 0.11671482771635056, "logps/chosen": -40.500267028808594, "logps/rejected": -62.4256591796875, "loss": 0.3847, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.39798420667648315, "rewards/margins": 0.910751223564148, "rewards/rejected": -1.3087353706359863, "step": 3664 }, { "epoch": 2.1852731591448933, "grad_norm": 38.5, "learning_rate": 5.629453681710213e-07, "logits/chosen": 0.0530809611082077, "logits/rejected": 0.16400957107543945, "logps/chosen": -42.896827697753906, "logps/rejected": -63.195579528808594, "loss": 0.4002, "rewards/accuracies": 0.921875, "rewards/chosen": -0.394864559173584, "rewards/margins": 0.8955377340316772, "rewards/rejected": -1.2904024124145508, "step": 3680 }, { "epoch": 2.1947743467933494, "grad_norm": 50.5, "learning_rate": 5.610451306413302e-07, "logits/chosen": 0.040826160460710526, "logits/rejected": 0.10787297785282135, "logps/chosen": -40.30386734008789, "logps/rejected": -64.05851745605469, "loss": 0.3583, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3300076425075531, "rewards/margins": 1.0026148557662964, "rewards/rejected": -1.3326225280761719, "step": 3696 }, { "epoch": 2.204275534441805, "grad_norm": 58.0, "learning_rate": 5.591448931116389e-07, "logits/chosen": 0.06701106578111649, "logits/rejected": 0.13853205740451813, "logps/chosen": -42.53071594238281, "logps/rejected": -62.566688537597656, "loss": 0.3845, "rewards/accuracies": 0.890625, "rewards/chosen": -0.41142404079437256, "rewards/margins": 0.9055157899856567, "rewards/rejected": -1.3169398307800293, "step": 3712 }, { "epoch": 2.213776722090261, "grad_norm": 57.0, "learning_rate": 5.572446555819477e-07, "logits/chosen": 0.01710950955748558, "logits/rejected": 0.095205157995224, "logps/chosen": -40.496944427490234, "logps/rejected": -61.76165771484375, "loss": 0.3853, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3820299506187439, "rewards/margins": 0.9107472896575928, "rewards/rejected": -1.292777180671692, "step": 3728 }, { "epoch": 2.223277909738717, "grad_norm": 56.0, "learning_rate": 5.553444180522565e-07, "logits/chosen": 0.08512625843286514, "logits/rejected": 0.16064119338989258, "logps/chosen": -41.3702278137207, "logps/rejected": -66.45726013183594, "loss": 0.3534, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3733311891555786, "rewards/margins": 1.0442577600479126, "rewards/rejected": -1.4175888299942017, "step": 3744 }, { "epoch": 2.2327790973871733, "grad_norm": 60.75, "learning_rate": 5.534441805225653e-07, "logits/chosen": 0.09103821218013763, "logits/rejected": 0.13160774111747742, "logps/chosen": -43.8742790222168, "logps/rejected": -65.8563003540039, "loss": 0.3973, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.4512379467487335, "rewards/margins": 0.9139763712882996, "rewards/rejected": -1.365214228630066, "step": 3760 }, { "epoch": 2.2422802850356294, "grad_norm": 44.75, "learning_rate": 5.515439429928741e-07, "logits/chosen": -0.0005833394825458527, "logits/rejected": 0.10005127638578415, "logps/chosen": -42.24885559082031, "logps/rejected": -64.85872650146484, "loss": 0.3583, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.42689433693885803, "rewards/margins": 0.9682207703590393, "rewards/rejected": -1.3951151371002197, "step": 3776 }, { "epoch": 2.2517814726840855, "grad_norm": 53.5, "learning_rate": 5.496437054631829e-07, "logits/chosen": 0.05875023454427719, "logits/rejected": 0.13941840827465057, "logps/chosen": -42.663665771484375, "logps/rejected": -63.30080795288086, "loss": 0.3864, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4756395220756531, "rewards/margins": 0.941820502281189, "rewards/rejected": -1.4174600839614868, "step": 3792 }, { "epoch": 2.2612826603325415, "grad_norm": 39.25, "learning_rate": 5.477434679334917e-07, "logits/chosen": 0.02674500085413456, "logits/rejected": 0.17680123448371887, "logps/chosen": -38.55767822265625, "logps/rejected": -62.6890754699707, "loss": 0.3493, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.2811494767665863, "rewards/margins": 1.024692416191101, "rewards/rejected": -1.3058419227600098, "step": 3808 }, { "epoch": 2.2707838479809976, "grad_norm": 48.25, "learning_rate": 5.458432304038004e-07, "logits/chosen": 0.009942879900336266, "logits/rejected": 0.1574607640504837, "logps/chosen": -41.19435501098633, "logps/rejected": -63.181053161621094, "loss": 0.3878, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.42380547523498535, "rewards/margins": 0.9084890484809875, "rewards/rejected": -1.3322944641113281, "step": 3824 }, { "epoch": 2.2802850356294537, "grad_norm": 49.25, "learning_rate": 5.439429928741093e-07, "logits/chosen": 0.06854081898927689, "logits/rejected": 0.1759743094444275, "logps/chosen": -40.84571075439453, "logps/rejected": -64.14063262939453, "loss": 0.4032, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4499894678592682, "rewards/margins": 0.8732954263687134, "rewards/rejected": -1.3232848644256592, "step": 3840 }, { "epoch": 2.28978622327791, "grad_norm": 58.5, "learning_rate": 5.42042755344418e-07, "logits/chosen": 0.019341815263032913, "logits/rejected": 0.16274170577526093, "logps/chosen": -41.712154388427734, "logps/rejected": -63.17491149902344, "loss": 0.3988, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4367818534374237, "rewards/margins": 0.8796077370643616, "rewards/rejected": -1.3163896799087524, "step": 3856 }, { "epoch": 2.299287410926366, "grad_norm": 45.5, "learning_rate": 5.401425178147269e-07, "logits/chosen": 0.0484287403523922, "logits/rejected": 0.15023472905158997, "logps/chosen": -40.65409851074219, "logps/rejected": -64.70632934570312, "loss": 0.3934, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3949016332626343, "rewards/margins": 0.9213113784790039, "rewards/rejected": -1.3162128925323486, "step": 3872 }, { "epoch": 2.308788598574822, "grad_norm": 45.5, "learning_rate": 5.382422802850356e-07, "logits/chosen": 0.0558081790804863, "logits/rejected": 0.14007531106472015, "logps/chosen": -40.67841339111328, "logps/rejected": -63.90584945678711, "loss": 0.3557, "rewards/accuracies": 0.953125, "rewards/chosen": -0.40134915709495544, "rewards/margins": 0.9840033054351807, "rewards/rejected": -1.385352373123169, "step": 3888 }, { "epoch": 2.318289786223278, "grad_norm": 43.5, "learning_rate": 5.363420427553445e-07, "logits/chosen": 0.06617691367864609, "logits/rejected": 0.13138173520565033, "logps/chosen": -42.65940856933594, "logps/rejected": -63.277469635009766, "loss": 0.3987, "rewards/accuracies": 0.90625, "rewards/chosen": -0.452262282371521, "rewards/margins": 0.8770920634269714, "rewards/rejected": -1.3293542861938477, "step": 3904 }, { "epoch": 2.3277909738717337, "grad_norm": 49.75, "learning_rate": 5.344418052256532e-07, "logits/chosen": 0.0712217167019844, "logits/rejected": 0.11869163066148758, "logps/chosen": -42.18745803833008, "logps/rejected": -62.14987564086914, "loss": 0.387, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3666439354419708, "rewards/margins": 0.9148606061935425, "rewards/rejected": -1.281504511833191, "step": 3920 }, { "epoch": 2.33729216152019, "grad_norm": 41.25, "learning_rate": 5.32541567695962e-07, "logits/chosen": 0.052989356219768524, "logits/rejected": 0.1383398026227951, "logps/chosen": -39.75597381591797, "logps/rejected": -61.32164764404297, "loss": 0.3859, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3883766829967499, "rewards/margins": 0.9118002653121948, "rewards/rejected": -1.3001768589019775, "step": 3936 }, { "epoch": 2.346793349168646, "grad_norm": 45.25, "learning_rate": 5.306413301662708e-07, "logits/chosen": 0.07647877931594849, "logits/rejected": 0.12356515228748322, "logps/chosen": -41.482666015625, "logps/rejected": -64.74012756347656, "loss": 0.4029, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.4555918574333191, "rewards/margins": 0.8827881217002869, "rewards/rejected": -1.338379979133606, "step": 3952 }, { "epoch": 2.356294536817102, "grad_norm": 49.0, "learning_rate": 5.287410926365796e-07, "logits/chosen": 0.025606969371438026, "logits/rejected": 0.11031323671340942, "logps/chosen": -40.05935287475586, "logps/rejected": -61.486900329589844, "loss": 0.367, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.34517067670822144, "rewards/margins": 1.008548378944397, "rewards/rejected": -1.3537191152572632, "step": 3968 }, { "epoch": 2.365795724465558, "grad_norm": 52.75, "learning_rate": 5.268408551068883e-07, "logits/chosen": 0.020188216120004654, "logits/rejected": 0.07814847677946091, "logps/chosen": -41.763946533203125, "logps/rejected": -62.15244674682617, "loss": 0.3841, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4187239706516266, "rewards/margins": 0.9112571477890015, "rewards/rejected": -1.3299810886383057, "step": 3984 }, { "epoch": 2.375296912114014, "grad_norm": 53.5, "learning_rate": 5.24940617577197e-07, "logits/chosen": -0.013901928439736366, "logits/rejected": 0.1482914239168167, "logps/chosen": -40.2778205871582, "logps/rejected": -64.91542053222656, "loss": 0.3821, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42066463828086853, "rewards/margins": 0.9068177938461304, "rewards/rejected": -1.3274823427200317, "step": 4000 }, { "epoch": 2.3847980997624703, "grad_norm": 60.75, "learning_rate": 5.230403800475059e-07, "logits/chosen": 0.07787059247493744, "logits/rejected": 0.17709147930145264, "logps/chosen": -42.24334716796875, "logps/rejected": -63.37432861328125, "loss": 0.3924, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3965498208999634, "rewards/margins": 0.8994001150131226, "rewards/rejected": -1.295949935913086, "step": 4016 }, { "epoch": 2.3942992874109263, "grad_norm": 47.25, "learning_rate": 5.211401425178146e-07, "logits/chosen": 0.06299006193876266, "logits/rejected": 0.14556781947612762, "logps/chosen": -42.120365142822266, "logps/rejected": -64.56780242919922, "loss": 0.3795, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4348159730434418, "rewards/margins": 0.9128706455230713, "rewards/rejected": -1.3476866483688354, "step": 4032 }, { "epoch": 2.4038004750593824, "grad_norm": 44.25, "learning_rate": 5.192399049881235e-07, "logits/chosen": 0.086119145154953, "logits/rejected": 0.1857333481311798, "logps/chosen": -40.12603759765625, "logps/rejected": -61.00786590576172, "loss": 0.4011, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.41915708780288696, "rewards/margins": 0.8710625767707825, "rewards/rejected": -1.2902196645736694, "step": 4048 }, { "epoch": 2.4133016627078385, "grad_norm": 37.75, "learning_rate": 5.173396674584322e-07, "logits/chosen": 0.027670690789818764, "logits/rejected": 0.1215248703956604, "logps/chosen": -40.98219299316406, "logps/rejected": -62.411865234375, "loss": 0.3782, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3689427971839905, "rewards/margins": 0.9613568186759949, "rewards/rejected": -1.3302994966506958, "step": 4064 }, { "epoch": 2.4228028503562946, "grad_norm": 45.5, "learning_rate": 5.154394299287411e-07, "logits/chosen": 0.03985697776079178, "logits/rejected": 0.12585824728012085, "logps/chosen": -42.074466705322266, "logps/rejected": -64.21247100830078, "loss": 0.3755, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3887619376182556, "rewards/margins": 0.9904804229736328, "rewards/rejected": -1.3792424201965332, "step": 4080 }, { "epoch": 2.4323040380047507, "grad_norm": 43.75, "learning_rate": 5.135391923990498e-07, "logits/chosen": 0.05401856452226639, "logits/rejected": 0.10051050782203674, "logps/chosen": -41.186561584472656, "logps/rejected": -63.16551971435547, "loss": 0.4086, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.4640354812145233, "rewards/margins": 0.8580038547515869, "rewards/rejected": -1.3220393657684326, "step": 4096 }, { "epoch": 2.441805225653207, "grad_norm": 48.25, "learning_rate": 5.116389548693586e-07, "logits/chosen": 0.06853917986154556, "logits/rejected": 0.1522480845451355, "logps/chosen": -41.95103073120117, "logps/rejected": -64.08912658691406, "loss": 0.4026, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4834258258342743, "rewards/margins": 0.8559648394584656, "rewards/rejected": -1.3393906354904175, "step": 4112 }, { "epoch": 2.451306413301663, "grad_norm": 41.5, "learning_rate": 5.097387173396674e-07, "logits/chosen": 0.07197491824626923, "logits/rejected": 0.16541120409965515, "logps/chosen": -42.12483215332031, "logps/rejected": -66.00977325439453, "loss": 0.3502, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.4519379436969757, "rewards/margins": 1.0152369737625122, "rewards/rejected": -1.467175006866455, "step": 4128 }, { "epoch": 2.460807600950119, "grad_norm": 41.0, "learning_rate": 5.078384798099762e-07, "logits/chosen": 0.048155199736356735, "logits/rejected": 0.14590026438236237, "logps/chosen": -41.200191497802734, "logps/rejected": -63.48173522949219, "loss": 0.3958, "rewards/accuracies": 0.890625, "rewards/chosen": -0.46383845806121826, "rewards/margins": 0.8997552394866943, "rewards/rejected": -1.3635936975479126, "step": 4144 }, { "epoch": 2.470308788598575, "grad_norm": 44.0, "learning_rate": 5.05938242280285e-07, "logits/chosen": 0.06484143435955048, "logits/rejected": 0.13990077376365662, "logps/chosen": -41.676612854003906, "logps/rejected": -66.45744323730469, "loss": 0.3469, "rewards/accuracies": 0.9375, "rewards/chosen": -0.404636025428772, "rewards/margins": 1.0517361164093018, "rewards/rejected": -1.4563721418380737, "step": 4160 }, { "epoch": 2.4798099762470307, "grad_norm": 56.0, "learning_rate": 5.040380047505938e-07, "logits/chosen": 0.10340757668018341, "logits/rejected": 0.12481331825256348, "logps/chosen": -41.96209716796875, "logps/rejected": -66.62129211425781, "loss": 0.3694, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4350067377090454, "rewards/margins": 0.967208981513977, "rewards/rejected": -1.4022157192230225, "step": 4176 }, { "epoch": 2.489311163895487, "grad_norm": 49.5, "learning_rate": 5.021377672209026e-07, "logits/chosen": 0.07601352035999298, "logits/rejected": 0.17555804550647736, "logps/chosen": -41.83412170410156, "logps/rejected": -62.92101287841797, "loss": 0.385, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4230985641479492, "rewards/margins": 0.9050178527832031, "rewards/rejected": -1.3281164169311523, "step": 4192 }, { "epoch": 2.498812351543943, "grad_norm": 44.5, "learning_rate": 5.002375296912114e-07, "logits/chosen": 0.03947605937719345, "logits/rejected": 0.14804242551326752, "logps/chosen": -42.070068359375, "logps/rejected": -67.8903579711914, "loss": 0.3496, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3808472752571106, "rewards/margins": 1.0300160646438599, "rewards/rejected": -1.4108633995056152, "step": 4208 }, { "epoch": 2.508313539192399, "grad_norm": 45.25, "learning_rate": 4.983372921615201e-07, "logits/chosen": 0.05546602979302406, "logits/rejected": 0.106099434196949, "logps/chosen": -39.989471435546875, "logps/rejected": -63.233856201171875, "loss": 0.3484, "rewards/accuracies": 0.9375, "rewards/chosen": -0.33619987964630127, "rewards/margins": 1.0194822549819946, "rewards/rejected": -1.3556820154190063, "step": 4224 }, { "epoch": 2.517814726840855, "grad_norm": 52.0, "learning_rate": 4.96437054631829e-07, "logits/chosen": 0.026712900027632713, "logits/rejected": 0.09488269686698914, "logps/chosen": -41.7761344909668, "logps/rejected": -60.326847076416016, "loss": 0.3839, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4112342596054077, "rewards/margins": 0.9102669358253479, "rewards/rejected": -1.3215012550354004, "step": 4240 }, { "epoch": 2.527315914489311, "grad_norm": 47.5, "learning_rate": 4.945368171021377e-07, "logits/chosen": 0.050753381103277206, "logits/rejected": 0.10667050629854202, "logps/chosen": -42.3249626159668, "logps/rejected": -64.19955444335938, "loss": 0.4062, "rewards/accuracies": 0.890625, "rewards/chosen": -0.46207451820373535, "rewards/margins": 0.864716649055481, "rewards/rejected": -1.3267912864685059, "step": 4256 }, { "epoch": 2.5368171021377672, "grad_norm": 47.25, "learning_rate": 4.926365795724465e-07, "logits/chosen": 0.007675782777369022, "logits/rejected": 0.06300470232963562, "logps/chosen": -42.04328918457031, "logps/rejected": -61.39952850341797, "loss": 0.3991, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.45607635378837585, "rewards/margins": 0.8779357671737671, "rewards/rejected": -1.3340120315551758, "step": 4272 }, { "epoch": 2.5463182897862233, "grad_norm": 52.75, "learning_rate": 4.907363420427553e-07, "logits/chosen": 0.08944439142942429, "logits/rejected": 0.16112002730369568, "logps/chosen": -42.05665588378906, "logps/rejected": -65.32984924316406, "loss": 0.3578, "rewards/accuracies": 0.9375, "rewards/chosen": -0.43149158358573914, "rewards/margins": 0.9926659464836121, "rewards/rejected": -1.4241576194763184, "step": 4288 }, { "epoch": 2.5558194774346794, "grad_norm": 43.75, "learning_rate": 4.888361045130641e-07, "logits/chosen": 0.048501964658498764, "logits/rejected": 0.11412826180458069, "logps/chosen": -40.358360290527344, "logps/rejected": -62.57841110229492, "loss": 0.3745, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3942060172557831, "rewards/margins": 0.9297899007797241, "rewards/rejected": -1.32399582862854, "step": 4304 }, { "epoch": 2.5653206650831355, "grad_norm": 62.5, "learning_rate": 4.869358669833729e-07, "logits/chosen": 0.048068106174468994, "logits/rejected": 0.11154348403215408, "logps/chosen": -42.39956283569336, "logps/rejected": -64.91427612304688, "loss": 0.3943, "rewards/accuracies": 0.890625, "rewards/chosen": -0.42655476927757263, "rewards/margins": 0.9072946310043335, "rewards/rejected": -1.3338494300842285, "step": 4320 }, { "epoch": 2.5748218527315916, "grad_norm": 44.25, "learning_rate": 4.850356294536817e-07, "logits/chosen": 0.0894806981086731, "logits/rejected": 0.12811443209648132, "logps/chosen": -42.88471603393555, "logps/rejected": -63.97032165527344, "loss": 0.3795, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4201812446117401, "rewards/margins": 0.9957151412963867, "rewards/rejected": -1.4158962965011597, "step": 4336 }, { "epoch": 2.5843230403800472, "grad_norm": 47.25, "learning_rate": 4.831353919239905e-07, "logits/chosen": 0.11298425495624542, "logits/rejected": 0.12845686078071594, "logps/chosen": -42.59984588623047, "logps/rejected": -62.72551727294922, "loss": 0.4004, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4689531922340393, "rewards/margins": 0.8442877531051636, "rewards/rejected": -1.3132410049438477, "step": 4352 }, { "epoch": 2.5938242280285033, "grad_norm": 41.5, "learning_rate": 4.812351543942993e-07, "logits/chosen": 0.03180324286222458, "logits/rejected": 0.11583473533391953, "logps/chosen": -41.20764923095703, "logps/rejected": -62.94646453857422, "loss": 0.3847, "rewards/accuracies": 0.921875, "rewards/chosen": -0.42716217041015625, "rewards/margins": 0.8983253240585327, "rewards/rejected": -1.3254876136779785, "step": 4368 }, { "epoch": 2.6033254156769594, "grad_norm": 53.5, "learning_rate": 4.793349168646081e-07, "logits/chosen": 0.09349140524864197, "logits/rejected": 0.1733095794916153, "logps/chosen": -42.4179573059082, "logps/rejected": -64.96019744873047, "loss": 0.3653, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4492948353290558, "rewards/margins": 0.9919961094856262, "rewards/rejected": -1.4412909746170044, "step": 4384 }, { "epoch": 2.6128266033254155, "grad_norm": 42.5, "learning_rate": 4.774346793349168e-07, "logits/chosen": -0.035068899393081665, "logits/rejected": 0.11121690273284912, "logps/chosen": -38.56016540527344, "logps/rejected": -62.15085220336914, "loss": 0.3403, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.33855387568473816, "rewards/margins": 1.0478910207748413, "rewards/rejected": -1.3864448070526123, "step": 4400 }, { "epoch": 2.6223277909738716, "grad_norm": 38.75, "learning_rate": 4.755344418052256e-07, "logits/chosen": 0.08347325026988983, "logits/rejected": 0.181388258934021, "logps/chosen": -42.476497650146484, "logps/rejected": -65.1061019897461, "loss": 0.3708, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.4493361711502075, "rewards/margins": 0.9572871327400208, "rewards/rejected": -1.4066232442855835, "step": 4416 }, { "epoch": 2.6318289786223277, "grad_norm": 34.25, "learning_rate": 4.736342042755344e-07, "logits/chosen": -0.0012733405455946922, "logits/rejected": 0.10057978332042694, "logps/chosen": -40.456119537353516, "logps/rejected": -63.483036041259766, "loss": 0.3623, "rewards/accuracies": 0.96875, "rewards/chosen": -0.35637179017066956, "rewards/margins": 0.99317467212677, "rewards/rejected": -1.3495464324951172, "step": 4432 }, { "epoch": 2.6413301662707838, "grad_norm": 60.0, "learning_rate": 4.717339667458432e-07, "logits/chosen": 0.04187817499041557, "logits/rejected": 0.11814681440591812, "logps/chosen": -42.3458251953125, "logps/rejected": -61.30575180053711, "loss": 0.4308, "rewards/accuracies": 0.84375, "rewards/chosen": -0.46507805585861206, "rewards/margins": 0.7858736515045166, "rewards/rejected": -1.2509517669677734, "step": 4448 }, { "epoch": 2.65083135391924, "grad_norm": 43.25, "learning_rate": 4.69833729216152e-07, "logits/chosen": 0.02010425180196762, "logits/rejected": 0.12688446044921875, "logps/chosen": -41.3795051574707, "logps/rejected": -62.74733352661133, "loss": 0.3521, "rewards/accuracies": 0.953125, "rewards/chosen": -0.40290382504463196, "rewards/margins": 0.9980502128601074, "rewards/rejected": -1.4009541273117065, "step": 4464 }, { "epoch": 2.660332541567696, "grad_norm": 49.75, "learning_rate": 4.679334916864608e-07, "logits/chosen": 0.04653356224298477, "logits/rejected": 0.1141427606344223, "logps/chosen": -42.21457290649414, "logps/rejected": -63.52908706665039, "loss": 0.3789, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.39218419790267944, "rewards/margins": 0.9429512023925781, "rewards/rejected": -1.3351353406906128, "step": 4480 }, { "epoch": 2.669833729216152, "grad_norm": 49.0, "learning_rate": 4.660332541567696e-07, "logits/chosen": 0.03440314531326294, "logits/rejected": 0.06475966423749924, "logps/chosen": -40.21307373046875, "logps/rejected": -61.209136962890625, "loss": 0.3654, "rewards/accuracies": 0.90625, "rewards/chosen": -0.36982235312461853, "rewards/margins": 0.9860597252845764, "rewards/rejected": -1.355882167816162, "step": 4496 }, { "epoch": 2.679334916864608, "grad_norm": 40.75, "learning_rate": 4.641330166270784e-07, "logits/chosen": 0.05200592428445816, "logits/rejected": 0.09623152762651443, "logps/chosen": -40.480892181396484, "logps/rejected": -61.244468688964844, "loss": 0.3711, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.32693397998809814, "rewards/margins": 0.9501146078109741, "rewards/rejected": -1.2770487070083618, "step": 4512 }, { "epoch": 2.688836104513064, "grad_norm": 44.0, "learning_rate": 4.622327790973872e-07, "logits/chosen": 0.011258577927947044, "logits/rejected": 0.15000179409980774, "logps/chosen": -42.406429290771484, "logps/rejected": -67.35293579101562, "loss": 0.3564, "rewards/accuracies": 0.921875, "rewards/chosen": -0.42154744267463684, "rewards/margins": 1.0057048797607422, "rewards/rejected": -1.427252173423767, "step": 4528 }, { "epoch": 2.6983372921615203, "grad_norm": 42.25, "learning_rate": 4.60332541567696e-07, "logits/chosen": 0.01903606206178665, "logits/rejected": 0.128606379032135, "logps/chosen": -38.58045959472656, "logps/rejected": -61.52817916870117, "loss": 0.3749, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3256430923938751, "rewards/margins": 0.9467271566390991, "rewards/rejected": -1.2723702192306519, "step": 4544 }, { "epoch": 2.7078384798099764, "grad_norm": 62.25, "learning_rate": 4.584323040380047e-07, "logits/chosen": 0.03335714340209961, "logits/rejected": 0.16635264456272125, "logps/chosen": -39.833648681640625, "logps/rejected": -61.806251525878906, "loss": 0.3795, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3701300024986267, "rewards/margins": 0.9247270226478577, "rewards/rejected": -1.2948570251464844, "step": 4560 }, { "epoch": 2.7173396674584325, "grad_norm": 34.0, "learning_rate": 4.565320665083135e-07, "logits/chosen": 0.011373243294656277, "logits/rejected": 0.1352601945400238, "logps/chosen": -41.90055465698242, "logps/rejected": -65.95397186279297, "loss": 0.372, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4336491525173187, "rewards/margins": 0.9594928026199341, "rewards/rejected": -1.3931418657302856, "step": 4576 }, { "epoch": 2.7268408551068886, "grad_norm": 40.75, "learning_rate": 4.5463182897862227e-07, "logits/chosen": 0.03081374615430832, "logits/rejected": 0.1375807374715805, "logps/chosen": -40.30317687988281, "logps/rejected": -62.725467681884766, "loss": 0.3491, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.33172154426574707, "rewards/margins": 1.030013918876648, "rewards/rejected": -1.3617355823516846, "step": 4592 }, { "epoch": 2.7363420427553447, "grad_norm": 53.5, "learning_rate": 4.5273159144893107e-07, "logits/chosen": 0.002102881669998169, "logits/rejected": 0.05711708217859268, "logps/chosen": -39.306236267089844, "logps/rejected": -62.230323791503906, "loss": 0.3496, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3352733254432678, "rewards/margins": 1.0495156049728394, "rewards/rejected": -1.3847888708114624, "step": 4608 }, { "epoch": 2.7458432304038007, "grad_norm": 54.0, "learning_rate": 4.5083135391923986e-07, "logits/chosen": 0.02053050696849823, "logits/rejected": 0.12392740696668625, "logps/chosen": -41.42273712158203, "logps/rejected": -63.603023529052734, "loss": 0.3822, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4146535396575928, "rewards/margins": 0.9132061004638672, "rewards/rejected": -1.32785964012146, "step": 4624 }, { "epoch": 2.7553444180522564, "grad_norm": 38.5, "learning_rate": 4.4893111638954866e-07, "logits/chosen": 0.0008830556180328131, "logits/rejected": 0.11984378099441528, "logps/chosen": -41.754310607910156, "logps/rejected": -67.43799591064453, "loss": 0.3531, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4328157305717468, "rewards/margins": 1.053360939025879, "rewards/rejected": -1.4861767292022705, "step": 4640 }, { "epoch": 2.7648456057007125, "grad_norm": 51.5, "learning_rate": 4.4703087885985746e-07, "logits/chosen": 0.0631004199385643, "logits/rejected": 0.13722942769527435, "logps/chosen": -41.07665252685547, "logps/rejected": -62.44814682006836, "loss": 0.3926, "rewards/accuracies": 0.90625, "rewards/chosen": -0.39121901988983154, "rewards/margins": 0.9159356355667114, "rewards/rejected": -1.307154655456543, "step": 4656 }, { "epoch": 2.7743467933491686, "grad_norm": 50.25, "learning_rate": 4.4513064133016626e-07, "logits/chosen": 0.034338682889938354, "logits/rejected": 0.18351401388645172, "logps/chosen": -40.274269104003906, "logps/rejected": -65.64212799072266, "loss": 0.3389, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.34793734550476074, "rewards/margins": 1.0671257972717285, "rewards/rejected": -1.4150630235671997, "step": 4672 }, { "epoch": 2.7838479809976246, "grad_norm": 44.25, "learning_rate": 4.4323040380047505e-07, "logits/chosen": 0.0658361166715622, "logits/rejected": 0.22124770283699036, "logps/chosen": -41.521907806396484, "logps/rejected": -66.72534942626953, "loss": 0.337, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3596152663230896, "rewards/margins": 1.0657168626785278, "rewards/rejected": -1.4253321886062622, "step": 4688 }, { "epoch": 2.7933491686460807, "grad_norm": 53.5, "learning_rate": 4.4133016627078385e-07, "logits/chosen": 0.09054507315158844, "logits/rejected": 0.17531618475914001, "logps/chosen": -39.683006286621094, "logps/rejected": -65.05414581298828, "loss": 0.3834, "rewards/accuracies": 0.890625, "rewards/chosen": -0.39716941118240356, "rewards/margins": 0.9602290987968445, "rewards/rejected": -1.357398271560669, "step": 4704 }, { "epoch": 2.802850356294537, "grad_norm": 36.25, "learning_rate": 4.3942992874109265e-07, "logits/chosen": 0.03398045897483826, "logits/rejected": 0.1653180718421936, "logps/chosen": -41.86040496826172, "logps/rejected": -66.90827941894531, "loss": 0.3572, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3843865394592285, "rewards/margins": 1.0734546184539795, "rewards/rejected": -1.457841157913208, "step": 4720 }, { "epoch": 2.812351543942993, "grad_norm": 37.5, "learning_rate": 4.3752969121140144e-07, "logits/chosen": 0.039868682622909546, "logits/rejected": 0.1837385594844818, "logps/chosen": -40.565216064453125, "logps/rejected": -63.868896484375, "loss": 0.3527, "rewards/accuracies": 0.9375, "rewards/chosen": -0.378643661737442, "rewards/margins": 0.9983595013618469, "rewards/rejected": -1.3770031929016113, "step": 4736 }, { "epoch": 2.821852731591449, "grad_norm": 55.25, "learning_rate": 4.356294536817102e-07, "logits/chosen": 0.09310074895620346, "logits/rejected": 0.14162081480026245, "logps/chosen": -41.470603942871094, "logps/rejected": -65.76705169677734, "loss": 0.3606, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4034523069858551, "rewards/margins": 1.0022685527801514, "rewards/rejected": -1.4057209491729736, "step": 4752 }, { "epoch": 2.831353919239905, "grad_norm": 41.75, "learning_rate": 4.33729216152019e-07, "logits/chosen": 0.11650769412517548, "logits/rejected": 0.15587171912193298, "logps/chosen": -41.557228088378906, "logps/rejected": -60.604087829589844, "loss": 0.3735, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.4029754400253296, "rewards/margins": 0.8826481103897095, "rewards/rejected": -1.2856236696243286, "step": 4768 }, { "epoch": 2.840855106888361, "grad_norm": 43.75, "learning_rate": 4.318289786223278e-07, "logits/chosen": 0.02145340107381344, "logits/rejected": 0.14118751883506775, "logps/chosen": -42.241329193115234, "logps/rejected": -64.31900024414062, "loss": 0.3717, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4430907368659973, "rewards/margins": 0.9653340578079224, "rewards/rejected": -1.4084248542785645, "step": 4784 }, { "epoch": 2.850356294536817, "grad_norm": 43.25, "learning_rate": 4.299287410926365e-07, "logits/chosen": 0.026168465614318848, "logits/rejected": 0.06277532130479813, "logps/chosen": -41.47759246826172, "logps/rejected": -60.46055603027344, "loss": 0.4074, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.43215835094451904, "rewards/margins": 0.8435641527175903, "rewards/rejected": -1.2757225036621094, "step": 4800 }, { "epoch": 2.859857482185273, "grad_norm": 42.25, "learning_rate": 4.280285035629453e-07, "logits/chosen": 0.074520543217659, "logits/rejected": 0.16152548789978027, "logps/chosen": -41.94219970703125, "logps/rejected": -64.68109893798828, "loss": 0.3645, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3893333375453949, "rewards/margins": 0.9812647104263306, "rewards/rejected": -1.3705980777740479, "step": 4816 }, { "epoch": 2.869358669833729, "grad_norm": 39.25, "learning_rate": 4.261282660332541e-07, "logits/chosen": 0.02079102396965027, "logits/rejected": 0.15321360528469086, "logps/chosen": -39.940887451171875, "logps/rejected": -64.5802230834961, "loss": 0.3704, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41797706484794617, "rewards/margins": 0.9673931002616882, "rewards/rejected": -1.385370135307312, "step": 4832 }, { "epoch": 2.878859857482185, "grad_norm": 56.75, "learning_rate": 4.242280285035629e-07, "logits/chosen": 0.020523881539702415, "logits/rejected": 0.18188393115997314, "logps/chosen": -40.60096740722656, "logps/rejected": -64.39540100097656, "loss": 0.3794, "rewards/accuracies": 0.90625, "rewards/chosen": -0.42755821347236633, "rewards/margins": 0.9751715064048767, "rewards/rejected": -1.4027297496795654, "step": 4848 }, { "epoch": 2.888361045130641, "grad_norm": 41.0, "learning_rate": 4.223277909738717e-07, "logits/chosen": 0.038833338767290115, "logits/rejected": 0.15533462166786194, "logps/chosen": -39.362998962402344, "logps/rejected": -65.98983001708984, "loss": 0.3129, "rewards/accuracies": 0.96875, "rewards/chosen": -0.29717180132865906, "rewards/margins": 1.1468960046768188, "rewards/rejected": -1.4440677165985107, "step": 4864 }, { "epoch": 2.8978622327790973, "grad_norm": 63.75, "learning_rate": 4.204275534441805e-07, "logits/chosen": 0.05080319941043854, "logits/rejected": 0.11638832837343216, "logps/chosen": -40.73345184326172, "logps/rejected": -62.49778747558594, "loss": 0.387, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3901386260986328, "rewards/margins": 0.9359588623046875, "rewards/rejected": -1.3260974884033203, "step": 4880 }, { "epoch": 2.9073634204275534, "grad_norm": 41.0, "learning_rate": 4.185273159144893e-07, "logits/chosen": 0.05678907781839371, "logits/rejected": 0.19164641201496124, "logps/chosen": -40.55916213989258, "logps/rejected": -65.99730682373047, "loss": 0.3831, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.3813077211380005, "rewards/margins": 0.9531018733978271, "rewards/rejected": -1.3344097137451172, "step": 4896 }, { "epoch": 2.9168646080760094, "grad_norm": 53.0, "learning_rate": 4.166270783847981e-07, "logits/chosen": -0.0027349982410669327, "logits/rejected": 0.0990891233086586, "logps/chosen": -39.93202209472656, "logps/rejected": -62.99840545654297, "loss": 0.3721, "rewards/accuracies": 0.921875, "rewards/chosen": -0.36999353766441345, "rewards/margins": 0.9719130992889404, "rewards/rejected": -1.3419066667556763, "step": 4912 }, { "epoch": 2.9263657957244655, "grad_norm": 40.25, "learning_rate": 4.147268408551069e-07, "logits/chosen": 0.05395427346229553, "logits/rejected": 0.13786588609218597, "logps/chosen": -40.425132751464844, "logps/rejected": -61.25279998779297, "loss": 0.3538, "rewards/accuracies": 0.9375, "rewards/chosen": -0.34139615297317505, "rewards/margins": 1.0007226467132568, "rewards/rejected": -1.3421188592910767, "step": 4928 }, { "epoch": 2.9358669833729216, "grad_norm": 65.0, "learning_rate": 4.1282660332541564e-07, "logits/chosen": 0.09111806005239487, "logits/rejected": 0.13585761189460754, "logps/chosen": -43.16571807861328, "logps/rejected": -64.36392211914062, "loss": 0.3918, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4209873378276825, "rewards/margins": 0.8701137900352478, "rewards/rejected": -1.2911012172698975, "step": 4944 }, { "epoch": 2.9453681710213777, "grad_norm": 53.0, "learning_rate": 4.1092636579572444e-07, "logits/chosen": 0.05853826552629471, "logits/rejected": 0.18161046504974365, "logps/chosen": -40.96638488769531, "logps/rejected": -66.1131591796875, "loss": 0.3467, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3545738160610199, "rewards/margins": 1.0768417119979858, "rewards/rejected": -1.4314155578613281, "step": 4960 }, { "epoch": 2.954869358669834, "grad_norm": 41.5, "learning_rate": 4.0902612826603324e-07, "logits/chosen": 0.06567525863647461, "logits/rejected": 0.18003502488136292, "logps/chosen": -39.08511734008789, "logps/rejected": -61.72608947753906, "loss": 0.3658, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3397156000137329, "rewards/margins": 1.0000014305114746, "rewards/rejected": -1.3397170305252075, "step": 4976 }, { "epoch": 2.96437054631829, "grad_norm": 44.25, "learning_rate": 4.0712589073634203e-07, "logits/chosen": 0.016267111524939537, "logits/rejected": 0.13151559233665466, "logps/chosen": -40.71261978149414, "logps/rejected": -62.006256103515625, "loss": 0.3973, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4248185455799103, "rewards/margins": 0.8792138695716858, "rewards/rejected": -1.3040324449539185, "step": 4992 }, { "epoch": 2.973871733966746, "grad_norm": 58.5, "learning_rate": 4.0522565320665083e-07, "logits/chosen": 0.03460274264216423, "logits/rejected": 0.13233664631843567, "logps/chosen": -40.716407775878906, "logps/rejected": -62.79671096801758, "loss": 0.3879, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4130927622318268, "rewards/margins": 0.9299353957176208, "rewards/rejected": -1.3430280685424805, "step": 5008 }, { "epoch": 2.983372921615202, "grad_norm": 47.0, "learning_rate": 4.0332541567695963e-07, "logits/chosen": -0.026960894465446472, "logits/rejected": 0.11078447848558426, "logps/chosen": -39.08903121948242, "logps/rejected": -62.35736083984375, "loss": 0.3588, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3670194745063782, "rewards/margins": 1.0107150077819824, "rewards/rejected": -1.3777345418930054, "step": 5024 }, { "epoch": 2.992874109263658, "grad_norm": 54.0, "learning_rate": 4.0142517814726837e-07, "logits/chosen": 0.026894917711615562, "logits/rejected": 0.1446702629327774, "logps/chosen": -41.22724914550781, "logps/rejected": -64.66578674316406, "loss": 0.3601, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3958403468132019, "rewards/margins": 0.9907092452049255, "rewards/rejected": -1.386549472808838, "step": 5040 }, { "epoch": 3.002375296912114, "grad_norm": 38.25, "learning_rate": 3.9952494061757717e-07, "logits/chosen": 0.0721140205860138, "logits/rejected": 0.08904615044593811, "logps/chosen": -42.36117172241211, "logps/rejected": -62.753578186035156, "loss": 0.3669, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4690694510936737, "rewards/margins": 0.9362343549728394, "rewards/rejected": -1.4053038358688354, "step": 5056 }, { "epoch": 3.01187648456057, "grad_norm": 38.75, "learning_rate": 3.9762470308788596e-07, "logits/chosen": 0.08058139681816101, "logits/rejected": 0.14635083079338074, "logps/chosen": -40.83808135986328, "logps/rejected": -62.58879470825195, "loss": 0.3736, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.40799736976623535, "rewards/margins": 0.9339090585708618, "rewards/rejected": -1.3419064283370972, "step": 5072 }, { "epoch": 3.021377672209026, "grad_norm": 49.25, "learning_rate": 3.9572446555819476e-07, "logits/chosen": 0.07044284790754318, "logits/rejected": 0.14013811945915222, "logps/chosen": -40.04065704345703, "logps/rejected": -60.96607971191406, "loss": 0.3681, "rewards/accuracies": 0.921875, "rewards/chosen": -0.35846540331840515, "rewards/margins": 0.9638465642929077, "rewards/rejected": -1.3223118782043457, "step": 5088 }, { "epoch": 3.030878859857482, "grad_norm": 52.0, "learning_rate": 3.9382422802850356e-07, "logits/chosen": 0.060851648449897766, "logits/rejected": 0.1331576406955719, "logps/chosen": -41.8791618347168, "logps/rejected": -64.0675048828125, "loss": 0.3568, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.359686017036438, "rewards/margins": 0.9825432300567627, "rewards/rejected": -1.3422291278839111, "step": 5104 }, { "epoch": 3.040380047505938, "grad_norm": 40.5, "learning_rate": 3.919239904988123e-07, "logits/chosen": 0.009696897119283676, "logits/rejected": 0.12189489603042603, "logps/chosen": -39.73085021972656, "logps/rejected": -63.392818450927734, "loss": 0.3708, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3765300512313843, "rewards/margins": 0.9506427049636841, "rewards/rejected": -1.3271726369857788, "step": 5120 }, { "epoch": 3.0498812351543942, "grad_norm": 48.75, "learning_rate": 3.900237529691211e-07, "logits/chosen": 0.10276854038238525, "logits/rejected": 0.13192720711231232, "logps/chosen": -42.13685989379883, "logps/rejected": -63.64410400390625, "loss": 0.3788, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.39574676752090454, "rewards/margins": 0.9484460949897766, "rewards/rejected": -1.3441928625106812, "step": 5136 }, { "epoch": 3.0593824228028503, "grad_norm": 52.25, "learning_rate": 3.881235154394299e-07, "logits/chosen": 0.11582867801189423, "logits/rejected": 0.11568093299865723, "logps/chosen": -43.7314338684082, "logps/rejected": -63.54302215576172, "loss": 0.3977, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.4822978973388672, "rewards/margins": 0.8951501846313477, "rewards/rejected": -1.3774480819702148, "step": 5152 }, { "epoch": 3.0688836104513064, "grad_norm": 49.25, "learning_rate": 3.862232779097387e-07, "logits/chosen": 0.08060070872306824, "logits/rejected": 0.10565990209579468, "logps/chosen": -41.34340286254883, "logps/rejected": -63.646820068359375, "loss": 0.3848, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3923201858997345, "rewards/margins": 0.9066511392593384, "rewards/rejected": -1.29897141456604, "step": 5168 }, { "epoch": 3.0783847980997625, "grad_norm": 46.25, "learning_rate": 3.843230403800475e-07, "logits/chosen": 0.055783893913030624, "logits/rejected": 0.14008364081382751, "logps/chosen": -41.782936096191406, "logps/rejected": -62.20945739746094, "loss": 0.3836, "rewards/accuracies": 0.9375, "rewards/chosen": -0.39728862047195435, "rewards/margins": 0.8838520050048828, "rewards/rejected": -1.2811405658721924, "step": 5184 }, { "epoch": 3.0878859857482186, "grad_norm": 43.5, "learning_rate": 3.824228028503563e-07, "logits/chosen": 0.025185655802488327, "logits/rejected": 0.12884651124477386, "logps/chosen": -41.46108627319336, "logps/rejected": -65.40599822998047, "loss": 0.3744, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.3796524405479431, "rewards/margins": 0.9922239780426025, "rewards/rejected": -1.3718763589859009, "step": 5200 }, { "epoch": 3.0973871733966747, "grad_norm": 57.25, "learning_rate": 3.805225653206651e-07, "logits/chosen": 0.036128733307123184, "logits/rejected": 0.14582857489585876, "logps/chosen": -40.435264587402344, "logps/rejected": -62.72749328613281, "loss": 0.3616, "rewards/accuracies": 0.90625, "rewards/chosen": -0.34147587418556213, "rewards/margins": 1.0060913562774658, "rewards/rejected": -1.3475672006607056, "step": 5216 }, { "epoch": 3.1068883610451308, "grad_norm": 54.25, "learning_rate": 3.786223277909739e-07, "logits/chosen": 0.08208269625902176, "logits/rejected": 0.1514563262462616, "logps/chosen": -43.135169982910156, "logps/rejected": -62.251564025878906, "loss": 0.4333, "rewards/accuracies": 0.8515625, "rewards/chosen": -0.4657037854194641, "rewards/margins": 0.7968182563781738, "rewards/rejected": -1.2625218629837036, "step": 5232 }, { "epoch": 3.116389548693587, "grad_norm": 47.75, "learning_rate": 3.767220902612827e-07, "logits/chosen": 0.0023325812071561813, "logits/rejected": 0.07401884347200394, "logps/chosen": -42.72369384765625, "logps/rejected": -65.19200134277344, "loss": 0.3696, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4340641498565674, "rewards/margins": 0.9625180959701538, "rewards/rejected": -1.3965823650360107, "step": 5248 }, { "epoch": 3.125890736342043, "grad_norm": 55.5, "learning_rate": 3.748218527315915e-07, "logits/chosen": 0.04036543890833855, "logits/rejected": 0.1542033553123474, "logps/chosen": -40.91764831542969, "logps/rejected": -67.87660217285156, "loss": 0.3403, "rewards/accuracies": 0.9375, "rewards/chosen": -0.42045873403549194, "rewards/margins": 1.0873808860778809, "rewards/rejected": -1.5078396797180176, "step": 5264 }, { "epoch": 3.1353919239904986, "grad_norm": 55.0, "learning_rate": 3.729216152019002e-07, "logits/chosen": 0.0443246066570282, "logits/rejected": 0.21922534704208374, "logps/chosen": -42.55435562133789, "logps/rejected": -65.33242797851562, "loss": 0.4199, "rewards/accuracies": 0.875, "rewards/chosen": -0.4705503284931183, "rewards/margins": 0.8472099304199219, "rewards/rejected": -1.3177603483200073, "step": 5280 }, { "epoch": 3.1448931116389547, "grad_norm": 53.25, "learning_rate": 3.71021377672209e-07, "logits/chosen": 0.009390661492943764, "logits/rejected": 0.12596291303634644, "logps/chosen": -40.291473388671875, "logps/rejected": -62.212562561035156, "loss": 0.3862, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4117724299430847, "rewards/margins": 0.9006836414337158, "rewards/rejected": -1.3124560117721558, "step": 5296 }, { "epoch": 3.1543942992874108, "grad_norm": 50.0, "learning_rate": 3.6912114014251776e-07, "logits/chosen": -0.022532382979989052, "logits/rejected": 0.11396850645542145, "logps/chosen": -40.82377624511719, "logps/rejected": -63.843467712402344, "loss": 0.3762, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.40509068965911865, "rewards/margins": 0.9473409652709961, "rewards/rejected": -1.3524316549301147, "step": 5312 }, { "epoch": 3.163895486935867, "grad_norm": 51.25, "learning_rate": 3.6722090261282656e-07, "logits/chosen": -0.019587915390729904, "logits/rejected": 0.09787953644990921, "logps/chosen": -38.83479690551758, "logps/rejected": -62.37450408935547, "loss": 0.3457, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.31038060784339905, "rewards/margins": 1.0302282571792603, "rewards/rejected": -1.340608835220337, "step": 5328 }, { "epoch": 3.173396674584323, "grad_norm": 45.0, "learning_rate": 3.6532066508313535e-07, "logits/chosen": 0.0068716611713171005, "logits/rejected": 0.10715761035680771, "logps/chosen": -39.36448287963867, "logps/rejected": -61.61016845703125, "loss": 0.3694, "rewards/accuracies": 0.921875, "rewards/chosen": -0.35794466733932495, "rewards/margins": 0.9621202349662781, "rewards/rejected": -1.3200650215148926, "step": 5344 }, { "epoch": 3.182897862232779, "grad_norm": 57.5, "learning_rate": 3.6342042755344415e-07, "logits/chosen": 0.08224662393331528, "logits/rejected": 0.10815519094467163, "logps/chosen": -43.51884078979492, "logps/rejected": -65.097900390625, "loss": 0.3834, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4219568073749542, "rewards/margins": 0.934171736240387, "rewards/rejected": -1.3561286926269531, "step": 5360 }, { "epoch": 3.192399049881235, "grad_norm": 47.5, "learning_rate": 3.6152019002375295e-07, "logits/chosen": 0.011415719985961914, "logits/rejected": 0.10578812658786774, "logps/chosen": -41.20673370361328, "logps/rejected": -63.52978515625, "loss": 0.365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.41507184505462646, "rewards/margins": 0.9821330308914185, "rewards/rejected": -1.397204875946045, "step": 5376 }, { "epoch": 3.201900237529691, "grad_norm": 42.5, "learning_rate": 3.5961995249406174e-07, "logits/chosen": 0.03810262307524681, "logits/rejected": 0.12873801589012146, "logps/chosen": -41.731388092041016, "logps/rejected": -65.47847747802734, "loss": 0.3798, "rewards/accuracies": 0.921875, "rewards/chosen": -0.44025665521621704, "rewards/margins": 0.9518187046051025, "rewards/rejected": -1.3920754194259644, "step": 5392 }, { "epoch": 3.2114014251781473, "grad_norm": 49.25, "learning_rate": 3.5771971496437054e-07, "logits/chosen": -0.023601891472935677, "logits/rejected": 0.06935597211122513, "logps/chosen": -38.68426513671875, "logps/rejected": -61.103973388671875, "loss": 0.3634, "rewards/accuracies": 0.921875, "rewards/chosen": -0.37336137890815735, "rewards/margins": 1.0120704174041748, "rewards/rejected": -1.3854318857192993, "step": 5408 }, { "epoch": 3.2209026128266034, "grad_norm": 52.5, "learning_rate": 3.5581947743467934e-07, "logits/chosen": 0.04542490094900131, "logits/rejected": 0.09217678010463715, "logps/chosen": -43.481048583984375, "logps/rejected": -63.5504264831543, "loss": 0.415, "rewards/accuracies": 0.890625, "rewards/chosen": -0.49973058700561523, "rewards/margins": 0.8297138214111328, "rewards/rejected": -1.3294442892074585, "step": 5424 }, { "epoch": 3.2304038004750595, "grad_norm": 47.0, "learning_rate": 3.5391923990498813e-07, "logits/chosen": 0.10594628006219864, "logits/rejected": 0.17065931856632233, "logps/chosen": -40.92080307006836, "logps/rejected": -64.85243225097656, "loss": 0.3541, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3807818293571472, "rewards/margins": 1.0301458835601807, "rewards/rejected": -1.4109277725219727, "step": 5440 }, { "epoch": 3.2399049881235156, "grad_norm": 66.0, "learning_rate": 3.5201900237529693e-07, "logits/chosen": 0.04224825277924538, "logits/rejected": 0.11179321259260178, "logps/chosen": -42.496681213378906, "logps/rejected": -64.15071105957031, "loss": 0.3789, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.41750308871269226, "rewards/margins": 0.9374274015426636, "rewards/rejected": -1.3549304008483887, "step": 5456 }, { "epoch": 3.2494061757719717, "grad_norm": 51.75, "learning_rate": 3.5011876484560573e-07, "logits/chosen": 0.06084592267870903, "logits/rejected": 0.13545851409435272, "logps/chosen": -38.884830474853516, "logps/rejected": -61.21929931640625, "loss": 0.3627, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3169533908367157, "rewards/margins": 0.991154670715332, "rewards/rejected": -1.3081080913543701, "step": 5472 }, { "epoch": 3.2589073634204277, "grad_norm": 52.5, "learning_rate": 3.482185273159145e-07, "logits/chosen": 0.03806290030479431, "logits/rejected": 0.11884848773479462, "logps/chosen": -41.503875732421875, "logps/rejected": -63.86762237548828, "loss": 0.3766, "rewards/accuracies": 0.9375, "rewards/chosen": -0.40684768557548523, "rewards/margins": 0.953045129776001, "rewards/rejected": -1.3598929643630981, "step": 5488 }, { "epoch": 3.268408551068884, "grad_norm": 57.0, "learning_rate": 3.463182897862232e-07, "logits/chosen": -0.0012608803808689117, "logits/rejected": 0.10703101754188538, "logps/chosen": -41.227787017822266, "logps/rejected": -64.53129577636719, "loss": 0.3804, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4662329852581024, "rewards/margins": 0.9123408794403076, "rewards/rejected": -1.3785738945007324, "step": 5504 }, { "epoch": 3.2779097387173395, "grad_norm": 35.75, "learning_rate": 3.44418052256532e-07, "logits/chosen": 0.09872997552156448, "logits/rejected": 0.16530072689056396, "logps/chosen": -42.996360778808594, "logps/rejected": -64.78094482421875, "loss": 0.3656, "rewards/accuracies": 0.921875, "rewards/chosen": -0.43684783577919006, "rewards/margins": 0.9832981824874878, "rewards/rejected": -1.420146107673645, "step": 5520 }, { "epoch": 3.2874109263657956, "grad_norm": 50.25, "learning_rate": 3.425178147268408e-07, "logits/chosen": 0.03449930623173714, "logits/rejected": 0.17792941629886627, "logps/chosen": -40.4708366394043, "logps/rejected": -65.352294921875, "loss": 0.3781, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.43737536668777466, "rewards/margins": 0.9423834681510925, "rewards/rejected": -1.3797588348388672, "step": 5536 }, { "epoch": 3.2969121140142517, "grad_norm": 47.0, "learning_rate": 3.406175771971496e-07, "logits/chosen": 0.09654662013053894, "logits/rejected": 0.17820453643798828, "logps/chosen": -41.23036193847656, "logps/rejected": -63.637550354003906, "loss": 0.3973, "rewards/accuracies": 0.859375, "rewards/chosen": -0.36412250995635986, "rewards/margins": 0.9050154685974121, "rewards/rejected": -1.269137978553772, "step": 5552 }, { "epoch": 3.3064133016627077, "grad_norm": 62.5, "learning_rate": 3.387173396674584e-07, "logits/chosen": 0.03694698214530945, "logits/rejected": 0.13538768887519836, "logps/chosen": -42.770050048828125, "logps/rejected": -63.76899719238281, "loss": 0.4137, "rewards/accuracies": 0.859375, "rewards/chosen": -0.49573254585266113, "rewards/margins": 0.8381365537643433, "rewards/rejected": -1.3338689804077148, "step": 5568 }, { "epoch": 3.315914489311164, "grad_norm": 45.0, "learning_rate": 3.368171021377672e-07, "logits/chosen": 0.02915555238723755, "logits/rejected": 0.1136770024895668, "logps/chosen": -42.1306266784668, "logps/rejected": -64.29971313476562, "loss": 0.3729, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4326779246330261, "rewards/margins": 0.928207516670227, "rewards/rejected": -1.360885500907898, "step": 5584 }, { "epoch": 3.32541567695962, "grad_norm": 45.75, "learning_rate": 3.34916864608076e-07, "logits/chosen": 0.07809650897979736, "logits/rejected": 0.12652210891246796, "logps/chosen": -41.568660736083984, "logps/rejected": -63.57560348510742, "loss": 0.3613, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3894152045249939, "rewards/margins": 1.0010182857513428, "rewards/rejected": -1.3904335498809814, "step": 5600 }, { "epoch": 3.334916864608076, "grad_norm": 58.5, "learning_rate": 3.330166270783848e-07, "logits/chosen": 0.040801357477903366, "logits/rejected": 0.1780150681734085, "logps/chosen": -41.86552810668945, "logps/rejected": -65.90018463134766, "loss": 0.3516, "rewards/accuracies": 0.921875, "rewards/chosen": -0.38096481561660767, "rewards/margins": 1.0388493537902832, "rewards/rejected": -1.4198143482208252, "step": 5616 }, { "epoch": 3.344418052256532, "grad_norm": 53.5, "learning_rate": 3.311163895486936e-07, "logits/chosen": 0.01211823895573616, "logits/rejected": 0.09781680256128311, "logps/chosen": -41.53929138183594, "logps/rejected": -63.851688385009766, "loss": 0.3701, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4125053286552429, "rewards/margins": 0.9792938828468323, "rewards/rejected": -1.3917990922927856, "step": 5632 }, { "epoch": 3.353919239904988, "grad_norm": 50.75, "learning_rate": 3.292161520190024e-07, "logits/chosen": 0.09114633500576019, "logits/rejected": 0.15826475620269775, "logps/chosen": -41.09789276123047, "logps/rejected": -65.02250671386719, "loss": 0.3713, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.41848012804985046, "rewards/margins": 0.9512667059898376, "rewards/rejected": -1.3697468042373657, "step": 5648 }, { "epoch": 3.3634204275534443, "grad_norm": 51.25, "learning_rate": 3.273159144893112e-07, "logits/chosen": 0.021810825914144516, "logits/rejected": 0.11112373322248459, "logps/chosen": -43.460018157958984, "logps/rejected": -64.75310516357422, "loss": 0.3962, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.4909437894821167, "rewards/margins": 0.8337816596031189, "rewards/rejected": -1.3247253894805908, "step": 5664 }, { "epoch": 3.3729216152019004, "grad_norm": 71.0, "learning_rate": 3.2541567695961993e-07, "logits/chosen": 0.06712229549884796, "logits/rejected": 0.15457472205162048, "logps/chosen": -41.23008346557617, "logps/rejected": -63.24055480957031, "loss": 0.3614, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3526281416416168, "rewards/margins": 0.9992504119873047, "rewards/rejected": -1.3518785238265991, "step": 5680 }, { "epoch": 3.3824228028503565, "grad_norm": 37.5, "learning_rate": 3.235154394299287e-07, "logits/chosen": 0.008397895842790604, "logits/rejected": 0.11099565029144287, "logps/chosen": -42.79941940307617, "logps/rejected": -62.79872131347656, "loss": 0.3956, "rewards/accuracies": 0.890625, "rewards/chosen": -0.432941198348999, "rewards/margins": 0.8876244425773621, "rewards/rejected": -1.3205657005310059, "step": 5696 }, { "epoch": 3.391923990498812, "grad_norm": 46.75, "learning_rate": 3.216152019002375e-07, "logits/chosen": 0.03148173540830612, "logits/rejected": 0.09412042796611786, "logps/chosen": -40.27482223510742, "logps/rejected": -64.9065170288086, "loss": 0.3578, "rewards/accuracies": 0.921875, "rewards/chosen": -0.36367034912109375, "rewards/margins": 1.0289617776870728, "rewards/rejected": -1.392632246017456, "step": 5712 }, { "epoch": 3.401425178147268, "grad_norm": 61.25, "learning_rate": 3.1971496437054627e-07, "logits/chosen": 0.03281719982624054, "logits/rejected": 0.10020038485527039, "logps/chosen": -41.20103073120117, "logps/rejected": -62.454811096191406, "loss": 0.3892, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4000158905982971, "rewards/margins": 0.9230848550796509, "rewards/rejected": -1.3231008052825928, "step": 5728 }, { "epoch": 3.4109263657957243, "grad_norm": 42.5, "learning_rate": 3.1781472684085506e-07, "logits/chosen": 0.03375418484210968, "logits/rejected": 0.07718026638031006, "logps/chosen": -42.45466613769531, "logps/rejected": -62.32984924316406, "loss": 0.4054, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.4012918472290039, "rewards/margins": 0.8690704107284546, "rewards/rejected": -1.2703622579574585, "step": 5744 }, { "epoch": 3.4204275534441804, "grad_norm": 50.25, "learning_rate": 3.1591448931116386e-07, "logits/chosen": 0.04832587391138077, "logits/rejected": 0.14456292986869812, "logps/chosen": -38.7043571472168, "logps/rejected": -61.933982849121094, "loss": 0.368, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.33057889342308044, "rewards/margins": 0.9730508923530579, "rewards/rejected": -1.3036296367645264, "step": 5760 }, { "epoch": 3.4299287410926365, "grad_norm": 41.5, "learning_rate": 3.1401425178147266e-07, "logits/chosen": 0.07459306716918945, "logits/rejected": 0.08237803727388382, "logps/chosen": -41.34931945800781, "logps/rejected": -62.95779037475586, "loss": 0.3687, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.37355130910873413, "rewards/margins": 1.0145928859710693, "rewards/rejected": -1.3881440162658691, "step": 5776 }, { "epoch": 3.4394299287410925, "grad_norm": 57.5, "learning_rate": 3.1211401425178145e-07, "logits/chosen": 0.03901844099164009, "logits/rejected": 0.11285356432199478, "logps/chosen": -40.3481559753418, "logps/rejected": -63.19662094116211, "loss": 0.3758, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3881966769695282, "rewards/margins": 0.9612338542938232, "rewards/rejected": -1.3494304418563843, "step": 5792 }, { "epoch": 3.4489311163895486, "grad_norm": 35.75, "learning_rate": 3.1021377672209025e-07, "logits/chosen": -0.005038658622652292, "logits/rejected": 0.06839510053396225, "logps/chosen": -39.993309020996094, "logps/rejected": -60.90965270996094, "loss": 0.3949, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.37709131836891174, "rewards/margins": 0.8873628973960876, "rewards/rejected": -1.2644541263580322, "step": 5808 }, { "epoch": 3.4584323040380047, "grad_norm": 56.0, "learning_rate": 3.0831353919239905e-07, "logits/chosen": 0.09682201594114304, "logits/rejected": 0.11965522170066833, "logps/chosen": -41.68549346923828, "logps/rejected": -64.51807403564453, "loss": 0.3576, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.36759233474731445, "rewards/margins": 1.0216158628463745, "rewards/rejected": -1.389208197593689, "step": 5824 }, { "epoch": 3.467933491686461, "grad_norm": 47.25, "learning_rate": 3.0641330166270784e-07, "logits/chosen": 0.04335436224937439, "logits/rejected": 0.09136922657489777, "logps/chosen": -41.07777404785156, "logps/rejected": -62.33134460449219, "loss": 0.3905, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.41379624605178833, "rewards/margins": 0.8802096843719482, "rewards/rejected": -1.2940058708190918, "step": 5840 }, { "epoch": 3.477434679334917, "grad_norm": 46.75, "learning_rate": 3.0451306413301664e-07, "logits/chosen": 0.07700560986995697, "logits/rejected": 0.11378947645425797, "logps/chosen": -41.164493560791016, "logps/rejected": -62.910099029541016, "loss": 0.3575, "rewards/accuracies": 0.96875, "rewards/chosen": -0.35974451899528503, "rewards/margins": 0.9829455614089966, "rewards/rejected": -1.3426902294158936, "step": 5856 }, { "epoch": 3.486935866983373, "grad_norm": 37.5, "learning_rate": 3.026128266033254e-07, "logits/chosen": -0.009967565536499023, "logits/rejected": 0.13184106349945068, "logps/chosen": -41.89753723144531, "logps/rejected": -65.06842041015625, "loss": 0.373, "rewards/accuracies": 0.921875, "rewards/chosen": -0.43801864981651306, "rewards/margins": 0.9392563104629517, "rewards/rejected": -1.377274990081787, "step": 5872 }, { "epoch": 3.496437054631829, "grad_norm": 47.25, "learning_rate": 3.007125890736342e-07, "logits/chosen": 0.0052648792043328285, "logits/rejected": 0.1504189670085907, "logps/chosen": -40.17364501953125, "logps/rejected": -62.997554779052734, "loss": 0.3612, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3528364896774292, "rewards/margins": 0.9979708194732666, "rewards/rejected": -1.3508071899414062, "step": 5888 }, { "epoch": 3.505938242280285, "grad_norm": 54.75, "learning_rate": 2.98812351543943e-07, "logits/chosen": 0.017038684338331223, "logits/rejected": 0.06112390756607056, "logps/chosen": -41.39101791381836, "logps/rejected": -64.11455535888672, "loss": 0.386, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4848855137825012, "rewards/margins": 0.9375608563423157, "rewards/rejected": -1.4224462509155273, "step": 5904 }, { "epoch": 3.5154394299287413, "grad_norm": 49.75, "learning_rate": 2.969121140142518e-07, "logits/chosen": 0.020917030051350594, "logits/rejected": 0.09810462594032288, "logps/chosen": -39.80662155151367, "logps/rejected": -60.70354080200195, "loss": 0.3528, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3079432249069214, "rewards/margins": 1.035301685333252, "rewards/rejected": -1.3432449102401733, "step": 5920 }, { "epoch": 3.5249406175771973, "grad_norm": 48.25, "learning_rate": 2.9501187648456057e-07, "logits/chosen": 0.07180722057819366, "logits/rejected": 0.20187009871006012, "logps/chosen": -42.24882507324219, "logps/rejected": -66.23828887939453, "loss": 0.3684, "rewards/accuracies": 0.921875, "rewards/chosen": -0.36295828223228455, "rewards/margins": 0.9907328486442566, "rewards/rejected": -1.3536912202835083, "step": 5936 }, { "epoch": 3.5344418052256534, "grad_norm": 55.75, "learning_rate": 2.9311163895486937e-07, "logits/chosen": 0.04826946556568146, "logits/rejected": 0.18203888833522797, "logps/chosen": -44.12788772583008, "logps/rejected": -65.50685119628906, "loss": 0.4068, "rewards/accuracies": 0.90625, "rewards/chosen": -0.45171403884887695, "rewards/margins": 0.8484476208686829, "rewards/rejected": -1.300161600112915, "step": 5952 }, { "epoch": 3.5439429928741095, "grad_norm": 67.5, "learning_rate": 2.912114014251781e-07, "logits/chosen": 0.03645118325948715, "logits/rejected": 0.10001954436302185, "logps/chosen": -41.21052551269531, "logps/rejected": -59.9468994140625, "loss": 0.4013, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.432812362909317, "rewards/margins": 0.8651708364486694, "rewards/rejected": -1.297983169555664, "step": 5968 }, { "epoch": 3.553444180522565, "grad_norm": 59.0, "learning_rate": 2.893111638954869e-07, "logits/chosen": 0.014936832711100578, "logits/rejected": 0.13327737152576447, "logps/chosen": -39.648658752441406, "logps/rejected": -64.09784698486328, "loss": 0.3456, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.34701600670814514, "rewards/margins": 1.0596061944961548, "rewards/rejected": -1.4066221714019775, "step": 5984 }, { "epoch": 3.5629453681710213, "grad_norm": 67.0, "learning_rate": 2.874109263657957e-07, "logits/chosen": -0.038970720022916794, "logits/rejected": 0.1254507154226303, "logps/chosen": -39.5800666809082, "logps/rejected": -63.40620422363281, "loss": 0.3871, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3893817365169525, "rewards/margins": 0.9221255779266357, "rewards/rejected": -1.3115073442459106, "step": 6000 }, { "epoch": 3.5724465558194773, "grad_norm": 46.25, "learning_rate": 2.855106888361045e-07, "logits/chosen": 0.027300620451569557, "logits/rejected": 0.14734551310539246, "logps/chosen": -39.858192443847656, "logps/rejected": -64.0389633178711, "loss": 0.365, "rewards/accuracies": 0.9375, "rewards/chosen": -0.40028518438339233, "rewards/margins": 0.9700754284858704, "rewards/rejected": -1.3703604936599731, "step": 6016 }, { "epoch": 3.5819477434679334, "grad_norm": 50.0, "learning_rate": 2.836104513064133e-07, "logits/chosen": 0.056467145681381226, "logits/rejected": 0.12261079996824265, "logps/chosen": -40.67634963989258, "logps/rejected": -62.402809143066406, "loss": 0.3536, "rewards/accuracies": 0.953125, "rewards/chosen": -0.3845675587654114, "rewards/margins": 1.0051778554916382, "rewards/rejected": -1.3897454738616943, "step": 6032 }, { "epoch": 3.5914489311163895, "grad_norm": 55.25, "learning_rate": 2.8171021377672204e-07, "logits/chosen": 0.10420281440019608, "logits/rejected": 0.13047069311141968, "logps/chosen": -40.67375946044922, "logps/rejected": -61.41259765625, "loss": 0.3815, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3753702640533447, "rewards/margins": 0.9167128205299377, "rewards/rejected": -1.2920830249786377, "step": 6048 }, { "epoch": 3.6009501187648456, "grad_norm": 68.0, "learning_rate": 2.7980997624703084e-07, "logits/chosen": 0.05103810504078865, "logits/rejected": 0.10168743878602982, "logps/chosen": -41.25544738769531, "logps/rejected": -60.72285461425781, "loss": 0.3892, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.3910224437713623, "rewards/margins": 0.9060119390487671, "rewards/rejected": -1.2970343828201294, "step": 6064 }, { "epoch": 3.6104513064133017, "grad_norm": 54.25, "learning_rate": 2.7790973871733964e-07, "logits/chosen": 0.013073207810521126, "logits/rejected": 0.12633521854877472, "logps/chosen": -41.78628921508789, "logps/rejected": -66.28352355957031, "loss": 0.353, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.40143895149230957, "rewards/margins": 1.0364677906036377, "rewards/rejected": -1.4379067420959473, "step": 6080 }, { "epoch": 3.619952494061758, "grad_norm": 50.75, "learning_rate": 2.7600950118764843e-07, "logits/chosen": 0.08037324994802475, "logits/rejected": 0.17064939439296722, "logps/chosen": -42.0279655456543, "logps/rejected": -64.91426086425781, "loss": 0.3877, "rewards/accuracies": 0.921875, "rewards/chosen": -0.43144869804382324, "rewards/margins": 0.8975583910942078, "rewards/rejected": -1.3290070295333862, "step": 6096 }, { "epoch": 3.629453681710214, "grad_norm": 56.5, "learning_rate": 2.7410926365795723e-07, "logits/chosen": 0.055456578731536865, "logits/rejected": 0.22028645873069763, "logps/chosen": -39.780517578125, "logps/rejected": -63.519813537597656, "loss": 0.3566, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3360990881919861, "rewards/margins": 0.9839707612991333, "rewards/rejected": -1.3200697898864746, "step": 6112 }, { "epoch": 3.63895486935867, "grad_norm": 39.5, "learning_rate": 2.7220902612826603e-07, "logits/chosen": 0.060147836804389954, "logits/rejected": 0.1302787959575653, "logps/chosen": -41.454872131347656, "logps/rejected": -64.60635375976562, "loss": 0.3871, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4015429615974426, "rewards/margins": 0.9040000438690186, "rewards/rejected": -1.3055431842803955, "step": 6128 }, { "epoch": 3.648456057007126, "grad_norm": 47.25, "learning_rate": 2.703087885985748e-07, "logits/chosen": -0.013463707640767097, "logits/rejected": 0.07376192510128021, "logps/chosen": -39.62220764160156, "logps/rejected": -62.20975875854492, "loss": 0.363, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35915249586105347, "rewards/margins": 0.9867640733718872, "rewards/rejected": -1.3459166288375854, "step": 6144 }, { "epoch": 3.6579572446555817, "grad_norm": 49.75, "learning_rate": 2.684085510688836e-07, "logits/chosen": 0.010295089334249496, "logits/rejected": 0.17823557555675507, "logps/chosen": -41.6696662902832, "logps/rejected": -68.72691345214844, "loss": 0.3555, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4070267975330353, "rewards/margins": 1.028420329093933, "rewards/rejected": -1.4354472160339355, "step": 6160 }, { "epoch": 3.667458432304038, "grad_norm": 37.0, "learning_rate": 2.665083135391924e-07, "logits/chosen": 0.10623430460691452, "logits/rejected": 0.16167762875556946, "logps/chosen": -42.180198669433594, "logps/rejected": -66.37389373779297, "loss": 0.3498, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4009030759334564, "rewards/margins": 1.0136104822158813, "rewards/rejected": -1.4145135879516602, "step": 6176 }, { "epoch": 3.676959619952494, "grad_norm": 51.5, "learning_rate": 2.646080760095012e-07, "logits/chosen": 0.048040006309747696, "logits/rejected": 0.14534232020378113, "logps/chosen": -43.282859802246094, "logps/rejected": -66.5873794555664, "loss": 0.3543, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4304080605506897, "rewards/margins": 1.0149825811386108, "rewards/rejected": -1.4453905820846558, "step": 6192 }, { "epoch": 3.68646080760095, "grad_norm": 47.25, "learning_rate": 2.6270783847980996e-07, "logits/chosen": 0.08240100741386414, "logits/rejected": 0.16390424966812134, "logps/chosen": -40.04043960571289, "logps/rejected": -62.7987060546875, "loss": 0.3618, "rewards/accuracies": 0.953125, "rewards/chosen": -0.33928748965263367, "rewards/margins": 0.9862152934074402, "rewards/rejected": -1.325502634048462, "step": 6208 }, { "epoch": 3.695961995249406, "grad_norm": 80.5, "learning_rate": 2.6080760095011876e-07, "logits/chosen": 0.021123535931110382, "logits/rejected": 0.1238473653793335, "logps/chosen": -40.08449935913086, "logps/rejected": -64.84993743896484, "loss": 0.3652, "rewards/accuracies": 0.890625, "rewards/chosen": -0.3657268285751343, "rewards/margins": 1.0157949924468994, "rewards/rejected": -1.3815219402313232, "step": 6224 }, { "epoch": 3.705463182897862, "grad_norm": 56.25, "learning_rate": 2.589073634204275e-07, "logits/chosen": 0.09684562683105469, "logits/rejected": 0.18495050072669983, "logps/chosen": -41.702301025390625, "logps/rejected": -63.34584426879883, "loss": 0.3913, "rewards/accuracies": 0.921875, "rewards/chosen": -0.4372140169143677, "rewards/margins": 0.9003235101699829, "rewards/rejected": -1.3375376462936401, "step": 6240 }, { "epoch": 3.7149643705463182, "grad_norm": 40.25, "learning_rate": 2.570071258907363e-07, "logits/chosen": 0.028134455904364586, "logits/rejected": 0.16499164700508118, "logps/chosen": -41.335540771484375, "logps/rejected": -63.30027770996094, "loss": 0.3824, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4398137331008911, "rewards/margins": 0.9075998067855835, "rewards/rejected": -1.347413420677185, "step": 6256 }, { "epoch": 3.7244655581947743, "grad_norm": 41.75, "learning_rate": 2.551068883610451e-07, "logits/chosen": 0.016426438465714455, "logits/rejected": 0.13184240460395813, "logps/chosen": -40.241729736328125, "logps/rejected": -61.8122673034668, "loss": 0.3795, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35463353991508484, "rewards/margins": 0.9521807432174683, "rewards/rejected": -1.3068143129348755, "step": 6272 }, { "epoch": 3.7339667458432304, "grad_norm": 50.5, "learning_rate": 2.532066508313539e-07, "logits/chosen": 0.028402097523212433, "logits/rejected": 0.19204524159431458, "logps/chosen": -41.228050231933594, "logps/rejected": -63.15939712524414, "loss": 0.3886, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.4359303116798401, "rewards/margins": 0.902795135974884, "rewards/rejected": -1.3387255668640137, "step": 6288 }, { "epoch": 3.7434679334916865, "grad_norm": 49.75, "learning_rate": 2.513064133016627e-07, "logits/chosen": 0.03932081162929535, "logits/rejected": 0.220359206199646, "logps/chosen": -39.12273025512695, "logps/rejected": -63.374549865722656, "loss": 0.384, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.32400816679000854, "rewards/margins": 0.9151921272277832, "rewards/rejected": -1.239200234413147, "step": 6304 }, { "epoch": 3.7529691211401426, "grad_norm": 35.25, "learning_rate": 2.494061757719715e-07, "logits/chosen": 0.0723482072353363, "logits/rejected": 0.16996870934963226, "logps/chosen": -42.35596466064453, "logps/rejected": -63.216426849365234, "loss": 0.3757, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.47743964195251465, "rewards/margins": 0.9289887547492981, "rewards/rejected": -1.4064284563064575, "step": 6320 }, { "epoch": 3.7624703087885987, "grad_norm": 51.25, "learning_rate": 2.475059382422803e-07, "logits/chosen": 0.08452095091342926, "logits/rejected": 0.16239051520824432, "logps/chosen": -42.04669189453125, "logps/rejected": -65.5211181640625, "loss": 0.3721, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.42908474802970886, "rewards/margins": 0.9674755930900574, "rewards/rejected": -1.3965604305267334, "step": 6336 }, { "epoch": 3.7719714964370548, "grad_norm": 39.5, "learning_rate": 2.456057007125891e-07, "logits/chosen": 0.028654370456933975, "logits/rejected": 0.0740758553147316, "logps/chosen": -41.6036262512207, "logps/rejected": -61.02503967285156, "loss": 0.409, "rewards/accuracies": 0.859375, "rewards/chosen": -0.42152711749076843, "rewards/margins": 0.8335566520690918, "rewards/rejected": -1.2550837993621826, "step": 6352 }, { "epoch": 3.781472684085511, "grad_norm": 34.75, "learning_rate": 2.437054631828979e-07, "logits/chosen": 0.05283776670694351, "logits/rejected": 0.15333330631256104, "logps/chosen": -40.32767868041992, "logps/rejected": -63.946720123291016, "loss": 0.3505, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3486955463886261, "rewards/margins": 1.0453296899795532, "rewards/rejected": -1.394025444984436, "step": 6368 }, { "epoch": 3.790973871733967, "grad_norm": 44.25, "learning_rate": 2.418052256532066e-07, "logits/chosen": 0.030136309564113617, "logits/rejected": 0.09111961722373962, "logps/chosen": -43.44806671142578, "logps/rejected": -64.65709686279297, "loss": 0.3773, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.48328760266304016, "rewards/margins": 0.9284934997558594, "rewards/rejected": -1.4117810726165771, "step": 6384 }, { "epoch": 3.800475059382423, "grad_norm": 69.5, "learning_rate": 2.399049881235154e-07, "logits/chosen": -0.005254952237010002, "logits/rejected": 0.14121603965759277, "logps/chosen": -39.49591827392578, "logps/rejected": -61.99911880493164, "loss": 0.3777, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3417325019836426, "rewards/margins": 0.9448993802070618, "rewards/rejected": -1.2866318225860596, "step": 6400 }, { "epoch": 3.809976247030879, "grad_norm": 59.75, "learning_rate": 2.380047505938242e-07, "logits/chosen": 0.11867545545101166, "logits/rejected": 0.2170010209083557, "logps/chosen": -40.522804260253906, "logps/rejected": -64.37654876708984, "loss": 0.3459, "rewards/accuracies": 0.9609375, "rewards/chosen": -0.36430996656417847, "rewards/margins": 1.050180435180664, "rewards/rejected": -1.4144904613494873, "step": 6416 }, { "epoch": 3.8194774346793348, "grad_norm": 46.75, "learning_rate": 2.36104513064133e-07, "logits/chosen": 0.09830942749977112, "logits/rejected": 0.1840059906244278, "logps/chosen": -40.83268356323242, "logps/rejected": -63.659610748291016, "loss": 0.3769, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4339376389980316, "rewards/margins": 0.9776801466941833, "rewards/rejected": -1.4116177558898926, "step": 6432 }, { "epoch": 3.828978622327791, "grad_norm": 54.0, "learning_rate": 2.342042755344418e-07, "logits/chosen": 0.03068104013800621, "logits/rejected": 0.1621265709400177, "logps/chosen": -43.22722625732422, "logps/rejected": -66.03999328613281, "loss": 0.4005, "rewards/accuracies": 0.875, "rewards/chosen": -0.4799574315547943, "rewards/margins": 0.8937349915504456, "rewards/rejected": -1.373692512512207, "step": 6448 }, { "epoch": 3.838479809976247, "grad_norm": 50.0, "learning_rate": 2.323040380047506e-07, "logits/chosen": 0.058121610432863235, "logits/rejected": 0.12576644122600555, "logps/chosen": -41.369140625, "logps/rejected": -59.46518325805664, "loss": 0.3966, "rewards/accuracies": 0.890625, "rewards/chosen": -0.40906572341918945, "rewards/margins": 0.8850200176239014, "rewards/rejected": -1.2940857410430908, "step": 6464 }, { "epoch": 3.847980997624703, "grad_norm": 52.75, "learning_rate": 2.3040380047505937e-07, "logits/chosen": 0.06827502697706223, "logits/rejected": 0.14090177416801453, "logps/chosen": -42.05507278442383, "logps/rejected": -63.853248596191406, "loss": 0.3718, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.40036866068840027, "rewards/margins": 0.9455254077911377, "rewards/rejected": -1.3458940982818604, "step": 6480 }, { "epoch": 3.857482185273159, "grad_norm": 43.75, "learning_rate": 2.2850356294536814e-07, "logits/chosen": 0.029515203088521957, "logits/rejected": 0.14883099496364594, "logps/chosen": -41.896156311035156, "logps/rejected": -64.60520935058594, "loss": 0.3798, "rewards/accuracies": 0.9375, "rewards/chosen": -0.389555424451828, "rewards/margins": 0.9199389815330505, "rewards/rejected": -1.3094943761825562, "step": 6496 }, { "epoch": 3.866983372921615, "grad_norm": 39.75, "learning_rate": 2.2660332541567694e-07, "logits/chosen": 0.005287522915750742, "logits/rejected": 0.15816916525363922, "logps/chosen": -41.17705535888672, "logps/rejected": -63.35408401489258, "loss": 0.4071, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.4392343759536743, "rewards/margins": 0.8662595748901367, "rewards/rejected": -1.305493950843811, "step": 6512 }, { "epoch": 3.8764845605700713, "grad_norm": 46.0, "learning_rate": 2.2470308788598574e-07, "logits/chosen": 0.07539123296737671, "logits/rejected": 0.12418274581432343, "logps/chosen": -41.912052154541016, "logps/rejected": -64.26847076416016, "loss": 0.3873, "rewards/accuracies": 0.890625, "rewards/chosen": -0.4488585293292999, "rewards/margins": 0.9296143054962158, "rewards/rejected": -1.3784728050231934, "step": 6528 }, { "epoch": 3.8859857482185274, "grad_norm": 47.5, "learning_rate": 2.2280285035629453e-07, "logits/chosen": 0.04668428376317024, "logits/rejected": 0.13457974791526794, "logps/chosen": -40.552486419677734, "logps/rejected": -63.59270477294922, "loss": 0.371, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.3902091681957245, "rewards/margins": 0.9766399264335632, "rewards/rejected": -1.3668489456176758, "step": 6544 }, { "epoch": 3.8954869358669835, "grad_norm": 51.75, "learning_rate": 2.209026128266033e-07, "logits/chosen": 0.019167862832546234, "logits/rejected": 0.10863950103521347, "logps/chosen": -40.27218246459961, "logps/rejected": -63.5718994140625, "loss": 0.355, "rewards/accuracies": 0.921875, "rewards/chosen": -0.400563508272171, "rewards/margins": 1.0269160270690918, "rewards/rejected": -1.4274795055389404, "step": 6560 }, { "epoch": 3.9049881235154396, "grad_norm": 40.5, "learning_rate": 2.190023752969121e-07, "logits/chosen": 0.032968372106552124, "logits/rejected": 0.1309332251548767, "logps/chosen": -39.79338073730469, "logps/rejected": -63.8279914855957, "loss": 0.3562, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.40201592445373535, "rewards/margins": 1.0143156051635742, "rewards/rejected": -1.4163316488265991, "step": 6576 }, { "epoch": 3.9144893111638956, "grad_norm": 42.75, "learning_rate": 2.171021377672209e-07, "logits/chosen": 0.06808724999427795, "logits/rejected": 0.1723027527332306, "logps/chosen": -39.6837272644043, "logps/rejected": -63.00306701660156, "loss": 0.3484, "rewards/accuracies": 0.9453125, "rewards/chosen": -0.3555365800857544, "rewards/margins": 1.008614420890808, "rewards/rejected": -1.3641510009765625, "step": 6592 }, { "epoch": 3.9239904988123513, "grad_norm": 49.0, "learning_rate": 2.1520190023752967e-07, "logits/chosen": 0.08254537731409073, "logits/rejected": 0.11683779954910278, "logps/chosen": -40.13639450073242, "logps/rejected": -59.88142776489258, "loss": 0.3912, "rewards/accuracies": 0.921875, "rewards/chosen": -0.36244481801986694, "rewards/margins": 0.8833273649215698, "rewards/rejected": -1.245772123336792, "step": 6608 }, { "epoch": 3.9334916864608074, "grad_norm": 55.25, "learning_rate": 2.1330166270783847e-07, "logits/chosen": 0.017773086205124855, "logits/rejected": 0.12275815010070801, "logps/chosen": -41.630889892578125, "logps/rejected": -64.5182876586914, "loss": 0.3651, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.39863353967666626, "rewards/margins": 0.9960266351699829, "rewards/rejected": -1.3946601152420044, "step": 6624 }, { "epoch": 3.9429928741092635, "grad_norm": 44.25, "learning_rate": 2.1140142517814726e-07, "logits/chosen": 0.027539458125829697, "logits/rejected": 0.1340925693511963, "logps/chosen": -40.22187042236328, "logps/rejected": -62.83479690551758, "loss": 0.3761, "rewards/accuracies": 0.90625, "rewards/chosen": -0.42398306727409363, "rewards/margins": 0.9288855195045471, "rewards/rejected": -1.352868676185608, "step": 6640 }, { "epoch": 3.9524940617577196, "grad_norm": 61.75, "learning_rate": 2.0950118764845603e-07, "logits/chosen": 0.031753845512866974, "logits/rejected": 0.13772860169410706, "logps/chosen": -41.15782928466797, "logps/rejected": -64.47724914550781, "loss": 0.3647, "rewards/accuracies": 0.890625, "rewards/chosen": -0.38045597076416016, "rewards/margins": 1.0232834815979004, "rewards/rejected": -1.4037394523620605, "step": 6656 }, { "epoch": 3.9619952494061756, "grad_norm": 57.25, "learning_rate": 2.0760095011876483e-07, "logits/chosen": 0.0039275167509913445, "logits/rejected": 0.13760821521282196, "logps/chosen": -39.61132049560547, "logps/rejected": -63.1502685546875, "loss": 0.3637, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.3538341522216797, "rewards/margins": 0.979717493057251, "rewards/rejected": -1.3335516452789307, "step": 6672 }, { "epoch": 3.9714964370546317, "grad_norm": 58.5, "learning_rate": 2.0570071258907363e-07, "logits/chosen": -0.005867550149559975, "logits/rejected": 0.10856153070926666, "logps/chosen": -40.953468322753906, "logps/rejected": -64.8403091430664, "loss": 0.3771, "rewards/accuracies": 0.921875, "rewards/chosen": -0.37854593992233276, "rewards/margins": 0.982690155506134, "rewards/rejected": -1.3612359762191772, "step": 6688 }, { "epoch": 3.980997624703088, "grad_norm": 46.0, "learning_rate": 2.0380047505938242e-07, "logits/chosen": 0.07198520749807358, "logits/rejected": 0.0809374749660492, "logps/chosen": -40.85484313964844, "logps/rejected": -63.39386749267578, "loss": 0.379, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4362155795097351, "rewards/margins": 0.933577835559845, "rewards/rejected": -1.3697935342788696, "step": 6704 }, { "epoch": 3.990498812351544, "grad_norm": 57.5, "learning_rate": 2.0190023752969122e-07, "logits/chosen": 0.05437842011451721, "logits/rejected": 0.1582392305135727, "logps/chosen": -41.71647262573242, "logps/rejected": -66.48292541503906, "loss": 0.3764, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.4557511806488037, "rewards/margins": 0.9482683539390564, "rewards/rejected": -1.4040195941925049, "step": 6720 }, { "epoch": 4.0, "grad_norm": 65.5, "learning_rate": 2e-07, "logits/chosen": 0.005993685685098171, "logits/rejected": 0.11152346432209015, "logps/chosen": -41.31070327758789, "logps/rejected": -63.83424758911133, "loss": 0.3716, "rewards/accuracies": 0.9296875, "rewards/chosen": -0.42573827505111694, "rewards/margins": 0.946398138999939, "rewards/rejected": -1.3721364736557007, "step": 6736 } ], "logging_steps": 16, "max_steps": 8420, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }