diff --git "a/checkpoint-415/trainer_state.json" "b/checkpoint-415/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-415/trainer_state.json" @@ -0,0 +1,6259 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 415, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0024110910186859553, + "grad_norm": 0.13540981709957123, + "learning_rate": 0.0, + "logits/chosen": -0.0662384033203125, + "logits/rejected": 0.145843505859375, + "logps/chosen": -1.815253496170044, + "logps/rejected": -2.616685628890991, + "loss": -0.1005, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20678018033504486, + "rewards/margins": 0.10050030052661896, + "rewards/rejected": 0.1062798798084259, + "step": 1 + }, + { + "epoch": 0.004822182037371911, + "grad_norm": 0.23861679434776306, + "learning_rate": 1.1904761904761906e-07, + "logits/chosen": -0.1087646484375, + "logits/rejected": -0.007598876953125, + "logps/chosen": -1.9724370241165161, + "logps/rejected": -2.4898905754089355, + "loss": -0.0593, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18243205547332764, + "rewards/margins": 0.05933766812086105, + "rewards/rejected": 0.12309440970420837, + "step": 2 + }, + { + "epoch": 0.007233273056057866, + "grad_norm": 0.3336013853549957, + "learning_rate": 2.3809523809523811e-07, + "logits/chosen": -0.2941436767578125, + "logits/rejected": 0.0058441162109375, + "logps/chosen": -1.8367376327514648, + "logps/rejected": -2.0668880939483643, + "loss": 0.0019, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2006993591785431, + "rewards/margins": -0.0019104601815342903, + "rewards/rejected": 0.2026098221540451, + "step": 3 + }, + { + "epoch": 0.009644364074743821, + "grad_norm": 0.2428356111049652, + "learning_rate": 3.5714285714285716e-07, + "logits/chosen": -0.24017333984375, + "logits/rejected": -0.49212646484375, + "logps/chosen": -2.2315659523010254, + "logps/rejected": -2.835834503173828, + "loss": -0.0708, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.14392711222171783, + "rewards/margins": 0.07077126950025558, + "rewards/rejected": 0.07315585017204285, + "step": 4 + }, + { + "epoch": 0.012055455093429777, + "grad_norm": 0.29473310708999634, + "learning_rate": 4.7619047619047623e-07, + "logits/chosen": -0.380950927734375, + "logits/rejected": -0.092864990234375, + "logps/chosen": -2.0910398960113525, + "logps/rejected": -2.601386070251465, + "loss": -0.0565, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17952492833137512, + "rewards/margins": 0.05648491531610489, + "rewards/rejected": 0.12304002046585083, + "step": 5 + }, + { + "epoch": 0.014466546112115732, + "grad_norm": 0.138897106051445, + "learning_rate": 5.952380952380953e-07, + "logits/chosen": -0.25400543212890625, + "logits/rejected": -0.037200927734375, + "logps/chosen": -1.6507453918457031, + "logps/rejected": -2.2166829109191895, + "loss": -0.076, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2409430742263794, + "rewards/margins": 0.07597146928310394, + "rewards/rejected": 0.16497160494327545, + "step": 6 + }, + { + "epoch": 0.016877637130801686, + "grad_norm": 0.29587748646736145, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": -0.3331298828125, + "logits/rejected": -0.20941162109375, + "logps/chosen": -1.9136180877685547, + "logps/rejected": -2.5555055141448975, + "loss": -0.0722, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18882396817207336, + "rewards/margins": 0.07221025228500366, + "rewards/rejected": 0.1166137233376503, + "step": 7 + }, + { + "epoch": 0.019288728149487643, + "grad_norm": 0.31592094898223877, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": 0.0233612060546875, + "logits/rejected": -0.042755126953125, + "logps/chosen": -2.3161673545837402, + "logps/rejected": -2.351764440536499, + "loss": -0.0255, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1771397888660431, + "rewards/margins": 0.025483673438429832, + "rewards/rejected": 0.1516561359167099, + "step": 8 + }, + { + "epoch": 0.0216998191681736, + "grad_norm": 0.23820726573467255, + "learning_rate": 9.523809523809525e-07, + "logits/chosen": -0.013631820678710938, + "logits/rejected": 0.48004150390625, + "logps/chosen": -1.8839356899261475, + "logps/rejected": -2.195974349975586, + "loss": -0.0324, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1884903907775879, + "rewards/margins": 0.03240886330604553, + "rewards/rejected": 0.15608154237270355, + "step": 9 + }, + { + "epoch": 0.024110910186859555, + "grad_norm": 0.17708325386047363, + "learning_rate": 1.0714285714285714e-06, + "logits/chosen": -0.29457855224609375, + "logits/rejected": -0.29869842529296875, + "logps/chosen": -2.0385050773620605, + "logps/rejected": -2.320281505584717, + "loss": -0.035, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19530463218688965, + "rewards/margins": 0.035025566816329956, + "rewards/rejected": 0.1602790802717209, + "step": 10 + }, + { + "epoch": 0.02652200120554551, + "grad_norm": 0.11751855164766312, + "learning_rate": 1.1904761904761906e-06, + "logits/chosen": -0.378936767578125, + "logits/rejected": -0.184967041015625, + "logps/chosen": -1.6468758583068848, + "logps/rejected": -2.522773504257202, + "loss": -0.0912, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2361975610256195, + "rewards/margins": 0.09120555222034454, + "rewards/rejected": 0.14499200880527496, + "step": 11 + }, + { + "epoch": 0.028933092224231464, + "grad_norm": 0.284108430147171, + "learning_rate": 1.3095238095238096e-06, + "logits/chosen": -0.11008930206298828, + "logits/rejected": -0.20126724243164062, + "logps/chosen": -2.2452898025512695, + "logps/rejected": -2.788154125213623, + "loss": -0.0746, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1762125939130783, + "rewards/margins": 0.07461583614349365, + "rewards/rejected": 0.10159675031900406, + "step": 12 + }, + { + "epoch": 0.03134418324291742, + "grad_norm": 0.18186624348163605, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -0.263336181640625, + "logits/rejected": -0.1172943115234375, + "logps/chosen": -2.009011745452881, + "logps/rejected": -2.4328689575195312, + "loss": -0.0533, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1683908998966217, + "rewards/margins": 0.05330117046833038, + "rewards/rejected": 0.11508972942829132, + "step": 13 + }, + { + "epoch": 0.03375527426160337, + "grad_norm": 0.15661412477493286, + "learning_rate": 1.5476190476190479e-06, + "logits/chosen": -0.40081787109375, + "logits/rejected": -0.5296630859375, + "logps/chosen": -2.0725817680358887, + "logps/rejected": -2.5020666122436523, + "loss": -0.0266, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16588371992111206, + "rewards/margins": 0.026602834463119507, + "rewards/rejected": 0.13928088545799255, + "step": 14 + }, + { + "epoch": 0.03616636528028933, + "grad_norm": 0.2272009253501892, + "learning_rate": 1.6666666666666667e-06, + "logits/chosen": -0.352447509765625, + "logits/rejected": -0.480438232421875, + "logps/chosen": -1.8715345859527588, + "logps/rejected": -2.3423707485198975, + "loss": -0.0386, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21626010537147522, + "rewards/margins": 0.038555338978767395, + "rewards/rejected": 0.17770478129386902, + "step": 15 + }, + { + "epoch": 0.038577456298975285, + "grad_norm": 0.10181388258934021, + "learning_rate": 1.7857142857142859e-06, + "logits/chosen": -0.586456298828125, + "logits/rejected": -0.514312744140625, + "logps/chosen": -2.07165789604187, + "logps/rejected": -2.6690852642059326, + "loss": -0.055, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16787435114383698, + "rewards/margins": 0.0550290048122406, + "rewards/rejected": 0.11284533143043518, + "step": 16 + }, + { + "epoch": 0.040988547317661245, + "grad_norm": 0.17082856595516205, + "learning_rate": 1.904761904761905e-06, + "logits/chosen": -0.29706573486328125, + "logits/rejected": -0.07115745544433594, + "logps/chosen": -1.9141883850097656, + "logps/rejected": -2.4803802967071533, + "loss": -0.0552, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17581021785736084, + "rewards/margins": 0.05516396462917328, + "rewards/rejected": 0.12064626812934875, + "step": 17 + }, + { + "epoch": 0.0433996383363472, + "grad_norm": 0.10969637334346771, + "learning_rate": 2.023809523809524e-06, + "logits/chosen": -0.38360595703125, + "logits/rejected": -0.23956298828125, + "logps/chosen": -1.9388043880462646, + "logps/rejected": -2.6233248710632324, + "loss": -0.0387, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16570636630058289, + "rewards/margins": 0.03866109251976013, + "rewards/rejected": 0.12704528868198395, + "step": 18 + }, + { + "epoch": 0.04581072935503315, + "grad_norm": 0.22710655629634857, + "learning_rate": 2.1428571428571427e-06, + "logits/chosen": -0.72705078125, + "logits/rejected": -0.4123687744140625, + "logps/chosen": -1.919374704360962, + "logps/rejected": -2.848342180252075, + "loss": -0.0726, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.15873771905899048, + "rewards/margins": 0.07257579267024994, + "rewards/rejected": 0.08616192638874054, + "step": 19 + }, + { + "epoch": 0.04822182037371911, + "grad_norm": 0.23456093668937683, + "learning_rate": 2.261904761904762e-06, + "logits/chosen": 0.058868408203125, + "logits/rejected": -0.426483154296875, + "logps/chosen": -2.212374210357666, + "logps/rejected": -2.4866414070129395, + "loss": -0.0117, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15827080607414246, + "rewards/margins": 0.011696278117597103, + "rewards/rejected": 0.14657454192638397, + "step": 20 + }, + { + "epoch": 0.05063291139240506, + "grad_norm": 0.1652158945798874, + "learning_rate": 2.380952380952381e-06, + "logits/chosen": -0.48993492126464844, + "logits/rejected": -0.0250244140625, + "logps/chosen": -2.017307996749878, + "logps/rejected": -2.5289230346679688, + "loss": -0.0759, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1964530050754547, + "rewards/margins": 0.07587158679962158, + "rewards/rejected": 0.12058141827583313, + "step": 21 + }, + { + "epoch": 0.05304400241109102, + "grad_norm": 0.21531249582767487, + "learning_rate": 2.5e-06, + "logits/chosen": -0.0325775146484375, + "logits/rejected": -0.158721923828125, + "logps/chosen": -2.0455565452575684, + "logps/rejected": -2.7017455101013184, + "loss": -0.0617, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16096433997154236, + "rewards/margins": 0.06167904660105705, + "rewards/rejected": 0.0992853045463562, + "step": 22 + }, + { + "epoch": 0.055455093429776975, + "grad_norm": 0.1420794427394867, + "learning_rate": 2.6190476190476192e-06, + "logits/chosen": -0.12445068359375, + "logits/rejected": -0.3396759033203125, + "logps/chosen": -1.604001760482788, + "logps/rejected": -2.3732666969299316, + "loss": -0.1029, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25396451354026794, + "rewards/margins": 0.10291750729084015, + "rewards/rejected": 0.1510470062494278, + "step": 23 + }, + { + "epoch": 0.05786618444846293, + "grad_norm": 0.1173233762383461, + "learning_rate": 2.7380952380952387e-06, + "logits/chosen": -0.10762214660644531, + "logits/rejected": -0.34763336181640625, + "logps/chosen": -1.9975299835205078, + "logps/rejected": -2.353853940963745, + "loss": -0.0196, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.18310482800006866, + "rewards/margins": 0.01959044113755226, + "rewards/rejected": 0.1635143905878067, + "step": 24 + }, + { + "epoch": 0.06027727546714889, + "grad_norm": 0.30177751183509827, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": -0.092987060546875, + "logits/rejected": -0.303619384765625, + "logps/chosen": -1.813647747039795, + "logps/rejected": -2.292224884033203, + "loss": -0.0482, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23531828820705414, + "rewards/margins": 0.048219405114650726, + "rewards/rejected": 0.187098890542984, + "step": 25 + }, + { + "epoch": 0.06268836648583484, + "grad_norm": 0.2024577260017395, + "learning_rate": 2.9761904761904763e-06, + "logits/chosen": -0.3021240234375, + "logits/rejected": -0.23987579345703125, + "logps/chosen": -2.396427631378174, + "logps/rejected": -2.6892919540405273, + "loss": -0.0411, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.119843989610672, + "rewards/margins": 0.04106573015451431, + "rewards/rejected": 0.07877825200557709, + "step": 26 + }, + { + "epoch": 0.0650994575045208, + "grad_norm": 0.08189426362514496, + "learning_rate": 3.0952380952380957e-06, + "logits/chosen": -0.4184112548828125, + "logits/rejected": -0.45281982421875, + "logps/chosen": -2.265611171722412, + "logps/rejected": -2.5915441513061523, + "loss": -0.0434, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14376436173915863, + "rewards/margins": 0.043388731777668, + "rewards/rejected": 0.10037562996149063, + "step": 27 + }, + { + "epoch": 0.06751054852320675, + "grad_norm": 0.15428940951824188, + "learning_rate": 3.2142857142857147e-06, + "logits/chosen": -0.04058837890625, + "logits/rejected": -0.27496337890625, + "logps/chosen": -2.4062347412109375, + "logps/rejected": -2.330113410949707, + "loss": 0.0221, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.13391344249248505, + "rewards/margins": -0.02209368534386158, + "rewards/rejected": 0.15600712597370148, + "step": 28 + }, + { + "epoch": 0.0699216395418927, + "grad_norm": 0.09272627532482147, + "learning_rate": 3.3333333333333333e-06, + "logits/chosen": -0.19900131225585938, + "logits/rejected": -0.03436279296875, + "logps/chosen": -2.168255090713501, + "logps/rejected": -2.554178237915039, + "loss": -0.0443, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18573640286922455, + "rewards/margins": 0.044263195246458054, + "rewards/rejected": 0.1414732187986374, + "step": 29 + }, + { + "epoch": 0.07233273056057866, + "grad_norm": 0.2111942172050476, + "learning_rate": 3.4523809523809528e-06, + "logits/chosen": 0.30554962158203125, + "logits/rejected": -0.11358642578125, + "logps/chosen": -2.004312515258789, + "logps/rejected": -2.436143636703491, + "loss": -0.0419, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1876247525215149, + "rewards/margins": 0.04193827509880066, + "rewards/rejected": 0.14568647742271423, + "step": 30 + }, + { + "epoch": 0.07474382157926461, + "grad_norm": 0.16993696987628937, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": -0.46051025390625, + "logits/rejected": -0.5606842041015625, + "logps/chosen": -2.133668899536133, + "logps/rejected": -2.9231319427490234, + "loss": -0.0732, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13931167125701904, + "rewards/margins": 0.07323409616947174, + "rewards/rejected": 0.0660775825381279, + "step": 31 + }, + { + "epoch": 0.07715491259795057, + "grad_norm": 0.19994620978832245, + "learning_rate": 3.690476190476191e-06, + "logits/chosen": -0.259307861328125, + "logits/rejected": -0.45316505432128906, + "logps/chosen": -1.7578485012054443, + "logps/rejected": -2.067509889602661, + "loss": -0.0414, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23030422627925873, + "rewards/margins": 0.041372161358594894, + "rewards/rejected": 0.18893206119537354, + "step": 32 + }, + { + "epoch": 0.07956600361663653, + "grad_norm": 0.22370131313800812, + "learning_rate": 3.80952380952381e-06, + "logits/chosen": -0.51361083984375, + "logits/rejected": -0.2445068359375, + "logps/chosen": -1.789047122001648, + "logps/rejected": -2.7003893852233887, + "loss": -0.0928, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19410739839076996, + "rewards/margins": 0.09284965693950653, + "rewards/rejected": 0.10125774145126343, + "step": 33 + }, + { + "epoch": 0.08197709463532249, + "grad_norm": 0.1592206358909607, + "learning_rate": 3.928571428571429e-06, + "logits/chosen": -0.3609161376953125, + "logits/rejected": -0.21234130859375, + "logps/chosen": -1.8671579360961914, + "logps/rejected": -2.793840169906616, + "loss": -0.0922, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.17160022258758545, + "rewards/margins": 0.09217934310436249, + "rewards/rejected": 0.07942087948322296, + "step": 34 + }, + { + "epoch": 0.08438818565400844, + "grad_norm": 0.4399804174900055, + "learning_rate": 4.047619047619048e-06, + "logits/chosen": -0.196014404296875, + "logits/rejected": -0.144134521484375, + "logps/chosen": -1.9747718572616577, + "logps/rejected": -2.4401257038116455, + "loss": -0.0573, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18049296736717224, + "rewards/margins": 0.057283900678157806, + "rewards/rejected": 0.12320907413959503, + "step": 35 + }, + { + "epoch": 0.0867992766726944, + "grad_norm": 0.2127927541732788, + "learning_rate": 4.166666666666667e-06, + "logits/chosen": -0.4849700927734375, + "logits/rejected": -0.421234130859375, + "logps/chosen": -1.9074232578277588, + "logps/rejected": -2.594005823135376, + "loss": -0.0657, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1906270831823349, + "rewards/margins": 0.06572838127613068, + "rewards/rejected": 0.12489870190620422, + "step": 36 + }, + { + "epoch": 0.08921036769138035, + "grad_norm": 0.27844470739364624, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": -0.184722900390625, + "logits/rejected": -0.0819091796875, + "logps/chosen": -1.9601411819458008, + "logps/rejected": -2.3807597160339355, + "loss": -0.0551, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20333774387836456, + "rewards/margins": 0.05508831888437271, + "rewards/rejected": 0.14824943244457245, + "step": 37 + }, + { + "epoch": 0.0916214587100663, + "grad_norm": 0.2028970569372177, + "learning_rate": 4.404761904761905e-06, + "logits/chosen": -0.230224609375, + "logits/rejected": -0.2232666015625, + "logps/chosen": -2.1484603881835938, + "logps/rejected": -2.5777299404144287, + "loss": -0.0697, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17912191152572632, + "rewards/margins": 0.06971441209316254, + "rewards/rejected": 0.10940749943256378, + "step": 38 + }, + { + "epoch": 0.09403254972875226, + "grad_norm": 0.302728533744812, + "learning_rate": 4.523809523809524e-06, + "logits/chosen": -0.109100341796875, + "logits/rejected": -0.40990447998046875, + "logps/chosen": -1.839449405670166, + "logps/rejected": -2.3926053047180176, + "loss": -0.0287, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18382522463798523, + "rewards/margins": 0.028690434992313385, + "rewards/rejected": 0.15513478219509125, + "step": 39 + }, + { + "epoch": 0.09644364074743822, + "grad_norm": 0.2610414922237396, + "learning_rate": 4.642857142857144e-06, + "logits/chosen": -0.2518310546875, + "logits/rejected": -0.1559295654296875, + "logps/chosen": -1.984920859336853, + "logps/rejected": -2.2792441844940186, + "loss": -0.0397, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.19426622986793518, + "rewards/margins": 0.03965914994478226, + "rewards/rejected": 0.15460708737373352, + "step": 40 + }, + { + "epoch": 0.09885473176612417, + "grad_norm": 0.15944337844848633, + "learning_rate": 4.761904761904762e-06, + "logits/chosen": -0.17547607421875, + "logits/rejected": -0.48273468017578125, + "logps/chosen": -2.3791887760162354, + "logps/rejected": -2.613189697265625, + "loss": -0.0304, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1360052227973938, + "rewards/margins": 0.030447155237197876, + "rewards/rejected": 0.10555806756019592, + "step": 41 + }, + { + "epoch": 0.10126582278481013, + "grad_norm": 0.10842674225568771, + "learning_rate": 4.880952380952381e-06, + "logits/chosen": -0.442657470703125, + "logits/rejected": -0.25125885009765625, + "logps/chosen": -1.9484907388687134, + "logps/rejected": -2.234405517578125, + "loss": -0.0251, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19080810248851776, + "rewards/margins": 0.025104153901338577, + "rewards/rejected": 0.16570396721363068, + "step": 42 + }, + { + "epoch": 0.10367691380349608, + "grad_norm": 0.20101885497570038, + "learning_rate": 5e-06, + "logits/chosen": -0.455078125, + "logits/rejected": -0.4127197265625, + "logps/chosen": -1.673321008682251, + "logps/rejected": -2.464716911315918, + "loss": -0.0917, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23471905291080475, + "rewards/margins": 0.0917198434472084, + "rewards/rejected": 0.14299920201301575, + "step": 43 + }, + { + "epoch": 0.10608800482218204, + "grad_norm": 0.2515440285205841, + "learning_rate": 4.986595174262735e-06, + "logits/chosen": -0.4605236053466797, + "logits/rejected": -0.3711700439453125, + "logps/chosen": -1.846736192703247, + "logps/rejected": -2.428391218185425, + "loss": -0.0862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22247973084449768, + "rewards/margins": 0.08620914071798325, + "rewards/rejected": 0.13627058267593384, + "step": 44 + }, + { + "epoch": 0.10849909584086799, + "grad_norm": 0.33906176686286926, + "learning_rate": 4.97319034852547e-06, + "logits/chosen": -0.020111083984375, + "logits/rejected": -0.1829071044921875, + "logps/chosen": -2.013679027557373, + "logps/rejected": -2.2912840843200684, + "loss": -0.0559, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.20378942787647247, + "rewards/margins": 0.055882230401039124, + "rewards/rejected": 0.14790719747543335, + "step": 45 + }, + { + "epoch": 0.11091018685955395, + "grad_norm": 0.17432667315006256, + "learning_rate": 4.959785522788204e-06, + "logits/chosen": -0.3087158203125, + "logits/rejected": -0.006011962890625, + "logps/chosen": -2.0413284301757812, + "logps/rejected": -2.6799116134643555, + "loss": -0.0552, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17880913615226746, + "rewards/margins": 0.0551818385720253, + "rewards/rejected": 0.12362729012966156, + "step": 46 + }, + { + "epoch": 0.11332127787823991, + "grad_norm": 0.2301454395055771, + "learning_rate": 4.946380697050938e-06, + "logits/chosen": -0.3030548095703125, + "logits/rejected": -0.25323486328125, + "logps/chosen": -1.760693073272705, + "logps/rejected": -2.335937976837158, + "loss": -0.0687, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21892692148685455, + "rewards/margins": 0.06873352080583572, + "rewards/rejected": 0.15019339323043823, + "step": 47 + }, + { + "epoch": 0.11573236889692586, + "grad_norm": 0.16875001788139343, + "learning_rate": 4.9329758713136735e-06, + "logits/chosen": -0.521636962890625, + "logits/rejected": -0.40252685546875, + "logps/chosen": -1.7948408126831055, + "logps/rejected": -2.5222525596618652, + "loss": -0.0787, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19600583612918854, + "rewards/margins": 0.07869600504636765, + "rewards/rejected": 0.11730983853340149, + "step": 48 + }, + { + "epoch": 0.11814345991561181, + "grad_norm": 0.2114410102367401, + "learning_rate": 4.919571045576408e-06, + "logits/chosen": -0.545257568359375, + "logits/rejected": -0.3851318359375, + "logps/chosen": -1.9391257762908936, + "logps/rejected": -2.7119221687316895, + "loss": -0.0832, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1780286729335785, + "rewards/margins": 0.08319505304098129, + "rewards/rejected": 0.09483359754085541, + "step": 49 + }, + { + "epoch": 0.12055455093429777, + "grad_norm": 0.15858511626720428, + "learning_rate": 4.906166219839142e-06, + "logits/chosen": -0.374267578125, + "logits/rejected": -0.422882080078125, + "logps/chosen": -2.1403720378875732, + "logps/rejected": -2.6450939178466797, + "loss": -0.0686, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1798945665359497, + "rewards/margins": 0.06862486898899078, + "rewards/rejected": 0.11126969754695892, + "step": 50 + }, + { + "epoch": 0.12296564195298372, + "grad_norm": 0.22379671037197113, + "learning_rate": 4.892761394101877e-06, + "logits/chosen": -0.444732666015625, + "logits/rejected": -0.4769287109375, + "logps/chosen": -1.6993770599365234, + "logps/rejected": -3.4388632774353027, + "loss": -0.1163, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.21231791377067566, + "rewards/margins": 0.11625936627388, + "rewards/rejected": 0.09605855494737625, + "step": 51 + }, + { + "epoch": 0.12537673297166968, + "grad_norm": 0.17817257344722748, + "learning_rate": 4.8793565683646115e-06, + "logits/chosen": -0.21722412109375, + "logits/rejected": -0.457275390625, + "logps/chosen": -1.8493852615356445, + "logps/rejected": -2.4138283729553223, + "loss": -0.0433, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18631546199321747, + "rewards/margins": 0.04325813055038452, + "rewards/rejected": 0.14305734634399414, + "step": 52 + }, + { + "epoch": 0.12778782399035563, + "grad_norm": 0.19372503459453583, + "learning_rate": 4.865951742627346e-06, + "logits/chosen": -0.451568603515625, + "logits/rejected": 0.0186767578125, + "logps/chosen": -1.7283393144607544, + "logps/rejected": -2.3081676959991455, + "loss": -0.092, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2489156574010849, + "rewards/margins": 0.09201367199420929, + "rewards/rejected": 0.15690195560455322, + "step": 53 + }, + { + "epoch": 0.1301989150090416, + "grad_norm": 0.2276526838541031, + "learning_rate": 4.852546916890081e-06, + "logits/chosen": -0.2716064453125, + "logits/rejected": -0.15972900390625, + "logps/chosen": -1.8576874732971191, + "logps/rejected": -2.2848258018493652, + "loss": -0.0768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22309163212776184, + "rewards/margins": 0.07676225900650024, + "rewards/rejected": 0.1463293582201004, + "step": 54 + }, + { + "epoch": 0.13261000602772754, + "grad_norm": 0.13054406642913818, + "learning_rate": 4.839142091152815e-06, + "logits/chosen": -0.52606201171875, + "logits/rejected": -0.4135894775390625, + "logps/chosen": -1.5982319116592407, + "logps/rejected": -2.2557473182678223, + "loss": -0.0826, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2298174500465393, + "rewards/margins": 0.08261054754257202, + "rewards/rejected": 0.14720690250396729, + "step": 55 + }, + { + "epoch": 0.1350210970464135, + "grad_norm": 0.10972320288419724, + "learning_rate": 4.8257372654155495e-06, + "logits/chosen": -0.3395843505859375, + "logits/rejected": -0.5084991455078125, + "logps/chosen": -2.2133383750915527, + "logps/rejected": -3.0123062133789062, + "loss": -0.083, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13503645360469818, + "rewards/margins": 0.08296732604503632, + "rewards/rejected": 0.05206912383437157, + "step": 56 + }, + { + "epoch": 0.13743218806509946, + "grad_norm": 0.15790365636348724, + "learning_rate": 4.812332439678285e-06, + "logits/chosen": -0.11748504638671875, + "logits/rejected": -0.09547805786132812, + "logps/chosen": -1.7487789392471313, + "logps/rejected": -2.369713306427002, + "loss": -0.0841, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.220793217420578, + "rewards/margins": 0.08413002640008926, + "rewards/rejected": 0.13666319847106934, + "step": 57 + }, + { + "epoch": 0.1398432790837854, + "grad_norm": 0.4171365797519684, + "learning_rate": 4.798927613941019e-06, + "logits/chosen": 0.0216064453125, + "logits/rejected": -0.113677978515625, + "logps/chosen": -1.3385279178619385, + "logps/rejected": -1.974929928779602, + "loss": -0.0785, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3086555600166321, + "rewards/margins": 0.07846418023109436, + "rewards/rejected": 0.2301914095878601, + "step": 58 + }, + { + "epoch": 0.14225437010247136, + "grad_norm": 0.26115328073501587, + "learning_rate": 4.785522788203753e-06, + "logits/chosen": -0.2350311279296875, + "logits/rejected": 0.15704345703125, + "logps/chosen": -1.844977855682373, + "logps/rejected": -2.1639933586120605, + "loss": -0.0797, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2549409568309784, + "rewards/margins": 0.0797463059425354, + "rewards/rejected": 0.175194650888443, + "step": 59 + }, + { + "epoch": 0.14466546112115733, + "grad_norm": 0.1251203715801239, + "learning_rate": 4.772117962466488e-06, + "logits/chosen": -0.2930641174316406, + "logits/rejected": 0.09967041015625, + "logps/chosen": -2.120539665222168, + "logps/rejected": -2.4138970375061035, + "loss": -0.0541, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2148587554693222, + "rewards/margins": 0.05413561686873436, + "rewards/rejected": 0.16072311997413635, + "step": 60 + }, + { + "epoch": 0.14707655213984328, + "grad_norm": 0.1257125586271286, + "learning_rate": 4.758713136729223e-06, + "logits/chosen": -0.34749794006347656, + "logits/rejected": -0.5482635498046875, + "logps/chosen": -2.239009380340576, + "logps/rejected": -2.781611442565918, + "loss": -0.0664, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14124281704425812, + "rewards/margins": 0.06636959314346313, + "rewards/rejected": 0.07487322390079498, + "step": 61 + }, + { + "epoch": 0.14948764315852922, + "grad_norm": 0.3138192594051361, + "learning_rate": 4.745308310991958e-06, + "logits/chosen": -0.50518798828125, + "logits/rejected": -0.3048248291015625, + "logps/chosen": -1.6394758224487305, + "logps/rejected": -2.2009780406951904, + "loss": -0.0566, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22183741629123688, + "rewards/margins": 0.05656104534864426, + "rewards/rejected": 0.16527636349201202, + "step": 62 + }, + { + "epoch": 0.1518987341772152, + "grad_norm": 0.13117310404777527, + "learning_rate": 4.731903485254692e-06, + "logits/chosen": 0.00091552734375, + "logits/rejected": -0.3922119140625, + "logps/chosen": -2.129087448120117, + "logps/rejected": -2.641709089279175, + "loss": -0.0559, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1729794293642044, + "rewards/margins": 0.05590403825044632, + "rewards/rejected": 0.11707539856433868, + "step": 63 + }, + { + "epoch": 0.15430982519590114, + "grad_norm": 0.4374631345272064, + "learning_rate": 4.718498659517426e-06, + "logits/chosen": -0.44207763671875, + "logits/rejected": -0.157684326171875, + "logps/chosen": -2.1939444541931152, + "logps/rejected": -2.554738998413086, + "loss": -0.0677, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.17465618252754211, + "rewards/margins": 0.06774517148733139, + "rewards/rejected": 0.10691101849079132, + "step": 64 + }, + { + "epoch": 0.1567209162145871, + "grad_norm": 0.15238653123378754, + "learning_rate": 4.7050938337801614e-06, + "logits/chosen": -0.4530010223388672, + "logits/rejected": -0.314239501953125, + "logps/chosen": -1.8891229629516602, + "logps/rejected": -2.7334225177764893, + "loss": -0.096, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1933603286743164, + "rewards/margins": 0.09599657356739044, + "rewards/rejected": 0.09736377000808716, + "step": 65 + }, + { + "epoch": 0.15913200723327306, + "grad_norm": 0.26014086604118347, + "learning_rate": 4.691689008042896e-06, + "logits/chosen": -0.505615234375, + "logits/rejected": -0.43780517578125, + "logps/chosen": -2.039263963699341, + "logps/rejected": -2.278066635131836, + "loss": -0.0365, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.19455406069755554, + "rewards/margins": 0.03649631887674332, + "rewards/rejected": 0.15805774927139282, + "step": 66 + }, + { + "epoch": 0.161543098251959, + "grad_norm": 0.10262659192085266, + "learning_rate": 4.67828418230563e-06, + "logits/chosen": -0.4066314697265625, + "logits/rejected": -0.291351318359375, + "logps/chosen": -1.8935928344726562, + "logps/rejected": -2.512824535369873, + "loss": -0.0746, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1891571581363678, + "rewards/margins": 0.07459196448326111, + "rewards/rejected": 0.1145651787519455, + "step": 67 + }, + { + "epoch": 0.16395418927064498, + "grad_norm": 0.13350163400173187, + "learning_rate": 4.664879356568365e-06, + "logits/chosen": -0.48486328125, + "logits/rejected": -0.358642578125, + "logps/chosen": -1.6513335704803467, + "logps/rejected": -2.6039624214172363, + "loss": -0.0848, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22321400046348572, + "rewards/margins": 0.08479839563369751, + "rewards/rejected": 0.1384156197309494, + "step": 68 + }, + { + "epoch": 0.16636528028933092, + "grad_norm": 0.10497214645147324, + "learning_rate": 4.651474530831099e-06, + "logits/chosen": -0.389892578125, + "logits/rejected": -0.4625816345214844, + "logps/chosen": -2.3515095710754395, + "logps/rejected": -2.8483352661132812, + "loss": -0.0658, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14101114869117737, + "rewards/margins": 0.06579185277223587, + "rewards/rejected": 0.0752192884683609, + "step": 69 + }, + { + "epoch": 0.16877637130801687, + "grad_norm": 0.15526430308818817, + "learning_rate": 4.638069705093834e-06, + "logits/chosen": -0.49468994140625, + "logits/rejected": -0.4281005859375, + "logps/chosen": -2.015451669692993, + "logps/rejected": -2.8719983100891113, + "loss": -0.1041, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18385550379753113, + "rewards/margins": 0.10408008843660355, + "rewards/rejected": 0.07977539300918579, + "step": 70 + }, + { + "epoch": 0.17118746232670284, + "grad_norm": 0.27161529660224915, + "learning_rate": 4.624664879356569e-06, + "logits/chosen": -0.5327186584472656, + "logits/rejected": -0.1277618408203125, + "logps/chosen": -1.5442382097244263, + "logps/rejected": -2.3167052268981934, + "loss": -0.0813, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2626121938228607, + "rewards/margins": 0.0812920331954956, + "rewards/rejected": 0.18132014572620392, + "step": 71 + }, + { + "epoch": 0.1735985533453888, + "grad_norm": 0.20697109401226044, + "learning_rate": 4.611260053619303e-06, + "logits/chosen": -0.15019989013671875, + "logits/rejected": 0.14874267578125, + "logps/chosen": -1.754292368888855, + "logps/rejected": -2.4399638175964355, + "loss": -0.0952, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22703246772289276, + "rewards/margins": 0.09517242014408112, + "rewards/rejected": 0.13186006247997284, + "step": 72 + }, + { + "epoch": 0.17600964436407474, + "grad_norm": 0.3675103783607483, + "learning_rate": 4.597855227882037e-06, + "logits/chosen": 0.14675140380859375, + "logits/rejected": -0.2359771728515625, + "logps/chosen": -2.012716054916382, + "logps/rejected": -2.7197463512420654, + "loss": -0.0733, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1592480093240738, + "rewards/margins": 0.07329224050045013, + "rewards/rejected": 0.08595577627420425, + "step": 73 + }, + { + "epoch": 0.1784207353827607, + "grad_norm": 0.3158482611179352, + "learning_rate": 4.5844504021447725e-06, + "logits/chosen": -0.10469627380371094, + "logits/rejected": -0.5006103515625, + "logps/chosen": -1.9160065650939941, + "logps/rejected": -2.619612455368042, + "loss": -0.0668, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1975511908531189, + "rewards/margins": 0.06682511419057846, + "rewards/rejected": 0.13072606921195984, + "step": 74 + }, + { + "epoch": 0.18083182640144665, + "grad_norm": 0.14642076194286346, + "learning_rate": 4.571045576407508e-06, + "logits/chosen": -0.4441871643066406, + "logits/rejected": -0.4947662353515625, + "logps/chosen": -1.870173454284668, + "logps/rejected": -2.7414560317993164, + "loss": -0.1017, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.18330451846122742, + "rewards/margins": 0.10166242718696594, + "rewards/rejected": 0.08164208382368088, + "step": 75 + }, + { + "epoch": 0.1832429174201326, + "grad_norm": 0.09241756796836853, + "learning_rate": 4.557640750670242e-06, + "logits/chosen": -0.4321136474609375, + "logits/rejected": 0.004730224609375, + "logps/chosen": -2.0175940990448, + "logps/rejected": -3.004261016845703, + "loss": -0.0993, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16436198353767395, + "rewards/margins": 0.09928404539823532, + "rewards/rejected": 0.06507793813943863, + "step": 76 + }, + { + "epoch": 0.18565400843881857, + "grad_norm": 0.4576946198940277, + "learning_rate": 4.544235924932976e-06, + "logits/chosen": -0.48431396484375, + "logits/rejected": -0.4161376953125, + "logps/chosen": -1.9736279249191284, + "logps/rejected": -2.632988929748535, + "loss": -0.0868, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18318729102611542, + "rewards/margins": 0.08684973418712616, + "rewards/rejected": 0.09633754938840866, + "step": 77 + }, + { + "epoch": 0.18806509945750452, + "grad_norm": 0.2209533154964447, + "learning_rate": 4.530831099195711e-06, + "logits/chosen": -0.2362060546875, + "logits/rejected": -0.30464935302734375, + "logps/chosen": -1.8351658582687378, + "logps/rejected": -2.5220913887023926, + "loss": -0.0685, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18719515204429626, + "rewards/margins": 0.0684509128332138, + "rewards/rejected": 0.11874424666166306, + "step": 78 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.12729401886463165, + "learning_rate": 4.517426273458446e-06, + "logits/chosen": -0.425689697265625, + "logits/rejected": -0.289306640625, + "logps/chosen": -2.03001070022583, + "logps/rejected": -2.797839403152466, + "loss": -0.0785, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.191144660115242, + "rewards/margins": 0.07852694392204285, + "rewards/rejected": 0.11261773109436035, + "step": 79 + }, + { + "epoch": 0.19288728149487644, + "grad_norm": 0.2784936726093292, + "learning_rate": 4.50402144772118e-06, + "logits/chosen": -0.1843414306640625, + "logits/rejected": -0.2737884521484375, + "logps/chosen": -2.0708518028259277, + "logps/rejected": -2.5952236652374268, + "loss": -0.086, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20133110880851746, + "rewards/margins": 0.08602908253669739, + "rewards/rejected": 0.11530201137065887, + "step": 80 + }, + { + "epoch": 0.19529837251356238, + "grad_norm": 0.25862249732017517, + "learning_rate": 4.490616621983915e-06, + "logits/chosen": -0.47052001953125, + "logits/rejected": 0.07061767578125, + "logps/chosen": -1.6248124837875366, + "logps/rejected": -2.3930277824401855, + "loss": -0.097, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24042755365371704, + "rewards/margins": 0.09697020053863525, + "rewards/rejected": 0.1434573531150818, + "step": 81 + }, + { + "epoch": 0.19770946353224833, + "grad_norm": 0.17928369343280792, + "learning_rate": 4.477211796246649e-06, + "logits/chosen": -0.230255126953125, + "logits/rejected": -0.19580078125, + "logps/chosen": -2.028120756149292, + "logps/rejected": -2.502490520477295, + "loss": -0.0635, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17482265830039978, + "rewards/margins": 0.06347900629043579, + "rewards/rejected": 0.11134365200996399, + "step": 82 + }, + { + "epoch": 0.2001205545509343, + "grad_norm": 0.3341054618358612, + "learning_rate": 4.463806970509384e-06, + "logits/chosen": -0.3468017578125, + "logits/rejected": -0.13421630859375, + "logps/chosen": -1.7436274290084839, + "logps/rejected": -2.3278002738952637, + "loss": -0.0815, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2389451116323471, + "rewards/margins": 0.08148396015167236, + "rewards/rejected": 0.15746116638183594, + "step": 83 + }, + { + "epoch": 0.20253164556962025, + "grad_norm": 0.32325637340545654, + "learning_rate": 4.450402144772119e-06, + "logits/chosen": -0.1004638671875, + "logits/rejected": 0.097198486328125, + "logps/chosen": -1.5258474349975586, + "logps/rejected": -2.320789337158203, + "loss": -0.1054, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27151721715927124, + "rewards/margins": 0.10544810444116592, + "rewards/rejected": 0.1660691201686859, + "step": 84 + }, + { + "epoch": 0.2049427365883062, + "grad_norm": 0.34552255272865295, + "learning_rate": 4.436997319034853e-06, + "logits/chosen": -0.3366241455078125, + "logits/rejected": -0.1417083740234375, + "logps/chosen": -1.6111032962799072, + "logps/rejected": -2.463747262954712, + "loss": -0.1116, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23705938458442688, + "rewards/margins": 0.11157448589801788, + "rewards/rejected": 0.1254848837852478, + "step": 85 + }, + { + "epoch": 0.20735382760699217, + "grad_norm": 0.19965040683746338, + "learning_rate": 4.423592493297587e-06, + "logits/chosen": -0.31158447265625, + "logits/rejected": -0.24356842041015625, + "logps/chosen": -1.8350573778152466, + "logps/rejected": -2.5424113273620605, + "loss": -0.0809, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1971243917942047, + "rewards/margins": 0.08091898262500763, + "rewards/rejected": 0.11620542407035828, + "step": 86 + }, + { + "epoch": 0.20976491862567812, + "grad_norm": 0.2497408092021942, + "learning_rate": 4.4101876675603224e-06, + "logits/chosen": -0.20909881591796875, + "logits/rejected": -0.13914060592651367, + "logps/chosen": -2.1212737560272217, + "logps/rejected": -2.6856305599212646, + "loss": -0.0844, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19036230444908142, + "rewards/margins": 0.08436112105846405, + "rewards/rejected": 0.10600116848945618, + "step": 87 + }, + { + "epoch": 0.2121760096443641, + "grad_norm": 0.2111276239156723, + "learning_rate": 4.396782841823057e-06, + "logits/chosen": -0.3738212585449219, + "logits/rejected": -0.41668701171875, + "logps/chosen": -1.758054494857788, + "logps/rejected": -2.4501168727874756, + "loss": -0.0747, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21286746859550476, + "rewards/margins": 0.07473389059305191, + "rewards/rejected": 0.13813358545303345, + "step": 88 + }, + { + "epoch": 0.21458710066305003, + "grad_norm": 0.10888315737247467, + "learning_rate": 4.383378016085791e-06, + "logits/chosen": -0.314666748046875, + "logits/rejected": -0.07592010498046875, + "logps/chosen": -1.9387867450714111, + "logps/rejected": -2.9789085388183594, + "loss": -0.1106, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17687290906906128, + "rewards/margins": 0.1105976328253746, + "rewards/rejected": 0.06627528369426727, + "step": 89 + }, + { + "epoch": 0.21699819168173598, + "grad_norm": 0.21843066811561584, + "learning_rate": 4.369973190348526e-06, + "logits/chosen": -0.3724937438964844, + "logits/rejected": -0.12108993530273438, + "logps/chosen": -1.7286062240600586, + "logps/rejected": -2.1806440353393555, + "loss": -0.0559, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.251477986574173, + "rewards/margins": 0.05585840716958046, + "rewards/rejected": 0.1956195831298828, + "step": 90 + }, + { + "epoch": 0.21940928270042195, + "grad_norm": 0.1985900104045868, + "learning_rate": 4.35656836461126e-06, + "logits/chosen": -0.2830810546875, + "logits/rejected": -0.233734130859375, + "logps/chosen": -1.8169076442718506, + "logps/rejected": -2.9106485843658447, + "loss": -0.1253, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.19795525074005127, + "rewards/margins": 0.12529826164245605, + "rewards/rejected": 0.07265698909759521, + "step": 91 + }, + { + "epoch": 0.2218203737191079, + "grad_norm": 0.5389503240585327, + "learning_rate": 4.343163538873995e-06, + "logits/chosen": -0.24054336547851562, + "logits/rejected": -0.027099609375, + "logps/chosen": -1.6436183452606201, + "logps/rejected": -2.21420955657959, + "loss": -0.0827, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23773282766342163, + "rewards/margins": 0.08266165852546692, + "rewards/rejected": 0.1550711691379547, + "step": 92 + }, + { + "epoch": 0.22423146473779385, + "grad_norm": 0.39058253169059753, + "learning_rate": 4.32975871313673e-06, + "logits/chosen": -0.3819732666015625, + "logits/rejected": -0.41412353515625, + "logps/chosen": -2.3253557682037354, + "logps/rejected": -2.9037134647369385, + "loss": -0.0723, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1553516536951065, + "rewards/margins": 0.07226738333702087, + "rewards/rejected": 0.08308427035808563, + "step": 93 + }, + { + "epoch": 0.22664255575647982, + "grad_norm": 0.28771349787712097, + "learning_rate": 4.316353887399464e-06, + "logits/chosen": -0.4373664855957031, + "logits/rejected": -0.2580432891845703, + "logps/chosen": -1.682684063911438, + "logps/rejected": -2.8237454891204834, + "loss": -0.1139, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21207323670387268, + "rewards/margins": 0.11388413608074188, + "rewards/rejected": 0.0981891006231308, + "step": 94 + }, + { + "epoch": 0.22905364677516576, + "grad_norm": 0.3826136291027069, + "learning_rate": 4.302949061662199e-06, + "logits/chosen": -0.2050018310546875, + "logits/rejected": -0.1048583984375, + "logps/chosen": -1.630781650543213, + "logps/rejected": -2.340024709701538, + "loss": -0.1231, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.30112412571907043, + "rewards/margins": 0.1230817511677742, + "rewards/rejected": 0.17804236710071564, + "step": 95 + }, + { + "epoch": 0.2314647377938517, + "grad_norm": 0.28904107213020325, + "learning_rate": 4.2895442359249335e-06, + "logits/chosen": -0.10987091064453125, + "logits/rejected": 0.16712188720703125, + "logps/chosen": -2.0005404949188232, + "logps/rejected": -2.7157270908355713, + "loss": -0.0922, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19579149782657623, + "rewards/margins": 0.09215118736028671, + "rewards/rejected": 0.10364031046628952, + "step": 96 + }, + { + "epoch": 0.23387582881253768, + "grad_norm": 0.28353551030158997, + "learning_rate": 4.276139410187668e-06, + "logits/chosen": -0.291778564453125, + "logits/rejected": -0.335479736328125, + "logps/chosen": -2.219102621078491, + "logps/rejected": -2.8513545989990234, + "loss": -0.0835, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18798625469207764, + "rewards/margins": 0.08353492617607117, + "rewards/rejected": 0.10445132851600647, + "step": 97 + }, + { + "epoch": 0.23628691983122363, + "grad_norm": 0.1202004924416542, + "learning_rate": 4.262734584450403e-06, + "logits/chosen": -0.10400390625, + "logits/rejected": -0.391357421875, + "logps/chosen": -1.8681118488311768, + "logps/rejected": -2.7251126766204834, + "loss": -0.0929, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1838112473487854, + "rewards/margins": 0.0928950309753418, + "rewards/rejected": 0.0909162238240242, + "step": 98 + }, + { + "epoch": 0.23869801084990958, + "grad_norm": 0.24099823832511902, + "learning_rate": 4.249329758713137e-06, + "logits/chosen": -0.458831787109375, + "logits/rejected": -0.258544921875, + "logps/chosen": -2.1151838302612305, + "logps/rejected": -2.593945026397705, + "loss": -0.0772, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1863017976284027, + "rewards/margins": 0.07718081772327423, + "rewards/rejected": 0.10912098735570908, + "step": 99 + }, + { + "epoch": 0.24110910186859555, + "grad_norm": 0.14569653570652008, + "learning_rate": 4.2359249329758715e-06, + "logits/chosen": -0.12518310546875, + "logits/rejected": -0.225616455078125, + "logps/chosen": -1.9331969022750854, + "logps/rejected": -3.222170352935791, + "loss": -0.1074, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.18711033463478088, + "rewards/margins": 0.10739375650882721, + "rewards/rejected": 0.07971656322479248, + "step": 100 + }, + { + "epoch": 0.2435201928872815, + "grad_norm": 0.16426680982112885, + "learning_rate": 4.222520107238607e-06, + "logits/chosen": -0.196533203125, + "logits/rejected": -0.091705322265625, + "logps/chosen": -1.6990966796875, + "logps/rejected": -2.578277111053467, + "loss": -0.0945, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23317289352416992, + "rewards/margins": 0.09446683526039124, + "rewards/rejected": 0.1387060582637787, + "step": 101 + }, + { + "epoch": 0.24593128390596744, + "grad_norm": 0.13737422227859497, + "learning_rate": 4.209115281501341e-06, + "logits/chosen": -0.21097087860107422, + "logits/rejected": -0.35211181640625, + "logps/chosen": -1.9149645566940308, + "logps/rejected": -2.552746295928955, + "loss": -0.0875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23537056148052216, + "rewards/margins": 0.08745107054710388, + "rewards/rejected": 0.14791950583457947, + "step": 102 + }, + { + "epoch": 0.24834237492465341, + "grad_norm": 0.3818114697933197, + "learning_rate": 4.195710455764075e-06, + "logits/chosen": -0.14261245727539062, + "logits/rejected": 0.32440948486328125, + "logps/chosen": -1.8553460836410522, + "logps/rejected": -2.6073460578918457, + "loss": -0.0916, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21273291110992432, + "rewards/margins": 0.0915689468383789, + "rewards/rejected": 0.12116396427154541, + "step": 103 + }, + { + "epoch": 0.25075346594333936, + "grad_norm": 0.25290530920028687, + "learning_rate": 4.18230563002681e-06, + "logits/chosen": -0.0988311767578125, + "logits/rejected": 0.1678466796875, + "logps/chosen": -1.1757745742797852, + "logps/rejected": -1.6883862018585205, + "loss": -0.1212, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.371662974357605, + "rewards/margins": 0.12118181586265564, + "rewards/rejected": 0.25048115849494934, + "step": 104 + }, + { + "epoch": 0.25316455696202533, + "grad_norm": 0.4634414613246918, + "learning_rate": 4.168900804289545e-06, + "logits/chosen": -0.224853515625, + "logits/rejected": -0.10296630859375, + "logps/chosen": -1.812528133392334, + "logps/rejected": -2.460575819015503, + "loss": -0.1094, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23638510704040527, + "rewards/margins": 0.10943854600191116, + "rewards/rejected": 0.1269465684890747, + "step": 105 + }, + { + "epoch": 0.25557564798071125, + "grad_norm": 0.324603796005249, + "learning_rate": 4.155495978552279e-06, + "logits/chosen": -0.236083984375, + "logits/rejected": 0.09223747253417969, + "logps/chosen": -1.4622098207473755, + "logps/rejected": -2.4940311908721924, + "loss": -0.139, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2819633185863495, + "rewards/margins": 0.13900119066238403, + "rewards/rejected": 0.14296211302280426, + "step": 106 + }, + { + "epoch": 0.2579867389993972, + "grad_norm": 0.15879696607589722, + "learning_rate": 4.142091152815014e-06, + "logits/chosen": -0.4085693359375, + "logits/rejected": -0.4613189697265625, + "logps/chosen": -2.0223631858825684, + "logps/rejected": -2.7616350650787354, + "loss": -0.0924, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1746683120727539, + "rewards/margins": 0.09240110218524933, + "rewards/rejected": 0.08226720243692398, + "step": 107 + }, + { + "epoch": 0.2603978300180832, + "grad_norm": 0.3156600892543793, + "learning_rate": 4.128686327077748e-06, + "logits/chosen": -0.250762939453125, + "logits/rejected": -0.24658203125, + "logps/chosen": -1.8081755638122559, + "logps/rejected": -2.5799002647399902, + "loss": -0.0881, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23830097913742065, + "rewards/margins": 0.08809483796358109, + "rewards/rejected": 0.15020614862442017, + "step": 108 + }, + { + "epoch": 0.2628089210367691, + "grad_norm": 0.18173182010650635, + "learning_rate": 4.115281501340483e-06, + "logits/chosen": -0.24599409103393555, + "logits/rejected": -0.0904541015625, + "logps/chosen": -1.885944128036499, + "logps/rejected": -2.5246598720550537, + "loss": -0.0545, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19110161066055298, + "rewards/margins": 0.054463762789964676, + "rewards/rejected": 0.1366378515958786, + "step": 109 + }, + { + "epoch": 0.2652200120554551, + "grad_norm": 0.12534259259700775, + "learning_rate": 4.101876675603218e-06, + "logits/chosen": -0.46356964111328125, + "logits/rejected": -0.44899749755859375, + "logps/chosen": -1.6304409503936768, + "logps/rejected": -2.6182172298431396, + "loss": -0.1224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24056994915008545, + "rewards/margins": 0.12238539755344391, + "rewards/rejected": 0.11818452924489975, + "step": 110 + }, + { + "epoch": 0.26763110307414106, + "grad_norm": 0.40160471200942993, + "learning_rate": 4.088471849865952e-06, + "logits/chosen": -0.1213531494140625, + "logits/rejected": 0.256622314453125, + "logps/chosen": -1.5642027854919434, + "logps/rejected": -1.8273777961730957, + "loss": -0.0704, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33564794063568115, + "rewards/margins": 0.07044831663370132, + "rewards/rejected": 0.2651996314525604, + "step": 111 + }, + { + "epoch": 0.270042194092827, + "grad_norm": 0.1428900957107544, + "learning_rate": 4.075067024128686e-06, + "logits/chosen": -0.3281402587890625, + "logits/rejected": -0.0436859130859375, + "logps/chosen": -1.6408134698867798, + "logps/rejected": -2.3986873626708984, + "loss": -0.0943, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2439131736755371, + "rewards/margins": 0.09430273622274399, + "rewards/rejected": 0.14961043000221252, + "step": 112 + }, + { + "epoch": 0.27245328511151296, + "grad_norm": 0.5922664999961853, + "learning_rate": 4.0616621983914214e-06, + "logits/chosen": 0.1334686279296875, + "logits/rejected": -0.14678192138671875, + "logps/chosen": -1.1047554016113281, + "logps/rejected": -1.5507018566131592, + "loss": -0.0672, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.360832154750824, + "rewards/margins": 0.06723302602767944, + "rewards/rejected": 0.2935991585254669, + "step": 113 + }, + { + "epoch": 0.27486437613019893, + "grad_norm": 0.1327289640903473, + "learning_rate": 4.048257372654156e-06, + "logits/chosen": -0.4437551498413086, + "logits/rejected": -0.544647216796875, + "logps/chosen": -1.9275341033935547, + "logps/rejected": -2.640573024749756, + "loss": -0.1109, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2113649547100067, + "rewards/margins": 0.11089782416820526, + "rewards/rejected": 0.10046711564064026, + "step": 114 + }, + { + "epoch": 0.27727546714888485, + "grad_norm": 0.23644386231899261, + "learning_rate": 4.03485254691689e-06, + "logits/chosen": -0.3495635986328125, + "logits/rejected": -0.0189208984375, + "logps/chosen": -1.701964259147644, + "logps/rejected": -2.173644781112671, + "loss": -0.0611, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24110406637191772, + "rewards/margins": 0.0611141100525856, + "rewards/rejected": 0.17998996376991272, + "step": 115 + }, + { + "epoch": 0.2796865581675708, + "grad_norm": 0.15662124752998352, + "learning_rate": 4.021447721179625e-06, + "logits/chosen": -0.34195709228515625, + "logits/rejected": -0.4056396484375, + "logps/chosen": -2.156498432159424, + "logps/rejected": -2.556830406188965, + "loss": -0.0638, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16725093126296997, + "rewards/margins": 0.06381096690893173, + "rewards/rejected": 0.10343997180461884, + "step": 116 + }, + { + "epoch": 0.2820976491862568, + "grad_norm": 0.10304291546344757, + "learning_rate": 4.008042895442359e-06, + "logits/chosen": -0.18689727783203125, + "logits/rejected": -0.379608154296875, + "logps/chosen": -1.9597594738006592, + "logps/rejected": -2.607123851776123, + "loss": -0.061, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19024546444416046, + "rewards/margins": 0.060972318053245544, + "rewards/rejected": 0.12927314639091492, + "step": 117 + }, + { + "epoch": 0.2845087402049427, + "grad_norm": 0.25902849435806274, + "learning_rate": 3.994638069705094e-06, + "logits/chosen": -0.3756866455078125, + "logits/rejected": -0.5497055053710938, + "logps/chosen": -1.8840107917785645, + "logps/rejected": -2.55940842628479, + "loss": -0.0636, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19889983534812927, + "rewards/margins": 0.06361623853445053, + "rewards/rejected": 0.13528360426425934, + "step": 118 + }, + { + "epoch": 0.2869198312236287, + "grad_norm": 0.30815866589546204, + "learning_rate": 3.981233243967829e-06, + "logits/chosen": -0.3415679931640625, + "logits/rejected": -0.3078479766845703, + "logps/chosen": -1.5039807558059692, + "logps/rejected": -2.126880645751953, + "loss": -0.0663, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26701006293296814, + "rewards/margins": 0.06632599234580994, + "rewards/rejected": 0.2006840854883194, + "step": 119 + }, + { + "epoch": 0.28933092224231466, + "grad_norm": 0.456194669008255, + "learning_rate": 3.967828418230563e-06, + "logits/chosen": -0.3817405700683594, + "logits/rejected": 0.17862510681152344, + "logps/chosen": -1.5209466218948364, + "logps/rejected": -2.282346487045288, + "loss": -0.089, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.29129108786582947, + "rewards/margins": 0.08900464326143265, + "rewards/rejected": 0.20228645205497742, + "step": 120 + }, + { + "epoch": 0.2917420132610006, + "grad_norm": 0.12140549719333649, + "learning_rate": 3.954423592493297e-06, + "logits/chosen": -0.21002197265625, + "logits/rejected": -0.15264892578125, + "logps/chosen": -2.0790646076202393, + "logps/rejected": -2.9354448318481445, + "loss": -0.0988, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18235059082508087, + "rewards/margins": 0.0988008975982666, + "rewards/rejected": 0.08354970812797546, + "step": 121 + }, + { + "epoch": 0.29415310427968655, + "grad_norm": 0.19598804414272308, + "learning_rate": 3.9410187667560325e-06, + "logits/chosen": -0.2721099853515625, + "logits/rejected": -0.1711578369140625, + "logps/chosen": -1.8032771348953247, + "logps/rejected": -2.242562770843506, + "loss": -0.0646, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2550719380378723, + "rewards/margins": 0.06460437178611755, + "rewards/rejected": 0.19046758115291595, + "step": 122 + }, + { + "epoch": 0.2965641952983725, + "grad_norm": 0.20384636521339417, + "learning_rate": 3.927613941018767e-06, + "logits/chosen": -0.3419189453125, + "logits/rejected": -0.120880126953125, + "logps/chosen": -1.8109452724456787, + "logps/rejected": -2.5486836433410645, + "loss": -0.0827, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21803930401802063, + "rewards/margins": 0.08268535882234573, + "rewards/rejected": 0.1353539377450943, + "step": 123 + }, + { + "epoch": 0.29897528631705844, + "grad_norm": 0.20745395123958588, + "learning_rate": 3.914209115281501e-06, + "logits/chosen": 0.0353851318359375, + "logits/rejected": -0.04217529296875, + "logps/chosen": -1.878538966178894, + "logps/rejected": -1.977557897567749, + "loss": -0.0154, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.2521553039550781, + "rewards/margins": 0.015359786339104176, + "rewards/rejected": 0.23679551482200623, + "step": 124 + }, + { + "epoch": 0.3013863773357444, + "grad_norm": 0.3772227466106415, + "learning_rate": 3.900804289544236e-06, + "logits/chosen": -0.07265853881835938, + "logits/rejected": -0.25904083251953125, + "logps/chosen": -2.103529691696167, + "logps/rejected": -2.508167266845703, + "loss": -0.0648, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19217947125434875, + "rewards/margins": 0.06476208567619324, + "rewards/rejected": 0.12741738557815552, + "step": 125 + }, + { + "epoch": 0.3037974683544304, + "grad_norm": 0.23701998591423035, + "learning_rate": 3.8873994638069705e-06, + "logits/chosen": -0.275390625, + "logits/rejected": 0.25140380859375, + "logps/chosen": -1.7639429569244385, + "logps/rejected": -2.795426845550537, + "loss": -0.1427, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22117727994918823, + "rewards/margins": 0.1427486389875412, + "rewards/rejected": 0.07842864096164703, + "step": 126 + }, + { + "epoch": 0.30620855937311636, + "grad_norm": 0.16641807556152344, + "learning_rate": 3.873994638069705e-06, + "logits/chosen": -0.335418701171875, + "logits/rejected": 0.2322998046875, + "logps/chosen": -1.8744635581970215, + "logps/rejected": -2.488361120223999, + "loss": -0.1248, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.267265260219574, + "rewards/margins": 0.12475024908781052, + "rewards/rejected": 0.14251500368118286, + "step": 127 + }, + { + "epoch": 0.3086196503918023, + "grad_norm": 0.3444962501525879, + "learning_rate": 3.86058981233244e-06, + "logits/chosen": -0.4952392578125, + "logits/rejected": -0.5163421630859375, + "logps/chosen": -1.9972844123840332, + "logps/rejected": -2.637503147125244, + "loss": -0.0566, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2014060914516449, + "rewards/margins": 0.0565631203353405, + "rewards/rejected": 0.1448429822921753, + "step": 128 + }, + { + "epoch": 0.31103074141048825, + "grad_norm": 0.15936574339866638, + "learning_rate": 3.847184986595174e-06, + "logits/chosen": -0.3858642578125, + "logits/rejected": -0.163482666015625, + "logps/chosen": -1.529799461364746, + "logps/rejected": -2.809572219848633, + "loss": -0.1517, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2267850637435913, + "rewards/margins": 0.15168723464012146, + "rewards/rejected": 0.07509784400463104, + "step": 129 + }, + { + "epoch": 0.3134418324291742, + "grad_norm": 0.2980387806892395, + "learning_rate": 3.833780160857909e-06, + "logits/chosen": 0.0192108154296875, + "logits/rejected": 0.3061981201171875, + "logps/chosen": -1.6735682487487793, + "logps/rejected": -2.1918599605560303, + "loss": -0.0486, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24807590246200562, + "rewards/margins": 0.048588160425424576, + "rewards/rejected": 0.19948776066303253, + "step": 130 + }, + { + "epoch": 0.31585292344786015, + "grad_norm": 0.12159116566181183, + "learning_rate": 3.820375335120644e-06, + "logits/chosen": -0.23541259765625, + "logits/rejected": -0.31982421875, + "logps/chosen": -2.0227720737457275, + "logps/rejected": -2.5635733604431152, + "loss": -0.0752, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18979406356811523, + "rewards/margins": 0.07516118884086609, + "rewards/rejected": 0.11463286727666855, + "step": 131 + }, + { + "epoch": 0.3182640144665461, + "grad_norm": 0.17405179142951965, + "learning_rate": 3.806970509383378e-06, + "logits/chosen": -0.409698486328125, + "logits/rejected": -0.24062347412109375, + "logps/chosen": -2.0117478370666504, + "logps/rejected": -2.7961604595184326, + "loss": -0.0895, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19796901941299438, + "rewards/margins": 0.08948354423046112, + "rewards/rejected": 0.10848548263311386, + "step": 132 + }, + { + "epoch": 0.3206751054852321, + "grad_norm": 0.33214181661605835, + "learning_rate": 3.7935656836461126e-06, + "logits/chosen": -0.18270492553710938, + "logits/rejected": -0.3084716796875, + "logps/chosen": -1.5752081871032715, + "logps/rejected": -2.373183250427246, + "loss": -0.0872, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2828559875488281, + "rewards/margins": 0.08717310428619385, + "rewards/rejected": 0.19568288326263428, + "step": 133 + }, + { + "epoch": 0.323086196503918, + "grad_norm": 0.13382308185100555, + "learning_rate": 3.7801608579088473e-06, + "logits/chosen": -0.10395431518554688, + "logits/rejected": -0.142425537109375, + "logps/chosen": -1.8472363948822021, + "logps/rejected": -2.658161163330078, + "loss": -0.1066, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20788323879241943, + "rewards/margins": 0.10657000541687012, + "rewards/rejected": 0.10131323337554932, + "step": 134 + }, + { + "epoch": 0.325497287522604, + "grad_norm": 0.2537650763988495, + "learning_rate": 3.7667560321715816e-06, + "logits/chosen": -0.189178466796875, + "logits/rejected": 0.0062255859375, + "logps/chosen": -2.1654839515686035, + "logps/rejected": -2.7937369346618652, + "loss": -0.1056, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19018790125846863, + "rewards/margins": 0.1055651307106018, + "rewards/rejected": 0.08462277054786682, + "step": 135 + }, + { + "epoch": 0.32790837854128996, + "grad_norm": 0.19514308869838715, + "learning_rate": 3.7533512064343163e-06, + "logits/chosen": -0.3323822021484375, + "logits/rejected": 0.1976318359375, + "logps/chosen": -1.761709213256836, + "logps/rejected": -2.5666615962982178, + "loss": -0.1247, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2264426350593567, + "rewards/margins": 0.12468908727169037, + "rewards/rejected": 0.10175354033708572, + "step": 136 + }, + { + "epoch": 0.3303194695599759, + "grad_norm": 0.20550787448883057, + "learning_rate": 3.7399463806970514e-06, + "logits/chosen": -0.4935302734375, + "logits/rejected": 0.031219482421875, + "logps/chosen": -1.7155025005340576, + "logps/rejected": -2.9084179401397705, + "loss": -0.1335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.203782320022583, + "rewards/margins": 0.13346055150032043, + "rewards/rejected": 0.07032175362110138, + "step": 137 + }, + { + "epoch": 0.33273056057866185, + "grad_norm": 0.2004317045211792, + "learning_rate": 3.726541554959786e-06, + "logits/chosen": -0.1445465087890625, + "logits/rejected": 0.095794677734375, + "logps/chosen": -1.7848482131958008, + "logps/rejected": -3.0859837532043457, + "loss": -0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2135561853647232, + "rewards/margins": 0.13746829330921173, + "rewards/rejected": 0.07608789205551147, + "step": 138 + }, + { + "epoch": 0.3351416515973478, + "grad_norm": 0.27391791343688965, + "learning_rate": 3.713136729222521e-06, + "logits/chosen": -0.16164779663085938, + "logits/rejected": 0.152679443359375, + "logps/chosen": -2.033043384552002, + "logps/rejected": -2.719478130340576, + "loss": -0.1146, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20201122760772705, + "rewards/margins": 0.11455677449703217, + "rewards/rejected": 0.08745446056127548, + "step": 139 + }, + { + "epoch": 0.33755274261603374, + "grad_norm": 0.1601397842168808, + "learning_rate": 3.699731903485255e-06, + "logits/chosen": -0.25620079040527344, + "logits/rejected": -0.3312492370605469, + "logps/chosen": -1.734803557395935, + "logps/rejected": -2.4838268756866455, + "loss": -0.0742, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2659730017185211, + "rewards/margins": 0.07417897880077362, + "rewards/rejected": 0.1917940378189087, + "step": 140 + }, + { + "epoch": 0.3399638336347197, + "grad_norm": 0.39302170276641846, + "learning_rate": 3.68632707774799e-06, + "logits/chosen": -0.60992431640625, + "logits/rejected": -0.39996337890625, + "logps/chosen": -1.718180775642395, + "logps/rejected": -2.8960657119750977, + "loss": -0.1128, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24195371568202972, + "rewards/margins": 0.11284901201725006, + "rewards/rejected": 0.12910471856594086, + "step": 141 + }, + { + "epoch": 0.3423749246534057, + "grad_norm": 0.27056244015693665, + "learning_rate": 3.6729222520107246e-06, + "logits/chosen": -0.5115966796875, + "logits/rejected": -0.11415481567382812, + "logps/chosen": -1.8849842548370361, + "logps/rejected": -2.7099342346191406, + "loss": -0.1179, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22585538029670715, + "rewards/margins": 0.1178693100810051, + "rewards/rejected": 0.10798607766628265, + "step": 142 + }, + { + "epoch": 0.3447860156720916, + "grad_norm": 0.4331517517566681, + "learning_rate": 3.659517426273459e-06, + "logits/chosen": -0.35406494140625, + "logits/rejected": -0.33056640625, + "logps/chosen": -1.701088786125183, + "logps/rejected": -2.8120265007019043, + "loss": -0.1106, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2163262814283371, + "rewards/margins": 0.11060433089733124, + "rewards/rejected": 0.10572194308042526, + "step": 143 + }, + { + "epoch": 0.3471971066907776, + "grad_norm": 0.1848490983247757, + "learning_rate": 3.6461126005361935e-06, + "logits/chosen": -0.24874114990234375, + "logits/rejected": -0.34979248046875, + "logps/chosen": -2.4686079025268555, + "logps/rejected": -3.148228168487549, + "loss": -0.0763, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12093396484851837, + "rewards/margins": 0.07626403868198395, + "rewards/rejected": 0.04466991871595383, + "step": 144 + }, + { + "epoch": 0.34960819770946355, + "grad_norm": 0.24051529169082642, + "learning_rate": 3.6327077747989283e-06, + "logits/chosen": -0.0614013671875, + "logits/rejected": 0.448883056640625, + "logps/chosen": -1.967170238494873, + "logps/rejected": -2.4206724166870117, + "loss": -0.0695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23632943630218506, + "rewards/margins": 0.0694933831691742, + "rewards/rejected": 0.16683605313301086, + "step": 145 + }, + { + "epoch": 0.35201928872814947, + "grad_norm": 0.12445730715990067, + "learning_rate": 3.6193029490616625e-06, + "logits/chosen": -0.1702408790588379, + "logits/rejected": -0.166351318359375, + "logps/chosen": -1.990149736404419, + "logps/rejected": -2.574845790863037, + "loss": -0.0849, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2130918800830841, + "rewards/margins": 0.08488123118877411, + "rewards/rejected": 0.12821064889431, + "step": 146 + }, + { + "epoch": 0.35443037974683544, + "grad_norm": 0.22781485319137573, + "learning_rate": 3.6058981233243972e-06, + "logits/chosen": -0.3178863525390625, + "logits/rejected": -0.218231201171875, + "logps/chosen": -1.5530338287353516, + "logps/rejected": -2.4394164085388184, + "loss": -0.0883, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26591306924819946, + "rewards/margins": 0.08833837509155273, + "rewards/rejected": 0.17757469415664673, + "step": 147 + }, + { + "epoch": 0.3568414707655214, + "grad_norm": 0.4100662171840668, + "learning_rate": 3.592493297587132e-06, + "logits/chosen": -0.02611064910888672, + "logits/rejected": -0.0902557373046875, + "logps/chosen": -1.5063302516937256, + "logps/rejected": -2.0787148475646973, + "loss": -0.0513, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2762554883956909, + "rewards/margins": 0.051341067999601364, + "rewards/rejected": 0.22491443157196045, + "step": 148 + }, + { + "epoch": 0.35925256178420734, + "grad_norm": 0.2707577347755432, + "learning_rate": 3.5790884718498662e-06, + "logits/chosen": -0.2430877685546875, + "logits/rejected": -0.121826171875, + "logps/chosen": -1.9725404977798462, + "logps/rejected": -2.7294797897338867, + "loss": -0.0799, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.187856987118721, + "rewards/margins": 0.07990557700395584, + "rewards/rejected": 0.10795141756534576, + "step": 149 + }, + { + "epoch": 0.3616636528028933, + "grad_norm": 0.32208871841430664, + "learning_rate": 3.565683646112601e-06, + "logits/chosen": -0.16693115234375, + "logits/rejected": 0.0411529541015625, + "logps/chosen": -1.4728728532791138, + "logps/rejected": -2.335010290145874, + "loss": -0.1053, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27311092615127563, + "rewards/margins": 0.10532081127166748, + "rewards/rejected": 0.16779009997844696, + "step": 150 + }, + { + "epoch": 0.3640747438215793, + "grad_norm": 0.08245796710252762, + "learning_rate": 3.5522788203753356e-06, + "logits/chosen": -0.33685302734375, + "logits/rejected": -0.46466064453125, + "logps/chosen": -2.0189056396484375, + "logps/rejected": -2.807873249053955, + "loss": -0.112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19605892896652222, + "rewards/margins": 0.11196976155042648, + "rewards/rejected": 0.08408917486667633, + "step": 151 + }, + { + "epoch": 0.3664858348402652, + "grad_norm": 0.3357884883880615, + "learning_rate": 3.53887399463807e-06, + "logits/chosen": -0.1822509765625, + "logits/rejected": 0.108062744140625, + "logps/chosen": -1.4196295738220215, + "logps/rejected": -2.2184829711914062, + "loss": -0.1068, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2789333462715149, + "rewards/margins": 0.10681991279125214, + "rewards/rejected": 0.17211341857910156, + "step": 152 + }, + { + "epoch": 0.3688969258589512, + "grad_norm": 0.17198774218559265, + "learning_rate": 3.5254691689008046e-06, + "logits/chosen": -0.47198486328125, + "logits/rejected": -0.17987060546875, + "logps/chosen": -1.7352066040039062, + "logps/rejected": -2.5751121044158936, + "loss": -0.1275, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22849620878696442, + "rewards/margins": 0.1274811327457428, + "rewards/rejected": 0.10101506859064102, + "step": 153 + }, + { + "epoch": 0.37130801687763715, + "grad_norm": 0.3036087155342102, + "learning_rate": 3.5120643431635393e-06, + "logits/chosen": -0.26273632049560547, + "logits/rejected": -0.06774520874023438, + "logps/chosen": -1.6372747421264648, + "logps/rejected": -2.0982563495635986, + "loss": -0.0822, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28292781114578247, + "rewards/margins": 0.08221237361431122, + "rewards/rejected": 0.20071543753147125, + "step": 154 + }, + { + "epoch": 0.37371910789632307, + "grad_norm": 0.19731618463993073, + "learning_rate": 3.498659517426274e-06, + "logits/chosen": -0.419708251953125, + "logits/rejected": -0.3563232421875, + "logps/chosen": -2.0185813903808594, + "logps/rejected": -2.8528635501861572, + "loss": -0.0953, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17664362490177155, + "rewards/margins": 0.09534762799739838, + "rewards/rejected": 0.08129599690437317, + "step": 155 + }, + { + "epoch": 0.37613019891500904, + "grad_norm": 0.2865287959575653, + "learning_rate": 3.4852546916890083e-06, + "logits/chosen": -0.380615234375, + "logits/rejected": -0.3313713073730469, + "logps/chosen": -1.8958239555358887, + "logps/rejected": -2.5793466567993164, + "loss": -0.0997, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2176969051361084, + "rewards/margins": 0.09974562376737595, + "rewards/rejected": 0.11795127391815186, + "step": 156 + }, + { + "epoch": 0.378541289933695, + "grad_norm": 0.17225690186023712, + "learning_rate": 3.471849865951743e-06, + "logits/chosen": -0.20720672607421875, + "logits/rejected": 0.1303863525390625, + "logps/chosen": -1.7372939586639404, + "logps/rejected": -2.2546074390411377, + "loss": -0.0945, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.267435222864151, + "rewards/margins": 0.09453544020652771, + "rewards/rejected": 0.1728997826576233, + "step": 157 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.34359943866729736, + "learning_rate": 3.4584450402144778e-06, + "logits/chosen": -0.07238006591796875, + "logits/rejected": -0.11508941650390625, + "logps/chosen": -1.454909086227417, + "logps/rejected": -2.7384259700775146, + "loss": -0.1615, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.265159547328949, + "rewards/margins": 0.16154341399669647, + "rewards/rejected": 0.1036161482334137, + "step": 158 + }, + { + "epoch": 0.3833634719710669, + "grad_norm": 0.3315942585468292, + "learning_rate": 3.445040214477212e-06, + "logits/chosen": -0.4606971740722656, + "logits/rejected": -0.0023193359375, + "logps/chosen": -1.9082099199295044, + "logps/rejected": -2.8033318519592285, + "loss": -0.0934, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21753202378749847, + "rewards/margins": 0.0933612659573555, + "rewards/rejected": 0.12417076528072357, + "step": 159 + }, + { + "epoch": 0.3857745629897529, + "grad_norm": 0.22415418922901154, + "learning_rate": 3.4316353887399467e-06, + "logits/chosen": -0.5545654296875, + "logits/rejected": -0.234130859375, + "logps/chosen": -1.9420737028121948, + "logps/rejected": -2.807130813598633, + "loss": -0.1164, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20228226482868195, + "rewards/margins": 0.11638941615819931, + "rewards/rejected": 0.08589284121990204, + "step": 160 + }, + { + "epoch": 0.3881856540084388, + "grad_norm": 0.218908429145813, + "learning_rate": 3.4182305630026814e-06, + "logits/chosen": -0.22979736328125, + "logits/rejected": -0.46307373046875, + "logps/chosen": -1.7105002403259277, + "logps/rejected": -2.8572072982788086, + "loss": -0.1365, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21908557415008545, + "rewards/margins": 0.13646164536476135, + "rewards/rejected": 0.08262395113706589, + "step": 161 + }, + { + "epoch": 0.39059674502712477, + "grad_norm": 0.33666086196899414, + "learning_rate": 3.4048257372654157e-06, + "logits/chosen": -0.1859893798828125, + "logits/rejected": -0.230438232421875, + "logps/chosen": -1.9788360595703125, + "logps/rejected": -2.3243966102600098, + "loss": -0.0598, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.196782648563385, + "rewards/margins": 0.059835951775312424, + "rewards/rejected": 0.13694670796394348, + "step": 162 + }, + { + "epoch": 0.39300783604581074, + "grad_norm": 0.4421437680721283, + "learning_rate": 3.3914209115281504e-06, + "logits/chosen": -0.11785888671875, + "logits/rejected": -0.4765777587890625, + "logps/chosen": -2.41336989402771, + "logps/rejected": -2.7111411094665527, + "loss": -0.0402, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.18764996528625488, + "rewards/margins": 0.04022819921374321, + "rewards/rejected": 0.14742176234722137, + "step": 163 + }, + { + "epoch": 0.39541892706449666, + "grad_norm": 0.1515016406774521, + "learning_rate": 3.378016085790885e-06, + "logits/chosen": -0.38008880615234375, + "logits/rejected": -0.2392425537109375, + "logps/chosen": -2.081786870956421, + "logps/rejected": -2.6870241165161133, + "loss": -0.062, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1730046272277832, + "rewards/margins": 0.06201169267296791, + "rewards/rejected": 0.11099293828010559, + "step": 164 + }, + { + "epoch": 0.39783001808318263, + "grad_norm": 0.1712828278541565, + "learning_rate": 3.3646112600536194e-06, + "logits/chosen": -0.060943603515625, + "logits/rejected": -0.1932373046875, + "logps/chosen": -1.9017142057418823, + "logps/rejected": -2.711599826812744, + "loss": -0.1037, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19711501896381378, + "rewards/margins": 0.10365079343318939, + "rewards/rejected": 0.09346422553062439, + "step": 165 + }, + { + "epoch": 0.4002411091018686, + "grad_norm": 0.17834404110908508, + "learning_rate": 3.351206434316354e-06, + "logits/chosen": -0.387451171875, + "logits/rejected": -0.4556121826171875, + "logps/chosen": -1.8684295415878296, + "logps/rejected": -3.169869899749756, + "loss": -0.1356, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18259389698505402, + "rewards/margins": 0.13558465242385864, + "rewards/rejected": 0.04700924828648567, + "step": 166 + }, + { + "epoch": 0.4026522001205545, + "grad_norm": 0.208110511302948, + "learning_rate": 3.337801608579089e-06, + "logits/chosen": -0.530792236328125, + "logits/rejected": -0.3457603454589844, + "logps/chosen": -1.9081766605377197, + "logps/rejected": -2.8117613792419434, + "loss": -0.0735, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17980918288230896, + "rewards/margins": 0.07350005954504013, + "rewards/rejected": 0.10630912333726883, + "step": 167 + }, + { + "epoch": 0.4050632911392405, + "grad_norm": 0.34093332290649414, + "learning_rate": 3.324396782841823e-06, + "logits/chosen": -0.400634765625, + "logits/rejected": -0.31293487548828125, + "logps/chosen": -1.3186595439910889, + "logps/rejected": -2.1169416904449463, + "loss": -0.0863, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.290740966796875, + "rewards/margins": 0.08634192496538162, + "rewards/rejected": 0.2043990194797516, + "step": 168 + }, + { + "epoch": 0.4074743821579265, + "grad_norm": 0.21918420493602753, + "learning_rate": 3.310991957104558e-06, + "logits/chosen": -0.2548065185546875, + "logits/rejected": -0.4208984375, + "logps/chosen": -1.9677972793579102, + "logps/rejected": -2.6887378692626953, + "loss": -0.0935, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19425921142101288, + "rewards/margins": 0.09353058785200119, + "rewards/rejected": 0.10072863101959229, + "step": 169 + }, + { + "epoch": 0.4098854731766124, + "grad_norm": 0.41718512773513794, + "learning_rate": 3.2975871313672925e-06, + "logits/chosen": -0.235504150390625, + "logits/rejected": -0.24871826171875, + "logps/chosen": -2.591125011444092, + "logps/rejected": -2.8518929481506348, + "loss": -0.0418, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.14805354177951813, + "rewards/margins": 0.041785478591918945, + "rewards/rejected": 0.10626806318759918, + "step": 170 + }, + { + "epoch": 0.41229656419529837, + "grad_norm": 0.37480252981185913, + "learning_rate": 3.2841823056300272e-06, + "logits/chosen": -0.327606201171875, + "logits/rejected": -0.2918701171875, + "logps/chosen": -1.7574467658996582, + "logps/rejected": -2.363612651824951, + "loss": -0.0836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23645339906215668, + "rewards/margins": 0.083559550344944, + "rewards/rejected": 0.15289384126663208, + "step": 171 + }, + { + "epoch": 0.41470765521398434, + "grad_norm": 0.14828062057495117, + "learning_rate": 3.2707774798927615e-06, + "logits/chosen": -0.390533447265625, + "logits/rejected": -0.071685791015625, + "logps/chosen": -1.7845356464385986, + "logps/rejected": -2.920743942260742, + "loss": -0.1457, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2436465620994568, + "rewards/margins": 0.14567814767360687, + "rewards/rejected": 0.09796842187643051, + "step": 172 + }, + { + "epoch": 0.41711874623267026, + "grad_norm": 0.33334192633628845, + "learning_rate": 3.2573726541554962e-06, + "logits/chosen": -0.3453369140625, + "logits/rejected": 0.0274505615234375, + "logps/chosen": -1.6034345626831055, + "logps/rejected": -2.4277515411376953, + "loss": -0.1169, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23403647541999817, + "rewards/margins": 0.11686831712722778, + "rewards/rejected": 0.11716815084218979, + "step": 173 + }, + { + "epoch": 0.41952983725135623, + "grad_norm": 0.28393813967704773, + "learning_rate": 3.243967828418231e-06, + "logits/chosen": -0.17050933837890625, + "logits/rejected": -0.3485107421875, + "logps/chosen": -1.439255952835083, + "logps/rejected": -2.649294853210449, + "loss": -0.1729, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27926915884017944, + "rewards/margins": 0.17292553186416626, + "rewards/rejected": 0.10634364187717438, + "step": 174 + }, + { + "epoch": 0.4219409282700422, + "grad_norm": 0.2537970542907715, + "learning_rate": 3.2305630026809652e-06, + "logits/chosen": -0.3106689453125, + "logits/rejected": -0.14837646484375, + "logps/chosen": -1.7542948722839355, + "logps/rejected": -2.611116409301758, + "loss": -0.0954, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20454636216163635, + "rewards/margins": 0.09543466567993164, + "rewards/rejected": 0.10911169648170471, + "step": 175 + }, + { + "epoch": 0.4243520192887282, + "grad_norm": 0.1256629228591919, + "learning_rate": 3.2171581769437e-06, + "logits/chosen": -0.4407958984375, + "logits/rejected": -0.6070556640625, + "logps/chosen": -2.1641149520874023, + "logps/rejected": -3.1083664894104004, + "loss": -0.1277, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1784818470478058, + "rewards/margins": 0.12774190306663513, + "rewards/rejected": 0.050739943981170654, + "step": 176 + }, + { + "epoch": 0.4267631103074141, + "grad_norm": 0.2930721342563629, + "learning_rate": 3.2037533512064346e-06, + "logits/chosen": -0.3129463195800781, + "logits/rejected": -0.1395263671875, + "logps/chosen": -2.037263870239258, + "logps/rejected": -2.617213726043701, + "loss": -0.0795, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.22663629055023193, + "rewards/margins": 0.07951289415359497, + "rewards/rejected": 0.14712341129779816, + "step": 177 + }, + { + "epoch": 0.42917420132610007, + "grad_norm": 0.30528175830841064, + "learning_rate": 3.190348525469169e-06, + "logits/chosen": -0.5257072448730469, + "logits/rejected": -0.46124267578125, + "logps/chosen": -2.009727954864502, + "logps/rejected": -2.3708386421203613, + "loss": -0.0418, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2033851444721222, + "rewards/margins": 0.041812196373939514, + "rewards/rejected": 0.16157294809818268, + "step": 178 + }, + { + "epoch": 0.43158529234478604, + "grad_norm": 0.4607950747013092, + "learning_rate": 3.1769436997319036e-06, + "logits/chosen": -0.27509307861328125, + "logits/rejected": 0.08876991271972656, + "logps/chosen": -1.8411033153533936, + "logps/rejected": -2.2585511207580566, + "loss": -0.0419, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24301722645759583, + "rewards/margins": 0.041885629296302795, + "rewards/rejected": 0.20113162696361542, + "step": 179 + }, + { + "epoch": 0.43399638336347196, + "grad_norm": 0.3540400564670563, + "learning_rate": 3.1635388739946383e-06, + "logits/chosen": -0.3549690246582031, + "logits/rejected": 0.013580322265625, + "logps/chosen": -1.647626280784607, + "logps/rejected": -2.341634750366211, + "loss": -0.0615, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.26883983612060547, + "rewards/margins": 0.06151677295565605, + "rewards/rejected": 0.2073230743408203, + "step": 180 + }, + { + "epoch": 0.43640747438215793, + "grad_norm": 0.22180259227752686, + "learning_rate": 3.1501340482573726e-06, + "logits/chosen": -0.128265380859375, + "logits/rejected": -0.42620849609375, + "logps/chosen": -1.700305700302124, + "logps/rejected": -2.5668506622314453, + "loss": -0.1232, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2558213174343109, + "rewards/margins": 0.12323971837759018, + "rewards/rejected": 0.13258159160614014, + "step": 181 + }, + { + "epoch": 0.4388185654008439, + "grad_norm": 0.31690463423728943, + "learning_rate": 3.1367292225201073e-06, + "logits/chosen": -0.514068603515625, + "logits/rejected": -0.55712890625, + "logps/chosen": -2.1217963695526123, + "logps/rejected": -2.891569137573242, + "loss": -0.0943, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2072557955980301, + "rewards/margins": 0.09428170323371887, + "rewards/rejected": 0.11297409236431122, + "step": 182 + }, + { + "epoch": 0.4412296564195298, + "grad_norm": 0.3443291187286377, + "learning_rate": 3.123324396782842e-06, + "logits/chosen": -0.3392219543457031, + "logits/rejected": -0.42657470703125, + "logps/chosen": -1.4376801252365112, + "logps/rejected": -2.3941378593444824, + "loss": -0.1013, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26512905955314636, + "rewards/margins": 0.10133281350135803, + "rewards/rejected": 0.16379624605178833, + "step": 183 + }, + { + "epoch": 0.4436407474382158, + "grad_norm": 0.15042658150196075, + "learning_rate": 3.1099195710455763e-06, + "logits/chosen": -0.5136451721191406, + "logits/rejected": -0.4971199035644531, + "logps/chosen": -2.0282766819000244, + "logps/rejected": -2.8578245639801025, + "loss": -0.0986, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1815551370382309, + "rewards/margins": 0.09861013293266296, + "rewards/rejected": 0.08294501900672913, + "step": 184 + }, + { + "epoch": 0.4460518384569018, + "grad_norm": 0.1746578812599182, + "learning_rate": 3.096514745308311e-06, + "logits/chosen": -0.48212432861328125, + "logits/rejected": -0.40771484375, + "logps/chosen": -2.101836681365967, + "logps/rejected": -3.1708879470825195, + "loss": -0.1342, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.18410921096801758, + "rewards/margins": 0.13416406512260437, + "rewards/rejected": 0.049945168197155, + "step": 185 + }, + { + "epoch": 0.4484629294755877, + "grad_norm": 0.1299658715724945, + "learning_rate": 3.0831099195710457e-06, + "logits/chosen": -0.458953857421875, + "logits/rejected": -0.0218505859375, + "logps/chosen": -1.8297481536865234, + "logps/rejected": -2.854112148284912, + "loss": -0.1357, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22957220673561096, + "rewards/margins": 0.13569310307502747, + "rewards/rejected": 0.0938790813088417, + "step": 186 + }, + { + "epoch": 0.45087402049427366, + "grad_norm": 0.304977685213089, + "learning_rate": 3.0697050938337804e-06, + "logits/chosen": -0.389617919921875, + "logits/rejected": -0.551361083984375, + "logps/chosen": -2.085005283355713, + "logps/rejected": -2.4760684967041016, + "loss": -0.0631, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19750094413757324, + "rewards/margins": 0.06314624100923538, + "rewards/rejected": 0.13435471057891846, + "step": 187 + }, + { + "epoch": 0.45328511151295964, + "grad_norm": 0.29833269119262695, + "learning_rate": 3.0563002680965147e-06, + "logits/chosen": -0.4839935302734375, + "logits/rejected": -0.221923828125, + "logps/chosen": -2.39060378074646, + "logps/rejected": -2.94338321685791, + "loss": -0.0827, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.18073594570159912, + "rewards/margins": 0.08274209499359131, + "rewards/rejected": 0.09799385815858841, + "step": 188 + }, + { + "epoch": 0.45569620253164556, + "grad_norm": 0.12078873813152313, + "learning_rate": 3.0428954423592494e-06, + "logits/chosen": -0.4688262939453125, + "logits/rejected": -0.5797119140625, + "logps/chosen": -2.113844394683838, + "logps/rejected": -3.068427562713623, + "loss": -0.0828, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15905998647212982, + "rewards/margins": 0.08275354653596878, + "rewards/rejected": 0.07630644738674164, + "step": 189 + }, + { + "epoch": 0.45810729355033153, + "grad_norm": 0.5458232760429382, + "learning_rate": 3.029490616621984e-06, + "logits/chosen": -0.15084075927734375, + "logits/rejected": 0.00341796875, + "logps/chosen": -1.4732913970947266, + "logps/rejected": -2.269859552383423, + "loss": -0.1328, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.32623231410980225, + "rewards/margins": 0.13282997906208038, + "rewards/rejected": 0.19340234994888306, + "step": 190 + }, + { + "epoch": 0.4605183845690175, + "grad_norm": 0.544780433177948, + "learning_rate": 3.0160857908847184e-06, + "logits/chosen": -0.19171142578125, + "logits/rejected": -0.109649658203125, + "logps/chosen": -1.7311904430389404, + "logps/rejected": -2.5916244983673096, + "loss": -0.0771, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2369753122329712, + "rewards/margins": 0.07712734490633011, + "rewards/rejected": 0.15984795987606049, + "step": 191 + }, + { + "epoch": 0.4629294755877034, + "grad_norm": 0.2684464752674103, + "learning_rate": 3.002680965147453e-06, + "logits/chosen": -0.46929931640625, + "logits/rejected": -0.3169097900390625, + "logps/chosen": -1.6674377918243408, + "logps/rejected": -2.835958957672119, + "loss": -0.1372, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2350844442844391, + "rewards/margins": 0.13724592328071594, + "rewards/rejected": 0.09783852845430374, + "step": 192 + }, + { + "epoch": 0.4653405666063894, + "grad_norm": 0.21394720673561096, + "learning_rate": 2.989276139410188e-06, + "logits/chosen": -0.35595703125, + "logits/rejected": -0.549102783203125, + "logps/chosen": -2.270293951034546, + "logps/rejected": -2.8809475898742676, + "loss": -0.1019, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19606401026248932, + "rewards/margins": 0.10188289731740952, + "rewards/rejected": 0.094181127846241, + "step": 193 + }, + { + "epoch": 0.46775165762507537, + "grad_norm": 0.3073039948940277, + "learning_rate": 2.975871313672922e-06, + "logits/chosen": -0.30355167388916016, + "logits/rejected": -0.343505859375, + "logps/chosen": -1.8982429504394531, + "logps/rejected": -2.5876522064208984, + "loss": -0.0955, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22451213002204895, + "rewards/margins": 0.09545783698558807, + "rewards/rejected": 0.12905429303646088, + "step": 194 + }, + { + "epoch": 0.4701627486437613, + "grad_norm": 0.26982581615448, + "learning_rate": 2.962466487935657e-06, + "logits/chosen": -0.348876953125, + "logits/rejected": -0.025299072265625, + "logps/chosen": -1.3984405994415283, + "logps/rejected": -3.069946527481079, + "loss": -0.1708, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.30713117122650146, + "rewards/margins": 0.17078398168087006, + "rewards/rejected": 0.1363471895456314, + "step": 195 + }, + { + "epoch": 0.47257383966244726, + "grad_norm": 0.236332967877388, + "learning_rate": 2.9490616621983915e-06, + "logits/chosen": -0.23291015625, + "logits/rejected": -0.0711517333984375, + "logps/chosen": -2.158919095993042, + "logps/rejected": -3.132253646850586, + "loss": -0.0836, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1541297733783722, + "rewards/margins": 0.0835646465420723, + "rewards/rejected": 0.0705651119351387, + "step": 196 + }, + { + "epoch": 0.47498493068113323, + "grad_norm": 0.2244856059551239, + "learning_rate": 2.935656836461126e-06, + "logits/chosen": -0.45892333984375, + "logits/rejected": -0.52117919921875, + "logps/chosen": -2.1706812381744385, + "logps/rejected": -2.848109722137451, + "loss": -0.0699, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1689453274011612, + "rewards/margins": 0.0699153020977974, + "rewards/rejected": 0.0990300327539444, + "step": 197 + }, + { + "epoch": 0.47739602169981915, + "grad_norm": 0.31412187218666077, + "learning_rate": 2.9222520107238605e-06, + "logits/chosen": -0.48822021484375, + "logits/rejected": -0.121551513671875, + "logps/chosen": -1.3786568641662598, + "logps/rejected": -2.4970614910125732, + "loss": -0.1442, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.29272276163101196, + "rewards/margins": 0.14422528445720673, + "rewards/rejected": 0.14849749207496643, + "step": 198 + }, + { + "epoch": 0.4798071127185051, + "grad_norm": 0.3352288007736206, + "learning_rate": 2.9088471849865957e-06, + "logits/chosen": -0.3072967529296875, + "logits/rejected": -0.2575225830078125, + "logps/chosen": -2.0018115043640137, + "logps/rejected": -2.711092710494995, + "loss": -0.0974, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21645545959472656, + "rewards/margins": 0.09735508263111115, + "rewards/rejected": 0.11910038441419601, + "step": 199 + }, + { + "epoch": 0.4822182037371911, + "grad_norm": 0.3519986867904663, + "learning_rate": 2.8954423592493304e-06, + "logits/chosen": -0.4149932861328125, + "logits/rejected": -0.36700439453125, + "logps/chosen": -1.6919000148773193, + "logps/rejected": -2.6575915813446045, + "loss": -0.0984, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22143259644508362, + "rewards/margins": 0.09836116433143616, + "rewards/rejected": 0.12307143211364746, + "step": 200 + }, + { + "epoch": 0.484629294755877, + "grad_norm": 0.15217958390712738, + "learning_rate": 2.8820375335120647e-06, + "logits/chosen": -0.4061279296875, + "logits/rejected": -0.32421875, + "logps/chosen": -1.7373387813568115, + "logps/rejected": -2.9932193756103516, + "loss": -0.1478, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2508403956890106, + "rewards/margins": 0.1478143036365509, + "rewards/rejected": 0.10302607715129852, + "step": 201 + }, + { + "epoch": 0.487040385774563, + "grad_norm": 0.23008199036121368, + "learning_rate": 2.8686327077747994e-06, + "logits/chosen": -0.44512939453125, + "logits/rejected": -0.377685546875, + "logps/chosen": -2.259744644165039, + "logps/rejected": -2.910661458969116, + "loss": -0.0963, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17576928436756134, + "rewards/margins": 0.09627437591552734, + "rewards/rejected": 0.07949492335319519, + "step": 202 + }, + { + "epoch": 0.48945147679324896, + "grad_norm": 0.25174668431282043, + "learning_rate": 2.855227882037534e-06, + "logits/chosen": -0.556671142578125, + "logits/rejected": -0.4662437438964844, + "logps/chosen": -1.7317817211151123, + "logps/rejected": -2.8880553245544434, + "loss": -0.1042, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20446676015853882, + "rewards/margins": 0.10418954491615295, + "rewards/rejected": 0.10027721524238586, + "step": 203 + }, + { + "epoch": 0.4918625678119349, + "grad_norm": 0.1880040019750595, + "learning_rate": 2.8418230563002683e-06, + "logits/chosen": -0.40822601318359375, + "logits/rejected": -0.5498809814453125, + "logps/chosen": -2.4135918617248535, + "logps/rejected": -2.8308472633361816, + "loss": -0.0557, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.16259118914604187, + "rewards/margins": 0.05568936467170715, + "rewards/rejected": 0.10690181702375412, + "step": 204 + }, + { + "epoch": 0.49427365883062085, + "grad_norm": 0.2413100153207779, + "learning_rate": 2.828418230563003e-06, + "logits/chosen": -0.31159210205078125, + "logits/rejected": -0.45654296875, + "logps/chosen": -1.912713885307312, + "logps/rejected": -2.6146907806396484, + "loss": -0.0828, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20836248993873596, + "rewards/margins": 0.08280447125434875, + "rewards/rejected": 0.1255580186843872, + "step": 205 + }, + { + "epoch": 0.49668474984930683, + "grad_norm": 0.23288388550281525, + "learning_rate": 2.8150134048257378e-06, + "logits/chosen": -0.6185302734375, + "logits/rejected": -0.24359130859375, + "logps/chosen": -2.4498584270477295, + "logps/rejected": -2.596996545791626, + "loss": -0.0523, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.21683993935585022, + "rewards/margins": 0.052317216992378235, + "rewards/rejected": 0.1645227074623108, + "step": 206 + }, + { + "epoch": 0.49909584086799275, + "grad_norm": 0.23177851736545563, + "learning_rate": 2.8016085790884725e-06, + "logits/chosen": -0.185089111328125, + "logits/rejected": -0.5606689453125, + "logps/chosen": -1.6990635395050049, + "logps/rejected": -2.5500454902648926, + "loss": -0.1057, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24100631475448608, + "rewards/margins": 0.10570210963487625, + "rewards/rejected": 0.13530419766902924, + "step": 207 + }, + { + "epoch": 0.5015069318866787, + "grad_norm": 0.15125368535518646, + "learning_rate": 2.7882037533512068e-06, + "logits/chosen": -0.388916015625, + "logits/rejected": -0.1047515869140625, + "logps/chosen": -1.4806641340255737, + "logps/rejected": -2.5625932216644287, + "loss": -0.102, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23970326781272888, + "rewards/margins": 0.10200240463018417, + "rewards/rejected": 0.1377008855342865, + "step": 208 + }, + { + "epoch": 0.5039180229053647, + "grad_norm": 0.12147095054388046, + "learning_rate": 2.7747989276139415e-06, + "logits/chosen": -0.576751708984375, + "logits/rejected": -0.6204833984375, + "logps/chosen": -1.9968822002410889, + "logps/rejected": -3.049175500869751, + "loss": -0.1141, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17157778143882751, + "rewards/margins": 0.11413666605949402, + "rewards/rejected": 0.0574411116540432, + "step": 209 + }, + { + "epoch": 0.5063291139240507, + "grad_norm": 0.21282178163528442, + "learning_rate": 2.761394101876676e-06, + "logits/chosen": -0.4432373046875, + "logits/rejected": 0.006011962890625, + "logps/chosen": -1.6359626054763794, + "logps/rejected": -2.696181297302246, + "loss": -0.1483, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2820802330970764, + "rewards/margins": 0.148276224732399, + "rewards/rejected": 0.13380399346351624, + "step": 210 + }, + { + "epoch": 0.5087402049427366, + "grad_norm": 0.13324294984340668, + "learning_rate": 2.7479892761394105e-06, + "logits/chosen": -0.6666259765625, + "logits/rejected": -0.25689697265625, + "logps/chosen": -1.9157309532165527, + "logps/rejected": -2.68827486038208, + "loss": -0.1183, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22823314368724823, + "rewards/margins": 0.11827092617750168, + "rewards/rejected": 0.10996222496032715, + "step": 211 + }, + { + "epoch": 0.5111512959614225, + "grad_norm": 0.13334231078624725, + "learning_rate": 2.734584450402145e-06, + "logits/chosen": -0.203125, + "logits/rejected": -0.62152099609375, + "logps/chosen": -2.153001070022583, + "logps/rejected": -3.000319004058838, + "loss": -0.1032, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18005381524562836, + "rewards/margins": 0.10323041677474976, + "rewards/rejected": 0.0768233984708786, + "step": 212 + }, + { + "epoch": 0.5135623869801085, + "grad_norm": 0.22844156622886658, + "learning_rate": 2.72117962466488e-06, + "logits/chosen": -0.603271484375, + "logits/rejected": -0.47894287109375, + "logps/chosen": -2.080747604370117, + "logps/rejected": -2.9058759212493896, + "loss": -0.0987, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19490177929401398, + "rewards/margins": 0.09865477681159973, + "rewards/rejected": 0.09624700248241425, + "step": 213 + }, + { + "epoch": 0.5159734779987944, + "grad_norm": 0.26035332679748535, + "learning_rate": 2.707774798927614e-06, + "logits/chosen": -0.440673828125, + "logits/rejected": -0.388214111328125, + "logps/chosen": -2.438211441040039, + "logps/rejected": -2.77759051322937, + "loss": -0.0468, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1824999749660492, + "rewards/margins": 0.04681158438324928, + "rewards/rejected": 0.13568837940692902, + "step": 214 + }, + { + "epoch": 0.5183845690174804, + "grad_norm": 0.21330344676971436, + "learning_rate": 2.694369973190349e-06, + "logits/chosen": -0.528106689453125, + "logits/rejected": -0.48358154296875, + "logps/chosen": -2.168797016143799, + "logps/rejected": -2.979780673980713, + "loss": -0.0952, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18293826282024384, + "rewards/margins": 0.09519477188587189, + "rewards/rejected": 0.08774348348379135, + "step": 215 + }, + { + "epoch": 0.5207956600361664, + "grad_norm": 0.1514255404472351, + "learning_rate": 2.6809651474530836e-06, + "logits/chosen": -0.53485107421875, + "logits/rejected": -0.12870216369628906, + "logps/chosen": -1.296417474746704, + "logps/rejected": -2.472968101501465, + "loss": -0.127, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.29904893040657043, + "rewards/margins": 0.12699376046657562, + "rewards/rejected": 0.17205515503883362, + "step": 216 + }, + { + "epoch": 0.5232067510548524, + "grad_norm": 0.14495502412319183, + "learning_rate": 2.667560321715818e-06, + "logits/chosen": -0.61651611328125, + "logits/rejected": -0.722412109375, + "logps/chosen": -2.337038278579712, + "logps/rejected": -3.0052199363708496, + "loss": -0.0707, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1695079803466797, + "rewards/margins": 0.07073226571083069, + "rewards/rejected": 0.098775714635849, + "step": 217 + }, + { + "epoch": 0.5256178420735382, + "grad_norm": 0.32379505038261414, + "learning_rate": 2.6541554959785526e-06, + "logits/chosen": -0.50653076171875, + "logits/rejected": -0.3614501953125, + "logps/chosen": -1.3938970565795898, + "logps/rejected": -2.695472240447998, + "loss": -0.1493, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27173522114753723, + "rewards/margins": 0.14925940334796906, + "rewards/rejected": 0.12247580289840698, + "step": 218 + }, + { + "epoch": 0.5280289330922242, + "grad_norm": 0.15786728262901306, + "learning_rate": 2.6407506702412873e-06, + "logits/chosen": -0.232025146484375, + "logits/rejected": -0.5867919921875, + "logps/chosen": -1.990861177444458, + "logps/rejected": -2.50642728805542, + "loss": -0.0588, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2022039145231247, + "rewards/margins": 0.0587797686457634, + "rewards/rejected": 0.1434241384267807, + "step": 219 + }, + { + "epoch": 0.5304400241109102, + "grad_norm": 0.2451145052909851, + "learning_rate": 2.6273458445040215e-06, + "logits/chosen": -0.5715179443359375, + "logits/rejected": -0.500457763671875, + "logps/chosen": -1.699583888053894, + "logps/rejected": -2.585405111312866, + "loss": -0.1102, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23460426926612854, + "rewards/margins": 0.11020025610923767, + "rewards/rejected": 0.12440400570631027, + "step": 220 + }, + { + "epoch": 0.5328511151295962, + "grad_norm": 0.6377310156822205, + "learning_rate": 2.6139410187667563e-06, + "logits/chosen": -0.15139007568359375, + "logits/rejected": -0.387603759765625, + "logps/chosen": -2.167043447494507, + "logps/rejected": -2.4354560375213623, + "loss": -0.0494, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.23499956727027893, + "rewards/margins": 0.04943379759788513, + "rewards/rejected": 0.1855657696723938, + "step": 221 + }, + { + "epoch": 0.5352622061482821, + "grad_norm": 0.2983133792877197, + "learning_rate": 2.600536193029491e-06, + "logits/chosen": -0.3441162109375, + "logits/rejected": -0.6526336669921875, + "logps/chosen": -1.7902055978775024, + "logps/rejected": -2.6743669509887695, + "loss": -0.1032, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21519100666046143, + "rewards/margins": 0.10320259630680084, + "rewards/rejected": 0.11198841035366058, + "step": 222 + }, + { + "epoch": 0.5376732971669681, + "grad_norm": 0.15829414129257202, + "learning_rate": 2.5871313672922257e-06, + "logits/chosen": -0.5672607421875, + "logits/rejected": -0.138763427734375, + "logps/chosen": -1.5828574895858765, + "logps/rejected": -2.4980366230010986, + "loss": -0.1061, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26392680406570435, + "rewards/margins": 0.10605490207672119, + "rewards/rejected": 0.15787190198898315, + "step": 223 + }, + { + "epoch": 0.540084388185654, + "grad_norm": 0.8204041719436646, + "learning_rate": 2.57372654155496e-06, + "logits/chosen": -0.4825439453125, + "logits/rejected": -0.3645515441894531, + "logps/chosen": -1.9334183931350708, + "logps/rejected": -2.572599172592163, + "loss": -0.116, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24374160170555115, + "rewards/margins": 0.11597643792629242, + "rewards/rejected": 0.12776517868041992, + "step": 224 + }, + { + "epoch": 0.5424954792043399, + "grad_norm": 0.5508759617805481, + "learning_rate": 2.5603217158176947e-06, + "logits/chosen": -0.3161468505859375, + "logits/rejected": -0.457672119140625, + "logps/chosen": -1.8335989713668823, + "logps/rejected": -2.5756654739379883, + "loss": -0.1277, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.26014891266822815, + "rewards/margins": 0.12774372100830078, + "rewards/rejected": 0.13240516185760498, + "step": 225 + }, + { + "epoch": 0.5449065702230259, + "grad_norm": 0.2274271547794342, + "learning_rate": 2.5469168900804294e-06, + "logits/chosen": -0.45947265625, + "logits/rejected": 0.2047119140625, + "logps/chosen": -1.5705689191818237, + "logps/rejected": -2.3291680812835693, + "loss": -0.1085, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.27440863847732544, + "rewards/margins": 0.10849922895431519, + "rewards/rejected": 0.16590939462184906, + "step": 226 + }, + { + "epoch": 0.5473176612417119, + "grad_norm": 0.214279904961586, + "learning_rate": 2.5335120643431636e-06, + "logits/chosen": -0.49285888671875, + "logits/rejected": -0.5154132843017578, + "logps/chosen": -2.2102606296539307, + "logps/rejected": -2.8159403800964355, + "loss": -0.0608, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.17781740427017212, + "rewards/margins": 0.06079640984535217, + "rewards/rejected": 0.11702098697423935, + "step": 227 + }, + { + "epoch": 0.5497287522603979, + "grad_norm": 0.20388254523277283, + "learning_rate": 2.5201072386058984e-06, + "logits/chosen": -0.787841796875, + "logits/rejected": -0.389617919921875, + "logps/chosen": -1.90524423122406, + "logps/rejected": -3.0185534954071045, + "loss": -0.1183, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18031735718250275, + "rewards/margins": 0.11827754974365234, + "rewards/rejected": 0.06203979626297951, + "step": 228 + }, + { + "epoch": 0.5521398432790838, + "grad_norm": 0.2535056471824646, + "learning_rate": 2.506702412868633e-06, + "logits/chosen": -0.522613525390625, + "logits/rejected": -0.31377410888671875, + "logps/chosen": -1.932642936706543, + "logps/rejected": -2.427947521209717, + "loss": -0.082, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2685334384441376, + "rewards/margins": 0.0820438414812088, + "rewards/rejected": 0.18648961186408997, + "step": 229 + }, + { + "epoch": 0.5545509342977697, + "grad_norm": 0.24235057830810547, + "learning_rate": 2.4932975871313673e-06, + "logits/chosen": -0.423065185546875, + "logits/rejected": -0.3415374755859375, + "logps/chosen": -2.437142848968506, + "logps/rejected": -3.146867275238037, + "loss": -0.0833, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17581258714199066, + "rewards/margins": 0.08331093192100525, + "rewards/rejected": 0.09250165522098541, + "step": 230 + }, + { + "epoch": 0.5569620253164557, + "grad_norm": 0.23205387592315674, + "learning_rate": 2.479892761394102e-06, + "logits/chosen": -0.56787109375, + "logits/rejected": -0.478912353515625, + "logps/chosen": -1.8429834842681885, + "logps/rejected": -2.6143712997436523, + "loss": -0.1018, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23154723644256592, + "rewards/margins": 0.1018138974905014, + "rewards/rejected": 0.12973333895206451, + "step": 231 + }, + { + "epoch": 0.5593731163351416, + "grad_norm": 0.3556649088859558, + "learning_rate": 2.4664879356568368e-06, + "logits/chosen": -0.5091133117675781, + "logits/rejected": -0.552734375, + "logps/chosen": -2.168527841567993, + "logps/rejected": -2.955108165740967, + "loss": -0.1008, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20043154060840607, + "rewards/margins": 0.10076121985912323, + "rewards/rejected": 0.09967032819986343, + "step": 232 + }, + { + "epoch": 0.5617842073538276, + "grad_norm": 0.684489369392395, + "learning_rate": 2.453083109919571e-06, + "logits/chosen": -0.424774169921875, + "logits/rejected": -0.5048828125, + "logps/chosen": -1.532679557800293, + "logps/rejected": -1.9847021102905273, + "loss": -0.0807, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.31033581495285034, + "rewards/margins": 0.08072011917829514, + "rewards/rejected": 0.2296156883239746, + "step": 233 + }, + { + "epoch": 0.5641952983725136, + "grad_norm": 0.2072058618068695, + "learning_rate": 2.4396782841823058e-06, + "logits/chosen": -0.54815673828125, + "logits/rejected": -0.291412353515625, + "logps/chosen": -1.9623987674713135, + "logps/rejected": -2.844301700592041, + "loss": -0.1038, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2099665403366089, + "rewards/margins": 0.10378723591566086, + "rewards/rejected": 0.10617929697036743, + "step": 234 + }, + { + "epoch": 0.5666063893911996, + "grad_norm": 0.23502588272094727, + "learning_rate": 2.4262734584450405e-06, + "logits/chosen": -0.4100189208984375, + "logits/rejected": -0.51544189453125, + "logps/chosen": -1.59712815284729, + "logps/rejected": -2.7786502838134766, + "loss": -0.1362, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.254263311624527, + "rewards/margins": 0.1362040787935257, + "rewards/rejected": 0.11805924028158188, + "step": 235 + }, + { + "epoch": 0.5690174804098854, + "grad_norm": 0.13915672898292542, + "learning_rate": 2.4128686327077747e-06, + "logits/chosen": -0.4629058837890625, + "logits/rejected": -0.5181884765625, + "logps/chosen": -2.209294319152832, + "logps/rejected": -2.994799852371216, + "loss": -0.0922, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19737109541893005, + "rewards/margins": 0.0922246128320694, + "rewards/rejected": 0.10514649003744125, + "step": 236 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.22934621572494507, + "learning_rate": 2.3994638069705094e-06, + "logits/chosen": -0.699554443359375, + "logits/rejected": -0.52276611328125, + "logps/chosen": -2.1282799243927, + "logps/rejected": -2.948793411254883, + "loss": -0.14, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21877320110797882, + "rewards/margins": 0.14000186324119568, + "rewards/rejected": 0.07877133786678314, + "step": 237 + }, + { + "epoch": 0.5738396624472574, + "grad_norm": 0.17939257621765137, + "learning_rate": 2.386058981233244e-06, + "logits/chosen": -0.588653564453125, + "logits/rejected": -0.4595947265625, + "logps/chosen": -1.9606249332427979, + "logps/rejected": -2.8305306434631348, + "loss": -0.1198, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22058779001235962, + "rewards/margins": 0.119773268699646, + "rewards/rejected": 0.10081452131271362, + "step": 238 + }, + { + "epoch": 0.5762507534659433, + "grad_norm": 0.4613324701786041, + "learning_rate": 2.372654155495979e-06, + "logits/chosen": -0.375701904296875, + "logits/rejected": -0.18255615234375, + "logps/chosen": -1.7975425720214844, + "logps/rejected": -2.716414451599121, + "loss": -0.1062, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2103354036808014, + "rewards/margins": 0.10619811713695526, + "rewards/rejected": 0.10413727164268494, + "step": 239 + }, + { + "epoch": 0.5786618444846293, + "grad_norm": 0.26412999629974365, + "learning_rate": 2.359249329758713e-06, + "logits/chosen": -0.67718505859375, + "logits/rejected": -0.6405868530273438, + "logps/chosen": -2.0427653789520264, + "logps/rejected": -3.129185914993286, + "loss": -0.1358, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.20130077004432678, + "rewards/margins": 0.13584360480308533, + "rewards/rejected": 0.06545715779066086, + "step": 240 + }, + { + "epoch": 0.5810729355033153, + "grad_norm": 0.13894815742969513, + "learning_rate": 2.345844504021448e-06, + "logits/chosen": -0.6707534790039062, + "logits/rejected": -0.89532470703125, + "logps/chosen": -2.6446471214294434, + "logps/rejected": -3.0613820552825928, + "loss": -0.0638, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.14486713707447052, + "rewards/margins": 0.0638175681233406, + "rewards/rejected": 0.08104956150054932, + "step": 241 + }, + { + "epoch": 0.5834840265220012, + "grad_norm": 0.3322769105434418, + "learning_rate": 2.3324396782841826e-06, + "logits/chosen": -0.5447845458984375, + "logits/rejected": 0.2573699951171875, + "logps/chosen": -1.9078049659729004, + "logps/rejected": -2.7448787689208984, + "loss": -0.1251, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2469547986984253, + "rewards/margins": 0.12514016032218933, + "rewards/rejected": 0.12181464582681656, + "step": 242 + }, + { + "epoch": 0.5858951175406871, + "grad_norm": 0.17555266618728638, + "learning_rate": 2.319034852546917e-06, + "logits/chosen": -0.5060272216796875, + "logits/rejected": -0.4970703125, + "logps/chosen": -1.9672725200653076, + "logps/rejected": -2.8103106021881104, + "loss": -0.1021, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19932040572166443, + "rewards/margins": 0.1021261066198349, + "rewards/rejected": 0.09719430655241013, + "step": 243 + }, + { + "epoch": 0.5883062085593731, + "grad_norm": 0.19972020387649536, + "learning_rate": 2.3056300268096516e-06, + "logits/chosen": -0.665679931640625, + "logits/rejected": -0.5572586059570312, + "logps/chosen": -2.1146159172058105, + "logps/rejected": -3.1303868293762207, + "loss": -0.1227, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17992740869522095, + "rewards/margins": 0.12266412377357483, + "rewards/rejected": 0.057263292372226715, + "step": 244 + }, + { + "epoch": 0.5907172995780591, + "grad_norm": 0.4251721501350403, + "learning_rate": 2.2922252010723863e-06, + "logits/chosen": -0.5277976989746094, + "logits/rejected": -0.02923583984375, + "logps/chosen": -2.0203771591186523, + "logps/rejected": -2.5307390689849854, + "loss": -0.0689, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23671527206897736, + "rewards/margins": 0.06891129910945892, + "rewards/rejected": 0.16780397295951843, + "step": 245 + }, + { + "epoch": 0.593128390596745, + "grad_norm": 0.5262976288795471, + "learning_rate": 2.278820375335121e-06, + "logits/chosen": -0.67388916015625, + "logits/rejected": -0.5382308959960938, + "logps/chosen": -2.340686082839966, + "logps/rejected": -3.4546706676483154, + "loss": -0.1314, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19458642601966858, + "rewards/margins": 0.1313656121492386, + "rewards/rejected": 0.06322081387042999, + "step": 246 + }, + { + "epoch": 0.595539481615431, + "grad_norm": 0.4661294221878052, + "learning_rate": 2.2654155495978557e-06, + "logits/chosen": -0.7741546630859375, + "logits/rejected": -0.673004150390625, + "logps/chosen": -2.1867899894714355, + "logps/rejected": -3.1655240058898926, + "loss": -0.1271, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17740848660469055, + "rewards/margins": 0.12705540657043457, + "rewards/rejected": 0.05035308003425598, + "step": 247 + }, + { + "epoch": 0.5979505726341169, + "grad_norm": 0.17859813570976257, + "learning_rate": 2.25201072386059e-06, + "logits/chosen": -0.3651885986328125, + "logits/rejected": -0.595367431640625, + "logps/chosen": -2.5258560180664062, + "logps/rejected": -3.2914695739746094, + "loss": -0.0912, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1555439531803131, + "rewards/margins": 0.09116679430007935, + "rewards/rejected": 0.06437715888023376, + "step": 248 + }, + { + "epoch": 0.6003616636528029, + "grad_norm": 0.2604820430278778, + "learning_rate": 2.2386058981233247e-06, + "logits/chosen": -0.49649810791015625, + "logits/rejected": -0.45428466796875, + "logps/chosen": -2.331780195236206, + "logps/rejected": -2.895789623260498, + "loss": -0.0789, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18511176109313965, + "rewards/margins": 0.07894974201917648, + "rewards/rejected": 0.10616202652454376, + "step": 249 + }, + { + "epoch": 0.6027727546714888, + "grad_norm": 0.20211994647979736, + "learning_rate": 2.2252010723860594e-06, + "logits/chosen": -0.781890869140625, + "logits/rejected": -0.24432373046875, + "logps/chosen": -2.092327356338501, + "logps/rejected": -2.87355375289917, + "loss": -0.1088, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21041500568389893, + "rewards/margins": 0.10880758613348007, + "rewards/rejected": 0.10160740464925766, + "step": 250 + }, + { + "epoch": 0.6051838456901748, + "grad_norm": 0.14812637865543365, + "learning_rate": 2.2117962466487937e-06, + "logits/chosen": -0.6563720703125, + "logits/rejected": -0.57281494140625, + "logps/chosen": -2.0487260818481445, + "logps/rejected": -3.338346004486084, + "loss": -0.1481, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.202968031167984, + "rewards/margins": 0.14812180399894714, + "rewards/rejected": 0.05484623461961746, + "step": 251 + }, + { + "epoch": 0.6075949367088608, + "grad_norm": 0.4227518141269684, + "learning_rate": 2.1983914209115284e-06, + "logits/chosen": -0.49267578125, + "logits/rejected": -0.381866455078125, + "logps/chosen": -1.4009172916412354, + "logps/rejected": -2.55879545211792, + "loss": -0.1211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2638285160064697, + "rewards/margins": 0.12105178833007812, + "rewards/rejected": 0.1427766978740692, + "step": 252 + }, + { + "epoch": 0.6100060277275468, + "grad_norm": 0.16262991726398468, + "learning_rate": 2.184986595174263e-06, + "logits/chosen": -0.830078125, + "logits/rejected": -0.5396881103515625, + "logps/chosen": -1.639129877090454, + "logps/rejected": -3.2362990379333496, + "loss": -0.148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20805802941322327, + "rewards/margins": 0.14800900220870972, + "rewards/rejected": 0.060049042105674744, + "step": 253 + }, + { + "epoch": 0.6124171187462327, + "grad_norm": 0.4719944894313812, + "learning_rate": 2.1715817694369974e-06, + "logits/chosen": -0.3321533203125, + "logits/rejected": -0.1128692626953125, + "logps/chosen": -1.4619243144989014, + "logps/rejected": -2.550328016281128, + "loss": -0.1132, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26330697536468506, + "rewards/margins": 0.11315856128931046, + "rewards/rejected": 0.1501484215259552, + "step": 254 + }, + { + "epoch": 0.6148282097649186, + "grad_norm": 0.28602099418640137, + "learning_rate": 2.158176943699732e-06, + "logits/chosen": -0.840911865234375, + "logits/rejected": -0.4417266845703125, + "logps/chosen": -1.915555715560913, + "logps/rejected": -3.222545623779297, + "loss": -0.141, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1985325813293457, + "rewards/margins": 0.14100342988967896, + "rewards/rejected": 0.05752915143966675, + "step": 255 + }, + { + "epoch": 0.6172393007836046, + "grad_norm": 0.46066752076148987, + "learning_rate": 2.1447721179624668e-06, + "logits/chosen": -0.331573486328125, + "logits/rejected": -0.20660400390625, + "logps/chosen": -1.4899280071258545, + "logps/rejected": -2.2292163372039795, + "loss": -0.0888, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2837684154510498, + "rewards/margins": 0.08876757323741913, + "rewards/rejected": 0.1950008124113083, + "step": 256 + }, + { + "epoch": 0.6196503918022905, + "grad_norm": 0.2576768696308136, + "learning_rate": 2.1313672922252015e-06, + "logits/chosen": -0.8979339599609375, + "logits/rejected": -0.9300537109375, + "logps/chosen": -2.1375041007995605, + "logps/rejected": -3.143777370452881, + "loss": -0.0999, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18273016810417175, + "rewards/margins": 0.09985122084617615, + "rewards/rejected": 0.0828789547085762, + "step": 257 + }, + { + "epoch": 0.6220614828209765, + "grad_norm": 0.22504042088985443, + "learning_rate": 2.1179624664879358e-06, + "logits/chosen": -0.4912567138671875, + "logits/rejected": -0.4622802734375, + "logps/chosen": -1.9178471565246582, + "logps/rejected": -3.0572853088378906, + "loss": -0.069, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1981804221868515, + "rewards/margins": 0.06895191967487335, + "rewards/rejected": 0.12922850251197815, + "step": 258 + }, + { + "epoch": 0.6244725738396625, + "grad_norm": 0.14527130126953125, + "learning_rate": 2.1045576407506705e-06, + "logits/chosen": -0.8145751953125, + "logits/rejected": -0.672088623046875, + "logps/chosen": -2.6195173263549805, + "logps/rejected": -3.425107955932617, + "loss": -0.0947, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1423579752445221, + "rewards/margins": 0.09470826387405396, + "rewards/rejected": 0.047649722546339035, + "step": 259 + }, + { + "epoch": 0.6268836648583485, + "grad_norm": 0.18013450503349304, + "learning_rate": 2.091152815013405e-06, + "logits/chosen": -0.41049957275390625, + "logits/rejected": -0.474639892578125, + "logps/chosen": -2.501359224319458, + "logps/rejected": -2.831785202026367, + "loss": -0.0471, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.15878508985042572, + "rewards/margins": 0.04714329540729523, + "rewards/rejected": 0.1116417944431305, + "step": 260 + }, + { + "epoch": 0.6292947558770343, + "grad_norm": 0.284231573343277, + "learning_rate": 2.0777479892761395e-06, + "logits/chosen": -0.44830322265625, + "logits/rejected": -0.00118255615234375, + "logps/chosen": -1.8145167827606201, + "logps/rejected": -2.2997889518737793, + "loss": -0.0847, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2791876196861267, + "rewards/margins": 0.08465435355901718, + "rewards/rejected": 0.19453327357769012, + "step": 261 + }, + { + "epoch": 0.6317058468957203, + "grad_norm": 0.16854453086853027, + "learning_rate": 2.064343163538874e-06, + "logits/chosen": -0.4799642562866211, + "logits/rejected": -0.48046875, + "logps/chosen": -1.4381766319274902, + "logps/rejected": -2.5089263916015625, + "loss": -0.1302, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30055326223373413, + "rewards/margins": 0.1301727294921875, + "rewards/rejected": 0.17038053274154663, + "step": 262 + }, + { + "epoch": 0.6341169379144063, + "grad_norm": 0.36369168758392334, + "learning_rate": 2.050938337801609e-06, + "logits/chosen": -0.5082874298095703, + "logits/rejected": -0.10931396484375, + "logps/chosen": -1.9076807498931885, + "logps/rejected": -2.5438785552978516, + "loss": -0.0958, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23563198745250702, + "rewards/margins": 0.0957505851984024, + "rewards/rejected": 0.13988140225410461, + "step": 263 + }, + { + "epoch": 0.6365280289330922, + "grad_norm": 0.13624438643455505, + "learning_rate": 2.037533512064343e-06, + "logits/chosen": -0.684356689453125, + "logits/rejected": -0.4652099609375, + "logps/chosen": -1.546952247619629, + "logps/rejected": -2.5537309646606445, + "loss": -0.1039, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2630748450756073, + "rewards/margins": 0.10387227684259415, + "rewards/rejected": 0.15920256078243256, + "step": 264 + }, + { + "epoch": 0.6389391199517782, + "grad_norm": 0.3461492359638214, + "learning_rate": 2.024128686327078e-06, + "logits/chosen": -0.6019287109375, + "logits/rejected": -0.2730712890625, + "logps/chosen": -1.7226495742797852, + "logps/rejected": -2.5613324642181396, + "loss": -0.1264, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27113670110702515, + "rewards/margins": 0.12641693651676178, + "rewards/rejected": 0.14471974968910217, + "step": 265 + }, + { + "epoch": 0.6413502109704642, + "grad_norm": 0.6156812310218811, + "learning_rate": 2.0107238605898126e-06, + "logits/chosen": -0.559967041015625, + "logits/rejected": -0.327117919921875, + "logps/chosen": -2.04434871673584, + "logps/rejected": -2.833914279937744, + "loss": -0.1184, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23786146938800812, + "rewards/margins": 0.11837325245141983, + "rewards/rejected": 0.11948820948600769, + "step": 266 + }, + { + "epoch": 0.64376130198915, + "grad_norm": 0.10644577443599701, + "learning_rate": 1.997319034852547e-06, + "logits/chosen": -0.72119140625, + "logits/rejected": -0.68402099609375, + "logps/chosen": -2.442497730255127, + "logps/rejected": -3.2481184005737305, + "loss": -0.0935, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16179729998111725, + "rewards/margins": 0.09352242946624756, + "rewards/rejected": 0.06827487051486969, + "step": 267 + }, + { + "epoch": 0.646172393007836, + "grad_norm": 0.3586520850658417, + "learning_rate": 1.9839142091152816e-06, + "logits/chosen": -0.652679443359375, + "logits/rejected": -0.437744140625, + "logps/chosen": -1.997248888015747, + "logps/rejected": -2.738507032394409, + "loss": -0.0943, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2078447937965393, + "rewards/margins": 0.09425204992294312, + "rewards/rejected": 0.11359275877475739, + "step": 268 + }, + { + "epoch": 0.648583484026522, + "grad_norm": 0.08498626202344894, + "learning_rate": 1.9705093833780163e-06, + "logits/chosen": -0.6308250427246094, + "logits/rejected": -0.734619140625, + "logps/chosen": -2.0818867683410645, + "logps/rejected": -2.9867730140686035, + "loss": -0.1121, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21084338426589966, + "rewards/margins": 0.11205324530601501, + "rewards/rejected": 0.09879015386104584, + "step": 269 + }, + { + "epoch": 0.650994575045208, + "grad_norm": 0.335497111082077, + "learning_rate": 1.9571045576407505e-06, + "logits/chosen": -0.58026123046875, + "logits/rejected": -0.13519287109375, + "logps/chosen": -2.177055597305298, + "logps/rejected": -2.7995100021362305, + "loss": -0.0957, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24520690739154816, + "rewards/margins": 0.09572410583496094, + "rewards/rejected": 0.1494828164577484, + "step": 270 + }, + { + "epoch": 0.6534056660638939, + "grad_norm": 0.23109914362430573, + "learning_rate": 1.9436997319034853e-06, + "logits/chosen": -0.74755859375, + "logits/rejected": -0.6177444458007812, + "logps/chosen": -2.025217056274414, + "logps/rejected": -3.0153648853302, + "loss": -0.1312, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18951821327209473, + "rewards/margins": 0.13122421503067017, + "rewards/rejected": 0.05829399824142456, + "step": 271 + }, + { + "epoch": 0.6558167570825799, + "grad_norm": 0.38043326139450073, + "learning_rate": 1.93029490616622e-06, + "logits/chosen": -0.71258544921875, + "logits/rejected": -0.55999755859375, + "logps/chosen": -2.0828866958618164, + "logps/rejected": -2.7454192638397217, + "loss": -0.0679, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.20676158368587494, + "rewards/margins": 0.06794621795415878, + "rewards/rejected": 0.13881537318229675, + "step": 272 + }, + { + "epoch": 0.6582278481012658, + "grad_norm": 0.23813959956169128, + "learning_rate": 1.9168900804289547e-06, + "logits/chosen": -0.563690185546875, + "logits/rejected": -0.50830078125, + "logps/chosen": -1.8330724239349365, + "logps/rejected": -2.955345630645752, + "loss": -0.1403, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24473488330841064, + "rewards/margins": 0.14027172327041626, + "rewards/rejected": 0.10446316748857498, + "step": 273 + }, + { + "epoch": 0.6606389391199518, + "grad_norm": 0.09323907643556595, + "learning_rate": 1.903485254691689e-06, + "logits/chosen": -0.26568603515625, + "logits/rejected": -0.57470703125, + "logps/chosen": -2.566443920135498, + "logps/rejected": -3.1312742233276367, + "loss": -0.0741, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15308275818824768, + "rewards/margins": 0.07411119341850281, + "rewards/rejected": 0.07897157967090607, + "step": 274 + }, + { + "epoch": 0.6630500301386377, + "grad_norm": 0.49775800108909607, + "learning_rate": 1.8900804289544237e-06, + "logits/chosen": -0.7232894897460938, + "logits/rejected": -0.23431396484375, + "logps/chosen": -1.658186674118042, + "logps/rejected": -2.5616846084594727, + "loss": -0.1136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3052350878715515, + "rewards/margins": 0.11359696090221405, + "rewards/rejected": 0.19163814187049866, + "step": 275 + }, + { + "epoch": 0.6654611211573237, + "grad_norm": 0.13795824348926544, + "learning_rate": 1.8766756032171582e-06, + "logits/chosen": -0.5635833740234375, + "logits/rejected": -0.5099639892578125, + "logps/chosen": -2.1662073135375977, + "logps/rejected": -2.86527943611145, + "loss": -0.1587, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24876488745212555, + "rewards/margins": 0.1587027609348297, + "rewards/rejected": 0.09006214141845703, + "step": 276 + }, + { + "epoch": 0.6678722121760097, + "grad_norm": 0.24294748902320862, + "learning_rate": 1.863270777479893e-06, + "logits/chosen": -0.59161376953125, + "logits/rejected": -0.4210090637207031, + "logps/chosen": -2.643289089202881, + "logps/rejected": -3.131368637084961, + "loss": -0.087, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.19423611462116241, + "rewards/margins": 0.08702246844768524, + "rewards/rejected": 0.10721364617347717, + "step": 277 + }, + { + "epoch": 0.6702833031946956, + "grad_norm": 0.33085882663726807, + "learning_rate": 1.8498659517426276e-06, + "logits/chosen": -0.6314697265625, + "logits/rejected": -0.68646240234375, + "logps/chosen": -2.2205257415771484, + "logps/rejected": -3.088937520980835, + "loss": -0.1027, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.18147246539592743, + "rewards/margins": 0.10274698585271835, + "rewards/rejected": 0.07872546464204788, + "step": 278 + }, + { + "epoch": 0.6726943942133815, + "grad_norm": 0.44562533497810364, + "learning_rate": 1.8364611260053623e-06, + "logits/chosen": -0.592193603515625, + "logits/rejected": -0.4844970703125, + "logps/chosen": -1.524603247642517, + "logps/rejected": -2.0741117000579834, + "loss": -0.1084, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.31715652346611023, + "rewards/margins": 0.1083594262599945, + "rewards/rejected": 0.20879709720611572, + "step": 279 + }, + { + "epoch": 0.6751054852320675, + "grad_norm": 0.8114678859710693, + "learning_rate": 1.8230563002680968e-06, + "logits/chosen": -0.4227294921875, + "logits/rejected": -0.52398681640625, + "logps/chosen": -2.330390214920044, + "logps/rejected": -2.480011463165283, + "loss": 0.0044, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.1879083663225174, + "rewards/margins": -0.004414182156324387, + "rewards/rejected": 0.1923225373029709, + "step": 280 + }, + { + "epoch": 0.6775165762507535, + "grad_norm": 0.19452427327632904, + "learning_rate": 1.8096514745308313e-06, + "logits/chosen": -0.4300384521484375, + "logits/rejected": 0.026641845703125, + "logps/chosen": -2.2282979488372803, + "logps/rejected": -2.492774724960327, + "loss": -0.1163, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.25205597281455994, + "rewards/margins": 0.11632832884788513, + "rewards/rejected": 0.1357276439666748, + "step": 281 + }, + { + "epoch": 0.6799276672694394, + "grad_norm": 0.2906542718410492, + "learning_rate": 1.796246648793566e-06, + "logits/chosen": -0.5941267013549805, + "logits/rejected": -0.255035400390625, + "logps/chosen": -2.0498180389404297, + "logps/rejected": -2.835390329360962, + "loss": -0.1131, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23062816262245178, + "rewards/margins": 0.11311490833759308, + "rewards/rejected": 0.1175132542848587, + "step": 282 + }, + { + "epoch": 0.6823387582881254, + "grad_norm": 0.3081793785095215, + "learning_rate": 1.7828418230563005e-06, + "logits/chosen": -0.6996917724609375, + "logits/rejected": -0.7662849426269531, + "logps/chosen": -1.6972362995147705, + "logps/rejected": -2.563687562942505, + "loss": -0.1131, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26854556798934937, + "rewards/margins": 0.11308521777391434, + "rewards/rejected": 0.15546032786369324, + "step": 283 + }, + { + "epoch": 0.6847498493068114, + "grad_norm": 0.5938827395439148, + "learning_rate": 1.769436997319035e-06, + "logits/chosen": -0.54541015625, + "logits/rejected": -0.540802001953125, + "logps/chosen": -2.7020649909973145, + "logps/rejected": -3.036040782928467, + "loss": -0.0786, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.18849295377731323, + "rewards/margins": 0.07859738916158676, + "rewards/rejected": 0.10989557206630707, + "step": 284 + }, + { + "epoch": 0.6871609403254972, + "grad_norm": 0.22290563583374023, + "learning_rate": 1.7560321715817697e-06, + "logits/chosen": -0.6795654296875, + "logits/rejected": -0.8431549072265625, + "logps/chosen": -1.9319499731063843, + "logps/rejected": -2.9910027980804443, + "loss": -0.1096, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20556455850601196, + "rewards/margins": 0.10960977524518967, + "rewards/rejected": 0.0959547683596611, + "step": 285 + }, + { + "epoch": 0.6895720313441832, + "grad_norm": 0.3192528486251831, + "learning_rate": 1.7426273458445042e-06, + "logits/chosen": -0.5517578125, + "logits/rejected": -0.222625732421875, + "logps/chosen": -1.9587316513061523, + "logps/rejected": -2.663393020629883, + "loss": -0.0712, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18889790773391724, + "rewards/margins": 0.07118511945009232, + "rewards/rejected": 0.11771276593208313, + "step": 286 + }, + { + "epoch": 0.6919831223628692, + "grad_norm": 0.3668079376220703, + "learning_rate": 1.7292225201072389e-06, + "logits/chosen": -0.34499359130859375, + "logits/rejected": -0.25164794921875, + "logps/chosen": -1.4858005046844482, + "logps/rejected": -2.4136972427368164, + "loss": -0.1089, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2776893377304077, + "rewards/margins": 0.10889081656932831, + "rewards/rejected": 0.1687985062599182, + "step": 287 + }, + { + "epoch": 0.6943942133815552, + "grad_norm": 0.24516712129116058, + "learning_rate": 1.7158176943699734e-06, + "logits/chosen": -0.7342529296875, + "logits/rejected": -0.75341796875, + "logps/chosen": -2.7940664291381836, + "logps/rejected": -3.426623821258545, + "loss": -0.0756, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1540980488061905, + "rewards/margins": 0.07558852434158325, + "rewards/rejected": 0.07850953191518784, + "step": 288 + }, + { + "epoch": 0.6968053044002411, + "grad_norm": 0.29001298546791077, + "learning_rate": 1.7024128686327079e-06, + "logits/chosen": -0.388916015625, + "logits/rejected": -0.257965087890625, + "logps/chosen": -1.8340270519256592, + "logps/rejected": -2.532681465148926, + "loss": -0.0941, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21397188305854797, + "rewards/margins": 0.09409509599208832, + "rewards/rejected": 0.11987678706645966, + "step": 289 + }, + { + "epoch": 0.6992163954189271, + "grad_norm": 0.18041586875915527, + "learning_rate": 1.6890080428954426e-06, + "logits/chosen": -0.69573974609375, + "logits/rejected": -0.15386962890625, + "logps/chosen": -1.6624269485473633, + "logps/rejected": -2.6766140460968018, + "loss": -0.1433, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2679325044155121, + "rewards/margins": 0.14325588941574097, + "rewards/rejected": 0.12467662990093231, + "step": 290 + }, + { + "epoch": 0.701627486437613, + "grad_norm": 0.1510167270898819, + "learning_rate": 1.675603217158177e-06, + "logits/chosen": -0.57171630859375, + "logits/rejected": -0.6336669921875, + "logps/chosen": -1.8552274703979492, + "logps/rejected": -2.634805202484131, + "loss": -0.0747, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2126169055700302, + "rewards/margins": 0.07465604692697525, + "rewards/rejected": 0.13796085119247437, + "step": 291 + }, + { + "epoch": 0.7040385774562989, + "grad_norm": 0.262734979391098, + "learning_rate": 1.6621983914209116e-06, + "logits/chosen": -0.7406005859375, + "logits/rejected": -0.66455078125, + "logps/chosen": -2.416321277618408, + "logps/rejected": -3.2232372760772705, + "loss": -0.0912, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16113072633743286, + "rewards/margins": 0.09117282927036285, + "rewards/rejected": 0.0699579119682312, + "step": 292 + }, + { + "epoch": 0.7064496684749849, + "grad_norm": 0.3963961899280548, + "learning_rate": 1.6487935656836463e-06, + "logits/chosen": -0.5918731689453125, + "logits/rejected": -0.7611083984375, + "logps/chosen": -2.0132744312286377, + "logps/rejected": -3.0947957038879395, + "loss": -0.1158, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19502262771129608, + "rewards/margins": 0.11575621366500854, + "rewards/rejected": 0.07926641404628754, + "step": 293 + }, + { + "epoch": 0.7088607594936709, + "grad_norm": 0.7935790419578552, + "learning_rate": 1.6353887399463808e-06, + "logits/chosen": -0.67559814453125, + "logits/rejected": -0.44921875, + "logps/chosen": -1.5360987186431885, + "logps/rejected": -2.802678108215332, + "loss": -0.1287, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.26868733763694763, + "rewards/margins": 0.12870383262634277, + "rewards/rejected": 0.13998351991176605, + "step": 294 + }, + { + "epoch": 0.7112718505123569, + "grad_norm": 0.30940330028533936, + "learning_rate": 1.6219839142091155e-06, + "logits/chosen": -0.783935546875, + "logits/rejected": -0.4868316650390625, + "logps/chosen": -1.7163124084472656, + "logps/rejected": -2.6496479511260986, + "loss": -0.13, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2559107542037964, + "rewards/margins": 0.1300193816423416, + "rewards/rejected": 0.12589135766029358, + "step": 295 + }, + { + "epoch": 0.7136829415310428, + "grad_norm": 0.10088784247636795, + "learning_rate": 1.60857908847185e-06, + "logits/chosen": -0.6380615234375, + "logits/rejected": -0.35400390625, + "logps/chosen": -1.991320013999939, + "logps/rejected": -2.672006607055664, + "loss": -0.1093, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.22304484248161316, + "rewards/margins": 0.10933370143175125, + "rewards/rejected": 0.11371114104986191, + "step": 296 + }, + { + "epoch": 0.7160940325497287, + "grad_norm": 0.4254037141799927, + "learning_rate": 1.5951742627345845e-06, + "logits/chosen": -0.362945556640625, + "logits/rejected": -0.5895767211914062, + "logps/chosen": -1.6407079696655273, + "logps/rejected": -2.352750539779663, + "loss": -0.0742, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2973887324333191, + "rewards/margins": 0.07424384355545044, + "rewards/rejected": 0.22314488887786865, + "step": 297 + }, + { + "epoch": 0.7185051235684147, + "grad_norm": 0.15900881588459015, + "learning_rate": 1.5817694369973192e-06, + "logits/chosen": -0.526092529296875, + "logits/rejected": -0.6285552978515625, + "logps/chosen": -2.120389223098755, + "logps/rejected": -3.233734369277954, + "loss": -0.1425, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2321762889623642, + "rewards/margins": 0.14250144362449646, + "rewards/rejected": 0.08967486023902893, + "step": 298 + }, + { + "epoch": 0.7209162145871006, + "grad_norm": 0.1536959558725357, + "learning_rate": 1.5683646112600537e-06, + "logits/chosen": -0.45995330810546875, + "logits/rejected": -0.459197998046875, + "logps/chosen": -2.1839547157287598, + "logps/rejected": -2.783351421356201, + "loss": -0.1009, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19937080144882202, + "rewards/margins": 0.10089893639087677, + "rewards/rejected": 0.09847186505794525, + "step": 299 + }, + { + "epoch": 0.7233273056057866, + "grad_norm": 0.0934535339474678, + "learning_rate": 1.5549597855227882e-06, + "logits/chosen": -0.72784423828125, + "logits/rejected": -0.614715576171875, + "logps/chosen": -2.0260772705078125, + "logps/rejected": -3.0592477321624756, + "loss": -0.0941, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16385474801063538, + "rewards/margins": 0.09406370669603348, + "rewards/rejected": 0.0697910487651825, + "step": 300 + }, + { + "epoch": 0.7257383966244726, + "grad_norm": 0.3281485140323639, + "learning_rate": 1.5415549597855229e-06, + "logits/chosen": -0.8062744140625, + "logits/rejected": -0.7749481201171875, + "logps/chosen": -2.4853672981262207, + "logps/rejected": -3.063077926635742, + "loss": -0.0701, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1718243956565857, + "rewards/margins": 0.07009021937847137, + "rewards/rejected": 0.10173418372869492, + "step": 301 + }, + { + "epoch": 0.7281494876431586, + "grad_norm": 0.28484320640563965, + "learning_rate": 1.5281501340482574e-06, + "logits/chosen": -0.6176834106445312, + "logits/rejected": 0.177490234375, + "logps/chosen": -1.5012640953063965, + "logps/rejected": -2.9272823333740234, + "loss": -0.1626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24188072979450226, + "rewards/margins": 0.16259074211120605, + "rewards/rejected": 0.0792900025844574, + "step": 302 + }, + { + "epoch": 0.7305605786618445, + "grad_norm": 0.10089068114757538, + "learning_rate": 1.514745308310992e-06, + "logits/chosen": -0.45220375061035156, + "logits/rejected": -0.5378036499023438, + "logps/chosen": -2.402207612991333, + "logps/rejected": -3.134814977645874, + "loss": -0.0841, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18568626046180725, + "rewards/margins": 0.08407771587371826, + "rewards/rejected": 0.10160855948925018, + "step": 303 + }, + { + "epoch": 0.7329716696805304, + "grad_norm": 0.4633752107620239, + "learning_rate": 1.5013404825737266e-06, + "logits/chosen": -0.19050437211990356, + "logits/rejected": -0.718505859375, + "logps/chosen": -1.7944018840789795, + "logps/rejected": -2.602524757385254, + "loss": -0.0984, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.231818288564682, + "rewards/margins": 0.09840609133243561, + "rewards/rejected": 0.1334121823310852, + "step": 304 + }, + { + "epoch": 0.7353827606992164, + "grad_norm": 0.21570847928524017, + "learning_rate": 1.487935656836461e-06, + "logits/chosen": -0.6647491455078125, + "logits/rejected": -0.3929443359375, + "logps/chosen": -1.6984490156173706, + "logps/rejected": -2.989982843399048, + "loss": -0.1345, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22221681475639343, + "rewards/margins": 0.13453607261180878, + "rewards/rejected": 0.08768075704574585, + "step": 305 + }, + { + "epoch": 0.7377938517179023, + "grad_norm": 0.17630194127559662, + "learning_rate": 1.4745308310991958e-06, + "logits/chosen": -0.689453125, + "logits/rejected": -0.79541015625, + "logps/chosen": -2.5726208686828613, + "logps/rejected": -3.2732038497924805, + "loss": -0.077, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14044827222824097, + "rewards/margins": 0.07698767632246017, + "rewards/rejected": 0.06346059590578079, + "step": 306 + }, + { + "epoch": 0.7402049427365883, + "grad_norm": 0.3801437020301819, + "learning_rate": 1.4611260053619303e-06, + "logits/chosen": -0.6834335327148438, + "logits/rejected": -0.8072509765625, + "logps/chosen": -2.256196975708008, + "logps/rejected": -3.0800771713256836, + "loss": -0.0838, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1884252429008484, + "rewards/margins": 0.08377772569656372, + "rewards/rejected": 0.10464751720428467, + "step": 307 + }, + { + "epoch": 0.7426160337552743, + "grad_norm": 0.1341785490512848, + "learning_rate": 1.4477211796246652e-06, + "logits/chosen": -0.7215576171875, + "logits/rejected": -0.6524658203125, + "logps/chosen": -2.456303358078003, + "logps/rejected": -3.424696445465088, + "loss": -0.1058, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17319267988204956, + "rewards/margins": 0.10575427114963531, + "rewards/rejected": 0.06743840128183365, + "step": 308 + }, + { + "epoch": 0.7450271247739603, + "grad_norm": 0.4489308297634125, + "learning_rate": 1.4343163538873997e-06, + "logits/chosen": -0.68035888671875, + "logits/rejected": -0.2259063720703125, + "logps/chosen": -2.080988645553589, + "logps/rejected": -2.792454481124878, + "loss": -0.0852, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2477894276380539, + "rewards/margins": 0.08518936485052109, + "rewards/rejected": 0.1626000702381134, + "step": 309 + }, + { + "epoch": 0.7474382157926461, + "grad_norm": 0.16423240303993225, + "learning_rate": 1.4209115281501342e-06, + "logits/chosen": -0.749267578125, + "logits/rejected": -0.22937774658203125, + "logps/chosen": -1.698309302330017, + "logps/rejected": -2.550537347793579, + "loss": -0.1194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3133883774280548, + "rewards/margins": 0.11937491595745087, + "rewards/rejected": 0.19401347637176514, + "step": 310 + }, + { + "epoch": 0.7498493068113321, + "grad_norm": 0.16148002445697784, + "learning_rate": 1.4075067024128689e-06, + "logits/chosen": -0.4085540771484375, + "logits/rejected": -0.3878173828125, + "logps/chosen": -1.9870270490646362, + "logps/rejected": -2.7122538089752197, + "loss": -0.1046, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2003295123577118, + "rewards/margins": 0.10455606132745743, + "rewards/rejected": 0.09577345848083496, + "step": 311 + }, + { + "epoch": 0.7522603978300181, + "grad_norm": 0.1515049934387207, + "learning_rate": 1.3941018766756034e-06, + "logits/chosen": -0.687255859375, + "logits/rejected": -0.88848876953125, + "logps/chosen": -2.3736040592193604, + "logps/rejected": -2.9982097148895264, + "loss": -0.1111, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19204746186733246, + "rewards/margins": 0.11114997416734695, + "rewards/rejected": 0.08089748024940491, + "step": 312 + }, + { + "epoch": 0.754671488848704, + "grad_norm": 0.4956166744232178, + "learning_rate": 1.380697050938338e-06, + "logits/chosen": -0.72039794921875, + "logits/rejected": -0.5407710075378418, + "logps/chosen": -1.729560136795044, + "logps/rejected": -2.8213071823120117, + "loss": -0.1143, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20637483894824982, + "rewards/margins": 0.11425817012786865, + "rewards/rejected": 0.09211666882038116, + "step": 313 + }, + { + "epoch": 0.75708257986739, + "grad_norm": 0.1496104747056961, + "learning_rate": 1.3672922252010726e-06, + "logits/chosen": -0.5347900390625, + "logits/rejected": -0.1898193359375, + "logps/chosen": -1.3573123216629028, + "logps/rejected": -2.506791353225708, + "loss": -0.1309, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.28250664472579956, + "rewards/margins": 0.13090378046035767, + "rewards/rejected": 0.1516028344631195, + "step": 314 + }, + { + "epoch": 0.759493670886076, + "grad_norm": 0.12428668141365051, + "learning_rate": 1.353887399463807e-06, + "logits/chosen": -0.739501953125, + "logits/rejected": -0.18463134765625, + "logps/chosen": -2.221066474914551, + "logps/rejected": -3.033491611480713, + "loss": -0.1139, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19889354705810547, + "rewards/margins": 0.11390172690153122, + "rewards/rejected": 0.08499181270599365, + "step": 315 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.2698236107826233, + "learning_rate": 1.3404825737265418e-06, + "logits/chosen": -0.76678466796875, + "logits/rejected": -0.61297607421875, + "logps/chosen": -2.259842872619629, + "logps/rejected": -3.022620677947998, + "loss": -0.1094, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19041167199611664, + "rewards/margins": 0.10941335558891296, + "rewards/rejected": 0.08099831640720367, + "step": 316 + }, + { + "epoch": 0.7643158529234478, + "grad_norm": 0.09925764054059982, + "learning_rate": 1.3270777479892763e-06, + "logits/chosen": -0.670684814453125, + "logits/rejected": -0.6322784423828125, + "logps/chosen": -2.2238829135894775, + "logps/rejected": -2.945051670074463, + "loss": -0.0891, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1835918128490448, + "rewards/margins": 0.08908460289239883, + "rewards/rejected": 0.09450721740722656, + "step": 317 + }, + { + "epoch": 0.7667269439421338, + "grad_norm": 0.20127759873867035, + "learning_rate": 1.3136729222520108e-06, + "logits/chosen": -0.703582763671875, + "logits/rejected": -0.824951171875, + "logps/chosen": -2.539128541946411, + "logps/rejected": -3.199209451675415, + "loss": -0.0892, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14936614036560059, + "rewards/margins": 0.08916693925857544, + "rewards/rejected": 0.06019919738173485, + "step": 318 + }, + { + "epoch": 0.7691380349608198, + "grad_norm": 0.09584791213274002, + "learning_rate": 1.3002680965147455e-06, + "logits/chosen": -0.830841064453125, + "logits/rejected": -0.667236328125, + "logps/chosen": -2.205950975418091, + "logps/rejected": -3.2253026962280273, + "loss": -0.1306, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1911124885082245, + "rewards/margins": 0.13062451779842377, + "rewards/rejected": 0.060487959533929825, + "step": 319 + }, + { + "epoch": 0.7715491259795058, + "grad_norm": 0.6067792177200317, + "learning_rate": 1.28686327077748e-06, + "logits/chosen": -0.6119766235351562, + "logits/rejected": -0.4876708984375, + "logps/chosen": -1.6387662887573242, + "logps/rejected": -2.564568519592285, + "loss": -0.0936, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23836247622966766, + "rewards/margins": 0.09364160150289536, + "rewards/rejected": 0.1447208821773529, + "step": 320 + }, + { + "epoch": 0.7739602169981917, + "grad_norm": 0.46343255043029785, + "learning_rate": 1.2734584450402147e-06, + "logits/chosen": -0.786834716796875, + "logits/rejected": -0.773651123046875, + "logps/chosen": -2.510172128677368, + "logps/rejected": -3.5082144737243652, + "loss": -0.0835, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1386396884918213, + "rewards/margins": 0.08353696018457413, + "rewards/rejected": 0.05510272458195686, + "step": 321 + }, + { + "epoch": 0.7763713080168776, + "grad_norm": 0.2656538784503937, + "learning_rate": 1.2600536193029492e-06, + "logits/chosen": -0.648895263671875, + "logits/rejected": -0.421630859375, + "logps/chosen": -1.6907479763031006, + "logps/rejected": -3.048745632171631, + "loss": -0.1701, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2478935867547989, + "rewards/margins": 0.17013061046600342, + "rewards/rejected": 0.07776296138763428, + "step": 322 + }, + { + "epoch": 0.7787823990355636, + "grad_norm": 0.41179776191711426, + "learning_rate": 1.2466487935656837e-06, + "logits/chosen": -0.71209716796875, + "logits/rejected": -0.47601318359375, + "logps/chosen": -1.7523152828216553, + "logps/rejected": -2.710491895675659, + "loss": -0.1393, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.27367299795150757, + "rewards/margins": 0.1393078863620758, + "rewards/rejected": 0.13436509668827057, + "step": 323 + }, + { + "epoch": 0.7811934900542495, + "grad_norm": 0.25614243745803833, + "learning_rate": 1.2332439678284184e-06, + "logits/chosen": -0.5819282531738281, + "logits/rejected": -0.48223876953125, + "logps/chosen": -1.960573434829712, + "logps/rejected": -3.019958019256592, + "loss": -0.1653, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.26249444484710693, + "rewards/margins": 0.16525691747665405, + "rewards/rejected": 0.09723751246929169, + "step": 324 + }, + { + "epoch": 0.7836045810729355, + "grad_norm": 0.3553808033466339, + "learning_rate": 1.2198391420911529e-06, + "logits/chosen": -0.63043212890625, + "logits/rejected": -0.8211669921875, + "logps/chosen": -2.7220582962036133, + "logps/rejected": -3.312553644180298, + "loss": -0.076, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15031234920024872, + "rewards/margins": 0.07595318555831909, + "rewards/rejected": 0.07435917109251022, + "step": 325 + }, + { + "epoch": 0.7860156720916215, + "grad_norm": 0.3252660632133484, + "learning_rate": 1.2064343163538874e-06, + "logits/chosen": -0.7486572265625, + "logits/rejected": -0.483856201171875, + "logps/chosen": -2.5184779167175293, + "logps/rejected": -3.1500484943389893, + "loss": -0.0962, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17655092477798462, + "rewards/margins": 0.09616627544164658, + "rewards/rejected": 0.08038467168807983, + "step": 326 + }, + { + "epoch": 0.7884267631103075, + "grad_norm": 1.2538331747055054, + "learning_rate": 1.193029490616622e-06, + "logits/chosen": -0.51934814453125, + "logits/rejected": -0.21263885498046875, + "logps/chosen": -1.6122972965240479, + "logps/rejected": -2.1897401809692383, + "loss": -0.0769, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2627054750919342, + "rewards/margins": 0.07686497271060944, + "rewards/rejected": 0.18584050238132477, + "step": 327 + }, + { + "epoch": 0.7908378541289933, + "grad_norm": 0.11358801275491714, + "learning_rate": 1.1796246648793566e-06, + "logits/chosen": -0.511932373046875, + "logits/rejected": -0.362030029296875, + "logps/chosen": -2.6542270183563232, + "logps/rejected": -3.073587417602539, + "loss": -0.0821, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.15208172798156738, + "rewards/margins": 0.08211520314216614, + "rewards/rejected": 0.06996653228998184, + "step": 328 + }, + { + "epoch": 0.7932489451476793, + "grad_norm": 0.3655044138431549, + "learning_rate": 1.1662198391420913e-06, + "logits/chosen": -0.5436553955078125, + "logits/rejected": -0.6343221664428711, + "logps/chosen": -1.558677315711975, + "logps/rejected": -2.7468161582946777, + "loss": -0.1391, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2532871663570404, + "rewards/margins": 0.13911029696464539, + "rewards/rejected": 0.11417686939239502, + "step": 329 + }, + { + "epoch": 0.7956600361663653, + "grad_norm": 0.24528777599334717, + "learning_rate": 1.1528150134048258e-06, + "logits/chosen": -0.7559814453125, + "logits/rejected": -0.7530593872070312, + "logps/chosen": -2.7780535221099854, + "logps/rejected": -3.227290630340576, + "loss": -0.0649, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15753725171089172, + "rewards/margins": 0.0649256557226181, + "rewards/rejected": 0.09261158853769302, + "step": 330 + }, + { + "epoch": 0.7980711271850512, + "grad_norm": 0.317238986492157, + "learning_rate": 1.1394101876675605e-06, + "logits/chosen": -0.6470184326171875, + "logits/rejected": -0.31689453125, + "logps/chosen": -2.395307779312134, + "logps/rejected": -3.2115113735198975, + "loss": -0.1122, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16952499747276306, + "rewards/margins": 0.1121913269162178, + "rewards/rejected": 0.05733366310596466, + "step": 331 + }, + { + "epoch": 0.8004822182037372, + "grad_norm": 0.35996726155281067, + "learning_rate": 1.126005361930295e-06, + "logits/chosen": -0.545654296875, + "logits/rejected": -0.43060302734375, + "logps/chosen": -2.0136656761169434, + "logps/rejected": -2.4377989768981934, + "loss": -0.0687, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2312278151512146, + "rewards/margins": 0.06869004666805267, + "rewards/rejected": 0.16253775358200073, + "step": 332 + }, + { + "epoch": 0.8028933092224232, + "grad_norm": 0.031628288328647614, + "learning_rate": 1.1126005361930297e-06, + "logits/chosen": -0.4132080078125, + "logits/rejected": -0.45574951171875, + "logps/chosen": -2.588603973388672, + "logps/rejected": -3.0727972984313965, + "loss": -0.0857, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19641953706741333, + "rewards/margins": 0.08566197752952576, + "rewards/rejected": 0.11075755953788757, + "step": 333 + }, + { + "epoch": 0.805304400241109, + "grad_norm": 0.17958490550518036, + "learning_rate": 1.0991957104557642e-06, + "logits/chosen": -0.59326171875, + "logits/rejected": -0.479888916015625, + "logps/chosen": -1.4320169687271118, + "logps/rejected": -2.4326488971710205, + "loss": -0.1338, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2660183012485504, + "rewards/margins": 0.1338462233543396, + "rewards/rejected": 0.132172092795372, + "step": 334 + }, + { + "epoch": 0.807715491259795, + "grad_norm": 0.20675577223300934, + "learning_rate": 1.0857908847184987e-06, + "logits/chosen": -0.67340087890625, + "logits/rejected": -0.41010284423828125, + "logps/chosen": -1.8122917413711548, + "logps/rejected": -2.3817529678344727, + "loss": -0.0713, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23399695754051208, + "rewards/margins": 0.07131601125001907, + "rewards/rejected": 0.16268093883991241, + "step": 335 + }, + { + "epoch": 0.810126582278481, + "grad_norm": 0.11301539838314056, + "learning_rate": 1.0723860589812334e-06, + "logits/chosen": -0.7158584594726562, + "logits/rejected": -0.7340087890625, + "logps/chosen": -1.748389482498169, + "logps/rejected": -2.919363498687744, + "loss": -0.1095, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23092560470104218, + "rewards/margins": 0.10945893824100494, + "rewards/rejected": 0.12146666646003723, + "step": 336 + }, + { + "epoch": 0.812537673297167, + "grad_norm": 0.43449512124061584, + "learning_rate": 1.0589812332439679e-06, + "logits/chosen": -0.58447265625, + "logits/rejected": -0.50311279296875, + "logps/chosen": -1.736456036567688, + "logps/rejected": -2.672661066055298, + "loss": -0.1263, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24446260929107666, + "rewards/margins": 0.12634806334972382, + "rewards/rejected": 0.11811454594135284, + "step": 337 + }, + { + "epoch": 0.814948764315853, + "grad_norm": 0.12449906766414642, + "learning_rate": 1.0455764075067026e-06, + "logits/chosen": -0.682525634765625, + "logits/rejected": -0.61590576171875, + "logps/chosen": -2.408848285675049, + "logps/rejected": -3.2679600715637207, + "loss": -0.123, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2024296224117279, + "rewards/margins": 0.12303236126899719, + "rewards/rejected": 0.0793972760438919, + "step": 338 + }, + { + "epoch": 0.8173598553345389, + "grad_norm": 0.19680745899677277, + "learning_rate": 1.032171581769437e-06, + "logits/chosen": -0.710418701171875, + "logits/rejected": -0.49053955078125, + "logps/chosen": -1.753347635269165, + "logps/rejected": -2.8181285858154297, + "loss": -0.102, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21920640766620636, + "rewards/margins": 0.10195432603359222, + "rewards/rejected": 0.11725206673145294, + "step": 339 + }, + { + "epoch": 0.8197709463532248, + "grad_norm": 0.34077131748199463, + "learning_rate": 1.0187667560321716e-06, + "logits/chosen": -0.4638214111328125, + "logits/rejected": -0.764739990234375, + "logps/chosen": -2.0815467834472656, + "logps/rejected": -2.8254013061523438, + "loss": -0.0965, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20986172556877136, + "rewards/margins": 0.09645634144544601, + "rewards/rejected": 0.11340540647506714, + "step": 340 + }, + { + "epoch": 0.8221820373719108, + "grad_norm": 0.4509377181529999, + "learning_rate": 1.0053619302949063e-06, + "logits/chosen": -0.481964111328125, + "logits/rejected": -0.1854095458984375, + "logps/chosen": -1.481386423110962, + "logps/rejected": -2.277862310409546, + "loss": -0.115, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29776546359062195, + "rewards/margins": 0.11503616720438004, + "rewards/rejected": 0.1827293038368225, + "step": 341 + }, + { + "epoch": 0.8245931283905967, + "grad_norm": 0.356119304895401, + "learning_rate": 9.919571045576408e-07, + "logits/chosen": -0.743408203125, + "logits/rejected": -0.6505126953125, + "logps/chosen": -1.952662706375122, + "logps/rejected": -2.7529821395874023, + "loss": -0.0956, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20616883039474487, + "rewards/margins": 0.09555038809776306, + "rewards/rejected": 0.11061841994524002, + "step": 342 + }, + { + "epoch": 0.8270042194092827, + "grad_norm": 0.2967340052127838, + "learning_rate": 9.785522788203753e-07, + "logits/chosen": -0.6893081665039062, + "logits/rejected": -0.8092041015625, + "logps/chosen": -2.09456729888916, + "logps/rejected": -2.9612343311309814, + "loss": -0.1016, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19902732968330383, + "rewards/margins": 0.1016278937458992, + "rewards/rejected": 0.09739943593740463, + "step": 343 + }, + { + "epoch": 0.8294153104279687, + "grad_norm": 0.7897835373878479, + "learning_rate": 9.6514745308311e-07, + "logits/chosen": -0.658203125, + "logits/rejected": -0.3927040100097656, + "logps/chosen": -1.6565616130828857, + "logps/rejected": -2.568018913269043, + "loss": -0.0928, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22605273127555847, + "rewards/margins": 0.09283320605754852, + "rewards/rejected": 0.13321952521800995, + "step": 344 + }, + { + "epoch": 0.8318264014466547, + "grad_norm": 0.6379087567329407, + "learning_rate": 9.517426273458445e-07, + "logits/chosen": -0.7298583984375, + "logits/rejected": -0.8280029296875, + "logps/chosen": -2.670103073120117, + "logps/rejected": -3.2749359607696533, + "loss": -0.0856, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1672106683254242, + "rewards/margins": 0.08561202883720398, + "rewards/rejected": 0.08159864693880081, + "step": 345 + }, + { + "epoch": 0.8342374924653405, + "grad_norm": 0.1369830071926117, + "learning_rate": 9.383378016085791e-07, + "logits/chosen": -0.644287109375, + "logits/rejected": -0.6427001953125, + "logps/chosen": -2.470473289489746, + "logps/rejected": -3.2199714183807373, + "loss": -0.092, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1711222231388092, + "rewards/margins": 0.09201842546463013, + "rewards/rejected": 0.07910380512475967, + "step": 346 + }, + { + "epoch": 0.8366485834840265, + "grad_norm": 0.13143277168273926, + "learning_rate": 9.249329758713138e-07, + "logits/chosen": -0.83502197265625, + "logits/rejected": -0.7808837890625, + "logps/chosen": -1.8317530155181885, + "logps/rejected": -3.274116039276123, + "loss": -0.1325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19241732358932495, + "rewards/margins": 0.13249869644641876, + "rewards/rejected": 0.05991861969232559, + "step": 347 + }, + { + "epoch": 0.8390596745027125, + "grad_norm": 0.3424263000488281, + "learning_rate": 9.115281501340484e-07, + "logits/chosen": -0.6896247863769531, + "logits/rejected": -0.622711181640625, + "logps/chosen": -1.7714059352874756, + "logps/rejected": -2.8001139163970947, + "loss": -0.1207, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21763411164283752, + "rewards/margins": 0.12070479243993759, + "rewards/rejected": 0.09692931175231934, + "step": 348 + }, + { + "epoch": 0.8414707655213984, + "grad_norm": 0.5700894594192505, + "learning_rate": 8.98123324396783e-07, + "logits/chosen": -0.79388427734375, + "logits/rejected": -0.565826416015625, + "logps/chosen": -2.2087557315826416, + "logps/rejected": -3.132368803024292, + "loss": -0.1254, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22162456810474396, + "rewards/margins": 0.12537109851837158, + "rewards/rejected": 0.09625348448753357, + "step": 349 + }, + { + "epoch": 0.8438818565400844, + "grad_norm": 0.21551689505577087, + "learning_rate": 8.847184986595175e-07, + "logits/chosen": -0.548614501953125, + "logits/rejected": -0.2332763671875, + "logps/chosen": -1.6717592477798462, + "logps/rejected": -2.53096866607666, + "loss": -0.1049, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26083993911743164, + "rewards/margins": 0.10488566011190414, + "rewards/rejected": 0.1559543013572693, + "step": 350 + }, + { + "epoch": 0.8462929475587704, + "grad_norm": 0.39866408705711365, + "learning_rate": 8.713136729222521e-07, + "logits/chosen": -0.853759765625, + "logits/rejected": -0.879302978515625, + "logps/chosen": -1.844583511352539, + "logps/rejected": -2.8574059009552, + "loss": -0.1335, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.24363258481025696, + "rewards/margins": 0.13351930677890778, + "rewards/rejected": 0.11011329293251038, + "step": 351 + }, + { + "epoch": 0.8487040385774564, + "grad_norm": 0.25307849049568176, + "learning_rate": 8.579088471849867e-07, + "logits/chosen": -0.765716552734375, + "logits/rejected": -0.3944091796875, + "logps/chosen": -1.8458311557769775, + "logps/rejected": -2.8168089389801025, + "loss": -0.1164, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22035478055477142, + "rewards/margins": 0.11644255369901657, + "rewards/rejected": 0.10391222685575485, + "step": 352 + }, + { + "epoch": 0.8511151295961422, + "grad_norm": 0.5324075222015381, + "learning_rate": 8.445040214477213e-07, + "logits/chosen": -0.5719451904296875, + "logits/rejected": -0.65869140625, + "logps/chosen": -1.841377854347229, + "logps/rejected": -2.890317916870117, + "loss": -0.1277, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23612016439437866, + "rewards/margins": 0.1276772916316986, + "rewards/rejected": 0.10844288766384125, + "step": 353 + }, + { + "epoch": 0.8535262206148282, + "grad_norm": 0.2615352272987366, + "learning_rate": 8.310991957104558e-07, + "logits/chosen": -0.8394775390625, + "logits/rejected": -0.75970458984375, + "logps/chosen": -2.243168830871582, + "logps/rejected": -3.2211532592773438, + "loss": -0.1223, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20771005749702454, + "rewards/margins": 0.12234125286340714, + "rewards/rejected": 0.0853688046336174, + "step": 354 + }, + { + "epoch": 0.8559373116335142, + "grad_norm": 0.570944607257843, + "learning_rate": 8.176943699731904e-07, + "logits/chosen": -0.5220870971679688, + "logits/rejected": -0.7337646484375, + "logps/chosen": -1.5883126258850098, + "logps/rejected": -2.644714117050171, + "loss": -0.1297, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2606670558452606, + "rewards/margins": 0.12971048057079315, + "rewards/rejected": 0.13095657527446747, + "step": 355 + }, + { + "epoch": 0.8583484026522001, + "grad_norm": 0.4165022373199463, + "learning_rate": 8.04289544235925e-07, + "logits/chosen": -0.6927490234375, + "logits/rejected": -0.737884521484375, + "logps/chosen": -2.259829044342041, + "logps/rejected": -2.9725074768066406, + "loss": -0.084, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19348900020122528, + "rewards/margins": 0.08402594923973083, + "rewards/rejected": 0.10946306586265564, + "step": 356 + }, + { + "epoch": 0.8607594936708861, + "grad_norm": 1.0125479698181152, + "learning_rate": 7.908847184986596e-07, + "logits/chosen": -0.650634765625, + "logits/rejected": -0.416748046875, + "logps/chosen": -1.9553372859954834, + "logps/rejected": -2.74334979057312, + "loss": -0.0942, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2425239533185959, + "rewards/margins": 0.094170942902565, + "rewards/rejected": 0.14835301041603088, + "step": 357 + }, + { + "epoch": 0.8631705846895721, + "grad_norm": 0.33252832293510437, + "learning_rate": 7.774798927613941e-07, + "logits/chosen": -0.7840576171875, + "logits/rejected": -0.6617355346679688, + "logps/chosen": -1.8228936195373535, + "logps/rejected": -3.4139397144317627, + "loss": -0.1881, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.25794464349746704, + "rewards/margins": 0.18806813657283783, + "rewards/rejected": 0.0698765367269516, + "step": 358 + }, + { + "epoch": 0.865581675708258, + "grad_norm": 0.3364689350128174, + "learning_rate": 7.640750670241287e-07, + "logits/chosen": -0.86126708984375, + "logits/rejected": -0.5726318359375, + "logps/chosen": -1.7675869464874268, + "logps/rejected": -2.9138216972351074, + "loss": -0.1138, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2292586714029312, + "rewards/margins": 0.11380802094936371, + "rewards/rejected": 0.11545064300298691, + "step": 359 + }, + { + "epoch": 0.8679927667269439, + "grad_norm": 0.4650719463825226, + "learning_rate": 7.506702412868633e-07, + "logits/chosen": -0.48486328125, + "logits/rejected": -0.335845947265625, + "logps/chosen": -1.743593454360962, + "logps/rejected": -3.025881767272949, + "loss": -0.1399, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2064819633960724, + "rewards/margins": 0.13988713920116425, + "rewards/rejected": 0.06659482419490814, + "step": 360 + }, + { + "epoch": 0.8704038577456299, + "grad_norm": 0.2281513512134552, + "learning_rate": 7.372654155495979e-07, + "logits/chosen": -0.822021484375, + "logits/rejected": -0.3350830078125, + "logps/chosen": -1.898543119430542, + "logps/rejected": -2.8283238410949707, + "loss": -0.1206, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24126467108726501, + "rewards/margins": 0.12059096992015839, + "rewards/rejected": 0.12067370861768723, + "step": 361 + }, + { + "epoch": 0.8728149487643159, + "grad_norm": 0.3579182028770447, + "learning_rate": 7.238605898123326e-07, + "logits/chosen": -0.762451171875, + "logits/rejected": -0.5715484619140625, + "logps/chosen": -2.3051161766052246, + "logps/rejected": -2.918890953063965, + "loss": -0.08, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19016730785369873, + "rewards/margins": 0.08000713586807251, + "rewards/rejected": 0.11016017198562622, + "step": 362 + }, + { + "epoch": 0.8752260397830018, + "grad_norm": 0.24361228942871094, + "learning_rate": 7.104557640750671e-07, + "logits/chosen": -0.61834716796875, + "logits/rejected": -0.76898193359375, + "logps/chosen": -2.3034181594848633, + "logps/rejected": -3.01597261428833, + "loss": -0.0915, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19367441534996033, + "rewards/margins": 0.0915202796459198, + "rewards/rejected": 0.10215415060520172, + "step": 363 + }, + { + "epoch": 0.8776371308016878, + "grad_norm": 0.16889825463294983, + "learning_rate": 6.970509383378017e-07, + "logits/chosen": -0.6359405517578125, + "logits/rejected": -0.72900390625, + "logps/chosen": -3.064101219177246, + "logps/rejected": -3.4576215744018555, + "loss": -0.0735, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13077300786972046, + "rewards/margins": 0.07352845370769501, + "rewards/rejected": 0.05724453926086426, + "step": 364 + }, + { + "epoch": 0.8800482218203737, + "grad_norm": 0.18460078537464142, + "learning_rate": 6.836461126005363e-07, + "logits/chosen": -0.8011474609375, + "logits/rejected": -0.77813720703125, + "logps/chosen": -2.028651714324951, + "logps/rejected": -3.184272527694702, + "loss": -0.1296, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19649113714694977, + "rewards/margins": 0.12956087291240692, + "rewards/rejected": 0.06693026423454285, + "step": 365 + }, + { + "epoch": 0.8824593128390597, + "grad_norm": 0.11158600449562073, + "learning_rate": 6.702412868632709e-07, + "logits/chosen": -0.537017822265625, + "logits/rejected": -0.615234375, + "logps/chosen": -2.300839900970459, + "logps/rejected": -3.0153896808624268, + "loss": -0.1314, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21872344613075256, + "rewards/margins": 0.13141807913780212, + "rewards/rejected": 0.08730537444353104, + "step": 366 + }, + { + "epoch": 0.8848704038577456, + "grad_norm": 0.3159348964691162, + "learning_rate": 6.568364611260054e-07, + "logits/chosen": -0.5802001953125, + "logits/rejected": -0.6844482421875, + "logps/chosen": -1.9563806056976318, + "logps/rejected": -2.8836090564727783, + "loss": -0.1088, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23880364000797272, + "rewards/margins": 0.10880030691623688, + "rewards/rejected": 0.13000334799289703, + "step": 367 + }, + { + "epoch": 0.8872814948764316, + "grad_norm": 0.17917028069496155, + "learning_rate": 6.4343163538874e-07, + "logits/chosen": -0.83807373046875, + "logits/rejected": -0.387908935546875, + "logps/chosen": -1.787778615951538, + "logps/rejected": -2.983999013900757, + "loss": -0.1358, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20905858278274536, + "rewards/margins": 0.13576525449752808, + "rewards/rejected": 0.07329332828521729, + "step": 368 + }, + { + "epoch": 0.8896925858951176, + "grad_norm": 0.8127462863922119, + "learning_rate": 6.300268096514746e-07, + "logits/chosen": -0.67694091796875, + "logits/rejected": -0.2607421875, + "logps/chosen": -2.443679094314575, + "logps/rejected": -2.846816062927246, + "loss": -0.1211, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2383415400981903, + "rewards/margins": 0.12107516080141068, + "rewards/rejected": 0.11726637184619904, + "step": 369 + }, + { + "epoch": 0.8921036769138035, + "grad_norm": 0.45784062147140503, + "learning_rate": 6.166219839142092e-07, + "logits/chosen": -0.596343994140625, + "logits/rejected": -0.48443603515625, + "logps/chosen": -1.6567652225494385, + "logps/rejected": -2.5580644607543945, + "loss": -0.0927, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.25336623191833496, + "rewards/margins": 0.09265391528606415, + "rewards/rejected": 0.16071230173110962, + "step": 370 + }, + { + "epoch": 0.8945147679324894, + "grad_norm": 0.16701728105545044, + "learning_rate": 6.032171581769437e-07, + "logits/chosen": -0.617431640625, + "logits/rejected": -0.76031494140625, + "logps/chosen": -1.9505524635314941, + "logps/rejected": -3.2255501747131348, + "loss": -0.1219, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18996216356754303, + "rewards/margins": 0.12192671000957489, + "rewards/rejected": 0.06803545355796814, + "step": 371 + }, + { + "epoch": 0.8969258589511754, + "grad_norm": 0.24312333762645721, + "learning_rate": 5.898123324396783e-07, + "logits/chosen": -0.7477149963378906, + "logits/rejected": -0.7543106079101562, + "logps/chosen": -2.029766082763672, + "logps/rejected": -2.523604393005371, + "loss": -0.052, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.24739180505275726, + "rewards/margins": 0.051989682018756866, + "rewards/rejected": 0.1954021155834198, + "step": 372 + }, + { + "epoch": 0.8993369499698614, + "grad_norm": 0.2546791434288025, + "learning_rate": 5.764075067024129e-07, + "logits/chosen": -0.76751708984375, + "logits/rejected": -0.46405029296875, + "logps/chosen": -1.5008107423782349, + "logps/rejected": -2.0352025032043457, + "loss": -0.0602, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3135339021682739, + "rewards/margins": 0.06018254905939102, + "rewards/rejected": 0.2533513307571411, + "step": 373 + }, + { + "epoch": 0.9017480409885473, + "grad_norm": 0.09645721316337585, + "learning_rate": 5.630026809651475e-07, + "logits/chosen": -0.4677734375, + "logits/rejected": -0.711151123046875, + "logps/chosen": -2.6174230575561523, + "logps/rejected": -3.0249276161193848, + "loss": -0.0902, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21756963431835175, + "rewards/margins": 0.09020111709833145, + "rewards/rejected": 0.1273685246706009, + "step": 374 + }, + { + "epoch": 0.9041591320072333, + "grad_norm": 0.3882448673248291, + "learning_rate": 5.495978552278821e-07, + "logits/chosen": -0.84515380859375, + "logits/rejected": -0.5882568359375, + "logps/chosen": -2.2469420433044434, + "logps/rejected": -2.8480148315429688, + "loss": -0.0631, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18355120718479156, + "rewards/margins": 0.0631059855222702, + "rewards/rejected": 0.12044522166252136, + "step": 375 + }, + { + "epoch": 0.9065702230259193, + "grad_norm": 0.1412249356508255, + "learning_rate": 5.361930294906167e-07, + "logits/chosen": -0.473724365234375, + "logits/rejected": -0.41229248046875, + "logps/chosen": -2.6998796463012695, + "logps/rejected": -3.3412230014801025, + "loss": -0.0744, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15408103168010712, + "rewards/margins": 0.07438866794109344, + "rewards/rejected": 0.07969234883785248, + "step": 376 + }, + { + "epoch": 0.9089813140446051, + "grad_norm": 0.20396974682807922, + "learning_rate": 5.227882037533513e-07, + "logits/chosen": -0.859619140625, + "logits/rejected": -0.53076171875, + "logps/chosen": -2.203944206237793, + "logps/rejected": -3.39310359954834, + "loss": -0.1483, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.18894213438034058, + "rewards/margins": 0.14831510186195374, + "rewards/rejected": 0.040627021342515945, + "step": 377 + }, + { + "epoch": 0.9113924050632911, + "grad_norm": 0.250082790851593, + "learning_rate": 5.093833780160858e-07, + "logits/chosen": -0.85601806640625, + "logits/rejected": -0.7227783203125, + "logps/chosen": -2.5119528770446777, + "logps/rejected": -3.0417776107788086, + "loss": -0.1028, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.22248879075050354, + "rewards/margins": 0.10282251238822937, + "rewards/rejected": 0.11966629326343536, + "step": 378 + }, + { + "epoch": 0.9138034960819771, + "grad_norm": 0.38608309626579285, + "learning_rate": 4.959785522788204e-07, + "logits/chosen": -0.6385498046875, + "logits/rejected": -0.7056884765625, + "logps/chosen": -1.922960877418518, + "logps/rejected": -2.9718003273010254, + "loss": -0.0979, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20073366165161133, + "rewards/margins": 0.09787726402282715, + "rewards/rejected": 0.10285639017820358, + "step": 379 + }, + { + "epoch": 0.9162145871006631, + "grad_norm": 0.1388954073190689, + "learning_rate": 4.82573726541555e-07, + "logits/chosen": -0.782958984375, + "logits/rejected": -0.6474609375, + "logps/chosen": -2.078749656677246, + "logps/rejected": -2.8820574283599854, + "loss": -0.1111, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2187328338623047, + "rewards/margins": 0.11110672354698181, + "rewards/rejected": 0.10762612521648407, + "step": 380 + }, + { + "epoch": 0.918625678119349, + "grad_norm": 0.2792143225669861, + "learning_rate": 4.6916890080428954e-07, + "logits/chosen": -0.68646240234375, + "logits/rejected": -0.8287353515625, + "logps/chosen": -2.4176182746887207, + "logps/rejected": -3.2421364784240723, + "loss": -0.0815, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14206507802009583, + "rewards/margins": 0.08154135942459106, + "rewards/rejected": 0.06052371859550476, + "step": 381 + }, + { + "epoch": 0.921036769138035, + "grad_norm": 0.14331650733947754, + "learning_rate": 4.557640750670242e-07, + "logits/chosen": -0.8070068359375, + "logits/rejected": -0.755096435546875, + "logps/chosen": -1.9265367984771729, + "logps/rejected": -2.9522743225097656, + "loss": -0.1121, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.230011448264122, + "rewards/margins": 0.11208435148000717, + "rewards/rejected": 0.11792707443237305, + "step": 382 + }, + { + "epoch": 0.9234478601567209, + "grad_norm": 0.5208033323287964, + "learning_rate": 4.4235924932975874e-07, + "logits/chosen": -0.7991943359375, + "logits/rejected": -0.31329345703125, + "logps/chosen": -1.5004363059997559, + "logps/rejected": -2.7850728034973145, + "loss": -0.2007, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3518799841403961, + "rewards/margins": 0.2006998360157013, + "rewards/rejected": 0.15118016302585602, + "step": 383 + }, + { + "epoch": 0.9258589511754068, + "grad_norm": 0.22296123206615448, + "learning_rate": 4.2895442359249334e-07, + "logits/chosen": -0.2904052734375, + "logits/rejected": -0.5080642700195312, + "logps/chosen": -1.890813946723938, + "logps/rejected": -2.844127655029297, + "loss": -0.0948, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21389998495578766, + "rewards/margins": 0.0947994664311409, + "rewards/rejected": 0.11910051852464676, + "step": 384 + }, + { + "epoch": 0.9282700421940928, + "grad_norm": 0.3174738585948944, + "learning_rate": 4.155495978552279e-07, + "logits/chosen": -0.652313232421875, + "logits/rejected": -0.671630859375, + "logps/chosen": -2.2779366970062256, + "logps/rejected": -3.0082509517669678, + "loss": -0.1049, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18445079028606415, + "rewards/margins": 0.10485711693763733, + "rewards/rejected": 0.07959368079900742, + "step": 385 + }, + { + "epoch": 0.9306811332127788, + "grad_norm": 0.13899242877960205, + "learning_rate": 4.021447721179625e-07, + "logits/chosen": -0.70849609375, + "logits/rejected": -0.70794677734375, + "logps/chosen": -1.9691016674041748, + "logps/rejected": -3.1534197330474854, + "loss": -0.0975, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.18175840377807617, + "rewards/margins": 0.09753191471099854, + "rewards/rejected": 0.08422648906707764, + "step": 386 + }, + { + "epoch": 0.9330922242314648, + "grad_norm": 0.3441121280193329, + "learning_rate": 3.8873994638069704e-07, + "logits/chosen": -0.576690673828125, + "logits/rejected": -0.625762939453125, + "logps/chosen": -1.7553675174713135, + "logps/rejected": -2.7304201126098633, + "loss": -0.1106, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2128778100013733, + "rewards/margins": 0.11058033257722855, + "rewards/rejected": 0.10229746997356415, + "step": 387 + }, + { + "epoch": 0.9355033152501507, + "grad_norm": 0.6303390264511108, + "learning_rate": 3.7533512064343164e-07, + "logits/chosen": -0.8516845703125, + "logits/rejected": -0.5465087890625, + "logps/chosen": -2.1568257808685303, + "logps/rejected": -3.1136550903320312, + "loss": -0.1011, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.19368943572044373, + "rewards/margins": 0.10105198621749878, + "rewards/rejected": 0.09263744205236435, + "step": 388 + }, + { + "epoch": 0.9379144062688366, + "grad_norm": 0.4192253649234772, + "learning_rate": 3.619302949061663e-07, + "logits/chosen": -0.984130859375, + "logits/rejected": -0.9190673828125, + "logps/chosen": -2.7901992797851562, + "logps/rejected": -3.138003349304199, + "loss": -0.0652, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.1714577078819275, + "rewards/margins": 0.06517384946346283, + "rewards/rejected": 0.10628385841846466, + "step": 389 + }, + { + "epoch": 0.9403254972875226, + "grad_norm": 0.30952024459838867, + "learning_rate": 3.4852546916890084e-07, + "logits/chosen": -0.665283203125, + "logits/rejected": -0.424072265625, + "logps/chosen": -1.1938724517822266, + "logps/rejected": -2.3507652282714844, + "loss": -0.1703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36022892594337463, + "rewards/margins": 0.17034617066383362, + "rewards/rejected": 0.18988275527954102, + "step": 390 + }, + { + "epoch": 0.9427365883062085, + "grad_norm": 0.3559146225452423, + "learning_rate": 3.3512064343163545e-07, + "logits/chosen": -0.702880859375, + "logits/rejected": -0.72674560546875, + "logps/chosen": -2.6314926147460938, + "logps/rejected": -3.2541961669921875, + "loss": -0.0778, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.15094122290611267, + "rewards/margins": 0.07783782482147217, + "rewards/rejected": 0.0731033906340599, + "step": 391 + }, + { + "epoch": 0.9451476793248945, + "grad_norm": 0.3946284055709839, + "learning_rate": 3.2171581769437e-07, + "logits/chosen": -0.7203216552734375, + "logits/rejected": -0.8667755126953125, + "logps/chosen": -1.8220577239990234, + "logps/rejected": -3.0527234077453613, + "loss": -0.1217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19296303391456604, + "rewards/margins": 0.12171687185764313, + "rewards/rejected": 0.0712461769580841, + "step": 392 + }, + { + "epoch": 0.9475587703435805, + "grad_norm": 0.15024717152118683, + "learning_rate": 3.083109919571046e-07, + "logits/chosen": -0.6586227416992188, + "logits/rejected": -0.4817962646484375, + "logps/chosen": -1.9061774015426636, + "logps/rejected": -2.863325834274292, + "loss": -0.1145, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19597814977169037, + "rewards/margins": 0.11450601369142532, + "rewards/rejected": 0.08147213608026505, + "step": 393 + }, + { + "epoch": 0.9499698613622665, + "grad_norm": 0.21850810945034027, + "learning_rate": 2.9490616621983914e-07, + "logits/chosen": -0.2873382568359375, + "logits/rejected": -0.47409820556640625, + "logps/chosen": -1.8596577644348145, + "logps/rejected": -2.84549617767334, + "loss": -0.0926, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21761712431907654, + "rewards/margins": 0.09261301904916763, + "rewards/rejected": 0.1250040978193283, + "step": 394 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.19099746644496918, + "learning_rate": 2.8150134048257374e-07, + "logits/chosen": -0.5827713012695312, + "logits/rejected": -0.5249900817871094, + "logps/chosen": -1.9461772441864014, + "logps/rejected": -2.7794950008392334, + "loss": -0.1245, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.25487175583839417, + "rewards/margins": 0.1244988888502121, + "rewards/rejected": 0.13037288188934326, + "step": 395 + }, + { + "epoch": 0.9547920433996383, + "grad_norm": 0.13578198850154877, + "learning_rate": 2.6809651474530835e-07, + "logits/chosen": -0.7562255859375, + "logits/rejected": -0.61749267578125, + "logps/chosen": -2.6253232955932617, + "logps/rejected": -3.218390464782715, + "loss": -0.0705, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.15692347288131714, + "rewards/margins": 0.07054620236158371, + "rewards/rejected": 0.08637728542089462, + "step": 396 + }, + { + "epoch": 0.9572031344183243, + "grad_norm": 0.48070481419563293, + "learning_rate": 2.546916890080429e-07, + "logits/chosen": -0.5872154235839844, + "logits/rejected": -0.822418212890625, + "logps/chosen": -1.757201910018921, + "logps/rejected": -2.6169118881225586, + "loss": -0.1148, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24008452892303467, + "rewards/margins": 0.11483413726091385, + "rewards/rejected": 0.12525039911270142, + "step": 397 + }, + { + "epoch": 0.9596142254370102, + "grad_norm": 0.13527289032936096, + "learning_rate": 2.412868632707775e-07, + "logits/chosen": -0.8331298828125, + "logits/rejected": -0.6849288940429688, + "logps/chosen": -2.3393187522888184, + "logps/rejected": -3.325896739959717, + "loss": -0.1248, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1991003304719925, + "rewards/margins": 0.12477909028530121, + "rewards/rejected": 0.07432122528553009, + "step": 398 + }, + { + "epoch": 0.9620253164556962, + "grad_norm": 0.4991070330142975, + "learning_rate": 2.278820375335121e-07, + "logits/chosen": -0.744598388671875, + "logits/rejected": -0.8030853271484375, + "logps/chosen": -2.3529510498046875, + "logps/rejected": -3.100037097930908, + "loss": -0.0877, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1853184998035431, + "rewards/margins": 0.08771476149559021, + "rewards/rejected": 0.09760373830795288, + "step": 399 + }, + { + "epoch": 0.9644364074743822, + "grad_norm": 0.28707462549209595, + "learning_rate": 2.1447721179624667e-07, + "logits/chosen": -0.47296142578125, + "logits/rejected": -0.4329681396484375, + "logps/chosen": -1.794015645980835, + "logps/rejected": -2.4885170459747314, + "loss": -0.0949, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2280758023262024, + "rewards/margins": 0.09485641866922379, + "rewards/rejected": 0.1332193911075592, + "step": 400 + }, + { + "epoch": 0.9668474984930681, + "grad_norm": 0.457259863615036, + "learning_rate": 2.0107238605898125e-07, + "logits/chosen": -0.6519775390625, + "logits/rejected": -0.484130859375, + "logps/chosen": -1.5918774604797363, + "logps/rejected": -2.6084401607513428, + "loss": -0.101, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2561843991279602, + "rewards/margins": 0.10101480782032013, + "rewards/rejected": 0.15516959130764008, + "step": 401 + }, + { + "epoch": 0.969258589511754, + "grad_norm": 0.18163321912288666, + "learning_rate": 1.8766756032171582e-07, + "logits/chosen": -0.8362579345703125, + "logits/rejected": -1.0521240234375, + "logps/chosen": -2.4465787410736084, + "logps/rejected": -3.3151872158050537, + "loss": -0.0983, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1536177396774292, + "rewards/margins": 0.09829680621623993, + "rewards/rejected": 0.05532093346118927, + "step": 402 + }, + { + "epoch": 0.97166968053044, + "grad_norm": 0.11386421322822571, + "learning_rate": 1.7426273458445042e-07, + "logits/chosen": -0.8059768676757812, + "logits/rejected": -0.6087646484375, + "logps/chosen": -2.302320957183838, + "logps/rejected": -3.2788963317871094, + "loss": -0.1136, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18182218074798584, + "rewards/margins": 0.11363334953784943, + "rewards/rejected": 0.06818882375955582, + "step": 403 + }, + { + "epoch": 0.974080771549126, + "grad_norm": 0.44338279962539673, + "learning_rate": 1.60857908847185e-07, + "logits/chosen": -0.674041748046875, + "logits/rejected": -0.6409912109375, + "logps/chosen": -1.7187930345535278, + "logps/rejected": -2.5694637298583984, + "loss": -0.1264, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2568460702896118, + "rewards/margins": 0.12642240524291992, + "rewards/rejected": 0.1304236501455307, + "step": 404 + }, + { + "epoch": 0.976491862567812, + "grad_norm": 0.21830198168754578, + "learning_rate": 1.4745308310991957e-07, + "logits/chosen": -0.866943359375, + "logits/rejected": -0.568603515625, + "logps/chosen": -2.568779468536377, + "logps/rejected": -3.350574016571045, + "loss": -0.1063, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15692029893398285, + "rewards/margins": 0.10625879466533661, + "rewards/rejected": 0.05066150799393654, + "step": 405 + }, + { + "epoch": 0.9789029535864979, + "grad_norm": 0.12280982732772827, + "learning_rate": 1.3404825737265417e-07, + "logits/chosen": -0.7132568359375, + "logits/rejected": -0.4779052734375, + "logps/chosen": -2.23645281791687, + "logps/rejected": -3.3312318325042725, + "loss": -0.1267, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17549726366996765, + "rewards/margins": 0.12666451930999756, + "rewards/rejected": 0.048832736909389496, + "step": 406 + }, + { + "epoch": 0.9813140446051839, + "grad_norm": 0.33424296975135803, + "learning_rate": 1.2064343163538875e-07, + "logits/chosen": -0.8260498046875, + "logits/rejected": -0.8294677734375, + "logps/chosen": -1.5432744026184082, + "logps/rejected": -2.5166401863098145, + "loss": -0.1231, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.29473647475242615, + "rewards/margins": 0.12308693677186966, + "rewards/rejected": 0.17164954543113708, + "step": 407 + }, + { + "epoch": 0.9837251356238698, + "grad_norm": 0.35786840319633484, + "learning_rate": 1.0723860589812334e-07, + "logits/chosen": -0.370391845703125, + "logits/rejected": -0.1624755859375, + "logps/chosen": -2.3019862174987793, + "logps/rejected": -2.7111849784851074, + "loss": -0.0591, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.18332113325595856, + "rewards/margins": 0.0591268390417099, + "rewards/rejected": 0.12419429421424866, + "step": 408 + }, + { + "epoch": 0.9861362266425557, + "grad_norm": 0.15782079100608826, + "learning_rate": 9.383378016085791e-08, + "logits/chosen": -0.8964691162109375, + "logits/rejected": -0.742095947265625, + "logps/chosen": -1.6657278537750244, + "logps/rejected": -2.7062299251556396, + "loss": -0.1231, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2638601064682007, + "rewards/margins": 0.1231328621506691, + "rewards/rejected": 0.140727236866951, + "step": 409 + }, + { + "epoch": 0.9885473176612417, + "grad_norm": 0.31421858072280884, + "learning_rate": 8.04289544235925e-08, + "logits/chosen": -0.582672119140625, + "logits/rejected": -0.33551025390625, + "logps/chosen": -1.4744884967803955, + "logps/rejected": -2.814042568206787, + "loss": -0.1417, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26842617988586426, + "rewards/margins": 0.14174434542655945, + "rewards/rejected": 0.126681849360466, + "step": 410 + }, + { + "epoch": 0.9909584086799277, + "grad_norm": 0.6276865005493164, + "learning_rate": 6.702412868632709e-08, + "logits/chosen": -0.559173583984375, + "logits/rejected": -0.6410598754882812, + "logps/chosen": -2.050459146499634, + "logps/rejected": -3.0755696296691895, + "loss": -0.1167, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.2281588762998581, + "rewards/margins": 0.11667966842651367, + "rewards/rejected": 0.11147919297218323, + "step": 411 + }, + { + "epoch": 0.9933694996986137, + "grad_norm": 0.2700299024581909, + "learning_rate": 5.361930294906167e-08, + "logits/chosen": -0.823974609375, + "logits/rejected": -0.64111328125, + "logps/chosen": -1.7341108322143555, + "logps/rejected": -3.0539872646331787, + "loss": -0.1569, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23255713284015656, + "rewards/margins": 0.1569197177886963, + "rewards/rejected": 0.07563740015029907, + "step": 412 + }, + { + "epoch": 0.9957805907172996, + "grad_norm": 0.22828982770442963, + "learning_rate": 4.021447721179625e-08, + "logits/chosen": -0.7105712890625, + "logits/rejected": -0.6495590209960938, + "logps/chosen": -1.727156162261963, + "logps/rejected": -2.8687305450439453, + "loss": -0.1414, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22719953954219818, + "rewards/margins": 0.14144930243492126, + "rewards/rejected": 0.08575025200843811, + "step": 413 + }, + { + "epoch": 0.9981916817359855, + "grad_norm": 0.22799554467201233, + "learning_rate": 2.6809651474530834e-08, + "logits/chosen": -0.39013671875, + "logits/rejected": -0.277984619140625, + "logps/chosen": -1.4473698139190674, + "logps/rejected": -2.3746933937072754, + "loss": -0.136, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3296416699886322, + "rewards/margins": 0.13600146770477295, + "rewards/rejected": 0.19364023208618164, + "step": 414 + }, + { + "epoch": 1.0, + "grad_norm": 0.4860399067401886, + "learning_rate": 1.3404825737265417e-08, + "logits/chosen": -0.5096028447151184, + "logits/rejected": -0.7128499150276184, + "logps/chosen": -1.7511554956436157, + "logps/rejected": -2.9001548290252686, + "loss": -0.1274, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23462635278701782, + "rewards/margins": 0.12739022076129913, + "rewards/rejected": 0.1072361171245575, + "step": 415 + } + ], + "logging_steps": 1, + "max_steps": 415, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}