{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0007099751508697, "eval_steps": 500, "global_step": 1409, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007099751508697196, "grad_norm": 0.193258345245447, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": 55250422.69090909, "logits/rejected": 165588823.67123288, "logps/chosen": -186.03636363636363, "logps/rejected": -308.82191780821915, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0014199503017394391, "grad_norm": 0.18556685201750664, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "logits/chosen": 127453674.36619718, "logits/rejected": 178662633.54385966, "logps/chosen": -276.50704225352115, "logps/rejected": -310.4561403508772, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.002129925452609159, "grad_norm": 0.17716453494234405, "kl": 0.1640625, "learning_rate": 1.0000000000000001e-07, "logits/chosen": 116355778.20689656, "logits/rejected": 145482430.17142856, "logps/chosen": -211.86206896551724, "logps/rejected": -307.42857142857144, "loss": 0.4969, "rewards/chosen": 0.013730805495689655, "rewards/margins": 0.023649890317118227, "rewards/rejected": -0.009919084821428571, "step": 3 }, { "epoch": 0.0028399006034788782, "grad_norm": 0.16219875866186287, "kl": 0.1484375, "learning_rate": 1.5000000000000002e-07, "logits/chosen": 138666232.24242425, "logits/rejected": 103639898.83870968, "logps/chosen": -184.72727272727272, "logps/rejected": -289.80645161290323, "loss": 0.5006, "rewards/chosen": 0.00439453125, "rewards/margins": -0.000444965977822581, "rewards/rejected": 0.004839497227822581, "step": 4 }, { "epoch": 0.003549875754348598, "grad_norm": 0.17450606223719384, "kl": 0.08984375, "learning_rate": 2.0000000000000002e-07, "logits/chosen": 116428093.79310344, "logits/rejected": 123312537.6, "logps/chosen": -214.89655172413794, "logps/rejected": -291.8857142857143, "loss": 0.501, "rewards/chosen": -0.0020541487068965517, "rewards/margins": -0.0047606219211822655, "rewards/rejected": 0.002706473214285714, "step": 5 }, { "epoch": 0.004259850905218318, "grad_norm": 0.15601944144213353, "kl": 0.12109375, "learning_rate": 2.5000000000000004e-07, "logits/chosen": 95530792.42105263, "logits/rejected": 163658515.69230768, "logps/chosen": -243.78947368421052, "logps/rejected": -281.84615384615387, "loss": 0.5009, "rewards/chosen": 0.007484837582236842, "rewards/margins": -0.0032244848336285417, "rewards/rejected": 0.010709322415865384, "step": 6 }, { "epoch": 0.004969826056088037, "grad_norm": 0.16525719026477595, "kl": 0.009765625, "learning_rate": 3.0000000000000004e-07, "logits/chosen": 160563200.0, "logits/rejected": 47054848.0, "logps/chosen": -218.75, "logps/rejected": -275.0, "loss": 0.4991, "rewards/chosen": 0.009368896484375, "rewards/margins": 0.010906219482421875, "rewards/rejected": -0.001537322998046875, "step": 7 }, { "epoch": 0.0056798012069577564, "grad_norm": 0.15925762446122826, "kl": 0.07421875, "learning_rate": 3.5000000000000004e-07, "logits/chosen": 94507140.12903225, "logits/rejected": 120109614.54545455, "logps/chosen": -221.41935483870967, "logps/rejected": -271.5151515151515, "loss": 0.5008, "rewards/chosen": -0.012776036416330645, "rewards/margins": -0.01691902884057307, "rewards/rejected": 0.004142992424242424, "step": 8 }, { "epoch": 0.006389776357827476, "grad_norm": 0.16934118843602133, "kl": 0.15234375, "learning_rate": 4.0000000000000003e-07, "logits/chosen": 147040314.5142857, "logits/rejected": 128866374.62068966, "logps/chosen": -246.17142857142858, "logps/rejected": -277.7931034482759, "loss": 0.5025, "rewards/chosen": -0.01484375, "rewards/margins": -0.02411267510775862, "rewards/rejected": 0.00926892510775862, "step": 9 }, { "epoch": 0.007099751508697196, "grad_norm": 0.17508542564356352, "kl": 0.115234375, "learning_rate": 4.5000000000000003e-07, "logits/chosen": 110471514.58461538, "logits/rejected": 149796571.42857143, "logps/chosen": -215.13846153846154, "logps/rejected": -282.1587301587302, "loss": 0.4998, "rewards/chosen": -0.014008037860576923, "rewards/margins": 0.009522468091804029, "rewards/rejected": -0.023530505952380952, "step": 10 }, { "epoch": 0.007809726659566915, "grad_norm": 0.17190842124330508, "kl": 0.076171875, "learning_rate": 5.000000000000001e-07, "logits/chosen": 178134558.11764705, "logits/rejected": 103669213.86666666, "logps/chosen": -259.29411764705884, "logps/rejected": -285.3333333333333, "loss": 0.5012, "rewards/chosen": -0.005916819852941176, "rewards/margins": -0.02065070657169118, "rewards/rejected": 0.01473388671875, "step": 11 }, { "epoch": 0.008519701810436636, "grad_norm": 0.16059148908937998, "kl": 0.00390625, "learning_rate": 5.5e-07, "logits/chosen": 156770706.8852459, "logits/rejected": 87078759.1641791, "logps/chosen": -238.1639344262295, "logps/rejected": -265.3134328358209, "loss": 0.5014, "rewards/chosen": -0.017410028176229508, "rewards/margins": -0.01308290888984891, "rewards/rejected": -0.004327119286380597, "step": 12 }, { "epoch": 0.009229676961306355, "grad_norm": 0.16432598417951516, "kl": 0.0, "learning_rate": 6.000000000000001e-07, "logits/chosen": 166793489.06666666, "logits/rejected": 88450469.64705883, "logps/chosen": -301.6, "logps/rejected": -237.41176470588235, "loss": 0.4985, "rewards/chosen": 0.0028076171875, "rewards/margins": 0.009151683134191176, "rewards/rejected": -0.006344065946691176, "step": 13 }, { "epoch": 0.009939652112176074, "grad_norm": 0.133698619917347, "kl": 0.05078125, "learning_rate": 6.5e-07, "logits/chosen": 135657563.70149255, "logits/rejected": 64805434.75409836, "logps/chosen": -175.88059701492537, "logps/rejected": -240.52459016393442, "loss": 0.5005, "rewards/chosen": 0.0016761893656716419, "rewards/margins": -0.0037989643228529477, "rewards/rejected": 0.00547515368852459, "step": 14 }, { "epoch": 0.010649627263045794, "grad_norm": 0.16477366931576812, "kl": 0.08203125, "learning_rate": 7.000000000000001e-07, "logits/chosen": 134332640.43835616, "logits/rejected": 94562490.18181819, "logps/chosen": -261.26027397260276, "logps/rejected": -231.27272727272728, "loss": 0.5047, "rewards/chosen": -0.002735712756849315, "rewards/margins": -0.021485712756849313, "rewards/rejected": 0.01875, "step": 15 }, { "epoch": 0.011359602413915513, "grad_norm": 0.20587707731788926, "kl": 0.025390625, "learning_rate": 7.5e-07, "logits/chosen": 144104301.7142857, "logits/rejected": 152509553.7777778, "logps/chosen": -258.2857142857143, "logps/rejected": -297.55555555555554, "loss": 0.5032, "rewards/chosen": -0.015546526227678572, "rewards/margins": -0.02894713386656746, "rewards/rejected": 0.013400607638888888, "step": 16 }, { "epoch": 0.012069577564785232, "grad_norm": 0.14782781454074068, "kl": 0.064453125, "learning_rate": 8.000000000000001e-07, "logits/chosen": 79441369.79104477, "logits/rejected": 150719907.67213115, "logps/chosen": -164.53731343283582, "logps/rejected": -290.0983606557377, "loss": 0.5003, "rewards/chosen": 0.002171758395522388, "rewards/margins": 0.004125383683688371, "rewards/rejected": -0.0019536252881659838, "step": 17 }, { "epoch": 0.012779552715654952, "grad_norm": 0.2012841005110524, "kl": 0.15625, "learning_rate": 8.500000000000001e-07, "logits/chosen": 132362555.07692307, "logits/rejected": 139626172.63157895, "logps/chosen": -276.0, "logps/rejected": -318.3157894736842, "loss": 0.4995, "rewards/chosen": 0.005333533653846154, "rewards/margins": 0.005410630693319838, "rewards/rejected": -7.70970394736842e-05, "step": 18 }, { "epoch": 0.013489527866524671, "grad_norm": 0.15943475861207523, "kl": 0.08203125, "learning_rate": 9.000000000000001e-07, "logits/chosen": 134502087.59322035, "logits/rejected": 79144692.86956522, "logps/chosen": -177.6271186440678, "logps/rejected": -273.6231884057971, "loss": 0.4968, "rewards/chosen": 0.008058709613347457, "rewards/margins": 0.021865038870593833, "rewards/rejected": -0.013806329257246376, "step": 19 }, { "epoch": 0.014199503017394392, "grad_norm": 0.15546208246853152, "kl": 0.0, "learning_rate": 9.500000000000001e-07, "logits/chosen": 217038587.93650794, "logits/rejected": 63229132.8, "logps/chosen": -255.4920634920635, "logps/rejected": -234.33846153846153, "loss": 0.4991, "rewards/chosen": 0.010817754836309524, "rewards/margins": 0.006937796903617216, "rewards/rejected": 0.003879957932692308, "step": 20 }, { "epoch": 0.014909478168264111, "grad_norm": 0.18205448063586435, "kl": 0.029296875, "learning_rate": 1.0000000000000002e-06, "logits/chosen": 182482617.50724638, "logits/rejected": 142179796.6101695, "logps/chosen": -285.2173913043478, "logps/rejected": -261.1525423728813, "loss": 0.5022, "rewards/chosen": -0.005661231884057971, "rewards/margins": -0.016494454871346105, "rewards/rejected": 0.010833222987288135, "step": 21 }, { "epoch": 0.01561945331913383, "grad_norm": 0.16098629518565086, "kl": 0.0234375, "learning_rate": 1.0500000000000001e-06, "logits/chosen": 182655174.19354838, "logits/rejected": 110513555.39393939, "logps/chosen": -254.96774193548387, "logps/rejected": -235.15151515151516, "loss": 0.4996, "rewards/chosen": -0.0006457913306451613, "rewards/margins": 0.001980569938294233, "rewards/rejected": -0.002626361268939394, "step": 22 }, { "epoch": 0.01632942847000355, "grad_norm": 0.15301444054614843, "kl": 0.14453125, "learning_rate": 1.1e-06, "logits/chosen": 127110712.8888889, "logits/rejected": 70104795.42857143, "logps/chosen": -257.3333333333333, "logps/rejected": -290.2857142857143, "loss": 0.496, "rewards/chosen": 0.0137939453125, "rewards/margins": 0.032139369419642856, "rewards/rejected": -0.018345424107142856, "step": 23 }, { "epoch": 0.01703940362087327, "grad_norm": 0.14449593784612436, "kl": 0.021484375, "learning_rate": 1.1500000000000002e-06, "logits/chosen": 147379164.68965518, "logits/rejected": 87361360.45714286, "logps/chosen": -235.58620689655172, "logps/rejected": -214.17142857142858, "loss": 0.5012, "rewards/chosen": -0.010735873518318966, "rewards/margins": -0.011545025304033252, "rewards/rejected": 0.0008091517857142858, "step": 24 }, { "epoch": 0.01774937877174299, "grad_norm": 0.14072830121891264, "kl": 0.099609375, "learning_rate": 1.2000000000000002e-06, "logits/chosen": 151481240.11594203, "logits/rejected": 108696454.50847457, "logps/chosen": -188.8695652173913, "logps/rejected": -258.4406779661017, "loss": 0.4996, "rewards/chosen": 0.006453804347826087, "rewards/margins": 0.0013392651529108325, "rewards/rejected": 0.0051145391949152545, "step": 25 }, { "epoch": 0.01845935392261271, "grad_norm": 0.1574771162963366, "kl": 0.0048828125, "learning_rate": 1.25e-06, "logits/chosen": 160315619.55555555, "logits/rejected": 101562075.42857143, "logps/chosen": -243.33333333333334, "logps/rejected": -264.0, "loss": 0.5017, "rewards/chosen": 0.011691623263888888, "rewards/margins": -0.02002921937003968, "rewards/rejected": 0.03172084263392857, "step": 26 }, { "epoch": 0.019169329073482427, "grad_norm": 0.17520394689846938, "kl": 0.0, "learning_rate": 1.3e-06, "logits/chosen": 118106274.53968254, "logits/rejected": 140154281.35384616, "logps/chosen": -244.06349206349208, "logps/rejected": -282.5846153846154, "loss": 0.5008, "rewards/chosen": -0.006053137400793651, "rewards/margins": -0.004152596535409035, "rewards/rejected": -0.0019005408653846154, "step": 27 }, { "epoch": 0.01987930422435215, "grad_norm": 0.1806156468368268, "kl": 0.01171875, "learning_rate": 1.3500000000000002e-06, "logits/chosen": 108459965.93548387, "logits/rejected": 183405474.9090909, "logps/chosen": -258.06451612903226, "logps/rejected": -292.3636363636364, "loss": 0.5006, "rewards/chosen": -0.014703566028225807, "rewards/margins": -0.0046512607488697465, "rewards/rejected": -0.01005230527935606, "step": 28 }, { "epoch": 0.020589279375221866, "grad_norm": 0.15025205546260437, "kl": 0.0, "learning_rate": 1.4000000000000001e-06, "logits/chosen": 123938729.46478873, "logits/rejected": 119795208.98245615, "logps/chosen": -218.59154929577466, "logps/rejected": -285.1929824561403, "loss": 0.4993, "rewards/chosen": 0.003964706205985915, "rewards/margins": 0.005279639045898196, "rewards/rejected": -0.0013149328399122808, "step": 29 }, { "epoch": 0.021299254526091587, "grad_norm": 0.14862686066877295, "kl": 0.0869140625, "learning_rate": 1.45e-06, "logits/chosen": 157258437.97333333, "logits/rejected": 48511478.33962264, "logps/chosen": -263.68, "logps/rejected": -245.1320754716981, "loss": 0.5015, "rewards/chosen": -0.001435546875, "rewards/margins": -0.011090617629716982, "rewards/rejected": 0.009655070754716982, "step": 30 }, { "epoch": 0.022009229676961305, "grad_norm": 0.17903654315320508, "kl": 0.169921875, "learning_rate": 1.5e-06, "logits/chosen": 148493078.45614034, "logits/rejected": 99599951.32394366, "logps/chosen": -252.6315789473684, "logps/rejected": -274.0281690140845, "loss": 0.5, "rewards/chosen": 0.012344092653508772, "rewards/margins": 0.0003450967248115888, "rewards/rejected": 0.011998995928697184, "step": 31 }, { "epoch": 0.022719204827831026, "grad_norm": 0.17513729672618147, "kl": 0.0, "learning_rate": 1.5500000000000002e-06, "logits/chosen": 147582628.88135594, "logits/rejected": 114401161.27536231, "logps/chosen": -225.6271186440678, "logps/rejected": -279.18840579710144, "loss": 0.5008, "rewards/chosen": -0.024583719544491525, "rewards/margins": -0.011449661573477033, "rewards/rejected": -0.013134057971014492, "step": 32 }, { "epoch": 0.023429179978700747, "grad_norm": 0.16534778625341848, "kl": 0.0, "learning_rate": 1.6000000000000001e-06, "logits/chosen": 95492236.2739726, "logits/rejected": 139708453.23636365, "logps/chosen": -190.68493150684932, "logps/rejected": -334.25454545454545, "loss": 0.4967, "rewards/chosen": -0.002675513698630137, "rewards/margins": 0.031237838574097135, "rewards/rejected": -0.03391335227272727, "step": 33 }, { "epoch": 0.024139155129570464, "grad_norm": 0.18389876955039927, "kl": 0.046875, "learning_rate": 1.6500000000000003e-06, "logits/chosen": 176859818.66666666, "logits/rejected": 98134377.41176471, "logps/chosen": -268.26666666666665, "logps/rejected": -272.47058823529414, "loss": 0.5009, "rewards/chosen": -0.013655598958333333, "rewards/margins": -0.008837411917892156, "rewards/rejected": -0.004818187040441176, "step": 34 }, { "epoch": 0.024849130280440185, "grad_norm": 0.15515785312260286, "kl": 0.044921875, "learning_rate": 1.7000000000000002e-06, "logits/chosen": 165393301.01492536, "logits/rejected": 106851613.37704918, "logps/chosen": -217.7910447761194, "logps/rejected": -217.18032786885246, "loss": 0.4997, "rewards/chosen": -0.0004390887360074627, "rewards/margins": 0.005328233241451553, "rewards/rejected": -0.005767321977459016, "step": 35 }, { "epoch": 0.025559105431309903, "grad_norm": 0.1541925542464671, "kl": 0.185546875, "learning_rate": 1.75e-06, "logits/chosen": 99724272.71641791, "logits/rejected": 141746847.47540984, "logps/chosen": -207.044776119403, "logps/rejected": -263.344262295082, "loss": 0.4973, "rewards/chosen": -0.0064423973880597014, "rewards/margins": 0.025319897693907512, "rewards/rejected": -0.031762295081967214, "step": 36 }, { "epoch": 0.026269080582179624, "grad_norm": 0.1626234500380143, "kl": 0.0703125, "learning_rate": 1.8000000000000001e-06, "logits/chosen": 92216433.77777778, "logits/rejected": 143654912.0, "logps/chosen": -215.11111111111111, "logps/rejected": -276.85714285714283, "loss": 0.4973, "rewards/chosen": 0.002266777886284722, "rewards/margins": 0.02572171650235615, "rewards/rejected": -0.023454938616071428, "step": 37 }, { "epoch": 0.026979055733049342, "grad_norm": 0.1896893626086524, "kl": 0.087890625, "learning_rate": 1.85e-06, "logits/chosen": 130218159.54285714, "logits/rejected": 139352134.62068966, "logps/chosen": -317.25714285714287, "logps/rejected": -262.62068965517244, "loss": 0.4987, "rewards/chosen": 0.006675502232142857, "rewards/margins": -24404702.889876224, "rewards/rejected": 24404702.896551725, "step": 38 }, { "epoch": 0.027689030883919063, "grad_norm": 0.18846667371865355, "kl": 0.080078125, "learning_rate": 1.9000000000000002e-06, "logits/chosen": 124336232.13559322, "logits/rejected": 101939823.30434783, "logps/chosen": -226.4406779661017, "logps/rejected": -297.5072463768116, "loss": 0.4991, "rewards/chosen": -0.0016303628177966102, "rewards/margins": 0.004335160280029476, "rewards/rejected": -0.005965523097826087, "step": 39 }, { "epoch": 0.028399006034788784, "grad_norm": 0.20425775691987258, "kl": 0.04296875, "learning_rate": 1.9500000000000004e-06, "logits/chosen": 133466509.37313433, "logits/rejected": 84779948.06557377, "logps/chosen": -265.55223880597015, "logps/rejected": -312.39344262295083, "loss": 0.4962, "rewards/chosen": 0.021444292210820896, "rewards/margins": 0.028414306939304503, "rewards/rejected": -0.006970014728483607, "step": 40 }, { "epoch": 0.0291089811856585, "grad_norm": 0.16886240537222386, "kl": 0.03125, "learning_rate": 2.0000000000000003e-06, "logits/chosen": 125966638.16393442, "logits/rejected": 127581963.46268657, "logps/chosen": -174.95081967213116, "logps/rejected": -237.61194029850745, "loss": 0.4975, "rewards/chosen": 0.013479764344262296, "rewards/margins": 0.022997604829336923, "rewards/rejected": -0.009517840485074628, "step": 41 }, { "epoch": 0.029818956336528223, "grad_norm": 0.17185438866788133, "kl": 0.0, "learning_rate": 2.05e-06, "logits/chosen": 182032793.6, "logits/rejected": 98041856.0, "logps/chosen": -256.53333333333336, "logps/rejected": -247.52941176470588, "loss": 0.5001, "rewards/chosen": 0.023583984375, "rewards/margins": 0.0023293887867647037, "rewards/rejected": 0.021254595588235295, "step": 42 }, { "epoch": 0.03052893148739794, "grad_norm": 0.16842872700883396, "kl": 0.048828125, "learning_rate": 2.1000000000000002e-06, "logits/chosen": 127456760.35820895, "logits/rejected": 95781401.18032786, "logps/chosen": -161.43283582089552, "logps/rejected": -254.1639344262295, "loss": 0.4984, "rewards/chosen": -0.0009109724813432836, "rewards/margins": 0.00995928878914852, "rewards/rejected": -0.010870261270491803, "step": 43 }, { "epoch": 0.03123890663826766, "grad_norm": 0.16977724130704808, "kl": 0.05859375, "learning_rate": 2.15e-06, "logits/chosen": 105769405.2173913, "logits/rejected": 104844270.6440678, "logps/chosen": -218.66666666666666, "logps/rejected": -288.271186440678, "loss": 0.497, "rewards/chosen": 0.0036231884057971015, "rewards/margins": 0.027159999846475068, "rewards/rejected": -0.023536811440677968, "step": 44 }, { "epoch": 0.03194888178913738, "grad_norm": 0.1653620060200305, "kl": 0.048828125, "learning_rate": 2.2e-06, "logits/chosen": 149596842.66666666, "logits/rejected": 95386590.96774194, "logps/chosen": -192.36363636363637, "logps/rejected": -253.67741935483872, "loss": 0.5005, "rewards/chosen": -0.011171283143939394, "rewards/margins": -0.0016872074130926205, "rewards/rejected": -0.009484075730846774, "step": 45 }, { "epoch": 0.0326588569400071, "grad_norm": 0.15724569851041262, "kl": 0.033203125, "learning_rate": 2.25e-06, "logits/chosen": 116781407.08571428, "logits/rejected": 110859793.65517241, "logps/chosen": -210.5142857142857, "logps/rejected": -232.0, "loss": 0.4969, "rewards/chosen": 0.017557198660714285, "rewards/margins": -13251351.154856594, "rewards/rejected": 13251351.172413792, "step": 46 }, { "epoch": 0.03336883209087682, "grad_norm": 0.17431449120532952, "kl": 0.1328125, "learning_rate": 2.3000000000000004e-06, "logits/chosen": 96214791.75757575, "logits/rejected": 149777242.83870968, "logps/chosen": -240.72727272727272, "logps/rejected": -252.38709677419354, "loss": 0.4967, "rewards/chosen": 0.017851858428030304, "rewards/margins": 0.02674330248044966, "rewards/rejected": -0.008891444052419355, "step": 47 }, { "epoch": 0.03407880724174654, "grad_norm": 0.20874085935058095, "kl": 0.154296875, "learning_rate": 2.35e-06, "logits/chosen": 137182667.03448275, "logits/rejected": 105217111.77142857, "logps/chosen": -273.6551724137931, "logps/rejected": -302.4, "loss": 0.5007, "rewards/chosen": -0.014143318965517241, "rewards/margins": -0.01265406115301724, "rewards/rejected": -0.0014892578125, "step": 48 }, { "epoch": 0.034788782392616256, "grad_norm": 0.19272707373853493, "kl": 0.04296875, "learning_rate": 2.4000000000000003e-06, "logits/chosen": 152532855.46666667, "logits/rejected": 116021850.35294117, "logps/chosen": -281.06666666666666, "logps/rejected": -293.88235294117646, "loss": 0.4965, "rewards/chosen": -0.010953776041666667, "rewards/margins": 0.02374291513480392, "rewards/rejected": -0.03469669117647059, "step": 49 }, { "epoch": 0.03549875754348598, "grad_norm": 0.19457372562093073, "kl": 0.0234375, "learning_rate": 2.4500000000000003e-06, "logits/chosen": 141388634.83870968, "logits/rejected": 82360878.54545455, "logps/chosen": -274.5806451612903, "logps/rejected": -256.24242424242425, "loss": 0.4977, "rewards/chosen": 0.011498235887096774, "rewards/margins": 0.015034575849217986, "rewards/rejected": -0.003536339962121212, "step": 50 }, { "epoch": 0.0362087326943557, "grad_norm": 0.1970747016347854, "kl": 0.025390625, "learning_rate": 2.5e-06, "logits/chosen": 121702466.06451613, "logits/rejected": 118298437.81818181, "logps/chosen": -253.80645161290323, "logps/rejected": -282.90909090909093, "loss": 0.4959, "rewards/chosen": 0.014538180443548387, "rewards/margins": 0.03319644275415445, "rewards/rejected": -0.01865826231060606, "step": 51 }, { "epoch": 0.03691870784522542, "grad_norm": 0.16885006100940733, "kl": 0.07421875, "learning_rate": 2.55e-06, "logits/chosen": 188926041.04347825, "logits/rejected": 39668163.25423729, "logps/chosen": -190.14492753623188, "logps/rejected": -268.20338983050846, "loss": 0.4985, "rewards/chosen": -0.009974382925724638, "rewards/margins": 0.017799716650546547, "rewards/rejected": -0.027774099576271187, "step": 52 }, { "epoch": 0.037628682996095134, "grad_norm": 0.20943562421702488, "kl": 0.0390625, "learning_rate": 2.6e-06, "logits/chosen": 125829120.0, "logits/rejected": 104158549.33333333, "logps/chosen": -250.57142857142858, "logps/rejected": -276.0, "loss": 0.4953, "rewards/chosen": 0.022321428571428572, "rewards/margins": 0.0382753402467758, "rewards/rejected": -0.015953911675347224, "step": 53 }, { "epoch": 0.038338658146964855, "grad_norm": 0.18213076170676917, "kl": 0.064453125, "learning_rate": 2.6500000000000005e-06, "logits/chosen": 168905755.6756757, "logits/rejected": 125052397.03703703, "logps/chosen": -210.3783783783784, "logps/rejected": -317.3333333333333, "loss": 0.4995, "rewards/chosen": -0.007522170608108108, "rewards/margins": 0.004268917354854855, "rewards/rejected": -0.011791087962962963, "step": 54 }, { "epoch": 0.039048633297834576, "grad_norm": 0.17997447396487476, "kl": 0.185546875, "learning_rate": 2.7000000000000004e-06, "logits/chosen": 109369654.3030303, "logits/rejected": 129482223.48387097, "logps/chosen": -259.1515151515151, "logps/rejected": -221.41935483870967, "loss": 0.4965, "rewards/chosen": 0.017650257457386364, "rewards/margins": 0.03778004576383798, "rewards/rejected": -0.020129788306451613, "step": 55 }, { "epoch": 0.0397586084487043, "grad_norm": 0.20707263159563127, "kl": 0.0, "learning_rate": 2.7500000000000004e-06, "logits/chosen": 151883567.72881356, "logits/rejected": 78088518.49275362, "logps/chosen": -244.0677966101695, "logps/rejected": -265.5072463768116, "loss": 0.4965, "rewards/chosen": 0.012082891949152543, "rewards/margins": 0.026419961695529355, "rewards/rejected": -0.014337069746376812, "step": 56 }, { "epoch": 0.04046858359957402, "grad_norm": 0.19775062598296936, "kl": 0.1171875, "learning_rate": 2.8000000000000003e-06, "logits/chosen": 161417153.93939394, "logits/rejected": 106548851.61290322, "logps/chosen": -272.72727272727275, "logps/rejected": -304.7741935483871, "loss": 0.4948, "rewards/chosen": -0.0001775568181818182, "rewards/margins": 0.03349809842375367, "rewards/rejected": -0.033675655241935484, "step": 57 }, { "epoch": 0.04117855875044373, "grad_norm": 0.19387497762057962, "kl": 0.05078125, "learning_rate": 2.85e-06, "logits/chosen": 113725557.02857143, "logits/rejected": 114728677.51724137, "logps/chosen": -265.37142857142857, "logps/rejected": -255.72413793103448, "loss": 0.4924, "rewards/chosen": 0.01568080357142857, "rewards/margins": 10051866.498439424, "rewards/rejected": -10051866.48275862, "step": 58 }, { "epoch": 0.04188853390131345, "grad_norm": 0.19791328775196437, "kl": 0.0, "learning_rate": 2.9e-06, "logits/chosen": 123298074.48275863, "logits/rejected": 84065835.88571429, "logps/chosen": -177.93103448275863, "logps/rejected": -247.31428571428572, "loss": 0.4966, "rewards/chosen": -0.00452081088362069, "rewards/margins": 0.02375764893780788, "rewards/rejected": -0.02827845982142857, "step": 59 }, { "epoch": 0.042598509052183174, "grad_norm": 0.24358611691711385, "kl": 0.080078125, "learning_rate": 2.95e-06, "logits/chosen": 134274923.05454546, "logits/rejected": 130655442.4109589, "logps/chosen": -240.5818181818182, "logps/rejected": -291.94520547945206, "loss": 0.487, "rewards/chosen": 0.042311789772727273, "rewards/margins": 0.09461808258094645, "rewards/rejected": -0.052306292808219176, "step": 60 }, { "epoch": 0.043308484203052895, "grad_norm": 0.17961285024375298, "kl": 0.052734375, "learning_rate": 3e-06, "logits/chosen": 145767260.7536232, "logits/rejected": 83530630.50847457, "logps/chosen": -237.91304347826087, "logps/rejected": -245.28813559322035, "loss": 0.4937, "rewards/chosen": 0.00864753170289855, "rewards/margins": 0.06187846390628838, "rewards/rejected": -0.05323093220338983, "step": 61 }, { "epoch": 0.04401845935392261, "grad_norm": 0.22467202299286215, "kl": 0.0, "learning_rate": 3.05e-06, "logits/chosen": 107164467.2, "logits/rejected": 149404001.10344827, "logps/chosen": -218.28571428571428, "logps/rejected": -331.3103448275862, "loss": 0.4898, "rewards/chosen": 0.02953404017857143, "rewards/margins": 0.08385111991995074, "rewards/rejected": -0.05431707974137931, "step": 62 }, { "epoch": 0.04472843450479233, "grad_norm": 0.23082125810118975, "kl": 0.0, "learning_rate": 3.1000000000000004e-06, "logits/chosen": 138682632.2580645, "logits/rejected": 106509901.57575758, "logps/chosen": -256.51612903225805, "logps/rejected": -285.57575757575756, "loss": 0.4875, "rewards/chosen": 0.03792448966733871, "rewards/margins": 0.09900403512188416, "rewards/rejected": -0.061079545454545456, "step": 63 }, { "epoch": 0.04543840965566205, "grad_norm": 0.2208084722820857, "kl": 0.0, "learning_rate": 3.1500000000000003e-06, "logits/chosen": 135056588.8, "logits/rejected": 114603188.70588236, "logps/chosen": -264.1333333333333, "logps/rejected": -255.76470588235293, "loss": 0.486, "rewards/chosen": 0.03433024088541667, "rewards/margins": 0.1138339173560049, "rewards/rejected": -0.07950367647058823, "step": 64 }, { "epoch": 0.04614838480653177, "grad_norm": 0.21935205683678607, "kl": 0.0859375, "learning_rate": 3.2000000000000003e-06, "logits/chosen": 141604711.16417912, "logits/rejected": 80379366.81967214, "logps/chosen": -222.80597014925374, "logps/rejected": -249.70491803278688, "loss": 0.4898, "rewards/chosen": 0.03105869577891791, "rewards/margins": -29530844.296810158, "rewards/rejected": 29530844.327868853, "step": 65 }, { "epoch": 0.046858359957401494, "grad_norm": 0.17446425084594594, "kl": 0.09375, "learning_rate": 3.2500000000000002e-06, "logits/chosen": 119821062.91891892, "logits/rejected": 52855997.62962963, "logps/chosen": -200.86486486486487, "logps/rejected": -225.62962962962962, "loss": 0.494, "rewards/chosen": 0.003998627533783784, "rewards/margins": 0.04092715762637638, "rewards/rejected": -0.036928530092592594, "step": 66 }, { "epoch": 0.04756833510827121, "grad_norm": 0.2077473654841623, "kl": 0.044921875, "learning_rate": 3.3000000000000006e-06, "logits/chosen": 82156849.40350877, "logits/rejected": 108933754.59154929, "logps/chosen": -199.71929824561403, "logps/rejected": -235.04225352112675, "loss": 0.4929, "rewards/chosen": 0.013312088815789474, "rewards/margins": 0.0548503810693106, "rewards/rejected": -0.041538292253521125, "step": 67 }, { "epoch": 0.04827831025914093, "grad_norm": 0.22854709393988337, "kl": 0.0888671875, "learning_rate": 3.3500000000000005e-06, "logits/chosen": 172334385.40350878, "logits/rejected": 90687055.32394366, "logps/chosen": -188.35087719298247, "logps/rejected": -280.11267605633805, "loss": 0.4837, "rewards/chosen": 0.05461896929824561, "rewards/margins": 0.1340919006362738, "rewards/rejected": -0.07947293133802817, "step": 68 }, { "epoch": 0.04898828541001065, "grad_norm": 0.1844528702622798, "kl": 0.015625, "learning_rate": 3.4000000000000005e-06, "logits/chosen": 143232351.52238807, "logits/rejected": 64014705.31147541, "logps/chosen": -196.0597014925373, "logps/rejected": -212.45901639344262, "loss": 0.4904, "rewards/chosen": 0.024610832555970148, "rewards/margins": 0.0790101665723636, "rewards/rejected": -0.05439933401639344, "step": 69 }, { "epoch": 0.04969826056088037, "grad_norm": 0.2627617305846521, "kl": 0.044921875, "learning_rate": 3.45e-06, "logits/chosen": 165021796.72131148, "logits/rejected": 110867349.01492538, "logps/chosen": -208.13114754098362, "logps/rejected": -288.0, "loss": 0.4848, "rewards/chosen": 0.04008709016393443, "rewards/margins": 0.09739819091020308, "rewards/rejected": -0.057311100746268655, "step": 70 }, { "epoch": 0.05040823571175009, "grad_norm": 0.221439079803056, "kl": 0.03125, "learning_rate": 3.5e-06, "logits/chosen": 165021796.72131148, "logits/rejected": 87141360.71641791, "logps/chosen": -201.44262295081967, "logps/rejected": -229.2537313432836, "loss": 0.4865, "rewards/chosen": 0.020972079918032786, "rewards/margins": 0.10679297544042085, "rewards/rejected": -0.08582089552238806, "step": 71 }, { "epoch": 0.051118210862619806, "grad_norm": 0.2170434066310585, "kl": 0.05859375, "learning_rate": 3.5500000000000003e-06, "logits/chosen": 144113664.0, "logits/rejected": 124911616.0, "logps/chosen": -213.25, "logps/rejected": -253.0, "loss": 0.4869, "rewards/chosen": 0.0238037109375, "rewards/margins": 0.1177978515625, "rewards/rejected": -0.093994140625, "step": 72 }, { "epoch": 0.05182818601348953, "grad_norm": 0.22851054595416265, "kl": 0.044921875, "learning_rate": 3.6000000000000003e-06, "logits/chosen": 164915170.3188406, "logits/rejected": 142606336.0, "logps/chosen": -179.36231884057972, "logps/rejected": -308.33898305084745, "loss": 0.4819, "rewards/chosen": 0.039713541666666664, "rewards/margins": 0.1508099399717514, "rewards/rejected": -0.11109639830508475, "step": 73 }, { "epoch": 0.05253816116435925, "grad_norm": 0.24920846642405237, "kl": 0.0, "learning_rate": 3.65e-06, "logits/chosen": 147932436.31746033, "logits/rejected": 106406266.09230769, "logps/chosen": -227.42857142857142, "logps/rejected": -285.53846153846155, "loss": 0.477, "rewards/chosen": 0.060887896825396824, "rewards/margins": 0.18672924297924298, "rewards/rejected": -0.12584134615384615, "step": 74 }, { "epoch": 0.05324813631522897, "grad_norm": 0.27493890612893185, "kl": 0.0, "learning_rate": 3.7e-06, "logits/chosen": 150740743.75757575, "logits/rejected": 133811827.61290322, "logps/chosen": -295.75757575757575, "logps/rejected": -309.16129032258067, "loss": 0.4713, "rewards/chosen": 0.10973011363636363, "rewards/margins": 0.20310208944281524, "rewards/rejected": -0.09337197580645161, "step": 75 }, { "epoch": 0.053958111466098684, "grad_norm": 0.24022684457291554, "kl": 0.03125, "learning_rate": 3.7500000000000005e-06, "logits/chosen": 161261598.56716418, "logits/rejected": 114621389.63934426, "logps/chosen": -225.19402985074626, "logps/rejected": -291.9344262295082, "loss": 0.4755, "rewards/chosen": 0.04843094099813433, "rewards/margins": 0.17881003935879006, "rewards/rejected": -0.13037909836065573, "step": 76 }, { "epoch": 0.054668086616968405, "grad_norm": 0.2328901884310424, "kl": 0.0, "learning_rate": 3.8000000000000005e-06, "logits/chosen": 145312338.58064517, "logits/rejected": 108289303.27272727, "logps/chosen": -231.48387096774192, "logps/rejected": -250.9090909090909, "loss": 0.4819, "rewards/chosen": 0.030052923387096774, "rewards/margins": 0.14392602944770283, "rewards/rejected": -0.11387310606060606, "step": 77 }, { "epoch": 0.055378061767838126, "grad_norm": 0.25054925485837937, "kl": 0.021484375, "learning_rate": 3.85e-06, "logits/chosen": 97225341.90163934, "logits/rejected": 113308809.5522388, "logps/chosen": -216.78688524590163, "logps/rejected": -233.07462686567163, "loss": 0.4708, "rewards/chosen": 0.10079405737704918, "rewards/margins": 0.23745450513824323, "rewards/rejected": -0.13666044776119404, "step": 78 }, { "epoch": 0.05608803691870785, "grad_norm": 0.23969186342674445, "kl": 0.0, "learning_rate": 3.900000000000001e-06, "logits/chosen": 177050468.84848484, "logits/rejected": 89737810.58064516, "logps/chosen": -243.15151515151516, "logps/rejected": -249.03225806451613, "loss": 0.4709, "rewards/chosen": 0.07990056818181818, "rewards/margins": 0.20049028592375367, "rewards/rejected": -0.12058971774193548, "step": 79 }, { "epoch": 0.05679801206957757, "grad_norm": 0.26986895266435285, "kl": 0.126953125, "learning_rate": 3.95e-06, "logits/chosen": 177394386.82352942, "logits/rejected": 62355319.46666667, "logps/chosen": -272.0, "logps/rejected": -270.1333333333333, "loss": 0.4738, "rewards/chosen": 0.06158088235294118, "rewards/margins": 0.22915900735294117, "rewards/rejected": -0.167578125, "step": 80 }, { "epoch": 0.05750798722044728, "grad_norm": 0.28250436801731516, "kl": 0.0, "learning_rate": 4.000000000000001e-06, "logits/chosen": 124718863.05882353, "logits/rejected": 94791270.4, "logps/chosen": -256.94117647058823, "logps/rejected": -276.26666666666665, "loss": 0.4631, "rewards/chosen": 0.08111213235294118, "rewards/margins": 0.2746017156862745, "rewards/rejected": -0.19348958333333333, "step": 81 }, { "epoch": 0.058217962371317, "grad_norm": 0.2671797404977399, "kl": 0.0, "learning_rate": 4.05e-06, "logits/chosen": 165945608.2580645, "logits/rejected": 126337520.48484848, "logps/chosen": -322.06451612903226, "logps/rejected": -250.1818181818182, "loss": 0.4657, "rewards/chosen": 0.048324092741935484, "rewards/margins": 0.2031536381964809, "rewards/rejected": -0.15482954545454544, "step": 82 }, { "epoch": 0.058927937522186724, "grad_norm": 0.28408178258059597, "kl": 0.265625, "learning_rate": 4.1e-06, "logits/chosen": 131840955.73333333, "logits/rejected": 58866748.23529412, "logps/chosen": -243.2, "logps/rejected": -232.0, "loss": 0.4583, "rewards/chosen": 0.149609375, "rewards/margins": 0.34928768382352937, "rewards/rejected": -0.1996783088235294, "step": 83 }, { "epoch": 0.059637912673056445, "grad_norm": 0.29720974115303783, "kl": 0.0048828125, "learning_rate": 4.15e-06, "logits/chosen": 120154473.41176471, "logits/rejected": 130792379.73333333, "logps/chosen": -193.41176470588235, "logps/rejected": -349.6, "loss": 0.4605, "rewards/chosen": 0.06425206801470588, "rewards/margins": 0.20201248468137256, "rewards/rejected": -0.13776041666666666, "step": 84 }, { "epoch": 0.06034788782392616, "grad_norm": 0.21848992929550343, "kl": 0.0, "learning_rate": 4.2000000000000004e-06, "logits/chosen": 129161580.71232876, "logits/rejected": 92808508.50909092, "logps/chosen": -241.75342465753425, "logps/rejected": -249.8909090909091, "loss": 0.4715, "rewards/chosen": 0.035584332191780824, "rewards/margins": 0.24836842310087176, "rewards/rejected": -0.21278409090909092, "step": 85 }, { "epoch": 0.06105786297479588, "grad_norm": 0.2712247003926014, "kl": 0.0, "learning_rate": 4.25e-06, "logits/chosen": 115054097.65517241, "logits/rejected": 73909628.34285714, "logps/chosen": -177.24137931034483, "logps/rejected": -235.88571428571427, "loss": 0.4568, "rewards/chosen": 0.06613685344827586, "rewards/margins": 0.3339939963054187, "rewards/rejected": -0.26785714285714285, "step": 86 }, { "epoch": 0.0617678381256656, "grad_norm": 0.31521357884594375, "kl": 0.009765625, "learning_rate": 4.3e-06, "logits/chosen": 186087287.46666667, "logits/rejected": 61125812.705882356, "logps/chosen": -258.93333333333334, "logps/rejected": -295.29411764705884, "loss": 0.4476, "rewards/chosen": 0.08919270833333333, "rewards/margins": 0.41823682598039214, "rewards/rejected": -0.3290441176470588, "step": 87 }, { "epoch": 0.06247781327653532, "grad_norm": 0.27935713675491464, "kl": 0.0, "learning_rate": 4.350000000000001e-06, "logits/chosen": 209155959.46666667, "logits/rejected": 63654731.294117644, "logps/chosen": -237.33333333333334, "logps/rejected": -248.94117647058823, "loss": 0.4485, "rewards/chosen": 0.12779947916666667, "rewards/margins": 0.4309934129901961, "rewards/rejected": -0.30319393382352944, "step": 88 }, { "epoch": 0.06318778842740504, "grad_norm": 0.3123470571112547, "kl": 0.046875, "learning_rate": 4.4e-06, "logits/chosen": 161647144.63492063, "logits/rejected": 61728862.52307692, "logps/chosen": -304.76190476190476, "logps/rejected": -271.26153846153846, "loss": 0.4356, "rewards/chosen": 0.2333829365079365, "rewards/margins": 0.5247290903540904, "rewards/rejected": -0.29134615384615387, "step": 89 }, { "epoch": 0.06389776357827476, "grad_norm": 0.28506265618675347, "kl": 0.125, "learning_rate": 4.450000000000001e-06, "logits/chosen": 120036985.90476191, "logits/rejected": 150168182.15384614, "logps/chosen": -230.47619047619048, "logps/rejected": -264.4923076923077, "loss": 0.4369, "rewards/chosen": 0.13076636904761904, "rewards/margins": 0.49759329212454206, "rewards/rejected": -0.36682692307692305, "step": 90 }, { "epoch": 0.06460773872914448, "grad_norm": 0.25741604413159636, "kl": 0.0, "learning_rate": 4.5e-06, "logits/chosen": 164086256.4848485, "logits/rejected": 74279770.83870968, "logps/chosen": -268.6060606060606, "logps/rejected": -240.51612903225808, "loss": 0.4377, "rewards/chosen": 0.16628196022727273, "rewards/margins": 0.5443061537756598, "rewards/rejected": -0.3780241935483871, "step": 91 }, { "epoch": 0.0653177138800142, "grad_norm": 0.28447638064422553, "kl": 0.0, "learning_rate": 4.5500000000000005e-06, "logits/chosen": 107628836.57142857, "logits/rejected": 82954012.44444445, "logps/chosen": -216.42857142857142, "logps/rejected": -234.44444444444446, "loss": 0.4433, "rewards/chosen": 0.07198660714285714, "rewards/margins": 0.44264632936507936, "rewards/rejected": -0.3706597222222222, "step": 92 }, { "epoch": 0.06602768903088392, "grad_norm": 0.2934580291203776, "kl": 0.041015625, "learning_rate": 4.600000000000001e-06, "logits/chosen": 142335735.7419355, "logits/rejected": 74941408.96969697, "logps/chosen": -253.93548387096774, "logps/rejected": -252.36363636363637, "loss": 0.4304, "rewards/chosen": 0.17855342741935484, "rewards/margins": 0.6027958516617791, "rewards/rejected": -0.42424242424242425, "step": 93 }, { "epoch": 0.06673766418175364, "grad_norm": 0.3279034151285564, "kl": 0.0, "learning_rate": 4.65e-06, "logits/chosen": 212603081.44262296, "logits/rejected": 96093382.68656716, "logps/chosen": -300.59016393442624, "logps/rejected": -293.7313432835821, "loss": 0.4144, "rewards/chosen": 0.1754610655737705, "rewards/margins": 0.7407595730364571, "rewards/rejected": -0.5652985074626866, "step": 94 }, { "epoch": 0.06744763933262336, "grad_norm": 0.29814008038232975, "kl": 0.0, "learning_rate": 4.7e-06, "logits/chosen": 97394206.11764705, "logits/rejected": 148548266.66666666, "logps/chosen": -201.41176470588235, "logps/rejected": -302.1333333333333, "loss": 0.4139, "rewards/chosen": 0.16233915441176472, "rewards/margins": 0.7961933210784314, "rewards/rejected": -0.6338541666666667, "step": 95 }, { "epoch": 0.06815761448349308, "grad_norm": 0.28599079701371377, "kl": 0.0, "learning_rate": 4.75e-06, "logits/chosen": 115343360.0, "logits/rejected": 129076323.09677419, "logps/chosen": -188.36363636363637, "logps/rejected": -266.06451612903226, "loss": 0.4187, "rewards/chosen": 0.11783854166666667, "rewards/margins": 0.7302377352150538, "rewards/rejected": -0.6123991935483871, "step": 96 }, { "epoch": 0.06886758963436279, "grad_norm": 0.26646654481491167, "kl": 0.0, "learning_rate": 4.800000000000001e-06, "logits/chosen": 105696460.8, "logits/rejected": 101531083.03448276, "logps/chosen": -205.02857142857144, "logps/rejected": -284.41379310344826, "loss": 0.4179, "rewards/chosen": 0.119140625, "rewards/margins": 0.7834725215517241, "rewards/rejected": -0.6643318965517241, "step": 97 }, { "epoch": 0.06957756478523251, "grad_norm": 0.27264328311354336, "kl": 0.0, "learning_rate": 4.85e-06, "logits/chosen": 120348562.77333333, "logits/rejected": 139401254.64150944, "logps/chosen": -285.6533333333333, "logps/rejected": -275.92452830188677, "loss": 0.3969, "rewards/chosen": 0.319375, "rewards/margins": 0.7733844339622642, "rewards/rejected": -0.4540094339622642, "step": 98 }, { "epoch": 0.07028753993610223, "grad_norm": 0.28440484923646836, "kl": 0.0, "learning_rate": 4.9000000000000005e-06, "logits/chosen": 71086221.2413793, "logits/rejected": 74913265.37142856, "logps/chosen": -195.0344827586207, "logps/rejected": -250.28571428571428, "loss": 0.3993, "rewards/chosen": 0.17699353448275862, "rewards/margins": 0.866279248768473, "rewards/rejected": -0.6892857142857143, "step": 99 }, { "epoch": 0.07099751508697195, "grad_norm": 0.26353115121576565, "kl": 0.0, "learning_rate": 4.95e-06, "logits/chosen": 104916674.70422535, "logits/rejected": 153202472.42105263, "logps/chosen": -200.7887323943662, "logps/rejected": -332.0701754385965, "loss": 0.4053, "rewards/chosen": 0.0907377860915493, "rewards/margins": 0.951483400126637, "rewards/rejected": -0.8607456140350878, "step": 100 }, { "epoch": 0.07170749023784168, "grad_norm": 0.27994801078171117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112956945.65517241, "logits/rejected": 139131055.54285714, "logps/chosen": -180.41379310344828, "logps/rejected": -297.6, "loss": 0.3795, "rewards/chosen": 0.04000538793103448, "rewards/margins": 1.0882196736453202, "rewards/rejected": -1.0482142857142858, "step": 101 }, { "epoch": 0.0724174653887114, "grad_norm": 0.2696159796311511, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96590566.0289855, "logits/rejected": 154122899.52542374, "logps/chosen": -249.97101449275362, "logps/rejected": -317.8305084745763, "loss": 0.3792, "rewards/chosen": 0.26913496376811596, "rewards/margins": -19869459.256288763, "rewards/rejected": 19869459.525423728, "step": 102 }, { "epoch": 0.07312744053958112, "grad_norm": 0.24916830933996315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139225852.17910448, "logits/rejected": 142056263.3442623, "logps/chosen": -231.6417910447761, "logps/rejected": -342.55737704918033, "loss": 0.3775, "rewards/chosen": 0.17747201492537312, "rewards/margins": 0.7297261132860289, "rewards/rejected": -0.5522540983606558, "step": 103 }, { "epoch": 0.07383741569045084, "grad_norm": 0.23792162637666706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145498959.44827586, "logits/rejected": 105217111.77142857, "logps/chosen": -209.93103448275863, "logps/rejected": -295.54285714285714, "loss": 0.3729, "rewards/chosen": 0.07825969827586207, "rewards/margins": 1.2175454125615763, "rewards/rejected": -1.1392857142857142, "step": 104 }, { "epoch": 0.07454739084132056, "grad_norm": 0.27800419411889965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81617030.29508197, "logits/rejected": 98159233.91044776, "logps/chosen": -174.68852459016392, "logps/rejected": -313.07462686567163, "loss": 0.3841, "rewards/chosen": 0.11609887295081968, "rewards/margins": 0.9127406639955958, "rewards/rejected": -0.7966417910447762, "step": 105 }, { "epoch": 0.07525736599219027, "grad_norm": 0.1757010250366332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112447292.95238096, "logits/rejected": 81741265.45454545, "logps/chosen": -262.4761904761905, "logps/rejected": -255.0909090909091, "loss": 0.395, "rewards/chosen": 0.37462797619047616, "rewards/margins": 1.3099972943722944, "rewards/rejected": -0.9353693181818182, "step": 106 }, { "epoch": 0.07596734114305999, "grad_norm": 0.2545312068270128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133518677.33333333, "logits/rejected": 108371746.5945946, "logps/chosen": -217.33333333333334, "logps/rejected": -273.2972972972973, "loss": 0.3698, "rewards/chosen": 0.11682581018518519, "rewards/margins": 1.1793258101851851, "rewards/rejected": -1.0625, "step": 107 }, { "epoch": 0.07667731629392971, "grad_norm": 0.2487064582071713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147747740.9032258, "logits/rejected": 92322350.54545455, "logps/chosen": -201.5483870967742, "logps/rejected": -296.72727272727275, "loss": 0.3491, "rewards/chosen": 0.050592237903225805, "rewards/margins": 1.508925571236559, "rewards/rejected": -1.4583333333333333, "step": 108 }, { "epoch": 0.07738729144479943, "grad_norm": 0.238698146781549, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96163951.7090909, "logits/rejected": 155591441.53424656, "logps/chosen": -219.4181818181818, "logps/rejected": -291.28767123287673, "loss": 0.3446, "rewards/chosen": 0.17375710227272728, "rewards/margins": 1.5419077872042342, "rewards/rejected": -1.3681506849315068, "step": 109 }, { "epoch": 0.07809726659566915, "grad_norm": 0.21907333824640984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151208213.69491526, "logits/rejected": 46380492.057971016, "logps/chosen": -251.38983050847457, "logps/rejected": -247.65217391304347, "loss": 0.3639, "rewards/chosen": 0.21252648305084745, "rewards/margins": 1.4208598163841808, "rewards/rejected": -1.2083333333333333, "step": 110 }, { "epoch": 0.07880724174653887, "grad_norm": 0.18960340423946143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107456886.9859155, "logits/rejected": 40958850.24561404, "logps/chosen": -226.70422535211267, "logps/rejected": -249.82456140350877, "loss": 0.3831, "rewards/chosen": 0.1141202684859155, "rewards/margins": 1.1196027246262663, "rewards/rejected": -1.0054824561403508, "step": 111 }, { "epoch": 0.0795172168974086, "grad_norm": 0.17090910865332667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 41815939.878787875, "logits/rejected": 178325570.06451613, "logps/chosen": -174.66666666666666, "logps/rejected": -308.9032258064516, "loss": 0.35, "rewards/chosen": 0.061197916666666664, "rewards/margins": 1.2184559811827957, "rewards/rejected": -1.157258064516129, "step": 112 }, { "epoch": 0.08022719204827831, "grad_norm": 0.17240059694357995, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115952210.58064516, "logits/rejected": 29773203.393939395, "logps/chosen": -211.09677419354838, "logps/rejected": -248.24242424242425, "loss": 0.3608, "rewards/chosen": 0.13927041330645162, "rewards/margins": 1.5928688981549364, "rewards/rejected": -1.4535984848484849, "step": 113 }, { "epoch": 0.08093716719914804, "grad_norm": 0.16370622499297388, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110354680.24242425, "logits/rejected": 123968743.22580644, "logps/chosen": -234.66666666666666, "logps/rejected": -301.16129032258067, "loss": 0.3378, "rewards/chosen": 0.2099313446969697, "rewards/margins": 1.9417861834066472, "rewards/rejected": -1.7318548387096775, "step": 114 }, { "epoch": 0.08164714235001774, "grad_norm": 0.19334730328534688, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135867945.96721312, "logits/rejected": 111430762.98507462, "logps/chosen": -256.5245901639344, "logps/rejected": -303.76119402985074, "loss": 0.3323, "rewards/chosen": 0.1804559426229508, "rewards/margins": 2.0871723605333985, "rewards/rejected": -1.9067164179104477, "step": 115 }, { "epoch": 0.08235711750088746, "grad_norm": 0.15957782321849037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149663418.92063493, "logits/rejected": 34199709.538461536, "logps/chosen": -238.22222222222223, "logps/rejected": -319.0153846153846, "loss": 0.3365, "rewards/chosen": 0.3273809523809524, "rewards/margins": 2.3725732600732603, "rewards/rejected": -2.0451923076923078, "step": 116 }, { "epoch": 0.08306709265175719, "grad_norm": 0.14569101378261698, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146538496.0, "logits/rejected": 74645504.0, "logps/chosen": -179.75, "logps/rejected": -254.25, "loss": 0.3569, "rewards/chosen": 0.07471466064453125, "rewards/margins": 1.8872146606445312, "rewards/rejected": -1.8125, "step": 117 }, { "epoch": 0.0837770678026269, "grad_norm": 0.19478959255449588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97401059.55555555, "logits/rejected": 91764569.94594595, "logps/chosen": -226.66666666666666, "logps/rejected": -236.54054054054055, "loss": 0.3359, "rewards/chosen": 0.08434606481481481, "rewards/margins": 1.692454172922923, "rewards/rejected": -1.6081081081081081, "step": 118 }, { "epoch": 0.08448704295349663, "grad_norm": 0.1826370418370257, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 201867792.51612905, "logits/rejected": 61897759.03030303, "logps/chosen": -245.67741935483872, "logps/rejected": -255.27272727272728, "loss": 0.3343, "rewards/chosen": 0.15246975806451613, "rewards/margins": 1.9536061217008798, "rewards/rejected": -1.8011363636363635, "step": 119 }, { "epoch": 0.08519701810436635, "grad_norm": 0.1368640191759488, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113500408.24242425, "logits/rejected": 128805722.83870968, "logps/chosen": -185.93939393939394, "logps/rejected": -323.0967741935484, "loss": 0.3387, "rewards/chosen": 0.11274857954545454, "rewards/margins": 2.370813095674487, "rewards/rejected": -2.2580645161290325, "step": 120 }, { "epoch": 0.08590699325523607, "grad_norm": 0.15511418281527517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146407424.0, "logits/rejected": 137363456.0, "logps/chosen": -255.75, "logps/rejected": -272.25, "loss": 0.3473, "rewards/chosen": 0.09539794921875, "rewards/margins": 2.13250732421875, "rewards/rejected": -2.037109375, "step": 121 }, { "epoch": 0.08661696840610579, "grad_norm": 0.14243812553718102, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141674268.44444445, "logits/rejected": 65610898.28571428, "logps/chosen": -223.33333333333334, "logps/rejected": -272.2857142857143, "loss": 0.368, "rewards/chosen": 0.2024875217013889, "rewards/margins": 2.0707910931299605, "rewards/rejected": -1.8683035714285714, "step": 122 }, { "epoch": 0.08732694355697551, "grad_norm": 0.17489891931969742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114988457.35384615, "logits/rejected": 85816791.36507936, "logps/chosen": -206.27692307692308, "logps/rejected": -251.17460317460316, "loss": 0.3427, "rewards/chosen": 0.07151442307692307, "rewards/margins": 2.075482677045177, "rewards/rejected": -2.003968253968254, "step": 123 }, { "epoch": 0.08803691870784522, "grad_norm": 0.14092356820187404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130755762.79365079, "logits/rejected": 119763511.13846155, "logps/chosen": -221.20634920634922, "logps/rejected": -301.04615384615386, "loss": 0.3085, "rewards/chosen": 0.29247271825396826, "rewards/margins": 2.773241949023199, "rewards/rejected": -2.480769230769231, "step": 124 }, { "epoch": 0.08874689385871494, "grad_norm": 0.16185059068967508, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138412032.0, "logits/rejected": 32957828.413793102, "logps/chosen": -227.88571428571427, "logps/rejected": -240.68965517241378, "loss": 0.3628, "rewards/chosen": 0.049107142857142856, "rewards/margins": 1.846520935960591, "rewards/rejected": -1.7974137931034482, "step": 125 }, { "epoch": 0.08945686900958466, "grad_norm": 0.14756139652705072, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111149056.0, "logits/rejected": 83484407.74193548, "logps/chosen": -232.96969696969697, "logps/rejected": -330.19354838709677, "loss": 0.3173, "rewards/chosen": 0.2461529356060606, "rewards/margins": 2.761273903347996, "rewards/rejected": -2.5151209677419355, "step": 126 }, { "epoch": 0.09016684416045438, "grad_norm": 0.15867127737126493, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143180898.19178084, "logits/rejected": 69644511.41818182, "logps/chosen": -252.93150684931507, "logps/rejected": -281.6, "loss": 0.3385, "rewards/chosen": 0.2266695205479452, "rewards/margins": -1827845.3006032067, "rewards/rejected": 1827845.5272727273, "step": 127 }, { "epoch": 0.0908768193113241, "grad_norm": 0.1564912479921012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103181934.43137255, "logits/rejected": 108779546.5974026, "logps/chosen": -191.05882352941177, "logps/rejected": -324.57142857142856, "loss": 0.3018, "rewards/chosen": 0.02152267156862745, "rewards/margins": 2.526392801438757, "rewards/rejected": -2.5048701298701297, "step": 128 }, { "epoch": 0.09158679446219382, "grad_norm": 0.15379699739681754, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144603623.6190476, "logits/rejected": 99308213.16923077, "logps/chosen": -245.33333333333334, "logps/rejected": -304.24615384615385, "loss": 0.3045, "rewards/chosen": 0.5225694444444444, "rewards/margins": 3.080261752136752, "rewards/rejected": -2.5576923076923075, "step": 129 }, { "epoch": 0.09229676961306355, "grad_norm": 0.16308921182210548, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85482419.58208956, "logits/rejected": 134080209.83606558, "logps/chosen": -204.4179104477612, "logps/rejected": -315.8032786885246, "loss": 0.2924, "rewards/chosen": 0.40706623134328357, "rewards/margins": 3.1160826247859066, "rewards/rejected": -2.709016393442623, "step": 130 }, { "epoch": 0.09300674476393327, "grad_norm": 0.15742118018333667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155126646.4477612, "logits/rejected": 52703836.32786885, "logps/chosen": -267.94029850746267, "logps/rejected": -313.9672131147541, "loss": 0.3385, "rewards/chosen": 0.10075355643656717, "rewards/margins": 2.557720769551321, "rewards/rejected": -2.456967213114754, "step": 131 }, { "epoch": 0.09371671991480299, "grad_norm": 0.15130600437759212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166755359.03030303, "logits/rejected": 39237037.41935484, "logps/chosen": -244.4848484848485, "logps/rejected": -308.38709677419354, "loss": 0.3085, "rewards/chosen": 0.30314867424242425, "rewards/margins": 2.198309964565005, "rewards/rejected": -1.8951612903225807, "step": 132 }, { "epoch": 0.09442669506567271, "grad_norm": 0.15131919891499243, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145003081.14285713, "logits/rejected": 87140281.37931034, "logps/chosen": -202.97142857142856, "logps/rejected": -360.2758620689655, "loss": 0.3115, "rewards/chosen": 0.16752232142857143, "rewards/margins": 3.4089016317733987, "rewards/rejected": -3.2413793103448274, "step": 133 }, { "epoch": 0.09513667021654242, "grad_norm": 0.15938322759121926, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94028669.67272727, "logits/rejected": 114682613.47945206, "logps/chosen": -223.12727272727273, "logps/rejected": -299.83561643835617, "loss": 0.2702, "rewards/chosen": 0.5355113636363636, "rewards/margins": 2.395100404732254, "rewards/rejected": -1.8595890410958904, "step": 134 }, { "epoch": 0.09584664536741214, "grad_norm": 0.18961456870583215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94411408.90566038, "logits/rejected": 85172333.22666667, "logps/chosen": -233.81132075471697, "logps/rejected": -273.92, "loss": 0.2685, "rewards/chosen": 0.466686320754717, "rewards/margins": 2.373352987421384, "rewards/rejected": -1.9066666666666667, "step": 135 }, { "epoch": 0.09655662051828186, "grad_norm": 0.15223908160736876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154723214.2222222, "logits/rejected": 26284977.230769232, "logps/chosen": -235.93650793650792, "logps/rejected": -246.15384615384616, "loss": 0.3092, "rewards/chosen": 0.5498511904761905, "rewards/margins": 2.328697344322344, "rewards/rejected": -1.7788461538461537, "step": 136 }, { "epoch": 0.09726659566915158, "grad_norm": 0.15803037312843862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110061643.85185185, "logits/rejected": 101116734.27027027, "logps/chosen": -146.37037037037038, "logps/rejected": -268.97297297297297, "loss": 0.2621, "rewards/chosen": 0.4427083333333333, "rewards/margins": 3.1758164414414414, "rewards/rejected": -2.733108108108108, "step": 137 }, { "epoch": 0.0979765708200213, "grad_norm": 0.13481761582918983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106102784.0, "logits/rejected": 147456000.0, "logps/chosen": -226.5, "logps/rejected": -324.0, "loss": 0.2753, "rewards/chosen": 0.587158203125, "rewards/margins": 3.505126953125, "rewards/rejected": -2.91796875, "step": 138 }, { "epoch": 0.09868654597089102, "grad_norm": 0.14589443053452944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122030505.05660377, "logits/rejected": 105360916.48, "logps/chosen": -228.52830188679246, "logps/rejected": -337.06666666666666, "loss": 0.2508, "rewards/chosen": 0.5928655660377359, "rewards/margins": 3.7795322327044025, "rewards/rejected": -3.1866666666666665, "step": 139 }, { "epoch": 0.09939652112176074, "grad_norm": 0.16026790060098484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101114990.27692308, "logits/rejected": 110117124.06349206, "logps/chosen": -248.86153846153846, "logps/rejected": -279.36507936507934, "loss": 0.2781, "rewards/chosen": 0.5370192307692307, "rewards/margins": 2.8267017704517703, "rewards/rejected": -2.2896825396825395, "step": 140 }, { "epoch": 0.10010649627263046, "grad_norm": 0.14751904654669978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119304647.1111111, "logits/rejected": 83111746.95384616, "logps/chosen": -216.38095238095238, "logps/rejected": -325.9076923076923, "loss": 0.2851, "rewards/chosen": 0.29365079365079366, "rewards/margins": 2.724420024420024, "rewards/rejected": -2.4307692307692306, "step": 141 }, { "epoch": 0.10081647142350018, "grad_norm": 0.1469704073279115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154751037.13432837, "logits/rejected": 35239029.50819672, "logps/chosen": -265.3134328358209, "logps/rejected": -227.40983606557376, "loss": 0.2935, "rewards/chosen": 0.8703358208955224, "rewards/margins": 3.353942378272572, "rewards/rejected": -2.4836065573770494, "step": 142 }, { "epoch": 0.10152644657436989, "grad_norm": 0.14593334647409217, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102261126.09523809, "logits/rejected": 72980889.6, "logps/chosen": -215.87301587301587, "logps/rejected": -267.0769230769231, "loss": 0.3008, "rewards/chosen": 0.49975198412698413, "rewards/margins": 3.3074442918192917, "rewards/rejected": -2.8076923076923075, "step": 143 }, { "epoch": 0.10223642172523961, "grad_norm": 0.12944099198089018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114665818.58461538, "logits/rejected": 72709591.36507936, "logps/chosen": -286.03076923076924, "logps/rejected": -307.04761904761904, "loss": 0.2613, "rewards/chosen": 0.9740384615384615, "rewards/margins": 3.88276862026862, "rewards/rejected": -2.9087301587301586, "step": 144 }, { "epoch": 0.10294639687610933, "grad_norm": 0.1380861361913777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90387251.2, "logits/rejected": 165602692.41379312, "logps/chosen": -231.65714285714284, "logps/rejected": -370.48275862068965, "loss": 0.2747, "rewards/chosen": 0.6897321428571429, "rewards/margins": 3.3341286945812807, "rewards/rejected": -2.644396551724138, "step": 145 }, { "epoch": 0.10365637202697905, "grad_norm": 0.13261154411960593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125022523.07692307, "logits/rejected": 107396257.68421052, "logps/chosen": -232.46153846153845, "logps/rejected": -300.63157894736844, "loss": 0.2457, "rewards/chosen": 0.6604567307692307, "rewards/margins": 3.90716725708502, "rewards/rejected": -3.2467105263157894, "step": 146 }, { "epoch": 0.10436634717784878, "grad_norm": 0.13078302616883689, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74908797.75438596, "logits/rejected": 118031259.04225352, "logps/chosen": -218.3859649122807, "logps/rejected": -337.1267605633803, "loss": 0.2664, "rewards/chosen": 0.4849232456140351, "rewards/margins": 3.8053457808253026, "rewards/rejected": -3.3204225352112675, "step": 147 }, { "epoch": 0.1050763223287185, "grad_norm": 0.14670819854674433, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153664046.54545453, "logits/rejected": 99648545.03225806, "logps/chosen": -231.5151515151515, "logps/rejected": -323.8709677419355, "loss": 0.2611, "rewards/chosen": 0.5327888257575758, "rewards/margins": 4.246498503176931, "rewards/rejected": -3.713709677419355, "step": 148 }, { "epoch": 0.10578629747958822, "grad_norm": 0.2290845846539068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121154218.66666667, "logits/rejected": 134517321.14285713, "logps/chosen": -260.44444444444446, "logps/rejected": -375.42857142857144, "loss": 0.2841, "rewards/chosen": 0.66796875, "rewards/margins": 4.226004464285714, "rewards/rejected": -3.5580357142857144, "step": 149 }, { "epoch": 0.10649627263045794, "grad_norm": 0.1463855377887239, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106319251.39393939, "logits/rejected": 129143973.16129032, "logps/chosen": -195.63636363636363, "logps/rejected": -337.5483870967742, "loss": 0.2654, "rewards/chosen": 0.5179924242424242, "rewards/margins": 4.25186339198436, "rewards/rejected": -3.7338709677419355, "step": 150 }, { "epoch": 0.10720624778132766, "grad_norm": 0.14406473246363238, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89235337.27536231, "logits/rejected": 168198699.3898305, "logps/chosen": -252.52173913043478, "logps/rejected": -388.33898305084745, "loss": 0.2869, "rewards/chosen": 0.421195652173913, "rewards/margins": 9366375.47204311, "rewards/rejected": -9366375.050847458, "step": 151 }, { "epoch": 0.10791622293219737, "grad_norm": 0.12864381989809498, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142606336.0, "logits/rejected": 59440173.85074627, "logps/chosen": -278.0327868852459, "logps/rejected": -296.5970149253731, "loss": 0.2412, "rewards/chosen": 0.9129098360655737, "rewards/margins": 4.274850134573037, "rewards/rejected": -3.361940298507463, "step": 152 }, { "epoch": 0.10862619808306709, "grad_norm": 0.14273251943600057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149317222.4, "logits/rejected": 66584576.0, "logps/chosen": -227.73333333333332, "logps/rejected": -291.29411764705884, "loss": 0.2711, "rewards/chosen": 0.434375, "rewards/margins": 3.1292279411764707, "rewards/rejected": -2.6948529411764706, "step": 153 }, { "epoch": 0.10933617323393681, "grad_norm": 0.13495481604521944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92740721.77777778, "logits/rejected": 90460934.91891892, "logps/chosen": -166.5185185185185, "logps/rejected": -307.4594594594595, "loss": 0.2448, "rewards/chosen": 0.3385416666666667, "rewards/margins": 4.058136261261262, "rewards/rejected": -3.7195945945945947, "step": 154 }, { "epoch": 0.11004614838480653, "grad_norm": 0.14873535853614253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 157810688.0, "logits/rejected": 107872256.0, "logps/chosen": -250.0, "logps/rejected": -335.5, "loss": 0.269, "rewards/chosen": 0.649658203125, "rewards/margins": 4.204345703125, "rewards/rejected": -3.5546875, "step": 155 }, { "epoch": 0.11075612353567625, "grad_norm": 0.13984344703595414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119698983.38461539, "logits/rejected": 95070890.66666667, "logps/chosen": -220.30769230769232, "logps/rejected": -313.3968253968254, "loss": 0.2674, "rewards/chosen": 0.5504807692307693, "rewards/margins": 4.236988705738706, "rewards/rejected": -3.6865079365079363, "step": 156 }, { "epoch": 0.11146609868654597, "grad_norm": 0.13977977521356166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127360373.84126984, "logits/rejected": 118505219.93846154, "logps/chosen": -272.76190476190476, "logps/rejected": -357.9076923076923, "loss": 0.268, "rewards/chosen": 0.3472222222222222, "rewards/margins": 4.166452991452991, "rewards/rejected": -3.8192307692307694, "step": 157 }, { "epoch": 0.1121760738374157, "grad_norm": 0.14489005490083723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109750954.66666667, "logits/rejected": 102873807.56756757, "logps/chosen": -231.40740740740742, "logps/rejected": -314.81081081081084, "loss": 0.2198, "rewards/chosen": 0.6322337962962963, "rewards/margins": 4.382233796296296, "rewards/rejected": -3.75, "step": 158 }, { "epoch": 0.11288604898828541, "grad_norm": 0.1396007131267906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143538403.55555555, "logits/rejected": 109826237.04615384, "logps/chosen": -206.22222222222223, "logps/rejected": -295.1384615384615, "loss": 0.2708, "rewards/chosen": 0.1654265873015873, "rewards/margins": 4.026965048840049, "rewards/rejected": -3.8615384615384616, "step": 159 }, { "epoch": 0.11359602413915514, "grad_norm": 0.11743311850831349, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97063185.06666666, "logits/rejected": 118427407.05882353, "logps/chosen": -213.6, "logps/rejected": -316.47058823529414, "loss": 0.2265, "rewards/chosen": 0.89375, "rewards/margins": 3.695220588235294, "rewards/rejected": -2.801470588235294, "step": 160 }, { "epoch": 0.11430599929002484, "grad_norm": 0.16371980367054315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141225932.15189874, "logits/rejected": 107104548.57142857, "logps/chosen": -221.16455696202533, "logps/rejected": -302.0408163265306, "loss": 0.2935, "rewards/chosen": 0.5408425632911392, "rewards/margins": 3.551046644923792, "rewards/rejected": -3.010204081632653, "step": 161 }, { "epoch": 0.11501597444089456, "grad_norm": 0.14926014154949283, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110241333.49253732, "logits/rejected": 105476431.73770492, "logps/chosen": -196.53731343283582, "logps/rejected": -349.9016393442623, "loss": 0.2421, "rewards/chosen": 0.8022388059701493, "rewards/margins": 4.5686322485931, "rewards/rejected": -3.7663934426229506, "step": 162 }, { "epoch": 0.11572594959176428, "grad_norm": 0.12672924505639546, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74066082.53968254, "logits/rejected": 116020901.41538462, "logps/chosen": -236.44444444444446, "logps/rejected": -307.9384615384615, "loss": 0.2347, "rewards/chosen": 0.7013888888888888, "rewards/margins": 10566656.701388888, "rewards/rejected": -10566656.0, "step": 163 }, { "epoch": 0.116435924742634, "grad_norm": 0.1471870276004415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143638780.06153846, "logits/rejected": 111981259.17460318, "logps/chosen": -213.53846153846155, "logps/rejected": -301.2063492063492, "loss": 0.2383, "rewards/chosen": 0.7567307692307692, "rewards/margins": 3.665460927960928, "rewards/rejected": -2.9087301587301586, "step": 164 }, { "epoch": 0.11714589989350373, "grad_norm": 0.141729595128919, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138831462.4, "logits/rejected": 67293906.8235294, "logps/chosen": -293.06666666666666, "logps/rejected": -276.47058823529414, "loss": 0.2319, "rewards/chosen": 1.0861979166666667, "rewards/margins": 5.027374387254902, "rewards/rejected": -3.9411764705882355, "step": 165 }, { "epoch": 0.11785587504437345, "grad_norm": 0.1508855011106279, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88692053.33333333, "logits/rejected": 132490661.64705883, "logps/chosen": -169.33333333333334, "logps/rejected": -258.11764705882354, "loss": 0.2566, "rewards/chosen": 0.37552083333333336, "rewards/margins": 4.45640318627451, "rewards/rejected": -4.080882352941177, "step": 166 }, { "epoch": 0.11856585019524317, "grad_norm": 0.14020612080381578, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113614129.40350877, "logits/rejected": 88789280.45070423, "logps/chosen": -254.31578947368422, "logps/rejected": -294.76056338028167, "loss": 0.2318, "rewards/chosen": 0.6710526315789473, "rewards/margins": 4.938658265381765, "rewards/rejected": -4.267605633802817, "step": 167 }, { "epoch": 0.11927582534611289, "grad_norm": 0.15851511336207794, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168051780.26666668, "logits/rejected": 56499742.11764706, "logps/chosen": -235.2, "logps/rejected": -317.6470588235294, "loss": 0.2393, "rewards/chosen": 0.48020833333333335, "rewards/margins": 4.516973039215687, "rewards/rejected": -4.036764705882353, "step": 168 }, { "epoch": 0.11998580049698261, "grad_norm": 0.12221667536757443, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132361232.78688525, "logits/rejected": 75685276.65671642, "logps/chosen": -256.5245901639344, "logps/rejected": -291.1044776119403, "loss": 0.229, "rewards/chosen": 1.0932377049180328, "rewards/margins": 3.2947302422314655, "rewards/rejected": -2.201492537313433, "step": 169 }, { "epoch": 0.12069577564785232, "grad_norm": 0.1627845354832446, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110245111.1724138, "logits/rejected": 123672049.37142856, "logps/chosen": -181.51724137931035, "logps/rejected": -358.85714285714283, "loss": 0.2449, "rewards/chosen": 0.4951508620689655, "rewards/margins": 4.6380080049261085, "rewards/rejected": -4.142857142857143, "step": 170 }, { "epoch": 0.12140575079872204, "grad_norm": 0.14809248335839706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88317159.22580644, "logits/rejected": 126146870.3030303, "logps/chosen": -196.6451612903226, "logps/rejected": -289.45454545454544, "loss": 0.2509, "rewards/chosen": 0.5962701612903226, "rewards/margins": 4.372785312805474, "rewards/rejected": -3.7765151515151514, "step": 171 }, { "epoch": 0.12211572594959176, "grad_norm": 0.1315419255774214, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111925778.96296297, "logits/rejected": 103157206.48648648, "logps/chosen": -208.0, "logps/rejected": -301.4054054054054, "loss": 0.2391, "rewards/chosen": 0.6018518518518519, "rewards/margins": 4.392392392392392, "rewards/rejected": -3.7905405405405403, "step": 172 }, { "epoch": 0.12282570110046148, "grad_norm": 0.1330037300695357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91302095.76811594, "logits/rejected": 106848117.15254237, "logps/chosen": -186.43478260869566, "logps/rejected": -336.8135593220339, "loss": 0.2392, "rewards/chosen": 0.7923460144927537, "rewards/margins": 5.177939234831737, "rewards/rejected": -4.385593220338983, "step": 173 }, { "epoch": 0.1235356762513312, "grad_norm": 0.15545324292418775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 210256400.51612905, "logits/rejected": 72613888.0, "logps/chosen": -332.9032258064516, "logps/rejected": -299.8787878787879, "loss": 0.2287, "rewards/chosen": 1.3508064516129032, "rewards/margins": 5.7485337243401755, "rewards/rejected": -4.3977272727272725, "step": 174 }, { "epoch": 0.12424565140220092, "grad_norm": 0.1452637248296067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161380839.6190476, "logits/rejected": 66444429.78461538, "logps/chosen": -221.46031746031747, "logps/rejected": -321.4769230769231, "loss": 0.2426, "rewards/chosen": 0.8572668650793651, "rewards/margins": 4.02265148046398, "rewards/rejected": -3.1653846153846152, "step": 175 }, { "epoch": 0.12495562655307065, "grad_norm": 0.15058213826964345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136005464.13114753, "logits/rejected": 109928325.73134328, "logps/chosen": -227.14754098360655, "logps/rejected": -318.56716417910445, "loss": 0.2145, "rewards/chosen": 0.7940573770491803, "rewards/margins": 4.939579765108881, "rewards/rejected": -4.145522388059701, "step": 176 }, { "epoch": 0.12566560170394037, "grad_norm": 0.14385362168056195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115081216.0, "logits/rejected": 122814464.0, "logps/chosen": -189.5, "logps/rejected": -347.0, "loss": 0.2396, "rewards/chosen": 0.9130859375, "rewards/margins": 5.1513671875, "rewards/rejected": -4.23828125, "step": 177 }, { "epoch": 0.12637557685481007, "grad_norm": 0.16682179707236713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141368672.52459016, "logits/rejected": 95154359.40298508, "logps/chosen": -280.1311475409836, "logps/rejected": -290.86567164179104, "loss": 0.2278, "rewards/chosen": 0.9897540983606558, "rewards/margins": 5.314380964032298, "rewards/rejected": -4.324626865671642, "step": 178 }, { "epoch": 0.1270855520056798, "grad_norm": 0.14706767354618822, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108035591.87692308, "logits/rejected": 139277523.3015873, "logps/chosen": -214.15384615384616, "logps/rejected": -317.46031746031747, "loss": 0.2417, "rewards/chosen": 0.6194711538461538, "rewards/margins": 4.917090201465201, "rewards/rejected": -4.2976190476190474, "step": 179 }, { "epoch": 0.12779552715654952, "grad_norm": 0.14490018668149265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 189195374.2769231, "logits/rejected": 55641104.253968254, "logps/chosen": -236.30769230769232, "logps/rejected": -288.76190476190476, "loss": 0.2462, "rewards/chosen": 0.9009615384615385, "rewards/margins": 5.008104395604395, "rewards/rejected": -4.107142857142857, "step": 180 }, { "epoch": 0.12850550230741925, "grad_norm": 0.15586084448719048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90282393.6, "logits/rejected": 102575405.1764706, "logps/chosen": -158.8, "logps/rejected": -308.47058823529414, "loss": 0.2513, "rewards/chosen": 0.49895833333333334, "rewards/margins": 4.737928921568628, "rewards/rejected": -4.238970588235294, "step": 181 }, { "epoch": 0.12921547745828896, "grad_norm": 0.1496951119601363, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94804585.65079366, "logits/rejected": 149188166.8923077, "logps/chosen": -204.06349206349208, "logps/rejected": -315.5692307692308, "loss": 0.2629, "rewards/chosen": 0.6190476190476191, "rewards/margins": 4.988278388278388, "rewards/rejected": -4.369230769230769, "step": 182 }, { "epoch": 0.1299254526091587, "grad_norm": 0.23054671300597207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119871992.57971014, "logits/rejected": 98921593.49152543, "logps/chosen": -249.97101449275362, "logps/rejected": -342.23728813559325, "loss": 0.2576, "rewards/chosen": 0.7028985507246377, "rewards/margins": 4.98255956767379, "rewards/rejected": -4.279661016949152, "step": 183 }, { "epoch": 0.1306354277600284, "grad_norm": 0.16866490033921078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141812819.02702704, "logits/rejected": 75031438.22222222, "logps/chosen": -201.51351351351352, "logps/rejected": -238.22222222222223, "loss": 0.2701, "rewards/chosen": 0.575168918918919, "rewards/margins": 3.950168918918919, "rewards/rejected": -3.375, "step": 184 }, { "epoch": 0.1313454029108981, "grad_norm": 0.16789600361961488, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114277871.48387097, "logits/rejected": 82265553.45454545, "logps/chosen": -200.70967741935485, "logps/rejected": -313.45454545454544, "loss": 0.2201, "rewards/chosen": 0.9430443548387096, "rewards/margins": 4.920317082111437, "rewards/rejected": -3.977272727272727, "step": 185 }, { "epoch": 0.13205537806176784, "grad_norm": 0.15604230206937253, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124357434.38596492, "logits/rejected": 81523091.83098592, "logps/chosen": -214.87719298245614, "logps/rejected": -258.92957746478874, "loss": 0.2123, "rewards/chosen": 0.8760279605263158, "rewards/margins": 5.520394157709415, "rewards/rejected": -4.644366197183099, "step": 186 }, { "epoch": 0.13276535321263755, "grad_norm": 0.21195183627548012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107263156.70588236, "logits/rejected": 72561459.2, "logps/chosen": -176.7058823529412, "logps/rejected": -274.1333333333333, "loss": 0.2501, "rewards/chosen": 0.640625, "rewards/margins": 4.903125, "rewards/rejected": -4.2625, "step": 187 }, { "epoch": 0.13347532836350728, "grad_norm": 0.18344028508447963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142606336.0, "logits/rejected": 84140280.24242425, "logps/chosen": -253.16129032258064, "logps/rejected": -305.93939393939394, "loss": 0.2306, "rewards/chosen": 0.7535282258064516, "rewards/margins": 5.033831256109482, "rewards/rejected": -4.28030303030303, "step": 188 }, { "epoch": 0.134185303514377, "grad_norm": 0.185535959408914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130432190.91525424, "logits/rejected": 119507270.49275362, "logps/chosen": -256.0, "logps/rejected": -266.6666666666667, "loss": 0.2187, "rewards/chosen": 0.9925847457627118, "rewards/margins": 5.1991064848931465, "rewards/rejected": -4.206521739130435, "step": 189 }, { "epoch": 0.13489527866524673, "grad_norm": 0.20339257857354778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92274688.0, "logits/rejected": 116677911.27272727, "logps/chosen": -252.90322580645162, "logps/rejected": -335.5151515151515, "loss": 0.2098, "rewards/chosen": 1.3870967741935485, "rewards/margins": 5.724217986314761, "rewards/rejected": -4.337121212121212, "step": 190 }, { "epoch": 0.13560525381611643, "grad_norm": 0.24012841042826583, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125435904.0, "logits/rejected": 122421248.0, "logps/chosen": -236.75, "logps/rejected": -291.5, "loss": 0.2464, "rewards/chosen": 0.556640625, "rewards/margins": 4.677734375, "rewards/rejected": -4.12109375, "step": 191 }, { "epoch": 0.13631522896698617, "grad_norm": 0.253387028206057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109272656.84210527, "logits/rejected": 97356248.61538461, "logps/chosen": -206.10526315789474, "logps/rejected": -371.0769230769231, "loss": 0.2646, "rewards/chosen": 0.6051603618421053, "rewards/margins": 4.850352669534413, "rewards/rejected": -4.2451923076923075, "step": 192 }, { "epoch": 0.13702520411785588, "grad_norm": 0.24080578576403963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108884131.84, "logits/rejected": 145455297.20754716, "logps/chosen": -219.30666666666667, "logps/rejected": -307.3207547169811, "loss": 0.2463, "rewards/chosen": 0.9, "rewards/margins": 6.04622641509434, "rewards/rejected": -5.14622641509434, "step": 193 }, { "epoch": 0.13773517926872558, "grad_norm": 0.22377141604155223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123764231.87692308, "logits/rejected": 119238070.85714285, "logps/chosen": -259.9384615384615, "logps/rejected": -347.1746031746032, "loss": 0.2366, "rewards/chosen": 0.6153846153846154, "rewards/margins": 4.956654456654457, "rewards/rejected": -4.341269841269841, "step": 194 }, { "epoch": 0.13844515441959532, "grad_norm": 0.2547355960195006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159716433.26984128, "logits/rejected": 78659331.93846154, "logps/chosen": -266.92063492063494, "logps/rejected": -274.7076923076923, "loss": 0.2336, "rewards/chosen": 1.1845238095238095, "rewards/margins": 4.230677655677655, "rewards/rejected": -3.046153846153846, "step": 195 }, { "epoch": 0.13915512957046502, "grad_norm": 0.16134894817615691, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81142306.13333334, "logits/rejected": 139152203.29411766, "logps/chosen": -200.26666666666668, "logps/rejected": -335.05882352941177, "loss": 0.2512, "rewards/chosen": 0.6260416666666667, "rewards/margins": 3.6003063725490194, "rewards/rejected": -2.974264705882353, "step": 196 }, { "epoch": 0.13986510472133476, "grad_norm": 0.2482654176859797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 108565607.88405797, "logps/chosen": -230.77966101694915, "logps/rejected": -327.8840579710145, "loss": 0.1993, "rewards/chosen": 1.0254237288135593, "rewards/margins": 5.402235323016457, "rewards/rejected": -4.3768115942028984, "step": 197 }, { "epoch": 0.14057507987220447, "grad_norm": 0.2753154707938262, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100281995.63636364, "logits/rejected": 108815128.77419356, "logps/chosen": -201.45454545454547, "logps/rejected": -306.3225806451613, "loss": 0.2657, "rewards/chosen": 0.5662878787878788, "rewards/margins": 4.691287878787879, "rewards/rejected": -4.125, "step": 198 }, { "epoch": 0.1412850550230742, "grad_norm": 0.2577847388945329, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 30253996.06557377, "logits/rejected": 151245350.20895523, "logps/chosen": -210.75409836065575, "logps/rejected": -336.0, "loss": 0.2157, "rewards/chosen": 0.9123975409836066, "rewards/margins": 5.546725899192562, "rewards/rejected": -4.634328358208955, "step": 199 }, { "epoch": 0.1419950301739439, "grad_norm": 0.2499748535354651, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141341387.17460316, "logits/rejected": 56074618.092307694, "logps/chosen": -187.55555555555554, "logps/rejected": -262.15384615384613, "loss": 0.231, "rewards/chosen": 0.8263888888888888, "rewards/margins": 3.5648504273504273, "rewards/rejected": -2.7384615384615385, "step": 200 }, { "epoch": 0.14270500532481364, "grad_norm": 0.19438607792055437, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123166069.84126984, "logits/rejected": 85467009.96923077, "logps/chosen": -254.73015873015873, "logps/rejected": -273.2307692307692, "loss": 0.1981, "rewards/chosen": 0.9156746031746031, "rewards/margins": 5.727213064713064, "rewards/rejected": -4.811538461538461, "step": 201 }, { "epoch": 0.14341498047568335, "grad_norm": 0.23913165053226854, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132692526.54545455, "logits/rejected": 100416572.23529412, "logps/chosen": -233.97402597402598, "logps/rejected": -330.03921568627453, "loss": 0.2454, "rewards/chosen": 1.0121753246753247, "rewards/margins": 5.796489050165521, "rewards/rejected": -4.784313725490196, "step": 202 }, { "epoch": 0.14412495562655306, "grad_norm": 0.1799571113932335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166313271.6521739, "logits/rejected": 78092253.28813559, "logps/chosen": -273.8550724637681, "logps/rejected": -329.22033898305085, "loss": 0.2369, "rewards/chosen": 0.967391304347826, "rewards/margins": -16190761.337693442, "rewards/rejected": 16190762.305084746, "step": 203 }, { "epoch": 0.1448349307774228, "grad_norm": 0.2391641267015947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120935765.33333333, "logits/rejected": 107310518.85714285, "logps/chosen": -206.0, "logps/rejected": -340.0, "loss": 0.232, "rewards/chosen": 0.8489583333333334, "rewards/margins": 4.822172619047619, "rewards/rejected": -3.9732142857142856, "step": 204 }, { "epoch": 0.1455449059282925, "grad_norm": 0.22270546285809, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163771439.26153848, "logits/rejected": 102127973.58730158, "logps/chosen": -260.9230769230769, "logps/rejected": -325.3333333333333, "loss": 0.2201, "rewards/chosen": 1.020673076923077, "rewards/margins": 5.869879426129426, "rewards/rejected": -4.849206349206349, "step": 205 }, { "epoch": 0.14625488107916224, "grad_norm": 0.21191939204610696, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99050102.15384616, "logits/rejected": 132220440.38095239, "logps/chosen": -252.30769230769232, "logps/rejected": -324.57142857142856, "loss": 0.193, "rewards/chosen": 1.1576923076923078, "rewards/margins": 6.121978021978022, "rewards/rejected": -4.964285714285714, "step": 206 }, { "epoch": 0.14696485623003194, "grad_norm": 0.1928272048163964, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156009043.78181818, "logits/rejected": 113303664.21917808, "logps/chosen": -206.83636363636364, "logps/rejected": -291.94520547945206, "loss": 0.2085, "rewards/chosen": 0.8, "rewards/margins": 5.3, "rewards/rejected": -4.5, "step": 207 }, { "epoch": 0.14767483138090168, "grad_norm": 0.16590999097724002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128746896.69565217, "logits/rejected": 114170376.6779661, "logps/chosen": -229.79710144927537, "logps/rejected": -321.6271186440678, "loss": 0.2356, "rewards/chosen": 1.0330615942028984, "rewards/margins": 5.60085820437239, "rewards/rejected": -4.567796610169491, "step": 208 }, { "epoch": 0.14838480653177138, "grad_norm": 0.22762146174235465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143943650.3188406, "logits/rejected": 97464250.57627119, "logps/chosen": -222.14492753623188, "logps/rejected": -304.54237288135596, "loss": 0.2311, "rewards/chosen": 0.8396739130434783, "rewards/margins": 5.161707811348563, "rewards/rejected": -4.322033898305085, "step": 209 }, { "epoch": 0.14909478168264112, "grad_norm": 0.2364580807280037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166556419.71014494, "logits/rejected": 67570948.33898304, "logps/chosen": -227.71014492753622, "logps/rejected": -283.1186440677966, "loss": 0.238, "rewards/chosen": 0.8777173913043478, "rewards/margins": 4.869242815033162, "rewards/rejected": -3.9915254237288136, "step": 210 }, { "epoch": 0.14980475683351083, "grad_norm": 0.19225594172423283, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105946505.84615384, "logits/rejected": 146201453.7142857, "logps/chosen": -230.64615384615385, "logps/rejected": -316.6984126984127, "loss": 0.218, "rewards/chosen": 0.5903846153846154, "rewards/margins": 5.0903846153846155, "rewards/rejected": -4.5, "step": 211 }, { "epoch": 0.15051473198438053, "grad_norm": 0.2300722003474391, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102498304.0, "logits/rejected": 113180672.0, "logps/chosen": -161.125, "logps/rejected": -277.25, "loss": 0.2036, "rewards/chosen": 0.94921875, "rewards/margins": 11518247.94921875, "rewards/rejected": -11518247.0, "step": 212 }, { "epoch": 0.15122470713525027, "grad_norm": 0.17727277702281258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89325568.0, "logits/rejected": 117243904.0, "logps/chosen": -208.75, "logps/rejected": -384.0, "loss": 0.2071, "rewards/chosen": 0.80517578125, "rewards/margins": 5.60986328125, "rewards/rejected": -4.8046875, "step": 213 }, { "epoch": 0.15193468228611998, "grad_norm": 0.3400160293858959, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183196374.70967743, "logits/rejected": 36493622.303030305, "logps/chosen": -228.6451612903226, "logps/rejected": -326.54545454545456, "loss": 0.2163, "rewards/chosen": 0.7883064516129032, "rewards/margins": 5.064821603128054, "rewards/rejected": -4.276515151515151, "step": 214 }, { "epoch": 0.1526446574369897, "grad_norm": 0.2209179748225996, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79298560.0, "logits/rejected": 118947840.0, "logps/chosen": -170.75, "logps/rejected": -314.5, "loss": 0.2357, "rewards/chosen": 0.771484375, "rewards/margins": 5.185546875, "rewards/rejected": -4.4140625, "step": 215 }, { "epoch": 0.15335463258785942, "grad_norm": 0.1919742878401442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117505039.75384615, "logits/rejected": 85883367.61904761, "logps/chosen": -225.47692307692307, "logps/rejected": -329.14285714285717, "loss": 0.2082, "rewards/chosen": 1.2961538461538462, "rewards/margins": 5.776312576312576, "rewards/rejected": -4.48015873015873, "step": 216 }, { "epoch": 0.15406460773872915, "grad_norm": 0.19349310863075875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155122671.74603173, "logits/rejected": 62511261.538461536, "logps/chosen": -213.33333333333334, "logps/rejected": -300.8, "loss": 0.2082, "rewards/chosen": 0.939484126984127, "rewards/margins": 5.685637973137974, "rewards/rejected": -4.746153846153846, "step": 217 }, { "epoch": 0.15477458288959886, "grad_norm": 0.1847379294366493, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103498881.8028169, "logits/rejected": 122113113.8245614, "logps/chosen": -204.16901408450704, "logps/rejected": -340.2105263157895, "loss": 0.2438, "rewards/chosen": 0.829225352112676, "rewards/margins": 4.833611317024957, "rewards/rejected": -4.004385964912281, "step": 218 }, { "epoch": 0.1554845580404686, "grad_norm": 0.20969839597869047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117309440.0, "logits/rejected": 133300224.0, "logps/chosen": -235.5, "logps/rejected": -358.0, "loss": 0.2027, "rewards/chosen": 1.0380859375, "rewards/margins": 6.3505859375, "rewards/rejected": -5.3125, "step": 219 }, { "epoch": 0.1561945331913383, "grad_norm": 0.18457917855983155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130830020.92307693, "logits/rejected": 111976879.15789473, "logps/chosen": -262.9230769230769, "logps/rejected": -331.36842105263156, "loss": 0.1948, "rewards/chosen": 0.8503605769230769, "rewards/margins": 5.7977289979757085, "rewards/rejected": -4.947368421052632, "step": 220 }, { "epoch": 0.156904508342208, "grad_norm": 0.2005368195624318, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68363122.21538462, "logits/rejected": 115310071.87301587, "logps/chosen": -210.33846153846153, "logps/rejected": -298.41269841269843, "loss": 0.2161, "rewards/chosen": 1.103846153846154, "rewards/margins": 4.635592185592186, "rewards/rejected": -3.5317460317460316, "step": 221 }, { "epoch": 0.15761448349307774, "grad_norm": 0.262385618172805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125693819.87096775, "logits/rejected": 110164030.06060606, "logps/chosen": -198.4516129032258, "logps/rejected": -307.3939393939394, "loss": 0.2146, "rewards/chosen": 0.7358870967741935, "rewards/margins": 5.868462854349952, "rewards/rejected": -5.132575757575758, "step": 222 }, { "epoch": 0.15832445864394745, "grad_norm": 0.1885262787840731, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147047363.7647059, "logits/rejected": 82487978.66666667, "logps/chosen": -194.58823529411765, "logps/rejected": -302.6666666666667, "loss": 0.2274, "rewards/chosen": 0.9283088235294118, "rewards/margins": 5.461642156862745, "rewards/rejected": -4.533333333333333, "step": 223 }, { "epoch": 0.1590344337948172, "grad_norm": 0.19938378106182414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96468992.0, "logits/rejected": 70399779.44615385, "logps/chosen": -156.95238095238096, "logps/rejected": -240.4923076923077, "loss": 0.2162, "rewards/chosen": 0.6656746031746031, "rewards/margins": 5.569520757020757, "rewards/rejected": -4.903846153846154, "step": 224 }, { "epoch": 0.1597444089456869, "grad_norm": 0.19283605230435025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164975957.33333334, "logits/rejected": 88080384.0, "logps/chosen": -214.6031746031746, "logps/rejected": -292.67692307692306, "loss": 0.2188, "rewards/chosen": 0.9503968253968254, "rewards/margins": -50750423.66498779, "rewards/rejected": 50750424.615384616, "step": 225 }, { "epoch": 0.16045438409655663, "grad_norm": 0.2006188531363677, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97803543.27272727, "logits/rejected": 108936991.56164384, "logps/chosen": -194.03636363636363, "logps/rejected": -368.6575342465753, "loss": 0.1966, "rewards/chosen": 0.76875, "rewards/margins": 5.933133561643835, "rewards/rejected": -5.164383561643835, "step": 226 }, { "epoch": 0.16116435924742634, "grad_norm": 0.203307304449532, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147309040.4848485, "logits/rejected": 83277229.41935484, "logps/chosen": -284.8484848484849, "logps/rejected": -267.61290322580646, "loss": 0.2194, "rewards/chosen": 1.2121212121212122, "rewards/margins": 5.712121212121212, "rewards/rejected": -4.5, "step": 227 }, { "epoch": 0.16187433439829607, "grad_norm": 0.22224575435016117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131596288.0, "logits/rejected": 98533376.0, "logps/chosen": -247.25, "logps/rejected": -320.5, "loss": 0.1837, "rewards/chosen": 1.30078125, "rewards/margins": 6.52734375, "rewards/rejected": -5.2265625, "step": 228 }, { "epoch": 0.16258430954916578, "grad_norm": 0.21900271583419353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126776220.90322581, "logits/rejected": 64312661.333333336, "logps/chosen": -173.29032258064515, "logps/rejected": -299.8787878787879, "loss": 0.212, "rewards/chosen": 0.9390120967741935, "rewards/margins": 5.692799975562073, "rewards/rejected": -4.753787878787879, "step": 229 }, { "epoch": 0.16329428470003549, "grad_norm": 0.26915279475687515, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167893734.0289855, "logits/rejected": 121279366.50847457, "logps/chosen": -257.6231884057971, "logps/rejected": -382.64406779661016, "loss": 0.2289, "rewards/chosen": 0.8713768115942029, "rewards/margins": 5.795105625153525, "rewards/rejected": -4.923728813559322, "step": 230 }, { "epoch": 0.16400425985090522, "grad_norm": 0.522692245736694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107085824.0, "logits/rejected": 101580800.0, "logps/chosen": -187.25, "logps/rejected": -298.75, "loss": 0.2126, "rewards/chosen": 0.5517578125, "rewards/margins": 5.7197265625, "rewards/rejected": -5.16796875, "step": 231 }, { "epoch": 0.16471423500177493, "grad_norm": 0.258799171886571, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121568239.74603175, "logits/rejected": 120086149.90769231, "logps/chosen": -217.65079365079364, "logps/rejected": -349.53846153846155, "loss": 0.1919, "rewards/chosen": 1.0783730158730158, "rewards/margins": 6.33221916971917, "rewards/rejected": -5.253846153846154, "step": 232 }, { "epoch": 0.16542421015264466, "grad_norm": 0.19457916981980536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132911305.44262294, "logits/rejected": 116384110.80597015, "logps/chosen": -207.7377049180328, "logps/rejected": -337.1940298507463, "loss": 0.1969, "rewards/chosen": 0.9738729508196722, "rewards/margins": 5.824619219476388, "rewards/rejected": -4.850746268656716, "step": 233 }, { "epoch": 0.16613418530351437, "grad_norm": 0.29598291339862126, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127965840.90566038, "logits/rejected": 110058536.96, "logps/chosen": -252.67924528301887, "logps/rejected": -304.64, "loss": 0.1851, "rewards/chosen": 1.1202830188679245, "rewards/margins": 6.093616352201258, "rewards/rejected": -4.973333333333334, "step": 234 }, { "epoch": 0.1668441604543841, "grad_norm": 0.2387509197495391, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156370742.08450705, "logits/rejected": 70567325.19298245, "logps/chosen": -236.8450704225352, "logps/rejected": -303.43859649122805, "loss": 0.2336, "rewards/chosen": 0.852112676056338, "rewards/margins": 4.303867062021251, "rewards/rejected": -3.4517543859649122, "step": 235 }, { "epoch": 0.1675541356052538, "grad_norm": 0.25595584866077803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140556846.54545453, "logits/rejected": 119064113.5483871, "logps/chosen": -241.93939393939394, "logps/rejected": -323.8709677419355, "loss": 0.2356, "rewards/chosen": 0.8423295454545454, "rewards/margins": 5.830232771260997, "rewards/rejected": -4.987903225806452, "step": 236 }, { "epoch": 0.16826411075612355, "grad_norm": 0.22552780689185373, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99325457.65517241, "logits/rejected": 120676117.94285715, "logps/chosen": -222.20689655172413, "logps/rejected": -319.0857142857143, "loss": 0.2056, "rewards/chosen": 0.7252155172413793, "rewards/margins": 4.903786945812808, "rewards/rejected": -4.178571428571429, "step": 237 }, { "epoch": 0.16897408590699325, "grad_norm": 0.20083900999851736, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132464884.53731343, "logits/rejected": 80620023.60655738, "logps/chosen": -294.6865671641791, "logps/rejected": -284.0655737704918, "loss": 0.2181, "rewards/chosen": 1.0914179104477613, "rewards/margins": 5.734860533398582, "rewards/rejected": -4.64344262295082, "step": 238 }, { "epoch": 0.16968406105786296, "grad_norm": 0.16459718077540803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172310259.40983605, "logits/rejected": 98910452.53731343, "logps/chosen": -244.98360655737704, "logps/rejected": -356.2985074626866, "loss": 0.1914, "rewards/chosen": 1.0788934426229508, "rewards/margins": 6.295311353070712, "rewards/rejected": -5.2164179104477615, "step": 239 }, { "epoch": 0.1703940362087327, "grad_norm": 0.21826038059056418, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117538816.0, "logits/rejected": 94765056.0, "logps/chosen": -202.75, "logps/rejected": -291.5, "loss": 0.2239, "rewards/chosen": 0.779296875, "rewards/margins": 5.505859375, "rewards/rejected": -4.7265625, "step": 240 }, { "epoch": 0.1711040113596024, "grad_norm": 0.3155067574959115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134582450.08695653, "logits/rejected": 27156341.15254237, "logps/chosen": -215.18840579710144, "logps/rejected": -289.89830508474574, "loss": 0.2094, "rewards/chosen": 1.335144927536232, "rewards/margins": 5.9283652665192825, "rewards/rejected": -4.593220338983051, "step": 241 }, { "epoch": 0.17181398651047214, "grad_norm": 0.29800719016877647, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62988144.28070176, "logits/rejected": 135281072.67605633, "logps/chosen": -204.21052631578948, "logps/rejected": -330.36619718309856, "loss": 0.1885, "rewards/chosen": 0.8097587719298246, "rewards/margins": 6.1970827155917965, "rewards/rejected": -5.387323943661972, "step": 242 }, { "epoch": 0.17252396166134185, "grad_norm": 0.197016563139447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118638884.57142857, "logits/rejected": 131765673.35384615, "logps/chosen": -265.6507936507937, "logps/rejected": -323.44615384615383, "loss": 0.2143, "rewards/chosen": 0.9136904761904762, "rewards/margins": 5.575228937728938, "rewards/rejected": -4.661538461538462, "step": 243 }, { "epoch": 0.17323393681221158, "grad_norm": 0.23156432281414477, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118965713.45454545, "logits/rejected": 59870307.09677419, "logps/chosen": -191.27272727272728, "logps/rejected": -283.61290322580646, "loss": 0.2229, "rewards/chosen": 0.7386363636363636, "rewards/margins": 5.295087976539589, "rewards/rejected": -4.556451612903226, "step": 244 }, { "epoch": 0.1739439119630813, "grad_norm": 0.23029272185788602, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88346689.01587301, "logits/rejected": 148542889.35384616, "logps/chosen": -219.3015873015873, "logps/rejected": -304.9846153846154, "loss": 0.2307, "rewards/chosen": 0.6944444444444444, "rewards/margins": 11599254.355982905, "rewards/rejected": -11599253.661538461, "step": 245 }, { "epoch": 0.17465388711395102, "grad_norm": 0.24393933343211854, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99265194.66666667, "logits/rejected": 108955112.36923076, "logps/chosen": -258.031746031746, "logps/rejected": -311.6307692307692, "loss": 0.2198, "rewards/chosen": 0.753968253968254, "rewards/margins": -20100819.922954824, "rewards/rejected": 20100820.676923078, "step": 246 }, { "epoch": 0.17536386226482073, "grad_norm": 0.3223457814075444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138280960.0, "logits/rejected": 85622784.0, "logps/chosen": -244.25, "logps/rejected": -299.875, "loss": 0.2001, "rewards/chosen": 1.3994140625, "rewards/margins": 6.6220703125, "rewards/rejected": -5.22265625, "step": 247 }, { "epoch": 0.17607383741569044, "grad_norm": 0.252145581274685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145358848.0, "logits/rejected": 84978688.0, "logps/chosen": -257.75, "logps/rejected": -309.0, "loss": 0.233, "rewards/chosen": 0.829833984375, "rewards/margins": 5.704833984375, "rewards/rejected": -4.875, "step": 248 }, { "epoch": 0.17678381256656017, "grad_norm": 0.22876566724906036, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160635583.0447761, "logits/rejected": 36545452.06557377, "logps/chosen": -222.08955223880596, "logps/rejected": -332.59016393442624, "loss": 0.2183, "rewards/chosen": 0.8917910447761194, "rewards/margins": 6.219659897235136, "rewards/rejected": -5.327868852459017, "step": 249 }, { "epoch": 0.17749378771742988, "grad_norm": 0.18108744837210006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89451598.76923077, "logits/rejected": 144337318.6031746, "logps/chosen": -174.64615384615385, "logps/rejected": -355.04761904761904, "loss": 0.2181, "rewards/chosen": 0.8774038461538461, "rewards/margins": 6.536134004884005, "rewards/rejected": -5.658730158730159, "step": 250 }, { "epoch": 0.17820376286829961, "grad_norm": 0.21961678477563767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87974006.72463769, "logits/rejected": 131942851.2542373, "logps/chosen": -168.57971014492753, "logps/rejected": -320.8135593220339, "loss": 0.2044, "rewards/chosen": 1.0045289855072463, "rewards/margins": 5.818088307541144, "rewards/rejected": -4.813559322033898, "step": 251 }, { "epoch": 0.17891373801916932, "grad_norm": 0.20193292959507503, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152613798.17543858, "logits/rejected": 82911347.38028169, "logps/chosen": -189.47368421052633, "logps/rejected": -315.0422535211268, "loss": 0.2166, "rewards/chosen": 0.5444078947368421, "rewards/margins": 4.0514501482579695, "rewards/rejected": -3.507042253521127, "step": 252 }, { "epoch": 0.17962371317003906, "grad_norm": 0.17182413816386707, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125369470.24657534, "logits/rejected": 130480984.43636364, "logps/chosen": -226.41095890410958, "logps/rejected": -356.6545454545454, "loss": 0.2101, "rewards/chosen": 1.5077054794520548, "rewards/margins": 6.598614570361145, "rewards/rejected": -5.090909090909091, "step": 253 }, { "epoch": 0.18033368832090876, "grad_norm": 0.269655292421174, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183043728.41025642, "logits/rejected": 9248440.32, "logps/chosen": -273.43589743589746, "logps/rejected": -262.08, "loss": 0.2512, "rewards/chosen": 1.0432692307692308, "rewards/margins": 5.928269230769231, "rewards/rejected": -4.885, "step": 254 }, { "epoch": 0.1810436634717785, "grad_norm": 0.2252211912708628, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138909661.2881356, "logits/rejected": 84402769.6231884, "logps/chosen": -202.84745762711864, "logps/rejected": -280.1159420289855, "loss": 0.206, "rewards/chosen": 0.9782838983050848, "rewards/margins": 5.920312883812331, "rewards/rejected": -4.942028985507246, "step": 255 }, { "epoch": 0.1817536386226482, "grad_norm": 0.22356269648927096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64996291.76470588, "logits/rejected": 157146589.86666667, "logps/chosen": -193.7058823529412, "logps/rejected": -362.1333333333333, "loss": 0.2048, "rewards/chosen": 0.8658088235294118, "rewards/margins": 6.0783088235294125, "rewards/rejected": -5.2125, "step": 256 }, { "epoch": 0.1824636137735179, "grad_norm": 0.2316926569402313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142270791.68, "logits/rejected": 70709634.41509435, "logps/chosen": -223.78666666666666, "logps/rejected": -328.75471698113205, "loss": 0.2474, "rewards/chosen": 0.9302083333333333, "rewards/margins": 5.826434748427673, "rewards/rejected": -4.89622641509434, "step": 257 }, { "epoch": 0.18317358892438765, "grad_norm": 0.22662356875177275, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140064333.57575756, "logits/rejected": 54999502.451612905, "logps/chosen": -185.0909090909091, "logps/rejected": -312.0, "loss": 0.2236, "rewards/chosen": 0.6832386363636364, "rewards/margins": 4.646948313782992, "rewards/rejected": -3.963709677419355, "step": 258 }, { "epoch": 0.18388356407525736, "grad_norm": 0.24414927707282474, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142241613.91304347, "logits/rejected": 86800765.83050847, "logps/chosen": -212.40579710144928, "logps/rejected": -324.33898305084745, "loss": 0.2081, "rewards/chosen": 1.0914855072463767, "rewards/margins": 5.218604151314174, "rewards/rejected": -4.127118644067797, "step": 259 }, { "epoch": 0.1845935392261271, "grad_norm": 0.3042198619011691, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153681920.0, "logits/rejected": 105512960.0, "logps/chosen": -216.0, "logps/rejected": -330.0, "loss": 0.2281, "rewards/chosen": 0.74951171875, "rewards/margins": 5.80419921875, "rewards/rejected": -5.0546875, "step": 260 }, { "epoch": 0.1853035143769968, "grad_norm": 0.21775588858354164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172831955.3015873, "logits/rejected": 88144911.75384615, "logps/chosen": -242.79365079365078, "logps/rejected": -291.9384615384615, "loss": 0.235, "rewards/chosen": 0.7018849206349206, "rewards/margins": 4.282654151404151, "rewards/rejected": -3.580769230769231, "step": 261 }, { "epoch": 0.18601348952786653, "grad_norm": 0.21354273931341347, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172354825.4814815, "logits/rejected": 43416714.37837838, "logps/chosen": -319.7037037037037, "logps/rejected": -310.4864864864865, "loss": 0.185, "rewards/chosen": 1.0202546296296295, "rewards/margins": 5.868227602602603, "rewards/rejected": -4.847972972972973, "step": 262 }, { "epoch": 0.18672346467873624, "grad_norm": 0.2104698793001246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154763738.89855072, "logits/rejected": 112293159.05084746, "logps/chosen": -253.91304347826087, "logps/rejected": -344.9491525423729, "loss": 0.2224, "rewards/chosen": 0.782608695652174, "rewards/margins": 6.1597273397199706, "rewards/rejected": -5.377118644067797, "step": 263 }, { "epoch": 0.18743343982960597, "grad_norm": 0.2026078245843651, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101694395.73333333, "logits/rejected": 111519141.64705883, "logps/chosen": -208.0, "logps/rejected": -285.1764705882353, "loss": 0.2016, "rewards/chosen": 0.8140625, "rewards/margins": 5.997886029411765, "rewards/rejected": -5.1838235294117645, "step": 264 }, { "epoch": 0.18814341498047568, "grad_norm": 0.23834015752820376, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128535122.58064516, "logits/rejected": 103809024.0, "logps/chosen": -225.29032258064515, "logps/rejected": -257.2121212121212, "loss": 0.198, "rewards/chosen": 1.0171370967741935, "rewards/margins": 6.066379521016618, "rewards/rejected": -5.049242424242424, "step": 265 }, { "epoch": 0.18885339013134542, "grad_norm": 0.18280608748418029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120465911.60655738, "logits/rejected": 120664491.94029851, "logps/chosen": -205.11475409836066, "logps/rejected": -321.43283582089555, "loss": 0.2055, "rewards/chosen": 0.8145491803278688, "rewards/margins": 6.105593956447271, "rewards/rejected": -5.291044776119403, "step": 266 }, { "epoch": 0.18956336528221512, "grad_norm": 0.4250933937619147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128508814.22222222, "logits/rejected": 127701577.14285715, "logps/chosen": -217.33333333333334, "logps/rejected": -389.7142857142857, "loss": 0.2249, "rewards/chosen": 0.8958333333333334, "rewards/margins": 6.9226190476190474, "rewards/rejected": -6.026785714285714, "step": 267 }, { "epoch": 0.19027334043308483, "grad_norm": 0.1964241027132096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103413659.27868852, "logits/rejected": 87970831.28358209, "logps/chosen": -168.13114754098362, "logps/rejected": -326.44776119402985, "loss": 0.1907, "rewards/chosen": 0.951844262295082, "rewards/margins": 6.116023366772694, "rewards/rejected": -5.164179104477612, "step": 268 }, { "epoch": 0.19098331558395457, "grad_norm": 0.21734627385771824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114837150.89655173, "logits/rejected": 90716803.65714286, "logps/chosen": -241.10344827586206, "logps/rejected": -273.14285714285717, "loss": 0.2275, "rewards/chosen": 0.6567887931034483, "rewards/margins": 5.456788793103448, "rewards/rejected": -4.8, "step": 269 }, { "epoch": 0.19169329073482427, "grad_norm": 0.17173178219402838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 178296050.03636363, "logits/rejected": 115716825.42465754, "logps/chosen": -262.4, "logps/rejected": -358.5753424657534, "loss": 0.1949, "rewards/chosen": 0.7977272727272727, "rewards/margins": 6.414165628891657, "rewards/rejected": -5.616438356164384, "step": 270 }, { "epoch": 0.192403265885694, "grad_norm": 0.19094755201439878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103109973.33333333, "logits/rejected": 132490661.64705883, "logps/chosen": -232.8, "logps/rejected": -351.05882352941177, "loss": 0.1763, "rewards/chosen": 1.3583333333333334, "rewards/margins": 10100406.064215686, "rewards/rejected": -10100404.705882354, "step": 271 }, { "epoch": 0.19311324103656372, "grad_norm": 0.2547679174680736, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123299222.34920634, "logits/rejected": 87757745.23076923, "logps/chosen": -215.61904761904762, "logps/rejected": -277.66153846153844, "loss": 0.1942, "rewards/chosen": 1.1304563492063493, "rewards/margins": 5.399687118437118, "rewards/rejected": -4.269230769230769, "step": 272 }, { "epoch": 0.19382321618743345, "grad_norm": 0.31345960935756534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140296429.44927537, "logits/rejected": 120284107.9322034, "logps/chosen": -244.63768115942028, "logps/rejected": -335.1864406779661, "loss": 0.225, "rewards/chosen": 1.1539855072463767, "rewards/margins": 6.166697371653156, "rewards/rejected": -5.012711864406779, "step": 273 }, { "epoch": 0.19453319133830316, "grad_norm": 0.4794997005387614, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161762410.98507464, "logits/rejected": 47254679.08196721, "logps/chosen": -281.7910447761194, "logps/rejected": -283.0163934426229, "loss": 0.2209, "rewards/chosen": 1.1473880597014925, "rewards/margins": 5.233453633471985, "rewards/rejected": -4.086065573770492, "step": 274 }, { "epoch": 0.1952431664891729, "grad_norm": 0.2239942155364995, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94090133.01492538, "logits/rejected": 122047370.49180327, "logps/chosen": -217.07462686567163, "logps/rejected": -350.95081967213116, "loss": 0.2279, "rewards/chosen": 0.9001865671641791, "rewards/margins": 6.244448862246147, "rewards/rejected": -5.344262295081967, "step": 275 }, { "epoch": 0.1959531416400426, "grad_norm": 0.30029487209530464, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110616942.80597015, "logits/rejected": 104032491.01639344, "logps/chosen": -182.6865671641791, "logps/rejected": -312.1311475409836, "loss": 0.2018, "rewards/chosen": 1.1445895522388059, "rewards/margins": 6.11999938830438, "rewards/rejected": -4.975409836065574, "step": 276 }, { "epoch": 0.1966631167909123, "grad_norm": 0.3586992984745448, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112057821.86666666, "logits/rejected": 77066480.94117647, "logps/chosen": -197.33333333333334, "logps/rejected": -275.52941176470586, "loss": 0.2224, "rewards/chosen": 0.325, "rewards/margins": 5.060294117647059, "rewards/rejected": -4.735294117647059, "step": 277 }, { "epoch": 0.19737309194178204, "grad_norm": 0.21710966666080694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119817284.26666667, "logits/rejected": 105351047.52941176, "logps/chosen": -181.33333333333334, "logps/rejected": -312.94117647058823, "loss": 0.2043, "rewards/chosen": 0.8161458333333333, "rewards/margins": 4.588204656862745, "rewards/rejected": -3.7720588235294117, "step": 278 }, { "epoch": 0.19808306709265175, "grad_norm": 0.2032859163500201, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86199115.29411764, "logits/rejected": 131002094.93333334, "logps/chosen": -200.7058823529412, "logps/rejected": -292.8, "loss": 0.1794, "rewards/chosen": 1.4926470588235294, "rewards/margins": 6.225980392156863, "rewards/rejected": -4.733333333333333, "step": 279 }, { "epoch": 0.19879304224352148, "grad_norm": 0.1476814498485769, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91016396.8, "logits/rejected": 140046576.94117647, "logps/chosen": -297.6, "logps/rejected": -323.7647058823529, "loss": 0.1846, "rewards/chosen": 1.3385416666666667, "rewards/margins": 6.401041666666667, "rewards/rejected": -5.0625, "step": 280 }, { "epoch": 0.1995030173943912, "grad_norm": 0.2323628108278076, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137992601.6, "logits/rejected": 59460427.294117644, "logps/chosen": -198.13333333333333, "logps/rejected": -328.0, "loss": 0.222, "rewards/chosen": 0.4786458333333333, "rewards/margins": 5.68452818627451, "rewards/rejected": -5.205882352941177, "step": 281 }, { "epoch": 0.20021299254526093, "grad_norm": 0.21394576284735622, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88290099.2, "logits/rejected": 136348168.12698412, "logps/chosen": -212.43076923076924, "logps/rejected": -406.3492063492063, "loss": 0.1901, "rewards/chosen": 1.2846153846153847, "rewards/margins": 6.471123321123321, "rewards/rejected": -5.186507936507937, "step": 282 }, { "epoch": 0.20092296769613063, "grad_norm": 0.19563845754107664, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172965107.80952382, "logits/rejected": 54590479.75384615, "logps/chosen": -207.23809523809524, "logps/rejected": -311.1384615384615, "loss": 0.1869, "rewards/chosen": 1.0426587301587302, "rewards/margins": 6.865735653235653, "rewards/rejected": -5.823076923076923, "step": 283 }, { "epoch": 0.20163294284700037, "grad_norm": 0.19917946082946128, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70788078.03508772, "logits/rejected": 131027693.97183098, "logps/chosen": -193.96491228070175, "logps/rejected": -333.5211267605634, "loss": 0.1877, "rewards/chosen": 0.7154605263157895, "rewards/margins": 6.659122498146775, "rewards/rejected": -5.943661971830986, "step": 284 }, { "epoch": 0.20234291799787008, "grad_norm": 0.22534511665469284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150368928.47761193, "logits/rejected": 105613949.90163934, "logps/chosen": -198.44776119402985, "logps/rejected": -321.8360655737705, "loss": 0.219, "rewards/chosen": 0.8269589552238806, "rewards/margins": 4.843352397846831, "rewards/rejected": -4.016393442622951, "step": 285 }, { "epoch": 0.20305289314873978, "grad_norm": 0.15314222983017475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109919691.03448276, "logits/rejected": 166753543.31428573, "logps/chosen": -208.27586206896552, "logps/rejected": -425.6, "loss": 0.1791, "rewards/chosen": 1.0829741379310345, "rewards/margins": 6.968688423645321, "rewards/rejected": -5.885714285714286, "step": 286 }, { "epoch": 0.20376286829960952, "grad_norm": 0.3338241772162338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106168320.0, "logits/rejected": 134380183.43661973, "logps/chosen": -189.47368421052633, "logps/rejected": -328.3380281690141, "loss": 0.175, "rewards/chosen": 0.8168859649122807, "rewards/margins": 6.147871880405239, "rewards/rejected": -5.330985915492958, "step": 287 }, { "epoch": 0.20447284345047922, "grad_norm": 0.26813599812894623, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153944064.0, "logits/rejected": 86310912.0, "logps/chosen": -285.75, "logps/rejected": -379.5, "loss": 0.1766, "rewards/chosen": 1.6318359375, "rewards/margins": 7.0224609375, "rewards/rejected": -5.390625, "step": 288 }, { "epoch": 0.20518281860134896, "grad_norm": 0.22203089146612165, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 234605987.67213115, "logits/rejected": 65293418.985074624, "logps/chosen": -264.91803278688525, "logps/rejected": -333.6119402985075, "loss": 0.2017, "rewards/chosen": 0.7033811475409836, "rewards/margins": 6.352634878884267, "rewards/rejected": -5.649253731343284, "step": 289 }, { "epoch": 0.20589279375221867, "grad_norm": 0.20294178295355056, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154449076.70588234, "logits/rejected": 111987916.8, "logps/chosen": -196.23529411764707, "logps/rejected": -358.93333333333334, "loss": 0.211, "rewards/chosen": 0.8033088235294118, "rewards/margins": 5.740808823529412, "rewards/rejected": -4.9375, "step": 290 }, { "epoch": 0.2066027689030884, "grad_norm": 0.19993440804375143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152818554.4347826, "logits/rejected": 58933525.69491526, "logps/chosen": -235.1304347826087, "logps/rejected": -331.66101694915255, "loss": 0.2374, "rewards/chosen": 0.7604166666666666, "rewards/margins": 6.332450564971752, "rewards/rejected": -5.572033898305085, "step": 291 }, { "epoch": 0.2073127440539581, "grad_norm": 0.21073885746278367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64891874.74285714, "logits/rejected": 138412032.0, "logps/chosen": -181.71428571428572, "logps/rejected": -356.41379310344826, "loss": 0.1938, "rewards/chosen": 1.0919642857142857, "rewards/margins": 6.255757389162562, "rewards/rejected": -5.163793103448276, "step": 292 }, { "epoch": 0.20802271920482784, "grad_norm": 0.2819543762079154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133798297.6, "logits/rejected": 112135951.05882353, "logps/chosen": -266.0, "logps/rejected": -337.4117647058824, "loss": 0.191, "rewards/chosen": 0.9026041666666667, "rewards/margins": 6.6673100490196076, "rewards/rejected": -5.764705882352941, "step": 293 }, { "epoch": 0.20873269435569755, "grad_norm": 0.19275377680878686, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94555800.70175439, "logits/rejected": 87844085.18309858, "logps/chosen": -217.82456140350877, "logps/rejected": -307.83098591549293, "loss": 0.19, "rewards/chosen": 0.26096491228070173, "rewards/margins": 5.007443785520139, "rewards/rejected": -4.746478873239437, "step": 294 }, { "epoch": 0.20944266950656726, "grad_norm": 0.31537602976441914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139111082.66666666, "logits/rejected": 86353317.64705883, "logps/chosen": -207.33333333333334, "logps/rejected": -292.47058823529414, "loss": 0.1912, "rewards/chosen": 0.43020833333333336, "rewards/margins": 5.702267156862746, "rewards/rejected": -5.272058823529412, "step": 295 }, { "epoch": 0.210152644657437, "grad_norm": 0.16853155489663862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135565897.14285713, "logits/rejected": 110216988.44444445, "logps/chosen": -201.71428571428572, "logps/rejected": -339.1111111111111, "loss": 0.1807, "rewards/chosen": 0.6417410714285714, "rewards/margins": 6.2806299603174605, "rewards/rejected": -5.638888888888889, "step": 296 }, { "epoch": 0.2108626198083067, "grad_norm": 0.17864139134516527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148274314.3783784, "logits/rejected": 95303907.55555555, "logps/chosen": -248.43243243243242, "logps/rejected": -314.6666666666667, "loss": 0.2385, "rewards/chosen": 1.0033783783783783, "rewards/margins": 6.383008008008008, "rewards/rejected": -5.37962962962963, "step": 297 }, { "epoch": 0.21157259495917644, "grad_norm": 0.2082102270989477, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173714090.66666666, "logits/rejected": 38331538.28571428, "logps/chosen": -272.22222222222223, "logps/rejected": -346.57142857142856, "loss": 0.2111, "rewards/chosen": 1.3116319444444444, "rewards/margins": 7.195560515873016, "rewards/rejected": -5.883928571428571, "step": 298 }, { "epoch": 0.21228257011004614, "grad_norm": 0.19293599409273732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141897439.54929578, "logits/rejected": 110376421.05263157, "logps/chosen": -204.8450704225352, "logps/rejected": -355.36842105263156, "loss": 0.2329, "rewards/chosen": 0.7601232394366197, "rewards/margins": 6.154860081541883, "rewards/rejected": -5.394736842105263, "step": 299 }, { "epoch": 0.21299254526091588, "grad_norm": 0.20932004315134553, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161914597.5172414, "logits/rejected": 62764763.428571425, "logps/chosen": -277.6551724137931, "logps/rejected": -364.34285714285716, "loss": 0.1722, "rewards/chosen": 1.1508620689655173, "rewards/margins": 7.329433497536947, "rewards/rejected": -6.178571428571429, "step": 300 }, { "epoch": 0.21370252041178558, "grad_norm": 0.1836751611522209, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68993804.1904762, "logits/rejected": 114859401.84615384, "logps/chosen": -172.6984126984127, "logps/rejected": -353.4769230769231, "loss": 0.1949, "rewards/chosen": 0.9146825396825397, "rewards/margins": 6.241605616605616, "rewards/rejected": -5.326923076923077, "step": 301 }, { "epoch": 0.21441249556265532, "grad_norm": 0.2121491243375906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135615829.33333334, "logits/rejected": 74871708.90322581, "logps/chosen": -240.4848484848485, "logps/rejected": -321.2903225806452, "loss": 0.2101, "rewards/chosen": 0.9550189393939394, "rewards/margins": 6.2413092619745845, "rewards/rejected": -5.286290322580645, "step": 302 }, { "epoch": 0.21512247071352503, "grad_norm": 0.20529215266035905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102101343.08571428, "logits/rejected": 110638327.1724138, "logps/chosen": -239.77142857142857, "logps/rejected": -358.3448275862069, "loss": 0.2353, "rewards/chosen": 0.93125, "rewards/margins": 6.008836206896552, "rewards/rejected": -5.077586206896552, "step": 303 }, { "epoch": 0.21583244586439473, "grad_norm": 0.16706953905701138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86489747.52542374, "logits/rejected": 136588421.5652174, "logps/chosen": -199.32203389830508, "logps/rejected": -349.2173913043478, "loss": 0.1682, "rewards/chosen": 1.1906779661016949, "rewards/margins": 6.8863301400147385, "rewards/rejected": -5.695652173913044, "step": 304 }, { "epoch": 0.21654242101526447, "grad_norm": 0.3108514484859258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99483648.0, "logits/rejected": 111214592.0, "logps/chosen": -220.75, "logps/rejected": -300.25, "loss": 0.1852, "rewards/chosen": 1.0966796875, "rewards/margins": 6.6748046875, "rewards/rejected": -5.578125, "step": 305 }, { "epoch": 0.21725239616613418, "grad_norm": 0.22543220892403298, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97473261.97183098, "logits/rejected": 103753835.78947368, "logps/chosen": -190.8732394366197, "logps/rejected": -330.3859649122807, "loss": 0.2283, "rewards/chosen": 0.7086267605633803, "rewards/margins": 6.761258339510748, "rewards/rejected": -6.052631578947368, "step": 306 }, { "epoch": 0.2179623713170039, "grad_norm": 0.24275899320324412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127273060.72131148, "logits/rejected": 146487632.23880598, "logps/chosen": -207.47540983606558, "logps/rejected": -370.6268656716418, "loss": 0.2004, "rewards/chosen": 0.7725409836065574, "rewards/margins": 6.078511132860289, "rewards/rejected": -5.3059701492537314, "step": 307 }, { "epoch": 0.21867234646787362, "grad_norm": 0.22640216095708715, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166947051.01639345, "logits/rejected": 87203962.26865672, "logps/chosen": -240.52459016393442, "logps/rejected": -352.0, "loss": 0.2032, "rewards/chosen": 0.7587090163934426, "rewards/margins": 6.348261255199413, "rewards/rejected": -5.58955223880597, "step": 308 }, { "epoch": 0.21938232161874335, "grad_norm": 0.21210015251560613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95884974.98734178, "logits/rejected": 89915392.0, "logps/chosen": -200.91139240506328, "logps/rejected": -278.2040816326531, "loss": 0.253, "rewards/chosen": 0.9572784810126582, "rewards/margins": 6.018502970808576, "rewards/rejected": -5.061224489795919, "step": 309 }, { "epoch": 0.22009229676961306, "grad_norm": 0.5976355060220286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92274688.0, "logits/rejected": 134402770.82352942, "logps/chosen": -184.53333333333333, "logps/rejected": -362.8235294117647, "loss": 0.1779, "rewards/chosen": 1.2229166666666667, "rewards/margins": 6.406740196078431, "rewards/rejected": -5.1838235294117645, "step": 310 }, { "epoch": 0.2208022719204828, "grad_norm": 0.19422942252253617, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154553747.3939394, "logits/rejected": 75632772.12903225, "logps/chosen": -226.42424242424244, "logps/rejected": -332.9032258064516, "loss": 0.2066, "rewards/chosen": 1.0449810606060606, "rewards/margins": 8845019.883690737, "rewards/rejected": -8845018.838709677, "step": 311 }, { "epoch": 0.2215122470713525, "grad_norm": 0.21430442506004538, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134863005.53846154, "logits/rejected": 93073603.04761904, "logps/chosen": -244.92307692307693, "logps/rejected": -350.984126984127, "loss": 0.1875, "rewards/chosen": 1.2221153846153847, "rewards/margins": 7.245924908424908, "rewards/rejected": -6.023809523809524, "step": 312 }, { "epoch": 0.2222222222222222, "grad_norm": 0.259266590675451, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91619328.0, "logits/rejected": 106823680.0, "logps/chosen": -176.0, "logps/rejected": -321.0, "loss": 0.1848, "rewards/chosen": 0.7236328125, "rewards/margins": 6.0595703125, "rewards/rejected": -5.3359375, "step": 313 }, { "epoch": 0.22293219737309194, "grad_norm": 0.26448530252239055, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 194175647.47540984, "logits/rejected": 80129986.86567163, "logps/chosen": -206.68852459016392, "logps/rejected": -345.7910447761194, "loss": 0.196, "rewards/chosen": 0.5655737704918032, "rewards/margins": 6.573036457058968, "rewards/rejected": -6.007462686567164, "step": 314 }, { "epoch": 0.22364217252396165, "grad_norm": 0.25982040486651586, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144106382.2222222, "logits/rejected": 107778633.14285715, "logps/chosen": -264.0, "logps/rejected": -348.2857142857143, "loss": 0.2101, "rewards/chosen": 1.1111111111111112, "rewards/margins": 6.678075396825397, "rewards/rejected": -5.566964285714286, "step": 315 }, { "epoch": 0.2243521476748314, "grad_norm": 0.2106639912505279, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155933398.70967743, "logits/rejected": 44834567.75757576, "logps/chosen": -216.51612903225808, "logps/rejected": -306.42424242424244, "loss": 0.2061, "rewards/chosen": 1.060483870967742, "rewards/margins": 6.249877810361681, "rewards/rejected": -5.1893939393939394, "step": 316 }, { "epoch": 0.2250621228257011, "grad_norm": 0.18468155091915206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144572416.0, "logits/rejected": 63799296.0, "logps/chosen": -246.25, "logps/rejected": -307.75, "loss": 0.1989, "rewards/chosen": 1.2412109375, "rewards/margins": 6.7099609375, "rewards/rejected": -5.46875, "step": 317 }, { "epoch": 0.22577209797657083, "grad_norm": 0.1953103634258263, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143787830.08450705, "logits/rejected": 80574787.36842105, "logps/chosen": -210.70422535211267, "logps/rejected": -313.82456140350877, "loss": 0.1992, "rewards/chosen": 0.9850352112676056, "rewards/margins": 5.463105386706203, "rewards/rejected": -4.478070175438597, "step": 318 }, { "epoch": 0.22648207312744054, "grad_norm": 0.16652200895272432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131203072.0, "logits/rejected": 116195328.0, "logps/chosen": -196.0, "logps/rejected": -367.0, "loss": 0.1891, "rewards/chosen": 0.6806640625, "rewards/margins": 6.5087890625, "rewards/rejected": -5.828125, "step": 319 }, { "epoch": 0.22719204827831027, "grad_norm": 0.26701002590569517, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175033940.0597015, "logits/rejected": 104995118.16393442, "logps/chosen": -246.6865671641791, "logps/rejected": -388.1967213114754, "loss": 0.1954, "rewards/chosen": 1.248134328358209, "rewards/margins": 7.125183508686078, "rewards/rejected": -5.877049180327869, "step": 320 }, { "epoch": 0.22790202342917998, "grad_norm": 0.20221540610952235, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80157809.77777778, "logits/rejected": 141961058.46153846, "logps/chosen": -253.71428571428572, "logps/rejected": -354.2153846153846, "loss": 0.1772, "rewards/chosen": 1.0357142857142858, "rewards/margins": 6.474175824175824, "rewards/rejected": -5.438461538461539, "step": 321 }, { "epoch": 0.22861199858004969, "grad_norm": 0.19424775561623236, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136788430.4516129, "logits/rejected": 122778717.0909091, "logps/chosen": -224.0, "logps/rejected": -313.2121212121212, "loss": 0.2145, "rewards/chosen": 0.844758064516129, "rewards/margins": 6.06066715542522, "rewards/rejected": -5.215909090909091, "step": 322 }, { "epoch": 0.22932197373091942, "grad_norm": 0.18497929074513747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96833714.08695652, "logits/rejected": 122701164.47457626, "logps/chosen": -221.2173913043478, "logps/rejected": -387.2542372881356, "loss": 0.2094, "rewards/chosen": 0.9809782608695652, "rewards/margins": 3.3072494473102427, "rewards/rejected": -2.3262711864406778, "step": 323 }, { "epoch": 0.23003194888178913, "grad_norm": 0.20963183257820453, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154426647.27272728, "logits/rejected": 94016677.16129032, "logps/chosen": -224.96969696969697, "logps/rejected": -320.258064516129, "loss": 0.2018, "rewards/chosen": 1.2888257575757576, "rewards/margins": 6.300922531769306, "rewards/rejected": -5.012096774193548, "step": 324 }, { "epoch": 0.23074192403265886, "grad_norm": 0.3719072849642422, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 201168316.3773585, "logits/rejected": 66102231.04, "logps/chosen": -284.07547169811323, "logps/rejected": -326.82666666666665, "loss": 0.1749, "rewards/chosen": 1.1863207547169812, "rewards/margins": 6.266320754716981, "rewards/rejected": -5.08, "step": 325 }, { "epoch": 0.23145189918352857, "grad_norm": 0.18279597694065836, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161516248.94915253, "logits/rejected": 119993566.60869566, "logps/chosen": -292.33898305084745, "logps/rejected": -378.8985507246377, "loss": 0.1841, "rewards/chosen": 1.271186440677966, "rewards/margins": 7.256693687054778, "rewards/rejected": -5.9855072463768115, "step": 326 }, { "epoch": 0.2321618743343983, "grad_norm": 0.22138590253855162, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119634455.63076924, "logits/rejected": 93706077.46031746, "logps/chosen": -196.6769230769231, "logps/rejected": -346.92063492063494, "loss": 0.2084, "rewards/chosen": 1.0067307692307692, "rewards/margins": 5.38768315018315, "rewards/rejected": -4.380952380952381, "step": 327 }, { "epoch": 0.232871849485268, "grad_norm": 0.1808500843543461, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143826497.16363636, "logits/rejected": 80417160.76712328, "logps/chosen": -205.0909090909091, "logps/rejected": -333.36986301369865, "loss": 0.1469, "rewards/chosen": 1.5397727272727273, "rewards/margins": 7.163060398505604, "rewards/rejected": -5.623287671232877, "step": 328 }, { "epoch": 0.23358182463613775, "grad_norm": 0.23257123137322303, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112525312.0, "logits/rejected": 133038080.0, "logps/chosen": -210.75, "logps/rejected": -295.75, "loss": 0.1985, "rewards/chosen": 1.1455078125, "rewards/margins": 6.1611328125, "rewards/rejected": -5.015625, "step": 329 }, { "epoch": 0.23429179978700745, "grad_norm": 0.2599284017904435, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143431444.98360655, "logits/rejected": 106547841.91044776, "logps/chosen": -254.1639344262295, "logps/rejected": -332.8955223880597, "loss": 0.1828, "rewards/chosen": 1.0850409836065573, "rewards/margins": 6.913399192561781, "rewards/rejected": -5.8283582089552235, "step": 330 }, { "epoch": 0.23500177493787716, "grad_norm": 0.17567118026495385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59397797.41538461, "logits/rejected": 97467635.8095238, "logps/chosen": -186.2153846153846, "logps/rejected": -323.8095238095238, "loss": 0.1741, "rewards/chosen": 1.5009615384615385, "rewards/margins": 5.969215506715507, "rewards/rejected": -4.468253968253968, "step": 331 }, { "epoch": 0.2357117500887469, "grad_norm": 0.24581287929608153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114707859.39393939, "logits/rejected": 61206395.87096774, "logps/chosen": -197.8181818181818, "logps/rejected": -282.06451612903226, "loss": 0.214, "rewards/chosen": 1.0388257575757576, "rewards/margins": 5.692051564027371, "rewards/rejected": -4.653225806451613, "step": 332 }, { "epoch": 0.2364217252396166, "grad_norm": 0.16193541389806843, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109326940.32786885, "logits/rejected": 118316933.73134328, "logps/chosen": -241.70491803278688, "logps/rejected": -324.53731343283584, "loss": 0.1873, "rewards/chosen": 0.5906762295081968, "rewards/margins": 6.359332945926107, "rewards/rejected": -5.768656716417911, "step": 333 }, { "epoch": 0.23713170039048634, "grad_norm": 0.25757314711738094, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117773393.26984127, "logits/rejected": 107696821.16923077, "logps/chosen": -247.74603174603175, "logps/rejected": -347.32307692307694, "loss": 0.1724, "rewards/chosen": 1.4464285714285714, "rewards/margins": 7.261813186813186, "rewards/rejected": -5.815384615384615, "step": 334 }, { "epoch": 0.23784167554135605, "grad_norm": 0.33817731367193216, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 182958433.10344827, "logits/rejected": 86642336.91428572, "logps/chosen": -232.41379310344828, "logps/rejected": -364.8, "loss": 0.1927, "rewards/chosen": 0.9617456896551724, "rewards/margins": 6.068888546798029, "rewards/rejected": -5.107142857142857, "step": 335 }, { "epoch": 0.23855165069222578, "grad_norm": 0.1876438136727318, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124076276.53731343, "logits/rejected": 110839640.13114753, "logps/chosen": -277.25373134328356, "logps/rejected": -328.91803278688525, "loss": 0.2117, "rewards/chosen": 1.242537313432836, "rewards/margins": 7.062209444580376, "rewards/rejected": -5.819672131147541, "step": 336 }, { "epoch": 0.2392616258430955, "grad_norm": 0.17053858215006842, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159018829.91304347, "logits/rejected": 42207405.55932204, "logps/chosen": -247.8840579710145, "logps/rejected": -308.06779661016947, "loss": 0.2179, "rewards/chosen": 1.0733695652173914, "rewards/margins": 6.192013633014001, "rewards/rejected": -5.11864406779661, "step": 337 }, { "epoch": 0.23997160099396522, "grad_norm": 0.21413222707358137, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59588043.03448276, "logits/rejected": 113485882.51428571, "logps/chosen": -232.82758620689654, "logps/rejected": -389.48571428571427, "loss": 0.1772, "rewards/chosen": 0.7984913793103449, "rewards/margins": 6.998491379310345, "rewards/rejected": -6.2, "step": 338 }, { "epoch": 0.24068157614483493, "grad_norm": 0.24292426643269505, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145249322.0821918, "logits/rejected": 91664607.41818182, "logps/chosen": -359.8904109589041, "logps/rejected": -349.3818181818182, "loss": 0.2093, "rewards/chosen": 1.5102739726027397, "rewards/margins": 5.55118306351183, "rewards/rejected": -4.040909090909091, "step": 339 }, { "epoch": 0.24139155129570464, "grad_norm": 0.22562335552547436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 52756480.0, "logits/rejected": 144703488.0, "logps/chosen": -168.0, "logps/rejected": -321.5, "loss": 0.1825, "rewards/chosen": 1.099609375, "rewards/margins": 6.951171875, "rewards/rejected": -5.8515625, "step": 340 }, { "epoch": 0.24210152644657437, "grad_norm": 0.2732426312594956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103098698.32258065, "logits/rejected": 116741461.33333333, "logps/chosen": -212.25806451612902, "logps/rejected": -295.75757575757575, "loss": 0.1996, "rewards/chosen": 0.9871471774193549, "rewards/margins": 6.365935056207234, "rewards/rejected": -5.378787878787879, "step": 341 }, { "epoch": 0.24281150159744408, "grad_norm": 0.21437904424260862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123731968.0, "logits/rejected": 131203072.0, "logps/chosen": -241.64705882352942, "logps/rejected": -373.6, "loss": 0.2033, "rewards/chosen": 1.2996323529411764, "rewards/margins": 7.324632352941177, "rewards/rejected": -6.025, "step": 342 }, { "epoch": 0.24352147674831381, "grad_norm": 0.2709942915229927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121705905.89830509, "logits/rejected": 96894501.10144928, "logps/chosen": -181.6949152542373, "logps/rejected": -315.82608695652175, "loss": 0.1898, "rewards/chosen": 1.0434322033898304, "rewards/margins": 6.90575104396954, "rewards/rejected": -5.86231884057971, "step": 343 }, { "epoch": 0.24423145189918352, "grad_norm": 0.18969536014051078, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136721790.08955225, "logits/rejected": 124935251.93442623, "logps/chosen": -198.56716417910448, "logps/rejected": -347.8032786885246, "loss": 0.18, "rewards/chosen": 1.2136194029850746, "rewards/margins": 6.254603009542452, "rewards/rejected": -5.040983606557377, "step": 344 }, { "epoch": 0.24494142705005326, "grad_norm": 0.1915016447004691, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138863726.2769231, "logits/rejected": 85816791.36507936, "logps/chosen": -214.15384615384616, "logps/rejected": -293.3333333333333, "loss": 0.2083, "rewards/chosen": 1.1519230769230768, "rewards/margins": 6.7709706959706955, "rewards/rejected": -5.619047619047619, "step": 345 }, { "epoch": 0.24565140220092296, "grad_norm": 0.21243754249723767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 188044629.33333334, "logits/rejected": 70816329.14285715, "logps/chosen": -270.44444444444446, "logps/rejected": -314.0, "loss": 0.2169, "rewards/chosen": 1.0538194444444444, "rewards/margins": 6.3261408730158735, "rewards/rejected": -5.272321428571429, "step": 346 }, { "epoch": 0.2463613773517927, "grad_norm": 0.18649798881607346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143885954.16949153, "logits/rejected": 90071158.72463769, "logps/chosen": -236.47457627118644, "logps/rejected": -383.07246376811594, "loss": 0.1907, "rewards/chosen": 0.8990333686440678, "rewards/margins": 7.094685542557111, "rewards/rejected": -6.195652173913044, "step": 347 }, { "epoch": 0.2470713525026624, "grad_norm": 0.22060119950497453, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98248393.6969697, "logits/rejected": 124205518.4516129, "logps/chosen": -233.8181818181818, "logps/rejected": -383.48387096774195, "loss": 0.194, "rewards/chosen": 0.8674242424242424, "rewards/margins": 7.133553274682307, "rewards/rejected": -6.266129032258065, "step": 348 }, { "epoch": 0.2477813276535321, "grad_norm": 0.22019761122956924, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 178289220.7761194, "logits/rejected": 78866667.01639344, "logps/chosen": -260.05970149253733, "logps/rejected": -305.3114754098361, "loss": 0.2103, "rewards/chosen": 1.1455223880597014, "rewards/margins": 6.629128945436751, "rewards/rejected": -5.483606557377049, "step": 349 }, { "epoch": 0.24849130280440185, "grad_norm": 0.21489537041419657, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114206171.94366197, "logits/rejected": 107359465.54385965, "logps/chosen": -192.90140845070422, "logps/rejected": -302.3157894736842, "loss": 0.2164, "rewards/chosen": 1.1355633802816902, "rewards/margins": 6.920651099579936, "rewards/rejected": -5.785087719298246, "step": 350 }, { "epoch": 0.24920127795527156, "grad_norm": 0.18399615058101043, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59657281.36170213, "logits/rejected": 121065219.16049382, "logps/chosen": -174.46808510638297, "logps/rejected": -341.3333333333333, "loss": 0.1548, "rewards/chosen": 0.4720744680851064, "rewards/margins": 6.058494221171527, "rewards/rejected": -5.58641975308642, "step": 351 }, { "epoch": 0.2499112531061413, "grad_norm": 0.20235820648148808, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127227221.33333333, "logits/rejected": 118797492.70588236, "logps/chosen": -255.2, "logps/rejected": -347.29411764705884, "loss": 0.2102, "rewards/chosen": 0.5604166666666667, "rewards/margins": 5.0236519607843135, "rewards/rejected": -4.463235294117647, "step": 352 }, { "epoch": 0.250621228257011, "grad_norm": 0.1794571436674447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163848456.2580645, "logits/rejected": 134599028.36363637, "logps/chosen": -285.93548387096774, "logps/rejected": -381.57575757575756, "loss": 0.182, "rewards/chosen": 1.747983870967742, "rewards/margins": 7.551014173998045, "rewards/rejected": -5.803030303030303, "step": 353 }, { "epoch": 0.25133120340788073, "grad_norm": 0.18066259384684652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158381927.16417912, "logits/rejected": 101282127.73770492, "logps/chosen": -270.8059701492537, "logps/rejected": -337.8360655737705, "loss": 0.1933, "rewards/chosen": 0.9813432835820896, "rewards/margins": 6.956753119647663, "rewards/rejected": -5.975409836065574, "step": 354 }, { "epoch": 0.25204117855875047, "grad_norm": 0.1904725753915106, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137399613.79310346, "logits/rejected": 88200221.25714286, "logps/chosen": -262.3448275862069, "logps/rejected": -325.0285714285714, "loss": 0.1709, "rewards/chosen": 1.2974137931034482, "rewards/margins": 7.211699507389163, "rewards/rejected": -5.914285714285715, "step": 355 }, { "epoch": 0.25275115370962015, "grad_norm": 0.25593814083373073, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130356305.26984127, "logits/rejected": 59236478.03076923, "logps/chosen": -248.12698412698413, "logps/rejected": -315.0769230769231, "loss": 0.1871, "rewards/chosen": 1.415922619047619, "rewards/margins": 7.34669184981685, "rewards/rejected": -5.930769230769231, "step": 356 }, { "epoch": 0.2534611288604899, "grad_norm": 0.3591256284373522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143012236.38709676, "logits/rejected": 72733044.36363636, "logps/chosen": -198.19354838709677, "logps/rejected": -332.6060606060606, "loss": 0.1603, "rewards/chosen": 1.3709677419354838, "rewards/margins": 5.689149560117302, "rewards/rejected": -4.318181818181818, "step": 357 }, { "epoch": 0.2541711040113596, "grad_norm": 0.2153292934928838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139810133.33333334, "logits/rejected": 101745697.03225806, "logps/chosen": -234.1818181818182, "logps/rejected": -279.48387096774195, "loss": 0.1833, "rewards/chosen": 1.2040719696969697, "rewards/margins": 6.740362292277615, "rewards/rejected": -5.536290322580645, "step": 358 }, { "epoch": 0.2548810791622293, "grad_norm": 0.147531124315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128323030.48648648, "logits/rejected": 46991739.25925926, "logps/chosen": -254.48648648648648, "logps/rejected": -290.3703703703704, "loss": 0.1855, "rewards/chosen": 1.6182432432432432, "rewards/margins": 5.720095095095095, "rewards/rejected": -4.101851851851852, "step": 359 }, { "epoch": 0.25559105431309903, "grad_norm": 0.15583632878625125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119385696.46376811, "logits/rejected": 63767638.779661015, "logps/chosen": -161.15942028985506, "logps/rejected": -306.4406779661017, "loss": 0.1844, "rewards/chosen": 0.8043478260869565, "rewards/margins": 5.524686809137805, "rewards/rejected": -4.720338983050848, "step": 360 }, { "epoch": 0.25630102946396877, "grad_norm": 0.17695939913710113, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115443224.38095239, "logits/rejected": 128281174.64615385, "logps/chosen": -231.87301587301587, "logps/rejected": -318.03076923076924, "loss": 0.1684, "rewards/chosen": 1.2668650793650793, "rewards/margins": 6.866865079365079, "rewards/rejected": -5.6, "step": 361 }, { "epoch": 0.2570110046148385, "grad_norm": 0.22963239271727076, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153052527.0943396, "logits/rejected": 133770335.57333334, "logps/chosen": -188.30188679245282, "logps/rejected": -359.25333333333333, "loss": 0.1822, "rewards/chosen": 0.8903301886792453, "rewards/margins": 5.703663522012579, "rewards/rejected": -4.8133333333333335, "step": 362 }, { "epoch": 0.2577209797657082, "grad_norm": 0.2784396950836412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158964121.6, "logits/rejected": 82344056.47058824, "logps/chosen": -220.53333333333333, "logps/rejected": -337.4117647058824, "loss": 0.188, "rewards/chosen": 1.14375, "rewards/margins": 6.901102941176471, "rewards/rejected": -5.757352941176471, "step": 363 }, { "epoch": 0.2584309549165779, "grad_norm": 0.20778258815626385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109750954.66666667, "logits/rejected": 122560030.11764705, "logps/chosen": -186.93333333333334, "logps/rejected": -354.8235294117647, "loss": 0.1767, "rewards/chosen": 1.0427083333333333, "rewards/margins": 7.00594362745098, "rewards/rejected": -5.963235294117647, "step": 364 }, { "epoch": 0.25914093006744765, "grad_norm": 0.17126803321702885, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144800279.63076922, "logits/rejected": 144869928.63492063, "logps/chosen": -243.93846153846152, "logps/rejected": -321.015873015873, "loss": 0.1975, "rewards/chosen": 0.7903846153846154, "rewards/margins": 7.3935592185592185, "rewards/rejected": -6.603174603174603, "step": 365 }, { "epoch": 0.2598509052183174, "grad_norm": 0.15947613692519996, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138539132.12121212, "logits/rejected": 90143710.96774194, "logps/chosen": -265.93939393939394, "logps/rejected": -317.16129032258067, "loss": 0.1985, "rewards/chosen": 1.1174242424242424, "rewards/margins": 7.810972629521016, "rewards/rejected": -6.693548387096774, "step": 366 }, { "epoch": 0.26056088036918706, "grad_norm": 0.2157111736397337, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142056263.3442623, "logits/rejected": 82258439.64179105, "logps/chosen": -226.0983606557377, "logps/rejected": -320.4776119402985, "loss": 0.183, "rewards/chosen": 1.3022540983606556, "rewards/margins": 7.473895889405432, "rewards/rejected": -6.1716417910447765, "step": 367 }, { "epoch": 0.2612708555200568, "grad_norm": 0.23961584911924985, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155730448.51612905, "logits/rejected": 96405441.93939394, "logps/chosen": -188.38709677419354, "logps/rejected": -324.8484848484849, "loss": 0.2065, "rewards/chosen": 0.8870967741935484, "rewards/margins": 6.500733137829911, "rewards/rejected": -5.613636363636363, "step": 368 }, { "epoch": 0.26198083067092653, "grad_norm": 0.13614088517485068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117440512.0, "logits/rejected": 74942343.52941176, "logps/chosen": -196.8, "logps/rejected": -328.0, "loss": 0.1703, "rewards/chosen": 1.1, "rewards/margins": 7.357352941176471, "rewards/rejected": -6.257352941176471, "step": 369 }, { "epoch": 0.2626908058217962, "grad_norm": 0.18336810798745598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80290962.28571428, "logits/rejected": 132337522.7586207, "logps/chosen": -238.85714285714286, "logps/rejected": -324.6896551724138, "loss": 0.2071, "rewards/chosen": 1.1705357142857142, "rewards/margins": 6.851570197044335, "rewards/rejected": -5.681034482758621, "step": 370 }, { "epoch": 0.26340078097266595, "grad_norm": 0.20395473078399937, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109576192.0, "logits/rejected": 94741925.64705883, "logps/chosen": -181.33333333333334, "logps/rejected": -310.11764705882354, "loss": 0.1683, "rewards/chosen": 1.49375, "rewards/margins": 7.111397058823529, "rewards/rejected": -5.617647058823529, "step": 371 }, { "epoch": 0.2641107561235357, "grad_norm": 0.19107983292526537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151563663.18644068, "logits/rejected": 58537894.95652174, "logps/chosen": -207.1864406779661, "logps/rejected": -255.07246376811594, "loss": 0.213, "rewards/chosen": 0.3289194915254237, "rewards/margins": 6.575296303119627, "rewards/rejected": -6.246376811594203, "step": 372 }, { "epoch": 0.2648207312744054, "grad_norm": 0.15415004593708634, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 177864704.0, "logits/rejected": 26468352.0, "logps/chosen": -219.5, "logps/rejected": -317.0, "loss": 0.2037, "rewards/chosen": 0.984375, "rewards/margins": 6.921875, "rewards/rejected": -5.9375, "step": 373 }, { "epoch": 0.2655307064252751, "grad_norm": 0.19043450982206342, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142344192.0, "logits/rejected": 84934656.0, "logps/chosen": -243.25, "logps/rejected": -316.0, "loss": 0.1674, "rewards/chosen": 1.4345703125, "rewards/margins": 8.2080078125, "rewards/rejected": -6.7734375, "step": 374 }, { "epoch": 0.26624068157614483, "grad_norm": 0.17411741190212937, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144542168.6153846, "logits/rejected": 86976619.78947368, "logps/chosen": -253.23076923076923, "logps/rejected": -344.0, "loss": 0.1679, "rewards/chosen": 1.4284855769230769, "rewards/margins": 7.520590840080971, "rewards/rejected": -6.092105263157895, "step": 375 }, { "epoch": 0.26695065672701457, "grad_norm": 0.22538347234330117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82217890.9090909, "logits/rejected": 94980690.58064516, "logps/chosen": -157.8181818181818, "logps/rejected": -309.4193548387097, "loss": 0.2115, "rewards/chosen": 0.9635416666666666, "rewards/margins": 6.931283602150538, "rewards/rejected": -5.967741935483871, "step": 376 }, { "epoch": 0.26766063187788425, "grad_norm": 0.15959726433799978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89744790.34920634, "logits/rejected": 106019099.56923077, "logps/chosen": -225.26984126984127, "logps/rejected": -325.4153846153846, "loss": 0.1887, "rewards/chosen": 0.8839285714285714, "rewards/margins": 8.014697802197803, "rewards/rejected": -7.130769230769231, "step": 377 }, { "epoch": 0.268370607028754, "grad_norm": 0.1667100317739596, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107057890.62295082, "logits/rejected": 99912077.37313433, "logps/chosen": -217.9672131147541, "logps/rejected": -338.6268656716418, "loss": 0.1908, "rewards/chosen": 1.0420081967213115, "rewards/margins": 7.392754465378028, "rewards/rejected": -6.350746268656716, "step": 378 }, { "epoch": 0.2690805821796237, "grad_norm": 0.14685094355860243, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111448649.14285715, "logits/rejected": 108406626.46153846, "logps/chosen": -188.6984126984127, "logps/rejected": -407.1384615384615, "loss": 0.1792, "rewards/chosen": 0.9672619047619048, "rewards/margins": 6.259569597069597, "rewards/rejected": -5.292307692307692, "step": 379 }, { "epoch": 0.26979055733049345, "grad_norm": 0.1709336676565556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131203072.0, "logits/rejected": 110067712.0, "logps/chosen": -218.5, "logps/rejected": -318.0, "loss": 0.1912, "rewards/chosen": 1.2529296875, "rewards/margins": 7.5029296875, "rewards/rejected": -6.25, "step": 380 }, { "epoch": 0.27050053248136313, "grad_norm": 0.17917388799986542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133728392.53333333, "logits/rejected": 131380404.70588236, "logps/chosen": -243.46666666666667, "logps/rejected": -352.0, "loss": 0.1732, "rewards/chosen": 1.6489583333333333, "rewards/margins": 6.693075980392157, "rewards/rejected": -5.044117647058823, "step": 381 }, { "epoch": 0.27121050763223287, "grad_norm": 0.17131891196810195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124702292.05970149, "logits/rejected": 79588637.37704918, "logps/chosen": -202.26865671641792, "logps/rejected": -375.60655737704917, "loss": 0.2051, "rewards/chosen": 1.023787313432836, "rewards/margins": 7.064770919990213, "rewards/rejected": -6.040983606557377, "step": 382 }, { "epoch": 0.2719204827831026, "grad_norm": 0.2025185043954365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139876709.58730158, "logits/rejected": 69028564.67692308, "logps/chosen": -278.0952380952381, "logps/rejected": -316.8, "loss": 0.1743, "rewards/chosen": 1.1676587301587302, "rewards/margins": 7.375351037851038, "rewards/rejected": -6.207692307692308, "step": 383 }, { "epoch": 0.27263045793397234, "grad_norm": 0.20995154047199213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179376401.06666666, "logits/rejected": 139275565.17647058, "logps/chosen": -240.0, "logps/rejected": -353.88235294117646, "loss": 0.2092, "rewards/chosen": 0.79375, "rewards/margins": 6.867279411764706, "rewards/rejected": -6.073529411764706, "step": 384 }, { "epoch": 0.273340433084842, "grad_norm": 0.16959996779489242, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128503748.63768116, "logits/rejected": 113708292.33898304, "logps/chosen": -239.65217391304347, "logps/rejected": -292.8813559322034, "loss": 0.2023, "rewards/chosen": 1.2971014492753623, "rewards/margins": 6.7250675509702775, "rewards/rejected": -5.427966101694915, "step": 385 }, { "epoch": 0.27405040823571175, "grad_norm": 0.16664183263575932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103147614.52307692, "logits/rejected": 131621254.09523809, "logps/chosen": -201.35384615384615, "logps/rejected": -349.968253968254, "loss": 0.1818, "rewards/chosen": 1.2875, "rewards/margins": 7.835119047619047, "rewards/rejected": -6.5476190476190474, "step": 386 }, { "epoch": 0.2747603833865815, "grad_norm": 0.18503112207018613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109150208.0, "logits/rejected": 122814464.0, "logps/chosen": -214.25, "logps/rejected": -364.0, "loss": 0.1962, "rewards/chosen": 0.55078125, "rewards/margins": 7.08203125, "rewards/rejected": -6.53125, "step": 387 }, { "epoch": 0.27547035853745117, "grad_norm": 0.18222000449267545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62020691.93442623, "logits/rejected": 139601461.49253732, "logps/chosen": -173.24590163934425, "logps/rejected": -341.97014925373134, "loss": 0.1707, "rewards/chosen": 1.3032786885245902, "rewards/margins": 8.027159285539517, "rewards/rejected": -6.723880597014926, "step": 388 }, { "epoch": 0.2761803336883209, "grad_norm": 0.18514436890583627, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78880613.43396227, "logits/rejected": 128177930.24, "logps/chosen": -179.9245283018868, "logps/rejected": -326.82666666666665, "loss": 0.1628, "rewards/chosen": 1.054245283018868, "rewards/margins": 7.4409119496855345, "rewards/rejected": -6.386666666666667, "step": 389 }, { "epoch": 0.27689030883919064, "grad_norm": 0.21642700432930048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107121192.63492064, "logits/rejected": 97695019.32307692, "logps/chosen": -209.26984126984127, "logps/rejected": -326.6461538461538, "loss": 0.1713, "rewards/chosen": 1.1656746031746033, "rewards/margins": 7.319520757020758, "rewards/rejected": -6.153846153846154, "step": 390 }, { "epoch": 0.27760028399006037, "grad_norm": 0.1714463123034678, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153155646.06060606, "logits/rejected": 50669898.32258064, "logps/chosen": -229.0909090909091, "logps/rejected": -331.35483870967744, "loss": 0.184, "rewards/chosen": 1.1136363636363635, "rewards/margins": 7.516862170087976, "rewards/rejected": -6.403225806451613, "step": 391 }, { "epoch": 0.27831025914093005, "grad_norm": 0.19076164164103007, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136547896.8888889, "logits/rejected": 109950683.42857143, "logps/chosen": -222.0, "logps/rejected": -355.42857142857144, "loss": 0.2043, "rewards/chosen": 0.8151041666666666, "rewards/margins": 4.127604166666667, "rewards/rejected": -3.3125, "step": 392 }, { "epoch": 0.2790202342917998, "grad_norm": 0.18473043558588886, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91925162.66666667, "logits/rejected": 79467081.14285715, "logps/chosen": -207.77777777777777, "logps/rejected": -326.85714285714283, "loss": 0.1877, "rewards/chosen": 1.2456597222222223, "rewards/margins": -869542.2543402778, "rewards/rejected": 869543.5, "step": 393 }, { "epoch": 0.2797302094426695, "grad_norm": 0.15738293906525994, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104698724.84848484, "logits/rejected": 79827076.12903225, "logps/chosen": -232.72727272727272, "logps/rejected": -352.0, "loss": 0.1929, "rewards/chosen": 1.331439393939394, "rewards/margins": 7.299181329423265, "rewards/rejected": -5.967741935483871, "step": 394 }, { "epoch": 0.2804401845935392, "grad_norm": 0.16442113222463195, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121002797.5890411, "logits/rejected": 106001501.0909091, "logps/chosen": -232.986301369863, "logps/rejected": -315.92727272727274, "loss": 0.2185, "rewards/chosen": 0.9704623287671232, "rewards/margins": 6.20682596513076, "rewards/rejected": -5.236363636363636, "step": 395 }, { "epoch": 0.28115015974440893, "grad_norm": 0.24381356591300807, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168056519.59322035, "logits/rejected": 23216840.347826086, "logps/chosen": -231.59322033898306, "logps/rejected": -300.28985507246375, "loss": 0.1814, "rewards/chosen": 1.1117584745762712, "rewards/margins": 7.111758474576272, "rewards/rejected": -6.0, "step": 396 }, { "epoch": 0.28186013489527867, "grad_norm": 0.18207905339209732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137518163.93442622, "logits/rejected": 67953984.95522387, "logps/chosen": -190.95081967213116, "logps/rejected": -362.9850746268657, "loss": 0.1707, "rewards/chosen": 1.2817622950819672, "rewards/margins": 7.259374235380474, "rewards/rejected": -5.977611940298507, "step": 397 }, { "epoch": 0.2825701100461484, "grad_norm": 0.17823594413819252, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141541115.93650794, "logits/rejected": 75884638.52307692, "logps/chosen": -261.7142857142857, "logps/rejected": -322.46153846153845, "loss": 0.1626, "rewards/chosen": 1.4861111111111112, "rewards/margins": 7.470726495726495, "rewards/rejected": -5.984615384615385, "step": 398 }, { "epoch": 0.2832800851970181, "grad_norm": 0.17074336135219398, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100990976.0, "logits/rejected": 101449728.0, "logps/chosen": -223.75, "logps/rejected": -347.0, "loss": 0.2001, "rewards/chosen": 1.22607421875, "rewards/margins": 8.24951171875, "rewards/rejected": -7.0234375, "step": 399 }, { "epoch": 0.2839900603478878, "grad_norm": 0.17137106579103484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127071876.74074075, "logits/rejected": 81335489.72972973, "logps/chosen": -249.92592592592592, "logps/rejected": -300.5405405405405, "loss": 0.1753, "rewards/chosen": 0.9305555555555556, "rewards/margins": 7.2008258258258255, "rewards/rejected": -6.27027027027027, "step": 400 }, { "epoch": 0.28470003549875755, "grad_norm": 0.20726041058573155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140035633.5483871, "logits/rejected": 82011353.21212122, "logps/chosen": -205.67741935483872, "logps/rejected": -267.27272727272725, "loss": 0.192, "rewards/chosen": 0.717741935483871, "rewards/margins": 6.687438905180841, "rewards/rejected": -5.96969696969697, "step": 401 }, { "epoch": 0.2854100106496273, "grad_norm": 0.20859570187821475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121555678.18867925, "logits/rejected": 101054764.37333333, "logps/chosen": -234.8679245283019, "logps/rejected": -306.9866666666667, "loss": 0.1676, "rewards/chosen": 0.9457547169811321, "rewards/margins": 5.132421383647799, "rewards/rejected": -4.1866666666666665, "step": 402 }, { "epoch": 0.28611998580049697, "grad_norm": 0.2676970480838386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150478721.96923077, "logits/rejected": 82687707.42857143, "logps/chosen": -257.7230769230769, "logps/rejected": -366.4761904761905, "loss": 0.2207, "rewards/chosen": 0.9870192307692308, "rewards/margins": 6.947336691086691, "rewards/rejected": -5.9603174603174605, "step": 403 }, { "epoch": 0.2868299609513667, "grad_norm": 0.17239245630912703, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169132474.8108108, "logits/rejected": 109828626.96296297, "logps/chosen": -300.5405405405405, "logps/rejected": -381.6296296296296, "loss": 0.1937, "rewards/chosen": 1.5194256756756757, "rewards/margins": 7.037944194194194, "rewards/rejected": -5.518518518518518, "step": 404 }, { "epoch": 0.28753993610223644, "grad_norm": 0.18383983701303944, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137333496.68571427, "logits/rejected": 42123828.96551724, "logps/chosen": -195.65714285714284, "logps/rejected": -285.51724137931035, "loss": 0.2058, "rewards/chosen": 0.9651785714285714, "rewards/margins": 5.594488916256157, "rewards/rejected": -4.629310344827586, "step": 405 }, { "epoch": 0.2882499112531061, "grad_norm": 0.20071278868986417, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81464017.12676056, "logits/rejected": 179839982.0350877, "logps/chosen": -229.18309859154928, "logps/rejected": -406.4561403508772, "loss": 0.216, "rewards/chosen": 1.0105633802816902, "rewards/margins": 7.51056338028169, "rewards/rejected": -6.5, "step": 406 }, { "epoch": 0.28895988640397585, "grad_norm": 0.17035951700181573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169297361.45454547, "logits/rejected": 34095632.51612903, "logps/chosen": -272.24242424242425, "logps/rejected": -320.51612903225805, "loss": 0.1943, "rewards/chosen": 1.0227272727272727, "rewards/margins": 5.7807917888563045, "rewards/rejected": -4.758064516129032, "step": 407 }, { "epoch": 0.2896698615548456, "grad_norm": 0.21231150353181827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150820181.33333334, "logits/rejected": 125229933.71428572, "logps/chosen": -210.77777777777777, "logps/rejected": -368.0, "loss": 0.1906, "rewards/chosen": 1.0677083333333333, "rewards/margins": 7.8757440476190474, "rewards/rejected": -6.808035714285714, "step": 408 }, { "epoch": 0.2903798367057153, "grad_norm": 0.2327250853944305, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133060678.62068966, "logits/rejected": 105816298.05714285, "logps/chosen": -219.0344827586207, "logps/rejected": -360.6857142857143, "loss": 0.1838, "rewards/chosen": 0.8836206896551724, "rewards/margins": 7.605049261083744, "rewards/rejected": -6.7214285714285715, "step": 409 }, { "epoch": 0.291089811856585, "grad_norm": 0.1762595250647935, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113865039.73770492, "logits/rejected": 93151109.73134328, "logps/chosen": -218.88524590163934, "logps/rejected": -373.4925373134328, "loss": 0.2068, "rewards/chosen": 1.0320184426229508, "rewards/margins": 6.293212472473697, "rewards/rejected": -5.2611940298507465, "step": 410 }, { "epoch": 0.29179978700745474, "grad_norm": 0.1904161921061122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120979456.0, "logits/rejected": 100007936.0, "logps/chosen": -206.75, "logps/rejected": -384.0, "loss": 0.1877, "rewards/chosen": 0.876953125, "rewards/margins": 7.611328125, "rewards/rejected": -6.734375, "step": 411 }, { "epoch": 0.29250976215832447, "grad_norm": 0.17880433912591573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140767736.98630136, "logits/rejected": 55169396.36363637, "logps/chosen": -198.57534246575344, "logps/rejected": -324.07272727272726, "loss": 0.2263, "rewards/chosen": 0.7461472602739726, "rewards/margins": 6.637056351183063, "rewards/rejected": -5.890909090909091, "step": 412 }, { "epoch": 0.29321973730919415, "grad_norm": 0.18644648898391536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76968608.47761194, "logits/rejected": 141093636.19672132, "logps/chosen": -205.37313432835822, "logps/rejected": -386.62295081967216, "loss": 0.2099, "rewards/chosen": 0.9682835820895522, "rewards/margins": 7.214185221433814, "rewards/rejected": -6.245901639344262, "step": 413 }, { "epoch": 0.2939297124600639, "grad_norm": 0.2852674862249383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128547650.37037037, "logits/rejected": 69815323.67567568, "logps/chosen": -215.55555555555554, "logps/rejected": -303.13513513513516, "loss": 0.1723, "rewards/chosen": 1.2395833333333333, "rewards/margins": 5.4287725225225225, "rewards/rejected": -4.1891891891891895, "step": 414 }, { "epoch": 0.2946396876109336, "grad_norm": 0.14606682331924145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98363193.80645162, "logits/rejected": 102823998.06060606, "logps/chosen": -212.0, "logps/rejected": -359.27272727272725, "loss": 0.1619, "rewards/chosen": 1.4294354838709677, "rewards/margins": 8.164283968719452, "rewards/rejected": -6.734848484848484, "step": 415 }, { "epoch": 0.29534966276180336, "grad_norm": 0.17240851556433537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142976421.6470588, "logits/rejected": 81369497.6, "logps/chosen": -218.11764705882354, "logps/rejected": -356.8, "loss": 0.1898, "rewards/chosen": 1.4779411764705883, "rewards/margins": 7.627941176470589, "rewards/rejected": -6.15, "step": 416 }, { "epoch": 0.29605963791267303, "grad_norm": 0.2291200493825254, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92641689.6, "logits/rejected": 105731413.33333333, "logps/chosen": -189.4, "logps/rejected": -318.3333333333333, "loss": 0.2059, "rewards/chosen": 1.328125, "rewards/margins": 5.828125, "rewards/rejected": -4.5, "step": 417 }, { "epoch": 0.29676961306354277, "grad_norm": 0.2754562484624052, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124850449.06666666, "logits/rejected": 114294784.0, "logps/chosen": -267.73333333333335, "logps/rejected": -369.4117647058824, "loss": 0.1802, "rewards/chosen": 1.490625, "rewards/margins": 8.005330882352942, "rewards/rejected": -6.514705882352941, "step": 418 }, { "epoch": 0.2974795882144125, "grad_norm": 0.16182441233841927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148009168.27118644, "logits/rejected": 107471441.6231884, "logps/chosen": -264.6779661016949, "logps/rejected": -355.2463768115942, "loss": 0.1523, "rewards/chosen": 1.5031779661016949, "rewards/margins": 7.111873618275608, "rewards/rejected": -5.608695652173913, "step": 419 }, { "epoch": 0.29818956336528224, "grad_norm": 0.20972710512837717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 208037478.4, "logits/rejected": 52256431.34246575, "logps/chosen": -306.90909090909093, "logps/rejected": -318.24657534246575, "loss": 0.1911, "rewards/chosen": 0.8556818181818182, "rewards/margins": 5.218695516811955, "rewards/rejected": -4.363013698630137, "step": 420 }, { "epoch": 0.2988995385161519, "grad_norm": 0.1974182646506903, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133649008.81355932, "logits/rejected": 98171028.4057971, "logps/chosen": -201.76271186440678, "logps/rejected": -341.7971014492754, "loss": 0.18, "rewards/chosen": 0.5889830508474576, "rewards/margins": 7.603475804470646, "rewards/rejected": -7.0144927536231885, "step": 421 }, { "epoch": 0.29960951366702165, "grad_norm": 0.26856864064615427, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120914898.14925373, "logits/rejected": 69924763.27868852, "logps/chosen": -222.80597014925374, "logps/rejected": -328.91803278688525, "loss": 0.1941, "rewards/chosen": 1.3264925373134329, "rewards/margins": 7.547804012723269, "rewards/rejected": -6.221311475409836, "step": 422 }, { "epoch": 0.3003194888178914, "grad_norm": 0.19608973036511684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139062872.27586207, "logits/rejected": 109890764.8, "logps/chosen": -252.9655172413793, "logps/rejected": -346.0571428571429, "loss": 0.1695, "rewards/chosen": 0.8890086206896551, "rewards/margins": 6.939008620689655, "rewards/rejected": -6.05, "step": 423 }, { "epoch": 0.30102946396876107, "grad_norm": 0.28902334076070085, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132686474.15873016, "logits/rejected": 102349083.56923077, "logps/chosen": -261.07936507936506, "logps/rejected": -356.18461538461537, "loss": 0.1862, "rewards/chosen": 1.3581349206349207, "rewards/margins": 7.173519536019536, "rewards/rejected": -5.815384615384615, "step": 424 }, { "epoch": 0.3017394391196308, "grad_norm": 0.16658126091815506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118191730.62686567, "logits/rejected": 93856146.8852459, "logps/chosen": -206.44776119402985, "logps/rejected": -302.42622950819674, "loss": 0.188, "rewards/chosen": 1.234141791044776, "rewards/margins": 7.209551627110351, "rewards/rejected": -5.975409836065574, "step": 425 }, { "epoch": 0.30244941427050054, "grad_norm": 0.17721219149026984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120076931.65714286, "logits/rejected": 86670230.06896552, "logps/chosen": -184.45714285714286, "logps/rejected": -324.6896551724138, "loss": 0.2114, "rewards/chosen": 1.0709821428571429, "rewards/margins": 7.8296028325123155, "rewards/rejected": -6.758620689655173, "step": 426 }, { "epoch": 0.3031593894213703, "grad_norm": 0.17529242503234008, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97668634.03389831, "logits/rejected": 101757462.26086956, "logps/chosen": -233.76271186440678, "logps/rejected": -350.60869565217394, "loss": 0.1812, "rewards/chosen": 1.1980932203389831, "rewards/margins": 7.524180176860723, "rewards/rejected": -6.326086956521739, "step": 427 }, { "epoch": 0.30386936457223995, "grad_norm": 0.15124920395425273, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142471035.87096775, "logits/rejected": 81725377.93939394, "logps/chosen": -200.0, "logps/rejected": -340.6060606060606, "loss": 0.1606, "rewards/chosen": 1.375, "rewards/margins": 7.367424242424242, "rewards/rejected": -5.992424242424242, "step": 428 }, { "epoch": 0.3045793397231097, "grad_norm": 0.162520818371563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119668736.0, "logits/rejected": 128712704.0, "logps/chosen": -216.5, "logps/rejected": -386.5, "loss": 0.1564, "rewards/chosen": 1.380859375, "rewards/margins": 8.349609375, "rewards/rejected": -6.96875, "step": 429 }, { "epoch": 0.3052893148739794, "grad_norm": 0.17782326051298916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133746757.42372881, "logits/rejected": 63826365.217391305, "logps/chosen": -290.4406779661017, "logps/rejected": -339.0144927536232, "loss": 0.1693, "rewards/chosen": 1.4004237288135593, "rewards/margins": 7.972887496929502, "rewards/rejected": -6.572463768115942, "step": 430 }, { "epoch": 0.30599929002484916, "grad_norm": 0.15945958890909237, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116226371.36842105, "logits/rejected": 93363593.84615384, "logps/chosen": -234.73684210526315, "logps/rejected": -349.2307692307692, "loss": 0.1935, "rewards/chosen": 1.4383223684210527, "rewards/margins": 8.62101467611336, "rewards/rejected": -7.1826923076923075, "step": 431 }, { "epoch": 0.30670926517571884, "grad_norm": 0.22277027223194118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162227147.93220338, "logits/rejected": 48264889.507246375, "logps/chosen": -210.98305084745763, "logps/rejected": -327.42028985507244, "loss": 0.158, "rewards/chosen": 1.2139830508474576, "rewards/margins": 8.25021493490543, "rewards/rejected": -7.036231884057971, "step": 432 }, { "epoch": 0.30741924032658857, "grad_norm": 0.15970999054171894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114487022.93333334, "logits/rejected": 109206106.35294117, "logps/chosen": -230.93333333333334, "logps/rejected": -335.7647058823529, "loss": 0.1652, "rewards/chosen": 1.2197916666666666, "rewards/margins": 7.4550857843137255, "rewards/rejected": -6.235294117647059, "step": 433 }, { "epoch": 0.3081292154774583, "grad_norm": 0.20589861630232587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130186838.44155844, "logits/rejected": 120709601.88235295, "logps/chosen": -211.53246753246754, "logps/rejected": -321.88235294117646, "loss": 0.2037, "rewards/chosen": 1.5925324675324675, "rewards/margins": 6.244493251846193, "rewards/rejected": -4.651960784313726, "step": 434 }, { "epoch": 0.308839190628328, "grad_norm": 0.15815691850566405, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119264122.43478261, "logits/rejected": 49905108.61016949, "logps/chosen": -262.0289855072464, "logps/rejected": -357.96610169491527, "loss": 0.1943, "rewards/chosen": 1.1603260869565217, "rewards/margins": 5.389139646278555, "rewards/rejected": -4.228813559322034, "step": 435 }, { "epoch": 0.3095491657791977, "grad_norm": 0.1911485413856106, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106891201.93939394, "logits/rejected": 102557497.80645162, "logps/chosen": -187.03030303030303, "logps/rejected": -338.5806451612903, "loss": 0.1978, "rewards/chosen": 1.1141098484848484, "rewards/margins": 8.275400171065494, "rewards/rejected": -7.161290322580645, "step": 436 }, { "epoch": 0.31025914093006746, "grad_norm": 0.21957450722135327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135918121.5135135, "logits/rejected": 71458512.5925926, "logps/chosen": -251.67567567567568, "logps/rejected": -293.3333333333333, "loss": 0.2019, "rewards/chosen": 1.5236486486486487, "rewards/margins": 7.1023523523523515, "rewards/rejected": -5.578703703703703, "step": 437 }, { "epoch": 0.3109691160809372, "grad_norm": 0.3052114628038131, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56679783.783783786, "logits/rejected": 137169275.25925925, "logps/chosen": -194.59459459459458, "logps/rejected": -333.9259259259259, "loss": 0.2263, "rewards/chosen": 1.052364864864865, "rewards/margins": 7.7699574574574575, "rewards/rejected": -6.717592592592593, "step": 438 }, { "epoch": 0.31167909123180687, "grad_norm": 0.2594303307519694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114881275.66101696, "logits/rejected": 76470064.23188406, "logps/chosen": -197.15254237288136, "logps/rejected": -313.9710144927536, "loss": 0.1836, "rewards/chosen": 1.4194915254237288, "rewards/margins": 7.180361090641121, "rewards/rejected": -5.760869565217392, "step": 439 }, { "epoch": 0.3123890663826766, "grad_norm": 0.1875189905589556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81216977.45454545, "logits/rejected": 147747740.9032258, "logps/chosen": -174.1818181818182, "logps/rejected": -335.2258064516129, "loss": 0.2063, "rewards/chosen": 0.7414772727272727, "rewards/margins": 6.8624450146627565, "rewards/rejected": -6.120967741935484, "step": 440 }, { "epoch": 0.31309904153354634, "grad_norm": 0.2000286974235827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152532855.46666667, "logits/rejected": 49113449.4117647, "logps/chosen": -274.93333333333334, "logps/rejected": -290.3529411764706, "loss": 0.1801, "rewards/chosen": 1.2479166666666666, "rewards/margins": 7.725857843137255, "rewards/rejected": -6.477941176470588, "step": 441 }, { "epoch": 0.313809016684416, "grad_norm": 0.17569901526773152, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107975733.89473684, "logits/rejected": 137081147.07692307, "logps/chosen": -212.8421052631579, "logps/rejected": -435.0769230769231, "loss": 0.1996, "rewards/chosen": 1.381578947368421, "rewards/margins": 6.27580971659919, "rewards/rejected": -4.894230769230769, "step": 442 }, { "epoch": 0.31451899183528575, "grad_norm": 0.19629312987495365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168689664.0, "logits/rejected": 52445184.0, "logps/chosen": -203.75, "logps/rejected": -327.5, "loss": 0.1903, "rewards/chosen": 0.9990234375, "rewards/margins": 6.9912109375, "rewards/rejected": -5.9921875, "step": 443 }, { "epoch": 0.3152289669861555, "grad_norm": 0.16505167742457652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133222469.42372881, "logits/rejected": 48078729.27536232, "logps/chosen": -242.84745762711864, "logps/rejected": -296.3478260869565, "loss": 0.187, "rewards/chosen": 0.6461864406779662, "rewards/margins": 6.602708179808401, "rewards/rejected": -5.956521739130435, "step": 444 }, { "epoch": 0.3159389421370252, "grad_norm": 0.18458143420480724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72351744.0, "logits/rejected": 86930332.90322581, "logps/chosen": -200.0, "logps/rejected": -301.4193548387097, "loss": 0.1895, "rewards/chosen": 1.293560606060606, "rewards/margins": 7.325818670576735, "rewards/rejected": -6.032258064516129, "step": 445 }, { "epoch": 0.3166489172878949, "grad_norm": 0.24475800883020574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127701577.14285715, "logits/rejected": 30335886.222222224, "logps/chosen": -266.0, "logps/rejected": -341.77777777777777, "loss": 0.1645, "rewards/chosen": 1.3169642857142858, "rewards/margins": 8.025297619047619, "rewards/rejected": -6.708333333333333, "step": 446 }, { "epoch": 0.31735889243876464, "grad_norm": 0.14664714496776324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 186394869.76, "logits/rejected": 73454093.12820514, "logps/chosen": -257.44, "logps/rejected": -364.71794871794873, "loss": 0.1488, "rewards/chosen": 0.8625, "rewards/margins": 7.356089743589743, "rewards/rejected": -6.493589743589744, "step": 447 }, { "epoch": 0.3180688675896344, "grad_norm": 0.1576729421961212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138041946.3529412, "logits/rejected": 116391936.0, "logps/chosen": -192.0, "logps/rejected": -322.4, "loss": 0.1785, "rewards/chosen": 1.1709558823529411, "rewards/margins": 5.9459558823529415, "rewards/rejected": -4.775, "step": 448 }, { "epoch": 0.3187788427405041, "grad_norm": 0.2131257826380154, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101528787.3015873, "logits/rejected": 69367335.38461539, "logps/chosen": -238.22222222222223, "logps/rejected": -292.4307692307692, "loss": 0.2145, "rewards/chosen": 0.9285714285714286, "rewards/margins": 4.997802197802198, "rewards/rejected": -4.069230769230769, "step": 449 }, { "epoch": 0.3194888178913738, "grad_norm": 0.1857684938873278, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159528183.1724138, "logits/rejected": 92274688.0, "logps/chosen": -282.48275862068965, "logps/rejected": -363.42857142857144, "loss": 0.1803, "rewards/chosen": 1.3485991379310345, "rewards/margins": 7.498599137931035, "rewards/rejected": -6.15, "step": 450 }, { "epoch": 0.3201987930422435, "grad_norm": 0.26111127979158544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98951649.88235295, "logits/rejected": 109113070.93333334, "logps/chosen": -196.7058823529412, "logps/rejected": -346.4, "loss": 0.2043, "rewards/chosen": 0.9434742647058824, "rewards/margins": 7.260140931372549, "rewards/rejected": -6.316666666666666, "step": 451 }, { "epoch": 0.32090876819311326, "grad_norm": 0.18539654612554948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143130624.0, "logits/rejected": 111214592.0, "logps/chosen": -220.0, "logps/rejected": -338.25, "loss": 0.1954, "rewards/chosen": 0.79345703125, "rewards/margins": 7.25439453125, "rewards/rejected": -6.4609375, "step": 452 }, { "epoch": 0.32161874334398294, "grad_norm": 0.1716599120750327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124565083.1780822, "logits/rejected": 106802231.85454546, "logps/chosen": -192.43835616438355, "logps/rejected": -354.3272727272727, "loss": 0.225, "rewards/chosen": 0.9657534246575342, "rewards/margins": 7.256662515566625, "rewards/rejected": -6.290909090909091, "step": 453 }, { "epoch": 0.3223287184948527, "grad_norm": 0.21134023303933908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118177792.0, "logits/rejected": 121044992.0, "logps/chosen": -237.75, "logps/rejected": -369.5, "loss": 0.1777, "rewards/chosen": 0.9326171875, "rewards/margins": 5.9873046875, "rewards/rejected": -5.0546875, "step": 454 }, { "epoch": 0.3230386936457224, "grad_norm": 0.17349243092430786, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85378967.86440678, "logits/rejected": 130084211.01449275, "logps/chosen": -197.6949152542373, "logps/rejected": -368.231884057971, "loss": 0.1729, "rewards/chosen": 0.9353813559322034, "rewards/margins": 7.717990051584377, "rewards/rejected": -6.782608695652174, "step": 455 }, { "epoch": 0.32374866879659214, "grad_norm": 0.23727231023064824, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120795955.2, "logits/rejected": 65288694.33962264, "logps/chosen": -203.09333333333333, "logps/rejected": -300.9811320754717, "loss": 0.2248, "rewards/chosen": 0.9266666666666666, "rewards/margins": 6.780440251572327, "rewards/rejected": -5.85377358490566, "step": 456 }, { "epoch": 0.3244586439474618, "grad_norm": 0.33015598623750786, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111366002.7586207, "logits/rejected": 129544074.97142857, "logps/chosen": -164.9655172413793, "logps/rejected": -402.2857142857143, "loss": 0.1714, "rewards/chosen": 0.8448275862068966, "rewards/margins": 7.459113300492611, "rewards/rejected": -6.614285714285714, "step": 457 }, { "epoch": 0.32516861909833156, "grad_norm": 0.17883186602188086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119406592.0, "logits/rejected": 84738048.0, "logps/chosen": -250.5, "logps/rejected": -335.5, "loss": 0.1965, "rewards/chosen": 1.302734375, "rewards/margins": 7.349609375, "rewards/rejected": -6.046875, "step": 458 }, { "epoch": 0.3258785942492013, "grad_norm": 0.16145573263509785, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161308806.29508197, "logits/rejected": 46348624.23880597, "logps/chosen": -258.88524590163934, "logps/rejected": -289.1940298507463, "loss": 0.1822, "rewards/chosen": 1.7510245901639345, "rewards/margins": 6.758487276731099, "rewards/rejected": -5.007462686567164, "step": 459 }, { "epoch": 0.32658856940007097, "grad_norm": 0.2039664649192835, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72934286.22222222, "logits/rejected": 113620699.42857143, "logps/chosen": -214.44444444444446, "logps/rejected": -356.0, "loss": 0.2171, "rewards/chosen": 1.0095486111111112, "rewards/margins": 5.2193700396825395, "rewards/rejected": -4.209821428571429, "step": 460 }, { "epoch": 0.3272985445509407, "grad_norm": 0.22187825679797102, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117810597.64705883, "logits/rejected": 79482060.8, "logps/chosen": -192.7058823529412, "logps/rejected": -350.4, "loss": 0.2095, "rewards/chosen": 1.03125, "rewards/margins": 7.197916666666667, "rewards/rejected": -6.166666666666667, "step": 461 }, { "epoch": 0.32800851970181044, "grad_norm": 0.15844845127619384, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111966589.83050847, "logits/rejected": 87594087.88405797, "logps/chosen": -169.4915254237288, "logps/rejected": -347.3623188405797, "loss": 0.1644, "rewards/chosen": 1.1345338983050848, "rewards/margins": 8.141780275116679, "rewards/rejected": -7.007246376811594, "step": 462 }, { "epoch": 0.3287184948526802, "grad_norm": 0.24486704147891306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173364565.33333334, "logits/rejected": 45925937.548387095, "logps/chosen": -243.3939393939394, "logps/rejected": -316.9032258064516, "loss": 0.2105, "rewards/chosen": 1.228219696969697, "rewards/margins": 6.744348729227762, "rewards/rejected": -5.516129032258065, "step": 463 }, { "epoch": 0.32942847000354986, "grad_norm": 0.15379203511937697, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 185839931.07692307, "logits/rejected": 89079027.8095238, "logps/chosen": -242.46153846153845, "logps/rejected": -356.57142857142856, "loss": 0.1854, "rewards/chosen": 0.9692307692307692, "rewards/margins": 8.247008547008546, "rewards/rejected": -7.277777777777778, "step": 464 }, { "epoch": 0.3301384451544196, "grad_norm": 0.19639269914630286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129823695.23809524, "logits/rejected": 101437629.04615384, "logps/chosen": -276.8253968253968, "logps/rejected": -285.04615384615386, "loss": 0.168, "rewards/chosen": 1.630952380952381, "rewards/margins": 7.677106227106227, "rewards/rejected": -6.046153846153846, "step": 465 }, { "epoch": 0.3308484203052893, "grad_norm": 0.22524153699209157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127667719.01369864, "logits/rejected": 97765413.23636363, "logps/chosen": -202.08219178082192, "logps/rejected": -328.72727272727275, "loss": 0.2106, "rewards/chosen": 1.2953767123287672, "rewards/margins": 6.840831257783313, "rewards/rejected": -5.545454545454546, "step": 466 }, { "epoch": 0.33155839545615906, "grad_norm": 0.3093365595655324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112273615.76811594, "logits/rejected": 127322007.86440678, "logps/chosen": -213.1014492753623, "logps/rejected": -369.89830508474574, "loss": 0.1837, "rewards/chosen": 1.2518115942028984, "rewards/margins": 7.158591255219847, "rewards/rejected": -5.906779661016949, "step": 467 }, { "epoch": 0.33226837060702874, "grad_norm": 0.20602364648886629, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166253532.68965518, "logits/rejected": 48294414.62857143, "logps/chosen": -225.93103448275863, "logps/rejected": -330.9714285714286, "loss": 0.1947, "rewards/chosen": 0.771484375, "rewards/margins": 7.414341517857143, "rewards/rejected": -6.642857142857143, "step": 468 }, { "epoch": 0.3329783457578985, "grad_norm": 0.20149505478611127, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 186904072.98245615, "logits/rejected": 55116699.042253524, "logps/chosen": -241.40350877192984, "logps/rejected": -318.8732394366197, "loss": 0.1788, "rewards/chosen": 0.7412280701754386, "rewards/margins": 7.029960464541635, "rewards/rejected": -6.288732394366197, "step": 469 }, { "epoch": 0.3336883209087682, "grad_norm": 0.21658509674992799, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143458304.0, "logits/rejected": 97501184.0, "logps/chosen": -221.5, "logps/rejected": -330.5, "loss": 0.1837, "rewards/chosen": 1.4326171875, "rewards/margins": 8.0654296875, "rewards/rejected": -6.6328125, "step": 470 }, { "epoch": 0.3343982960596379, "grad_norm": 0.20276095378991263, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123596667.87096775, "logits/rejected": 76911460.84848484, "logps/chosen": -237.29032258064515, "logps/rejected": -316.3636363636364, "loss": 0.165, "rewards/chosen": 1.5403225806451613, "rewards/margins": 8.41911045943304, "rewards/rejected": -6.878787878787879, "step": 471 }, { "epoch": 0.3351082712105076, "grad_norm": 0.1542718161239687, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 174696090.41269842, "logits/rejected": 49750898.21538462, "logps/chosen": -292.57142857142856, "logps/rejected": -323.9384615384615, "loss": 0.165, "rewards/chosen": 1.6944444444444444, "rewards/margins": 8.47136752136752, "rewards/rejected": -6.776923076923077, "step": 472 }, { "epoch": 0.33581824636137736, "grad_norm": 0.24893961505036086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179587100.84507042, "logits/rejected": 92421856.56140351, "logps/chosen": -215.88732394366198, "logps/rejected": -321.6842105263158, "loss": 0.2102, "rewards/chosen": 0.903169014084507, "rewards/margins": -23510574.254725724, "rewards/rejected": 23510575.157894738, "step": 473 }, { "epoch": 0.3365282215122471, "grad_norm": 0.1672564410185234, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112988096.98461539, "logits/rejected": 106455430.09523809, "logps/chosen": -239.01538461538462, "logps/rejected": -354.53968253968253, "loss": 0.1677, "rewards/chosen": 1.3990384615384615, "rewards/margins": 8.391101953601954, "rewards/rejected": -6.992063492063492, "step": 474 }, { "epoch": 0.3372381966631168, "grad_norm": 0.18603002872782562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170072262.19354838, "logits/rejected": 100218445.57575758, "logps/chosen": -275.8709677419355, "logps/rejected": -369.45454545454544, "loss": 0.1827, "rewards/chosen": 1.3508064516129032, "rewards/margins": 7.320503421309873, "rewards/rejected": -5.96969696969697, "step": 475 }, { "epoch": 0.3379481718139865, "grad_norm": 0.17916905604882388, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80853275.56923077, "logits/rejected": 154456909.2063492, "logps/chosen": -181.66153846153847, "logps/rejected": -363.6825396825397, "loss": 0.1731, "rewards/chosen": 1.1211538461538462, "rewards/margins": -5584135.513766789, "rewards/rejected": 5584136.634920635, "step": 476 }, { "epoch": 0.33865814696485624, "grad_norm": 0.21361862094284487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148897792.0, "logits/rejected": 55640064.0, "logps/chosen": -215.5, "logps/rejected": -307.75, "loss": 0.1808, "rewards/chosen": 1.1689453125, "rewards/margins": 7.4697265625, "rewards/rejected": -6.30078125, "step": 477 }, { "epoch": 0.3393681221157259, "grad_norm": 0.17982885815185684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98536606.64788732, "logits/rejected": 90803002.38596492, "logps/chosen": -185.2394366197183, "logps/rejected": -335.1578947368421, "loss": 0.192, "rewards/chosen": 1.2940140845070423, "rewards/margins": 6.092259698542129, "rewards/rejected": -4.798245614035087, "step": 478 }, { "epoch": 0.34007809726659566, "grad_norm": 0.15708276758227413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116626691.82089552, "logits/rejected": 93443592.39344262, "logps/chosen": -198.92537313432837, "logps/rejected": -365.11475409836066, "loss": 0.1681, "rewards/chosen": 0.9692164179104478, "rewards/margins": 7.534790188402251, "rewards/rejected": -6.565573770491803, "step": 479 }, { "epoch": 0.3407880724174654, "grad_norm": 0.16374413817749908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 46152764.23529412, "logits/rejected": 118978423.46666667, "logps/chosen": -227.76470588235293, "logps/rejected": -353.6, "loss": 0.1877, "rewards/chosen": 1.4356617647058822, "rewards/margins": 6.235661764705882, "rewards/rejected": -4.8, "step": 480 }, { "epoch": 0.34149804756833513, "grad_norm": 0.15026831436218266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130208466.8235294, "logits/rejected": 81579212.8, "logps/chosen": -261.1764705882353, "logps/rejected": -344.8, "loss": 0.1833, "rewards/chosen": 1.1443014705882353, "rewards/margins": 8.310968137254902, "rewards/rejected": -7.166666666666667, "step": 481 }, { "epoch": 0.3422080227192048, "grad_norm": 0.16562683184638136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134647913.02564102, "logits/rejected": 101166612.48, "logps/chosen": -198.56410256410257, "logps/rejected": -344.0, "loss": 0.2217, "rewards/chosen": 1.0048076923076923, "rewards/margins": 4.934807692307692, "rewards/rejected": -3.93, "step": 482 }, { "epoch": 0.34291799787007454, "grad_norm": 0.23375877780961368, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141916861.369863, "logits/rejected": 98451753.89090909, "logps/chosen": -256.0, "logps/rejected": -335.7090909090909, "loss": 0.2083, "rewards/chosen": 1.2363013698630136, "rewards/margins": 7.336301369863014, "rewards/rejected": -6.1, "step": 483 }, { "epoch": 0.3436279730209443, "grad_norm": 0.180381344509433, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103233998.4516129, "logits/rejected": 88080384.0, "logps/chosen": -210.83870967741936, "logps/rejected": -326.3030303030303, "loss": 0.1653, "rewards/chosen": 1.5060483870967742, "rewards/margins": 8.422715053763442, "rewards/rejected": -6.916666666666667, "step": 484 }, { "epoch": 0.344337948171814, "grad_norm": 0.14554082713373143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117174206.98412699, "logits/rejected": 129894368.4923077, "logps/chosen": -236.1904761904762, "logps/rejected": -342.4, "loss": 0.1774, "rewards/chosen": 1.5773809523809523, "rewards/margins": 7.731227106227107, "rewards/rejected": -6.153846153846154, "step": 485 }, { "epoch": 0.3450479233226837, "grad_norm": 0.20066153604156936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80181111.46666667, "logits/rejected": 101403467.29411764, "logps/chosen": -170.53333333333333, "logps/rejected": -310.8235294117647, "loss": 0.1703, "rewards/chosen": 1.2885416666666667, "rewards/margins": 7.82530637254902, "rewards/rejected": -6.536764705882353, "step": 486 }, { "epoch": 0.3457578984735534, "grad_norm": 0.24535948594164145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146931712.0, "logits/rejected": 95125504.0, "logps/chosen": -231.0, "logps/rejected": -366.75, "loss": 0.1762, "rewards/chosen": 1.509765625, "rewards/margins": 7.478515625, "rewards/rejected": -5.96875, "step": 487 }, { "epoch": 0.34646787362442316, "grad_norm": 0.16214211045653934, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 51302551.7037037, "logits/rejected": 170266070.4864865, "logps/chosen": -255.85185185185185, "logps/rejected": -384.86486486486484, "loss": 0.1639, "rewards/chosen": 1.1597222222222223, "rewards/margins": 7.875938438438438, "rewards/rejected": -6.716216216216216, "step": 488 }, { "epoch": 0.34717784877529284, "grad_norm": 0.2329110845621555, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126329932.41791044, "logits/rejected": 80482505.44262294, "logps/chosen": -261.4925373134328, "logps/rejected": -325.24590163934425, "loss": 0.1682, "rewards/chosen": 1.3479477611940298, "rewards/margins": 7.839751039882555, "rewards/rejected": -6.491803278688525, "step": 489 }, { "epoch": 0.3478878239261626, "grad_norm": 0.13558189873777213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85179323.73333333, "logits/rejected": 121388092.23529412, "logps/chosen": -236.8, "logps/rejected": -304.0, "loss": 0.1675, "rewards/chosen": 0.9739583333333334, "rewards/margins": 7.268075980392156, "rewards/rejected": -6.294117647058823, "step": 490 }, { "epoch": 0.3485977990770323, "grad_norm": 0.16272453036084975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142900673.12280703, "logits/rejected": 82084301.52112676, "logps/chosen": -254.17543859649123, "logps/rejected": -351.5492957746479, "loss": 0.151, "rewards/chosen": 1.5888157894736843, "rewards/margins": 6.307125648628614, "rewards/rejected": -4.71830985915493, "step": 491 }, { "epoch": 0.34930777422790205, "grad_norm": 0.16888508926884352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136681049.3968254, "logits/rejected": 99534060.3076923, "logps/chosen": -236.95238095238096, "logps/rejected": -400.24615384615385, "loss": 0.1973, "rewards/chosen": 1.2445436507936507, "rewards/margins": 7.71377442002442, "rewards/rejected": -6.469230769230769, "step": 492 }, { "epoch": 0.3500177493787717, "grad_norm": 0.21261217434438767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67014961.67164179, "logits/rejected": 83679802.75409836, "logps/chosen": -156.53731343283582, "logps/rejected": -294.2950819672131, "loss": 0.2015, "rewards/chosen": 0.8292910447761194, "rewards/margins": 6.427651700513824, "rewards/rejected": -5.598360655737705, "step": 493 }, { "epoch": 0.35072772452964146, "grad_norm": 0.16051978935306183, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128929257.73913044, "logits/rejected": 67926397.83050847, "logps/chosen": -236.8695652173913, "logps/rejected": -376.6779661016949, "loss": 0.189, "rewards/chosen": 1.5480072463768115, "rewards/margins": 8.903939449766643, "rewards/rejected": -7.3559322033898304, "step": 494 }, { "epoch": 0.3514376996805112, "grad_norm": 0.1922981569178144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145500996.50704226, "logits/rejected": 92109123.36842105, "logps/chosen": -267.0422535211268, "logps/rejected": -384.0, "loss": 0.1923, "rewards/chosen": 1.857394366197183, "rewards/margins": 6.5679206819866565, "rewards/rejected": -4.7105263157894735, "step": 495 }, { "epoch": 0.3521476748313809, "grad_norm": 0.2511776982460633, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106896497.77777778, "logits/rejected": 67146313.14285715, "logps/chosen": -202.22222222222223, "logps/rejected": -321.14285714285717, "loss": 0.2149, "rewards/chosen": 0.9904513888888888, "rewards/margins": 4.294022817460317, "rewards/rejected": -3.3035714285714284, "step": 496 }, { "epoch": 0.3528576499822506, "grad_norm": 0.2467477486459445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107254345.14285715, "logits/rejected": 94577443.13725491, "logps/chosen": -199.27272727272728, "logps/rejected": -348.54901960784315, "loss": 0.2099, "rewards/chosen": 1.1891233766233766, "rewards/margins": 6.375397886427298, "rewards/rejected": -5.186274509803922, "step": 497 }, { "epoch": 0.35356762513312034, "grad_norm": 0.21992710099922047, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107503237.90769231, "logits/rejected": 109850819.04761904, "logps/chosen": -209.72307692307692, "logps/rejected": -329.6507936507937, "loss": 0.192, "rewards/chosen": 1.1211538461538462, "rewards/margins": 7.3433760683760685, "rewards/rejected": -6.222222222222222, "step": 498 }, { "epoch": 0.3542776002839901, "grad_norm": 0.18561155498871176, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110430032.45714286, "logits/rejected": 103338972.68965517, "logps/chosen": -178.28571428571428, "logps/rejected": -349.2413793103448, "loss": 0.2047, "rewards/chosen": 1.0794642857142858, "rewards/margins": 7.639809113300493, "rewards/rejected": -6.560344827586207, "step": 499 }, { "epoch": 0.35498757543485976, "grad_norm": 0.14721683433839797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92687242.49180327, "logits/rejected": 124013674.98507462, "logps/chosen": -179.80327868852459, "logps/rejected": -358.6865671641791, "loss": 0.1701, "rewards/chosen": 1.209016393442623, "rewards/margins": 6.283643259114266, "rewards/rejected": -5.074626865671642, "step": 500 }, { "epoch": 0.3556975505857295, "grad_norm": 0.16446719781563335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154456909.2063492, "logits/rejected": 71948445.53846154, "logps/chosen": -247.36507936507937, "logps/rejected": -336.73846153846154, "loss": 0.1753, "rewards/chosen": 1.4712301587301588, "rewards/margins": 8.001999389499389, "rewards/rejected": -6.530769230769231, "step": 501 }, { "epoch": 0.35640752573659923, "grad_norm": 0.1856935070131649, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116774749.46031746, "logits/rejected": 96565783.63076924, "logps/chosen": -248.88888888888889, "logps/rejected": -316.3076923076923, "loss": 0.1705, "rewards/chosen": 1.4722222222222223, "rewards/margins": 7.67991452991453, "rewards/rejected": -6.207692307692308, "step": 502 }, { "epoch": 0.35711750088746896, "grad_norm": 0.19882334122695025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103174359.57894737, "logits/rejected": 108164647.38461539, "logps/chosen": -239.78947368421052, "logps/rejected": -301.84615384615387, "loss": 0.1926, "rewards/chosen": 1.2467105263157894, "rewards/margins": 7.568825910931174, "rewards/rejected": -6.322115384615385, "step": 503 }, { "epoch": 0.35782747603833864, "grad_norm": 0.18719658798104705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152725926.6031746, "logits/rejected": 92081104.73846154, "logps/chosen": -311.6190476190476, "logps/rejected": -377.10769230769233, "loss": 0.1864, "rewards/chosen": 1.4543650793650793, "rewards/margins": 8.046672771672771, "rewards/rejected": -6.592307692307692, "step": 504 }, { "epoch": 0.3585374511892084, "grad_norm": 0.20238262730493123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108767544.40677966, "logits/rejected": 119871992.57971014, "logps/chosen": -184.67796610169492, "logps/rejected": -378.8985507246377, "loss": 0.1812, "rewards/chosen": 0.9899364406779662, "rewards/margins": 8.18558861459101, "rewards/rejected": -7.195652173913044, "step": 505 }, { "epoch": 0.3592474263400781, "grad_norm": 0.1386987180628323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167619639.85454544, "logits/rejected": 60803043.94520548, "logps/chosen": -260.07272727272726, "logps/rejected": -333.5890410958904, "loss": 0.17, "rewards/chosen": 1.3363636363636364, "rewards/margins": 8.267870485678705, "rewards/rejected": -6.931506849315069, "step": 506 }, { "epoch": 0.3599574014909478, "grad_norm": 0.26482975201307823, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103760628.18461539, "logits/rejected": 93905806.22222222, "logps/chosen": -194.2153846153846, "logps/rejected": -311.36507936507934, "loss": 0.2039, "rewards/chosen": 0.8538461538461538, "rewards/margins": 7.33003663003663, "rewards/rejected": -6.476190476190476, "step": 507 }, { "epoch": 0.3606673766418175, "grad_norm": 0.2071779048925806, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 208009042.44067797, "logits/rejected": -22620367.76811594, "logps/chosen": -251.66101694915255, "logps/rejected": -335.768115942029, "loss": 0.1631, "rewards/chosen": 1.257415254237288, "rewards/margins": 7.344371775976419, "rewards/rejected": -6.086956521739131, "step": 508 }, { "epoch": 0.36137735179268726, "grad_norm": 0.18047808577947322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 188362379.63636363, "logits/rejected": 71675243.35483871, "logps/chosen": -308.1212121212121, "logps/rejected": -365.93548387096774, "loss": 0.1697, "rewards/chosen": 1.75, "rewards/margins": 7.209677419354839, "rewards/rejected": -5.459677419354839, "step": 509 }, { "epoch": 0.362087326943557, "grad_norm": 0.17985199065122776, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173251815.22580644, "logits/rejected": 1048576.0, "logps/chosen": -170.96774193548387, "logps/rejected": -298.42424242424244, "loss": 0.1933, "rewards/chosen": 0.59375, "rewards/margins": 6.267992424242424, "rewards/rejected": -5.674242424242424, "step": 510 }, { "epoch": 0.3627973020944267, "grad_norm": 0.1671307757688729, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119968933.16129032, "logits/rejected": 109623854.54545455, "logps/chosen": -213.41935483870967, "logps/rejected": -362.6666666666667, "loss": 0.169, "rewards/chosen": 1.3689516129032258, "rewards/margins": 8.012891006842619, "rewards/rejected": -6.643939393939394, "step": 511 }, { "epoch": 0.3635072772452964, "grad_norm": 0.231153430003135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107933422.93333334, "logits/rejected": 108656214.94339623, "logps/chosen": -218.24, "logps/rejected": -324.52830188679246, "loss": 0.21, "rewards/chosen": 1.1858333333333333, "rewards/margins": 6.30375786163522, "rewards/rejected": -5.117924528301887, "step": 512 }, { "epoch": 0.36421725239616615, "grad_norm": 0.23942088496239067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120865860.26666667, "logits/rejected": 106954752.0, "logps/chosen": -219.33333333333334, "logps/rejected": -345.88235294117646, "loss": 0.1569, "rewards/chosen": 0.859375, "rewards/margins": 7.410845588235294, "rewards/rejected": -6.551470588235294, "step": 513 }, { "epoch": 0.3649272275470358, "grad_norm": 0.20745055195456447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132023784.36923076, "logits/rejected": 141807420.95238096, "logps/chosen": -295.87692307692305, "logps/rejected": -379.93650793650795, "loss": 0.1703, "rewards/chosen": 1.4653846153846153, "rewards/margins": 8.417765567765567, "rewards/rejected": -6.9523809523809526, "step": 514 }, { "epoch": 0.36563720269790556, "grad_norm": 0.1973871679234739, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122498349.1764706, "logits/rejected": 89967820.8, "logps/chosen": -239.2941176470588, "logps/rejected": -334.93333333333334, "loss": 0.1602, "rewards/chosen": 1.8125, "rewards/margins": 6.5125, "rewards/rejected": -4.7, "step": 515 }, { "epoch": 0.3663471778487753, "grad_norm": 0.17171347809749468, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82822735.32394366, "logits/rejected": 117072590.59649123, "logps/chosen": -211.6056338028169, "logps/rejected": -340.49122807017545, "loss": 0.2101, "rewards/chosen": 1.2095070422535212, "rewards/margins": -5436169.457159624, "rewards/rejected": 5436170.666666667, "step": 516 }, { "epoch": 0.36705715299964503, "grad_norm": 0.13979024504654966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141407963.42857143, "logits/rejected": 41620401.23076923, "logps/chosen": -248.12698412698413, "logps/rejected": -330.83076923076925, "loss": 0.1311, "rewards/chosen": 2.0833333333333335, "rewards/margins": 9.59871794871795, "rewards/rejected": -7.515384615384615, "step": 517 }, { "epoch": 0.3677671281505147, "grad_norm": 0.15134378123669773, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151806744.77419356, "logits/rejected": 100663296.0, "logps/chosen": -290.06451612903226, "logps/rejected": -370.42424242424244, "loss": 0.17, "rewards/chosen": 1.418850806451613, "rewards/margins": 9.206729594330401, "rewards/rejected": -7.787878787878788, "step": 518 }, { "epoch": 0.36847710330138445, "grad_norm": 0.13192893482493806, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144155002.0923077, "logits/rejected": 49499444.82539683, "logps/chosen": -232.86153846153846, "logps/rejected": -327.1111111111111, "loss": 0.1528, "rewards/chosen": 1.775, "rewards/margins": 9.917857142857143, "rewards/rejected": -8.142857142857142, "step": 519 }, { "epoch": 0.3691870784522542, "grad_norm": 0.15514642086901115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113246208.0, "logits/rejected": 110100480.0, "logps/chosen": -179.52941176470588, "logps/rejected": -380.8, "loss": 0.187, "rewards/chosen": 1.3547794117647058, "rewards/margins": 7.929779411764706, "rewards/rejected": -6.575, "step": 520 }, { "epoch": 0.3698970536031239, "grad_norm": 0.14897999057269537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102038477.63934426, "logits/rejected": 115061653.01492538, "logps/chosen": -186.49180327868854, "logps/rejected": -353.910447761194, "loss": 0.1932, "rewards/chosen": 0.9851434426229508, "rewards/margins": 7.5075315023244436, "rewards/rejected": -6.522388059701493, "step": 521 }, { "epoch": 0.3706070287539936, "grad_norm": 0.15133162901825595, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 40103432.98245614, "logits/rejected": 137496374.08450705, "logps/chosen": -174.59649122807016, "logps/rejected": -315.943661971831, "loss": 0.1739, "rewards/chosen": 1.0548245614035088, "rewards/margins": 7.815387941685199, "rewards/rejected": -6.76056338028169, "step": 522 }, { "epoch": 0.37131700390486333, "grad_norm": 0.15921698738133208, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149037602.13333333, "logits/rejected": 132614023.52941176, "logps/chosen": -256.26666666666665, "logps/rejected": -324.70588235294116, "loss": 0.1791, "rewards/chosen": 1.046875, "rewards/margins": 7.318933823529412, "rewards/rejected": -6.272058823529412, "step": 523 }, { "epoch": 0.37202697905573306, "grad_norm": 0.19488750496379353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82960865.88235295, "logits/rejected": 140788804.26666668, "logps/chosen": -202.94117647058823, "logps/rejected": -434.6666666666667, "loss": 0.1702, "rewards/chosen": 1.40625, "rewards/margins": 8.472916666666666, "rewards/rejected": -7.066666666666666, "step": 524 }, { "epoch": 0.37273695420660274, "grad_norm": 0.1862920548465119, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115142263.23287672, "logits/rejected": 70921867.63636364, "logps/chosen": -209.75342465753425, "logps/rejected": -297.0181818181818, "loss": 0.2217, "rewards/chosen": 0.9426369863013698, "rewards/margins": 7.506273349937733, "rewards/rejected": -6.5636363636363635, "step": 525 }, { "epoch": 0.3734469293574725, "grad_norm": 0.20363668682423178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118472956.06153846, "logits/rejected": 112314140.44444445, "logps/chosen": -228.43076923076924, "logps/rejected": -379.6825396825397, "loss": 0.1819, "rewards/chosen": 1.4956730769230768, "rewards/margins": 8.134561965811965, "rewards/rejected": -6.638888888888889, "step": 526 }, { "epoch": 0.3741569045083422, "grad_norm": 0.180142553999124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90740186.53658536, "logits/rejected": 101210379.13043478, "logps/chosen": -199.8048780487805, "logps/rejected": -310.2608695652174, "loss": 0.1999, "rewards/chosen": 1.4603658536585367, "rewards/margins": 7.319061505832449, "rewards/rejected": -5.858695652173913, "step": 527 }, { "epoch": 0.37486687965921195, "grad_norm": 0.16499543938626068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 85006971.5862069, "logps/chosen": -258.0571428571429, "logps/rejected": -307.0344827586207, "loss": 0.1762, "rewards/chosen": 1.9428571428571428, "rewards/margins": 6.813546798029557, "rewards/rejected": -4.870689655172414, "step": 528 }, { "epoch": 0.37557685481008163, "grad_norm": 0.15249956135390114, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144551520.46376812, "logits/rejected": 96824441.49152543, "logps/chosen": -246.7246376811594, "logps/rejected": -341.6949152542373, "loss": 0.1922, "rewards/chosen": 1.3451086956521738, "rewards/margins": 7.463752763448784, "rewards/rejected": -6.11864406779661, "step": 529 }, { "epoch": 0.37628682996095136, "grad_norm": 0.19705892862323118, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102692797.93548387, "logits/rejected": 124939419.15151516, "logps/chosen": -224.0, "logps/rejected": -371.3939393939394, "loss": 0.1512, "rewards/chosen": 1.377016129032258, "rewards/margins": 6.301258553274682, "rewards/rejected": -4.924242424242424, "step": 530 }, { "epoch": 0.3769968051118211, "grad_norm": 0.1601197063040403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155559333.6470588, "logits/rejected": 67803545.6, "logps/chosen": -249.64705882352942, "logps/rejected": -297.6, "loss": 0.1935, "rewards/chosen": 1.734375, "rewards/margins": 7.709375, "rewards/rejected": -5.975, "step": 531 }, { "epoch": 0.37770678026269083, "grad_norm": 0.1663970882605342, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131507562.33846153, "logits/rejected": 98832449.01587301, "logps/chosen": -232.12307692307692, "logps/rejected": -332.1904761904762, "loss": 0.1789, "rewards/chosen": 1.3673076923076923, "rewards/margins": 7.708577533577533, "rewards/rejected": -6.341269841269841, "step": 532 }, { "epoch": 0.3784167554135605, "grad_norm": 0.1470051684112572, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131072000.0, "logits/rejected": 122333866.66666667, "logps/chosen": -264.0, "logps/rejected": -398.22222222222223, "loss": 0.1682, "rewards/chosen": 1.4232700892857142, "rewards/margins": 7.8816034226190474, "rewards/rejected": -6.458333333333333, "step": 533 }, { "epoch": 0.37912673056443025, "grad_norm": 0.16247073615836918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118755675.11864407, "logits/rejected": 66181862.02898551, "logps/chosen": -202.71186440677965, "logps/rejected": -327.18840579710144, "loss": 0.1845, "rewards/chosen": 0.5794491525423728, "rewards/margins": 7.825825964136576, "rewards/rejected": -7.246376811594203, "step": 534 }, { "epoch": 0.3798367057153, "grad_norm": 0.1932968344591622, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147556989.90163934, "logits/rejected": 91147860.05970149, "logps/chosen": -236.327868852459, "logps/rejected": -291.82089552238807, "loss": 0.1727, "rewards/chosen": 1.3227459016393444, "rewards/margins": 7.591402618057256, "rewards/rejected": -6.268656716417911, "step": 535 }, { "epoch": 0.38054668086616966, "grad_norm": 0.14199692067736142, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169266021.69863012, "logits/rejected": 100396385.74545455, "logps/chosen": -231.23287671232876, "logps/rejected": -360.72727272727275, "loss": 0.1805, "rewards/chosen": 1.6284246575342465, "rewards/margins": 8.873879202988793, "rewards/rejected": -7.245454545454545, "step": 536 }, { "epoch": 0.3812566560170394, "grad_norm": 0.1727967288669775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127426950.09523809, "logits/rejected": 94468631.63076924, "logps/chosen": -219.68253968253967, "logps/rejected": -313.10769230769233, "loss": 0.1844, "rewards/chosen": 1.244047619047619, "rewards/margins": 5.736355311355311, "rewards/rejected": -4.492307692307692, "step": 537 }, { "epoch": 0.38196663116790913, "grad_norm": 0.17950896303136715, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120676117.94285715, "logits/rejected": 101033913.37931034, "logps/chosen": -220.57142857142858, "logps/rejected": -362.2068965517241, "loss": 0.1898, "rewards/chosen": 1.4321428571428572, "rewards/margins": 5.708004926108374, "rewards/rejected": -4.275862068965517, "step": 538 }, { "epoch": 0.38267660631877887, "grad_norm": 0.22227168570445777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90173016.27586207, "logits/rejected": 120316606.17142858, "logps/chosen": -181.3793103448276, "logps/rejected": -316.8, "loss": 0.1571, "rewards/chosen": 1.1810344827586208, "rewards/margins": 7.9596059113300495, "rewards/rejected": -6.7785714285714285, "step": 539 }, { "epoch": 0.38338658146964855, "grad_norm": 0.30881391495264293, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141208234.66666666, "logits/rejected": 83096243.53246753, "logps/chosen": -203.76470588235293, "logps/rejected": -316.46753246753246, "loss": 0.1753, "rewards/chosen": 0.49203431372549017, "rewards/margins": 6.719307040998218, "rewards/rejected": -6.2272727272727275, "step": 540 }, { "epoch": 0.3840965566205183, "grad_norm": 0.19766551952706138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85324127.08571428, "logits/rejected": 107099383.1724138, "logps/chosen": -213.4857142857143, "logps/rejected": -337.6551724137931, "loss": 0.1681, "rewards/chosen": 1.65, "rewards/margins": 8.71896551724138, "rewards/rejected": -7.068965517241379, "step": 541 }, { "epoch": 0.384806531771388, "grad_norm": 0.13685152846231724, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179489580.6984127, "logits/rejected": 77207457.47692308, "logps/chosen": -239.4920634920635, "logps/rejected": -380.55384615384617, "loss": 0.1479, "rewards/chosen": 1.6478174603174602, "rewards/margins": 8.763202075702075, "rewards/rejected": -7.115384615384615, "step": 542 }, { "epoch": 0.3855165069222577, "grad_norm": 0.17092442656195012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102390362.35294117, "logits/rejected": 94235661.2987013, "logps/chosen": -210.35294117647058, "logps/rejected": -337.038961038961, "loss": 0.1445, "rewards/chosen": 1.196078431372549, "rewards/margins": 8.170104405398522, "rewards/rejected": -6.974025974025974, "step": 543 }, { "epoch": 0.38622648207312743, "grad_norm": 0.17609280051978368, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140640256.0, "logits/rejected": 42106880.0, "logps/chosen": -233.5, "logps/rejected": -269.25, "loss": 0.1769, "rewards/chosen": 0.98779296875, "rewards/margins": 8.00341796875, "rewards/rejected": -7.015625, "step": 544 }, { "epoch": 0.38693645722399717, "grad_norm": 0.19389867819096096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169551561.6969697, "logits/rejected": 45731443.61290322, "logps/chosen": -278.06060606060606, "logps/rejected": -376.0, "loss": 0.1865, "rewards/chosen": 1.3910984848484849, "rewards/margins": 8.987872678396872, "rewards/rejected": -7.596774193548387, "step": 545 }, { "epoch": 0.3876464323748669, "grad_norm": 0.20424635925411502, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148198741.33333334, "logits/rejected": 37782561.03225806, "logps/chosen": -215.75757575757575, "logps/rejected": -339.61290322580646, "loss": 0.1617, "rewards/chosen": 1.7007575757575757, "rewards/margins": 8.587854349951124, "rewards/rejected": -6.887096774193548, "step": 546 }, { "epoch": 0.3883564075257366, "grad_norm": 0.19266490271712686, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 55073715.58208955, "logits/rejected": 68965359.21311475, "logps/chosen": -174.56716417910448, "logps/rejected": -311.344262295082, "loss": 0.1983, "rewards/chosen": 0.9169776119402985, "rewards/margins": 7.335010398825545, "rewards/rejected": -6.418032786885246, "step": 547 }, { "epoch": 0.3890663826766063, "grad_norm": 0.17231985726775928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 213310317.7142857, "logits/rejected": 89187214.22222222, "logps/chosen": -266.0, "logps/rejected": -342.6666666666667, "loss": 0.17, "rewards/chosen": 1.1071428571428572, "rewards/margins": 7.572420634920634, "rewards/rejected": -6.465277777777778, "step": 548 }, { "epoch": 0.38977635782747605, "grad_norm": 0.1838740820227176, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68717885.79310344, "logits/rejected": 151474293.02857143, "logps/chosen": -218.20689655172413, "logps/rejected": -341.0285714285714, "loss": 0.1747, "rewards/chosen": 1.3566810344827587, "rewards/margins": 7.663823891625616, "rewards/rejected": -6.307142857142857, "step": 549 }, { "epoch": 0.3904863329783458, "grad_norm": 0.1939766032064981, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100172833.03225806, "logits/rejected": 109306104.24242425, "logps/chosen": -226.83870967741936, "logps/rejected": -336.969696969697, "loss": 0.161, "rewards/chosen": 1.4879032258064515, "rewards/margins": 8.283357771260997, "rewards/rejected": -6.795454545454546, "step": 550 }, { "epoch": 0.39119630812921546, "grad_norm": 0.17867783675923213, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144179200.0, "logits/rejected": 46137344.0, "logps/chosen": -231.0, "logps/rejected": -288.0, "loss": 0.1714, "rewards/chosen": 1.482421875, "rewards/margins": 6.708984375, "rewards/rejected": -5.2265625, "step": 551 }, { "epoch": 0.3919062832800852, "grad_norm": 0.19229892848945143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148727752.64864865, "logits/rejected": 119949084.44444445, "logps/chosen": -263.56756756756755, "logps/rejected": -388.14814814814815, "loss": 0.1962, "rewards/chosen": 1.2263513513513513, "rewards/margins": 9.35598098098098, "rewards/rejected": -8.12962962962963, "step": 552 }, { "epoch": 0.39261625843095493, "grad_norm": 0.14892977713332972, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107632293.41538462, "logits/rejected": 77428183.36507936, "logps/chosen": -186.58461538461538, "logps/rejected": -312.63492063492066, "loss": 0.1617, "rewards/chosen": 1.4961538461538462, "rewards/margins": 7.722344322344322, "rewards/rejected": -6.226190476190476, "step": 553 }, { "epoch": 0.3933262335818246, "grad_norm": 0.2449798747014588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 60051781.07936508, "logits/rejected": 178612822.64615384, "logps/chosen": -180.06349206349208, "logps/rejected": -415.5076923076923, "loss": 0.1785, "rewards/chosen": 1.0734126984126984, "rewards/margins": 8.273412698412699, "rewards/rejected": -7.2, "step": 554 }, { "epoch": 0.39403620873269435, "grad_norm": 0.21477170015920144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96278341.81818181, "logits/rejected": 65349962.32258064, "logps/chosen": -202.3030303030303, "logps/rejected": -296.0, "loss": 0.1961, "rewards/chosen": 1.2054924242424243, "rewards/margins": 5.878879521016618, "rewards/rejected": -4.673387096774194, "step": 555 }, { "epoch": 0.3947461838835641, "grad_norm": 0.14084616165537034, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150607777.47692308, "logits/rejected": 42625446.603174604, "logps/chosen": -224.73846153846154, "logps/rejected": -365.2063492063492, "loss": 0.1644, "rewards/chosen": 1.4403846153846154, "rewards/margins": 8.495940170940171, "rewards/rejected": -7.055555555555555, "step": 556 }, { "epoch": 0.3954561590344338, "grad_norm": 0.1378653394769045, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140176685.17647058, "logits/rejected": 107024657.06666666, "logps/chosen": -253.64705882352942, "logps/rejected": -328.0, "loss": 0.207, "rewards/chosen": 1.1948529411764706, "rewards/margins": 8.811519607843136, "rewards/rejected": -7.616666666666666, "step": 557 }, { "epoch": 0.3961661341853035, "grad_norm": 0.18803999335428367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119930880.0, "logits/rejected": 74047488.0, "logps/chosen": -211.75, "logps/rejected": -313.25, "loss": 0.161, "rewards/chosen": 1.36328125, "rewards/margins": 8.80078125, "rewards/rejected": -7.4375, "step": 558 }, { "epoch": 0.39687610933617323, "grad_norm": 0.1379532346237037, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 185561794.20689654, "logits/rejected": 43800517.48571429, "logps/chosen": -216.82758620689654, "logps/rejected": -320.22857142857146, "loss": 0.1753, "rewards/chosen": 0.7446120689655172, "rewards/margins": 8.280326354679802, "rewards/rejected": -7.535714285714286, "step": 559 }, { "epoch": 0.39758608448704297, "grad_norm": 0.15533131184243057, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179516211.2, "logits/rejected": 105323633.77777778, "logps/chosen": -272.0, "logps/rejected": -440.8888888888889, "loss": 0.1879, "rewards/chosen": 1.2817307692307693, "rewards/margins": 8.273794261294261, "rewards/rejected": -6.992063492063492, "step": 560 }, { "epoch": 0.39829605963791265, "grad_norm": 0.1808444484593907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125467542.06896552, "logits/rejected": 98326469.48571429, "logps/chosen": -209.24137931034483, "logps/rejected": -373.0285714285714, "loss": 0.1752, "rewards/chosen": 0.5899784482758621, "rewards/margins": 6.618549876847291, "rewards/rejected": -6.0285714285714285, "step": 561 }, { "epoch": 0.3990060347887824, "grad_norm": 0.14994011414336245, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152347945.29032257, "logits/rejected": 79183375.51515152, "logps/chosen": -226.06451612903226, "logps/rejected": -362.6666666666667, "loss": 0.173, "rewards/chosen": 1.2338709677419355, "rewards/margins": 8.082355816226784, "rewards/rejected": -6.848484848484849, "step": 562 }, { "epoch": 0.3997160099396521, "grad_norm": 0.1481534520898313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83337594.09230769, "logits/rejected": 100496855.36507936, "logps/chosen": -225.96923076923076, "logps/rejected": -358.6031746031746, "loss": 0.1634, "rewards/chosen": 1.7096153846153845, "rewards/margins": 8.947710622710623, "rewards/rejected": -7.238095238095238, "step": 563 }, { "epoch": 0.40042598509052185, "grad_norm": 0.21136029725599892, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143743774.37288135, "logits/rejected": 75406291.47826087, "logps/chosen": -202.03389830508473, "logps/rejected": -342.2608695652174, "loss": 0.1841, "rewards/chosen": 0.7748940677966102, "rewards/margins": 7.832865082289364, "rewards/rejected": -7.057971014492754, "step": 564 }, { "epoch": 0.40113596024139153, "grad_norm": 0.18257160404345588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164158605.7846154, "logits/rejected": 60850696.12698413, "logps/chosen": -195.3230769230769, "logps/rejected": -354.2857142857143, "loss": 0.1855, "rewards/chosen": 0.9240384615384616, "rewards/margins": 8.170070207570207, "rewards/rejected": -7.246031746031746, "step": 565 }, { "epoch": 0.40184593539226127, "grad_norm": 0.1773292950923495, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104794998.4477612, "logits/rejected": 103241761.5737705, "logps/chosen": -195.5820895522388, "logps/rejected": -360.91803278688525, "loss": 0.1934, "rewards/chosen": 1.164179104477612, "rewards/margins": 7.803523366772694, "rewards/rejected": -6.639344262295082, "step": 566 }, { "epoch": 0.402555910543131, "grad_norm": 0.1846554465323305, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87209259.32307692, "logits/rejected": 109451361.52380952, "logps/chosen": -204.30769230769232, "logps/rejected": -349.46031746031747, "loss": 0.1943, "rewards/chosen": 1.0384615384615385, "rewards/margins": 8.744810744810746, "rewards/rejected": -7.7063492063492065, "step": 567 }, { "epoch": 0.40326588569400074, "grad_norm": 0.17056222662665346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107288389.81818181, "logits/rejected": 64944061.93548387, "logps/chosen": -216.72727272727272, "logps/rejected": -302.7096774193548, "loss": 0.1801, "rewards/chosen": 1.0880681818181819, "rewards/margins": 7.950971407624634, "rewards/rejected": -6.862903225806452, "step": 568 }, { "epoch": 0.4039758608448704, "grad_norm": 0.15702981268212754, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86984856.8358209, "logits/rejected": 135936705.04918033, "logps/chosen": -168.955223880597, "logps/rejected": -349.37704918032784, "loss": 0.1869, "rewards/chosen": 1.2243470149253732, "rewards/margins": 7.937461769023734, "rewards/rejected": -6.713114754098361, "step": 569 }, { "epoch": 0.40468583599574015, "grad_norm": 0.16459915066777536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131830201.1076923, "logits/rejected": 96535568.25396825, "logps/chosen": -247.3846153846154, "logps/rejected": -327.6190476190476, "loss": 0.2067, "rewards/chosen": 1.15, "rewards/margins": 7.907936507936508, "rewards/rejected": -6.757936507936508, "step": 570 }, { "epoch": 0.4053958111466099, "grad_norm": 0.13837929177017028, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166693624.68571427, "logits/rejected": 84898498.20689656, "logps/chosen": -289.14285714285717, "logps/rejected": -355.3103448275862, "loss": 0.1595, "rewards/chosen": 1.4625, "rewards/margins": 8.859051724137931, "rewards/rejected": -7.396551724137931, "step": 571 }, { "epoch": 0.40610578629747957, "grad_norm": 0.13945326824160767, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91930892.59016393, "logits/rejected": 126204729.31343284, "logps/chosen": -206.68852459016392, "logps/rejected": -376.35820895522386, "loss": 0.1823, "rewards/chosen": 1.2725409836065573, "rewards/margins": 7.906869341815512, "rewards/rejected": -6.634328358208955, "step": 572 }, { "epoch": 0.4068157614483493, "grad_norm": 0.1651463983868889, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106485240.35820895, "logits/rejected": 106885992.91803278, "logps/chosen": -195.22388059701493, "logps/rejected": -380.8524590163934, "loss": 0.1785, "rewards/chosen": 1.705223880597015, "rewards/margins": 7.901945192072425, "rewards/rejected": -6.19672131147541, "step": 573 }, { "epoch": 0.40752573659921904, "grad_norm": 0.17485986953835844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75852921.49152543, "logits/rejected": 142971058.08695653, "logps/chosen": -294.50847457627117, "logps/rejected": -368.69565217391306, "loss": 0.1278, "rewards/chosen": 1.6483050847457628, "rewards/margins": 8.822218128224023, "rewards/rejected": -7.173913043478261, "step": 574 }, { "epoch": 0.40823571175008877, "grad_norm": 0.14976069026329403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 57089137.777777776, "logits/rejected": 109165263.56756757, "logps/chosen": -221.92592592592592, "logps/rejected": -395.6756756756757, "loss": 0.1548, "rewards/chosen": 0.8981481481481481, "rewards/margins": 7.9994994994995, "rewards/rejected": -7.101351351351352, "step": 575 }, { "epoch": 0.40894568690095845, "grad_norm": 0.12273462233410902, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126682198.77966101, "logits/rejected": 98596537.50724638, "logps/chosen": -241.08474576271186, "logps/rejected": -380.28985507246375, "loss": 0.1356, "rewards/chosen": 1.478813559322034, "rewards/margins": 7.478813559322034, "rewards/rejected": -6.0, "step": 576 }, { "epoch": 0.4096556620518282, "grad_norm": 0.228927666223811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86908446.11764705, "logits/rejected": 96538897.06666666, "logps/chosen": -213.41176470588235, "logps/rejected": -374.4, "loss": 0.1812, "rewards/chosen": 1.3161764705882353, "rewards/margins": 8.407843137254902, "rewards/rejected": -7.091666666666667, "step": 577 }, { "epoch": 0.4103656372026979, "grad_norm": 0.17621767292414167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144703488.0, "logits/rejected": 47535445.333333336, "logps/chosen": -249.41176470588235, "logps/rejected": -325.8666666666667, "loss": 0.2283, "rewards/chosen": 0.9136029411764706, "rewards/margins": 4.48860294117647, "rewards/rejected": -3.575, "step": 578 }, { "epoch": 0.4110756123535676, "grad_norm": 0.16327171322313916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103980609.16363636, "logits/rejected": 125599295.12328768, "logps/chosen": -161.8909090909091, "logps/rejected": -386.63013698630135, "loss": 0.1554, "rewards/chosen": 0.803409090909091, "rewards/margins": 8.022587173100872, "rewards/rejected": -7.219178082191781, "step": 579 }, { "epoch": 0.41178558750443733, "grad_norm": 0.1485037077176721, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135775612.34285715, "logits/rejected": 105363809.10344827, "logps/chosen": -231.77142857142857, "logps/rejected": -368.0, "loss": 0.1914, "rewards/chosen": 1.24375, "rewards/margins": 7.2609913793103456, "rewards/rejected": -6.017241379310345, "step": 580 }, { "epoch": 0.41249556265530707, "grad_norm": 0.18414522763901814, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106095263.47540984, "logits/rejected": 108551091.58208956, "logps/chosen": -160.52459016393442, "logps/rejected": -345.7910447761194, "loss": 0.1719, "rewards/chosen": 1.1331967213114753, "rewards/margins": 8.81976388549058, "rewards/rejected": -7.686567164179104, "step": 581 }, { "epoch": 0.4132055378061768, "grad_norm": 0.16163743396826408, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100313770.66666667, "logits/rejected": 60517814.85714286, "logps/chosen": -221.77777777777777, "logps/rejected": -344.85714285714283, "loss": 0.1711, "rewards/chosen": 1.5138888888888888, "rewards/margins": 6.415674603174603, "rewards/rejected": -4.901785714285714, "step": 582 }, { "epoch": 0.4139155129570465, "grad_norm": 0.15846339269525986, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104951502.3283582, "logits/rejected": 117853066.49180327, "logps/chosen": -186.50746268656715, "logps/rejected": -352.0, "loss": 0.1761, "rewards/chosen": 1.4029850746268657, "rewards/margins": 4.501345730364571, "rewards/rejected": -3.098360655737705, "step": 583 }, { "epoch": 0.4146254881079162, "grad_norm": 0.1521668425241513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154854919.42028984, "logits/rejected": 63625458.983050846, "logps/chosen": -280.57971014492756, "logps/rejected": -346.03389830508473, "loss": 0.2101, "rewards/chosen": 1.2427536231884058, "rewards/margins": 8.32749938590027, "rewards/rejected": -7.084745762711864, "step": 584 }, { "epoch": 0.41533546325878595, "grad_norm": 0.1338585069435172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146872955.58620688, "logits/rejected": 91136234.05714285, "logps/chosen": -272.2758620689655, "logps/rejected": -319.0857142857143, "loss": 0.1751, "rewards/chosen": 1.2920258620689655, "rewards/margins": 8.163454433497536, "rewards/rejected": -6.871428571428571, "step": 585 }, { "epoch": 0.4160454384096557, "grad_norm": 0.134948027556117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95501075.6923077, "logits/rejected": 115975834.41269842, "logps/chosen": -204.8, "logps/rejected": -323.04761904761904, "loss": 0.1841, "rewards/chosen": 1.3846153846153846, "rewards/margins": 8.178266178266178, "rewards/rejected": -6.7936507936507935, "step": 586 }, { "epoch": 0.41675541356052537, "grad_norm": 0.13535321117710955, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62836887.7037037, "logits/rejected": 121408096.86486487, "logps/chosen": -195.25925925925927, "logps/rejected": -344.2162162162162, "loss": 0.1431, "rewards/chosen": 1.3425925925925926, "rewards/margins": 9.126376376376378, "rewards/rejected": -7.783783783783784, "step": 587 }, { "epoch": 0.4174653887113951, "grad_norm": 0.1602601087859153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 21336242.086956523, "logits/rejected": 173603265.5609756, "logps/chosen": -235.82608695652175, "logps/rejected": -350.4390243902439, "loss": 0.1391, "rewards/chosen": 0.9891304347826086, "rewards/margins": 7.354984093319194, "rewards/rejected": -6.365853658536586, "step": 588 }, { "epoch": 0.41817536386226484, "grad_norm": 0.18050792121831075, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128299737.42465754, "logits/rejected": 117135471.7090909, "logps/chosen": -248.32876712328766, "logps/rejected": -368.8727272727273, "loss": 0.2041, "rewards/chosen": 1.0667808219178083, "rewards/margins": 8.366780821917809, "rewards/rejected": -7.3, "step": 589 }, { "epoch": 0.4188853390131345, "grad_norm": 0.27277620503791256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114603188.70588236, "logits/rejected": 100873011.2, "logps/chosen": -218.7058823529412, "logps/rejected": -373.3333333333333, "loss": 0.1589, "rewards/chosen": 1.0983455882352942, "rewards/margins": 8.298345588235295, "rewards/rejected": -7.2, "step": 590 }, { "epoch": 0.41959531416400425, "grad_norm": 0.14779621972013693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173272886.55737704, "logits/rejected": 49971689.07462686, "logps/chosen": -180.98360655737704, "logps/rejected": -367.76119402985074, "loss": 0.1587, "rewards/chosen": 0.9692622950819673, "rewards/margins": 8.74538169806704, "rewards/rejected": -7.776119402985074, "step": 591 }, { "epoch": 0.420305289314874, "grad_norm": 0.16895109870121797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159136828.2352941, "logits/rejected": 40824558.93333333, "logps/chosen": -267.7647058823529, "logps/rejected": -331.2, "loss": 0.2071, "rewards/chosen": 1.3511029411764706, "rewards/margins": 7.934436274509803, "rewards/rejected": -6.583333333333333, "step": 592 }, { "epoch": 0.4210152644657437, "grad_norm": 0.14418326544139523, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104698724.84848484, "logits/rejected": 83344879.48387097, "logps/chosen": -196.6060606060606, "logps/rejected": -349.16129032258067, "loss": 0.179, "rewards/chosen": 1.3863636363636365, "rewards/margins": 8.29765395894428, "rewards/rejected": -6.911290322580645, "step": 593 }, { "epoch": 0.4217252396166134, "grad_norm": 0.20442301694179038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115015680.0, "logits/rejected": 96616448.0, "logps/chosen": -192.375, "logps/rejected": -367.0, "loss": 0.1796, "rewards/chosen": 0.890625, "rewards/margins": 8.234375, "rewards/rejected": -7.34375, "step": 594 }, { "epoch": 0.42243521476748314, "grad_norm": 0.1430155096233023, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139374659.147541, "logits/rejected": 70990160.23880596, "logps/chosen": -242.62295081967213, "logps/rejected": -318.8059701492537, "loss": 0.1742, "rewards/chosen": 1.2202868852459017, "rewards/margins": 8.862077930022021, "rewards/rejected": -7.641791044776119, "step": 595 }, { "epoch": 0.42314518991835287, "grad_norm": 0.17344506719694328, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 170568362.66666666, "logits/rejected": 70921867.63636364, "logps/chosen": -240.31372549019608, "logps/rejected": -363.42857142857144, "loss": 0.155, "rewards/chosen": 1.0147058823529411, "rewards/margins": 8.478991596638656, "rewards/rejected": -7.464285714285714, "step": 596 }, { "epoch": 0.42385516506922255, "grad_norm": 0.21886734638048705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146950436.57142857, "logits/rejected": 85691960.8888889, "logps/chosen": -238.28571428571428, "logps/rejected": -324.22222222222223, "loss": 0.1763, "rewards/chosen": 1.1595982142857142, "rewards/margins": 8.097098214285714, "rewards/rejected": -6.9375, "step": 597 }, { "epoch": 0.4245651402200923, "grad_norm": 0.1470053121619513, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129260823.27272727, "logits/rejected": 142335735.7419355, "logps/chosen": -190.78787878787878, "logps/rejected": -414.4516129032258, "loss": 0.1899, "rewards/chosen": 1.1803977272727273, "rewards/margins": 8.462655791788857, "rewards/rejected": -7.282258064516129, "step": 598 }, { "epoch": 0.425275115370962, "grad_norm": 0.1812086247277244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107766552.77419356, "logits/rejected": 55781065.696969695, "logps/chosen": -199.3548387096774, "logps/rejected": -424.24242424242425, "loss": 0.1876, "rewards/chosen": 0.8709677419354839, "rewards/margins": 6.810361681329423, "rewards/rejected": -5.9393939393939394, "step": 599 }, { "epoch": 0.42598509052183176, "grad_norm": 0.16765542465098487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151227960.8888889, "logits/rejected": 91758465.96923077, "logps/chosen": -258.2857142857143, "logps/rejected": -381.04615384615386, "loss": 0.1736, "rewards/chosen": 1.4613095238095237, "rewards/margins": 7.076694139194139, "rewards/rejected": -5.615384615384615, "step": 600 }, { "epoch": 0.42669506567270143, "grad_norm": 0.14950364337510721, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117530389.94285715, "logits/rejected": 47945233.655172415, "logps/chosen": -215.31428571428572, "logps/rejected": -358.0689655172414, "loss": 0.1566, "rewards/chosen": 1.9107142857142858, "rewards/margins": -24551892.15825123, "rewards/rejected": 24551894.068965517, "step": 601 }, { "epoch": 0.42740504082357117, "grad_norm": 0.1577596052553197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109941604.84848484, "logits/rejected": 113449158.19354838, "logps/chosen": -218.42424242424244, "logps/rejected": -334.7096774193548, "loss": 0.183, "rewards/chosen": 1.375, "rewards/margins": 6.495967741935484, "rewards/rejected": -5.120967741935484, "step": 602 }, { "epoch": 0.4281150159744409, "grad_norm": 0.17009911814835013, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91003686.78787878, "logits/rejected": 83344879.48387097, "logps/chosen": -141.21212121212122, "logps/rejected": -332.9032258064516, "loss": 0.1845, "rewards/chosen": 0.9356060606060606, "rewards/margins": 7.975928641251222, "rewards/rejected": -7.040322580645161, "step": 603 }, { "epoch": 0.42882499112531064, "grad_norm": 0.15554170898129904, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110335235.82089552, "logits/rejected": 41324208.26229508, "logps/chosen": -180.77611940298507, "logps/rejected": -301.11475409836066, "loss": 0.1864, "rewards/chosen": 1.2369402985074627, "rewards/margins": 9.441858331294348, "rewards/rejected": -8.204918032786885, "step": 604 }, { "epoch": 0.4295349662761803, "grad_norm": 0.3345211107048372, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119907749.64705883, "logits/rejected": 89784320.0, "logps/chosen": -216.7058823529412, "logps/rejected": -378.1333333333333, "loss": 0.1709, "rewards/chosen": 1.5238970588235294, "rewards/margins": 7.34889705882353, "rewards/rejected": -5.825, "step": 605 }, { "epoch": 0.43024494142705005, "grad_norm": 0.14651724740959252, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101964976.55172414, "logits/rejected": 100064109.71428572, "logps/chosen": -210.20689655172413, "logps/rejected": -333.25714285714287, "loss": 0.1592, "rewards/chosen": 0.8162715517241379, "rewards/margins": 7.951985837438424, "rewards/rejected": -7.135714285714286, "step": 606 }, { "epoch": 0.4309549165779198, "grad_norm": 0.1501097465679541, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99310294.70967741, "logits/rejected": 95579291.15151516, "logps/chosen": -198.96774193548387, "logps/rejected": -355.6363636363636, "loss": 0.1713, "rewards/chosen": 1.3324092741935485, "rewards/margins": 7.802106243890519, "rewards/rejected": -6.46969696969697, "step": 607 }, { "epoch": 0.43166489172878947, "grad_norm": 0.16583885670858123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151901820.54054055, "logits/rejected": 72099309.03703703, "logps/chosen": -223.56756756756758, "logps/rejected": -329.48148148148147, "loss": 0.2246, "rewards/chosen": 1.2449324324324325, "rewards/margins": 8.874562062062061, "rewards/rejected": -7.62962962962963, "step": 608 }, { "epoch": 0.4323748668796592, "grad_norm": 0.1356890149124914, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149253912.1509434, "logits/rejected": 120013018.45333333, "logps/chosen": -274.2641509433962, "logps/rejected": -323.41333333333336, "loss": 0.1377, "rewards/chosen": 1.8755896226415094, "rewards/margins": 9.335589622641509, "rewards/rejected": -7.46, "step": 609 }, { "epoch": 0.43308484203052894, "grad_norm": 0.1761190088006202, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118711513.21212122, "logits/rejected": 73738570.32258065, "logps/chosen": -208.0, "logps/rejected": -357.16129032258067, "loss": 0.1959, "rewards/chosen": 0.8977272727272727, "rewards/margins": 7.6154692082111435, "rewards/rejected": -6.717741935483871, "step": 610 }, { "epoch": 0.4337948171813987, "grad_norm": 0.17757485357574795, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124127083.5942029, "logits/rejected": 31146261.694915254, "logps/chosen": -198.95652173913044, "logps/rejected": -313.76271186440675, "loss": 0.1811, "rewards/chosen": 1.6141304347826086, "rewards/margins": 8.588706705969049, "rewards/rejected": -6.97457627118644, "step": 611 }, { "epoch": 0.43450479233226835, "grad_norm": 0.1691661427129053, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 183291084.8, "logits/rejected": 62667836.23529412, "logps/chosen": -292.8, "logps/rejected": -328.47058823529414, "loss": 0.1515, "rewards/chosen": 1.74375, "rewards/margins": 8.515808823529412, "rewards/rejected": -6.772058823529412, "step": 612 }, { "epoch": 0.4352147674831381, "grad_norm": 0.16489984710716135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118066527.52238806, "logits/rejected": 50357432.655737706, "logps/chosen": -248.83582089552237, "logps/rejected": -359.8688524590164, "loss": 0.2025, "rewards/chosen": 1.1135727611940298, "rewards/margins": 8.179146531685832, "rewards/rejected": -7.065573770491803, "step": 613 }, { "epoch": 0.4359247426340078, "grad_norm": 0.131346938999672, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134512065.12280703, "logits/rejected": 94578601.46478873, "logps/chosen": -277.05263157894734, "logps/rejected": -384.0, "loss": 0.1268, "rewards/chosen": 1.8782894736842106, "rewards/margins": 10.15997961452928, "rewards/rejected": -8.28169014084507, "step": 614 }, { "epoch": 0.4366347177848775, "grad_norm": 0.19047568839933068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91607412.36363636, "logits/rejected": 51786124.38709678, "logps/chosen": -175.5151515151515, "logps/rejected": -312.51612903225805, "loss": 0.1736, "rewards/chosen": 1.0918560606060606, "rewards/margins": 8.043468963831867, "rewards/rejected": -6.951612903225806, "step": 615 }, { "epoch": 0.43734469293574724, "grad_norm": 0.12628799672207203, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130303044.26666667, "logits/rejected": 138905479.52941176, "logps/chosen": -218.13333333333333, "logps/rejected": -400.94117647058823, "loss": 0.14, "rewards/chosen": 1.6895833333333334, "rewards/margins": 9.38811274509804, "rewards/rejected": -7.698529411764706, "step": 616 }, { "epoch": 0.43805466808661697, "grad_norm": 0.17134875372074831, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90628821.87341772, "logits/rejected": 103830423.51020408, "logps/chosen": -172.65822784810126, "logps/rejected": -340.57142857142856, "loss": 0.2015, "rewards/chosen": 1.2721518987341771, "rewards/margins": 8.358886592611729, "rewards/rejected": -7.086734693877551, "step": 617 }, { "epoch": 0.4387646432374867, "grad_norm": 0.14111320425030827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137522331.15151516, "logits/rejected": 84494930.58064516, "logps/chosen": -203.5151515151515, "logps/rejected": -321.03225806451616, "loss": 0.1693, "rewards/chosen": 1.356060606060606, "rewards/margins": 7.0415444770283475, "rewards/rejected": -5.685483870967742, "step": 618 }, { "epoch": 0.4394746183883564, "grad_norm": 0.15034405099748574, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82043128.24242425, "logits/rejected": 70423717.16129032, "logps/chosen": -172.72727272727272, "logps/rejected": -300.9032258064516, "loss": 0.1473, "rewards/chosen": 2.0132575757575757, "rewards/margins": 7.989064027370478, "rewards/rejected": -5.975806451612903, "step": 619 }, { "epoch": 0.4401845935392261, "grad_norm": 0.15540668807487484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 175361852.95238096, "logits/rejected": 108600209.72307692, "logps/chosen": -251.17460317460316, "logps/rejected": -405.16923076923075, "loss": 0.1574, "rewards/chosen": 1.4871031746031746, "rewards/margins": 8.925564713064713, "rewards/rejected": -7.438461538461539, "step": 620 }, { "epoch": 0.44089456869009586, "grad_norm": 0.1538072611590281, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173764022.85714287, "logits/rejected": 35360312.88888889, "logps/chosen": -218.85714285714286, "logps/rejected": -320.6666666666667, "loss": 0.1655, "rewards/chosen": 1.0731026785714286, "rewards/margins": 8.580047123015873, "rewards/rejected": -7.506944444444445, "step": 621 }, { "epoch": 0.4416045438409656, "grad_norm": 0.1872259396217002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128649427.86206897, "logits/rejected": 72022191.54285714, "logps/chosen": -212.9655172413793, "logps/rejected": -361.14285714285717, "loss": 0.1595, "rewards/chosen": 1.1217672413793103, "rewards/margins": 8.60033866995074, "rewards/rejected": -7.478571428571429, "step": 622 }, { "epoch": 0.44231451899183527, "grad_norm": 0.18383606798876262, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125148962.5945946, "logits/rejected": 123964984.8888889, "logps/chosen": -241.0810810810811, "logps/rejected": -352.0, "loss": 0.1893, "rewards/chosen": 1.508445945945946, "rewards/margins": 6.730668168168169, "rewards/rejected": -5.222222222222222, "step": 623 }, { "epoch": 0.443024494142705, "grad_norm": 0.15650806466821318, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98160243.61290322, "logits/rejected": 135210697.6969697, "logps/chosen": -233.5483870967742, "logps/rejected": -368.72727272727275, "loss": 0.179, "rewards/chosen": 1.0514112903225807, "rewards/margins": 8.642320381231672, "rewards/rejected": -7.590909090909091, "step": 624 }, { "epoch": 0.44373446929357474, "grad_norm": 0.18124185359970146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117378831.05882353, "logits/rejected": 53477376.0, "logps/chosen": -197.1764705882353, "logps/rejected": -302.1333333333333, "loss": 0.2, "rewards/chosen": 0.9871323529411765, "rewards/margins": 7.12046568627451, "rewards/rejected": -6.133333333333334, "step": 625 }, { "epoch": 0.4444444444444444, "grad_norm": 0.19458317089048674, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121914436.26666667, "logits/rejected": 81126148.83018868, "logps/chosen": -263.25333333333333, "logps/rejected": -370.7169811320755, "loss": 0.1778, "rewards/chosen": 1.885, "rewards/margins": 8.918018867924529, "rewards/rejected": -7.033018867924528, "step": 626 }, { "epoch": 0.44515441959531415, "grad_norm": 0.18752404341240803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126732508.55384615, "logits/rejected": 55358155.17460317, "logps/chosen": -261.04615384615386, "logps/rejected": -304.0, "loss": 0.1826, "rewards/chosen": 1.7490384615384615, "rewards/margins": 7.518879731379731, "rewards/rejected": -5.76984126984127, "step": 627 }, { "epoch": 0.4458643947461839, "grad_norm": 0.32199839759896387, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137013930.66666666, "logits/rejected": 96880932.57142857, "logps/chosen": -210.88888888888889, "logps/rejected": -349.7142857142857, "loss": 0.2045, "rewards/chosen": 1.1258680555555556, "rewards/margins": 8.2062251984127, "rewards/rejected": -7.080357142857143, "step": 628 }, { "epoch": 0.4465743698970536, "grad_norm": 0.12330479668329504, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158433898.26415095, "logits/rejected": 71806484.48, "logps/chosen": -212.83018867924528, "logps/rejected": -317.8666666666667, "loss": 0.1436, "rewards/chosen": 1.25, "rewards/margins": 8.316666666666666, "rewards/rejected": -7.066666666666666, "step": 629 }, { "epoch": 0.4472843450479233, "grad_norm": 0.1690040701826016, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109406806.64615385, "logits/rejected": 112713597.96825397, "logps/chosen": -218.33846153846153, "logps/rejected": -312.3809523809524, "loss": 0.2174, "rewards/chosen": 1.126923076923077, "rewards/margins": 6.745970695970696, "rewards/rejected": -5.619047619047619, "step": 630 }, { "epoch": 0.44799432019879304, "grad_norm": 0.1709473036507212, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82092863.07246377, "logits/rejected": 147582628.88135594, "logps/chosen": -204.52173913043478, "logps/rejected": -413.2881355932203, "loss": 0.1798, "rewards/chosen": 1.3147644927536233, "rewards/margins": 9.46730686563498, "rewards/rejected": -8.152542372881356, "step": 631 }, { "epoch": 0.4487042953496628, "grad_norm": 0.1477430778908799, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129769223.75757575, "logits/rejected": 53443550.96774194, "logps/chosen": -260.3636363636364, "logps/rejected": -323.0967741935484, "loss": 0.138, "rewards/chosen": 2.2821969696969697, "rewards/margins": 9.14510019550342, "rewards/rejected": -6.862903225806452, "step": 632 }, { "epoch": 0.4494142705005325, "grad_norm": 0.1709089364036667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95582871.43661971, "logits/rejected": 94390236.07017544, "logps/chosen": -199.66197183098592, "logps/rejected": -365.4736842105263, "loss": 0.1836, "rewards/chosen": 1.596830985915493, "rewards/margins": 8.71086607363479, "rewards/rejected": -7.114035087719298, "step": 633 }, { "epoch": 0.4501242456514022, "grad_norm": 0.15667249443558662, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113122846.11764705, "logits/rejected": 97517568.0, "logps/chosen": -177.1764705882353, "logps/rejected": -346.6666666666667, "loss": 0.1868, "rewards/chosen": 0.9761029411764706, "rewards/margins": 7.759436274509804, "rewards/rejected": -6.783333333333333, "step": 634 }, { "epoch": 0.4508342208022719, "grad_norm": 0.15262119695522458, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139949943.46666667, "logits/rejected": 76607728.94117647, "logps/chosen": -222.4, "logps/rejected": -356.70588235294116, "loss": 0.1687, "rewards/chosen": 2.0, "rewards/margins": 8.897058823529413, "rewards/rejected": -6.897058823529412, "step": 635 }, { "epoch": 0.45154419595314166, "grad_norm": 0.18550871210288367, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168666944.85333332, "logits/rejected": 31140728.75471698, "logps/chosen": -258.3466666666667, "logps/rejected": -321.50943396226415, "loss": 0.1917, "rewards/chosen": 1.2875, "rewards/margins": 7.079952830188679, "rewards/rejected": -5.7924528301886795, "step": 636 }, { "epoch": 0.45225417110401134, "grad_norm": 0.14893357487449116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103438938.35294117, "logits/rejected": 136594500.26666668, "logps/chosen": -185.64705882352942, "logps/rejected": -350.4, "loss": 0.1544, "rewards/chosen": 1.125, "rewards/margins": 8.466666666666667, "rewards/rejected": -7.341666666666667, "step": 637 }, { "epoch": 0.4529641462548811, "grad_norm": 0.13273878676195697, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142193781.5081967, "logits/rejected": 96844601.31343284, "logps/chosen": -203.54098360655738, "logps/rejected": -363.46268656716416, "loss": 0.1723, "rewards/chosen": 1.3872950819672132, "rewards/margins": 7.909683141668706, "rewards/rejected": -6.522388059701493, "step": 638 }, { "epoch": 0.4536741214057508, "grad_norm": 0.151946473544798, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118372579.55555555, "logits/rejected": 95944704.0, "logps/chosen": -190.66666666666666, "logps/rejected": -364.0, "loss": 0.1916, "rewards/chosen": 1.1727430555555556, "rewards/margins": 6.485243055555555, "rewards/rejected": -5.3125, "step": 639 }, { "epoch": 0.45438409655662054, "grad_norm": 0.14142103489728342, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125163357.46031746, "logits/rejected": 70722418.21538462, "logps/chosen": -208.25396825396825, "logps/rejected": -358.89230769230767, "loss": 0.1639, "rewards/chosen": 1.5476190476190477, "rewards/margins": 9.063003663003663, "rewards/rejected": -7.515384615384615, "step": 640 }, { "epoch": 0.4550940717074902, "grad_norm": 0.15386172383926947, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124659554.46153846, "logits/rejected": 110486797.4736842, "logps/chosen": -299.6923076923077, "logps/rejected": -352.63157894736844, "loss": 0.1478, "rewards/chosen": 1.6105769230769231, "rewards/margins": 8.985576923076923, "rewards/rejected": -7.375, "step": 641 }, { "epoch": 0.45580404685835996, "grad_norm": 0.14726048335703956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119537664.0, "logits/rejected": 98762752.0, "logps/chosen": -219.875, "logps/rejected": -345.0, "loss": 0.161, "rewards/chosen": 1.07421875, "rewards/margins": 8.85546875, "rewards/rejected": -7.78125, "step": 642 }, { "epoch": 0.4565140220092297, "grad_norm": 0.17077516588370076, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81917983.5076923, "logits/rejected": 138977930.15873015, "logps/chosen": -230.4, "logps/rejected": -358.6031746031746, "loss": 0.1479, "rewards/chosen": 2.0865384615384617, "rewards/margins": 9.316697191697191, "rewards/rejected": -7.23015873015873, "step": 643 }, { "epoch": 0.45722399716009937, "grad_norm": 0.15626664742758237, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120727093.49253732, "logits/rejected": 53735222.55737705, "logps/chosen": -269.3731343283582, "logps/rejected": -323.9344262295082, "loss": 0.1909, "rewards/chosen": 0.7350746268656716, "rewards/margins": 8.103927085882065, "rewards/rejected": -7.368852459016393, "step": 644 }, { "epoch": 0.4579339723109691, "grad_norm": 0.2027034850744542, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 180421648.25396827, "logits/rejected": 32747835.076923076, "logps/chosen": -240.25396825396825, "logps/rejected": -327.87692307692305, "loss": 0.1778, "rewards/chosen": 1.3263888888888888, "rewards/margins": 8.25715811965812, "rewards/rejected": -6.930769230769231, "step": 645 }, { "epoch": 0.45864394746183884, "grad_norm": 0.20968444762292032, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147883041.03225806, "logits/rejected": 89923335.75757575, "logps/chosen": -206.83870967741936, "logps/rejected": -384.969696969697, "loss": 0.1801, "rewards/chosen": 1.0715725806451613, "rewards/margins": 8.957936217008799, "rewards/rejected": -7.886363636363637, "step": 646 }, { "epoch": 0.4593539226127086, "grad_norm": 0.13539021247152672, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124870421.94285715, "logits/rejected": 142389389.24137932, "logps/chosen": -256.6857142857143, "logps/rejected": -471.7241379310345, "loss": 0.1747, "rewards/chosen": 1.7107142857142856, "rewards/margins": 9.598645320197045, "rewards/rejected": -7.887931034482759, "step": 647 }, { "epoch": 0.46006389776357826, "grad_norm": 0.16839994061005817, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168970532.57142857, "logits/rejected": 38167438.222222224, "logps/chosen": -228.28571428571428, "logps/rejected": -308.8888888888889, "loss": 0.1649, "rewards/chosen": 1.0658482142857142, "rewards/margins": 8.003348214285714, "rewards/rejected": -6.9375, "step": 648 }, { "epoch": 0.460773872914448, "grad_norm": 0.12310369670352016, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124418308.65454546, "logits/rejected": 130540529.97260274, "logps/chosen": -272.0, "logps/rejected": -438.7945205479452, "loss": 0.1445, "rewards/chosen": 1.7522727272727272, "rewards/margins": 10.581039850560398, "rewards/rejected": -8.82876712328767, "step": 649 }, { "epoch": 0.4614838480653177, "grad_norm": 0.1914185089498648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128275797.33333333, "logits/rejected": 67417268.70588236, "logps/chosen": -246.26666666666668, "logps/rejected": -356.2352941176471, "loss": 0.1696, "rewards/chosen": 1.59375, "rewards/margins": 8.424632352941178, "rewards/rejected": -6.830882352941177, "step": 650 }, { "epoch": 0.46219382321618746, "grad_norm": 0.13681120584181136, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142321976.40677965, "logits/rejected": 78536822.72463769, "logps/chosen": -210.71186440677965, "logps/rejected": -321.8550724637681, "loss": 0.1777, "rewards/chosen": 1.0360169491525424, "rewards/margins": 8.036016949152543, "rewards/rejected": -7.0, "step": 651 }, { "epoch": 0.46290379836705714, "grad_norm": 0.13640264337477, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88701762.37037037, "logits/rejected": 88278763.24324325, "logps/chosen": -190.8148148148148, "logps/rejected": -339.8918918918919, "loss": 0.1664, "rewards/chosen": 0.8900462962962963, "rewards/margins": 7.96437062062062, "rewards/rejected": -7.074324324324325, "step": 652 }, { "epoch": 0.4636137735179269, "grad_norm": 0.13366668933265938, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113845394.28571428, "logits/rejected": 107116071.38461539, "logps/chosen": -160.88888888888889, "logps/rejected": -296.3692307692308, "loss": 0.1924, "rewards/chosen": 0.9464285714285714, "rewards/margins": 7.9387362637362635, "rewards/rejected": -6.992307692307692, "step": 653 }, { "epoch": 0.4643237486687966, "grad_norm": 0.14096524535213012, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120848384.0, "logits/rejected": 118095872.0, "logps/chosen": -215.0, "logps/rejected": -397.5, "loss": 0.1576, "rewards/chosen": 1.6572265625, "rewards/margins": 9.5322265625, "rewards/rejected": -7.875, "step": 654 }, { "epoch": 0.4650337238196663, "grad_norm": 0.1818346774327194, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97390467.87878788, "logits/rejected": 105737050.83870968, "logps/chosen": -276.8484848484849, "logps/rejected": -333.4193548387097, "loss": 0.1772, "rewards/chosen": 1.5364583333333333, "rewards/margins": 7.609038978494623, "rewards/rejected": -6.07258064516129, "step": 655 }, { "epoch": 0.465743698970536, "grad_norm": 0.30755433580018426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141348044.8, "logits/rejected": 70073803.03448276, "logps/chosen": -225.37142857142857, "logps/rejected": -353.6551724137931, "loss": 0.1718, "rewards/chosen": 1.3232142857142857, "rewards/margins": 8.974076354679802, "rewards/rejected": -7.650862068965517, "step": 656 }, { "epoch": 0.46645367412140576, "grad_norm": 0.15063488900899116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90380486.19354838, "logits/rejected": 97739993.21212122, "logps/chosen": -177.03225806451613, "logps/rejected": -365.8181818181818, "loss": 0.1678, "rewards/chosen": 1.2983870967741935, "rewards/margins": 8.040811339198436, "rewards/rejected": -6.742424242424242, "step": 657 }, { "epoch": 0.4671636492722755, "grad_norm": 0.15108328803525906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111305559.88059701, "logits/rejected": 91849241.18032786, "logps/chosen": -213.49253731343285, "logps/rejected": -287.73770491803276, "loss": 0.1557, "rewards/chosen": 1.7929104477611941, "rewards/margins": 7.71094323464644, "rewards/rejected": -5.918032786885246, "step": 658 }, { "epoch": 0.4678736244231452, "grad_norm": 0.17396307286831197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85370218.33846153, "logits/rejected": 127892983.87301587, "logps/chosen": -205.53846153846155, "logps/rejected": -346.1587301587302, "loss": 0.1689, "rewards/chosen": 1.5490384615384616, "rewards/margins": 7.088721001221002, "rewards/rejected": -5.5396825396825395, "step": 659 }, { "epoch": 0.4685835995740149, "grad_norm": 0.21759955741490145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132089275.2238806, "logits/rejected": 123009997.63934426, "logps/chosen": -299.2238805970149, "logps/rejected": -380.327868852459, "loss": 0.1876, "rewards/chosen": 1.6735074626865671, "rewards/margins": 8853295.83744189, "rewards/rejected": -8853294.163934426, "step": 660 }, { "epoch": 0.46929357472488464, "grad_norm": 0.2253185536848369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139580936.39344263, "logits/rejected": 60034888.59701493, "logps/chosen": -234.2295081967213, "logps/rejected": -345.3134328358209, "loss": 0.1517, "rewards/chosen": 1.6198770491803278, "rewards/margins": 9.328832273060925, "rewards/rejected": -7.708955223880597, "step": 661 }, { "epoch": 0.4700035498757543, "grad_norm": 0.16476500796595708, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138471950.62857142, "logits/rejected": 67591344.55172414, "logps/chosen": -203.65714285714284, "logps/rejected": -350.62068965517244, "loss": 0.197, "rewards/chosen": 1.1008928571428571, "rewards/margins": 8.178479064039408, "rewards/rejected": -7.077586206896552, "step": 662 }, { "epoch": 0.47071352502662406, "grad_norm": 0.1745768933236528, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81544260.26666667, "logits/rejected": 123485244.23529412, "logps/chosen": -221.6, "logps/rejected": -380.70588235294116, "loss": 0.1536, "rewards/chosen": 1.3125, "rewards/margins": 9.547794117647058, "rewards/rejected": -8.235294117647058, "step": 663 }, { "epoch": 0.4714235001774938, "grad_norm": 0.20441146922839643, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56056306.16216216, "logits/rejected": 113323880.2962963, "logps/chosen": -180.54054054054055, "logps/rejected": -280.2962962962963, "loss": 0.1945, "rewards/chosen": 1.4214527027027026, "rewards/margins": 8.884415665665665, "rewards/rejected": -7.462962962962963, "step": 664 }, { "epoch": 0.4721334753283635, "grad_norm": 0.14858871189937778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133702638.03508772, "logits/rejected": 98772905.46478873, "logps/chosen": -215.57894736842104, "logps/rejected": -382.19718309859155, "loss": 0.1558, "rewards/chosen": 1.3607456140350878, "rewards/margins": 9.00863293797875, "rewards/rejected": -7.647887323943662, "step": 665 }, { "epoch": 0.4728434504792332, "grad_norm": 0.15570200977798246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91523469.37313433, "logits/rejected": 116134089.44262294, "logps/chosen": -201.3134328358209, "logps/rejected": -384.26229508196724, "loss": 0.1714, "rewards/chosen": 1.3451492537313432, "rewards/margins": 8.804165647173965, "rewards/rejected": -7.459016393442623, "step": 666 }, { "epoch": 0.47355342563010294, "grad_norm": 0.15714168120678723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166594811.50877193, "logits/rejected": 94283227.94366197, "logps/chosen": -253.19298245614036, "logps/rejected": -329.46478873239437, "loss": 0.1496, "rewards/chosen": 1.0515350877192982, "rewards/margins": 8.847309735606622, "rewards/rejected": -7.795774647887324, "step": 667 }, { "epoch": 0.4742634007809727, "grad_norm": 0.15526611012361352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110799530.66666667, "logits/rejected": 122758290.28571428, "logps/chosen": -198.22222222222223, "logps/rejected": -346.85714285714283, "loss": 0.1919, "rewards/chosen": 0.9861111111111112, "rewards/margins": 7.441468253968255, "rewards/rejected": -6.455357142857143, "step": 668 }, { "epoch": 0.4749733759318424, "grad_norm": 0.15034980103934506, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146381209.6, "logits/rejected": 91349473.88235295, "logps/chosen": -206.0, "logps/rejected": -411.29411764705884, "loss": 0.1696, "rewards/chosen": 1.3317708333333333, "rewards/margins": 9.442064950980392, "rewards/rejected": -8.110294117647058, "step": 669 }, { "epoch": 0.4756833510827121, "grad_norm": 0.2192870121474621, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110310195.2, "logits/rejected": 90485940.70588236, "logps/chosen": -192.8, "logps/rejected": -320.0, "loss": 0.1739, "rewards/chosen": 1.4432291666666666, "rewards/margins": 8.384405637254902, "rewards/rejected": -6.9411764705882355, "step": 670 }, { "epoch": 0.4763933262335818, "grad_norm": 0.17781724083029948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117440512.0, "logits/rejected": 99221504.0, "logps/chosen": -264.0, "logps/rejected": -339.5, "loss": 0.1947, "rewards/chosen": 0.8876953125, "rewards/margins": 7.8798828125, "rewards/rejected": -6.9921875, "step": 671 }, { "epoch": 0.47710330138445156, "grad_norm": 0.18929395751740755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168022566.20895523, "logits/rejected": 60387663.73770492, "logps/chosen": -219.9402985074627, "logps/rejected": -371.9344262295082, "loss": 0.1874, "rewards/chosen": 0.6688432835820896, "rewards/margins": 7.922941644237827, "rewards/rejected": -7.254098360655738, "step": 672 }, { "epoch": 0.47781327653532124, "grad_norm": 0.18611448542228193, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 151253055.0153846, "logits/rejected": 83886080.0, "logps/chosen": -251.56923076923076, "logps/rejected": -336.5079365079365, "loss": 0.1628, "rewards/chosen": 1.925, "rewards/margins": 9.619444444444445, "rewards/rejected": -7.694444444444445, "step": 673 }, { "epoch": 0.478523251686191, "grad_norm": 0.1743410486869646, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127679548.23529412, "logits/rejected": 126108740.26666667, "logps/chosen": -249.1764705882353, "logps/rejected": -400.0, "loss": 0.1928, "rewards/chosen": 0.7150735294117647, "rewards/margins": 8.556740196078431, "rewards/rejected": -7.841666666666667, "step": 674 }, { "epoch": 0.4792332268370607, "grad_norm": 0.19625445416039514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138020563.62666667, "logits/rejected": 51360439.54716981, "logps/chosen": -222.08, "logps/rejected": -300.37735849056605, "loss": 0.2173, "rewards/chosen": 1.445, "rewards/margins": 7.638396226415095, "rewards/rejected": -6.193396226415095, "step": 675 }, { "epoch": 0.47994320198793045, "grad_norm": 0.15520895368059232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142999552.0, "logits/rejected": 123338752.0, "logps/chosen": -244.75, "logps/rejected": -358.5, "loss": 0.1607, "rewards/chosen": 1.671875, "rewards/margins": 9.5703125, "rewards/rejected": -7.8984375, "step": 676 }, { "epoch": 0.4806531771388001, "grad_norm": 0.19349958832526917, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154299547.15151516, "logits/rejected": 78879975.22580644, "logps/chosen": -307.1515151515151, "logps/rejected": -328.258064516129, "loss": 0.1881, "rewards/chosen": 1.3996212121212122, "rewards/margins": 8.488330889540567, "rewards/rejected": -7.088709677419355, "step": 677 }, { "epoch": 0.48136315228966986, "grad_norm": 0.19468462410745888, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82785934.68852459, "logits/rejected": 100976303.76119404, "logps/chosen": -157.50819672131146, "logps/rejected": -321.43283582089555, "loss": 0.1473, "rewards/chosen": 1.6823770491803278, "rewards/margins": 8.52566063126988, "rewards/rejected": -6.843283582089552, "step": 678 }, { "epoch": 0.4820731274405396, "grad_norm": 0.16972151663558324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118339291.42857143, "logits/rejected": 107013006.22222222, "logps/chosen": -262.14285714285717, "logps/rejected": -336.0, "loss": 0.1554, "rewards/chosen": 1.4854910714285714, "rewards/margins": 9.18687996031746, "rewards/rejected": -7.701388888888889, "step": 679 }, { "epoch": 0.4827831025914093, "grad_norm": 0.17306809332996162, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140247040.0, "logits/rejected": 89849856.0, "logps/chosen": -212.5, "logps/rejected": -308.25, "loss": 0.1662, "rewards/chosen": 1.35546875, "rewards/margins": 7.99609375, "rewards/rejected": -6.640625, "step": 680 }, { "epoch": 0.483493077742279, "grad_norm": 0.18252848312213485, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85949406.96774194, "logits/rejected": 122651616.96969697, "logps/chosen": -206.70967741935485, "logps/rejected": -362.6666666666667, "loss": 0.1697, "rewards/chosen": 1.122983870967742, "rewards/margins": 9.3805596285435, "rewards/rejected": -8.257575757575758, "step": 681 }, { "epoch": 0.48420305289314874, "grad_norm": 0.16325857305991884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127275431.72413793, "logits/rejected": 57162371.657142855, "logps/chosen": -254.89655172413794, "logps/rejected": -305.37142857142857, "loss": 0.1653, "rewards/chosen": 1.3114224137931034, "rewards/margins": 8.554279556650247, "rewards/rejected": -7.242857142857143, "step": 682 }, { "epoch": 0.4849130280440185, "grad_norm": 0.15402104601649028, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154664960.0, "logits/rejected": 80183296.0, "logps/chosen": -227.25, "logps/rejected": -277.75, "loss": 0.1517, "rewards/chosen": 1.83984375, "rewards/margins": 7.92578125, "rewards/rejected": -6.0859375, "step": 683 }, { "epoch": 0.48562300319488816, "grad_norm": 0.15873943767224302, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140276167.1111111, "logits/rejected": 99114629.90769231, "logps/chosen": -249.9047619047619, "logps/rejected": -374.6461538461538, "loss": 0.1621, "rewards/chosen": 1.2777777777777777, "rewards/margins": 8.57008547008547, "rewards/rejected": -7.292307692307692, "step": 684 }, { "epoch": 0.4863329783457579, "grad_norm": 0.17215921712749843, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117885362.42424242, "logits/rejected": 88249509.16129032, "logps/chosen": -205.0909090909091, "logps/rejected": -343.741935483871, "loss": 0.1841, "rewards/chosen": 1.0482954545454546, "rewards/margins": 8.927327712609971, "rewards/rejected": -7.879032258064516, "step": 685 }, { "epoch": 0.48704295349662763, "grad_norm": 0.17080022155905775, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 207298143.45762712, "logits/rejected": 92213900.98550725, "logps/chosen": -331.1186440677966, "logps/rejected": -390.95652173913044, "loss": 0.1729, "rewards/chosen": 1.646186440677966, "rewards/margins": 8.05198354212724, "rewards/rejected": -6.405797101449275, "step": 686 }, { "epoch": 0.48775292864749736, "grad_norm": 0.1959075172308139, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116557500.63157895, "logits/rejected": 104532689.12676056, "logps/chosen": -188.91228070175438, "logps/rejected": -358.98591549295776, "loss": 0.1728, "rewards/chosen": 0.6359649122807017, "rewards/margins": 7.650049419322956, "rewards/rejected": -7.014084507042254, "step": 687 }, { "epoch": 0.48846290379836704, "grad_norm": 0.17341414673807884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76623720.2962963, "logits/rejected": 102193650.16216215, "logps/chosen": -214.5185185185185, "logps/rejected": -344.64864864864865, "loss": 0.162, "rewards/chosen": 1.3449074074074074, "rewards/margins": 9.101664164164164, "rewards/rejected": -7.756756756756757, "step": 688 }, { "epoch": 0.4891728789492368, "grad_norm": 0.19254448233498225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 193905900.30769232, "logits/rejected": 81722351.74603175, "logps/chosen": -249.84615384615384, "logps/rejected": -401.77777777777777, "loss": 0.2003, "rewards/chosen": 0.9298076923076923, "rewards/margins": 8.572664835164836, "rewards/rejected": -7.642857142857143, "step": 689 }, { "epoch": 0.4898828541001065, "grad_norm": 0.21169486635609533, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145402538.66666666, "logits/rejected": 27156341.15254237, "logps/chosen": -242.31884057971016, "logps/rejected": -347.66101694915255, "loss": 0.2096, "rewards/chosen": 0.917572463768116, "rewards/margins": 8.671809751903709, "rewards/rejected": -7.754237288135593, "step": 690 }, { "epoch": 0.4905928292509762, "grad_norm": 0.13026661065324652, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94210520.61538461, "logits/rejected": 121035629.71428572, "logps/chosen": -263.38461538461536, "logps/rejected": -387.04761904761904, "loss": 0.1703, "rewards/chosen": 1.6201923076923077, "rewards/margins": 8.858287545787546, "rewards/rejected": -7.238095238095238, "step": 691 }, { "epoch": 0.4913028044018459, "grad_norm": 0.24106852592384284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88854717.04615384, "logits/rejected": 101395634.79365079, "logps/chosen": -210.7076923076923, "logps/rejected": -340.8253968253968, "loss": 0.1868, "rewards/chosen": 1.226923076923077, "rewards/margins": 8.465018315018316, "rewards/rejected": -7.238095238095238, "step": 692 }, { "epoch": 0.49201277955271566, "grad_norm": 0.1532574329663959, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163158425.6, "logits/rejected": 51873671.52941176, "logps/chosen": -197.6, "logps/rejected": -286.11764705882354, "loss": 0.1554, "rewards/chosen": 1.2208333333333334, "rewards/margins": 8.478186274509804, "rewards/rejected": -7.257352941176471, "step": 693 }, { "epoch": 0.4927227547035854, "grad_norm": 0.19017316589952452, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98962902.48648648, "logits/rejected": 99537047.7037037, "logps/chosen": -222.7027027027027, "logps/rejected": -332.14814814814815, "loss": 0.19, "rewards/chosen": 1.6993243243243243, "rewards/margins": 9.13450950950951, "rewards/rejected": -7.435185185185185, "step": 694 }, { "epoch": 0.4934327298544551, "grad_norm": 0.17477731962547327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123791886.62857144, "logits/rejected": 75388998.62068966, "logps/chosen": -231.77142857142857, "logps/rejected": -314.48275862068965, "loss": 0.1993, "rewards/chosen": 1.3928571428571428, "rewards/margins": 6.845443349753694, "rewards/rejected": -5.452586206896552, "step": 695 }, { "epoch": 0.4941427050053248, "grad_norm": 0.21227928755566786, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131907821.44927536, "logits/rejected": 64638490.0338983, "logps/chosen": -193.3913043478261, "logps/rejected": -327.59322033898303, "loss": 0.1884, "rewards/chosen": 1.1684782608695652, "rewards/margins": 8.236274871039056, "rewards/rejected": -7.067796610169491, "step": 696 }, { "epoch": 0.49485268015619455, "grad_norm": 0.2569800563925406, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86046782.06060606, "logits/rejected": 115681610.32258065, "logps/chosen": -238.3030303030303, "logps/rejected": -386.5806451612903, "loss": 0.1556, "rewards/chosen": 2.047348484848485, "rewards/margins": 9.684445259042032, "rewards/rejected": -7.637096774193548, "step": 697 }, { "epoch": 0.4955626553070642, "grad_norm": 0.14899757184659404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108111801.37931034, "logits/rejected": 102760448.0, "logps/chosen": -229.51724137931035, "logps/rejected": -386.74285714285713, "loss": 0.1161, "rewards/chosen": 2.2198275862068964, "rewards/margins": 10.369827586206897, "rewards/rejected": -8.15, "step": 698 }, { "epoch": 0.49627263045793396, "grad_norm": 0.16262683354705582, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137315060.18461537, "logits/rejected": 28627789.206349205, "logps/chosen": -208.98461538461538, "logps/rejected": -299.1746031746032, "loss": 0.1908, "rewards/chosen": 1.091826923076923, "rewards/margins": 10036774.67912851, "rewards/rejected": -10036773.587301588, "step": 699 }, { "epoch": 0.4969826056088037, "grad_norm": 0.13974082070034372, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96098906.35294117, "logits/rejected": 125269879.46666667, "logps/chosen": -204.94117647058823, "logps/rejected": -386.4, "loss": 0.149, "rewards/chosen": 2.0404411764705883, "rewards/margins": 8.048774509803922, "rewards/rejected": -6.008333333333334, "step": 700 }, { "epoch": 0.49769258075967343, "grad_norm": 0.12546254792759465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111613990.64150943, "logits/rejected": 99377042.77333333, "logps/chosen": -198.9433962264151, "logps/rejected": -345.6, "loss": 0.1279, "rewards/chosen": 1.4127358490566038, "rewards/margins": 9.046069182389937, "rewards/rejected": -7.633333333333334, "step": 701 }, { "epoch": 0.4984025559105431, "grad_norm": 0.15900694984304006, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107429578.86792453, "logits/rejected": 72281838.93333334, "logps/chosen": -192.60377358490567, "logps/rejected": -343.04, "loss": 0.1523, "rewards/chosen": 1.022995283018868, "rewards/margins": 9.016328616352201, "rewards/rejected": -7.993333333333333, "step": 702 }, { "epoch": 0.49911253106141285, "grad_norm": 0.1738476684189449, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100900709.43396227, "logits/rejected": 95294586.88, "logps/chosen": -279.24528301886795, "logps/rejected": -349.8666666666667, "loss": 0.1516, "rewards/chosen": 1.9599056603773586, "rewards/margins": 9.43990566037736, "rewards/rejected": -7.48, "step": 703 }, { "epoch": 0.4998225062122826, "grad_norm": 0.18500805939791284, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159099192.40677965, "logits/rejected": 17628234.20289855, "logps/chosen": -250.57627118644066, "logps/rejected": -333.4492753623188, "loss": 0.1602, "rewards/chosen": 1.3315677966101696, "rewards/margins": 8.9185243183493, "rewards/rejected": -7.586956521739131, "step": 704 }, { "epoch": 0.5005324813631523, "grad_norm": 0.15321767968831115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106446351.51515152, "logits/rejected": 81010952.25806452, "logps/chosen": -188.36363636363637, "logps/rejected": -323.0967741935484, "loss": 0.1818, "rewards/chosen": 1.4526515151515151, "rewards/margins": 9.283296676441838, "rewards/rejected": -7.830645161290323, "step": 705 }, { "epoch": 0.501242456514022, "grad_norm": 0.18045076119844963, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116425761.03225806, "logits/rejected": 111975206.78787878, "logps/chosen": -248.51612903225808, "logps/rejected": -369.93939393939394, "loss": 0.1677, "rewards/chosen": 1.404233870967742, "rewards/margins": 5.737567204301075, "rewards/rejected": -4.333333333333333, "step": 706 }, { "epoch": 0.5019524316648917, "grad_norm": 0.18446576125676312, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138139674.5974026, "logits/rejected": 48070013.49019608, "logps/chosen": -187.84415584415584, "logps/rejected": -286.7450980392157, "loss": 0.2263, "rewards/chosen": 1.0998376623376624, "rewards/margins": 7.60964158390629, "rewards/rejected": -6.509803921568627, "step": 707 }, { "epoch": 0.5026624068157615, "grad_norm": 0.19579608146798633, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127160645.07936507, "logits/rejected": 61849852.06153846, "logps/chosen": -212.82539682539684, "logps/rejected": -376.61538461538464, "loss": 0.1827, "rewards/chosen": 1.0416666666666667, "rewards/margins": 8.510897435897435, "rewards/rejected": -7.469230769230769, "step": 708 }, { "epoch": 0.5033723819666311, "grad_norm": 0.1700553322821592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167212919.46666667, "logits/rejected": 20462652.23529412, "logps/chosen": -240.8, "logps/rejected": -255.2941176470588, "loss": 0.1733, "rewards/chosen": 1.4239583333333334, "rewards/margins": 9.196017156862746, "rewards/rejected": -7.772058823529412, "step": 709 }, { "epoch": 0.5040823571175009, "grad_norm": 0.20482038845600747, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69952118.15384616, "logits/rejected": 135416100.57142857, "logps/chosen": -244.1846153846154, "logps/rejected": -361.14285714285717, "loss": 0.1786, "rewards/chosen": 1.3211538461538461, "rewards/margins": 8.940201465201465, "rewards/rejected": -7.619047619047619, "step": 710 }, { "epoch": 0.5047923322683706, "grad_norm": 0.15059439006973419, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113179631.74603175, "logits/rejected": 90209799.87692308, "logps/chosen": -289.77777777777777, "logps/rejected": -366.2769230769231, "loss": 0.1472, "rewards/chosen": 2.492063492063492, "rewards/margins": 8.468986568986569, "rewards/rejected": -5.976923076923077, "step": 711 }, { "epoch": 0.5055023074192403, "grad_norm": 0.1487019220251956, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85490719.03030303, "logits/rejected": 86930332.90322581, "logps/chosen": -218.9090909090909, "logps/rejected": -433.03225806451616, "loss": 0.1534, "rewards/chosen": 1.5454545454545454, "rewards/margins": 10.505131964809383, "rewards/rejected": -8.959677419354838, "step": 712 }, { "epoch": 0.5062122825701101, "grad_norm": 0.16795781819639324, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146068301.2063492, "logits/rejected": 66883016.86153846, "logps/chosen": -241.52380952380952, "logps/rejected": -342.4, "loss": 0.17, "rewards/chosen": 1.3333333333333333, "rewards/margins": 8.525641025641026, "rewards/rejected": -7.1923076923076925, "step": 713 }, { "epoch": 0.5069222577209798, "grad_norm": 0.1621948604613473, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 196128018.028169, "logits/rejected": 7652765.192982456, "logps/chosen": -288.90140845070425, "logps/rejected": -305.4035087719298, "loss": 0.1868, "rewards/chosen": 1.2605633802816902, "rewards/margins": 9.672844082036075, "rewards/rejected": -8.412280701754385, "step": 714 }, { "epoch": 0.5076322328718494, "grad_norm": 0.1336226499161084, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136296801.10344827, "logits/rejected": 117800023.77142857, "logps/chosen": -276.13793103448273, "logps/rejected": -373.0285714285714, "loss": 0.1407, "rewards/chosen": 1.9116379310344827, "rewards/margins": 8.125923645320198, "rewards/rejected": -6.214285714285714, "step": 715 }, { "epoch": 0.5083422080227192, "grad_norm": 0.15075092345542854, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86882011.42857143, "logits/rejected": 86827918.22222222, "logps/chosen": -198.42857142857142, "logps/rejected": -334.44444444444446, "loss": 0.1399, "rewards/chosen": 1.53125, "rewards/margins": 9.628472222222221, "rewards/rejected": -8.097222222222221, "step": 716 }, { "epoch": 0.5090521831735889, "grad_norm": 0.15588978336713907, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111681169.19402985, "logits/rejected": 63877187.14754099, "logps/chosen": -204.65671641791045, "logps/rejected": -339.9344262295082, "loss": 0.1638, "rewards/chosen": 1.6417910447761195, "rewards/margins": 7.100807438218742, "rewards/rejected": -5.459016393442623, "step": 717 }, { "epoch": 0.5097621583244586, "grad_norm": 0.13983746886272266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92833928.53333333, "logits/rejected": 65597680.941176474, "logps/chosen": -190.4, "logps/rejected": -352.47058823529414, "loss": 0.152, "rewards/chosen": 1.44375, "rewards/margins": 9.296691176470588, "rewards/rejected": -7.852941176470588, "step": 718 }, { "epoch": 0.5104721334753284, "grad_norm": 0.1522810073218657, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162446059.68253967, "logits/rejected": 60204394.33846154, "logps/chosen": -221.46031746031747, "logps/rejected": -350.5230769230769, "loss": 0.1873, "rewards/chosen": 1.1259920634920635, "rewards/margins": 6.20291514041514, "rewards/rejected": -5.076923076923077, "step": 719 }, { "epoch": 0.5111821086261981, "grad_norm": 0.1252879424597241, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82529099.29411764, "logits/rejected": 116671556.26666667, "logps/chosen": -209.64705882352942, "logps/rejected": -366.4, "loss": 0.1488, "rewards/chosen": 1.8272058823529411, "rewards/margins": 9.710539215686275, "rewards/rejected": -7.883333333333334, "step": 720 }, { "epoch": 0.5118920837770679, "grad_norm": 0.2862365191365487, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143122558.03076923, "logits/rejected": 54376155.428571425, "logps/chosen": -195.3230769230769, "logps/rejected": -305.77777777777777, "loss": 0.1741, "rewards/chosen": 1.4903846153846154, "rewards/margins": 6.712606837606838, "rewards/rejected": -5.222222222222222, "step": 721 }, { "epoch": 0.5126020589279375, "grad_norm": 0.15059011779016973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120061952.0, "logits/rejected": 93687352.8888889, "logps/chosen": -193.57142857142858, "logps/rejected": -361.3333333333333, "loss": 0.1464, "rewards/chosen": 0.50390625, "rewards/margins": 8.274739583333332, "rewards/rejected": -7.770833333333333, "step": 722 }, { "epoch": 0.5133120340788072, "grad_norm": 0.1540676172531951, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90287912.42105263, "logits/rejected": 75083949.07042253, "logps/chosen": -167.43859649122808, "logps/rejected": -308.28169014084506, "loss": 0.1507, "rewards/chosen": 1.2017543859649122, "rewards/margins": 9.215838893007167, "rewards/rejected": -8.014084507042254, "step": 723 }, { "epoch": 0.514022009229677, "grad_norm": 0.20521803225226098, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 59203584.0, "logits/rejected": 122814464.0, "logps/chosen": -158.375, "logps/rejected": -383.0, "loss": 0.1811, "rewards/chosen": 1.46484375, "rewards/margins": 8.59765625, "rewards/rejected": -7.1328125, "step": 724 }, { "epoch": 0.5147319843805467, "grad_norm": 0.16870985063556734, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147128320.0, "logits/rejected": 65847296.0, "logps/chosen": -209.5, "logps/rejected": -348.0, "loss": 0.179, "rewards/chosen": 1.185546875, "rewards/margins": 8.873046875, "rewards/rejected": -7.6875, "step": 725 }, { "epoch": 0.5154419595314164, "grad_norm": 0.19503586752415505, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131677515.71830986, "logits/rejected": 32303499.228070177, "logps/chosen": -267.0422535211268, "logps/rejected": -371.0877192982456, "loss": 0.1746, "rewards/chosen": 1.5501760563380282, "rewards/margins": 8.111579565109958, "rewards/rejected": -6.56140350877193, "step": 726 }, { "epoch": 0.5161519346822862, "grad_norm": 0.1515856998956132, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108902107.42857143, "logits/rejected": 116741461.33333333, "logps/chosen": -181.14285714285714, "logps/rejected": -369.3333333333333, "loss": 0.1617, "rewards/chosen": 0.9955357142857143, "rewards/margins": 8.516369047619047, "rewards/rejected": -7.520833333333333, "step": 727 }, { "epoch": 0.5168619098331558, "grad_norm": 0.13086570084364116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116671556.26666667, "logits/rejected": 109483670.58823529, "logps/chosen": -190.26666666666668, "logps/rejected": -415.05882352941177, "loss": 0.1427, "rewards/chosen": 0.81875, "rewards/margins": 8.870220588235293, "rewards/rejected": -8.051470588235293, "step": 728 }, { "epoch": 0.5175718849840255, "grad_norm": 0.20524704732510168, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102867082.84745763, "logits/rejected": 94827742.60869566, "logps/chosen": -187.9322033898305, "logps/rejected": -343.18840579710144, "loss": 0.1494, "rewards/chosen": 1.465042372881356, "rewards/margins": 9.689680054040776, "rewards/rejected": -8.22463768115942, "step": 729 }, { "epoch": 0.5182818601348953, "grad_norm": 0.1711020171982761, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129397408.47761194, "logits/rejected": 75703749.24590164, "logps/chosen": -214.6865671641791, "logps/rejected": -356.72131147540983, "loss": 0.1556, "rewards/chosen": 1.9029850746268657, "rewards/margins": 7.886591632003915, "rewards/rejected": -5.983606557377049, "step": 730 }, { "epoch": 0.518991835285765, "grad_norm": 0.17254313426716122, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105934515.8918919, "logits/rejected": 89400813.03703703, "logps/chosen": -242.59459459459458, "logps/rejected": -365.3333333333333, "loss": 0.2093, "rewards/chosen": 0.981418918918919, "rewards/margins": 7.129567067067067, "rewards/rejected": -6.148148148148148, "step": 731 }, { "epoch": 0.5197018104366348, "grad_norm": 0.16129757614597115, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149004426.84745762, "logits/rejected": 64495022.376811594, "logps/chosen": -266.03389830508473, "logps/rejected": -314.8985507246377, "loss": 0.1683, "rewards/chosen": 0.909957627118644, "rewards/margins": 8.380972119872267, "rewards/rejected": -7.471014492753623, "step": 732 }, { "epoch": 0.5204117855875044, "grad_norm": 0.14165193432560497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 66619528.53333333, "logits/rejected": 125088948.70588236, "logps/chosen": -185.33333333333334, "logps/rejected": -361.88235294117646, "loss": 0.1599, "rewards/chosen": 1.3958333333333333, "rewards/margins": 8.47671568627451, "rewards/rejected": -7.080882352941177, "step": 733 }, { "epoch": 0.5211217607383741, "grad_norm": 1.10412158910853, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150994944.0, "logits/rejected": 96331473.83606558, "logps/chosen": -264.5970149253731, "logps/rejected": -360.91803278688525, "loss": 0.1634, "rewards/chosen": 2.1156716417910446, "rewards/margins": 9.492720822118914, "rewards/rejected": -7.377049180327869, "step": 734 }, { "epoch": 0.5218317358892439, "grad_norm": 0.17104464966174351, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115034955.29411764, "logits/rejected": 71722598.4, "logps/chosen": -165.64705882352942, "logps/rejected": -276.26666666666665, "loss": 0.1981, "rewards/chosen": 1.2022058823529411, "rewards/margins": 8.118872549019608, "rewards/rejected": -6.916666666666667, "step": 735 }, { "epoch": 0.5225417110401136, "grad_norm": 0.1773554236278442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67383900.32786885, "logits/rejected": 112933200.23880596, "logps/chosen": -183.34426229508196, "logps/rejected": -349.13432835820896, "loss": 0.1871, "rewards/chosen": 0.9713114754098361, "rewards/margins": 7.829520430633717, "rewards/rejected": -6.858208955223881, "step": 736 }, { "epoch": 0.5232516861909833, "grad_norm": 0.13973100952106582, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143912758.55737704, "logits/rejected": 46825961.07462686, "logps/chosen": -226.62295081967213, "logps/rejected": -350.56716417910445, "loss": 0.1644, "rewards/chosen": 1.3504098360655739, "rewards/margins": 9.492200880841693, "rewards/rejected": -8.14179104477612, "step": 737 }, { "epoch": 0.5239616613418531, "grad_norm": 0.158856631831485, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82720995.55555555, "logits/rejected": 87219053.71428572, "logps/chosen": -186.66666666666666, "logps/rejected": -314.2857142857143, "loss": 0.1906, "rewards/chosen": 1.5616319444444444, "rewards/margins": 7.391989087301588, "rewards/rejected": -5.830357142857143, "step": 738 }, { "epoch": 0.5246716364927227, "grad_norm": 0.15065819951565432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106337942.58823529, "logits/rejected": 96608802.13333334, "logps/chosen": -220.94117647058823, "logps/rejected": -352.8, "loss": 0.1423, "rewards/chosen": 1.8823529411764706, "rewards/margins": 8.83235294117647, "rewards/rejected": -6.95, "step": 739 }, { "epoch": 0.5253816116435924, "grad_norm": 0.1533118434804592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129433600.0, "logits/rejected": 109248512.0, "logps/chosen": -231.0, "logps/rejected": -362.0, "loss": 0.1543, "rewards/chosen": 1.6171875, "rewards/margins": 8.9921875, "rewards/rejected": -7.375, "step": 740 }, { "epoch": 0.5260915867944622, "grad_norm": 0.1669600227365016, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154927104.0, "logits/rejected": 63427282.823529415, "logps/chosen": -225.86666666666667, "logps/rejected": -309.4117647058824, "loss": 0.1827, "rewards/chosen": 1.25625, "rewards/margins": 8.190073529411764, "rewards/rejected": -6.9338235294117645, "step": 741 }, { "epoch": 0.5268015619453319, "grad_norm": 0.14336531692026774, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105404683.13043478, "logits/rejected": 104786510.10169491, "logps/chosen": -204.63768115942028, "logps/rejected": -335.45762711864404, "loss": 0.1629, "rewards/chosen": 1.618659420289855, "rewards/margins": 9.656795013510195, "rewards/rejected": -8.038135593220339, "step": 742 }, { "epoch": 0.5275115370962017, "grad_norm": 0.13937301705854596, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119013376.0, "logits/rejected": 169082880.0, "logps/chosen": -257.5, "logps/rejected": -414.0, "loss": 0.1856, "rewards/chosen": 1.2373046875, "rewards/margins": 7.7060546875, "rewards/rejected": -6.46875, "step": 743 }, { "epoch": 0.5282215122470714, "grad_norm": 0.13408651733347143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76083440.94117647, "logits/rejected": 108282948.26666667, "logps/chosen": -180.7058823529412, "logps/rejected": -332.53333333333336, "loss": 0.1526, "rewards/chosen": 0.9375, "rewards/margins": 9.2375, "rewards/rejected": -8.3, "step": 744 }, { "epoch": 0.528931487397941, "grad_norm": 0.16259147682358943, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97708218.18181819, "logits/rejected": 124116727.74193548, "logps/chosen": -191.27272727272728, "logps/rejected": -336.7741935483871, "loss": 0.1693, "rewards/chosen": 0.8939393939393939, "rewards/margins": 9.426197458455524, "rewards/rejected": -8.53225806451613, "step": 745 }, { "epoch": 0.5296414625488108, "grad_norm": 0.12339810853510778, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87599070.4262295, "logits/rejected": 109427513.31343284, "logps/chosen": -176.2622950819672, "logps/rejected": -386.86567164179104, "loss": 0.1501, "rewards/chosen": 1.6670081967213115, "rewards/margins": 9.189396256422805, "rewards/rejected": -7.522388059701493, "step": 746 }, { "epoch": 0.5303514376996805, "grad_norm": 0.1515541929407973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146997248.0, "logits/rejected": 77070336.0, "logps/chosen": -235.25, "logps/rejected": -375.5, "loss": 0.1788, "rewards/chosen": 1.2646484375, "rewards/margins": 9.8427734375, "rewards/rejected": -8.578125, "step": 747 }, { "epoch": 0.5310614128505502, "grad_norm": 0.15317298974609764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114036439.1884058, "logits/rejected": 105426319.18644068, "logps/chosen": -234.20289855072463, "logps/rejected": -336.0, "loss": 0.1548, "rewards/chosen": 1.858695652173913, "rewards/margins": 9.3671702284451, "rewards/rejected": -7.508474576271187, "step": 748 }, { "epoch": 0.53177138800142, "grad_norm": 0.1544688068013832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125829120.0, "logits/rejected": 69685365.02857143, "logps/chosen": -209.3793103448276, "logps/rejected": -313.14285714285717, "loss": 0.1621, "rewards/chosen": 1.2553879310344827, "rewards/margins": 7.01253078817734, "rewards/rejected": -5.757142857142857, "step": 749 }, { "epoch": 0.5324813631522897, "grad_norm": 0.15096327475768223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 186062075.80327868, "logits/rejected": 63086714.268656716, "logps/chosen": -238.1639344262295, "logps/rejected": -384.23880597014926, "loss": 0.1555, "rewards/chosen": 1.721311475409836, "rewards/margins": 9.803401027648642, "rewards/rejected": -8.082089552238806, "step": 750 }, { "epoch": 0.5331913383031593, "grad_norm": 0.17812403268318694, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153550848.0, "logits/rejected": 56889344.0, "logps/chosen": -192.0, "logps/rejected": -328.5, "loss": 0.1564, "rewards/chosen": 1.109375, "rewards/margins": 8.625, "rewards/rejected": -7.515625, "step": 751 }, { "epoch": 0.5339013134540291, "grad_norm": 0.1677495365909811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147897611.81538463, "logits/rejected": 107753667.04761904, "logps/chosen": -273.7230769230769, "logps/rejected": -360.63492063492066, "loss": 0.1866, "rewards/chosen": 1.1961538461538461, "rewards/margins": 9.354884004884005, "rewards/rejected": -8.158730158730158, "step": 752 }, { "epoch": 0.5346112886048988, "grad_norm": 0.15858353800695382, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79691776.0, "logits/rejected": 150994944.0, "logps/chosen": -228.92307692307693, "logps/rejected": -408.3809523809524, "loss": 0.1541, "rewards/chosen": 1.7346153846153847, "rewards/margins": 9.877472527472527, "rewards/rejected": -8.142857142857142, "step": 753 }, { "epoch": 0.5353212637557685, "grad_norm": 0.1588944239589832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139061853.7464789, "logits/rejected": 75101956.49122807, "logps/chosen": -189.29577464788733, "logps/rejected": -359.29824561403507, "loss": 0.1906, "rewards/chosen": 1.1153169014084507, "rewards/margins": 8.510053743513714, "rewards/rejected": -7.394736842105263, "step": 754 }, { "epoch": 0.5360312389066383, "grad_norm": 0.15060632833340157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136577024.0, "logits/rejected": 63913984.0, "logps/chosen": -266.0, "logps/rejected": -383.5, "loss": 0.1817, "rewards/chosen": 1.3779296875, "rewards/margins": 7.6748046875, "rewards/rejected": -6.296875, "step": 755 }, { "epoch": 0.536741214057508, "grad_norm": 0.18783424041335361, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131627128.47058824, "logits/rejected": 81579212.8, "logps/chosen": -216.47058823529412, "logps/rejected": -341.06666666666666, "loss": 0.1881, "rewards/chosen": 1.3253676470588236, "rewards/margins": 8.500367647058823, "rewards/rejected": -7.175, "step": 756 }, { "epoch": 0.5374511892083778, "grad_norm": 0.17894217168596027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 158643380.70588234, "logits/rejected": 77734434.13333334, "logps/chosen": -248.0, "logps/rejected": -330.4, "loss": 0.1494, "rewards/chosen": 2.0919117647058822, "rewards/margins": 9.200245098039215, "rewards/rejected": -7.108333333333333, "step": 757 }, { "epoch": 0.5381611643592474, "grad_norm": 0.1589817052433356, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95300087.60655738, "logits/rejected": 133591712.47761194, "logps/chosen": -210.36065573770492, "logps/rejected": -362.9850746268657, "loss": 0.1526, "rewards/chosen": 1.2704918032786885, "rewards/margins": 9.337655982383165, "rewards/rejected": -8.067164179104477, "step": 758 }, { "epoch": 0.5388711395101171, "grad_norm": 0.14724929822808475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110804747.46268657, "logits/rejected": 91071404.06557377, "logps/chosen": -218.02985074626866, "logps/rejected": -343.08196721311475, "loss": 0.1911, "rewards/chosen": 1.3666044776119404, "rewards/margins": 9.325620871054562, "rewards/rejected": -7.959016393442623, "step": 759 }, { "epoch": 0.5395811146609869, "grad_norm": 0.17463688370823593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119770680.8888889, "logits/rejected": 80197339.42857143, "logps/chosen": -214.44444444444446, "logps/rejected": -321.14285714285717, "loss": 0.1737, "rewards/chosen": 1.7413194444444444, "rewards/margins": 7.250248015873016, "rewards/rejected": -5.508928571428571, "step": 760 }, { "epoch": 0.5402910898118566, "grad_norm": 0.16384966130518241, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115875473.19402985, "logits/rejected": 56313688.13114754, "logps/chosen": -246.44776119402985, "logps/rejected": -379.8032786885246, "loss": 0.1831, "rewards/chosen": 1.523320895522388, "rewards/margins": 7.121681551260092, "rewards/rejected": -5.598360655737705, "step": 761 }, { "epoch": 0.5410010649627263, "grad_norm": 0.15767515028043894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98915669.33333333, "logits/rejected": 107818285.1764706, "logps/chosen": -244.26666666666668, "logps/rejected": -333.6470588235294, "loss": 0.1382, "rewards/chosen": 2.05625, "rewards/margins": 9.850367647058823, "rewards/rejected": -7.794117647058823, "step": 762 }, { "epoch": 0.541711040113596, "grad_norm": 0.25296415809722594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136179579.87096775, "logits/rejected": 62326721.93939394, "logps/chosen": -243.09677419354838, "logps/rejected": -370.42424242424244, "loss": 0.1779, "rewards/chosen": 0.9737903225806451, "rewards/margins": 8.890456989247312, "rewards/rejected": -7.916666666666667, "step": 763 }, { "epoch": 0.5424210152644657, "grad_norm": 0.1743986707479402, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145631735.60655737, "logits/rejected": 143357554.62686568, "logps/chosen": -237.11475409836066, "logps/rejected": -421.7313432835821, "loss": 0.1971, "rewards/chosen": 0.9405737704918032, "rewards/margins": 9.656991680939564, "rewards/rejected": -8.716417910447761, "step": 764 }, { "epoch": 0.5431309904153354, "grad_norm": 0.12855451380206998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108585870.22222222, "logits/rejected": 134047688.64864865, "logps/chosen": -206.5185185185185, "logps/rejected": -355.8918918918919, "loss": 0.1581, "rewards/chosen": 0.8854166666666666, "rewards/margins": 8.290822072072071, "rewards/rejected": -7.405405405405405, "step": 765 }, { "epoch": 0.5438409655662052, "grad_norm": 0.2013136582416383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142976421.6470588, "logits/rejected": 699050.6666666666, "logps/chosen": -208.7058823529412, "logps/rejected": -296.26666666666665, "loss": 0.1894, "rewards/chosen": 1.353860294117647, "rewards/margins": 9.245526960784314, "rewards/rejected": -7.891666666666667, "step": 766 }, { "epoch": 0.5445509407170749, "grad_norm": 0.16655300514087457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104042040.8888889, "logits/rejected": 85076355.45945945, "logps/chosen": -143.40740740740742, "logps/rejected": -333.8378378378378, "loss": 0.163, "rewards/chosen": 0.8599537037037037, "rewards/margins": 8.36671046046046, "rewards/rejected": -7.506756756756757, "step": 767 }, { "epoch": 0.5452609158679447, "grad_norm": 0.18143979369849353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147076581.0526316, "logits/rejected": 87838404.92307693, "logps/chosen": -230.52631578947367, "logps/rejected": -327.38461538461536, "loss": 0.2254, "rewards/chosen": 1.0986842105263157, "rewards/margins": 8.4832995951417, "rewards/rejected": -7.384615384615385, "step": 768 }, { "epoch": 0.5459708910188144, "grad_norm": 0.13728406554039713, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72351744.0, "logits/rejected": 150672305.23076922, "logps/chosen": -242.24, "logps/rejected": -377.43589743589746, "loss": 0.1599, "rewards/chosen": 1.1475, "rewards/margins": 6.634679487179487, "rewards/rejected": -5.487179487179487, "step": 769 }, { "epoch": 0.546680866169684, "grad_norm": 0.15601655165309322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133255100.85245901, "logits/rejected": 66357645.37313433, "logps/chosen": -243.14754098360655, "logps/rejected": -320.23880597014926, "loss": 0.1586, "rewards/chosen": 2.055327868852459, "rewards/margins": 9.406074137509176, "rewards/rejected": -7.350746268656716, "step": 770 }, { "epoch": 0.5473908413205538, "grad_norm": 0.1506626607770608, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141573891.93846154, "logits/rejected": 55225002.666666664, "logps/chosen": -212.92307692307693, "logps/rejected": -302.22222222222223, "loss": 0.1495, "rewards/chosen": 1.7673076923076922, "rewards/margins": 9.076831501831501, "rewards/rejected": -7.309523809523809, "step": 771 }, { "epoch": 0.5481008164714235, "grad_norm": 0.1592966506286869, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120569595.93650794, "logits/rejected": 104728544.4923077, "logps/chosen": -175.74603174603175, "logps/rejected": -366.2769230769231, "loss": 0.1562, "rewards/chosen": 1.378968253968254, "rewards/margins": 7.686660561660561, "rewards/rejected": -6.3076923076923075, "step": 772 }, { "epoch": 0.5488107916222932, "grad_norm": 0.14257733891833732, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 52751438.76923077, "logits/rejected": 133485389.20634921, "logps/chosen": -158.03076923076924, "logps/rejected": -339.55555555555554, "loss": 0.1835, "rewards/chosen": 1.0826923076923076, "rewards/margins": 8.431898656898657, "rewards/rejected": -7.349206349206349, "step": 773 }, { "epoch": 0.549520766773163, "grad_norm": 0.13964161980295875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141304655.44827586, "logits/rejected": 138831462.4, "logps/chosen": -248.27586206896552, "logps/rejected": -422.85714285714283, "loss": 0.1557, "rewards/chosen": 1.2413793103448276, "rewards/margins": 9.055665024630542, "rewards/rejected": -7.814285714285714, "step": 774 }, { "epoch": 0.5502307419240327, "grad_norm": 0.17092668360831997, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82680217.6, "logits/rejected": 125612173.2413793, "logps/chosen": -195.65714285714284, "logps/rejected": -363.58620689655174, "loss": 0.1606, "rewards/chosen": 1.95, "rewards/margins": 9.682758620689654, "rewards/rejected": -7.732758620689655, "step": 775 }, { "epoch": 0.5509407170749023, "grad_norm": 0.18127379593714416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113009909.18309858, "logits/rejected": 86663886.59649123, "logps/chosen": -226.70422535211267, "logps/rejected": -320.0, "loss": 0.1835, "rewards/chosen": 1.5193661971830985, "rewards/margins": 8.931646898937485, "rewards/rejected": -7.412280701754386, "step": 776 }, { "epoch": 0.5516506922257721, "grad_norm": 0.15294954674655017, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105080025.21212122, "logits/rejected": 127114471.22580644, "logps/chosen": -245.33333333333334, "logps/rejected": -377.2903225806452, "loss": 0.1693, "rewards/chosen": 1.3977272727272727, "rewards/margins": 8.857404692082111, "rewards/rejected": -7.459677419354839, "step": 777 }, { "epoch": 0.5523606673766418, "grad_norm": 0.18140667331391946, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129690542.73015873, "logits/rejected": 63430782.03076923, "logps/chosen": -191.74603174603175, "logps/rejected": -364.3076923076923, "loss": 0.1672, "rewards/chosen": 1.0724206349206349, "rewards/margins": 7.626266788766788, "rewards/rejected": -6.553846153846154, "step": 778 }, { "epoch": 0.5530706425275116, "grad_norm": 0.14086250470286357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123875405.28301887, "logits/rejected": 77063345.49333334, "logps/chosen": -214.9433962264151, "logps/rejected": -326.82666666666665, "loss": 0.1432, "rewards/chosen": 1.5200471698113207, "rewards/margins": 7.960047169811321, "rewards/rejected": -6.44, "step": 779 }, { "epoch": 0.5537806176783813, "grad_norm": 0.2117037530279583, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128974848.0, "logits/rejected": 54596169.14285714, "logps/chosen": -240.44444444444446, "logps/rejected": -369.14285714285717, "loss": 0.1781, "rewards/chosen": 1.8177083333333333, "rewards/margins": 9.728422619047619, "rewards/rejected": -7.910714285714286, "step": 780 }, { "epoch": 0.554490592829251, "grad_norm": 0.16888121813385068, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135379227.56923077, "logits/rejected": 68906422.85714285, "logps/chosen": -277.16923076923075, "logps/rejected": -339.04761904761904, "loss": 0.1759, "rewards/chosen": 1.708653846153846, "rewards/margins": 8.875320512820513, "rewards/rejected": -7.166666666666667, "step": 781 }, { "epoch": 0.5552005679801207, "grad_norm": 0.16342600974999286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113410048.0, "logits/rejected": 94961664.0, "logps/chosen": -206.375, "logps/rejected": -320.5, "loss": 0.167, "rewards/chosen": 1.28125, "rewards/margins": 9.53125, "rewards/rejected": -8.25, "step": 782 }, { "epoch": 0.5559105431309904, "grad_norm": 0.18548888553945928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112197632.0, "logits/rejected": 83951616.0, "logps/chosen": -252.75, "logps/rejected": -321.25, "loss": 0.1694, "rewards/chosen": 1.87109375, "rewards/margins": 9.87890625, "rewards/rejected": -8.0078125, "step": 783 }, { "epoch": 0.5566205182818601, "grad_norm": 0.18599849282400915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84417752.33802816, "logits/rejected": 109668172.3508772, "logps/chosen": -203.49295774647888, "logps/rejected": -363.50877192982455, "loss": 0.1995, "rewards/chosen": 1.2341549295774648, "rewards/margins": 8.497312824314307, "rewards/rejected": -7.2631578947368425, "step": 784 }, { "epoch": 0.5573304934327299, "grad_norm": 0.19717584658401457, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 46599428.33898305, "logits/rejected": 102608480.46376811, "logps/chosen": -238.91525423728814, "logps/rejected": -326.95652173913044, "loss": 0.162, "rewards/chosen": 1.496292372881356, "rewards/margins": 9.148466285924835, "rewards/rejected": -7.6521739130434785, "step": 785 }, { "epoch": 0.5580404685835996, "grad_norm": 0.14538848773678628, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137013930.66666666, "logits/rejected": 101156743.52941176, "logps/chosen": -230.66666666666666, "logps/rejected": -401.88235294117646, "loss": 0.1573, "rewards/chosen": 1.0208333333333333, "rewards/margins": 8.528186274509805, "rewards/rejected": -7.507352941176471, "step": 786 }, { "epoch": 0.5587504437344692, "grad_norm": 0.12617722835476677, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89811366.6031746, "logits/rejected": 129958896.24615385, "logps/chosen": -263.6190476190476, "logps/rejected": -387.44615384615383, "loss": 0.1497, "rewards/chosen": 1.3055555555555556, "rewards/margins": 8.836324786324786, "rewards/rejected": -7.530769230769231, "step": 787 }, { "epoch": 0.559460418885339, "grad_norm": 0.14748325130056475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 198895111.42028984, "logits/rejected": 91919238.50847457, "logps/chosen": -258.7826086956522, "logps/rejected": -365.5593220338983, "loss": 0.1931, "rewards/chosen": 1.2753623188405796, "rewards/margins": 9.504175878162615, "rewards/rejected": -8.228813559322035, "step": 788 }, { "epoch": 0.5601703940362087, "grad_norm": 0.18718445264670514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167521753.79104477, "logits/rejected": 87599070.4262295, "logps/chosen": -237.37313432835822, "logps/rejected": -399.73770491803276, "loss": 0.1555, "rewards/chosen": 1.4067164179104477, "rewards/margins": 9.791962319549791, "rewards/rejected": -8.385245901639344, "step": 789 }, { "epoch": 0.5608803691870784, "grad_norm": 0.14036415217942858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109534581.84126984, "logits/rejected": 90129140.18461539, "logps/chosen": -198.34920634920636, "logps/rejected": -419.9384615384615, "loss": 0.1605, "rewards/chosen": 1.4861111111111112, "rewards/margins": -12296700.544658119, "rewards/rejected": 12296702.03076923, "step": 790 }, { "epoch": 0.5615903443379482, "grad_norm": 0.1310851556701587, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78357224.72727273, "logits/rejected": 87563278.02739726, "logps/chosen": -166.4, "logps/rejected": -356.3835616438356, "loss": 0.1359, "rewards/chosen": 1.1681818181818182, "rewards/margins": 9.264072229140723, "rewards/rejected": -8.095890410958905, "step": 791 }, { "epoch": 0.5623003194888179, "grad_norm": 0.17484607395520765, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111806372.29850747, "logits/rejected": 105201395.40983607, "logps/chosen": -197.49253731343285, "logps/rejected": -324.1967213114754, "loss": 0.1568, "rewards/chosen": 1.6119402985074627, "rewards/margins": 7.65292390506484, "rewards/rejected": -6.040983606557377, "step": 792 }, { "epoch": 0.5630102946396877, "grad_norm": 0.2031756353236043, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120490914.9090909, "logits/rejected": -2919564.5490196077, "logps/chosen": -261.4025974025974, "logps/rejected": -291.45098039215685, "loss": 0.1928, "rewards/chosen": 1.9074675324675325, "rewards/margins": 8.829036159918513, "rewards/rejected": -6.921568627450981, "step": 793 }, { "epoch": 0.5637202697905573, "grad_norm": 0.17099105442301188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122386034.62686567, "logits/rejected": 122803720.39344262, "logps/chosen": -213.73134328358208, "logps/rejected": -374.55737704918033, "loss": 0.1819, "rewards/chosen": 1.521455223880597, "rewards/margins": 8.742766699290433, "rewards/rejected": -7.221311475409836, "step": 794 }, { "epoch": 0.564430244941427, "grad_norm": 0.18098293754307415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84436152.6557377, "logits/rejected": 81757627.2238806, "logps/chosen": -228.19672131147541, "logps/rejected": -399.2835820895522, "loss": 0.1935, "rewards/chosen": 1.1885245901639345, "rewards/margins": 6.382554440910203, "rewards/rejected": -5.1940298507462686, "step": 795 }, { "epoch": 0.5651402200922968, "grad_norm": 0.17534722582609855, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118520863.03030303, "logits/rejected": 90245186.06451613, "logps/chosen": -210.78787878787878, "logps/rejected": -413.4193548387097, "loss": 0.1753, "rewards/chosen": 1.4611742424242424, "rewards/margins": 9.009561339198436, "rewards/rejected": -7.548387096774194, "step": 796 }, { "epoch": 0.5658501952431665, "grad_norm": 0.16469435844923008, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119826926.34482759, "logits/rejected": 101022807.77142857, "logps/chosen": -214.06896551724137, "logps/rejected": -398.62857142857143, "loss": 0.1371, "rewards/chosen": 0.9051724137931034, "rewards/margins": 10.148029556650245, "rewards/rejected": -9.242857142857142, "step": 797 }, { "epoch": 0.5665601703940362, "grad_norm": 0.16435434280612912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176848358.81967214, "logits/rejected": 58681130.02985074, "logps/chosen": -295.8688524590164, "logps/rejected": -375.4029850746269, "loss": 0.1341, "rewards/chosen": 2.1864754098360657, "rewards/margins": 10.261102275507707, "rewards/rejected": -8.074626865671641, "step": 798 }, { "epoch": 0.567270145544906, "grad_norm": 0.16585327610546544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 62011171.44615385, "logits/rejected": 96202686.98412699, "logps/chosen": -202.83076923076922, "logps/rejected": -326.85714285714283, "loss": 0.1687, "rewards/chosen": 1.6115384615384616, "rewards/margins": -12299917.626556776, "rewards/rejected": 12299919.238095239, "step": 799 }, { "epoch": 0.5679801206957756, "grad_norm": 0.24386110445730394, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176293920.5079365, "logits/rejected": 49041092.92307692, "logps/chosen": -232.63492063492063, "logps/rejected": -353.96923076923076, "loss": 0.1631, "rewards/chosen": 1.6646825396825398, "rewards/margins": 9.718528693528693, "rewards/rejected": -8.053846153846154, "step": 800 }, { "epoch": 0.5686900958466453, "grad_norm": 0.15060108696089577, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102912415.53623189, "logits/rejected": 38175275.38983051, "logps/chosen": -233.04347826086956, "logps/rejected": -315.66101694915255, "loss": 0.1621, "rewards/chosen": 1.588768115942029, "rewards/margins": 7.656564726111521, "rewards/rejected": -6.067796610169491, "step": 801 }, { "epoch": 0.5694000709975151, "grad_norm": 0.20825406914687555, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160160274.96296296, "logits/rejected": 65351790.7027027, "logps/chosen": -258.6666666666667, "logps/rejected": -372.3243243243243, "loss": 0.1528, "rewards/chosen": 1.4195601851851851, "rewards/margins": 9.608749374374375, "rewards/rejected": -8.18918918918919, "step": 802 }, { "epoch": 0.5701100461483848, "grad_norm": 0.15703042455133157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112328704.0, "logits/rejected": 71696384.0, "logps/chosen": -226.25, "logps/rejected": -363.5, "loss": 0.1734, "rewards/chosen": 1.82421875, "rewards/margins": 10.41015625, "rewards/rejected": -8.5859375, "step": 803 }, { "epoch": 0.5708200212992546, "grad_norm": 0.18721122050301225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100910019.76470588, "logits/rejected": 112267537.06666666, "logps/chosen": -228.7058823529412, "logps/rejected": -355.73333333333335, "loss": 0.1805, "rewards/chosen": 1.2738970588235294, "rewards/margins": 9.17389705882353, "rewards/rejected": -7.9, "step": 804 }, { "epoch": 0.5715299964501243, "grad_norm": 0.14324599011921466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143380669.04615384, "logits/rejected": 73500184.38095239, "logps/chosen": -236.8, "logps/rejected": -430.22222222222223, "loss": 0.1613, "rewards/chosen": 1.7942307692307693, "rewards/margins": 10.556135531135531, "rewards/rejected": -8.761904761904763, "step": 805 }, { "epoch": 0.5722399716009939, "grad_norm": 0.20266337274039334, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109471334.4, "logits/rejected": 98689505.88235295, "logps/chosen": -228.53333333333333, "logps/rejected": -320.0, "loss": 0.1538, "rewards/chosen": 1.59375, "rewards/margins": 9.181985294117647, "rewards/rejected": -7.588235294117647, "step": 806 }, { "epoch": 0.5729499467518637, "grad_norm": 0.16975532489892872, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136734310.4, "logits/rejected": 107818285.1764706, "logps/chosen": -241.6, "logps/rejected": -385.88235294117646, "loss": 0.1527, "rewards/chosen": 1.503125, "rewards/margins": 7.716360294117647, "rewards/rejected": -6.213235294117647, "step": 807 }, { "epoch": 0.5736599219027334, "grad_norm": 0.1453102030052512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81659872.4923077, "logits/rejected": 133418812.95238096, "logps/chosen": -187.3230769230769, "logps/rejected": -402.2857142857143, "loss": 0.1811, "rewards/chosen": 1.185576923076923, "rewards/margins": 9.264942002442002, "rewards/rejected": -8.079365079365079, "step": 808 }, { "epoch": 0.5743698970536031, "grad_norm": 0.1601543570196534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132150969.50724638, "logits/rejected": 44395641.49152543, "logps/chosen": -248.57971014492753, "logps/rejected": -368.0, "loss": 0.1715, "rewards/chosen": 1.8605072463768115, "rewards/margins": 7.233388602309015, "rewards/rejected": -5.372881355932203, "step": 809 }, { "epoch": 0.5750798722044729, "grad_norm": 0.12660274455578197, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79880863.47540984, "logits/rejected": 114686043.70149253, "logps/chosen": -221.37704918032787, "logps/rejected": -371.5820895522388, "loss": 0.1331, "rewards/chosen": 1.7520491803278688, "rewards/margins": 10.184885001223392, "rewards/rejected": -8.432835820895523, "step": 810 }, { "epoch": 0.5757898473553426, "grad_norm": 0.13318415550926593, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101728516.06349206, "logits/rejected": 80724220.06153846, "logps/chosen": -213.96825396825398, "logps/rejected": -341.9076923076923, "loss": 0.1342, "rewards/chosen": 1.871031746031746, "rewards/margins": 10.078724053724052, "rewards/rejected": -8.207692307692307, "step": 811 }, { "epoch": 0.5764998225062122, "grad_norm": 0.16486933783680344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110231552.0, "logits/rejected": 106168320.0, "logps/chosen": -209.25, "logps/rejected": -333.5, "loss": 0.1688, "rewards/chosen": 1.560546875, "rewards/margins": 9.310546875, "rewards/rejected": -7.75, "step": 812 }, { "epoch": 0.577209797657082, "grad_norm": 0.13769283656576625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79575267.55555555, "logits/rejected": 88250423.35135135, "logps/chosen": -175.7037037037037, "logps/rejected": -363.6756756756757, "loss": 0.1357, "rewards/chosen": 1.5266203703703705, "rewards/margins": 9.053647397397398, "rewards/rejected": -7.527027027027027, "step": 813 }, { "epoch": 0.5779197728079517, "grad_norm": 0.17159502934161905, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130217007.26153846, "logits/rejected": 69206016.0, "logps/chosen": -243.69230769230768, "logps/rejected": -397.46031746031747, "loss": 0.1742, "rewards/chosen": 1.6826923076923077, "rewards/margins": 9.51602564102564, "rewards/rejected": -7.833333333333333, "step": 814 }, { "epoch": 0.5786297479588215, "grad_norm": 0.13821120337210918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115487991.1724138, "logits/rejected": 123612130.74285714, "logps/chosen": -229.24137931034483, "logps/rejected": -406.4, "loss": 0.1212, "rewards/chosen": 2.3168103448275863, "rewards/margins": 8.352524630541872, "rewards/rejected": -6.035714285714286, "step": 815 }, { "epoch": 0.5793397231096912, "grad_norm": 0.14078210287672793, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104240790.58823529, "logits/rejected": 140928614.4, "logps/chosen": -209.88235294117646, "logps/rejected": -374.93333333333334, "loss": 0.1586, "rewards/chosen": 1.5533088235294117, "rewards/margins": 8.869975490196078, "rewards/rejected": -7.316666666666666, "step": 816 }, { "epoch": 0.5800496982605609, "grad_norm": 0.16227820207119223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 193625574.81967214, "logits/rejected": 50390336.95522388, "logps/chosen": -220.59016393442624, "logps/rejected": -371.1044776119403, "loss": 0.1683, "rewards/chosen": 0.9836065573770492, "rewards/margins": 7.819427452899437, "rewards/rejected": -6.835820895522388, "step": 817 }, { "epoch": 0.5807596734114306, "grad_norm": 0.16412435847673315, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120417114.83870968, "logits/rejected": 51729749.333333336, "logps/chosen": -184.0, "logps/rejected": -406.3030303030303, "loss": 0.1535, "rewards/chosen": 1.247983870967742, "rewards/margins": 10.0055596285435, "rewards/rejected": -8.757575757575758, "step": 818 }, { "epoch": 0.5814696485623003, "grad_norm": 0.13461941047570358, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165941918.25454545, "logits/rejected": 91470300.93150684, "logps/chosen": -191.85454545454544, "logps/rejected": -399.3424657534247, "loss": 0.1624, "rewards/chosen": 0.8267045454545454, "rewards/margins": 8.92259495641345, "rewards/rejected": -8.095890410958905, "step": 819 }, { "epoch": 0.58217962371317, "grad_norm": 0.1660700166106514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98503542.4477612, "logits/rejected": 121772334.16393442, "logps/chosen": -176.11940298507463, "logps/rejected": -350.95081967213116, "loss": 0.1691, "rewards/chosen": 1.6007462686567164, "rewards/margins": 8.740090530951798, "rewards/rejected": -7.139344262295082, "step": 820 }, { "epoch": 0.5828895988640398, "grad_norm": 0.17505654684547176, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111439430.8923077, "logits/rejected": 113645665.52380952, "logps/chosen": -206.52307692307693, "logps/rejected": -330.1587301587302, "loss": 0.1735, "rewards/chosen": 1.0278846153846153, "rewards/margins": 9.305662393162393, "rewards/rejected": -8.277777777777779, "step": 821 }, { "epoch": 0.5835995740149095, "grad_norm": 0.16226429981740592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128270580.53731343, "logits/rejected": 57551351.60655738, "logps/chosen": -207.28358208955223, "logps/rejected": -309.5081967213115, "loss": 0.1589, "rewards/chosen": 1.9813432835820894, "rewards/margins": 9.858392463909958, "rewards/rejected": -7.877049180327869, "step": 822 }, { "epoch": 0.5843095491657792, "grad_norm": 0.1621175208239178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117673528.8888889, "logits/rejected": 114032640.0, "logps/chosen": -285.3333333333333, "logps/rejected": -440.0, "loss": 0.1466, "rewards/chosen": 2.3020833333333335, "rewards/margins": 11.444940476190476, "rewards/rejected": -9.142857142857142, "step": 823 }, { "epoch": 0.5850195243166489, "grad_norm": 0.18530642101896225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68354048.0, "logits/rejected": 105349120.0, "logps/chosen": -185.125, "logps/rejected": -356.5, "loss": 0.1649, "rewards/chosen": 1.015625, "rewards/margins": 9.359375, "rewards/rejected": -8.34375, "step": 824 }, { "epoch": 0.5857294994675186, "grad_norm": 0.13968524182880074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156307729.06666666, "logits/rejected": 105166004.70588236, "logps/chosen": -262.93333333333334, "logps/rejected": -337.88235294117646, "loss": 0.1595, "rewards/chosen": 1.9354166666666666, "rewards/margins": 9.72218137254902, "rewards/rejected": -7.786764705882353, "step": 825 }, { "epoch": 0.5864394746183883, "grad_norm": 0.12914218146164166, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152077345.03225806, "logits/rejected": 2351352.242424242, "logps/chosen": -209.5483870967742, "logps/rejected": -335.030303030303, "loss": 0.1366, "rewards/chosen": 2.0725806451612905, "rewards/margins": 9.640762463343108, "rewards/rejected": -7.568181818181818, "step": 826 }, { "epoch": 0.5871494497692581, "grad_norm": 0.2075717307440123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125960192.0, "logits/rejected": 91357184.0, "logps/chosen": -145.25, "logps/rejected": -335.5, "loss": 0.1548, "rewards/chosen": 1.2255859375, "rewards/margins": 9.2646484375, "rewards/rejected": -8.0390625, "step": 827 }, { "epoch": 0.5878594249201278, "grad_norm": 0.16286322026332503, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172148825.04347825, "logits/rejected": 7117876.06779661, "logps/chosen": -228.17391304347825, "logps/rejected": -299.3898305084746, "loss": 0.1654, "rewards/chosen": 1.5978260869565217, "rewards/margins": 8.987656595431098, "rewards/rejected": -7.389830508474576, "step": 828 }, { "epoch": 0.5885694000709976, "grad_norm": 0.15789326381417537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156415275.32307693, "logits/rejected": 83087164.95238096, "logps/chosen": -238.52307692307693, "logps/rejected": -345.6507936507937, "loss": 0.1588, "rewards/chosen": 1.6990384615384615, "rewards/margins": 9.849832112332113, "rewards/rejected": -8.15079365079365, "step": 829 }, { "epoch": 0.5892793752218672, "grad_norm": 0.13678402975836929, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112178566.98181818, "logits/rejected": 86758890.9589041, "logps/chosen": -190.25454545454545, "logps/rejected": -319.56164383561645, "loss": 0.1527, "rewards/chosen": 1.1727272727272726, "rewards/margins": 9.23437110834371, "rewards/rejected": -8.061643835616438, "step": 830 }, { "epoch": 0.5899893503727369, "grad_norm": 0.13868271761879022, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100044464.26229508, "logits/rejected": 125641315.34328358, "logps/chosen": -160.65573770491804, "logps/rejected": -377.07462686567163, "loss": 0.1792, "rewards/chosen": 1.1475409836065573, "rewards/margins": 9.08037680450208, "rewards/rejected": -7.932835820895522, "step": 831 }, { "epoch": 0.5906993255236067, "grad_norm": 0.17722773344192413, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74938231.46666667, "logits/rejected": 127001057.88235295, "logps/chosen": -210.0, "logps/rejected": -372.70588235294116, "loss": 0.1708, "rewards/chosen": 1.5145833333333334, "rewards/margins": 6.580759803921569, "rewards/rejected": -5.0661764705882355, "step": 832 }, { "epoch": 0.5914093006744764, "grad_norm": 0.21332062790854706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118057321.41176471, "logits/rejected": 86472567.46666667, "logps/chosen": -224.0, "logps/rejected": -373.6, "loss": 0.1644, "rewards/chosen": 0.8198529411764706, "rewards/margins": 9.353186274509804, "rewards/rejected": -8.533333333333333, "step": 833 }, { "epoch": 0.5921192758253461, "grad_norm": 0.17136590907730692, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152543610.0923077, "logits/rejected": 59502526.984126985, "logps/chosen": -245.16923076923078, "logps/rejected": -359.6190476190476, "loss": 0.1727, "rewards/chosen": 1.4519230769230769, "rewards/margins": 8.86462148962149, "rewards/rejected": -7.412698412698413, "step": 834 }, { "epoch": 0.5928292509762159, "grad_norm": 0.13338519624055042, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87017039.32394366, "logits/rejected": 95420416.0, "logps/chosen": -161.69014084507043, "logps/rejected": -303.1578947368421, "loss": 0.1644, "rewards/chosen": 1.3855633802816902, "rewards/margins": 9.148721275018532, "rewards/rejected": -7.7631578947368425, "step": 835 }, { "epoch": 0.5935392261270855, "grad_norm": 0.16987485235710184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173764022.85714287, "logits/rejected": 19950062.344827585, "logps/chosen": -213.4857142857143, "logps/rejected": -307.86206896551727, "loss": 0.2029, "rewards/chosen": 1.10625, "rewards/margins": 8.25280172413793, "rewards/rejected": -7.146551724137931, "step": 836 }, { "epoch": 0.5942492012779552, "grad_norm": 0.18943465168236226, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92694118.4, "logits/rejected": 63284645.64705882, "logps/chosen": -156.8, "logps/rejected": -349.4117647058824, "loss": 0.1551, "rewards/chosen": 1.375, "rewards/margins": 9.713235294117647, "rewards/rejected": -8.338235294117647, "step": 837 }, { "epoch": 0.594959176428825, "grad_norm": 0.1736728755365522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110758061.55932203, "logits/rejected": 39693920.46376812, "logps/chosen": -176.54237288135593, "logps/rejected": -310.95652173913044, "loss": 0.1768, "rewards/chosen": 0.9194915254237288, "rewards/margins": 8.223839351510685, "rewards/rejected": -7.304347826086956, "step": 838 }, { "epoch": 0.5956691515796947, "grad_norm": 0.17208024260765756, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78942793.14285715, "logits/rejected": 77449992.8275862, "logps/chosen": -271.77142857142854, "logps/rejected": -357.7931034482759, "loss": 0.1693, "rewards/chosen": 2.1446428571428573, "rewards/margins": 9.851539408866994, "rewards/rejected": -7.706896551724138, "step": 839 }, { "epoch": 0.5963791267305645, "grad_norm": 0.20606414001030116, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79869500.7457627, "logits/rejected": 119324909.44927536, "logps/chosen": -199.864406779661, "logps/rejected": -375.18840579710144, "loss": 0.1661, "rewards/chosen": 1.2229872881355932, "rewards/margins": 8.69037859248342, "rewards/rejected": -7.467391304347826, "step": 840 }, { "epoch": 0.5970891018814342, "grad_norm": 0.5250767442449576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130586837.97014925, "logits/rejected": 49025225.44262295, "logps/chosen": -245.49253731343285, "logps/rejected": -346.4918032786885, "loss": 0.1841, "rewards/chosen": 1.3274253731343284, "rewards/margins": -18531219.656181186, "rewards/rejected": 18531220.98360656, "step": 841 }, { "epoch": 0.5977990770323038, "grad_norm": 0.2378405679174382, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134610944.0, "logits/rejected": 56885248.0, "logps/chosen": -223.25, "logps/rejected": -318.5, "loss": 0.1677, "rewards/chosen": 1.48046875, "rewards/margins": 10.87109375, "rewards/rejected": -9.390625, "step": 842 }, { "epoch": 0.5985090521831736, "grad_norm": 0.1512821809254421, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160736553.29032257, "logits/rejected": 93990539.63636364, "logps/chosen": -249.5483870967742, "logps/rejected": -352.4848484848485, "loss": 0.1645, "rewards/chosen": 1.4112903225806452, "rewards/margins": 10.593108504398828, "rewards/rejected": -9.181818181818182, "step": 843 }, { "epoch": 0.5992190273340433, "grad_norm": 0.1526351592756251, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105906176.0, "logits/rejected": 101318656.0, "logps/chosen": -171.75, "logps/rejected": -388.5, "loss": 0.1804, "rewards/chosen": 1.1201171875, "rewards/margins": 9.1513671875, "rewards/rejected": -8.03125, "step": 844 }, { "epoch": 0.599929002484913, "grad_norm": 0.6447027581761832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103492786.79365079, "logits/rejected": 90564702.52307692, "logps/chosen": -221.20634920634922, "logps/rejected": -332.55384615384617, "loss": 0.1495, "rewards/chosen": 1.867063492063492, "rewards/margins": 9.728601953601952, "rewards/rejected": -7.861538461538461, "step": 845 }, { "epoch": 0.6006389776357828, "grad_norm": 0.13835408878029548, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126979171.09677419, "logits/rejected": 88715884.60606061, "logps/chosen": -217.8709677419355, "logps/rejected": -372.8484848484849, "loss": 0.1491, "rewards/chosen": 1.2862903225806452, "rewards/margins": 10.83174486803519, "rewards/rejected": -9.545454545454545, "step": 846 }, { "epoch": 0.6013489527866525, "grad_norm": 0.17754355432044205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123762361.50724638, "logits/rejected": 74431123.52542374, "logps/chosen": -239.07246376811594, "logps/rejected": -384.54237288135596, "loss": 0.187, "rewards/chosen": 1.9420289855072463, "rewards/margins": 9.425079832964874, "rewards/rejected": -7.483050847457627, "step": 847 }, { "epoch": 0.6020589279375221, "grad_norm": 0.16529646809782414, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123432374.85714285, "logits/rejected": 80498372.92307693, "logps/chosen": -222.47619047619048, "logps/rejected": -375.87692307692305, "loss": 0.1812, "rewards/chosen": 1.4563492063492063, "rewards/margins": 8.764041514041514, "rewards/rejected": -7.3076923076923075, "step": 848 }, { "epoch": 0.6027689030883919, "grad_norm": 0.16422307050061194, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137598211.82089552, "logits/rejected": 99975705.18032786, "logps/chosen": -204.4179104477612, "logps/rejected": -327.08196721311475, "loss": 0.1569, "rewards/chosen": 1.7854477611940298, "rewards/margins": 9.416595302177637, "rewards/rejected": -7.631147540983607, "step": 849 }, { "epoch": 0.6034788782392616, "grad_norm": 0.15095724518871848, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109471334.4, "logits/rejected": 107694923.29411764, "logps/chosen": -224.8, "logps/rejected": -352.0, "loss": 0.1459, "rewards/chosen": 1.8208333333333333, "rewards/margins": 9.78406862745098, "rewards/rejected": -7.963235294117647, "step": 850 }, { "epoch": 0.6041888533901314, "grad_norm": 0.15813494386407637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135162923.26760563, "logits/rejected": 100589711.71929824, "logps/chosen": -218.3661971830986, "logps/rejected": -331.7894736842105, "loss": 0.1771, "rewards/chosen": 1.8794014084507042, "rewards/margins": 9.870629478626142, "rewards/rejected": -7.991228070175438, "step": 851 }, { "epoch": 0.6048988285410011, "grad_norm": 0.16468053588240178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111042421.15254237, "logits/rejected": 88566680.11594203, "logps/chosen": -189.5593220338983, "logps/rejected": -351.07246376811594, "loss": 0.1711, "rewards/chosen": 1.2076271186440677, "rewards/margins": 8.164148857774503, "rewards/rejected": -6.956521739130435, "step": 852 }, { "epoch": 0.6056088036918708, "grad_norm": 0.16900878696486346, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103793373.6119403, "logits/rejected": 53563324.85245901, "logps/chosen": -257.67164179104475, "logps/rejected": -348.327868852459, "loss": 0.1682, "rewards/chosen": 1.3619402985074627, "rewards/margins": 7.271776364081234, "rewards/rejected": -5.909836065573771, "step": 853 }, { "epoch": 0.6063187788427405, "grad_norm": 0.1541078630410781, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122868434.8235294, "logits/rejected": 165465292.8, "logps/chosen": -241.88235294117646, "logps/rejected": -384.53333333333336, "loss": 0.1781, "rewards/chosen": 1.59375, "rewards/margins": 5.41875, "rewards/rejected": -3.825, "step": 854 }, { "epoch": 0.6070287539936102, "grad_norm": 0.17331165361911952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102706674.87179486, "logits/rejected": 128010158.08, "logps/chosen": -220.10256410256412, "logps/rejected": -352.0, "loss": 0.2029, "rewards/chosen": 1.7195512820512822, "rewards/margins": 7.939551282051282, "rewards/rejected": -6.22, "step": 855 }, { "epoch": 0.6077387291444799, "grad_norm": 0.1633410824922079, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111245847.63076924, "logits/rejected": 90477129.14285715, "logps/chosen": -238.52307692307693, "logps/rejected": -378.41269841269843, "loss": 0.1759, "rewards/chosen": 1.4211538461538462, "rewards/margins": 9.135439560439561, "rewards/rejected": -7.714285714285714, "step": 856 }, { "epoch": 0.6084487042953497, "grad_norm": 0.15428505800608397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98064009.0140845, "logits/rejected": 100773672.42105263, "logps/chosen": -190.64788732394365, "logps/rejected": -391.29824561403507, "loss": 0.1719, "rewards/chosen": 1.4375, "rewards/margins": 9.384868421052632, "rewards/rejected": -7.947368421052632, "step": 857 }, { "epoch": 0.6091586794462194, "grad_norm": 0.15152296100197557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116841325.71428572, "logits/rejected": 77121308.44444445, "logps/chosen": -189.42857142857142, "logps/rejected": -372.0, "loss": 0.1382, "rewards/chosen": 1.1875, "rewards/margins": 9.819444444444445, "rewards/rejected": -8.631944444444445, "step": 858 }, { "epoch": 0.609868654597089, "grad_norm": 0.16855624345727932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137056949.16923076, "logits/rejected": -8646590.984126983, "logps/chosen": -151.01538461538462, "logps/rejected": -304.76190476190476, "loss": 0.1667, "rewards/chosen": 1.1278846153846154, "rewards/margins": 8.834233821733822, "rewards/rejected": -7.7063492063492065, "step": 859 }, { "epoch": 0.6105786297479588, "grad_norm": 0.1550064877725562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131596288.0, "logits/rejected": 85188856.24242425, "logps/chosen": -206.06451612903226, "logps/rejected": -351.030303030303, "loss": 0.1532, "rewards/chosen": 1.7237903225806452, "rewards/margins": 9.738941837732161, "rewards/rejected": -8.015151515151516, "step": 860 }, { "epoch": 0.6112886048988285, "grad_norm": 0.15756337446181334, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85343422.91525424, "logits/rejected": 63583217.15942029, "logps/chosen": -212.74576271186442, "logps/rejected": -326.95652173913044, "loss": 0.1469, "rewards/chosen": 1.63135593220339, "rewards/margins": 8.805268975681651, "rewards/rejected": -7.173913043478261, "step": 861 }, { "epoch": 0.6119985800496983, "grad_norm": 0.21588435932881092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145906771.93442622, "logits/rejected": 40761435.70149254, "logps/chosen": -286.42622950819674, "logps/rejected": -287.2835820895522, "loss": 0.1604, "rewards/chosen": 1.9077868852459017, "rewards/margins": 9.997339124051871, "rewards/rejected": -8.08955223880597, "step": 862 }, { "epoch": 0.612708555200568, "grad_norm": 0.1807619878412378, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84716874.83076923, "logits/rejected": 84152385.01587301, "logps/chosen": -189.7846153846154, "logps/rejected": -333.2063492063492, "loss": 0.1763, "rewards/chosen": 1.5, "rewards/margins": 7.373015873015873, "rewards/rejected": -5.873015873015873, "step": 863 }, { "epoch": 0.6134185303514377, "grad_norm": 0.7184866839496814, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136634784.54237288, "logits/rejected": 79266266.89855072, "logps/chosen": -219.38983050847457, "logps/rejected": -308.8695652173913, "loss": 0.1426, "rewards/chosen": 1.6588983050847457, "rewards/margins": 9.151651928273152, "rewards/rejected": -7.492753623188406, "step": 864 }, { "epoch": 0.6141285055023075, "grad_norm": 0.13226740481718238, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83518158.59649123, "logits/rejected": 84358677.63380282, "logps/chosen": -180.49122807017545, "logps/rejected": -368.67605633802816, "loss": 0.141, "rewards/chosen": 1.4989035087719298, "rewards/margins": 9.224255621447986, "rewards/rejected": -7.725352112676056, "step": 865 }, { "epoch": 0.6148384806531771, "grad_norm": 0.14153071547955415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124480950.85714285, "logits/rejected": 91459128.8888889, "logps/chosen": -248.0, "logps/rejected": -370.6666666666667, "loss": 0.1553, "rewards/chosen": 0.8258928571428571, "rewards/margins": 8.985615079365079, "rewards/rejected": -8.159722222222221, "step": 866 }, { "epoch": 0.6155484558040468, "grad_norm": 0.17152171749293169, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147799283.80952382, "logits/rejected": 67044336.24615385, "logps/chosen": -213.33333333333334, "logps/rejected": -327.38461538461536, "loss": 0.1841, "rewards/chosen": 1.0401785714285714, "rewards/margins": 8.94787087912088, "rewards/rejected": -7.907692307692308, "step": 867 }, { "epoch": 0.6162584309549166, "grad_norm": 0.19370480350459332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124518400.0, "logits/rejected": 98631680.0, "logps/chosen": -203.375, "logps/rejected": -370.5, "loss": 0.1782, "rewards/chosen": 1.193359375, "rewards/margins": 9.154296875, "rewards/rejected": -7.9609375, "step": 868 }, { "epoch": 0.6169684061057863, "grad_norm": 0.19130910977149573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125724262.4, "logits/rejected": 46630791.52941176, "logps/chosen": -264.93333333333334, "logps/rejected": -324.70588235294116, "loss": 0.1406, "rewards/chosen": 1.9208333333333334, "rewards/margins": 9.722303921568628, "rewards/rejected": -7.801470588235294, "step": 869 }, { "epoch": 0.617678381256656, "grad_norm": 0.14750828195891158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92388047.56756757, "logits/rejected": 84895819.85185185, "logps/chosen": -213.40540540540542, "logps/rejected": -330.3703703703704, "loss": 0.1868, "rewards/chosen": 1.7027027027027026, "rewards/margins": 7.23973973973974, "rewards/rejected": -5.537037037037037, "step": 870 }, { "epoch": 0.6183883564075258, "grad_norm": 0.152148534844095, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106684151.74193548, "logits/rejected": 96627867.15151516, "logps/chosen": -172.90322580645162, "logps/rejected": -345.6969696969697, "loss": 0.1692, "rewards/chosen": 1.4203629032258065, "rewards/margins": 8.5643022971652, "rewards/rejected": -7.143939393939394, "step": 871 }, { "epoch": 0.6190983315583954, "grad_norm": 0.18706839159082864, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129346923.35483871, "logits/rejected": 71912851.39393939, "logps/chosen": -203.8709677419355, "logps/rejected": -355.8787878787879, "loss": 0.1605, "rewards/chosen": 1.7600806451612903, "rewards/margins": 10.108565493646138, "rewards/rejected": -8.348484848484848, "step": 872 }, { "epoch": 0.6198083067092651, "grad_norm": 0.2523408664933973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77514985.31645569, "logits/rejected": 98865737.14285715, "logps/chosen": -173.16455696202533, "logps/rejected": -324.57142857142856, "loss": 0.233, "rewards/chosen": 1.0625, "rewards/margins": 8.082908163265305, "rewards/rejected": -7.020408163265306, "step": 873 }, { "epoch": 0.6205182818601349, "grad_norm": 0.15449813784917682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92629136.22535211, "logits/rejected": 92789777.96491228, "logps/chosen": -201.01408450704224, "logps/rejected": -281.2631578947368, "loss": 0.1603, "rewards/chosen": 1.806338028169014, "rewards/margins": 6.920373115888312, "rewards/rejected": -5.114035087719298, "step": 874 }, { "epoch": 0.6212282570110046, "grad_norm": 0.13769908288230032, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113671654.81967214, "logits/rejected": 100475491.34328358, "logps/chosen": -227.14754098360655, "logps/rejected": -365.85074626865674, "loss": 0.1548, "rewards/chosen": 1.0942622950819672, "rewards/margins": 9.019635429410325, "rewards/rejected": -7.925373134328358, "step": 875 }, { "epoch": 0.6219382321618744, "grad_norm": 0.32768115245352925, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103968976.27118644, "logits/rejected": 46866788.17391305, "logps/chosen": -186.84745762711864, "logps/rejected": -337.15942028985506, "loss": 0.1367, "rewards/chosen": 1.5911016949152543, "rewards/margins": 10.417188651436993, "rewards/rejected": -8.826086956521738, "step": 876 }, { "epoch": 0.6226482073127441, "grad_norm": 0.1592525224802156, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108983144.91803278, "logits/rejected": 110679544.35820895, "logps/chosen": -187.54098360655738, "logps/rejected": -342.44776119402985, "loss": 0.1602, "rewards/chosen": 1.3381147540983607, "rewards/margins": 10.032144604844628, "rewards/rejected": -8.694029850746269, "step": 877 }, { "epoch": 0.6233581824636137, "grad_norm": 0.1833278295752836, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136378430.06060606, "logits/rejected": 82938979.09677419, "logps/chosen": -223.75757575757575, "logps/rejected": -414.96774193548384, "loss": 0.1887, "rewards/chosen": 1.3409090909090908, "rewards/margins": 8.19574780058651, "rewards/rejected": -6.854838709677419, "step": 878 }, { "epoch": 0.6240681576144835, "grad_norm": 0.13880209021905918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109784242.79365079, "logits/rejected": 55364812.8, "logps/chosen": -247.36507936507937, "logps/rejected": -348.0615384615385, "loss": 0.1281, "rewards/chosen": 1.8095238095238095, "rewards/margins": 10.14798534798535, "rewards/rejected": -8.338461538461539, "step": 879 }, { "epoch": 0.6247781327653532, "grad_norm": 0.16495595167872656, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153511526.4, "logits/rejected": 75096545.88235295, "logps/chosen": -201.33333333333334, "logps/rejected": -323.7647058823529, "loss": 0.1609, "rewards/chosen": 1.1854166666666666, "rewards/margins": -26230491.049877454, "rewards/rejected": 26230492.23529412, "step": 880 }, { "epoch": 0.6254881079162229, "grad_norm": 0.15292982579181658, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 55193227.63636363, "logits/rejected": 152144068.38356164, "logps/chosen": -177.01818181818183, "logps/rejected": -380.05479452054794, "loss": 0.1565, "rewards/chosen": 1.196590909090909, "rewards/margins": 9.744536114570362, "rewards/rejected": -8.547945205479452, "step": 881 }, { "epoch": 0.6261980830670927, "grad_norm": 0.1535321658356205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129260823.27272727, "logits/rejected": 104586999.74193548, "logps/chosen": -234.66666666666666, "logps/rejected": -369.5483870967742, "loss": 0.1787, "rewards/chosen": 1.2045454545454546, "rewards/margins": 9.591642228739003, "rewards/rejected": -8.387096774193548, "step": 882 }, { "epoch": 0.6269080582179624, "grad_norm": 0.1629301914067029, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154257180.44444445, "logits/rejected": 99240228.57142857, "logps/chosen": -266.22222222222223, "logps/rejected": -400.57142857142856, "loss": 0.1779, "rewards/chosen": 2.125, "rewards/margins": 9.232142857142858, "rewards/rejected": -7.107142857142857, "step": 883 }, { "epoch": 0.627618033368832, "grad_norm": 0.17633994596456673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123699704.12307692, "logits/rejected": 128658610.79365079, "logps/chosen": -199.3846153846154, "logps/rejected": -373.3333333333333, "loss": 0.1912, "rewards/chosen": 0.8701923076923077, "rewards/margins": 8.624160561660561, "rewards/rejected": -7.753968253968254, "step": 884 }, { "epoch": 0.6283280085197018, "grad_norm": 0.1623126114718755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179236590.93333334, "logits/rejected": 52922247.52941176, "logps/chosen": -288.53333333333336, "logps/rejected": -362.3529411764706, "loss": 0.1538, "rewards/chosen": 1.91875, "rewards/margins": 10.727573529411764, "rewards/rejected": -8.808823529411764, "step": 885 }, { "epoch": 0.6290379836705715, "grad_norm": 0.18330269412256975, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138339716.41379312, "logits/rejected": 55679385.6, "logps/chosen": -214.06896551724137, "logps/rejected": -362.0571428571429, "loss": 0.1537, "rewards/chosen": 1.769396551724138, "rewards/margins": 9.705110837438424, "rewards/rejected": -7.935714285714286, "step": 886 }, { "epoch": 0.6297479588214413, "grad_norm": 0.15347952439968404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80425779.2, "logits/rejected": 101465148.23529412, "logps/chosen": -207.6, "logps/rejected": -445.6470588235294, "loss": 0.1496, "rewards/chosen": 1.2875, "rewards/margins": 8.566911764705882, "rewards/rejected": -7.279411764705882, "step": 887 }, { "epoch": 0.630457933972311, "grad_norm": 0.15374099292942847, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155001443.3432836, "logits/rejected": 53657868.59016393, "logps/chosen": -273.67164179104475, "logps/rejected": -329.1803278688525, "loss": 0.1656, "rewards/chosen": 1.5690298507462686, "rewards/margins": 9.929685588451187, "rewards/rejected": -8.360655737704919, "step": 888 }, { "epoch": 0.6311679091231807, "grad_norm": 0.18212959952037983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150444871.3442623, "logits/rejected": 88080384.0, "logps/chosen": -275.8032786885246, "logps/rejected": -320.7164179104478, "loss": 0.1678, "rewards/chosen": 1.7074795081967213, "rewards/margins": 9.737330254465379, "rewards/rejected": -8.029850746268657, "step": 889 }, { "epoch": 0.6318778842740504, "grad_norm": 0.15440398712081563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134540366.76923078, "logits/rejected": 72296555.78947368, "logps/chosen": -193.23076923076923, "logps/rejected": -344.8421052631579, "loss": 0.1452, "rewards/chosen": 1.3413461538461537, "rewards/margins": 9.597925101214575, "rewards/rejected": -8.256578947368421, "step": 890 }, { "epoch": 0.6325878594249201, "grad_norm": 0.17967167161800557, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77290198.70967741, "logits/rejected": 95706391.27272727, "logps/chosen": -203.09677419354838, "logps/rejected": -363.1515151515151, "loss": 0.1727, "rewards/chosen": 0.9243951612903226, "rewards/margins": 8.363789100684262, "rewards/rejected": -7.4393939393939394, "step": 891 }, { "epoch": 0.6332978345757898, "grad_norm": 0.24707216154346695, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120021622.15384616, "logits/rejected": 62614966.85714286, "logps/chosen": -198.4, "logps/rejected": -356.06349206349205, "loss": 0.1701, "rewards/chosen": 1.4346153846153846, "rewards/margins": 9.4504884004884, "rewards/rejected": -8.015873015873016, "step": 892 }, { "epoch": 0.6340078097266596, "grad_norm": 0.25015030316776976, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104018739.2, "logits/rejected": 87547773.96825397, "logps/chosen": -189.7846153846154, "logps/rejected": -348.44444444444446, "loss": 0.1587, "rewards/chosen": 1.603846153846154, "rewards/margins": 10.318131868131868, "rewards/rejected": -8.714285714285714, "step": 893 }, { "epoch": 0.6347177848775293, "grad_norm": 0.18257411241895188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139676980.82539684, "logits/rejected": 83208538.58461538, "logps/chosen": -190.6031746031746, "logps/rejected": -382.5230769230769, "loss": 0.1497, "rewards/chosen": 1.3888888888888888, "rewards/margins": 10.965811965811966, "rewards/rejected": -9.576923076923077, "step": 894 }, { "epoch": 0.635427760028399, "grad_norm": 0.17734150721133293, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56770790.76056338, "logits/rejected": 107433049.8245614, "logps/chosen": -183.77464788732394, "logps/rejected": -372.7719298245614, "loss": 0.1743, "rewards/chosen": 1.4154929577464788, "rewards/margins": 10.748826291079812, "rewards/rejected": -9.333333333333334, "step": 895 }, { "epoch": 0.6361377351792687, "grad_norm": 0.24186718225783788, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 169045430.85714287, "logits/rejected": 56098816.0, "logps/chosen": -218.85714285714286, "logps/rejected": -364.44444444444446, "loss": 0.1623, "rewards/chosen": 1.1506696428571428, "rewards/margins": 9.449280753968253, "rewards/rejected": -8.29861111111111, "step": 896 }, { "epoch": 0.6368477103301384, "grad_norm": 0.15363088865633867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131274356.77192983, "logits/rejected": 80016686.87323944, "logps/chosen": -186.94736842105263, "logps/rejected": -384.90140845070425, "loss": 0.1461, "rewards/chosen": 1.0109649122807018, "rewards/margins": 10.891246602421548, "rewards/rejected": -9.880281690140846, "step": 897 }, { "epoch": 0.6375576854810082, "grad_norm": 0.13280250224205783, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131647025.5483871, "logits/rejected": 84839330.9090909, "logps/chosen": -239.48387096774192, "logps/rejected": -398.06060606060606, "loss": 0.1622, "rewards/chosen": 1.5408266129032258, "rewards/margins": 9.760523582600195, "rewards/rejected": -8.219696969696969, "step": 898 }, { "epoch": 0.6382676606318779, "grad_norm": 0.15861342464215536, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123265934.22222222, "logits/rejected": 51754715.428571425, "logps/chosen": -222.44444444444446, "logps/rejected": -337.2857142857143, "loss": 0.1972, "rewards/chosen": 1.2673611111111112, "rewards/margins": 9.38343253968254, "rewards/rejected": -8.116071428571429, "step": 899 }, { "epoch": 0.6389776357827476, "grad_norm": 0.136319380141563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140998519.46666667, "logits/rejected": 61804303.058823526, "logps/chosen": -250.93333333333334, "logps/rejected": -365.1764705882353, "loss": 0.1422, "rewards/chosen": 1.8395833333333333, "rewards/margins": 10.648406862745098, "rewards/rejected": -8.808823529411764, "step": 900 }, { "epoch": 0.6396876109336174, "grad_norm": 0.16626283655599383, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 173272886.55737704, "logits/rejected": 26042245.731343284, "logps/chosen": -301.9016393442623, "logps/rejected": -357.25373134328356, "loss": 0.188, "rewards/chosen": 1.5102459016393444, "rewards/margins": 9.666962319549791, "rewards/rejected": -8.156716417910447, "step": 901 }, { "epoch": 0.640397586084487, "grad_norm": 0.18800481275543526, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105593442.80701755, "logits/rejected": 120630546.02816902, "logps/chosen": -213.89473684210526, "logps/rejected": -395.2676056338028, "loss": 0.1374, "rewards/chosen": 1.7412280701754386, "rewards/margins": 10.917284408203608, "rewards/rejected": -9.17605633802817, "step": 902 }, { "epoch": 0.6411075612353567, "grad_norm": 0.14781779630186148, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100293210.35294117, "logits/rejected": 102760448.0, "logps/chosen": -192.7058823529412, "logps/rejected": -390.4, "loss": 0.1546, "rewards/chosen": 1.681985294117647, "rewards/margins": 10.356985294117647, "rewards/rejected": -8.675, "step": 903 }, { "epoch": 0.6418175363862265, "grad_norm": 0.14723378518284286, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 167501559.7419355, "logits/rejected": 49696147.39393939, "logps/chosen": -209.29032258064515, "logps/rejected": -374.7878787878788, "loss": 0.1569, "rewards/chosen": 1.404233870967742, "rewards/margins": 10.06332478005865, "rewards/rejected": -8.659090909090908, "step": 904 }, { "epoch": 0.6425275115370962, "grad_norm": 0.1613924417234702, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127153637.05263157, "logits/rejected": 60172130.461538464, "logps/chosen": -258.94736842105266, "logps/rejected": -392.3076923076923, "loss": 0.1752, "rewards/chosen": 2.2434210526315788, "rewards/margins": 11.176113360323887, "rewards/rejected": -8.932692307692308, "step": 905 }, { "epoch": 0.6432374866879659, "grad_norm": 0.1590599007197146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134945905.7777778, "logits/rejected": 78877257.14285715, "logps/chosen": -252.66666666666666, "logps/rejected": -366.57142857142856, "loss": 0.1829, "rewards/chosen": 1.7673611111111112, "rewards/margins": 10.062003968253968, "rewards/rejected": -8.294642857142858, "step": 906 }, { "epoch": 0.6439474618388357, "grad_norm": 0.24215053533277067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138720436.70588234, "logits/rejected": 63054370.13333333, "logps/chosen": -263.52941176470586, "logps/rejected": -350.93333333333334, "loss": 0.1965, "rewards/chosen": 1.2113970588235294, "rewards/margins": -1317260.655269608, "rewards/rejected": 1317261.8666666667, "step": 907 }, { "epoch": 0.6446574369897053, "grad_norm": 0.1395678866368576, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 193084097.04918033, "logits/rejected": 28921917.134328358, "logps/chosen": -244.45901639344262, "logps/rejected": -325.25373134328356, "loss": 0.1554, "rewards/chosen": 1.5061475409836065, "rewards/margins": 9.841968436505995, "rewards/rejected": -8.335820895522389, "step": 908 }, { "epoch": 0.645367412140575, "grad_norm": 0.15456391592460172, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111178593.35211268, "logits/rejected": 68727718.1754386, "logps/chosen": -205.07042253521126, "logps/rejected": -324.7719298245614, "loss": 0.1832, "rewards/chosen": 1.7235915492957747, "rewards/margins": 9.179731900172968, "rewards/rejected": -7.456140350877193, "step": 909 }, { "epoch": 0.6460773872914448, "grad_norm": 0.1630101349840426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116896805.92592593, "logits/rejected": 43586753.72972973, "logps/chosen": -170.07407407407408, "logps/rejected": -310.9189189189189, "loss": 0.1454, "rewards/chosen": 1.5486111111111112, "rewards/margins": 7.683746246246246, "rewards/rejected": -6.135135135135135, "step": 910 }, { "epoch": 0.6467873624423145, "grad_norm": 0.15332092433435923, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 156015398.78787878, "logits/rejected": 71709068.38709678, "logps/chosen": -208.96969696969697, "logps/rejected": -377.80645161290323, "loss": 0.1609, "rewards/chosen": 1.1193181818181819, "rewards/margins": 9.385447214076246, "rewards/rejected": -8.266129032258064, "step": 911 }, { "epoch": 0.6474973375931843, "grad_norm": 0.15431057251973734, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117720132.26666667, "logits/rejected": 74880662.58823529, "logps/chosen": -212.0, "logps/rejected": -342.11764705882354, "loss": 0.1506, "rewards/chosen": 1.4197916666666666, "rewards/margins": 8.677144607843138, "rewards/rejected": -7.257352941176471, "step": 912 }, { "epoch": 0.648207312744054, "grad_norm": 0.20703601308630173, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145643590.62068966, "logits/rejected": 89357399.77142857, "logps/chosen": -230.06896551724137, "logps/rejected": -360.6857142857143, "loss": 0.1617, "rewards/chosen": 1.396551724137931, "rewards/margins": -16867438.14630542, "rewards/rejected": 16867439.542857144, "step": 913 }, { "epoch": 0.6489172878949236, "grad_norm": 0.17400301689430384, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143025766.4, "logits/rejected": 36946883.76470588, "logps/chosen": -213.33333333333334, "logps/rejected": -289.1764705882353, "loss": 0.1553, "rewards/chosen": 1.5125, "rewards/margins": 10.0125, "rewards/rejected": -8.5, "step": 914 }, { "epoch": 0.6496272630457934, "grad_norm": 0.15009495697503866, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110216988.44444445, "logits/rejected": 47841280.0, "logps/chosen": -278.8888888888889, "logps/rejected": -335.42857142857144, "loss": 0.1489, "rewards/chosen": 1.8125, "rewards/margins": 7.316964285714286, "rewards/rejected": -5.504464285714286, "step": 915 }, { "epoch": 0.6503372381966631, "grad_norm": 0.15779111581830527, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121509612.89552239, "logits/rejected": 71165649.83606558, "logps/chosen": -191.52238805970148, "logps/rejected": -317.37704918032784, "loss": 0.1598, "rewards/chosen": 1.710820895522388, "rewards/margins": 9.35016515781747, "rewards/rejected": -7.639344262295082, "step": 916 }, { "epoch": 0.6510472133475328, "grad_norm": 0.14777281994504268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164254356.6451613, "logits/rejected": 24339673.21212121, "logps/chosen": -200.25806451612902, "logps/rejected": -333.09090909090907, "loss": 0.1711, "rewards/chosen": 1.3810483870967742, "rewards/margins": 9.623472629521016, "rewards/rejected": -8.242424242424242, "step": 917 }, { "epoch": 0.6517571884984026, "grad_norm": 0.21283405689970897, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88445106.08695652, "logits/rejected": 106741482.30508475, "logps/chosen": -294.72463768115944, "logps/rejected": -334.64406779661016, "loss": 0.2029, "rewards/chosen": 1.3034420289855073, "rewards/margins": 9.388187791697373, "rewards/rejected": -8.084745762711865, "step": 918 }, { "epoch": 0.6524671636492723, "grad_norm": 0.16791276747381403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148598198.85714287, "logits/rejected": 93307132.06153846, "logps/chosen": -216.38095238095238, "logps/rejected": -326.89230769230767, "loss": 0.1728, "rewards/chosen": 1.623015873015873, "rewards/margins": 9.323015873015873, "rewards/rejected": -7.7, "step": 919 }, { "epoch": 0.6531771388001419, "grad_norm": 0.1873806730647581, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108021724.07017544, "logits/rejected": 82763660.61971831, "logps/chosen": -252.0701754385965, "logps/rejected": -351.77464788732397, "loss": 0.1495, "rewards/chosen": 1.625548245614035, "rewards/margins": 10.188928527304174, "rewards/rejected": -8.56338028169014, "step": 920 }, { "epoch": 0.6538871139510117, "grad_norm": 0.16755658439080556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74197828.50704226, "logits/rejected": 99706700.3508772, "logps/chosen": -194.92957746478874, "logps/rejected": -358.17543859649123, "loss": 0.1776, "rewards/chosen": 1.3934859154929577, "rewards/margins": 12343916.060152581, "rewards/rejected": -12343914.666666666, "step": 921 }, { "epoch": 0.6545970891018814, "grad_norm": 0.16264935128085098, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165798369.88235295, "logits/rejected": 55644433.06666667, "logps/chosen": -280.47058823529414, "logps/rejected": -330.93333333333334, "loss": 0.1767, "rewards/chosen": 1.4871323529411764, "rewards/margins": 9.94546568627451, "rewards/rejected": -8.458333333333334, "step": 922 }, { "epoch": 0.6553070642527512, "grad_norm": 0.15340207499779415, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126280814.27692308, "logits/rejected": 54692392.634920634, "logps/chosen": -205.2923076923077, "logps/rejected": -389.58730158730157, "loss": 0.1607, "rewards/chosen": 1.7538461538461538, "rewards/margins": 10.404639804639803, "rewards/rejected": -8.65079365079365, "step": 923 }, { "epoch": 0.6560170394036209, "grad_norm": 0.18601038802147407, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101987813.05263157, "logits/rejected": 81264640.0, "logps/chosen": -197.68421052631578, "logps/rejected": -364.0, "loss": 0.1855, "rewards/chosen": 1.5986842105263157, "rewards/margins": 8.694838056680162, "rewards/rejected": -7.096153846153846, "step": 924 }, { "epoch": 0.6567270145544906, "grad_norm": 0.18760212128106366, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127343729.77777778, "logits/rejected": 88379977.14285715, "logps/chosen": -216.22222222222223, "logps/rejected": -421.42857142857144, "loss": 0.2049, "rewards/chosen": 1.1805555555555556, "rewards/margins": 10.475198412698413, "rewards/rejected": -9.294642857142858, "step": 925 }, { "epoch": 0.6574369897053604, "grad_norm": 0.15846271268485432, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114425856.0, "logits/rejected": 75235328.0, "logps/chosen": -194.0, "logps/rejected": -360.0, "loss": 0.1795, "rewards/chosen": 1.2578125, "rewards/margins": 7.0625, "rewards/rejected": -5.8046875, "step": 926 }, { "epoch": 0.65814696485623, "grad_norm": 0.14498006901581492, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104382773.13207547, "logits/rejected": 85004561.06666666, "logps/chosen": -213.73584905660377, "logps/rejected": -336.2133333333333, "loss": 0.1219, "rewards/chosen": 1.9316037735849056, "rewards/margins": 9.991603773584906, "rewards/rejected": -8.06, "step": 927 }, { "epoch": 0.6588569400070997, "grad_norm": 0.14199070165215322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121084743.3442623, "logits/rejected": 79629174.4477612, "logps/chosen": -245.24590163934425, "logps/rejected": -331.46268656716416, "loss": 0.1629, "rewards/chosen": 1.3954918032786885, "rewards/margins": 6.7089246390995845, "rewards/rejected": -5.313432835820896, "step": 928 }, { "epoch": 0.6595669151579695, "grad_norm": 0.15194744703844912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 141281818.9473684, "logits/rejected": 63209933.52112676, "logps/chosen": -183.1578947368421, "logps/rejected": -316.3943661971831, "loss": 0.1477, "rewards/chosen": 1.2489035087719298, "rewards/margins": 8.896790832715592, "rewards/rejected": -7.647887323943662, "step": 929 }, { "epoch": 0.6602768903088392, "grad_norm": 0.17716236618846073, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104202240.0, "logits/rejected": 68763648.0, "logps/chosen": -195.75, "logps/rejected": -374.0, "loss": 0.1865, "rewards/chosen": 0.9443359375, "rewards/margins": 12632390.944335938, "rewards/rejected": -12632390.0, "step": 930 }, { "epoch": 0.6609868654597089, "grad_norm": 0.15043396346165167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139670323.2, "logits/rejected": 102590825.41176471, "logps/chosen": -244.53333333333333, "logps/rejected": -326.8235294117647, "loss": 0.1483, "rewards/chosen": 1.4958333333333333, "rewards/margins": 9.532598039215687, "rewards/rejected": -8.036764705882353, "step": 931 }, { "epoch": 0.6616968406105787, "grad_norm": 0.17951425587895137, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97380797.2173913, "logits/rejected": 102867082.84745763, "logps/chosen": -169.3913043478261, "logps/rejected": -367.45762711864404, "loss": 0.1895, "rewards/chosen": 1.3070652173913044, "rewards/margins": 8.807065217391305, "rewards/rejected": -7.5, "step": 932 }, { "epoch": 0.6624068157614483, "grad_norm": 0.2495642429051953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126863331.94520548, "logits/rejected": 65354882.32727273, "logps/chosen": -238.68493150684932, "logps/rejected": -339.7818181818182, "loss": 0.1924, "rewards/chosen": 1.2910958904109588, "rewards/margins": -21925239.72708593, "rewards/rejected": 21925241.01818182, "step": 933 }, { "epoch": 0.6631167909123181, "grad_norm": 0.19523371442168044, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123795518.06060606, "logits/rejected": 101948647.22580644, "logps/chosen": -253.0909090909091, "logps/rejected": -386.5806451612903, "loss": 0.1675, "rewards/chosen": 1.9384469696969697, "rewards/margins": 10.204576001955033, "rewards/rejected": -8.266129032258064, "step": 934 }, { "epoch": 0.6638267660631878, "grad_norm": 0.17306082345582469, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117776056.32, "logits/rejected": 110634660.2264151, "logps/chosen": -222.50666666666666, "logps/rejected": -353.811320754717, "loss": 0.183, "rewards/chosen": 1.8783333333333334, "rewards/margins": 10.44437106918239, "rewards/rejected": -8.566037735849056, "step": 935 }, { "epoch": 0.6645367412140575, "grad_norm": 0.13385153301767472, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84881338.57627119, "logits/rejected": 76105342.14492753, "logps/chosen": -198.3728813559322, "logps/rejected": -378.8985507246377, "loss": 0.1459, "rewards/chosen": 1.6779661016949152, "rewards/margins": 10.004053058216654, "rewards/rejected": -8.326086956521738, "step": 936 }, { "epoch": 0.6652467163649273, "grad_norm": 0.15870938748171184, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83727804.3773585, "logits/rejected": 95294586.88, "logps/chosen": -193.0566037735849, "logps/rejected": -359.68, "loss": 0.1494, "rewards/chosen": 1.240566037735849, "rewards/margins": 9.193899371069183, "rewards/rejected": -7.953333333333333, "step": 937 }, { "epoch": 0.665956691515797, "grad_norm": 0.16790982590490763, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64072688.71641791, "logits/rejected": 80069950.95081967, "logps/chosen": -144.71641791044777, "logps/rejected": -359.344262295082, "loss": 0.2018, "rewards/chosen": 0.4832089552238806, "rewards/margins": 8.302881086371421, "rewards/rejected": -7.819672131147541, "step": 938 }, { "epoch": 0.6666666666666666, "grad_norm": 0.15067204536695544, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139544494.08, "logits/rejected": 66517359.58974359, "logps/chosen": -233.6, "logps/rejected": -355.6923076923077, "loss": 0.1421, "rewards/chosen": 1.07875, "rewards/margins": 9.642852564102563, "rewards/rejected": -8.564102564102564, "step": 939 }, { "epoch": 0.6673766418175364, "grad_norm": 0.1564757046262168, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115072759.74193548, "logits/rejected": 95912928.96969697, "logps/chosen": -221.16129032258064, "logps/rejected": -344.24242424242425, "loss": 0.152, "rewards/chosen": 1.6673387096774193, "rewards/margins": 9.584005376344086, "rewards/rejected": -7.916666666666667, "step": 940 }, { "epoch": 0.6680866169684061, "grad_norm": 0.16028253411298285, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142082048.0, "logits/rejected": 87883776.0, "logps/chosen": -233.0, "logps/rejected": -338.0, "loss": 0.1642, "rewards/chosen": 1.7578125, "rewards/margins": 9.1640625, "rewards/rejected": -7.40625, "step": 941 }, { "epoch": 0.6687965921192758, "grad_norm": 0.15991755712808872, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149012182.1090909, "logits/rejected": 23032761.8630137, "logps/chosen": -217.01818181818183, "logps/rejected": -337.972602739726, "loss": 0.1361, "rewards/chosen": 1.2954545454545454, "rewards/margins": 7.459838107098381, "rewards/rejected": -6.164383561643835, "step": 942 }, { "epoch": 0.6695065672701456, "grad_norm": 0.20081666580272675, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116481813.94285715, "logits/rejected": 62263719.72413793, "logps/chosen": -192.9142857142857, "logps/rejected": -348.6896551724138, "loss": 0.1736, "rewards/chosen": 1.4232142857142858, "rewards/margins": 8.328386699507389, "rewards/rejected": -6.905172413793103, "step": 943 }, { "epoch": 0.6702165424210152, "grad_norm": 0.20174870617130644, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 24411585.12280702, "logits/rejected": 137998509.07042253, "logps/chosen": -214.87719298245614, "logps/rejected": -396.16901408450707, "loss": 0.1622, "rewards/chosen": 1.4945175438596492, "rewards/margins": 9.804376698789227, "rewards/rejected": -8.309859154929578, "step": 944 }, { "epoch": 0.670926517571885, "grad_norm": 0.13877770133139106, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80759076.57142857, "logits/rejected": 88313400.8888889, "logps/chosen": -251.42857142857142, "logps/rejected": -335.1111111111111, "loss": 0.1491, "rewards/chosen": 1.4084821428571428, "rewards/margins": 9.915426587301587, "rewards/rejected": -8.506944444444445, "step": 945 }, { "epoch": 0.6716364927227547, "grad_norm": 0.15157909153017998, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136963686.4, "logits/rejected": 34572167.52941176, "logps/chosen": -269.06666666666666, "logps/rejected": -311.05882352941177, "loss": 0.1419, "rewards/chosen": 2.275, "rewards/margins": 10.576470588235294, "rewards/rejected": -8.301470588235293, "step": 946 }, { "epoch": 0.6723464678736244, "grad_norm": 0.18109420378274396, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162549064.4528302, "logits/rejected": 28591172.266666666, "logps/chosen": -251.47169811320754, "logps/rejected": -370.3466666666667, "loss": 0.1492, "rewards/chosen": 1.2794811320754718, "rewards/margins": 9.772814465408805, "rewards/rejected": -8.493333333333334, "step": 947 }, { "epoch": 0.6730564430244942, "grad_norm": 0.1757517657859469, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71021461.01492538, "logits/rejected": 119365766.29508197, "logps/chosen": -208.47761194029852, "logps/rejected": -367.21311475409834, "loss": 0.1601, "rewards/chosen": 1.9962686567164178, "rewards/margins": 10.51266209933937, "rewards/rejected": -8.51639344262295, "step": 948 }, { "epoch": 0.6737664181753639, "grad_norm": 0.15530440322442637, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93356032.0, "logits/rejected": 67207168.0, "logps/chosen": -201.875, "logps/rejected": -310.25, "loss": 0.1446, "rewards/chosen": 1.876953125, "rewards/margins": 7.791015625, "rewards/rejected": -5.9140625, "step": 949 }, { "epoch": 0.6744763933262335, "grad_norm": 0.14405094359385573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74820971.35483871, "logits/rejected": 97358692.84848484, "logps/chosen": -139.61290322580646, "logps/rejected": -365.57575757575756, "loss": 0.1802, "rewards/chosen": 1.0745967741935485, "rewards/margins": 9.06702101661779, "rewards/rejected": -7.992424242424242, "step": 950 }, { "epoch": 0.6751863684771033, "grad_norm": 0.14026015572458878, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117066020.57142857, "logits/rejected": 69963320.8888889, "logps/chosen": -241.42857142857142, "logps/rejected": -354.6666666666667, "loss": 0.1356, "rewards/chosen": 1.4553571428571428, "rewards/margins": 9.413690476190476, "rewards/rejected": -7.958333333333333, "step": 951 }, { "epoch": 0.675896343627973, "grad_norm": 0.24554502606590117, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77878983.59322034, "logits/rejected": 81697747.47826087, "logps/chosen": -210.98305084745763, "logps/rejected": -363.1304347826087, "loss": 0.1624, "rewards/chosen": 1.3432203389830508, "rewards/margins": 7.212785556374356, "rewards/rejected": -5.869565217391305, "step": 952 }, { "epoch": 0.6766063187788427, "grad_norm": 0.18237678338063568, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142453815.85454544, "logits/rejected": 70218681.8630137, "logps/chosen": -229.23636363636365, "logps/rejected": -374.35616438356163, "loss": 0.1626, "rewards/chosen": 1.7477272727272728, "rewards/margins": 10.343617683686178, "rewards/rejected": -8.595890410958905, "step": 953 }, { "epoch": 0.6773162939297125, "grad_norm": 0.17623365246625908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121841093.24590164, "logits/rejected": 57969037.37313433, "logps/chosen": -238.68852459016392, "logps/rejected": -342.92537313432837, "loss": 0.1465, "rewards/chosen": 1.7305327868852458, "rewards/margins": 10.551428309273305, "rewards/rejected": -8.82089552238806, "step": 954 }, { "epoch": 0.6780262690805822, "grad_norm": 0.1427274848827826, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87648617.41176471, "logits/rejected": 69406993.06666666, "logps/chosen": -253.64705882352942, "logps/rejected": -377.3333333333333, "loss": 0.1345, "rewards/chosen": 2.1636029411764706, "rewards/margins": 7.73860294117647, "rewards/rejected": -5.575, "step": 955 }, { "epoch": 0.6787362442314518, "grad_norm": 0.18027161089227295, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107617010.5263158, "logits/rejected": 104076209.23076923, "logps/chosen": -204.6315789473684, "logps/rejected": -368.0, "loss": 0.1819, "rewards/chosen": 1.8717105263157894, "rewards/margins": 9.49671052631579, "rewards/rejected": -7.625, "step": 956 }, { "epoch": 0.6794462193823216, "grad_norm": 0.22810097013793978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145187446.15384614, "logits/rejected": 100863024.76190476, "logps/chosen": -262.15384615384613, "logps/rejected": -368.25396825396825, "loss": 0.1706, "rewards/chosen": 1.7923076923076924, "rewards/margins": 10.03040293040293, "rewards/rejected": -8.238095238095237, "step": 957 }, { "epoch": 0.6801561945331913, "grad_norm": 0.15271854780590186, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99702101.33333333, "logits/rejected": 80899510.85714285, "logps/chosen": -217.11111111111111, "logps/rejected": -363.7142857142857, "loss": 0.1865, "rewards/chosen": 1.6137152777777777, "rewards/margins": 6.149429563492063, "rewards/rejected": -4.535714285714286, "step": 958 }, { "epoch": 0.6808661696840611, "grad_norm": 0.14393335800719592, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152113425.06666666, "logits/rejected": 91688719.05882353, "logps/chosen": -265.6, "logps/rejected": -368.0, "loss": 0.139, "rewards/chosen": 2.04375, "rewards/margins": 10.565808823529412, "rewards/rejected": -8.522058823529411, "step": 959 }, { "epoch": 0.6815761448349308, "grad_norm": 0.15774065511450142, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95944704.0, "logits/rejected": 72413424.94117647, "logps/chosen": -208.26666666666668, "logps/rejected": -359.52941176470586, "loss": 0.1474, "rewards/chosen": 1.1322916666666667, "rewards/margins": 9.941115196078432, "rewards/rejected": -8.808823529411764, "step": 960 }, { "epoch": 0.6822861199858005, "grad_norm": 0.16535883327952489, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68092912.24615385, "logits/rejected": 119437799.61904761, "logps/chosen": -201.6, "logps/rejected": -355.55555555555554, "loss": 0.1637, "rewards/chosen": 1.448076923076923, "rewards/margins": 10.186172161172161, "rewards/rejected": -8.738095238095237, "step": 961 }, { "epoch": 0.6829960951366703, "grad_norm": 0.17443652591347777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96502817.03225806, "logits/rejected": 75688122.18181819, "logps/chosen": -170.58064516129033, "logps/rejected": -378.1818181818182, "loss": 0.1538, "rewards/chosen": 1.4838709677419355, "rewards/margins": 10.264173998044967, "rewards/rejected": -8.780303030303031, "step": 962 }, { "epoch": 0.6837060702875399, "grad_norm": 0.19625424666471825, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86282825.14285715, "logits/rejected": 110787478.06896552, "logps/chosen": -162.74285714285713, "logps/rejected": -409.37931034482756, "loss": 0.1772, "rewards/chosen": 1.3017857142857143, "rewards/margins": 10.353509852216748, "rewards/rejected": -9.051724137931034, "step": 963 }, { "epoch": 0.6844160454384096, "grad_norm": 0.2040914095770177, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73812874.49180327, "logits/rejected": 106391338.02985075, "logps/chosen": -207.21311475409837, "logps/rejected": -366.32835820895525, "loss": 0.1737, "rewards/chosen": 1.3329918032786885, "rewards/margins": 9.840454489845852, "rewards/rejected": -8.507462686567164, "step": 964 }, { "epoch": 0.6851260205892794, "grad_norm": 0.17526252393547545, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101877436.63157895, "logits/rejected": 92859470.76923077, "logps/chosen": -198.73684210526315, "logps/rejected": -325.84615384615387, "loss": 0.1924, "rewards/chosen": 1.4120065789473684, "rewards/margins": 9.671621963562753, "rewards/rejected": -8.259615384615385, "step": 965 }, { "epoch": 0.6858359957401491, "grad_norm": 0.16367296911514437, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90946491.73333333, "logits/rejected": 94556882.8235294, "logps/chosen": -233.33333333333334, "logps/rejected": -334.11764705882354, "loss": 0.1402, "rewards/chosen": 1.7958333333333334, "rewards/margins": 10.604656862745099, "rewards/rejected": -8.808823529411764, "step": 966 }, { "epoch": 0.6865459708910188, "grad_norm": 0.3465763433731365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88820555.29411764, "logits/rejected": 111848106.66666667, "logps/chosen": -192.7058823529412, "logps/rejected": -391.46666666666664, "loss": 0.1899, "rewards/chosen": 1.1112132352941178, "rewards/margins": 9.477879901960785, "rewards/rejected": -8.366666666666667, "step": 967 }, { "epoch": 0.6872559460418886, "grad_norm": 0.15709146408306832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91799552.0, "logits/rejected": 117768192.0, "logps/chosen": -228.25, "logps/rejected": -420.0, "loss": 0.1623, "rewards/chosen": 1.533203125, "rewards/margins": 10.462890625, "rewards/rejected": -8.9296875, "step": 968 }, { "epoch": 0.6879659211927582, "grad_norm": 0.17453447487348908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101997847.27272727, "logits/rejected": 46813844.64516129, "logps/chosen": -204.6060606060606, "logps/rejected": -324.64516129032256, "loss": 0.1722, "rewards/chosen": 1.4318181818181819, "rewards/margins": 10.39149560117302, "rewards/rejected": -8.959677419354838, "step": 969 }, { "epoch": 0.688675896343628, "grad_norm": 0.15316188210943077, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108916603.87096775, "logits/rejected": 104476299.63636364, "logps/chosen": -180.1290322580645, "logps/rejected": -368.969696969697, "loss": 0.1703, "rewards/chosen": 1.1053427419354838, "rewards/margins": 10.287160923753666, "rewards/rejected": -9.181818181818182, "step": 970 }, { "epoch": 0.6893858714944977, "grad_norm": 0.22508327800811265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108719022.73015873, "logits/rejected": 78465748.67692308, "logps/chosen": -207.23809523809524, "logps/rejected": -314.83076923076925, "loss": 0.148, "rewards/chosen": 2.1646825396825395, "rewards/margins": 10.356990231990231, "rewards/rejected": -8.192307692307692, "step": 971 }, { "epoch": 0.6900958466453674, "grad_norm": 0.17002167004214022, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93885001.14285715, "logits/rejected": 79924792.8888889, "logps/chosen": -221.71428571428572, "logps/rejected": -348.6666666666667, "loss": 0.1317, "rewards/chosen": 1.5267857142857142, "rewards/margins": 9.867063492063492, "rewards/rejected": -8.340277777777779, "step": 972 }, { "epoch": 0.6908058217962372, "grad_norm": 0.18867487785188977, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105126465.64102565, "logits/rejected": 101732843.52, "logps/chosen": -237.33333333333334, "logps/rejected": -368.0, "loss": 0.1812, "rewards/chosen": 2.0240384615384617, "rewards/margins": 7.6140384615384615, "rewards/rejected": -5.59, "step": 973 }, { "epoch": 0.6915157969471069, "grad_norm": 0.1412249153005764, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84111927.13846155, "logits/rejected": 108452717.71428572, "logps/chosen": -207.75384615384615, "logps/rejected": -350.984126984127, "loss": 0.1602, "rewards/chosen": 1.9807692307692308, "rewards/margins": 11.17124542124542, "rewards/rejected": -9.19047619047619, "step": 974 }, { "epoch": 0.6922257720979765, "grad_norm": 0.17729281878541894, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 49228835.31034483, "logits/rejected": 107134507.88571429, "logps/chosen": -190.3448275862069, "logps/rejected": -385.8285714285714, "loss": 0.1467, "rewards/chosen": 1.9396551724137931, "rewards/margins": 10.511083743842365, "rewards/rejected": -8.571428571428571, "step": 975 }, { "epoch": 0.6929357472488463, "grad_norm": 0.14324371897315216, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106795876.84848484, "logits/rejected": 123055467.35483871, "logps/chosen": -241.93939393939394, "logps/rejected": -407.2258064516129, "loss": 0.1581, "rewards/chosen": 1.9829545454545454, "rewards/margins": 11.160373900293255, "rewards/rejected": -9.17741935483871, "step": 976 }, { "epoch": 0.693645722399716, "grad_norm": 0.1547792445236877, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147768556.30769232, "logits/rejected": 93989026.53968254, "logps/chosen": -219.3230769230769, "logps/rejected": -401.26984126984127, "loss": 0.1842, "rewards/chosen": 1.4153846153846155, "rewards/margins": 10.478876678876679, "rewards/rejected": -9.063492063492063, "step": 977 }, { "epoch": 0.6943556975505857, "grad_norm": 0.16979792931795876, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99414256.94117647, "logits/rejected": 83221981.86666666, "logps/chosen": -200.94117647058823, "logps/rejected": -349.8666666666667, "loss": 0.1668, "rewards/chosen": 1.3308823529411764, "rewards/margins": 9.380882352941176, "rewards/rejected": -8.05, "step": 978 }, { "epoch": 0.6950656727014555, "grad_norm": 0.1442531551743138, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136024505.1076923, "logits/rejected": 76163234.53968254, "logps/chosen": -242.7076923076923, "logps/rejected": -377.9047619047619, "loss": 0.1505, "rewards/chosen": 1.9057692307692307, "rewards/margins": 10.199420024420025, "rewards/rejected": -8.293650793650794, "step": 979 }, { "epoch": 0.6957756478523252, "grad_norm": 0.2425502041160041, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61577245.68115942, "logits/rejected": 77496875.38983051, "logps/chosen": -169.04347826086956, "logps/rejected": -381.0169491525424, "loss": 0.1807, "rewards/chosen": 0.6793478260869565, "rewards/margins": 7.747144436256448, "rewards/rejected": -7.067796610169491, "step": 980 }, { "epoch": 0.6964856230031949, "grad_norm": 0.1430675995411403, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135281072.67605633, "logits/rejected": 44872614.1754386, "logps/chosen": -234.14084507042253, "logps/rejected": -354.8070175438597, "loss": 0.1581, "rewards/chosen": 2.183098591549296, "rewards/margins": 10.8409933283914, "rewards/rejected": -8.657894736842104, "step": 981 }, { "epoch": 0.6971955981540646, "grad_norm": 0.14723433753590442, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104364152.47058824, "logits/rejected": 93532979.2, "logps/chosen": -220.7058823529412, "logps/rejected": -423.6, "loss": 0.1434, "rewards/chosen": 2.014705882352941, "rewards/margins": 9.86470588235294, "rewards/rejected": -7.85, "step": 982 }, { "epoch": 0.6979055733049343, "grad_norm": 0.16245916207243025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81153427.39393939, "logits/rejected": 121634816.0, "logps/chosen": -187.15151515151516, "logps/rejected": -400.51612903225805, "loss": 0.1451, "rewards/chosen": 1.803030303030303, "rewards/margins": 10.206256109481915, "rewards/rejected": -8.403225806451612, "step": 983 }, { "epoch": 0.6986155484558041, "grad_norm": 0.14054873508480695, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119312969.14285715, "logits/rejected": 67072706.20689655, "logps/chosen": -213.25714285714287, "logps/rejected": -344.55172413793105, "loss": 0.1616, "rewards/chosen": 1.8339285714285714, "rewards/margins": 10.523583743842366, "rewards/rejected": -8.689655172413794, "step": 984 }, { "epoch": 0.6993255236066738, "grad_norm": 0.14908335085270719, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153946491.25925925, "logits/rejected": 69376055.35135135, "logps/chosen": -259.25925925925924, "logps/rejected": -359.35135135135135, "loss": 0.1301, "rewards/chosen": 2.252314814814815, "rewards/margins": 10.259071571571571, "rewards/rejected": -8.006756756756756, "step": 985 }, { "epoch": 0.7000354987575435, "grad_norm": 0.18149843613331612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132091038.64788732, "logits/rejected": 89478485.33333333, "logps/chosen": -218.14084507042253, "logps/rejected": -380.0701754385965, "loss": 0.1734, "rewards/chosen": 1.4911971830985915, "rewards/margins": 10.456109463800347, "rewards/rejected": -8.964912280701755, "step": 986 }, { "epoch": 0.7007454739084132, "grad_norm": 0.1733187558433564, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132511570.44067797, "logits/rejected": 48720792.11594203, "logps/chosen": -203.38983050847457, "logps/rejected": -311.6521739130435, "loss": 0.1792, "rewards/chosen": 0.996822033898305, "rewards/margins": 4.59102493244903, "rewards/rejected": -3.5942028985507246, "step": 987 }, { "epoch": 0.7014554490592829, "grad_norm": 0.14749824423631294, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114684255.08571428, "logits/rejected": 117946721.10344827, "logps/chosen": -247.31428571428572, "logps/rejected": -372.9655172413793, "loss": 0.1856, "rewards/chosen": 1.7339285714285715, "rewards/margins": 10.147721674876847, "rewards/rejected": -8.413793103448276, "step": 988 }, { "epoch": 0.7021654242101526, "grad_norm": 0.14726929232046718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140742200.8888889, "logits/rejected": 39620040.86153846, "logps/chosen": -239.23809523809524, "logps/rejected": -344.12307692307695, "loss": 0.1305, "rewards/chosen": 1.9305555555555556, "rewards/margins": 10.961324786324786, "rewards/rejected": -9.03076923076923, "step": 989 }, { "epoch": 0.7028753993610224, "grad_norm": 0.2542321842247692, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137725691.34545454, "logits/rejected": 96756273.09589042, "logps/chosen": -249.3090909090909, "logps/rejected": -398.90410958904107, "loss": 0.1317, "rewards/chosen": 2.147727272727273, "rewards/margins": 10.818960149439603, "rewards/rejected": -8.67123287671233, "step": 990 }, { "epoch": 0.7035853745118921, "grad_norm": 0.1325114122942323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134217728.0, "logits/rejected": 86077134.3283582, "logps/chosen": -255.7377049180328, "logps/rejected": -346.74626865671644, "loss": 0.1478, "rewards/chosen": 1.4364754098360655, "rewards/margins": 9.734982872522632, "rewards/rejected": -8.298507462686567, "step": 991 }, { "epoch": 0.7042953496627617, "grad_norm": 0.14639139803008813, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131257042.8235294, "logits/rejected": 78049006.93333334, "logps/chosen": -216.0, "logps/rejected": -355.2, "loss": 0.1588, "rewards/chosen": 1.90625, "rewards/margins": 9.372916666666667, "rewards/rejected": -7.466666666666667, "step": 992 }, { "epoch": 0.7050053248136315, "grad_norm": 0.16123602173536436, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152552828.34285715, "logits/rejected": 73111057.65517241, "logps/chosen": -281.8285714285714, "logps/rejected": -331.58620689655174, "loss": 0.1576, "rewards/chosen": 2.2035714285714287, "rewards/margins": 7.979433497536945, "rewards/rejected": -5.775862068965517, "step": 993 }, { "epoch": 0.7057152999645012, "grad_norm": 0.16848458605728217, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114511156.82539682, "logits/rejected": 137702226.7076923, "logps/chosen": -243.8095238095238, "logps/rejected": -347.5692307692308, "loss": 0.1539, "rewards/chosen": 1.8363095238095237, "rewards/margins": 9.58246336996337, "rewards/rejected": -7.746153846153846, "step": 994 }, { "epoch": 0.706425275115371, "grad_norm": 0.20353032996710588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127986190.62857144, "logits/rejected": 103700550.62068966, "logps/chosen": -220.57142857142858, "logps/rejected": -422.62068965517244, "loss": 0.1936, "rewards/chosen": 1.4714285714285715, "rewards/margins": 8.367980295566502, "rewards/rejected": -6.896551724137931, "step": 995 }, { "epoch": 0.7071352502662407, "grad_norm": 0.14739983440349388, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112066560.0, "logits/rejected": 75087872.0, "logps/chosen": -211.25, "logps/rejected": -362.25, "loss": 0.1357, "rewards/chosen": 2.15625, "rewards/margins": 10.875, "rewards/rejected": -8.71875, "step": 996 }, { "epoch": 0.7078452254171104, "grad_norm": 0.1855026289225484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125265706.02985075, "logits/rejected": 95643883.01639344, "logps/chosen": -204.4179104477612, "logps/rejected": -368.78688524590166, "loss": 0.1938, "rewards/chosen": 1.5475746268656716, "rewards/margins": -30632236.616359796, "rewards/rejected": 30632238.163934425, "step": 997 }, { "epoch": 0.7085552005679802, "grad_norm": 0.14108669533723475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101089835.38983051, "logits/rejected": 91788391.88405797, "logps/chosen": -195.66101694915255, "logps/rejected": -419.2463768115942, "loss": 0.1545, "rewards/chosen": 1.5635593220338984, "rewards/margins": 9.824428887251289, "rewards/rejected": -8.26086956521739, "step": 998 }, { "epoch": 0.7092651757188498, "grad_norm": 0.14125780117744646, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121119726.03508772, "logits/rejected": 101608491.26760563, "logps/chosen": -203.08771929824562, "logps/rejected": -368.22535211267603, "loss": 0.1602, "rewards/chosen": 1.4714912280701755, "rewards/margins": 9.35177291821102, "rewards/rejected": -7.880281690140845, "step": 999 }, { "epoch": 0.7099751508697195, "grad_norm": 0.22426316865729887, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92806801.19402985, "logits/rejected": 55651881.96721312, "logps/chosen": -188.17910447761193, "logps/rejected": -337.3114754098361, "loss": 0.1577, "rewards/chosen": 1.6380597014925373, "rewards/margins": 9.236420357230243, "rewards/rejected": -7.598360655737705, "step": 1000 }, { "epoch": 0.7106851260205893, "grad_norm": 0.14956751270666507, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 19263355.870967742, "logits/rejected": 118647963.15151516, "logps/chosen": -171.2258064516129, "logps/rejected": -388.8484848484849, "loss": 0.1648, "rewards/chosen": 1.3125, "rewards/margins": 10.456439393939394, "rewards/rejected": -9.143939393939394, "step": 1001 }, { "epoch": 0.711395101171459, "grad_norm": 0.1886957248303164, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137831282.2153846, "logits/rejected": 54093206.34920635, "logps/chosen": -216.6153846153846, "logps/rejected": -353.015873015873, "loss": 0.166, "rewards/chosen": 1.4567307692307692, "rewards/margins": 10.218635531135533, "rewards/rejected": -8.761904761904763, "step": 1002 }, { "epoch": 0.7121050763223287, "grad_norm": 0.19596281080078684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80950067.2, "logits/rejected": 113122846.11764705, "logps/chosen": -230.13333333333333, "logps/rejected": -373.1764705882353, "loss": 0.1482, "rewards/chosen": 1.675, "rewards/margins": 10.27794117647059, "rewards/rejected": -8.602941176470589, "step": 1003 }, { "epoch": 0.7128150514731985, "grad_norm": 0.17609116511491002, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130156576.50793651, "logits/rejected": 54913118.52307692, "logps/chosen": -224.0, "logps/rejected": -350.7692307692308, "loss": 0.1701, "rewards/chosen": 1.7003968253968254, "rewards/margins": 8.161935286935288, "rewards/rejected": -6.461538461538462, "step": 1004 }, { "epoch": 0.7135250266240681, "grad_norm": 0.14750902072348254, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108022393.01818182, "logits/rejected": 83052964.8219178, "logps/chosen": -181.6727272727273, "logps/rejected": -377.4246575342466, "loss": 0.1494, "rewards/chosen": 1.3477272727272727, "rewards/margins": 9.114850560398505, "rewards/rejected": -7.767123287671233, "step": 1005 }, { "epoch": 0.7142350017749379, "grad_norm": 0.18046708066921957, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103649071.72881356, "logits/rejected": 75619046.0289855, "logps/chosen": -252.20338983050848, "logps/rejected": -346.8985507246377, "loss": 0.1401, "rewards/chosen": 2.11228813559322, "rewards/margins": 8.242722918201915, "rewards/rejected": -6.130434782608695, "step": 1006 }, { "epoch": 0.7149449769258076, "grad_norm": 0.15428424653930398, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91142891.68253969, "logits/rejected": 120021622.15384616, "logps/chosen": -248.12698412698413, "logps/rejected": -437.16923076923075, "loss": 0.1625, "rewards/chosen": 2.0317460317460316, "rewards/margins": 11.647130647130647, "rewards/rejected": -9.615384615384615, "step": 1007 }, { "epoch": 0.7156549520766773, "grad_norm": 0.15290433915209964, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142727910.0289855, "logits/rejected": 104857600.0, "logps/chosen": -253.68115942028984, "logps/rejected": -380.7457627118644, "loss": 0.1716, "rewards/chosen": 1.858695652173913, "rewards/margins": 11.384119380987471, "rewards/rejected": -9.525423728813559, "step": 1008 }, { "epoch": 0.7163649272275471, "grad_norm": 0.3173837475157258, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102429318.73684211, "logits/rejected": 49545216.0, "logps/chosen": -204.0, "logps/rejected": -302.15384615384613, "loss": 0.1963, "rewards/chosen": 1.7023026315789473, "rewards/margins": 10.365764170040485, "rewards/rejected": -8.663461538461538, "step": 1009 }, { "epoch": 0.7170749023784168, "grad_norm": 0.15848289655514314, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124472139.29411764, "logits/rejected": 44145049.6, "logps/chosen": -244.47058823529412, "logps/rejected": -366.93333333333334, "loss": 0.1725, "rewards/chosen": 1.630514705882353, "rewards/margins": 10.555514705882354, "rewards/rejected": -8.925, "step": 1010 }, { "epoch": 0.7177848775292864, "grad_norm": 0.15058719404615928, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84756094.14492753, "logits/rejected": 46350613.69491526, "logps/chosen": -187.2463768115942, "logps/rejected": -342.50847457627117, "loss": 0.1726, "rewards/chosen": 1.6621376811594204, "rewards/margins": 10.077391918447555, "rewards/rejected": -8.415254237288135, "step": 1011 }, { "epoch": 0.7184948526801562, "grad_norm": 0.1578917384043509, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105201395.40983607, "logits/rejected": 108050279.1641791, "logps/chosen": -225.31147540983608, "logps/rejected": -372.53731343283584, "loss": 0.149, "rewards/chosen": 0.7684426229508197, "rewards/margins": 8.925159040861267, "rewards/rejected": -8.156716417910447, "step": 1012 }, { "epoch": 0.7192048278310259, "grad_norm": 0.18857132040154123, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73902454.9859155, "logits/rejected": 78808764.63157895, "logps/chosen": -219.49295774647888, "logps/rejected": -420.49122807017545, "loss": 0.1849, "rewards/chosen": 1.6267605633802817, "rewards/margins": 10.082900914257475, "rewards/rejected": -8.456140350877194, "step": 1013 }, { "epoch": 0.7199148029818956, "grad_norm": 0.13697224871035915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86873803.39726028, "logits/rejected": 87165263.12727273, "logps/chosen": -192.0, "logps/rejected": -317.96363636363634, "loss": 0.1612, "rewards/chosen": 1.9554794520547945, "rewards/margins": 10.010024906600249, "rewards/rejected": -8.054545454545455, "step": 1014 }, { "epoch": 0.7206247781327654, "grad_norm": 0.17198220808814532, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122647234.20689656, "logits/rejected": 57237269.942857146, "logps/chosen": -225.6551724137931, "logps/rejected": -300.34285714285716, "loss": 0.1546, "rewards/chosen": 1.6303879310344827, "rewards/margins": 9.751816502463054, "rewards/rejected": -8.121428571428572, "step": 1015 }, { "epoch": 0.721334753283635, "grad_norm": 0.17964942139826265, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127409582.3768116, "logits/rejected": 46954877.83050848, "logps/chosen": -182.4927536231884, "logps/rejected": -343.3220338983051, "loss": 0.1603, "rewards/chosen": 1.6467391304347827, "rewards/margins": -19017101.539701547, "rewards/rejected": 19017103.186440676, "step": 1016 }, { "epoch": 0.7220447284345048, "grad_norm": 0.16287220821546858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96468992.0, "logits/rejected": 109182976.0, "logps/chosen": -230.875, "logps/rejected": -316.0, "loss": 0.1529, "rewards/chosen": 2.0087890625, "rewards/margins": 8.3056640625, "rewards/rejected": -6.296875, "step": 1017 }, { "epoch": 0.7227547035853745, "grad_norm": 0.2162284601037316, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90786386.58064516, "logits/rejected": 133137376.96969697, "logps/chosen": -198.70967741935485, "logps/rejected": -350.54545454545456, "loss": 0.1714, "rewards/chosen": 1.5675403225806452, "rewards/margins": 10.446328201368525, "rewards/rejected": -8.878787878787879, "step": 1018 }, { "epoch": 0.7234646787362442, "grad_norm": 0.17184772204670046, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96726838.55737706, "logits/rejected": 102666545.6716418, "logps/chosen": -166.0327868852459, "logps/rejected": -387.1044776119403, "loss": 0.1641, "rewards/chosen": 1.2756147540983607, "rewards/margins": 9.09651027648642, "rewards/rejected": -7.82089552238806, "step": 1019 }, { "epoch": 0.724174653887114, "grad_norm": 0.1645185585163597, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135730427.80327868, "logits/rejected": 88518594.86567163, "logps/chosen": -263.60655737704917, "logps/rejected": -361.7910447761194, "loss": 0.1453, "rewards/chosen": 1.790983606557377, "rewards/margins": 10.238744800587229, "rewards/rejected": -8.447761194029852, "step": 1020 }, { "epoch": 0.7248846290379837, "grad_norm": 0.154604326899712, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78203474.58064516, "logits/rejected": 107081852.12121212, "logps/chosen": -180.25806451612902, "logps/rejected": -347.6363636363636, "loss": 0.1478, "rewards/chosen": 1.8790322580645162, "rewards/margins": 6.742668621700879, "rewards/rejected": -4.863636363636363, "step": 1021 }, { "epoch": 0.7255946041888534, "grad_norm": 0.20019624120205526, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95251290.83870968, "logits/rejected": 108162203.15151516, "logps/chosen": -190.19354838709677, "logps/rejected": -383.5151515151515, "loss": 0.1764, "rewards/chosen": 1.3296370967741935, "rewards/margins": 9.9356977028348, "rewards/rejected": -8.606060606060606, "step": 1022 }, { "epoch": 0.7263045793397231, "grad_norm": 0.16050156482766814, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86721118.81481482, "logits/rejected": 62801200.432432435, "logps/chosen": -184.88888888888889, "logps/rejected": -308.3243243243243, "loss": 0.1422, "rewards/chosen": 1.6168981481481481, "rewards/margins": 9.285817067067068, "rewards/rejected": -7.668918918918919, "step": 1023 }, { "epoch": 0.7270145544905928, "grad_norm": 0.14867397552707343, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161647144.63492063, "logits/rejected": 35502363.56923077, "logps/chosen": -210.03174603174602, "logps/rejected": -338.2153846153846, "loss": 0.162, "rewards/chosen": 1.5178571428571428, "rewards/margins": 10.433241758241758, "rewards/rejected": -8.915384615384616, "step": 1024 }, { "epoch": 0.7277245296414625, "grad_norm": 0.15803628213374749, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91415199.47540984, "logits/rejected": 53774733.37313433, "logps/chosen": -178.75409836065575, "logps/rejected": -341.0149253731343, "loss": 0.1708, "rewards/chosen": 1.2248975409836065, "rewards/margins": 10.120419929043308, "rewards/rejected": -8.895522388059701, "step": 1025 }, { "epoch": 0.7284345047923323, "grad_norm": 0.15467401241760875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 152372178.14925373, "logits/rejected": 26678523.80327869, "logps/chosen": -231.40298507462686, "logps/rejected": -363.8032786885246, "loss": 0.164, "rewards/chosen": 1.6986940298507462, "rewards/margins": 11.239677636408123, "rewards/rejected": -9.540983606557377, "step": 1026 }, { "epoch": 0.729144479943202, "grad_norm": 0.17769873265283717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127267167.08571428, "logits/rejected": 94227208.8275862, "logps/chosen": -221.02857142857144, "logps/rejected": -376.2758620689655, "loss": 0.1591, "rewards/chosen": 1.5214285714285714, "rewards/margins": 9.754187192118227, "rewards/rejected": -8.232758620689655, "step": 1027 }, { "epoch": 0.7298544550940717, "grad_norm": 0.16748496933942092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84270557.86666666, "logits/rejected": 110902332.23529412, "logps/chosen": -243.2, "logps/rejected": -415.05882352941177, "loss": 0.1625, "rewards/chosen": 1.3833333333333333, "rewards/margins": 10.677450980392157, "rewards/rejected": -9.294117647058824, "step": 1028 }, { "epoch": 0.7305644302449414, "grad_norm": 0.17555405430350868, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162133915.27868852, "logits/rejected": 62225942.92537314, "logps/chosen": -271.08196721311475, "logps/rejected": -332.8955223880597, "loss": 0.1585, "rewards/chosen": 2.0256147540983607, "rewards/margins": 10.756958037680452, "rewards/rejected": -8.73134328358209, "step": 1029 }, { "epoch": 0.7312744053958111, "grad_norm": 0.20822789832518018, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75969331.2, "logits/rejected": 174927149.17647058, "logps/chosen": -292.8, "logps/rejected": -406.5882352941176, "loss": 0.1437, "rewards/chosen": 1.6208333333333333, "rewards/margins": 8.054656862745098, "rewards/rejected": -6.4338235294117645, "step": 1030 }, { "epoch": 0.7319843805466809, "grad_norm": 0.20899832452325823, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138978829.83783785, "logits/rejected": 38253605.925925925, "logps/chosen": -209.94594594594594, "logps/rejected": -371.55555555555554, "loss": 0.175, "rewards/chosen": 1.5929054054054055, "rewards/margins": 10.185497997997999, "rewards/rejected": -8.592592592592593, "step": 1031 }, { "epoch": 0.7326943556975506, "grad_norm": 0.19179973300341818, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147477140.6451613, "logits/rejected": 10104459.636363637, "logps/chosen": -200.51612903225808, "logps/rejected": -321.45454545454544, "loss": 0.1669, "rewards/chosen": 1.1360887096774193, "rewards/margins": 10.105785679374389, "rewards/rejected": -8.969696969696969, "step": 1032 }, { "epoch": 0.7334043308484203, "grad_norm": 0.2471229230816794, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155189248.0, "logits/rejected": 39668163.25423729, "logps/chosen": -238.6086956521739, "logps/rejected": -358.77966101694915, "loss": 0.1937, "rewards/chosen": 1.4166666666666667, "rewards/margins": 10.747175141242938, "rewards/rejected": -9.330508474576272, "step": 1033 }, { "epoch": 0.7341143059992901, "grad_norm": 0.15557416506271365, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100727823.75384615, "logits/rejected": 66509677.71428572, "logps/chosen": -247.3846153846154, "logps/rejected": -346.6666666666667, "loss": 0.1495, "rewards/chosen": 1.626923076923077, "rewards/margins": 9.706288156288156, "rewards/rejected": -8.079365079365079, "step": 1034 }, { "epoch": 0.7348242811501597, "grad_norm": 0.1984828554984502, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97307852.8, "logits/rejected": 88803539.86206897, "logps/chosen": -193.6, "logps/rejected": -330.2068965517241, "loss": 0.1588, "rewards/chosen": 1.7571428571428571, "rewards/margins": 10.877832512315273, "rewards/rejected": -9.120689655172415, "step": 1035 }, { "epoch": 0.7355342563010294, "grad_norm": 0.15846184557144716, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56717006.32835821, "logits/rejected": 114415112.39344262, "logps/chosen": -165.2537313432836, "logps/rejected": -451.672131147541, "loss": 0.1621, "rewards/chosen": 1.4598880597014925, "rewards/margins": 10.869724125275264, "rewards/rejected": -9.40983606557377, "step": 1036 }, { "epoch": 0.7362442314518992, "grad_norm": 0.15953317358749397, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95192464.69565217, "logits/rejected": 65971425.62711865, "logps/chosen": -196.17391304347825, "logps/rejected": -334.3728813559322, "loss": 0.1787, "rewards/chosen": 0.8641304347826086, "rewards/margins": 7.906503316138541, "rewards/rejected": -7.0423728813559325, "step": 1037 }, { "epoch": 0.7369542066027689, "grad_norm": 0.1675922721970083, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124532698.76363637, "logits/rejected": 87103628.2739726, "logps/chosen": -205.96363636363637, "logps/rejected": -369.972602739726, "loss": 0.1422, "rewards/chosen": 1.518181818181818, "rewards/margins": 7.634620174346201, "rewards/rejected": -6.116438356164384, "step": 1038 }, { "epoch": 0.7376641817536386, "grad_norm": 0.2529248636163344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110162160.94117647, "logits/rejected": 65081617.06666667, "logps/chosen": -199.2941176470588, "logps/rejected": -376.0, "loss": 0.1731, "rewards/chosen": 1.630514705882353, "rewards/margins": 11.163848039215686, "rewards/rejected": -9.533333333333333, "step": 1039 }, { "epoch": 0.7383741569045084, "grad_norm": 0.21368319344714182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121586192.51612903, "logits/rejected": 85061756.12121212, "logps/chosen": -241.03225806451613, "logps/rejected": -433.2121212121212, "loss": 0.1686, "rewards/chosen": 1.4183467741935485, "rewards/margins": 11.728952834799609, "rewards/rejected": -10.31060606060606, "step": 1040 }, { "epoch": 0.739084132055378, "grad_norm": 0.1470184678767412, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113743837.28813559, "logits/rejected": 76135735.6521739, "logps/chosen": -198.64406779661016, "logps/rejected": -409.5072463768116, "loss": 0.1234, "rewards/chosen": 1.8029661016949152, "rewards/margins": 10.39716900024564, "rewards/rejected": -8.594202898550725, "step": 1041 }, { "epoch": 0.7397941072062478, "grad_norm": 0.15114125999436331, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97019064.6557377, "logits/rejected": 92525094.20895523, "logps/chosen": -220.327868852459, "logps/rejected": -403.1044776119403, "loss": 0.1467, "rewards/chosen": 2.1741803278688523, "rewards/margins": 9.883135551749449, "rewards/rejected": -7.708955223880597, "step": 1042 }, { "epoch": 0.7405040823571175, "grad_norm": 0.15816572962857683, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135720686.93333334, "logits/rejected": 40123452.23529412, "logps/chosen": -232.26666666666668, "logps/rejected": -326.8235294117647, "loss": 0.1489, "rewards/chosen": 1.58125, "rewards/margins": 8.390073529411765, "rewards/rejected": -6.8088235294117645, "step": 1043 }, { "epoch": 0.7412140575079872, "grad_norm": 0.18106134104332153, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128991232.0, "logits/rejected": 61210624.0, "logps/chosen": -244.375, "logps/rejected": -370.0, "loss": 0.1763, "rewards/chosen": 0.9482421875, "rewards/margins": 9.7060546875, "rewards/rejected": -8.7578125, "step": 1044 }, { "epoch": 0.741924032658857, "grad_norm": 0.17586435908073178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146432718.59649122, "logits/rejected": 63712068.50704225, "logps/chosen": -277.89473684210526, "logps/rejected": -351.5492957746479, "loss": 0.1463, "rewards/chosen": 1.8596491228070176, "rewards/margins": 10.958240672102791, "rewards/rejected": -9.098591549295774, "step": 1045 }, { "epoch": 0.7426340078097267, "grad_norm": 0.16292376052979562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125602400.86486487, "logits/rejected": 55390056.2962963, "logps/chosen": -209.51351351351352, "logps/rejected": -365.3333333333333, "loss": 0.1837, "rewards/chosen": 1.4341216216216217, "rewards/margins": 9.95264014014014, "rewards/rejected": -8.518518518518519, "step": 1046 }, { "epoch": 0.7433439829605963, "grad_norm": 0.16256262627863313, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82901054.06060606, "logits/rejected": 106075301.16129032, "logps/chosen": -252.12121212121212, "logps/rejected": -396.9032258064516, "loss": 0.1537, "rewards/chosen": 2.077651515151515, "rewards/margins": 10.384103128054742, "rewards/rejected": -8.306451612903226, "step": 1047 }, { "epoch": 0.7440539581114661, "grad_norm": 0.15580118138601648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128246176.54237288, "logits/rejected": 87791645.68115942, "logps/chosen": -212.33898305084745, "logps/rejected": -408.1159420289855, "loss": 0.1615, "rewards/chosen": 1.4597457627118644, "rewards/margins": 11.27858634242201, "rewards/rejected": -9.818840579710145, "step": 1048 }, { "epoch": 0.7447639332623358, "grad_norm": 0.14946267149219206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112421099.01639344, "logits/rejected": 83886080.0, "logps/chosen": -192.0, "logps/rejected": -355.82089552238807, "loss": 0.1406, "rewards/chosen": 1.7172131147540983, "rewards/margins": 10.575422069977979, "rewards/rejected": -8.85820895522388, "step": 1049 }, { "epoch": 0.7454739084132055, "grad_norm": 0.17775211461295543, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131338056.59701492, "logits/rejected": 33004359.344262294, "logps/chosen": -191.044776119403, "logps/rejected": -330.75409836065575, "loss": 0.1797, "rewards/chosen": 1.271455223880597, "rewards/margins": 10.14030768289699, "rewards/rejected": -8.868852459016393, "step": 1050 }, { "epoch": 0.7461838835640753, "grad_norm": 0.17375004766279215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115681610.32258065, "logits/rejected": 70635892.36363636, "logps/chosen": -235.09677419354838, "logps/rejected": -338.90909090909093, "loss": 0.1592, "rewards/chosen": 1.9793346774193548, "rewards/margins": 10.403577101661778, "rewards/rejected": -8.424242424242424, "step": 1051 }, { "epoch": 0.746893858714945, "grad_norm": 0.17414457899891486, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83852791.87301587, "logits/rejected": 114472235.32307692, "logps/chosen": -225.52380952380952, "logps/rejected": -420.9230769230769, "loss": 0.1387, "rewards/chosen": 1.8968253968253967, "rewards/margins": 10114894.696825398, "rewards/rejected": -10114892.8, "step": 1052 }, { "epoch": 0.7476038338658147, "grad_norm": 0.157379121548602, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123480309.76, "logits/rejected": 78504708.83018868, "logps/chosen": -205.22666666666666, "logps/rejected": -374.9433962264151, "loss": 0.173, "rewards/chosen": 1.57, "rewards/margins": 10.437924528301886, "rewards/rejected": -8.867924528301886, "step": 1053 }, { "epoch": 0.7483138090166844, "grad_norm": 0.1889962027765158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117330135.57894737, "logits/rejected": 64124455.384615384, "logps/chosen": -202.8421052631579, "logps/rejected": -325.84615384615387, "loss": 0.2032, "rewards/chosen": 1.2549342105263157, "rewards/margins": 9.985703441295545, "rewards/rejected": -8.73076923076923, "step": 1054 }, { "epoch": 0.7490237841675541, "grad_norm": 0.1654675988337385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142481132.8955224, "logits/rejected": 97844173.63934426, "logps/chosen": -221.8507462686567, "logps/rejected": -327.08196721311475, "loss": 0.1513, "rewards/chosen": 2.0634328358208953, "rewards/margins": 8.07162955713237, "rewards/rejected": -6.008196721311475, "step": 1055 }, { "epoch": 0.7497337593184239, "grad_norm": 0.19470995820323206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104734238.11764705, "logits/rejected": 87241523.2, "logps/chosen": -187.2941176470588, "logps/rejected": -340.26666666666665, "loss": 0.1842, "rewards/chosen": 1.2077205882352942, "rewards/margins": 8.741053921568627, "rewards/rejected": -7.533333333333333, "step": 1056 }, { "epoch": 0.7504437344692936, "grad_norm": 0.13676532646199735, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 23667858.285714287, "logits/rejected": 157834885.9076923, "logps/chosen": -211.8095238095238, "logps/rejected": -417.96923076923076, "loss": 0.141, "rewards/chosen": 2.0337301587301586, "rewards/margins": 11.249114774114773, "rewards/rejected": -9.215384615384615, "step": 1057 }, { "epoch": 0.7511537096201633, "grad_norm": 0.17560989643634303, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145592111.72881356, "logits/rejected": 57177785.507246375, "logps/chosen": -221.01694915254237, "logps/rejected": -405.7971014492754, "loss": 0.1543, "rewards/chosen": 1.7097457627118644, "rewards/margins": 10.85467329894375, "rewards/rejected": -9.144927536231885, "step": 1058 }, { "epoch": 0.751863684771033, "grad_norm": 0.17138074335394163, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88760541.4054054, "logits/rejected": 79458759.1111111, "logps/chosen": -179.45945945945945, "logps/rejected": -320.0, "loss": 0.1671, "rewards/chosen": 1.9560810810810811, "rewards/margins": 10.641266266266266, "rewards/rejected": -8.685185185185185, "step": 1059 }, { "epoch": 0.7525736599219027, "grad_norm": 0.16541907897030717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 48742896.484848484, "logits/rejected": 120552414.96774194, "logps/chosen": -170.42424242424244, "logps/rejected": -387.61290322580646, "loss": 0.1624, "rewards/chosen": 1.3238636363636365, "rewards/margins": 10.606121700879767, "rewards/rejected": -9.28225806451613, "step": 1060 }, { "epoch": 0.7532836350727724, "grad_norm": 0.13705967759276558, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123370390.06896552, "logits/rejected": 87181604.57142857, "logps/chosen": -253.51724137931035, "logps/rejected": -374.4, "loss": 0.1359, "rewards/chosen": 1.800646551724138, "rewards/margins": 10.086360837438423, "rewards/rejected": -8.285714285714286, "step": 1061 }, { "epoch": 0.7539936102236422, "grad_norm": 0.15140955247388124, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67616239.48387097, "logits/rejected": 89224285.0909091, "logps/chosen": -177.16129032258064, "logps/rejected": -350.54545454545456, "loss": 0.1452, "rewards/chosen": 1.5201612903225807, "rewards/margins": 9.95955522971652, "rewards/rejected": -8.43939393939394, "step": 1062 }, { "epoch": 0.7547035853745119, "grad_norm": 0.19437312568679305, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111590561.68421052, "logits/rejected": 55332548.92307692, "logps/chosen": -224.6315789473684, "logps/rejected": -397.53846153846155, "loss": 0.1829, "rewards/chosen": 1.5855263157894737, "rewards/margins": 855778.8162955466, "rewards/rejected": -855777.2307692308, "step": 1063 }, { "epoch": 0.7554135605253817, "grad_norm": 0.16525498734697355, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112047835.42857143, "logits/rejected": 70182276.4137931, "logps/chosen": -212.57142857142858, "logps/rejected": -388.9655172413793, "loss": 0.185, "rewards/chosen": 1.2705357142857143, "rewards/margins": 11.675708128078817, "rewards/rejected": -10.405172413793103, "step": 1064 }, { "epoch": 0.7561235356762513, "grad_norm": 0.18365069752917038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99017185.88235295, "logits/rejected": 119258043.73333333, "logps/chosen": -243.76470588235293, "logps/rejected": -387.46666666666664, "loss": 0.1524, "rewards/chosen": 1.2683823529411764, "rewards/margins": 9.085049019607842, "rewards/rejected": -7.816666666666666, "step": 1065 }, { "epoch": 0.756833510827121, "grad_norm": 0.1525907677341466, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 65485262.451612905, "logits/rejected": 133582227.39393939, "logps/chosen": -165.09677419354838, "logps/rejected": -371.3939393939394, "loss": 0.1525, "rewards/chosen": 1.6108870967741935, "rewards/margins": 10.285129521016618, "rewards/rejected": -8.674242424242424, "step": 1066 }, { "epoch": 0.7575434859779908, "grad_norm": 0.14605200353556522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117512827.5862069, "logits/rejected": 78193810.28571428, "logps/chosen": -189.51724137931035, "logps/rejected": -334.62857142857143, "loss": 0.14, "rewards/chosen": 1.478448275862069, "rewards/margins": 9.292733990147783, "rewards/rejected": -7.814285714285714, "step": 1067 }, { "epoch": 0.7582534611288605, "grad_norm": 0.2002806711643844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92574281.14285715, "logits/rejected": 77305361.65517241, "logps/chosen": -187.88571428571427, "logps/rejected": -345.6551724137931, "loss": 0.1865, "rewards/chosen": 1.4696428571428573, "rewards/margins": 9.254125615763547, "rewards/rejected": -7.7844827586206895, "step": 1068 }, { "epoch": 0.7589634362797302, "grad_norm": 0.1784020385553684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128492170.15873016, "logits/rejected": 59914019.44615385, "logps/chosen": -230.34920634920636, "logps/rejected": -346.83076923076925, "loss": 0.155, "rewards/chosen": 1.5317460317460319, "rewards/margins": 10.531746031746032, "rewards/rejected": -9.0, "step": 1069 }, { "epoch": 0.7596734114306, "grad_norm": 0.22718442750165418, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131905483.48717949, "logits/rejected": 106157834.24, "logps/chosen": -251.7948717948718, "logps/rejected": -407.68, "loss": 0.2008, "rewards/chosen": 1.7403846153846154, "rewards/margins": 9.020384615384616, "rewards/rejected": -7.28, "step": 1070 }, { "epoch": 0.7603833865814696, "grad_norm": 0.12008531038861585, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123999252.07843137, "logits/rejected": 93527532.05194806, "logps/chosen": -295.52941176470586, "logps/rejected": -374.4415584415584, "loss": 0.1067, "rewards/chosen": 2.6813725490196076, "rewards/margins": 11.16189202953909, "rewards/rejected": -8.480519480519481, "step": 1071 }, { "epoch": 0.7610933617323393, "grad_norm": 0.16461008174612937, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136052736.0, "logits/rejected": 95264768.0, "logps/chosen": -222.875, "logps/rejected": -425.75, "loss": 0.1615, "rewards/chosen": 1.767578125, "rewards/margins": 11.400390625, "rewards/rejected": -9.6328125, "step": 1072 }, { "epoch": 0.7618033368832091, "grad_norm": 0.1754042511838401, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 154989519.23809522, "logits/rejected": 46395455.015384614, "logps/chosen": -222.47619047619048, "logps/rejected": -364.3076923076923, "loss": 0.1676, "rewards/chosen": 1.4166666666666667, "rewards/margins": 7.716666666666667, "rewards/rejected": -6.3, "step": 1073 }, { "epoch": 0.7625133120340788, "grad_norm": 0.1444995562445229, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111914682.92063493, "logits/rejected": 98824255.01538461, "logps/chosen": -211.68253968253967, "logps/rejected": -336.24615384615385, "loss": 0.1773, "rewards/chosen": 1.4751984126984128, "rewards/margins": 10.059813797313797, "rewards/rejected": -8.584615384615384, "step": 1074 }, { "epoch": 0.7632232871849485, "grad_norm": 0.16256897725582561, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133948094.17142858, "logits/rejected": 81427350.06896552, "logps/chosen": -174.97142857142856, "logps/rejected": -307.0344827586207, "loss": 0.1744, "rewards/chosen": 1.4785714285714286, "rewards/margins": 10.263054187192118, "rewards/rejected": -8.78448275862069, "step": 1075 }, { "epoch": 0.7639332623358183, "grad_norm": 0.18003006321816317, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84526019.76470588, "logits/rejected": 105277030.4, "logps/chosen": -227.76470588235293, "logps/rejected": -367.46666666666664, "loss": 0.1589, "rewards/chosen": 2.1066176470588234, "rewards/margins": 10.764950980392157, "rewards/rejected": -8.658333333333333, "step": 1076 }, { "epoch": 0.7646432374866879, "grad_norm": 0.1500330536721157, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117755084.8, "logits/rejected": 74572257.88235295, "logps/chosen": -239.33333333333334, "logps/rejected": -358.11764705882354, "loss": 0.15, "rewards/chosen": 1.8510416666666667, "rewards/margins": -11210773.678370098, "rewards/rejected": 11210775.529411765, "step": 1077 }, { "epoch": 0.7653532126375577, "grad_norm": 0.2102198612573654, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112447292.95238096, "logits/rejected": 23294519.138461538, "logps/chosen": -235.93650793650792, "logps/rejected": -289.96923076923076, "loss": 0.146, "rewards/chosen": 1.5654761904761905, "rewards/margins": 9.480860805860805, "rewards/rejected": -7.915384615384616, "step": 1078 }, { "epoch": 0.7660631877884274, "grad_norm": 0.1690993062525537, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 64117338.35294118, "logits/rejected": 99964245.33333333, "logps/chosen": -197.1764705882353, "logps/rejected": -364.26666666666665, "loss": 0.1683, "rewards/chosen": 1.599264705882353, "rewards/margins": 9.14093137254902, "rewards/rejected": -7.541666666666667, "step": 1079 }, { "epoch": 0.7667731629392971, "grad_norm": 0.15455256338831658, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95468811.81538461, "logits/rejected": 133252372.31746031, "logps/chosen": -213.16923076923078, "logps/rejected": -422.6031746031746, "loss": 0.1668, "rewards/chosen": 1.3461538461538463, "rewards/margins": 10.242979242979244, "rewards/rejected": -8.896825396825397, "step": 1080 }, { "epoch": 0.7674831380901669, "grad_norm": 0.15954225888174797, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111049191.61904761, "logits/rejected": 81498553.1076923, "logps/chosen": -168.76190476190476, "logps/rejected": -377.10769230769233, "loss": 0.1576, "rewards/chosen": 1.4712301587301588, "rewards/margins": 10.178922466422465, "rewards/rejected": -8.707692307692307, "step": 1081 }, { "epoch": 0.7681931132410366, "grad_norm": 0.13196605995856475, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109351497.14285715, "logits/rejected": 109634446.22222222, "logps/chosen": -218.57142857142858, "logps/rejected": -379.55555555555554, "loss": 0.136, "rewards/chosen": 2.1138392857142856, "rewards/margins": 10.32217261904762, "rewards/rejected": -8.208333333333334, "step": 1082 }, { "epoch": 0.7689030883919062, "grad_norm": 0.17604102123599086, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105651116.97297297, "logits/rejected": 80856860.44444445, "logps/chosen": -219.24324324324326, "logps/rejected": -317.037037037037, "loss": 0.175, "rewards/chosen": 0.7635135135135135, "rewards/margins": 9.06906906906907, "rewards/rejected": -8.305555555555555, "step": 1083 }, { "epoch": 0.769613063542776, "grad_norm": 0.1612595945832187, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109411415.77142857, "logits/rejected": 71556272.55172414, "logps/chosen": -205.02857142857144, "logps/rejected": -333.7931034482759, "loss": 0.1684, "rewards/chosen": 1.7678571428571428, "rewards/margins": 10.423029556650246, "rewards/rejected": -8.655172413793103, "step": 1084 }, { "epoch": 0.7703230386936457, "grad_norm": 0.14265905075145013, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88661133.78461538, "logits/rejected": 89728146.28571428, "logps/chosen": -199.3846153846154, "logps/rejected": -392.12698412698415, "loss": 0.1494, "rewards/chosen": 1.3942307692307692, "rewards/margins": 10.798992673992675, "rewards/rejected": -9.404761904761905, "step": 1085 }, { "epoch": 0.7710330138445154, "grad_norm": 0.15007785383485245, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142606336.0, "logits/rejected": 39556625.655172415, "logps/chosen": -220.11428571428573, "logps/rejected": -363.58620689655174, "loss": 0.1651, "rewards/chosen": 2.0035714285714286, "rewards/margins": 8.857019704433498, "rewards/rejected": -6.853448275862069, "step": 1086 }, { "epoch": 0.7717429889953852, "grad_norm": 0.15295950378844717, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124049718.3030303, "logits/rejected": 84156680.25806452, "logps/chosen": -210.3030303030303, "logps/rejected": -408.258064516129, "loss": 0.1359, "rewards/chosen": 2.0606060606060606, "rewards/margins": 12.23802541544477, "rewards/rejected": -10.17741935483871, "step": 1087 }, { "epoch": 0.7724529641462549, "grad_norm": 0.14255297845208015, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77042741.89473684, "logits/rejected": 97089276.39436619, "logps/chosen": -192.56140350877192, "logps/rejected": -276.7323943661972, "loss": 0.1543, "rewards/chosen": 1.5054824561403508, "rewards/margins": 9.449144427971337, "rewards/rejected": -7.943661971830986, "step": 1088 }, { "epoch": 0.7731629392971247, "grad_norm": 0.1814503942866232, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95582871.43661971, "logits/rejected": 101914228.77192983, "logps/chosen": -191.09859154929578, "logps/rejected": -405.3333333333333, "loss": 0.1772, "rewards/chosen": 1.2711267605633803, "rewards/margins": -1729349.044662713, "rewards/rejected": 1729350.3157894737, "step": 1089 }, { "epoch": 0.7738729144479943, "grad_norm": 0.17809287666274895, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117440512.0, "logits/rejected": 66727563.63636363, "logps/chosen": -206.02739726027397, "logps/rejected": -334.54545454545456, "loss": 0.1836, "rewards/chosen": 1.6883561643835616, "rewards/margins": 10.342901618929016, "rewards/rejected": -8.654545454545454, "step": 1090 }, { "epoch": 0.774582889598864, "grad_norm": 0.13720137319211004, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134217728.0, "logits/rejected": 73497111.63076924, "logps/chosen": -225.77777777777777, "logps/rejected": -332.8, "loss": 0.1469, "rewards/chosen": 1.1111111111111112, "rewards/margins": 9.803418803418802, "rewards/rejected": -8.692307692307692, "step": 1091 }, { "epoch": 0.7752928647497338, "grad_norm": 0.15339008144097158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 89012451.55555555, "logits/rejected": 80982331.07692307, "logps/chosen": -240.5079365079365, "logps/rejected": -353.96923076923076, "loss": 0.1569, "rewards/chosen": 1.4260912698412698, "rewards/margins": 8.395322039072038, "rewards/rejected": -6.969230769230769, "step": 1092 }, { "epoch": 0.7760028399006035, "grad_norm": 0.17493194708935053, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83639356.23529412, "logits/rejected": 104228454.4, "logps/chosen": -184.0, "logps/rejected": -344.53333333333336, "loss": 0.1611, "rewards/chosen": 1.619485294117647, "rewards/margins": 9.619485294117647, "rewards/rejected": -8.0, "step": 1093 }, { "epoch": 0.7767128150514732, "grad_norm": 0.1526216733552698, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128705214.17142858, "logits/rejected": 26250557.79310345, "logps/chosen": -211.88571428571427, "logps/rejected": -350.0689655172414, "loss": 0.1638, "rewards/chosen": 1.8607142857142858, "rewards/margins": 11.9814039408867, "rewards/rejected": -10.120689655172415, "step": 1094 }, { "epoch": 0.777422790202343, "grad_norm": 0.1952433695686494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106488718.22222222, "logits/rejected": 134367524.57142857, "logps/chosen": -196.66666666666666, "logps/rejected": -371.42857142857144, "loss": 0.1802, "rewards/chosen": 1.4496527777777777, "rewards/margins": 10.146081349206348, "rewards/rejected": -8.696428571428571, "step": 1095 }, { "epoch": 0.7781327653532126, "grad_norm": 0.17156240360872005, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165365592.13114753, "logits/rejected": 35275974.686567165, "logps/chosen": -233.70491803278688, "logps/rejected": -370.6268656716418, "loss": 0.1637, "rewards/chosen": 1.459016393442623, "rewards/margins": 9.869464154636653, "rewards/rejected": -8.41044776119403, "step": 1096 }, { "epoch": 0.7788427405040823, "grad_norm": 0.19226571164295203, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84156680.25806452, "logits/rejected": 147499690.66666666, "logps/chosen": -230.32258064516128, "logps/rejected": -416.72727272727275, "loss": 0.1454, "rewards/chosen": 2.0141129032258065, "rewards/margins": 11.18835532746823, "rewards/rejected": -9.174242424242424, "step": 1097 }, { "epoch": 0.7795527156549521, "grad_norm": 0.1614467615828684, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85777628.86274509, "logits/rejected": 107526702.54545455, "logps/chosen": -221.49019607843138, "logps/rejected": -368.6233766233766, "loss": 0.1281, "rewards/chosen": 1.5735294117647058, "rewards/margins": 10.625477463712759, "rewards/rejected": -9.051948051948052, "step": 1098 }, { "epoch": 0.7802626908058218, "grad_norm": 0.15174318954686927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125119314.70769231, "logits/rejected": 61466526.47619048, "logps/chosen": -209.35384615384615, "logps/rejected": -338.2857142857143, "loss": 0.1566, "rewards/chosen": 1.6153846153846154, "rewards/margins": 10.615384615384615, "rewards/rejected": -9.0, "step": 1099 }, { "epoch": 0.7809726659566916, "grad_norm": 0.15234809389219386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108714861.71428572, "logits/rejected": 95769941.33333333, "logps/chosen": -204.0, "logps/rejected": -375.1111111111111, "loss": 0.1519, "rewards/chosen": 1.046875, "rewards/margins": 9.519097222222221, "rewards/rejected": -8.472222222222221, "step": 1100 }, { "epoch": 0.7816826411075612, "grad_norm": 0.18331079035186454, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145196935.52941176, "logits/rejected": 91960115.2, "logps/chosen": -256.2352941176471, "logps/rejected": -325.06666666666666, "loss": 0.1759, "rewards/chosen": 1.4797794117647058, "rewards/margins": 9.438112745098039, "rewards/rejected": -7.958333333333333, "step": 1101 }, { "epoch": 0.7823926162584309, "grad_norm": 0.15332119868416377, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140771328.0, "logits/rejected": 75563008.0, "logps/chosen": -281.75, "logps/rejected": -357.0, "loss": 0.1533, "rewards/chosen": 1.810546875, "rewards/margins": 10.388671875, "rewards/rejected": -8.578125, "step": 1102 }, { "epoch": 0.7831025914093007, "grad_norm": 0.1795742582368739, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112102306.9090909, "logits/rejected": 83277229.41935484, "logps/chosen": -226.9090909090909, "logps/rejected": -387.0967741935484, "loss": 0.1603, "rewards/chosen": 1.4053030303030303, "rewards/margins": 8.744012707722385, "rewards/rejected": -7.338709677419355, "step": 1103 }, { "epoch": 0.7838125665601704, "grad_norm": 0.1429643871626653, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83129730.09836066, "logits/rejected": 80877292.89552239, "logps/chosen": -213.24590163934425, "logps/rejected": -342.44776119402985, "loss": 0.1493, "rewards/chosen": 1.3422131147540983, "rewards/margins": 11.028780278933203, "rewards/rejected": -9.686567164179104, "step": 1104 }, { "epoch": 0.7845225417110401, "grad_norm": 0.16634524271911977, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147436140.6060606, "logits/rejected": 70220766.96774194, "logps/chosen": -234.9090909090909, "logps/rejected": -332.1290322580645, "loss": 0.1565, "rewards/chosen": 2.009469696969697, "rewards/margins": 11.735276148582601, "rewards/rejected": -9.725806451612904, "step": 1105 }, { "epoch": 0.7852325168619099, "grad_norm": 0.1443775966340912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87440574.91525424, "logits/rejected": 58933010.55072464, "logps/chosen": -167.1864406779661, "logps/rejected": -331.1304347826087, "loss": 0.1275, "rewards/chosen": 1.9194915254237288, "rewards/margins": 10.948477032670105, "rewards/rejected": -9.028985507246377, "step": 1106 }, { "epoch": 0.7859424920127795, "grad_norm": 0.16879587555454742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93714523.70149253, "logits/rejected": 70443679.47540984, "logps/chosen": -199.6417910447761, "logps/rejected": -385.57377049180326, "loss": 0.1611, "rewards/chosen": 1.2546641791044777, "rewards/margins": 11.221877293858576, "rewards/rejected": -9.967213114754099, "step": 1107 }, { "epoch": 0.7866524671636492, "grad_norm": 0.17388314277850223, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135499320.8888889, "logits/rejected": 86807113.14285715, "logps/chosen": -249.33333333333334, "logps/rejected": -390.2857142857143, "loss": 0.1606, "rewards/chosen": 2.078125, "rewards/margins": 9.935267857142858, "rewards/rejected": -7.857142857142857, "step": 1108 }, { "epoch": 0.787362442314519, "grad_norm": 0.15799181953554225, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111969680.69565217, "logits/rejected": 87867114.30508475, "logps/chosen": -238.3768115942029, "logps/rejected": -374.77966101694915, "loss": 0.1646, "rewards/chosen": 2.045289855072464, "rewards/margins": 9.299527143208056, "rewards/rejected": -7.254237288135593, "step": 1109 }, { "epoch": 0.7880724174653887, "grad_norm": 0.16879395038052655, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106857960.36923076, "logits/rejected": 82354826.15873016, "logps/chosen": -241.23076923076923, "logps/rejected": -398.22222222222223, "loss": 0.1706, "rewards/chosen": 1.5432692307692308, "rewards/margins": 8.606761294261293, "rewards/rejected": -7.063492063492063, "step": 1110 }, { "epoch": 0.7887823926162584, "grad_norm": 0.1741503048275345, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103042154.98507462, "logits/rejected": 92618483.40983607, "logps/chosen": -227.34328358208955, "logps/rejected": -375.60655737704917, "loss": 0.1552, "rewards/chosen": 1.7789179104477613, "rewards/margins": 7.467442500611696, "rewards/rejected": -5.688524590163935, "step": 1111 }, { "epoch": 0.7894923677671282, "grad_norm": 0.1452769612579996, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96345630.11764705, "logits/rejected": 105696460.8, "logps/chosen": -226.58823529411765, "logps/rejected": -411.73333333333335, "loss": 0.1474, "rewards/chosen": 2.238970588235294, "rewards/margins": 9.10563725490196, "rewards/rejected": -6.866666666666666, "step": 1112 }, { "epoch": 0.7902023429179978, "grad_norm": 0.14368716760379038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123253670.1754386, "logits/rejected": 42489481.01408451, "logps/chosen": -197.05263157894737, "logps/rejected": -395.71830985915494, "loss": 0.1438, "rewards/chosen": 1.0526315789473684, "rewards/margins": 8.285025945144552, "rewards/rejected": -7.232394366197183, "step": 1113 }, { "epoch": 0.7909123180688676, "grad_norm": 0.14287553862389882, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106857960.36923076, "logits/rejected": 81555911.1111111, "logps/chosen": -185.1076923076923, "logps/rejected": -350.4761904761905, "loss": 0.1489, "rewards/chosen": 1.5038461538461538, "rewards/margins": 10.805433455433455, "rewards/rejected": -9.301587301587302, "step": 1114 }, { "epoch": 0.7916222932197373, "grad_norm": 0.1971828592202541, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90681914.32911393, "logits/rejected": 85940432.97959183, "logps/chosen": -231.08860759493672, "logps/rejected": -350.6938775510204, "loss": 0.2002, "rewards/chosen": 1.4430379746835442, "rewards/margins": 7.56548695427538, "rewards/rejected": -6.122448979591836, "step": 1115 }, { "epoch": 0.792332268370607, "grad_norm": 0.1668744609916945, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103586598.78787878, "logits/rejected": 105162025.29032259, "logps/chosen": -181.0909090909091, "logps/rejected": -369.2903225806452, "loss": 0.1722, "rewards/chosen": 1.603219696969697, "rewards/margins": 9.970155180840665, "rewards/rejected": -8.366935483870968, "step": 1116 }, { "epoch": 0.7930422435214768, "grad_norm": 0.17271383600210144, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 136282616.12307692, "logits/rejected": 44106768.253968254, "logps/chosen": -214.15384615384616, "logps/rejected": -374.85714285714283, "loss": 0.1757, "rewards/chosen": 1.0711538461538461, "rewards/margins": 10.880677655677657, "rewards/rejected": -9.80952380952381, "step": 1117 }, { "epoch": 0.7937522186723465, "grad_norm": 0.18549147502201932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120537844.18461539, "logits/rejected": 83220317.46031746, "logps/chosen": -196.06153846153848, "logps/rejected": -345.14285714285717, "loss": 0.1764, "rewards/chosen": 1.7134615384615384, "rewards/margins": 10.269017094017094, "rewards/rejected": -8.555555555555555, "step": 1118 }, { "epoch": 0.7944621938232161, "grad_norm": 0.17557664700222428, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91947008.0, "logits/rejected": 81133568.0, "logps/chosen": -186.25, "logps/rejected": -330.0, "loss": 0.1634, "rewards/chosen": 1.62109375, "rewards/margins": 9.62109375, "rewards/rejected": -8.0, "step": 1119 }, { "epoch": 0.7951721689740859, "grad_norm": 0.15470941461825777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121141368.47058824, "logits/rejected": 53687091.2, "logps/chosen": -233.41176470588235, "logps/rejected": -318.6666666666667, "loss": 0.174, "rewards/chosen": 1.822610294117647, "rewards/margins": 9.497610294117647, "rewards/rejected": -7.675, "step": 1120 }, { "epoch": 0.7958821441249556, "grad_norm": 0.16932912826115074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71172096.0, "logits/rejected": 113639424.0, "logps/chosen": -201.75, "logps/rejected": -416.5, "loss": 0.1646, "rewards/chosen": 1.755859375, "rewards/margins": 12.005859375, "rewards/rejected": -10.25, "step": 1121 }, { "epoch": 0.7965921192758253, "grad_norm": 0.16253791786052982, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91177716.18461539, "logits/rejected": 51596596.82539683, "logps/chosen": -203.07692307692307, "logps/rejected": -355.8095238095238, "loss": 0.162, "rewards/chosen": 1.1865384615384615, "rewards/margins": 10.218284493284493, "rewards/rejected": -9.031746031746032, "step": 1122 }, { "epoch": 0.7973020944266951, "grad_norm": 0.15368661929164534, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 155511886.76923078, "logits/rejected": 50773153.684210524, "logps/chosen": -188.6153846153846, "logps/rejected": -370.5263157894737, "loss": 0.1374, "rewards/chosen": 1.2475961538461537, "rewards/margins": 10.471280364372468, "rewards/rejected": -9.223684210526315, "step": 1123 }, { "epoch": 0.7980120695775648, "grad_norm": 0.21749800028850103, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106741997.44927536, "logits/rejected": 77110324.0677966, "logps/chosen": -232.1159420289855, "logps/rejected": -421.4237288135593, "loss": 0.145, "rewards/chosen": 2.2246376811594204, "rewards/margins": 10.089044460820437, "rewards/rejected": -7.864406779661017, "step": 1124 }, { "epoch": 0.7987220447284346, "grad_norm": 0.1766866132083025, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 140697600.0, "logits/rejected": 8708096.0, "logps/chosen": -219.75, "logps/rejected": -328.5, "loss": 0.1668, "rewards/chosen": 1.478515625, "rewards/margins": 9.048828125, "rewards/rejected": -7.5703125, "step": 1125 }, { "epoch": 0.7994320198793042, "grad_norm": 0.1498955041872987, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102828098.06451613, "logits/rejected": 54641136.484848484, "logps/chosen": -209.80645161290323, "logps/rejected": -349.09090909090907, "loss": 0.1493, "rewards/chosen": 1.6370967741935485, "rewards/margins": 11.015884652981427, "rewards/rejected": -9.378787878787879, "step": 1126 }, { "epoch": 0.8001419950301739, "grad_norm": 0.163602731935244, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 185880901.07936507, "logits/rejected": 72771174.4, "logps/chosen": -229.33333333333334, "logps/rejected": -363.81538461538463, "loss": 0.1692, "rewards/chosen": 1.3412698412698412, "rewards/margins": 10.402808302808303, "rewards/rejected": -9.061538461538461, "step": 1127 }, { "epoch": 0.8008519701810437, "grad_norm": 0.14346968056811338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 45298483.2, "logits/rejected": 122520681.93103448, "logps/chosen": -257.14285714285717, "logps/rejected": -384.55172413793105, "loss": 0.1376, "rewards/chosen": 2.8214285714285716, "rewards/margins": -16904136.213054188, "rewards/rejected": 16904139.03448276, "step": 1128 }, { "epoch": 0.8015619453319134, "grad_norm": 0.1486881304333515, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79159165.96825397, "logits/rejected": 99437268.67692308, "logps/chosen": -192.12698412698413, "logps/rejected": -355.9384615384615, "loss": 0.141, "rewards/chosen": 1.5625, "rewards/margins": 10.300961538461538, "rewards/rejected": -8.738461538461538, "step": 1129 }, { "epoch": 0.8022719204827831, "grad_norm": 0.1793329568395936, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114751855.58974358, "logits/rejected": 80111206.4, "logps/chosen": -206.15384615384616, "logps/rejected": -366.4, "loss": 0.1686, "rewards/chosen": 2.0961538461538463, "rewards/margins": 8.486153846153845, "rewards/rejected": -6.39, "step": 1130 }, { "epoch": 0.8029818956336529, "grad_norm": 0.16043126308913871, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96833714.08695652, "logits/rejected": 114596916.0677966, "logps/chosen": -210.31884057971016, "logps/rejected": -388.8813559322034, "loss": 0.1582, "rewards/chosen": 1.806159420289855, "rewards/margins": 10.170566199950873, "rewards/rejected": -8.364406779661017, "step": 1131 }, { "epoch": 0.8036918707845225, "grad_norm": 0.16006242352477001, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104366080.0, "logits/rejected": 109772800.0, "logps/chosen": -235.75, "logps/rejected": -362.5, "loss": 0.1554, "rewards/chosen": 1.978515625, "rewards/margins": 9.564453125, "rewards/rejected": -7.5859375, "step": 1132 }, { "epoch": 0.8044018459353922, "grad_norm": 0.14375518145465319, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76148312.27586207, "logits/rejected": 80268492.8, "logps/chosen": -206.48275862068965, "logps/rejected": -328.22857142857146, "loss": 0.1472, "rewards/chosen": 1.3599137931034482, "rewards/margins": 8.317056650246306, "rewards/rejected": -6.957142857142857, "step": 1133 }, { "epoch": 0.805111821086262, "grad_norm": 0.1482594287738332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 92490571.29411764, "logits/rejected": 67796992.0, "logps/chosen": -166.11764705882354, "logps/rejected": -358.1333333333333, "loss": 0.1592, "rewards/chosen": 1.5873161764705883, "rewards/margins": 5269195.1873161765, "rewards/rejected": -5269193.6, "step": 1134 }, { "epoch": 0.8058217962371317, "grad_norm": 0.1405317158702092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112197632.0, "logits/rejected": 104487514.35294117, "logps/chosen": -221.33333333333334, "logps/rejected": -371.7647058823529, "loss": 0.1382, "rewards/chosen": 2.20625, "rewards/margins": 11.162132352941175, "rewards/rejected": -8.955882352941176, "step": 1135 }, { "epoch": 0.8065317713880015, "grad_norm": 0.17298271902450438, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97555017.14285715, "logits/rejected": 68914744.8888889, "logps/chosen": -206.85714285714286, "logps/rejected": -320.8888888888889, "loss": 0.1642, "rewards/chosen": 1.443359375, "rewards/margins": 9.360026041666668, "rewards/rejected": -7.916666666666667, "step": 1136 }, { "epoch": 0.8072417465388712, "grad_norm": 0.16948751967769188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116628711.22580644, "logits/rejected": 46840366.54545455, "logps/chosen": -201.03225806451613, "logps/rejected": -358.7878787878788, "loss": 0.1512, "rewards/chosen": 1.7560483870967742, "rewards/margins": 11.528775659824047, "rewards/rejected": -9.772727272727273, "step": 1137 }, { "epoch": 0.8079517216897408, "grad_norm": 0.19010996338488256, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72520521.64383562, "logits/rejected": 93342329.01818182, "logps/chosen": -175.23287671232876, "logps/rejected": -406.6909090909091, "loss": 0.1843, "rewards/chosen": 1.5702054794520548, "rewards/margins": 10.706569115815691, "rewards/rejected": -9.136363636363637, "step": 1138 }, { "epoch": 0.8086616968406106, "grad_norm": 0.15323500668724285, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63550581.50819672, "logits/rejected": 72367394.3880597, "logps/chosen": -203.54098360655738, "logps/rejected": -394.02985074626866, "loss": 0.1507, "rewards/chosen": 1.2346311475409837, "rewards/margins": 10.697317714705163, "rewards/rejected": -9.462686567164178, "step": 1139 }, { "epoch": 0.8093716719914803, "grad_norm": 0.18652290518189388, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81048756.70588236, "logits/rejected": 102620637.86666666, "logps/chosen": -172.94117647058823, "logps/rejected": -382.4, "loss": 0.1609, "rewards/chosen": 1.1875, "rewards/margins": 7.5875, "rewards/rejected": -6.4, "step": 1140 }, { "epoch": 0.81008164714235, "grad_norm": 0.16206472226765972, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106700551.75757575, "logits/rejected": 63523410.58064516, "logps/chosen": -248.4848484848485, "logps/rejected": -402.06451612903226, "loss": 0.1653, "rewards/chosen": 1.331439393939394, "rewards/margins": 10.484665200391007, "rewards/rejected": -9.153225806451612, "step": 1141 }, { "epoch": 0.8107916222932198, "grad_norm": 0.22419049549223588, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96606510.16393442, "logits/rejected": 74652351.04477613, "logps/chosen": -247.60655737704917, "logps/rejected": -369.1940298507463, "loss": 0.1407, "rewards/chosen": 1.8944672131147542, "rewards/margins": 12.424317959383412, "rewards/rejected": -10.529850746268657, "step": 1142 }, { "epoch": 0.8115015974440895, "grad_norm": 0.1604571568722953, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137425136.94117647, "logits/rejected": 36455492.266666666, "logps/chosen": -195.05882352941177, "logps/rejected": -346.1333333333333, "loss": 0.1819, "rewards/chosen": 1.619485294117647, "rewards/margins": 11.036151960784313, "rewards/rejected": -9.416666666666666, "step": 1143 }, { "epoch": 0.8122115725949591, "grad_norm": 0.269458191721884, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111448649.14285715, "logits/rejected": 32903591.724137932, "logps/chosen": -193.94285714285715, "logps/rejected": -362.48275862068965, "loss": 0.2039, "rewards/chosen": 0.9, "rewards/margins": 10.641379310344828, "rewards/rejected": -9.741379310344827, "step": 1144 }, { "epoch": 0.8129215477458289, "grad_norm": 0.12718461394496208, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114396259.09677419, "logits/rejected": 42832740.84848485, "logps/chosen": -272.258064516129, "logps/rejected": -354.90909090909093, "loss": 0.1343, "rewards/chosen": 1.0625, "rewards/margins": 9.820075757575758, "rewards/rejected": -8.757575757575758, "step": 1145 }, { "epoch": 0.8136315228966986, "grad_norm": 0.1318071759733228, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82153650.08695652, "logits/rejected": 80082770.44067797, "logps/chosen": -180.8695652173913, "logps/rejected": -362.03389830508473, "loss": 0.1565, "rewards/chosen": 1.701086956521739, "rewards/margins": 11.38752763448784, "rewards/rejected": -9.686440677966102, "step": 1146 }, { "epoch": 0.8143414980475683, "grad_norm": 0.15927076429225143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129661846.06896552, "logits/rejected": 51140549.48571429, "logps/chosen": -279.17241379310343, "logps/rejected": -360.6857142857143, "loss": 0.1095, "rewards/chosen": 3.1379310344827585, "rewards/margins": 11.645073891625616, "rewards/rejected": -8.507142857142858, "step": 1147 }, { "epoch": 0.8150514731984381, "grad_norm": 0.1518039876372811, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 91919238.50847457, "logits/rejected": 85892051.47826087, "logps/chosen": -173.01694915254237, "logps/rejected": -382.1449275362319, "loss": 0.1614, "rewards/chosen": 1.2383474576271187, "rewards/margins": 10.129651805453207, "rewards/rejected": -8.891304347826088, "step": 1148 }, { "epoch": 0.8157614483493077, "grad_norm": 0.1798984931246863, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71240566.4477612, "logits/rejected": 90624470.03278689, "logps/chosen": -195.82089552238807, "logps/rejected": -335.73770491803276, "loss": 0.1627, "rewards/chosen": 2.001865671641791, "rewards/margins": 10.657603376559825, "rewards/rejected": -8.655737704918034, "step": 1149 }, { "epoch": 0.8164714235001775, "grad_norm": 0.1560171025058289, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93755030.58823529, "logits/rejected": 58388206.93333333, "logps/chosen": -218.11764705882354, "logps/rejected": -353.3333333333333, "loss": 0.1558, "rewards/chosen": 1.9393382352941178, "rewards/margins": 10.456004901960785, "rewards/rejected": -8.516666666666667, "step": 1150 }, { "epoch": 0.8171813986510472, "grad_norm": 0.2264393338621722, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 79536431.4074074, "logits/rejected": 88363782.91891892, "logps/chosen": -173.4814814814815, "logps/rejected": -398.7027027027027, "loss": 0.1574, "rewards/chosen": 1.0405092592592593, "rewards/margins": 11.026995745745745, "rewards/rejected": -9.986486486486486, "step": 1151 }, { "epoch": 0.8178913738019169, "grad_norm": 0.15857666300620923, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114794874.09230769, "logits/rejected": 72301811.8095238, "logps/chosen": -211.44615384615383, "logps/rejected": -344.3809523809524, "loss": 0.1877, "rewards/chosen": 0.8884615384615384, "rewards/margins": 9.983699633699633, "rewards/rejected": -9.095238095238095, "step": 1152 }, { "epoch": 0.8186013489527867, "grad_norm": 0.17178519392317362, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106172481.01587301, "logits/rejected": 71238640.24615385, "logps/chosen": -260.3174603174603, "logps/rejected": -337.4769230769231, "loss": 0.1523, "rewards/chosen": 2.2123015873015874, "rewards/margins": 10.927686202686202, "rewards/rejected": -8.715384615384615, "step": 1153 }, { "epoch": 0.8193113241036564, "grad_norm": 0.17953055443055735, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 160808032.6037736, "logits/rejected": 52652496.21333333, "logps/chosen": -235.47169811320754, "logps/rejected": -350.29333333333335, "loss": 0.1339, "rewards/chosen": 2.134433962264151, "rewards/margins": 8.441100628930817, "rewards/rejected": -6.306666666666667, "step": 1154 }, { "epoch": 0.820021299254526, "grad_norm": 0.18920716721391787, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115652775.86885247, "logits/rejected": 94121433.79104477, "logps/chosen": -256.0, "logps/rejected": -372.05970149253733, "loss": 0.1391, "rewards/chosen": 2.3114754098360657, "rewards/margins": 8.520430633716662, "rewards/rejected": -6.208955223880597, "step": 1155 }, { "epoch": 0.8207312744053958, "grad_norm": 0.17545921201503126, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105639588.88135593, "logits/rejected": 72123792.69565217, "logps/chosen": -211.52542372881356, "logps/rejected": -384.231884057971, "loss": 0.1233, "rewards/chosen": 1.9777542372881356, "rewards/margins": 11.34731945467944, "rewards/rejected": -9.369565217391305, "step": 1156 }, { "epoch": 0.8214412495562655, "grad_norm": 0.1664046485248411, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123032917.33333333, "logits/rejected": 31677142.70967742, "logps/chosen": -199.03030303030303, "logps/rejected": -365.93548387096774, "loss": 0.1558, "rewards/chosen": 1.6988636363636365, "rewards/margins": 10.198863636363637, "rewards/rejected": -8.5, "step": 1157 }, { "epoch": 0.8221512247071352, "grad_norm": 0.18807467731956867, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113013191.1111111, "logits/rejected": 91424491.24324325, "logps/chosen": -211.55555555555554, "logps/rejected": -343.35135135135135, "loss": 0.1404, "rewards/chosen": 0.7013888888888888, "rewards/margins": 9.61355105105105, "rewards/rejected": -8.912162162162161, "step": 1158 }, { "epoch": 0.822861199858005, "grad_norm": 0.19416556393422038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137289612.6197183, "logits/rejected": 74651252.77192983, "logps/chosen": -233.91549295774647, "logps/rejected": -374.4561403508772, "loss": 0.1677, "rewards/chosen": 2.132042253521127, "rewards/margins": 11.193445762293056, "rewards/rejected": -9.06140350877193, "step": 1159 }, { "epoch": 0.8235711750088747, "grad_norm": 0.16714367036305514, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124726307.31034483, "logits/rejected": 82148439.77142857, "logps/chosen": -239.44827586206895, "logps/rejected": -359.77142857142854, "loss": 0.1291, "rewards/chosen": 2.002155172413793, "rewards/margins": 11.145012315270936, "rewards/rejected": -9.142857142857142, "step": 1160 }, { "epoch": 0.8242811501597445, "grad_norm": 0.17166406654690772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142468817.83606556, "logits/rejected": 89144610.3880597, "logps/chosen": -270.95081967213116, "logps/rejected": -394.5074626865672, "loss": 0.1692, "rewards/chosen": 2.028688524590164, "rewards/margins": 9.954061658918523, "rewards/rejected": -7.925373134328358, "step": 1161 }, { "epoch": 0.8249911253106141, "grad_norm": 0.13539961205798842, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134102815.56164384, "logits/rejected": -17749531.927272726, "logps/chosen": -183.8904109589041, "logps/rejected": -288.2909090909091, "loss": 0.1566, "rewards/chosen": 1.6815068493150684, "rewards/margins": 10.654234122042341, "rewards/rejected": -8.972727272727273, "step": 1162 }, { "epoch": 0.8257011004614838, "grad_norm": 0.22727813861105625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132938109.83050847, "logits/rejected": 45407899.82608695, "logps/chosen": -203.9322033898305, "logps/rejected": -371.0144927536232, "loss": 0.1664, "rewards/chosen": 0.8167372881355932, "rewards/margins": 9.729780766396463, "rewards/rejected": -8.91304347826087, "step": 1163 }, { "epoch": 0.8264110756123536, "grad_norm": 0.253567251880129, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106115891.2, "logits/rejected": 50244266.666666664, "logps/chosen": -233.2, "logps/rejected": -319.0, "loss": 0.2162, "rewards/chosen": 1.65, "rewards/margins": 8.717708333333333, "rewards/rejected": -7.067708333333333, "step": 1164 }, { "epoch": 0.8271210507632233, "grad_norm": 0.15460546663698613, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70589982.11764705, "logits/rejected": 54875477.333333336, "logps/chosen": -184.0, "logps/rejected": -368.53333333333336, "loss": 0.1639, "rewards/chosen": 1.6075367647058822, "rewards/margins": 9.674203431372549, "rewards/rejected": -8.066666666666666, "step": 1165 }, { "epoch": 0.827831025914093, "grad_norm": 0.15553806775170984, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88749041.1594203, "logits/rejected": 51184726.779661015, "logps/chosen": -201.15942028985506, "logps/rejected": -349.2881355932203, "loss": 0.1489, "rewards/chosen": 2.2300724637681157, "rewards/margins": 11.187699582412183, "rewards/rejected": -8.957627118644067, "step": 1166 }, { "epoch": 0.8285410010649628, "grad_norm": 0.1803066675895298, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149496978.2857143, "logits/rejected": 70720625.77777778, "logps/chosen": -205.42857142857142, "logps/rejected": -357.77777777777777, "loss": 0.1644, "rewards/chosen": 1.2366071428571428, "rewards/margins": 9.889384920634921, "rewards/rejected": -8.652777777777779, "step": 1167 }, { "epoch": 0.8292509762158324, "grad_norm": 0.18730800116721943, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143286493.4054054, "logits/rejected": 69167179.85185185, "logps/chosen": -206.7027027027027, "logps/rejected": -370.6666666666667, "loss": 0.1897, "rewards/chosen": 1.4991554054054055, "rewards/margins": 11.286192442442442, "rewards/rejected": -9.787037037037036, "step": 1168 }, { "epoch": 0.8299609513667021, "grad_norm": 0.18964297110018988, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115588481.66233766, "logits/rejected": 92110205.49019608, "logps/chosen": -218.1818181818182, "logps/rejected": -409.0980392156863, "loss": 0.1674, "rewards/chosen": 1.9464285714285714, "rewards/margins": 9.946428571428571, "rewards/rejected": -8.0, "step": 1169 }, { "epoch": 0.8306709265175719, "grad_norm": 0.16252336969525702, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93268075.78947368, "logits/rejected": 76183079.38461539, "logps/chosen": -192.0, "logps/rejected": -329.0769230769231, "loss": 0.1949, "rewards/chosen": 1.3355263157894737, "rewards/margins": 9.840334008097166, "rewards/rejected": -8.504807692307692, "step": 1170 }, { "epoch": 0.8313809016684416, "grad_norm": 0.1696334262813264, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85511372.8, "logits/rejected": 56731045.64705882, "logps/chosen": -230.4, "logps/rejected": -378.8235294117647, "loss": 0.1541, "rewards/chosen": 1.7104166666666667, "rewards/margins": 12.563357843137256, "rewards/rejected": -10.852941176470589, "step": 1171 }, { "epoch": 0.8320908768193114, "grad_norm": 0.16521525461218317, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107933422.93333334, "logits/rejected": 82282375.52941176, "logps/chosen": -201.46666666666667, "logps/rejected": -400.47058823529414, "loss": 0.1521, "rewards/chosen": 1.2364583333333334, "rewards/margins": 11.368811274509804, "rewards/rejected": -10.132352941176471, "step": 1172 }, { "epoch": 0.832800851970181, "grad_norm": 0.16820934789973624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88715884.60606061, "logits/rejected": 121431865.80645162, "logps/chosen": -258.42424242424244, "logps/rejected": -394.3225806451613, "loss": 0.1642, "rewards/chosen": 1.8200757575757576, "rewards/margins": 11.634591886608016, "rewards/rejected": -9.814516129032258, "step": 1173 }, { "epoch": 0.8335108271210507, "grad_norm": 0.22997514935936064, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131797937.23076923, "logits/rejected": -1930711.365079365, "logps/chosen": -259.6923076923077, "logps/rejected": -315.6825396825397, "loss": 0.1645, "rewards/chosen": 1.6605769230769232, "rewards/margins": 10.422481684981687, "rewards/rejected": -8.761904761904763, "step": 1174 }, { "epoch": 0.8342208022719205, "grad_norm": 0.16594565327460886, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75998284.41791044, "logits/rejected": 94062424.13114753, "logps/chosen": -197.01492537313433, "logps/rejected": -372.4590163934426, "loss": 0.1849, "rewards/chosen": 1.412313432835821, "rewards/margins": 10.895919990212871, "rewards/rejected": -9.48360655737705, "step": 1175 }, { "epoch": 0.8349307774227902, "grad_norm": 0.16195255843170983, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110836048.23880596, "logits/rejected": 85742575.21311475, "logps/chosen": -259.82089552238807, "logps/rejected": -403.40983606557376, "loss": 0.156, "rewards/chosen": 1.8945895522388059, "rewards/margins": 12.08311414240274, "rewards/rejected": -10.188524590163935, "step": 1176 }, { "epoch": 0.8356407525736599, "grad_norm": 0.13149675750984705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 38422820.571428575, "logits/rejected": 117149240.8888889, "logps/chosen": -159.42857142857142, "logps/rejected": -400.8888888888889, "loss": 0.1258, "rewards/chosen": 1.3002232142857142, "rewards/margins": 10.355778769841269, "rewards/rejected": -9.055555555555555, "step": 1177 }, { "epoch": 0.8363507277245297, "grad_norm": 0.1806967094461706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119078912.0, "logits/rejected": 73531392.0, "logps/chosen": -264.5, "logps/rejected": -445.5, "loss": 0.1698, "rewards/chosen": 1.19677734375, "rewards/margins": 14008765.196777344, "rewards/rejected": -14008764.0, "step": 1178 }, { "epoch": 0.8370607028753994, "grad_norm": 0.14693134728046522, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80880162.13333334, "logits/rejected": 92675614.11764705, "logps/chosen": -200.93333333333334, "logps/rejected": -418.8235294117647, "loss": 0.1416, "rewards/chosen": 1.53125, "rewards/margins": 12.060661764705882, "rewards/rejected": -10.529411764705882, "step": 1179 }, { "epoch": 0.837770678026269, "grad_norm": 0.1658266521717404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 25598569.65079365, "logits/rejected": 113826957.78461538, "logps/chosen": -171.3015873015873, "logps/rejected": -379.0769230769231, "loss": 0.1421, "rewards/chosen": 2.0853174603174605, "rewards/margins": 10.208394383394383, "rewards/rejected": -8.123076923076923, "step": 1180 }, { "epoch": 0.8384806531771388, "grad_norm": 0.16036716868931056, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107676722.36065574, "logits/rejected": 59283669.97014926, "logps/chosen": -203.27868852459017, "logps/rejected": -346.5074626865672, "loss": 0.1407, "rewards/chosen": 1.7971311475409837, "rewards/margins": 10.782205774406656, "rewards/rejected": -8.985074626865671, "step": 1181 }, { "epoch": 0.8391906283280085, "grad_norm": 0.14100789869886407, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68774249.41176471, "logits/rejected": 99894340.26666667, "logps/chosen": -223.1764705882353, "logps/rejected": -398.6666666666667, "loss": 0.1336, "rewards/chosen": 2.0919117647058822, "rewards/margins": 12.500245098039215, "rewards/rejected": -10.408333333333333, "step": 1182 }, { "epoch": 0.8399006034788783, "grad_norm": 0.17315806517072654, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 78583281.37142856, "logits/rejected": 103194341.51724137, "logps/chosen": -210.97142857142856, "logps/rejected": -388.41379310344826, "loss": 0.1863, "rewards/chosen": 1.0160714285714285, "rewards/margins": 7.447105911330049, "rewards/rejected": -6.431034482758621, "step": 1183 }, { "epoch": 0.840610578629748, "grad_norm": 0.16786697627889755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159516704.5079365, "logits/rejected": 64398698.33846154, "logps/chosen": -231.61904761904762, "logps/rejected": -391.87692307692305, "loss": 0.1645, "rewards/chosen": 1.9821428571428572, "rewards/margins": 10.77445054945055, "rewards/rejected": -8.792307692307693, "step": 1184 }, { "epoch": 0.8413205537806177, "grad_norm": 0.15805087615555585, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 163646615.0819672, "logits/rejected": 50069504.0, "logps/chosen": -299.5409836065574, "logps/rejected": -351.04477611940297, "loss": 0.1536, "rewards/chosen": 2.055327868852459, "rewards/margins": 11.36129801810619, "rewards/rejected": -9.305970149253731, "step": 1185 }, { "epoch": 0.8420305289314874, "grad_norm": 0.17160122054027624, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69259986.8235294, "logits/rejected": 67598199.46666667, "logps/chosen": -207.05882352941177, "logps/rejected": -365.3333333333333, "loss": 0.1719, "rewards/chosen": 1.6516544117647058, "rewards/margins": 10.676654411764707, "rewards/rejected": -9.025, "step": 1186 }, { "epoch": 0.8427405040823571, "grad_norm": 0.1901867278555484, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127595142.73684211, "logits/rejected": 52517412.05633803, "logps/chosen": -224.28070175438597, "logps/rejected": -373.1830985915493, "loss": 0.1782, "rewards/chosen": 1.543859649122807, "rewards/margins": 7.0227328885594265, "rewards/rejected": -5.47887323943662, "step": 1187 }, { "epoch": 0.8434504792332268, "grad_norm": 0.1806216412468188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73999506.28571428, "logits/rejected": 118019036.68965517, "logps/chosen": -260.34285714285716, "logps/rejected": -368.55172413793105, "loss": 0.1327, "rewards/chosen": 2.6267857142857145, "rewards/margins": 11.092302955665025, "rewards/rejected": -8.46551724137931, "step": 1188 }, { "epoch": 0.8441604543840966, "grad_norm": 0.16560979378898663, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90207929.50724638, "logits/rejected": 85556692.61016949, "logps/chosen": -179.0144927536232, "logps/rejected": -354.9830508474576, "loss": 0.1759, "rewards/chosen": 1.443840579710145, "rewards/margins": 10.494688037337264, "rewards/rejected": -9.05084745762712, "step": 1189 }, { "epoch": 0.8448704295349663, "grad_norm": 0.16109153298901344, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107494019.65714286, "logits/rejected": 76220627.86206897, "logps/chosen": -220.34285714285716, "logps/rejected": -377.9310344827586, "loss": 0.1695, "rewards/chosen": 1.6803571428571429, "rewards/margins": 9.344150246305418, "rewards/rejected": -7.663793103448276, "step": 1190 }, { "epoch": 0.845580404685836, "grad_norm": 0.24736194984554188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135535937.82857144, "logits/rejected": 47511340.137931034, "logps/chosen": -254.85714285714286, "logps/rejected": -345.37931034482756, "loss": 0.1662, "rewards/chosen": 1.9303571428571429, "rewards/margins": 10.068288177339902, "rewards/rejected": -8.137931034482758, "step": 1191 }, { "epoch": 0.8462903798367057, "grad_norm": 0.24167962752946043, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85517198.22222222, "logits/rejected": 91900196.57142857, "logps/chosen": -230.11111111111111, "logps/rejected": -343.42857142857144, "loss": 0.1709, "rewards/chosen": 1.7239583333333333, "rewards/margins": 7.402529761904762, "rewards/rejected": -5.678571428571429, "step": 1192 }, { "epoch": 0.8470003549875754, "grad_norm": 0.17136047000086133, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102940203.88571429, "logits/rejected": 83741448.8275862, "logps/chosen": -208.68571428571428, "logps/rejected": -423.17241379310343, "loss": 0.1745, "rewards/chosen": 1.7758928571428572, "rewards/margins": 10.655203201970442, "rewards/rejected": -8.879310344827585, "step": 1193 }, { "epoch": 0.8477103301384451, "grad_norm": 0.19185360516546826, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 85474831.51515152, "logits/rejected": 110540205.41935484, "logps/chosen": -221.21212121212122, "logps/rejected": -345.80645161290323, "loss": 0.1881, "rewards/chosen": 1.353219696969697, "rewards/margins": 8.667735826001955, "rewards/rejected": -7.314516129032258, "step": 1194 }, { "epoch": 0.8484203052893149, "grad_norm": 0.16226645666722966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148435707.66101694, "logits/rejected": 28417929.27536232, "logps/chosen": -258.1694915254237, "logps/rejected": -369.6231884057971, "loss": 0.162, "rewards/chosen": 1.597457627118644, "rewards/margins": 11.206153279292556, "rewards/rejected": -9.608695652173912, "step": 1195 }, { "epoch": 0.8491302804401846, "grad_norm": 0.15727278960663096, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87621632.0, "logits/rejected": 83099648.0, "logps/chosen": -171.125, "logps/rejected": -339.0, "loss": 0.1677, "rewards/chosen": 1.412109375, "rewards/margins": 9.154296875, "rewards/rejected": -7.7421875, "step": 1196 }, { "epoch": 0.8498402555910544, "grad_norm": 0.15997726210159038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148536214.06896552, "logits/rejected": 81519294.17142858, "logps/chosen": -238.3448275862069, "logps/rejected": -325.0285714285714, "loss": 0.1473, "rewards/chosen": 1.3060344827586208, "rewards/margins": 6.848891625615764, "rewards/rejected": -5.542857142857143, "step": 1197 }, { "epoch": 0.850550230741924, "grad_norm": 0.17894068412536665, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113185420.98550725, "logits/rejected": 69525920.54237288, "logps/chosen": -219.82608695652175, "logps/rejected": -306.4406779661017, "loss": 0.1696, "rewards/chosen": 1.7318840579710144, "rewards/margins": 10.342053549496438, "rewards/rejected": -8.610169491525424, "step": 1198 }, { "epoch": 0.8512602058927937, "grad_norm": 0.16608247491396322, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109088696.14035088, "logits/rejected": 78214908.39436619, "logps/chosen": -236.0701754385965, "logps/rejected": -395.2676056338028, "loss": 0.1178, "rewards/chosen": 2.258771929824561, "rewards/margins": 12.723560662218928, "rewards/rejected": -10.464788732394366, "step": 1199 }, { "epoch": 0.8519701810436635, "grad_norm": 0.17558925405631423, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101711872.0, "logits/rejected": 89980928.0, "logps/chosen": -185.5, "logps/rejected": -333.5, "loss": 0.1559, "rewards/chosen": 1.478515625, "rewards/margins": 9.595703125, "rewards/rejected": -8.1171875, "step": 1200 }, { "epoch": 0.8526801561945332, "grad_norm": 0.3818612116835549, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90532985.49152543, "logits/rejected": 50088499.942028984, "logps/chosen": -186.84745762711864, "logps/rejected": -308.40579710144925, "loss": 0.1663, "rewards/chosen": 1.3167372881355932, "rewards/margins": 9.954418447555883, "rewards/rejected": -8.63768115942029, "step": 1201 }, { "epoch": 0.8533901313454029, "grad_norm": 0.1889737301246809, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 122563063.60655738, "logits/rejected": 77398994.14925373, "logps/chosen": -220.85245901639345, "logps/rejected": -384.95522388059703, "loss": 0.1654, "rewards/chosen": 1.3555327868852458, "rewards/margins": 10.101801443601664, "rewards/rejected": -8.746268656716419, "step": 1202 }, { "epoch": 0.8541001064962727, "grad_norm": 0.1562294419599777, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73673861.56521739, "logits/rejected": 94478474.84745763, "logps/chosen": -179.1304347826087, "logps/rejected": -341.96610169491527, "loss": 0.1609, "rewards/chosen": 1.7463768115942029, "rewards/margins": 9.678580201424712, "rewards/rejected": -7.932203389830509, "step": 1203 }, { "epoch": 0.8548100816471423, "grad_norm": 0.20717999705406573, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130974132.90666667, "logits/rejected": 54644658.716981135, "logps/chosen": -206.72, "logps/rejected": -349.58490566037733, "loss": 0.1805, "rewards/chosen": 1.4383333333333332, "rewards/margins": 10.900597484276728, "rewards/rejected": -9.462264150943396, "step": 1204 }, { "epoch": 0.855520056798012, "grad_norm": 0.15234578867435028, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98146713.6, "logits/rejected": 73100726.85714285, "logps/chosen": -199.3846153846154, "logps/rejected": -309.8412698412698, "loss": 0.1423, "rewards/chosen": 1.1576923076923078, "rewards/margins": 9.72912087912088, "rewards/rejected": -8.571428571428571, "step": 1205 }, { "epoch": 0.8562300319488818, "grad_norm": 0.1786933477291749, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88737700.29850747, "logits/rejected": 83159812.19672132, "logps/chosen": -218.7462686567164, "logps/rejected": -360.655737704918, "loss": 0.1665, "rewards/chosen": 1.2649253731343284, "rewards/margins": 8.773122094445803, "rewards/rejected": -7.508196721311475, "step": 1206 }, { "epoch": 0.8569400070997515, "grad_norm": 0.18686397188430062, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129885905.83606558, "logits/rejected": 79785678.3283582, "logps/chosen": -262.8196721311475, "logps/rejected": -343.4029850746269, "loss": 0.1365, "rewards/chosen": 2.4692622950819674, "rewards/margins": 11.140904086126742, "rewards/rejected": -8.671641791044776, "step": 1207 }, { "epoch": 0.8576499822506213, "grad_norm": 0.5166842861835598, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 144703488.0, "logits/rejected": 33648880.941176474, "logps/chosen": -245.33333333333334, "logps/rejected": -306.3529411764706, "loss": 0.149, "rewards/chosen": 1.7291666666666667, "rewards/margins": 10.302696078431373, "rewards/rejected": -8.573529411764707, "step": 1208 }, { "epoch": 0.858359957401491, "grad_norm": 0.18123643451433236, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121634816.0, "logits/rejected": 63111168.0, "logps/chosen": -216.875, "logps/rejected": -414.0, "loss": 0.1717, "rewards/chosen": 1.3798828125, "rewards/margins": -5595290.6201171875, "rewards/rejected": 5595292.0, "step": 1209 }, { "epoch": 0.8590699325523606, "grad_norm": 0.1766999808437067, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149302505.54385966, "logits/rejected": 81552629.18309858, "logps/chosen": -280.42105263157896, "logps/rejected": -333.9718309859155, "loss": 0.1594, "rewards/chosen": 1.4605263157894737, "rewards/margins": 7.735174203113417, "rewards/rejected": -6.274647887323944, "step": 1210 }, { "epoch": 0.8597799077032304, "grad_norm": 0.15370422083438723, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68548699.70149253, "logits/rejected": 98600523.5409836, "logps/chosen": -231.40298507462686, "logps/rejected": -332.8524590163934, "loss": 0.1713, "rewards/chosen": 1.7649253731343284, "rewards/margins": 10.289515537068755, "rewards/rejected": -8.524590163934427, "step": 1211 }, { "epoch": 0.8604898828541001, "grad_norm": 0.1801753730346229, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138269852.20338982, "logits/rejected": 64221480.8115942, "logps/chosen": -319.45762711864404, "logps/rejected": -349.6811594202899, "loss": 0.1399, "rewards/chosen": 2.8612288135593222, "rewards/margins": 12.122098378776712, "rewards/rejected": -9.26086956521739, "step": 1212 }, { "epoch": 0.8611998580049698, "grad_norm": 0.17911918652932168, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68999254.53521127, "logits/rejected": 111995275.22807017, "logps/chosen": -189.29577464788733, "logps/rejected": -436.7719298245614, "loss": 0.1641, "rewards/chosen": 1.5455545774647887, "rewards/margins": 8.949063349394613, "rewards/rejected": -7.4035087719298245, "step": 1213 }, { "epoch": 0.8619098331558396, "grad_norm": 0.22285192537944357, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71112517.81818181, "logits/rejected": 82155158.58823529, "logps/chosen": -155.2207792207792, "logps/rejected": -371.921568627451, "loss": 0.1787, "rewards/chosen": 1.5974025974025974, "rewards/margins": 7.62681436210848, "rewards/rejected": -6.029411764705882, "step": 1214 }, { "epoch": 0.8626198083067093, "grad_norm": 0.14253923787275335, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112984064.0, "logits/rejected": 85262336.0, "logps/chosen": -173.25, "logps/rejected": -410.0, "loss": 0.1437, "rewards/chosen": 1.73046875, "rewards/margins": 10.90234375, "rewards/rejected": -9.171875, "step": 1215 }, { "epoch": 0.8633297834575789, "grad_norm": 0.16184367020889906, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 81653627.87096775, "logits/rejected": 125130069.33333333, "logps/chosen": -204.38709677419354, "logps/rejected": -378.1818181818182, "loss": 0.1199, "rewards/chosen": 2.530241935483871, "rewards/margins": 12.318120723362657, "rewards/rejected": -9.787878787878787, "step": 1216 }, { "epoch": 0.8640397586084487, "grad_norm": 0.24423845748194323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135517371.49295774, "logits/rejected": 29654465.12280702, "logps/chosen": -221.9718309859155, "logps/rejected": -334.5964912280702, "loss": 0.1815, "rewards/chosen": 1.6320422535211268, "rewards/margins": 10.991691376328143, "rewards/rejected": -9.359649122807017, "step": 1217 }, { "epoch": 0.8647497337593184, "grad_norm": 0.1742829807306908, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111382072.8888889, "logits/rejected": 44947068.54054054, "logps/chosen": -220.44444444444446, "logps/rejected": -329.0810810810811, "loss": 0.1242, "rewards/chosen": 1.6828703703703705, "rewards/margins": 7.980167667667668, "rewards/rejected": -6.297297297297297, "step": 1218 }, { "epoch": 0.8654597089101882, "grad_norm": 0.16518560245025912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119193355.46268657, "logits/rejected": 80723162.22950819, "logps/chosen": -189.37313432835822, "logps/rejected": -346.2295081967213, "loss": 0.1763, "rewards/chosen": 1.0317164179104477, "rewards/margins": 7.875978712992415, "rewards/rejected": -6.844262295081967, "step": 1219 }, { "epoch": 0.8661696840610579, "grad_norm": 0.18853864964893755, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129515023.51515152, "logits/rejected": 99580894.96774194, "logps/chosen": -193.45454545454547, "logps/rejected": -404.1290322580645, "loss": 0.1626, "rewards/chosen": 1.4867424242424243, "rewards/margins": 11.067387585532746, "rewards/rejected": -9.580645161290322, "step": 1220 }, { "epoch": 0.8668796592119276, "grad_norm": 0.15191750952095048, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93371659.81538461, "logits/rejected": 84868079.74603175, "logps/chosen": -199.26153846153846, "logps/rejected": -372.3174603174603, "loss": 0.1543, "rewards/chosen": 1.675, "rewards/margins": 11.667063492063493, "rewards/rejected": -9.992063492063492, "step": 1221 }, { "epoch": 0.8675896343627973, "grad_norm": 0.1664274858885097, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123262456.35820895, "logits/rejected": 69842037.50819673, "logps/chosen": -264.35820895522386, "logps/rejected": -372.4590163934426, "loss": 0.1551, "rewards/chosen": 2.4029850746268657, "rewards/margins": 11.427575238561293, "rewards/rejected": -9.024590163934427, "step": 1222 }, { "epoch": 0.868299609513667, "grad_norm": 0.17848946781543246, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 110498215.72413793, "logits/rejected": 105456786.28571428, "logps/chosen": -249.93103448275863, "logps/rejected": -375.77142857142854, "loss": 0.1548, "rewards/chosen": 1.5635775862068966, "rewards/margins": 10.063577586206897, "rewards/rejected": -8.5, "step": 1223 }, { "epoch": 0.8690095846645367, "grad_norm": 0.17202176763187838, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96119466.66666667, "logits/rejected": 74036955.42857143, "logps/chosen": -218.66666666666666, "logps/rejected": -317.42857142857144, "loss": 0.1534, "rewards/chosen": 2.189236111111111, "rewards/margins": 11.00173611111111, "rewards/rejected": -8.8125, "step": 1224 }, { "epoch": 0.8697195598154065, "grad_norm": 0.18939101083787802, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134099578.59154929, "logits/rejected": 76748404.77192983, "logps/chosen": -249.91549295774647, "logps/rejected": -410.94736842105266, "loss": 0.1472, "rewards/chosen": 1.3221830985915493, "rewards/margins": 9.269551519644182, "rewards/rejected": -7.947368421052632, "step": 1225 }, { "epoch": 0.8704295349662762, "grad_norm": 0.167163091670918, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84450697.84615384, "logits/rejected": 106388853.84126984, "logps/chosen": -192.0, "logps/rejected": -338.7936507936508, "loss": 0.1586, "rewards/chosen": 1.2384615384615385, "rewards/margins": 10.635286935286935, "rewards/rejected": -9.396825396825397, "step": 1226 }, { "epoch": 0.8711395101171459, "grad_norm": 0.22478839095697384, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149962499.93846154, "logits/rejected": 33321415.111111112, "logps/chosen": -250.33846153846153, "logps/rejected": -372.3174603174603, "loss": 0.1835, "rewards/chosen": 1.6615384615384616, "rewards/margins": 8.344078144078145, "rewards/rejected": -6.682539682539683, "step": 1227 }, { "epoch": 0.8718494852680156, "grad_norm": 0.19182490383471673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148466870.3561644, "logits/rejected": 52466930.03636364, "logps/chosen": -224.43835616438355, "logps/rejected": -367.7090909090909, "loss": 0.1737, "rewards/chosen": 2.037671232876712, "rewards/margins": 10.16494396014944, "rewards/rejected": -8.127272727272727, "step": 1228 }, { "epoch": 0.8725594604188853, "grad_norm": 0.16417995964612084, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 181253851.42857143, "logits/rejected": 44127573.333333336, "logps/chosen": -204.57142857142858, "logps/rejected": -397.3333333333333, "loss": 0.165, "rewards/chosen": 0.9274553571428571, "rewards/margins": 11.344122023809524, "rewards/rejected": -10.416666666666666, "step": 1229 }, { "epoch": 0.873269435569755, "grad_norm": 0.15411689736334655, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 68030339.87878788, "logits/rejected": 132458826.32258065, "logps/chosen": -222.06060606060606, "logps/rejected": -332.64516129032256, "loss": 0.1542, "rewards/chosen": 1.3598484848484849, "rewards/margins": 9.367913000977516, "rewards/rejected": -8.008064516129032, "step": 1230 }, { "epoch": 0.8739794107206248, "grad_norm": 0.15336095615935158, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149596842.66666666, "logits/rejected": 90786386.58064516, "logps/chosen": -270.3030303030303, "logps/rejected": -439.741935483871, "loss": 0.1679, "rewards/chosen": 2.053030303030303, "rewards/margins": 4.50464320625611, "rewards/rejected": -2.4516129032258065, "step": 1231 }, { "epoch": 0.8746893858714945, "grad_norm": 0.1938040366137974, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 72430803.3015873, "logits/rejected": 87564161.96923077, "logps/chosen": -217.9047619047619, "logps/rejected": -349.53846153846155, "loss": 0.1588, "rewards/chosen": 2.0615079365079363, "rewards/margins": 8.338431013431013, "rewards/rejected": -6.276923076923077, "step": 1232 }, { "epoch": 0.8753993610223643, "grad_norm": 0.1591176175202231, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94033589.67741935, "logits/rejected": 88461684.36363636, "logps/chosen": -225.03225806451613, "logps/rejected": -366.54545454545456, "loss": 0.1429, "rewards/chosen": 1.752016129032258, "rewards/margins": 11.55504643206256, "rewards/rejected": -9.803030303030303, "step": 1233 }, { "epoch": 0.8761093361732339, "grad_norm": 0.1873089492139948, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86643978.52054794, "logits/rejected": 92503468.21818182, "logps/chosen": -219.83561643835617, "logps/rejected": -389.5272727272727, "loss": 0.1615, "rewards/chosen": 2.368150684931507, "rewards/margins": 9.24996886674969, "rewards/rejected": -6.881818181818182, "step": 1234 }, { "epoch": 0.8768193113241036, "grad_norm": 0.1910256144720497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96079520.91428572, "logits/rejected": 94733417.93103448, "logps/chosen": -238.85714285714286, "logps/rejected": -371.3103448275862, "loss": 0.1643, "rewards/chosen": 2.241964285714286, "rewards/margins": 11.690240147783252, "rewards/rejected": -9.448275862068966, "step": 1235 }, { "epoch": 0.8775292864749734, "grad_norm": 0.1689639457830429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 44771017.696969695, "logits/rejected": 104992900.12903225, "logps/chosen": -209.21212121212122, "logps/rejected": -366.7096774193548, "loss": 0.1603, "rewards/chosen": 1.7556818181818181, "rewards/margins": 10.699230205278592, "rewards/rejected": -8.943548387096774, "step": 1236 }, { "epoch": 0.8782392616258431, "grad_norm": 0.15839215159366135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112066560.0, "logits/rejected": 32505856.0, "logps/chosen": -192.875, "logps/rejected": -359.5, "loss": 0.1531, "rewards/chosen": 1.57421875, "rewards/margins": 9.45703125, "rewards/rejected": -7.8828125, "step": 1237 }, { "epoch": 0.8789492367767128, "grad_norm": 0.15187406306320447, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63581835.63636363, "logits/rejected": 95048340.64516129, "logps/chosen": -203.27272727272728, "logps/rejected": -317.4193548387097, "loss": 0.149, "rewards/chosen": 1.9109848484848484, "rewards/margins": 8.588404203323558, "rewards/rejected": -6.67741935483871, "step": 1238 }, { "epoch": 0.8796592119275826, "grad_norm": 0.17897629081063648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 142471035.87096775, "logits/rejected": 96532542.06060606, "logps/chosen": -245.16129032258064, "logps/rejected": -354.42424242424244, "loss": 0.1643, "rewards/chosen": 2.0766129032258065, "rewards/margins": 9.546309872922777, "rewards/rejected": -7.46969696969697, "step": 1239 }, { "epoch": 0.8803691870784522, "grad_norm": 0.1648855947991986, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 65606426.74626866, "logits/rejected": 90899506.36065574, "logps/chosen": -189.37313432835822, "logps/rejected": -363.0163934426229, "loss": 0.175, "rewards/chosen": 1.6361940298507462, "rewards/margins": 10.496849767555664, "rewards/rejected": -8.860655737704919, "step": 1240 }, { "epoch": 0.8810791622293219, "grad_norm": 0.13710853984910834, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105100748.05797102, "logits/rejected": 52677614.644067794, "logps/chosen": -181.79710144927537, "logps/rejected": -335.1864406779661, "loss": 0.1601, "rewards/chosen": 1.5960144927536233, "rewards/margins": 11.036692458855319, "rewards/rejected": -9.440677966101696, "step": 1241 }, { "epoch": 0.8817891373801917, "grad_norm": 0.15827691362307816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138132411.73333332, "logits/rejected": 102760448.0, "logps/chosen": -239.33333333333334, "logps/rejected": -369.4117647058824, "loss": 0.1615, "rewards/chosen": 1.6276041666666667, "rewards/margins": 10.348192401960784, "rewards/rejected": -8.720588235294118, "step": 1242 }, { "epoch": 0.8824991125310614, "grad_norm": 0.14250210463197205, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 105640536.74666667, "logits/rejected": 56266983.8490566, "logps/chosen": -197.12, "logps/rejected": -333.2830188679245, "loss": 0.1663, "rewards/chosen": 1.8416666666666666, "rewards/margins": 9.077515723270439, "rewards/rejected": -7.235849056603773, "step": 1243 }, { "epoch": 0.8832090876819312, "grad_norm": 0.1630788131375268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100446349.2413793, "logits/rejected": 96708666.51428571, "logps/chosen": -251.58620689655172, "logps/rejected": -394.51428571428573, "loss": 0.1536, "rewards/chosen": 1.5732758620689655, "rewards/margins": 11.058990147783252, "rewards/rejected": -9.485714285714286, "step": 1244 }, { "epoch": 0.8839190628328009, "grad_norm": 0.1580315309918693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113881708.60606061, "logits/rejected": 46610894.451612905, "logps/chosen": -221.33333333333334, "logps/rejected": -363.35483870967744, "loss": 0.1539, "rewards/chosen": 1.8863636363636365, "rewards/margins": 10.765395894428153, "rewards/rejected": -8.879032258064516, "step": 1245 }, { "epoch": 0.8846290379836705, "grad_norm": 0.1857479504981429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112606398.91525424, "logits/rejected": 73005204.4057971, "logps/chosen": -215.32203389830508, "logps/rejected": -343.18840579710144, "loss": 0.1464, "rewards/chosen": 1.5201271186440677, "rewards/margins": 11.40418508965856, "rewards/rejected": -9.884057971014492, "step": 1246 }, { "epoch": 0.8853390131345403, "grad_norm": 0.12849441970383424, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75421211.92727272, "logits/rejected": 115142263.23287672, "logps/chosen": -255.4181818181818, "logps/rejected": -381.8082191780822, "loss": 0.1277, "rewards/chosen": 1.7818181818181817, "rewards/margins": 11.685927770859276, "rewards/rejected": -9.904109589041095, "step": 1247 }, { "epoch": 0.88604898828541, "grad_norm": 0.1913602074873261, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114015163.73333333, "logits/rejected": 101033381.64705883, "logps/chosen": -252.26666666666668, "logps/rejected": -393.88235294117646, "loss": 0.1619, "rewards/chosen": 1.5635416666666666, "rewards/margins": 10.637071078431372, "rewards/rejected": -9.073529411764707, "step": 1248 }, { "epoch": 0.8867589634362797, "grad_norm": 0.16563777547637215, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 145087473.5774648, "logits/rejected": 82671939.36842105, "logps/chosen": -310.3098591549296, "logps/rejected": -435.9298245614035, "loss": 0.1376, "rewards/chosen": 2.507042253521127, "rewards/margins": -2455743.4578700275, "rewards/rejected": 2455745.964912281, "step": 1249 }, { "epoch": 0.8874689385871495, "grad_norm": 0.15834564122377634, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123312537.6, "logits/rejected": 58473532.23529412, "logps/chosen": -239.46666666666667, "logps/rejected": -376.47058823529414, "loss": 0.1596, "rewards/chosen": 1.4708333333333334, "rewards/margins": 10.581127450980391, "rewards/rejected": -9.110294117647058, "step": 1250 }, { "epoch": 0.8881789137380192, "grad_norm": 0.27213821116452247, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 162287300.92307693, "logits/rejected": 52082769.92, "logps/chosen": -309.94871794871796, "logps/rejected": -384.96, "loss": 0.1821, "rewards/chosen": 2.046474358974359, "rewards/margins": 11.406474358974359, "rewards/rejected": -9.36, "step": 1251 }, { "epoch": 0.8888888888888888, "grad_norm": 0.18219802021737386, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118862309.96610169, "logits/rejected": 52155258.43478261, "logps/chosen": -240.27118644067798, "logps/rejected": -314.4347826086956, "loss": 0.141, "rewards/chosen": 2.207627118644068, "rewards/margins": 9.338061901252763, "rewards/rejected": -7.130434782608695, "step": 1252 }, { "epoch": 0.8895988640397586, "grad_norm": 0.22627041759376426, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107324837.64705883, "logits/rejected": 100383675.73333333, "logps/chosen": -262.3529411764706, "logps/rejected": -375.46666666666664, "loss": 0.1571, "rewards/chosen": 1.4871323529411764, "rewards/margins": 12.020465686274509, "rewards/rejected": -10.533333333333333, "step": 1253 }, { "epoch": 0.8903088391906283, "grad_norm": 0.15960123071675264, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73400320.0, "logits/rejected": 75038720.0, "logps/chosen": -194.25, "logps/rejected": -372.75, "loss": 0.1631, "rewards/chosen": 1.0859375, "rewards/margins": 11.2890625, "rewards/rejected": -10.203125, "step": 1254 }, { "epoch": 0.8910188143414981, "grad_norm": 0.17069349937130288, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147774317.7142857, "logits/rejected": 74361514.66666667, "logps/chosen": -224.28571428571428, "logps/rejected": -389.3333333333333, "loss": 0.1246, "rewards/chosen": 1.9732142857142858, "rewards/margins": 11.973214285714286, "rewards/rejected": -10.0, "step": 1255 }, { "epoch": 0.8917287894923678, "grad_norm": 0.18509566396471083, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118208483.15492958, "logits/rejected": 99890661.05263157, "logps/chosen": -204.61971830985917, "logps/rejected": -368.0, "loss": 0.163, "rewards/chosen": 1.6830985915492958, "rewards/margins": -23216312.70286632, "rewards/rejected": 23216314.38596491, "step": 1256 }, { "epoch": 0.8924387646432375, "grad_norm": 0.19915063793941143, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93199902.11764705, "logits/rejected": 115430741.33333333, "logps/chosen": -211.05882352941177, "logps/rejected": -375.46666666666664, "loss": 0.1731, "rewards/chosen": 1.8106617647058822, "rewards/margins": 11.20232843137255, "rewards/rejected": -9.391666666666667, "step": 1257 }, { "epoch": 0.8931487397941072, "grad_norm": 0.17768656800659333, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96352483.55555555, "logits/rejected": 115902756.57142857, "logps/chosen": -180.22222222222223, "logps/rejected": -354.85714285714283, "loss": 0.1769, "rewards/chosen": 1.4635416666666667, "rewards/margins": 10.972470238095237, "rewards/rejected": -9.508928571428571, "step": 1258 }, { "epoch": 0.8938587149449769, "grad_norm": 0.16380972695842752, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123450261.01492538, "logits/rejected": 41977419.54098361, "logps/chosen": -238.32835820895522, "logps/rejected": -341.5081967213115, "loss": 0.1668, "rewards/chosen": 1.830223880597015, "rewards/margins": 6523745.830223881, "rewards/rejected": -6523744.0, "step": 1259 }, { "epoch": 0.8945686900958466, "grad_norm": 0.1627268693753494, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 166423990.85714287, "logits/rejected": 48205368.88888889, "logps/chosen": -249.14285714285714, "logps/rejected": -352.8888888888889, "loss": 0.1446, "rewards/chosen": 2.0535714285714284, "rewards/margins": 11.331349206349207, "rewards/rejected": -9.277777777777779, "step": 1260 }, { "epoch": 0.8952786652467164, "grad_norm": 0.16788336472548973, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99906946.09836066, "logits/rejected": 61686004.53731343, "logps/chosen": -259.1475409836066, "logps/rejected": -350.089552238806, "loss": 0.1692, "rewards/chosen": 1.9610655737704918, "rewards/margins": 10.908826767800344, "rewards/rejected": -8.947761194029852, "step": 1261 }, { "epoch": 0.8959886403975861, "grad_norm": 0.19405341424478698, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 54584206.222222224, "logits/rejected": 107766930.28571428, "logps/chosen": -213.11111111111111, "logps/rejected": -344.85714285714283, "loss": 0.194, "rewards/chosen": 1.5902777777777777, "rewards/margins": 7.679563492063492, "rewards/rejected": -6.089285714285714, "step": 1262 }, { "epoch": 0.8966986155484558, "grad_norm": 0.1673644320736706, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 57482936.32, "logits/rejected": 93726562.46153846, "logps/chosen": -223.04, "logps/rejected": -306.87179487179486, "loss": 0.1199, "rewards/chosen": 2.225, "rewards/margins": 11.019871794871795, "rewards/rejected": -8.794871794871796, "step": 1263 }, { "epoch": 0.8974085906993255, "grad_norm": 0.15050231484573742, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 116252125.86666666, "logits/rejected": 120524559.05882353, "logps/chosen": -213.06666666666666, "logps/rejected": -393.1764705882353, "loss": 0.1246, "rewards/chosen": 1.8645833333333333, "rewards/margins": 11.732230392156863, "rewards/rejected": -9.867647058823529, "step": 1264 }, { "epoch": 0.8981185658501952, "grad_norm": 0.1683676038464604, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147792020.94545454, "logits/rejected": 71719725.5890411, "logps/chosen": -241.45454545454547, "logps/rejected": -384.43835616438355, "loss": 0.1583, "rewards/chosen": 1.7772727272727273, "rewards/margins": 10.69508094645081, "rewards/rejected": -8.917808219178083, "step": 1265 }, { "epoch": 0.898828541001065, "grad_norm": 0.1824149697028473, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98900472.57971014, "logits/rejected": 44324551.59322034, "logps/chosen": -198.95652173913044, "logps/rejected": -344.9491525423729, "loss": 0.1711, "rewards/chosen": 1.7128623188405796, "rewards/margins": 11.128116556128715, "rewards/rejected": -9.415254237288135, "step": 1266 }, { "epoch": 0.8995385161519347, "grad_norm": 0.20364921300862715, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109946688.85333334, "logits/rejected": 88871762.11320755, "logps/chosen": -196.26666666666668, "logps/rejected": -400.0, "loss": 0.1873, "rewards/chosen": 1.5683333333333334, "rewards/margins": 9.804182389937107, "rewards/rejected": -8.235849056603774, "step": 1267 }, { "epoch": 0.9002484913028044, "grad_norm": 0.22622113851600237, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36667392.0, "logits/rejected": 75634688.0, "logps/chosen": -228.5, "logps/rejected": -354.0, "loss": 0.1589, "rewards/chosen": 1.677734375, "rewards/margins": 11.365234375, "rewards/rejected": -9.6875, "step": 1268 }, { "epoch": 0.9009584664536742, "grad_norm": 0.19099776762673268, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125088948.70588236, "logits/rejected": 47535445.333333336, "logps/chosen": -210.58823529411765, "logps/rejected": -344.8, "loss": 0.1637, "rewards/chosen": 1.6985294117647058, "rewards/margins": -14103617.501470588, "rewards/rejected": 14103619.2, "step": 1269 }, { "epoch": 0.9016684416045438, "grad_norm": 1.3526956586846572, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95260463.72881356, "logits/rejected": 116711067.82608695, "logps/chosen": -237.5593220338983, "logps/rejected": -391.42028985507244, "loss": 0.1399, "rewards/chosen": 1.9618644067796611, "rewards/margins": 11.512589044460821, "rewards/rejected": -9.55072463768116, "step": 1270 }, { "epoch": 0.9023784167554135, "grad_norm": 0.1661696692619912, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109919691.03448276, "logits/rejected": 59079767.77142857, "logps/chosen": -193.3793103448276, "logps/rejected": -351.0857142857143, "loss": 0.1298, "rewards/chosen": 1.2780172413793103, "rewards/margins": 10.749445812807881, "rewards/rejected": -9.471428571428572, "step": 1271 }, { "epoch": 0.9030883919062833, "grad_norm": 0.15282436340451785, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 97908827.70149253, "logits/rejected": 111458471.86885247, "logps/chosen": -251.9402985074627, "logps/rejected": -339.9344262295082, "loss": 0.1596, "rewards/chosen": 1.7388059701492538, "rewards/margins": 10.435527281624664, "rewards/rejected": -8.69672131147541, "step": 1272 }, { "epoch": 0.903798367057153, "grad_norm": 0.1800863698767531, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80208881.97260274, "logits/rejected": 103866219.05454545, "logps/chosen": -175.34246575342465, "logps/rejected": -357.23636363636365, "loss": 0.1887, "rewards/chosen": 0.6318493150684932, "rewards/margins": 10.422758405977584, "rewards/rejected": -9.790909090909091, "step": 1273 }, { "epoch": 0.9045083422080227, "grad_norm": 0.19398815323566954, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 132922428.23529412, "logits/rejected": 59559116.8, "logps/chosen": -248.47058823529412, "logps/rejected": -355.46666666666664, "loss": 0.1822, "rewards/chosen": 0.8952205882352942, "rewards/margins": 9.38688725490196, "rewards/rejected": -8.491666666666667, "step": 1274 }, { "epoch": 0.9052183173588925, "grad_norm": 0.16792986101995308, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139074290.52631578, "logits/rejected": 69058329.23943663, "logps/chosen": -199.859649122807, "logps/rejected": -299.71830985915494, "loss": 0.1567, "rewards/chosen": 1.419407894736842, "rewards/margins": 9.419407894736842, "rewards/rejected": -8.0, "step": 1275 }, { "epoch": 0.9059282925097621, "grad_norm": 0.15493932938657803, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82388114.28571428, "logits/rejected": 92224971.03448276, "logps/chosen": -242.05714285714285, "logps/rejected": -334.62068965517244, "loss": 0.1587, "rewards/chosen": 2.1232142857142855, "rewards/margins": 10.924938423645319, "rewards/rejected": -8.801724137931034, "step": 1276 }, { "epoch": 0.9066382676606318, "grad_norm": 0.22738204842195714, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102973202.55072464, "logits/rejected": 117227242.30508475, "logps/chosen": -204.05797101449275, "logps/rejected": -382.91525423728814, "loss": 0.1631, "rewards/chosen": 1.2934782608695652, "rewards/margins": 11.581613854089904, "rewards/rejected": -10.288135593220339, "step": 1277 }, { "epoch": 0.9073482428115016, "grad_norm": 0.18970500180915897, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 130547712.0, "logits/rejected": 76971146.37837838, "logps/chosen": -229.1851851851852, "logps/rejected": -341.18918918918916, "loss": 0.1658, "rewards/chosen": 0.7847222222222222, "rewards/margins": 10.027965465465465, "rewards/rejected": -9.243243243243244, "step": 1278 }, { "epoch": 0.9080582179623713, "grad_norm": 0.14979466686744913, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 73046255.37662338, "logits/rejected": 88100944.31372549, "logps/chosen": -182.44155844155844, "logps/rejected": -399.6862745098039, "loss": 0.1432, "rewards/chosen": 1.6412337662337662, "rewards/margins": 10.102018079959256, "rewards/rejected": -8.46078431372549, "step": 1279 }, { "epoch": 0.9087681931132411, "grad_norm": 0.17332999762523327, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104066221.88679245, "logits/rejected": 65878534.82666667, "logps/chosen": -236.52830188679246, "logps/rejected": -325.97333333333336, "loss": 0.1474, "rewards/chosen": 0.6043632075471698, "rewards/margins": 9.911029874213837, "rewards/rejected": -9.306666666666667, "step": 1280 }, { "epoch": 0.9094781682641108, "grad_norm": 0.1749190360838474, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123032917.33333333, "logits/rejected": 40726296.1509434, "logps/chosen": -225.28, "logps/rejected": -307.62264150943395, "loss": 0.1949, "rewards/chosen": 1.3616666666666666, "rewards/margins": 10.427704402515722, "rewards/rejected": -9.066037735849056, "step": 1281 }, { "epoch": 0.9101881434149804, "grad_norm": 0.13417514613573844, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107799872.95522387, "logits/rejected": 40395960.655737706, "logps/chosen": -299.94029850746267, "logps/rejected": -353.04918032786884, "loss": 0.143, "rewards/chosen": 2.373134328358209, "rewards/margins": 12.291167115243454, "rewards/rejected": -9.918032786885245, "step": 1282 }, { "epoch": 0.9108981185658502, "grad_norm": 0.17954373396785528, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 63183425.64102564, "logits/rejected": 148142817.28, "logps/chosen": -160.6153846153846, "logps/rejected": -431.36, "loss": 0.1922, "rewards/chosen": 1.5144230769230769, "rewards/margins": 9.294423076923078, "rewards/rejected": -7.78, "step": 1283 }, { "epoch": 0.9116080937167199, "grad_norm": 0.16609932234886146, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 118489088.0, "logits/rejected": 104202240.0, "logps/chosen": -236.5, "logps/rejected": -376.0, "loss": 0.1503, "rewards/chosen": 1.71484375, "rewards/margins": 10.55078125, "rewards/rejected": -8.8359375, "step": 1284 }, { "epoch": 0.9123180688675896, "grad_norm": 0.17957801156289369, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 134288817.8983051, "logits/rejected": 86591102.14492753, "logps/chosen": -261.1525423728813, "logps/rejected": -409.5072463768116, "loss": 0.1378, "rewards/chosen": 2.1440677966101696, "rewards/margins": 12.216531564726111, "rewards/rejected": -10.072463768115941, "step": 1285 }, { "epoch": 0.9130280440184594, "grad_norm": 0.1617208122072535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148779085.28301886, "logits/rejected": 62299395.413333334, "logps/chosen": -227.9245283018868, "logps/rejected": -348.16, "loss": 0.1453, "rewards/chosen": 1.2358490566037736, "rewards/margins": 10.635849056603774, "rewards/rejected": -9.4, "step": 1286 }, { "epoch": 0.9137380191693291, "grad_norm": 0.19784526650495263, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102662144.0, "logits/rejected": 107282432.0, "logps/chosen": -211.25, "logps/rejected": -408.5, "loss": 0.1657, "rewards/chosen": 1.283203125, "rewards/margins": 12.001953125, "rewards/rejected": -10.71875, "step": 1287 }, { "epoch": 0.9144479943201987, "grad_norm": 0.19531616637608548, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128345702.4, "logits/rejected": 53939983.058823526, "logps/chosen": -212.0, "logps/rejected": -342.11764705882354, "loss": 0.1617, "rewards/chosen": 1.5729166666666667, "rewards/margins": 10.37438725490196, "rewards/rejected": -8.801470588235293, "step": 1288 }, { "epoch": 0.9151579694710685, "grad_norm": 0.19227140419289332, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77175193.6, "logits/rejected": 93279573.33333333, "logps/chosen": -196.4, "logps/rejected": -390.0, "loss": 0.1985, "rewards/chosen": 1.5203125, "rewards/margins": 8.8328125, "rewards/rejected": -7.3125, "step": 1289 }, { "epoch": 0.9158679446219382, "grad_norm": 0.15185038112352872, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 148155722.83076924, "logits/rejected": 52070952.634920634, "logps/chosen": -186.58461538461538, "logps/rejected": -384.0, "loss": 0.1465, "rewards/chosen": 1.123076923076923, "rewards/margins": -13831548.845177045, "rewards/rejected": 13831549.968253968, "step": 1290 }, { "epoch": 0.916577919772808, "grad_norm": 0.19165383484953355, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 111425763.55555555, "logits/rejected": 67576978.28571428, "logps/chosen": -207.77777777777777, "logps/rejected": -358.85714285714283, "loss": 0.1732, "rewards/chosen": 1.4913194444444444, "rewards/margins": 10.268105158730158, "rewards/rejected": -8.776785714285714, "step": 1291 }, { "epoch": 0.9172878949236777, "grad_norm": 0.1633877535650206, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100221790.31578948, "logits/rejected": 107752260.50704226, "logps/chosen": -231.01754385964912, "logps/rejected": -402.4788732394366, "loss": 0.1396, "rewards/chosen": 2.1030701754385963, "rewards/margins": 10.969267358537188, "rewards/rejected": -8.866197183098592, "step": 1292 }, { "epoch": 0.9179978700745474, "grad_norm": 0.16093261913034612, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102003143.1111111, "logits/rejected": 92124891.42857143, "logps/chosen": -209.77777777777777, "logps/rejected": -364.2857142857143, "loss": 0.1566, "rewards/chosen": 1.9409722222222223, "rewards/margins": 11.673115079365079, "rewards/rejected": -9.732142857142858, "step": 1293 }, { "epoch": 0.9187078452254172, "grad_norm": 0.1940863848714648, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123978691.76470588, "logits/rejected": 49562692.266666666, "logps/chosen": -195.05882352941177, "logps/rejected": -330.6666666666667, "loss": 0.1757, "rewards/chosen": 1.5863970588235294, "rewards/margins": 10.88639705882353, "rewards/rejected": -9.3, "step": 1294 }, { "epoch": 0.9194178203762868, "grad_norm": 0.1845757666924306, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147381389.7846154, "logits/rejected": 53793613.20634921, "logps/chosen": -265.3538461538462, "logps/rejected": -324.06349206349205, "loss": 0.1463, "rewards/chosen": 2.3596153846153847, "rewards/margins": 11.192948717948719, "rewards/rejected": -8.833333333333334, "step": 1295 }, { "epoch": 0.9201277955271565, "grad_norm": 0.17835242793719497, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 147512880.3018868, "logits/rejected": 69094167.89333333, "logps/chosen": -218.8679245283019, "logps/rejected": -349.8666666666667, "loss": 0.1562, "rewards/chosen": 1.3679245283018868, "rewards/margins": 9.807924528301886, "rewards/rejected": -8.44, "step": 1296 }, { "epoch": 0.9208377706780263, "grad_norm": 0.17933462970985148, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 108416403.39393939, "logits/rejected": 68225090.06451613, "logps/chosen": -196.84848484848484, "logps/rejected": -366.4516129032258, "loss": 0.1945, "rewards/chosen": 0.9962121212121212, "rewards/margins": 10.092986314760509, "rewards/rejected": -9.096774193548388, "step": 1297 }, { "epoch": 0.921547745828896, "grad_norm": 0.1439865836524259, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 67451667.6923077, "logits/rejected": 93273331.8095238, "logps/chosen": -176.4923076923077, "logps/rejected": -361.14285714285717, "loss": 0.1573, "rewards/chosen": 1.7846153846153847, "rewards/margins": 11.308424908424909, "rewards/rejected": -9.523809523809524, "step": 1298 }, { "epoch": 0.9222577209797657, "grad_norm": 0.15724401899199927, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 117874405.51724137, "logits/rejected": 68427073.82857142, "logps/chosen": -145.93103448275863, "logps/rejected": -360.6857142857143, "loss": 0.1515, "rewards/chosen": 1.081896551724138, "rewards/margins": 10.153325123152708, "rewards/rejected": -9.071428571428571, "step": 1299 }, { "epoch": 0.9229676961306355, "grad_norm": 0.19454771870490445, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106301540.72131148, "logits/rejected": 53399124.059701495, "logps/chosen": -195.14754098360655, "logps/rejected": -359.64179104477614, "loss": 0.1601, "rewards/chosen": 1.3924180327868851, "rewards/margins": 10.884955346219721, "rewards/rejected": -9.492537313432836, "step": 1300 }, { "epoch": 0.9236776712815051, "grad_norm": 0.18082466233255465, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 107561822.31578948, "logits/rejected": 87489636.95774648, "logps/chosen": -231.1578947368421, "logps/rejected": -379.0422535211268, "loss": 0.1428, "rewards/chosen": 2.012609649122807, "rewards/margins": 11.083032184334074, "rewards/rejected": -9.070422535211268, "step": 1301 }, { "epoch": 0.9243876464323749, "grad_norm": 0.1622832220033678, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80800270.62857144, "logits/rejected": 102471185.65517241, "logps/chosen": -241.14285714285714, "logps/rejected": -425.1034482758621, "loss": 0.1637, "rewards/chosen": 2.0142857142857142, "rewards/margins": 11.540147783251232, "rewards/rejected": -9.525862068965518, "step": 1302 }, { "epoch": 0.9250976215832446, "grad_norm": 0.14872543422168297, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 159886868.48, "logits/rejected": 31363177.025641024, "logps/chosen": -245.6, "logps/rejected": -332.3076923076923, "loss": 0.1355, "rewards/chosen": 1.8475, "rewards/margins": 10.501346153846153, "rewards/rejected": -8.653846153846153, "step": 1303 }, { "epoch": 0.9258075967341143, "grad_norm": 0.16854126467510178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 86341495.46666667, "logits/rejected": 92336368.94117647, "logps/chosen": -224.53333333333333, "logps/rejected": -387.29411764705884, "loss": 0.1575, "rewards/chosen": 1.3854166666666667, "rewards/margins": 10.267769607843137, "rewards/rejected": -8.882352941176471, "step": 1304 }, { "epoch": 0.9265175718849841, "grad_norm": 0.152536196430188, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102073449.93103448, "logits/rejected": 79392182.85714285, "logps/chosen": -220.9655172413793, "logps/rejected": -383.54285714285714, "loss": 0.1559, "rewards/chosen": 1.4956896551724137, "rewards/margins": 11.324261083743842, "rewards/rejected": -9.82857142857143, "step": 1305 }, { "epoch": 0.9272275470358537, "grad_norm": 0.16167924608905532, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36936458.81690141, "logits/rejected": 124283850.10526316, "logps/chosen": -170.3661971830986, "logps/rejected": -331.7894736842105, "loss": 0.1559, "rewards/chosen": 1.744718309859155, "rewards/margins": 8.156999011613541, "rewards/rejected": -6.412280701754386, "step": 1306 }, { "epoch": 0.9279375221867234, "grad_norm": 0.17195299836360978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 179539512.8888889, "logits/rejected": 14268123.42857143, "logps/chosen": -282.44444444444446, "logps/rejected": -378.2857142857143, "loss": 0.1689, "rewards/chosen": 1.7777777777777777, "rewards/margins": 11.027777777777779, "rewards/rejected": -9.25, "step": 1307 }, { "epoch": 0.9286474973375932, "grad_norm": 0.14134118829530565, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88225015.1724138, "logits/rejected": 121155466.97142857, "logps/chosen": -204.9655172413793, "logps/rejected": -363.8857142857143, "loss": 0.1285, "rewards/chosen": 1.9612068965517242, "rewards/margins": 9.104064039408868, "rewards/rejected": -7.142857142857143, "step": 1308 }, { "epoch": 0.9293574724884629, "grad_norm": 0.22540495138864416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128129222.19354838, "logits/rejected": 92572578.9090909, "logps/chosen": -241.80645161290323, "logps/rejected": -360.72727272727275, "loss": 0.1629, "rewards/chosen": 1.5050403225806452, "rewards/margins": 10.611100928641251, "rewards/rejected": -9.106060606060606, "step": 1309 }, { "epoch": 0.9300674476393326, "grad_norm": 0.20581055425702507, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 176018588.20338982, "logits/rejected": 42824451.71014493, "logps/chosen": -281.76271186440675, "logps/rejected": -352.69565217391306, "loss": 0.1458, "rewards/chosen": 1.6927966101694916, "rewards/margins": 12.113086465241954, "rewards/rejected": -10.420289855072463, "step": 1310 }, { "epoch": 0.9307774227902024, "grad_norm": 0.17163634956446952, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 129710416.23880596, "logits/rejected": 52153763.67213115, "logps/chosen": -208.11940298507463, "logps/rejected": -414.42622950819674, "loss": 0.1571, "rewards/chosen": 1.6222014925373134, "rewards/margins": 10.892693295816, "rewards/rejected": -9.270491803278688, "step": 1311 }, { "epoch": 0.931487397941072, "grad_norm": 0.17973682548074832, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96275408.73846154, "logits/rejected": 56423375.23809524, "logps/chosen": -199.3846153846154, "logps/rejected": -297.9047619047619, "loss": 0.1752, "rewards/chosen": 1.5932692307692307, "rewards/margins": 10.394856532356533, "rewards/rejected": -8.801587301587302, "step": 1312 }, { "epoch": 0.9321973730919417, "grad_norm": 0.17207395510368295, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 109739494.81967214, "logits/rejected": 56263145.07462686, "logps/chosen": -224.52459016393442, "logps/rejected": -376.35820895522386, "loss": 0.1349, "rewards/chosen": 2.1362704918032787, "rewards/margins": 9.315374969415219, "rewards/rejected": -7.17910447761194, "step": 1313 }, { "epoch": 0.9329073482428115, "grad_norm": 0.18528877758941847, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 98886048.54237288, "logits/rejected": 61638032.69565217, "logps/chosen": -220.74576271186442, "logps/rejected": -408.1159420289855, "loss": 0.1458, "rewards/chosen": 1.2627118644067796, "rewards/margins": 11.530827806435767, "rewards/rejected": -10.268115942028986, "step": 1314 }, { "epoch": 0.9336173233936812, "grad_norm": 0.1638581477465997, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95390456.68571429, "logits/rejected": 58810650.48275862, "logps/chosen": -168.0, "logps/rejected": -355.86206896551727, "loss": 0.1734, "rewards/chosen": 1.457142857142857, "rewards/margins": 9.620935960591133, "rewards/rejected": -8.163793103448276, "step": 1315 }, { "epoch": 0.934327298544551, "grad_norm": 0.15030547688162135, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104786510.10169491, "logits/rejected": 64722973.68115942, "logps/chosen": -226.98305084745763, "logps/rejected": -370.5507246376812, "loss": 0.1393, "rewards/chosen": 1.11864406779661, "rewards/margins": 10.212846966347335, "rewards/rejected": -9.094202898550725, "step": 1316 }, { "epoch": 0.9350372736954207, "grad_norm": 0.23519501966749354, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 87547773.96825397, "logits/rejected": 74142389.16923077, "logps/chosen": -219.42857142857142, "logps/rejected": -361.84615384615387, "loss": 0.1515, "rewards/chosen": 1.3075396825396826, "rewards/margins": 11.284462759462759, "rewards/rejected": -9.976923076923077, "step": 1317 }, { "epoch": 0.9357472488462903, "grad_norm": 0.17641497509036932, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119290940.23529412, "logits/rejected": 76720810.66666667, "logps/chosen": -186.35294117647058, "logps/rejected": -375.46666666666664, "loss": 0.1579, "rewards/chosen": 1.8547794117647058, "rewards/margins": 10.596446078431374, "rewards/rejected": -8.741666666666667, "step": 1318 }, { "epoch": 0.9364572239971601, "grad_norm": 0.17575410452773632, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70196337.77777778, "logits/rejected": 115792749.71428572, "logps/chosen": -170.66666666666666, "logps/rejected": -367.42857142857144, "loss": 0.1868, "rewards/chosen": 1.1111111111111112, "rewards/margins": 10.334325396825397, "rewards/rejected": -9.223214285714286, "step": 1319 }, { "epoch": 0.9371671991480298, "grad_norm": 0.16587589104147257, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 172805324.8, "logits/rejected": -7586755.764705882, "logps/chosen": -254.4, "logps/rejected": -299.7647058823529, "loss": 0.1372, "rewards/chosen": 2.1375, "rewards/margins": 10.858088235294119, "rewards/rejected": -8.720588235294118, "step": 1320 }, { "epoch": 0.9378771742988995, "grad_norm": 0.18903218296137667, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56410349.44927536, "logits/rejected": 95260463.72881356, "logps/chosen": -205.68115942028984, "logps/rejected": -398.64406779661016, "loss": 0.1768, "rewards/chosen": 1.6032608695652173, "rewards/margins": 8744066.688006632, "rewards/rejected": -8744065.084745763, "step": 1321 }, { "epoch": 0.9385871494497693, "grad_norm": 0.176079773674827, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 115687668.53731343, "logits/rejected": 78866667.01639344, "logps/chosen": -230.92537313432837, "logps/rejected": -371.9344262295082, "loss": 0.1698, "rewards/chosen": 1.9738805970149254, "rewards/margins": 10.400110105211647, "rewards/rejected": -8.426229508196721, "step": 1322 }, { "epoch": 0.939297124600639, "grad_norm": 0.18197713612206182, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131917625.80645162, "logits/rejected": 59419306.666666664, "logps/chosen": -252.6451612903226, "logps/rejected": -328.24242424242425, "loss": 0.152, "rewards/chosen": 2.2278225806451615, "rewards/margins": 10.932368035190617, "rewards/rejected": -8.704545454545455, "step": 1323 }, { "epoch": 0.9400070997515086, "grad_norm": 0.1582752097392429, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 51729749.333333336, "logits/rejected": 49829513.01408451, "logps/chosen": -184.56140350877192, "logps/rejected": -338.4788732394366, "loss": 0.1416, "rewards/chosen": 1.5964912280701755, "rewards/margins": 11.730294044971583, "rewards/rejected": -10.133802816901408, "step": 1324 }, { "epoch": 0.9407170749023784, "grad_norm": 0.1905429558105092, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94138823.1111111, "logits/rejected": 52279003.428571425, "logps/chosen": -218.22222222222223, "logps/rejected": -361.14285714285717, "loss": 0.1957, "rewards/chosen": 1.4618055555555556, "rewards/margins": 11.283234126984127, "rewards/rejected": -9.821428571428571, "step": 1325 }, { "epoch": 0.9414270500532481, "grad_norm": 0.18482691943530105, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95630131.2, "logits/rejected": 60490964.52830189, "logps/chosen": -217.81333333333333, "logps/rejected": -362.8679245283019, "loss": 0.1555, "rewards/chosen": 1.77, "rewards/margins": 10.27, "rewards/rejected": -8.5, "step": 1326 }, { "epoch": 0.9421370252041179, "grad_norm": 0.14850831424550898, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103151442.44067797, "logits/rejected": 59198953.73913044, "logps/chosen": -208.8135593220339, "logps/rejected": -319.536231884058, "loss": 0.1413, "rewards/chosen": 1.7648305084745763, "rewards/margins": 10.924250798329648, "rewards/rejected": -9.159420289855072, "step": 1327 }, { "epoch": 0.9428470003549876, "grad_norm": 0.1752700328540682, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 77594624.0, "logits/rejected": 102660583.61904761, "logps/chosen": -179.56923076923076, "logps/rejected": -405.8412698412698, "loss": 0.1852, "rewards/chosen": 1.025, "rewards/margins": 8.501190476190477, "rewards/rejected": -7.476190476190476, "step": 1328 }, { "epoch": 0.9435569755058573, "grad_norm": 0.16281594054727178, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103361631.57333334, "logits/rejected": 84360906.86792453, "logps/chosen": -217.17333333333335, "logps/rejected": -389.7358490566038, "loss": 0.1541, "rewards/chosen": 2.2733333333333334, "rewards/margins": 8.962012578616353, "rewards/rejected": -6.688679245283019, "step": 1329 }, { "epoch": 0.944266950656727, "grad_norm": 0.1949832299883609, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146496704.92753622, "logits/rejected": 53424058.576271184, "logps/chosen": -237.68115942028984, "logps/rejected": -344.6779661016949, "loss": 0.1916, "rewards/chosen": 1.5452898550724639, "rewards/margins": 11.206306804225004, "rewards/rejected": -9.661016949152541, "step": 1330 }, { "epoch": 0.9449769258075967, "grad_norm": 0.19950702629472966, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114993834.66666667, "logits/rejected": 45088768.0, "logps/chosen": -196.96969696969697, "logps/rejected": -409.2903225806452, "loss": 0.1432, "rewards/chosen": 1.9659090909090908, "rewards/margins": 12.691715542521994, "rewards/rejected": -10.725806451612904, "step": 1331 }, { "epoch": 0.9456869009584664, "grad_norm": 0.16999474770282805, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120236714.66666667, "logits/rejected": 53113756.90322581, "logps/chosen": -204.36363636363637, "logps/rejected": -378.3225806451613, "loss": 0.1685, "rewards/chosen": 1.6893939393939394, "rewards/margins": 8.979716520039101, "rewards/rejected": -7.290322580645161, "step": 1332 }, { "epoch": 0.9463968761093362, "grad_norm": 0.14314258249921233, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 119498095.09433962, "logits/rejected": 105137220.26666667, "logps/chosen": -180.52830188679246, "logps/rejected": -381.0133333333333, "loss": 0.1194, "rewards/chosen": 1.875, "rewards/margins": 10.968333333333334, "rewards/rejected": -9.093333333333334, "step": 1333 }, { "epoch": 0.9471068512602059, "grad_norm": 0.1614440518372338, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 112614189.5890411, "logits/rejected": 84877460.94545455, "logps/chosen": -222.02739726027397, "logps/rejected": -387.4909090909091, "loss": 0.1588, "rewards/chosen": 1.9914383561643836, "rewards/margins": 13.100529265255293, "rewards/rejected": -11.10909090909091, "step": 1334 }, { "epoch": 0.9478168264110756, "grad_norm": 0.1904144816623038, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 44842511.515151516, "logits/rejected": 146394739.61290324, "logps/chosen": -206.06060606060606, "logps/rejected": -389.6774193548387, "loss": 0.1686, "rewards/chosen": 1.8143939393939394, "rewards/margins": 11.016006842619745, "rewards/rejected": -9.201612903225806, "step": 1335 }, { "epoch": 0.9485268015619454, "grad_norm": 0.18989175390550586, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 29438379.94029851, "logits/rejected": 73297181.37704918, "logps/chosen": -210.38805970149255, "logps/rejected": -374.55737704918033, "loss": 0.1455, "rewards/chosen": 2.042910447761194, "rewards/margins": 8139517.321598972, "rewards/rejected": -8139515.278688525, "step": 1336 }, { "epoch": 0.949236776712815, "grad_norm": 0.22499440195824247, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 93838189.71428572, "logits/rejected": 70764316.44444445, "logps/chosen": -251.14285714285714, "logps/rejected": -373.3333333333333, "loss": 0.1312, "rewards/chosen": 2.0580357142857144, "rewards/margins": 11.460813492063494, "rewards/rejected": -9.402777777777779, "step": 1337 }, { "epoch": 0.9499467518636848, "grad_norm": 0.18841397032539753, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 184549376.0, "logits/rejected": 40052649.464788735, "logps/chosen": -259.0877192982456, "logps/rejected": -441.23943661971833, "loss": 0.1562, "rewards/chosen": 1.599780701754386, "rewards/margins": 11.684287744007907, "rewards/rejected": -10.084507042253522, "step": 1338 }, { "epoch": 0.9506567270145545, "grad_norm": 0.13568689559266456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80740352.0, "logits/rejected": 108213043.2, "logps/chosen": -205.51724137931035, "logps/rejected": -383.0857142857143, "loss": 0.1293, "rewards/chosen": 1.9073275862068966, "rewards/margins": 12.328756157635468, "rewards/rejected": -10.42142857142857, "step": 1339 }, { "epoch": 0.9513667021654242, "grad_norm": 0.15804912641873273, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 60311198.89655172, "logits/rejected": 86582418.28571428, "logps/chosen": -210.3448275862069, "logps/rejected": -391.77142857142854, "loss": 0.1231, "rewards/chosen": 2.021551724137931, "rewards/margins": 12.62155172413793, "rewards/rejected": -10.6, "step": 1340 }, { "epoch": 0.952076677316294, "grad_norm": 0.18076012184346443, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120236714.66666667, "logits/rejected": 42120264.112676054, "logps/chosen": -221.75438596491227, "logps/rejected": -371.83098591549293, "loss": 0.16, "rewards/chosen": 1.4144736842105263, "rewards/margins": 11.808839881393624, "rewards/rejected": -10.394366197183098, "step": 1341 }, { "epoch": 0.9527866524671637, "grad_norm": 0.17040493923567224, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139885706.3783784, "logits/rejected": 55263838.81481481, "logps/chosen": -253.83783783783784, "logps/rejected": -406.51851851851853, "loss": 0.1741, "rewards/chosen": 1.8935810810810811, "rewards/margins": 12.217655155155155, "rewards/rejected": -10.324074074074074, "step": 1342 }, { "epoch": 0.9534966276180333, "grad_norm": 0.20572363899786705, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153930956.8, "logits/rejected": 59280701.79310345, "logps/chosen": -272.0, "logps/rejected": -338.7586206896552, "loss": 0.1613, "rewards/chosen": 1.9214285714285715, "rewards/margins": 11.766256157635468, "rewards/rejected": -9.844827586206897, "step": 1343 }, { "epoch": 0.9542066027689031, "grad_norm": 0.17428122087009862, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 71604813.15068494, "logits/rejected": 67833334.69090909, "logps/chosen": -250.95890410958904, "logps/rejected": -336.8727272727273, "loss": 0.1565, "rewards/chosen": 2.1969178082191783, "rewards/margins": 11.096917808219178, "rewards/rejected": -8.9, "step": 1344 }, { "epoch": 0.9549165779197728, "grad_norm": 0.1421707106658915, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 99024896.0, "logits/rejected": 29065216.0, "logps/chosen": -208.5, "logps/rejected": -305.25, "loss": 0.1562, "rewards/chosen": 2.17578125, "rewards/margins": 10.74609375, "rewards/rejected": -8.5703125, "step": 1345 }, { "epoch": 0.9556265530706425, "grad_norm": 0.16805180532164404, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 90177536.0, "logits/rejected": 66912256.0, "logps/chosen": -212.5, "logps/rejected": -318.5, "loss": 0.1641, "rewards/chosen": 0.841796875, "rewards/margins": 10.185546875, "rewards/rejected": -9.34375, "step": 1346 }, { "epoch": 0.9563365282215123, "grad_norm": 0.21801148362068623, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 75763415.1884058, "logits/rejected": 70272364.47457626, "logps/chosen": -186.66666666666666, "logps/rejected": -349.8305084745763, "loss": 0.1523, "rewards/chosen": 1.6884057971014492, "rewards/margins": 11.035863424220093, "rewards/rejected": -9.347457627118644, "step": 1347 }, { "epoch": 0.957046503372382, "grad_norm": 0.13960538035724074, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138476559.75384617, "logits/rejected": 58453950.984126985, "logps/chosen": -222.27692307692308, "logps/rejected": -360.12698412698415, "loss": 0.1146, "rewards/chosen": 2.480769230769231, "rewards/margins": 10.337912087912088, "rewards/rejected": -7.857142857142857, "step": 1348 }, { "epoch": 0.9577564785232516, "grad_norm": 0.20499839688669821, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150470656.0, "logits/rejected": 44236800.0, "logps/chosen": -242.5, "logps/rejected": -395.0, "loss": 0.1531, "rewards/chosen": 1.2734375, "rewards/margins": 11.8203125, "rewards/rejected": -10.546875, "step": 1349 }, { "epoch": 0.9584664536741214, "grad_norm": 0.17379135823153607, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 58458112.0, "logits/rejected": 99549184.0, "logps/chosen": -223.5, "logps/rejected": -449.5, "loss": 0.1577, "rewards/chosen": 1.740234375, "rewards/margins": 9.943359375, "rewards/rejected": -8.203125, "step": 1350 }, { "epoch": 0.9591764288249911, "grad_norm": 0.2056820507099594, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131013745.77777778, "logits/rejected": 38572617.14285714, "logps/chosen": -237.77777777777777, "logps/rejected": -337.42857142857144, "loss": 0.1888, "rewards/chosen": 1.5668402777777777, "rewards/margins": 8.031125992063492, "rewards/rejected": -6.464285714285714, "step": 1351 }, { "epoch": 0.9598864039758609, "grad_norm": 0.18039616200598535, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143457354.20289856, "logits/rejected": 73080415.45762712, "logps/chosen": -223.768115942029, "logps/rejected": -388.6101694915254, "loss": 0.1573, "rewards/chosen": 1.983695652173913, "rewards/margins": 11.271831245394253, "rewards/rejected": -9.288135593220339, "step": 1352 }, { "epoch": 0.9605963791267306, "grad_norm": 0.16773532699397772, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84279296.0, "logits/rejected": 87949312.0, "logps/chosen": -211.75, "logps/rejected": -367.5, "loss": 0.1518, "rewards/chosen": 1.763671875, "rewards/margins": 10.810546875, "rewards/rejected": -9.046875, "step": 1353 }, { "epoch": 0.9613063542776003, "grad_norm": 0.16110294274239942, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 70404388.57142857, "logits/rejected": 104857600.0, "logps/chosen": -178.15873015873015, "logps/rejected": -392.3692307692308, "loss": 0.1511, "rewards/chosen": 1.7876984126984128, "rewards/margins": 9.941544566544566, "rewards/rejected": -8.153846153846153, "step": 1354 }, { "epoch": 0.96201632942847, "grad_norm": 0.1743452511419368, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76953827.55555555, "logits/rejected": 70011172.57142857, "logps/chosen": -189.77777777777777, "logps/rejected": -340.57142857142856, "loss": 0.1671, "rewards/chosen": 1.4184027777777777, "rewards/margins": 9.498759920634921, "rewards/rejected": -8.080357142857142, "step": 1355 }, { "epoch": 0.9627263045793397, "grad_norm": 0.1787123062082456, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 127659361.74545455, "logits/rejected": 83081692.93150684, "logps/chosen": -197.0909090909091, "logps/rejected": -425.64383561643837, "loss": 0.1334, "rewards/chosen": 1.7704545454545455, "rewards/margins": 12.112920298879203, "rewards/rejected": -10.342465753424657, "step": 1356 }, { "epoch": 0.9634362797302094, "grad_norm": 0.16129476534896858, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 74123475.86206897, "logits/rejected": 84665022.17142858, "logps/chosen": -203.17241379310346, "logps/rejected": -372.57142857142856, "loss": 0.1502, "rewards/chosen": 1.7370689655172413, "rewards/margins": 10.679926108374385, "rewards/rejected": -8.942857142857143, "step": 1357 }, { "epoch": 0.9641462548810792, "grad_norm": 0.17824808525342323, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139410675.80952382, "logits/rejected": 76788027.07692307, "logps/chosen": -228.44444444444446, "logps/rejected": -369.7230769230769, "loss": 0.1402, "rewards/chosen": 2.2430555555555554, "rewards/margins": 7.504594017094017, "rewards/rejected": -5.2615384615384615, "step": 1358 }, { "epoch": 0.9648562300319489, "grad_norm": 0.16775144157796512, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 113712241.77777778, "logits/rejected": 29135433.14285714, "logps/chosen": -258.0, "logps/rejected": -368.0, "loss": 0.1586, "rewards/chosen": 1.9895833333333333, "rewards/margins": 11.302083333333334, "rewards/rejected": -9.3125, "step": 1359 }, { "epoch": 0.9655662051828185, "grad_norm": 0.17232578478374685, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 123012050.14925373, "logits/rejected": 66214995.93442623, "logps/chosen": -218.98507462686567, "logps/rejected": -350.42622950819674, "loss": 0.1582, "rewards/chosen": 1.3563432835820894, "rewards/margins": 10.716999021287009, "rewards/rejected": -9.360655737704919, "step": 1360 }, { "epoch": 0.9662761803336883, "grad_norm": 0.1829846622839147, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 161380839.6190476, "logits/rejected": 69948085.16923077, "logps/chosen": -272.5079365079365, "logps/rejected": -366.2769230769231, "loss": 0.1155, "rewards/chosen": 2.0535714285714284, "rewards/margins": 12.830494505494507, "rewards/rejected": -10.776923076923078, "step": 1361 }, { "epoch": 0.966986155484558, "grad_norm": 0.15623654251777916, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 137878211.4909091, "logits/rejected": 54195578.73972603, "logps/chosen": -177.8909090909091, "logps/rejected": -362.52054794520546, "loss": 0.1382, "rewards/chosen": 0.9068181818181819, "rewards/margins": 10.523256537982565, "rewards/rejected": -9.616438356164384, "step": 1362 }, { "epoch": 0.9676961306354278, "grad_norm": 0.15517199363665965, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 139144370.7936508, "logits/rejected": 85208898.95384616, "logps/chosen": -231.61904761904762, "logps/rejected": -420.4307692307692, "loss": 0.1283, "rewards/chosen": 1.6488095238095237, "rewards/margins": 11.648809523809524, "rewards/rejected": -10.0, "step": 1363 }, { "epoch": 0.9684061057862975, "grad_norm": 0.18106849932773353, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104651322.75409836, "logits/rejected": 99223460.29850747, "logps/chosen": -203.14754098360655, "logps/rejected": -382.56716417910445, "loss": 0.16, "rewards/chosen": 1.5327868852459017, "rewards/margins": 11.771592855395156, "rewards/rejected": -10.238805970149254, "step": 1364 }, { "epoch": 0.9691160809371672, "grad_norm": 0.1648229943872837, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125703916.89552239, "logits/rejected": 142468817.83606556, "logps/chosen": -238.56716417910448, "logps/rejected": -463.21311475409834, "loss": 0.1655, "rewards/chosen": 1.3320895522388059, "rewards/margins": 12.381269880107657, "rewards/rejected": -11.049180327868852, "step": 1365 }, { "epoch": 0.969826056088037, "grad_norm": 0.17628752845723875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 82116608.0, "logits/rejected": 92274688.0, "logps/chosen": -184.0, "logps/rejected": -331.75, "loss": 0.1624, "rewards/chosen": 1.71875, "rewards/margins": 10.59375, "rewards/rejected": -8.875, "step": 1366 }, { "epoch": 0.9705360312389066, "grad_norm": 0.19186419374850303, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95008475.42857143, "logits/rejected": 67641217.96923077, "logps/chosen": -269.2063492063492, "logps/rejected": -372.18461538461537, "loss": 0.123, "rewards/chosen": 2.8353174603174605, "rewards/margins": 12.350702075702076, "rewards/rejected": -9.515384615384615, "step": 1367 }, { "epoch": 0.9712460063897763, "grad_norm": 0.17623105653153562, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125592821.18309858, "logits/rejected": 22001699.92982456, "logps/chosen": -243.38028169014083, "logps/rejected": -366.03508771929825, "loss": 0.1425, "rewards/chosen": 1.5017605633802817, "rewards/margins": 8048478.6947430195, "rewards/rejected": -8048477.192982456, "step": 1368 }, { "epoch": 0.9719559815406461, "grad_norm": 0.20359535816333207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 164029550.2769231, "logits/rejected": 38206447.746031746, "logps/chosen": -271.0153846153846, "logps/rejected": -357.58730158730157, "loss": 0.1451, "rewards/chosen": 1.8615384615384616, "rewards/margins": 9.274236874236875, "rewards/rejected": -7.412698412698413, "step": 1369 }, { "epoch": 0.9726659566915158, "grad_norm": 0.1592258751093186, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128938690.20689656, "logits/rejected": 81069904.45714286, "logps/chosen": -213.93103448275863, "logps/rejected": -444.34285714285716, "loss": 0.1461, "rewards/chosen": 0.8254310344827587, "rewards/margins": 11.325431034482758, "rewards/rejected": -10.5, "step": 1370 }, { "epoch": 0.9733759318423855, "grad_norm": 0.16561984712488553, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 121054066.21538462, "logits/rejected": 51729749.333333336, "logps/chosen": -291.44615384615383, "logps/rejected": -363.6825396825397, "loss": 0.1383, "rewards/chosen": 2.519230769230769, "rewards/margins": 10502571.154151404, "rewards/rejected": -10502568.634920634, "step": 1371 }, { "epoch": 0.9740859069932553, "grad_norm": 0.16890395059274693, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 133836427.63636364, "logits/rejected": 82237109.67741935, "logps/chosen": -207.03030303030303, "logps/rejected": -381.93548387096774, "loss": 0.169, "rewards/chosen": 1.7604166666666667, "rewards/margins": 10.05880376344086, "rewards/rejected": -8.298387096774194, "step": 1372 }, { "epoch": 0.9747958821441249, "grad_norm": 0.17558590277453978, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128478154.10526316, "logits/rejected": 77151563.71830986, "logps/chosen": -198.59649122807016, "logps/rejected": -425.46478873239437, "loss": 0.165, "rewards/chosen": 1.0016447368421053, "rewards/margins": 10.649532060785766, "rewards/rejected": -9.647887323943662, "step": 1373 }, { "epoch": 0.9755058572949947, "grad_norm": 0.1825777402336961, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 135944794.3529412, "logits/rejected": 101082726.4, "logps/chosen": -235.52941176470588, "logps/rejected": -428.26666666666665, "loss": 0.1714, "rewards/chosen": 1.4623161764705883, "rewards/margins": 11.828982843137256, "rewards/rejected": -10.366666666666667, "step": 1374 }, { "epoch": 0.9762158324458644, "grad_norm": 0.17960562303564997, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 101577055.08571428, "logits/rejected": 70833116.68965517, "logps/chosen": -222.62857142857143, "logps/rejected": -346.48275862068965, "loss": 0.1534, "rewards/chosen": 1.9928571428571429, "rewards/margins": 11.880788177339902, "rewards/rejected": -9.887931034482758, "step": 1375 }, { "epoch": 0.9769258075967341, "grad_norm": 0.18314648316643364, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 146021697.82857144, "logits/rejected": 93684841.93103448, "logps/chosen": -252.34285714285716, "logps/rejected": -400.55172413793105, "loss": 0.1589, "rewards/chosen": 1.8125, "rewards/margins": 10.553879310344827, "rewards/rejected": -8.741379310344827, "step": 1376 }, { "epoch": 0.9776357827476039, "grad_norm": 0.16695381960573155, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103179878.4, "logits/rejected": 93138221.1764706, "logps/chosen": -193.33333333333334, "logps/rejected": -362.3529411764706, "loss": 0.1467, "rewards/chosen": 1.2947916666666666, "rewards/margins": 10.383026960784314, "rewards/rejected": -9.088235294117647, "step": 1377 }, { "epoch": 0.9783457578984736, "grad_norm": 0.1955970164436094, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 128712704.0, "logits/rejected": 67764224.0, "logps/chosen": -237.125, "logps/rejected": -398.0, "loss": 0.1379, "rewards/chosen": 2.341796875, "rewards/margins": 12.537109375, "rewards/rejected": -10.1953125, "step": 1378 }, { "epoch": 0.9790557330493432, "grad_norm": 0.19496134597226816, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 80290962.28571428, "logits/rejected": 113277048.47058824, "logps/chosen": -244.98701298701297, "logps/rejected": -409.72549019607845, "loss": 0.1836, "rewards/chosen": 1.9431818181818181, "rewards/margins": 10.01180926916221, "rewards/rejected": -8.068627450980392, "step": 1379 }, { "epoch": 0.979765708200213, "grad_norm": 0.1753566189265145, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 83531177.35384615, "logits/rejected": 106821599.49206349, "logps/chosen": -223.75384615384615, "logps/rejected": -408.8888888888889, "loss": 0.1589, "rewards/chosen": 1.4692307692307693, "rewards/margins": 10.405738705738706, "rewards/rejected": -8.936507936507937, "step": 1380 }, { "epoch": 0.9804756833510827, "grad_norm": 0.15745671563036853, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94574790.19354838, "logits/rejected": 54430626.90909091, "logps/chosen": -224.51612903225808, "logps/rejected": -353.93939393939394, "loss": 0.154, "rewards/chosen": 1.5383064516129032, "rewards/margins": 9.129215542521994, "rewards/rejected": -7.590909090909091, "step": 1381 }, { "epoch": 0.9811856585019524, "grad_norm": 0.1434346871978416, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 56445043.9245283, "logits/rejected": 76175551.14666666, "logps/chosen": -199.24528301886792, "logps/rejected": -342.61333333333334, "loss": 0.1367, "rewards/chosen": 1.7735849056603774, "rewards/margins": 11.186918238993712, "rewards/rejected": -9.413333333333334, "step": 1382 }, { "epoch": 0.9818956336528222, "grad_norm": 0.19238841293943, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 165703736.10958904, "logits/rejected": 6177065.890909091, "logps/chosen": -228.6027397260274, "logps/rejected": -294.4, "loss": 0.1922, "rewards/chosen": 1.33861301369863, "rewards/margins": 10.574976650062267, "rewards/rejected": -9.236363636363636, "step": 1383 }, { "epoch": 0.9826056088036919, "grad_norm": 0.1570484153980499, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100229402.48275863, "logits/rejected": 53132843.885714285, "logps/chosen": -215.17241379310346, "logps/rejected": -358.85714285714283, "loss": 0.1447, "rewards/chosen": 1.9504310344827587, "rewards/margins": 11.736145320197044, "rewards/rejected": -9.785714285714286, "step": 1384 }, { "epoch": 0.9833155839545616, "grad_norm": 0.187176599200556, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 114250477.97183098, "logits/rejected": 17255513.824561402, "logps/chosen": -208.22535211267606, "logps/rejected": -310.17543859649123, "loss": 0.1614, "rewards/chosen": 1.732394366197183, "rewards/margins": 7.8025698047936745, "rewards/rejected": -6.0701754385964914, "step": 1385 }, { "epoch": 0.9840255591054313, "grad_norm": 0.14785744895682082, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 88995504.87272727, "logits/rejected": 83541342.6849315, "logps/chosen": -199.27272727272728, "logps/rejected": -440.986301369863, "loss": 0.1408, "rewards/chosen": 1.425, "rewards/margins": 12.137328767123288, "rewards/rejected": -10.712328767123287, "step": 1386 }, { "epoch": 0.984735534256301, "grad_norm": 0.1780428431863678, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 150208512.0, "logits/rejected": 37289984.0, "logps/chosen": -231.75, "logps/rejected": -366.25, "loss": 0.1414, "rewards/chosen": 1.5390625, "rewards/margins": 12.1015625, "rewards/rejected": -10.5625, "step": 1387 }, { "epoch": 0.9854455094071708, "grad_norm": 0.2025810415705296, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 76821989.05263157, "logits/rejected": 93574331.49295774, "logps/chosen": -227.64912280701753, "logps/rejected": -409.23943661971833, "loss": 0.1469, "rewards/chosen": 1.7039473684210527, "rewards/margins": 11.492679762787251, "rewards/rejected": -9.788732394366198, "step": 1388 }, { "epoch": 0.9861554845580405, "grad_norm": 0.16621133832009444, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 54141192.258064516, "logits/rejected": 102442697.6969697, "logps/chosen": -208.7741935483871, "logps/rejected": -370.90909090909093, "loss": 0.1481, "rewards/chosen": 1.621975806451613, "rewards/margins": 11.637127321603128, "rewards/rejected": -10.015151515151516, "step": 1389 }, { "epoch": 0.9868654597089102, "grad_norm": 0.17557847785318786, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 94791270.4, "logits/rejected": 48767106.03174603, "logps/chosen": -271.5076923076923, "logps/rejected": -313.3968253968254, "loss": 0.1457, "rewards/chosen": 2.2596153846153846, "rewards/margins": 10.767551892551893, "rewards/rejected": -8.507936507936508, "step": 1390 }, { "epoch": 0.9875754348597799, "grad_norm": 0.14661256742540718, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 120259634.36065574, "logits/rejected": 97658421.49253732, "logps/chosen": -164.327868852459, "logps/rejected": -398.8059701492537, "loss": 0.1328, "rewards/chosen": 1.4487704918032787, "rewards/margins": 10.881606312698802, "rewards/rejected": -9.432835820895523, "step": 1391 }, { "epoch": 0.9882854100106496, "grad_norm": 0.18575393853005673, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 125518430.81481482, "logits/rejected": 47710208.0, "logps/chosen": -285.6296296296296, "logps/rejected": -359.7837837837838, "loss": 0.1479, "rewards/chosen": 1.3796296296296295, "rewards/margins": 10.541791791791791, "rewards/rejected": -9.162162162162161, "step": 1392 }, { "epoch": 0.9889953851615193, "grad_norm": 0.16642923213367716, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 84433163.13043478, "logits/rejected": 68974973.83050847, "logps/chosen": -253.44927536231884, "logps/rejected": -340.8813559322034, "loss": 0.1593, "rewards/chosen": 1.9782608695652173, "rewards/margins": 12.707074428887251, "rewards/rejected": -10.728813559322035, "step": 1393 }, { "epoch": 0.9897053603123891, "grad_norm": 0.23019320426669093, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 168165376.0, "logits/rejected": 53342208.0, "logps/chosen": -294.0, "logps/rejected": -324.5, "loss": 0.1613, "rewards/chosen": 1.685546875, "rewards/margins": 10.857421875, "rewards/rejected": -9.171875, "step": 1394 }, { "epoch": 0.9904153354632588, "grad_norm": 0.16265650733581563, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124504602.94736843, "logits/rejected": 36404786.47887324, "logps/chosen": -198.17543859649123, "logps/rejected": -369.5774647887324, "loss": 0.1439, "rewards/chosen": 1.493421052631579, "rewards/margins": 9.092012601927355, "rewards/rejected": -7.598591549295775, "step": 1395 }, { "epoch": 0.9911253106141285, "grad_norm": 0.1785756529338266, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 131249451.32307692, "logits/rejected": 73783133.46031746, "logps/chosen": -197.2923076923077, "logps/rejected": -385.5238095238095, "loss": 0.1541, "rewards/chosen": 1.8269230769230769, "rewards/margins": 9703823.033272283, "rewards/rejected": -9703821.206349207, "step": 1396 }, { "epoch": 0.9918352857649982, "grad_norm": 0.1695020864977385, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 106509901.57575758, "logits/rejected": 65282312.258064516, "logps/chosen": -184.72727272727272, "logps/rejected": -323.35483870967744, "loss": 0.1652, "rewards/chosen": 1.6458333333333333, "rewards/margins": 8.31518817204301, "rewards/rejected": -6.669354838709677, "step": 1397 }, { "epoch": 0.9925452609158679, "grad_norm": 0.18589079789798252, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 69164072.96, "logits/rejected": 97537352.4528302, "logps/chosen": -213.33333333333334, "logps/rejected": -367.39622641509436, "loss": 0.175, "rewards/chosen": 1.5316666666666667, "rewards/margins": 10.404308176100628, "rewards/rejected": -8.872641509433961, "step": 1398 }, { "epoch": 0.9932552360667377, "grad_norm": 0.15790577983412027, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 124098137.3968254, "logits/rejected": 45350912.0, "logps/chosen": -192.5079365079365, "logps/rejected": -348.55384615384617, "loss": 0.1404, "rewards/chosen": 1.3134920634920635, "rewards/margins": 11.575030525030526, "rewards/rejected": -10.261538461538462, "step": 1399 }, { "epoch": 0.9939652112176074, "grad_norm": 0.1649937868284555, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 102963398.19354838, "logits/rejected": 46645744.484848484, "logps/chosen": -189.5483870967742, "logps/rejected": -398.06060606060606, "loss": 0.1467, "rewards/chosen": 1.6350806451612903, "rewards/margins": 11.619929130009774, "rewards/rejected": -9.984848484848484, "step": 1400 }, { "epoch": 0.9946751863684771, "grad_norm": 0.14563246916042552, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100154895.51515152, "logits/rejected": 68867765.67741935, "logps/chosen": -205.57575757575756, "logps/rejected": -376.7741935483871, "loss": 0.1417, "rewards/chosen": 1.9611742424242424, "rewards/margins": 11.340206500488758, "rewards/rejected": -9.379032258064516, "step": 1401 }, { "epoch": 0.9953851615193469, "grad_norm": 0.15763476820493352, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 153282746.1818182, "logits/rejected": 44209317.161290325, "logps/chosen": -238.06060606060606, "logps/rejected": -334.96774193548384, "loss": 0.1535, "rewards/chosen": 1.9696969696969697, "rewards/margins": 11.155180840664713, "rewards/rejected": -9.185483870967742, "step": 1402 }, { "epoch": 0.9960951366702165, "grad_norm": 0.16204954023484566, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 61324783.48387097, "logits/rejected": 92719538.42424242, "logps/chosen": -243.2258064516129, "logps/rejected": -345.2121212121212, "loss": 0.136, "rewards/chosen": 2.1794354838709675, "rewards/margins": -5806899.638746334, "rewards/rejected": 5806901.818181818, "step": 1403 }, { "epoch": 0.9968051118210862, "grad_norm": 0.16551477705637543, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 60626757.81818182, "logits/rejected": 112039627.39726028, "logps/chosen": -198.83636363636364, "logps/rejected": -408.54794520547944, "loss": 0.1335, "rewards/chosen": 1.8284090909090909, "rewards/margins": 11.4174501867995, "rewards/rejected": -9.58904109589041, "step": 1404 }, { "epoch": 0.997515086971956, "grad_norm": 0.17338033598002167, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 103167358.08955224, "logits/rejected": 98325487.21311475, "logps/chosen": -213.97014925373134, "logps/rejected": -414.95081967213116, "loss": 0.1487, "rewards/chosen": 1.6119402985074627, "rewards/margins": 11.77587472473697, "rewards/rejected": -10.163934426229508, "step": 1405 }, { "epoch": 0.9982250621228257, "grad_norm": 0.14452643111292748, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 95321493.73584905, "logits/rejected": 104130587.30666667, "logps/chosen": -182.03773584905662, "logps/rejected": -406.61333333333334, "loss": 0.1415, "rewards/chosen": 1.080188679245283, "rewards/margins": 11.093522012578617, "rewards/rejected": -10.013333333333334, "step": 1406 }, { "epoch": 0.9989350372736954, "grad_norm": 0.16164883243001807, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 104181099.35483871, "logits/rejected": 70572342.3030303, "logps/chosen": -254.96774193548387, "logps/rejected": -346.1818181818182, "loss": 0.1554, "rewards/chosen": 1.9435483870967742, "rewards/margins": 10.557184750733137, "rewards/rejected": -8.613636363636363, "step": 1407 }, { "epoch": 0.9996450124245652, "grad_norm": 0.14109566332228207, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 149851042.9090909, "logits/rejected": 35448633.80645161, "logps/chosen": -239.27272727272728, "logps/rejected": -340.64516129032256, "loss": 0.1265, "rewards/chosen": 2.4545454545454546, "rewards/margins": 11.890029325513197, "rewards/rejected": -9.435483870967742, "step": 1408 }, { "epoch": 1.0007099751508697, "grad_norm": 0.18286241535339387, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36321507.55555555, "logits/rejected": 119537664.0, "logps/chosen": -168.22222222222223, "logps/rejected": -415.27272727272725, "loss": 0.1479, "rewards/chosen": 1.7222222222222223, "rewards/margins": 11.767676767676768, "rewards/rejected": -10.045454545454545, "step": 1409 } ], "logging_steps": 1, "max_steps": 1409, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }