{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 3997, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00025021894157387716, "grad_norm": 88.5, "kl": 0.0, "learning_rate": 1.4285714285714287e-07, "logits/chosen": -59541664.0, "logits/rejected": -25616944.0, "logps/chosen": -488.6276448567708, "logps/rejected": -316.3020833333333, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0005004378831477543, "grad_norm": 98.0, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "logits/chosen": -56574988.8, "logits/rejected": -65667410.28571428, "logps/chosen": -472.650927734375, "logps/rejected": -482.46128627232144, "loss": 0.5, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0007506568247216314, "grad_norm": 76.0, "kl": 0.0, "learning_rate": 4.285714285714286e-07, "logits/chosen": -16256480.0, "logits/rejected": -27352000.0, "logps/chosen": -367.8279622395833, "logps/rejected": -355.7858072916667, "loss": 0.5002, "rewards/chosen": 0.003980398178100586, "rewards/margins": -0.0015324751536051435, "rewards/rejected": 0.0055128733317057295, "step": 3 }, { "epoch": 0.0010008757662955086, "grad_norm": 93.0, "kl": 0.08193652331829071, "learning_rate": 5.714285714285715e-07, "logits/chosen": -85845922.9090909, "logits/rejected": -38409316.92307692, "logps/chosen": -430.83473899147725, "logps/rejected": -493.1432542067308, "loss": 0.4964, "rewards/chosen": 0.008217828517610376, "rewards/margins": 0.026016472363388624, "rewards/rejected": -0.017798643845778245, "step": 4 }, { "epoch": 0.0012510947078693856, "grad_norm": 86.5, "kl": 0.052356719970703125, "learning_rate": 7.142857142857143e-07, "logits/chosen": -63304915.2, "logits/rejected": -19663620.57142857, "logps/chosen": -542.8798828125, "logps/rejected": -435.6636439732143, "loss": 0.5014, "rewards/chosen": -0.020672836899757387, "rewards/margins": -0.018652871676853727, "rewards/rejected": -0.0020199652229036602, "step": 5 }, { "epoch": 0.0015013136494432628, "grad_norm": 94.0, "kl": 0.04557546228170395, "learning_rate": 8.571428571428572e-07, "logits/chosen": -45416180.0, "logits/rejected": -37515092.0, "logps/chosen": -473.4654235839844, "logps/rejected": -438.4719543457031, "loss": 0.5, "rewards/chosen": 0.006817246787250042, "rewards/margins": 0.001284862868487835, "rewards/rejected": 0.005532383918762207, "step": 6 }, { "epoch": 0.00175153259101714, "grad_norm": 112.0, "kl": 0.0428212508559227, "learning_rate": 1.0000000000000002e-06, "logits/chosen": -80972763.42857143, "logits/rejected": -25930512.0, "logps/chosen": -587.2008928571429, "logps/rejected": -637.29287109375, "loss": 0.4812, "rewards/chosen": 0.06468875067574638, "rewards/margins": 0.15806072013718742, "rewards/rejected": -0.09337196946144104, "step": 7 }, { "epoch": 0.0020017515325910173, "grad_norm": 101.0, "kl": 0.0089480085298419, "learning_rate": 1.142857142857143e-06, "logits/chosen": -58541333.333333336, "logits/rejected": -18188266.666666668, "logps/chosen": -537.5132378472222, "logps/rejected": -592.4221354166667, "loss": 0.5031, "rewards/chosen": -0.04628287421332465, "rewards/margins": -0.039186521536774106, "rewards/rejected": -0.007096352676550548, "step": 8 }, { "epoch": 0.0022519704741648943, "grad_norm": 81.0, "kl": 0.1360289305448532, "learning_rate": 1.2857142857142856e-06, "logits/chosen": -30847316.57142857, "logits/rejected": -29223027.2, "logps/chosen": -365.27615792410717, "logps/rejected": -316.3841796875, "loss": 0.4918, "rewards/chosen": 0.01241051937852587, "rewards/margins": 0.08507478024278368, "rewards/rejected": -0.07266426086425781, "step": 9 }, { "epoch": 0.0025021894157387712, "grad_norm": 80.0, "kl": 0.05688444897532463, "learning_rate": 1.4285714285714286e-06, "logits/chosen": -41497478.4, "logits/rejected": -35174166.85714286, "logps/chosen": -405.414208984375, "logps/rejected": -407.22471400669644, "loss": 0.4902, "rewards/chosen": -0.002581767737865448, "rewards/margins": 0.06627030351332255, "rewards/rejected": -0.068852071251188, "step": 10 }, { "epoch": 0.0027524083573126487, "grad_norm": 85.5, "kl": 0.0, "learning_rate": 1.5714285714285714e-06, "logits/chosen": 4115368.727272727, "logits/rejected": -23189848.615384616, "logps/chosen": -608.0187766335227, "logps/rejected": -409.3933293269231, "loss": 0.4965, "rewards/chosen": -0.04436756805940108, "rewards/margins": 0.01926962437329592, "rewards/rejected": -0.063637192432697, "step": 11 }, { "epoch": 0.0030026272988865257, "grad_norm": 76.5, "kl": 0.005174319259822369, "learning_rate": 1.7142857142857145e-06, "logits/chosen": -36525562.18181818, "logits/rejected": -59515072.0, "logps/chosen": -335.98237748579544, "logps/rejected": -383.279296875, "loss": 0.4785, "rewards/chosen": 0.026898888024416836, "rewards/margins": 0.16427097829071793, "rewards/rejected": -0.1373720902663011, "step": 12 }, { "epoch": 0.0032528462404604027, "grad_norm": 90.5, "kl": 0.0, "learning_rate": 1.8571428571428573e-06, "logits/chosen": -61187712.0, "logits/rejected": -30487753.14285714, "logps/chosen": -554.416796875, "logps/rejected": -470.5059291294643, "loss": 0.4635, "rewards/chosen": 0.04775131344795227, "rewards/margins": 0.2668748813016074, "rewards/rejected": -0.21912356785365514, "step": 13 }, { "epoch": 0.00350306518203428, "grad_norm": 87.0, "kl": 0.0, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -82098724.57142857, "logits/rejected": -23819310.4, "logps/chosen": -455.3506556919643, "logps/rejected": -330.45283203125, "loss": 0.4759, "rewards/chosen": 0.030299640127590725, "rewards/margins": 0.2239126924957548, "rewards/rejected": -0.19361305236816406, "step": 14 }, { "epoch": 0.003753284123608157, "grad_norm": 90.0, "kl": 0.0, "learning_rate": 2.1428571428571427e-06, "logits/chosen": -91756363.63636364, "logits/rejected": -43028002.461538464, "logps/chosen": -455.7848011363636, "logps/rejected": -466.22325721153845, "loss": 0.4504, "rewards/chosen": 0.07364630699157715, "rewards/margins": 0.3964222211104173, "rewards/rejected": -0.32277591411884016, "step": 15 }, { "epoch": 0.0040035030651820345, "grad_norm": 87.0, "kl": 0.0, "learning_rate": 2.285714285714286e-06, "logits/chosen": -9058512.0, "logits/rejected": -45979446.85714286, "logps/chosen": -545.936767578125, "logps/rejected": -541.0837751116071, "loss": 0.4381, "rewards/chosen": 0.058348214626312254, "rewards/margins": 0.4579805663653782, "rewards/rejected": -0.39963235173906597, "step": 16 }, { "epoch": 0.0042537220067559115, "grad_norm": 81.0, "kl": 0.019661586731672287, "learning_rate": 2.428571428571429e-06, "logits/chosen": -5214717.090909091, "logits/rejected": -40260457.84615385, "logps/chosen": -561.8737571022727, "logps/rejected": -403.4113957331731, "loss": 0.4358, "rewards/chosen": 0.1934277577833696, "rewards/margins": 0.5138672065067958, "rewards/rejected": -0.3204394487234262, "step": 17 }, { "epoch": 0.0045039409483297885, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 2.571428571428571e-06, "logits/chosen": -51075559.384615384, "logits/rejected": -69050926.54545455, "logps/chosen": -383.6446063701923, "logps/rejected": -542.7177734375, "loss": 0.435, "rewards/chosen": 0.054797039582179144, "rewards/margins": 0.5791361086018436, "rewards/rejected": -0.5243390690196644, "step": 18 }, { "epoch": 0.0047541598899036655, "grad_norm": 84.0, "kl": 0.28668975830078125, "learning_rate": 2.7142857142857144e-06, "logits/chosen": -84931226.66666667, "logits/rejected": -37310128.0, "logps/chosen": -557.7818603515625, "logps/rejected": -442.5623372395833, "loss": 0.4194, "rewards/chosen": 0.20295224587122598, "rewards/margins": 0.7280211249987284, "rewards/rejected": -0.5250688791275024, "step": 19 }, { "epoch": 0.0050043788314775425, "grad_norm": 76.5, "kl": 0.022904079407453537, "learning_rate": 2.8571428571428573e-06, "logits/chosen": -30446306.285714287, "logits/rejected": -60657049.6, "logps/chosen": -415.9957798549107, "logps/rejected": -440.586328125, "loss": 0.4335, "rewards/chosen": 0.11211512769971575, "rewards/margins": 0.6526112862995693, "rewards/rejected": -0.5404961585998536, "step": 20 }, { "epoch": 0.00525459777305142, "grad_norm": 77.0, "kl": 0.0, "learning_rate": 3e-06, "logits/chosen": -47356508.0, "logits/rejected": -38769844.0, "logps/chosen": -540.397705078125, "logps/rejected": -476.3580322265625, "loss": 0.3676, "rewards/chosen": 0.2587902247905731, "rewards/margins": 1.0264058411121368, "rewards/rejected": -0.7676156163215637, "step": 21 }, { "epoch": 0.005504816714625297, "grad_norm": 78.5, "kl": 0.0, "learning_rate": 3.142857142857143e-06, "logits/chosen": -44589024.0, "logits/rejected": -50823984.0, "logps/chosen": -392.33087158203125, "logps/rejected": -431.8802185058594, "loss": 0.4085, "rewards/chosen": 0.20118574798107147, "rewards/margins": 0.9541785567998886, "rewards/rejected": -0.7529928088188171, "step": 22 }, { "epoch": 0.005755035656199174, "grad_norm": 73.5, "kl": 0.2415180206298828, "learning_rate": 3.285714285714286e-06, "logits/chosen": -29925634.285714287, "logits/rejected": -40795820.8, "logps/chosen": -434.2471400669643, "logps/rejected": -425.832421875, "loss": 0.3729, "rewards/chosen": 0.42611227716718403, "rewards/margins": 1.3466090270451136, "rewards/rejected": -0.9204967498779297, "step": 23 }, { "epoch": 0.006005254597773051, "grad_norm": 67.0, "kl": 0.8058691024780273, "learning_rate": 3.428571428571429e-06, "logits/chosen": -54340292.571428575, "logits/rejected": -37819292.8, "logps/chosen": -382.73416573660717, "logps/rejected": -552.03896484375, "loss": 0.3819, "rewards/chosen": 0.33219385147094727, "rewards/margins": 1.3029109001159669, "rewards/rejected": -0.9707170486450195, "step": 24 }, { "epoch": 0.006255473539346928, "grad_norm": 62.0, "kl": 0.8036088943481445, "learning_rate": 3.5714285714285718e-06, "logits/chosen": -59392403.692307696, "logits/rejected": -47422577.45454545, "logps/chosen": -459.494140625, "logps/rejected": -587.9924538352273, "loss": 0.3155, "rewards/chosen": 0.5894884696373572, "rewards/margins": 2.0235542017263133, "rewards/rejected": -1.434065732088956, "step": 25 }, { "epoch": 0.006505692480920805, "grad_norm": 52.0, "kl": 0.0, "learning_rate": 3.7142857142857146e-06, "logits/chosen": -41334444.307692304, "logits/rejected": -39218932.36363637, "logps/chosen": -381.20650540865387, "logps/rejected": -425.54092684659093, "loss": 0.3401, "rewards/chosen": 0.5716312848604642, "rewards/margins": 1.5807496250926198, "rewards/rejected": -1.0091183402321555, "step": 26 }, { "epoch": 0.006755911422494683, "grad_norm": 65.5, "kl": 0.18149185180664062, "learning_rate": 3.857142857142858e-06, "logits/chosen": -45916352.0, "logits/rejected": -43130663.384615384, "logps/chosen": -632.1180308948864, "logps/rejected": -468.31385216346155, "loss": 0.2855, "rewards/chosen": 0.8257727189497515, "rewards/margins": 2.0523407942765246, "rewards/rejected": -1.226568075326773, "step": 27 }, { "epoch": 0.00700613036406856, "grad_norm": 60.75, "kl": 2.256681442260742, "learning_rate": 4.000000000000001e-06, "logits/chosen": -52474614.15384615, "logits/rejected": -35245515.63636363, "logps/chosen": -576.4675856370193, "logps/rejected": -331.1305042613636, "loss": 0.297, "rewards/chosen": 1.36091555081881, "rewards/margins": 2.2001000517731777, "rewards/rejected": -0.8391845009543679, "step": 28 }, { "epoch": 0.007256349305642437, "grad_norm": 46.75, "kl": 0.9984372854232788, "learning_rate": 4.1428571428571435e-06, "logits/chosen": -46850481.777777776, "logits/rejected": -56521501.86666667, "logps/chosen": -426.9078776041667, "logps/rejected": -384.9435546875, "loss": 0.2869, "rewards/chosen": 1.1934963862101238, "rewards/margins": 2.367296028137207, "rewards/rejected": -1.1737996419270833, "step": 29 }, { "epoch": 0.007506568247216314, "grad_norm": 66.5, "kl": 2.4080123901367188, "learning_rate": 4.2857142857142855e-06, "logits/chosen": -49813892.92307692, "logits/rejected": -23674167.272727273, "logps/chosen": -528.6504657451923, "logps/rejected": -403.7041015625, "loss": 0.3331, "rewards/chosen": 0.9016431661752554, "rewards/margins": 1.9492643729790107, "rewards/rejected": -1.0476212068037554, "step": 30 }, { "epoch": 0.007756787188790191, "grad_norm": 46.75, "kl": 0.9658675193786621, "learning_rate": 4.428571428571429e-06, "logits/chosen": -48486181.333333336, "logits/rejected": -28836466.666666668, "logps/chosen": -339.1027018229167, "logps/rejected": -356.8131510416667, "loss": 0.2965, "rewards/chosen": 1.1228853861490886, "rewards/margins": 2.131355444590251, "rewards/rejected": -1.008470058441162, "step": 31 }, { "epoch": 0.008007006130364069, "grad_norm": 66.5, "kl": 5.168900012969971, "learning_rate": 4.571428571428572e-06, "logits/chosen": -51749624.47058824, "logits/rejected": -26386843.42857143, "logps/chosen": -440.3121553308824, "logps/rejected": -363.31431361607144, "loss": 0.3235, "rewards/chosen": 1.1607482012580423, "rewards/margins": 2.905660212540827, "rewards/rejected": -1.7449120112827845, "step": 32 }, { "epoch": 0.008257225071937945, "grad_norm": 49.25, "kl": 5.057428359985352, "learning_rate": 4.714285714285715e-06, "logits/chosen": -42186612.705882356, "logits/rejected": -63770464.0, "logps/chosen": -444.23133042279414, "logps/rejected": -495.78916713169644, "loss": 0.2656, "rewards/chosen": 1.668528388528263, "rewards/margins": 3.3087558425775097, "rewards/rejected": -1.6402274540492467, "step": 33 }, { "epoch": 0.008507444013511823, "grad_norm": 49.0, "kl": 5.5824666023254395, "learning_rate": 4.857142857142858e-06, "logits/chosen": -9483001.846153846, "logits/rejected": -43397184.0, "logps/chosen": -628.7256234975962, "logps/rejected": -417.3606622869318, "loss": 0.2009, "rewards/chosen": 2.539976560152494, "rewards/margins": 3.367193662203275, "rewards/rejected": -0.8272171020507812, "step": 34 }, { "epoch": 0.0087576629550857, "grad_norm": 38.5, "kl": 6.245134353637695, "learning_rate": 5e-06, "logits/chosen": -44029211.428571425, "logits/rejected": -69293568.0, "logps/chosen": -381.298828125, "logps/rejected": -482.6978515625, "loss": 0.253, "rewards/chosen": 1.7988497870309013, "rewards/margins": 2.8967734473092213, "rewards/rejected": -1.0979236602783202, "step": 35 }, { "epoch": 0.009007881896659577, "grad_norm": 45.5, "kl": 4.012667655944824, "learning_rate": 5e-06, "logits/chosen": -46534710.15384615, "logits/rejected": -37244465.45454545, "logps/chosen": -459.57083834134613, "logps/rejected": -457.34614701704544, "loss": 0.2657, "rewards/chosen": 1.8993977766770582, "rewards/margins": 3.029151496353683, "rewards/rejected": -1.1297537196766247, "step": 36 }, { "epoch": 0.009258100838233455, "grad_norm": 44.25, "kl": 4.958836078643799, "learning_rate": 5e-06, "logits/chosen": -54757964.8, "logits/rejected": -41609389.71428572, "logps/chosen": -385.8251953125, "logps/rejected": -402.88065011160717, "loss": 0.2729, "rewards/chosen": 2.25915412902832, "rewards/margins": 3.121587835039411, "rewards/rejected": -0.8624337060110909, "step": 37 }, { "epoch": 0.009508319779807331, "grad_norm": 46.0, "kl": 7.350357532501221, "learning_rate": 5e-06, "logits/chosen": -68632512.0, "logits/rejected": -33406549.333333332, "logps/chosen": -517.378662109375, "logps/rejected": -517.5869140625, "loss": 0.2025, "rewards/chosen": 2.505164623260498, "rewards/margins": 3.9591061274210615, "rewards/rejected": -1.4539415041605632, "step": 38 }, { "epoch": 0.009758538721381209, "grad_norm": 30.75, "kl": 7.688513278961182, "learning_rate": 5e-06, "logits/chosen": -63052647.384615384, "logits/rejected": -40342722.90909091, "logps/chosen": -486.09878305288464, "logps/rejected": -492.6912286931818, "loss": 0.1482, "rewards/chosen": 3.5016696636493387, "rewards/margins": 5.5918673135183905, "rewards/rejected": -2.0901976498690518, "step": 39 }, { "epoch": 0.010008757662955085, "grad_norm": 52.75, "kl": 0.9967638850212097, "learning_rate": 5e-06, "logits/chosen": -45448392.0, "logits/rejected": -21358310.0, "logps/chosen": -449.9379577636719, "logps/rejected": -443.4837646484375, "loss": 0.2033, "rewards/chosen": 3.020130157470703, "rewards/margins": 4.729620933532715, "rewards/rejected": -1.7094907760620117, "step": 40 }, { "epoch": 0.010258976604528963, "grad_norm": 39.0, "kl": 3.558825969696045, "learning_rate": 5e-06, "logits/chosen": -78859318.15384616, "logits/rejected": -52826565.81818182, "logps/chosen": -418.4805438701923, "logps/rejected": -416.9651988636364, "loss": 0.1979, "rewards/chosen": 2.3863435891958384, "rewards/margins": 4.1012449064454835, "rewards/rejected": -1.7149013172496448, "step": 41 }, { "epoch": 0.01050919554610284, "grad_norm": 36.5, "kl": 5.200307369232178, "learning_rate": 5e-06, "logits/chosen": -49897130.666666664, "logits/rejected": -51501777.777777776, "logps/chosen": -446.23912760416664, "logps/rejected": -361.51161024305554, "loss": 0.2222, "rewards/chosen": 2.500630187988281, "rewards/margins": 3.2733071857028535, "rewards/rejected": -0.7726769977145724, "step": 42 }, { "epoch": 0.010759414487676717, "grad_norm": 34.0, "kl": 5.424537181854248, "learning_rate": 5e-06, "logits/chosen": -66925262.76923077, "logits/rejected": -28641832.727272727, "logps/chosen": -444.0276442307692, "logps/rejected": -439.81986860795456, "loss": 0.262, "rewards/chosen": 2.707141582782452, "rewards/margins": 4.80719745075786, "rewards/rejected": -2.100055867975408, "step": 43 }, { "epoch": 0.011009633429250595, "grad_norm": 27.875, "kl": 4.094232082366943, "learning_rate": 5e-06, "logits/chosen": -30213911.466666665, "logits/rejected": -60019619.55555555, "logps/chosen": -424.7192708333333, "logps/rejected": -568.1768120659722, "loss": 0.1609, "rewards/chosen": 2.563251241048177, "rewards/margins": 5.800806850857205, "rewards/rejected": -3.2375556098090277, "step": 44 }, { "epoch": 0.01125985237082447, "grad_norm": 37.0, "kl": 3.9221110343933105, "learning_rate": 5e-06, "logits/chosen": -32619204.923076924, "logits/rejected": -81111790.54545455, "logps/chosen": -342.2945087139423, "logps/rejected": -644.5071910511364, "loss": 0.1991, "rewards/chosen": 1.9459634927602916, "rewards/margins": 3.8989322635677315, "rewards/rejected": -1.9529687708074397, "step": 45 }, { "epoch": 0.011510071312398349, "grad_norm": 35.5, "kl": 5.3883137702941895, "learning_rate": 5e-06, "logits/chosen": -23232284.0, "logits/rejected": -24105454.0, "logps/chosen": -347.0953369140625, "logps/rejected": -546.0194091796875, "loss": 0.2393, "rewards/chosen": 2.0438919067382812, "rewards/margins": 4.417301893234253, "rewards/rejected": -2.3734099864959717, "step": 46 }, { "epoch": 0.011760290253972227, "grad_norm": 29.25, "kl": 5.374369144439697, "learning_rate": 5e-06, "logits/chosen": -61469664.0, "logits/rejected": -81658696.0, "logps/chosen": -452.052490234375, "logps/rejected": -572.4196166992188, "loss": 0.175, "rewards/chosen": 2.912391185760498, "rewards/margins": 6.923405170440674, "rewards/rejected": -4.011013984680176, "step": 47 }, { "epoch": 0.012010509195546103, "grad_norm": 42.0, "kl": 3.7215304374694824, "learning_rate": 5e-06, "logits/chosen": -60276736.0, "logits/rejected": -55375522.90909091, "logps/chosen": -453.3477313701923, "logps/rejected": -561.0490500710227, "loss": 0.2311, "rewards/chosen": 2.7364683884840746, "rewards/margins": 4.453294393899558, "rewards/rejected": -1.716826005415483, "step": 48 }, { "epoch": 0.01226072813711998, "grad_norm": 26.0, "kl": 3.1391959190368652, "learning_rate": 5e-06, "logits/chosen": -53692666.18181818, "logits/rejected": -52693277.538461536, "logps/chosen": -464.56005859375, "logps/rejected": -463.7102614182692, "loss": 0.1819, "rewards/chosen": 2.6378198103471235, "rewards/margins": 5.295058990691926, "rewards/rejected": -2.657239180344802, "step": 49 }, { "epoch": 0.012510947078693857, "grad_norm": 34.25, "kl": 2.9572463035583496, "learning_rate": 5e-06, "logits/chosen": -36200530.28571428, "logits/rejected": -24690896.0, "logps/chosen": -300.24166434151783, "logps/rejected": -411.090625, "loss": 0.2121, "rewards/chosen": 2.0287959235055104, "rewards/margins": 4.210692514692034, "rewards/rejected": -2.1818965911865233, "step": 50 }, { "epoch": 0.012761166020267735, "grad_norm": 35.5, "kl": 3.992845058441162, "learning_rate": 5e-06, "logits/chosen": -43658215.384615384, "logits/rejected": -38654816.0, "logps/chosen": -418.28568209134613, "logps/rejected": -510.7462713068182, "loss": 0.1616, "rewards/chosen": 2.639216789832482, "rewards/margins": 5.9850447594702665, "rewards/rejected": -3.345827969637784, "step": 51 }, { "epoch": 0.01301138496184161, "grad_norm": 35.75, "kl": 4.593269348144531, "learning_rate": 5e-06, "logits/chosen": -53449028.266666666, "logits/rejected": -42102737.777777776, "logps/chosen": -324.2541015625, "logps/rejected": -351.76752387152777, "loss": 0.2912, "rewards/chosen": 1.6974297841389974, "rewards/margins": 3.6081525166829427, "rewards/rejected": -1.9107227325439453, "step": 52 }, { "epoch": 0.013261603903415489, "grad_norm": 24.25, "kl": 2.7101986408233643, "learning_rate": 5e-06, "logits/chosen": -70743861.33333333, "logits/rejected": -23802826.666666668, "logps/chosen": -488.441162109375, "logps/rejected": -642.853515625, "loss": 0.1279, "rewards/chosen": 3.102839152018229, "rewards/margins": 7.0102189381917315, "rewards/rejected": -3.9073797861735025, "step": 53 }, { "epoch": 0.013511822844989366, "grad_norm": 33.75, "kl": 1.6181539297103882, "learning_rate": 5e-06, "logits/chosen": -79371884.3076923, "logits/rejected": -19914660.363636363, "logps/chosen": -541.4880558894231, "logps/rejected": -677.6415127840909, "loss": 0.1736, "rewards/chosen": 3.6615911630483775, "rewards/margins": 7.590517110757895, "rewards/rejected": -3.928925947709517, "step": 54 }, { "epoch": 0.013762041786563243, "grad_norm": 34.0, "kl": 17.674467086791992, "learning_rate": 5e-06, "logits/chosen": -65001088.0, "logits/rejected": -17387204.0, "logps/chosen": -424.58978271484375, "logps/rejected": -450.2849426269531, "loss": 0.3174, "rewards/chosen": 3.732567310333252, "rewards/margins": 6.20811915397644, "rewards/rejected": -2.4755518436431885, "step": 55 }, { "epoch": 0.01401226072813712, "grad_norm": 23.5, "kl": 11.61301040649414, "learning_rate": 5e-06, "logits/chosen": -72500420.26666667, "logits/rejected": -57343502.222222224, "logps/chosen": -424.5728515625, "logps/rejected": -460.6795247395833, "loss": 0.1971, "rewards/chosen": 3.042181905110677, "rewards/margins": 6.413038084242078, "rewards/rejected": -3.370856179131402, "step": 56 }, { "epoch": 0.014262479669710997, "grad_norm": 29.75, "kl": 10.680805206298828, "learning_rate": 5e-06, "logits/chosen": -58818313.84615385, "logits/rejected": -38190298.18181818, "logps/chosen": -472.00473257211536, "logps/rejected": -425.52903053977275, "loss": 0.2436, "rewards/chosen": 3.4778641920823317, "rewards/margins": 5.632143033967985, "rewards/rejected": -2.1542788418856533, "step": 57 }, { "epoch": 0.014512698611284874, "grad_norm": 19.5, "kl": 5.999025821685791, "learning_rate": 5e-06, "logits/chosen": -80188544.0, "logits/rejected": -60439285.333333336, "logps/chosen": -452.6884358723958, "logps/rejected": -470.2325032552083, "loss": 0.1122, "rewards/chosen": 4.041547139485677, "rewards/margins": 6.5902516047159825, "rewards/rejected": -2.548704465230306, "step": 58 }, { "epoch": 0.014762917552858752, "grad_norm": 23.75, "kl": 6.040742874145508, "learning_rate": 5e-06, "logits/chosen": -56422065.23076923, "logits/rejected": -51661288.72727273, "logps/chosen": -452.8036358173077, "logps/rejected": -417.02823153409093, "loss": 0.1324, "rewards/chosen": 3.348560333251953, "rewards/margins": 5.20868561484597, "rewards/rejected": -1.8601252815940164, "step": 59 }, { "epoch": 0.015013136494432628, "grad_norm": 26.375, "kl": 2.8226964473724365, "learning_rate": 5e-06, "logits/chosen": -35815285.333333336, "logits/rejected": -34013146.666666664, "logps/chosen": -462.9361979166667, "logps/rejected": -472.0421549479167, "loss": 0.1239, "rewards/chosen": 3.104276657104492, "rewards/margins": 6.3904050191243496, "rewards/rejected": -3.286128362019857, "step": 60 }, { "epoch": 0.015263355436006506, "grad_norm": 28.375, "kl": 6.3608245849609375, "learning_rate": 5e-06, "logits/chosen": -35580477.333333336, "logits/rejected": -49281386.666666664, "logps/chosen": -451.8302408854167, "logps/rejected": -531.75537109375, "loss": 0.1522, "rewards/chosen": 3.9960447947184243, "rewards/margins": 8.040406862894693, "rewards/rejected": -4.0443620681762695, "step": 61 }, { "epoch": 0.015513574377580382, "grad_norm": 38.75, "kl": 15.220528602600098, "learning_rate": 5e-06, "logits/chosen": -61638628.571428575, "logits/rejected": -35349504.0, "logps/chosen": -506.88438197544644, "logps/rejected": -523.688134765625, "loss": 0.1738, "rewards/chosen": 3.3980004446847096, "rewards/margins": 6.051696504865374, "rewards/rejected": -2.653696060180664, "step": 62 }, { "epoch": 0.01576379331915426, "grad_norm": 31.125, "kl": 6.477090835571289, "learning_rate": 5e-06, "logits/chosen": -46088345.6, "logits/rejected": -20498917.333333332, "logps/chosen": -412.12242838541664, "logps/rejected": -664.1638454861111, "loss": 0.1967, "rewards/chosen": 3.4296048482259116, "rewards/margins": 7.458029429117838, "rewards/rejected": -4.028424580891927, "step": 63 }, { "epoch": 0.016014012260728138, "grad_norm": 30.625, "kl": 8.509873390197754, "learning_rate": 5e-06, "logits/chosen": -40442308.0, "logits/rejected": -56075968.0, "logps/chosen": -437.4285888671875, "logps/rejected": -570.4381713867188, "loss": 0.1208, "rewards/chosen": 3.924570083618164, "rewards/margins": 7.112115859985352, "rewards/rejected": -3.1875457763671875, "step": 64 }, { "epoch": 0.016264231202302016, "grad_norm": 24.5, "kl": 0.34754371643066406, "learning_rate": 5e-06, "logits/chosen": -84957613.71428572, "logits/rejected": -46729106.823529415, "logps/chosen": -515.1071428571429, "logps/rejected": -652.2908432904412, "loss": 0.0864, "rewards/chosen": 4.909284319196429, "rewards/margins": 9.977046806271337, "rewards/rejected": -5.0677624870749085, "step": 65 }, { "epoch": 0.01651445014387589, "grad_norm": 30.875, "kl": 6.232099533081055, "learning_rate": 5e-06, "logits/chosen": -4933568.0, "logits/rejected": -36256624.0, "logps/chosen": -379.7061767578125, "logps/rejected": -517.9809919084821, "loss": 0.1841, "rewards/chosen": 3.948345184326172, "rewards/margins": 6.235989706856864, "rewards/rejected": -2.287644522530692, "step": 66 }, { "epoch": 0.016764669085449768, "grad_norm": 27.5, "kl": 7.00858211517334, "learning_rate": 5e-06, "logits/chosen": -69562180.57142857, "logits/rejected": 14068864.0, "logps/chosen": -482.22202845982144, "logps/rejected": -514.0224609375, "loss": 0.158, "rewards/chosen": 3.3606959751674106, "rewards/margins": 6.84954103742327, "rewards/rejected": -3.4888450622558596, "step": 67 }, { "epoch": 0.017014888027023646, "grad_norm": 28.5, "kl": 2.7451655864715576, "learning_rate": 5e-06, "logits/chosen": -53321006.54545455, "logits/rejected": -56115938.461538464, "logps/chosen": -363.21928267045456, "logps/rejected": -505.3498347355769, "loss": 0.1439, "rewards/chosen": 2.931535547429865, "rewards/margins": 6.898419573590472, "rewards/rejected": -3.966884026160607, "step": 68 }, { "epoch": 0.017265106968597524, "grad_norm": 38.25, "kl": 4.104753017425537, "learning_rate": 5e-06, "logits/chosen": -28209125.818181816, "logits/rejected": -44421026.461538464, "logps/chosen": -415.62442294034093, "logps/rejected": -705.1533203125, "loss": 0.1929, "rewards/chosen": 4.027130473743785, "rewards/margins": 7.716762596077018, "rewards/rejected": -3.6896321223332333, "step": 69 }, { "epoch": 0.0175153259101714, "grad_norm": 28.5, "kl": 11.870689392089844, "learning_rate": 5e-06, "logits/chosen": 13527932.57142857, "logits/rejected": -30350771.2, "logps/chosen": -632.6715262276786, "logps/rejected": -554.3529296875, "loss": 0.2055, "rewards/chosen": 4.735430036272321, "rewards/margins": 7.6904417855399, "rewards/rejected": -2.955011749267578, "step": 70 }, { "epoch": 0.017765544851745276, "grad_norm": 37.0, "kl": 22.042312622070312, "learning_rate": 5e-06, "logits/chosen": -66415018.666666664, "logits/rejected": -37259733.333333336, "logps/chosen": -532.311328125, "logps/rejected": -393.3572591145833, "loss": 0.1746, "rewards/chosen": 5.246820068359375, "rewards/margins": 7.600402450561523, "rewards/rejected": -2.3535823822021484, "step": 71 }, { "epoch": 0.018015763793319154, "grad_norm": 34.0, "kl": 8.992449760437012, "learning_rate": 5e-06, "logits/chosen": -44114397.538461536, "logits/rejected": -35993207.27272727, "logps/chosen": -402.97408353365387, "logps/rejected": -380.97194602272725, "loss": 0.1486, "rewards/chosen": 4.046647585355318, "rewards/margins": 6.243044846541398, "rewards/rejected": -2.1963972611860796, "step": 72 }, { "epoch": 0.018265982734893032, "grad_norm": 31.125, "kl": 3.5951037406921387, "learning_rate": 5e-06, "logits/chosen": -44106713.6, "logits/rejected": -29727241.14285714, "logps/chosen": -367.2716796875, "logps/rejected": -408.199951171875, "loss": 0.1656, "rewards/chosen": 2.72470645904541, "rewards/margins": 5.160595566885812, "rewards/rejected": -2.435889107840402, "step": 73 }, { "epoch": 0.01851620167646691, "grad_norm": 24.75, "kl": 0.0681304931640625, "learning_rate": 5e-06, "logits/chosen": -42291921.45454545, "logits/rejected": -5568534.153846154, "logps/chosen": -345.49429598721593, "logps/rejected": -627.5030423677885, "loss": 0.1198, "rewards/chosen": 2.857915011319247, "rewards/margins": 7.511329810936134, "rewards/rejected": -4.653414799616887, "step": 74 }, { "epoch": 0.018766420618040784, "grad_norm": 29.125, "kl": 3.71746826171875, "learning_rate": 5e-06, "logits/chosen": -77624949.33333333, "logits/rejected": -32048949.333333332, "logps/chosen": -498.4450276692708, "logps/rejected": -601.1195068359375, "loss": 0.1168, "rewards/chosen": 4.614773432413737, "rewards/margins": 7.70643170674642, "rewards/rejected": -3.091658274332682, "step": 75 }, { "epoch": 0.019016639559614662, "grad_norm": 23.375, "kl": 9.963424682617188, "learning_rate": 5e-06, "logits/chosen": -28286677.333333332, "logits/rejected": -48645717.333333336, "logps/chosen": -356.578125, "logps/rejected": -511.0622151692708, "loss": 0.1213, "rewards/chosen": 4.0463972091674805, "rewards/margins": 8.232327461242676, "rewards/rejected": -4.185930252075195, "step": 76 }, { "epoch": 0.01926685850118854, "grad_norm": 23.125, "kl": 6.5861496925354, "learning_rate": 5e-06, "logits/chosen": -65989980.0, "logits/rejected": -39129820.0, "logps/chosen": -412.86334228515625, "logps/rejected": -353.802001953125, "loss": 0.1798, "rewards/chosen": 3.7884206771850586, "rewards/margins": 7.3194260597229, "rewards/rejected": -3.531005382537842, "step": 77 }, { "epoch": 0.019517077442762418, "grad_norm": 25.125, "kl": 2.9897258281707764, "learning_rate": 5e-06, "logits/chosen": -48174040.0, "logits/rejected": -41698836.0, "logps/chosen": -446.5084533691406, "logps/rejected": -447.56201171875, "loss": 0.1329, "rewards/chosen": 3.867509365081787, "rewards/margins": 7.806653261184692, "rewards/rejected": -3.9391438961029053, "step": 78 }, { "epoch": 0.019767296384336296, "grad_norm": 20.375, "kl": 6.163200855255127, "learning_rate": 5e-06, "logits/chosen": -50068060.44444445, "logits/rejected": -58402713.6, "logps/chosen": -468.9856228298611, "logps/rejected": -660.4854166666667, "loss": 0.1099, "rewards/chosen": 3.648191663953993, "rewards/margins": 8.640795220269098, "rewards/rejected": -4.992603556315104, "step": 79 }, { "epoch": 0.02001751532591017, "grad_norm": 14.625, "kl": 1.4809026718139648, "learning_rate": 5e-06, "logits/chosen": -54611097.6, "logits/rejected": -22446710.85714286, "logps/chosen": -528.61337890625, "logps/rejected": -472.2774135044643, "loss": 0.0852, "rewards/chosen": 5.32177963256836, "rewards/margins": 9.697051021030973, "rewards/rejected": -4.375271388462612, "step": 80 }, { "epoch": 0.020267734267484048, "grad_norm": 31.0, "kl": 7.566961288452148, "learning_rate": 5e-06, "logits/chosen": -29653430.85714286, "logits/rejected": -33127379.2, "logps/chosen": -489.04771205357144, "logps/rejected": -366.3197509765625, "loss": 0.1507, "rewards/chosen": 3.8084632328578403, "rewards/margins": 8.1127623966762, "rewards/rejected": -4.304299163818359, "step": 81 }, { "epoch": 0.020517953209057926, "grad_norm": 21.75, "kl": 4.328423976898193, "learning_rate": 5e-06, "logits/chosen": -63439066.666666664, "logits/rejected": -61543376.0, "logps/chosen": -424.947265625, "logps/rejected": -386.8924560546875, "loss": 0.1485, "rewards/chosen": 3.516907056172689, "rewards/margins": 6.636496225992839, "rewards/rejected": -3.11958916982015, "step": 82 }, { "epoch": 0.020768172150631804, "grad_norm": 21.0, "kl": 5.679565906524658, "learning_rate": 5e-06, "logits/chosen": -43479936.0, "logits/rejected": -64066611.2, "logps/chosen": -403.6904296875, "logps/rejected": -428.721630859375, "loss": 0.132, "rewards/chosen": 3.5876573835100447, "rewards/margins": 7.082850864955358, "rewards/rejected": -3.4951934814453125, "step": 83 }, { "epoch": 0.02101839109220568, "grad_norm": 40.25, "kl": 0.9630905985832214, "learning_rate": 5e-06, "logits/chosen": -47370304.0, "logits/rejected": -22394812.0, "logps/chosen": -311.91552734375, "logps/rejected": -397.6983337402344, "loss": 0.1971, "rewards/chosen": 2.817434787750244, "rewards/margins": 5.503911256790161, "rewards/rejected": -2.686476469039917, "step": 84 }, { "epoch": 0.021268610033779556, "grad_norm": 27.375, "kl": 2.1226773262023926, "learning_rate": 5e-06, "logits/chosen": -63888085.333333336, "logits/rejected": -20810644.0, "logps/chosen": -368.785400390625, "logps/rejected": -496.9632975260417, "loss": 0.1262, "rewards/chosen": 3.5727866490681968, "rewards/margins": 8.387812932332357, "rewards/rejected": -4.81502628326416, "step": 85 }, { "epoch": 0.021518828975353434, "grad_norm": 25.0, "kl": 4.572732448577881, "learning_rate": 5e-06, "logits/chosen": -15606040.615384616, "logits/rejected": -42046190.54545455, "logps/chosen": -381.45601712740387, "logps/rejected": -488.03151633522725, "loss": 0.1288, "rewards/chosen": 3.206278287447416, "rewards/margins": 6.891124298522522, "rewards/rejected": -3.6848460110751065, "step": 86 }, { "epoch": 0.02176904791692731, "grad_norm": 20.625, "kl": 4.554980278015137, "learning_rate": 5e-06, "logits/chosen": -39270776.615384616, "logits/rejected": -32091194.181818184, "logps/chosen": -354.12777944711536, "logps/rejected": -391.29092684659093, "loss": 0.1579, "rewards/chosen": 3.6647494389460635, "rewards/margins": 8.14450275981343, "rewards/rejected": -4.479753320867365, "step": 87 }, { "epoch": 0.02201926685850119, "grad_norm": 21.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43021610.666666664, "logits/rejected": -37736665.6, "logps/chosen": -382.052490234375, "logps/rejected": -483.31591796875, "loss": 0.0882, "rewards/chosen": 3.0966402689615884, "rewards/margins": 7.5184684753417965, "rewards/rejected": -4.421828206380209, "step": 88 }, { "epoch": 0.022269485800075067, "grad_norm": 23.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25497083.2, "logits/rejected": -52602468.571428575, "logps/chosen": -394.87197265625, "logps/rejected": -571.9961286272321, "loss": 0.1, "rewards/chosen": 4.237030792236328, "rewards/margins": 9.36163624354771, "rewards/rejected": -5.124605451311384, "step": 89 }, { "epoch": 0.02251970474164894, "grad_norm": 22.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67350298.66666667, "logits/rejected": -45903715.55555555, "logps/chosen": -454.5413818359375, "logps/rejected": -465.64171006944446, "loss": 0.1194, "rewards/chosen": 3.133005142211914, "rewards/margins": 7.996957355075413, "rewards/rejected": -4.863952212863499, "step": 90 }, { "epoch": 0.02276992368322282, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54561536.0, "logits/rejected": -43982166.85714286, "logps/chosen": -463.916015625, "logps/rejected": -568.4403599330357, "loss": 0.065, "rewards/chosen": 3.3763851165771483, "rewards/margins": 9.043676485334124, "rewards/rejected": -5.667291368756976, "step": 91 }, { "epoch": 0.023020142624796697, "grad_norm": 29.75, "kl": 3.878533363342285, "learning_rate": 5e-06, "logits/chosen": -78604160.0, "logits/rejected": -49953117.86666667, "logps/chosen": -414.2492947048611, "logps/rejected": -565.402734375, "loss": 0.1407, "rewards/chosen": 3.157175064086914, "rewards/margins": 7.866324742635091, "rewards/rejected": -4.709149678548177, "step": 92 }, { "epoch": 0.023270361566370575, "grad_norm": 22.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -115785036.8, "logits/rejected": -50407865.2631579, "logps/chosen": -614.140771484375, "logps/rejected": -535.7399773848684, "loss": 0.1498, "rewards/chosen": 5.492605209350586, "rewards/margins": 10.425421564202558, "rewards/rejected": -4.9328163548519735, "step": 93 }, { "epoch": 0.023520580507944453, "grad_norm": 18.0, "kl": 2.0050878524780273, "learning_rate": 5e-06, "logits/chosen": -63961902.54545455, "logits/rejected": -33704019.692307696, "logps/chosen": -452.275390625, "logps/rejected": -492.4416691706731, "loss": 0.1261, "rewards/chosen": 3.503257404674183, "rewards/margins": 9.732444416392934, "rewards/rejected": -6.22918701171875, "step": 94 }, { "epoch": 0.023770799449518328, "grad_norm": 30.125, "kl": 0.7020899653434753, "learning_rate": 5e-06, "logits/chosen": -72260728.8888889, "logits/rejected": -33154007.466666665, "logps/chosen": -533.3898654513889, "logps/rejected": -419.801171875, "loss": 0.1758, "rewards/chosen": 3.7349586486816406, "rewards/margins": 7.236947886149089, "rewards/rejected": -3.501989237467448, "step": 95 }, { "epoch": 0.024021018391092205, "grad_norm": 33.25, "kl": 0.9786033630371094, "learning_rate": 5e-06, "logits/chosen": -65529914.18181818, "logits/rejected": -19485442.46153846, "logps/chosen": -422.83536044034093, "logps/rejected": -591.4449368990385, "loss": 0.1803, "rewards/chosen": 1.8292180841619319, "rewards/margins": 8.98130712975989, "rewards/rejected": -7.152089045597957, "step": 96 }, { "epoch": 0.024271237332666083, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64227723.63636363, "logits/rejected": -39207089.23076923, "logps/chosen": -403.1671697443182, "logps/rejected": -562.7894381009615, "loss": 0.095, "rewards/chosen": 2.459473870017312, "rewards/margins": 10.224654604504993, "rewards/rejected": -7.76518073448768, "step": 97 }, { "epoch": 0.02452145627423996, "grad_norm": 29.375, "kl": 1.1252658367156982, "learning_rate": 5e-06, "logits/chosen": -44272933.333333336, "logits/rejected": -40071653.333333336, "logps/chosen": -422.0517985026042, "logps/rejected": -501.0906982421875, "loss": 0.133, "rewards/chosen": 2.880974769592285, "rewards/margins": 8.332698504130047, "rewards/rejected": -5.451723734537761, "step": 98 }, { "epoch": 0.024771675215813835, "grad_norm": 29.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71466904.61538461, "logits/rejected": -54177931.63636363, "logps/chosen": -490.6721379206731, "logps/rejected": -404.0007990056818, "loss": 0.1428, "rewards/chosen": 3.21109859759991, "rewards/margins": 8.094061604746571, "rewards/rejected": -4.882963007146662, "step": 99 }, { "epoch": 0.025021894157387713, "grad_norm": 28.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61428824.0, "logits/rejected": -60440752.0, "logps/chosen": -516.803955078125, "logps/rejected": -664.438720703125, "loss": 0.1351, "rewards/chosen": 2.4329707622528076, "rewards/margins": 10.272232294082642, "rewards/rejected": -7.839261531829834, "step": 100 }, { "epoch": 0.02527211309896159, "grad_norm": 15.5, "kl": 2.8989791870117188, "learning_rate": 5e-06, "logits/chosen": -56807532.307692304, "logits/rejected": -33420832.0, "logps/chosen": -545.9090670072115, "logps/rejected": -378.61714311079544, "loss": 0.109, "rewards/chosen": 3.6259260911207933, "rewards/margins": 7.566817356989934, "rewards/rejected": -3.9408912658691406, "step": 101 }, { "epoch": 0.02552233204053547, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52177293.71428572, "logits/rejected": -83865396.70588236, "logps/chosen": -437.81612723214283, "logps/rejected": -750.4253216911765, "loss": 0.0519, "rewards/chosen": 4.108769825526646, "rewards/margins": 12.032810675997695, "rewards/rejected": -7.924040850471048, "step": 102 }, { "epoch": 0.025772550982109347, "grad_norm": 24.5, "kl": 2.3272581100463867, "learning_rate": 5e-06, "logits/chosen": -27030333.09090909, "logits/rejected": -46858318.76923077, "logps/chosen": -335.79365678267044, "logps/rejected": -514.9300255408654, "loss": 0.1732, "rewards/chosen": 2.424674294211648, "rewards/margins": 7.565785227955638, "rewards/rejected": -5.14111093374399, "step": 103 }, { "epoch": 0.02602276992368322, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50505687.27272727, "logits/rejected": -51117435.07692308, "logps/chosen": -499.69588955965907, "logps/rejected": -542.9323918269231, "loss": 0.0748, "rewards/chosen": 4.681109341708097, "rewards/margins": 11.172604914311762, "rewards/rejected": -6.491495572603666, "step": 104 }, { "epoch": 0.0262729888652571, "grad_norm": 21.375, "kl": 5.133856296539307, "learning_rate": 5e-06, "logits/chosen": 14268272.0, "logits/rejected": -63890316.8, "logps/chosen": -495.60836356026783, "logps/rejected": -505.9212890625, "loss": 0.1042, "rewards/chosen": 3.7813717978341237, "rewards/margins": 9.263994325910296, "rewards/rejected": -5.482622528076172, "step": 105 }, { "epoch": 0.026523207806830977, "grad_norm": 20.0, "kl": 3.6473140716552734, "learning_rate": 5e-06, "logits/chosen": -54861888.0, "logits/rejected": -48034137.6, "logps/chosen": -366.1411830357143, "logps/rejected": -525.676171875, "loss": 0.1435, "rewards/chosen": 3.1206065586635043, "rewards/margins": 6.874971335274832, "rewards/rejected": -3.754364776611328, "step": 106 }, { "epoch": 0.026773426748404855, "grad_norm": 28.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72654528.0, "logits/rejected": -55476534.85714286, "logps/chosen": -372.630517578125, "logps/rejected": -451.62901088169644, "loss": 0.1135, "rewards/chosen": 1.8628992080688476, "rewards/margins": 6.851275280543735, "rewards/rejected": -4.988376072474888, "step": 107 }, { "epoch": 0.027023645689978733, "grad_norm": 30.25, "kl": 7.377291679382324, "learning_rate": 5e-06, "logits/chosen": -60453529.6, "logits/rejected": -3780721.3333333335, "logps/chosen": -439.65693359375, "logps/rejected": -445.7779134114583, "loss": 0.1769, "rewards/chosen": 3.740906270345052, "rewards/margins": 7.646404774983724, "rewards/rejected": -3.905498504638672, "step": 108 }, { "epoch": 0.027273864631552607, "grad_norm": 23.125, "kl": 7.282306671142578, "learning_rate": 5e-06, "logits/chosen": -33562976.0, "logits/rejected": -81358826.66666667, "logps/chosen": -396.25547960069446, "logps/rejected": -531.6099039713541, "loss": 0.1514, "rewards/chosen": 3.3797940148247614, "rewards/margins": 8.526789559258354, "rewards/rejected": -5.146995544433594, "step": 109 }, { "epoch": 0.027524083573126485, "grad_norm": 18.5, "kl": 0.6033732295036316, "learning_rate": 5e-06, "logits/chosen": -11015164.666666666, "logits/rejected": -43291568.0, "logps/chosen": -376.6101481119792, "logps/rejected": -580.6302897135416, "loss": 0.116, "rewards/chosen": 4.113841374715169, "rewards/margins": 10.078737258911133, "rewards/rejected": -5.964895884195964, "step": 110 }, { "epoch": 0.027774302514700363, "grad_norm": 19.375, "kl": 8.607128143310547, "learning_rate": 5e-06, "logits/chosen": -80056891.07692307, "logits/rejected": -8103525.818181818, "logps/chosen": -391.80014272836536, "logps/rejected": -500.23606178977275, "loss": 0.1769, "rewards/chosen": 3.5198141244741588, "rewards/margins": 6.782989395248307, "rewards/rejected": -3.263175270774148, "step": 111 }, { "epoch": 0.02802452145627424, "grad_norm": 23.25, "kl": 6.622926712036133, "learning_rate": 5e-06, "logits/chosen": -68026336.0, "logits/rejected": -53684298.666666664, "logps/chosen": -458.5303141276042, "logps/rejected": -464.957763671875, "loss": 0.1257, "rewards/chosen": 3.841912269592285, "rewards/margins": 7.030071258544922, "rewards/rejected": -3.1881589889526367, "step": 112 }, { "epoch": 0.02827474039784812, "grad_norm": 13.5625, "kl": 2.5748486518859863, "learning_rate": 5e-06, "logits/chosen": -50560486.4, "logits/rejected": -22140749.714285713, "logps/chosen": -516.96865234375, "logps/rejected": -348.55751255580356, "loss": 0.0899, "rewards/chosen": 5.615151977539062, "rewards/margins": 9.130515943254743, "rewards/rejected": -3.515363965715681, "step": 113 }, { "epoch": 0.028524959339421993, "grad_norm": 22.25, "kl": 7.2938947677612305, "learning_rate": 5e-06, "logits/chosen": -36971982.76923077, "logits/rejected": -34339246.54545455, "logps/chosen": -370.43306790865387, "logps/rejected": -484.18075284090907, "loss": 0.1932, "rewards/chosen": 3.307170867919922, "rewards/margins": 7.3687522194602275, "rewards/rejected": -4.061581351540306, "step": 114 }, { "epoch": 0.02877517828099587, "grad_norm": 21.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70304418.9090909, "logits/rejected": -68940150.15384616, "logps/chosen": -339.2457386363636, "logps/rejected": -573.2331730769231, "loss": 0.1163, "rewards/chosen": 3.3149663751775567, "rewards/margins": 8.222295054188976, "rewards/rejected": -4.907328679011418, "step": 115 }, { "epoch": 0.02902539722256975, "grad_norm": 16.5, "kl": 5.182840347290039, "learning_rate": 5e-06, "logits/chosen": -32940020.363636363, "logits/rejected": -31526279.384615384, "logps/chosen": -338.5106756036932, "logps/rejected": -444.4393780048077, "loss": 0.1589, "rewards/chosen": 3.36935112693093, "rewards/margins": 7.280587523133605, "rewards/rejected": -3.9112363962026744, "step": 116 }, { "epoch": 0.029275616164143627, "grad_norm": 84.5, "kl": 2.374370574951172, "learning_rate": 5e-06, "logits/chosen": -41273824.0, "logits/rejected": -9672752.0, "logps/chosen": -451.16943359375, "logps/rejected": -619.67578125, "loss": 0.114, "rewards/chosen": 3.3866065979003905, "rewards/margins": 8.341240583147322, "rewards/rejected": -4.9546339852469305, "step": 117 }, { "epoch": 0.029525835105717504, "grad_norm": 14.6875, "kl": 2.604111433029175, "learning_rate": 5e-06, "logits/chosen": -53653376.0, "logits/rejected": -59455158.15384615, "logps/chosen": -418.7755681818182, "logps/rejected": -505.4073016826923, "loss": 0.082, "rewards/chosen": 4.632992137562145, "rewards/margins": 9.54545422534009, "rewards/rejected": -4.912462087777945, "step": 118 }, { "epoch": 0.02977605404729138, "grad_norm": 13.5, "kl": 0.20227432250976562, "learning_rate": 5e-06, "logits/chosen": -32224620.0, "logits/rejected": -41155708.0, "logps/chosen": -416.0848083496094, "logps/rejected": -377.306396484375, "loss": 0.0932, "rewards/chosen": 3.985408306121826, "rewards/margins": 8.498955726623535, "rewards/rejected": -4.513547420501709, "step": 119 }, { "epoch": 0.030026272988865257, "grad_norm": 11.0625, "kl": 0.35961565375328064, "learning_rate": 5e-06, "logits/chosen": -52238199.46666667, "logits/rejected": -55250368.0, "logps/chosen": -430.0063151041667, "logps/rejected": -558.1921657986111, "loss": 0.0457, "rewards/chosen": 4.434163411458333, "rewards/margins": 10.700937059190537, "rewards/rejected": -6.2667736477322045, "step": 120 }, { "epoch": 0.030276491930439135, "grad_norm": 22.875, "kl": 10.833108901977539, "learning_rate": 5e-06, "logits/chosen": -55598536.0, "logits/rejected": -36607028.0, "logps/chosen": -454.464111328125, "logps/rejected": -471.4989318847656, "loss": 0.0801, "rewards/chosen": 5.157410621643066, "rewards/margins": 10.901457786560059, "rewards/rejected": -5.744047164916992, "step": 121 }, { "epoch": 0.030526710872013012, "grad_norm": 17.0, "kl": 7.9545464515686035, "learning_rate": 5e-06, "logits/chosen": -49838222.222222224, "logits/rejected": -50103086.93333333, "logps/chosen": -415.40961371527777, "logps/rejected": -517.4078776041666, "loss": 0.0859, "rewards/chosen": 5.3493804931640625, "rewards/margins": 9.173910522460938, "rewards/rejected": -3.824530029296875, "step": 122 }, { "epoch": 0.030776929813586887, "grad_norm": 20.625, "kl": 1.4038188457489014, "learning_rate": 5e-06, "logits/chosen": -40942124.307692304, "logits/rejected": -46390813.09090909, "logps/chosen": -450.25863882211536, "logps/rejected": -705.0929509943181, "loss": 0.0696, "rewards/chosen": 4.544291276198167, "rewards/margins": 12.617362842693197, "rewards/rejected": -8.07307156649503, "step": 123 }, { "epoch": 0.031027148755160765, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39217398.15384615, "logits/rejected": -54239610.18181818, "logps/chosen": -331.6940354567308, "logps/rejected": -562.7880415482955, "loss": 0.1387, "rewards/chosen": 3.295840336726262, "rewards/margins": 9.500584155529529, "rewards/rejected": -6.204743818803267, "step": 124 }, { "epoch": 0.03127736769673464, "grad_norm": 20.125, "kl": 8.88840103149414, "learning_rate": 5e-06, "logits/chosen": -75248842.66666667, "logits/rejected": -27488213.333333332, "logps/chosen": -512.0596516927084, "logps/rejected": -431.65234375, "loss": 0.1034, "rewards/chosen": 6.4001725514729815, "rewards/margins": 10.636566162109375, "rewards/rejected": -4.2363936106363935, "step": 125 }, { "epoch": 0.03152758663830852, "grad_norm": 10.4375, "kl": 0.7614803314208984, "learning_rate": 5e-06, "logits/chosen": -52280902.4, "logits/rejected": -44913298.28571428, "logps/chosen": -436.962109375, "logps/rejected": -555.4670061383929, "loss": 0.062, "rewards/chosen": 5.041194152832031, "rewards/margins": 10.586859348842076, "rewards/rejected": -5.545665196010044, "step": 126 }, { "epoch": 0.0317778055798824, "grad_norm": 22.25, "kl": 10.628379821777344, "learning_rate": 5e-06, "logits/chosen": -64422680.615384616, "logits/rejected": -74492311.27272727, "logps/chosen": -516.8713942307693, "logps/rejected": -438.9396306818182, "loss": 0.1222, "rewards/chosen": 5.40548588679387, "rewards/margins": 9.113758994149162, "rewards/rejected": -3.708273107355291, "step": 127 }, { "epoch": 0.032028024521456276, "grad_norm": 20.875, "kl": 16.492826461791992, "learning_rate": 5e-06, "logits/chosen": -96028784.0, "logits/rejected": -46677208.0, "logps/chosen": -388.1448059082031, "logps/rejected": -429.96466064453125, "loss": 0.1979, "rewards/chosen": 4.420779228210449, "rewards/margins": 9.198101997375488, "rewards/rejected": -4.777322769165039, "step": 128 }, { "epoch": 0.032278243463030154, "grad_norm": 20.125, "kl": 2.5171051025390625, "learning_rate": 5e-06, "logits/chosen": -78527726.54545455, "logits/rejected": -58059150.76923077, "logps/chosen": -589.0059925426136, "logps/rejected": -637.1475360576923, "loss": 0.0816, "rewards/chosen": 6.104273015802557, "rewards/margins": 11.774602183095226, "rewards/rejected": -5.670329167292668, "step": 129 }, { "epoch": 0.03252846240460403, "grad_norm": 24.75, "kl": 6.977313041687012, "learning_rate": 5e-06, "logits/chosen": -45330215.384615384, "logits/rejected": -52318446.54545455, "logps/chosen": -332.63683143028845, "logps/rejected": -449.8136541193182, "loss": 0.1588, "rewards/chosen": 3.3289369436410756, "rewards/margins": 8.708844031487311, "rewards/rejected": -5.379907087846235, "step": 130 }, { "epoch": 0.0327786813461779, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23178550.4, "logits/rejected": -45325394.28571428, "logps/chosen": -428.61552734375, "logps/rejected": -615.2171456473214, "loss": 0.0438, "rewards/chosen": 4.332117080688477, "rewards/margins": 10.99391692025321, "rewards/rejected": -6.661799839564732, "step": 131 }, { "epoch": 0.03302890028775178, "grad_norm": 19.25, "kl": 8.29294204711914, "learning_rate": 5e-06, "logits/chosen": -46093674.666666664, "logits/rejected": -22097666.666666668, "logps/chosen": -460.8732096354167, "logps/rejected": -280.8202311197917, "loss": 0.1448, "rewards/chosen": 4.674725850423177, "rewards/margins": 6.884338537851969, "rewards/rejected": -2.2096126874287925, "step": 132 }, { "epoch": 0.03327911922932566, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63178402.90909091, "logits/rejected": -24557321.846153848, "logps/chosen": -361.7850452769886, "logps/rejected": -507.1363055889423, "loss": 0.0875, "rewards/chosen": 3.1080398559570312, "rewards/margins": 9.310384310208834, "rewards/rejected": -6.2023444542518025, "step": 133 }, { "epoch": 0.033529338170899536, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69576857.6, "logits/rejected": -41444192.0, "logps/chosen": -365.8089599609375, "logps/rejected": -486.6082240513393, "loss": 0.0939, "rewards/chosen": 3.5887569427490233, "rewards/margins": 8.99369032723563, "rewards/rejected": -5.404933384486607, "step": 134 }, { "epoch": 0.033779557112473414, "grad_norm": 24.125, "kl": 6.132481575012207, "learning_rate": 5e-06, "logits/chosen": -34672068.571428575, "logits/rejected": -64537472.0, "logps/chosen": -431.24354771205356, "logps/rejected": -673.63056640625, "loss": 0.1251, "rewards/chosen": 3.8119286128452847, "rewards/margins": 11.959659630911691, "rewards/rejected": -8.147731018066406, "step": 135 }, { "epoch": 0.03402977605404729, "grad_norm": 23.125, "kl": 15.4842529296875, "learning_rate": 5e-06, "logits/chosen": -100979224.0, "logits/rejected": -17410722.0, "logps/chosen": -524.205322265625, "logps/rejected": -326.41387939453125, "loss": 0.1419, "rewards/chosen": 5.6238112449646, "rewards/margins": 10.311150550842285, "rewards/rejected": -4.6873393058776855, "step": 136 }, { "epoch": 0.03427999499562117, "grad_norm": 28.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30986018.46153846, "logits/rejected": -26399616.0, "logps/chosen": -356.25304236778845, "logps/rejected": -426.40380859375, "loss": 0.1682, "rewards/chosen": 2.9559669494628906, "rewards/margins": 7.164258089932528, "rewards/rejected": -4.208291140469638, "step": 137 }, { "epoch": 0.03453021393719505, "grad_norm": 23.75, "kl": 10.649872779846191, "learning_rate": 5e-06, "logits/chosen": -74135760.0, "logits/rejected": -38775632.0, "logps/chosen": -489.68231201171875, "logps/rejected": -550.9609375, "loss": 0.0558, "rewards/chosen": 4.875946521759033, "rewards/margins": 9.918260097503662, "rewards/rejected": -5.042313575744629, "step": 138 }, { "epoch": 0.034780432878768926, "grad_norm": 24.875, "kl": 4.290389060974121, "learning_rate": 5e-06, "logits/chosen": -43735931.428571425, "logits/rejected": -33822678.4, "logps/chosen": -338.83150809151783, "logps/rejected": -483.46162109375, "loss": 0.1309, "rewards/chosen": 3.1358509063720703, "rewards/margins": 9.684260940551757, "rewards/rejected": -6.548410034179687, "step": 139 }, { "epoch": 0.0350306518203428, "grad_norm": 18.75, "kl": 5.464059829711914, "learning_rate": 5e-06, "logits/chosen": -59807507.692307696, "logits/rejected": -62122222.54545455, "logps/chosen": -364.8303786057692, "logps/rejected": -498.06516335227275, "loss": 0.1168, "rewards/chosen": 3.21218255849985, "rewards/margins": 8.490869775518672, "rewards/rejected": -5.278687217018821, "step": 140 }, { "epoch": 0.035280870761916674, "grad_norm": 25.75, "kl": 5.874354362487793, "learning_rate": 5e-06, "logits/chosen": -80815156.70588236, "logits/rejected": -99578194.28571428, "logps/chosen": -403.36790556066177, "logps/rejected": -484.09176199776783, "loss": 0.1625, "rewards/chosen": 4.2788184670841, "rewards/margins": 10.05712951531931, "rewards/rejected": -5.778311048235212, "step": 141 }, { "epoch": 0.03553108970349055, "grad_norm": 23.0, "kl": 1.0059306621551514, "learning_rate": 5e-06, "logits/chosen": -50342840.0, "logits/rejected": -55544748.0, "logps/chosen": -416.4798583984375, "logps/rejected": -632.0045166015625, "loss": 0.1058, "rewards/chosen": 3.6286752223968506, "rewards/margins": 8.574209451675415, "rewards/rejected": -4.9455342292785645, "step": 142 }, { "epoch": 0.03578130864506443, "grad_norm": 21.75, "kl": 8.205643653869629, "learning_rate": 5e-06, "logits/chosen": -59896755.2, "logits/rejected": -28854793.14285714, "logps/chosen": -527.664453125, "logps/rejected": -552.7942592075893, "loss": 0.0741, "rewards/chosen": 5.277662658691407, "rewards/margins": 10.381992994035993, "rewards/rejected": -5.104330335344587, "step": 143 }, { "epoch": 0.03603152758663831, "grad_norm": 21.25, "kl": 0.39021429419517517, "learning_rate": 5e-06, "logits/chosen": -54364202.666666664, "logits/rejected": -43710101.333333336, "logps/chosen": -355.3279215494792, "logps/rejected": -591.236328125, "loss": 0.1382, "rewards/chosen": 3.4100462595621743, "rewards/margins": 9.961926142374674, "rewards/rejected": -6.5518798828125, "step": 144 }, { "epoch": 0.036281746528212186, "grad_norm": 19.625, "kl": 3.4919161796569824, "learning_rate": 5e-06, "logits/chosen": -44140093.09090909, "logits/rejected": -60244603.07692308, "logps/chosen": -431.66645951704544, "logps/rejected": -479.35659555288464, "loss": 0.0717, "rewards/chosen": 6.132303411310369, "rewards/margins": 10.587619514732094, "rewards/rejected": -4.455316103421724, "step": 145 }, { "epoch": 0.036531965469786064, "grad_norm": 28.125, "kl": 9.083163261413574, "learning_rate": 5e-06, "logits/chosen": -27821803.42857143, "logits/rejected": -15193225.6, "logps/chosen": -330.38779994419644, "logps/rejected": -567.6158203125, "loss": 0.2525, "rewards/chosen": 4.288521902901786, "rewards/margins": 6.976574461800711, "rewards/rejected": -2.6880525588989257, "step": 146 }, { "epoch": 0.03678218441135994, "grad_norm": 16.625, "kl": 1.34625244140625, "learning_rate": 5e-06, "logits/chosen": -55421824.0, "logits/rejected": -7098027.636363637, "logps/chosen": -323.37161959134613, "logps/rejected": -678.7225230823864, "loss": 0.0908, "rewards/chosen": 3.7793003962590146, "rewards/margins": 8.31683389623682, "rewards/rejected": -4.537533499977806, "step": 147 }, { "epoch": 0.03703240335293382, "grad_norm": 20.625, "kl": 6.829098701477051, "learning_rate": 5e-06, "logits/chosen": -38491163.07692308, "logits/rejected": -31447584.0, "logps/chosen": -433.2761793870192, "logps/rejected": -421.71799538352275, "loss": 0.1703, "rewards/chosen": 4.942574134239783, "rewards/margins": 8.83587601134827, "rewards/rejected": -3.8933018771084873, "step": 148 }, { "epoch": 0.0372826222945077, "grad_norm": 20.0, "kl": 3.9278724193573, "learning_rate": 5e-06, "logits/chosen": -52982498.461538464, "logits/rejected": -51161460.36363637, "logps/chosen": -428.42202524038464, "logps/rejected": -591.8447265625, "loss": 0.0763, "rewards/chosen": 4.8494253892164965, "rewards/margins": 10.34067967554906, "rewards/rejected": -5.4912542863325635, "step": 149 }, { "epoch": 0.03753284123608157, "grad_norm": 19.375, "kl": 6.239824295043945, "learning_rate": 5e-06, "logits/chosen": -48351232.0, "logits/rejected": -20709194.0, "logps/chosen": -283.2247314453125, "logps/rejected": -323.7087097167969, "loss": 0.1278, "rewards/chosen": 4.198636054992676, "rewards/margins": 7.8641743659973145, "rewards/rejected": -3.6655383110046387, "step": 150 }, { "epoch": 0.037783060177655446, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54331776.0, "logits/rejected": 35268820.571428575, "logps/chosen": -556.68505859375, "logps/rejected": -581.5312848772321, "loss": 0.0675, "rewards/chosen": 6.455515289306641, "rewards/margins": 11.3932131086077, "rewards/rejected": -4.93769781930106, "step": 151 }, { "epoch": 0.038033279119229324, "grad_norm": 14.375, "kl": 4.6379618644714355, "learning_rate": 5e-06, "logits/chosen": -44820281.6, "logits/rejected": -57210107.428571425, "logps/chosen": -297.274853515625, "logps/rejected": -502.84061104910717, "loss": 0.075, "rewards/chosen": 3.327361297607422, "rewards/margins": 8.49720960344587, "rewards/rejected": -5.169848305838449, "step": 152 }, { "epoch": 0.0382834980608032, "grad_norm": 22.625, "kl": 1.0584895610809326, "learning_rate": 5e-06, "logits/chosen": -81029024.0, "logits/rejected": -52466168.0, "logps/chosen": -656.5376586914062, "logps/rejected": -481.95355224609375, "loss": 0.0681, "rewards/chosen": 6.785458087921143, "rewards/margins": 12.110761165618896, "rewards/rejected": -5.325303077697754, "step": 153 }, { "epoch": 0.03853371700237708, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49451008.0, "logits/rejected": -97331114.66666667, "logps/chosen": -333.88014729817706, "logps/rejected": -486.0155029296875, "loss": 0.1643, "rewards/chosen": 3.8546034495035806, "rewards/margins": 8.570431391398111, "rewards/rejected": -4.715827941894531, "step": 154 }, { "epoch": 0.03878393594395096, "grad_norm": 19.0, "kl": 8.156567573547363, "learning_rate": 5e-06, "logits/chosen": -40026600.0, "logits/rejected": -12795388.0, "logps/chosen": -357.08233642578125, "logps/rejected": -397.34918212890625, "loss": 0.1987, "rewards/chosen": 3.712439775466919, "rewards/margins": 7.8179771900177, "rewards/rejected": -4.105537414550781, "step": 155 }, { "epoch": 0.039034154885524835, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18985547.42857143, "logits/rejected": -4583904.0, "logps/chosen": -363.89854213169644, "logps/rejected": -481.7215360753676, "loss": 0.126, "rewards/chosen": 4.135683332170759, "rewards/margins": 8.639876806435463, "rewards/rejected": -4.504193474264706, "step": 156 }, { "epoch": 0.03928437382709871, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40897821.86666667, "logits/rejected": -43241447.11111111, "logps/chosen": -368.06103515625, "logps/rejected": -526.2424045138889, "loss": 0.1085, "rewards/chosen": 4.275669860839844, "rewards/margins": 10.76724616156684, "rewards/rejected": -6.491576300726996, "step": 157 }, { "epoch": 0.03953459276867259, "grad_norm": 16.5, "kl": 0.36798352003097534, "learning_rate": 5e-06, "logits/chosen": -57061719.27272727, "logits/rejected": -39024620.307692304, "logps/chosen": -391.3818359375, "logps/rejected": -388.5891676682692, "loss": 0.0616, "rewards/chosen": 4.3716558976606885, "rewards/margins": 9.17924667571808, "rewards/rejected": -4.807590778057392, "step": 158 }, { "epoch": 0.03978481171024647, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28181875.2, "logits/rejected": -31613298.285714287, "logps/chosen": -428.174462890625, "logps/rejected": -416.0806361607143, "loss": 0.0532, "rewards/chosen": 4.767366790771485, "rewards/margins": 10.949200330461775, "rewards/rejected": -6.18183353969029, "step": 159 }, { "epoch": 0.04003503065182034, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -97138384.0, "logits/rejected": -46946064.0, "logps/chosen": -571.2589111328125, "logps/rejected": -585.5206298828125, "loss": 0.0944, "rewards/chosen": 6.3586201667785645, "rewards/margins": 11.417516708374023, "rewards/rejected": -5.058896541595459, "step": 160 }, { "epoch": 0.04028524959339422, "grad_norm": 23.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45920308.36363637, "logits/rejected": -27952059.076923076, "logps/chosen": -352.62990500710225, "logps/rejected": -485.89554537259613, "loss": 0.1199, "rewards/chosen": 4.034776167436079, "rewards/margins": 10.131546820793952, "rewards/rejected": -6.096770653357873, "step": 161 }, { "epoch": 0.040535468534968096, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37612216.0, "logits/rejected": -30060588.0, "logps/chosen": -300.6482238769531, "logps/rejected": -473.1891174316406, "loss": 0.1184, "rewards/chosen": 3.2378549575805664, "rewards/margins": 8.858850479125977, "rewards/rejected": -5.62099552154541, "step": 162 }, { "epoch": 0.040785687476541974, "grad_norm": 18.5, "kl": 4.293770790100098, "learning_rate": 5e-06, "logits/chosen": -45873765.333333336, "logits/rejected": -55967301.333333336, "logps/chosen": -384.5272623697917, "logps/rejected": -594.3080647786459, "loss": 0.1583, "rewards/chosen": 2.8155123392740884, "rewards/margins": 8.81442387898763, "rewards/rejected": -5.998911539713542, "step": 163 }, { "epoch": 0.04103590641811585, "grad_norm": 21.5, "kl": 7.401231288909912, "learning_rate": 5e-06, "logits/chosen": -52780744.0, "logits/rejected": -56251888.0, "logps/chosen": -586.0512084960938, "logps/rejected": -395.9722595214844, "loss": 0.0694, "rewards/chosen": 5.403119087219238, "rewards/margins": 11.10162353515625, "rewards/rejected": -5.698504447937012, "step": 164 }, { "epoch": 0.04128612535968973, "grad_norm": 18.75, "kl": 4.575628280639648, "learning_rate": 5e-06, "logits/chosen": -46542098.28571428, "logits/rejected": -65141612.8, "logps/chosen": -417.9397670200893, "logps/rejected": -495.366015625, "loss": 0.076, "rewards/chosen": 3.4580459594726562, "rewards/margins": 9.116856384277344, "rewards/rejected": -5.658810424804687, "step": 165 }, { "epoch": 0.04153634430126361, "grad_norm": 26.625, "kl": 2.5202600955963135, "learning_rate": 5e-06, "logits/chosen": -50662144.0, "logits/rejected": -33581750.4, "logps/chosen": -390.7208775111607, "logps/rejected": -318.3043701171875, "loss": 0.1234, "rewards/chosen": 4.517326354980469, "rewards/margins": 7.158951377868652, "rewards/rejected": -2.6416250228881837, "step": 166 }, { "epoch": 0.041786563242837485, "grad_norm": 24.125, "kl": 3.483273983001709, "learning_rate": 5e-06, "logits/chosen": -59899664.0, "logits/rejected": -17329224.0, "logps/chosen": -469.6291097005208, "logps/rejected": -388.012939453125, "loss": 0.0926, "rewards/chosen": 5.202668190002441, "rewards/margins": 10.415954271952312, "rewards/rejected": -5.21328608194987, "step": 167 }, { "epoch": 0.04203678218441136, "grad_norm": 13.875, "kl": 0.2973499298095703, "learning_rate": 5e-06, "logits/chosen": -71792883.2, "logits/rejected": -51374930.28571428, "logps/chosen": -482.91533203125, "logps/rejected": -594.4489397321429, "loss": 0.0549, "rewards/chosen": 4.24849853515625, "rewards/margins": 11.637181854248047, "rewards/rejected": -7.388683319091797, "step": 168 }, { "epoch": 0.042287001125985234, "grad_norm": 14.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19547772.0, "logits/rejected": -43603864.0, "logps/chosen": -274.3015543619792, "logps/rejected": -582.5530598958334, "loss": 0.0915, "rewards/chosen": 2.4779138565063477, "rewards/margins": 9.535144488016766, "rewards/rejected": -7.057230631510417, "step": 169 }, { "epoch": 0.04253722006755911, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22624244.57142857, "logits/rejected": -46519928.47058824, "logps/chosen": -293.7974330357143, "logps/rejected": -620.6377527573529, "loss": 0.0856, "rewards/chosen": 3.239545004708426, "rewards/margins": 11.214790760969915, "rewards/rejected": -7.975245756261489, "step": 170 }, { "epoch": 0.04278743900913299, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47868133.81818182, "logits/rejected": 57006483.692307696, "logps/chosen": -385.4287109375, "logps/rejected": -448.4245793269231, "loss": 0.1033, "rewards/chosen": 3.897959275679155, "rewards/margins": 10.785419837578193, "rewards/rejected": -6.887460561899038, "step": 171 }, { "epoch": 0.04303765795070687, "grad_norm": 14.6875, "kl": 0.7472852468490601, "learning_rate": 5e-06, "logits/chosen": -40471397.333333336, "logits/rejected": -77715264.0, "logps/chosen": -422.4617513020833, "logps/rejected": -555.0777994791666, "loss": 0.0723, "rewards/chosen": 4.81877326965332, "rewards/margins": 11.356734593709309, "rewards/rejected": -6.537961324055989, "step": 172 }, { "epoch": 0.043287876892280745, "grad_norm": 15.6875, "kl": 4.088824272155762, "learning_rate": 5e-06, "logits/chosen": -52978038.15384615, "logits/rejected": 90874722.9090909, "logps/chosen": -365.61767578125, "logps/rejected": -551.6926491477273, "loss": 0.0937, "rewards/chosen": 4.703894981971154, "rewards/margins": 12.280280480018028, "rewards/rejected": -7.576385498046875, "step": 173 }, { "epoch": 0.04353809583385462, "grad_norm": 12.875, "kl": 1.2338712215423584, "learning_rate": 5e-06, "logits/chosen": -32180448.0, "logits/rejected": -40426395.428571425, "logps/chosen": -308.8005126953125, "logps/rejected": -429.81005859375, "loss": 0.0771, "rewards/chosen": 3.661548614501953, "rewards/margins": 9.841023581368582, "rewards/rejected": -6.179474966866629, "step": 174 }, { "epoch": 0.0437883147754285, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57538949.81818182, "logits/rejected": -34008918.15384615, "logps/chosen": -347.4867498224432, "logps/rejected": -562.6441556490385, "loss": 0.0613, "rewards/chosen": 4.080998854203657, "rewards/margins": 11.374724674891759, "rewards/rejected": -7.293725820688101, "step": 175 }, { "epoch": 0.04403853371700238, "grad_norm": 18.375, "kl": 5.854638576507568, "learning_rate": 5e-06, "logits/chosen": -47022602.666666664, "logits/rejected": -48280176.0, "logps/chosen": -415.4212239583333, "logps/rejected": -594.5871175130209, "loss": 0.0913, "rewards/chosen": 5.363189697265625, "rewards/margins": 13.202273050944012, "rewards/rejected": -7.839083353678386, "step": 176 }, { "epoch": 0.04428875265857626, "grad_norm": 13.25, "kl": 1.109082579612732, "learning_rate": 5e-06, "logits/chosen": -46307733.333333336, "logits/rejected": -40083797.333333336, "logps/chosen": -273.48581949869794, "logps/rejected": -353.5043131510417, "loss": 0.1185, "rewards/chosen": 3.213988939921061, "rewards/margins": 7.801450411478678, "rewards/rejected": -4.587461471557617, "step": 177 }, { "epoch": 0.044538971600150135, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26819675.42857143, "logits/rejected": -42344256.0, "logps/chosen": -211.00730678013392, "logps/rejected": -606.9449103860294, "loss": 0.109, "rewards/chosen": 2.6112401144845143, "rewards/margins": 8.805287032568154, "rewards/rejected": -6.1940469180836395, "step": 178 }, { "epoch": 0.044789190541724005, "grad_norm": 20.25, "kl": 1.9853570461273193, "learning_rate": 5e-06, "logits/chosen": -49304214.4, "logits/rejected": -75421686.85714285, "logps/chosen": -552.79306640625, "logps/rejected": -567.9908621651786, "loss": 0.0845, "rewards/chosen": 4.3198188781738285, "rewards/margins": 10.149826158796039, "rewards/rejected": -5.83000728062221, "step": 179 }, { "epoch": 0.04503940948329788, "grad_norm": 15.375, "kl": 1.1453670263290405, "learning_rate": 5e-06, "logits/chosen": -50428322.461538464, "logits/rejected": -59052404.36363637, "logps/chosen": -387.38439002403845, "logps/rejected": -631.0325816761364, "loss": 0.0927, "rewards/chosen": 4.393623938927283, "rewards/margins": 12.610069168197526, "rewards/rejected": -8.216445229270242, "step": 180 }, { "epoch": 0.04528962842487176, "grad_norm": 31.0, "kl": 9.287070274353027, "learning_rate": 5e-06, "logits/chosen": -72337800.53333333, "logits/rejected": -54460508.44444445, "logps/chosen": -430.4421875, "logps/rejected": -463.251953125, "loss": 0.1243, "rewards/chosen": 4.164467112223307, "rewards/margins": 8.997049374050565, "rewards/rejected": -4.832582261827257, "step": 181 }, { "epoch": 0.04553984736644564, "grad_norm": 26.5, "kl": 7.495224952697754, "learning_rate": 5e-06, "logits/chosen": -68060749.71428572, "logits/rejected": -37737881.6, "logps/chosen": -436.6040736607143, "logps/rejected": -428.13115234375, "loss": 0.108, "rewards/chosen": 4.3831939697265625, "rewards/margins": 10.0071533203125, "rewards/rejected": -5.623959350585937, "step": 182 }, { "epoch": 0.04579006630801952, "grad_norm": 16.375, "kl": 8.493086814880371, "learning_rate": 5e-06, "logits/chosen": -47477799.384615384, "logits/rejected": -58968791.27272727, "logps/chosen": -429.14663461538464, "logps/rejected": -332.81716086647725, "loss": 0.1017, "rewards/chosen": 5.500474783090445, "rewards/margins": 10.639228500686325, "rewards/rejected": -5.138753717595881, "step": 183 }, { "epoch": 0.046040285249593395, "grad_norm": 12.25, "kl": 4.708995342254639, "learning_rate": 5e-06, "logits/chosen": -44649705.14285714, "logits/rejected": -67075948.8, "logps/chosen": -523.5865304129464, "logps/rejected": -622.985791015625, "loss": 0.0371, "rewards/chosen": 5.529398236955915, "rewards/margins": 13.241002546037947, "rewards/rejected": -7.711604309082031, "step": 184 }, { "epoch": 0.04629050419116727, "grad_norm": 17.0, "kl": 2.1218771934509277, "learning_rate": 5e-06, "logits/chosen": -50696034.461538464, "logits/rejected": -49664046.54545455, "logps/chosen": -322.49057241586536, "logps/rejected": -529.3594193892045, "loss": 0.1227, "rewards/chosen": 3.6770201462965746, "rewards/margins": 9.060277925504671, "rewards/rejected": -5.383257779208097, "step": 185 }, { "epoch": 0.04654072313274115, "grad_norm": 11.6875, "kl": 4.476684093475342, "learning_rate": 5e-06, "logits/chosen": -53033125.333333336, "logits/rejected": -43685322.666666664, "logps/chosen": -444.3708089192708, "logps/rejected": -532.5503743489584, "loss": 0.0768, "rewards/chosen": 4.921902974446614, "rewards/margins": 10.608036041259766, "rewards/rejected": -5.686133066813151, "step": 186 }, { "epoch": 0.04679094207431503, "grad_norm": 19.125, "kl": 2.0418787002563477, "learning_rate": 5e-06, "logits/chosen": -50350249.6, "logits/rejected": -37511499.428571425, "logps/chosen": -381.4614990234375, "logps/rejected": -589.4267578125, "loss": 0.1072, "rewards/chosen": 4.3655342102050785, "rewards/margins": 10.5390745980399, "rewards/rejected": -6.173540387834821, "step": 187 }, { "epoch": 0.047041161015888906, "grad_norm": 10.4375, "kl": 8.362052917480469, "learning_rate": 5e-06, "logits/chosen": -56223128.0, "logits/rejected": -49902480.0, "logps/chosen": -370.11468505859375, "logps/rejected": -875.49365234375, "loss": 0.1507, "rewards/chosen": 4.879919052124023, "rewards/margins": 12.756749629974365, "rewards/rejected": -7.876830577850342, "step": 188 }, { "epoch": 0.04729137995746278, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34136556.307692304, "logits/rejected": -44233218.90909091, "logps/chosen": -375.6189528245192, "logps/rejected": -477.4869939630682, "loss": 0.0834, "rewards/chosen": 4.037762275108924, "rewards/margins": 8.266784908054593, "rewards/rejected": -4.229022632945668, "step": 189 }, { "epoch": 0.047541598899036655, "grad_norm": 16.375, "kl": 1.9649031162261963, "learning_rate": 5e-06, "logits/chosen": -31002606.769230768, "logits/rejected": -41109626.18181818, "logps/chosen": -327.640625, "logps/rejected": -631.2844460227273, "loss": 0.1154, "rewards/chosen": 3.483311286339393, "rewards/margins": 11.118699747365671, "rewards/rejected": -7.635388461026278, "step": 190 }, { "epoch": 0.04779181784061053, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61394141.09090909, "logits/rejected": -37747611.07692308, "logps/chosen": -453.96484375, "logps/rejected": -758.7204777644231, "loss": 0.0453, "rewards/chosen": 5.382943933660334, "rewards/margins": 12.17320659610775, "rewards/rejected": -6.790262662447416, "step": 191 }, { "epoch": 0.04804203678218441, "grad_norm": 9.8125, "kl": 8.697744369506836, "learning_rate": 5e-06, "logits/chosen": -29078030.769230768, "logits/rejected": -53254888.72727273, "logps/chosen": -411.91165865384613, "logps/rejected": -551.6779119318181, "loss": 0.0816, "rewards/chosen": 5.3462360088641825, "rewards/margins": 11.406404882044225, "rewards/rejected": -6.060168873180043, "step": 192 }, { "epoch": 0.04829225572375829, "grad_norm": 14.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72034519.27272727, "logits/rejected": -31024893.53846154, "logps/chosen": -347.4210094105114, "logps/rejected": -458.4655198317308, "loss": 0.0981, "rewards/chosen": 3.841581171209162, "rewards/margins": 10.548764422223286, "rewards/rejected": -6.707183251014123, "step": 193 }, { "epoch": 0.048542474665332166, "grad_norm": 17.375, "kl": 3.349236249923706, "learning_rate": 5e-06, "logits/chosen": -59783954.28571428, "logits/rejected": 59085209.6, "logps/chosen": -360.4351283482143, "logps/rejected": -575.68037109375, "loss": 0.0747, "rewards/chosen": 4.300768171037946, "rewards/margins": 9.780179868425641, "rewards/rejected": -5.479411697387695, "step": 194 }, { "epoch": 0.048792693606906044, "grad_norm": 17.125, "kl": 6.444109916687012, "learning_rate": 5e-06, "logits/chosen": -32805256.533333335, "logits/rejected": -63079182.222222224, "logps/chosen": -363.3408528645833, "logps/rejected": -466.26019965277777, "loss": 0.1606, "rewards/chosen": 3.773480987548828, "rewards/margins": 9.127889251708984, "rewards/rejected": -5.354408264160156, "step": 195 }, { "epoch": 0.04904291254847992, "grad_norm": 14.625, "kl": 12.910234451293945, "learning_rate": 5e-06, "logits/chosen": -11549870.933333334, "logits/rejected": -54725589.333333336, "logps/chosen": -429.72649739583335, "logps/rejected": -525.4338650173611, "loss": 0.1118, "rewards/chosen": 4.961282857259115, "rewards/margins": 9.972749413384332, "rewards/rejected": -5.011466556125217, "step": 196 }, { "epoch": 0.0492931314900538, "grad_norm": 27.375, "kl": 4.634339809417725, "learning_rate": 5e-06, "logits/chosen": -25323805.866666667, "logits/rejected": -45791729.777777776, "logps/chosen": -450.1834309895833, "logps/rejected": -538.20947265625, "loss": 0.0775, "rewards/chosen": 5.164319356282552, "rewards/margins": 11.035110473632812, "rewards/rejected": -5.870791117350261, "step": 197 }, { "epoch": 0.04954335043162767, "grad_norm": 15.8125, "kl": 16.27667999267578, "learning_rate": 5e-06, "logits/chosen": -59654941.09090909, "logits/rejected": -59627126.15384615, "logps/chosen": -378.24147727272725, "logps/rejected": -449.65981820913464, "loss": 0.1025, "rewards/chosen": 5.658745158802379, "rewards/margins": 9.16282013579682, "rewards/rejected": -3.504074976994441, "step": 198 }, { "epoch": 0.04979356937320155, "grad_norm": 11.9375, "kl": 3.600116014480591, "learning_rate": 5e-06, "logits/chosen": -33377289.6, "logits/rejected": -31516605.714285713, "logps/chosen": -322.9662353515625, "logps/rejected": -415.9325474330357, "loss": 0.0971, "rewards/chosen": 4.778628540039063, "rewards/margins": 9.22843393598284, "rewards/rejected": -4.449805395943778, "step": 199 }, { "epoch": 0.05004378831477543, "grad_norm": 13.125, "kl": 4.291494369506836, "learning_rate": 5e-06, "logits/chosen": -63218432.0, "logits/rejected": -69397736.0, "logps/chosen": -384.4539489746094, "logps/rejected": -540.909423828125, "loss": 0.0536, "rewards/chosen": 4.746562957763672, "rewards/margins": 10.028171062469482, "rewards/rejected": -5.2816081047058105, "step": 200 }, { "epoch": 0.050294007256349305, "grad_norm": 20.125, "kl": 4.140326499938965, "learning_rate": 5e-06, "logits/chosen": -26370669.714285713, "logits/rejected": -55841625.6, "logps/chosen": -383.97140066964283, "logps/rejected": -646.44501953125, "loss": 0.1091, "rewards/chosen": 4.063012531825474, "rewards/margins": 10.766155079432895, "rewards/rejected": -6.703142547607422, "step": 201 }, { "epoch": 0.05054422619792318, "grad_norm": 19.625, "kl": 3.0051372051239014, "learning_rate": 5e-06, "logits/chosen": -124939721.14285715, "logits/rejected": -46234996.705882356, "logps/chosen": -421.8641880580357, "logps/rejected": -554.2309857536765, "loss": 0.0387, "rewards/chosen": 5.835538591657366, "rewards/margins": 11.872191100561318, "rewards/rejected": -6.036652508903952, "step": 202 }, { "epoch": 0.05079444513949706, "grad_norm": 7.28125, "kl": 7.439305782318115, "learning_rate": 5e-06, "logits/chosen": -66490786.461538464, "logits/rejected": -65816610.90909091, "logps/chosen": -453.8348858173077, "logps/rejected": -551.6194513494319, "loss": 0.0628, "rewards/chosen": 6.551427401029146, "rewards/margins": 10.499076229709011, "rewards/rejected": -3.947648828679865, "step": 203 }, { "epoch": 0.05104466408107094, "grad_norm": 20.25, "kl": 5.730766773223877, "learning_rate": 5e-06, "logits/chosen": -72196405.33333333, "logits/rejected": -51363594.666666664, "logps/chosen": -440.599365234375, "logps/rejected": -537.4098307291666, "loss": 0.095, "rewards/chosen": 4.975851694742839, "rewards/margins": 10.486460367838543, "rewards/rejected": -5.510608673095703, "step": 204 }, { "epoch": 0.051294883022644816, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70233012.36363636, "logits/rejected": -45291643.07692308, "logps/chosen": -519.3923117897727, "logps/rejected": -454.42078575721155, "loss": 0.0654, "rewards/chosen": 5.860253767533735, "rewards/margins": 11.40235217968067, "rewards/rejected": -5.542098412146935, "step": 205 }, { "epoch": 0.051545101964218694, "grad_norm": 21.375, "kl": 18.074310302734375, "learning_rate": 5e-06, "logits/chosen": -71379493.64705883, "logits/rejected": -60079908.571428575, "logps/chosen": -548.6579733455883, "logps/rejected": -558.7813197544643, "loss": 0.1183, "rewards/chosen": 6.103434394387638, "rewards/margins": 11.75511711184718, "rewards/rejected": -5.6516827174595425, "step": 206 }, { "epoch": 0.05179532090579257, "grad_norm": 25.5, "kl": 18.960453033447266, "learning_rate": 5e-06, "logits/chosen": -70417160.0, "logits/rejected": -59043764.0, "logps/chosen": -387.33612060546875, "logps/rejected": -598.7929077148438, "loss": 0.1292, "rewards/chosen": 4.7016496658325195, "rewards/margins": 11.90509033203125, "rewards/rejected": -7.2034406661987305, "step": 207 }, { "epoch": 0.05204553984736644, "grad_norm": 10.125, "kl": 0.08770434558391571, "learning_rate": 5e-06, "logits/chosen": -53707336.0, "logits/rejected": -58144924.0, "logps/chosen": -316.13916015625, "logps/rejected": -602.5560302734375, "loss": 0.0665, "rewards/chosen": 4.102164268493652, "rewards/margins": 11.597392082214355, "rewards/rejected": -7.495227813720703, "step": 208 }, { "epoch": 0.05229575878894032, "grad_norm": 17.75, "kl": 5.174756050109863, "learning_rate": 5e-06, "logits/chosen": -74477824.0, "logits/rejected": -45505221.81818182, "logps/chosen": -460.4802809495192, "logps/rejected": -369.1199396306818, "loss": 0.1004, "rewards/chosen": 5.138316814716045, "rewards/margins": 9.534788705252268, "rewards/rejected": -4.396471890536222, "step": 209 }, { "epoch": 0.0525459777305142, "grad_norm": 13.8125, "kl": 0.1292479932308197, "learning_rate": 5e-06, "logits/chosen": -69940625.45454545, "logits/rejected": -42426825.84615385, "logps/chosen": -371.20749733664775, "logps/rejected": -638.8646334134615, "loss": 0.0675, "rewards/chosen": 4.9522316672585225, "rewards/margins": 13.068748954292776, "rewards/rejected": -8.116517287034254, "step": 210 }, { "epoch": 0.052796196672088076, "grad_norm": 21.25, "kl": 6.5166192054748535, "learning_rate": 5e-06, "logits/chosen": -70400464.0, "logits/rejected": -35769348.0, "logps/chosen": -424.12115478515625, "logps/rejected": -300.89501953125, "loss": 0.0704, "rewards/chosen": 4.73396110534668, "rewards/margins": 9.076435565948486, "rewards/rejected": -4.342474460601807, "step": 211 }, { "epoch": 0.053046415613661954, "grad_norm": 13.0, "kl": 4.748558044433594, "learning_rate": 5e-06, "logits/chosen": -59821738.666666664, "logits/rejected": -64486704.0, "logps/chosen": -477.7695719401042, "logps/rejected": -744.5475260416666, "loss": 0.0284, "rewards/chosen": 6.058443705240886, "rewards/margins": 14.159394582112629, "rewards/rejected": -8.100950876871744, "step": 212 }, { "epoch": 0.05329663455523583, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52458066.28571428, "logits/rejected": -69845586.8235294, "logps/chosen": -502.26499720982144, "logps/rejected": -596.6792279411765, "loss": 0.0869, "rewards/chosen": 5.950331551688058, "rewards/margins": 10.853909468450468, "rewards/rejected": -4.9035779167624085, "step": 213 }, { "epoch": 0.05354685349680971, "grad_norm": 20.875, "kl": 9.591470718383789, "learning_rate": 5e-06, "logits/chosen": -61181298.28571428, "logits/rejected": -76267212.8, "logps/chosen": -359.708740234375, "logps/rejected": -512.524658203125, "loss": 0.1162, "rewards/chosen": 4.980873107910156, "rewards/margins": 11.222312927246094, "rewards/rejected": -6.2414398193359375, "step": 214 }, { "epoch": 0.05379707243838359, "grad_norm": 18.75, "kl": 2.8346781730651855, "learning_rate": 5e-06, "logits/chosen": -58141400.615384616, "logits/rejected": -33345233.454545453, "logps/chosen": -434.98084435096155, "logps/rejected": -382.52738813920456, "loss": 0.0696, "rewards/chosen": 4.866139045128455, "rewards/margins": 8.685153507686161, "rewards/rejected": -3.819014462557706, "step": 215 }, { "epoch": 0.054047291379957466, "grad_norm": 15.8125, "kl": 10.940942764282227, "learning_rate": 5e-06, "logits/chosen": -50631176.0, "logits/rejected": -55207144.0, "logps/chosen": -439.8216247558594, "logps/rejected": -594.8463134765625, "loss": 0.0943, "rewards/chosen": 5.669807434082031, "rewards/margins": 13.193309783935547, "rewards/rejected": -7.523502349853516, "step": 216 }, { "epoch": 0.054297510321531336, "grad_norm": 33.75, "kl": 3.2548866271972656, "learning_rate": 5e-06, "logits/chosen": -53489436.44444445, "logits/rejected": 4758829.333333333, "logps/chosen": -395.04747178819446, "logps/rejected": -472.0696614583333, "loss": 0.0891, "rewards/chosen": 4.646226671006945, "rewards/margins": 8.308933427598742, "rewards/rejected": -3.662706756591797, "step": 217 }, { "epoch": 0.054547729263105214, "grad_norm": 13.6875, "kl": 4.568079948425293, "learning_rate": 5e-06, "logits/chosen": -60876618.666666664, "logits/rejected": -21642705.333333332, "logps/chosen": -503.5177408854167, "logps/rejected": -483.5083414713542, "loss": 0.0767, "rewards/chosen": 5.721874872843425, "rewards/margins": 11.690620422363281, "rewards/rejected": -5.9687455495198565, "step": 218 }, { "epoch": 0.05479794820467909, "grad_norm": 8.9375, "kl": 5.388765811920166, "learning_rate": 5e-06, "logits/chosen": -76596437.33333333, "logits/rejected": -47909795.55555555, "logps/chosen": -510.7514973958333, "logps/rejected": -329.8972981770833, "loss": 0.0261, "rewards/chosen": 5.45499267578125, "rewards/margins": 9.806046125623915, "rewards/rejected": -4.351053449842665, "step": 219 }, { "epoch": 0.05504816714625297, "grad_norm": 23.25, "kl": 8.13897705078125, "learning_rate": 5e-06, "logits/chosen": -103544469.33333333, "logits/rejected": -30345754.666666668, "logps/chosen": -445.0954182942708, "logps/rejected": -572.4197591145834, "loss": 0.169, "rewards/chosen": 5.362514495849609, "rewards/margins": 12.308923085530598, "rewards/rejected": -6.946408589680989, "step": 220 }, { "epoch": 0.05529838608782685, "grad_norm": 16.25, "kl": 8.565845489501953, "learning_rate": 5e-06, "logits/chosen": -35269291.428571425, "logits/rejected": -24489796.8, "logps/chosen": -421.0266810825893, "logps/rejected": -347.6060302734375, "loss": 0.1262, "rewards/chosen": 5.468855721609933, "rewards/margins": 9.556808907645088, "rewards/rejected": -4.087953186035156, "step": 221 }, { "epoch": 0.055548605029400726, "grad_norm": 16.0, "kl": 3.1567230224609375, "learning_rate": 5e-06, "logits/chosen": -38834736.0, "logits/rejected": -53589360.0, "logps/chosen": -404.2430826822917, "logps/rejected": -549.4991861979166, "loss": 0.0491, "rewards/chosen": 4.901371320088704, "rewards/margins": 11.388110796610514, "rewards/rejected": -6.48673947652181, "step": 222 }, { "epoch": 0.055798823970974604, "grad_norm": 14.125, "kl": 2.0810012817382812, "learning_rate": 5e-06, "logits/chosen": -69334592.0, "logits/rejected": -35340559.058823526, "logps/chosen": -588.7001255580357, "logps/rejected": -495.96559053308823, "loss": 0.0385, "rewards/chosen": 6.903956821986607, "rewards/margins": 12.074652856137572, "rewards/rejected": -5.170696034150965, "step": 223 }, { "epoch": 0.05604904291254848, "grad_norm": 22.0, "kl": 1.7932794094085693, "learning_rate": 5e-06, "logits/chosen": -33799616.0, "logits/rejected": -1172655.3333333333, "logps/chosen": -399.4767252604167, "logps/rejected": -460.508544921875, "loss": 0.104, "rewards/chosen": 4.64273738861084, "rewards/margins": 9.231263796488445, "rewards/rejected": -4.5885264078776045, "step": 224 }, { "epoch": 0.05629926185412236, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51637317.81818182, "logits/rejected": -58171603.692307696, "logps/chosen": -483.3936878551136, "logps/rejected": -606.2865459735577, "loss": 0.0551, "rewards/chosen": 5.742768721147017, "rewards/margins": 13.433718381228147, "rewards/rejected": -7.69094966008113, "step": 225 }, { "epoch": 0.05654948079569624, "grad_norm": 9.875, "kl": 3.7677154541015625, "learning_rate": 5e-06, "logits/chosen": -53113203.2, "logits/rejected": -49358720.0, "logps/chosen": -384.63059895833334, "logps/rejected": -585.7591145833334, "loss": 0.0748, "rewards/chosen": 5.463581339518229, "rewards/margins": 13.2242184109158, "rewards/rejected": -7.76063707139757, "step": 226 }, { "epoch": 0.05679969973727011, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76321258.66666667, "logits/rejected": -60759840.0, "logps/chosen": -530.7881673177084, "logps/rejected": -690.87646484375, "loss": 0.0407, "rewards/chosen": 5.5117142995198565, "rewards/margins": 13.567319234212238, "rewards/rejected": -8.055604934692383, "step": 227 }, { "epoch": 0.057049918678843986, "grad_norm": 9.625, "kl": 2.9436187744140625, "learning_rate": 5e-06, "logits/chosen": -57832976.0, "logits/rejected": -43035576.0, "logps/chosen": -475.21234130859375, "logps/rejected": -455.1776123046875, "loss": 0.0263, "rewards/chosen": 5.860957145690918, "rewards/margins": 10.955077171325684, "rewards/rejected": -5.094120025634766, "step": 228 }, { "epoch": 0.057300137620417864, "grad_norm": 18.875, "kl": 3.507528066635132, "learning_rate": 5e-06, "logits/chosen": -50419136.0, "logits/rejected": -58293361.23076923, "logps/chosen": -354.875, "logps/rejected": -387.4791917067308, "loss": 0.1223, "rewards/chosen": 4.062959844415838, "rewards/margins": 8.998005526882785, "rewards/rejected": -4.9350456824669475, "step": 229 }, { "epoch": 0.05755035656199174, "grad_norm": 16.125, "kl": 0.5412664413452148, "learning_rate": 5e-06, "logits/chosen": -38644117.333333336, "logits/rejected": -56184810.666666664, "logps/chosen": -420.4574381510417, "logps/rejected": -551.7562255859375, "loss": 0.0768, "rewards/chosen": 3.9948673248291016, "rewards/margins": 10.442398707071941, "rewards/rejected": -6.447531382242839, "step": 230 }, { "epoch": 0.05780057550356562, "grad_norm": 16.375, "kl": 0.26791128516197205, "learning_rate": 5e-06, "logits/chosen": -41563904.0, "logits/rejected": -73339136.0, "logps/chosen": -442.39808872767856, "logps/rejected": -640.181005859375, "loss": 0.0442, "rewards/chosen": 5.2872499738420755, "rewards/margins": 11.534498814174107, "rewards/rejected": -6.247248840332031, "step": 231 }, { "epoch": 0.0580507944451395, "grad_norm": 21.5, "kl": 7.603259086608887, "learning_rate": 5e-06, "logits/chosen": -85483227.42857143, "logits/rejected": -63368768.0, "logps/chosen": -552.6742466517857, "logps/rejected": -519.0142578125, "loss": 0.1034, "rewards/chosen": 5.500741141183036, "rewards/margins": 11.000902502877372, "rewards/rejected": -5.500161361694336, "step": 232 }, { "epoch": 0.058301013386713375, "grad_norm": 15.125, "kl": 6.472053527832031, "learning_rate": 5e-06, "logits/chosen": -38552170.666666664, "logits/rejected": -17897214.666666668, "logps/chosen": -468.6722819010417, "logps/rejected": -590.6138916015625, "loss": 0.0491, "rewards/chosen": 4.910961151123047, "rewards/margins": 12.375680923461914, "rewards/rejected": -7.464719772338867, "step": 233 }, { "epoch": 0.05855123232828725, "grad_norm": 15.5625, "kl": 0.8780374526977539, "learning_rate": 5e-06, "logits/chosen": -96147411.2, "logits/rejected": -43601572.571428575, "logps/chosen": -609.7056640625, "logps/rejected": -413.4390345982143, "loss": 0.0475, "rewards/chosen": 5.946613311767578, "rewards/margins": 12.216070992606028, "rewards/rejected": -6.269457680838449, "step": 234 }, { "epoch": 0.05880145126986113, "grad_norm": 16.125, "kl": 5.695685386657715, "learning_rate": 5e-06, "logits/chosen": -93081225.84615384, "logits/rejected": -49701550.54545455, "logps/chosen": -468.4971454326923, "logps/rejected": -421.6673029119318, "loss": 0.0623, "rewards/chosen": 4.9567741980919475, "rewards/margins": 11.436547979608282, "rewards/rejected": -6.479773781516335, "step": 235 }, { "epoch": 0.05905167021143501, "grad_norm": 14.6875, "kl": 1.8992418050765991, "learning_rate": 5e-06, "logits/chosen": -54616021.333333336, "logits/rejected": -32344803.555555556, "logps/chosen": -413.20299479166664, "logps/rejected": -326.95220269097223, "loss": 0.1424, "rewards/chosen": 3.883965555826823, "rewards/margins": 8.674331834581164, "rewards/rejected": -4.79036627875434, "step": 236 }, { "epoch": 0.05930188915300888, "grad_norm": 13.0625, "kl": 4.48274040222168, "learning_rate": 5e-06, "logits/chosen": -63192994.461538464, "logits/rejected": -39326164.36363637, "logps/chosen": -534.2183368389423, "logps/rejected": -460.103515625, "loss": 0.1283, "rewards/chosen": 5.91471686730018, "rewards/margins": 11.04461389821726, "rewards/rejected": -5.129897030917081, "step": 237 }, { "epoch": 0.05955210809458276, "grad_norm": 17.375, "kl": 8.008248329162598, "learning_rate": 5e-06, "logits/chosen": -105651729.06666666, "logits/rejected": -33743971.55555555, "logps/chosen": -534.3837890625, "logps/rejected": -353.31640625, "loss": 0.0603, "rewards/chosen": 4.986274210611979, "rewards/margins": 11.95004645453559, "rewards/rejected": -6.963772243923611, "step": 238 }, { "epoch": 0.059802327036156636, "grad_norm": 18.0, "kl": 1.8964078426361084, "learning_rate": 5e-06, "logits/chosen": -52408721.45454545, "logits/rejected": -48703926.15384615, "logps/chosen": -353.20503373579544, "logps/rejected": -686.5046574519231, "loss": 0.1101, "rewards/chosen": 4.136805447665128, "rewards/margins": 12.106794343961703, "rewards/rejected": -7.969988896296575, "step": 239 }, { "epoch": 0.06005254597773051, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53857590.15384615, "logits/rejected": -43563741.09090909, "logps/chosen": -363.1044921875, "logps/rejected": -474.27756569602275, "loss": 0.0699, "rewards/chosen": 4.238471397986779, "rewards/margins": 10.517301492757731, "rewards/rejected": -6.278830094770952, "step": 240 }, { "epoch": 0.06030276491930439, "grad_norm": 13.0625, "kl": 0.6801236867904663, "learning_rate": 5e-06, "logits/chosen": -34695546.666666664, "logits/rejected": -48599274.666666664, "logps/chosen": -383.9803466796875, "logps/rejected": -568.0260416666666, "loss": 0.07, "rewards/chosen": 4.735932032267253, "rewards/margins": 11.453741073608398, "rewards/rejected": -6.7178090413411455, "step": 241 }, { "epoch": 0.06055298386087827, "grad_norm": 28.75, "kl": 7.704229354858398, "learning_rate": 5e-06, "logits/chosen": -40987808.0, "logits/rejected": -33765154.28571428, "logps/chosen": -432.7125, "logps/rejected": -600.5432477678571, "loss": 0.1087, "rewards/chosen": 4.74278564453125, "rewards/margins": 10.816473606654576, "rewards/rejected": -6.0736879621233255, "step": 242 }, { "epoch": 0.06080320280245215, "grad_norm": 23.125, "kl": 2.1749091148376465, "learning_rate": 5e-06, "logits/chosen": -91359545.6, "logits/rejected": -62814555.428571425, "logps/chosen": -501.44365234375, "logps/rejected": -450.42354910714283, "loss": 0.0623, "rewards/chosen": 5.585881042480469, "rewards/margins": 11.966207994733537, "rewards/rejected": -6.3803269522530695, "step": 243 }, { "epoch": 0.061053421744026025, "grad_norm": 20.25, "kl": 3.0634572505950928, "learning_rate": 5e-06, "logits/chosen": -32003923.692307692, "logits/rejected": -44226141.09090909, "logps/chosen": -304.0939190204327, "logps/rejected": -494.07901278409093, "loss": 0.1763, "rewards/chosen": 2.9934842036320615, "rewards/margins": 8.667183669297012, "rewards/rejected": -5.67369946566495, "step": 244 }, { "epoch": 0.0613036406855999, "grad_norm": 14.75, "kl": 1.1172847747802734, "learning_rate": 5e-06, "logits/chosen": -71501435.73333333, "logits/rejected": -49895534.222222224, "logps/chosen": -448.57200520833334, "logps/rejected": -532.8487413194445, "loss": 0.0571, "rewards/chosen": 4.671438598632813, "rewards/margins": 10.938484361436632, "rewards/rejected": -6.26704576280382, "step": 245 }, { "epoch": 0.061553859627173774, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42116583.384615384, "logits/rejected": -64825611.63636363, "logps/chosen": -414.7998046875, "logps/rejected": -464.74116654829544, "loss": 0.0804, "rewards/chosen": 5.251841031588041, "rewards/margins": 11.372012318431082, "rewards/rejected": -6.12017128684304, "step": 246 }, { "epoch": 0.06180407856874765, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27848082.0, "logits/rejected": -49274160.0, "logps/chosen": -323.40582275390625, "logps/rejected": -380.6914367675781, "loss": 0.088, "rewards/chosen": 3.8589885234832764, "rewards/margins": 8.341522932052612, "rewards/rejected": -4.482534408569336, "step": 247 }, { "epoch": 0.06205429751032153, "grad_norm": 17.25, "kl": 0.6061592102050781, "learning_rate": 5e-06, "logits/chosen": -72504634.66666667, "logits/rejected": -68203098.66666667, "logps/chosen": -358.8920084635417, "logps/rejected": -500.3172200520833, "loss": 0.0906, "rewards/chosen": 3.5224291483561196, "rewards/margins": 9.297661463419596, "rewards/rejected": -5.775232315063477, "step": 248 }, { "epoch": 0.06230451645189541, "grad_norm": 8.0625, "kl": 3.0028254985809326, "learning_rate": 5e-06, "logits/chosen": -44604178.666666664, "logits/rejected": -50529397.333333336, "logps/chosen": -586.1739095052084, "logps/rejected": -642.0966389973959, "loss": 0.0191, "rewards/chosen": 6.448746999104817, "rewards/margins": 13.13725471496582, "rewards/rejected": -6.688507715861003, "step": 249 }, { "epoch": 0.06255473539346929, "grad_norm": 16.25, "kl": 4.768838405609131, "learning_rate": 5e-06, "logits/chosen": -54296755.2, "logits/rejected": -50749120.0, "logps/chosen": -484.103125, "logps/rejected": -633.3464704241071, "loss": 0.0487, "rewards/chosen": 6.53180160522461, "rewards/margins": 14.83898173740932, "rewards/rejected": -8.30718013218471, "step": 250 }, { "epoch": 0.06280495433504316, "grad_norm": 16.5, "kl": 0.9286988973617554, "learning_rate": 5e-06, "logits/chosen": -46544112.0, "logits/rejected": -36229072.0, "logps/chosen": -341.8699544270833, "logps/rejected": -429.4635416666667, "loss": 0.0906, "rewards/chosen": 4.2282514572143555, "rewards/margins": 10.53867244720459, "rewards/rejected": -6.310420989990234, "step": 251 }, { "epoch": 0.06305517327661704, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62439566.222222224, "logits/rejected": -41197469.86666667, "logps/chosen": -521.8043619791666, "logps/rejected": -565.6182291666667, "loss": 0.0467, "rewards/chosen": 6.853285471598308, "rewards/margins": 15.23663813273112, "rewards/rejected": -8.383352661132813, "step": 252 }, { "epoch": 0.06330539221819091, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46289870.222222224, "logits/rejected": -48755741.86666667, "logps/chosen": -429.5540364583333, "logps/rejected": -520.4573567708334, "loss": 0.0305, "rewards/chosen": 5.6796459621853295, "rewards/margins": 11.141227383083766, "rewards/rejected": -5.461581420898438, "step": 253 }, { "epoch": 0.0635556111597648, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63038912.0, "logits/rejected": -41706232.88888889, "logps/chosen": -318.92742919921875, "logps/rejected": -603.8517795138889, "loss": 0.0363, "rewards/chosen": 3.1966425577799478, "rewards/margins": 10.984870062934027, "rewards/rejected": -7.7882275051540795, "step": 254 }, { "epoch": 0.06380583010133867, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47767109.333333336, "logits/rejected": -42213802.666666664, "logps/chosen": -420.8616536458333, "logps/rejected": -464.4077962239583, "loss": 0.0585, "rewards/chosen": 5.024503707885742, "rewards/margins": 10.489451726277668, "rewards/rejected": -5.464948018391927, "step": 255 }, { "epoch": 0.06405604904291255, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69395494.4, "logits/rejected": -39749270.85714286, "logps/chosen": -396.38232421875, "logps/rejected": -442.34340122767856, "loss": 0.0836, "rewards/chosen": 4.396539306640625, "rewards/margins": 9.73754381452288, "rewards/rejected": -5.341004507882254, "step": 256 }, { "epoch": 0.06430626798448642, "grad_norm": 16.5, "kl": 1.3632354736328125, "learning_rate": 5e-06, "logits/chosen": -46325056.0, "logits/rejected": -39119104.0, "logps/chosen": -459.4122869318182, "logps/rejected": -449.71987680288464, "loss": 0.0746, "rewards/chosen": 6.272554570978338, "rewards/margins": 12.569833928888494, "rewards/rejected": -6.297279357910156, "step": 257 }, { "epoch": 0.06455648692606031, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45575133.09090909, "logits/rejected": -57118119.384615384, "logps/chosen": -478.939453125, "logps/rejected": -490.60069861778845, "loss": 0.0495, "rewards/chosen": 4.3977227644486865, "rewards/margins": 12.05151786003913, "rewards/rejected": -7.653795095590445, "step": 258 }, { "epoch": 0.06480670586763418, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52410422.4, "logits/rejected": -30478674.285714287, "logps/chosen": -292.6397705078125, "logps/rejected": -452.7027064732143, "loss": 0.0918, "rewards/chosen": 3.266457366943359, "rewards/margins": 10.335402025495256, "rewards/rejected": -7.068944658551898, "step": 259 }, { "epoch": 0.06505692480920806, "grad_norm": 15.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53376630.15384615, "logits/rejected": -24394141.09090909, "logps/chosen": -463.49906099759613, "logps/rejected": -558.4932972301136, "loss": 0.067, "rewards/chosen": 5.315664438100962, "rewards/margins": 13.556491131549116, "rewards/rejected": -8.240826693448154, "step": 260 }, { "epoch": 0.06530714375078193, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65685779.2, "logits/rejected": -59654674.28571428, "logps/chosen": -493.353076171875, "logps/rejected": -534.4528459821429, "loss": 0.0196, "rewards/chosen": 4.4684593200683596, "rewards/margins": 12.256799752371652, "rewards/rejected": -7.7883404323032925, "step": 261 }, { "epoch": 0.0655573626923558, "grad_norm": 11.375, "kl": 0.8784777522087097, "learning_rate": 5e-06, "logits/chosen": -87586154.66666667, "logits/rejected": -65527802.666666664, "logps/chosen": -483.6160481770833, "logps/rejected": -566.429931640625, "loss": 0.059, "rewards/chosen": 5.209161122639974, "rewards/margins": 12.132217407226562, "rewards/rejected": -6.923056284586589, "step": 262 }, { "epoch": 0.06580758163392969, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70752604.44444445, "logits/rejected": 51250338.13333333, "logps/chosen": -425.29399956597223, "logps/rejected": -520.7492838541667, "loss": 0.0508, "rewards/chosen": 3.386476516723633, "rewards/margins": 11.8880552927653, "rewards/rejected": -8.501578776041667, "step": 263 }, { "epoch": 0.06605780057550356, "grad_norm": 25.75, "kl": 5.149889945983887, "learning_rate": 5e-06, "logits/chosen": -48233130.666666664, "logits/rejected": -43924462.222222224, "logps/chosen": -552.0205729166667, "logps/rejected": -537.4425998263889, "loss": 0.0788, "rewards/chosen": 5.195638020833333, "rewards/margins": 11.78333960639106, "rewards/rejected": -6.587701585557726, "step": 264 }, { "epoch": 0.06630801951707745, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61328758.85714286, "logits/rejected": -56918505.4117647, "logps/chosen": -479.367431640625, "logps/rejected": -608.0940372242648, "loss": 0.0618, "rewards/chosen": 3.7964319501604353, "rewards/margins": 12.093412864108046, "rewards/rejected": -8.29698091394761, "step": 265 }, { "epoch": 0.06655823845865132, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55310618.666666664, "logits/rejected": -21998644.0, "logps/chosen": -439.9580078125, "logps/rejected": -547.12060546875, "loss": 0.0748, "rewards/chosen": 5.038155873616536, "rewards/margins": 12.72253672281901, "rewards/rejected": -7.684380849202474, "step": 266 }, { "epoch": 0.0668084574002252, "grad_norm": 15.5, "kl": 5.1014509201049805, "learning_rate": 5e-06, "logits/chosen": -75437512.0, "logits/rejected": -80797840.0, "logps/chosen": -525.6380004882812, "logps/rejected": -414.5377197265625, "loss": 0.029, "rewards/chosen": 7.350945472717285, "rewards/margins": 13.114680767059326, "rewards/rejected": -5.763735294342041, "step": 267 }, { "epoch": 0.06705867634179907, "grad_norm": 17.25, "kl": 1.123401403427124, "learning_rate": 5e-06, "logits/chosen": -58302169.6, "logits/rejected": -46181952.0, "logps/chosen": -475.768017578125, "logps/rejected": -429.35302734375, "loss": 0.0977, "rewards/chosen": 4.8388420104980465, "rewards/margins": 11.183382197788784, "rewards/rejected": -6.344540187290737, "step": 268 }, { "epoch": 0.06730889528337296, "grad_norm": 20.75, "kl": 7.065022945404053, "learning_rate": 5e-06, "logits/chosen": -77816576.0, "logits/rejected": -52881866.666666664, "logps/chosen": -483.0377604166667, "logps/rejected": -646.4273681640625, "loss": 0.0953, "rewards/chosen": 4.503868103027344, "rewards/margins": 16.014129638671875, "rewards/rejected": -11.510261535644531, "step": 269 }, { "epoch": 0.06755911422494683, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41498552.88888889, "logits/rejected": -60701781.333333336, "logps/chosen": -437.0484212239583, "logps/rejected": -565.5378255208333, "loss": 0.0493, "rewards/chosen": 4.081297556559245, "rewards/margins": 13.958947499593098, "rewards/rejected": -9.877649943033854, "step": 270 }, { "epoch": 0.0678093331665207, "grad_norm": 15.5625, "kl": 0.29158782958984375, "learning_rate": 5e-06, "logits/chosen": -78919831.27272727, "logits/rejected": -25148347.076923076, "logps/chosen": -400.4002574573864, "logps/rejected": -301.63927283653845, "loss": 0.0679, "rewards/chosen": 5.59576381336559, "rewards/margins": 11.211588586127007, "rewards/rejected": -5.615824772761418, "step": 271 }, { "epoch": 0.06805955210809458, "grad_norm": 18.25, "kl": 7.191731929779053, "learning_rate": 5e-06, "logits/chosen": -52007271.384615384, "logits/rejected": -45008203.63636363, "logps/chosen": -379.2160456730769, "logps/rejected": -450.93212890625, "loss": 0.0883, "rewards/chosen": 4.103186387282151, "rewards/margins": 12.224304412628388, "rewards/rejected": -8.121118025346236, "step": 272 }, { "epoch": 0.06830977104966846, "grad_norm": 22.0, "kl": 5.124803066253662, "learning_rate": 5e-06, "logits/chosen": -78945928.53333333, "logits/rejected": -29661472.0, "logps/chosen": -470.85325520833334, "logps/rejected": -476.0070529513889, "loss": 0.085, "rewards/chosen": 5.226908365885417, "rewards/margins": 10.916728973388672, "rewards/rejected": -5.689820607503255, "step": 273 }, { "epoch": 0.06855998999124234, "grad_norm": 22.375, "kl": 7.02076530456543, "learning_rate": 5e-06, "logits/chosen": -58399515.428571425, "logits/rejected": -61953196.8, "logps/chosen": -535.1898018973214, "logps/rejected": -317.746240234375, "loss": 0.0809, "rewards/chosen": 6.339141845703125, "rewards/margins": 9.71686019897461, "rewards/rejected": -3.3777183532714843, "step": 274 }, { "epoch": 0.06881020893281621, "grad_norm": 10.4375, "kl": 6.946084022521973, "learning_rate": 5e-06, "logits/chosen": -53368098.461538464, "logits/rejected": -55329297.45454545, "logps/chosen": -428.52201021634613, "logps/rejected": -597.0437233664773, "loss": 0.047, "rewards/chosen": 4.962626530573918, "rewards/margins": 11.407653968650978, "rewards/rejected": -6.44502743807706, "step": 275 }, { "epoch": 0.0690604278743901, "grad_norm": 21.75, "kl": 5.97451114654541, "learning_rate": 5e-06, "logits/chosen": -81498112.0, "logits/rejected": 81226272.0, "logps/chosen": -430.68505859375, "logps/rejected": -447.3458984375, "loss": 0.0851, "rewards/chosen": 4.633864266531808, "rewards/margins": 9.713998086111886, "rewards/rejected": -5.0801338195800785, "step": 276 }, { "epoch": 0.06931064681596397, "grad_norm": 12.25, "kl": 0.1952921599149704, "learning_rate": 5e-06, "logits/chosen": -78502592.0, "logits/rejected": -31381964.0, "logps/chosen": -487.24859619140625, "logps/rejected": -431.78997802734375, "loss": 0.0301, "rewards/chosen": 5.563851833343506, "rewards/margins": 10.804934978485107, "rewards/rejected": -5.241083145141602, "step": 277 }, { "epoch": 0.06956086575753785, "grad_norm": 10.75, "kl": 0.5088316798210144, "learning_rate": 5e-06, "logits/chosen": -30100498.666666668, "logits/rejected": -39578920.0, "logps/chosen": -342.0590006510417, "logps/rejected": -634.4694417317709, "loss": 0.0814, "rewards/chosen": 5.2512868245442705, "rewards/margins": 13.84987513224284, "rewards/rejected": -8.598588307698568, "step": 278 }, { "epoch": 0.06981108469911172, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61760720.0, "logits/rejected": -42346205.333333336, "logps/chosen": -427.8580729166667, "logps/rejected": -348.9947916666667, "loss": 0.0705, "rewards/chosen": 4.600465456644694, "rewards/margins": 9.550864537556967, "rewards/rejected": -4.950399080912272, "step": 279 }, { "epoch": 0.0700613036406856, "grad_norm": 7.6875, "kl": 4.760970115661621, "learning_rate": 5e-06, "logits/chosen": -41277586.28571428, "logits/rejected": -52429590.4, "logps/chosen": -442.60525948660717, "logps/rejected": -449.628955078125, "loss": 0.0498, "rewards/chosen": 5.872192927769253, "rewards/margins": 11.140544673374722, "rewards/rejected": -5.268351745605469, "step": 280 }, { "epoch": 0.07031152258225948, "grad_norm": 8.875, "kl": 1.9243710041046143, "learning_rate": 5e-06, "logits/chosen": -45989888.0, "logits/rejected": -58634885.333333336, "logps/chosen": -356.4975992838542, "logps/rejected": -542.73876953125, "loss": 0.037, "rewards/chosen": 4.887360254923503, "rewards/margins": 11.935904184977215, "rewards/rejected": -7.048543930053711, "step": 281 }, { "epoch": 0.07056174152383335, "grad_norm": 10.9375, "kl": 11.589720726013184, "learning_rate": 5e-06, "logits/chosen": -59778560.0, "logits/rejected": -66434538.666666664, "logps/chosen": -458.0664388020833, "logps/rejected": -428.1740993923611, "loss": 0.1556, "rewards/chosen": 5.7575327555338545, "rewards/margins": 10.448915269639757, "rewards/rejected": -4.691382514105903, "step": 282 }, { "epoch": 0.07081196046540723, "grad_norm": 16.25, "kl": 10.859646797180176, "learning_rate": 5e-06, "logits/chosen": -50777248.0, "logits/rejected": -43125244.0, "logps/chosen": -399.5986633300781, "logps/rejected": -522.7695922851562, "loss": 0.0645, "rewards/chosen": 5.321485996246338, "rewards/margins": 12.317273139953613, "rewards/rejected": -6.995787143707275, "step": 283 }, { "epoch": 0.0710621794069811, "grad_norm": 7.90625, "kl": 1.132131814956665, "learning_rate": 5e-06, "logits/chosen": -55504290.90909091, "logits/rejected": -39513597.538461536, "logps/chosen": -495.51247336647725, "logps/rejected": -362.1698467548077, "loss": 0.032, "rewards/chosen": 6.6378936767578125, "rewards/margins": 10.8924319927509, "rewards/rejected": -4.254538315993089, "step": 284 }, { "epoch": 0.07131239834855499, "grad_norm": 16.875, "kl": 11.234561920166016, "learning_rate": 5e-06, "logits/chosen": -71418130.28571428, "logits/rejected": -41369632.0, "logps/chosen": -428.03271484375, "logps/rejected": -731.75654296875, "loss": 0.0669, "rewards/chosen": 6.893448965890067, "rewards/margins": 15.258488028390065, "rewards/rejected": -8.3650390625, "step": 285 }, { "epoch": 0.07156261729012886, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54508409.6, "logits/rejected": -55452480.0, "logps/chosen": -340.187939453125, "logps/rejected": -539.3630022321429, "loss": 0.079, "rewards/chosen": 5.844940948486328, "rewards/margins": 13.450724138532365, "rewards/rejected": -7.605783190046038, "step": 286 }, { "epoch": 0.07181283623170275, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57286966.85714286, "logits/rejected": -38474522.35294118, "logps/chosen": -254.78475516183036, "logps/rejected": -456.94514016544116, "loss": 0.1102, "rewards/chosen": 3.0357987540108815, "rewards/margins": 9.18617187628225, "rewards/rejected": -6.150373122271369, "step": 287 }, { "epoch": 0.07206305517327662, "grad_norm": 15.5625, "kl": 2.9759058952331543, "learning_rate": 5e-06, "logits/chosen": -92077286.4, "logits/rejected": -37994189.71428572, "logps/chosen": -515.35732421875, "logps/rejected": -623.5980747767857, "loss": 0.0347, "rewards/chosen": 7.753670501708984, "rewards/margins": 13.778749411446707, "rewards/rejected": -6.025078909737723, "step": 288 }, { "epoch": 0.0723132741148505, "grad_norm": 7.09375, "kl": 5.745540618896484, "learning_rate": 5e-06, "logits/chosen": -73232245.33333333, "logits/rejected": -49373632.0, "logps/chosen": -429.4737548828125, "logps/rejected": -579.3299153645834, "loss": 0.0339, "rewards/chosen": 5.881547292073567, "rewards/margins": 12.978575388590494, "rewards/rejected": -7.097028096516927, "step": 289 }, { "epoch": 0.07256349305642437, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24547368.727272727, "logits/rejected": -39950163.692307696, "logps/chosen": -547.5836736505681, "logps/rejected": -678.9582331730769, "loss": 0.0331, "rewards/chosen": 5.565066944469105, "rewards/margins": 13.200354889556245, "rewards/rejected": -7.635287945087139, "step": 290 }, { "epoch": 0.07281371199799824, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64983193.6, "logits/rejected": -55468856.88888889, "logps/chosen": -355.5180989583333, "logps/rejected": -449.8848470052083, "loss": 0.0891, "rewards/chosen": 4.4547876993815105, "rewards/margins": 10.654435390896268, "rewards/rejected": -6.199647691514757, "step": 291 }, { "epoch": 0.07306393093957213, "grad_norm": 12.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69864550.4, "logits/rejected": -37210121.14285714, "logps/chosen": -418.81591796875, "logps/rejected": -421.0170200892857, "loss": 0.0631, "rewards/chosen": 5.072563171386719, "rewards/margins": 12.084261757986887, "rewards/rejected": -7.0116985866001675, "step": 292 }, { "epoch": 0.073314149881146, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54616715.63636363, "logits/rejected": -51849550.76923077, "logps/chosen": -451.50221946022725, "logps/rejected": -551.5064978966346, "loss": 0.0619, "rewards/chosen": 4.917368108575994, "rewards/margins": 12.405217417470226, "rewards/rejected": -7.487849308894231, "step": 293 }, { "epoch": 0.07356436882271988, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50288458.666666664, "logits/rejected": -57870400.0, "logps/chosen": -452.7593994140625, "logps/rejected": -536.1645100911459, "loss": 0.0448, "rewards/chosen": 5.461680094401042, "rewards/margins": 13.866125106811523, "rewards/rejected": -8.404445012410482, "step": 294 }, { "epoch": 0.07381458776429375, "grad_norm": 14.1875, "kl": 7.171340465545654, "learning_rate": 5e-06, "logits/chosen": -54457760.0, "logits/rejected": -68416877.71428572, "logps/chosen": -398.9943115234375, "logps/rejected": -708.7611607142857, "loss": 0.1173, "rewards/chosen": 4.315273666381836, "rewards/margins": 13.175844301496234, "rewards/rejected": -8.860570635114398, "step": 295 }, { "epoch": 0.07406480670586764, "grad_norm": 20.75, "kl": 4.5795793533325195, "learning_rate": 5e-06, "logits/chosen": -75763680.0, "logits/rejected": -44389234.28571428, "logps/chosen": -454.39814453125, "logps/rejected": -446.1196986607143, "loss": 0.0631, "rewards/chosen": 4.975424957275391, "rewards/margins": 12.284866659981864, "rewards/rejected": -7.309441702706473, "step": 296 }, { "epoch": 0.07431502564744151, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38831180.307692304, "logits/rejected": -55790394.18181818, "logps/chosen": -338.1838566706731, "logps/rejected": -588.3064630681819, "loss": 0.1255, "rewards/chosen": 4.476335672231821, "rewards/margins": 12.39231731174709, "rewards/rejected": -7.91598163951527, "step": 297 }, { "epoch": 0.0745652445890154, "grad_norm": 13.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55339756.8, "logits/rejected": -62088137.14285714, "logps/chosen": -392.377001953125, "logps/rejected": -520.7548130580357, "loss": 0.0632, "rewards/chosen": 4.347047805786133, "rewards/margins": 13.66012328011649, "rewards/rejected": -9.313075474330358, "step": 298 }, { "epoch": 0.07481546353058927, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55607074.90909091, "logits/rejected": -68183310.76923077, "logps/chosen": -368.6839488636364, "logps/rejected": -563.94970703125, "loss": 0.0925, "rewards/chosen": 3.4984796697443183, "rewards/margins": 11.65328008478338, "rewards/rejected": -8.154800415039062, "step": 299 }, { "epoch": 0.07506568247216314, "grad_norm": 12.0, "kl": 7.9532246589660645, "learning_rate": 5e-06, "logits/chosen": -48124446.11764706, "logits/rejected": -48984662.85714286, "logps/chosen": -462.72409237132354, "logps/rejected": -419.2373046875, "loss": 0.072, "rewards/chosen": 6.3196240593405335, "rewards/margins": 12.44195976577887, "rewards/rejected": -6.122335706438337, "step": 300 }, { "epoch": 0.07531590141373702, "grad_norm": 14.0, "kl": 5.24700403213501, "learning_rate": 5e-06, "logits/chosen": -68653216.0, "logits/rejected": -35296780.8, "logps/chosen": -466.408203125, "logps/rejected": -407.138232421875, "loss": 0.1249, "rewards/chosen": 4.35285895211356, "rewards/margins": 10.530777958461215, "rewards/rejected": -6.177919006347656, "step": 301 }, { "epoch": 0.07556612035531089, "grad_norm": 16.625, "kl": 3.634018659591675, "learning_rate": 5e-06, "logits/chosen": -50885474.13333333, "logits/rejected": -56333162.666666664, "logps/chosen": -377.9595052083333, "logps/rejected": -554.6233723958334, "loss": 0.1282, "rewards/chosen": 4.844576009114584, "rewards/margins": 12.83094991048177, "rewards/rejected": -7.9863739013671875, "step": 302 }, { "epoch": 0.07581633929688478, "grad_norm": 5.3125, "kl": 0.5038427114486694, "learning_rate": 5e-06, "logits/chosen": -16485792.0, "logits/rejected": -45833536.0, "logps/chosen": -608.5945638020834, "logps/rejected": -612.8363850911459, "loss": 0.0134, "rewards/chosen": 5.650701522827148, "rewards/margins": 13.277856826782227, "rewards/rejected": -7.627155303955078, "step": 303 }, { "epoch": 0.07606655823845865, "grad_norm": 217.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71577656.8888889, "logits/rejected": 1446080.0, "logps/chosen": -392.09239366319446, "logps/rejected": -441.5680338541667, "loss": 0.0545, "rewards/chosen": 4.2894774542914496, "rewards/margins": 10.09934582180447, "rewards/rejected": -5.809868367513021, "step": 304 }, { "epoch": 0.07631677718003253, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45708024.88888889, "logits/rejected": -48254118.4, "logps/chosen": -469.90489366319446, "logps/rejected": -553.3817057291667, "loss": 0.0403, "rewards/chosen": 5.711358812120226, "rewards/margins": 15.100974697536893, "rewards/rejected": -9.389615885416667, "step": 305 }, { "epoch": 0.0765669961216064, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65677869.71428572, "logits/rejected": -37207715.2, "logps/chosen": -538.8613630022321, "logps/rejected": -673.329736328125, "loss": 0.0181, "rewards/chosen": 6.0059612819126675, "rewards/margins": 17.722608620779855, "rewards/rejected": -11.716647338867187, "step": 306 }, { "epoch": 0.07681721506318029, "grad_norm": 13.5, "kl": 3.474886894226074, "learning_rate": 5e-06, "logits/chosen": -57763948.0, "logits/rejected": -35655440.0, "logps/chosen": -380.42388916015625, "logps/rejected": -565.8240966796875, "loss": 0.0494, "rewards/chosen": 5.190339088439941, "rewards/margins": 12.820594310760498, "rewards/rejected": -7.630255222320557, "step": 307 }, { "epoch": 0.07706743400475416, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50644028.8, "logits/rejected": -64086038.85714286, "logps/chosen": -335.427197265625, "logps/rejected": -471.4610072544643, "loss": 0.1226, "rewards/chosen": 4.116854476928711, "rewards/margins": 10.397411291939871, "rewards/rejected": -6.280556815011161, "step": 308 }, { "epoch": 0.07731765294632803, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40328700.44444445, "logits/rejected": -66934630.4, "logps/chosen": -318.47987196180554, "logps/rejected": -568.573828125, "loss": 0.0707, "rewards/chosen": 3.737506866455078, "rewards/margins": 11.43120091756185, "rewards/rejected": -7.693694051106771, "step": 309 }, { "epoch": 0.07756787188790192, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -78686771.2, "logits/rejected": -47525301.333333336, "logps/chosen": -442.12867838541666, "logps/rejected": -514.9216579861111, "loss": 0.0935, "rewards/chosen": 4.877289326985677, "rewards/margins": 13.47551778157552, "rewards/rejected": -8.598228454589844, "step": 310 }, { "epoch": 0.07781809082947579, "grad_norm": 20.75, "kl": 0.8121821284294128, "learning_rate": 5e-06, "logits/chosen": -44828251.428571425, "logits/rejected": -53163142.4, "logps/chosen": -362.40244838169644, "logps/rejected": -622.3521484375, "loss": 0.084, "rewards/chosen": 3.6696810041155135, "rewards/margins": 11.712674931117467, "rewards/rejected": -8.042993927001953, "step": 311 }, { "epoch": 0.07806830977104967, "grad_norm": 27.5, "kl": 7.953427791595459, "learning_rate": 5e-06, "logits/chosen": -48007264.0, "logits/rejected": -50813360.0, "logps/chosen": -312.81829833984375, "logps/rejected": -782.123779296875, "loss": 0.194, "rewards/chosen": 2.631920337677002, "rewards/margins": 13.814859867095947, "rewards/rejected": -11.182939529418945, "step": 312 }, { "epoch": 0.07831852871262354, "grad_norm": 18.375, "kl": 4.491418838500977, "learning_rate": 5e-06, "logits/chosen": -59036037.333333336, "logits/rejected": -62983744.0, "logps/chosen": -405.5749104817708, "logps/rejected": -400.713134765625, "loss": 0.0604, "rewards/chosen": 5.748222351074219, "rewards/margins": 12.755852381388348, "rewards/rejected": -7.007630030314128, "step": 313 }, { "epoch": 0.07856874765419743, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49776497.23076923, "logits/rejected": -39559080.72727273, "logps/chosen": -410.9743088942308, "logps/rejected": -649.0001775568181, "loss": 0.0764, "rewards/chosen": 3.891269096961388, "rewards/margins": 12.504322852288093, "rewards/rejected": -8.613053755326705, "step": 314 }, { "epoch": 0.0788189665957713, "grad_norm": 13.5625, "kl": 2.1953799724578857, "learning_rate": 5e-06, "logits/chosen": -22781229.714285713, "logits/rejected": -51122547.2, "logps/chosen": -473.47377232142856, "logps/rejected": -650.680078125, "loss": 0.0604, "rewards/chosen": 3.87014525277274, "rewards/margins": 11.899960163661412, "rewards/rejected": -8.029814910888671, "step": 315 }, { "epoch": 0.07906918553734518, "grad_norm": 15.8125, "kl": 9.544827461242676, "learning_rate": 5e-06, "logits/chosen": -48614872.0, "logits/rejected": -43650948.0, "logps/chosen": -453.4354553222656, "logps/rejected": -512.2210693359375, "loss": 0.0898, "rewards/chosen": 5.919229984283447, "rewards/margins": 13.66039228439331, "rewards/rejected": -7.741162300109863, "step": 316 }, { "epoch": 0.07931940447891905, "grad_norm": 10.25, "kl": 4.059052467346191, "learning_rate": 5e-06, "logits/chosen": -39676822.15384615, "logits/rejected": -43106210.90909091, "logps/chosen": -295.5746882512019, "logps/rejected": -311.35336026278407, "loss": 0.0869, "rewards/chosen": 4.078121478740986, "rewards/margins": 8.50689033028129, "rewards/rejected": -4.428768851540306, "step": 317 }, { "epoch": 0.07956962342049294, "grad_norm": 9.3125, "kl": 5.8941545486450195, "learning_rate": 5e-06, "logits/chosen": -64744792.615384616, "logits/rejected": -51566289.45454545, "logps/chosen": -515.5871018629807, "logps/rejected": -568.5423473011364, "loss": 0.0258, "rewards/chosen": 5.717002281775842, "rewards/margins": 16.027895253855032, "rewards/rejected": -10.31089297207919, "step": 318 }, { "epoch": 0.07981984236206681, "grad_norm": 16.875, "kl": 2.7760798931121826, "learning_rate": 5e-06, "logits/chosen": -71659475.2, "logits/rejected": -19278834.285714287, "logps/chosen": -399.245458984375, "logps/rejected": -608.02392578125, "loss": 0.1061, "rewards/chosen": 4.554701614379883, "rewards/margins": 10.876112747192384, "rewards/rejected": -6.3214111328125, "step": 319 }, { "epoch": 0.08007006130364068, "grad_norm": 21.5, "kl": 3.1826140880584717, "learning_rate": 5e-06, "logits/chosen": -47259182.54545455, "logits/rejected": -36462040.615384616, "logps/chosen": -488.42813387784093, "logps/rejected": -390.6603440504808, "loss": 0.0642, "rewards/chosen": 6.43964316628196, "rewards/margins": 11.169853984059152, "rewards/rejected": -4.730210817777193, "step": 320 }, { "epoch": 0.08032028024521456, "grad_norm": 4.78125, "kl": 1.7273375988006592, "learning_rate": 5e-06, "logits/chosen": -66869568.0, "logits/rejected": -85939221.33333333, "logps/chosen": -534.5769856770834, "logps/rejected": -714.4563802083334, "loss": 0.0283, "rewards/chosen": 6.996565500895183, "rewards/margins": 15.929007212320965, "rewards/rejected": -8.932441711425781, "step": 321 }, { "epoch": 0.08057049918678844, "grad_norm": 20.0, "kl": 2.56874418258667, "learning_rate": 5e-06, "logits/chosen": -57239475.2, "logits/rejected": -56795510.85714286, "logps/chosen": -335.910400390625, "logps/rejected": -404.29600306919644, "loss": 0.1252, "rewards/chosen": 4.564379119873047, "rewards/margins": 8.60868355887277, "rewards/rejected": -4.044304438999721, "step": 322 }, { "epoch": 0.08082071812836232, "grad_norm": 5.03125, "kl": 0.07329623401165009, "learning_rate": 5e-06, "logits/chosen": -69165893.81818181, "logits/rejected": -1601237.5384615385, "logps/chosen": -447.9138849431818, "logps/rejected": -450.7696063701923, "loss": 0.0382, "rewards/chosen": 6.760638150301847, "rewards/margins": 13.934056635503168, "rewards/rejected": -7.1734184852013225, "step": 323 }, { "epoch": 0.08107093706993619, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39580979.2, "logits/rejected": -68372598.85714285, "logps/chosen": -313.34326171875, "logps/rejected": -575.8517020089286, "loss": 0.0425, "rewards/chosen": 6.435821533203125, "rewards/margins": 13.611937386648997, "rewards/rejected": -7.176115853445871, "step": 324 }, { "epoch": 0.08132115601151008, "grad_norm": 20.125, "kl": 0.8204015493392944, "learning_rate": 5e-06, "logits/chosen": -65441307.428571425, "logits/rejected": -57284736.0, "logps/chosen": -425.15806361607144, "logps/rejected": -590.3502987132352, "loss": 0.0802, "rewards/chosen": 6.303914751325335, "rewards/margins": 11.97205852861164, "rewards/rejected": -5.668143777286305, "step": 325 }, { "epoch": 0.08157137495308395, "grad_norm": 19.0, "kl": 10.519261360168457, "learning_rate": 5e-06, "logits/chosen": -80258446.76923077, "logits/rejected": -51475234.90909091, "logps/chosen": -530.1903545673077, "logps/rejected": -660.7665571732955, "loss": 0.0618, "rewards/chosen": 7.094483595628005, "rewards/margins": 13.829765106414582, "rewards/rejected": -6.735281510786577, "step": 326 }, { "epoch": 0.08182159389465783, "grad_norm": 10.9375, "kl": 2.997100830078125, "learning_rate": 5e-06, "logits/chosen": -68122611.2, "logits/rejected": -37164114.28571428, "logps/chosen": -521.449755859375, "logps/rejected": -398.49173409598217, "loss": 0.0351, "rewards/chosen": 6.441287231445313, "rewards/margins": 12.19258804321289, "rewards/rejected": -5.751300811767578, "step": 327 }, { "epoch": 0.0820718128362317, "grad_norm": 14.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28296394.666666668, "logits/rejected": -53455300.266666666, "logps/chosen": -365.97100151909723, "logps/rejected": -708.0231119791666, "loss": 0.0643, "rewards/chosen": 4.3345459832085504, "rewards/margins": 15.565710533989801, "rewards/rejected": -11.23116455078125, "step": 328 }, { "epoch": 0.08232203177780557, "grad_norm": 25.0, "kl": 6.122256755828857, "learning_rate": 5e-06, "logits/chosen": -59985792.0, "logits/rejected": -43732233.14285714, "logps/chosen": -395.7806181066176, "logps/rejected": -368.55897739955356, "loss": 0.1264, "rewards/chosen": 5.007666195140166, "rewards/margins": 11.517860701104173, "rewards/rejected": -6.510194505964007, "step": 329 }, { "epoch": 0.08257225071937946, "grad_norm": 19.25, "kl": 8.461600303649902, "learning_rate": 5e-06, "logits/chosen": -42237380.92307692, "logits/rejected": -49793361.45454545, "logps/chosen": -479.0254657451923, "logps/rejected": -346.64839311079544, "loss": 0.0445, "rewards/chosen": 6.577327434833233, "rewards/margins": 12.408399461866258, "rewards/rejected": -5.831072027033025, "step": 330 }, { "epoch": 0.08282246966095333, "grad_norm": 18.25, "kl": 1.3572839498519897, "learning_rate": 5e-06, "logits/chosen": -79799195.42857143, "logits/rejected": -47506073.6, "logps/chosen": -401.2435825892857, "logps/rejected": -499.124609375, "loss": 0.057, "rewards/chosen": 5.039963858468192, "rewards/margins": 14.429381125313895, "rewards/rejected": -9.389417266845703, "step": 331 }, { "epoch": 0.08307268860252721, "grad_norm": 15.625, "kl": 3.3528761863708496, "learning_rate": 5e-06, "logits/chosen": -91620602.18181819, "logits/rejected": -45963298.461538464, "logps/chosen": -401.23073508522725, "logps/rejected": -722.0138221153846, "loss": 0.0619, "rewards/chosen": 3.542086514559659, "rewards/margins": 12.321062368112845, "rewards/rejected": -8.778975853553185, "step": 332 }, { "epoch": 0.08332290754410109, "grad_norm": 19.25, "kl": 2.2864317893981934, "learning_rate": 5e-06, "logits/chosen": -78876793.6, "logits/rejected": -32254505.14285714, "logps/chosen": -462.73212890625, "logps/rejected": -509.91015625, "loss": 0.078, "rewards/chosen": 4.534415817260742, "rewards/margins": 13.48143185206822, "rewards/rejected": -8.947016034807477, "step": 333 }, { "epoch": 0.08357312648567497, "grad_norm": 23.5, "kl": 15.974513053894043, "learning_rate": 5e-06, "logits/chosen": -58769088.0, "logits/rejected": -43553034.666666664, "logps/chosen": -456.30121527777777, "logps/rejected": -764.964599609375, "loss": 0.1248, "rewards/chosen": 5.548309326171875, "rewards/margins": 13.340513229370117, "rewards/rejected": -7.792203903198242, "step": 334 }, { "epoch": 0.08382334542724884, "grad_norm": 31.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57135756.8, "logits/rejected": -54594267.428571425, "logps/chosen": -427.58017578125, "logps/rejected": -613.2606026785714, "loss": 0.1121, "rewards/chosen": 4.363252639770508, "rewards/margins": 11.321872438703265, "rewards/rejected": -6.958619798932757, "step": 335 }, { "epoch": 0.08407356436882273, "grad_norm": 18.5, "kl": 3.6958415508270264, "learning_rate": 5e-06, "logits/chosen": -29753862.4, "logits/rejected": -55726363.428571425, "logps/chosen": -332.8385009765625, "logps/rejected": -564.3188127790179, "loss": 0.0776, "rewards/chosen": 3.554238128662109, "rewards/margins": 12.512367466517858, "rewards/rejected": -8.958129337855748, "step": 336 }, { "epoch": 0.0843237833103966, "grad_norm": 14.75, "kl": 1.9886001348495483, "learning_rate": 5e-06, "logits/chosen": -71776645.33333333, "logits/rejected": -33673770.666666664, "logps/chosen": -468.2509765625, "logps/rejected": -374.058349609375, "loss": 0.0554, "rewards/chosen": 5.30703608194987, "rewards/margins": 11.932327906290691, "rewards/rejected": -6.62529182434082, "step": 337 }, { "epoch": 0.08457400225197047, "grad_norm": 2.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46806941.09090909, "logits/rejected": -59882958.76923077, "logps/chosen": -452.71950461647725, "logps/rejected": -889.8108473557693, "loss": 0.0187, "rewards/chosen": 6.304441972212358, "rewards/margins": 18.56909040971236, "rewards/rejected": -12.2646484375, "step": 338 }, { "epoch": 0.08482422119354435, "grad_norm": 23.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39775402.666666664, "logits/rejected": -60844488.53333333, "logps/chosen": -373.6398111979167, "logps/rejected": -589.4291666666667, "loss": 0.0569, "rewards/chosen": 4.993715074327257, "rewards/margins": 12.940140448676216, "rewards/rejected": -7.946425374348959, "step": 339 }, { "epoch": 0.08507444013511822, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54369786.18181818, "logits/rejected": -34264625.23076923, "logps/chosen": -363.16166548295456, "logps/rejected": -634.0920222355769, "loss": 0.0525, "rewards/chosen": 5.306921872225675, "rewards/margins": 15.834063736708849, "rewards/rejected": -10.527141864483173, "step": 340 }, { "epoch": 0.08532465907669211, "grad_norm": 13.8125, "kl": 3.7437191009521484, "learning_rate": 5e-06, "logits/chosen": -69789661.53846154, "logits/rejected": -71503592.72727273, "logps/chosen": -552.8607271634615, "logps/rejected": -348.5597478693182, "loss": 0.0393, "rewards/chosen": 6.496727576622596, "rewards/margins": 11.194468304827495, "rewards/rejected": -4.6977407282049, "step": 341 }, { "epoch": 0.08557487801826598, "grad_norm": 18.75, "kl": 2.2659192085266113, "learning_rate": 5e-06, "logits/chosen": -55322080.0, "logits/rejected": -19687675.42857143, "logps/chosen": -468.987841796875, "logps/rejected": -484.4410923549107, "loss": 0.0457, "rewards/chosen": 5.919325637817383, "rewards/margins": 12.086318915230887, "rewards/rejected": -6.166993277413504, "step": 342 }, { "epoch": 0.08582509695983986, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -82238848.0, "logits/rejected": -32393040.0, "logps/chosen": -356.1858825683594, "logps/rejected": -407.47186279296875, "loss": 0.0844, "rewards/chosen": 3.6388773918151855, "rewards/margins": 9.83605670928955, "rewards/rejected": -6.197179317474365, "step": 343 }, { "epoch": 0.08607531590141373, "grad_norm": 13.1875, "kl": 1.406911849975586, "learning_rate": 5e-06, "logits/chosen": -54211136.0, "logits/rejected": -49059731.2, "logps/chosen": -490.14634486607144, "logps/rejected": -461.68271484375, "loss": 0.0765, "rewards/chosen": 5.853201729910714, "rewards/margins": 13.903588540213448, "rewards/rejected": -8.050386810302735, "step": 344 }, { "epoch": 0.08632553484298762, "grad_norm": 23.75, "kl": 10.15842056274414, "learning_rate": 5e-06, "logits/chosen": -16359782.666666666, "logits/rejected": -95072714.66666667, "logps/chosen": -399.1624348958333, "logps/rejected": -492.3768717447917, "loss": 0.1384, "rewards/chosen": 4.814750989278157, "rewards/margins": 13.772103945414226, "rewards/rejected": -8.957352956136068, "step": 345 }, { "epoch": 0.08657575378456149, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39405501.333333336, "logits/rejected": -35529018.666666664, "logps/chosen": -424.7720133463542, "logps/rejected": -584.3602294921875, "loss": 0.0367, "rewards/chosen": 5.987045923868815, "rewards/margins": 15.205389658610027, "rewards/rejected": -9.218343734741211, "step": 346 }, { "epoch": 0.08682597272613538, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71689997.71428572, "logits/rejected": -36729336.47058824, "logps/chosen": -670.90380859375, "logps/rejected": -450.77039292279414, "loss": 0.0563, "rewards/chosen": 7.845038822719029, "rewards/margins": 14.11451801332105, "rewards/rejected": -6.269479190602022, "step": 347 }, { "epoch": 0.08707619166770925, "grad_norm": 10.1875, "kl": 0.8384997248649597, "learning_rate": 5e-06, "logits/chosen": -73047540.36363636, "logits/rejected": -50919163.07692308, "logps/chosen": -502.95854048295456, "logps/rejected": -549.7272385817307, "loss": 0.0266, "rewards/chosen": 6.3045786077326, "rewards/margins": 15.477944274048705, "rewards/rejected": -9.173365666316105, "step": 348 }, { "epoch": 0.08732641060928312, "grad_norm": 15.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8290919.0, "logits/rejected": 18450308.0, "logps/chosen": -378.5304870605469, "logps/rejected": -619.2883911132812, "loss": 0.0528, "rewards/chosen": 4.09270715713501, "rewards/margins": 12.661085605621338, "rewards/rejected": -8.568378448486328, "step": 349 }, { "epoch": 0.087576629550857, "grad_norm": 24.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65208251.07692308, "logits/rejected": -50788544.0, "logps/chosen": -550.4088792067307, "logps/rejected": -394.7405894886364, "loss": 0.0686, "rewards/chosen": 4.863852867713342, "rewards/margins": 11.727734759137348, "rewards/rejected": -6.863881891424006, "step": 350 }, { "epoch": 0.08782684849243087, "grad_norm": 8.5625, "kl": 3.084390640258789, "learning_rate": 5e-06, "logits/chosen": -50031324.44444445, "logits/rejected": -53625011.2, "logps/chosen": -503.4275716145833, "logps/rejected": -781.49453125, "loss": 0.0382, "rewards/chosen": 5.849625481499566, "rewards/margins": 18.398342725965712, "rewards/rejected": -12.548717244466145, "step": 351 }, { "epoch": 0.08807706743400476, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50025744.0, "logits/rejected": -73963898.66666667, "logps/chosen": -418.3951822916667, "logps/rejected": -627.3041585286459, "loss": 0.0376, "rewards/chosen": 5.052695910135905, "rewards/margins": 13.235371589660645, "rewards/rejected": -8.18267567952474, "step": 352 }, { "epoch": 0.08832728637557863, "grad_norm": 14.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26449706.666666668, "logits/rejected": -36583244.8, "logps/chosen": -303.1364474826389, "logps/rejected": -434.3656901041667, "loss": 0.0688, "rewards/chosen": 3.8088162740071616, "rewards/margins": 12.002457427978516, "rewards/rejected": -8.193641153971354, "step": 353 }, { "epoch": 0.08857750531715251, "grad_norm": 12.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56242976.0, "logits/rejected": 33482381.714285713, "logps/chosen": -548.902880859375, "logps/rejected": -346.14878627232144, "loss": 0.0369, "rewards/chosen": 7.538912200927735, "rewards/margins": 13.006964656284879, "rewards/rejected": -5.468052455357143, "step": 354 }, { "epoch": 0.08882772425872638, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53564284.8, "logits/rejected": -62660818.28571428, "logps/chosen": -397.53154296875, "logps/rejected": -714.7157505580357, "loss": 0.0661, "rewards/chosen": 3.3319602966308595, "rewards/margins": 15.111241912841797, "rewards/rejected": -11.779281616210938, "step": 355 }, { "epoch": 0.08907794320030027, "grad_norm": 20.625, "kl": 3.1753997802734375, "learning_rate": 5e-06, "logits/chosen": -39247488.0, "logits/rejected": -55188656.0, "logps/chosen": -421.8942565917969, "logps/rejected": -615.5950927734375, "loss": 0.0858, "rewards/chosen": 4.690866470336914, "rewards/margins": 15.615058898925781, "rewards/rejected": -10.924192428588867, "step": 356 }, { "epoch": 0.08932816214187414, "grad_norm": 17.375, "kl": 1.4323711395263672, "learning_rate": 5e-06, "logits/chosen": -60970124.8, "logits/rejected": -73626752.0, "logps/chosen": -333.5826171875, "logps/rejected": -555.0177525111607, "loss": 0.0877, "rewards/chosen": 3.657939910888672, "rewards/margins": 11.130105699811663, "rewards/rejected": -7.472165788922991, "step": 357 }, { "epoch": 0.08957838108344801, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67144472.0, "logits/rejected": -38479056.0, "logps/chosen": -423.0672607421875, "logps/rejected": -615.0408935546875, "loss": 0.1016, "rewards/chosen": 4.16230583190918, "rewards/margins": 13.634326934814453, "rewards/rejected": -9.472021102905273, "step": 358 }, { "epoch": 0.0898286000250219, "grad_norm": 18.625, "kl": 3.3815784454345703, "learning_rate": 5e-06, "logits/chosen": -45535152.0, "logits/rejected": -51963908.0, "logps/chosen": -386.4666748046875, "logps/rejected": -322.1533508300781, "loss": 0.1285, "rewards/chosen": 3.6765856742858887, "rewards/margins": 8.868733882904053, "rewards/rejected": -5.192148208618164, "step": 359 }, { "epoch": 0.09007881896659577, "grad_norm": 19.875, "kl": 0.9900690913200378, "learning_rate": 5e-06, "logits/chosen": -57026585.6, "logits/rejected": -60664571.428571425, "logps/chosen": -374.341943359375, "logps/rejected": -529.1930454799107, "loss": 0.0431, "rewards/chosen": 4.683572387695312, "rewards/margins": 12.21929212297712, "rewards/rejected": -7.535719735281808, "step": 360 }, { "epoch": 0.09032903790816965, "grad_norm": 5.65625, "kl": 3.256103515625, "learning_rate": 5e-06, "logits/chosen": -61525415.384615384, "logits/rejected": -63263197.09090909, "logps/chosen": -541.3863431490385, "logps/rejected": -405.3743341619318, "loss": 0.0149, "rewards/chosen": 6.213826693021334, "rewards/margins": 12.861952775008195, "rewards/rejected": -6.64812608198686, "step": 361 }, { "epoch": 0.09057925684974352, "grad_norm": 13.375, "kl": 4.158720016479492, "learning_rate": 5e-06, "logits/chosen": -56053405.538461536, "logits/rejected": -46150685.09090909, "logps/chosen": -394.01810396634613, "logps/rejected": -508.25053267045456, "loss": 0.0663, "rewards/chosen": 4.731486100416917, "rewards/margins": 11.334641063129986, "rewards/rejected": -6.603154962713068, "step": 362 }, { "epoch": 0.09082947579131741, "grad_norm": 15.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58984179.2, "logits/rejected": -44649984.0, "logps/chosen": -429.757373046875, "logps/rejected": -497.41650390625, "loss": 0.0432, "rewards/chosen": 5.909109497070313, "rewards/margins": 14.319558933803014, "rewards/rejected": -8.410449436732701, "step": 363 }, { "epoch": 0.09107969473289128, "grad_norm": 15.4375, "kl": 2.5075480937957764, "learning_rate": 5e-06, "logits/chosen": -36391227.733333334, "logits/rejected": -60230762.666666664, "logps/chosen": -371.2073567708333, "logps/rejected": -653.6985677083334, "loss": 0.107, "rewards/chosen": 4.0661875406901045, "rewards/margins": 14.488671196831596, "rewards/rejected": -10.422483656141493, "step": 364 }, { "epoch": 0.09132991367446516, "grad_norm": 19.625, "kl": 5.928158760070801, "learning_rate": 5e-06, "logits/chosen": -66486496.0, "logits/rejected": 1660486.6666666667, "logps/chosen": -516.28369140625, "logps/rejected": -445.3562418619792, "loss": 0.0979, "rewards/chosen": 6.100261688232422, "rewards/margins": 12.259714762369793, "rewards/rejected": -6.15945307413737, "step": 365 }, { "epoch": 0.09158013261603903, "grad_norm": 13.6875, "kl": 6.553426265716553, "learning_rate": 5e-06, "logits/chosen": -55526043.428571425, "logits/rejected": -49916358.4, "logps/chosen": -441.85703822544644, "logps/rejected": -593.2875, "loss": 0.0348, "rewards/chosen": 5.074913569859096, "rewards/margins": 13.679854365757535, "rewards/rejected": -8.604940795898438, "step": 366 }, { "epoch": 0.0918303515576129, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41151360.0, "logits/rejected": -41403968.0, "logps/chosen": -417.757666015625, "logps/rejected": -400.60878208705356, "loss": 0.0609, "rewards/chosen": 4.611019897460937, "rewards/margins": 11.120613861083985, "rewards/rejected": -6.509593963623047, "step": 367 }, { "epoch": 0.09208057049918679, "grad_norm": 14.0, "kl": 8.070347785949707, "learning_rate": 5e-06, "logits/chosen": -37015507.692307696, "logits/rejected": -31232366.545454547, "logps/chosen": -395.2751652644231, "logps/rejected": -530.2959872159091, "loss": 0.0552, "rewards/chosen": 5.743570767916166, "rewards/margins": 12.627804416042942, "rewards/rejected": -6.884233648126775, "step": 368 }, { "epoch": 0.09233078944076066, "grad_norm": 27.75, "kl": 2.749298095703125, "learning_rate": 5e-06, "logits/chosen": -65480448.0, "logits/rejected": -25416629.333333332, "logps/chosen": -481.6826171875, "logps/rejected": -474.8369140625, "loss": 0.1008, "rewards/chosen": 6.059861501057942, "rewards/margins": 10.086954752604166, "rewards/rejected": -4.027093251546224, "step": 369 }, { "epoch": 0.09258100838233455, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44566824.0, "logits/rejected": -45385685.333333336, "logps/chosen": -399.710693359375, "logps/rejected": -653.25244140625, "loss": 0.0388, "rewards/chosen": 4.439666112263997, "rewards/margins": 15.766674677530926, "rewards/rejected": -11.327008565266928, "step": 370 }, { "epoch": 0.09283122732390842, "grad_norm": 18.75, "kl": 6.107187271118164, "learning_rate": 5e-06, "logits/chosen": -63404730.666666664, "logits/rejected": -22978010.666666668, "logps/chosen": -474.6171061197917, "logps/rejected": -350.288818359375, "loss": 0.088, "rewards/chosen": 4.971944491068522, "rewards/margins": 10.882649421691895, "rewards/rejected": -5.910704930623372, "step": 371 }, { "epoch": 0.0930814462654823, "grad_norm": 14.75, "kl": 3.7354979515075684, "learning_rate": 5e-06, "logits/chosen": -34817354.666666664, "logits/rejected": -6763334.666666667, "logps/chosen": -350.4503173828125, "logps/rejected": -509.0269775390625, "loss": 0.0934, "rewards/chosen": 5.426799774169922, "rewards/margins": 10.002487182617188, "rewards/rejected": -4.575687408447266, "step": 372 }, { "epoch": 0.09333166520705617, "grad_norm": 13.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50732711.384615384, "logits/rejected": -43489856.0, "logps/chosen": -204.86395733173077, "logps/rejected": -454.76589133522725, "loss": 0.1408, "rewards/chosen": 3.477960146390475, "rewards/margins": 8.843580846186285, "rewards/rejected": -5.36562069979581, "step": 373 }, { "epoch": 0.09358188414863006, "grad_norm": 12.75, "kl": 0.46170300245285034, "learning_rate": 5e-06, "logits/chosen": -53032192.0, "logits/rejected": -76971019.63636364, "logps/chosen": -434.9354717548077, "logps/rejected": -654.2144886363636, "loss": 0.0481, "rewards/chosen": 6.249504089355469, "rewards/margins": 12.367059881036932, "rewards/rejected": -6.117555791681463, "step": 374 }, { "epoch": 0.09383210309020393, "grad_norm": 18.75, "kl": 17.76038360595703, "learning_rate": 5e-06, "logits/chosen": -66216277.333333336, "logits/rejected": -48391557.333333336, "logps/chosen": -435.092529296875, "logps/rejected": -399.892333984375, "loss": 0.1204, "rewards/chosen": 6.640109592013889, "rewards/margins": 12.17822986178928, "rewards/rejected": -5.538120269775391, "step": 375 }, { "epoch": 0.09408232203177781, "grad_norm": 17.5, "kl": 13.891247749328613, "learning_rate": 5e-06, "logits/chosen": -59493808.0, "logits/rejected": -28179568.0, "logps/chosen": -455.0027262369792, "logps/rejected": -370.8207600911458, "loss": 0.1225, "rewards/chosen": 6.140077590942383, "rewards/margins": 10.166110038757324, "rewards/rejected": -4.026032447814941, "step": 376 }, { "epoch": 0.09433254097335168, "grad_norm": 5.6875, "kl": 7.242225170135498, "learning_rate": 5e-06, "logits/chosen": -58986362.666666664, "logits/rejected": -28042061.333333332, "logps/chosen": -565.0171712239584, "logps/rejected": -347.8406168619792, "loss": 0.0162, "rewards/chosen": 7.618915557861328, "rewards/margins": 13.646324157714844, "rewards/rejected": -6.027408599853516, "step": 377 }, { "epoch": 0.09458275991492555, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52458016.0, "logits/rejected": -24554508.8, "logps/chosen": -299.80918666294644, "logps/rejected": -501.97216796875, "loss": 0.0837, "rewards/chosen": 3.9322482517787387, "rewards/margins": 12.154504721505301, "rewards/rejected": -8.222256469726563, "step": 378 }, { "epoch": 0.09483297885649944, "grad_norm": 23.75, "kl": 8.254425048828125, "learning_rate": 5e-06, "logits/chosen": -49368411.428571425, "logits/rejected": -47947078.4, "logps/chosen": -448.5439453125, "logps/rejected": -664.28271484375, "loss": 0.0619, "rewards/chosen": 6.7694887433733255, "rewards/margins": 14.484971836635044, "rewards/rejected": -7.715483093261719, "step": 379 }, { "epoch": 0.09508319779807331, "grad_norm": 12.4375, "kl": 1.073413610458374, "learning_rate": 5e-06, "logits/chosen": -28673864.533333335, "logits/rejected": -38044817.777777776, "logps/chosen": -329.4783203125, "logps/rejected": -461.26161024305554, "loss": 0.1649, "rewards/chosen": 4.78260498046875, "rewards/margins": 11.099345228407117, "rewards/rejected": -6.316740247938368, "step": 380 }, { "epoch": 0.0953334167396472, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30628922.181818184, "logits/rejected": -29202161.230769232, "logps/chosen": -285.32712624289775, "logps/rejected": -498.37661508413464, "loss": 0.1054, "rewards/chosen": 4.811467950994318, "rewards/margins": 10.034194626174607, "rewards/rejected": -5.222726675180288, "step": 381 }, { "epoch": 0.09558363568122107, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39937210.18181818, "logits/rejected": -34606724.92307692, "logps/chosen": -437.4091796875, "logps/rejected": -572.4448617788462, "loss": 0.0646, "rewards/chosen": 5.796208815141157, "rewards/margins": 14.80409995659248, "rewards/rejected": -9.007891141451323, "step": 382 }, { "epoch": 0.09583385462279495, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31486598.4, "logits/rejected": -56153600.0, "logps/chosen": -402.7879638671875, "logps/rejected": -559.7917131696429, "loss": 0.0637, "rewards/chosen": 5.576757431030273, "rewards/margins": 14.116545813424247, "rewards/rejected": -8.539788382393974, "step": 383 }, { "epoch": 0.09608407356436882, "grad_norm": 13.125, "kl": 8.177698135375977, "learning_rate": 5e-06, "logits/chosen": -28956149.333333332, "logits/rejected": -38146065.777777776, "logps/chosen": -396.94127604166664, "logps/rejected": -384.12681749131946, "loss": 0.0715, "rewards/chosen": 5.464049275716146, "rewards/margins": 11.317184702555338, "rewards/rejected": -5.853135426839192, "step": 384 }, { "epoch": 0.0963342925059427, "grad_norm": 8.8125, "kl": 5.718497276306152, "learning_rate": 5e-06, "logits/chosen": -40536068.571428575, "logits/rejected": -93329408.0, "logps/chosen": -372.83241489955356, "logps/rejected": -517.881298828125, "loss": 0.0322, "rewards/chosen": 4.840836661202567, "rewards/margins": 11.606588309151785, "rewards/rejected": -6.765751647949219, "step": 385 }, { "epoch": 0.09658451144751658, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46708416.0, "logits/rejected": -38078960.0, "logps/chosen": -339.2285888671875, "logps/rejected": -552.4586007254464, "loss": 0.054, "rewards/chosen": 5.0961761474609375, "rewards/margins": 14.008648463657924, "rewards/rejected": -8.912472316196986, "step": 386 }, { "epoch": 0.09683473038909045, "grad_norm": 12.25, "kl": 1.0777747631072998, "learning_rate": 5e-06, "logits/chosen": -65940394.666666664, "logits/rejected": -61548618.666666664, "logps/chosen": -545.5083821614584, "logps/rejected": -683.36474609375, "loss": 0.0414, "rewards/chosen": 5.95986811319987, "rewards/margins": 15.873210906982422, "rewards/rejected": -9.913342793782553, "step": 387 }, { "epoch": 0.09708494933066433, "grad_norm": 24.125, "kl": 7.892280101776123, "learning_rate": 5e-06, "logits/chosen": -45052066.13333333, "logits/rejected": -22081038.222222224, "logps/chosen": -353.6652018229167, "logps/rejected": -462.7194010416667, "loss": 0.0798, "rewards/chosen": 5.11092274983724, "rewards/margins": 13.230683898925781, "rewards/rejected": -8.119761149088541, "step": 388 }, { "epoch": 0.0973351682722382, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27990958.769230768, "logits/rejected": -65293504.0, "logps/chosen": -320.1760441706731, "logps/rejected": -499.63383345170456, "loss": 0.0959, "rewards/chosen": 4.739841461181641, "rewards/margins": 10.843053991144354, "rewards/rejected": -6.103212529962713, "step": 389 }, { "epoch": 0.09758538721381209, "grad_norm": 13.75, "kl": 0.3630460202693939, "learning_rate": 5e-06, "logits/chosen": -45893707.63636363, "logits/rejected": -34649604.92307692, "logps/chosen": -292.3326970880682, "logps/rejected": -452.06381460336536, "loss": 0.0554, "rewards/chosen": 3.9567704634232954, "rewards/margins": 9.537572794027263, "rewards/rejected": -5.580802330603967, "step": 390 }, { "epoch": 0.09783560615538596, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53935556.571428575, "logits/rejected": -41159052.8, "logps/chosen": -393.78916713169644, "logps/rejected": -526.728857421875, "loss": 0.0606, "rewards/chosen": 5.70606449672154, "rewards/margins": 13.297688184465681, "rewards/rejected": -7.591623687744141, "step": 391 }, { "epoch": 0.09808582509695984, "grad_norm": 22.625, "kl": 8.097393989562988, "learning_rate": 5e-06, "logits/chosen": -44183943.52941176, "logits/rejected": -50947894.85714286, "logps/chosen": -410.19680606617646, "logps/rejected": -846.0643833705357, "loss": 0.1159, "rewards/chosen": 5.803953282973346, "rewards/margins": 20.649368670808165, "rewards/rejected": -14.845415387834821, "step": 392 }, { "epoch": 0.09833604403853372, "grad_norm": 13.125, "kl": 1.3518741130828857, "learning_rate": 5e-06, "logits/chosen": -40081449.84615385, "logits/rejected": -45273658.18181818, "logps/chosen": -468.5323016826923, "logps/rejected": -567.9649325284091, "loss": 0.0829, "rewards/chosen": 5.369481013371394, "rewards/margins": 13.613824164117133, "rewards/rejected": -8.244343150745738, "step": 393 }, { "epoch": 0.0985862629801076, "grad_norm": 18.75, "kl": 8.161908149719238, "learning_rate": 5e-06, "logits/chosen": -60935847.384615384, "logits/rejected": -40088401.45454545, "logps/chosen": -450.28797325721155, "logps/rejected": -503.8245738636364, "loss": 0.1072, "rewards/chosen": 6.421485900878906, "rewards/margins": 12.865921714089133, "rewards/rejected": -6.4444358132102275, "step": 394 }, { "epoch": 0.09883648192168147, "grad_norm": 16.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49220906.666666664, "logits/rejected": -35500326.4, "logps/chosen": -517.005859375, "logps/rejected": -534.5302083333333, "loss": 0.0846, "rewards/chosen": 6.194328308105469, "rewards/margins": 13.034056599934896, "rewards/rejected": -6.839728291829427, "step": 395 }, { "epoch": 0.09908670086325534, "grad_norm": 13.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77013294.54545455, "logits/rejected": -18858615.384615384, "logps/chosen": -552.6618874289773, "logps/rejected": -617.21435546875, "loss": 0.0566, "rewards/chosen": 7.5582143610174, "rewards/margins": 15.697539429564577, "rewards/rejected": -8.139325068547176, "step": 396 }, { "epoch": 0.09933691980482923, "grad_norm": 7.125, "kl": 2.094575881958008, "learning_rate": 5e-06, "logits/chosen": -57553609.14285714, "logits/rejected": -50990073.6, "logps/chosen": -456.2685546875, "logps/rejected": -521.18876953125, "loss": 0.0339, "rewards/chosen": 6.946342468261719, "rewards/margins": 14.4228515625, "rewards/rejected": -7.476509094238281, "step": 397 }, { "epoch": 0.0995871387464031, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45192153.6, "logits/rejected": -36347890.28571428, "logps/chosen": -328.09736328125, "logps/rejected": -487.84291294642856, "loss": 0.0884, "rewards/chosen": 4.698400497436523, "rewards/margins": 11.430424554007395, "rewards/rejected": -6.732024056570871, "step": 398 }, { "epoch": 0.09983735768797698, "grad_norm": 10.0625, "kl": 2.2654342651367188, "learning_rate": 5e-06, "logits/chosen": 20556571.636363637, "logits/rejected": -22432477.53846154, "logps/chosen": -473.34170809659093, "logps/rejected": -640.076171875, "loss": 0.0478, "rewards/chosen": 5.259163943204013, "rewards/margins": 13.574018491731657, "rewards/rejected": -8.314854548527645, "step": 399 }, { "epoch": 0.10008757662955085, "grad_norm": 11.75, "kl": 1.1438745260238647, "learning_rate": 5e-06, "logits/chosen": -70439456.0, "logits/rejected": -48127808.0, "logps/chosen": -391.1977132161458, "logps/rejected": -468.4460042317708, "loss": 0.0664, "rewards/chosen": 4.742527008056641, "rewards/margins": 11.225823720296223, "rewards/rejected": -6.483296712239583, "step": 400 }, { "epoch": 0.10033779557112474, "grad_norm": 16.375, "kl": 1.0232124328613281, "learning_rate": 5e-06, "logits/chosen": -57168663.27272727, "logits/rejected": -54569028.92307692, "logps/chosen": -300.26895419034093, "logps/rejected": -476.3874323918269, "loss": 0.0886, "rewards/chosen": 4.676924618807706, "rewards/margins": 10.507736339435711, "rewards/rejected": -5.830811720628005, "step": 401 }, { "epoch": 0.10058801451269861, "grad_norm": 11.25, "kl": 0.8866307139396667, "learning_rate": 5e-06, "logits/chosen": -44025165.71428572, "logits/rejected": -46981712.0, "logps/chosen": -429.13818359375, "logps/rejected": -417.379736328125, "loss": 0.0927, "rewards/chosen": 5.455929347446987, "rewards/margins": 12.826463862827847, "rewards/rejected": -7.370534515380859, "step": 402 }, { "epoch": 0.1008382334542725, "grad_norm": 11.8125, "kl": 1.8641650676727295, "learning_rate": 5e-06, "logits/chosen": -63372405.333333336, "logits/rejected": -44824485.333333336, "logps/chosen": -416.6009928385417, "logps/rejected": -511.1730550130208, "loss": 0.0466, "rewards/chosen": 5.438326517740886, "rewards/margins": 12.913864135742188, "rewards/rejected": -7.475537618001302, "step": 403 }, { "epoch": 0.10108845239584636, "grad_norm": 8.9375, "kl": 0.44264063239097595, "learning_rate": 5e-06, "logits/chosen": -81691273.84615384, "logits/rejected": -30523296.0, "logps/chosen": -401.87289663461536, "logps/rejected": -425.74338600852275, "loss": 0.0489, "rewards/chosen": 4.802876985990084, "rewards/margins": 10.41016777412041, "rewards/rejected": -5.607290788130327, "step": 404 }, { "epoch": 0.10133867133742024, "grad_norm": 14.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43013686.4, "logits/rejected": -31936971.42857143, "logps/chosen": -422.82099609375, "logps/rejected": -417.3173828125, "loss": 0.0853, "rewards/chosen": 4.338645553588867, "rewards/margins": 10.886906923566546, "rewards/rejected": -6.548261369977679, "step": 405 }, { "epoch": 0.10158889027899412, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51162449.777777776, "logits/rejected": -54308573.86666667, "logps/chosen": -520.4177517361111, "logps/rejected": -607.8944661458333, "loss": 0.004, "rewards/chosen": 7.769977145724827, "rewards/margins": 19.033067152235244, "rewards/rejected": -11.263090006510417, "step": 406 }, { "epoch": 0.10183910922056799, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47403029.333333336, "logits/rejected": -51197397.333333336, "logps/chosen": -332.00469970703125, "logps/rejected": -457.2459309895833, "loss": 0.0488, "rewards/chosen": 3.972020467122396, "rewards/margins": 11.739330291748047, "rewards/rejected": -7.767309824625651, "step": 407 }, { "epoch": 0.10208932816214188, "grad_norm": 18.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -92978826.66666667, "logits/rejected": -39795232.0, "logps/chosen": -444.3814290364583, "logps/rejected": -519.9366319444445, "loss": 0.0436, "rewards/chosen": 6.104851404825847, "rewards/margins": 13.283403820461697, "rewards/rejected": -7.178552415635851, "step": 408 }, { "epoch": 0.10233954710371575, "grad_norm": 25.5, "kl": 3.115605354309082, "learning_rate": 5e-06, "logits/chosen": -30063936.0, "logits/rejected": -55245927.11111111, "logps/chosen": -266.97451171875, "logps/rejected": -378.625, "loss": 0.1341, "rewards/chosen": 3.777569580078125, "rewards/margins": 9.546980455186631, "rewards/rejected": -5.769410875108507, "step": 409 }, { "epoch": 0.10258976604528963, "grad_norm": 15.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2317781.3333333335, "logits/rejected": -73145820.44444445, "logps/chosen": -510.08118489583336, "logps/rejected": -891.1088324652778, "loss": 0.0647, "rewards/chosen": 6.504647827148437, "rewards/margins": 17.896556260850694, "rewards/rejected": -11.391908433702257, "step": 410 }, { "epoch": 0.1028399849868635, "grad_norm": 24.5, "kl": 10.833757400512695, "learning_rate": 5e-06, "logits/chosen": -35619772.23529412, "logits/rejected": -44575968.0, "logps/chosen": -382.6388154871324, "logps/rejected": -406.9873744419643, "loss": 0.1755, "rewards/chosen": 4.639774995691636, "rewards/margins": 10.885245315167083, "rewards/rejected": -6.245470319475446, "step": 411 }, { "epoch": 0.10309020392843739, "grad_norm": 19.25, "kl": 7.022817134857178, "learning_rate": 5e-06, "logits/chosen": -58954845.538461536, "logits/rejected": -45061486.54545455, "logps/chosen": -487.4467022235577, "logps/rejected": -543.7039683948864, "loss": 0.0714, "rewards/chosen": 6.067454998309795, "rewards/margins": 15.676225195397864, "rewards/rejected": -9.608770197088068, "step": 412 }, { "epoch": 0.10334042287001126, "grad_norm": 12.5, "kl": 1.0240873098373413, "learning_rate": 5e-06, "logits/chosen": -48003172.571428575, "logits/rejected": -37617283.2, "logps/chosen": -261.23702566964283, "logps/rejected": -470.354443359375, "loss": 0.096, "rewards/chosen": 3.7107960837227956, "rewards/margins": 9.509578432355609, "rewards/rejected": -5.7987823486328125, "step": 413 }, { "epoch": 0.10359064181158514, "grad_norm": 12.0625, "kl": 1.8942980766296387, "learning_rate": 5e-06, "logits/chosen": -42673890.13333333, "logits/rejected": -45731893.333333336, "logps/chosen": -375.79625651041664, "logps/rejected": -445.3735622829861, "loss": 0.079, "rewards/chosen": 5.186106363932292, "rewards/margins": 12.294780731201172, "rewards/rejected": -7.10867436726888, "step": 414 }, { "epoch": 0.10384086075315901, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -7241936.0, "logits/rejected": -50001260.307692304, "logps/chosen": -552.8135209517045, "logps/rejected": -572.2059420072115, "loss": 0.03, "rewards/chosen": 5.882486516779119, "rewards/margins": 15.103575806517702, "rewards/rejected": -9.221089289738583, "step": 415 }, { "epoch": 0.10409107969473289, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43977437.86666667, "logits/rejected": -79319217.77777778, "logps/chosen": -486.6686197916667, "logps/rejected": -732.5679796006945, "loss": 0.0268, "rewards/chosen": 6.040640258789063, "rewards/margins": 16.642068820529516, "rewards/rejected": -10.601428561740452, "step": 416 }, { "epoch": 0.10434129863630677, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41147776.0, "logits/rejected": -41775364.266666666, "logps/chosen": -368.81426323784723, "logps/rejected": -450.8942057291667, "loss": 0.0977, "rewards/chosen": 5.021399603949653, "rewards/margins": 10.690480719672308, "rewards/rejected": -5.669081115722657, "step": 417 }, { "epoch": 0.10459151757788064, "grad_norm": 13.25, "kl": 4.804970741271973, "learning_rate": 5e-06, "logits/chosen": -47708659.2, "logits/rejected": -40883541.333333336, "logps/chosen": -427.544140625, "logps/rejected": -615.9753146701389, "loss": 0.0508, "rewards/chosen": 5.232835896809896, "rewards/margins": 14.803684997558594, "rewards/rejected": -9.570849100748697, "step": 418 }, { "epoch": 0.10484173651945453, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37544085.333333336, "logits/rejected": -33258616.0, "logps/chosen": -348.0797932942708, "logps/rejected": -572.90478515625, "loss": 0.0556, "rewards/chosen": 5.4418894449869795, "rewards/margins": 15.198873519897461, "rewards/rejected": -9.756984074910482, "step": 419 }, { "epoch": 0.1050919554610284, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54339653.81818182, "logits/rejected": -37937309.538461536, "logps/chosen": -371.27934126420456, "logps/rejected": -563.3169696514423, "loss": 0.0581, "rewards/chosen": 4.655000166459517, "rewards/margins": 14.421822981400922, "rewards/rejected": -9.766822814941406, "step": 420 }, { "epoch": 0.10534217440260228, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63854405.81818182, "logits/rejected": -57682284.307692304, "logps/chosen": -497.42746803977275, "logps/rejected": -663.4538762019231, "loss": 0.0264, "rewards/chosen": 6.401584972034801, "rewards/margins": 15.93082081187855, "rewards/rejected": -9.52923583984375, "step": 421 }, { "epoch": 0.10559239334417615, "grad_norm": 2.859375, "kl": 1.827080488204956, "learning_rate": 5e-06, "logits/chosen": -73727705.6, "logits/rejected": -37536870.85714286, "logps/chosen": -520.97421875, "logps/rejected": -383.84995814732144, "loss": 0.0072, "rewards/chosen": 7.687107086181641, "rewards/margins": 15.212650844029017, "rewards/rejected": -7.525543757847378, "step": 422 }, { "epoch": 0.10584261228575004, "grad_norm": 4.625, "kl": 5.112859725952148, "learning_rate": 5e-06, "logits/chosen": -71892704.0, "logits/rejected": -18449296.0, "logps/chosen": -536.8836263020834, "logps/rejected": -331.226318359375, "loss": 0.0121, "rewards/chosen": 8.18552271525065, "rewards/margins": 14.193390528361002, "rewards/rejected": -6.007867813110352, "step": 423 }, { "epoch": 0.10609283122732391, "grad_norm": 11.4375, "kl": 4.349067211151123, "learning_rate": 5e-06, "logits/chosen": -59904878.93333333, "logits/rejected": -62533589.333333336, "logps/chosen": -395.45517578125, "logps/rejected": -482.7253689236111, "loss": 0.0882, "rewards/chosen": 4.3522796630859375, "rewards/margins": 12.742725796169704, "rewards/rejected": -8.390446133083767, "step": 424 }, { "epoch": 0.10634305016889778, "grad_norm": 14.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42599539.2, "logits/rejected": -81123666.28571428, "logps/chosen": -436.782666015625, "logps/rejected": -501.5443638392857, "loss": 0.1078, "rewards/chosen": 4.611692810058594, "rewards/margins": 11.875519452776228, "rewards/rejected": -7.263826642717634, "step": 425 }, { "epoch": 0.10659326911047166, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37022701.333333336, "logits/rejected": -43165373.333333336, "logps/chosen": -283.94740804036456, "logps/rejected": -503.1743570963542, "loss": 0.0585, "rewards/chosen": 4.248084704081218, "rewards/margins": 10.905401547749838, "rewards/rejected": -6.65731684366862, "step": 426 }, { "epoch": 0.10684348805204553, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23991613.09090909, "logits/rejected": -49280157.538461536, "logps/chosen": -492.92959872159093, "logps/rejected": -610.6906174879807, "loss": 0.0308, "rewards/chosen": 5.263313640247691, "rewards/margins": 15.086980726335431, "rewards/rejected": -9.82366708608774, "step": 427 }, { "epoch": 0.10709370699361942, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53200699.07692308, "logits/rejected": 9402146.909090908, "logps/chosen": -270.28173828125, "logps/rejected": -583.5978338068181, "loss": 0.0781, "rewards/chosen": 4.015569833608774, "rewards/margins": 11.952680040906358, "rewards/rejected": -7.937110207297585, "step": 428 }, { "epoch": 0.10734392593519329, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -105375266.9090909, "logits/rejected": -44693144.615384616, "logps/chosen": -422.41526100852275, "logps/rejected": -547.7668269230769, "loss": 0.0656, "rewards/chosen": 3.8282331986860796, "rewards/margins": 14.670419759683677, "rewards/rejected": -10.842186560997597, "step": 429 }, { "epoch": 0.10759414487676718, "grad_norm": 19.375, "kl": 1.44424569606781, "learning_rate": 5e-06, "logits/chosen": -39972753.06666667, "logits/rejected": -27854250.666666668, "logps/chosen": -400.51256510416664, "logps/rejected": -663.8101128472222, "loss": 0.0632, "rewards/chosen": 6.041231282552084, "rewards/margins": 14.498454962836373, "rewards/rejected": -8.457223680284288, "step": 430 }, { "epoch": 0.10784436381834105, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -92005045.33333333, "logits/rejected": -51833440.0, "logps/chosen": -465.8907063802083, "logps/rejected": -594.366455078125, "loss": 0.0596, "rewards/chosen": 5.629677454630534, "rewards/margins": 13.738147099812824, "rewards/rejected": -8.108469645182291, "step": 431 }, { "epoch": 0.10809458275991493, "grad_norm": 5.3125, "kl": 0.06840769946575165, "learning_rate": 5e-06, "logits/chosen": -30050475.636363637, "logits/rejected": -55986180.92307692, "logps/chosen": -455.12349076704544, "logps/rejected": -548.5456730769231, "loss": 0.0332, "rewards/chosen": 6.364178050648082, "rewards/margins": 16.604269227781494, "rewards/rejected": -10.240091177133413, "step": 432 }, { "epoch": 0.1083448017014888, "grad_norm": 7.8125, "kl": 0.8455416560173035, "learning_rate": 5e-06, "logits/chosen": -59441879.27272727, "logits/rejected": -43116731.07692308, "logps/chosen": -540.3319424715909, "logps/rejected": -570.4277719350962, "loss": 0.01, "rewards/chosen": 7.734629544344815, "rewards/margins": 15.767583246831293, "rewards/rejected": -8.032953702486479, "step": 433 }, { "epoch": 0.10859502064306267, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42783740.8, "logits/rejected": -59091515.428571425, "logps/chosen": -336.4683349609375, "logps/rejected": -619.0197405133929, "loss": 0.0688, "rewards/chosen": 4.17241325378418, "rewards/margins": 14.749433081490654, "rewards/rejected": -10.577019827706474, "step": 434 }, { "epoch": 0.10884523958463656, "grad_norm": 15.0625, "kl": 1.6571426391601562, "learning_rate": 5e-06, "logits/chosen": -52426240.0, "logits/rejected": -43346029.333333336, "logps/chosen": -327.007080078125, "logps/rejected": -359.2379964192708, "loss": 0.0616, "rewards/chosen": 3.975179354349772, "rewards/margins": 10.891711870829264, "rewards/rejected": -6.916532516479492, "step": 435 }, { "epoch": 0.10909545852621043, "grad_norm": 9.25, "kl": 2.7744154930114746, "learning_rate": 5e-06, "logits/chosen": -56368116.36363637, "logits/rejected": -55251067.07692308, "logps/chosen": -512.1365855823864, "logps/rejected": -612.5079627403846, "loss": 0.0327, "rewards/chosen": 7.093271428888494, "rewards/margins": 17.611389480270706, "rewards/rejected": -10.518118051382212, "step": 436 }, { "epoch": 0.10934567746778431, "grad_norm": 16.25, "kl": 1.4108521938323975, "learning_rate": 5e-06, "logits/chosen": -39318958.54545455, "logits/rejected": -36087556.92307692, "logps/chosen": -297.08469460227275, "logps/rejected": -451.4909855769231, "loss": 0.0989, "rewards/chosen": 4.057796825062145, "rewards/margins": 11.61541518631515, "rewards/rejected": -7.557618361253005, "step": 437 }, { "epoch": 0.10959589640935818, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64789038.54545455, "logits/rejected": -46188721.23076923, "logps/chosen": -406.27787642045456, "logps/rejected": -488.8867938701923, "loss": 0.0522, "rewards/chosen": 4.8656369989568535, "rewards/margins": 12.836351861486902, "rewards/rejected": -7.970714862530048, "step": 438 }, { "epoch": 0.10984611535093207, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47823145.6, "logits/rejected": -57922779.428571425, "logps/chosen": -363.1122314453125, "logps/rejected": -569.68994140625, "loss": 0.0802, "rewards/chosen": 4.555009460449218, "rewards/margins": 13.741898018973213, "rewards/rejected": -9.186888558523995, "step": 439 }, { "epoch": 0.11009633429250594, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46450609.23076923, "logits/rejected": -62006528.0, "logps/chosen": -415.20688100961536, "logps/rejected": -607.5606800426136, "loss": 0.0706, "rewards/chosen": 5.4302203838641825, "rewards/margins": 14.26920756093272, "rewards/rejected": -8.838987177068537, "step": 440 }, { "epoch": 0.11034655323407982, "grad_norm": 8.4375, "kl": 0.7785409688949585, "learning_rate": 5e-06, "logits/chosen": -48214857.84615385, "logits/rejected": -68788165.81818181, "logps/chosen": -421.80551382211536, "logps/rejected": -653.4932084517045, "loss": 0.0484, "rewards/chosen": 5.005285409780649, "rewards/margins": 16.069748244919143, "rewards/rejected": -11.064462835138494, "step": 441 }, { "epoch": 0.1105967721756537, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57273272.0, "logits/rejected": -46521432.0, "logps/chosen": -472.7195129394531, "logps/rejected": -615.48193359375, "loss": 0.0665, "rewards/chosen": 5.434605598449707, "rewards/margins": 15.126108169555664, "rewards/rejected": -9.691502571105957, "step": 442 }, { "epoch": 0.11084699111722758, "grad_norm": 15.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59167099.07692308, "logits/rejected": -56120965.81818182, "logps/chosen": -430.2843674879808, "logps/rejected": -611.7582120028409, "loss": 0.0408, "rewards/chosen": 5.39390857403095, "rewards/margins": 15.747195317195011, "rewards/rejected": -10.353286743164062, "step": 443 }, { "epoch": 0.11109721005880145, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40118033.45454545, "logits/rejected": -34688561.23076923, "logps/chosen": -360.45780806107956, "logps/rejected": -517.9439227764423, "loss": 0.0346, "rewards/chosen": 4.406946702436968, "rewards/margins": 14.288353046337207, "rewards/rejected": -9.88140634390024, "step": 444 }, { "epoch": 0.11134742900037532, "grad_norm": 12.1875, "kl": 1.8678210973739624, "learning_rate": 5e-06, "logits/chosen": -42519163.07692308, "logits/rejected": -29502612.363636363, "logps/chosen": -329.134765625, "logps/rejected": -450.5138494318182, "loss": 0.0965, "rewards/chosen": 4.272696568415715, "rewards/margins": 14.40737827007587, "rewards/rejected": -10.134681701660156, "step": 445 }, { "epoch": 0.11159764794194921, "grad_norm": 15.625, "kl": 0.2287565916776657, "learning_rate": 5e-06, "logits/chosen": -16067683.555555556, "logits/rejected": -75281416.53333333, "logps/chosen": -523.2049696180555, "logps/rejected": -596.6475260416667, "loss": 0.0885, "rewards/chosen": 4.291886647542317, "rewards/margins": 12.028658294677733, "rewards/rejected": -7.736771647135416, "step": 446 }, { "epoch": 0.11184786688352308, "grad_norm": 3.4375, "kl": 1.16470468044281, "learning_rate": 5e-06, "logits/chosen": -91582615.27272727, "logits/rejected": -32545806.769230768, "logps/chosen": -589.0840287642045, "logps/rejected": -544.3393930288462, "loss": 0.0063, "rewards/chosen": 8.398947975852273, "rewards/margins": 16.89925725976904, "rewards/rejected": -8.500309283916767, "step": 447 }, { "epoch": 0.11209808582509696, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66744974.76923077, "logits/rejected": -36661026.90909091, "logps/chosen": -386.50450721153845, "logps/rejected": -571.6574928977273, "loss": 0.053, "rewards/chosen": 5.015506450946514, "rewards/margins": 14.494260667920946, "rewards/rejected": -9.478754216974432, "step": 448 }, { "epoch": 0.11234830476667083, "grad_norm": 10.1875, "kl": 1.5537364482879639, "learning_rate": 5e-06, "logits/chosen": -63999374.76923077, "logits/rejected": -74214469.81818181, "logps/chosen": -385.4125225360577, "logps/rejected": -595.1963778409091, "loss": 0.0543, "rewards/chosen": 5.21881338266226, "rewards/margins": 14.902449441122842, "rewards/rejected": -9.683636058460582, "step": 449 }, { "epoch": 0.11259852370824472, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48373746.28571428, "logits/rejected": -66640519.52941176, "logps/chosen": -399.30106026785717, "logps/rejected": -635.9929917279412, "loss": 0.0336, "rewards/chosen": 5.6707354954310825, "rewards/margins": 16.89242595384101, "rewards/rejected": -11.221690458409926, "step": 450 }, { "epoch": 0.11284874264981859, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55439051.63636363, "logits/rejected": -27698461.53846154, "logps/chosen": -360.3815252130682, "logps/rejected": -319.3821364182692, "loss": 0.0666, "rewards/chosen": 4.7006613991477275, "rewards/margins": 12.42681895436107, "rewards/rejected": -7.726157555213342, "step": 451 }, { "epoch": 0.11309896159139247, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67583872.0, "logits/rejected": -83846882.46153846, "logps/chosen": -341.9344371448864, "logps/rejected": -519.6827674278846, "loss": 0.0478, "rewards/chosen": 5.524294072931463, "rewards/margins": 13.071720736843723, "rewards/rejected": -7.54742666391226, "step": 452 }, { "epoch": 0.11334918053296635, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42845492.0, "logits/rejected": -37726132.0, "logps/chosen": -332.44293212890625, "logps/rejected": -580.2208862304688, "loss": 0.0755, "rewards/chosen": 4.65379524230957, "rewards/margins": 12.033156871795654, "rewards/rejected": -7.379361629486084, "step": 453 }, { "epoch": 0.11359939947454022, "grad_norm": 9.5, "kl": 5.602470397949219, "learning_rate": 5e-06, "logits/chosen": -57169546.666666664, "logits/rejected": -13590354.666666666, "logps/chosen": -388.2919108072917, "logps/rejected": -596.8492838541666, "loss": 0.068, "rewards/chosen": 5.011869430541992, "rewards/margins": 13.917655309041342, "rewards/rejected": -8.90578587849935, "step": 454 }, { "epoch": 0.1138496184161141, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -84295982.54545455, "logits/rejected": -54136497.23076923, "logps/chosen": -457.11319247159093, "logps/rejected": -520.7634465144231, "loss": 0.0264, "rewards/chosen": 6.40874550559304, "rewards/margins": 14.554382430923567, "rewards/rejected": -8.145636925330528, "step": 455 }, { "epoch": 0.11409983735768797, "grad_norm": 30.75, "kl": 11.186366081237793, "learning_rate": 5e-06, "logits/chosen": -66573275.428571425, "logits/rejected": -67588889.6, "logps/chosen": -326.14327566964283, "logps/rejected": -690.95556640625, "loss": 0.2057, "rewards/chosen": 3.4635941641671315, "rewards/margins": 12.93242656162807, "rewards/rejected": -9.468832397460938, "step": 456 }, { "epoch": 0.11435005629926186, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73573179.73333333, "logits/rejected": -77340636.44444445, "logps/chosen": -524.6490885416666, "logps/rejected": -512.8880750868055, "loss": 0.0285, "rewards/chosen": 6.236517333984375, "rewards/margins": 13.422470262315539, "rewards/rejected": -7.185952928331163, "step": 457 }, { "epoch": 0.11460027524083573, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79265735.1111111, "logits/rejected": -36078596.266666666, "logps/chosen": -396.03301323784723, "logps/rejected": -469.51373697916665, "loss": 0.0173, "rewards/chosen": 4.9003550211588545, "rewards/margins": 14.78834228515625, "rewards/rejected": -9.887987263997395, "step": 458 }, { "epoch": 0.11485049418240961, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -82130585.6, "logits/rejected": -48419026.28571428, "logps/chosen": -509.479736328125, "logps/rejected": -520.1116768973214, "loss": 0.0365, "rewards/chosen": 6.426950073242187, "rewards/margins": 15.140568215506416, "rewards/rejected": -8.71361814226423, "step": 459 }, { "epoch": 0.11510071312398348, "grad_norm": 16.0, "kl": 9.457859992980957, "learning_rate": 5e-06, "logits/chosen": -45067478.85714286, "logits/rejected": -62056377.6, "logps/chosen": -474.35306222098217, "logps/rejected": -622.212841796875, "loss": 0.0838, "rewards/chosen": 6.8805084228515625, "rewards/margins": 15.898917388916015, "rewards/rejected": -9.018408966064452, "step": 460 }, { "epoch": 0.11535093206555737, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35475108.571428575, "logits/rejected": -77169062.4, "logps/chosen": -363.6903599330357, "logps/rejected": -648.9046875, "loss": 0.0722, "rewards/chosen": 4.659856523786273, "rewards/margins": 16.018920244489397, "rewards/rejected": -11.359063720703125, "step": 461 }, { "epoch": 0.11560115100713124, "grad_norm": 6.28125, "kl": 1.090598464012146, "learning_rate": 5e-06, "logits/chosen": -74379520.0, "logits/rejected": -40001571.55555555, "logps/chosen": -486.63287760416665, "logps/rejected": -554.8781467013889, "loss": 0.0109, "rewards/chosen": 8.59778544108073, "rewards/margins": 19.568378363715276, "rewards/rejected": -10.970592922634548, "step": 462 }, { "epoch": 0.11585136994870511, "grad_norm": 13.5625, "kl": 1.3382396697998047, "learning_rate": 5e-06, "logits/chosen": -22800081.777777776, "logits/rejected": -37535044.266666666, "logps/chosen": -508.11675347222223, "logps/rejected": -432.5379231770833, "loss": 0.0531, "rewards/chosen": 6.193984561496311, "rewards/margins": 12.401752302381727, "rewards/rejected": -6.207767740885417, "step": 463 }, { "epoch": 0.116101588890279, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50146112.0, "logits/rejected": -39513017.6, "logps/chosen": -459.5585239955357, "logps/rejected": -578.97080078125, "loss": 0.0283, "rewards/chosen": 6.518726348876953, "rewards/margins": 17.24543685913086, "rewards/rejected": -10.726710510253906, "step": 464 }, { "epoch": 0.11635180783185287, "grad_norm": 9.6875, "kl": 0.2913055419921875, "learning_rate": 5e-06, "logits/chosen": -33372720.0, "logits/rejected": -55401685.333333336, "logps/chosen": -376.7622884114583, "logps/rejected": -360.7132568359375, "loss": 0.0812, "rewards/chosen": 5.447196960449219, "rewards/margins": 12.01614761352539, "rewards/rejected": -6.568950653076172, "step": 465 }, { "epoch": 0.11660202677342675, "grad_norm": 15.0, "kl": 0.9293226003646851, "learning_rate": 5e-06, "logits/chosen": -54136459.63636363, "logits/rejected": -25768428.307692308, "logps/chosen": -369.4251819957386, "logps/rejected": -394.2653245192308, "loss": 0.0812, "rewards/chosen": 5.809360850941051, "rewards/margins": 11.81350361217152, "rewards/rejected": -6.004142761230469, "step": 466 }, { "epoch": 0.11685224571500062, "grad_norm": 4.90625, "kl": 0.4398040771484375, "learning_rate": 5e-06, "logits/chosen": -21144785.333333332, "logits/rejected": -77457888.0, "logps/chosen": -427.287353515625, "logps/rejected": -368.9241536458333, "loss": 0.0163, "rewards/chosen": 5.185189247131348, "rewards/margins": 11.416746457417805, "rewards/rejected": -6.231557210286458, "step": 467 }, { "epoch": 0.1171024646565745, "grad_norm": 3.4375, "kl": 1.4409472942352295, "learning_rate": 5e-06, "logits/chosen": -57688384.0, "logits/rejected": -49374240.0, "logps/chosen": -552.608447265625, "logps/rejected": -549.7452218191964, "loss": 0.0062, "rewards/chosen": 7.94610595703125, "rewards/margins": 16.24700927734375, "rewards/rejected": -8.3009033203125, "step": 468 }, { "epoch": 0.11735268359814838, "grad_norm": 9.0625, "kl": 3.589811325073242, "learning_rate": 5e-06, "logits/chosen": -64632625.23076923, "logits/rejected": -41390429.09090909, "logps/chosen": -343.5602463942308, "logps/rejected": -445.1888316761364, "loss": 0.0812, "rewards/chosen": 4.639742631178636, "rewards/margins": 9.403381907856549, "rewards/rejected": -4.763639276677912, "step": 469 }, { "epoch": 0.11760290253972226, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55867793.45454545, "logits/rejected": -25705326.769230768, "logps/chosen": -446.68696732954544, "logps/rejected": -461.61177884615387, "loss": 0.0355, "rewards/chosen": 5.096036737615412, "rewards/margins": 13.319340899274065, "rewards/rejected": -8.223304161658653, "step": 470 }, { "epoch": 0.11785312148129613, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46496901.81818182, "logits/rejected": -27688430.769230768, "logps/chosen": -290.997802734375, "logps/rejected": -648.7838040865385, "loss": 0.0984, "rewards/chosen": 3.826521786776456, "rewards/margins": 10.63640783883475, "rewards/rejected": -6.809886052058293, "step": 471 }, { "epoch": 0.11810334042287002, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35014086.85714286, "logits/rejected": -46966264.47058824, "logps/chosen": -364.2755650111607, "logps/rejected": -588.3134765625, "loss": 0.0637, "rewards/chosen": 4.479448318481445, "rewards/margins": 14.524808210485121, "rewards/rejected": -10.045359892003676, "step": 472 }, { "epoch": 0.11835355936444389, "grad_norm": 10.0, "kl": 5.845806121826172, "learning_rate": 5e-06, "logits/chosen": -42262010.666666664, "logits/rejected": -5774304.0, "logps/chosen": -383.2425537109375, "logps/rejected": -824.1959635416666, "loss": 0.0602, "rewards/chosen": 6.369055430094401, "rewards/margins": 18.599793752034504, "rewards/rejected": -12.230738321940104, "step": 473 }, { "epoch": 0.11860377830601776, "grad_norm": 6.65625, "kl": 2.1483943462371826, "learning_rate": 5e-06, "logits/chosen": -61046961.23076923, "logits/rejected": -63630714.18181818, "logps/chosen": -368.2033128004808, "logps/rejected": -500.08198686079544, "loss": 0.0166, "rewards/chosen": 6.370841393103967, "rewards/margins": 14.787518694684223, "rewards/rejected": -8.416677301580256, "step": 474 }, { "epoch": 0.11885399724759164, "grad_norm": 10.6875, "kl": 15.326486587524414, "learning_rate": 5e-06, "logits/chosen": -59713250.13333333, "logits/rejected": -25291317.333333332, "logps/chosen": -460.09765625, "logps/rejected": -703.8848741319445, "loss": 0.0689, "rewards/chosen": 6.759861246744792, "rewards/margins": 16.236472913953993, "rewards/rejected": -9.476611667209202, "step": 475 }, { "epoch": 0.11910421618916552, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20665618.90909091, "logits/rejected": -27693661.53846154, "logps/chosen": -301.78457919034093, "logps/rejected": -550.5569786658654, "loss": 0.0703, "rewards/chosen": 4.220409046519887, "rewards/margins": 13.16304186840991, "rewards/rejected": -8.942632821890024, "step": 476 }, { "epoch": 0.1193544351307394, "grad_norm": 11.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68123658.66666667, "logits/rejected": -52094229.333333336, "logps/chosen": -459.8805338541667, "logps/rejected": -468.3011474609375, "loss": 0.0423, "rewards/chosen": 5.906017939249675, "rewards/margins": 16.16494305928548, "rewards/rejected": -10.258925120035807, "step": 477 }, { "epoch": 0.11960465407231327, "grad_norm": 6.15625, "kl": 2.859895706176758, "learning_rate": 5e-06, "logits/chosen": -50117834.666666664, "logits/rejected": 13962564.0, "logps/chosen": -417.3835042317708, "logps/rejected": -478.0733235677083, "loss": 0.0527, "rewards/chosen": 6.168768564860026, "rewards/margins": 12.145323435465496, "rewards/rejected": -5.976554870605469, "step": 478 }, { "epoch": 0.11985487301388716, "grad_norm": 18.5, "kl": 1.1742995977401733, "learning_rate": 5e-06, "logits/chosen": -49570372.92307692, "logits/rejected": -31609547.636363637, "logps/chosen": -434.6548602764423, "logps/rejected": -373.51686789772725, "loss": 0.087, "rewards/chosen": 5.331655062161959, "rewards/margins": 11.418631546980851, "rewards/rejected": -6.086976484818892, "step": 479 }, { "epoch": 0.12010509195546103, "grad_norm": 15.3125, "kl": 1.315460205078125, "learning_rate": 5e-06, "logits/chosen": -40150051.2, "logits/rejected": -38186505.14285714, "logps/chosen": -401.6398681640625, "logps/rejected": -547.6748744419643, "loss": 0.0514, "rewards/chosen": 5.968011474609375, "rewards/margins": 15.018058122907366, "rewards/rejected": -9.050046648297991, "step": 480 }, { "epoch": 0.12035531089703491, "grad_norm": 28.5, "kl": 12.87321662902832, "learning_rate": 5e-06, "logits/chosen": -55077428.36363637, "logits/rejected": 53048851.692307696, "logps/chosen": -451.60715553977275, "logps/rejected": -631.8985877403846, "loss": 0.0932, "rewards/chosen": 7.126282431862571, "rewards/margins": 14.064957358620383, "rewards/rejected": -6.9386749267578125, "step": 481 }, { "epoch": 0.12060552983860878, "grad_norm": 12.75, "kl": 2.398913860321045, "learning_rate": 5e-06, "logits/chosen": -47802420.36363637, "logits/rejected": -45856064.0, "logps/chosen": -375.00363991477275, "logps/rejected": -495.0247145432692, "loss": 0.0733, "rewards/chosen": 5.409019817005504, "rewards/margins": 12.809068879881105, "rewards/rejected": -7.400049062875601, "step": 482 }, { "epoch": 0.12085574878018265, "grad_norm": 5.09375, "kl": 6.2386674880981445, "learning_rate": 5e-06, "logits/chosen": -38800280.0, "logits/rejected": -33975160.0, "logps/chosen": -497.49139404296875, "logps/rejected": -713.5885620117188, "loss": 0.0938, "rewards/chosen": 6.65096378326416, "rewards/margins": 14.327479839324951, "rewards/rejected": -7.676516056060791, "step": 483 }, { "epoch": 0.12110596772175654, "grad_norm": 22.625, "kl": 11.31030559539795, "learning_rate": 5e-06, "logits/chosen": -65001774.93333333, "logits/rejected": -24923900.444444444, "logps/chosen": -507.31178385416666, "logps/rejected": -521.2833116319445, "loss": 0.061, "rewards/chosen": 7.19973398844401, "rewards/margins": 15.36088375515408, "rewards/rejected": -8.16114976671007, "step": 484 }, { "epoch": 0.12135618666333041, "grad_norm": 5.90625, "kl": 5.078367233276367, "learning_rate": 5e-06, "logits/chosen": -74973979.42857143, "logits/rejected": -48408723.2, "logps/chosen": -400.0113002232143, "logps/rejected": -327.6156494140625, "loss": 0.0571, "rewards/chosen": 6.5358734130859375, "rewards/margins": 12.664257049560547, "rewards/rejected": -6.128383636474609, "step": 485 }, { "epoch": 0.1216064056049043, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48302883.2, "logits/rejected": -12341133.714285715, "logps/chosen": -360.29853515625, "logps/rejected": -503.53627232142856, "loss": 0.0337, "rewards/chosen": 6.144741439819336, "rewards/margins": 13.616791697910855, "rewards/rejected": -7.472050258091518, "step": 486 }, { "epoch": 0.12185662454647816, "grad_norm": 10.3125, "kl": 7.17965841293335, "learning_rate": 5e-06, "logits/chosen": -72141056.0, "logits/rejected": -44767606.15384615, "logps/chosen": -547.6633522727273, "logps/rejected": -548.7711838942307, "loss": 0.0105, "rewards/chosen": 8.66045448996804, "rewards/margins": 16.591907901363772, "rewards/rejected": -7.931453411395733, "step": 487 }, { "epoch": 0.12210684348805205, "grad_norm": 18.75, "kl": 2.8869330883026123, "learning_rate": 5e-06, "logits/chosen": -39958440.0, "logits/rejected": -20978050.666666668, "logps/chosen": -266.3028564453125, "logps/rejected": -570.3648274739584, "loss": 0.1361, "rewards/chosen": 3.7354443868001304, "rewards/margins": 14.502566019694012, "rewards/rejected": -10.76712163289388, "step": 488 }, { "epoch": 0.12235706242962592, "grad_norm": 8.125, "kl": 7.888890743255615, "learning_rate": 5e-06, "logits/chosen": -39167570.28571428, "logits/rejected": -76855008.0, "logps/chosen": -388.73032924107144, "logps/rejected": -485.32421875, "loss": 0.0769, "rewards/chosen": 6.364748273577009, "rewards/margins": 13.185390363420758, "rewards/rejected": -6.82064208984375, "step": 489 }, { "epoch": 0.1226072813711998, "grad_norm": 27.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33452950.85714286, "logits/rejected": -65937460.705882356, "logps/chosen": -416.3704310825893, "logps/rejected": -577.2244944852941, "loss": 0.0699, "rewards/chosen": 5.780368259974888, "rewards/margins": 11.687593027323235, "rewards/rejected": -5.907224767348346, "step": 490 }, { "epoch": 0.12285750031277368, "grad_norm": 14.9375, "kl": 8.293676376342773, "learning_rate": 5e-06, "logits/chosen": -95655424.0, "logits/rejected": -58857070.222222224, "logps/chosen": -504.1125, "logps/rejected": -525.0114474826389, "loss": 0.0645, "rewards/chosen": 5.967329915364584, "rewards/margins": 15.612222290039062, "rewards/rejected": -9.644892374674479, "step": 491 }, { "epoch": 0.12310771925434755, "grad_norm": 15.6875, "kl": 0.24074110388755798, "learning_rate": 5e-06, "logits/chosen": -69385493.33333333, "logits/rejected": -34415170.13333333, "logps/chosen": -425.4816080729167, "logps/rejected": -573.4834635416667, "loss": 0.0401, "rewards/chosen": 6.937757703993055, "rewards/margins": 15.098754204644097, "rewards/rejected": -8.160996500651041, "step": 492 }, { "epoch": 0.12335793819592143, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49814382.54545455, "logits/rejected": -44867017.84615385, "logps/chosen": -516.3655894886364, "logps/rejected": -450.52944711538464, "loss": 0.0215, "rewards/chosen": 5.593210740522905, "rewards/margins": 13.613422767265693, "rewards/rejected": -8.020212026742788, "step": 493 }, { "epoch": 0.1236081571374953, "grad_norm": 22.625, "kl": 12.503021240234375, "learning_rate": 5e-06, "logits/chosen": -7045472.0, "logits/rejected": -61588666.18181818, "logps/chosen": -560.8127253605769, "logps/rejected": -613.7776988636364, "loss": 0.0884, "rewards/chosen": 6.884715153620793, "rewards/margins": 13.994393942239402, "rewards/rejected": -7.109678788618608, "step": 494 }, { "epoch": 0.12385837607906919, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67192540.44444445, "logits/rejected": -44729378.13333333, "logps/chosen": -484.6663411458333, "logps/rejected": -537.2735026041667, "loss": 0.0621, "rewards/chosen": 6.838818868001302, "rewards/margins": 14.29560546875, "rewards/rejected": -7.456786600748698, "step": 495 }, { "epoch": 0.12410859502064306, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41143004.8, "logits/rejected": -40233846.85714286, "logps/chosen": -306.438134765625, "logps/rejected": -423.08740234375, "loss": 0.0433, "rewards/chosen": 5.018299102783203, "rewards/margins": 12.230145590645925, "rewards/rejected": -7.211846487862723, "step": 496 }, { "epoch": 0.12435881396221694, "grad_norm": 5.125, "kl": 2.0190443992614746, "learning_rate": 5e-06, "logits/chosen": -58310645.333333336, "logits/rejected": -59065100.8, "logps/chosen": -442.364501953125, "logps/rejected": -632.051171875, "loss": 0.0371, "rewards/chosen": 6.31556150648329, "rewards/margins": 16.375715721978082, "rewards/rejected": -10.060154215494792, "step": 497 }, { "epoch": 0.12460903290379081, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10794531.692307692, "logits/rejected": -47191863.27272727, "logps/chosen": -393.6940730168269, "logps/rejected": -479.57790305397725, "loss": 0.0533, "rewards/chosen": 6.14932133601262, "rewards/margins": 13.989344029993443, "rewards/rejected": -7.840022693980824, "step": 498 }, { "epoch": 0.1248592518453647, "grad_norm": 15.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -660263.5, "logits/rejected": -41052148.0, "logps/chosen": -345.80963134765625, "logps/rejected": -500.7593688964844, "loss": 0.0649, "rewards/chosen": 4.8870849609375, "rewards/margins": 13.218653678894043, "rewards/rejected": -8.331568717956543, "step": 499 }, { "epoch": 0.12510947078693857, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49384086.85714286, "logits/rejected": -47163286.5882353, "logps/chosen": -392.75770786830356, "logps/rejected": -514.7162798713235, "loss": 0.0447, "rewards/chosen": 5.240893227713449, "rewards/margins": 13.04556098104525, "rewards/rejected": -7.804667753331802, "step": 500 }, { "epoch": 0.12535968972851244, "grad_norm": 24.125, "kl": 13.870463371276855, "learning_rate": 5e-06, "logits/chosen": -34992692.705882356, "logits/rejected": -49290697.14285714, "logps/chosen": -449.64694393382354, "logps/rejected": -478.52779715401783, "loss": 0.1207, "rewards/chosen": 5.975617352653952, "rewards/margins": 15.519418796571363, "rewards/rejected": -9.543801443917411, "step": 501 }, { "epoch": 0.1256099086700863, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46354883.55555555, "logits/rejected": -52046830.93333333, "logps/chosen": -504.72667100694446, "logps/rejected": -782.2417317708333, "loss": 0.0263, "rewards/chosen": 6.27376471625434, "rewards/margins": 19.49381883409288, "rewards/rejected": -13.220054117838542, "step": 502 }, { "epoch": 0.1258601276116602, "grad_norm": 14.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63085602.461538464, "logits/rejected": -49758004.36363637, "logps/chosen": -346.48940805288464, "logps/rejected": -501.6334783380682, "loss": 0.0873, "rewards/chosen": 4.194590348463792, "rewards/margins": 10.549574978701717, "rewards/rejected": -6.354984630237926, "step": 503 }, { "epoch": 0.12611034655323408, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37878294.4, "logits/rejected": -44222112.0, "logps/chosen": -300.1798828125, "logps/rejected": -666.5936104910714, "loss": 0.0332, "rewards/chosen": 4.9337005615234375, "rewards/margins": 17.644949776785715, "rewards/rejected": -12.711249215262276, "step": 504 }, { "epoch": 0.12636056549480795, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53258168.88888889, "logits/rejected": -62086340.266666666, "logps/chosen": -497.5511067708333, "logps/rejected": -525.2665364583333, "loss": 0.0172, "rewards/chosen": 8.90140872531467, "rewards/margins": 18.32415042453342, "rewards/rejected": -9.42274169921875, "step": 505 }, { "epoch": 0.12661078443638182, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58714240.0, "logits/rejected": -68869421.71428572, "logps/chosen": -466.70322265625, "logps/rejected": -527.516357421875, "loss": 0.0408, "rewards/chosen": 6.711792755126953, "rewards/margins": 16.323926326206752, "rewards/rejected": -9.612133571079799, "step": 506 }, { "epoch": 0.12686100337795572, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28285740.8, "logits/rejected": -51621261.71428572, "logps/chosen": -253.264501953125, "logps/rejected": -547.0942034040179, "loss": 0.0528, "rewards/chosen": 4.269728088378907, "rewards/margins": 13.727525547572546, "rewards/rejected": -9.457797459193639, "step": 507 }, { "epoch": 0.1271112223195296, "grad_norm": 16.375, "kl": 1.6754951477050781, "learning_rate": 5e-06, "logits/chosen": -60989648.0, "logits/rejected": -48107856.0, "logps/chosen": -577.9093831380209, "logps/rejected": -504.8100992838542, "loss": 0.035, "rewards/chosen": 5.744365692138672, "rewards/margins": 13.332722345987957, "rewards/rejected": -7.588356653849284, "step": 508 }, { "epoch": 0.12736144126110346, "grad_norm": 19.375, "kl": 5.535085201263428, "learning_rate": 5e-06, "logits/chosen": -59012608.0, "logits/rejected": -43437836.0, "logps/chosen": -449.87152099609375, "logps/rejected": -410.8323669433594, "loss": 0.0751, "rewards/chosen": 5.659799098968506, "rewards/margins": 12.933778762817383, "rewards/rejected": -7.273979663848877, "step": 509 }, { "epoch": 0.12761166020267734, "grad_norm": 11.125, "kl": 2.6640734672546387, "learning_rate": 5e-06, "logits/chosen": -38895342.93333333, "logits/rejected": 20717648.0, "logps/chosen": -480.3834635416667, "logps/rejected": -503.7947591145833, "loss": 0.0827, "rewards/chosen": 5.3864802042643225, "rewards/margins": 14.387341478135852, "rewards/rejected": -9.000861273871529, "step": 510 }, { "epoch": 0.12786187914425123, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25693600.0, "logits/rejected": -39176096.0, "logps/chosen": -292.12186373197113, "logps/rejected": -862.1125710227273, "loss": 0.0745, "rewards/chosen": 4.115102327786959, "rewards/margins": 15.687349919672613, "rewards/rejected": -11.572247591885654, "step": 511 }, { "epoch": 0.1281120980858251, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46853792.0, "logits/rejected": -53366701.71428572, "logps/chosen": -295.2908935546875, "logps/rejected": -556.5693359375, "loss": 0.0817, "rewards/chosen": 4.939128494262695, "rewards/margins": 15.05255252293178, "rewards/rejected": -10.113424028669085, "step": 512 }, { "epoch": 0.12836231702739898, "grad_norm": 11.0625, "kl": 9.1113862991333, "learning_rate": 5e-06, "logits/chosen": -48826320.0, "logits/rejected": -78610368.0, "logps/chosen": -402.9245910644531, "logps/rejected": -859.9910888671875, "loss": 0.0929, "rewards/chosen": 5.665109634399414, "rewards/margins": 21.817462921142578, "rewards/rejected": -16.152353286743164, "step": 513 }, { "epoch": 0.12861253596897285, "grad_norm": 9.125, "kl": 0.6761309504508972, "learning_rate": 5e-06, "logits/chosen": -52341338.666666664, "logits/rejected": -62310213.333333336, "logps/chosen": -522.9697265625, "logps/rejected": -595.3945719401041, "loss": 0.0305, "rewards/chosen": 4.998908996582031, "rewards/margins": 15.923812866210938, "rewards/rejected": -10.924903869628906, "step": 514 }, { "epoch": 0.12886275491054672, "grad_norm": 8.875, "kl": 0.22754161059856415, "learning_rate": 5e-06, "logits/chosen": -71374148.26666667, "logits/rejected": -48037955.55555555, "logps/chosen": -380.0354817708333, "logps/rejected": -605.1385091145834, "loss": 0.0613, "rewards/chosen": 4.615861002604166, "rewards/margins": 13.637049526638455, "rewards/rejected": -9.021188524034288, "step": 515 }, { "epoch": 0.12911297385212062, "grad_norm": 17.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24142262.85714286, "logits/rejected": -54316363.294117644, "logps/chosen": -352.91859654017856, "logps/rejected": -469.2028377757353, "loss": 0.063, "rewards/chosen": 4.072139195033482, "rewards/margins": 13.999904985187435, "rewards/rejected": -9.927765790153952, "step": 516 }, { "epoch": 0.1293631927936945, "grad_norm": 14.5, "kl": 9.314910888671875, "learning_rate": 5e-06, "logits/chosen": -44396648.0, "logits/rejected": -68413600.0, "logps/chosen": -431.5744323730469, "logps/rejected": -652.3296508789062, "loss": 0.0826, "rewards/chosen": 5.625824928283691, "rewards/margins": 15.972043991088867, "rewards/rejected": -10.346219062805176, "step": 517 }, { "epoch": 0.12961341173526836, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33778769.23076923, "logits/rejected": -55241437.09090909, "logps/chosen": -296.46570763221155, "logps/rejected": -525.5427024147727, "loss": 0.0907, "rewards/chosen": 3.4785006596491885, "rewards/margins": 13.163419843553664, "rewards/rejected": -9.684919183904475, "step": 518 }, { "epoch": 0.12986363067684223, "grad_norm": 11.5625, "kl": 14.970568656921387, "learning_rate": 5e-06, "logits/chosen": -56321322.666666664, "logits/rejected": -28531660.444444444, "logps/chosen": -448.6962565104167, "logps/rejected": -724.5801866319445, "loss": 0.1089, "rewards/chosen": 6.593615214029948, "rewards/margins": 18.20522138807509, "rewards/rejected": -11.61160617404514, "step": 519 }, { "epoch": 0.13011384961841613, "grad_norm": 19.125, "kl": 4.355490684509277, "learning_rate": 5e-06, "logits/chosen": -52453877.333333336, "logits/rejected": -61738533.333333336, "logps/chosen": -398.1594645182292, "logps/rejected": -449.4321695963542, "loss": 0.0581, "rewards/chosen": 5.456010182698567, "rewards/margins": 13.074945449829102, "rewards/rejected": -7.618935267130534, "step": 520 }, { "epoch": 0.13036406855999, "grad_norm": 11.4375, "kl": 5.234340667724609, "learning_rate": 5e-06, "logits/chosen": -54637986.13333333, "logits/rejected": -66116152.88888889, "logps/chosen": -407.49892578125, "logps/rejected": -543.0160590277778, "loss": 0.0588, "rewards/chosen": 6.215244547526042, "rewards/margins": 14.532039048936632, "rewards/rejected": -8.316794501410591, "step": 521 }, { "epoch": 0.13061428750156387, "grad_norm": 15.6875, "kl": 18.19165802001953, "learning_rate": 5e-06, "logits/chosen": -25189714.82352941, "logits/rejected": -77390098.28571428, "logps/chosen": -517.8507582720588, "logps/rejected": -708.9725167410714, "loss": 0.1545, "rewards/chosen": 7.169649011948529, "rewards/margins": 20.293441387785585, "rewards/rejected": -13.123792375837054, "step": 522 }, { "epoch": 0.13086450644313774, "grad_norm": 13.75, "kl": 2.2514073848724365, "learning_rate": 5e-06, "logits/chosen": -49493472.0, "logits/rejected": -34294025.14285714, "logps/chosen": -484.2455078125, "logps/rejected": -619.6990094866071, "loss": 0.0245, "rewards/chosen": 5.915974426269531, "rewards/margins": 16.226895141601563, "rewards/rejected": -10.310920715332031, "step": 523 }, { "epoch": 0.1311147253847116, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63092003.55555555, "logits/rejected": -67593390.93333334, "logps/chosen": -361.02989366319446, "logps/rejected": -579.9860677083333, "loss": 0.0337, "rewards/chosen": 5.46497556898329, "rewards/margins": 13.815851423475477, "rewards/rejected": -8.350875854492188, "step": 524 }, { "epoch": 0.1313649443262855, "grad_norm": 6.9375, "kl": 2.5712223052978516, "learning_rate": 5e-06, "logits/chosen": -17019883.076923076, "logits/rejected": -47405239.27272727, "logps/chosen": -296.4245793269231, "logps/rejected": -495.5245916193182, "loss": 0.1468, "rewards/chosen": 4.746087587796724, "rewards/margins": 10.624109521612422, "rewards/rejected": -5.878021933815696, "step": 525 }, { "epoch": 0.13161516326785938, "grad_norm": 15.125, "kl": 17.520803451538086, "learning_rate": 5e-06, "logits/chosen": -51876352.0, "logits/rejected": -65297644.8, "logps/chosen": -404.55092075892856, "logps/rejected": -424.95859375, "loss": 0.0571, "rewards/chosen": 6.774650573730469, "rewards/margins": 11.588059616088866, "rewards/rejected": -4.813409042358399, "step": 526 }, { "epoch": 0.13186538220943325, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43696916.36363637, "logits/rejected": -60581454.76923077, "logps/chosen": -436.27299360795456, "logps/rejected": -458.74594350961536, "loss": 0.0766, "rewards/chosen": 8.198015386408025, "rewards/margins": 13.268264503745765, "rewards/rejected": -5.07024911733774, "step": 527 }, { "epoch": 0.13211560115100712, "grad_norm": 24.5, "kl": 5.007396697998047, "learning_rate": 5e-06, "logits/chosen": -46646429.09090909, "logits/rejected": -37675318.15384615, "logps/chosen": -485.1492365056818, "logps/rejected": -466.7234450120192, "loss": 0.0798, "rewards/chosen": 7.680758389559659, "rewards/margins": 11.481649385465609, "rewards/rejected": -3.8008909959059496, "step": 528 }, { "epoch": 0.13236582009258102, "grad_norm": 14.1875, "kl": 5.743526458740234, "learning_rate": 5e-06, "logits/chosen": -30546733.333333332, "logits/rejected": -24016501.333333332, "logps/chosen": -409.8443603515625, "logps/rejected": -327.99269612630206, "loss": 0.1558, "rewards/chosen": 5.747198104858398, "rewards/margins": 11.759943008422852, "rewards/rejected": -6.012744903564453, "step": 529 }, { "epoch": 0.1326160390341549, "grad_norm": 15.0, "kl": 13.818990707397461, "learning_rate": 5e-06, "logits/chosen": -47930131.692307696, "logits/rejected": -41468194.90909091, "logps/chosen": -468.44125600961536, "logps/rejected": -552.3140536221591, "loss": 0.0647, "rewards/chosen": 7.358089153583233, "rewards/margins": 13.968134846720663, "rewards/rejected": -6.610045693137429, "step": 530 }, { "epoch": 0.13286625797572876, "grad_norm": 11.8125, "kl": 5.293578147888184, "learning_rate": 5e-06, "logits/chosen": -89926336.0, "logits/rejected": -47984923.428571425, "logps/chosen": -488.002587890625, "logps/rejected": -619.7414899553571, "loss": 0.0323, "rewards/chosen": 8.120950317382812, "rewards/margins": 18.287205723353793, "rewards/rejected": -10.166255405970983, "step": 531 }, { "epoch": 0.13311647691730263, "grad_norm": 17.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57388392.72727273, "logits/rejected": -66457895.384615384, "logps/chosen": -337.86514559659093, "logps/rejected": -636.5494290865385, "loss": 0.0944, "rewards/chosen": 5.181604905561968, "rewards/margins": 14.031535302008784, "rewards/rejected": -8.849930396446815, "step": 532 }, { "epoch": 0.1333666958588765, "grad_norm": 16.0, "kl": 3.3148531913757324, "learning_rate": 5e-06, "logits/chosen": -56274967.27272727, "logits/rejected": -37210028.307692304, "logps/chosen": -419.67276278409093, "logps/rejected": -511.70474008413464, "loss": 0.0532, "rewards/chosen": 7.069847800514915, "rewards/margins": 12.83716065733583, "rewards/rejected": -5.767312856820913, "step": 533 }, { "epoch": 0.1336169148004504, "grad_norm": 8.875, "kl": 4.858542442321777, "learning_rate": 5e-06, "logits/chosen": -51129984.0, "logits/rejected": -127513344.0, "logps/chosen": -489.1588134765625, "logps/rejected": -339.93943277994794, "loss": 0.0793, "rewards/chosen": 5.95106824239095, "rewards/margins": 11.520591100056965, "rewards/rejected": -5.569522857666016, "step": 534 }, { "epoch": 0.13386713374202427, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61036231.11111111, "logits/rejected": -45355780.266666666, "logps/chosen": -242.20610894097223, "logps/rejected": -535.25869140625, "loss": 0.1034, "rewards/chosen": 3.5311997731526694, "rewards/margins": 10.262492497762045, "rewards/rejected": -6.731292724609375, "step": 535 }, { "epoch": 0.13411735268359815, "grad_norm": 26.5, "kl": 5.973757743835449, "learning_rate": 5e-06, "logits/chosen": -71781450.66666667, "logits/rejected": -70372618.66666667, "logps/chosen": -412.5847574869792, "logps/rejected": -782.48046875, "loss": 0.159, "rewards/chosen": 4.9632829030354815, "rewards/margins": 13.216527938842773, "rewards/rejected": -8.253245035807291, "step": 536 }, { "epoch": 0.13436757162517202, "grad_norm": 1.8671875, "kl": 2.53564715385437, "learning_rate": 5e-06, "logits/chosen": -34058551.27272727, "logits/rejected": -65766291.692307696, "logps/chosen": -467.0050603693182, "logps/rejected": -673.2614182692307, "loss": 0.0042, "rewards/chosen": 7.107920559969815, "rewards/margins": 18.238155498371256, "rewards/rejected": -11.130234938401442, "step": 537 }, { "epoch": 0.13461779056674592, "grad_norm": 15.5625, "kl": 2.483484983444214, "learning_rate": 5e-06, "logits/chosen": -37301804.307692304, "logits/rejected": -49108584.72727273, "logps/chosen": -403.10366586538464, "logps/rejected": -386.72878196022725, "loss": 0.0422, "rewards/chosen": 6.849193279559795, "rewards/margins": 12.445289665168815, "rewards/rejected": -5.59609638560902, "step": 538 }, { "epoch": 0.13486800950831979, "grad_norm": 13.875, "kl": 3.3756346702575684, "learning_rate": 5e-06, "logits/chosen": -40917779.2, "logits/rejected": -38514413.71428572, "logps/chosen": -252.2524169921875, "logps/rejected": -540.5044991629464, "loss": 0.0961, "rewards/chosen": 3.6661636352539064, "rewards/margins": 10.864099557059152, "rewards/rejected": -7.197935921805246, "step": 539 }, { "epoch": 0.13511822844989366, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62873988.0, "logits/rejected": -62200472.0, "logps/chosen": -373.53582763671875, "logps/rejected": -349.1548767089844, "loss": 0.0465, "rewards/chosen": 5.450967788696289, "rewards/margins": 12.257472515106201, "rewards/rejected": -6.806504726409912, "step": 540 }, { "epoch": 0.13536844739146753, "grad_norm": 15.125, "kl": 6.65887975692749, "learning_rate": 5e-06, "logits/chosen": -75504914.28571428, "logits/rejected": -44939814.4, "logps/chosen": -454.80154854910717, "logps/rejected": -561.496923828125, "loss": 0.0378, "rewards/chosen": 6.142020089285714, "rewards/margins": 15.195914132254465, "rewards/rejected": -9.05389404296875, "step": 541 }, { "epoch": 0.1356186663330414, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88219491.55555555, "logits/rejected": -53119641.6, "logps/chosen": -325.59288194444446, "logps/rejected": -455.66611328125, "loss": 0.0911, "rewards/chosen": 3.8690032958984375, "rewards/margins": 11.863264973958334, "rewards/rejected": -7.994261678059896, "step": 542 }, { "epoch": 0.1358688852746153, "grad_norm": 12.3125, "kl": 5.512020111083984, "learning_rate": 5e-06, "logits/chosen": -64126796.8, "logits/rejected": -40754148.571428575, "logps/chosen": -554.72607421875, "logps/rejected": -515.8116629464286, "loss": 0.0328, "rewards/chosen": 7.9691215515136715, "rewards/margins": 16.311395263671876, "rewards/rejected": -8.342273712158203, "step": 543 }, { "epoch": 0.13611910421618917, "grad_norm": 11.5, "kl": 7.555688381195068, "learning_rate": 5e-06, "logits/chosen": -80192728.61538461, "logits/rejected": -31945445.818181816, "logps/chosen": -503.6399489182692, "logps/rejected": -663.5648082386364, "loss": 0.0438, "rewards/chosen": 6.85441648043119, "rewards/margins": 16.89773826332359, "rewards/rejected": -10.0433217828924, "step": 544 }, { "epoch": 0.13636932315776304, "grad_norm": 12.1875, "kl": 10.995513916015625, "learning_rate": 5e-06, "logits/chosen": -79398520.47058824, "logits/rejected": -112806070.85714285, "logps/chosen": -579.7518956801471, "logps/rejected": -900.4176897321429, "loss": 0.0834, "rewards/chosen": 6.391205731560202, "rewards/margins": 23.74232572667739, "rewards/rejected": -17.351119995117188, "step": 545 }, { "epoch": 0.1366195420993369, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27158245.818181816, "logits/rejected": -34891116.307692304, "logps/chosen": -412.5660955255682, "logps/rejected": -411.27328725961536, "loss": 0.0498, "rewards/chosen": 5.319623773748225, "rewards/margins": 13.087192508724186, "rewards/rejected": -7.767568734975962, "step": 546 }, { "epoch": 0.1368697610409108, "grad_norm": 12.875, "kl": 1.4945749044418335, "learning_rate": 5e-06, "logits/chosen": -27459042.666666668, "logits/rejected": -29302160.0, "logps/chosen": -307.5785725911458, "logps/rejected": -464.4750569661458, "loss": 0.1028, "rewards/chosen": 3.5340277353922525, "rewards/margins": 12.774560928344727, "rewards/rejected": -9.240533192952475, "step": 547 }, { "epoch": 0.13711997998248468, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70222515.2, "logits/rejected": -24017531.42857143, "logps/chosen": -489.0595703125, "logps/rejected": -525.2247488839286, "loss": 0.0384, "rewards/chosen": 5.771475219726563, "rewards/margins": 13.34258989606585, "rewards/rejected": -7.571114676339286, "step": 548 }, { "epoch": 0.13737019892405855, "grad_norm": 14.6875, "kl": 1.2472445964813232, "learning_rate": 5e-06, "logits/chosen": -102107372.8, "logits/rejected": -43376987.428571425, "logps/chosen": -456.514794921875, "logps/rejected": -312.28536551339283, "loss": 0.0448, "rewards/chosen": 5.982299423217773, "rewards/margins": 11.620232336861747, "rewards/rejected": -5.637932913643973, "step": 549 }, { "epoch": 0.13762041786563242, "grad_norm": 16.25, "kl": 0.9486293792724609, "learning_rate": 5e-06, "logits/chosen": -69926752.0, "logits/rejected": -54773900.8, "logps/chosen": -391.06689453125, "logps/rejected": -451.2779296875, "loss": 0.0811, "rewards/chosen": 4.650173732212612, "rewards/margins": 12.90467812674386, "rewards/rejected": -8.25450439453125, "step": 550 }, { "epoch": 0.1378706368072063, "grad_norm": 10.4375, "kl": 1.4036941528320312, "learning_rate": 5e-06, "logits/chosen": -62808960.0, "logits/rejected": -46912640.0, "logps/chosen": -405.81338778409093, "logps/rejected": -471.51160606971155, "loss": 0.0796, "rewards/chosen": 4.9530112526633525, "rewards/margins": 13.210419981629698, "rewards/rejected": -8.257408728966347, "step": 551 }, { "epoch": 0.1381208557487802, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50658583.27272727, "logits/rejected": -49524686.76923077, "logps/chosen": -403.10813210227275, "logps/rejected": -632.4972956730769, "loss": 0.0522, "rewards/chosen": 5.546658602627841, "rewards/margins": 15.684891494004042, "rewards/rejected": -10.138232891376202, "step": 552 }, { "epoch": 0.13837107469035406, "grad_norm": 24.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -82265824.0, "logits/rejected": -51365882.666666664, "logps/chosen": -461.3359781901042, "logps/rejected": -644.1104329427084, "loss": 0.0771, "rewards/chosen": 4.5082963307698565, "rewards/margins": 16.37892468770345, "rewards/rejected": -11.870628356933594, "step": 553 }, { "epoch": 0.13862129363192793, "grad_norm": 21.125, "kl": 0.04537200927734375, "learning_rate": 5e-06, "logits/chosen": -50316848.0, "logits/rejected": -84522186.66666667, "logps/chosen": -326.8287353515625, "logps/rejected": -470.2118733723958, "loss": 0.0984, "rewards/chosen": 3.7203763326009116, "rewards/margins": 12.240569432576498, "rewards/rejected": -8.520193099975586, "step": 554 }, { "epoch": 0.1388715125735018, "grad_norm": 16.375, "kl": 3.3425607681274414, "learning_rate": 5e-06, "logits/chosen": -31000344.615384616, "logits/rejected": -46964930.90909091, "logps/chosen": -421.4607496995192, "logps/rejected": -458.14936967329544, "loss": 0.073, "rewards/chosen": 4.11741696871244, "rewards/margins": 14.60420632529092, "rewards/rejected": -10.48678935657848, "step": 555 }, { "epoch": 0.1391217315150757, "grad_norm": 14.1875, "kl": 0.7640914916992188, "learning_rate": 5e-06, "logits/chosen": -40181469.333333336, "logits/rejected": -30542840.0, "logps/chosen": -306.20522054036456, "logps/rejected": -431.0502115885417, "loss": 0.039, "rewards/chosen": 4.890604654947917, "rewards/margins": 11.853193918863933, "rewards/rejected": -6.962589263916016, "step": 556 }, { "epoch": 0.13937195045664957, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27684346.666666668, "logits/rejected": -44124005.333333336, "logps/chosen": -427.5699869791667, "logps/rejected": -468.980224609375, "loss": 0.0667, "rewards/chosen": 5.270424524943034, "rewards/margins": 12.634319305419922, "rewards/rejected": -7.363894780476888, "step": 557 }, { "epoch": 0.13962216939822344, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74414016.0, "logits/rejected": -47394377.14285714, "logps/chosen": -408.1877685546875, "logps/rejected": -541.2767508370536, "loss": 0.0637, "rewards/chosen": 5.466611862182617, "rewards/margins": 15.036128725324359, "rewards/rejected": -9.569516863141741, "step": 558 }, { "epoch": 0.13987238833979732, "grad_norm": 5.09375, "kl": 0.3665122985839844, "learning_rate": 5e-06, "logits/chosen": -50733922.461538464, "logits/rejected": -60117550.54545455, "logps/chosen": -438.68111478365387, "logps/rejected": -583.6179421164773, "loss": 0.0324, "rewards/chosen": 5.679512023925781, "rewards/margins": 13.910150007768111, "rewards/rejected": -8.23063798384233, "step": 559 }, { "epoch": 0.1401226072813712, "grad_norm": 13.0, "kl": 1.3728488683700562, "learning_rate": 5e-06, "logits/chosen": -57274709.333333336, "logits/rejected": -28212930.666666668, "logps/chosen": -401.2125244140625, "logps/rejected": -535.2503255208334, "loss": 0.0603, "rewards/chosen": 5.87824821472168, "rewards/margins": 13.921934127807617, "rewards/rejected": -8.043685913085938, "step": 560 }, { "epoch": 0.14037282622294509, "grad_norm": 28.25, "kl": 2.3028316497802734, "learning_rate": 5e-06, "logits/chosen": -33563565.333333336, "logits/rejected": -44590602.666666664, "logps/chosen": -306.4778238932292, "logps/rejected": -550.0835774739584, "loss": 0.1334, "rewards/chosen": 5.0559336344401045, "rewards/margins": 11.945223490397137, "rewards/rejected": -6.889289855957031, "step": 561 }, { "epoch": 0.14062304516451896, "grad_norm": 9.125, "kl": 1.8545424938201904, "learning_rate": 5e-06, "logits/chosen": -48026560.0, "logits/rejected": -30056333.333333332, "logps/chosen": -388.0746663411458, "logps/rejected": -397.7444661458333, "loss": 0.0811, "rewards/chosen": 5.234479268391927, "rewards/margins": 13.937051773071289, "rewards/rejected": -8.702572504679361, "step": 562 }, { "epoch": 0.14087326410609283, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56461430.15384615, "logits/rejected": -104653649.45454545, "logps/chosen": -402.31497896634613, "logps/rejected": -790.8291015625, "loss": 0.0716, "rewards/chosen": 5.494441105769231, "rewards/margins": 16.66681687148301, "rewards/rejected": -11.17237576571378, "step": 563 }, { "epoch": 0.1411234830476667, "grad_norm": 1.4296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69598738.28571428, "logits/rejected": -29663859.2, "logps/chosen": -533.5154854910714, "logps/rejected": -601.884765625, "loss": 0.0264, "rewards/chosen": 7.532101222446987, "rewards/margins": 19.468496486118863, "rewards/rejected": -11.936395263671875, "step": 564 }, { "epoch": 0.1413737019892406, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88706675.2, "logits/rejected": -55017769.14285714, "logps/chosen": -437.87451171875, "logps/rejected": -772.2451869419643, "loss": 0.05, "rewards/chosen": 4.9825439453125, "rewards/margins": 18.630482264927455, "rewards/rejected": -13.647938319614955, "step": 565 }, { "epoch": 0.14162392093081447, "grad_norm": 9.75, "kl": 3.747547149658203, "learning_rate": 5e-06, "logits/chosen": -60948240.0, "logits/rejected": -84816202.66666667, "logps/chosen": -360.94189453125, "logps/rejected": -462.5052083333333, "loss": 0.0659, "rewards/chosen": 5.374217987060547, "rewards/margins": 11.517545064290363, "rewards/rejected": -6.143327077229817, "step": 566 }, { "epoch": 0.14187413987238834, "grad_norm": 11.125, "kl": 1.3537509441375732, "learning_rate": 5e-06, "logits/chosen": -44645988.571428575, "logits/rejected": -45456985.6, "logps/chosen": -462.3232421875, "logps/rejected": -501.261083984375, "loss": 0.0216, "rewards/chosen": 6.114850725446429, "rewards/margins": 17.424613298688616, "rewards/rejected": -11.309762573242187, "step": 567 }, { "epoch": 0.1421243588139622, "grad_norm": 13.5625, "kl": 5.523778438568115, "learning_rate": 5e-06, "logits/chosen": -65928890.666666664, "logits/rejected": -44987210.666666664, "logps/chosen": -441.14794921875, "logps/rejected": -499.1584879557292, "loss": 0.0334, "rewards/chosen": 5.457477569580078, "rewards/margins": 13.345184326171875, "rewards/rejected": -7.887706756591797, "step": 568 }, { "epoch": 0.14237457775553608, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65111108.92307692, "logits/rejected": -10760162.909090908, "logps/chosen": -466.7189753605769, "logps/rejected": -671.4787819602273, "loss": 0.0181, "rewards/chosen": 6.900662348820613, "rewards/margins": 16.807293818547176, "rewards/rejected": -9.906631469726562, "step": 569 }, { "epoch": 0.14262479669710998, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63227194.666666664, "logits/rejected": -70243680.0, "logps/chosen": -451.7148844401042, "logps/rejected": -532.2007242838541, "loss": 0.0684, "rewards/chosen": 5.362323760986328, "rewards/margins": 14.59932009379069, "rewards/rejected": -9.236996332804361, "step": 570 }, { "epoch": 0.14287501563868385, "grad_norm": 6.875, "kl": 0.22522418200969696, "learning_rate": 5e-06, "logits/chosen": -38687499.63636363, "logits/rejected": -69070168.61538461, "logps/chosen": -416.4705699573864, "logps/rejected": -423.6765700120192, "loss": 0.0206, "rewards/chosen": 6.458177046342329, "rewards/margins": 14.118144322108556, "rewards/rejected": -7.659967275766226, "step": 571 }, { "epoch": 0.14312523458025772, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69844089.6, "logits/rejected": -47900699.428571425, "logps/chosen": -364.145556640625, "logps/rejected": -686.8136160714286, "loss": 0.0559, "rewards/chosen": 5.256170654296875, "rewards/margins": 14.609437779017858, "rewards/rejected": -9.353267124720983, "step": 572 }, { "epoch": 0.1433754535218316, "grad_norm": 7.25, "kl": 6.505663871765137, "learning_rate": 5e-06, "logits/chosen": -71723273.84615384, "logits/rejected": -37394836.36363637, "logps/chosen": -557.0062725360577, "logps/rejected": -574.6129705255681, "loss": 0.0489, "rewards/chosen": 7.415489783653846, "rewards/margins": 15.985279323337796, "rewards/rejected": -8.56978953968395, "step": 573 }, { "epoch": 0.1436256724634055, "grad_norm": 13.6875, "kl": 4.055292129516602, "learning_rate": 5e-06, "logits/chosen": -76171057.23076923, "logits/rejected": -50149015.27272727, "logps/chosen": -408.18558443509613, "logps/rejected": -559.4716796875, "loss": 0.0607, "rewards/chosen": 5.947671743539663, "rewards/margins": 16.2534689736533, "rewards/rejected": -10.305797230113637, "step": 574 }, { "epoch": 0.14387589140497936, "grad_norm": 11.1875, "kl": 2.946934223175049, "learning_rate": 5e-06, "logits/chosen": -22387913.846153848, "logits/rejected": -37404657.45454545, "logps/chosen": -539.9292743389423, "logps/rejected": -442.79350142045456, "loss": 0.0303, "rewards/chosen": 6.466593228853666, "rewards/margins": 14.180487466025186, "rewards/rejected": -7.71389423717152, "step": 575 }, { "epoch": 0.14412611034655323, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50337417.84615385, "logits/rejected": -31008706.90909091, "logps/chosen": -519.6180889423077, "logps/rejected": -499.71657492897725, "loss": 0.0135, "rewards/chosen": 7.569244384765625, "rewards/margins": 16.236910733309657, "rewards/rejected": -8.667666348544033, "step": 576 }, { "epoch": 0.1443763292881271, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72365312.0, "logits/rejected": -84830858.66666667, "logps/chosen": -383.6020100911458, "logps/rejected": -745.9160970052084, "loss": 0.06, "rewards/chosen": 6.994204203287761, "rewards/margins": 19.90826161702474, "rewards/rejected": -12.914057413736979, "step": 577 }, { "epoch": 0.144626548229701, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53728320.0, "logits/rejected": -62778950.4, "logps/chosen": -298.8453892299107, "logps/rejected": -609.87529296875, "loss": 0.0502, "rewards/chosen": 4.344665254865374, "rewards/margins": 12.591643251691545, "rewards/rejected": -8.246977996826171, "step": 578 }, { "epoch": 0.14487676717127487, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70246381.71428572, "logits/rejected": -30079724.8, "logps/chosen": -445.68729073660717, "logps/rejected": -556.24599609375, "loss": 0.0208, "rewards/chosen": 6.037149156842913, "rewards/margins": 17.830748094831193, "rewards/rejected": -11.793598937988282, "step": 579 }, { "epoch": 0.14512698611284874, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22964480.0, "logits/rejected": -46092390.4, "logps/chosen": -174.85899135044642, "logps/rejected": -524.25390625, "loss": 0.135, "rewards/chosen": 2.951103482927595, "rewards/margins": 12.166678128923689, "rewards/rejected": -9.215574645996094, "step": 580 }, { "epoch": 0.14537720505442261, "grad_norm": 23.0, "kl": 6.223883628845215, "learning_rate": 5e-06, "logits/chosen": -57095116.8, "logits/rejected": -56019975.11111111, "logps/chosen": -516.7806640625, "logps/rejected": -630.8013237847222, "loss": 0.0569, "rewards/chosen": 6.289462280273438, "rewards/margins": 16.177319505479602, "rewards/rejected": -9.887857225206163, "step": 581 }, { "epoch": 0.14562742399599649, "grad_norm": 21.0, "kl": 3.0273406505584717, "learning_rate": 5e-06, "logits/chosen": -57891221.333333336, "logits/rejected": -70378224.0, "logps/chosen": -536.1588948567709, "logps/rejected": -587.8660481770834, "loss": 0.047, "rewards/chosen": 7.852675120035808, "rewards/margins": 15.25135103861491, "rewards/rejected": -7.398675918579102, "step": 582 }, { "epoch": 0.14587764293757038, "grad_norm": 13.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 53221752.88888889, "logits/rejected": -39718408.53333333, "logps/chosen": -571.5025499131945, "logps/rejected": -556.9738932291667, "loss": 0.0487, "rewards/chosen": 5.060741424560547, "rewards/margins": 16.374883270263673, "rewards/rejected": -11.314141845703125, "step": 583 }, { "epoch": 0.14612786187914426, "grad_norm": 6.21875, "kl": 3.0376155376434326, "learning_rate": 5e-06, "logits/chosen": -62987258.666666664, "logits/rejected": -61405440.0, "logps/chosen": -421.086181640625, "logps/rejected": -643.8584798177084, "loss": 0.0463, "rewards/chosen": 5.113549868265788, "rewards/margins": 13.810180346171062, "rewards/rejected": -8.696630477905273, "step": 584 }, { "epoch": 0.14637808082071813, "grad_norm": 14.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33535074.666666668, "logits/rejected": -22897408.0, "logps/chosen": -206.67183430989584, "logps/rejected": -308.8408610026042, "loss": 0.1215, "rewards/chosen": 2.9940287272135415, "rewards/margins": 9.589642842610678, "rewards/rejected": -6.595614115397136, "step": 585 }, { "epoch": 0.146628299762292, "grad_norm": 7.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71851984.0, "logits/rejected": -37640056.0, "logps/chosen": -449.3997802734375, "logps/rejected": -503.60772705078125, "loss": 0.0564, "rewards/chosen": 5.691864967346191, "rewards/margins": 13.455193519592285, "rewards/rejected": -7.763328552246094, "step": 586 }, { "epoch": 0.1468785187038659, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47702720.0, "logits/rejected": -62317696.0, "logps/chosen": -430.5486949573864, "logps/rejected": -817.4065504807693, "loss": 0.0333, "rewards/chosen": 5.671214363791726, "rewards/margins": 16.31571744371961, "rewards/rejected": -10.644503079927885, "step": 587 }, { "epoch": 0.14712873764543977, "grad_norm": 22.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60553320.72727273, "logits/rejected": -70311276.3076923, "logps/chosen": -354.89208984375, "logps/rejected": -566.9461388221154, "loss": 0.0739, "rewards/chosen": 4.223767367276278, "rewards/margins": 13.48609705571528, "rewards/rejected": -9.262329688439003, "step": 588 }, { "epoch": 0.14737895658701364, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16443514.666666666, "logits/rejected": -41768354.13333333, "logps/chosen": -362.9908854166667, "logps/rejected": -483.8588541666667, "loss": 0.0802, "rewards/chosen": 5.540819803873698, "rewards/margins": 13.324569193522136, "rewards/rejected": -7.783749389648437, "step": 589 }, { "epoch": 0.1476291755285875, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45180163.2, "logits/rejected": -63016658.28571428, "logps/chosen": -355.2489501953125, "logps/rejected": -715.7284458705357, "loss": 0.0695, "rewards/chosen": 4.424626159667969, "rewards/margins": 15.527487836565289, "rewards/rejected": -11.102861676897321, "step": 590 }, { "epoch": 0.14787939447016138, "grad_norm": 11.0625, "kl": 3.0407485961914062, "learning_rate": 5e-06, "logits/chosen": -41149545.14285714, "logits/rejected": -48740230.4, "logps/chosen": -484.2509068080357, "logps/rejected": -575.022314453125, "loss": 0.0294, "rewards/chosen": 6.6061521257672995, "rewards/margins": 16.47402779715402, "rewards/rejected": -9.867875671386718, "step": 591 }, { "epoch": 0.14812961341173528, "grad_norm": 13.0, "kl": 1.0395148992538452, "learning_rate": 5e-06, "logits/chosen": -27352150.4, "logits/rejected": -44808640.0, "logps/chosen": -326.3874267578125, "logps/rejected": -615.2264229910714, "loss": 0.0912, "rewards/chosen": 4.599735260009766, "rewards/margins": 15.051055363246373, "rewards/rejected": -10.451320103236608, "step": 592 }, { "epoch": 0.14837983235330915, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64602268.44444445, "logits/rejected": -37757606.4, "logps/chosen": -399.9679904513889, "logps/rejected": -622.66484375, "loss": 0.0381, "rewards/chosen": 5.446172926161024, "rewards/margins": 16.01845279269748, "rewards/rejected": -10.572279866536459, "step": 593 }, { "epoch": 0.14863005129488302, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48084785.23076923, "logits/rejected": -31335467.636363637, "logps/chosen": -305.96542593149036, "logps/rejected": -653.6206942471591, "loss": 0.0687, "rewards/chosen": 4.481809762807993, "rewards/margins": 13.614516144865876, "rewards/rejected": -9.132706382057883, "step": 594 }, { "epoch": 0.1488802702364569, "grad_norm": 3.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49751385.6, "logits/rejected": -59806578.28571428, "logps/chosen": -427.02548828125, "logps/rejected": -788.4076450892857, "loss": 0.037, "rewards/chosen": 7.2791259765625, "rewards/margins": 22.111955043247768, "rewards/rejected": -14.832829066685267, "step": 595 }, { "epoch": 0.1491304891780308, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44918483.2, "logits/rejected": -41798006.85714286, "logps/chosen": -463.986181640625, "logps/rejected": -465.1258021763393, "loss": 0.0722, "rewards/chosen": 6.066531372070313, "rewards/margins": 15.181361934116907, "rewards/rejected": -9.114830562046595, "step": 596 }, { "epoch": 0.14938070811960466, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49981060.571428575, "logits/rejected": -30313939.2, "logps/chosen": -436.2766810825893, "logps/rejected": -507.420361328125, "loss": 0.0286, "rewards/chosen": 6.159907749720982, "rewards/margins": 16.996255711146762, "rewards/rejected": -10.836347961425782, "step": 597 }, { "epoch": 0.14963092706117853, "grad_norm": 22.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53047340.8, "logits/rejected": -42007465.14285714, "logps/chosen": -321.1655517578125, "logps/rejected": -500.06089564732144, "loss": 0.0677, "rewards/chosen": 4.893730545043946, "rewards/margins": 13.315142331804548, "rewards/rejected": -8.421411786760602, "step": 598 }, { "epoch": 0.1498811460027524, "grad_norm": 6.59375, "kl": 0.5697571635246277, "learning_rate": 5e-06, "logits/chosen": -49678749.538461536, "logits/rejected": -28427377.454545453, "logps/chosen": -437.12631460336536, "logps/rejected": -677.50927734375, "loss": 0.0496, "rewards/chosen": 5.437606224646935, "rewards/margins": 15.197456466568099, "rewards/rejected": -9.759850241921164, "step": 599 }, { "epoch": 0.15013136494432627, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44754730.666666664, "logits/rejected": -31375938.666666668, "logps/chosen": -263.7760009765625, "logps/rejected": -404.2611897786458, "loss": 0.0664, "rewards/chosen": 4.977120717366536, "rewards/margins": 15.294183095296223, "rewards/rejected": -10.317062377929688, "step": 600 }, { "epoch": 0.15038158388590017, "grad_norm": 14.0625, "kl": 1.9754600524902344, "learning_rate": 5e-06, "logits/chosen": -38264466.666666664, "logits/rejected": -13454984.0, "logps/chosen": -379.0237223307292, "logps/rejected": -513.689208984375, "loss": 0.1188, "rewards/chosen": 4.791454950968425, "rewards/margins": 13.80802281697591, "rewards/rejected": -9.016567866007486, "step": 601 }, { "epoch": 0.15063180282747404, "grad_norm": 1.921875, "kl": 1.9281539916992188, "learning_rate": 5e-06, "logits/chosen": -73308996.26666667, "logits/rejected": -42650944.0, "logps/chosen": -518.6475260416667, "logps/rejected": -616.7601453993055, "loss": 0.017, "rewards/chosen": 6.334693908691406, "rewards/margins": 15.746002197265625, "rewards/rejected": -9.411308288574219, "step": 602 }, { "epoch": 0.1508820217690479, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11186326.153846154, "logits/rejected": -59520768.0, "logps/chosen": -312.8362379807692, "logps/rejected": -550.9716796875, "loss": 0.057, "rewards/chosen": 4.333173018235427, "rewards/margins": 15.060367370818877, "rewards/rejected": -10.727194352583451, "step": 603 }, { "epoch": 0.15113224071062178, "grad_norm": 13.1875, "kl": 1.423807144165039, "learning_rate": 5e-06, "logits/chosen": -57009925.81818182, "logits/rejected": -62002540.307692304, "logps/chosen": -354.83389559659093, "logps/rejected": -456.54995492788464, "loss": 0.0408, "rewards/chosen": 5.314810319380327, "rewards/margins": 14.247145806159175, "rewards/rejected": -8.932335486778847, "step": 604 }, { "epoch": 0.15138245965219568, "grad_norm": 11.8125, "kl": 3.8470964431762695, "learning_rate": 5e-06, "logits/chosen": -81040907.63636364, "logits/rejected": -47877080.615384616, "logps/chosen": -441.78884055397725, "logps/rejected": -457.900390625, "loss": 0.05, "rewards/chosen": 6.137951937588778, "rewards/margins": 15.409236774577963, "rewards/rejected": -9.271284836989183, "step": 605 }, { "epoch": 0.15163267859376955, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33345590.4, "logits/rejected": -47074130.28571428, "logps/chosen": -307.0808349609375, "logps/rejected": -507.64501953125, "loss": 0.0887, "rewards/chosen": 4.056896209716797, "rewards/margins": 12.964509691510882, "rewards/rejected": -8.907613481794085, "step": 606 }, { "epoch": 0.15188289753534343, "grad_norm": 12.5625, "kl": 1.1803348064422607, "learning_rate": 5e-06, "logits/chosen": -63267341.71428572, "logits/rejected": -57571315.2, "logps/chosen": -444.02797154017856, "logps/rejected": -610.296875, "loss": 0.0584, "rewards/chosen": 6.373085021972656, "rewards/margins": 18.516206359863283, "rewards/rejected": -12.143121337890625, "step": 607 }, { "epoch": 0.1521331164769173, "grad_norm": 16.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54215261.09090909, "logits/rejected": -61871547.07692308, "logps/chosen": -387.6716974431818, "logps/rejected": -354.5124323918269, "loss": 0.076, "rewards/chosen": 5.259364734996449, "rewards/margins": 11.558089569732026, "rewards/rejected": -6.298724834735577, "step": 608 }, { "epoch": 0.15238333541849117, "grad_norm": 21.875, "kl": 11.305414199829102, "learning_rate": 5e-06, "logits/chosen": -56930860.307692304, "logits/rejected": -31004282.181818184, "logps/chosen": -433.7024113581731, "logps/rejected": -658.8701171875, "loss": 0.0973, "rewards/chosen": 5.5040740966796875, "rewards/margins": 16.440994262695312, "rewards/rejected": -10.936920166015625, "step": 609 }, { "epoch": 0.15263355436006507, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -97271246.22222222, "logits/rejected": -46111820.8, "logps/chosen": -458.6940104166667, "logps/rejected": -552.3071614583333, "loss": 0.0254, "rewards/chosen": 6.719607883029514, "rewards/margins": 16.007958306206596, "rewards/rejected": -9.288350423177084, "step": 610 }, { "epoch": 0.15288377330163894, "grad_norm": 13.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21712582.4, "logits/rejected": -55264342.85714286, "logps/chosen": -265.624169921875, "logps/rejected": -575.4641462053571, "loss": 0.0957, "rewards/chosen": 3.5226036071777345, "rewards/margins": 11.884290313720703, "rewards/rejected": -8.361686706542969, "step": 611 }, { "epoch": 0.1531339922432128, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29308192.0, "logits/rejected": -53309746.28571428, "logps/chosen": -312.162841796875, "logps/rejected": -548.2455705915179, "loss": 0.0854, "rewards/chosen": 2.817942237854004, "rewards/margins": 12.169474220275879, "rewards/rejected": -9.351531982421875, "step": 612 }, { "epoch": 0.15338421118478668, "grad_norm": 10.125, "kl": 7.217921257019043, "learning_rate": 5e-06, "logits/chosen": -43526417.45454545, "logits/rejected": -58481019.07692308, "logps/chosen": -427.17489346590907, "logps/rejected": -537.5062349759615, "loss": 0.0465, "rewards/chosen": 5.5406410910866475, "rewards/margins": 16.36690686632703, "rewards/rejected": -10.826265775240385, "step": 613 }, { "epoch": 0.15363443012636058, "grad_norm": 9.3125, "kl": 2.2072792053222656, "learning_rate": 5e-06, "logits/chosen": -56456826.18181818, "logits/rejected": -37112531.692307696, "logps/chosen": -512.4351029829545, "logps/rejected": -406.7492487980769, "loss": 0.047, "rewards/chosen": 7.514734441583807, "rewards/margins": 13.133255324997268, "rewards/rejected": -5.618520883413462, "step": 614 }, { "epoch": 0.15388464906793445, "grad_norm": 12.875, "kl": 0.842522144317627, "learning_rate": 5e-06, "logits/chosen": -19764147.2, "logits/rejected": -36850731.428571425, "logps/chosen": -357.34423828125, "logps/rejected": -360.9752720424107, "loss": 0.0883, "rewards/chosen": 6.33453483581543, "rewards/margins": 12.28083588736398, "rewards/rejected": -5.9463010515485495, "step": 615 }, { "epoch": 0.15413486800950832, "grad_norm": 12.6875, "kl": 1.6806972026824951, "learning_rate": 5e-06, "logits/chosen": -57333124.92307692, "logits/rejected": -22525808.0, "logps/chosen": -454.38611778846155, "logps/rejected": -517.8662997159091, "loss": 0.0464, "rewards/chosen": 6.989862295297476, "rewards/margins": 17.49558988984648, "rewards/rejected": -10.505727594549006, "step": 616 }, { "epoch": 0.1543850869510822, "grad_norm": 14.625, "kl": 2.119199752807617, "learning_rate": 5e-06, "logits/chosen": -53081521.23076923, "logits/rejected": -68264372.36363636, "logps/chosen": -368.2942457932692, "logps/rejected": -647.6526544744319, "loss": 0.068, "rewards/chosen": 5.668980525090144, "rewards/margins": 14.989180664916137, "rewards/rejected": -9.320200139825994, "step": 617 }, { "epoch": 0.15463530589265606, "grad_norm": 12.1875, "kl": 3.436605453491211, "learning_rate": 5e-06, "logits/chosen": -51985715.2, "logits/rejected": -71383497.14285715, "logps/chosen": -349.806884765625, "logps/rejected": -594.2551618303571, "loss": 0.0599, "rewards/chosen": 5.950720977783203, "rewards/margins": 13.747690800258091, "rewards/rejected": -7.796969822474888, "step": 618 }, { "epoch": 0.15488552483422996, "grad_norm": 13.375, "kl": 9.635443687438965, "learning_rate": 5e-06, "logits/chosen": -45207136.0, "logits/rejected": -52954444.8, "logps/chosen": -364.88204520089283, "logps/rejected": -584.69951171875, "loss": 0.1153, "rewards/chosen": 4.932337624686105, "rewards/margins": 12.84488797869001, "rewards/rejected": -7.912550354003907, "step": 619 }, { "epoch": 0.15513574377580383, "grad_norm": 14.6875, "kl": 1.0316712856292725, "learning_rate": 5e-06, "logits/chosen": -82956437.33333333, "logits/rejected": -59409184.0, "logps/chosen": -407.2008870442708, "logps/rejected": -469.4278157552083, "loss": 0.077, "rewards/chosen": 6.241847991943359, "rewards/margins": 14.080009460449219, "rewards/rejected": -7.838161468505859, "step": 620 }, { "epoch": 0.1553859627173777, "grad_norm": 3.484375, "kl": 5.205187797546387, "learning_rate": 5e-06, "logits/chosen": 104229688.8888889, "logits/rejected": -53546018.13333333, "logps/chosen": -437.0675998263889, "logps/rejected": -628.8258463541666, "loss": 0.0076, "rewards/chosen": 7.225312974717882, "rewards/margins": 16.454711574978298, "rewards/rejected": -9.229398600260417, "step": 621 }, { "epoch": 0.15563618165895157, "grad_norm": 10.5, "kl": 7.199731826782227, "learning_rate": 5e-06, "logits/chosen": -69697683.6923077, "logits/rejected": -51939397.81818182, "logps/chosen": -494.4455378605769, "logps/rejected": -478.54190340909093, "loss": 0.0652, "rewards/chosen": 7.387502230130709, "rewards/margins": 17.04444442428909, "rewards/rejected": -9.65694219415838, "step": 622 }, { "epoch": 0.15588640060052547, "grad_norm": 11.375, "kl": 11.347785949707031, "learning_rate": 5e-06, "logits/chosen": -83514090.66666667, "logits/rejected": -40512921.6, "logps/chosen": -520.8565538194445, "logps/rejected": -495.40930989583336, "loss": 0.0688, "rewards/chosen": 9.910608927408854, "rewards/margins": 16.37061462402344, "rewards/rejected": -6.460005696614584, "step": 623 }, { "epoch": 0.15613661954209934, "grad_norm": 20.125, "kl": 6.3800764083862305, "learning_rate": 5e-06, "logits/chosen": -66245792.0, "logits/rejected": -30638410.666666668, "logps/chosen": -405.9878743489583, "logps/rejected": -304.5220133463542, "loss": 0.1018, "rewards/chosen": 6.917832056681315, "rewards/margins": 11.399139722188314, "rewards/rejected": -4.481307665506999, "step": 624 }, { "epoch": 0.1563868384836732, "grad_norm": 8.0, "kl": 8.922649383544922, "learning_rate": 5e-06, "logits/chosen": -53217347.2, "logits/rejected": -43760681.14285714, "logps/chosen": -364.7166748046875, "logps/rejected": -381.60023716517856, "loss": 0.0746, "rewards/chosen": 7.687879180908203, "rewards/margins": 14.588166264125277, "rewards/rejected": -6.9002870832170755, "step": 625 }, { "epoch": 0.15663705742524708, "grad_norm": 0.8515625, "kl": 1.8622817993164062, "learning_rate": 5e-06, "logits/chosen": -78571328.0, "logits/rejected": -48910037.333333336, "logps/chosen": -487.4912923177083, "logps/rejected": -679.3701985677084, "loss": 0.0015, "rewards/chosen": 8.298712412516275, "rewards/margins": 19.89679718017578, "rewards/rejected": -11.598084767659506, "step": 626 }, { "epoch": 0.15688727636682095, "grad_norm": 5.71875, "kl": 6.2126946449279785, "learning_rate": 5e-06, "logits/chosen": -48173108.36363637, "logits/rejected": -60336866.461538464, "logps/chosen": -402.76979758522725, "logps/rejected": -534.7078575721154, "loss": 0.0341, "rewards/chosen": 7.176218206232244, "rewards/margins": 14.030700416831703, "rewards/rejected": -6.854482210599459, "step": 627 }, { "epoch": 0.15713749530839485, "grad_norm": 14.0625, "kl": 9.500288963317871, "learning_rate": 5e-06, "logits/chosen": -74956672.0, "logits/rejected": -42692579.55555555, "logps/chosen": -543.0876627604167, "logps/rejected": -716.7958984375, "loss": 0.0436, "rewards/chosen": 7.639811706542969, "rewards/margins": 16.986564297146266, "rewards/rejected": -9.346752590603298, "step": 628 }, { "epoch": 0.15738771424996872, "grad_norm": 1.8828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83722230.85714285, "logits/rejected": -56166113.88235294, "logps/chosen": -504.71351841517856, "logps/rejected": -530.2498276654412, "loss": 0.0046, "rewards/chosen": 7.7960935320172995, "rewards/margins": 16.280020641679524, "rewards/rejected": -8.483927109662224, "step": 629 }, { "epoch": 0.1576379331915426, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27695993.6, "logits/rejected": -15187328.0, "logps/chosen": -326.235009765625, "logps/rejected": -658.9176897321429, "loss": 0.0436, "rewards/chosen": 5.420652770996094, "rewards/margins": 14.86701158796038, "rewards/rejected": -9.446358816964286, "step": 630 }, { "epoch": 0.15788815213311647, "grad_norm": 22.0, "kl": 1.486368179321289, "learning_rate": 5e-06, "logits/chosen": -66076716.8, "logits/rejected": -29961417.14285714, "logps/chosen": -429.9384765625, "logps/rejected": -505.41622488839283, "loss": 0.0499, "rewards/chosen": 7.387535095214844, "rewards/margins": 15.074607304164342, "rewards/rejected": -7.687072208949497, "step": 631 }, { "epoch": 0.15813837107469036, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75176978.28571428, "logits/rejected": -37523501.176470585, "logps/chosen": -436.0302734375, "logps/rejected": -549.9060202205883, "loss": 0.0469, "rewards/chosen": 6.377547127859933, "rewards/margins": 14.81867153905019, "rewards/rejected": -8.441124411190257, "step": 632 }, { "epoch": 0.15838859001626424, "grad_norm": 13.8125, "kl": 3.222527265548706, "learning_rate": 5e-06, "logits/chosen": -29587706.181818184, "logits/rejected": -45230843.07692308, "logps/chosen": -299.69731001420456, "logps/rejected": -654.0891676682693, "loss": 0.0876, "rewards/chosen": 4.940381136807528, "rewards/margins": 15.726164170912096, "rewards/rejected": -10.785783034104567, "step": 633 }, { "epoch": 0.1586388089578381, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79841837.71428572, "logits/rejected": -43428491.294117644, "logps/chosen": -381.05824497767856, "logps/rejected": -388.09558823529414, "loss": 0.0625, "rewards/chosen": 5.998417445591518, "rewards/margins": 12.576788830156087, "rewards/rejected": -6.578371384564568, "step": 634 }, { "epoch": 0.15888902789941198, "grad_norm": 3.546875, "kl": 2.7077102661132812, "learning_rate": 5e-06, "logits/chosen": -50009619.2, "logits/rejected": -50984548.571428575, "logps/chosen": -351.46279296875, "logps/rejected": -432.29830496651783, "loss": 0.0195, "rewards/chosen": 6.222322463989258, "rewards/margins": 15.356186730521065, "rewards/rejected": -9.133864266531807, "step": 635 }, { "epoch": 0.15913924684098588, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59448686.93333333, "logits/rejected": -31012142.222222224, "logps/chosen": -341.6755859375, "logps/rejected": -556.4164496527778, "loss": 0.0569, "rewards/chosen": 5.288178507486979, "rewards/margins": 15.303597513834635, "rewards/rejected": -10.015419006347656, "step": 636 }, { "epoch": 0.15938946578255975, "grad_norm": 5.375, "kl": 6.035285472869873, "learning_rate": 5e-06, "logits/chosen": -59883669.333333336, "logits/rejected": -37449261.333333336, "logps/chosen": -456.2738444010417, "logps/rejected": -547.1472981770834, "loss": 0.0383, "rewards/chosen": 6.626650492350261, "rewards/margins": 15.629468282063801, "rewards/rejected": -9.002817789713541, "step": 637 }, { "epoch": 0.15963968472413362, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32597276.444444444, "logits/rejected": -26964051.2, "logps/chosen": -459.62065972222223, "logps/rejected": -666.8653645833333, "loss": 0.0536, "rewards/chosen": 7.42759026421441, "rewards/margins": 16.06582777235243, "rewards/rejected": -8.63823750813802, "step": 638 }, { "epoch": 0.1598899036657075, "grad_norm": 18.375, "kl": 4.807146072387695, "learning_rate": 5e-06, "logits/chosen": -65658290.28571428, "logits/rejected": -62354956.8, "logps/chosen": -429.42843191964283, "logps/rejected": -757.53955078125, "loss": 0.0804, "rewards/chosen": 6.848824092320034, "rewards/margins": 18.877921077183316, "rewards/rejected": -12.029096984863282, "step": 639 }, { "epoch": 0.16014012260728136, "grad_norm": 1.4453125, "kl": 0.20021185278892517, "learning_rate": 5e-06, "logits/chosen": -66669568.0, "logits/rejected": -30092720.0, "logps/chosen": -480.6462890625, "logps/rejected": -553.7813197544643, "loss": 0.044, "rewards/chosen": 7.846075439453125, "rewards/margins": 18.060491943359374, "rewards/rejected": -10.21441650390625, "step": 640 }, { "epoch": 0.16039034154885526, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49854862.76923077, "logits/rejected": -40396616.72727273, "logps/chosen": -495.71788611778845, "logps/rejected": -692.5755948153409, "loss": 0.0362, "rewards/chosen": 6.8903632530799275, "rewards/margins": 19.248831048712027, "rewards/rejected": -12.358467795632102, "step": 641 }, { "epoch": 0.16064056049042913, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57201440.0, "logits/rejected": -62493845.333333336, "logps/chosen": -386.4222819010417, "logps/rejected": -622.29833984375, "loss": 0.0286, "rewards/chosen": 5.635476430257161, "rewards/margins": 16.534656524658203, "rewards/rejected": -10.899180094401041, "step": 642 }, { "epoch": 0.160890779432003, "grad_norm": 11.875, "kl": 5.814295768737793, "learning_rate": 5e-06, "logits/chosen": -82580775.38461539, "logits/rejected": -7982277.818181818, "logps/chosen": -421.7467698317308, "logps/rejected": -440.0792347301136, "loss": 0.1019, "rewards/chosen": 5.027743412898137, "rewards/margins": 13.461065279020296, "rewards/rejected": -8.433321866122158, "step": 643 }, { "epoch": 0.16114099837357687, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56197717.333333336, "logits/rejected": -68045883.73333333, "logps/chosen": -331.20252821180554, "logps/rejected": -543.6722330729167, "loss": 0.0434, "rewards/chosen": 5.1691436767578125, "rewards/margins": 16.076640828450522, "rewards/rejected": -10.907497151692708, "step": 644 }, { "epoch": 0.16139121731515077, "grad_norm": 8.6875, "kl": 1.6532491445541382, "learning_rate": 5e-06, "logits/chosen": -66253376.0, "logits/rejected": -18327956.8, "logps/chosen": -441.36575753348217, "logps/rejected": -411.46396484375, "loss": 0.0324, "rewards/chosen": 6.985917227608817, "rewards/margins": 14.692185538155691, "rewards/rejected": -7.706268310546875, "step": 645 }, { "epoch": 0.16164143625672464, "grad_norm": 13.125, "kl": 1.1214256286621094, "learning_rate": 5e-06, "logits/chosen": -44702904.88888889, "logits/rejected": -53356019.2, "logps/chosen": -384.03917100694446, "logps/rejected": -466.74095052083334, "loss": 0.0506, "rewards/chosen": 5.757381863064236, "rewards/margins": 13.231995815700955, "rewards/rejected": -7.474613952636719, "step": 646 }, { "epoch": 0.1618916551982985, "grad_norm": 9.5, "kl": 0.4956817626953125, "learning_rate": 5e-06, "logits/chosen": -82611601.45454545, "logits/rejected": -42202840.615384616, "logps/chosen": -476.5628551136364, "logps/rejected": -474.54184194711536, "loss": 0.0255, "rewards/chosen": 5.4842071533203125, "rewards/margins": 15.649455143855167, "rewards/rejected": -10.165247990534855, "step": 647 }, { "epoch": 0.16214187413987238, "grad_norm": 27.25, "kl": 0.2697928845882416, "learning_rate": 5e-06, "logits/chosen": -51315141.81818182, "logits/rejected": -34651052.307692304, "logps/chosen": -411.80122514204544, "logps/rejected": -355.74793419471155, "loss": 0.1193, "rewards/chosen": 6.381439902565696, "rewards/margins": 12.210419394753195, "rewards/rejected": -5.8289794921875, "step": 648 }, { "epoch": 0.16239209308144625, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37384800.0, "logits/rejected": -47076154.18181818, "logps/chosen": -306.23563326322113, "logps/rejected": -484.53488991477275, "loss": 0.0711, "rewards/chosen": 4.964331993689904, "rewards/margins": 12.463073143592247, "rewards/rejected": -7.498741149902344, "step": 649 }, { "epoch": 0.16264231202302015, "grad_norm": 14.5625, "kl": 11.090937614440918, "learning_rate": 5e-06, "logits/chosen": -93352152.61538461, "logits/rejected": -48217291.63636363, "logps/chosen": -507.54800180288464, "logps/rejected": -510.0047496448864, "loss": 0.0721, "rewards/chosen": 7.322487464317908, "rewards/margins": 17.584539133352, "rewards/rejected": -10.262051669034092, "step": 650 }, { "epoch": 0.16289253096459402, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45809664.0, "logits/rejected": -52096361.14285714, "logps/chosen": -338.61005859375, "logps/rejected": -508.8189174107143, "loss": 0.0752, "rewards/chosen": 3.650387191772461, "rewards/margins": 12.195720509120397, "rewards/rejected": -8.545333317347936, "step": 651 }, { "epoch": 0.1631427499061679, "grad_norm": 10.375, "kl": 0.15212313830852509, "learning_rate": 5e-06, "logits/chosen": -45016186.18181818, "logits/rejected": -47008290.461538464, "logps/chosen": -345.62868430397725, "logps/rejected": -493.28162560096155, "loss": 0.0557, "rewards/chosen": 5.799393393776634, "rewards/margins": 15.5548777546916, "rewards/rejected": -9.755484360914965, "step": 652 }, { "epoch": 0.16339296884774177, "grad_norm": 15.1875, "kl": 8.093406677246094, "learning_rate": 5e-06, "logits/chosen": -47084368.0, "logits/rejected": -33976602.666666664, "logps/chosen": -571.6569010416666, "logps/rejected": -402.2288004557292, "loss": 0.0805, "rewards/chosen": 6.332256317138672, "rewards/margins": 13.123188018798828, "rewards/rejected": -6.790931701660156, "step": 653 }, { "epoch": 0.16364318778931566, "grad_norm": 10.75, "kl": 1.6146190166473389, "learning_rate": 5e-06, "logits/chosen": -49042076.44444445, "logits/rejected": -48191219.2, "logps/chosen": -499.9162326388889, "logps/rejected": -544.7228515625, "loss": 0.0293, "rewards/chosen": 6.670145670572917, "rewards/margins": 17.234286499023437, "rewards/rejected": -10.56414082845052, "step": 654 }, { "epoch": 0.16389340673088953, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33668744.72727273, "logits/rejected": -54299268.92307692, "logps/chosen": -392.8492542613636, "logps/rejected": -707.9043719951923, "loss": 0.0508, "rewards/chosen": 5.799758564342152, "rewards/margins": 17.658680495682297, "rewards/rejected": -11.858921931340145, "step": 655 }, { "epoch": 0.1641436256724634, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59663526.4, "logits/rejected": -74696000.0, "logps/chosen": -438.689990234375, "logps/rejected": -462.24654715401783, "loss": 0.041, "rewards/chosen": 4.823660278320313, "rewards/margins": 12.745981597900391, "rewards/rejected": -7.922321319580078, "step": 656 }, { "epoch": 0.16439384461403728, "grad_norm": 12.3125, "kl": 4.263171195983887, "learning_rate": 5e-06, "logits/chosen": -72709725.0909091, "logits/rejected": -45490875.07692308, "logps/chosen": -425.22727272727275, "logps/rejected": -546.3515625, "loss": 0.0142, "rewards/chosen": 6.606288563121449, "rewards/margins": 16.010276741081185, "rewards/rejected": -9.403988177959736, "step": 657 }, { "epoch": 0.16464406355561115, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39353725.333333336, "logits/rejected": -10432152.666666666, "logps/chosen": -515.0740559895834, "logps/rejected": -531.9388834635416, "loss": 0.0861, "rewards/chosen": 5.730404535929362, "rewards/margins": 14.719844182332356, "rewards/rejected": -8.989439646402994, "step": 658 }, { "epoch": 0.16489428249718505, "grad_norm": 16.25, "kl": 5.5067877769470215, "learning_rate": 5e-06, "logits/chosen": -83794325.33333333, "logits/rejected": -52759696.0, "logps/chosen": -523.13818359375, "logps/rejected": -510.0330403645833, "loss": 0.0727, "rewards/chosen": 9.032029469807943, "rewards/margins": 17.574310302734375, "rewards/rejected": -8.542280832926432, "step": 659 }, { "epoch": 0.16514450143875892, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67866263.27272727, "logits/rejected": -29991428.923076924, "logps/chosen": -511.46004971590907, "logps/rejected": -441.0225360576923, "loss": 0.0185, "rewards/chosen": 7.2500083229758525, "rewards/margins": 16.020787272419962, "rewards/rejected": -8.77077894944411, "step": 660 }, { "epoch": 0.1653947203803328, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41601848.0, "logits/rejected": -41465676.0, "logps/chosen": -405.6173400878906, "logps/rejected": -618.76025390625, "loss": 0.0353, "rewards/chosen": 6.1663007736206055, "rewards/margins": 15.854592323303223, "rewards/rejected": -9.688291549682617, "step": 661 }, { "epoch": 0.16564493932190666, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45884917.333333336, "logits/rejected": -56918109.86666667, "logps/chosen": -409.5426974826389, "logps/rejected": -568.8666015625, "loss": 0.0365, "rewards/chosen": 5.419274648030599, "rewards/margins": 16.57511672973633, "rewards/rejected": -11.15584208170573, "step": 662 }, { "epoch": 0.16589515826348056, "grad_norm": 19.125, "kl": 2.281391143798828, "learning_rate": 5e-06, "logits/chosen": -48919569.06666667, "logits/rejected": -74399658.66666667, "logps/chosen": -374.050390625, "logps/rejected": -561.4972330729166, "loss": 0.1097, "rewards/chosen": 5.25815684000651, "rewards/margins": 15.972480095757378, "rewards/rejected": -10.714323255750868, "step": 663 }, { "epoch": 0.16614537720505443, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66907374.54545455, "logits/rejected": -56381902.76923077, "logps/chosen": -425.4708806818182, "logps/rejected": -617.5872145432693, "loss": 0.0229, "rewards/chosen": 5.99124492298473, "rewards/margins": 17.961372909012375, "rewards/rejected": -11.970127986027645, "step": 664 }, { "epoch": 0.1663955961466283, "grad_norm": 5.28125, "kl": 2.266335964202881, "learning_rate": 5e-06, "logits/chosen": -58393786.18181818, "logits/rejected": -53815990.15384615, "logps/chosen": -579.6943803267045, "logps/rejected": -849.4885817307693, "loss": 0.0189, "rewards/chosen": 7.249734358354048, "rewards/margins": 20.1824865274496, "rewards/rejected": -12.932752169095552, "step": 665 }, { "epoch": 0.16664581508820217, "grad_norm": 6.9375, "kl": 9.461564064025879, "learning_rate": 5e-06, "logits/chosen": -57897604.571428575, "logits/rejected": -68688595.2, "logps/chosen": -535.4178641183036, "logps/rejected": -563.0220703125, "loss": 0.0671, "rewards/chosen": 6.964071001325335, "rewards/margins": 16.061434282575334, "rewards/rejected": -9.09736328125, "step": 666 }, { "epoch": 0.16689603402977604, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47103685.81818182, "logits/rejected": -60106491.07692308, "logps/chosen": -465.9483753551136, "logps/rejected": -720.1829176682693, "loss": 0.0385, "rewards/chosen": 7.5133056640625, "rewards/margins": 17.82909451998197, "rewards/rejected": -10.315788855919472, "step": 667 }, { "epoch": 0.16714625297134994, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42105920.0, "logits/rejected": -50973195.63636363, "logps/chosen": -368.02779447115387, "logps/rejected": -590.5321377840909, "loss": 0.0932, "rewards/chosen": 5.910755450908955, "rewards/margins": 15.882768057443046, "rewards/rejected": -9.972012606534092, "step": 668 }, { "epoch": 0.1673964719129238, "grad_norm": 7.09375, "kl": 4.374902248382568, "learning_rate": 5e-06, "logits/chosen": -71032672.0, "logits/rejected": -51098856.0, "logps/chosen": -572.2081909179688, "logps/rejected": -486.6452941894531, "loss": 0.0216, "rewards/chosen": 9.053348541259766, "rewards/margins": 17.3983154296875, "rewards/rejected": -8.344966888427734, "step": 669 }, { "epoch": 0.16764669085449768, "grad_norm": 6.125, "kl": 0.622650146484375, "learning_rate": 5e-06, "logits/chosen": -79201744.0, "logits/rejected": -58268224.0, "logps/chosen": -499.8522135416667, "logps/rejected": -579.6601969401041, "loss": 0.0105, "rewards/chosen": 6.753852208455403, "rewards/margins": 16.505355834960938, "rewards/rejected": -9.751503626505533, "step": 670 }, { "epoch": 0.16789690979607155, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87231334.4, "logits/rejected": -25305997.714285713, "logps/chosen": -454.819140625, "logps/rejected": -497.30782645089283, "loss": 0.0247, "rewards/chosen": 5.814561080932617, "rewards/margins": 13.032736696515766, "rewards/rejected": -7.218175615583148, "step": 671 }, { "epoch": 0.16814712873764545, "grad_norm": 10.5625, "kl": 3.3707733154296875, "learning_rate": 5e-06, "logits/chosen": -59407125.333333336, "logits/rejected": -67145016.8888889, "logps/chosen": -362.93359375, "logps/rejected": -701.4104275173611, "loss": 0.1211, "rewards/chosen": 5.802168273925782, "rewards/margins": 15.662977600097657, "rewards/rejected": -9.860809326171875, "step": 672 }, { "epoch": 0.16839734767921932, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77413526.58823529, "logits/rejected": -35157140.571428575, "logps/chosen": -406.3794519761029, "logps/rejected": -374.62744140625, "loss": 0.0617, "rewards/chosen": 5.624132941750919, "rewards/margins": 13.331869365788307, "rewards/rejected": -7.707736424037388, "step": 673 }, { "epoch": 0.1686475666207932, "grad_norm": 12.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74230120.0, "logits/rejected": -45749808.0, "logps/chosen": -275.2440185546875, "logps/rejected": -655.884033203125, "loss": 0.0775, "rewards/chosen": 3.579000473022461, "rewards/margins": 14.1707124710083, "rewards/rejected": -10.59171199798584, "step": 674 }, { "epoch": 0.16889778556236706, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52094637.71428572, "logits/rejected": -36853193.6, "logps/chosen": -221.60508510044642, "logps/rejected": -426.48662109375, "loss": 0.1137, "rewards/chosen": 3.56838253566197, "rewards/margins": 11.586692319597516, "rewards/rejected": -8.018309783935546, "step": 675 }, { "epoch": 0.16914800450394094, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70561402.66666667, "logits/rejected": -42819237.333333336, "logps/chosen": -472.2602132161458, "logps/rejected": -518.2571614583334, "loss": 0.0156, "rewards/chosen": 5.527693430582683, "rewards/margins": 14.173243204752605, "rewards/rejected": -8.645549774169922, "step": 676 }, { "epoch": 0.16939822344551483, "grad_norm": 8.625, "kl": 2.589200973510742, "learning_rate": 5e-06, "logits/chosen": -57153557.333333336, "logits/rejected": -19165306.666666668, "logps/chosen": -459.7823079427083, "logps/rejected": -440.3107096354167, "loss": 0.0543, "rewards/chosen": 5.891520818074544, "rewards/margins": 12.614496231079102, "rewards/rejected": -6.722975413004558, "step": 677 }, { "epoch": 0.1696484423870887, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58684000.0, "logits/rejected": -25339477.333333332, "logps/chosen": -278.4515380859375, "logps/rejected": -715.0196940104166, "loss": 0.1064, "rewards/chosen": 3.71870485941569, "rewards/margins": 16.13400713602702, "rewards/rejected": -12.415302276611328, "step": 678 }, { "epoch": 0.16989866132866258, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50479221.333333336, "logits/rejected": -52558997.333333336, "logps/chosen": -429.3028564453125, "logps/rejected": -698.0872395833334, "loss": 0.0482, "rewards/chosen": 5.871030171712239, "rewards/margins": 17.883778889973957, "rewards/rejected": -12.012748718261719, "step": 679 }, { "epoch": 0.17014888027023645, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70929932.8, "logits/rejected": -26153078.85714286, "logps/chosen": -363.9793212890625, "logps/rejected": -454.523681640625, "loss": 0.0352, "rewards/chosen": 4.510313415527344, "rewards/margins": 13.084225899832589, "rewards/rejected": -8.573912484305245, "step": 680 }, { "epoch": 0.17039909921181035, "grad_norm": 9.0625, "kl": 0.3682422637939453, "learning_rate": 5e-06, "logits/chosen": -69601413.33333333, "logits/rejected": -49324549.333333336, "logps/chosen": -375.6105143229167, "logps/rejected": -379.2108968098958, "loss": 0.083, "rewards/chosen": 3.9637940724690757, "rewards/margins": 11.69102923075358, "rewards/rejected": -7.727235158284505, "step": 681 }, { "epoch": 0.17064931815338422, "grad_norm": 5.375, "kl": 1.6558949947357178, "learning_rate": 5e-06, "logits/chosen": -42467827.2, "logits/rejected": -53473435.428571425, "logps/chosen": -373.6046875, "logps/rejected": -554.8095354352679, "loss": 0.0169, "rewards/chosen": 5.687347412109375, "rewards/margins": 15.629019601004464, "rewards/rejected": -9.941672188895089, "step": 682 }, { "epoch": 0.1708995370949581, "grad_norm": 10.125, "kl": 1.0112838745117188, "learning_rate": 5e-06, "logits/chosen": -75722122.66666667, "logits/rejected": -52346037.333333336, "logps/chosen": -422.0463460286458, "logps/rejected": -495.5492757161458, "loss": 0.0405, "rewards/chosen": 6.150388717651367, "rewards/margins": 14.395827611287435, "rewards/rejected": -8.245438893636068, "step": 683 }, { "epoch": 0.17114975603653196, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -106837376.0, "logits/rejected": -93661883.73333333, "logps/chosen": -507.72840711805554, "logps/rejected": -703.4063802083333, "loss": 0.0145, "rewards/chosen": 7.626553005642361, "rewards/margins": 20.393387518988717, "rewards/rejected": -12.766834513346355, "step": 684 }, { "epoch": 0.17139997497810583, "grad_norm": 8.0, "kl": 5.420981407165527, "learning_rate": 5e-06, "logits/chosen": -81585810.28571428, "logits/rejected": -18807579.2, "logps/chosen": -500.3882533482143, "logps/rejected": -639.95791015625, "loss": 0.0358, "rewards/chosen": 8.625656127929688, "rewards/margins": 20.069236755371094, "rewards/rejected": -11.443580627441406, "step": 685 }, { "epoch": 0.17165019391967973, "grad_norm": 13.875, "kl": 0.13960489630699158, "learning_rate": 5e-06, "logits/chosen": -26490086.85714286, "logits/rejected": -36049920.0, "logps/chosen": -351.08523995535717, "logps/rejected": -465.912890625, "loss": 0.0909, "rewards/chosen": 4.871974400111607, "rewards/margins": 11.220363071986608, "rewards/rejected": -6.348388671875, "step": 686 }, { "epoch": 0.1719004128612536, "grad_norm": 9.75, "kl": 2.86164927482605, "learning_rate": 5e-06, "logits/chosen": -66866652.44444445, "logits/rejected": -20687991.466666665, "logps/chosen": -465.76860894097223, "logps/rejected": -624.56953125, "loss": 0.0416, "rewards/chosen": 6.558584425184462, "rewards/margins": 15.554604932996961, "rewards/rejected": -8.9960205078125, "step": 687 }, { "epoch": 0.17215063180282747, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64529397.333333336, "logits/rejected": -34528013.333333336, "logps/chosen": -375.1025390625, "logps/rejected": -494.53125, "loss": 0.0311, "rewards/chosen": 6.433191299438477, "rewards/margins": 13.5333251953125, "rewards/rejected": -7.100133895874023, "step": 688 }, { "epoch": 0.17240085074440134, "grad_norm": 12.375, "kl": 0.8503507375717163, "learning_rate": 5e-06, "logits/chosen": -81551445.33333333, "logits/rejected": -32157968.0, "logps/chosen": -291.41636149088544, "logps/rejected": -418.5695393880208, "loss": 0.0583, "rewards/chosen": 6.001850128173828, "rewards/margins": 12.537049611409504, "rewards/rejected": -6.535199483235677, "step": 689 }, { "epoch": 0.17265106968597524, "grad_norm": 16.875, "kl": 0.7009134292602539, "learning_rate": 5e-06, "logits/chosen": -58566660.266666666, "logits/rejected": -48897628.44444445, "logps/chosen": -366.5291341145833, "logps/rejected": -325.05322265625, "loss": 0.1278, "rewards/chosen": 6.048930358886719, "rewards/margins": 11.004567125108508, "rewards/rejected": -4.955636766221788, "step": 690 }, { "epoch": 0.1729012886275491, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27105212.444444444, "logits/rejected": -56637614.93333333, "logps/chosen": -380.39198133680554, "logps/rejected": -702.0263020833333, "loss": 0.0742, "rewards/chosen": 5.317849900987413, "rewards/margins": 16.23008134629991, "rewards/rejected": -10.9122314453125, "step": 691 }, { "epoch": 0.17315150756912298, "grad_norm": 12.25, "kl": 10.227622032165527, "learning_rate": 5e-06, "logits/chosen": -50098432.0, "logits/rejected": -54675366.4, "logps/chosen": -349.3036411830357, "logps/rejected": -542.9171875, "loss": 0.1075, "rewards/chosen": 6.4098325456891745, "rewards/margins": 14.273331996372768, "rewards/rejected": -7.863499450683594, "step": 692 }, { "epoch": 0.17340172651069685, "grad_norm": 16.125, "kl": 6.429734230041504, "learning_rate": 5e-06, "logits/chosen": -45190384.941176474, "logits/rejected": -61296333.71428572, "logps/chosen": -354.80701401654414, "logps/rejected": -578.6396833147321, "loss": 0.106, "rewards/chosen": 5.8299547083237595, "rewards/margins": 14.019735897288603, "rewards/rejected": -8.189781188964844, "step": 693 }, { "epoch": 0.17365194545227075, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66619697.777777776, "logits/rejected": -19792582.4, "logps/chosen": -455.56206597222223, "logps/rejected": -498.78196614583334, "loss": 0.0392, "rewards/chosen": 5.388075510660808, "rewards/margins": 14.306572214762369, "rewards/rejected": -8.918496704101562, "step": 694 }, { "epoch": 0.17390216439384462, "grad_norm": 17.125, "kl": 8.696220397949219, "learning_rate": 5e-06, "logits/chosen": -38154048.0, "logits/rejected": -46748677.333333336, "logps/chosen": -422.6399739583333, "logps/rejected": -586.0579427083334, "loss": 0.0855, "rewards/chosen": 5.315046946207683, "rewards/margins": 14.517519632975262, "rewards/rejected": -9.202472686767578, "step": 695 }, { "epoch": 0.1741523833354185, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57893158.4, "logits/rejected": -44106176.0, "logps/chosen": -307.336083984375, "logps/rejected": -518.4172014508929, "loss": 0.0778, "rewards/chosen": 3.7311058044433594, "rewards/margins": 13.723721640450615, "rewards/rejected": -9.992615836007255, "step": 696 }, { "epoch": 0.17440260227699236, "grad_norm": 7.90625, "kl": 5.789601802825928, "learning_rate": 5e-06, "logits/chosen": -26980958.0, "logits/rejected": 4796307.5, "logps/chosen": -460.54522705078125, "logps/rejected": -577.863037109375, "loss": 0.1196, "rewards/chosen": 5.657016277313232, "rewards/margins": 14.294976711273193, "rewards/rejected": -8.637960433959961, "step": 697 }, { "epoch": 0.17465282121856623, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38760436.36363637, "logits/rejected": -36518616.615384616, "logps/chosen": -463.99320845170456, "logps/rejected": -431.96228966346155, "loss": 0.0509, "rewards/chosen": 5.864227294921875, "rewards/margins": 13.251133845402645, "rewards/rejected": -7.386906550480769, "step": 698 }, { "epoch": 0.17490304016014013, "grad_norm": 15.4375, "kl": 0.9401601552963257, "learning_rate": 5e-06, "logits/chosen": -93742813.0909091, "logits/rejected": -46093065.84615385, "logps/chosen": -428.4674627130682, "logps/rejected": -708.6716496394231, "loss": 0.0956, "rewards/chosen": 4.923358223655007, "rewards/margins": 14.918353340842508, "rewards/rejected": -9.9949951171875, "step": 699 }, { "epoch": 0.175153259101714, "grad_norm": 8.875, "kl": 2.8063995838165283, "learning_rate": 5e-06, "logits/chosen": -57266080.0, "logits/rejected": -71095024.0, "logps/chosen": -380.40972900390625, "logps/rejected": -743.252685546875, "loss": 0.0885, "rewards/chosen": 5.708252906799316, "rewards/margins": 19.88963508605957, "rewards/rejected": -14.181382179260254, "step": 700 }, { "epoch": 0.17540347804328787, "grad_norm": 9.375, "kl": 2.2932868003845215, "learning_rate": 5e-06, "logits/chosen": -58853480.72727273, "logits/rejected": -55137329.23076923, "logps/chosen": -390.6741388494318, "logps/rejected": -615.8079552283654, "loss": 0.0535, "rewards/chosen": 7.638673262162642, "rewards/margins": 18.623382781769013, "rewards/rejected": -10.98470951960637, "step": 701 }, { "epoch": 0.17565369698486175, "grad_norm": 17.75, "kl": 3.0111489295959473, "learning_rate": 5e-06, "logits/chosen": -57234824.53333333, "logits/rejected": -18644693.333333332, "logps/chosen": -356.08671875, "logps/rejected": -425.9432779947917, "loss": 0.0751, "rewards/chosen": 4.617676798502604, "rewards/margins": 15.072693040635851, "rewards/rejected": -10.455016242133247, "step": 702 }, { "epoch": 0.17590391592643564, "grad_norm": 5.0, "kl": 9.278034210205078, "learning_rate": 5e-06, "logits/chosen": -61575414.15384615, "logits/rejected": -38665376.0, "logps/chosen": -521.6234224759615, "logps/rejected": -612.9747869318181, "loss": 0.0611, "rewards/chosen": 7.4358684833233175, "rewards/margins": 16.680585474401088, "rewards/rejected": -9.24471699107777, "step": 703 }, { "epoch": 0.17615413486800952, "grad_norm": 11.5, "kl": 0.6015090942382812, "learning_rate": 5e-06, "logits/chosen": -53363524.92307692, "logits/rejected": -80383976.72727273, "logps/chosen": -452.48328575721155, "logps/rejected": -645.0536665482955, "loss": 0.0467, "rewards/chosen": 6.177543053260217, "rewards/margins": 13.943426065511638, "rewards/rejected": -7.765883012251421, "step": 704 }, { "epoch": 0.1764043538095834, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47379992.615384616, "logits/rejected": -81091473.45454545, "logps/chosen": -258.82470703125, "logps/rejected": -763.2136896306819, "loss": 0.0999, "rewards/chosen": 4.53782712496244, "rewards/margins": 16.743336657544116, "rewards/rejected": -12.205509532581676, "step": 705 }, { "epoch": 0.17665457275115726, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73657624.61538461, "logits/rejected": -1966561.4545454546, "logps/chosen": -528.8232797475962, "logps/rejected": -392.4462890625, "loss": 0.0231, "rewards/chosen": 6.44094731257512, "rewards/margins": 14.868203810044935, "rewards/rejected": -8.427256497469815, "step": 706 }, { "epoch": 0.17690479169273113, "grad_norm": 21.25, "kl": 33.04115295410156, "learning_rate": 5e-06, "logits/chosen": -55989918.315789476, "logits/rejected": -55486969.6, "logps/chosen": -499.5463610197368, "logps/rejected": -1032.14833984375, "loss": 0.1098, "rewards/chosen": 7.99026810495477, "rewards/margins": 23.005423134251643, "rewards/rejected": -15.015155029296874, "step": 707 }, { "epoch": 0.17715501063430503, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50918538.666666664, "logits/rejected": -27359146.666666668, "logps/chosen": -454.8282063802083, "logps/rejected": -698.5673828125, "loss": 0.0246, "rewards/chosen": 6.743810653686523, "rewards/margins": 16.029017130533852, "rewards/rejected": -9.28520647684733, "step": 708 }, { "epoch": 0.1774052295758789, "grad_norm": 15.625, "kl": 3.79986572265625, "learning_rate": 5e-06, "logits/chosen": -66966252.307692304, "logits/rejected": -58647947.63636363, "logps/chosen": -470.3007061298077, "logps/rejected": -517.6930930397727, "loss": 0.0913, "rewards/chosen": 7.029987041766827, "rewards/margins": 14.387965168986288, "rewards/rejected": -7.35797812721946, "step": 709 }, { "epoch": 0.17765544851745277, "grad_norm": 4.78125, "kl": 7.186982154846191, "learning_rate": 5e-06, "logits/chosen": -73866131.6923077, "logits/rejected": -36368529.45454545, "logps/chosen": -483.39066256009613, "logps/rejected": -474.87668678977275, "loss": 0.0766, "rewards/chosen": 6.209300114558293, "rewards/margins": 13.51670437259274, "rewards/rejected": -7.307404258034446, "step": 710 }, { "epoch": 0.17790566745902664, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31546496.0, "logits/rejected": -30095182.933333334, "logps/chosen": -575.1024305555555, "logps/rejected": -466.9169921875, "loss": 0.0292, "rewards/chosen": 6.828341166178386, "rewards/margins": 14.729346211751302, "rewards/rejected": -7.901005045572917, "step": 711 }, { "epoch": 0.17815588640060054, "grad_norm": 10.25, "kl": 5.903179168701172, "learning_rate": 5e-06, "logits/chosen": -61828789.333333336, "logits/rejected": -29850149.333333332, "logps/chosen": -393.2134195963542, "logps/rejected": -494.1728108723958, "loss": 0.027, "rewards/chosen": 7.188343048095703, "rewards/margins": 13.707970937093098, "rewards/rejected": -6.5196278889973955, "step": 712 }, { "epoch": 0.1784061053421744, "grad_norm": 16.125, "kl": 11.499292373657227, "learning_rate": 5e-06, "logits/chosen": -74213997.71428572, "logits/rejected": 124858931.2, "logps/chosen": -477.9955357142857, "logps/rejected": -592.252587890625, "loss": 0.1009, "rewards/chosen": 7.06403568812779, "rewards/margins": 14.65481480189732, "rewards/rejected": -7.590779113769531, "step": 713 }, { "epoch": 0.17865632428374828, "grad_norm": 7.875, "kl": 1.8934530019760132, "learning_rate": 5e-06, "logits/chosen": -62661139.692307696, "logits/rejected": -68402885.81818181, "logps/chosen": -419.9270207331731, "logps/rejected": -668.5748845880681, "loss": 0.053, "rewards/chosen": 7.242148766150842, "rewards/margins": 16.546153782130954, "rewards/rejected": -9.304005015980113, "step": 714 }, { "epoch": 0.17890654322532215, "grad_norm": 15.1875, "kl": 3.329894781112671, "learning_rate": 5e-06, "logits/chosen": -43639113.6, "logits/rejected": -49763670.85714286, "logps/chosen": -352.9053955078125, "logps/rejected": -552.7942940848214, "loss": 0.1092, "rewards/chosen": 5.797957229614258, "rewards/margins": 16.703293882097515, "rewards/rejected": -10.905336652483259, "step": 715 }, { "epoch": 0.17915676216689602, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74053873.77777778, "logits/rejected": -34832123.733333334, "logps/chosen": -470.9084201388889, "logps/rejected": -633.1209635416667, "loss": 0.0582, "rewards/chosen": 5.858901129828559, "rewards/margins": 14.033810085720486, "rewards/rejected": -8.174908955891928, "step": 716 }, { "epoch": 0.17940698110846992, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44742971.07692308, "logits/rejected": -30457152.0, "logps/chosen": -260.2517653245192, "logps/rejected": -557.5134943181819, "loss": 0.082, "rewards/chosen": 4.013749636136568, "rewards/margins": 13.470396642084722, "rewards/rejected": -9.456647005948154, "step": 717 }, { "epoch": 0.1796572000500438, "grad_norm": 12.8125, "kl": 2.5944085121154785, "learning_rate": 5e-06, "logits/chosen": -33767108.571428575, "logits/rejected": -52787110.4, "logps/chosen": -341.03853934151783, "logps/rejected": -668.7759765625, "loss": 0.0641, "rewards/chosen": 5.297382354736328, "rewards/margins": 14.50130386352539, "rewards/rejected": -9.203921508789062, "step": 718 }, { "epoch": 0.17990741899161766, "grad_norm": 8.125, "kl": 5.018739700317383, "learning_rate": 5e-06, "logits/chosen": -54394554.666666664, "logits/rejected": -74321024.0, "logps/chosen": -325.34425862630206, "logps/rejected": -537.8746744791666, "loss": 0.0702, "rewards/chosen": 5.580752054850261, "rewards/margins": 16.269168853759766, "rewards/rejected": -10.688416798909506, "step": 719 }, { "epoch": 0.18015763793319153, "grad_norm": 6.125, "kl": 7.915319442749023, "learning_rate": 5e-06, "logits/chosen": -83251652.92307693, "logits/rejected": -70421120.0, "logps/chosen": -388.1950871394231, "logps/rejected": -481.87397904829544, "loss": 0.0517, "rewards/chosen": 5.9322028526893025, "rewards/margins": 12.56598914086402, "rewards/rejected": -6.633786288174716, "step": 720 }, { "epoch": 0.18040785687476543, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69674013.0909091, "logits/rejected": -47242003.692307696, "logps/chosen": -371.03178267045456, "logps/rejected": -691.5302734375, "loss": 0.0752, "rewards/chosen": 5.602556055242365, "rewards/margins": 17.488119832285633, "rewards/rejected": -11.88556377704327, "step": 721 }, { "epoch": 0.1806580758163393, "grad_norm": 10.4375, "kl": 3.593658447265625, "learning_rate": 5e-06, "logits/chosen": -76687530.66666667, "logits/rejected": -54437056.0, "logps/chosen": -519.9554036458334, "logps/rejected": -457.9519856770833, "loss": 0.0333, "rewards/chosen": 8.330449422200521, "rewards/margins": 15.347049967447917, "rewards/rejected": -7.016600545247396, "step": 722 }, { "epoch": 0.18090829475791317, "grad_norm": 7.1875, "kl": 1.7823947668075562, "learning_rate": 5e-06, "logits/chosen": -93296168.72727273, "logits/rejected": -35667318.15384615, "logps/chosen": -426.74587180397725, "logps/rejected": -546.3592247596154, "loss": 0.0522, "rewards/chosen": 5.792817549272017, "rewards/margins": 14.118543398130189, "rewards/rejected": -8.325725848858173, "step": 723 }, { "epoch": 0.18115851369948704, "grad_norm": 8.0, "kl": 0.570186972618103, "learning_rate": 5e-06, "logits/chosen": -45147566.54545455, "logits/rejected": -29549462.153846152, "logps/chosen": -387.9017223011364, "logps/rejected": -420.05716646634613, "loss": 0.0424, "rewards/chosen": 5.388437444513494, "rewards/margins": 12.461523522863855, "rewards/rejected": -7.073086078350361, "step": 724 }, { "epoch": 0.18140873264106092, "grad_norm": 12.9375, "kl": 1.8238639831542969, "learning_rate": 5e-06, "logits/chosen": -49162180.92307692, "logits/rejected": -26878609.454545453, "logps/chosen": -414.88927283653845, "logps/rejected": -296.8208673650568, "loss": 0.0584, "rewards/chosen": 6.385125967172476, "rewards/margins": 12.43112913545195, "rewards/rejected": -6.046003168279475, "step": 725 }, { "epoch": 0.18165895158263481, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37551239.11111111, "logits/rejected": -79150412.8, "logps/chosen": -375.75146484375, "logps/rejected": -591.9458333333333, "loss": 0.0481, "rewards/chosen": 6.276812235514323, "rewards/margins": 16.622684224446616, "rewards/rejected": -10.345871988932291, "step": 726 }, { "epoch": 0.18190917052420869, "grad_norm": 6.75, "kl": 2.515467405319214, "learning_rate": 5e-06, "logits/chosen": -76964346.66666667, "logits/rejected": -94197440.0, "logps/chosen": -456.6259358723958, "logps/rejected": -456.8588460286458, "loss": 0.0236, "rewards/chosen": 6.5506032307942705, "rewards/margins": 13.756879170735676, "rewards/rejected": -7.206275939941406, "step": 727 }, { "epoch": 0.18215938946578256, "grad_norm": 2.234375, "kl": 2.5210318565368652, "learning_rate": 5e-06, "logits/chosen": -53339451.07692308, "logits/rejected": -48501076.36363637, "logps/chosen": -493.7118389423077, "logps/rejected": -848.0596590909091, "loss": 0.0054, "rewards/chosen": 8.017585167518028, "rewards/margins": 25.27701232483337, "rewards/rejected": -17.25942715731534, "step": 728 }, { "epoch": 0.18240960840735643, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48911010.90909091, "logits/rejected": -49852224.0, "logps/chosen": -361.5631214488636, "logps/rejected": -530.8541165865385, "loss": 0.0232, "rewards/chosen": 5.89506738836115, "rewards/margins": 15.495888076462112, "rewards/rejected": -9.600820688100962, "step": 729 }, { "epoch": 0.18265982734893033, "grad_norm": 19.375, "kl": 11.989924430847168, "learning_rate": 5e-06, "logits/chosen": -8390604.666666666, "logits/rejected": -47819898.666666664, "logps/chosen": -492.7035319010417, "logps/rejected": -470.9751383463542, "loss": 0.1241, "rewards/chosen": 6.317582448323567, "rewards/margins": 12.549383799235025, "rewards/rejected": -6.231801350911458, "step": 730 }, { "epoch": 0.1829100462905042, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67372736.0, "logits/rejected": -53511112.53333333, "logps/chosen": -542.2922634548611, "logps/rejected": -534.9907877604167, "loss": 0.041, "rewards/chosen": 7.221045600043403, "rewards/margins": 17.336568874782987, "rewards/rejected": -10.115523274739584, "step": 731 }, { "epoch": 0.18316026523207807, "grad_norm": 12.125, "kl": 4.440423965454102, "learning_rate": 5e-06, "logits/chosen": -35697900.307692304, "logits/rejected": -52358365.09090909, "logps/chosen": -384.01986929086536, "logps/rejected": -458.3294122869318, "loss": 0.0756, "rewards/chosen": 4.612891270564153, "rewards/margins": 13.66867532263269, "rewards/rejected": -9.055784052068537, "step": 732 }, { "epoch": 0.18341048417365194, "grad_norm": 6.59375, "kl": 3.787334442138672, "learning_rate": 5e-06, "logits/chosen": -105404205.1764706, "logits/rejected": -42533677.71428572, "logps/chosen": -420.24488740808823, "logps/rejected": -518.8389020647321, "loss": 0.0744, "rewards/chosen": 5.8443163703469665, "rewards/margins": 16.0253793411896, "rewards/rejected": -10.181062970842634, "step": 733 }, { "epoch": 0.1836607031152258, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22021725.714285713, "logits/rejected": -40388875.294117644, "logps/chosen": -348.66353934151783, "logps/rejected": -431.63683363970586, "loss": 0.0368, "rewards/chosen": 5.842098236083984, "rewards/margins": 13.277313905603744, "rewards/rejected": -7.435215669519761, "step": 734 }, { "epoch": 0.1839109220567997, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52535842.90909091, "logits/rejected": -26753092.923076924, "logps/chosen": -414.73291015625, "logps/rejected": -521.9594350961538, "loss": 0.0319, "rewards/chosen": 6.992141030051491, "rewards/margins": 16.080937792371202, "rewards/rejected": -9.088796762319712, "step": 735 }, { "epoch": 0.18416114099837358, "grad_norm": 24.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48058642.28571428, "logits/rejected": -23420648.0, "logps/chosen": -362.770263671875, "logps/rejected": -518.7287109375, "loss": 0.0708, "rewards/chosen": 3.8764964512416293, "rewards/margins": 10.613148062569755, "rewards/rejected": -6.736651611328125, "step": 736 }, { "epoch": 0.18441135993994745, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51263054.222222224, "logits/rejected": -35914214.4, "logps/chosen": -342.0602756076389, "logps/rejected": -459.39755859375, "loss": 0.0564, "rewards/chosen": 5.4303783840603295, "rewards/margins": 15.345145840115016, "rewards/rejected": -9.914767456054687, "step": 737 }, { "epoch": 0.18466157888152132, "grad_norm": 13.625, "kl": 10.84058952331543, "learning_rate": 5e-06, "logits/chosen": -71714112.0, "logits/rejected": -45106220.307692304, "logps/chosen": -532.7406338778409, "logps/rejected": -384.07474459134613, "loss": 0.0204, "rewards/chosen": 7.893820329145952, "rewards/margins": 16.849273468230987, "rewards/rejected": -8.955453139085035, "step": 738 }, { "epoch": 0.18491179782309522, "grad_norm": 1.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63248288.0, "logits/rejected": -52859536.0, "logps/chosen": -549.8051147460938, "logps/rejected": -500.8243103027344, "loss": 0.0035, "rewards/chosen": 7.644482612609863, "rewards/margins": 16.183485984802246, "rewards/rejected": -8.539003372192383, "step": 739 }, { "epoch": 0.1851620167646691, "grad_norm": 12.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83834936.8888889, "logits/rejected": -79412394.66666667, "logps/chosen": -474.4694010416667, "logps/rejected": -596.1455729166667, "loss": 0.031, "rewards/chosen": 7.703482733832465, "rewards/margins": 19.916092597113714, "rewards/rejected": -12.21260986328125, "step": 740 }, { "epoch": 0.18541223570624296, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19660963.692307692, "logits/rejected": -40386658.90909091, "logps/chosen": -297.5119816706731, "logps/rejected": -511.09525923295456, "loss": 0.0906, "rewards/chosen": 4.164818396935096, "rewards/margins": 13.506016017673733, "rewards/rejected": -9.341197620738637, "step": 741 }, { "epoch": 0.18566245464781683, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -104695656.72727273, "logits/rejected": -79931441.23076923, "logps/chosen": -542.4687056107955, "logps/rejected": -587.4605994591346, "loss": 0.0096, "rewards/chosen": 7.262121027166193, "rewards/margins": 18.767978054660183, "rewards/rejected": -11.50585702749399, "step": 742 }, { "epoch": 0.1859126735893907, "grad_norm": 7.40625, "kl": 0.5892280340194702, "learning_rate": 5e-06, "logits/chosen": -25681698.666666668, "logits/rejected": -36335941.333333336, "logps/chosen": -331.6855061848958, "logps/rejected": -488.4554443359375, "loss": 0.0403, "rewards/chosen": 5.152231852213542, "rewards/margins": 14.552221934000652, "rewards/rejected": -9.39999008178711, "step": 743 }, { "epoch": 0.1861628925309646, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77571288.0, "logits/rejected": -64650784.0, "logps/chosen": -470.01446533203125, "logps/rejected": -569.0632934570312, "loss": 0.0608, "rewards/chosen": 7.393571853637695, "rewards/margins": 17.142139434814453, "rewards/rejected": -9.748567581176758, "step": 744 }, { "epoch": 0.18641311147253847, "grad_norm": 16.625, "kl": 0.5460826754570007, "learning_rate": 5e-06, "logits/chosen": -7227328.0, "logits/rejected": -26068051.2, "logps/chosen": -490.69580078125, "logps/rejected": -464.62421875, "loss": 0.0381, "rewards/chosen": 6.061325920952691, "rewards/margins": 13.666175672743055, "rewards/rejected": -7.604849751790365, "step": 745 }, { "epoch": 0.18666333041411234, "grad_norm": 22.125, "kl": 1.6078147888183594, "learning_rate": 5e-06, "logits/chosen": -43218771.692307696, "logits/rejected": -50225640.72727273, "logps/chosen": -323.54830228365387, "logps/rejected": -555.5528231534091, "loss": 0.0997, "rewards/chosen": 4.106776310847356, "rewards/margins": 12.702093804632867, "rewards/rejected": -8.595317493785512, "step": 746 }, { "epoch": 0.18691354935568621, "grad_norm": 5.59375, "kl": 5.847883701324463, "learning_rate": 5e-06, "logits/chosen": -61582642.28571428, "logits/rejected": -46114960.0, "logps/chosen": -444.17173549107144, "logps/rejected": -591.34609375, "loss": 0.0394, "rewards/chosen": 6.704927716936384, "rewards/margins": 17.70850285121373, "rewards/rejected": -11.003575134277344, "step": 747 }, { "epoch": 0.1871637682972601, "grad_norm": 4.1875, "kl": 5.278030872344971, "learning_rate": 5e-06, "logits/chosen": -36747737.6, "logits/rejected": -70000704.0, "logps/chosen": -368.3920572916667, "logps/rejected": -311.98133680555554, "loss": 0.0477, "rewards/chosen": 6.172967529296875, "rewards/margins": 13.405596245659723, "rewards/rejected": -7.232628716362847, "step": 748 }, { "epoch": 0.18741398723883398, "grad_norm": 10.875, "kl": 1.9263179302215576, "learning_rate": 5e-06, "logits/chosen": -70863404.3076923, "logits/rejected": -22107310.545454547, "logps/chosen": -462.0105168269231, "logps/rejected": -458.2373046875, "loss": 0.0246, "rewards/chosen": 7.0742962176983175, "rewards/margins": 14.719641732169197, "rewards/rejected": -7.645345514470881, "step": 749 }, { "epoch": 0.18766420618040786, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -91789513.14285715, "logits/rejected": -72141683.2, "logps/chosen": -390.64554268973217, "logps/rejected": -771.376513671875, "loss": 0.0271, "rewards/chosen": 6.177424839564732, "rewards/margins": 19.402516392299106, "rewards/rejected": -13.225091552734375, "step": 750 }, { "epoch": 0.18791442512198173, "grad_norm": 22.875, "kl": 19.72315788269043, "learning_rate": 5e-06, "logits/chosen": -55871130.35294118, "logits/rejected": -70852123.42857143, "logps/chosen": -436.32117417279414, "logps/rejected": -483.63089425223217, "loss": 0.0678, "rewards/chosen": 6.746851303998162, "rewards/margins": 15.705984452191522, "rewards/rejected": -8.95913314819336, "step": 751 }, { "epoch": 0.18816464406355562, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47239680.0, "logits/rejected": -54592625.23076923, "logps/chosen": -603.2428977272727, "logps/rejected": -478.13003305288464, "loss": 0.0499, "rewards/chosen": 8.076557506214488, "rewards/margins": 16.09364734996449, "rewards/rejected": -8.01708984375, "step": 752 }, { "epoch": 0.1884148630051295, "grad_norm": 15.9375, "kl": 1.6756629943847656, "learning_rate": 5e-06, "logits/chosen": -60310000.0, "logits/rejected": -29489061.333333332, "logps/chosen": -258.2334798177083, "logps/rejected": -634.0535074869791, "loss": 0.0662, "rewards/chosen": 4.710054079691569, "rewards/margins": 15.058815320332844, "rewards/rejected": -10.348761240641275, "step": 753 }, { "epoch": 0.18866508194670337, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49030297.6, "logits/rejected": -40142065.777777776, "logps/chosen": -340.61272786458335, "logps/rejected": -400.89344618055554, "loss": 0.0465, "rewards/chosen": 6.660277811686198, "rewards/margins": 14.644435797797309, "rewards/rejected": -7.984157986111111, "step": 754 }, { "epoch": 0.18891530088827724, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23835105.6, "logits/rejected": -54121389.71428572, "logps/chosen": -292.4234375, "logps/rejected": -622.8983677455357, "loss": 0.0217, "rewards/chosen": 4.780410385131836, "rewards/margins": 17.66634875706264, "rewards/rejected": -12.885938371930804, "step": 755 }, { "epoch": 0.1891655198298511, "grad_norm": 19.75, "kl": 6.965249061584473, "learning_rate": 5e-06, "logits/chosen": -52097258.666666664, "logits/rejected": -41704900.266666666, "logps/chosen": -564.0428602430555, "logps/rejected": -433.09619140625, "loss": 0.0203, "rewards/chosen": 10.003687540690104, "rewards/margins": 17.34937744140625, "rewards/rejected": -7.345689900716146, "step": 756 }, { "epoch": 0.189415738771425, "grad_norm": 4.28125, "kl": 1.6121814250946045, "learning_rate": 5e-06, "logits/chosen": -56811072.0, "logits/rejected": -62204773.333333336, "logps/chosen": -426.9440104166667, "logps/rejected": -559.7593587239584, "loss": 0.0094, "rewards/chosen": 8.24566396077474, "rewards/margins": 18.42844835917155, "rewards/rejected": -10.18278439839681, "step": 757 }, { "epoch": 0.18966595771299888, "grad_norm": 13.4375, "kl": 7.0572829246521, "learning_rate": 5e-06, "logits/chosen": -84275536.0, "logits/rejected": -38835888.0, "logps/chosen": -473.0864664713542, "logps/rejected": -457.4165852864583, "loss": 0.0525, "rewards/chosen": 6.5260874430338545, "rewards/margins": 14.497683207194012, "rewards/rejected": -7.971595764160156, "step": 758 }, { "epoch": 0.18991617665457275, "grad_norm": 2.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 24208880.0, "logits/rejected": -44486724.0, "logps/chosen": -504.8377990722656, "logps/rejected": -723.7698974609375, "loss": 0.0133, "rewards/chosen": 7.936724662780762, "rewards/margins": 19.90409564971924, "rewards/rejected": -11.967370986938477, "step": 759 }, { "epoch": 0.19016639559614662, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52100419.2, "logits/rejected": -17151417.14285714, "logps/chosen": -317.670703125, "logps/rejected": -504.99581473214283, "loss": 0.1065, "rewards/chosen": 4.338721466064453, "rewards/margins": 12.845295824323383, "rewards/rejected": -8.506574358258929, "step": 760 }, { "epoch": 0.19041661453772052, "grad_norm": 20.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32805285.818181816, "logits/rejected": -3965060.923076923, "logps/chosen": -326.91410688920456, "logps/rejected": -358.51089242788464, "loss": 0.0618, "rewards/chosen": 5.743856950239702, "rewards/margins": 12.598762885673896, "rewards/rejected": -6.854905935434195, "step": 761 }, { "epoch": 0.1906668334792944, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51308214.85714286, "logits/rejected": -12822941.6, "logps/chosen": -343.5065220424107, "logps/rejected": -273.5468994140625, "loss": 0.0461, "rewards/chosen": 6.219408307756696, "rewards/margins": 12.495134626116071, "rewards/rejected": -6.275726318359375, "step": 762 }, { "epoch": 0.19091705242086826, "grad_norm": 10.4375, "kl": 1.5390746593475342, "learning_rate": 5e-06, "logits/chosen": -52308371.692307696, "logits/rejected": -32224605.09090909, "logps/chosen": -446.91458834134613, "logps/rejected": -300.5582386363636, "loss": 0.0579, "rewards/chosen": 7.066094031700721, "rewards/margins": 13.239156442922312, "rewards/rejected": -6.173062411221591, "step": 763 }, { "epoch": 0.19116727136244213, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54476643.55555555, "logits/rejected": -43014997.333333336, "logps/chosen": -340.44146050347223, "logps/rejected": -560.6063802083333, "loss": 0.0272, "rewards/chosen": 6.979727003309462, "rewards/margins": 18.29089135064019, "rewards/rejected": -11.31116434733073, "step": 764 }, { "epoch": 0.191417490304016, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55464677.333333336, "logits/rejected": -60452624.0, "logps/chosen": -387.4718831380208, "logps/rejected": -610.9570719401041, "loss": 0.0212, "rewards/chosen": 7.022189458211263, "rewards/margins": 15.729825337727863, "rewards/rejected": -8.707635879516602, "step": 765 }, { "epoch": 0.1916677092455899, "grad_norm": 11.75, "kl": 2.5066115856170654, "learning_rate": 5e-06, "logits/chosen": -31838448.0, "logits/rejected": -55355976.0, "logps/chosen": -367.44366455078125, "logps/rejected": -665.2879028320312, "loss": 0.0561, "rewards/chosen": 5.7261786460876465, "rewards/margins": 16.928192615509033, "rewards/rejected": -11.202013969421387, "step": 766 }, { "epoch": 0.19191792818716377, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48132042.666666664, "logits/rejected": -52281546.666666664, "logps/chosen": -415.7325846354167, "logps/rejected": -640.1173095703125, "loss": 0.0248, "rewards/chosen": 6.91850217183431, "rewards/margins": 17.803024927775066, "rewards/rejected": -10.884522755940756, "step": 767 }, { "epoch": 0.19216814712873764, "grad_norm": 11.4375, "kl": 4.991845607757568, "learning_rate": 5e-06, "logits/chosen": -41127569.23076923, "logits/rejected": -26718731.636363637, "logps/chosen": -391.3740985576923, "logps/rejected": -466.5599254261364, "loss": 0.0352, "rewards/chosen": 6.182608384352464, "rewards/margins": 13.746976198849978, "rewards/rejected": -7.564367814497515, "step": 768 }, { "epoch": 0.19241836607031151, "grad_norm": 14.0625, "kl": 0.30423229932785034, "learning_rate": 5e-06, "logits/chosen": -61034953.84615385, "logits/rejected": -47198824.72727273, "logps/chosen": -389.52892127403845, "logps/rejected": -490.61860795454544, "loss": 0.0388, "rewards/chosen": 6.54525639460637, "rewards/margins": 16.552333698406088, "rewards/rejected": -10.007077303799717, "step": 769 }, { "epoch": 0.1926685850118854, "grad_norm": 2.828125, "kl": 2.7039363384246826, "learning_rate": 5e-06, "logits/chosen": -69796817.45454545, "logits/rejected": -36781767.384615384, "logps/chosen": -494.58522727272725, "logps/rejected": -555.9032451923077, "loss": 0.0122, "rewards/chosen": 7.6953277587890625, "rewards/margins": 17.446314298189606, "rewards/rejected": -9.750986539400541, "step": 770 }, { "epoch": 0.19291880395345928, "grad_norm": 15.75, "kl": 4.258843898773193, "learning_rate": 5e-06, "logits/chosen": -81830108.44444445, "logits/rejected": -62735112.53333333, "logps/chosen": -397.0026584201389, "logps/rejected": -426.97307942708335, "loss": 0.0708, "rewards/chosen": 7.245611402723524, "rewards/margins": 13.92344750298394, "rewards/rejected": -6.677836100260417, "step": 771 }, { "epoch": 0.19316902289503315, "grad_norm": 8.75, "kl": 3.016787528991699, "learning_rate": 5e-06, "logits/chosen": -52523565.71428572, "logits/rejected": -56918009.6, "logps/chosen": -449.27828543526783, "logps/rejected": -550.47978515625, "loss": 0.0346, "rewards/chosen": 6.104246956961496, "rewards/margins": 16.66878934587751, "rewards/rejected": -10.564542388916015, "step": 772 }, { "epoch": 0.19341924183660703, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62468249.6, "logits/rejected": -61920635.428571425, "logps/chosen": -336.78134765625, "logps/rejected": -499.6768275669643, "loss": 0.0486, "rewards/chosen": 6.820289611816406, "rewards/margins": 14.544644492013113, "rewards/rejected": -7.7243548801967075, "step": 773 }, { "epoch": 0.1936694607781809, "grad_norm": 3.078125, "kl": 4.653754711151123, "learning_rate": 5e-06, "logits/chosen": -79820381.86666666, "logits/rejected": -68412416.0, "logps/chosen": -436.62353515625, "logps/rejected": -549.1727430555555, "loss": 0.018, "rewards/chosen": 7.988427734375, "rewards/margins": 16.950892808702257, "rewards/rejected": -8.962465074327257, "step": 774 }, { "epoch": 0.1939196797197548, "grad_norm": 15.625, "kl": 9.926856994628906, "learning_rate": 5e-06, "logits/chosen": -43630656.0, "logits/rejected": -62075379.2, "logps/chosen": -351.16245814732144, "logps/rejected": -721.505517578125, "loss": 0.0918, "rewards/chosen": 6.118412562779018, "rewards/margins": 20.750901576450893, "rewards/rejected": -14.632489013671876, "step": 775 }, { "epoch": 0.19416989866132867, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32805280.0, "logits/rejected": -48117952.0, "logps/chosen": -276.3290771484375, "logps/rejected": -588.8889508928571, "loss": 0.0706, "rewards/chosen": 4.932155609130859, "rewards/margins": 13.072474343436104, "rewards/rejected": -8.140318734305245, "step": 776 }, { "epoch": 0.19442011760290254, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56095260.44444445, "logits/rejected": -48964078.93333333, "logps/chosen": -285.20005967881946, "logps/rejected": -579.3704427083334, "loss": 0.0854, "rewards/chosen": 4.338704427083333, "rewards/margins": 11.526825968424479, "rewards/rejected": -7.188121541341146, "step": 777 }, { "epoch": 0.1946703365444764, "grad_norm": 4.96875, "kl": 2.4994025230407715, "learning_rate": 5e-06, "logits/chosen": -58249338.18181818, "logits/rejected": -21601095.384615384, "logps/chosen": -464.8508966619318, "logps/rejected": -497.72475961538464, "loss": 0.0369, "rewards/chosen": 6.807033192027699, "rewards/margins": 16.16709953254753, "rewards/rejected": -9.360066340519833, "step": 778 }, { "epoch": 0.1949205554860503, "grad_norm": 16.625, "kl": 5.22830867767334, "learning_rate": 5e-06, "logits/chosen": -48293481.14285714, "logits/rejected": -50497737.6, "logps/chosen": -400.02713448660717, "logps/rejected": -577.370654296875, "loss": 0.0892, "rewards/chosen": 6.610093252999442, "rewards/margins": 17.666199820382253, "rewards/rejected": -11.056106567382812, "step": 779 }, { "epoch": 0.19517077442762418, "grad_norm": 2.28125, "kl": 5.2460150718688965, "learning_rate": 5e-06, "logits/chosen": -48169334.85714286, "logits/rejected": -44441955.2, "logps/chosen": -369.29286411830356, "logps/rejected": -656.659619140625, "loss": 0.0308, "rewards/chosen": 7.134678431919643, "rewards/margins": 16.292231532505582, "rewards/rejected": -9.157553100585938, "step": 780 }, { "epoch": 0.19542099336919805, "grad_norm": 20.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 213684309.33333334, "logits/rejected": -59194485.333333336, "logps/chosen": -576.1879475911459, "logps/rejected": -682.8180338541666, "loss": 0.0548, "rewards/chosen": 7.890483856201172, "rewards/margins": 17.955281999376083, "rewards/rejected": -10.064798143174913, "step": 781 }, { "epoch": 0.19567121231077192, "grad_norm": 12.6875, "kl": 5.794898509979248, "learning_rate": 5e-06, "logits/chosen": -51934084.0, "logits/rejected": -59407036.0, "logps/chosen": -375.10528564453125, "logps/rejected": -476.72027587890625, "loss": 0.0459, "rewards/chosen": 5.85671329498291, "rewards/margins": 14.188584327697754, "rewards/rejected": -8.331871032714844, "step": 782 }, { "epoch": 0.1959214312523458, "grad_norm": 10.4375, "kl": 7.434493541717529, "learning_rate": 5e-06, "logits/chosen": -58841052.44444445, "logits/rejected": -36314103.46666667, "logps/chosen": -306.030517578125, "logps/rejected": -443.04791666666665, "loss": 0.0421, "rewards/chosen": 6.728176964653863, "rewards/margins": 14.62759713066949, "rewards/rejected": -7.899420166015625, "step": 783 }, { "epoch": 0.1961716501939197, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66068544.0, "logits/rejected": -36125330.823529415, "logps/chosen": -417.34116908482144, "logps/rejected": -447.0884650735294, "loss": 0.0157, "rewards/chosen": 6.3509014674595425, "rewards/margins": 14.345273667022962, "rewards/rejected": -7.994372199563419, "step": 784 }, { "epoch": 0.19642186913549356, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33594185.14285714, "logits/rejected": -37133347.2, "logps/chosen": -350.42518833705356, "logps/rejected": -527.660107421875, "loss": 0.0516, "rewards/chosen": 5.214516230991909, "rewards/margins": 14.731714412144253, "rewards/rejected": -9.517198181152343, "step": 785 }, { "epoch": 0.19667208807706743, "grad_norm": 15.5, "kl": 11.855327606201172, "learning_rate": 5e-06, "logits/chosen": -42866858.666666664, "logits/rejected": -66386104.88888889, "logps/chosen": -423.8452473958333, "logps/rejected": -528.5264756944445, "loss": 0.0424, "rewards/chosen": 7.475948079427083, "rewards/margins": 14.528540886773005, "rewards/rejected": -7.0525928073459205, "step": 786 }, { "epoch": 0.1969223070186413, "grad_norm": 13.3125, "kl": 8.092233657836914, "learning_rate": 5e-06, "logits/chosen": -50351599.15789474, "logits/rejected": -49429683.2, "logps/chosen": -443.9545641447368, "logps/rejected": -830.33525390625, "loss": 0.0466, "rewards/chosen": 6.621235094572368, "rewards/margins": 21.335682116056745, "rewards/rejected": -14.714447021484375, "step": 787 }, { "epoch": 0.1971725259602152, "grad_norm": 16.25, "kl": 20.436603546142578, "learning_rate": 5e-06, "logits/chosen": -64435602.28571428, "logits/rejected": -72926502.4, "logps/chosen": -473.6840122767857, "logps/rejected": -505.0876953125, "loss": 0.1178, "rewards/chosen": 7.37617438180106, "rewards/margins": 14.716451481410434, "rewards/rejected": -7.340277099609375, "step": 788 }, { "epoch": 0.19742274490178907, "grad_norm": 17.375, "kl": 1.1567704677581787, "learning_rate": 5e-06, "logits/chosen": -54946469.333333336, "logits/rejected": -37511346.666666664, "logps/chosen": -413.1483968098958, "logps/rejected": -540.9465738932291, "loss": 0.1048, "rewards/chosen": 5.627518971761067, "rewards/margins": 13.126118977864582, "rewards/rejected": -7.498600006103516, "step": 789 }, { "epoch": 0.19767296384336294, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62774502.4, "logits/rejected": -39964036.571428575, "logps/chosen": -463.8818359375, "logps/rejected": -584.0109514508929, "loss": 0.0218, "rewards/chosen": 7.480061340332031, "rewards/margins": 18.057569449288504, "rewards/rejected": -10.577508108956474, "step": 790 }, { "epoch": 0.1979231827849368, "grad_norm": 9.25, "kl": 8.550105094909668, "learning_rate": 5e-06, "logits/chosen": -55717208.615384616, "logits/rejected": -47519586.90909091, "logps/chosen": -513.7130033052885, "logps/rejected": -545.9212979403409, "loss": 0.0318, "rewards/chosen": 7.185725872333233, "rewards/margins": 15.249381378814057, "rewards/rejected": -8.063655506480824, "step": 791 }, { "epoch": 0.19817340172651068, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36282480.0, "logits/rejected": -39602952.0, "logps/chosen": -421.6878662109375, "logps/rejected": -653.6077880859375, "loss": 0.0519, "rewards/chosen": 8.16075325012207, "rewards/margins": 17.460349082946777, "rewards/rejected": -9.299595832824707, "step": 792 }, { "epoch": 0.19842362066808458, "grad_norm": 19.375, "kl": 3.8469817638397217, "learning_rate": 5e-06, "logits/chosen": -63231360.0, "logits/rejected": -30232147.692307692, "logps/chosen": -485.91104403409093, "logps/rejected": -524.6609074519231, "loss": 0.0929, "rewards/chosen": 8.293641523881393, "rewards/margins": 15.931041637500684, "rewards/rejected": -7.637400113619291, "step": 793 }, { "epoch": 0.19867383960965845, "grad_norm": 1.4375, "kl": 1.10211181640625, "learning_rate": 5e-06, "logits/chosen": -81161390.54545455, "logits/rejected": -41907657.84615385, "logps/chosen": -481.1915838068182, "logps/rejected": -525.1224834735577, "loss": 0.0039, "rewards/chosen": 6.877793051979759, "rewards/margins": 14.500239552317801, "rewards/rejected": -7.622446500338041, "step": 794 }, { "epoch": 0.19892405855123232, "grad_norm": 11.9375, "kl": 1.7197463512420654, "learning_rate": 5e-06, "logits/chosen": -57853036.307692304, "logits/rejected": -49966731.63636363, "logps/chosen": -400.5968674879808, "logps/rejected": -671.5992542613636, "loss": 0.0389, "rewards/chosen": 7.59848139836238, "rewards/margins": 17.708185155908545, "rewards/rejected": -10.109703757546164, "step": 795 }, { "epoch": 0.1991742774928062, "grad_norm": 1.9921875, "kl": 6.740237236022949, "learning_rate": 5e-06, "logits/chosen": -70098694.4, "logits/rejected": -53354130.28571428, "logps/chosen": -486.557275390625, "logps/rejected": -710.7572544642857, "loss": 0.0144, "rewards/chosen": 8.284804534912109, "rewards/margins": 21.42544915335519, "rewards/rejected": -13.14064461844308, "step": 796 }, { "epoch": 0.1994244964343801, "grad_norm": 9.0, "kl": 1.36517333984375, "learning_rate": 5e-06, "logits/chosen": -51640822.15384615, "logits/rejected": -39425210.18181818, "logps/chosen": -366.5441706730769, "logps/rejected": -533.6463068181819, "loss": 0.0431, "rewards/chosen": 6.539581298828125, "rewards/margins": 17.53961181640625, "rewards/rejected": -11.000030517578125, "step": 797 }, { "epoch": 0.19967471537595397, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48864339.692307696, "logits/rejected": -44865262.54545455, "logps/chosen": -343.4710036057692, "logps/rejected": -601.4108664772727, "loss": 0.0712, "rewards/chosen": 4.839988708496094, "rewards/margins": 13.800580804998225, "rewards/rejected": -8.96059209650213, "step": 798 }, { "epoch": 0.19992493431752784, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47303571.2, "logits/rejected": -47950340.571428575, "logps/chosen": -351.1532470703125, "logps/rejected": -664.36328125, "loss": 0.0467, "rewards/chosen": 6.578648376464844, "rewards/margins": 16.07303641183036, "rewards/rejected": -9.494388035365514, "step": 799 }, { "epoch": 0.2001751532591017, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69996309.33333333, "logits/rejected": -47352160.0, "logps/chosen": -408.6333414713542, "logps/rejected": -636.7174886067709, "loss": 0.0456, "rewards/chosen": 5.707289377848308, "rewards/margins": 13.65249252319336, "rewards/rejected": -7.945203145345052, "step": 800 }, { "epoch": 0.20042537220067558, "grad_norm": 13.6875, "kl": 0.8703645467758179, "learning_rate": 5e-06, "logits/chosen": -42387300.0, "logits/rejected": -18186752.0, "logps/chosen": -338.995361328125, "logps/rejected": -379.30133056640625, "loss": 0.0741, "rewards/chosen": 6.07996129989624, "rewards/margins": 13.174275398254395, "rewards/rejected": -7.094314098358154, "step": 801 }, { "epoch": 0.20067559114224948, "grad_norm": 5.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49454641.23076923, "logits/rejected": -45453466.18181818, "logps/chosen": -344.4498948317308, "logps/rejected": -545.4881924715909, "loss": 0.0363, "rewards/chosen": 5.379285959097055, "rewards/margins": 16.756572696712468, "rewards/rejected": -11.377286737615412, "step": 802 }, { "epoch": 0.20092581008382335, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59304434.28571428, "logits/rejected": -29780595.2, "logps/chosen": -328.17801339285717, "logps/rejected": -540.84912109375, "loss": 0.0697, "rewards/chosen": 5.914222717285156, "rewards/margins": 18.02959747314453, "rewards/rejected": -12.115374755859374, "step": 803 }, { "epoch": 0.20117602902539722, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66439744.0, "logits/rejected": -28480691.692307692, "logps/chosen": -414.01957563920456, "logps/rejected": -547.6895282451923, "loss": 0.0246, "rewards/chosen": 7.101613825017756, "rewards/margins": 18.80237723397208, "rewards/rejected": -11.700763408954327, "step": 804 }, { "epoch": 0.2014262479669711, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26977117.53846154, "logits/rejected": -35117533.09090909, "logps/chosen": -387.91650390625, "logps/rejected": -524.9378107244319, "loss": 0.081, "rewards/chosen": 5.092916048490084, "rewards/margins": 15.22620642602027, "rewards/rejected": -10.133290377530185, "step": 805 }, { "epoch": 0.201676466908545, "grad_norm": 15.1875, "kl": 0.31385931372642517, "learning_rate": 5e-06, "logits/chosen": -46441126.4, "logits/rejected": -43305436.44444445, "logps/chosen": -359.1406575520833, "logps/rejected": -717.1804470486111, "loss": 0.0661, "rewards/chosen": 6.0799204508463545, "rewards/margins": 16.787812127007378, "rewards/rejected": -10.707891676161024, "step": 806 }, { "epoch": 0.20192668585011886, "grad_norm": 6.375, "kl": 5.913142204284668, "learning_rate": 5e-06, "logits/chosen": -40712609.88235294, "logits/rejected": -85484982.85714285, "logps/chosen": -331.26220703125, "logps/rejected": -697.1421595982143, "loss": 0.0534, "rewards/chosen": 5.862836052389706, "rewards/margins": 15.491141760048745, "rewards/rejected": -9.62830570765904, "step": 807 }, { "epoch": 0.20217690479169273, "grad_norm": 13.125, "kl": 2.1432228088378906, "learning_rate": 5e-06, "logits/chosen": -35160153.14285714, "logits/rejected": -28466432.0, "logps/chosen": -454.322021484375, "logps/rejected": -456.8068359375, "loss": 0.0554, "rewards/chosen": 6.256374904087612, "rewards/margins": 15.419541713169643, "rewards/rejected": -9.163166809082032, "step": 808 }, { "epoch": 0.2024271237332666, "grad_norm": 7.3125, "kl": 2.2905538082122803, "learning_rate": 5e-06, "logits/chosen": -50495085.71428572, "logits/rejected": -33732294.4, "logps/chosen": -382.4754115513393, "logps/rejected": -485.15234375, "loss": 0.0485, "rewards/chosen": 6.194065638950893, "rewards/margins": 15.424795314243863, "rewards/rejected": -9.230729675292968, "step": 809 }, { "epoch": 0.20267734267484047, "grad_norm": 10.0625, "kl": 12.888921737670898, "learning_rate": 5e-06, "logits/chosen": -54909376.0, "logits/rejected": -93817309.0909091, "logps/chosen": -510.6948993389423, "logps/rejected": -764.3355823863636, "loss": 0.0188, "rewards/chosen": 7.7440032958984375, "rewards/margins": 19.01056601784446, "rewards/rejected": -11.266562721946023, "step": 810 }, { "epoch": 0.20292756161641437, "grad_norm": 11.875, "kl": 0.23932330310344696, "learning_rate": 5e-06, "logits/chosen": -44965632.0, "logits/rejected": -41088272.0, "logps/chosen": -286.18516322544644, "logps/rejected": -505.327978515625, "loss": 0.0831, "rewards/chosen": 4.751211983816964, "rewards/margins": 13.533659798758372, "rewards/rejected": -8.782447814941406, "step": 811 }, { "epoch": 0.20317778055798824, "grad_norm": 13.625, "kl": 5.660660743713379, "learning_rate": 5e-06, "logits/chosen": -43158245.05263158, "logits/rejected": -28269974.4, "logps/chosen": -368.97239925986844, "logps/rejected": -602.176611328125, "loss": 0.107, "rewards/chosen": 5.408772117213199, "rewards/margins": 14.234994346217107, "rewards/rejected": -8.826222229003907, "step": 812 }, { "epoch": 0.2034279994995621, "grad_norm": 19.125, "kl": 1.3985570669174194, "learning_rate": 5e-06, "logits/chosen": -13799566.666666666, "logits/rejected": -50878677.333333336, "logps/chosen": -307.6500244140625, "logps/rejected": -757.4918619791666, "loss": 0.0697, "rewards/chosen": 5.658947626749675, "rewards/margins": 14.656039555867512, "rewards/rejected": -8.997091929117838, "step": 813 }, { "epoch": 0.20367821844113598, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49396320.0, "logits/rejected": -29117410.666666668, "logps/chosen": -405.6623942057292, "logps/rejected": -581.714111328125, "loss": 0.0388, "rewards/chosen": 7.830986022949219, "rewards/margins": 16.280290603637695, "rewards/rejected": -8.449304580688477, "step": 814 }, { "epoch": 0.20392843738270988, "grad_norm": 21.25, "kl": 6.440698623657227, "learning_rate": 5e-06, "logits/chosen": -61339304.0, "logits/rejected": -34033212.0, "logps/chosen": -381.3726501464844, "logps/rejected": -487.78515625, "loss": 0.0597, "rewards/chosen": 6.295356750488281, "rewards/margins": 13.154322624206543, "rewards/rejected": -6.858965873718262, "step": 815 }, { "epoch": 0.20417865632428375, "grad_norm": 16.75, "kl": 0.2847709655761719, "learning_rate": 5e-06, "logits/chosen": -49366925.71428572, "logits/rejected": -53796761.6, "logps/chosen": -377.70595005580356, "logps/rejected": -507.280810546875, "loss": 0.058, "rewards/chosen": 5.664988926478794, "rewards/margins": 14.688199833461216, "rewards/rejected": -9.023210906982422, "step": 816 }, { "epoch": 0.20442887526585762, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51120056.88888889, "logits/rejected": -30452398.933333334, "logps/chosen": -351.32188585069446, "logps/rejected": -631.512890625, "loss": 0.0578, "rewards/chosen": 5.218953026665582, "rewards/margins": 12.556123436821832, "rewards/rejected": -7.33717041015625, "step": 817 }, { "epoch": 0.2046790942074315, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40052777.6, "logits/rejected": -55822555.428571425, "logps/chosen": -408.3108642578125, "logps/rejected": -545.50537109375, "loss": 0.0356, "rewards/chosen": 6.1724708557128904, "rewards/margins": 15.47464828491211, "rewards/rejected": -9.302177429199219, "step": 818 }, { "epoch": 0.2049293131490054, "grad_norm": 15.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59217819.428571425, "logits/rejected": -35902732.8, "logps/chosen": -402.3795689174107, "logps/rejected": -592.60673828125, "loss": 0.0565, "rewards/chosen": 6.2483640398297995, "rewards/margins": 16.120682307652064, "rewards/rejected": -9.872318267822266, "step": 819 }, { "epoch": 0.20517953209057926, "grad_norm": 9.1875, "kl": 3.18426775932312, "learning_rate": 5e-06, "logits/chosen": -48457554.28571428, "logits/rejected": -43792019.2, "logps/chosen": -375.62681361607144, "logps/rejected": -590.33076171875, "loss": 0.0856, "rewards/chosen": 6.395714351109096, "rewards/margins": 15.888672419956752, "rewards/rejected": -9.492958068847656, "step": 820 }, { "epoch": 0.20542975103215314, "grad_norm": 14.3125, "kl": 6.878249168395996, "learning_rate": 5e-06, "logits/chosen": -63146906.666666664, "logits/rejected": -19526541.333333332, "logps/chosen": -317.61997477213544, "logps/rejected": -612.055908203125, "loss": 0.0913, "rewards/chosen": 4.943337440490723, "rewards/margins": 15.065688769022623, "rewards/rejected": -10.1223513285319, "step": 821 }, { "epoch": 0.205679969973727, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54319974.4, "logits/rejected": -57623954.28571428, "logps/chosen": -406.001220703125, "logps/rejected": -621.3896484375, "loss": 0.0278, "rewards/chosen": 5.587949752807617, "rewards/margins": 13.978311538696289, "rewards/rejected": -8.390361785888672, "step": 822 }, { "epoch": 0.20593018891530088, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45855674.666666664, "logits/rejected": -53418816.0, "logps/chosen": -474.3279622395833, "logps/rejected": -620.4185384114584, "loss": 0.0533, "rewards/chosen": 5.96588134765625, "rewards/margins": 16.95417912801107, "rewards/rejected": -10.988297780354818, "step": 823 }, { "epoch": 0.20618040785687478, "grad_norm": 7.71875, "kl": 0.8577841520309448, "learning_rate": 5e-06, "logits/chosen": -41662215.11111111, "logits/rejected": -35238327.46666667, "logps/chosen": -383.3494466145833, "logps/rejected": -397.70712890625, "loss": 0.0346, "rewards/chosen": 7.946407741970486, "rewards/margins": 15.131324428982204, "rewards/rejected": -7.184916687011719, "step": 824 }, { "epoch": 0.20643062679844865, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65579238.4, "logits/rejected": -44685193.14285714, "logps/chosen": -377.4366943359375, "logps/rejected": -494.9213169642857, "loss": 0.0418, "rewards/chosen": 5.857852935791016, "rewards/margins": 13.956713540213448, "rewards/rejected": -8.098860604422432, "step": 825 }, { "epoch": 0.20668084574002252, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37624906.666666664, "logits/rejected": 91468125.86666666, "logps/chosen": -349.5646701388889, "logps/rejected": -633.4867838541667, "loss": 0.0468, "rewards/chosen": 5.625090705023871, "rewards/margins": 16.80655025906033, "rewards/rejected": -11.181459554036458, "step": 826 }, { "epoch": 0.2069310646815964, "grad_norm": 7.75, "kl": 0.3838348388671875, "learning_rate": 5e-06, "logits/chosen": -11999882.181818182, "logits/rejected": -29316740.923076924, "logps/chosen": -403.7110706676136, "logps/rejected": -361.85802283653845, "loss": 0.0359, "rewards/chosen": 5.75521503795277, "rewards/margins": 11.892577004599405, "rewards/rejected": -6.137361966646635, "step": 827 }, { "epoch": 0.2071812836231703, "grad_norm": 16.625, "kl": 7.2646613121032715, "learning_rate": 5e-06, "logits/chosen": -79085479.38461539, "logits/rejected": -53273629.09090909, "logps/chosen": -479.7726862980769, "logps/rejected": -563.8832563920455, "loss": 0.0301, "rewards/chosen": 7.126521770770733, "rewards/margins": 16.66407503781619, "rewards/rejected": -9.537553267045455, "step": 828 }, { "epoch": 0.20743150256474416, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74224710.4, "logits/rejected": -35041188.571428575, "logps/chosen": -335.867041015625, "logps/rejected": -397.70703125, "loss": 0.0439, "rewards/chosen": 5.500564193725586, "rewards/margins": 14.455432510375976, "rewards/rejected": -8.95486831665039, "step": 829 }, { "epoch": 0.20768172150631803, "grad_norm": 6.3125, "kl": 4.6465630531311035, "learning_rate": 5e-06, "logits/chosen": -42577053.538461536, "logits/rejected": -60188904.72727273, "logps/chosen": -517.484375, "logps/rejected": -514.5040838068181, "loss": 0.0474, "rewards/chosen": 5.921867370605469, "rewards/margins": 15.534251126376065, "rewards/rejected": -9.612383755770596, "step": 830 }, { "epoch": 0.2079319404478919, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63994416.0, "logits/rejected": -41208864.0, "logps/chosen": -432.72100830078125, "logps/rejected": -644.6954956054688, "loss": 0.0331, "rewards/chosen": 6.187349796295166, "rewards/margins": 21.06824827194214, "rewards/rejected": -14.880898475646973, "step": 831 }, { "epoch": 0.20818215938946577, "grad_norm": 4.59375, "kl": 3.961434841156006, "learning_rate": 5e-06, "logits/chosen": -33809644.307692304, "logits/rejected": -54601547.63636363, "logps/chosen": -448.92540564903845, "logps/rejected": -667.9743874289773, "loss": 0.0104, "rewards/chosen": 6.512371356670673, "rewards/margins": 21.6511167512907, "rewards/rejected": -15.13874539462003, "step": 832 }, { "epoch": 0.20843237833103967, "grad_norm": 10.375, "kl": 5.6155829429626465, "learning_rate": 5e-06, "logits/chosen": -70941508.26666667, "logits/rejected": -49239608.88888889, "logps/chosen": -432.57607421875, "logps/rejected": -613.7744140625, "loss": 0.0512, "rewards/chosen": 6.954598999023437, "rewards/margins": 16.32216033935547, "rewards/rejected": -9.367561340332031, "step": 833 }, { "epoch": 0.20868259727261354, "grad_norm": 8.4375, "kl": 1.0676867961883545, "learning_rate": 5e-06, "logits/chosen": -54695699.2, "logits/rejected": -44111465.14285714, "logps/chosen": -505.381494140625, "logps/rejected": -486.27535574776783, "loss": 0.0167, "rewards/chosen": 7.41577377319336, "rewards/margins": 16.328859710693358, "rewards/rejected": -8.9130859375, "step": 834 }, { "epoch": 0.2089328162141874, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36743656.0, "logits/rejected": -37833946.666666664, "logps/chosen": -445.3172200520833, "logps/rejected": -498.4071044921875, "loss": 0.0403, "rewards/chosen": 5.983867645263672, "rewards/margins": 16.58716901143392, "rewards/rejected": -10.603301366170248, "step": 835 }, { "epoch": 0.20918303515576128, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38590470.4, "logits/rejected": -47026797.71428572, "logps/chosen": -367.522900390625, "logps/rejected": -475.99379185267856, "loss": 0.0558, "rewards/chosen": 4.908218765258789, "rewards/margins": 16.21550897870745, "rewards/rejected": -11.307290213448661, "step": 836 }, { "epoch": 0.20943325409733518, "grad_norm": 16.25, "kl": 1.3447717428207397, "learning_rate": 5e-06, "logits/chosen": -33159101.09090909, "logits/rejected": -37479372.307692304, "logps/chosen": -363.07852450284093, "logps/rejected": -421.51615084134613, "loss": 0.0755, "rewards/chosen": 4.89566386829723, "rewards/margins": 13.38884729772181, "rewards/rejected": -8.49318342942458, "step": 837 }, { "epoch": 0.20968347303890905, "grad_norm": 7.875, "kl": 2.8279104232788086, "learning_rate": 5e-06, "logits/chosen": -63577109.333333336, "logits/rejected": -23843453.866666667, "logps/chosen": -535.0646701388889, "logps/rejected": -451.5923828125, "loss": 0.0314, "rewards/chosen": 9.165021260579428, "rewards/margins": 18.087744649251302, "rewards/rejected": -8.922723388671875, "step": 838 }, { "epoch": 0.20993369198048292, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72973677.71428572, "logits/rejected": -20600211.2, "logps/chosen": -393.15189034598217, "logps/rejected": -617.3955078125, "loss": 0.0299, "rewards/chosen": 6.266732352120536, "rewards/margins": 15.502330344063896, "rewards/rejected": -9.23559799194336, "step": 839 }, { "epoch": 0.2101839109220568, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64426212.571428575, "logits/rejected": -54891276.8, "logps/chosen": -400.4794921875, "logps/rejected": -513.963623046875, "loss": 0.0446, "rewards/chosen": 6.538800920758929, "rewards/margins": 15.272718157087054, "rewards/rejected": -8.733917236328125, "step": 840 }, { "epoch": 0.21043412986363066, "grad_norm": 13.5625, "kl": 8.454451560974121, "learning_rate": 5e-06, "logits/chosen": -91480469.33333333, "logits/rejected": -54089498.666666664, "logps/chosen": -552.0216471354166, "logps/rejected": -524.0720621744791, "loss": 0.0815, "rewards/chosen": 6.681446075439453, "rewards/margins": 15.04843266805013, "rewards/rejected": -8.366986592610678, "step": 841 }, { "epoch": 0.21068434880520456, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43597897.84615385, "logits/rejected": -50333736.72727273, "logps/chosen": -471.59119591346155, "logps/rejected": -776.5814098011364, "loss": 0.0171, "rewards/chosen": 7.979854290301983, "rewards/margins": 19.372585003192608, "rewards/rejected": -11.392730712890625, "step": 842 }, { "epoch": 0.21093456774677843, "grad_norm": 20.5, "kl": 1.9040069580078125, "learning_rate": 5e-06, "logits/chosen": -67600106.66666667, "logits/rejected": -49108629.333333336, "logps/chosen": -405.8778483072917, "logps/rejected": -359.3288981119792, "loss": 0.0445, "rewards/chosen": 6.412080764770508, "rewards/margins": 13.216399510701496, "rewards/rejected": -6.804318745930989, "step": 843 }, { "epoch": 0.2111847866883523, "grad_norm": 22.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53075444.36363637, "logits/rejected": -53297467.07692308, "logps/chosen": -466.5056818181818, "logps/rejected": -615.8974233774038, "loss": 0.0369, "rewards/chosen": 7.342480746182528, "rewards/margins": 17.85110430950885, "rewards/rejected": -10.508623563326323, "step": 844 }, { "epoch": 0.21143500562992618, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46289424.0, "logits/rejected": 122063264.0, "logps/chosen": -417.7291259765625, "logps/rejected": -580.0477294921875, "loss": 0.0336, "rewards/chosen": 4.884383678436279, "rewards/margins": 15.343745708465576, "rewards/rejected": -10.459362030029297, "step": 845 }, { "epoch": 0.21168522457150007, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76412326.4, "logits/rejected": -68620544.0, "logps/chosen": -512.96103515625, "logps/rejected": -427.666015625, "loss": 0.0469, "rewards/chosen": 5.886021423339844, "rewards/margins": 14.041544451032365, "rewards/rejected": -8.155523027692523, "step": 846 }, { "epoch": 0.21193544351307395, "grad_norm": 17.25, "kl": 16.615293502807617, "learning_rate": 5e-06, "logits/chosen": -68891504.94117647, "logits/rejected": -69892210.28571428, "logps/chosen": -387.6879308363971, "logps/rejected": -434.17006138392856, "loss": 0.1837, "rewards/chosen": 6.362054263844209, "rewards/margins": 15.139738066857602, "rewards/rejected": -8.777683803013392, "step": 847 }, { "epoch": 0.21218566245464782, "grad_norm": 11.125, "kl": 3.3448550701141357, "learning_rate": 5e-06, "logits/chosen": -45499528.53333333, "logits/rejected": -25999232.0, "logps/chosen": -401.82985026041666, "logps/rejected": -768.7527669270834, "loss": 0.0203, "rewards/chosen": 6.305695597330729, "rewards/margins": 20.066478474934897, "rewards/rejected": -13.760782877604166, "step": 848 }, { "epoch": 0.2124358813962217, "grad_norm": 4.78125, "kl": 0.7335942983627319, "learning_rate": 5e-06, "logits/chosen": -63022805.333333336, "logits/rejected": -44460765.333333336, "logps/chosen": -422.1632486979167, "logps/rejected": -604.6455891927084, "loss": 0.0242, "rewards/chosen": 7.583875020345052, "rewards/margins": 16.90160306294759, "rewards/rejected": -9.317728042602539, "step": 849 }, { "epoch": 0.21268610033779556, "grad_norm": 12.9375, "kl": 0.8178736567497253, "learning_rate": 5e-06, "logits/chosen": -32316640.0, "logits/rejected": -47145015.27272727, "logps/chosen": -303.7578312800481, "logps/rejected": -715.5393732244319, "loss": 0.0536, "rewards/chosen": 5.149880629319411, "rewards/margins": 14.817243189244838, "rewards/rejected": -9.667362559925426, "step": 850 }, { "epoch": 0.21293631927936946, "grad_norm": 11.4375, "kl": 2.490816831588745, "learning_rate": 5e-06, "logits/chosen": -50717082.666666664, "logits/rejected": -25829136.0, "logps/chosen": -440.0017496744792, "logps/rejected": -454.2618001302083, "loss": 0.0728, "rewards/chosen": 6.279614766438802, "rewards/margins": 14.91478157043457, "rewards/rejected": -8.635166803995768, "step": 851 }, { "epoch": 0.21318653822094333, "grad_norm": 9.5625, "kl": 1.5940736532211304, "learning_rate": 5e-06, "logits/chosen": -41784200.0, "logits/rejected": -48387576.0, "logps/chosen": -398.4791259765625, "logps/rejected": -440.3336181640625, "loss": 0.0336, "rewards/chosen": 6.844158172607422, "rewards/margins": 14.78076982498169, "rewards/rejected": -7.936611652374268, "step": 852 }, { "epoch": 0.2134367571625172, "grad_norm": 27.25, "kl": 15.438124656677246, "learning_rate": 5e-06, "logits/chosen": -38035357.86666667, "logits/rejected": 93770844.44444445, "logps/chosen": -443.3076497395833, "logps/rejected": -593.2223307291666, "loss": 0.1567, "rewards/chosen": 7.630831909179688, "rewards/margins": 14.684690772162543, "rewards/rejected": -7.053858862982856, "step": 853 }, { "epoch": 0.21368697610409107, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36225346.28571428, "logits/rejected": -45393803.294117644, "logps/chosen": -408.18858119419644, "logps/rejected": -557.3362821691177, "loss": 0.0229, "rewards/chosen": 8.157574244907924, "rewards/margins": 16.173671369793034, "rewards/rejected": -8.01609712488511, "step": 854 }, { "epoch": 0.21393719504566497, "grad_norm": 16.75, "kl": 1.0470657348632812, "learning_rate": 5e-06, "logits/chosen": -58416372.36363637, "logits/rejected": -51174183.384615384, "logps/chosen": -496.31503018465907, "logps/rejected": -636.4634915865385, "loss": 0.0516, "rewards/chosen": 7.375865589488637, "rewards/margins": 16.24125724739128, "rewards/rejected": -8.865391657902645, "step": 855 }, { "epoch": 0.21418741398723884, "grad_norm": 9.875, "kl": 6.740177154541016, "learning_rate": 5e-06, "logits/chosen": -61907012.92307692, "logits/rejected": -56968302.54545455, "logps/chosen": -440.6944110576923, "logps/rejected": -592.2461381392045, "loss": 0.035, "rewards/chosen": 7.263149554912861, "rewards/margins": 18.913061608801357, "rewards/rejected": -11.649912053888494, "step": 856 }, { "epoch": 0.2144376329288127, "grad_norm": 13.3125, "kl": 2.8779256343841553, "learning_rate": 5e-06, "logits/chosen": -29195834.181818184, "logits/rejected": -43359773.538461536, "logps/chosen": -240.44322620738637, "logps/rejected": -551.4405048076923, "loss": 0.0979, "rewards/chosen": 4.928837862881747, "rewards/margins": 12.499620450960172, "rewards/rejected": -7.570782588078425, "step": 857 }, { "epoch": 0.21468785187038658, "grad_norm": 21.375, "kl": 3.101168394088745, "learning_rate": 5e-06, "logits/chosen": -40155383.46666667, "logits/rejected": -41494887.11111111, "logps/chosen": -339.31598307291665, "logps/rejected": -500.6658528645833, "loss": 0.1539, "rewards/chosen": 4.952131144205729, "rewards/margins": 12.43179711235894, "rewards/rejected": -7.479665968153212, "step": 858 }, { "epoch": 0.21493807081196045, "grad_norm": 15.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37480608.0, "logits/rejected": -24109422.933333334, "logps/chosen": -476.09657118055554, "logps/rejected": -429.8720703125, "loss": 0.039, "rewards/chosen": 7.48978508843316, "rewards/margins": 15.800205824110243, "rewards/rejected": -8.310420735677083, "step": 859 }, { "epoch": 0.21518828975353435, "grad_norm": 6.09375, "kl": 1.8882615566253662, "learning_rate": 5e-06, "logits/chosen": -34047748.92307692, "logits/rejected": -49096768.0, "logps/chosen": -265.89954552283655, "logps/rejected": -702.7535955255681, "loss": 0.0673, "rewards/chosen": 4.728611872746394, "rewards/margins": 17.00692941092111, "rewards/rejected": -12.278317538174717, "step": 860 }, { "epoch": 0.21543850869510822, "grad_norm": 8.3125, "kl": 3.954415798187256, "learning_rate": 5e-06, "logits/chosen": -44509050.666666664, "logits/rejected": -62676245.333333336, "logps/chosen": -489.1415201822917, "logps/rejected": -654.4629720052084, "loss": 0.0385, "rewards/chosen": 8.263253529866537, "rewards/margins": 18.790645599365234, "rewards/rejected": -10.527392069498697, "step": 861 }, { "epoch": 0.2156887276366821, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35458925.333333336, "logits/rejected": -49140213.333333336, "logps/chosen": -287.7821451822917, "logps/rejected": -480.912841796875, "loss": 0.0332, "rewards/chosen": 5.380205154418945, "rewards/margins": 13.390286763509115, "rewards/rejected": -8.01008160909017, "step": 862 }, { "epoch": 0.21593894657825596, "grad_norm": 20.625, "kl": 11.090689659118652, "learning_rate": 5e-06, "logits/chosen": -69584402.28571428, "logits/rejected": -55375321.6, "logps/chosen": -552.8187081473214, "logps/rejected": -499.70009765625, "loss": 0.0499, "rewards/chosen": 8.731765747070312, "rewards/margins": 16.728807067871095, "rewards/rejected": -7.997041320800781, "step": 863 }, { "epoch": 0.21618916551982986, "grad_norm": 17.375, "kl": 10.584943771362305, "learning_rate": 5e-06, "logits/chosen": -66684266.666666664, "logits/rejected": -27548413.333333332, "logps/chosen": -357.3514811197917, "logps/rejected": -520.56103515625, "loss": 0.144, "rewards/chosen": 6.678855895996094, "rewards/margins": 15.041339874267578, "rewards/rejected": -8.362483978271484, "step": 864 }, { "epoch": 0.21643938446140373, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48434244.571428575, "logits/rejected": -45303168.0, "logps/chosen": -338.2197963169643, "logps/rejected": -618.5232421875, "loss": 0.0694, "rewards/chosen": 5.867035457066128, "rewards/margins": 15.076931544712611, "rewards/rejected": -9.209896087646484, "step": 865 }, { "epoch": 0.2166896034029776, "grad_norm": 20.125, "kl": 6.285125255584717, "learning_rate": 5e-06, "logits/chosen": -42835514.666666664, "logits/rejected": -60667349.333333336, "logps/chosen": -536.45751953125, "logps/rejected": -662.5782877604166, "loss": 0.0821, "rewards/chosen": 8.507424672444662, "rewards/margins": 17.059466679890953, "rewards/rejected": -8.552042007446289, "step": 866 }, { "epoch": 0.21693982234455148, "grad_norm": 19.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51094688.0, "logits/rejected": -14992970.666666666, "logps/chosen": -414.3006998697917, "logps/rejected": -399.5585123697917, "loss": 0.0872, "rewards/chosen": 5.1931107838948565, "rewards/margins": 11.766401290893555, "rewards/rejected": -6.573290506998698, "step": 867 }, { "epoch": 0.21719004128612535, "grad_norm": 6.0, "kl": 2.2430219650268555, "learning_rate": 5e-06, "logits/chosen": -55216610.90909091, "logits/rejected": -36635756.307692304, "logps/chosen": -386.10751065340907, "logps/rejected": -446.90685096153845, "loss": 0.0384, "rewards/chosen": 6.514386263760653, "rewards/margins": 12.784233253318947, "rewards/rejected": -6.269846989558293, "step": 868 }, { "epoch": 0.21744026022769924, "grad_norm": 8.5625, "kl": 7.486400127410889, "learning_rate": 5e-06, "logits/chosen": -52908590.54545455, "logits/rejected": -9853538.461538462, "logps/chosen": -357.14228959517044, "logps/rejected": -739.2422626201923, "loss": 0.0766, "rewards/chosen": 6.715452714399858, "rewards/margins": 18.655198103898055, "rewards/rejected": -11.939745389498198, "step": 869 }, { "epoch": 0.21769047916927312, "grad_norm": 2.296875, "kl": 7.726406097412109, "learning_rate": 5e-06, "logits/chosen": -68999123.6923077, "logits/rejected": -39331319.27272727, "logps/chosen": -524.8665114182693, "logps/rejected": -500.24636008522725, "loss": 0.005, "rewards/chosen": 8.946279672475962, "rewards/margins": 18.330212999890733, "rewards/rejected": -9.383933327414773, "step": 870 }, { "epoch": 0.217940698110847, "grad_norm": 8.3125, "kl": 2.492487668991089, "learning_rate": 5e-06, "logits/chosen": -39259072.0, "logits/rejected": -44220557.71428572, "logps/chosen": -421.684375, "logps/rejected": -674.5276227678571, "loss": 0.0204, "rewards/chosen": 7.777132415771485, "rewards/margins": 18.12573689052037, "rewards/rejected": -10.348604474748884, "step": 871 }, { "epoch": 0.21819091705242086, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70753427.2, "logits/rejected": -62200672.0, "logps/chosen": -350.109130859375, "logps/rejected": -551.241943359375, "loss": 0.039, "rewards/chosen": 5.932463073730469, "rewards/margins": 15.67813720703125, "rewards/rejected": -9.745674133300781, "step": 872 }, { "epoch": 0.21844113599399476, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57096768.0, "logits/rejected": -26418562.285714287, "logps/chosen": -392.160498046875, "logps/rejected": -668.8669084821429, "loss": 0.0061, "rewards/chosen": 6.697810363769531, "rewards/margins": 19.36263253348214, "rewards/rejected": -12.664822169712611, "step": 873 }, { "epoch": 0.21869135493556863, "grad_norm": 29.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32409620.363636363, "logits/rejected": -46671035.07692308, "logps/chosen": -299.3151189630682, "logps/rejected": -512.1787109375, "loss": 0.0735, "rewards/chosen": 5.699134826660156, "rewards/margins": 12.853279700646034, "rewards/rejected": -7.154144873985877, "step": 874 }, { "epoch": 0.2189415738771425, "grad_norm": 13.0625, "kl": 2.6314876079559326, "learning_rate": 5e-06, "logits/chosen": -23004838.85714286, "logits/rejected": -37200979.2, "logps/chosen": -337.72593470982144, "logps/rejected": -511.9251953125, "loss": 0.0794, "rewards/chosen": 5.1929811750139505, "rewards/margins": 14.745045689174106, "rewards/rejected": -9.552064514160156, "step": 875 }, { "epoch": 0.21919179281871637, "grad_norm": 5.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37902272.0, "logits/rejected": -31997494.153846152, "logps/chosen": -285.9669744318182, "logps/rejected": -628.6466346153846, "loss": 0.0439, "rewards/chosen": 4.861722772771662, "rewards/margins": 16.22666840453248, "rewards/rejected": -11.364945631760817, "step": 876 }, { "epoch": 0.21944201176029027, "grad_norm": 8.6875, "kl": 7.84165096282959, "learning_rate": 5e-06, "logits/chosen": -28804657.230769232, "logits/rejected": -28111429.818181816, "logps/chosen": -428.36099008413464, "logps/rejected": -415.67578125, "loss": 0.078, "rewards/chosen": 6.948799720177283, "rewards/margins": 15.052854257863718, "rewards/rejected": -8.104054537686435, "step": 877 }, { "epoch": 0.21969223070186414, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46892076.307692304, "logits/rejected": -50200855.27272727, "logps/chosen": -375.0295973557692, "logps/rejected": -546.7670010653409, "loss": 0.053, "rewards/chosen": 7.37348879300631, "rewards/margins": 14.722526656997788, "rewards/rejected": -7.3490378639914775, "step": 878 }, { "epoch": 0.219942449643438, "grad_norm": 3.015625, "kl": 0.28276318311691284, "learning_rate": 5e-06, "logits/chosen": -35701813.333333336, "logits/rejected": -33751249.06666667, "logps/chosen": -544.9024522569445, "logps/rejected": -461.6625, "loss": 0.0193, "rewards/chosen": 7.179185655381945, "rewards/margins": 16.952802191840277, "rewards/rejected": -9.773616536458333, "step": 879 }, { "epoch": 0.22019266858501188, "grad_norm": 10.5625, "kl": 5.377200126647949, "learning_rate": 5e-06, "logits/chosen": -51452160.0, "logits/rejected": -48970146.90909091, "logps/chosen": -417.5757587139423, "logps/rejected": -558.3232421875, "loss": 0.0537, "rewards/chosen": 7.008513817420373, "rewards/margins": 16.327812301529036, "rewards/rejected": -9.319298484108664, "step": 880 }, { "epoch": 0.22044288752658575, "grad_norm": 12.9375, "kl": 0.08029492944478989, "learning_rate": 5e-06, "logits/chosen": -69940356.92307693, "logits/rejected": -55195752.72727273, "logps/chosen": -351.60486778846155, "logps/rejected": -509.8441051136364, "loss": 0.0734, "rewards/chosen": 6.0399627685546875, "rewards/margins": 14.784964821555398, "rewards/rejected": -8.74500205300071, "step": 881 }, { "epoch": 0.22069310646815965, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61429606.4, "logits/rejected": -29648416.0, "logps/chosen": -308.7400390625, "logps/rejected": -384.663818359375, "loss": 0.0919, "rewards/chosen": 4.954191080729166, "rewards/margins": 13.243031480577258, "rewards/rejected": -8.288840399848091, "step": 882 }, { "epoch": 0.22094332540973352, "grad_norm": 10.25, "kl": 0.22380320727825165, "learning_rate": 5e-06, "logits/chosen": -46709013.333333336, "logits/rejected": -51381120.0, "logps/chosen": -436.3108723958333, "logps/rejected": -573.4994710286459, "loss": 0.0192, "rewards/chosen": 7.917387008666992, "rewards/margins": 16.842842737833656, "rewards/rejected": -8.925455729166666, "step": 883 }, { "epoch": 0.2211935443513074, "grad_norm": 15.6875, "kl": 7.116348743438721, "learning_rate": 5e-06, "logits/chosen": -100355675.42857143, "logits/rejected": -63054739.2, "logps/chosen": -393.7709263392857, "logps/rejected": -600.83974609375, "loss": 0.0716, "rewards/chosen": 6.167784009660993, "rewards/margins": 17.562579236711777, "rewards/rejected": -11.394795227050782, "step": 884 }, { "epoch": 0.22144376329288126, "grad_norm": 8.9375, "kl": 0.2906290888786316, "learning_rate": 5e-06, "logits/chosen": -49279701.333333336, "logits/rejected": -61520213.333333336, "logps/chosen": -433.4287109375, "logps/rejected": -671.3612630208333, "loss": 0.0261, "rewards/chosen": 8.179108513726128, "rewards/margins": 20.204544915093315, "rewards/rejected": -12.025436401367188, "step": 885 }, { "epoch": 0.22169398223445516, "grad_norm": 3.71875, "kl": 6.281719207763672, "learning_rate": 5e-06, "logits/chosen": -45588474.666666664, "logits/rejected": -14956497.333333334, "logps/chosen": -486.5383707682292, "logps/rejected": -812.0821126302084, "loss": 0.0052, "rewards/chosen": 8.805744171142578, "rewards/margins": 21.97202173868815, "rewards/rejected": -13.166277567545572, "step": 886 }, { "epoch": 0.22194420117602903, "grad_norm": 5.0625, "kl": 0.26970165967941284, "learning_rate": 5e-06, "logits/chosen": -52520635.07692308, "logits/rejected": -49551418.18181818, "logps/chosen": -440.88882211538464, "logps/rejected": -610.7002840909091, "loss": 0.0227, "rewards/chosen": 6.061122600848858, "rewards/margins": 18.8501601319213, "rewards/rejected": -12.789037531072443, "step": 887 }, { "epoch": 0.2221944201176029, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73012264.72727273, "logits/rejected": -51247315.692307696, "logps/chosen": -331.8717595880682, "logps/rejected": -552.4230769230769, "loss": 0.1024, "rewards/chosen": 4.700930508700284, "rewards/margins": 16.03623183457168, "rewards/rejected": -11.335301325871395, "step": 888 }, { "epoch": 0.22244463905917677, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39887114.666666664, "logits/rejected": -41164781.333333336, "logps/chosen": -298.58392333984375, "logps/rejected": -374.5581868489583, "loss": 0.1278, "rewards/chosen": 3.9807020823160806, "rewards/margins": 11.933468500773111, "rewards/rejected": -7.952766418457031, "step": 889 }, { "epoch": 0.22269485800075065, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65502884.571428575, "logits/rejected": -40199883.294117644, "logps/chosen": -483.1144321986607, "logps/rejected": -476.2304113051471, "loss": 0.014, "rewards/chosen": 8.052347455705915, "rewards/margins": 17.155099243676965, "rewards/rejected": -9.102751787971048, "step": 890 }, { "epoch": 0.22294507694232454, "grad_norm": 26.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54301969.06666667, "logits/rejected": -79465927.1111111, "logps/chosen": -337.78675130208336, "logps/rejected": -579.8811848958334, "loss": 0.0692, "rewards/chosen": 5.206717936197917, "rewards/margins": 13.366334194607205, "rewards/rejected": -8.159616258409288, "step": 891 }, { "epoch": 0.22319529588389841, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74470363.42857143, "logits/rejected": -10162648.470588235, "logps/chosen": -439.63553292410717, "logps/rejected": -639.7953814338235, "loss": 0.0697, "rewards/chosen": 5.8232863289969305, "rewards/margins": 17.13429904585125, "rewards/rejected": -11.31101271685432, "step": 892 }, { "epoch": 0.22344551482547229, "grad_norm": 9.8125, "kl": 4.602041721343994, "learning_rate": 5e-06, "logits/chosen": -68313344.0, "logits/rejected": -31684557.714285713, "logps/chosen": -404.32275390625, "logps/rejected": -411.10682896205356, "loss": 0.04, "rewards/chosen": 5.589457702636719, "rewards/margins": 14.57886701311384, "rewards/rejected": -8.98940931047712, "step": 893 }, { "epoch": 0.22369573376704616, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61931227.428571425, "logits/rejected": -43959251.2, "logps/chosen": -382.4054478236607, "logps/rejected": -673.0078125, "loss": 0.0844, "rewards/chosen": 4.8950685773577005, "rewards/margins": 16.836561802455357, "rewards/rejected": -11.941493225097656, "step": 894 }, { "epoch": 0.22394595270862006, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61059333.81818182, "logits/rejected": -53136452.92307692, "logps/chosen": -326.50106534090907, "logps/rejected": -629.8517127403846, "loss": 0.0338, "rewards/chosen": 5.8683554909446025, "rewards/margins": 18.607122674688593, "rewards/rejected": -12.73876718374399, "step": 895 }, { "epoch": 0.22419617165019393, "grad_norm": 1.7734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30159670.85714286, "logits/rejected": -59443648.0, "logps/chosen": -354.20894949776783, "logps/rejected": -581.8898782169117, "loss": 0.0175, "rewards/chosen": 5.0240968976702005, "rewards/margins": 19.113194762157793, "rewards/rejected": -14.089097864487591, "step": 896 }, { "epoch": 0.2244463905917678, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32034229.333333332, "logits/rejected": -59916186.666666664, "logps/chosen": -406.53369140625, "logps/rejected": -576.3888346354166, "loss": 0.0936, "rewards/chosen": 5.919286727905273, "rewards/margins": 16.343662897745766, "rewards/rejected": -10.424376169840494, "step": 897 }, { "epoch": 0.22469660953334167, "grad_norm": 3.109375, "kl": 1.7369754314422607, "learning_rate": 5e-06, "logits/chosen": -51159249.06666667, "logits/rejected": -34830954.666666664, "logps/chosen": -435.738671875, "logps/rejected": -599.8976236979166, "loss": 0.0213, "rewards/chosen": 6.416150919596354, "rewards/margins": 19.040545654296874, "rewards/rejected": -12.624394734700521, "step": 898 }, { "epoch": 0.22494682847491554, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41043462.4, "logits/rejected": -45398747.428571425, "logps/chosen": -479.7412109375, "logps/rejected": -527.4547293526786, "loss": 0.0568, "rewards/chosen": 4.900117492675781, "rewards/margins": 17.14068647112165, "rewards/rejected": -12.24056897844587, "step": 899 }, { "epoch": 0.22519704741648944, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39985076.0, "logits/rejected": -35510740.0, "logps/chosen": -324.8722229003906, "logps/rejected": -546.1305541992188, "loss": 0.078, "rewards/chosen": 4.132997989654541, "rewards/margins": 14.69203519821167, "rewards/rejected": -10.559037208557129, "step": 900 }, { "epoch": 0.2254472663580633, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48580533.333333336, "logits/rejected": -67845529.6, "logps/chosen": -359.9342447916667, "logps/rejected": -636.8192708333333, "loss": 0.0472, "rewards/chosen": 6.685011545817058, "rewards/margins": 21.08140640258789, "rewards/rejected": -14.396394856770833, "step": 901 }, { "epoch": 0.22569748529963718, "grad_norm": 19.125, "kl": 8.682260513305664, "learning_rate": 5e-06, "logits/chosen": -79313547.63636364, "logits/rejected": -39524388.92307692, "logps/chosen": -517.0273881392045, "logps/rejected": -398.0355694110577, "loss": 0.0731, "rewards/chosen": 6.073057001287287, "rewards/margins": 14.0310923969829, "rewards/rejected": -7.958035395695613, "step": 902 }, { "epoch": 0.22594770424121105, "grad_norm": 10.1875, "kl": 6.758369445800781, "learning_rate": 5e-06, "logits/chosen": 107983008.0, "logits/rejected": -21591917.333333332, "logps/chosen": -588.9263916015625, "logps/rejected": -442.9207763671875, "loss": 0.027, "rewards/chosen": 7.587039311726888, "rewards/margins": 14.34028434753418, "rewards/rejected": -6.753245035807292, "step": 903 }, { "epoch": 0.22619792318278495, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49310124.307692304, "logits/rejected": -48749812.36363637, "logps/chosen": -461.45004507211536, "logps/rejected": -566.53515625, "loss": 0.0274, "rewards/chosen": 7.519031231219952, "rewards/margins": 19.996614682924495, "rewards/rejected": -12.477583451704545, "step": 904 }, { "epoch": 0.22644814212435882, "grad_norm": 8.5, "kl": 4.311858177185059, "learning_rate": 5e-06, "logits/chosen": -44651034.666666664, "logits/rejected": 555985.3333333334, "logps/chosen": -516.0038655598959, "logps/rejected": -644.2854817708334, "loss": 0.0378, "rewards/chosen": 7.955541610717773, "rewards/margins": 18.99603716532389, "rewards/rejected": -11.04049555460612, "step": 905 }, { "epoch": 0.2266983610659327, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39005595.428571425, "logits/rejected": -50134841.6, "logps/chosen": -339.9086216517857, "logps/rejected": -593.230859375, "loss": 0.0857, "rewards/chosen": 4.672640664236886, "rewards/margins": 17.40224805559431, "rewards/rejected": -12.729607391357423, "step": 906 }, { "epoch": 0.22694858000750656, "grad_norm": 12.75, "kl": 13.265981674194336, "learning_rate": 5e-06, "logits/chosen": -76839330.46153846, "logits/rejected": -21663102.545454547, "logps/chosen": -380.22348257211536, "logps/rejected": -431.7615855823864, "loss": 0.0862, "rewards/chosen": 5.711649968073918, "rewards/margins": 13.704359734808648, "rewards/rejected": -7.99270976673473, "step": 907 }, { "epoch": 0.22719879894908043, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54960480.0, "logits/rejected": -42284340.0, "logps/chosen": -424.51910400390625, "logps/rejected": -657.2796630859375, "loss": 0.0289, "rewards/chosen": 7.0018310546875, "rewards/margins": 19.016210556030273, "rewards/rejected": -12.014379501342773, "step": 908 }, { "epoch": 0.22744901789065433, "grad_norm": 5.5625, "kl": 1.0971851348876953, "learning_rate": 5e-06, "logits/chosen": -51318132.705882356, "logits/rejected": -80645988.57142857, "logps/chosen": -408.4037511488971, "logps/rejected": -456.4017857142857, "loss": 0.0471, "rewards/chosen": 6.688805972828584, "rewards/margins": 13.419990796001017, "rewards/rejected": -6.731184823172433, "step": 909 }, { "epoch": 0.2276992368322282, "grad_norm": 10.3125, "kl": 6.374741554260254, "learning_rate": 5e-06, "logits/chosen": -70748445.53846154, "logits/rejected": -33107482.181818184, "logps/chosen": -465.82068810096155, "logps/rejected": -380.7496448863636, "loss": 0.054, "rewards/chosen": 6.5722527137169475, "rewards/margins": 14.515082192587686, "rewards/rejected": -7.942829478870738, "step": 910 }, { "epoch": 0.22794945577380207, "grad_norm": 12.6875, "kl": 15.299120903015137, "learning_rate": 5e-06, "logits/chosen": -80027810.46153846, "logits/rejected": -40516040.72727273, "logps/chosen": -503.6709735576923, "logps/rejected": -594.6040482954545, "loss": 0.0937, "rewards/chosen": 6.277316753680889, "rewards/margins": 17.76777723619154, "rewards/rejected": -11.490460482510654, "step": 911 }, { "epoch": 0.22819967471537594, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37288764.44444445, "logits/rejected": -15200901.333333334, "logps/chosen": -422.58867730034723, "logps/rejected": -550.1824869791667, "loss": 0.0301, "rewards/chosen": 7.073209126790364, "rewards/margins": 19.02741444905599, "rewards/rejected": -11.954205322265626, "step": 912 }, { "epoch": 0.22844989365694984, "grad_norm": 0.7578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63073976.88888889, "logits/rejected": -72308906.66666667, "logps/chosen": -467.6124674479167, "logps/rejected": -547.29296875, "loss": 0.002, "rewards/chosen": 6.477109273274739, "rewards/margins": 17.309584045410155, "rewards/rejected": -10.832474772135416, "step": 913 }, { "epoch": 0.2287001125985237, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47196632.615384616, "logits/rejected": -52435991.27272727, "logps/chosen": -355.9587214543269, "logps/rejected": -663.9367897727273, "loss": 0.0601, "rewards/chosen": 5.280397268442007, "rewards/margins": 19.18201137089229, "rewards/rejected": -13.901614102450283, "step": 914 }, { "epoch": 0.22895033154009758, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37041440.0, "logits/rejected": -47587936.0, "logps/chosen": -357.572021484375, "logps/rejected": -557.8770751953125, "loss": 0.0828, "rewards/chosen": 6.3396453857421875, "rewards/margins": 15.498096466064453, "rewards/rejected": -9.158451080322266, "step": 915 }, { "epoch": 0.22920055048167146, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41357060.0, "logits/rejected": -70418416.0, "logps/chosen": -546.7437133789062, "logps/rejected": -653.62451171875, "loss": 0.018, "rewards/chosen": 7.156656742095947, "rewards/margins": 19.993201732635498, "rewards/rejected": -12.83654499053955, "step": 916 }, { "epoch": 0.22945076942324533, "grad_norm": 5.21875, "kl": 6.046242713928223, "learning_rate": 5e-06, "logits/chosen": -49670478.222222224, "logits/rejected": -48864304.0, "logps/chosen": -411.0027126736111, "logps/rejected": -613.636962890625, "loss": 0.0537, "rewards/chosen": 6.228641086154514, "rewards/margins": 21.116777631971573, "rewards/rejected": -14.888136545817057, "step": 917 }, { "epoch": 0.22970098836481923, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47282772.0, "logits/rejected": -62508904.0, "logps/chosen": -373.3050842285156, "logps/rejected": -704.8584594726562, "loss": 0.024, "rewards/chosen": 6.87139892578125, "rewards/margins": 19.98058795928955, "rewards/rejected": -13.1091890335083, "step": 918 }, { "epoch": 0.2299512073063931, "grad_norm": 16.125, "kl": 3.070117950439453, "learning_rate": 5e-06, "logits/chosen": -43018977.88235294, "logits/rejected": -46294806.85714286, "logps/chosen": -356.7622931985294, "logps/rejected": -515.0734165736607, "loss": 0.0473, "rewards/chosen": 7.065644208122702, "rewards/margins": 15.731119236024488, "rewards/rejected": -8.665475027901786, "step": 919 }, { "epoch": 0.23020142624796697, "grad_norm": 8.0, "kl": 6.292427062988281, "learning_rate": 5e-06, "logits/chosen": -62030030.76923077, "logits/rejected": -56004939.63636363, "logps/chosen": -503.7373046875, "logps/rejected": -794.2305575284091, "loss": 0.0433, "rewards/chosen": 7.658596332256611, "rewards/margins": 21.78488500635107, "rewards/rejected": -14.12628867409446, "step": 920 }, { "epoch": 0.23045164518954084, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77693888.0, "logits/rejected": -34540572.0, "logps/chosen": -418.0924377441406, "logps/rejected": -568.9324951171875, "loss": 0.0102, "rewards/chosen": 5.300932884216309, "rewards/margins": 16.795610427856445, "rewards/rejected": -11.494677543640137, "step": 921 }, { "epoch": 0.23070186413111474, "grad_norm": 13.3125, "kl": 1.3069236278533936, "learning_rate": 5e-06, "logits/chosen": -50080742.4, "logits/rejected": -42146915.55555555, "logps/chosen": -327.55035807291665, "logps/rejected": -795.9128146701389, "loss": 0.0349, "rewards/chosen": 5.24858652750651, "rewards/margins": 22.489956834581164, "rewards/rejected": -17.241370307074654, "step": 922 }, { "epoch": 0.2309520830726886, "grad_norm": 11.375, "kl": 3.664954662322998, "learning_rate": 5e-06, "logits/chosen": -59110253.71428572, "logits/rejected": -43279382.4, "logps/chosen": -482.2735072544643, "logps/rejected": -578.491845703125, "loss": 0.0541, "rewards/chosen": 7.756150381905692, "rewards/margins": 17.201266043526786, "rewards/rejected": -9.445115661621093, "step": 923 }, { "epoch": 0.23120230201426248, "grad_norm": 21.75, "kl": 5.4658613204956055, "learning_rate": 5e-06, "logits/chosen": -37039584.0, "logits/rejected": -32755747.2, "logps/chosen": -325.296875, "logps/rejected": -422.929296875, "loss": 0.0793, "rewards/chosen": 6.870804922921317, "rewards/margins": 13.902114432198662, "rewards/rejected": -7.031309509277344, "step": 924 }, { "epoch": 0.23145252095583635, "grad_norm": 12.3125, "kl": 0.8870315551757812, "learning_rate": 5e-06, "logits/chosen": -68504384.0, "logits/rejected": -48437780.36363637, "logps/chosen": -473.87905649038464, "logps/rejected": -553.9419833096591, "loss": 0.0419, "rewards/chosen": 7.379144521859976, "rewards/margins": 14.978341909555288, "rewards/rejected": -7.5991973876953125, "step": 925 }, { "epoch": 0.23170273989741022, "grad_norm": 15.4375, "kl": 8.738922119140625, "learning_rate": 5e-06, "logits/chosen": -38038200.0, "logits/rejected": -61981349.333333336, "logps/chosen": -536.9969889322916, "logps/rejected": -495.327880859375, "loss": 0.0265, "rewards/chosen": 8.034940083821615, "rewards/margins": 18.238653818766277, "rewards/rejected": -10.203713734944662, "step": 926 }, { "epoch": 0.23195295883898412, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38799692.8, "logits/rejected": -62651832.88888889, "logps/chosen": -359.4570638020833, "logps/rejected": -483.93896484375, "loss": 0.0473, "rewards/chosen": 7.463754272460937, "rewards/margins": 17.33887464735243, "rewards/rejected": -9.875120374891493, "step": 927 }, { "epoch": 0.232203177780558, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36857338.18181818, "logits/rejected": -50456064.0, "logps/chosen": -367.7675115411932, "logps/rejected": -576.2898137019231, "loss": 0.038, "rewards/chosen": 6.119823109019887, "rewards/margins": 16.769954334605824, "rewards/rejected": -10.650131225585938, "step": 928 }, { "epoch": 0.23245339672213186, "grad_norm": 12.0625, "kl": 9.128301620483398, "learning_rate": 5e-06, "logits/chosen": -25711554.285714287, "logits/rejected": -51162745.6, "logps/chosen": -547.6895228794643, "logps/rejected": -526.092236328125, "loss": 0.0739, "rewards/chosen": 8.874160766601562, "rewards/margins": 20.233805847167968, "rewards/rejected": -11.359645080566406, "step": 929 }, { "epoch": 0.23270361566370573, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31915549.09090909, "logits/rejected": -56411136.0, "logps/chosen": -489.5325816761364, "logps/rejected": -482.1345402644231, "loss": 0.0405, "rewards/chosen": 7.253825794566762, "rewards/margins": 14.993141120963998, "rewards/rejected": -7.739315326397236, "step": 930 }, { "epoch": 0.23295383460527963, "grad_norm": 14.125, "kl": 0.4628610610961914, "learning_rate": 5e-06, "logits/chosen": -41247953.777777776, "logits/rejected": -62558429.86666667, "logps/chosen": -307.1012369791667, "logps/rejected": -483.59290364583336, "loss": 0.0932, "rewards/chosen": 5.360449896918403, "rewards/margins": 11.433313666449653, "rewards/rejected": -6.07286376953125, "step": 931 }, { "epoch": 0.2332040535468535, "grad_norm": 4.625, "kl": 3.119382619857788, "learning_rate": 5e-06, "logits/chosen": -37342084.0, "logits/rejected": -26470720.0, "logps/chosen": -395.2557678222656, "logps/rejected": -451.6656188964844, "loss": 0.056, "rewards/chosen": 6.482509136199951, "rewards/margins": 13.495808124542236, "rewards/rejected": -7.013298988342285, "step": 932 }, { "epoch": 0.23345427248842737, "grad_norm": 25.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56840908.8, "logits/rejected": 5934252.571428572, "logps/chosen": -299.0068115234375, "logps/rejected": -649.7470703125, "loss": 0.0555, "rewards/chosen": 6.617679595947266, "rewards/margins": 14.541585104806082, "rewards/rejected": -7.923905508858817, "step": 933 }, { "epoch": 0.23370449143000124, "grad_norm": 16.125, "kl": 2.8662619590759277, "learning_rate": 5e-06, "logits/chosen": -52624208.0, "logits/rejected": -57799962.666666664, "logps/chosen": -411.8450520833333, "logps/rejected": -528.4455973307291, "loss": 0.0467, "rewards/chosen": 7.910139719645183, "rewards/margins": 17.80528513590495, "rewards/rejected": -9.895145416259766, "step": 934 }, { "epoch": 0.23395471037157514, "grad_norm": 2.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41030153.6, "logits/rejected": -66448649.14285714, "logps/chosen": -330.2591064453125, "logps/rejected": -728.6985212053571, "loss": 0.0179, "rewards/chosen": 6.134889221191406, "rewards/margins": 19.477854483468192, "rewards/rejected": -13.342965262276786, "step": 935 }, { "epoch": 0.234204929313149, "grad_norm": 8.875, "kl": 0.379302978515625, "learning_rate": 5e-06, "logits/chosen": -54399656.72727273, "logits/rejected": -26842806.153846152, "logps/chosen": -366.34565873579544, "logps/rejected": -421.78463040865387, "loss": 0.0533, "rewards/chosen": 6.0342885797674, "rewards/margins": 13.81286535729895, "rewards/rejected": -7.77857677753155, "step": 936 }, { "epoch": 0.23445514825472288, "grad_norm": 7.6875, "kl": 6.206681728363037, "learning_rate": 5e-06, "logits/chosen": -44843712.0, "logits/rejected": -45088960.0, "logps/chosen": -400.8680889423077, "logps/rejected": -681.0548206676136, "loss": 0.0353, "rewards/chosen": 6.523436913123498, "rewards/margins": 15.86996123840759, "rewards/rejected": -9.346524325284092, "step": 937 }, { "epoch": 0.23470536719629675, "grad_norm": 9.5, "kl": 2.585693836212158, "learning_rate": 5e-06, "logits/chosen": -29395781.333333332, "logits/rejected": -54385114.666666664, "logps/chosen": -352.1145833333333, "logps/rejected": -508.311767578125, "loss": 0.0772, "rewards/chosen": 5.958161036173503, "rewards/margins": 14.334941228230793, "rewards/rejected": -8.376780192057291, "step": 938 }, { "epoch": 0.23495558613787063, "grad_norm": 13.625, "kl": 9.4812650680542, "learning_rate": 5e-06, "logits/chosen": -58789984.0, "logits/rejected": -37406240.0, "logps/chosen": -565.1275227864584, "logps/rejected": -404.2013753255208, "loss": 0.0299, "rewards/chosen": 8.59155527750651, "rewards/margins": 16.36270968119303, "rewards/rejected": -7.771154403686523, "step": 939 }, { "epoch": 0.23520580507944452, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46891365.333333336, "logits/rejected": -48070080.0, "logps/chosen": -480.721435546875, "logps/rejected": -499.0227864583333, "loss": 0.045, "rewards/chosen": 6.095334370930989, "rewards/margins": 17.481115976969402, "rewards/rejected": -11.385781606038412, "step": 940 }, { "epoch": 0.2354560240210184, "grad_norm": 14.875, "kl": 11.093153953552246, "learning_rate": 5e-06, "logits/chosen": -42544972.8, "logits/rejected": -45334599.11111111, "logps/chosen": -417.0918294270833, "logps/rejected": -573.8923611111111, "loss": 0.053, "rewards/chosen": 6.57138671875, "rewards/margins": 15.85594702826606, "rewards/rejected": -9.28456030951606, "step": 941 }, { "epoch": 0.23570624296259227, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43262704.0, "logits/rejected": -63748053.333333336, "logps/chosen": -323.8709716796875, "logps/rejected": -680.2295328776041, "loss": 0.0387, "rewards/chosen": 5.177082061767578, "rewards/margins": 17.591632843017578, "rewards/rejected": -12.41455078125, "step": 942 }, { "epoch": 0.23595646190416614, "grad_norm": 15.375, "kl": 1.3339195251464844, "learning_rate": 5e-06, "logits/chosen": -66854378.666666664, "logits/rejected": -38074235.733333334, "logps/chosen": -495.10205078125, "logps/rejected": -492.8537109375, "loss": 0.0346, "rewards/chosen": 8.158946567111546, "rewards/margins": 16.75756310356988, "rewards/rejected": -8.598616536458334, "step": 943 }, { "epoch": 0.23620668084574004, "grad_norm": 8.8125, "kl": 2.094184637069702, "learning_rate": 5e-06, "logits/chosen": -25042998.85714286, "logits/rejected": -41569836.8, "logps/chosen": -299.8716517857143, "logps/rejected": -558.7623046875, "loss": 0.0933, "rewards/chosen": 6.422407967703683, "rewards/margins": 16.13129163469587, "rewards/rejected": -9.708883666992188, "step": 944 }, { "epoch": 0.2364568997873139, "grad_norm": 13.625, "kl": 5.683592796325684, "learning_rate": 5e-06, "logits/chosen": -53061714.28571428, "logits/rejected": -51149849.6, "logps/chosen": -311.28086635044644, "logps/rejected": -545.0552734375, "loss": 0.1163, "rewards/chosen": 4.430931636265346, "rewards/margins": 14.698024531773157, "rewards/rejected": -10.267092895507812, "step": 945 }, { "epoch": 0.23670711872888778, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43230976.0, "logits/rejected": -70082609.77777778, "logps/chosen": -323.52900390625, "logps/rejected": -666.9175889756945, "loss": 0.0676, "rewards/chosen": 5.542985534667968, "rewards/margins": 15.466888258192274, "rewards/rejected": -9.923902723524305, "step": 946 }, { "epoch": 0.23695733767046165, "grad_norm": 3.65625, "kl": 5.355119228363037, "learning_rate": 5e-06, "logits/chosen": -53822139.07692308, "logits/rejected": -59451601.45454545, "logps/chosen": -516.5736177884615, "logps/rejected": -628.6519886363636, "loss": 0.0175, "rewards/chosen": 7.771853520320012, "rewards/margins": 18.58540915109061, "rewards/rejected": -10.813555630770596, "step": 947 }, { "epoch": 0.23720755661203552, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40924457.84615385, "logits/rejected": -57772125.09090909, "logps/chosen": -313.2546198918269, "logps/rejected": -609.9730557528409, "loss": 0.0595, "rewards/chosen": 6.1494627732497, "rewards/margins": 14.070502007757867, "rewards/rejected": -7.921039234508168, "step": 948 }, { "epoch": 0.23745777555360942, "grad_norm": 8.0, "kl": 7.103673458099365, "learning_rate": 5e-06, "logits/chosen": -69940736.0, "logits/rejected": -33472898.285714287, "logps/chosen": -363.793603515625, "logps/rejected": -411.0101841517857, "loss": 0.0329, "rewards/chosen": 5.684453964233398, "rewards/margins": 14.026391983032227, "rewards/rejected": -8.341938018798828, "step": 949 }, { "epoch": 0.2377079944951833, "grad_norm": 13.8125, "kl": 10.972240447998047, "learning_rate": 5e-06, "logits/chosen": -55315656.0, "logits/rejected": -37758380.0, "logps/chosen": -462.2991943359375, "logps/rejected": -372.3765869140625, "loss": 0.073, "rewards/chosen": 6.486313819885254, "rewards/margins": 12.432631492614746, "rewards/rejected": -5.946317672729492, "step": 950 }, { "epoch": 0.23795821343675716, "grad_norm": 11.1875, "kl": 4.412203788757324, "learning_rate": 5e-06, "logits/chosen": -75238116.57142857, "logits/rejected": -58091456.0, "logps/chosen": -400.944580078125, "logps/rejected": -584.079833984375, "loss": 0.0444, "rewards/chosen": 5.991706848144531, "rewards/margins": 15.89676513671875, "rewards/rejected": -9.905058288574219, "step": 951 }, { "epoch": 0.23820843237833103, "grad_norm": 6.5625, "kl": 9.460249900817871, "learning_rate": 5e-06, "logits/chosen": -40347648.0, "logits/rejected": -39134568.0, "logps/chosen": -421.4286295572917, "logps/rejected": -471.579833984375, "loss": 0.0255, "rewards/chosen": 7.644588470458984, "rewards/margins": 12.908722241719563, "rewards/rejected": -5.264133771260579, "step": 952 }, { "epoch": 0.23845865131990493, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77202478.54545455, "logits/rejected": -52970422.15384615, "logps/chosen": -390.42964311079544, "logps/rejected": -525.9840369591346, "loss": 0.0403, "rewards/chosen": 7.254411177201704, "rewards/margins": 16.002279641744973, "rewards/rejected": -8.74786846454327, "step": 953 }, { "epoch": 0.2387088702614788, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11673018.666666666, "logits/rejected": -45780957.86666667, "logps/chosen": -494.92192925347223, "logps/rejected": -638.621875, "loss": 0.0926, "rewards/chosen": 5.703413645426433, "rewards/margins": 13.848574574788412, "rewards/rejected": -8.14516092936198, "step": 954 }, { "epoch": 0.23895908920305267, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41096916.0, "logits/rejected": -38687588.0, "logps/chosen": -372.59002685546875, "logps/rejected": -688.5537109375, "loss": 0.0332, "rewards/chosen": 7.318997859954834, "rewards/margins": 19.08086061477661, "rewards/rejected": -11.761862754821777, "step": 955 }, { "epoch": 0.23920930814462654, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44092472.88888889, "logits/rejected": -30713898.666666668, "logps/chosen": -338.7389322916667, "logps/rejected": -479.07395833333334, "loss": 0.0326, "rewards/chosen": 7.433725145128038, "rewards/margins": 18.19242875840929, "rewards/rejected": -10.75870361328125, "step": 956 }, { "epoch": 0.2394595270862004, "grad_norm": 11.25, "kl": 1.5393956899642944, "learning_rate": 5e-06, "logits/chosen": -83178843.42857143, "logits/rejected": -85053644.8, "logps/chosen": -432.54561941964283, "logps/rejected": -618.793505859375, "loss": 0.0472, "rewards/chosen": 6.379090445382254, "rewards/margins": 14.241289847237724, "rewards/rejected": -7.862199401855468, "step": 957 }, { "epoch": 0.2397097460277743, "grad_norm": 14.625, "kl": 6.892382621765137, "learning_rate": 5e-06, "logits/chosen": -22198784.0, "logits/rejected": -37976570.18181818, "logps/chosen": -394.47964242788464, "logps/rejected": -592.0108753551136, "loss": 0.0754, "rewards/chosen": 6.815983698918269, "rewards/margins": 16.970387385441708, "rewards/rejected": -10.154403686523438, "step": 958 }, { "epoch": 0.23995996496934818, "grad_norm": 7.875, "kl": 1.3100789785385132, "learning_rate": 5e-06, "logits/chosen": -62880742.4, "logits/rejected": -81479222.85714285, "logps/chosen": -326.6181884765625, "logps/rejected": -519.4295828683036, "loss": 0.0941, "rewards/chosen": 3.6879344940185548, "rewards/margins": 11.727349581037249, "rewards/rejected": -8.039415087018694, "step": 959 }, { "epoch": 0.24021018391092205, "grad_norm": 12.8125, "kl": 1.117627501487732, "learning_rate": 5e-06, "logits/chosen": -39224738.461538464, "logits/rejected": -42938368.0, "logps/chosen": -300.0834209735577, "logps/rejected": -358.07601651278407, "loss": 0.0677, "rewards/chosen": 5.270392197829026, "rewards/margins": 10.915350000341455, "rewards/rejected": -5.644957802512429, "step": 960 }, { "epoch": 0.24046040285249592, "grad_norm": 7.09375, "kl": 0.7437850832939148, "learning_rate": 5e-06, "logits/chosen": -11991135.0, "logits/rejected": -60121304.0, "logps/chosen": -354.5306701660156, "logps/rejected": -524.0028076171875, "loss": 0.049, "rewards/chosen": 5.199479103088379, "rewards/margins": 12.830081939697266, "rewards/rejected": -7.630602836608887, "step": 961 }, { "epoch": 0.24071062179406982, "grad_norm": 6.34375, "kl": 5.288510799407959, "learning_rate": 5e-06, "logits/chosen": -52445209.6, "logits/rejected": -49654798.222222224, "logps/chosen": -358.82236328125, "logps/rejected": -596.2080620659722, "loss": 0.0272, "rewards/chosen": 6.363755798339843, "rewards/margins": 17.211896091037325, "rewards/rejected": -10.848140292697483, "step": 962 }, { "epoch": 0.2409608407356437, "grad_norm": 12.75, "kl": 2.5923068523406982, "learning_rate": 5e-06, "logits/chosen": -63866322.823529415, "logits/rejected": -71825179.42857143, "logps/chosen": -425.9490751378676, "logps/rejected": -766.2367466517857, "loss": 0.0681, "rewards/chosen": 6.591006559484145, "rewards/margins": 17.28947980864709, "rewards/rejected": -10.698473249162946, "step": 963 }, { "epoch": 0.24121105967721757, "grad_norm": 13.5, "kl": 7.148531436920166, "learning_rate": 5e-06, "logits/chosen": -67719542.85714285, "logits/rejected": -49603027.2, "logps/chosen": -408.22719029017856, "logps/rejected": -545.6841796875, "loss": 0.0519, "rewards/chosen": 6.195568084716797, "rewards/margins": 15.375952911376952, "rewards/rejected": -9.180384826660156, "step": 964 }, { "epoch": 0.24146127861879144, "grad_norm": 6.96875, "kl": 3.784701108932495, "learning_rate": 5e-06, "logits/chosen": -51663904.0, "logits/rejected": -51272873.14285714, "logps/chosen": -346.400732421875, "logps/rejected": -528.1700265066964, "loss": 0.0297, "rewards/chosen": 5.401966857910156, "rewards/margins": 15.662945556640626, "rewards/rejected": -10.260978698730469, "step": 965 }, { "epoch": 0.2417114975603653, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41541956.92307692, "logits/rejected": -50786996.36363637, "logps/chosen": -376.15271935096155, "logps/rejected": -710.5560635653409, "loss": 0.0984, "rewards/chosen": 5.761100182166467, "rewards/margins": 17.646116883604677, "rewards/rejected": -11.88501670143821, "step": 966 }, { "epoch": 0.2419617165019392, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67082682.18181818, "logits/rejected": -61389745.23076923, "logps/chosen": -396.5559747869318, "logps/rejected": -586.0338792067307, "loss": 0.0248, "rewards/chosen": 6.245370344682173, "rewards/margins": 16.267867668525323, "rewards/rejected": -10.02249732384315, "step": 967 }, { "epoch": 0.24221193544351308, "grad_norm": 6.21875, "kl": 2.1117992401123047, "learning_rate": 5e-06, "logits/chosen": -57475852.8, "logits/rejected": -41609577.14285714, "logps/chosen": -526.372119140625, "logps/rejected": -543.5379115513393, "loss": 0.0093, "rewards/chosen": 6.452223205566407, "rewards/margins": 16.720904323032926, "rewards/rejected": -10.268681117466517, "step": 968 }, { "epoch": 0.24246215438508695, "grad_norm": 5.15625, "kl": 5.931025505065918, "learning_rate": 5e-06, "logits/chosen": -25839696.0, "logits/rejected": -42132539.733333334, "logps/chosen": -365.1384548611111, "logps/rejected": -499.8753255208333, "loss": 0.0491, "rewards/chosen": 7.373534308539496, "rewards/margins": 16.471940273708768, "rewards/rejected": -9.09840596516927, "step": 969 }, { "epoch": 0.24271237332666082, "grad_norm": 7.78125, "kl": 5.186364650726318, "learning_rate": 5e-06, "logits/chosen": -46551428.0, "logits/rejected": -59868468.0, "logps/chosen": -436.8776550292969, "logps/rejected": -419.1877136230469, "loss": 0.0204, "rewards/chosen": 8.265854835510254, "rewards/margins": 16.00886869430542, "rewards/rejected": -7.743013858795166, "step": 970 }, { "epoch": 0.24296259226823472, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48635178.666666664, "logits/rejected": -22060261.333333332, "logps/chosen": -424.09661458333335, "logps/rejected": -465.74370659722223, "loss": 0.0268, "rewards/chosen": 7.468690999348959, "rewards/margins": 18.117595587836373, "rewards/rejected": -10.648904588487413, "step": 971 }, { "epoch": 0.2432128112098086, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26651632.0, "logits/rejected": -46666843.428571425, "logps/chosen": -379.4952880859375, "logps/rejected": -596.7696707589286, "loss": 0.0263, "rewards/chosen": 6.369148254394531, "rewards/margins": 15.411238534109932, "rewards/rejected": -9.042090279715401, "step": 972 }, { "epoch": 0.24346303015138246, "grad_norm": 7.5625, "kl": 0.9020862579345703, "learning_rate": 5e-06, "logits/chosen": -44098397.538461536, "logits/rejected": -56110161.45454545, "logps/chosen": -481.2684795673077, "logps/rejected": -599.7449396306819, "loss": 0.0115, "rewards/chosen": 8.506537804236778, "rewards/margins": 22.31245433033763, "rewards/rejected": -13.805916526100852, "step": 973 }, { "epoch": 0.24371324909295633, "grad_norm": 20.5, "kl": 11.748554229736328, "learning_rate": 5e-06, "logits/chosen": -48147471.058823526, "logits/rejected": -40631003.428571425, "logps/chosen": -420.59449678308823, "logps/rejected": -627.2489536830357, "loss": 0.0955, "rewards/chosen": 6.807598787195542, "rewards/margins": 15.587558329606257, "rewards/rejected": -8.779959542410714, "step": 974 }, { "epoch": 0.2439634680345302, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45298222.54545455, "logits/rejected": -45828617.84615385, "logps/chosen": -499.63423295454544, "logps/rejected": -673.6026141826923, "loss": 0.0526, "rewards/chosen": 7.542264764959162, "rewards/margins": 19.699298831966374, "rewards/rejected": -12.157034067007212, "step": 975 }, { "epoch": 0.2442136869761041, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22724881.6, "logits/rejected": -56636242.28571428, "logps/chosen": -303.7207275390625, "logps/rejected": -520.9016810825893, "loss": 0.0762, "rewards/chosen": 5.799948883056641, "rewards/margins": 14.588018471854074, "rewards/rejected": -8.788069588797432, "step": 976 }, { "epoch": 0.24446390591767797, "grad_norm": 19.5, "kl": 7.317101955413818, "learning_rate": 5e-06, "logits/chosen": -28845570.666666668, "logits/rejected": -38781898.666666664, "logps/chosen": -409.3150634765625, "logps/rejected": -512.3441569010416, "loss": 0.0447, "rewards/chosen": 6.162761688232422, "rewards/margins": 13.854595184326172, "rewards/rejected": -7.69183349609375, "step": 977 }, { "epoch": 0.24471412485925184, "grad_norm": 7.40625, "kl": 7.131377696990967, "learning_rate": 5e-06, "logits/chosen": -27759594.0, "logits/rejected": -38790868.0, "logps/chosen": -366.73529052734375, "logps/rejected": -505.4588928222656, "loss": 0.0912, "rewards/chosen": 7.375744819641113, "rewards/margins": 16.4173526763916, "rewards/rejected": -9.041607856750488, "step": 978 }, { "epoch": 0.2449643438008257, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33602082.461538464, "logits/rejected": 41118132.36363637, "logps/chosen": -392.90884164663464, "logps/rejected": -679.8631036931819, "loss": 0.0344, "rewards/chosen": 6.0534491905799275, "rewards/margins": 17.51038472635763, "rewards/rejected": -11.4569355357777, "step": 979 }, { "epoch": 0.2452145627423996, "grad_norm": 8.5625, "kl": 9.956182479858398, "learning_rate": 5e-06, "logits/chosen": -74135819.63636364, "logits/rejected": -37772140.307692304, "logps/chosen": -509.16313032670456, "logps/rejected": -541.5817307692307, "loss": 0.0229, "rewards/chosen": 8.883264021439986, "rewards/margins": 17.90471787886186, "rewards/rejected": -9.021453857421875, "step": 980 }, { "epoch": 0.24546478168397348, "grad_norm": 17.25, "kl": 9.704726219177246, "learning_rate": 5e-06, "logits/chosen": -33535570.285714287, "logits/rejected": -33441254.4, "logps/chosen": -355.86083984375, "logps/rejected": -568.32529296875, "loss": 0.062, "rewards/chosen": 7.664146423339844, "rewards/margins": 17.043216705322266, "rewards/rejected": -9.379070281982422, "step": 981 }, { "epoch": 0.24571500062554735, "grad_norm": 12.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41295132.8, "logits/rejected": -62507634.28571428, "logps/chosen": -471.283935546875, "logps/rejected": -480.21177455357144, "loss": 0.043, "rewards/chosen": 5.92628288269043, "rewards/margins": 12.615982873099192, "rewards/rejected": -6.689699990408761, "step": 982 }, { "epoch": 0.24596521956712122, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44087664.0, "logits/rejected": -52632604.0, "logps/chosen": -363.9759521484375, "logps/rejected": -392.74951171875, "loss": 0.0511, "rewards/chosen": 5.343209266662598, "rewards/margins": 12.991962909698486, "rewards/rejected": -7.648753643035889, "step": 983 }, { "epoch": 0.2462154385086951, "grad_norm": 4.28125, "kl": 5.336452007293701, "learning_rate": 5e-06, "logits/chosen": -45089450.666666664, "logits/rejected": -21235751.111111112, "logps/chosen": -386.187890625, "logps/rejected": -438.6044921875, "loss": 0.0102, "rewards/chosen": 8.136567179361979, "rewards/margins": 18.851971266004774, "rewards/rejected": -10.715404086642796, "step": 984 }, { "epoch": 0.246465657450269, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36906614.4, "logits/rejected": -16278251.42857143, "logps/chosen": -248.096484375, "logps/rejected": -478.1185825892857, "loss": 0.0484, "rewards/chosen": 4.976877212524414, "rewards/margins": 13.192564555576869, "rewards/rejected": -8.215687343052455, "step": 985 }, { "epoch": 0.24671587639184286, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34849632.0, "logits/rejected": -55342107.428571425, "logps/chosen": -430.87333984375, "logps/rejected": -626.6460658482143, "loss": 0.0197, "rewards/chosen": 8.463484191894532, "rewards/margins": 18.211029706682478, "rewards/rejected": -9.747545514787946, "step": 986 }, { "epoch": 0.24696609533341674, "grad_norm": 9.625, "kl": 7.418025970458984, "learning_rate": 5e-06, "logits/chosen": -72797735.38461539, "logits/rejected": -41053122.90909091, "logps/chosen": -391.8903245192308, "logps/rejected": -508.41463955965907, "loss": 0.0396, "rewards/chosen": 6.177619934082031, "rewards/margins": 15.243307633833451, "rewards/rejected": -9.06568769975142, "step": 987 }, { "epoch": 0.2472163142749906, "grad_norm": 25.875, "kl": 1.2753047943115234, "learning_rate": 5e-06, "logits/chosen": -44311702.85714286, "logits/rejected": -52756188.8, "logps/chosen": -274.71533203125, "logps/rejected": -559.870751953125, "loss": 0.0852, "rewards/chosen": 5.22882080078125, "rewards/margins": 11.39544219970703, "rewards/rejected": -6.166621398925781, "step": 988 }, { "epoch": 0.2474665332165645, "grad_norm": 8.6875, "kl": 10.789250373840332, "learning_rate": 5e-06, "logits/chosen": -26739306.0, "logits/rejected": 121936768.0, "logps/chosen": -412.46630859375, "logps/rejected": -691.3695068359375, "loss": 0.0366, "rewards/chosen": 7.12168025970459, "rewards/margins": 16.61695098876953, "rewards/rejected": -9.495270729064941, "step": 989 }, { "epoch": 0.24771675215813838, "grad_norm": 16.5, "kl": 5.047600746154785, "learning_rate": 5e-06, "logits/chosen": -55969934.222222224, "logits/rejected": -35237674.666666664, "logps/chosen": -477.73149956597223, "logps/rejected": -559.80078125, "loss": 0.0423, "rewards/chosen": 8.644964430067274, "rewards/margins": 15.533296542697482, "rewards/rejected": -6.888332112630208, "step": 990 }, { "epoch": 0.24796697109971225, "grad_norm": 13.125, "kl": 0.7679736018180847, "learning_rate": 5e-06, "logits/chosen": -31620014.545454547, "logits/rejected": -73518897.23076923, "logps/chosen": -361.9465997869318, "logps/rejected": -631.2200270432693, "loss": 0.0418, "rewards/chosen": 6.584184820001775, "rewards/margins": 16.191359619994266, "rewards/rejected": -9.607174799992489, "step": 991 }, { "epoch": 0.24821719004128612, "grad_norm": 9.9375, "kl": 8.253324508666992, "learning_rate": 5e-06, "logits/chosen": -38875401.14285714, "logits/rejected": 4182356.0, "logps/chosen": -480.73580496651783, "logps/rejected": -971.26484375, "loss": 0.028, "rewards/chosen": 7.2487335205078125, "rewards/margins": 24.113922119140625, "rewards/rejected": -16.865188598632812, "step": 992 }, { "epoch": 0.24846740898286002, "grad_norm": 8.5, "kl": 4.701239585876465, "learning_rate": 5e-06, "logits/chosen": -20693806.545454547, "logits/rejected": -50711187.692307696, "logps/chosen": -369.4903453480114, "logps/rejected": -411.2606670673077, "loss": 0.033, "rewards/chosen": 7.151483709161932, "rewards/margins": 14.639182190795044, "rewards/rejected": -7.487698481633113, "step": 993 }, { "epoch": 0.2487176279244339, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64155991.27272727, "logits/rejected": -56464275.692307696, "logps/chosen": -436.88645241477275, "logps/rejected": -492.15054086538464, "loss": 0.042, "rewards/chosen": 7.837338534268466, "rewards/margins": 18.271864244154283, "rewards/rejected": -10.434525709885817, "step": 994 }, { "epoch": 0.24896784686600776, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -93347315.2, "logits/rejected": -51931538.28571428, "logps/chosen": -581.70400390625, "logps/rejected": -634.4309430803571, "loss": 0.0142, "rewards/chosen": 9.399922180175782, "rewards/margins": 21.41703818184989, "rewards/rejected": -12.017116001674108, "step": 995 }, { "epoch": 0.24921806580758163, "grad_norm": 3.703125, "kl": 10.504900932312012, "learning_rate": 5e-06, "logits/chosen": -82139697.23076923, "logits/rejected": -36180968.72727273, "logps/chosen": -443.50210336538464, "logps/rejected": -578.3034002130681, "loss": 0.0079, "rewards/chosen": 7.701954768254207, "rewards/margins": 16.321860413451294, "rewards/rejected": -8.619905645197088, "step": 996 }, { "epoch": 0.2494682847491555, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63988465.777777776, "logits/rejected": 119205981.86666666, "logps/chosen": -383.4265950520833, "logps/rejected": -538.39052734375, "loss": 0.0528, "rewards/chosen": 6.038827260335286, "rewards/margins": 14.730064646402994, "rewards/rejected": -8.691237386067709, "step": 997 }, { "epoch": 0.2497185036907294, "grad_norm": 6.3125, "kl": 2.5585403442382812, "learning_rate": 5e-06, "logits/chosen": -45557655.27272727, "logits/rejected": -51408177.23076923, "logps/chosen": -368.10531338778407, "logps/rejected": -601.8615159254807, "loss": 0.0463, "rewards/chosen": 5.519921042702415, "rewards/margins": 16.688814710070204, "rewards/rejected": -11.168893667367788, "step": 998 }, { "epoch": 0.24996872263230327, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34937315.55555555, "logits/rejected": -75829555.2, "logps/chosen": -162.37904188368054, "logps/rejected": -554.6164713541667, "loss": 0.1019, "rewards/chosen": 1.883637958102756, "rewards/margins": 13.290573586357965, "rewards/rejected": -11.406935628255209, "step": 999 }, { "epoch": 0.25021894157387714, "grad_norm": 2.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35295445.333333336, "logits/rejected": -54430352.0, "logps/chosen": -434.4065755208333, "logps/rejected": -617.1285400390625, "loss": 0.0077, "rewards/chosen": 6.1912797292073565, "rewards/margins": 17.268178939819336, "rewards/rejected": -11.076899210611979, "step": 1000 }, { "epoch": 0.25046916051545104, "grad_norm": 12.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53168928.0, "logits/rejected": 80021893.33333333, "logps/chosen": -332.048095703125, "logps/rejected": -623.7146809895834, "loss": 0.058, "rewards/chosen": 6.199230194091797, "rewards/margins": 18.334878285725914, "rewards/rejected": -12.135648091634115, "step": 1001 }, { "epoch": 0.2507193794570249, "grad_norm": 3.015625, "kl": 8.039688110351562, "learning_rate": 5e-06, "logits/chosen": -46497472.0, "logits/rejected": -23026619.076923076, "logps/chosen": -455.58744673295456, "logps/rejected": -561.5847731370193, "loss": 0.0091, "rewards/chosen": 7.264249628240412, "rewards/margins": 19.195509957266854, "rewards/rejected": -11.931260329026442, "step": 1002 }, { "epoch": 0.2509695983985988, "grad_norm": 8.125, "kl": 1.5779623985290527, "learning_rate": 5e-06, "logits/chosen": -60927122.28571428, "logits/rejected": -17497033.6, "logps/chosen": -525.3565848214286, "logps/rejected": -385.39814453125, "loss": 0.0382, "rewards/chosen": 7.462730952671596, "rewards/margins": 13.477145167759488, "rewards/rejected": -6.014414215087891, "step": 1003 }, { "epoch": 0.2512198173401726, "grad_norm": 8.25, "kl": 1.5625652074813843, "learning_rate": 5e-06, "logits/chosen": -25108976.0, "logits/rejected": -20830339.2, "logps/chosen": -317.3733607700893, "logps/rejected": -578.748974609375, "loss": 0.0713, "rewards/chosen": 5.1822646004813055, "rewards/margins": 16.48181141444615, "rewards/rejected": -11.299546813964843, "step": 1004 }, { "epoch": 0.2514700362817465, "grad_norm": 8.8125, "kl": 6.138740539550781, "learning_rate": 5e-06, "logits/chosen": -43888369.23076923, "logits/rejected": -66328436.36363637, "logps/chosen": -357.99605618990387, "logps/rejected": -508.69522372159093, "loss": 0.0489, "rewards/chosen": 6.330251840444712, "rewards/margins": 17.637029794546272, "rewards/rejected": -11.306777954101562, "step": 1005 }, { "epoch": 0.2517202552233204, "grad_norm": 14.875, "kl": 2.96492338180542, "learning_rate": 5e-06, "logits/chosen": -67805671.38461539, "logits/rejected": -52245154.90909091, "logps/chosen": -408.27456430288464, "logps/rejected": -613.4556107954545, "loss": 0.0275, "rewards/chosen": 6.22849860558143, "rewards/margins": 17.653217982578944, "rewards/rejected": -11.424719376997514, "step": 1006 }, { "epoch": 0.25197047416489426, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46761316.0, "logits/rejected": -17286142.0, "logps/chosen": -488.9906311035156, "logps/rejected": -519.4513549804688, "loss": 0.0351, "rewards/chosen": 7.57394552230835, "rewards/margins": 16.839669704437256, "rewards/rejected": -9.265724182128906, "step": 1007 }, { "epoch": 0.25222069310646816, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47920457.14285714, "logits/rejected": -47777536.0, "logps/chosen": -347.48329380580356, "logps/rejected": -508.841015625, "loss": 0.0162, "rewards/chosen": 6.903561183384487, "rewards/margins": 17.68131844656808, "rewards/rejected": -10.777757263183593, "step": 1008 }, { "epoch": 0.25247091204804206, "grad_norm": 11.5, "kl": 1.9794502258300781, "learning_rate": 5e-06, "logits/chosen": -43209225.14285714, "logits/rejected": -35479430.4, "logps/chosen": -427.35062081473217, "logps/rejected": -489.30712890625, "loss": 0.0385, "rewards/chosen": 7.117788587297712, "rewards/margins": 20.615838514055525, "rewards/rejected": -13.498049926757812, "step": 1009 }, { "epoch": 0.2527211309896159, "grad_norm": 1.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60640464.0, "logits/rejected": -47734826.666666664, "logps/chosen": -486.995361328125, "logps/rejected": -481.5196940104167, "loss": 0.018, "rewards/chosen": 8.655935923258463, "rewards/margins": 19.78811772664388, "rewards/rejected": -11.132181803385416, "step": 1010 }, { "epoch": 0.2529713499311898, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54788080.0, "logits/rejected": -48225640.0, "logps/chosen": -419.5021057128906, "logps/rejected": -506.8221435546875, "loss": 0.0837, "rewards/chosen": 5.512420177459717, "rewards/margins": 16.09086561203003, "rewards/rejected": -10.578445434570312, "step": 1011 }, { "epoch": 0.25322156887276365, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36486714.18181818, "logits/rejected": -31619288.615384616, "logps/chosen": -299.0135609019886, "logps/rejected": -444.0518329326923, "loss": 0.0769, "rewards/chosen": 6.2726287841796875, "rewards/margins": 18.588010347806488, "rewards/rejected": -12.315381563626802, "step": 1012 }, { "epoch": 0.25347178781433755, "grad_norm": 8.4375, "kl": 6.553333282470703, "learning_rate": 5e-06, "logits/chosen": -25746574.769230768, "logits/rejected": -59588596.36363637, "logps/chosen": -334.5786884014423, "logps/rejected": -781.0011541193181, "loss": 0.0613, "rewards/chosen": 5.6049969012920675, "rewards/margins": 20.91585727504917, "rewards/rejected": -15.310860373757102, "step": 1013 }, { "epoch": 0.25372200675591144, "grad_norm": 15.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23167282.666666668, "logits/rejected": -52002032.0, "logps/chosen": -444.2410888671875, "logps/rejected": -652.1468098958334, "loss": 0.0225, "rewards/chosen": 6.999585469563802, "rewards/margins": 21.186602274576824, "rewards/rejected": -14.187016805013021, "step": 1014 }, { "epoch": 0.2539722256974853, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68860960.0, "logits/rejected": -53725508.0, "logps/chosen": -424.82183837890625, "logps/rejected": -647.5213623046875, "loss": 0.0549, "rewards/chosen": 7.0070271492004395, "rewards/margins": 16.34157419204712, "rewards/rejected": -9.33454704284668, "step": 1015 }, { "epoch": 0.2542224446390592, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52634100.0, "logits/rejected": -35175144.0, "logps/chosen": -412.65069580078125, "logps/rejected": -394.19256591796875, "loss": 0.0104, "rewards/chosen": 5.859254837036133, "rewards/margins": 14.669235229492188, "rewards/rejected": -8.809980392456055, "step": 1016 }, { "epoch": 0.25447266358063303, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42593801.14285714, "logits/rejected": -81095033.6, "logps/chosen": -365.44203404017856, "logps/rejected": -580.0462890625, "loss": 0.0514, "rewards/chosen": 5.486855643136161, "rewards/margins": 18.86259111676897, "rewards/rejected": -13.375735473632812, "step": 1017 }, { "epoch": 0.25472288252220693, "grad_norm": 1.6171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48885939.2, "logits/rejected": -58672932.571428575, "logps/chosen": -401.49375, "logps/rejected": -572.3987862723214, "loss": 0.0158, "rewards/chosen": 7.813041687011719, "rewards/margins": 19.08989061628069, "rewards/rejected": -11.276848929268974, "step": 1018 }, { "epoch": 0.2549731014637808, "grad_norm": 23.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42999747.2, "logits/rejected": -64734381.71428572, "logps/chosen": -353.2420654296875, "logps/rejected": -599.8684430803571, "loss": 0.0788, "rewards/chosen": 4.258368301391601, "rewards/margins": 14.02631481715611, "rewards/rejected": -9.767946515764509, "step": 1019 }, { "epoch": 0.25522332040535467, "grad_norm": 16.625, "kl": 4.18208122253418, "learning_rate": 5e-06, "logits/chosen": -39622581.333333336, "logits/rejected": -30896533.333333332, "logps/chosen": -456.6374918619792, "logps/rejected": -491.9557291666667, "loss": 0.0654, "rewards/chosen": 5.739707946777344, "rewards/margins": 15.960919698079428, "rewards/rejected": -10.221211751302084, "step": 1020 }, { "epoch": 0.25547353934692857, "grad_norm": 7.40625, "kl": 3.0132863521575928, "learning_rate": 5e-06, "logits/chosen": -62003580.0, "logits/rejected": -46525796.0, "logps/chosen": -523.8833618164062, "logps/rejected": -671.1158447265625, "loss": 0.0389, "rewards/chosen": 6.62756872177124, "rewards/margins": 20.664516925811768, "rewards/rejected": -14.036948204040527, "step": 1021 }, { "epoch": 0.25572375828850247, "grad_norm": 12.125, "kl": 4.427289962768555, "learning_rate": 5e-06, "logits/chosen": -66106632.53333333, "logits/rejected": -26632158.222222224, "logps/chosen": -408.7776692708333, "logps/rejected": -483.0852864583333, "loss": 0.031, "rewards/chosen": 7.477281188964843, "rewards/margins": 15.351595391167535, "rewards/rejected": -7.874314202202691, "step": 1022 }, { "epoch": 0.2559739772300763, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39644416.0, "logits/rejected": -23809483.2, "logps/chosen": -303.5419921875, "logps/rejected": -339.08544921875, "loss": 0.0668, "rewards/chosen": 5.307792118617466, "rewards/margins": 15.151117924281529, "rewards/rejected": -9.843325805664062, "step": 1023 }, { "epoch": 0.2562241961716502, "grad_norm": 3.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74494576.0, "logits/rejected": 7892221.5, "logps/chosen": -463.3726501464844, "logps/rejected": -538.4326171875, "loss": 0.0133, "rewards/chosen": 6.614655017852783, "rewards/margins": 16.026517391204834, "rewards/rejected": -9.41186237335205, "step": 1024 }, { "epoch": 0.25647441511322405, "grad_norm": 6.125, "kl": 6.1739325523376465, "learning_rate": 5e-06, "logits/chosen": -58289456.0, "logits/rejected": -74404805.33333333, "logps/chosen": -379.2535400390625, "logps/rejected": -536.9324544270834, "loss": 0.0592, "rewards/chosen": 6.860469818115234, "rewards/margins": 17.552379608154297, "rewards/rejected": -10.691909790039062, "step": 1025 }, { "epoch": 0.25672463405479795, "grad_norm": 14.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61612708.0, "logits/rejected": -46607440.0, "logps/chosen": -281.4748840332031, "logps/rejected": -813.1284790039062, "loss": 0.0743, "rewards/chosen": 2.569995164871216, "rewards/margins": 16.324273347854614, "rewards/rejected": -13.754278182983398, "step": 1026 }, { "epoch": 0.25697485299637185, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43085370.18181818, "logits/rejected": -6887155.692307692, "logps/chosen": -499.87748579545456, "logps/rejected": -691.3734975961538, "loss": 0.0338, "rewards/chosen": 7.815723072398793, "rewards/margins": 22.084843508847108, "rewards/rejected": -14.269120436448317, "step": 1027 }, { "epoch": 0.2572250719379457, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39086007.27272727, "logits/rejected": -39490820.92307692, "logps/chosen": -322.6455078125, "logps/rejected": -474.70132211538464, "loss": 0.0516, "rewards/chosen": 5.844175165349787, "rewards/margins": 15.344976251775568, "rewards/rejected": -9.500801086425781, "step": 1028 }, { "epoch": 0.2574752908795196, "grad_norm": 6.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42935284.36363637, "logits/rejected": -32859200.0, "logps/chosen": -307.62808504971593, "logps/rejected": -616.8149038461538, "loss": 0.0531, "rewards/chosen": 6.581899469549006, "rewards/margins": 18.02195707734648, "rewards/rejected": -11.440057607797476, "step": 1029 }, { "epoch": 0.25772550982109343, "grad_norm": 14.375, "kl": 0.05431016534566879, "learning_rate": 5e-06, "logits/chosen": -66931792.0, "logits/rejected": -46914684.0, "logps/chosen": -437.85186767578125, "logps/rejected": -441.6575927734375, "loss": 0.0729, "rewards/chosen": 7.840295791625977, "rewards/margins": 13.40229320526123, "rewards/rejected": -5.561997413635254, "step": 1030 }, { "epoch": 0.25797572876266733, "grad_norm": 11.6875, "kl": 4.4825873374938965, "learning_rate": 5e-06, "logits/chosen": -99679173.81818181, "logits/rejected": -65518897.23076923, "logps/chosen": -471.60933061079544, "logps/rejected": -570.8533653846154, "loss": 0.0261, "rewards/chosen": 8.747844349254262, "rewards/margins": 18.80046134895378, "rewards/rejected": -10.05261699969952, "step": 1031 }, { "epoch": 0.25822594770424123, "grad_norm": 11.625, "kl": 12.704346656799316, "learning_rate": 5e-06, "logits/chosen": -47932327.384615384, "logits/rejected": -22647476.363636363, "logps/chosen": -416.32189002403845, "logps/rejected": -610.4292436079545, "loss": 0.065, "rewards/chosen": 7.321493882399339, "rewards/margins": 19.031692024711127, "rewards/rejected": -11.71019814231179, "step": 1032 }, { "epoch": 0.2584761666458151, "grad_norm": 32.25, "kl": 10.949246406555176, "learning_rate": 5e-06, "logits/chosen": -16912742.4, "logits/rejected": -85911637.33333333, "logps/chosen": -460.91637369791664, "logps/rejected": -540.9468315972222, "loss": 0.0607, "rewards/chosen": 7.205246988932291, "rewards/margins": 16.048444281684027, "rewards/rejected": -8.843197292751736, "step": 1033 }, { "epoch": 0.258726385587389, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40004640.0, "logits/rejected": -54037641.84615385, "logps/chosen": -383.189453125, "logps/rejected": -512.6988055889423, "loss": 0.0548, "rewards/chosen": 5.053617304021662, "rewards/margins": 14.073775338126229, "rewards/rejected": -9.020158034104567, "step": 1034 }, { "epoch": 0.2589766045289628, "grad_norm": 2.9375, "kl": 3.688621997833252, "learning_rate": 5e-06, "logits/chosen": -47938101.333333336, "logits/rejected": -74513685.33333333, "logps/chosen": -408.0246175130208, "logps/rejected": -539.6314290364584, "loss": 0.0145, "rewards/chosen": 7.235884348551433, "rewards/margins": 17.9769287109375, "rewards/rejected": -10.741044362386068, "step": 1035 }, { "epoch": 0.2592268234705367, "grad_norm": 4.21875, "kl": 5.445687770843506, "learning_rate": 5e-06, "logits/chosen": -49607470.93333333, "logits/rejected": -53894122.666666664, "logps/chosen": -400.16998697916665, "logps/rejected": -577.62890625, "loss": 0.0184, "rewards/chosen": 7.655976867675781, "rewards/margins": 20.3657962375217, "rewards/rejected": -12.70981936984592, "step": 1036 }, { "epoch": 0.2594770424121106, "grad_norm": 13.8125, "kl": 15.88320255279541, "learning_rate": 5e-06, "logits/chosen": -60380708.0, "logits/rejected": -53676040.0, "logps/chosen": -373.0316162109375, "logps/rejected": -722.8099975585938, "loss": 0.0407, "rewards/chosen": 7.708272933959961, "rewards/margins": 22.90108299255371, "rewards/rejected": -15.19281005859375, "step": 1037 }, { "epoch": 0.25972726135368446, "grad_norm": 4.1875, "kl": 3.2029032707214355, "learning_rate": 5e-06, "logits/chosen": -43271523.2, "logits/rejected": -56064704.0, "logps/chosen": -435.09599609375, "logps/rejected": -612.1997767857143, "loss": 0.0274, "rewards/chosen": 6.463885498046875, "rewards/margins": 16.702281188964843, "rewards/rejected": -10.238395690917969, "step": 1038 }, { "epoch": 0.25997748029525836, "grad_norm": 3.546875, "kl": 0.4309489130973816, "learning_rate": 5e-06, "logits/chosen": -70473397.33333333, "logits/rejected": -17723012.0, "logps/chosen": -443.3926595052083, "logps/rejected": -425.0060221354167, "loss": 0.0089, "rewards/chosen": 8.649250666300455, "rewards/margins": 18.368024826049805, "rewards/rejected": -9.71877415974935, "step": 1039 }, { "epoch": 0.26022769923683226, "grad_norm": 14.6875, "kl": 4.056169033050537, "learning_rate": 5e-06, "logits/chosen": -20390875.733333334, "logits/rejected": -46449955.55555555, "logps/chosen": -313.95384114583334, "logps/rejected": -445.68866644965277, "loss": 0.1157, "rewards/chosen": 5.378410847981771, "rewards/margins": 11.495064714219835, "rewards/rejected": -6.116653866238064, "step": 1040 }, { "epoch": 0.2604779181784061, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37050550.15384615, "logits/rejected": -30389207.272727273, "logps/chosen": -322.15726412259613, "logps/rejected": -507.79567649147725, "loss": 0.0689, "rewards/chosen": 3.795181861290565, "rewards/margins": 15.502620937107327, "rewards/rejected": -11.707439075816762, "step": 1041 }, { "epoch": 0.26072813711998, "grad_norm": 10.5625, "kl": 6.8166961669921875, "learning_rate": 5e-06, "logits/chosen": -48090980.571428575, "logits/rejected": -9097024.0, "logps/chosen": -340.4025181361607, "logps/rejected": -489.7923828125, "loss": 0.0716, "rewards/chosen": 6.434259142194476, "rewards/margins": 15.65297862461635, "rewards/rejected": -9.218719482421875, "step": 1042 }, { "epoch": 0.26097835606155384, "grad_norm": 4.6875, "kl": 3.012301206588745, "learning_rate": 5e-06, "logits/chosen": -56313262.54545455, "logits/rejected": -38509769.84615385, "logps/chosen": -492.56840376420456, "logps/rejected": -687.3157301682693, "loss": 0.0123, "rewards/chosen": 8.033505526455967, "rewards/margins": 16.60095054786522, "rewards/rejected": -8.567445021409254, "step": 1043 }, { "epoch": 0.26122857500312774, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40478364.0, "logits/rejected": -50540128.0, "logps/chosen": -303.23431396484375, "logps/rejected": -593.4578857421875, "loss": 0.0307, "rewards/chosen": 5.1532745361328125, "rewards/margins": 15.232929229736328, "rewards/rejected": -10.079654693603516, "step": 1044 }, { "epoch": 0.26147879394470164, "grad_norm": 9.625, "kl": 3.0611419677734375, "learning_rate": 5e-06, "logits/chosen": -48648808.0, "logits/rejected": -36076716.0, "logps/chosen": -429.60162353515625, "logps/rejected": -429.864501953125, "loss": 0.0244, "rewards/chosen": 9.218293190002441, "rewards/margins": 16.823873043060303, "rewards/rejected": -7.605579853057861, "step": 1045 }, { "epoch": 0.2617290128862755, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41144442.666666664, "logits/rejected": -35357786.666666664, "logps/chosen": -361.2045084635417, "logps/rejected": -550.9252522786459, "loss": 0.0701, "rewards/chosen": 6.7840728759765625, "rewards/margins": 15.464081446329752, "rewards/rejected": -8.68000857035319, "step": 1046 }, { "epoch": 0.2619792318278494, "grad_norm": 10.625, "kl": 4.34995174407959, "learning_rate": 5e-06, "logits/chosen": -50915858.28571428, "logits/rejected": -40179104.0, "logps/chosen": -424.2108677455357, "logps/rejected": -601.500390625, "loss": 0.0304, "rewards/chosen": 7.509768894740513, "rewards/margins": 19.86227057320731, "rewards/rejected": -12.352501678466798, "step": 1047 }, { "epoch": 0.2622294507694232, "grad_norm": 2.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58716384.0, "logits/rejected": -53422901.333333336, "logps/chosen": -410.5199381510417, "logps/rejected": -452.4109700520833, "loss": 0.0276, "rewards/chosen": 7.995380401611328, "rewards/margins": 17.340922673543297, "rewards/rejected": -9.345542271931967, "step": 1048 }, { "epoch": 0.2624796697109971, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36150487.27272727, "logits/rejected": -85648285.53846154, "logps/chosen": -342.70869584517044, "logps/rejected": -437.68765024038464, "loss": 0.055, "rewards/chosen": 6.717924638227983, "rewards/margins": 14.550274562168788, "rewards/rejected": -7.832349923940805, "step": 1049 }, { "epoch": 0.262729888652571, "grad_norm": 4.8125, "kl": 6.073253631591797, "learning_rate": 5e-06, "logits/chosen": -54816570.18181818, "logits/rejected": -29863310.769230768, "logps/chosen": -425.4305308948864, "logps/rejected": -440.45015775240387, "loss": 0.0246, "rewards/chosen": 7.383992975408381, "rewards/margins": 15.853181372155676, "rewards/rejected": -8.469188396747295, "step": 1050 }, { "epoch": 0.26298010759414486, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24965732.0, "logits/rejected": -22566512.0, "logps/chosen": -395.2441101074219, "logps/rejected": -617.1180419921875, "loss": 0.0482, "rewards/chosen": 6.644718170166016, "rewards/margins": 19.166110038757324, "rewards/rejected": -12.521391868591309, "step": 1051 }, { "epoch": 0.26323032653571876, "grad_norm": 7.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66122944.0, "logits/rejected": -46419668.0, "logps/chosen": -561.966552734375, "logps/rejected": -351.14605712890625, "loss": 0.0287, "rewards/chosen": 8.718419075012207, "rewards/margins": 15.356658935546875, "rewards/rejected": -6.638239860534668, "step": 1052 }, { "epoch": 0.2634805454772926, "grad_norm": 16.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53186496.0, "logits/rejected": -39044332.307692304, "logps/chosen": -390.18678977272725, "logps/rejected": -504.33980618990387, "loss": 0.0645, "rewards/chosen": 6.3116982199928975, "rewards/margins": 16.40342701731862, "rewards/rejected": -10.091728797325722, "step": 1053 }, { "epoch": 0.2637307644188665, "grad_norm": 12.0, "kl": 1.1019856929779053, "learning_rate": 5e-06, "logits/chosen": -39770276.571428575, "logits/rejected": -51058931.2, "logps/chosen": -320.23158482142856, "logps/rejected": -565.858984375, "loss": 0.0543, "rewards/chosen": 5.458232334681919, "rewards/margins": 15.545704868861606, "rewards/rejected": -10.087472534179687, "step": 1054 }, { "epoch": 0.2639809833604404, "grad_norm": 21.875, "kl": 13.707293510437012, "learning_rate": 5e-06, "logits/chosen": -65360053.333333336, "logits/rejected": -24593626.666666668, "logps/chosen": -396.2681477864583, "logps/rejected": -519.4673258463541, "loss": 0.1107, "rewards/chosen": 6.441303253173828, "rewards/margins": 15.619555155436197, "rewards/rejected": -9.17825190226237, "step": 1055 }, { "epoch": 0.26423120230201425, "grad_norm": 6.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37330126.54545455, "logits/rejected": -35687133.538461536, "logps/chosen": -412.78493430397725, "logps/rejected": -454.18637319711536, "loss": 0.0199, "rewards/chosen": 6.795394203879616, "rewards/margins": 15.800334530276853, "rewards/rejected": -9.004940326397236, "step": 1056 }, { "epoch": 0.26448142124358814, "grad_norm": 3.703125, "kl": 1.5879911184310913, "learning_rate": 5e-06, "logits/chosen": -55692164.571428575, "logits/rejected": -42299350.4, "logps/chosen": -464.7675083705357, "logps/rejected": -541.408203125, "loss": 0.0171, "rewards/chosen": 8.523343222481865, "rewards/margins": 20.877652304513113, "rewards/rejected": -12.35430908203125, "step": 1057 }, { "epoch": 0.26473164018516204, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12121508.0, "logits/rejected": -40629152.0, "logps/chosen": -302.60715738932294, "logps/rejected": -526.6312934027778, "loss": 0.03, "rewards/chosen": 3.363840103149414, "rewards/margins": 15.145866605970594, "rewards/rejected": -11.78202650282118, "step": 1058 }, { "epoch": 0.2649818591267359, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33326417.066666666, "logits/rejected": -39347768.88888889, "logps/chosen": -460.6977213541667, "logps/rejected": -626.7797309027778, "loss": 0.051, "rewards/chosen": 6.554227701822916, "rewards/margins": 18.729056972927516, "rewards/rejected": -12.174829271104601, "step": 1059 }, { "epoch": 0.2652320780683098, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58585731.55555555, "logits/rejected": -53787080.53333333, "logps/chosen": -470.43901909722223, "logps/rejected": -597.9861979166667, "loss": 0.0431, "rewards/chosen": 8.312078687879774, "rewards/margins": 21.554023064507376, "rewards/rejected": -13.241944376627604, "step": 1060 }, { "epoch": 0.2654822970098836, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60873498.35294118, "logits/rejected": -43994779.428571425, "logps/chosen": -537.8562729779412, "logps/rejected": -504.58963448660717, "loss": 0.0407, "rewards/chosen": 6.962846194996553, "rewards/margins": 21.59422218899767, "rewards/rejected": -14.631375994001116, "step": 1061 }, { "epoch": 0.2657325159514575, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48586210.461538464, "logits/rejected": -52390033.45454545, "logps/chosen": -482.6952373798077, "logps/rejected": -470.1321910511364, "loss": 0.008, "rewards/chosen": 7.405115567720854, "rewards/margins": 18.144474643093723, "rewards/rejected": -10.73935907537287, "step": 1062 }, { "epoch": 0.2659827348930314, "grad_norm": 12.4375, "kl": 12.692652702331543, "learning_rate": 5e-06, "logits/chosen": -49571694.93333333, "logits/rejected": -33375488.0, "logps/chosen": -416.4013671875, "logps/rejected": -689.6477864583334, "loss": 0.0377, "rewards/chosen": 7.510750325520833, "rewards/margins": 24.81729770236545, "rewards/rejected": -17.30654737684462, "step": 1063 }, { "epoch": 0.26623295383460527, "grad_norm": 4.4375, "kl": 2.9872589111328125, "learning_rate": 5e-06, "logits/chosen": -30440634.181818184, "logits/rejected": -40265137.23076923, "logps/chosen": -372.70676491477275, "logps/rejected": -601.6565880408654, "loss": 0.0238, "rewards/chosen": 6.4228057861328125, "rewards/margins": 19.749731210561897, "rewards/rejected": -13.326925424429087, "step": 1064 }, { "epoch": 0.26648317277617917, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32604874.666666668, "logits/rejected": -22527709.333333332, "logps/chosen": -380.1713460286458, "logps/rejected": -573.5654296875, "loss": 0.0531, "rewards/chosen": 4.891154607137044, "rewards/margins": 16.450868606567383, "rewards/rejected": -11.559713999430338, "step": 1065 }, { "epoch": 0.266733391717753, "grad_norm": 5.4375, "kl": 0.3885812759399414, "learning_rate": 5e-06, "logits/chosen": -61228234.666666664, "logits/rejected": -74673984.0, "logps/chosen": -355.7219645182292, "logps/rejected": -584.5227864583334, "loss": 0.0605, "rewards/chosen": 4.82704480489095, "rewards/margins": 17.867760340372723, "rewards/rejected": -13.040715535481771, "step": 1066 }, { "epoch": 0.2669836106593269, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55856928.0, "logits/rejected": 46298482.28571428, "logps/chosen": -384.7314453125, "logps/rejected": -582.9377790178571, "loss": 0.0975, "rewards/chosen": 5.264641571044922, "rewards/margins": 16.17779835292271, "rewards/rejected": -10.91315678187779, "step": 1067 }, { "epoch": 0.2672338296009008, "grad_norm": 6.96875, "kl": 1.9263758659362793, "learning_rate": 5e-06, "logits/chosen": -49573779.692307696, "logits/rejected": -26875904.0, "logps/chosen": -344.23106971153845, "logps/rejected": -522.1237571022727, "loss": 0.1146, "rewards/chosen": 4.320041363055889, "rewards/margins": 15.965288388979184, "rewards/rejected": -11.645247025923295, "step": 1068 }, { "epoch": 0.26748404854247465, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41251515.428571425, "logits/rejected": -73711545.6, "logps/chosen": -323.14505440848217, "logps/rejected": -698.8765625, "loss": 0.0499, "rewards/chosen": 5.300013405936105, "rewards/margins": 17.88540235246931, "rewards/rejected": -12.585388946533204, "step": 1069 }, { "epoch": 0.26773426748404855, "grad_norm": 13.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76435219.2, "logits/rejected": -59747597.473684214, "logps/chosen": -412.98466796875, "logps/rejected": -595.5314555921053, "loss": 0.073, "rewards/chosen": 5.491269302368164, "rewards/margins": 16.93192626551578, "rewards/rejected": -11.440656963147616, "step": 1070 }, { "epoch": 0.2679844864256224, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70582061.71428572, "logits/rejected": -28251091.2, "logps/chosen": -448.77926199776783, "logps/rejected": -716.748291015625, "loss": 0.0508, "rewards/chosen": 6.677077157156808, "rewards/margins": 16.48286328996931, "rewards/rejected": -9.8057861328125, "step": 1071 }, { "epoch": 0.2682347053671963, "grad_norm": 13.4375, "kl": 6.2189483642578125, "learning_rate": 5e-06, "logits/chosen": -51928312.47058824, "logits/rejected": -746270.8571428572, "logps/chosen": -422.0572150735294, "logps/rejected": -480.60501534598217, "loss": 0.0432, "rewards/chosen": 7.531087538775275, "rewards/margins": 20.113306253898042, "rewards/rejected": -12.582218715122767, "step": 1072 }, { "epoch": 0.2684849243087702, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42603328.0, "logits/rejected": -48993929.14285714, "logps/chosen": -493.55888671875, "logps/rejected": -410.38058035714283, "loss": 0.0392, "rewards/chosen": 7.736805725097656, "rewards/margins": 16.214313180106025, "rewards/rejected": -8.47750745500837, "step": 1073 }, { "epoch": 0.26873514325034403, "grad_norm": 4.21875, "kl": 5.50734281539917, "learning_rate": 5e-06, "logits/chosen": -64664249.6, "logits/rejected": -36441474.28571428, "logps/chosen": -498.4115234375, "logps/rejected": -419.52852957589283, "loss": 0.0219, "rewards/chosen": 7.686170959472657, "rewards/margins": 15.24597396850586, "rewards/rejected": -7.559803009033203, "step": 1074 }, { "epoch": 0.26898536219191793, "grad_norm": 18.75, "kl": 1.8065681457519531, "learning_rate": 5e-06, "logits/chosen": -80071461.33333333, "logits/rejected": -58035962.666666664, "logps/chosen": -518.0250244140625, "logps/rejected": -723.7351888020834, "loss": 0.0535, "rewards/chosen": 6.1612599690755205, "rewards/margins": 19.24044672648112, "rewards/rejected": -13.0791867574056, "step": 1075 }, { "epoch": 0.26923558113349183, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37960340.36363637, "logits/rejected": -52759522.461538464, "logps/chosen": -390.21067116477275, "logps/rejected": -472.46142578125, "loss": 0.0286, "rewards/chosen": 6.200856295498935, "rewards/margins": 16.18916102055903, "rewards/rejected": -9.988304725060097, "step": 1076 }, { "epoch": 0.2694858000750657, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75481536.0, "logits/rejected": -38179432.0, "logps/chosen": -362.4228210449219, "logps/rejected": -484.3741149902344, "loss": 0.0905, "rewards/chosen": 5.8872971534729, "rewards/margins": 16.126540660858154, "rewards/rejected": -10.239243507385254, "step": 1077 }, { "epoch": 0.26973601901663957, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6303317.333333333, "logits/rejected": -40102544.0, "logps/chosen": -426.8374837239583, "logps/rejected": -507.0885009765625, "loss": 0.0307, "rewards/chosen": 6.030133565266927, "rewards/margins": 16.44853146870931, "rewards/rejected": -10.418397903442383, "step": 1078 }, { "epoch": 0.2699862379582134, "grad_norm": 12.5625, "kl": 1.014570951461792, "learning_rate": 5e-06, "logits/chosen": -30062451.2, "logits/rejected": -5822384.444444444, "logps/chosen": -414.942578125, "logps/rejected": -456.8699001736111, "loss": 0.0672, "rewards/chosen": 6.2477060953776045, "rewards/margins": 13.918084038628471, "rewards/rejected": -7.670377943250868, "step": 1079 }, { "epoch": 0.2702364568997873, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41193900.8, "logits/rejected": -54204164.571428575, "logps/chosen": -289.3846435546875, "logps/rejected": -428.8517368861607, "loss": 0.0635, "rewards/chosen": 6.360298156738281, "rewards/margins": 13.527463095528738, "rewards/rejected": -7.1671649387904575, "step": 1080 }, { "epoch": 0.2704866758413612, "grad_norm": 8.625, "kl": 8.937373161315918, "learning_rate": 5e-06, "logits/chosen": -68271445.33333333, "logits/rejected": -57289600.0, "logps/chosen": -467.50169270833334, "logps/rejected": -483.86094835069446, "loss": 0.0598, "rewards/chosen": 8.355076599121094, "rewards/margins": 16.197863599989148, "rewards/rejected": -7.842787000868055, "step": 1081 }, { "epoch": 0.27073689478293506, "grad_norm": 6.28125, "kl": 6.583486080169678, "learning_rate": 5e-06, "logits/chosen": -48324998.4, "logits/rejected": -64252214.85714286, "logps/chosen": -400.69033203125, "logps/rejected": -538.2389787946429, "loss": 0.08, "rewards/chosen": 8.198338317871094, "rewards/margins": 16.923764038085938, "rewards/rejected": -8.725425720214844, "step": 1082 }, { "epoch": 0.27098711372450895, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33818705.45454545, "logits/rejected": -37354188.307692304, "logps/chosen": -414.52778764204544, "logps/rejected": -622.1455453725962, "loss": 0.0267, "rewards/chosen": 6.887436606667259, "rewards/margins": 17.916585588788653, "rewards/rejected": -11.029148982121395, "step": 1083 }, { "epoch": 0.2712373326660828, "grad_norm": 11.1875, "kl": 2.986737012863159, "learning_rate": 5e-06, "logits/chosen": -28127161.14285714, "logits/rejected": -32415081.6, "logps/chosen": -407.00258091517856, "logps/rejected": -570.648828125, "loss": 0.0454, "rewards/chosen": 7.337047576904297, "rewards/margins": 18.68454818725586, "rewards/rejected": -11.347500610351563, "step": 1084 }, { "epoch": 0.2714875516076567, "grad_norm": 3.09375, "kl": 1.8498420715332031, "learning_rate": 5e-06, "logits/chosen": -51871883.63636363, "logits/rejected": -58128162.461538464, "logps/chosen": -399.17338423295456, "logps/rejected": -598.6005108173077, "loss": 0.0145, "rewards/chosen": 6.610595703125, "rewards/margins": 18.59569138746995, "rewards/rejected": -11.985095684344952, "step": 1085 }, { "epoch": 0.2717377705492306, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65536170.666666664, "logits/rejected": -60796364.8, "logps/chosen": -428.63525390625, "logps/rejected": -625.8037109375, "loss": 0.0286, "rewards/chosen": 7.138812594943577, "rewards/margins": 17.146891615125867, "rewards/rejected": -10.008079020182292, "step": 1086 }, { "epoch": 0.27198798949080444, "grad_norm": 1.9453125, "kl": 0.1335500180721283, "learning_rate": 5e-06, "logits/chosen": -67314986.66666667, "logits/rejected": -68208704.0, "logps/chosen": -448.6257731119792, "logps/rejected": -595.0028076171875, "loss": 0.0236, "rewards/chosen": 8.418085734049479, "rewards/margins": 19.364306131998696, "rewards/rejected": -10.946220397949219, "step": 1087 }, { "epoch": 0.27223820843237834, "grad_norm": 11.9375, "kl": 2.4781596660614014, "learning_rate": 5e-06, "logits/chosen": -32897909.333333332, "logits/rejected": -50726010.666666664, "logps/chosen": -341.4776611328125, "logps/rejected": -386.9042154947917, "loss": 0.0575, "rewards/chosen": 5.447121302286784, "rewards/margins": 12.173222223917644, "rewards/rejected": -6.726100921630859, "step": 1088 }, { "epoch": 0.27248842737395224, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65734038.85714286, "logits/rejected": -37282688.0, "logps/chosen": -460.634765625, "logps/rejected": -557.724462890625, "loss": 0.0652, "rewards/chosen": 6.275933946881976, "rewards/margins": 17.17930613926479, "rewards/rejected": -10.903372192382813, "step": 1089 }, { "epoch": 0.2727386463155261, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53089733.81818182, "logits/rejected": -52116263.384615384, "logps/chosen": -341.99471768465907, "logps/rejected": -592.1624474158654, "loss": 0.0477, "rewards/chosen": 5.92734805020419, "rewards/margins": 17.439280423251066, "rewards/rejected": -11.511932373046875, "step": 1090 }, { "epoch": 0.2729888652571, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25469912.0, "logits/rejected": -56716392.0, "logps/chosen": -281.33465576171875, "logps/rejected": -462.2007141113281, "loss": 0.0713, "rewards/chosen": 5.469954490661621, "rewards/margins": 14.639933586120605, "rewards/rejected": -9.169979095458984, "step": 1091 }, { "epoch": 0.2732390841986738, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42062903.46666667, "logits/rejected": -56085603.55555555, "logps/chosen": -268.9640299479167, "logps/rejected": -600.7552083333334, "loss": 0.0727, "rewards/chosen": 4.227912902832031, "rewards/margins": 18.390953063964844, "rewards/rejected": -14.163040161132812, "step": 1092 }, { "epoch": 0.2734893031402477, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69614899.2, "logits/rejected": -55605156.571428575, "logps/chosen": -459.2021484375, "logps/rejected": -469.44960239955356, "loss": 0.0215, "rewards/chosen": 6.567465209960938, "rewards/margins": 16.48907645089286, "rewards/rejected": -9.92161124093192, "step": 1093 }, { "epoch": 0.2737395220818216, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3467280.0, "logits/rejected": -56018474.666666664, "logps/chosen": -385.5160807291667, "logps/rejected": -762.1418185763889, "loss": 0.0375, "rewards/chosen": 7.350739542643229, "rewards/margins": 24.25840861002604, "rewards/rejected": -16.907669067382812, "step": 1094 }, { "epoch": 0.27398974102339546, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41002224.0, "logits/rejected": -46760886.85714286, "logps/chosen": -317.456640625, "logps/rejected": -442.14481026785717, "loss": 0.0619, "rewards/chosen": 5.158559036254883, "rewards/margins": 15.3337582724435, "rewards/rejected": -10.175199236188616, "step": 1095 }, { "epoch": 0.27423995996496936, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66423336.0, "logits/rejected": 15578939.0, "logps/chosen": -598.3119506835938, "logps/rejected": -538.156005859375, "loss": 0.0326, "rewards/chosen": 9.851755142211914, "rewards/margins": 20.736827850341797, "rewards/rejected": -10.885072708129883, "step": 1096 }, { "epoch": 0.2744901789065432, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56380012.0, "logits/rejected": -45252272.0, "logps/chosen": -360.52008056640625, "logps/rejected": -775.1851196289062, "loss": 0.0491, "rewards/chosen": 4.268970966339111, "rewards/margins": 20.097017765045166, "rewards/rejected": -15.828046798706055, "step": 1097 }, { "epoch": 0.2747403978481171, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42099539.2, "logits/rejected": -70175158.85714285, "logps/chosen": -432.8919921875, "logps/rejected": -546.2735072544643, "loss": 0.0328, "rewards/chosen": 6.726247406005859, "rewards/margins": 15.323533085414342, "rewards/rejected": -8.597285679408483, "step": 1098 }, { "epoch": 0.274990616789691, "grad_norm": 16.0, "kl": 1.2806282043457031, "learning_rate": 5e-06, "logits/chosen": -55991517.86666667, "logits/rejected": -13366439.111111112, "logps/chosen": -308.357421875, "logps/rejected": -559.4840494791666, "loss": 0.0504, "rewards/chosen": 5.396641540527344, "rewards/margins": 18.67631564670139, "rewards/rejected": -13.279674106174046, "step": 1099 }, { "epoch": 0.27524083573126484, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57582818.13333333, "logits/rejected": -673201.7777777778, "logps/chosen": -400.09137369791665, "logps/rejected": -696.7108289930555, "loss": 0.0349, "rewards/chosen": 6.75584716796875, "rewards/margins": 24.74800075954861, "rewards/rejected": -17.99215359157986, "step": 1100 }, { "epoch": 0.27549105467283874, "grad_norm": 30.625, "kl": 0.6924750208854675, "learning_rate": 5e-06, "logits/chosen": -27497136.0, "logits/rejected": -29163402.0, "logps/chosen": -348.93359375, "logps/rejected": -368.3362121582031, "loss": 0.1269, "rewards/chosen": 3.9706897735595703, "rewards/margins": 14.764211654663086, "rewards/rejected": -10.793521881103516, "step": 1101 }, { "epoch": 0.2757412736144126, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50458805.333333336, "logits/rejected": -67763530.66666667, "logps/chosen": -436.042724609375, "logps/rejected": -732.9457194010416, "loss": 0.0332, "rewards/chosen": 7.310827891031901, "rewards/margins": 22.272210439046223, "rewards/rejected": -14.961382548014322, "step": 1102 }, { "epoch": 0.2759914925559865, "grad_norm": 11.375, "kl": 9.376516342163086, "learning_rate": 5e-06, "logits/chosen": -47450645.333333336, "logits/rejected": -84347980.8, "logps/chosen": -406.51036241319446, "logps/rejected": -679.2341796875, "loss": 0.095, "rewards/chosen": 8.338723076714409, "rewards/margins": 21.0160159640842, "rewards/rejected": -12.677292887369791, "step": 1103 }, { "epoch": 0.2762417114975604, "grad_norm": 17.625, "kl": 0.4985329508781433, "learning_rate": 5e-06, "logits/chosen": -58128019.692307696, "logits/rejected": -50909597.09090909, "logps/chosen": -375.994140625, "logps/rejected": -634.8025568181819, "loss": 0.0619, "rewards/chosen": 4.677800692044771, "rewards/margins": 16.36170516647659, "rewards/rejected": -11.683904474431818, "step": 1104 }, { "epoch": 0.2764919304391342, "grad_norm": 20.875, "kl": 2.386707305908203, "learning_rate": 5e-06, "logits/chosen": -38971032.0, "logits/rejected": -35238500.0, "logps/chosen": -517.1376342773438, "logps/rejected": -600.728759765625, "loss": 0.048, "rewards/chosen": 5.9778876304626465, "rewards/margins": 14.879157543182373, "rewards/rejected": -8.901269912719727, "step": 1105 }, { "epoch": 0.2767421493807081, "grad_norm": 18.5, "kl": 1.0896530151367188, "learning_rate": 5e-06, "logits/chosen": -69047163.07692307, "logits/rejected": -33687758.54545455, "logps/chosen": -485.11527193509613, "logps/rejected": -600.8192471590909, "loss": 0.0497, "rewards/chosen": 7.161865234375, "rewards/margins": 18.86987165971236, "rewards/rejected": -11.708006425337357, "step": 1106 }, { "epoch": 0.276992368322282, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57469159.384615384, "logits/rejected": -63527133.09090909, "logps/chosen": -482.9043719951923, "logps/rejected": -649.0275213068181, "loss": 0.0247, "rewards/chosen": 7.538060701810396, "rewards/margins": 19.453976451100168, "rewards/rejected": -11.915915749289773, "step": 1107 }, { "epoch": 0.27724258726385587, "grad_norm": 25.5, "kl": 0.8959074020385742, "learning_rate": 5e-06, "logits/chosen": -63813358.93333333, "logits/rejected": -71857265.77777778, "logps/chosen": -440.74720052083336, "logps/rejected": -692.888671875, "loss": 0.0772, "rewards/chosen": 5.711973571777344, "rewards/margins": 17.683696492513022, "rewards/rejected": -11.971722920735678, "step": 1108 }, { "epoch": 0.27749280620542977, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41586064.0, "logits/rejected": -29834989.714285713, "logps/chosen": -456.718115234375, "logps/rejected": -563.10986328125, "loss": 0.0284, "rewards/chosen": 6.393958663940429, "rewards/margins": 16.333298437935966, "rewards/rejected": -9.939339773995536, "step": 1109 }, { "epoch": 0.2777430251470036, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83759024.0, "logits/rejected": -74702080.0, "logps/chosen": -425.6722005208333, "logps/rejected": -599.1381022135416, "loss": 0.0148, "rewards/chosen": 7.529047012329102, "rewards/margins": 18.031349182128906, "rewards/rejected": -10.502302169799805, "step": 1110 }, { "epoch": 0.2779932440885775, "grad_norm": 3.359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49951896.88888889, "logits/rejected": -30285772.8, "logps/chosen": -428.3562282986111, "logps/rejected": -510.3357421875, "loss": 0.0186, "rewards/chosen": 6.390851762559679, "rewards/margins": 17.234177568223743, "rewards/rejected": -10.843325805664062, "step": 1111 }, { "epoch": 0.2782434630301514, "grad_norm": 4.5, "kl": 3.721956253051758, "learning_rate": 5e-06, "logits/chosen": -71663616.0, "logits/rejected": -63565701.81818182, "logps/chosen": -577.8170823317307, "logps/rejected": -530.6829723011364, "loss": 0.0284, "rewards/chosen": 8.188385009765625, "rewards/margins": 16.593276283957742, "rewards/rejected": -8.404891274192117, "step": 1112 }, { "epoch": 0.27849368197172525, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57338761.14285714, "logits/rejected": -11805608.8, "logps/chosen": -384.8462611607143, "logps/rejected": -459.12490234375, "loss": 0.0088, "rewards/chosen": 6.566524505615234, "rewards/margins": 14.983558654785156, "rewards/rejected": -8.417034149169922, "step": 1113 }, { "epoch": 0.27874390091329915, "grad_norm": 11.4375, "kl": 10.931575775146484, "learning_rate": 5e-06, "logits/chosen": -64109453.71428572, "logits/rejected": -73537651.2, "logps/chosen": -525.2949916294643, "logps/rejected": -424.23046875, "loss": 0.0329, "rewards/chosen": 7.816885811941964, "rewards/margins": 15.793410164969309, "rewards/rejected": -7.976524353027344, "step": 1114 }, { "epoch": 0.278994119854873, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35641701.333333336, "logits/rejected": -41233922.666666664, "logps/chosen": -297.7326253255208, "logps/rejected": -481.4709065755208, "loss": 0.0485, "rewards/chosen": 5.158966064453125, "rewards/margins": 16.710957845052086, "rewards/rejected": -11.551991780598959, "step": 1115 }, { "epoch": 0.2792443387964469, "grad_norm": 1.0234375, "kl": 5.5768256187438965, "learning_rate": 5e-06, "logits/chosen": -55892608.0, "logits/rejected": -60655173.81818182, "logps/chosen": -539.2142052283654, "logps/rejected": -671.0494939630681, "loss": 0.0022, "rewards/chosen": 8.648090069110577, "rewards/margins": 21.435173088020377, "rewards/rejected": -12.7870830189098, "step": 1116 }, { "epoch": 0.2794945577380208, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60317222.4, "logits/rejected": -35313513.14285714, "logps/chosen": -341.6595703125, "logps/rejected": -375.9506138392857, "loss": 0.0929, "rewards/chosen": 5.1135601043701175, "rewards/margins": 12.600681686401368, "rewards/rejected": -7.48712158203125, "step": 1117 }, { "epoch": 0.27974477667959463, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64883498.666666664, "logits/rejected": -78907754.66666667, "logps/chosen": -443.2113037109375, "logps/rejected": -552.0526123046875, "loss": 0.018, "rewards/chosen": 7.160554885864258, "rewards/margins": 19.559404373168945, "rewards/rejected": -12.398849487304688, "step": 1118 }, { "epoch": 0.27999499562116853, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44881655.27272727, "logits/rejected": -8537550.76923077, "logps/chosen": -447.947265625, "logps/rejected": -672.7516526442307, "loss": 0.0229, "rewards/chosen": 6.872282548384233, "rewards/margins": 21.806531252560916, "rewards/rejected": -14.934248704176683, "step": 1119 }, { "epoch": 0.2802452145627424, "grad_norm": 8.875, "kl": 0.3929786682128906, "learning_rate": 5e-06, "logits/chosen": -45486376.72727273, "logits/rejected": -21242176.0, "logps/chosen": -390.81356534090907, "logps/rejected": -469.76900540865387, "loss": 0.0495, "rewards/chosen": 5.598920995538885, "rewards/margins": 15.924116308038885, "rewards/rejected": -10.3251953125, "step": 1120 }, { "epoch": 0.28049543350431627, "grad_norm": 23.75, "kl": 22.835674285888672, "learning_rate": 5e-06, "logits/chosen": -49082532.0, "logits/rejected": -42672644.0, "logps/chosen": -392.3042907714844, "logps/rejected": -348.0960998535156, "loss": 0.1745, "rewards/chosen": 6.786016464233398, "rewards/margins": 14.288293838500977, "rewards/rejected": -7.502277374267578, "step": 1121 }, { "epoch": 0.28074565244589017, "grad_norm": 13.5, "kl": 10.35714340209961, "learning_rate": 5e-06, "logits/chosen": -45470136.0, "logits/rejected": -23907364.0, "logps/chosen": -535.1232299804688, "logps/rejected": -396.4907531738281, "loss": 0.0315, "rewards/chosen": 6.495186805725098, "rewards/margins": 14.931267738342285, "rewards/rejected": -8.436080932617188, "step": 1122 }, { "epoch": 0.280995871387464, "grad_norm": 7.40625, "kl": 5.8620924949646, "learning_rate": 5e-06, "logits/chosen": -20827218.82352941, "logits/rejected": -55563227.428571425, "logps/chosen": -456.75178079044116, "logps/rejected": -754.9725167410714, "loss": 0.0192, "rewards/chosen": 7.6631308162913605, "rewards/margins": 21.92915908428801, "rewards/rejected": -14.266028267996651, "step": 1123 }, { "epoch": 0.2812460903290379, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28628236.307692308, "logits/rejected": -44597704.72727273, "logps/chosen": -425.4797175480769, "logps/rejected": -522.7269176136364, "loss": 0.0373, "rewards/chosen": 5.29712148813101, "rewards/margins": 14.073091947115385, "rewards/rejected": -8.775970458984375, "step": 1124 }, { "epoch": 0.2814963092706118, "grad_norm": 7.0, "kl": 1.6981037855148315, "learning_rate": 5e-06, "logits/chosen": -43891998.11764706, "logits/rejected": -55080987.428571425, "logps/chosen": -351.44617417279414, "logps/rejected": -472.8842075892857, "loss": 0.0655, "rewards/chosen": 5.352763905244715, "rewards/margins": 15.160261202259225, "rewards/rejected": -9.807497297014509, "step": 1125 }, { "epoch": 0.28174652821218565, "grad_norm": 5.78125, "kl": 4.999650478363037, "learning_rate": 5e-06, "logits/chosen": -52355825.23076923, "logits/rejected": -29107421.09090909, "logps/chosen": -442.4424579326923, "logps/rejected": -578.0192649147727, "loss": 0.0116, "rewards/chosen": 6.517998915452224, "rewards/margins": 17.675476554390436, "rewards/rejected": -11.15747763893821, "step": 1126 }, { "epoch": 0.28199674715375955, "grad_norm": 2.828125, "kl": 2.381016254425049, "learning_rate": 5e-06, "logits/chosen": -39125462.85714286, "logits/rejected": -65015564.8, "logps/chosen": -394.45511300223217, "logps/rejected": -309.7927978515625, "loss": 0.0367, "rewards/chosen": 7.08624267578125, "rewards/margins": 15.39000244140625, "rewards/rejected": -8.303759765625, "step": 1127 }, { "epoch": 0.2822469660953334, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42608813.333333336, "logits/rejected": -39809245.333333336, "logps/chosen": -308.09075927734375, "logps/rejected": -623.582763671875, "loss": 0.0235, "rewards/chosen": 6.037115097045898, "rewards/margins": 19.856931686401367, "rewards/rejected": -13.819816589355469, "step": 1128 }, { "epoch": 0.2824971850369073, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55244179.692307696, "logits/rejected": -33944610.90909091, "logps/chosen": -439.1571514423077, "logps/rejected": -503.52370383522725, "loss": 0.03, "rewards/chosen": 7.263636662409856, "rewards/margins": 15.412269752342386, "rewards/rejected": -8.14863308993253, "step": 1129 }, { "epoch": 0.2827474039784812, "grad_norm": 14.1875, "kl": 3.562600612640381, "learning_rate": 5e-06, "logits/chosen": -37833376.0, "logits/rejected": -37308688.0, "logps/chosen": -436.9351501464844, "logps/rejected": -576.4046630859375, "loss": 0.0425, "rewards/chosen": 6.774343013763428, "rewards/margins": 19.837724208831787, "rewards/rejected": -13.06338119506836, "step": 1130 }, { "epoch": 0.28299762292005504, "grad_norm": 6.9375, "kl": 0.6583760976791382, "learning_rate": 5e-06, "logits/chosen": -17543952.0, "logits/rejected": -43305002.666666664, "logps/chosen": -411.9682210286458, "logps/rejected": -449.27587890625, "loss": 0.0403, "rewards/chosen": 7.3061097462972, "rewards/margins": 15.605206807454426, "rewards/rejected": -8.299097061157227, "step": 1131 }, { "epoch": 0.28324784186162894, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11877948.0, "logits/rejected": -52788173.71428572, "logps/chosen": -280.8844482421875, "logps/rejected": -757.8016880580357, "loss": 0.0684, "rewards/chosen": 6.709333801269532, "rewards/margins": 18.47840292794364, "rewards/rejected": -11.769069126674108, "step": 1132 }, { "epoch": 0.2834980608032028, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24925646.222222224, "logits/rejected": -51881169.06666667, "logps/chosen": -198.14195421006946, "logps/rejected": -663.5908854166667, "loss": 0.0815, "rewards/chosen": 4.016237894694011, "rewards/margins": 16.9124506632487, "rewards/rejected": -12.896212768554687, "step": 1133 }, { "epoch": 0.2837482797447767, "grad_norm": 11.3125, "kl": 1.8512611389160156, "learning_rate": 5e-06, "logits/chosen": -41117060.266666666, "logits/rejected": -31486176.0, "logps/chosen": -387.18040364583334, "logps/rejected": -724.8056640625, "loss": 0.0513, "rewards/chosen": 7.161700439453125, "rewards/margins": 18.074291314019096, "rewards/rejected": -10.912590874565971, "step": 1134 }, { "epoch": 0.2839984986863506, "grad_norm": 19.125, "kl": 2.806396484375, "learning_rate": 5e-06, "logits/chosen": -72035584.0, "logits/rejected": -39650119.11111111, "logps/chosen": -373.69049479166665, "logps/rejected": -530.0896267361111, "loss": 0.095, "rewards/chosen": 6.401496887207031, "rewards/margins": 16.91127438015408, "rewards/rejected": -10.509777492947048, "step": 1135 }, { "epoch": 0.2842487176279244, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52554972.44444445, "logits/rejected": -63235443.2, "logps/chosen": -467.20838758680554, "logps/rejected": -588.9477864583333, "loss": 0.0425, "rewards/chosen": 8.019686381022135, "rewards/margins": 17.684647115071613, "rewards/rejected": -9.66496073404948, "step": 1136 }, { "epoch": 0.2844989365694983, "grad_norm": 4.0625, "kl": 14.149798393249512, "learning_rate": 5e-06, "logits/chosen": -75866186.66666667, "logits/rejected": -21808450.666666668, "logps/chosen": -492.2015380859375, "logps/rejected": -441.0258382161458, "loss": 0.0502, "rewards/chosen": 7.840506235758464, "rewards/margins": 17.053593317667644, "rewards/rejected": -9.21308708190918, "step": 1137 }, { "epoch": 0.28474915551107216, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16061280.0, "logits/rejected": -57771382.85714286, "logps/chosen": -292.27177734375, "logps/rejected": -742.0191127232143, "loss": 0.0383, "rewards/chosen": 5.383618927001953, "rewards/margins": 16.250819505964007, "rewards/rejected": -10.867200578962054, "step": 1138 }, { "epoch": 0.28499937445264606, "grad_norm": 10.0, "kl": 4.595436096191406, "learning_rate": 5e-06, "logits/chosen": -44831984.0, "logits/rejected": -38189504.0, "logps/chosen": -460.630908203125, "logps/rejected": -546.166015625, "loss": 0.043, "rewards/chosen": 7.146396636962891, "rewards/margins": 18.307462964739116, "rewards/rejected": -11.161066327776227, "step": 1139 }, { "epoch": 0.28524959339421996, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52034154.666666664, "logits/rejected": -58314090.666666664, "logps/chosen": -493.29443359375, "logps/rejected": -709.8943684895834, "loss": 0.0318, "rewards/chosen": 7.900133768717448, "rewards/margins": 19.16943868001302, "rewards/rejected": -11.269304911295572, "step": 1140 }, { "epoch": 0.2854998123357938, "grad_norm": 22.375, "kl": 1.6676957607269287, "learning_rate": 5e-06, "logits/chosen": 14674387.2, "logits/rejected": -42041792.0, "logps/chosen": -507.386865234375, "logps/rejected": -522.7941545758929, "loss": 0.047, "rewards/chosen": 8.240525817871093, "rewards/margins": 18.35200980050223, "rewards/rejected": -10.111483982631139, "step": 1141 }, { "epoch": 0.2857500312773677, "grad_norm": 6.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58925924.571428575, "logits/rejected": -47231235.76470588, "logps/chosen": -377.67654854910717, "logps/rejected": -561.3901654411765, "loss": 0.0308, "rewards/chosen": 6.969936915806362, "rewards/margins": 17.053445575617943, "rewards/rejected": -10.083508659811582, "step": 1142 }, { "epoch": 0.2860002502189416, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67890048.0, "logits/rejected": -37485654.15384615, "logps/chosen": -460.69588955965907, "logps/rejected": -652.5498798076923, "loss": 0.0125, "rewards/chosen": 8.55354863947088, "rewards/margins": 19.06226375553158, "rewards/rejected": -10.508715116060698, "step": 1143 }, { "epoch": 0.28625046916051544, "grad_norm": 8.1875, "kl": 0.6028093099594116, "learning_rate": 5e-06, "logits/chosen": -51619936.0, "logits/rejected": -47086843.428571425, "logps/chosen": -472.382421875, "logps/rejected": -444.4257114955357, "loss": 0.0216, "rewards/chosen": 7.286181640625, "rewards/margins": 16.01197466169085, "rewards/rejected": -8.725793021065849, "step": 1144 }, { "epoch": 0.28650068810208934, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77244224.0, "logits/rejected": -71108745.14285715, "logps/chosen": -501.0978515625, "logps/rejected": -574.0147879464286, "loss": 0.0769, "rewards/chosen": 7.306327819824219, "rewards/margins": 18.51054164341518, "rewards/rejected": -11.20421382359096, "step": 1145 }, { "epoch": 0.2867509070436632, "grad_norm": 23.375, "kl": 3.8275184631347656, "learning_rate": 5e-06, "logits/chosen": -48584009.14285714, "logits/rejected": -41102422.4, "logps/chosen": -307.35843331473217, "logps/rejected": -454.167822265625, "loss": 0.1518, "rewards/chosen": 4.919449397495815, "rewards/margins": 14.319527980259487, "rewards/rejected": -9.400078582763673, "step": 1146 }, { "epoch": 0.2870011259852371, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33572341.333333336, "logits/rejected": -56828341.333333336, "logps/chosen": -252.9339599609375, "logps/rejected": -638.4321695963541, "loss": 0.0473, "rewards/chosen": 5.4500579833984375, "rewards/margins": 17.980410257975258, "rewards/rejected": -12.530352274576822, "step": 1147 }, { "epoch": 0.287251344926811, "grad_norm": 4.34375, "kl": 6.115841865539551, "learning_rate": 5e-06, "logits/chosen": -46525792.0, "logits/rejected": -55102822.4, "logps/chosen": -290.5564662388393, "logps/rejected": -481.272119140625, "loss": 0.0699, "rewards/chosen": 5.91736820765904, "rewards/margins": 16.89659641810826, "rewards/rejected": -10.979228210449218, "step": 1148 }, { "epoch": 0.2875015638683848, "grad_norm": 3.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -86422912.0, "logits/rejected": -29018448.0, "logps/chosen": -457.31138392857144, "logps/rejected": -504.507421875, "loss": 0.0263, "rewards/chosen": 7.788644518171038, "rewards/margins": 18.48494197300502, "rewards/rejected": -10.696297454833985, "step": 1149 }, { "epoch": 0.2877517828099587, "grad_norm": 4.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35256182.15384615, "logits/rejected": -47389853.09090909, "logps/chosen": -410.7327223557692, "logps/rejected": -410.20725319602275, "loss": 0.0304, "rewards/chosen": 6.891871525691106, "rewards/margins": 14.435614312445367, "rewards/rejected": -7.543742786754262, "step": 1150 }, { "epoch": 0.28800200175153257, "grad_norm": 14.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42627355.428571425, "logits/rejected": -40488342.4, "logps/chosen": -534.7102399553571, "logps/rejected": -459.3259765625, "loss": 0.0259, "rewards/chosen": 8.4393310546875, "rewards/margins": 18.26767578125, "rewards/rejected": -9.8283447265625, "step": 1151 }, { "epoch": 0.28825222069310646, "grad_norm": 11.125, "kl": 1.3041725158691406, "learning_rate": 5e-06, "logits/chosen": -60510795.63636363, "logits/rejected": 453984.0, "logps/chosen": -397.06010298295456, "logps/rejected": -455.00439453125, "loss": 0.0311, "rewards/chosen": 6.105116410688921, "rewards/margins": 15.377516313032672, "rewards/rejected": -9.27239990234375, "step": 1152 }, { "epoch": 0.28850243963468036, "grad_norm": 14.5625, "kl": 5.840234279632568, "learning_rate": 5e-06, "logits/chosen": -65389233.23076923, "logits/rejected": -71592244.36363636, "logps/chosen": -345.34337439903845, "logps/rejected": -582.2925248579545, "loss": 0.0411, "rewards/chosen": 7.0215301513671875, "rewards/margins": 20.155201305042613, "rewards/rejected": -13.133671153675426, "step": 1153 }, { "epoch": 0.2887526585762542, "grad_norm": 25.0, "kl": 11.057903289794922, "learning_rate": 5e-06, "logits/chosen": -55171406.222222224, "logits/rejected": -102561578.66666667, "logps/chosen": -419.52362738715277, "logps/rejected": -601.3760579427084, "loss": 0.0741, "rewards/chosen": 6.599257998996311, "rewards/margins": 19.258938683403862, "rewards/rejected": -12.659680684407553, "step": 1154 }, { "epoch": 0.2890028775178281, "grad_norm": 11.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43943498.666666664, "logits/rejected": -38566464.0, "logps/chosen": -313.95220947265625, "logps/rejected": -437.9047037760417, "loss": 0.0373, "rewards/chosen": 5.543373107910156, "rewards/margins": 14.59536043802897, "rewards/rejected": -9.051987330118815, "step": 1155 }, { "epoch": 0.289253096459402, "grad_norm": 8.1875, "kl": 1.2169806957244873, "learning_rate": 5e-06, "logits/chosen": -32880795.076923076, "logits/rejected": -27842656.0, "logps/chosen": -305.5837965745192, "logps/rejected": -474.1767578125, "loss": 0.0405, "rewards/chosen": 6.181807884803185, "rewards/margins": 13.862743831181026, "rewards/rejected": -7.680935946377841, "step": 1156 }, { "epoch": 0.28950331540097585, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76837461.33333333, "logits/rejected": -55483170.13333333, "logps/chosen": -304.2406412760417, "logps/rejected": -763.5622395833333, "loss": 0.0402, "rewards/chosen": 5.299874623616536, "rewards/margins": 19.165586090087892, "rewards/rejected": -13.865711466471355, "step": 1157 }, { "epoch": 0.28975353434254975, "grad_norm": 7.71875, "kl": 11.241491317749023, "learning_rate": 5e-06, "logits/chosen": -53666258.28571428, "logits/rejected": -55073804.8, "logps/chosen": -441.96578543526783, "logps/rejected": -634.8125, "loss": 0.0181, "rewards/chosen": 7.33161871773856, "rewards/margins": 18.380584171840123, "rewards/rejected": -11.048965454101562, "step": 1158 }, { "epoch": 0.2900037532841236, "grad_norm": 8.625, "kl": 8.927696228027344, "learning_rate": 5e-06, "logits/chosen": 3694013.3333333335, "logits/rejected": -76322346.66666667, "logps/chosen": -448.2857666015625, "logps/rejected": -624.241943359375, "loss": 0.0476, "rewards/chosen": 7.467383702596028, "rewards/margins": 17.04286066691081, "rewards/rejected": -9.57547696431478, "step": 1159 }, { "epoch": 0.2902539722256975, "grad_norm": 20.25, "kl": 5.393974304199219, "learning_rate": 5e-06, "logits/chosen": -43851430.4, "logits/rejected": -55026546.28571428, "logps/chosen": -394.58173828125, "logps/rejected": -543.3825334821429, "loss": 0.0729, "rewards/chosen": 8.295943450927734, "rewards/margins": 16.72632250104632, "rewards/rejected": -8.430379050118583, "step": 1160 }, { "epoch": 0.2905041911672714, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27270582.85714286, "logits/rejected": -74103878.4, "logps/chosen": -331.34287806919644, "logps/rejected": -499.31845703125, "loss": 0.0486, "rewards/chosen": 6.761083330426898, "rewards/margins": 16.667295183454243, "rewards/rejected": -9.906211853027344, "step": 1161 }, { "epoch": 0.29075441010884523, "grad_norm": 7.0625, "kl": 11.803191184997559, "learning_rate": 5e-06, "logits/chosen": -48919748.266666666, "logits/rejected": -11062577.777777778, "logps/chosen": -406.24427083333336, "logps/rejected": -430.6764322916667, "loss": 0.0281, "rewards/chosen": 7.299162801106771, "rewards/margins": 15.235323418511285, "rewards/rejected": -7.936160617404514, "step": 1162 }, { "epoch": 0.29100462905041913, "grad_norm": 2.359375, "kl": 5.95670747756958, "learning_rate": 5e-06, "logits/chosen": -47983514.18181818, "logits/rejected": -58994697.84615385, "logps/chosen": -440.89888139204544, "logps/rejected": -572.2101111778846, "loss": 0.0056, "rewards/chosen": 8.965993707830256, "rewards/margins": 18.987236289711266, "rewards/rejected": -10.02124258188101, "step": 1163 }, { "epoch": 0.29125484799199297, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51669549.71428572, "logits/rejected": -54192704.0, "logps/chosen": -332.2840053013393, "logps/rejected": -748.29404296875, "loss": 0.047, "rewards/chosen": 7.1476576668875555, "rewards/margins": 23.29574268886021, "rewards/rejected": -16.148085021972655, "step": 1164 }, { "epoch": 0.29150506693356687, "grad_norm": 13.875, "kl": 7.269364833831787, "learning_rate": 5e-06, "logits/chosen": -39548300.8, "logits/rejected": -59502286.222222224, "logps/chosen": -378.7070638020833, "logps/rejected": -472.37651909722223, "loss": 0.1074, "rewards/chosen": 6.256101481119791, "rewards/margins": 14.99893069797092, "rewards/rejected": -8.742829216851128, "step": 1165 }, { "epoch": 0.29175528587514077, "grad_norm": 11.875, "kl": 10.952193260192871, "learning_rate": 5e-06, "logits/chosen": -42194747.07692308, "logits/rejected": -52604928.0, "logps/chosen": -386.5329777644231, "logps/rejected": -546.6471058238636, "loss": 0.0558, "rewards/chosen": 8.422494741586538, "rewards/margins": 16.069486818113525, "rewards/rejected": -7.646992076526988, "step": 1166 }, { "epoch": 0.2920055048167146, "grad_norm": 9.125, "kl": 10.713637351989746, "learning_rate": 5e-06, "logits/chosen": -35048576.0, "logits/rejected": -59720981.333333336, "logps/chosen": -449.3466796875, "logps/rejected": -522.2677951388889, "loss": 0.0232, "rewards/chosen": 8.380671183268229, "rewards/margins": 17.041798909505207, "rewards/rejected": -8.661127726236979, "step": 1167 }, { "epoch": 0.2922557237582885, "grad_norm": 8.0, "kl": 0.8730294704437256, "learning_rate": 5e-06, "logits/chosen": -32253632.0, "logits/rejected": -43578926.54545455, "logps/chosen": -340.8405198317308, "logps/rejected": -659.3392666903409, "loss": 0.074, "rewards/chosen": 5.842277526855469, "rewards/margins": 15.458738153631037, "rewards/rejected": -9.616460626775568, "step": 1168 }, { "epoch": 0.29250594269986235, "grad_norm": 11.1875, "kl": 6.464358329772949, "learning_rate": 5e-06, "logits/chosen": -18985476.0, "logits/rejected": -61411680.0, "logps/chosen": -442.2316080729167, "logps/rejected": -709.6593424479166, "loss": 0.046, "rewards/chosen": 6.438519795735677, "rewards/margins": 17.92563756306966, "rewards/rejected": -11.487117767333984, "step": 1169 }, { "epoch": 0.29275616164143625, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69223529.14285715, "logits/rejected": -55853152.0, "logps/chosen": -409.9129115513393, "logps/rejected": -782.859033203125, "loss": 0.086, "rewards/chosen": 7.345761980329241, "rewards/margins": 21.801278032575333, "rewards/rejected": -14.455516052246093, "step": 1170 }, { "epoch": 0.29300638058301015, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40741317.81818182, "logits/rejected": -50016777.84615385, "logps/chosen": -326.20478959517044, "logps/rejected": -678.9094050480769, "loss": 0.049, "rewards/chosen": 6.4317543723366475, "rewards/margins": 14.607239836579435, "rewards/rejected": -8.175485464242788, "step": 1171 }, { "epoch": 0.293256599524584, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43050307.2, "logits/rejected": -50048731.428571425, "logps/chosen": -315.14072265625, "logps/rejected": -391.5376674107143, "loss": 0.0469, "rewards/chosen": 5.647267913818359, "rewards/margins": 13.675783320835658, "rewards/rejected": -8.028515407017299, "step": 1172 }, { "epoch": 0.2935068184661579, "grad_norm": 25.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73290880.0, "logits/rejected": -41485750.85714286, "logps/chosen": -482.339599609375, "logps/rejected": -607.3962053571429, "loss": 0.0504, "rewards/chosen": 8.916798400878907, "rewards/margins": 17.531564113071987, "rewards/rejected": -8.61476571219308, "step": 1173 }, { "epoch": 0.2937570374077318, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33187360.0, "logits/rejected": -47261147.428571425, "logps/chosen": -487.595263671875, "logps/rejected": -420.5537109375, "loss": 0.0362, "rewards/chosen": 8.267884826660156, "rewards/margins": 16.589737919398715, "rewards/rejected": -8.32185309273856, "step": 1174 }, { "epoch": 0.29400725634930563, "grad_norm": 8.0625, "kl": 4.510001182556152, "learning_rate": 5e-06, "logits/chosen": -51537077.333333336, "logits/rejected": -5749093.333333333, "logps/chosen": -492.6954345703125, "logps/rejected": -454.897705078125, "loss": 0.0489, "rewards/chosen": 7.210700988769531, "rewards/margins": 15.760632832845053, "rewards/rejected": -8.549931844075521, "step": 1175 }, { "epoch": 0.29425747529087953, "grad_norm": 11.3125, "kl": 9.470272064208984, "learning_rate": 5e-06, "logits/chosen": -43001432.615384616, "logits/rejected": -45236808.72727273, "logps/chosen": -346.83559945913464, "logps/rejected": -406.42356178977275, "loss": 0.0954, "rewards/chosen": 5.944509652944712, "rewards/margins": 12.52782818987653, "rewards/rejected": -6.583318536931818, "step": 1176 }, { "epoch": 0.2945076942324534, "grad_norm": 14.1875, "kl": 2.1260085105895996, "learning_rate": 5e-06, "logits/chosen": -56938752.0, "logits/rejected": -35727936.0, "logps/chosen": -394.81507161458336, "logps/rejected": -559.5191514756945, "loss": 0.0552, "rewards/chosen": 6.616814676920573, "rewards/margins": 16.59097985161675, "rewards/rejected": -9.97416517469618, "step": 1177 }, { "epoch": 0.2947579131740273, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35963780.571428575, "logits/rejected": -40654284.8, "logps/chosen": -367.4828404017857, "logps/rejected": -555.4779296875, "loss": 0.0538, "rewards/chosen": 6.843018123081753, "rewards/margins": 18.400052424839565, "rewards/rejected": -11.557034301757813, "step": 1178 }, { "epoch": 0.2950081321156012, "grad_norm": 4.65625, "kl": 0.869391143321991, "learning_rate": 5e-06, "logits/chosen": -64680185.6, "logits/rejected": -46269709.71428572, "logps/chosen": -296.861865234375, "logps/rejected": -482.86293247767856, "loss": 0.0642, "rewards/chosen": 6.163364028930664, "rewards/margins": 16.856327765328544, "rewards/rejected": -10.69296373639788, "step": 1179 }, { "epoch": 0.295258351057175, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52071910.4, "logits/rejected": -62973454.222222224, "logps/chosen": -400.16155598958335, "logps/rejected": -759.7228732638889, "loss": 0.0406, "rewards/chosen": 6.897824605305989, "rewards/margins": 20.841357760959202, "rewards/rejected": -13.943533155653212, "step": 1180 }, { "epoch": 0.2955085699987489, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35593959.384615384, "logits/rejected": -50709364.36363637, "logps/chosen": -345.9950420673077, "logps/rejected": -656.3503639914773, "loss": 0.0783, "rewards/chosen": 5.690041175255408, "rewards/margins": 18.259755808156687, "rewards/rejected": -12.56971463290128, "step": 1181 }, { "epoch": 0.29575878894032276, "grad_norm": 10.375, "kl": 9.542585372924805, "learning_rate": 5e-06, "logits/chosen": -49999276.8, "logits/rejected": -56117622.85714286, "logps/chosen": -392.2707763671875, "logps/rejected": -645.2907366071429, "loss": 0.0812, "rewards/chosen": 7.23094482421875, "rewards/margins": 17.091972133091517, "rewards/rejected": -9.861027308872767, "step": 1182 }, { "epoch": 0.29600900788189666, "grad_norm": 7.53125, "kl": 1.3462190628051758, "learning_rate": 5e-06, "logits/chosen": -73396597.33333333, "logits/rejected": -58261290.666666664, "logps/chosen": -489.7920328776042, "logps/rejected": -587.763427734375, "loss": 0.0643, "rewards/chosen": 7.031253814697266, "rewards/margins": 16.520645141601562, "rewards/rejected": -9.489391326904297, "step": 1183 }, { "epoch": 0.29625922682347056, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44937866.666666664, "logits/rejected": -57909968.0, "logps/chosen": -295.5955810546875, "logps/rejected": -518.7527262369791, "loss": 0.0848, "rewards/chosen": 4.8132584889729815, "rewards/margins": 15.341567993164062, "rewards/rejected": -10.52830950419108, "step": 1184 }, { "epoch": 0.2965094457650444, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66403982.222222224, "logits/rejected": -68882841.6, "logps/chosen": -338.1589084201389, "logps/rejected": -577.3327473958333, "loss": 0.0652, "rewards/chosen": 4.731848822699653, "rewards/margins": 17.06589830186632, "rewards/rejected": -12.334049479166667, "step": 1185 }, { "epoch": 0.2967596647066183, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59508433.45454545, "logits/rejected": -60182759.384615384, "logps/chosen": -320.5403497869318, "logps/rejected": -695.4613131009615, "loss": 0.0464, "rewards/chosen": 5.647186972878196, "rewards/margins": 18.00622627951882, "rewards/rejected": -12.359039306640625, "step": 1186 }, { "epoch": 0.29700988364819214, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63789216.0, "logits/rejected": -67286698.66666667, "logps/chosen": -243.3569539388021, "logps/rejected": -369.5504150390625, "loss": 0.11, "rewards/chosen": 5.065320650736491, "rewards/margins": 12.966883023579916, "rewards/rejected": -7.901562372843425, "step": 1187 }, { "epoch": 0.29726010258976604, "grad_norm": 19.375, "kl": 3.0311450958251953, "learning_rate": 5e-06, "logits/chosen": -52727392.0, "logits/rejected": -45968368.0, "logps/chosen": -317.70501708984375, "logps/rejected": -490.6730651855469, "loss": 0.1226, "rewards/chosen": 4.367021560668945, "rewards/margins": 14.081737518310547, "rewards/rejected": -9.714715957641602, "step": 1188 }, { "epoch": 0.29751032153133994, "grad_norm": 9.0625, "kl": 0.4523735046386719, "learning_rate": 5e-06, "logits/chosen": -80183897.6, "logits/rejected": -49963616.0, "logps/chosen": -404.010986328125, "logps/rejected": -511.8562709263393, "loss": 0.0162, "rewards/chosen": 7.27861328125, "rewards/margins": 18.591876220703124, "rewards/rejected": -11.313262939453125, "step": 1189 }, { "epoch": 0.2977605404729138, "grad_norm": 2.5625, "kl": 11.088561058044434, "learning_rate": 5e-06, "logits/chosen": -60941792.0, "logits/rejected": -66404777.14285714, "logps/chosen": -494.26298828125, "logps/rejected": -569.3899274553571, "loss": 0.0119, "rewards/chosen": 8.699906921386718, "rewards/margins": 21.065955461774553, "rewards/rejected": -12.366048540387835, "step": 1190 }, { "epoch": 0.2980107594144877, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30597309.53846154, "logits/rejected": -50105530.18181818, "logps/chosen": -375.7626953125, "logps/rejected": -610.1363192471591, "loss": 0.027, "rewards/chosen": 6.560080895057092, "rewards/margins": 18.856131920447716, "rewards/rejected": -12.296051025390625, "step": 1191 }, { "epoch": 0.2982609783560616, "grad_norm": 1.9609375, "kl": 0.9644635915756226, "learning_rate": 5e-06, "logits/chosen": -65544477.538461536, "logits/rejected": -58500642.90909091, "logps/chosen": -427.7375676081731, "logps/rejected": -650.7820046164773, "loss": 0.0305, "rewards/chosen": 6.857108482947717, "rewards/margins": 18.837696528934934, "rewards/rejected": -11.980588045987217, "step": 1192 }, { "epoch": 0.2985111972976354, "grad_norm": 29.75, "kl": 10.394487380981445, "learning_rate": 5e-06, "logits/chosen": -26408361.14285714, "logits/rejected": -64922188.8, "logps/chosen": -380.1547154017857, "logps/rejected": -435.6400390625, "loss": 0.1729, "rewards/chosen": 4.728291102818081, "rewards/margins": 12.919478389195035, "rewards/rejected": -8.191187286376953, "step": 1193 }, { "epoch": 0.2987614162392093, "grad_norm": 6.75, "kl": 2.676608085632324, "learning_rate": 5e-06, "logits/chosen": -51292818.28571428, "logits/rejected": -45942102.4, "logps/chosen": -453.580810546875, "logps/rejected": -500.6263671875, "loss": 0.0254, "rewards/chosen": 7.770008632114956, "rewards/margins": 19.930053492954798, "rewards/rejected": -12.160044860839843, "step": 1194 }, { "epoch": 0.29901163518078316, "grad_norm": 6.34375, "kl": 10.00429916381836, "learning_rate": 5e-06, "logits/chosen": -49094005.333333336, "logits/rejected": -52719712.0, "logps/chosen": -404.1981201171875, "logps/rejected": -633.2005615234375, "loss": 0.0266, "rewards/chosen": 7.785235087076823, "rewards/margins": 20.702952067057293, "rewards/rejected": -12.917716979980469, "step": 1195 }, { "epoch": 0.29926185412235706, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32333948.444444444, "logits/rejected": -54345403.733333334, "logps/chosen": -315.71869574652777, "logps/rejected": -579.2310546875, "loss": 0.0464, "rewards/chosen": 5.535135904947917, "rewards/margins": 15.939619954427084, "rewards/rejected": -10.404484049479167, "step": 1196 }, { "epoch": 0.29951207306393096, "grad_norm": 6.59375, "kl": 2.8395156860351562, "learning_rate": 5e-06, "logits/chosen": -43056997.333333336, "logits/rejected": -29393584.0, "logps/chosen": -365.9730631510417, "logps/rejected": -471.6285400390625, "loss": 0.0421, "rewards/chosen": 6.637304306030273, "rewards/margins": 15.41088612874349, "rewards/rejected": -8.773581822713217, "step": 1197 }, { "epoch": 0.2997622920055048, "grad_norm": 4.8125, "kl": 4.63289213180542, "learning_rate": 5e-06, "logits/chosen": -38853734.4, "logits/rejected": -30732472.888888888, "logps/chosen": -456.22470703125, "logps/rejected": -479.3478732638889, "loss": 0.0166, "rewards/chosen": 7.875729370117187, "rewards/margins": 15.737787373860677, "rewards/rejected": -7.862058003743489, "step": 1198 }, { "epoch": 0.3000125109470787, "grad_norm": 10.0625, "kl": 11.848552703857422, "learning_rate": 5e-06, "logits/chosen": -64484292.571428575, "logits/rejected": -58322188.8, "logps/chosen": -385.57470703125, "logps/rejected": -657.2455078125, "loss": 0.0863, "rewards/chosen": 6.681431361607143, "rewards/margins": 17.403420802525112, "rewards/rejected": -10.72198944091797, "step": 1199 }, { "epoch": 0.30026272988865255, "grad_norm": 2.734375, "kl": 4.273982048034668, "learning_rate": 5e-06, "logits/chosen": -84237008.0, "logits/rejected": -52003640.0, "logps/chosen": -547.9490356445312, "logps/rejected": -717.3812866210938, "loss": 0.0038, "rewards/chosen": 10.082688331604004, "rewards/margins": 22.521946907043457, "rewards/rejected": -12.439258575439453, "step": 1200 }, { "epoch": 0.30051294883022645, "grad_norm": 5.8125, "kl": 2.1367499828338623, "learning_rate": 5e-06, "logits/chosen": -47991885.71428572, "logits/rejected": -37787616.0, "logps/chosen": -421.96212332589283, "logps/rejected": -646.25380859375, "loss": 0.062, "rewards/chosen": 7.607967921665737, "rewards/margins": 18.67392098563058, "rewards/rejected": -11.065953063964844, "step": 1201 }, { "epoch": 0.30076316777180034, "grad_norm": 19.875, "kl": 20.412845611572266, "learning_rate": 5e-06, "logits/chosen": -57176690.28571428, "logits/rejected": -41773651.2, "logps/chosen": -496.66183035714283, "logps/rejected": -485.713720703125, "loss": 0.0505, "rewards/chosen": 8.625023978097099, "rewards/margins": 16.44388918195452, "rewards/rejected": -7.818865203857422, "step": 1202 }, { "epoch": 0.3010133867133742, "grad_norm": 6.09375, "kl": 7.945102691650391, "learning_rate": 5e-06, "logits/chosen": -41518966.15384615, "logits/rejected": -54993536.0, "logps/chosen": -436.09761868990387, "logps/rejected": -662.8452592329545, "loss": 0.0415, "rewards/chosen": 7.948289724496695, "rewards/margins": 18.823940303775814, "rewards/rejected": -10.87565057927912, "step": 1203 }, { "epoch": 0.3012636056549481, "grad_norm": 13.1875, "kl": 12.051488876342773, "learning_rate": 5e-06, "logits/chosen": -47792244.0, "logits/rejected": -47254648.0, "logps/chosen": -327.1128234863281, "logps/rejected": -590.921142578125, "loss": 0.0755, "rewards/chosen": 6.375910758972168, "rewards/margins": 17.78371524810791, "rewards/rejected": -11.407804489135742, "step": 1204 }, { "epoch": 0.301513824596522, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43345376.0, "logits/rejected": -26328621.714285713, "logps/chosen": -332.8568115234375, "logps/rejected": -507.40869140625, "loss": 0.0445, "rewards/chosen": 5.435160827636719, "rewards/margins": 13.513526698521204, "rewards/rejected": -8.078365870884486, "step": 1205 }, { "epoch": 0.3017640435380958, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44591864.88888889, "logits/rejected": -82771959.46666667, "logps/chosen": -434.6588541666667, "logps/rejected": -862.0003255208334, "loss": 0.0452, "rewards/chosen": 8.864386664496529, "rewards/margins": 22.953476630316842, "rewards/rejected": -14.089089965820312, "step": 1206 }, { "epoch": 0.3020142624796697, "grad_norm": 7.90625, "kl": 5.905923366546631, "learning_rate": 5e-06, "logits/chosen": -46774262.15384615, "logits/rejected": -14435822.545454545, "logps/chosen": -339.00255408653845, "logps/rejected": -408.1629083806818, "loss": 0.0729, "rewards/chosen": 6.62992448073167, "rewards/margins": 13.8753461504316, "rewards/rejected": -7.245421669699929, "step": 1207 }, { "epoch": 0.30226448142124357, "grad_norm": 4.125, "kl": 5.0011420249938965, "learning_rate": 5e-06, "logits/chosen": -35347517.71428572, "logits/rejected": -54376166.4, "logps/chosen": -444.19088309151783, "logps/rejected": -581.9115234375, "loss": 0.0375, "rewards/chosen": 7.872530800955636, "rewards/margins": 17.909774453299388, "rewards/rejected": -10.03724365234375, "step": 1208 }, { "epoch": 0.30251470036281747, "grad_norm": 15.0, "kl": 11.214672088623047, "learning_rate": 5e-06, "logits/chosen": -54350061.176470585, "logits/rejected": -40607730.28571428, "logps/chosen": -414.0768037683824, "logps/rejected": -440.0709751674107, "loss": 0.0779, "rewards/chosen": 7.746686150045956, "rewards/margins": 15.269918201350364, "rewards/rejected": -7.523232051304409, "step": 1209 }, { "epoch": 0.30276491930439137, "grad_norm": 8.5, "kl": 14.319580078125, "learning_rate": 5e-06, "logits/chosen": -76032904.53333333, "logits/rejected": -63509973.333333336, "logps/chosen": -416.6420572916667, "logps/rejected": -630.6883680555555, "loss": 0.0843, "rewards/chosen": 8.988008626302083, "rewards/margins": 19.222793070475262, "rewards/rejected": -10.234784444173178, "step": 1210 }, { "epoch": 0.3030151382459652, "grad_norm": 13.8125, "kl": 4.323009490966797, "learning_rate": 5e-06, "logits/chosen": -40642581.333333336, "logits/rejected": -53383825.06666667, "logps/chosen": -470.5436197916667, "logps/rejected": -466.6597005208333, "loss": 0.1265, "rewards/chosen": 7.391743977864583, "rewards/margins": 16.222974650065105, "rewards/rejected": -8.83123067220052, "step": 1211 }, { "epoch": 0.3032653571875391, "grad_norm": 10.0, "kl": 15.987177848815918, "learning_rate": 5e-06, "logits/chosen": -86152557.71428572, "logits/rejected": -55593881.6, "logps/chosen": -579.7297712053571, "logps/rejected": -559.9326171875, "loss": 0.0304, "rewards/chosen": 9.981085641043526, "rewards/margins": 18.77410627092634, "rewards/rejected": -8.793020629882813, "step": 1212 }, { "epoch": 0.30351557612911295, "grad_norm": 8.5625, "kl": 0.21150970458984375, "learning_rate": 5e-06, "logits/chosen": -19375637.333333332, "logits/rejected": -34573491.2, "logps/chosen": -270.0732150607639, "logps/rejected": -572.109765625, "loss": 0.0528, "rewards/chosen": 6.671888139512804, "rewards/margins": 16.843167029486764, "rewards/rejected": -10.171278889973959, "step": 1213 }, { "epoch": 0.30376579507068685, "grad_norm": 26.25, "kl": 16.300756454467773, "learning_rate": 5e-06, "logits/chosen": -23889493.333333332, "logits/rejected": -58384629.333333336, "logps/chosen": -419.4922688802083, "logps/rejected": -507.2896321614583, "loss": 0.1428, "rewards/chosen": 7.208911895751953, "rewards/margins": 16.83298428853353, "rewards/rejected": -9.624072392781576, "step": 1214 }, { "epoch": 0.30401601401226075, "grad_norm": 18.5, "kl": 8.834426879882812, "learning_rate": 5e-06, "logits/chosen": -13152477.538461538, "logits/rejected": -12759269.818181818, "logps/chosen": -252.78532527043268, "logps/rejected": -529.7227450284091, "loss": 0.0963, "rewards/chosen": 5.433926508976863, "rewards/margins": 17.288959689907262, "rewards/rejected": -11.855033180930398, "step": 1215 }, { "epoch": 0.3042662329538346, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10205563.636363637, "logits/rejected": -54210569.84615385, "logps/chosen": -451.13188032670456, "logps/rejected": -535.5322641225962, "loss": 0.0081, "rewards/chosen": 7.300703568892046, "rewards/margins": 17.306140392810317, "rewards/rejected": -10.00543682391827, "step": 1216 }, { "epoch": 0.3045164518954085, "grad_norm": 16.875, "kl": 1.085489273071289, "learning_rate": 5e-06, "logits/chosen": -49543040.0, "logits/rejected": -44048296.72727273, "logps/chosen": -464.95361328125, "logps/rejected": -475.2381036931818, "loss": 0.018, "rewards/chosen": 8.724955045259916, "rewards/margins": 17.333094776927176, "rewards/rejected": -8.608139731667258, "step": 1217 }, { "epoch": 0.30476667083698233, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38865818.18181818, "logits/rejected": -57863758.76923077, "logps/chosen": -417.4181463068182, "logps/rejected": -690.7618689903846, "loss": 0.1122, "rewards/chosen": 5.963247819380327, "rewards/margins": 16.167141627598475, "rewards/rejected": -10.20389380821815, "step": 1218 }, { "epoch": 0.30501688977855623, "grad_norm": 6.53125, "kl": 0.8689308166503906, "learning_rate": 5e-06, "logits/chosen": -45907918.222222224, "logits/rejected": -29998685.866666667, "logps/chosen": -443.6657986111111, "logps/rejected": -493.95384114583334, "loss": 0.0222, "rewards/chosen": 8.326126098632812, "rewards/margins": 16.025186157226564, "rewards/rejected": -7.69906005859375, "step": 1219 }, { "epoch": 0.30526710872013013, "grad_norm": 22.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76317580.8, "logits/rejected": -40659858.28571428, "logps/chosen": -298.8114013671875, "logps/rejected": -468.56082589285717, "loss": 0.0571, "rewards/chosen": 5.860202407836914, "rewards/margins": 12.43331162588937, "rewards/rejected": -6.573109218052456, "step": 1220 }, { "epoch": 0.305517327661704, "grad_norm": 2.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36596786.28571428, "logits/rejected": -52418819.76470588, "logps/chosen": -482.5286342075893, "logps/rejected": -558.5536534926471, "loss": 0.0037, "rewards/chosen": 8.219329289027623, "rewards/margins": 19.81026987668847, "rewards/rejected": -11.590940587660846, "step": 1221 }, { "epoch": 0.3057675466032779, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57436570.666666664, "logits/rejected": -30109176.0, "logps/chosen": -336.4075113932292, "logps/rejected": -499.1826985677083, "loss": 0.0335, "rewards/chosen": 5.587828318277995, "rewards/margins": 13.55677096048991, "rewards/rejected": -7.968942642211914, "step": 1222 }, { "epoch": 0.30601776554485177, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31468336.0, "logits/rejected": -34517024.0, "logps/chosen": -333.1884521484375, "logps/rejected": -665.4081333705357, "loss": 0.0488, "rewards/chosen": 5.672737884521484, "rewards/margins": 16.932380349295478, "rewards/rejected": -11.259642464773995, "step": 1223 }, { "epoch": 0.3062679844864256, "grad_norm": 16.375, "kl": 14.873919486999512, "learning_rate": 5e-06, "logits/chosen": -65133782.85714286, "logits/rejected": -83509286.4, "logps/chosen": -487.99550083705356, "logps/rejected": -648.62421875, "loss": 0.0507, "rewards/chosen": 6.1974896022251675, "rewards/margins": 17.42685056413923, "rewards/rejected": -11.229360961914063, "step": 1224 }, { "epoch": 0.3065182034279995, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61670101.333333336, "logits/rejected": -48595835.733333334, "logps/chosen": -364.9957682291667, "logps/rejected": -590.0022135416667, "loss": 0.004, "rewards/chosen": 7.189697265625, "rewards/margins": 17.719127400716147, "rewards/rejected": -10.529430135091145, "step": 1225 }, { "epoch": 0.30676842236957336, "grad_norm": 7.5625, "kl": 11.055181503295898, "learning_rate": 5e-06, "logits/chosen": -16172059.0, "logits/rejected": -64466960.0, "logps/chosen": -669.9346313476562, "logps/rejected": -700.0983276367188, "loss": 0.0616, "rewards/chosen": 9.303380012512207, "rewards/margins": 23.678828239440918, "rewards/rejected": -14.375448226928711, "step": 1226 }, { "epoch": 0.30701864131114726, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14761928.727272727, "logits/rejected": -9460416.0, "logps/chosen": -414.7590997869318, "logps/rejected": -724.708984375, "loss": 0.0719, "rewards/chosen": 4.879152471368963, "rewards/margins": 22.25484029062978, "rewards/rejected": -17.37568781926082, "step": 1227 }, { "epoch": 0.30726886025272115, "grad_norm": 20.25, "kl": 6.07587194442749, "learning_rate": 5e-06, "logits/chosen": -34716292.571428575, "logits/rejected": -34900057.6, "logps/chosen": -449.8076171875, "logps/rejected": -492.19150390625, "loss": 0.0694, "rewards/chosen": 7.262049538748605, "rewards/margins": 14.568482644217355, "rewards/rejected": -7.30643310546875, "step": 1228 }, { "epoch": 0.307519079194295, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46084236.8, "logits/rejected": -53814720.0, "logps/chosen": -341.7791259765625, "logps/rejected": -642.7267020089286, "loss": 0.0381, "rewards/chosen": 4.39723014831543, "rewards/margins": 16.99350438799177, "rewards/rejected": -12.596274239676339, "step": 1229 }, { "epoch": 0.3077692981358689, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61462215.11111111, "logits/rejected": -17072117.333333332, "logps/chosen": -520.7877604166666, "logps/rejected": -565.2350260416666, "loss": 0.0088, "rewards/chosen": 6.520641326904297, "rewards/margins": 20.729000091552734, "rewards/rejected": -14.208358764648438, "step": 1230 }, { "epoch": 0.30801951707744274, "grad_norm": 11.8125, "kl": 3.8257193565368652, "learning_rate": 5e-06, "logits/chosen": -25244777.14285714, "logits/rejected": -35328236.8, "logps/chosen": -392.2129603794643, "logps/rejected": -542.0587890625, "loss": 0.0741, "rewards/chosen": 5.690225873674665, "rewards/margins": 19.81968754359654, "rewards/rejected": -14.129461669921875, "step": 1231 }, { "epoch": 0.30826973601901664, "grad_norm": 12.5625, "kl": 0.28105735778808594, "learning_rate": 5e-06, "logits/chosen": -53168320.0, "logits/rejected": -48682928.0, "logps/chosen": -321.15576171875, "logps/rejected": -432.2138264973958, "loss": 0.0512, "rewards/chosen": 4.481417338053386, "rewards/margins": 14.419905980428059, "rewards/rejected": -9.938488642374674, "step": 1232 }, { "epoch": 0.30851995496059054, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63650205.538461536, "logits/rejected": -12155309.090909092, "logps/chosen": -292.4852764423077, "logps/rejected": -369.77587890625, "loss": 0.0811, "rewards/chosen": 4.245010962853065, "rewards/margins": 13.588428737400296, "rewards/rejected": -9.34341777454723, "step": 1233 }, { "epoch": 0.3087701739021644, "grad_norm": 14.25, "kl": 10.198458671569824, "learning_rate": 5e-06, "logits/chosen": -80799793.23076923, "logits/rejected": -71043642.18181819, "logps/chosen": -403.5478515625, "logps/rejected": -562.3085049715909, "loss": 0.1501, "rewards/chosen": 5.712518545297476, "rewards/margins": 19.58325440733583, "rewards/rejected": -13.870735862038352, "step": 1234 }, { "epoch": 0.3090203928437383, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30094779.42857143, "logits/rejected": -30140227.2, "logps/chosen": -376.94144112723217, "logps/rejected": -618.9744140625, "loss": 0.0606, "rewards/chosen": 6.012803213936942, "rewards/margins": 18.650406973702566, "rewards/rejected": -12.637603759765625, "step": 1235 }, { "epoch": 0.3092706117853121, "grad_norm": 14.75, "kl": 1.3741252422332764, "learning_rate": 5e-06, "logits/chosen": -22974729.14285714, "logits/rejected": -47919587.2, "logps/chosen": -455.51576450892856, "logps/rejected": -509.818896484375, "loss": 0.0753, "rewards/chosen": 5.769430433000837, "rewards/margins": 17.118478284563338, "rewards/rejected": -11.3490478515625, "step": 1236 }, { "epoch": 0.309520830726886, "grad_norm": 2.5, "kl": 1.9214465618133545, "learning_rate": 5e-06, "logits/chosen": -50238396.44444445, "logits/rejected": -22097812.0, "logps/chosen": -313.0553385416667, "logps/rejected": -452.2313639322917, "loss": 0.0106, "rewards/chosen": 6.583102332221137, "rewards/margins": 16.858679241604275, "rewards/rejected": -10.275576909383139, "step": 1237 }, { "epoch": 0.3097710496684599, "grad_norm": 7.375, "kl": 8.763050079345703, "learning_rate": 5e-06, "logits/chosen": -67746020.57142857, "logits/rejected": -79124198.4, "logps/chosen": -539.0365862165179, "logps/rejected": -750.8107421875, "loss": 0.0644, "rewards/chosen": 8.34955324445452, "rewards/margins": 25.343193381173272, "rewards/rejected": -16.99364013671875, "step": 1238 }, { "epoch": 0.31002126861003376, "grad_norm": 7.28125, "kl": 7.915022850036621, "learning_rate": 5e-06, "logits/chosen": -56121408.0, "logits/rejected": -33696928.0, "logps/chosen": -342.4960611979167, "logps/rejected": -433.22862413194446, "loss": 0.089, "rewards/chosen": 7.548491414388021, "rewards/margins": 18.4423826429579, "rewards/rejected": -10.893891228569878, "step": 1239 }, { "epoch": 0.31027148755160766, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47854330.666666664, "logits/rejected": -79582746.66666667, "logps/chosen": -293.01316324869794, "logps/rejected": -660.5765787760416, "loss": 0.0559, "rewards/chosen": 4.899873733520508, "rewards/margins": 17.41712506612142, "rewards/rejected": -12.517251332600912, "step": 1240 }, { "epoch": 0.31052170649318156, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48317344.0, "logits/rejected": -19875664.0, "logps/chosen": -366.90098353794644, "logps/rejected": -399.4595703125, "loss": 0.0534, "rewards/chosen": 5.985648018973214, "rewards/margins": 15.232921273367747, "rewards/rejected": -9.247273254394532, "step": 1241 }, { "epoch": 0.3107719254347554, "grad_norm": 12.6875, "kl": 6.448036193847656, "learning_rate": 5e-06, "logits/chosen": -50070218.666666664, "logits/rejected": -14155168.0, "logps/chosen": -435.9420166015625, "logps/rejected": -522.2616373697916, "loss": 0.014, "rewards/chosen": 7.722878774007161, "rewards/margins": 17.908846537272137, "rewards/rejected": -10.185967763264975, "step": 1242 }, { "epoch": 0.3110221443763293, "grad_norm": 23.25, "kl": 10.553285598754883, "learning_rate": 5e-06, "logits/chosen": -58239488.0, "logits/rejected": -49623240.0, "logps/chosen": -314.5216979980469, "logps/rejected": -742.7978515625, "loss": 0.1889, "rewards/chosen": 5.35376501083374, "rewards/margins": 18.42606782913208, "rewards/rejected": -13.07230281829834, "step": 1243 }, { "epoch": 0.31127236331790314, "grad_norm": 13.5625, "kl": 18.329917907714844, "learning_rate": 5e-06, "logits/chosen": -69109981.86666666, "logits/rejected": -7965475.555555556, "logps/chosen": -481.5512369791667, "logps/rejected": -644.6374240451389, "loss": 0.1081, "rewards/chosen": 8.319614156087239, "rewards/margins": 22.451639133029513, "rewards/rejected": -14.132024976942274, "step": 1244 }, { "epoch": 0.31152258225947704, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41265280.0, "logits/rejected": -51238592.0, "logps/chosen": -421.37571022727275, "logps/rejected": -518.7028245192307, "loss": 0.0363, "rewards/chosen": 6.618546919389204, "rewards/margins": 17.33440244447935, "rewards/rejected": -10.715855525090145, "step": 1245 }, { "epoch": 0.31177280120105094, "grad_norm": 6.5625, "kl": 2.755945920944214, "learning_rate": 5e-06, "logits/chosen": -28064611.2, "logits/rejected": -56540539.428571425, "logps/chosen": -322.7212646484375, "logps/rejected": -618.4041573660714, "loss": 0.0591, "rewards/chosen": 6.259741973876953, "rewards/margins": 17.569922637939452, "rewards/rejected": -11.3101806640625, "step": 1246 }, { "epoch": 0.3120230201426248, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48707445.333333336, "logits/rejected": -15703682.666666666, "logps/chosen": -368.1058756510417, "logps/rejected": -552.9111735026041, "loss": 0.0501, "rewards/chosen": 5.385286966959636, "rewards/margins": 15.32118542989095, "rewards/rejected": -9.935898462931315, "step": 1247 }, { "epoch": 0.3122732390841987, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69974586.18181819, "logits/rejected": -38543259.07692308, "logps/chosen": -531.0395063920455, "logps/rejected": -658.4560546875, "loss": 0.0169, "rewards/chosen": 7.813090931285512, "rewards/margins": 17.791294337986233, "rewards/rejected": -9.978203406700722, "step": 1248 }, { "epoch": 0.3125234580257725, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57584999.11111111, "logits/rejected": -26344529.066666666, "logps/chosen": -350.635986328125, "logps/rejected": -667.6969401041666, "loss": 0.0268, "rewards/chosen": 5.984920077853733, "rewards/margins": 14.629159376356338, "rewards/rejected": -8.644239298502605, "step": 1249 }, { "epoch": 0.3127736769673464, "grad_norm": 21.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29123074.0, "logits/rejected": -55702496.0, "logps/chosen": -311.8891906738281, "logps/rejected": -502.1377258300781, "loss": 0.0634, "rewards/chosen": 6.006384372711182, "rewards/margins": 14.619926929473877, "rewards/rejected": -8.613542556762695, "step": 1250 }, { "epoch": 0.3130238959089203, "grad_norm": 1.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73254158.22222222, "logits/rejected": -55719364.266666666, "logps/chosen": -310.37744140625, "logps/rejected": -609.6881510416666, "loss": 0.0123, "rewards/chosen": 8.115142822265625, "rewards/margins": 21.34777018229167, "rewards/rejected": -13.232627360026042, "step": 1251 }, { "epoch": 0.31327411485049417, "grad_norm": 4.25, "kl": 0.9545091390609741, "learning_rate": 5e-06, "logits/chosen": -55141664.0, "logits/rejected": -63513642.666666664, "logps/chosen": -418.5477701822917, "logps/rejected": -597.7539876302084, "loss": 0.0221, "rewards/chosen": 8.29672114054362, "rewards/margins": 18.434499104817707, "rewards/rejected": -10.137777964274088, "step": 1252 }, { "epoch": 0.31352433379206807, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75923128.0, "logits/rejected": -44531580.0, "logps/chosen": -478.70660400390625, "logps/rejected": -534.2015380859375, "loss": 0.05, "rewards/chosen": 7.728018760681152, "rewards/margins": 15.276987552642822, "rewards/rejected": -7.54896879196167, "step": 1253 }, { "epoch": 0.3137745527336419, "grad_norm": 3.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83031749.33333333, "logits/rejected": -39417584.0, "logps/chosen": -331.9674886067708, "logps/rejected": -506.0525716145833, "loss": 0.0087, "rewards/chosen": 7.54632568359375, "rewards/margins": 17.604111353556313, "rewards/rejected": -10.057785669962565, "step": 1254 }, { "epoch": 0.3140247716752158, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74658396.44444445, "logits/rejected": -70124074.66666667, "logps/chosen": -329.97816297743054, "logps/rejected": -542.4194661458333, "loss": 0.0386, "rewards/chosen": 6.777015262179905, "rewards/margins": 16.719556766086157, "rewards/rejected": -9.94254150390625, "step": 1255 }, { "epoch": 0.3142749906167897, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -127000969.84615384, "logits/rejected": -55268328.72727273, "logps/chosen": -355.56088491586536, "logps/rejected": -578.9248046875, "loss": 0.0751, "rewards/chosen": 5.809100811298077, "rewards/margins": 13.339487302553405, "rewards/rejected": -7.530386491255327, "step": 1256 }, { "epoch": 0.31452520955836355, "grad_norm": 6.53125, "kl": 1.0577256679534912, "learning_rate": 5e-06, "logits/chosen": -43596028.44444445, "logits/rejected": -39182749.86666667, "logps/chosen": -442.22157118055554, "logps/rejected": -665.2127604166667, "loss": 0.05, "rewards/chosen": 7.25166490342882, "rewards/margins": 18.627539740668404, "rewards/rejected": -11.375874837239584, "step": 1257 }, { "epoch": 0.31477542849993745, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37669309.538461536, "logits/rejected": -59644497.45454545, "logps/chosen": -438.7571364182692, "logps/rejected": -543.0248135653409, "loss": 0.0566, "rewards/chosen": 7.409668555626502, "rewards/margins": 20.050626474660593, "rewards/rejected": -12.640957919034092, "step": 1258 }, { "epoch": 0.31502564744151135, "grad_norm": 15.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56054498.90909091, "logits/rejected": -54907465.84615385, "logps/chosen": -473.20210404829544, "logps/rejected": -503.11144080528845, "loss": 0.0287, "rewards/chosen": 6.974796641956676, "rewards/margins": 14.99104020979021, "rewards/rejected": -8.016243567833534, "step": 1259 }, { "epoch": 0.3152758663830852, "grad_norm": 20.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31206835.2, "logits/rejected": -49201581.71428572, "logps/chosen": -395.8020263671875, "logps/rejected": -494.59329659598217, "loss": 0.0588, "rewards/chosen": 5.384991836547852, "rewards/margins": 15.526380865914483, "rewards/rejected": -10.14138902936663, "step": 1260 }, { "epoch": 0.3155260853246591, "grad_norm": 8.375, "kl": 8.176740646362305, "learning_rate": 5e-06, "logits/chosen": -43392391.11111111, "logits/rejected": -82738480.0, "logps/chosen": -405.25946723090277, "logps/rejected": -850.5380859375, "loss": 0.0262, "rewards/chosen": 7.222204420301649, "rewards/margins": 25.75845294528537, "rewards/rejected": -18.536248524983723, "step": 1261 }, { "epoch": 0.31577630426623293, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32081225.846153848, "logits/rejected": -53937361.45454545, "logps/chosen": -298.4744215745192, "logps/rejected": -641.1859463778409, "loss": 0.0665, "rewards/chosen": 6.669564467210036, "rewards/margins": 18.06667583972424, "rewards/rejected": -11.397111372514205, "step": 1262 }, { "epoch": 0.31602652320780683, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46724680.0, "logits/rejected": -64328520.0, "logps/chosen": -299.5916748046875, "logps/rejected": -706.776123046875, "loss": 0.0399, "rewards/chosen": 6.258452892303467, "rewards/margins": 18.273942470550537, "rewards/rejected": -12.01548957824707, "step": 1263 }, { "epoch": 0.31627674214938073, "grad_norm": 23.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 28210421.333333332, "logits/rejected": -42209237.333333336, "logps/chosen": -392.4527180989583, "logps/rejected": -541.4264322916666, "loss": 0.1138, "rewards/chosen": 5.391928354899089, "rewards/margins": 15.481610616048176, "rewards/rejected": -10.089682261149088, "step": 1264 }, { "epoch": 0.3165269610909546, "grad_norm": 7.0, "kl": 10.140898704528809, "learning_rate": 5e-06, "logits/chosen": -18645608.727272727, "logits/rejected": -52932824.615384616, "logps/chosen": -387.9818004261364, "logps/rejected": -547.0643780048077, "loss": 0.0348, "rewards/chosen": 7.51687275279652, "rewards/margins": 17.709093587381858, "rewards/rejected": -10.192220834585337, "step": 1265 }, { "epoch": 0.31677718003252847, "grad_norm": 16.375, "kl": 4.632102966308594, "learning_rate": 5e-06, "logits/chosen": -12108572.307692308, "logits/rejected": -49853498.18181818, "logps/chosen": -359.57898888221155, "logps/rejected": -776.3927556818181, "loss": 0.0298, "rewards/chosen": 7.409845205453726, "rewards/margins": 22.61639094852901, "rewards/rejected": -15.206545743075283, "step": 1266 }, { "epoch": 0.3170273989741023, "grad_norm": 6.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40591907.2, "logits/rejected": -33658761.14285714, "logps/chosen": -386.29423828125, "logps/rejected": -718.2762974330357, "loss": 0.0177, "rewards/chosen": 7.0229850769042965, "rewards/margins": 21.472626713344027, "rewards/rejected": -14.449641636439733, "step": 1267 }, { "epoch": 0.3172776179156762, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45506995.2, "logits/rejected": -66656490.666666664, "logps/chosen": -377.16149088541664, "logps/rejected": -528.9047309027778, "loss": 0.0247, "rewards/chosen": 7.000936889648438, "rewards/margins": 20.10698716905382, "rewards/rejected": -13.106050279405382, "step": 1268 }, { "epoch": 0.3175278368572501, "grad_norm": 5.3125, "kl": 0.21482086181640625, "learning_rate": 5e-06, "logits/chosen": -43414668.8, "logits/rejected": -57985998.222222224, "logps/chosen": -408.27057291666665, "logps/rejected": -523.1936848958334, "loss": 0.0395, "rewards/chosen": 6.874470011393229, "rewards/margins": 16.336249287923177, "rewards/rejected": -9.461779276529947, "step": 1269 }, { "epoch": 0.31777805579882396, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45678677.333333336, "logits/rejected": -27435040.0, "logps/chosen": -421.4379475911458, "logps/rejected": -634.3431803385416, "loss": 0.0826, "rewards/chosen": 5.739796956380208, "rewards/margins": 16.778418223063152, "rewards/rejected": -11.038621266682943, "step": 1270 }, { "epoch": 0.31802827474039785, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45876445.09090909, "logits/rejected": -57674259.692307696, "logps/chosen": -382.40793678977275, "logps/rejected": -755.0516826923077, "loss": 0.0246, "rewards/chosen": 7.015948208895597, "rewards/margins": 22.097244689514586, "rewards/rejected": -15.08129648061899, "step": 1271 }, { "epoch": 0.31827849368197175, "grad_norm": 10.25, "kl": 9.881219863891602, "learning_rate": 5e-06, "logits/chosen": -63959099.733333334, "logits/rejected": -39135047.11111111, "logps/chosen": -504.5817057291667, "logps/rejected": -818.7438151041666, "loss": 0.0528, "rewards/chosen": 7.784451293945312, "rewards/margins": 21.66170111762153, "rewards/rejected": -13.877249823676216, "step": 1272 }, { "epoch": 0.3185287126235456, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48841749.333333336, "logits/rejected": -39677824.0, "logps/chosen": -304.23032633463544, "logps/rejected": -530.8593207465278, "loss": 0.0321, "rewards/chosen": 7.149608612060547, "rewards/margins": 19.449505700005425, "rewards/rejected": -12.299897087944878, "step": 1273 }, { "epoch": 0.3187789315651195, "grad_norm": 14.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52670457.6, "logits/rejected": -44969750.85714286, "logps/chosen": -371.2403564453125, "logps/rejected": -566.4298967633929, "loss": 0.0482, "rewards/chosen": 5.227872085571289, "rewards/margins": 17.715610558646066, "rewards/rejected": -12.487738473074776, "step": 1274 }, { "epoch": 0.31902915050669334, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29936220.444444444, "logits/rejected": -52257467.733333334, "logps/chosen": -255.77734375, "logps/rejected": -552.2119791666667, "loss": 0.05, "rewards/chosen": 4.643010033501519, "rewards/margins": 15.452336205376518, "rewards/rejected": -10.809326171875, "step": 1275 }, { "epoch": 0.31927936944826724, "grad_norm": 8.375, "kl": 1.7003517150878906, "learning_rate": 5e-06, "logits/chosen": -40648089.6, "logits/rejected": -78199409.77777778, "logps/chosen": -368.59485677083336, "logps/rejected": -622.3368598090278, "loss": 0.0234, "rewards/chosen": 6.860987854003906, "rewards/margins": 15.721263122558593, "rewards/rejected": -8.860275268554688, "step": 1276 }, { "epoch": 0.31952958838984113, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63605989.333333336, "logits/rejected": -36670560.0, "logps/chosen": -379.5790201822917, "logps/rejected": -568.8715006510416, "loss": 0.043, "rewards/chosen": 5.063301722208659, "rewards/margins": 18.589928309122723, "rewards/rejected": -13.526626586914062, "step": 1277 }, { "epoch": 0.319779807331415, "grad_norm": 6.90625, "kl": 0.7730096578598022, "learning_rate": 5e-06, "logits/chosen": -43640310.85714286, "logits/rejected": -55026489.6, "logps/chosen": -311.25924246651783, "logps/rejected": -493.325732421875, "loss": 0.0365, "rewards/chosen": 5.7841322762625555, "rewards/margins": 17.032314954485212, "rewards/rejected": -11.248182678222657, "step": 1278 }, { "epoch": 0.3200300262729889, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39818297.6, "logits/rejected": -48446802.28571428, "logps/chosen": -313.08740234375, "logps/rejected": -579.4892578125, "loss": 0.026, "rewards/chosen": 5.144179916381836, "rewards/margins": 17.348122351510185, "rewards/rejected": -12.203942435128349, "step": 1279 }, { "epoch": 0.3202802452145627, "grad_norm": 6.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48784283.428571425, "logits/rejected": -44368300.8, "logps/chosen": -504.41245814732144, "logps/rejected": -582.00107421875, "loss": 0.034, "rewards/chosen": 8.076739719935826, "rewards/margins": 22.640622166224887, "rewards/rejected": -14.563882446289062, "step": 1280 }, { "epoch": 0.3205304641561366, "grad_norm": 9.1875, "kl": 2.7007224559783936, "learning_rate": 5e-06, "logits/chosen": -105592797.0909091, "logits/rejected": -35596194.461538464, "logps/chosen": -453.2277166193182, "logps/rejected": -424.7365910456731, "loss": 0.0295, "rewards/chosen": 5.819885947487571, "rewards/margins": 17.609108077896224, "rewards/rejected": -11.789222130408653, "step": 1281 }, { "epoch": 0.3207806830977105, "grad_norm": 12.5625, "kl": 7.347240447998047, "learning_rate": 5e-06, "logits/chosen": -43336922.35294118, "logits/rejected": -40496996.571428575, "logps/chosen": -371.56399356617646, "logps/rejected": -655.4492885044643, "loss": 0.0357, "rewards/chosen": 7.157273685230928, "rewards/margins": 22.04531885996586, "rewards/rejected": -14.888045174734932, "step": 1282 }, { "epoch": 0.32103090203928436, "grad_norm": 24.0, "kl": 0.1550954282283783, "learning_rate": 5e-06, "logits/chosen": -31235501.333333332, "logits/rejected": -48669770.666666664, "logps/chosen": -353.2637939453125, "logps/rejected": -601.1693522135416, "loss": 0.065, "rewards/chosen": 5.363969802856445, "rewards/margins": 20.086745580037437, "rewards/rejected": -14.72277577718099, "step": 1283 }, { "epoch": 0.32128112098085826, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63968576.0, "logits/rejected": -36907298.461538464, "logps/chosen": -352.1075550426136, "logps/rejected": -633.4033203125, "loss": 0.0275, "rewards/chosen": 6.64397569136186, "rewards/margins": 21.92105065192376, "rewards/rejected": -15.2770749605619, "step": 1284 }, { "epoch": 0.3215313399224321, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49737393.777777776, "logits/rejected": -63062912.0, "logps/chosen": -274.05799696180554, "logps/rejected": -688.4330729166667, "loss": 0.0871, "rewards/chosen": 3.257261488172743, "rewards/margins": 16.115530734592014, "rewards/rejected": -12.85826924641927, "step": 1285 }, { "epoch": 0.321781558864006, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26646731.42857143, "logits/rejected": -47209379.2, "logps/chosen": -382.17529296875, "logps/rejected": -605.72880859375, "loss": 0.0555, "rewards/chosen": 4.946352277483259, "rewards/margins": 20.262341962541853, "rewards/rejected": -15.315989685058593, "step": 1286 }, { "epoch": 0.3220317778055799, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60115338.666666664, "logits/rejected": -55130805.333333336, "logps/chosen": -303.1997884114583, "logps/rejected": -608.5017496744791, "loss": 0.0459, "rewards/chosen": 5.368688583374023, "rewards/margins": 18.260388056437172, "rewards/rejected": -12.89169947306315, "step": 1287 }, { "epoch": 0.32228199674715374, "grad_norm": 18.625, "kl": 9.332144737243652, "learning_rate": 5e-06, "logits/chosen": -64099438.54545455, "logits/rejected": -63784659.692307696, "logps/chosen": -419.41787997159093, "logps/rejected": -504.9330303485577, "loss": 0.0656, "rewards/chosen": 7.781129316850142, "rewards/margins": 19.046220259232953, "rewards/rejected": -11.265090942382812, "step": 1288 }, { "epoch": 0.32253221568872764, "grad_norm": 7.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59888535.27272727, "logits/rejected": -46535222.15384615, "logps/chosen": -487.5755504261364, "logps/rejected": -631.5006760817307, "loss": 0.0171, "rewards/chosen": 7.721756675026634, "rewards/margins": 19.121367928031443, "rewards/rejected": -11.399611253004808, "step": 1289 }, { "epoch": 0.32278243463030154, "grad_norm": 11.375, "kl": 4.022454738616943, "learning_rate": 5e-06, "logits/chosen": -48174052.571428575, "logits/rejected": -36629676.8, "logps/chosen": -458.96323939732144, "logps/rejected": -695.745556640625, "loss": 0.0626, "rewards/chosen": 6.650062561035156, "rewards/margins": 23.885919189453126, "rewards/rejected": -17.23585662841797, "step": 1290 }, { "epoch": 0.3230326535718754, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33614868.36363637, "logits/rejected": -31564366.769230768, "logps/chosen": -394.28444602272725, "logps/rejected": -528.0996844951923, "loss": 0.0376, "rewards/chosen": 5.247021068226207, "rewards/margins": 15.810591530966592, "rewards/rejected": -10.563570462740385, "step": 1291 }, { "epoch": 0.3232828725134493, "grad_norm": 1.7734375, "kl": 6.965981483459473, "learning_rate": 5e-06, "logits/chosen": -74536950.85714285, "logits/rejected": -57493292.8, "logps/chosen": -439.77640206473217, "logps/rejected": -529.878515625, "loss": 0.0823, "rewards/chosen": 8.393458775111608, "rewards/margins": 20.22898428780692, "rewards/rejected": -11.835525512695312, "step": 1292 }, { "epoch": 0.3235330914550231, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56255319.27272727, "logits/rejected": -63900489.84615385, "logps/chosen": -322.28677645596593, "logps/rejected": -634.2214543269231, "loss": 0.0572, "rewards/chosen": 7.038822520862926, "rewards/margins": 17.64155994762074, "rewards/rejected": -10.602737426757812, "step": 1293 }, { "epoch": 0.323783310396597, "grad_norm": 14.625, "kl": 2.604123830795288, "learning_rate": 5e-06, "logits/chosen": -26121428.0, "logits/rejected": -64316448.0, "logps/chosen": -312.4049377441406, "logps/rejected": -732.2247314453125, "loss": 0.0732, "rewards/chosen": 5.839616775512695, "rewards/margins": 21.819026947021484, "rewards/rejected": -15.979410171508789, "step": 1294 }, { "epoch": 0.3240335293381709, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49314279.384615384, "logits/rejected": -30681736.727272727, "logps/chosen": -398.8065655048077, "logps/rejected": -743.0536665482955, "loss": 0.0368, "rewards/chosen": 6.979983990009014, "rewards/margins": 21.606319827633303, "rewards/rejected": -14.62633583762429, "step": 1295 }, { "epoch": 0.32428374827974477, "grad_norm": 5.90625, "kl": 10.46474838256836, "learning_rate": 5e-06, "logits/chosen": -33184354.285714287, "logits/rejected": 21539731.2, "logps/chosen": -512.68115234375, "logps/rejected": -608.69970703125, "loss": 0.0104, "rewards/chosen": 7.437153407505581, "rewards/margins": 18.65278069632394, "rewards/rejected": -11.215627288818359, "step": 1296 }, { "epoch": 0.32453396722131866, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55096283.428571425, "logits/rejected": -38106669.176470585, "logps/chosen": -387.54652622767856, "logps/rejected": -511.9735753676471, "loss": 0.0311, "rewards/chosen": 7.0351137433733255, "rewards/margins": 18.50823673280347, "rewards/rejected": -11.473122989430147, "step": 1297 }, { "epoch": 0.3247841861628925, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69361152.0, "logits/rejected": -27781085.53846154, "logps/chosen": -484.75284090909093, "logps/rejected": -587.9753605769231, "loss": 0.023, "rewards/chosen": 7.069455233487216, "rewards/margins": 19.2225278841032, "rewards/rejected": -12.153072650615986, "step": 1298 }, { "epoch": 0.3250344051044664, "grad_norm": 13.0, "kl": 4.947166442871094, "learning_rate": 5e-06, "logits/chosen": -48727114.666666664, "logits/rejected": -29651333.333333332, "logps/chosen": -423.2584635416667, "logps/rejected": -330.3903401692708, "loss": 0.0626, "rewards/chosen": 8.027955373128256, "rewards/margins": 15.964884440104168, "rewards/rejected": -7.936929066975911, "step": 1299 }, { "epoch": 0.3252846240460403, "grad_norm": 12.125, "kl": 2.5877013206481934, "learning_rate": 5e-06, "logits/chosen": -54991104.0, "logits/rejected": 3685610.6666666665, "logps/chosen": -431.54120551215277, "logps/rejected": -522.689453125, "loss": 0.0348, "rewards/chosen": 8.492072211371529, "rewards/margins": 20.74616156684028, "rewards/rejected": -12.25408935546875, "step": 1300 }, { "epoch": 0.32553484298761415, "grad_norm": 3.390625, "kl": 4.459981918334961, "learning_rate": 5e-06, "logits/chosen": -58122426.18181818, "logits/rejected": -51050353.23076923, "logps/chosen": -410.40016867897725, "logps/rejected": -652.0196063701923, "loss": 0.021, "rewards/chosen": 8.1951904296875, "rewards/margins": 22.66502615121695, "rewards/rejected": -14.469835721529448, "step": 1301 }, { "epoch": 0.32578506192918805, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57415301.333333336, "logits/rejected": -27322098.666666668, "logps/chosen": -507.4646809895833, "logps/rejected": -532.3552652994791, "loss": 0.0354, "rewards/chosen": 8.16363271077474, "rewards/margins": 18.853150685628258, "rewards/rejected": -10.689517974853516, "step": 1302 }, { "epoch": 0.3260352808707619, "grad_norm": 7.5, "kl": 2.8420791625976562, "learning_rate": 5e-06, "logits/chosen": -45678602.666666664, "logits/rejected": 47868314.666666664, "logps/chosen": -484.0874837239583, "logps/rejected": -394.8536376953125, "loss": 0.04, "rewards/chosen": 7.418270746866862, "rewards/margins": 16.138280232747395, "rewards/rejected": -8.720009485880533, "step": 1303 }, { "epoch": 0.3262854998123358, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37260436.571428575, "logits/rejected": -23889561.6, "logps/chosen": -387.35707310267856, "logps/rejected": -335.847265625, "loss": 0.0206, "rewards/chosen": 6.58929933820452, "rewards/margins": 15.41866182599749, "rewards/rejected": -8.829362487792968, "step": 1304 }, { "epoch": 0.3265357187539097, "grad_norm": 14.8125, "kl": 0.38400477170944214, "learning_rate": 5e-06, "logits/chosen": -54965115.07692308, "logits/rejected": -62456593.45454545, "logps/chosen": -400.7051532451923, "logps/rejected": -552.0832741477273, "loss": 0.0908, "rewards/chosen": 5.856508108285757, "rewards/margins": 16.283283953900103, "rewards/rejected": -10.426775845614346, "step": 1305 }, { "epoch": 0.32678593769548353, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49025269.333333336, "logits/rejected": -27415837.866666667, "logps/chosen": -413.7991536458333, "logps/rejected": -386.639453125, "loss": 0.0926, "rewards/chosen": 4.815151214599609, "rewards/margins": 12.762701161702473, "rewards/rejected": -7.947549947102865, "step": 1306 }, { "epoch": 0.32703615663705743, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46450272.0, "logits/rejected": -16916649.333333332, "logps/chosen": -543.2366129557291, "logps/rejected": -612.9325764973959, "loss": 0.0147, "rewards/chosen": 8.421676635742188, "rewards/margins": 22.21736399332682, "rewards/rejected": -13.795687357584635, "step": 1307 }, { "epoch": 0.32728637557863133, "grad_norm": 9.125, "kl": 2.1481730937957764, "learning_rate": 5e-06, "logits/chosen": -51507024.0, "logits/rejected": -69488650.66666667, "logps/chosen": -470.5382893880208, "logps/rejected": -731.5758463541666, "loss": 0.0317, "rewards/chosen": 8.839162190755209, "rewards/margins": 21.126419067382812, "rewards/rejected": -12.287256876627604, "step": 1308 }, { "epoch": 0.32753659452020517, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75509195.63636364, "logits/rejected": -28084519.384615384, "logps/chosen": -289.5700017755682, "logps/rejected": -491.72716346153845, "loss": 0.0505, "rewards/chosen": 4.795547832142223, "rewards/margins": 16.448959697376598, "rewards/rejected": -11.653411865234375, "step": 1309 }, { "epoch": 0.32778681346177907, "grad_norm": 11.25, "kl": 6.043050289154053, "learning_rate": 5e-06, "logits/chosen": -54099234.90909091, "logits/rejected": -108971204.92307693, "logps/chosen": -475.1676136363636, "logps/rejected": -531.2613807091346, "loss": 0.0373, "rewards/chosen": 8.551368019797586, "rewards/margins": 22.20765323238773, "rewards/rejected": -13.656285212590145, "step": 1310 }, { "epoch": 0.3280370324033529, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56028224.0, "logits/rejected": -43560963.2, "logps/chosen": -456.30496651785717, "logps/rejected": -610.762255859375, "loss": 0.0292, "rewards/chosen": 5.967198508126395, "rewards/margins": 19.150186484200614, "rewards/rejected": -13.18298797607422, "step": 1311 }, { "epoch": 0.3282872513449268, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -102815469.71428572, "logits/rejected": -61270885.64705882, "logps/chosen": -479.75205775669644, "logps/rejected": -652.5213694852941, "loss": 0.0417, "rewards/chosen": 8.19467544555664, "rewards/margins": 21.028763939352597, "rewards/rejected": -12.834088493795957, "step": 1312 }, { "epoch": 0.3285374702865007, "grad_norm": 5.59375, "kl": 2.733454942703247, "learning_rate": 5e-06, "logits/chosen": -37395322.18181818, "logits/rejected": -912665.8461538461, "logps/chosen": -374.05055930397725, "logps/rejected": -523.19482421875, "loss": 0.0427, "rewards/chosen": 8.183658253062855, "rewards/margins": 16.726570076042123, "rewards/rejected": -8.542911822979267, "step": 1313 }, { "epoch": 0.32878768922807455, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31023730.666666668, "logits/rejected": -52924389.333333336, "logps/chosen": -355.6796061197917, "logps/rejected": -574.7836507161459, "loss": 0.0706, "rewards/chosen": 6.032105763753255, "rewards/margins": 16.904056549072266, "rewards/rejected": -10.87195078531901, "step": 1314 }, { "epoch": 0.32903790816964845, "grad_norm": 12.9375, "kl": 14.697174072265625, "learning_rate": 5e-06, "logits/chosen": -48039571.692307696, "logits/rejected": -69240017.45454545, "logps/chosen": -455.8532151442308, "logps/rejected": -730.8930220170455, "loss": 0.0803, "rewards/chosen": 8.880028357872597, "rewards/margins": 23.880536059399585, "rewards/rejected": -15.000507701526988, "step": 1315 }, { "epoch": 0.3292881271112223, "grad_norm": 3.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57237326.76923077, "logits/rejected": -43537780.36363637, "logps/chosen": -445.71814903846155, "logps/rejected": -462.0006214488636, "loss": 0.0184, "rewards/chosen": 7.54583505483774, "rewards/margins": 16.45185201151388, "rewards/rejected": -8.906016956676137, "step": 1316 }, { "epoch": 0.3295383460527962, "grad_norm": 7.875, "kl": 3.6453094482421875, "learning_rate": 5e-06, "logits/chosen": -84937830.4, "logits/rejected": -63595181.71428572, "logps/chosen": -481.329296875, "logps/rejected": -652.0350167410714, "loss": 0.043, "rewards/chosen": 6.581423950195313, "rewards/margins": 18.190345546177454, "rewards/rejected": -11.608921595982142, "step": 1317 }, { "epoch": 0.3297885649943701, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71455808.0, "logits/rejected": -60674279.384615384, "logps/chosen": -270.35311612215907, "logps/rejected": -752.7180739182693, "loss": 0.0303, "rewards/chosen": 3.9281075217507104, "rewards/margins": 18.5446823226822, "rewards/rejected": -14.61657480093149, "step": 1318 }, { "epoch": 0.33003878393594394, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46047648.0, "logits/rejected": -70868557.71428572, "logps/chosen": -357.9293701171875, "logps/rejected": -740.3204520089286, "loss": 0.0259, "rewards/chosen": 5.3519329071044925, "rewards/margins": 22.76464173453195, "rewards/rejected": -17.412708827427455, "step": 1319 }, { "epoch": 0.33028900287751783, "grad_norm": 3.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66989794.461538464, "logits/rejected": -56064698.18181818, "logps/chosen": -458.98985877403845, "logps/rejected": -545.8565784801136, "loss": 0.0072, "rewards/chosen": 8.09774428147536, "rewards/margins": 17.084722985754482, "rewards/rejected": -8.98697870427912, "step": 1320 }, { "epoch": 0.3305392218190917, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69612960.0, "logits/rejected": -51901398.4, "logps/chosen": -402.77901785714283, "logps/rejected": -564.616845703125, "loss": 0.0586, "rewards/chosen": 5.751282828194754, "rewards/margins": 19.252323477608815, "rewards/rejected": -13.501040649414062, "step": 1321 }, { "epoch": 0.3307894407606656, "grad_norm": 14.5625, "kl": 12.04847240447998, "learning_rate": 5e-06, "logits/chosen": -34522457.6, "logits/rejected": -39687120.0, "logps/chosen": -476.9611328125, "logps/rejected": -897.1007080078125, "loss": 0.0823, "rewards/chosen": 8.806517791748046, "rewards/margins": 26.31516761779785, "rewards/rejected": -17.508649826049805, "step": 1322 }, { "epoch": 0.3310396597022395, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70662486.85714285, "logits/rejected": -63623667.2, "logps/chosen": -357.3809291294643, "logps/rejected": -658.059326171875, "loss": 0.0353, "rewards/chosen": 5.4811875479561945, "rewards/margins": 20.022956957135882, "rewards/rejected": -14.541769409179688, "step": 1323 }, { "epoch": 0.3312898786438133, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48801028.571428575, "logits/rejected": -69028928.0, "logps/chosen": -430.1524135044643, "logps/rejected": -755.635546875, "loss": 0.0404, "rewards/chosen": 8.647618430001396, "rewards/margins": 21.954083578927175, "rewards/rejected": -13.306465148925781, "step": 1324 }, { "epoch": 0.3315400975853872, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37805664.0, "logits/rejected": -49762391.27272727, "logps/chosen": -327.4065504807692, "logps/rejected": -559.6705433238636, "loss": 0.0252, "rewards/chosen": 6.329107431265024, "rewards/margins": 18.372063696801245, "rewards/rejected": -12.04295626553622, "step": 1325 }, { "epoch": 0.3317903165269611, "grad_norm": 10.625, "kl": 2.1260504722595215, "learning_rate": 5e-06, "logits/chosen": -58163997.538461536, "logits/rejected": -58095197.09090909, "logps/chosen": -354.61658653846155, "logps/rejected": -867.4247159090909, "loss": 0.0375, "rewards/chosen": 6.199777456430288, "rewards/margins": 21.506789841018357, "rewards/rejected": -15.307012384588068, "step": 1326 }, { "epoch": 0.33204053546853496, "grad_norm": 15.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27763982.0, "logits/rejected": -47246644.0, "logps/chosen": -394.0245666503906, "logps/rejected": -583.006103515625, "loss": 0.0318, "rewards/chosen": 6.902261734008789, "rewards/margins": 19.452731132507324, "rewards/rejected": -12.550469398498535, "step": 1327 }, { "epoch": 0.33229075441010886, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61280471.27272727, "logits/rejected": -29800585.846153848, "logps/chosen": -466.18918678977275, "logps/rejected": -373.14002403846155, "loss": 0.0337, "rewards/chosen": 7.508759932084517, "rewards/margins": 16.588247072446595, "rewards/rejected": -9.07948714036208, "step": 1328 }, { "epoch": 0.3325409733516827, "grad_norm": 5.5, "kl": 0.4970232844352722, "learning_rate": 5e-06, "logits/chosen": -33947829.333333336, "logits/rejected": -36240066.666666664, "logps/chosen": -382.8893229166667, "logps/rejected": -538.6937662760416, "loss": 0.0333, "rewards/chosen": 6.703197479248047, "rewards/margins": 18.959982554117836, "rewards/rejected": -12.256785074869791, "step": 1329 }, { "epoch": 0.3327911922932566, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37734368.0, "logits/rejected": 33503694.933333334, "logps/chosen": -349.6689181857639, "logps/rejected": -447.7561848958333, "loss": 0.0202, "rewards/chosen": 6.241354200575087, "rewards/margins": 16.77003156873915, "rewards/rejected": -10.528677368164063, "step": 1330 }, { "epoch": 0.3330414112348305, "grad_norm": 5.71875, "kl": 1.872991919517517, "learning_rate": 5e-06, "logits/chosen": -43453418.666666664, "logits/rejected": -6143753.333333333, "logps/chosen": -343.99696180555554, "logps/rejected": -605.0619710286459, "loss": 0.0732, "rewards/chosen": 5.6351267496744795, "rewards/margins": 16.732693990071613, "rewards/rejected": -11.097567240397135, "step": 1331 }, { "epoch": 0.33329163017640434, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54579840.0, "logits/rejected": -63119669.333333336, "logps/chosen": -438.6621500651042, "logps/rejected": -531.3933512369791, "loss": 0.0517, "rewards/chosen": 5.8907470703125, "rewards/margins": 15.888117472330729, "rewards/rejected": -9.997370402018229, "step": 1332 }, { "epoch": 0.33354184911797824, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39835549.09090909, "logits/rejected": -28802279.384615384, "logps/chosen": -275.9366344105114, "logps/rejected": -451.02249849759613, "loss": 0.1043, "rewards/chosen": 4.384413979270241, "rewards/margins": 14.761321327903055, "rewards/rejected": -10.376907348632812, "step": 1333 }, { "epoch": 0.3337920680595521, "grad_norm": 6.375, "kl": 0.39078569412231445, "learning_rate": 5e-06, "logits/chosen": -41747347.2, "logits/rejected": -48206258.28571428, "logps/chosen": -188.80257568359374, "logps/rejected": -517.7456752232143, "loss": 0.0396, "rewards/chosen": 4.530624771118164, "rewards/margins": 16.05058435712542, "rewards/rejected": -11.519959586007255, "step": 1334 }, { "epoch": 0.334042287001126, "grad_norm": 11.375, "kl": 1.87445068359375, "learning_rate": 5e-06, "logits/chosen": -51963562.666666664, "logits/rejected": -34073514.666666664, "logps/chosen": -411.3940755208333, "logps/rejected": -372.69349500868054, "loss": 0.0319, "rewards/chosen": 7.220335896809896, "rewards/margins": 18.33167928059896, "rewards/rejected": -11.111343383789062, "step": 1335 }, { "epoch": 0.3342925059426999, "grad_norm": 18.25, "kl": 1.205657958984375, "learning_rate": 5e-06, "logits/chosen": -7935102.4, "logits/rejected": -35353090.28571428, "logps/chosen": -305.5932373046875, "logps/rejected": -439.08461216517856, "loss": 0.0682, "rewards/chosen": 5.213975524902343, "rewards/margins": 17.195648629324776, "rewards/rejected": -11.981673104422432, "step": 1336 }, { "epoch": 0.3345427248842737, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50167117.71428572, "logits/rejected": -64593702.4, "logps/chosen": -365.00830078125, "logps/rejected": -658.148388671875, "loss": 0.0617, "rewards/chosen": 5.423768724714007, "rewards/margins": 18.849464525495257, "rewards/rejected": -13.42569580078125, "step": 1337 }, { "epoch": 0.3347929438258476, "grad_norm": 10.5, "kl": 2.0387485027313232, "learning_rate": 5e-06, "logits/chosen": -52057192.72727273, "logits/rejected": -22635057.230769232, "logps/chosen": -426.41792436079544, "logps/rejected": -537.6845327524038, "loss": 0.037, "rewards/chosen": 6.5159149169921875, "rewards/margins": 18.38060349684495, "rewards/rejected": -11.864688579852764, "step": 1338 }, { "epoch": 0.3350431627674215, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28478294.4, "logits/rejected": -22022157.714285713, "logps/chosen": -207.52890625, "logps/rejected": -577.8662109375, "loss": 0.0594, "rewards/chosen": 5.3212932586669925, "rewards/margins": 15.112489809308734, "rewards/rejected": -9.791196550641741, "step": 1339 }, { "epoch": 0.33529338170899536, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54632616.0, "logits/rejected": -46140208.0, "logps/chosen": -337.9712219238281, "logps/rejected": -574.8458251953125, "loss": 0.0476, "rewards/chosen": 5.999845504760742, "rewards/margins": 19.704029083251953, "rewards/rejected": -13.704183578491211, "step": 1340 }, { "epoch": 0.33554360065056926, "grad_norm": 13.0, "kl": 1.5396665334701538, "learning_rate": 5e-06, "logits/chosen": -67867834.18181819, "logits/rejected": -52158749.538461536, "logps/chosen": -307.18257279829544, "logps/rejected": -528.5794771634615, "loss": 0.0609, "rewards/chosen": 5.633566076105291, "rewards/margins": 17.671494730702648, "rewards/rejected": -12.037928654597355, "step": 1341 }, { "epoch": 0.3357938195921431, "grad_norm": 6.8125, "kl": 5.219085693359375, "learning_rate": 5e-06, "logits/chosen": -52744969.14285714, "logits/rejected": -25142401.6, "logps/chosen": -412.9884556361607, "logps/rejected": -497.02978515625, "loss": 0.046, "rewards/chosen": 7.841343470982143, "rewards/margins": 21.259319850376674, "rewards/rejected": -13.417976379394531, "step": 1342 }, { "epoch": 0.336044038533717, "grad_norm": 4.4375, "kl": 0.0262451171875, "learning_rate": 5e-06, "logits/chosen": -43297984.0, "logits/rejected": -48623338.666666664, "logps/chosen": -377.7265218098958, "logps/rejected": -585.1934000651041, "loss": 0.0504, "rewards/chosen": 6.501618067423503, "rewards/margins": 18.779150009155273, "rewards/rejected": -12.277531941731771, "step": 1343 }, { "epoch": 0.3362942574752909, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30450723.555555556, "logits/rejected": -31818880.0, "logps/chosen": -414.99799262152777, "logps/rejected": -767.8289713541667, "loss": 0.0398, "rewards/chosen": 5.126502143012153, "rewards/margins": 21.735421413845486, "rewards/rejected": -16.608919270833333, "step": 1344 }, { "epoch": 0.33654447641686475, "grad_norm": 11.0625, "kl": 4.947748184204102, "learning_rate": 5e-06, "logits/chosen": -51574592.0, "logits/rejected": -78118165.33333333, "logps/chosen": -442.65576171875, "logps/rejected": -623.1094156901041, "loss": 0.0512, "rewards/chosen": 7.760725657145183, "rewards/margins": 18.367804845174152, "rewards/rejected": -10.60707918802897, "step": 1345 }, { "epoch": 0.33679469535843864, "grad_norm": 12.8125, "kl": 7.564021110534668, "learning_rate": 5e-06, "logits/chosen": -67430976.0, "logits/rejected": -62796666.666666664, "logps/chosen": -436.0860188802083, "logps/rejected": -500.9079996744792, "loss": 0.0495, "rewards/chosen": 6.593201955159505, "rewards/margins": 15.753700256347656, "rewards/rejected": -9.16049830118815, "step": 1346 }, { "epoch": 0.3370449143000125, "grad_norm": 10.9375, "kl": 7.317770004272461, "learning_rate": 5e-06, "logits/chosen": -22696803.76470588, "logits/rejected": -70915177.14285715, "logps/chosen": -308.36764705882354, "logps/rejected": -582.8038504464286, "loss": 0.0669, "rewards/chosen": 6.396182789522059, "rewards/margins": 18.096769380970162, "rewards/rejected": -11.700586591448102, "step": 1347 }, { "epoch": 0.3372951332415864, "grad_norm": 7.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42279850.666666664, "logits/rejected": -68172637.86666666, "logps/chosen": -335.36371527777777, "logps/rejected": -596.8044270833333, "loss": 0.025, "rewards/chosen": 6.992486741807726, "rewards/margins": 18.569833543565537, "rewards/rejected": -11.577346801757812, "step": 1348 }, { "epoch": 0.3375453521831603, "grad_norm": 8.6875, "kl": 1.6140928268432617, "learning_rate": 5e-06, "logits/chosen": -44424864.0, "logits/rejected": -70485562.66666667, "logps/chosen": -360.2839762369792, "logps/rejected": -661.4430338541666, "loss": 0.082, "rewards/chosen": 6.851779937744141, "rewards/margins": 19.409959157307945, "rewards/rejected": -12.558179219563803, "step": 1349 }, { "epoch": 0.33779557112473413, "grad_norm": 20.125, "kl": 13.532404899597168, "learning_rate": 5e-06, "logits/chosen": -54110276.92307692, "logits/rejected": -44575662.54545455, "logps/chosen": -385.5871394230769, "logps/rejected": -538.53173828125, "loss": 0.0824, "rewards/chosen": 7.348847022423377, "rewards/margins": 15.594080731585308, "rewards/rejected": -8.245233709161932, "step": 1350 }, { "epoch": 0.338045790066308, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50688764.44444445, "logits/rejected": -46143872.0, "logps/chosen": -504.49370659722223, "logps/rejected": -485.072265625, "loss": 0.0401, "rewards/chosen": 8.675616794162327, "rewards/margins": 14.77322726779514, "rewards/rejected": -6.0976104736328125, "step": 1351 }, { "epoch": 0.33829600900788187, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36727572.571428575, "logits/rejected": -27164060.8, "logps/chosen": -303.04356166294644, "logps/rejected": -322.324267578125, "loss": 0.0498, "rewards/chosen": 6.981774466378348, "rewards/margins": 14.689091055733817, "rewards/rejected": -7.707316589355469, "step": 1352 }, { "epoch": 0.33854622794945577, "grad_norm": 11.0, "kl": 8.41501235961914, "learning_rate": 5e-06, "logits/chosen": -45846715.733333334, "logits/rejected": 32867413.333333332, "logps/chosen": -419.6502278645833, "logps/rejected": -599.8024088541666, "loss": 0.0923, "rewards/chosen": 6.2668706258138025, "rewards/margins": 16.30726996527778, "rewards/rejected": -10.040399339463976, "step": 1353 }, { "epoch": 0.33879644689102967, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51326234.666666664, "logits/rejected": -78415200.0, "logps/chosen": -353.623291015625, "logps/rejected": -734.7068684895834, "loss": 0.0288, "rewards/chosen": 7.545745849609375, "rewards/margins": 22.716283162434898, "rewards/rejected": -15.170537312825521, "step": 1354 }, { "epoch": 0.3390466658326035, "grad_norm": 3.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36534173.09090909, "logits/rejected": -51090215.384615384, "logps/chosen": -383.0226384943182, "logps/rejected": -695.0777493990385, "loss": 0.0118, "rewards/chosen": 6.529823303222656, "rewards/margins": 18.708175072303185, "rewards/rejected": -12.178351769080528, "step": 1355 }, { "epoch": 0.3392968847741774, "grad_norm": 8.125, "kl": 2.185840606689453, "learning_rate": 5e-06, "logits/chosen": -44884644.0, "logits/rejected": -39766516.0, "logps/chosen": -494.3297424316406, "logps/rejected": -685.2808837890625, "loss": 0.0089, "rewards/chosen": 9.92443561553955, "rewards/margins": 23.356390953063965, "rewards/rejected": -13.431955337524414, "step": 1356 }, { "epoch": 0.3395471037157513, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74740584.72727273, "logits/rejected": -66027347.692307696, "logps/chosen": -446.85293856534093, "logps/rejected": -647.4830979567307, "loss": 0.0253, "rewards/chosen": 7.935643282803622, "rewards/margins": 20.64057703618403, "rewards/rejected": -12.70493375338041, "step": 1357 }, { "epoch": 0.33979732265732515, "grad_norm": 8.625, "kl": 2.9340224266052246, "learning_rate": 5e-06, "logits/chosen": -27940229.333333332, "logits/rejected": -51206391.46666667, "logps/chosen": -440.60986328125, "logps/rejected": -557.525, "loss": 0.0655, "rewards/chosen": 6.054548051622179, "rewards/margins": 15.8319701300727, "rewards/rejected": -9.77742207845052, "step": 1358 }, { "epoch": 0.34004754159889905, "grad_norm": 12.3125, "kl": 0.7810115814208984, "learning_rate": 5e-06, "logits/chosen": -15250318.0, "logits/rejected": 2441615.0, "logps/chosen": -291.82830810546875, "logps/rejected": -410.85205078125, "loss": 0.0746, "rewards/chosen": 5.514744281768799, "rewards/margins": 12.880590915679932, "rewards/rejected": -7.365846633911133, "step": 1359 }, { "epoch": 0.3402977605404729, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31370933.333333332, "logits/rejected": -75846480.0, "logps/chosen": -366.5683186848958, "logps/rejected": -579.5812174479166, "loss": 0.0363, "rewards/chosen": 7.31094233194987, "rewards/margins": 19.48259989420573, "rewards/rejected": -12.17165756225586, "step": 1360 }, { "epoch": 0.3405479794820468, "grad_norm": 27.125, "kl": 0.7354120016098022, "learning_rate": 5e-06, "logits/chosen": -24336891.42857143, "logits/rejected": -88193536.0, "logps/chosen": -346.33517020089283, "logps/rejected": -600.51484375, "loss": 0.0647, "rewards/chosen": 4.871701921735491, "rewards/margins": 15.511420549665178, "rewards/rejected": -10.639718627929687, "step": 1361 }, { "epoch": 0.3407981984236207, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72564198.4, "logits/rejected": -81451520.0, "logps/chosen": -441.51897786458335, "logps/rejected": -681.1800130208334, "loss": 0.0256, "rewards/chosen": 6.898324584960937, "rewards/margins": 18.322556220160592, "rewards/rejected": -11.424231635199654, "step": 1362 }, { "epoch": 0.34104841736519453, "grad_norm": 17.125, "kl": 3.1553988456726074, "learning_rate": 5e-06, "logits/chosen": -64922200.615384616, "logits/rejected": -64769378.90909091, "logps/chosen": -389.02640474759613, "logps/rejected": -428.58065518465907, "loss": 0.079, "rewards/chosen": 5.469981266902043, "rewards/margins": 13.756486532571433, "rewards/rejected": -8.286505265669389, "step": 1363 }, { "epoch": 0.34129863630676843, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71494608.0, "logits/rejected": -45395173.333333336, "logps/chosen": -488.9459635416667, "logps/rejected": -692.489013671875, "loss": 0.0271, "rewards/chosen": 7.334481557210286, "rewards/margins": 19.616621653238933, "rewards/rejected": -12.282140096028646, "step": 1364 }, { "epoch": 0.3415488552483423, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46601742.54545455, "logits/rejected": -68418092.3076923, "logps/chosen": -440.64936967329544, "logps/rejected": -582.9760366586538, "loss": 0.0271, "rewards/chosen": 6.367367137562145, "rewards/margins": 18.072649495584983, "rewards/rejected": -11.705282358022837, "step": 1365 }, { "epoch": 0.3417990741899162, "grad_norm": 18.5, "kl": 18.15050506591797, "learning_rate": 5e-06, "logits/chosen": -50474990.93333333, "logits/rejected": -61366727.11111111, "logps/chosen": -523.4237630208333, "logps/rejected": -563.9654405381945, "loss": 0.1084, "rewards/chosen": 7.516913859049479, "rewards/margins": 21.16167229546441, "rewards/rejected": -13.64475843641493, "step": 1366 }, { "epoch": 0.3420492931314901, "grad_norm": 7.03125, "kl": 3.886951446533203, "learning_rate": 5e-06, "logits/chosen": -55835328.0, "logits/rejected": -13413665.454545455, "logps/chosen": -467.9079777644231, "logps/rejected": -525.4630681818181, "loss": 0.0581, "rewards/chosen": 7.6134209266075725, "rewards/margins": 16.18602501929223, "rewards/rejected": -8.572604092684658, "step": 1367 }, { "epoch": 0.3422995120730639, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12680450.461538462, "logits/rejected": -24374149.818181816, "logps/chosen": -323.2471454326923, "logps/rejected": -440.1715198863636, "loss": 0.0581, "rewards/chosen": 5.362631577711839, "rewards/margins": 15.90045950296042, "rewards/rejected": -10.53782792524858, "step": 1368 }, { "epoch": 0.3425497310146378, "grad_norm": 1.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21023002.666666668, "logits/rejected": -40075374.222222224, "logps/chosen": -363.3076171875, "logps/rejected": -516.2307942708334, "loss": 0.0028, "rewards/chosen": 6.823829650878906, "rewards/margins": 18.073543124728733, "rewards/rejected": -11.249713473849827, "step": 1369 }, { "epoch": 0.34279994995621166, "grad_norm": 10.875, "kl": 0.19117769598960876, "learning_rate": 5e-06, "logits/chosen": -38705467.07692308, "logits/rejected": -37777914.18181818, "logps/chosen": -411.40538611778845, "logps/rejected": -368.20805220170456, "loss": 0.0516, "rewards/chosen": 6.531492379995493, "rewards/margins": 13.993226111352026, "rewards/rejected": -7.461733731356534, "step": 1370 }, { "epoch": 0.34305016889778556, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55614997.333333336, "logits/rejected": -15105030.666666666, "logps/chosen": -421.9579264322917, "logps/rejected": -529.8504231770834, "loss": 0.0707, "rewards/chosen": 7.2065887451171875, "rewards/margins": 21.210697174072266, "rewards/rejected": -14.004108428955078, "step": 1371 }, { "epoch": 0.34330038783935946, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30513008.0, "logits/rejected": -28525005.333333332, "logps/chosen": -357.9259847005208, "logps/rejected": -402.765380859375, "loss": 0.0687, "rewards/chosen": 5.72294553120931, "rewards/margins": 14.270186742146809, "rewards/rejected": -8.5472412109375, "step": 1372 }, { "epoch": 0.3435506067809333, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59987328.0, "logits/rejected": -25641462.0, "logps/chosen": -263.6124267578125, "logps/rejected": -471.99639892578125, "loss": 0.0283, "rewards/chosen": 5.485619068145752, "rewards/margins": 15.866224765777588, "rewards/rejected": -10.380605697631836, "step": 1373 }, { "epoch": 0.3438008257225072, "grad_norm": 2.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10521260.8, "logits/rejected": -46716654.222222224, "logps/chosen": -480.3423177083333, "logps/rejected": -782.1529405381945, "loss": 0.0099, "rewards/chosen": 8.070053100585938, "rewards/margins": 22.473271687825523, "rewards/rejected": -14.403218587239584, "step": 1374 }, { "epoch": 0.3440510446640811, "grad_norm": 12.25, "kl": 8.94655704498291, "learning_rate": 5e-06, "logits/chosen": -63110028.8, "logits/rejected": -40571721.14285714, "logps/chosen": -489.54150390625, "logps/rejected": -442.15321568080356, "loss": 0.0692, "rewards/chosen": 8.175430297851562, "rewards/margins": 16.03187506539481, "rewards/rejected": -7.856444767543247, "step": 1375 }, { "epoch": 0.34430126360565494, "grad_norm": 7.59375, "kl": 3.9555816650390625, "learning_rate": 5e-06, "logits/chosen": -44090709.333333336, "logits/rejected": -53187781.333333336, "logps/chosen": -546.0808919270834, "logps/rejected": -703.1783040364584, "loss": 0.0249, "rewards/chosen": 7.2213389078776045, "rewards/margins": 20.84004847208659, "rewards/rejected": -13.618709564208984, "step": 1376 }, { "epoch": 0.34455148254722884, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40956295.11111111, "logits/rejected": -25625280.0, "logps/chosen": -354.81523980034723, "logps/rejected": -493.5981119791667, "loss": 0.0462, "rewards/chosen": 6.573668162027995, "rewards/margins": 14.77570826212565, "rewards/rejected": -8.202040100097657, "step": 1377 }, { "epoch": 0.3448017014888027, "grad_norm": 9.0625, "kl": 3.7113418579101562, "learning_rate": 5e-06, "logits/chosen": -50864117.333333336, "logits/rejected": -46956000.0, "logps/chosen": -318.4588623046875, "logps/rejected": -713.8990071614584, "loss": 0.0822, "rewards/chosen": 7.12844721476237, "rewards/margins": 20.45091374715169, "rewards/rejected": -13.322466532389322, "step": 1378 }, { "epoch": 0.3450519204303766, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87894656.0, "logits/rejected": -55881356.8, "logps/chosen": -324.99379185267856, "logps/rejected": -587.48818359375, "loss": 0.1286, "rewards/chosen": 5.334012712751116, "rewards/margins": 14.57075892857143, "rewards/rejected": -9.236746215820313, "step": 1379 }, { "epoch": 0.3453021393719505, "grad_norm": 6.40625, "kl": 2.066612958908081, "learning_rate": 5e-06, "logits/chosen": -46889063.384615384, "logits/rejected": -73043066.18181819, "logps/chosen": -361.35787259615387, "logps/rejected": -503.84645774147725, "loss": 0.0291, "rewards/chosen": 7.517757709209736, "rewards/margins": 19.83745430899667, "rewards/rejected": -12.319696599786932, "step": 1380 }, { "epoch": 0.3455523583135243, "grad_norm": 7.34375, "kl": 1.531408667564392, "learning_rate": 5e-06, "logits/chosen": -43154321.06666667, "logits/rejected": -27408366.222222224, "logps/chosen": -372.10631510416664, "logps/rejected": -567.4663628472222, "loss": 0.0163, "rewards/chosen": 6.417936197916666, "rewards/margins": 19.293846638997394, "rewards/rejected": -12.875910441080729, "step": 1381 }, { "epoch": 0.3458025772550982, "grad_norm": 1.1015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 19201792.0, "logits/rejected": -47947975.52941176, "logps/chosen": -455.1500767299107, "logps/rejected": -605.4234834558823, "loss": 0.0202, "rewards/chosen": 8.886889866420201, "rewards/margins": 19.985506522555312, "rewards/rejected": -11.09861665613511, "step": 1382 }, { "epoch": 0.34605279619667206, "grad_norm": 11.875, "kl": 8.238302230834961, "learning_rate": 5e-06, "logits/chosen": -57637897.84615385, "logits/rejected": -43387450.18181818, "logps/chosen": -364.1184269831731, "logps/rejected": -546.6664151278409, "loss": 0.094, "rewards/chosen": 6.448720491849459, "rewards/margins": 17.34112868942581, "rewards/rejected": -10.89240819757635, "step": 1383 }, { "epoch": 0.34630301513824596, "grad_norm": 2.28125, "kl": 2.4979376792907715, "learning_rate": 5e-06, "logits/chosen": -45941051.07692308, "logits/rejected": -45544192.0, "logps/chosen": -438.09510216346155, "logps/rejected": -649.1272194602273, "loss": 0.026, "rewards/chosen": 7.812551058255709, "rewards/margins": 19.802229181036246, "rewards/rejected": -11.98967812278054, "step": 1384 }, { "epoch": 0.34655323407981986, "grad_norm": 2.375, "kl": 4.510049343109131, "learning_rate": 5e-06, "logits/chosen": -40340452.571428575, "logits/rejected": -43033008.0, "logps/chosen": -414.4686802455357, "logps/rejected": -605.7154296875, "loss": 0.0272, "rewards/chosen": 8.496366228376116, "rewards/margins": 22.162793840680806, "rewards/rejected": -13.666427612304688, "step": 1385 }, { "epoch": 0.3468034530213937, "grad_norm": 0.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38219467.63636363, "logits/rejected": -43419072.0, "logps/chosen": -480.02787642045456, "logps/rejected": -548.1022761418269, "loss": 0.0032, "rewards/chosen": 7.689335909756747, "rewards/margins": 20.41747123878319, "rewards/rejected": -12.728135329026442, "step": 1386 }, { "epoch": 0.3470536719629676, "grad_norm": 8.0, "kl": 5.687331199645996, "learning_rate": 5e-06, "logits/chosen": -12470166.153846154, "logits/rejected": -52592384.0, "logps/chosen": -512.0307241586538, "logps/rejected": -661.3045099431819, "loss": 0.0638, "rewards/chosen": 8.254149216871996, "rewards/margins": 20.152106758597846, "rewards/rejected": -11.897957541725852, "step": 1387 }, { "epoch": 0.3473038909045415, "grad_norm": 4.71875, "kl": 4.2610015869140625, "learning_rate": 5e-06, "logits/chosen": -36584472.615384616, "logits/rejected": -68275642.18181819, "logps/chosen": -317.00931490384613, "logps/rejected": -678.169921875, "loss": 0.0463, "rewards/chosen": 7.358241741473858, "rewards/margins": 19.540428908554823, "rewards/rejected": -12.182187167080967, "step": 1388 }, { "epoch": 0.34755410984611534, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35182205.333333336, "logits/rejected": -48418435.55555555, "logps/chosen": -436.0703938802083, "logps/rejected": -698.2550998263889, "loss": 0.0854, "rewards/chosen": 8.249051411946615, "rewards/margins": 19.68235524495443, "rewards/rejected": -11.433303833007812, "step": 1389 }, { "epoch": 0.34780432878768924, "grad_norm": 15.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30450473.6, "logits/rejected": -30351821.714285713, "logps/chosen": -394.827099609375, "logps/rejected": -550.4891183035714, "loss": 0.033, "rewards/chosen": 8.100794219970703, "rewards/margins": 19.846066175188337, "rewards/rejected": -11.745271955217634, "step": 1390 }, { "epoch": 0.3480545477292631, "grad_norm": 6.34375, "kl": 3.1048903465270996, "learning_rate": 5e-06, "logits/chosen": -29222763.42857143, "logits/rejected": -61965824.0, "logps/chosen": -297.25547572544644, "logps/rejected": -538.98232421875, "loss": 0.0576, "rewards/chosen": 6.200111389160156, "rewards/margins": 16.02859573364258, "rewards/rejected": -9.828484344482423, "step": 1391 }, { "epoch": 0.348304766670837, "grad_norm": 13.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76160296.0, "logits/rejected": -70088784.0, "logps/chosen": -336.0784606933594, "logps/rejected": -760.227294921875, "loss": 0.04, "rewards/chosen": 6.1641387939453125, "rewards/margins": 19.47642707824707, "rewards/rejected": -13.312288284301758, "step": 1392 }, { "epoch": 0.3485549856124109, "grad_norm": 10.5, "kl": 1.8843917846679688, "learning_rate": 5e-06, "logits/chosen": -59755688.72727273, "logits/rejected": -46480329.84615385, "logps/chosen": -398.46462180397725, "logps/rejected": -546.4738581730769, "loss": 0.0191, "rewards/chosen": 7.625688726251775, "rewards/margins": 15.847116056855741, "rewards/rejected": -8.221427330603966, "step": 1393 }, { "epoch": 0.3488052045539847, "grad_norm": 9.125, "kl": 1.4063594341278076, "learning_rate": 5e-06, "logits/chosen": -43650157.71428572, "logits/rejected": -58462035.2, "logps/chosen": -339.45235770089283, "logps/rejected": -705.771240234375, "loss": 0.027, "rewards/chosen": 7.272457667759487, "rewards/margins": 22.102779933384486, "rewards/rejected": -14.830322265625, "step": 1394 }, { "epoch": 0.3490554234955586, "grad_norm": 19.0, "kl": 15.25861644744873, "learning_rate": 5e-06, "logits/chosen": -27900723.2, "logits/rejected": -37788544.0, "logps/chosen": -435.4731770833333, "logps/rejected": -534.9817708333334, "loss": 0.0844, "rewards/chosen": 8.663606770833333, "rewards/margins": 19.592518615722653, "rewards/rejected": -10.928911844889322, "step": 1395 }, { "epoch": 0.34930564243713247, "grad_norm": 4.3125, "kl": 6.739832878112793, "learning_rate": 5e-06, "logits/chosen": -48544906.666666664, "logits/rejected": -54690320.0, "logps/chosen": -399.119140625, "logps/rejected": -762.9268391927084, "loss": 0.0133, "rewards/chosen": 9.885173797607422, "rewards/margins": 25.321478525797524, "rewards/rejected": -15.436304728190104, "step": 1396 }, { "epoch": 0.34955586137870637, "grad_norm": 11.5, "kl": 2.9718384742736816, "learning_rate": 5e-06, "logits/chosen": -52491236.571428575, "logits/rejected": -53341286.4, "logps/chosen": -297.95186941964283, "logps/rejected": -592.1646484375, "loss": 0.0566, "rewards/chosen": 4.977242061070034, "rewards/margins": 18.4941289629255, "rewards/rejected": -13.516886901855468, "step": 1397 }, { "epoch": 0.34980608032028027, "grad_norm": 5.90625, "kl": 0.271176815032959, "learning_rate": 5e-06, "logits/chosen": -34071288.0, "logits/rejected": -64014901.333333336, "logps/chosen": -353.0765787760417, "logps/rejected": -691.5896809895834, "loss": 0.0642, "rewards/chosen": 5.96182378133138, "rewards/margins": 19.90054702758789, "rewards/rejected": -13.93872324625651, "step": 1398 }, { "epoch": 0.3500562992618541, "grad_norm": 1.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41278669.333333336, "logits/rejected": -2138555.5555555555, "logps/chosen": -366.9925130208333, "logps/rejected": -676.63037109375, "loss": 0.0137, "rewards/chosen": 6.664829254150391, "rewards/margins": 16.928256140814888, "rewards/rejected": -10.263426886664497, "step": 1399 }, { "epoch": 0.350306518203428, "grad_norm": 1.4140625, "kl": 12.511804580688477, "learning_rate": 5e-06, "logits/chosen": -36576179.2, "logits/rejected": -54901404.44444445, "logps/chosen": -551.7591796875, "logps/rejected": -816.2090928819445, "loss": 0.0023, "rewards/chosen": 10.2717041015625, "rewards/margins": 27.145997111002607, "rewards/rejected": -16.874293009440105, "step": 1400 }, { "epoch": 0.35055673714500185, "grad_norm": 0.74609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64965109.333333336, "logits/rejected": -46812032.0, "logps/chosen": -357.7642415364583, "logps/rejected": -564.5442708333334, "loss": 0.0051, "rewards/chosen": 8.211051940917969, "rewards/margins": 19.42884063720703, "rewards/rejected": -11.217788696289062, "step": 1401 }, { "epoch": 0.35080695608657575, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53030712.0, "logits/rejected": -48485420.0, "logps/chosen": -282.1611328125, "logps/rejected": -662.4784545898438, "loss": 0.0379, "rewards/chosen": 5.630801200866699, "rewards/margins": 17.286304473876953, "rewards/rejected": -11.655503273010254, "step": 1402 }, { "epoch": 0.35105717502814965, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -78118144.0, "logits/rejected": -61208064.0, "logps/chosen": -650.1380504261364, "logps/rejected": -775.9940655048077, "loss": 0.0062, "rewards/chosen": 10.618053783069957, "rewards/margins": 25.78856722958438, "rewards/rejected": -15.170513446514423, "step": 1403 }, { "epoch": 0.3513073939697235, "grad_norm": 12.6875, "kl": 2.596083402633667, "learning_rate": 5e-06, "logits/chosen": -68005970.28571428, "logits/rejected": 59958547.2, "logps/chosen": -431.27633231026783, "logps/rejected": -686.0212890625, "loss": 0.046, "rewards/chosen": 6.783527919224331, "rewards/margins": 16.459911128452845, "rewards/rejected": -9.676383209228515, "step": 1404 }, { "epoch": 0.3515576129112974, "grad_norm": 14.0625, "kl": 6.426671028137207, "learning_rate": 5e-06, "logits/chosen": -57330112.0, "logits/rejected": -31658176.0, "logps/chosen": -442.88025841346155, "logps/rejected": -523.8718039772727, "loss": 0.0662, "rewards/chosen": 7.974584726186899, "rewards/margins": 19.058880519200038, "rewards/rejected": -11.084295793013139, "step": 1405 }, { "epoch": 0.3518078318528713, "grad_norm": 1.6875, "kl": 7.128184795379639, "learning_rate": 5e-06, "logits/chosen": -39027270.4, "logits/rejected": -55688004.571428575, "logps/chosen": -426.928125, "logps/rejected": -450.0786830357143, "loss": 0.017, "rewards/chosen": 8.389739227294921, "rewards/margins": 18.13197468348912, "rewards/rejected": -9.742235456194196, "step": 1406 }, { "epoch": 0.35205805079444513, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58844288.0, "logits/rejected": -45988250.666666664, "logps/chosen": -382.8331705729167, "logps/rejected": -456.5674235026042, "loss": 0.0291, "rewards/chosen": 6.269336700439453, "rewards/margins": 15.390447616577148, "rewards/rejected": -9.121110916137695, "step": 1407 }, { "epoch": 0.35230826973601903, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34870579.2, "logits/rejected": -45762802.28571428, "logps/chosen": -375.8123291015625, "logps/rejected": -474.4597865513393, "loss": 0.0097, "rewards/chosen": 6.738951873779297, "rewards/margins": 18.88167495727539, "rewards/rejected": -12.142723083496094, "step": 1408 }, { "epoch": 0.3525584886775929, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56879675.07692308, "logits/rejected": -47576791.27272727, "logps/chosen": -470.84908353365387, "logps/rejected": -502.419921875, "loss": 0.0586, "rewards/chosen": 8.04593012883113, "rewards/margins": 18.539741996284967, "rewards/rejected": -10.493811867453836, "step": 1409 }, { "epoch": 0.3528087076191668, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36270729.84615385, "logits/rejected": -74471185.45454545, "logps/chosen": -474.7137920673077, "logps/rejected": -558.6563387784091, "loss": 0.0368, "rewards/chosen": 6.993660560021033, "rewards/margins": 19.025967577954273, "rewards/rejected": -12.032307017933238, "step": 1410 }, { "epoch": 0.35305892656074067, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25235098.181818184, "logits/rejected": 17158104.615384616, "logps/chosen": -281.726806640625, "logps/rejected": -545.2224308894231, "loss": 0.0833, "rewards/chosen": 4.301422812721946, "rewards/margins": 15.044850969648028, "rewards/rejected": -10.743428156926083, "step": 1411 }, { "epoch": 0.3533091455023145, "grad_norm": 14.9375, "kl": 5.812307357788086, "learning_rate": 5e-06, "logits/chosen": -65225700.571428575, "logits/rejected": -52220195.2, "logps/chosen": -439.56515066964283, "logps/rejected": -726.58310546875, "loss": 0.0469, "rewards/chosen": 8.212796892438616, "rewards/margins": 22.355350603376117, "rewards/rejected": -14.1425537109375, "step": 1412 }, { "epoch": 0.3535593644438884, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62327764.0, "logits/rejected": -24458312.0, "logps/chosen": -460.3680725097656, "logps/rejected": -596.10400390625, "loss": 0.0556, "rewards/chosen": 7.300900459289551, "rewards/margins": 19.922649383544922, "rewards/rejected": -12.621748924255371, "step": 1413 }, { "epoch": 0.35380958338546226, "grad_norm": 10.6875, "kl": 11.8612060546875, "learning_rate": 5e-06, "logits/chosen": -57664365.71428572, "logits/rejected": -29341267.2, "logps/chosen": -398.34326171875, "logps/rejected": -522.267919921875, "loss": 0.0721, "rewards/chosen": 5.677768162318638, "rewards/margins": 15.454412296840122, "rewards/rejected": -9.776644134521485, "step": 1414 }, { "epoch": 0.35405980232703615, "grad_norm": 1.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60141370.18181818, "logits/rejected": -57800438.15384615, "logps/chosen": -470.8170276988636, "logps/rejected": -560.9586463341346, "loss": 0.026, "rewards/chosen": 8.210657986727627, "rewards/margins": 21.072558903193972, "rewards/rejected": -12.861900916466347, "step": 1415 }, { "epoch": 0.35431002126861005, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32914932.0, "logits/rejected": -38338192.0, "logps/chosen": -173.03822326660156, "logps/rejected": -584.142333984375, "loss": 0.0764, "rewards/chosen": 3.7667694091796875, "rewards/margins": 15.68187141418457, "rewards/rejected": -11.915102005004883, "step": 1416 }, { "epoch": 0.3545602402101839, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77776715.63636364, "logits/rejected": -36113853.538461536, "logps/chosen": -396.49729225852275, "logps/rejected": -629.8743990384615, "loss": 0.0346, "rewards/chosen": 7.152030944824219, "rewards/margins": 21.61517862173227, "rewards/rejected": -14.463147676908052, "step": 1417 }, { "epoch": 0.3548104591517578, "grad_norm": 16.5, "kl": 7.717679500579834, "learning_rate": 5e-06, "logits/chosen": 17478704.0, "logits/rejected": -52612176.0, "logps/chosen": -429.768798828125, "logps/rejected": -281.666259765625, "loss": 0.0918, "rewards/chosen": 5.887630462646484, "rewards/margins": 13.455334186553955, "rewards/rejected": -7.567703723907471, "step": 1418 }, { "epoch": 0.35506067809333164, "grad_norm": 17.625, "kl": 11.13028335571289, "learning_rate": 5e-06, "logits/chosen": -28847306.666666668, "logits/rejected": -61549909.333333336, "logps/chosen": -463.72486979166666, "logps/rejected": -495.6037326388889, "loss": 0.13, "rewards/chosen": 7.3768056233723955, "rewards/margins": 19.00676981608073, "rewards/rejected": -11.629964192708334, "step": 1419 }, { "epoch": 0.35531089703490554, "grad_norm": 23.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74581996.3076923, "logits/rejected": -60997742.54545455, "logps/chosen": -365.0421799879808, "logps/rejected": -507.80229048295456, "loss": 0.0689, "rewards/chosen": 6.126974252554087, "rewards/margins": 14.834668566296985, "rewards/rejected": -8.707694313742898, "step": 1420 }, { "epoch": 0.35556111597647944, "grad_norm": 1.1953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70038394.18181819, "logits/rejected": -59319748.92307692, "logps/chosen": -489.3062855113636, "logps/rejected": -592.6949368990385, "loss": 0.0094, "rewards/chosen": 8.56694308194247, "rewards/margins": 24.725538240446078, "rewards/rejected": -16.158595158503605, "step": 1421 }, { "epoch": 0.3558113349180533, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55887858.28571428, "logits/rejected": 24521422.4, "logps/chosen": -500.5084751674107, "logps/rejected": -544.201953125, "loss": 0.034, "rewards/chosen": 8.7468626839774, "rewards/margins": 20.526266370500835, "rewards/rejected": -11.779403686523438, "step": 1422 }, { "epoch": 0.3560615538596272, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76280480.0, "logits/rejected": -67293340.44444445, "logps/chosen": -345.414306640625, "logps/rejected": -477.0716145833333, "loss": 0.0553, "rewards/chosen": 5.34048589070638, "rewards/margins": 14.012145572238499, "rewards/rejected": -8.671659681532118, "step": 1423 }, { "epoch": 0.3563117728012011, "grad_norm": 2.65625, "kl": 0.27488836646080017, "learning_rate": 5e-06, "logits/chosen": -34870131.692307696, "logits/rejected": -51258350.54545455, "logps/chosen": -399.97787710336536, "logps/rejected": -603.3857865767045, "loss": 0.0065, "rewards/chosen": 7.985346280611479, "rewards/margins": 20.77538118162355, "rewards/rejected": -12.790034901012074, "step": 1424 }, { "epoch": 0.3565619917427749, "grad_norm": 15.375, "kl": 3.0965240001678467, "learning_rate": 5e-06, "logits/chosen": -73984182.85714285, "logits/rejected": -50863641.6, "logps/chosen": -428.75948660714283, "logps/rejected": -346.171728515625, "loss": 0.0602, "rewards/chosen": 6.2626816885811945, "rewards/margins": 15.120948900495257, "rewards/rejected": -8.858267211914063, "step": 1425 }, { "epoch": 0.3568122106843488, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48847749.333333336, "logits/rejected": -41748872.0, "logps/chosen": -377.884521484375, "logps/rejected": -508.500244140625, "loss": 0.0284, "rewards/chosen": 6.642574310302734, "rewards/margins": 17.614665985107422, "rewards/rejected": -10.972091674804688, "step": 1426 }, { "epoch": 0.35706242962592266, "grad_norm": 12.6875, "kl": 1.6349167823791504, "learning_rate": 5e-06, "logits/chosen": -33113094.85714286, "logits/rejected": -52767555.2, "logps/chosen": -291.21182686941967, "logps/rejected": -551.41259765625, "loss": 0.072, "rewards/chosen": 5.402621677943638, "rewards/margins": 17.80347693307059, "rewards/rejected": -12.400855255126952, "step": 1427 }, { "epoch": 0.35731264856749656, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60668824.0, "logits/rejected": -72480448.0, "logps/chosen": -392.55889892578125, "logps/rejected": -691.7050170898438, "loss": 0.0189, "rewards/chosen": 6.731208324432373, "rewards/margins": 21.27409315109253, "rewards/rejected": -14.542884826660156, "step": 1428 }, { "epoch": 0.35756286750907046, "grad_norm": 12.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56924032.0, "logits/rejected": -57238551.27272727, "logps/chosen": -338.5987079326923, "logps/rejected": -551.2757013494319, "loss": 0.0453, "rewards/chosen": 6.334498478816106, "rewards/margins": 19.08776385967548, "rewards/rejected": -12.753265380859375, "step": 1429 }, { "epoch": 0.3578130864506443, "grad_norm": 2.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38227300.571428575, "logits/rejected": -13758476.8, "logps/chosen": -395.7186802455357, "logps/rejected": -662.495654296875, "loss": 0.0354, "rewards/chosen": 6.083441598074777, "rewards/margins": 20.663643319266182, "rewards/rejected": -14.580201721191406, "step": 1430 }, { "epoch": 0.3580633053922182, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50863402.666666664, "logits/rejected": -90437248.0, "logps/chosen": -437.3853352864583, "logps/rejected": -570.7180582682291, "loss": 0.0116, "rewards/chosen": 7.654365539550781, "rewards/margins": 18.859172821044922, "rewards/rejected": -11.20480728149414, "step": 1431 }, { "epoch": 0.35831352433379204, "grad_norm": 3.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44515515.428571425, "logits/rejected": -40173155.2, "logps/chosen": -348.9318150111607, "logps/rejected": -728.392578125, "loss": 0.0564, "rewards/chosen": 6.237430027553013, "rewards/margins": 19.374383762904575, "rewards/rejected": -13.136953735351563, "step": 1432 }, { "epoch": 0.35856374327536594, "grad_norm": 14.3125, "kl": 0.3510233759880066, "learning_rate": 5e-06, "logits/chosen": -78531214.22222222, "logits/rejected": -35573386.666666664, "logps/chosen": -477.5673828125, "logps/rejected": -664.309765625, "loss": 0.0403, "rewards/chosen": 8.29533216688368, "rewards/margins": 19.821913994683158, "rewards/rejected": -11.526581827799479, "step": 1433 }, { "epoch": 0.35881396221693984, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23483219.692307692, "logits/rejected": -37687714.90909091, "logps/chosen": -397.96206430288464, "logps/rejected": -571.9079367897727, "loss": 0.0246, "rewards/chosen": 5.6260516826923075, "rewards/margins": 22.129813667777533, "rewards/rejected": -16.503761985085227, "step": 1434 }, { "epoch": 0.3590641811585137, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42031988.0, "logits/rejected": -47345768.0, "logps/chosen": -401.73577880859375, "logps/rejected": -539.3719482421875, "loss": 0.0057, "rewards/chosen": 6.956500053405762, "rewards/margins": 18.414944648742676, "rewards/rejected": -11.458444595336914, "step": 1435 }, { "epoch": 0.3593144001000876, "grad_norm": 11.4375, "kl": 1.1771705150604248, "learning_rate": 5e-06, "logits/chosen": -75799488.0, "logits/rejected": -34032836.571428575, "logps/chosen": -351.0594970703125, "logps/rejected": -509.5528041294643, "loss": 0.0459, "rewards/chosen": 6.812472534179688, "rewards/margins": 17.59641353062221, "rewards/rejected": -10.783940996442523, "step": 1436 }, { "epoch": 0.3595646190416614, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30782788.57142857, "logits/rejected": -48426796.8, "logps/chosen": -337.0228794642857, "logps/rejected": -588.4873046875, "loss": 0.0353, "rewards/chosen": 5.464809962681362, "rewards/margins": 17.96720711844308, "rewards/rejected": -12.502397155761718, "step": 1437 }, { "epoch": 0.3598148379832353, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25957444.8, "logits/rejected": -49068164.571428575, "logps/chosen": -323.091943359375, "logps/rejected": -711.6405552455357, "loss": 0.0451, "rewards/chosen": 5.587612915039062, "rewards/margins": 16.85543954031808, "rewards/rejected": -11.267826625279017, "step": 1438 }, { "epoch": 0.3600650569248092, "grad_norm": 7.5, "kl": 11.932319641113281, "learning_rate": 5e-06, "logits/chosen": -67978737.23076923, "logits/rejected": -40119258.18181818, "logps/chosen": -507.05284705528845, "logps/rejected": -577.4150390625, "loss": 0.0532, "rewards/chosen": 8.382584205040565, "rewards/margins": 18.729393592247597, "rewards/rejected": -10.346809387207031, "step": 1439 }, { "epoch": 0.36031527586638307, "grad_norm": 2.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57670336.0, "logits/rejected": 85713389.71428572, "logps/chosen": -432.618798828125, "logps/rejected": -548.0721958705357, "loss": 0.0049, "rewards/chosen": 7.446672058105468, "rewards/margins": 18.45464368547712, "rewards/rejected": -11.007971627371651, "step": 1440 }, { "epoch": 0.36056549480795697, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55968552.72727273, "logits/rejected": -41064172.307692304, "logps/chosen": -332.743896484375, "logps/rejected": -533.4876802884615, "loss": 0.026, "rewards/chosen": 5.945619756525213, "rewards/margins": 16.623060693274013, "rewards/rejected": -10.677440936748798, "step": 1441 }, { "epoch": 0.36081571374953086, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49842715.428571425, "logits/rejected": -59495563.294117644, "logps/chosen": -563.5268903459821, "logps/rejected": -543.5722081801471, "loss": 0.0546, "rewards/chosen": 11.332850864955358, "rewards/margins": 21.585354203937435, "rewards/rejected": -10.252503338982077, "step": 1442 }, { "epoch": 0.3610659326911047, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40110734.222222224, "logits/rejected": -48277162.666666664, "logps/chosen": -431.4103732638889, "logps/rejected": -660.915625, "loss": 0.0195, "rewards/chosen": 9.167383829752604, "rewards/margins": 20.99742736816406, "rewards/rejected": -11.830043538411458, "step": 1443 }, { "epoch": 0.3613161516326786, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33238566.0, "logits/rejected": -67309864.0, "logps/chosen": -349.23663330078125, "logps/rejected": -633.099853515625, "loss": 0.0522, "rewards/chosen": 4.590353965759277, "rewards/margins": 20.492164611816406, "rewards/rejected": -15.901810646057129, "step": 1444 }, { "epoch": 0.36156637057425245, "grad_norm": 2.546875, "kl": 1.2477658987045288, "learning_rate": 5e-06, "logits/chosen": -38277891.2, "logits/rejected": -57382464.0, "logps/chosen": -445.97236328125, "logps/rejected": -766.3161969866071, "loss": 0.0105, "rewards/chosen": 9.916059875488282, "rewards/margins": 26.178626578194752, "rewards/rejected": -16.262566702706472, "step": 1445 }, { "epoch": 0.36181658951582635, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76586595.55555555, "logits/rejected": -60284834.13333333, "logps/chosen": -410.7468532986111, "logps/rejected": -590.7415364583334, "loss": 0.0632, "rewards/chosen": 6.82777574327257, "rewards/margins": 18.97667202419705, "rewards/rejected": -12.148896280924479, "step": 1446 }, { "epoch": 0.36206680845740025, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23338980.57142857, "logits/rejected": -47900617.6, "logps/chosen": -459.7919224330357, "logps/rejected": -697.594482421875, "loss": 0.048, "rewards/chosen": 5.634098052978516, "rewards/margins": 18.001346588134766, "rewards/rejected": -12.36724853515625, "step": 1447 }, { "epoch": 0.3623170273989741, "grad_norm": 27.875, "kl": 0.20477867126464844, "learning_rate": 5e-06, "logits/chosen": -45810074.666666664, "logits/rejected": -61941322.666666664, "logps/chosen": -356.6944580078125, "logps/rejected": -673.794677734375, "loss": 0.0901, "rewards/chosen": 5.975573221842448, "rewards/margins": 14.205772399902344, "rewards/rejected": -8.230199178059896, "step": 1448 }, { "epoch": 0.362567246340548, "grad_norm": 15.375, "kl": 5.429044246673584, "learning_rate": 5e-06, "logits/chosen": -27342520.615384616, "logits/rejected": -45290042.18181818, "logps/chosen": -355.38461538461536, "logps/rejected": -537.3761985085227, "loss": 0.1103, "rewards/chosen": 4.918288891132061, "rewards/margins": 17.5353820640724, "rewards/rejected": -12.617093172940342, "step": 1449 }, { "epoch": 0.36281746528212183, "grad_norm": 6.1875, "kl": 3.1310575008392334, "learning_rate": 5e-06, "logits/chosen": -50957627.07692308, "logits/rejected": -47080209.45454545, "logps/chosen": -381.6692457932692, "logps/rejected": -636.3359375, "loss": 0.0151, "rewards/chosen": 7.457385723407452, "rewards/margins": 20.24586561509779, "rewards/rejected": -12.788479891690342, "step": 1450 }, { "epoch": 0.36306768422369573, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63243426.90909091, "logits/rejected": -54973538.461538464, "logps/chosen": -327.92092063210225, "logps/rejected": -607.486328125, "loss": 0.0414, "rewards/chosen": 4.701632412997159, "rewards/margins": 18.160959523874563, "rewards/rejected": -13.459327110877403, "step": 1451 }, { "epoch": 0.36331790316526963, "grad_norm": 1.3984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40940501.333333336, "logits/rejected": -89388492.8, "logps/chosen": -549.4225260416666, "logps/rejected": -708.7557942708333, "loss": 0.003, "rewards/chosen": 10.054502699110243, "rewards/margins": 24.36369459364149, "rewards/rejected": -14.30919189453125, "step": 1452 }, { "epoch": 0.36356812210684347, "grad_norm": 8.875, "kl": 4.9395599365234375, "learning_rate": 5e-06, "logits/chosen": -59268502.85714286, "logits/rejected": -50867353.6, "logps/chosen": -370.87587193080356, "logps/rejected": -742.21689453125, "loss": 0.0269, "rewards/chosen": 6.457456861223493, "rewards/margins": 22.824256787981305, "rewards/rejected": -16.366799926757814, "step": 1453 }, { "epoch": 0.36381834104841737, "grad_norm": 15.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40006464.0, "logits/rejected": -43237564.0, "logps/chosen": -278.547607421875, "logps/rejected": -419.1848449707031, "loss": 0.0591, "rewards/chosen": 6.350396633148193, "rewards/margins": 15.127200603485107, "rewards/rejected": -8.776803970336914, "step": 1454 }, { "epoch": 0.36406855998999127, "grad_norm": 13.6875, "kl": 4.628375053405762, "learning_rate": 5e-06, "logits/chosen": -72887250.28571428, "logits/rejected": 51300352.0, "logps/chosen": -317.3553989955357, "logps/rejected": -668.96689453125, "loss": 0.0472, "rewards/chosen": 6.140809195382254, "rewards/margins": 17.672460501534598, "rewards/rejected": -11.531651306152344, "step": 1455 }, { "epoch": 0.3643187789315651, "grad_norm": 5.21875, "kl": 2.7030959129333496, "learning_rate": 5e-06, "logits/chosen": -70124012.8, "logits/rejected": -41078427.428571425, "logps/chosen": -454.91982421875, "logps/rejected": -442.45472935267856, "loss": 0.0413, "rewards/chosen": 8.184555053710938, "rewards/margins": 20.056630815778462, "rewards/rejected": -11.872075762067523, "step": 1456 }, { "epoch": 0.364568997873139, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62508608.0, "logits/rejected": -60690880.0, "logps/chosen": -412.9384358723958, "logps/rejected": -696.38525390625, "loss": 0.0444, "rewards/chosen": 6.140658696492513, "rewards/margins": 22.935712814331055, "rewards/rejected": -16.795054117838543, "step": 1457 }, { "epoch": 0.36481921681471285, "grad_norm": 3.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44248132.92307692, "logits/rejected": -107704354.9090909, "logps/chosen": -353.9615009014423, "logps/rejected": -618.1991743607955, "loss": 0.031, "rewards/chosen": 6.919779850886418, "rewards/margins": 21.778407170222355, "rewards/rejected": -14.858627319335938, "step": 1458 }, { "epoch": 0.36506943575628675, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79556812.8, "logits/rejected": -57513934.222222224, "logps/chosen": -408.33372395833334, "logps/rejected": -589.1618923611111, "loss": 0.0171, "rewards/chosen": 7.305982462565104, "rewards/margins": 21.13246290418837, "rewards/rejected": -13.826480441623264, "step": 1459 }, { "epoch": 0.36531965469786065, "grad_norm": 7.9375, "kl": 0.5344009399414062, "learning_rate": 5e-06, "logits/chosen": -21463386.666666668, "logits/rejected": -42706066.666666664, "logps/chosen": -252.90580240885416, "logps/rejected": -473.3464762369792, "loss": 0.0685, "rewards/chosen": 5.302000363667806, "rewards/margins": 16.691630999247234, "rewards/rejected": -11.389630635579428, "step": 1460 }, { "epoch": 0.3655698736394345, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63345425.45454545, "logits/rejected": -51707377.23076923, "logps/chosen": -364.1495472301136, "logps/rejected": -549.7163461538462, "loss": 0.0263, "rewards/chosen": 7.571725325150923, "rewards/margins": 20.361564849640107, "rewards/rejected": -12.789839524489183, "step": 1461 }, { "epoch": 0.3658200925810084, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44151470.222222224, "logits/rejected": -47670485.333333336, "logps/chosen": -297.3771158854167, "logps/rejected": -483.42666015625, "loss": 0.0522, "rewards/chosen": 5.755478752983941, "rewards/margins": 16.315081617567273, "rewards/rejected": -10.559602864583333, "step": 1462 }, { "epoch": 0.36607031152258224, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44405037.71428572, "logits/rejected": -45631990.4, "logps/chosen": -325.0625, "logps/rejected": -476.936181640625, "loss": 0.0603, "rewards/chosen": 4.838845934186663, "rewards/margins": 14.867067064557757, "rewards/rejected": -10.028221130371094, "step": 1463 }, { "epoch": 0.36632053046415614, "grad_norm": 16.5, "kl": 2.6466217041015625, "learning_rate": 5e-06, "logits/chosen": -79042285.71428572, "logits/rejected": -62999872.0, "logps/chosen": -406.38242885044644, "logps/rejected": -540.474072265625, "loss": 0.0348, "rewards/chosen": 6.694066728864398, "rewards/margins": 20.671233476911272, "rewards/rejected": -13.977166748046875, "step": 1464 }, { "epoch": 0.36657074940573003, "grad_norm": 7.59375, "kl": 3.3304905891418457, "learning_rate": 5e-06, "logits/chosen": -84168308.36363636, "logits/rejected": -38034441.84615385, "logps/chosen": -448.4808238636364, "logps/rejected": -532.1355543870193, "loss": 0.0385, "rewards/chosen": 8.67235079678622, "rewards/margins": 20.51585430865521, "rewards/rejected": -11.84350351186899, "step": 1465 }, { "epoch": 0.3668209683473039, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -90441744.0, "logits/rejected": -75981264.0, "logps/chosen": -386.78564453125, "logps/rejected": -691.1327514648438, "loss": 0.0395, "rewards/chosen": 6.249045372009277, "rewards/margins": 20.348111152648926, "rewards/rejected": -14.099065780639648, "step": 1466 }, { "epoch": 0.3670711872888778, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63987768.88888889, "logits/rejected": -36629307.733333334, "logps/chosen": -388.2990451388889, "logps/rejected": -582.2656901041667, "loss": 0.0059, "rewards/chosen": 8.126688639322916, "rewards/margins": 19.559955851236978, "rewards/rejected": -11.433267211914062, "step": 1467 }, { "epoch": 0.3673214062304516, "grad_norm": 2.46875, "kl": 3.54791522026062, "learning_rate": 5e-06, "logits/chosen": -58671863.46666667, "logits/rejected": -12699426.666666666, "logps/chosen": -363.3497721354167, "logps/rejected": -341.2548014322917, "loss": 0.0275, "rewards/chosen": 8.100952657063802, "rewards/margins": 16.662552388509113, "rewards/rejected": -8.561599731445312, "step": 1468 }, { "epoch": 0.3675716251720255, "grad_norm": 13.5625, "kl": 10.497701644897461, "learning_rate": 5e-06, "logits/chosen": -77485331.6923077, "logits/rejected": -38310112.0, "logps/chosen": -389.5580303485577, "logps/rejected": -478.53981711647725, "loss": 0.0393, "rewards/chosen": 7.283131526066707, "rewards/margins": 18.54624298735932, "rewards/rejected": -11.263111461292613, "step": 1469 }, { "epoch": 0.3678218441135994, "grad_norm": 9.9375, "kl": 8.398536682128906, "learning_rate": 5e-06, "logits/chosen": -45394133.333333336, "logits/rejected": -81482332.44444445, "logps/chosen": -295.29856770833334, "logps/rejected": -604.4596896701389, "loss": 0.0971, "rewards/chosen": 5.591945393880208, "rewards/margins": 18.995632765028212, "rewards/rejected": -13.403687371148003, "step": 1470 }, { "epoch": 0.36807206305517326, "grad_norm": 3.828125, "kl": 5.776141166687012, "learning_rate": 5e-06, "logits/chosen": -66453548.8, "logits/rejected": -35028992.0, "logps/chosen": -407.5990478515625, "logps/rejected": -544.4265834263393, "loss": 0.0081, "rewards/chosen": 8.373365783691407, "rewards/margins": 22.0897702898298, "rewards/rejected": -13.716404506138392, "step": 1471 }, { "epoch": 0.36832228199674716, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50171630.54545455, "logits/rejected": 51244342.15384615, "logps/chosen": -387.94797585227275, "logps/rejected": -515.2635591947115, "loss": 0.0606, "rewards/chosen": 5.185014204545454, "rewards/margins": 15.870903815422857, "rewards/rejected": -10.685889610877403, "step": 1472 }, { "epoch": 0.36857250093832106, "grad_norm": 11.25, "kl": 1.8079898357391357, "learning_rate": 5e-06, "logits/chosen": -50533488.0, "logits/rejected": -48172309.333333336, "logps/chosen": -366.69482421875, "logps/rejected": -727.1637369791666, "loss": 0.0531, "rewards/chosen": 5.371676762898763, "rewards/margins": 18.358312606811523, "rewards/rejected": -12.98663584391276, "step": 1473 }, { "epoch": 0.3688227198798949, "grad_norm": 2.25, "kl": 10.794523239135742, "learning_rate": 5e-06, "logits/chosen": -62364614.4, "logits/rejected": -61305526.85714286, "logps/chosen": -375.316259765625, "logps/rejected": -517.3069196428571, "loss": 0.005, "rewards/chosen": 9.397773742675781, "rewards/margins": 19.65023258754185, "rewards/rejected": -10.252458844866071, "step": 1474 }, { "epoch": 0.3690729388214688, "grad_norm": 12.125, "kl": 10.09415340423584, "learning_rate": 5e-06, "logits/chosen": -68816680.0, "logits/rejected": -38184508.0, "logps/chosen": -480.1839904785156, "logps/rejected": -519.7053833007812, "loss": 0.1193, "rewards/chosen": 6.878000259399414, "rewards/margins": 17.881831169128418, "rewards/rejected": -11.003830909729004, "step": 1475 }, { "epoch": 0.36932315776304264, "grad_norm": 3.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43772096.0, "logits/rejected": -38886904.615384616, "logps/chosen": -394.36874112215907, "logps/rejected": -581.1694711538462, "loss": 0.039, "rewards/chosen": 6.982310208407315, "rewards/margins": 19.27888120637907, "rewards/rejected": -12.296570997971754, "step": 1476 }, { "epoch": 0.36957337670461654, "grad_norm": 16.125, "kl": 1.2153505086898804, "learning_rate": 5e-06, "logits/chosen": -43910435.2, "logits/rejected": -61819936.0, "logps/chosen": -352.085205078125, "logps/rejected": -566.4048200334821, "loss": 0.0466, "rewards/chosen": 6.919402313232422, "rewards/margins": 17.703001076834543, "rewards/rejected": -10.78359876360212, "step": 1477 }, { "epoch": 0.36982359564619044, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47157385.84615385, "logits/rejected": -53794781.09090909, "logps/chosen": -350.62943209134613, "logps/rejected": -551.9283114346591, "loss": 0.0348, "rewards/chosen": 7.055459829477163, "rewards/margins": 20.27703409261637, "rewards/rejected": -13.221574263139205, "step": 1478 }, { "epoch": 0.3700738145877643, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41338093.333333336, "logits/rejected": -32626581.333333332, "logps/chosen": -298.90293375651044, "logps/rejected": -618.6534016927084, "loss": 0.0094, "rewards/chosen": 7.279170989990234, "rewards/margins": 21.39152399698893, "rewards/rejected": -14.112353006998697, "step": 1479 }, { "epoch": 0.3703240335293382, "grad_norm": 11.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59876096.0, "logits/rejected": -25147684.57142857, "logps/chosen": -453.362841796875, "logps/rejected": -436.4036342075893, "loss": 0.0418, "rewards/chosen": 9.705673980712891, "rewards/margins": 21.370231301443916, "rewards/rejected": -11.664557320731026, "step": 1480 }, { "epoch": 0.370574252470912, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37758555.428571425, "logits/rejected": 10056352.0, "logps/chosen": -354.9829799107143, "logps/rejected": -490.49040670955884, "loss": 0.0135, "rewards/chosen": 7.0408172607421875, "rewards/margins": 18.278410967658548, "rewards/rejected": -11.23759370691636, "step": 1481 }, { "epoch": 0.3708244714124859, "grad_norm": 18.0, "kl": 9.759722709655762, "learning_rate": 5e-06, "logits/chosen": 4218770.823529412, "logits/rejected": -11962620.57142857, "logps/chosen": -420.60285500919116, "logps/rejected": -449.01290457589283, "loss": 0.077, "rewards/chosen": 8.556619083180147, "rewards/margins": 20.04801203623539, "rewards/rejected": -11.491392953055245, "step": 1482 }, { "epoch": 0.3710746903540598, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43813546.666666664, "logits/rejected": -42713714.666666664, "logps/chosen": -413.4487711588542, "logps/rejected": -486.2548828125, "loss": 0.0343, "rewards/chosen": 8.300863265991211, "rewards/margins": 17.352370580037437, "rewards/rejected": -9.051507314046225, "step": 1483 }, { "epoch": 0.37132490929563366, "grad_norm": 12.8125, "kl": 2.8726329803466797, "learning_rate": 5e-06, "logits/chosen": -18753536.0, "logits/rejected": -83926448.0, "logps/chosen": -398.6054280598958, "logps/rejected": -629.783447265625, "loss": 0.0237, "rewards/chosen": 8.39282480875651, "rewards/margins": 18.916951497395832, "rewards/rejected": -10.524126688639322, "step": 1484 }, { "epoch": 0.37157512823720756, "grad_norm": 7.9375, "kl": 0.5384852290153503, "learning_rate": 5e-06, "logits/chosen": -47932476.0, "logits/rejected": -42761188.0, "logps/chosen": -448.7462158203125, "logps/rejected": -492.65087890625, "loss": 0.0382, "rewards/chosen": 7.764603137969971, "rewards/margins": 16.71304178237915, "rewards/rejected": -8.94843864440918, "step": 1485 }, { "epoch": 0.3718253471787814, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79898827.63636364, "logits/rejected": -28066569.846153848, "logps/chosen": -379.1720525568182, "logps/rejected": -700.5993088942307, "loss": 0.0246, "rewards/chosen": 6.93018271706321, "rewards/margins": 23.255096328842058, "rewards/rejected": -16.324913611778847, "step": 1486 }, { "epoch": 0.3720755661203553, "grad_norm": 7.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68727541.33333333, "logits/rejected": -39419306.666666664, "logps/chosen": -243.83072916666666, "logps/rejected": -586.9990641276041, "loss": 0.0425, "rewards/chosen": 5.1235246658325195, "rewards/margins": 20.250266710917153, "rewards/rejected": -15.126742045084635, "step": 1487 }, { "epoch": 0.3723257850619292, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73166112.0, "logits/rejected": -30268201.14285714, "logps/chosen": -385.7388916015625, "logps/rejected": -737.8534458705357, "loss": 0.0166, "rewards/chosen": 6.703889465332031, "rewards/margins": 19.759499032156807, "rewards/rejected": -13.055609566824776, "step": 1488 }, { "epoch": 0.37257600400350305, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76090630.4, "logits/rejected": -51500681.14285714, "logps/chosen": -552.44677734375, "logps/rejected": -731.6400669642857, "loss": 0.005, "rewards/chosen": 8.726946258544922, "rewards/margins": 25.98178983415876, "rewards/rejected": -17.25484357561384, "step": 1489 }, { "epoch": 0.37282622294507695, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55076461.71428572, "logits/rejected": -42433923.76470588, "logps/chosen": -453.6431361607143, "logps/rejected": -488.16457950367646, "loss": 0.0252, "rewards/chosen": 6.651004791259766, "rewards/margins": 16.44989215626436, "rewards/rejected": -9.798887365004596, "step": 1490 }, { "epoch": 0.37307644188665084, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29556128.0, "logits/rejected": -27918313.14285714, "logps/chosen": -291.852490234375, "logps/rejected": -406.06033761160717, "loss": 0.0543, "rewards/chosen": 7.14937744140625, "rewards/margins": 17.696335274832588, "rewards/rejected": -10.546957833426339, "step": 1491 }, { "epoch": 0.3733266608282247, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72913254.4, "logits/rejected": -20996069.333333332, "logps/chosen": -406.1290690104167, "logps/rejected": -469.84624565972223, "loss": 0.0519, "rewards/chosen": 7.254035949707031, "rewards/margins": 20.007662455240883, "rewards/rejected": -12.753626505533854, "step": 1492 }, { "epoch": 0.3735768797697986, "grad_norm": 13.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53002656.0, "logits/rejected": -42120474.666666664, "logps/chosen": -423.9876708984375, "logps/rejected": -655.2089436848959, "loss": 0.0703, "rewards/chosen": 6.128908793131511, "rewards/margins": 20.21557871500651, "rewards/rejected": -14.086669921875, "step": 1493 }, { "epoch": 0.37382709871137243, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30981002.666666668, "logits/rejected": -29381042.666666668, "logps/chosen": -450.0165201822917, "logps/rejected": -379.5830078125, "loss": 0.0389, "rewards/chosen": 7.783847808837891, "rewards/margins": 17.051981608072914, "rewards/rejected": -9.268133799235025, "step": 1494 }, { "epoch": 0.37407731765294633, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47246299.428571425, "logits/rejected": -55030256.941176474, "logps/chosen": -469.45664760044644, "logps/rejected": -728.8492647058823, "loss": 0.0078, "rewards/chosen": 9.222942897251674, "rewards/margins": 22.16458879999754, "rewards/rejected": -12.941645902745863, "step": 1495 }, { "epoch": 0.3743275365945202, "grad_norm": 14.375, "kl": 8.163302421569824, "learning_rate": 5e-06, "logits/chosen": -60342880.0, "logits/rejected": 60988160.0, "logps/chosen": -341.5933837890625, "logps/rejected": -642.232421875, "loss": 0.1069, "rewards/chosen": 5.002639452616374, "rewards/margins": 18.22743574778239, "rewards/rejected": -13.224796295166016, "step": 1496 }, { "epoch": 0.37457775553609407, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39287498.666666664, "logits/rejected": -53840477.86666667, "logps/chosen": -421.43299696180554, "logps/rejected": -626.4707682291667, "loss": 0.0259, "rewards/chosen": 7.583435906304254, "rewards/margins": 22.185001458062064, "rewards/rejected": -14.601565551757812, "step": 1497 }, { "epoch": 0.37482797447766797, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48876600.0, "logits/rejected": -52909204.0, "logps/chosen": -333.1134338378906, "logps/rejected": -550.0637817382812, "loss": 0.0451, "rewards/chosen": 6.283137321472168, "rewards/margins": 18.175339698791504, "rewards/rejected": -11.892202377319336, "step": 1498 }, { "epoch": 0.3750781934192418, "grad_norm": 9.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51574290.28571428, "logits/rejected": -60402867.2, "logps/chosen": -290.1018763950893, "logps/rejected": -598.34931640625, "loss": 0.0612, "rewards/chosen": 5.244940076555524, "rewards/margins": 17.886013684953962, "rewards/rejected": -12.641073608398438, "step": 1499 }, { "epoch": 0.3753284123608157, "grad_norm": 10.0, "kl": 9.629730224609375, "learning_rate": 5e-06, "logits/chosen": -86127396.57142857, "logits/rejected": -47186486.4, "logps/chosen": -359.7578822544643, "logps/rejected": -604.67939453125, "loss": 0.0445, "rewards/chosen": 7.190484183175223, "rewards/margins": 21.387906973702567, "rewards/rejected": -14.197422790527344, "step": 1500 }, { "epoch": 0.3755786313023896, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66806793.84615385, "logits/rejected": -43458059.63636363, "logps/chosen": -437.1399489182692, "logps/rejected": -411.3058416193182, "loss": 0.061, "rewards/chosen": 6.5617546668419475, "rewards/margins": 15.403611883416877, "rewards/rejected": -8.84185721657493, "step": 1501 }, { "epoch": 0.37582885024396345, "grad_norm": 11.5, "kl": 1.826680064201355, "learning_rate": 5e-06, "logits/chosen": -61711133.86666667, "logits/rejected": -82257905.77777778, "logps/chosen": -304.7396484375, "logps/rejected": -540.8516710069445, "loss": 0.0394, "rewards/chosen": 5.438275655110677, "rewards/margins": 16.29000006781684, "rewards/rejected": -10.851724412706163, "step": 1502 }, { "epoch": 0.37607906918553735, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34954262.4, "logits/rejected": -50900754.28571428, "logps/chosen": -395.3766357421875, "logps/rejected": -674.3683733258929, "loss": 0.0212, "rewards/chosen": 6.769110870361328, "rewards/margins": 21.44669919695173, "rewards/rejected": -14.677588326590401, "step": 1503 }, { "epoch": 0.37632928812711125, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58222069.333333336, "logits/rejected": -51208453.333333336, "logps/chosen": -371.5040283203125, "logps/rejected": -579.203125, "loss": 0.0455, "rewards/chosen": 4.879982630411784, "rewards/margins": 16.281888961791992, "rewards/rejected": -11.401906331380209, "step": 1504 }, { "epoch": 0.3765795070686851, "grad_norm": 1.3046875, "kl": 2.357339382171631, "learning_rate": 5e-06, "logits/chosen": -45106060.8, "logits/rejected": -55622848.0, "logps/chosen": -502.6572265625, "logps/rejected": -823.5059678819445, "loss": 0.0364, "rewards/chosen": 9.417509969075521, "rewards/margins": 26.391280110677084, "rewards/rejected": -16.973770141601562, "step": 1505 }, { "epoch": 0.376829726010259, "grad_norm": 17.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55083072.0, "logits/rejected": -45244051.2, "logps/chosen": -315.50167410714283, "logps/rejected": -541.287109375, "loss": 0.079, "rewards/chosen": 4.188118525913784, "rewards/margins": 16.441922542027065, "rewards/rejected": -12.253804016113282, "step": 1506 }, { "epoch": 0.37707994495183284, "grad_norm": 3.46875, "kl": 0.1512959897518158, "learning_rate": 5e-06, "logits/chosen": 2154250.285714286, "logits/rejected": -82293952.0, "logps/chosen": -369.66556222098217, "logps/rejected": -530.7470703125, "loss": 0.0154, "rewards/chosen": 6.587891714913504, "rewards/margins": 20.365275137765067, "rewards/rejected": -13.777383422851562, "step": 1507 }, { "epoch": 0.37733016389340673, "grad_norm": 14.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44232152.0, "logits/rejected": -50541624.0, "logps/chosen": -380.9112854003906, "logps/rejected": -488.965576171875, "loss": 0.0534, "rewards/chosen": 5.986564636230469, "rewards/margins": 15.534777641296387, "rewards/rejected": -9.548213005065918, "step": 1508 }, { "epoch": 0.37758038283498063, "grad_norm": 3.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40509820.0, "logits/rejected": -68375168.0, "logps/chosen": -392.9261474609375, "logps/rejected": -575.1669921875, "loss": 0.0321, "rewards/chosen": 6.878152847290039, "rewards/margins": 18.946308135986328, "rewards/rejected": -12.068155288696289, "step": 1509 }, { "epoch": 0.3778306017765545, "grad_norm": 9.375, "kl": 4.054653167724609, "learning_rate": 5e-06, "logits/chosen": -30676578.285714287, "logits/rejected": -48333776.0, "logps/chosen": -550.9013323102679, "logps/rejected": -822.26640625, "loss": 0.0075, "rewards/chosen": 8.581047058105469, "rewards/margins": 27.540748596191406, "rewards/rejected": -18.959701538085938, "step": 1510 }, { "epoch": 0.3780808207181284, "grad_norm": 3.21875, "kl": 2.9975523948669434, "learning_rate": 5e-06, "logits/chosen": -69100750.76923077, "logits/rejected": -41653134.54545455, "logps/chosen": -490.8140399639423, "logps/rejected": -612.9503284801136, "loss": 0.0054, "rewards/chosen": 8.579756516676683, "rewards/margins": 20.793590412273275, "rewards/rejected": -12.213833895596592, "step": 1511 }, { "epoch": 0.3783310396597022, "grad_norm": 5.4375, "kl": 16.88369369506836, "learning_rate": 5e-06, "logits/chosen": -54836566.85714286, "logits/rejected": -31125961.6, "logps/chosen": -430.0634068080357, "logps/rejected": -454.46572265625, "loss": 0.0706, "rewards/chosen": 10.103321620396205, "rewards/margins": 19.27356545584542, "rewards/rejected": -9.170243835449218, "step": 1512 }, { "epoch": 0.3785812586012761, "grad_norm": 3.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72190966.15384616, "logits/rejected": -66811345.45454545, "logps/chosen": -348.3506610576923, "logps/rejected": -591.2220348011364, "loss": 0.0289, "rewards/chosen": 6.97769282414363, "rewards/margins": 18.67318288096181, "rewards/rejected": -11.695490056818182, "step": 1513 }, { "epoch": 0.37883147754285, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79889938.28571428, "logits/rejected": -55504806.4, "logps/chosen": -397.7470005580357, "logps/rejected": -652.032080078125, "loss": 0.0574, "rewards/chosen": 7.618073599679129, "rewards/margins": 21.785721915108816, "rewards/rejected": -14.167648315429688, "step": 1514 }, { "epoch": 0.37908169648442386, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13003349.0, "logits/rejected": -42715532.0, "logps/chosen": -354.6552734375, "logps/rejected": -571.0947875976562, "loss": 0.0313, "rewards/chosen": 6.287143707275391, "rewards/margins": 18.31677532196045, "rewards/rejected": -12.029631614685059, "step": 1515 }, { "epoch": 0.37933191542599776, "grad_norm": 10.6875, "kl": 23.4040584564209, "learning_rate": 5e-06, "logits/chosen": -43846515.2, "logits/rejected": -33908547.55555555, "logps/chosen": -423.00283203125, "logps/rejected": -403.5305989583333, "loss": 0.1387, "rewards/chosen": 8.704026285807291, "rewards/margins": 16.69274664984809, "rewards/rejected": -7.988720364040798, "step": 1516 }, { "epoch": 0.3795821343675716, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -81527261.0909091, "logits/rejected": -26310112.0, "logps/chosen": -384.15988991477275, "logps/rejected": -470.4381760817308, "loss": 0.0291, "rewards/chosen": 7.699965043501421, "rewards/margins": 19.194301685253222, "rewards/rejected": -11.494336641751802, "step": 1517 }, { "epoch": 0.3798323533091455, "grad_norm": 15.8125, "kl": 4.488152980804443, "learning_rate": 5e-06, "logits/chosen": -59436473.6, "logits/rejected": 46565211.428571425, "logps/chosen": -411.4630859375, "logps/rejected": -614.6962890625, "loss": 0.0479, "rewards/chosen": 6.233036804199219, "rewards/margins": 17.670736258370535, "rewards/rejected": -11.437699454171318, "step": 1518 }, { "epoch": 0.3800825722507194, "grad_norm": 2.015625, "kl": 5.165832042694092, "learning_rate": 5e-06, "logits/chosen": -31114702.545454547, "logits/rejected": -73696659.6923077, "logps/chosen": -357.61172762784093, "logps/rejected": -678.9456129807693, "loss": 0.0079, "rewards/chosen": 8.33049149946733, "rewards/margins": 21.25602860884233, "rewards/rejected": -12.925537109375, "step": 1519 }, { "epoch": 0.38033279119229324, "grad_norm": 17.125, "kl": 11.158552169799805, "learning_rate": 5e-06, "logits/chosen": -57488808.72727273, "logits/rejected": -39432851.692307696, "logps/chosen": -456.2736150568182, "logps/rejected": -261.6331129807692, "loss": 0.1006, "rewards/chosen": 8.497854059392756, "rewards/margins": 14.644600181312828, "rewards/rejected": -6.1467461219200725, "step": 1520 }, { "epoch": 0.38058301013386714, "grad_norm": 21.375, "kl": 4.887245178222656, "learning_rate": 5e-06, "logits/chosen": -56543266.90909091, "logits/rejected": -90804214.15384616, "logps/chosen": -431.1031605113636, "logps/rejected": -582.3155423677885, "loss": 0.0892, "rewards/chosen": 6.0468361594460225, "rewards/margins": 14.944600512097765, "rewards/rejected": -8.897764352651743, "step": 1521 }, { "epoch": 0.38083322907544104, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19850392.727272727, "logits/rejected": -31027657.846153848, "logps/chosen": -475.45210404829544, "logps/rejected": -614.6077599158654, "loss": 0.0296, "rewards/chosen": 7.16021728515625, "rewards/margins": 20.232137826772835, "rewards/rejected": -13.071920541616587, "step": 1522 }, { "epoch": 0.3810834480170149, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72314741.33333333, "logits/rejected": -67724346.66666667, "logps/chosen": -454.7184651692708, "logps/rejected": -529.76220703125, "loss": 0.0629, "rewards/chosen": 7.364760716756185, "rewards/margins": 16.69577980041504, "rewards/rejected": -9.331019083658854, "step": 1523 }, { "epoch": 0.3813336669585888, "grad_norm": 14.5625, "kl": 0.7795896530151367, "learning_rate": 5e-06, "logits/chosen": -73304938.66666667, "logits/rejected": -56953344.0, "logps/chosen": -288.66514078776044, "logps/rejected": -507.5502522786458, "loss": 0.0685, "rewards/chosen": 5.534720102945964, "rewards/margins": 16.35481135050456, "rewards/rejected": -10.820091247558594, "step": 1524 }, { "epoch": 0.3815838859001626, "grad_norm": 5.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55282208.0, "logits/rejected": -75112214.85714285, "logps/chosen": -404.2345947265625, "logps/rejected": -568.1969866071429, "loss": 0.055, "rewards/chosen": 6.519377136230469, "rewards/margins": 14.721871076311384, "rewards/rejected": -8.202493940080915, "step": 1525 }, { "epoch": 0.3818341048417365, "grad_norm": 5.875, "kl": 2.9955215454101562, "learning_rate": 5e-06, "logits/chosen": -55246453.333333336, "logits/rejected": -42434261.333333336, "logps/chosen": -370.6536051432292, "logps/rejected": -461.8415934244792, "loss": 0.0289, "rewards/chosen": 7.182188669840495, "rewards/margins": 15.839283625284832, "rewards/rejected": -8.657094955444336, "step": 1526 }, { "epoch": 0.3820843237833104, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6866636.0, "logits/rejected": -72539656.0, "logps/chosen": -575.23779296875, "logps/rejected": -502.9908447265625, "loss": 0.0621, "rewards/chosen": 8.676311492919922, "rewards/margins": 18.104681968688965, "rewards/rejected": -9.428370475769043, "step": 1527 }, { "epoch": 0.38233454272488426, "grad_norm": 6.90625, "kl": 5.5987701416015625, "learning_rate": 5e-06, "logits/chosen": -80637710.22222222, "logits/rejected": -39810432.0, "logps/chosen": -490.1339518229167, "logps/rejected": -569.58125, "loss": 0.0175, "rewards/chosen": 9.917039659288195, "rewards/margins": 20.660138617621527, "rewards/rejected": -10.743098958333333, "step": 1528 }, { "epoch": 0.38258476166645816, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28269437.09090909, "logits/rejected": -36584093.538461536, "logps/chosen": -363.4459117542614, "logps/rejected": -454.58657602163464, "loss": 0.0532, "rewards/chosen": 6.719588539817116, "rewards/margins": 18.085076445466154, "rewards/rejected": -11.365487905649038, "step": 1529 }, { "epoch": 0.382834980608032, "grad_norm": 15.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53403863.27272727, "logits/rejected": -52322432.0, "logps/chosen": -292.2071644176136, "logps/rejected": -574.6057692307693, "loss": 0.0665, "rewards/chosen": 5.259834289550781, "rewards/margins": 16.10341057410607, "rewards/rejected": -10.843576284555288, "step": 1530 }, { "epoch": 0.3830851995496059, "grad_norm": 17.5, "kl": 7.816334247589111, "learning_rate": 5e-06, "logits/chosen": -48588228.0, "logits/rejected": -40557196.0, "logps/chosen": -403.4239807128906, "logps/rejected": -639.576904296875, "loss": 0.055, "rewards/chosen": 7.863036155700684, "rewards/margins": 16.942940711975098, "rewards/rejected": -9.079904556274414, "step": 1531 }, { "epoch": 0.3833354184911798, "grad_norm": 10.0625, "kl": 3.45674467086792, "learning_rate": 5e-06, "logits/chosen": -35228987.428571425, "logits/rejected": -67342969.6, "logps/chosen": -289.88779994419644, "logps/rejected": -502.24150390625, "loss": 0.0697, "rewards/chosen": 5.424467904227121, "rewards/margins": 17.334997994559153, "rewards/rejected": -11.910530090332031, "step": 1532 }, { "epoch": 0.38358563743275365, "grad_norm": 3.171875, "kl": 1.3790347576141357, "learning_rate": 5e-06, "logits/chosen": -50948342.85714286, "logits/rejected": -68044672.0, "logps/chosen": -345.66012137276783, "logps/rejected": -576.31328125, "loss": 0.0271, "rewards/chosen": 8.226844787597656, "rewards/margins": 18.942554473876953, "rewards/rejected": -10.715709686279297, "step": 1533 }, { "epoch": 0.38383585637432754, "grad_norm": 19.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4193448.0, "logits/rejected": -49205840.0, "logps/chosen": -679.2981567382812, "logps/rejected": -543.2861938476562, "loss": 0.0363, "rewards/chosen": 10.279638290405273, "rewards/margins": 19.407093048095703, "rewards/rejected": -9.12745475769043, "step": 1534 }, { "epoch": 0.3840860753159014, "grad_norm": 2.640625, "kl": 1.4770686626434326, "learning_rate": 5e-06, "logits/chosen": -61227688.72727273, "logits/rejected": -39253051.07692308, "logps/chosen": -537.5231711647727, "logps/rejected": -724.4465895432693, "loss": 0.0179, "rewards/chosen": 10.051686373623935, "rewards/margins": 21.17281533621408, "rewards/rejected": -11.121128962590145, "step": 1535 }, { "epoch": 0.3843362942574753, "grad_norm": 19.25, "kl": 5.299968719482422, "learning_rate": 5e-06, "logits/chosen": -27923964.8, "logits/rejected": -30241273.14285714, "logps/chosen": -473.3396484375, "logps/rejected": -485.83272879464283, "loss": 0.0215, "rewards/chosen": 9.987073516845703, "rewards/margins": 18.1262941632952, "rewards/rejected": -8.139220646449498, "step": 1536 }, { "epoch": 0.3845865131990492, "grad_norm": 4.15625, "kl": 3.274555206298828, "learning_rate": 5e-06, "logits/chosen": -49640285.09090909, "logits/rejected": -43750931.692307696, "logps/chosen": -379.07590553977275, "logps/rejected": -545.8899489182693, "loss": 0.0391, "rewards/chosen": 6.578675703568892, "rewards/margins": 17.362469226330308, "rewards/rejected": -10.783793522761417, "step": 1537 }, { "epoch": 0.38483673214062303, "grad_norm": 9.0625, "kl": 3.5699737071990967, "learning_rate": 5e-06, "logits/chosen": -35539602.28571428, "logits/rejected": -44411769.6, "logps/chosen": -301.36854771205356, "logps/rejected": -553.735205078125, "loss": 0.0625, "rewards/chosen": 6.414234706333706, "rewards/margins": 17.522111293247768, "rewards/rejected": -11.107876586914063, "step": 1538 }, { "epoch": 0.3850869510821969, "grad_norm": 22.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35209483.63636363, "logits/rejected": -42758852.92307692, "logps/chosen": -351.9339488636364, "logps/rejected": -648.3533653846154, "loss": 0.0815, "rewards/chosen": 5.24550385908647, "rewards/margins": 16.894082209447046, "rewards/rejected": -11.648578350360577, "step": 1539 }, { "epoch": 0.3853371700237708, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62190650.18181818, "logits/rejected": -78514953.84615384, "logps/chosen": -480.68599076704544, "logps/rejected": -568.4444861778846, "loss": 0.0074, "rewards/chosen": 7.456821788441051, "rewards/margins": 19.565300041145377, "rewards/rejected": -12.108478252704327, "step": 1540 }, { "epoch": 0.38558738896534467, "grad_norm": 15.0, "kl": 11.122676849365234, "learning_rate": 5e-06, "logits/chosen": -61903338.666666664, "logits/rejected": -62698112.0, "logps/chosen": -254.6910196940104, "logps/rejected": -789.6026204427084, "loss": 0.0958, "rewards/chosen": 4.186389287312825, "rewards/margins": 17.29101626078288, "rewards/rejected": -13.104626973470053, "step": 1541 }, { "epoch": 0.38583760790691857, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30149254.0, "logits/rejected": -19735254.0, "logps/chosen": -276.3680419921875, "logps/rejected": -648.9039916992188, "loss": 0.064, "rewards/chosen": 5.651928424835205, "rewards/margins": 20.514277935028076, "rewards/rejected": -14.862349510192871, "step": 1542 }, { "epoch": 0.3860878268484924, "grad_norm": 14.8125, "kl": 3.869978666305542, "learning_rate": 5e-06, "logits/chosen": -18704593.14285714, "logits/rejected": -53599376.0, "logps/chosen": -492.6671665736607, "logps/rejected": -642.15009765625, "loss": 0.0241, "rewards/chosen": 8.568756648472377, "rewards/margins": 22.675739070347376, "rewards/rejected": -14.106982421875, "step": 1543 }, { "epoch": 0.3863380457900663, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14169156.923076924, "logits/rejected": -39728826.18181818, "logps/chosen": -246.31049053485577, "logps/rejected": -568.0261008522727, "loss": 0.0615, "rewards/chosen": 6.015750591571514, "rewards/margins": 17.324505946019315, "rewards/rejected": -11.308755354447799, "step": 1544 }, { "epoch": 0.3865882647316402, "grad_norm": 11.9375, "kl": 1.039900541305542, "learning_rate": 5e-06, "logits/chosen": -69726037.33333333, "logits/rejected": -16867088.0, "logps/chosen": -360.1797200520833, "logps/rejected": -545.5159505208334, "loss": 0.0805, "rewards/chosen": 5.97146962483724, "rewards/margins": 14.771454874674479, "rewards/rejected": -8.79998524983724, "step": 1545 }, { "epoch": 0.38683848367321405, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49347886.54545455, "logits/rejected": -81052977.23076923, "logps/chosen": -301.373046875, "logps/rejected": -740.3809344951923, "loss": 0.0526, "rewards/chosen": 5.9335105202414775, "rewards/margins": 20.306566785265517, "rewards/rejected": -14.373056265024038, "step": 1546 }, { "epoch": 0.38708870261478795, "grad_norm": 1.4140625, "kl": 1.5461070537567139, "learning_rate": 5e-06, "logits/chosen": -48313552.0, "logits/rejected": -48347562.666666664, "logps/chosen": -379.9781901041667, "logps/rejected": -606.9390462239584, "loss": 0.0425, "rewards/chosen": 7.176903406778972, "rewards/margins": 18.952517827351887, "rewards/rejected": -11.775614420572916, "step": 1547 }, { "epoch": 0.3873389215563618, "grad_norm": 6.40625, "kl": 3.7011592388153076, "learning_rate": 5e-06, "logits/chosen": -87210321.45454545, "logits/rejected": -21078112.0, "logps/chosen": -483.9068714488636, "logps/rejected": -326.86609825721155, "loss": 0.0311, "rewards/chosen": 8.05739385431463, "rewards/margins": 15.626142915312226, "rewards/rejected": -7.568749060997596, "step": 1548 }, { "epoch": 0.3875891404979357, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -95598359.27272727, "logits/rejected": -24670084.923076924, "logps/chosen": -453.8734019886364, "logps/rejected": -477.9554912860577, "loss": 0.0272, "rewards/chosen": 7.100312666459517, "rewards/margins": 19.62544506579846, "rewards/rejected": -12.525132399338942, "step": 1549 }, { "epoch": 0.3878393594395096, "grad_norm": 10.8125, "kl": 7.050755977630615, "learning_rate": 5e-06, "logits/chosen": -38994102.85714286, "logits/rejected": -39168192.0, "logps/chosen": -521.2699497767857, "logps/rejected": -576.305419921875, "loss": 0.0396, "rewards/chosen": 9.492527553013392, "rewards/margins": 20.6130857195173, "rewards/rejected": -11.120558166503907, "step": 1550 }, { "epoch": 0.38808957838108343, "grad_norm": 17.75, "kl": 1.4493141174316406, "learning_rate": 5e-06, "logits/chosen": -92590517.33333333, "logits/rejected": -56608421.333333336, "logps/chosen": -466.5970052083333, "logps/rejected": -635.5377604166666, "loss": 0.0654, "rewards/chosen": 8.066460291544596, "rewards/margins": 20.011279424031574, "rewards/rejected": -11.944819132486979, "step": 1551 }, { "epoch": 0.38833979732265733, "grad_norm": 6.40625, "kl": 4.593206405639648, "learning_rate": 5e-06, "logits/chosen": -64595889.23076923, "logits/rejected": -34343517.09090909, "logps/chosen": -419.7858323317308, "logps/rejected": -520.7568359375, "loss": 0.0287, "rewards/chosen": 7.569098252516526, "rewards/margins": 17.65582504805985, "rewards/rejected": -10.086726795543324, "step": 1552 }, { "epoch": 0.3885900162642312, "grad_norm": 20.0, "kl": 6.921544075012207, "learning_rate": 5e-06, "logits/chosen": -37210870.15384615, "logits/rejected": -40107886.54545455, "logps/chosen": -337.2096980168269, "logps/rejected": -595.0050603693181, "loss": 0.0531, "rewards/chosen": 4.817118131197416, "rewards/margins": 18.157794178782645, "rewards/rejected": -13.340676047585227, "step": 1553 }, { "epoch": 0.3888402352058051, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32519335.111111112, "logits/rejected": -52503944.53333333, "logps/chosen": -461.6263020833333, "logps/rejected": -454.01689453125, "loss": 0.0394, "rewards/chosen": 7.8503528171115455, "rewards/margins": 17.58455488416884, "rewards/rejected": -9.734202067057291, "step": 1554 }, { "epoch": 0.389090454147379, "grad_norm": 10.625, "kl": 11.004191398620605, "learning_rate": 5e-06, "logits/chosen": -45119763.692307696, "logits/rejected": -34119418.18181818, "logps/chosen": -416.51307091346155, "logps/rejected": -470.24360795454544, "loss": 0.0213, "rewards/chosen": 8.634840745192308, "rewards/margins": 20.445210063374127, "rewards/rejected": -11.810369318181818, "step": 1555 }, { "epoch": 0.3893406730889528, "grad_norm": 8.5, "kl": 5.074847221374512, "learning_rate": 5e-06, "logits/chosen": -52541892.266666666, "logits/rejected": -58095320.88888889, "logps/chosen": -383.05052083333334, "logps/rejected": -496.83018663194446, "loss": 0.0573, "rewards/chosen": 7.852602640787761, "rewards/margins": 17.40517849392361, "rewards/rejected": -9.552575853135851, "step": 1556 }, { "epoch": 0.3895908920305267, "grad_norm": 13.625, "kl": 4.986838340759277, "learning_rate": 5e-06, "logits/chosen": -50742144.0, "logits/rejected": -45844102.4, "logps/chosen": -509.5894252232143, "logps/rejected": -486.414697265625, "loss": 0.018, "rewards/chosen": 8.469165257045201, "rewards/margins": 18.375714329310824, "rewards/rejected": -9.906549072265625, "step": 1557 }, { "epoch": 0.3898411109721006, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33524884.0, "logits/rejected": 15619832.0, "logps/chosen": -394.5790100097656, "logps/rejected": -772.2391357421875, "loss": 0.0224, "rewards/chosen": 6.39989709854126, "rewards/margins": 20.26790189743042, "rewards/rejected": -13.86800479888916, "step": 1558 }, { "epoch": 0.39009132991367446, "grad_norm": 20.25, "kl": 18.480480194091797, "learning_rate": 5e-06, "logits/chosen": -70890555.42857143, "logits/rejected": -61248192.0, "logps/chosen": -429.66688755580356, "logps/rejected": -614.1064453125, "loss": 0.047, "rewards/chosen": 6.514197758265904, "rewards/margins": 16.985174015590122, "rewards/rejected": -10.470976257324219, "step": 1559 }, { "epoch": 0.39034154885524835, "grad_norm": 15.75, "kl": 0.6127548217773438, "learning_rate": 5e-06, "logits/chosen": -59696005.333333336, "logits/rejected": -78712608.0, "logps/chosen": -377.5688883463542, "logps/rejected": -707.6808268229166, "loss": 0.0327, "rewards/chosen": 7.171212514241536, "rewards/margins": 22.457936604817707, "rewards/rejected": -15.286724090576172, "step": 1560 }, { "epoch": 0.3905917677968222, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44644579.2, "logits/rejected": -60546816.0, "logps/chosen": -398.78916015625, "logps/rejected": -696.2825055803571, "loss": 0.0285, "rewards/chosen": 7.975958251953125, "rewards/margins": 20.657434300013954, "rewards/rejected": -12.681476048060826, "step": 1561 }, { "epoch": 0.3908419867383961, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17875178.666666668, "logits/rejected": -44063910.4, "logps/chosen": -253.60259331597223, "logps/rejected": -494.1056640625, "loss": 0.022, "rewards/chosen": 6.223546769883898, "rewards/margins": 17.610119035508898, "rewards/rejected": -11.386572265625, "step": 1562 }, { "epoch": 0.39109220567997, "grad_norm": 11.5, "kl": 18.026988983154297, "learning_rate": 5e-06, "logits/chosen": -49660525.176470585, "logits/rejected": -91150619.42857143, "logps/chosen": -386.2071174172794, "logps/rejected": -676.1298130580357, "loss": 0.0718, "rewards/chosen": 8.575309304630055, "rewards/margins": 22.164142704811418, "rewards/rejected": -13.588833400181361, "step": 1563 }, { "epoch": 0.39134242462154384, "grad_norm": 14.1875, "kl": 0.5734914541244507, "learning_rate": 5e-06, "logits/chosen": -80333809.77777778, "logits/rejected": -34712507.733333334, "logps/chosen": -375.451171875, "logps/rejected": -593.5008463541667, "loss": 0.0577, "rewards/chosen": 7.233046637641059, "rewards/margins": 17.59545610215929, "rewards/rejected": -10.36240946451823, "step": 1564 }, { "epoch": 0.39159264356311774, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49640634.666666664, "logits/rejected": -33605261.333333336, "logps/chosen": -367.917236328125, "logps/rejected": -377.2064615885417, "loss": 0.0241, "rewards/chosen": 7.1159407297770185, "rewards/margins": 16.494548797607422, "rewards/rejected": -9.378608067830404, "step": 1565 }, { "epoch": 0.3918428625046916, "grad_norm": 4.4375, "kl": 6.361255645751953, "learning_rate": 5e-06, "logits/chosen": -37976981.333333336, "logits/rejected": -53967562.666666664, "logps/chosen": -368.8640950520833, "logps/rejected": -526.2738444010416, "loss": 0.0455, "rewards/chosen": 6.881423314412435, "rewards/margins": 18.78554089864095, "rewards/rejected": -11.904117584228516, "step": 1566 }, { "epoch": 0.3920930814462655, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51512531.692307696, "logits/rejected": -66421771.63636363, "logps/chosen": -433.203125, "logps/rejected": -584.4297762784091, "loss": 0.0361, "rewards/chosen": 7.266213637131911, "rewards/margins": 16.878148645787807, "rewards/rejected": -9.611935008655895, "step": 1567 }, { "epoch": 0.3923433003878394, "grad_norm": 3.234375, "kl": 8.539081573486328, "learning_rate": 5e-06, "logits/chosen": -50990528.0, "logits/rejected": -24223909.333333332, "logps/chosen": -413.7838948567708, "logps/rejected": -267.9441731770833, "loss": 0.0482, "rewards/chosen": 8.737911224365234, "rewards/margins": 14.086903254191082, "rewards/rejected": -5.348992029825847, "step": 1568 }, { "epoch": 0.3925935193294132, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63922569.84615385, "logits/rejected": -49597300.36363637, "logps/chosen": -439.88146033653845, "logps/rejected": -708.5866477272727, "loss": 0.0171, "rewards/chosen": 8.851475642277645, "rewards/margins": 23.01541745912779, "rewards/rejected": -14.163941816850143, "step": 1569 }, { "epoch": 0.3928437382709871, "grad_norm": 12.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56432226.461538464, "logits/rejected": -42487621.81818182, "logps/chosen": -294.70742563100964, "logps/rejected": -500.23002485795456, "loss": 0.0889, "rewards/chosen": 6.72278066781851, "rewards/margins": 17.206085471840172, "rewards/rejected": -10.483304804021662, "step": 1570 }, { "epoch": 0.393093957212561, "grad_norm": 16.0, "kl": 0.8531255722045898, "learning_rate": 5e-06, "logits/chosen": -43765696.0, "logits/rejected": -39412829.09090909, "logps/chosen": -377.3454777644231, "logps/rejected": -468.8181818181818, "loss": 0.124, "rewards/chosen": 5.567727309006911, "rewards/margins": 16.901978712815506, "rewards/rejected": -11.334251403808594, "step": 1571 }, { "epoch": 0.39334417615413486, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50214562.461538464, "logits/rejected": -47814362.18181818, "logps/chosen": -335.5585186298077, "logps/rejected": -620.6560724431819, "loss": 0.025, "rewards/chosen": 6.158630957970252, "rewards/margins": 18.306207029969542, "rewards/rejected": -12.14757607199929, "step": 1572 }, { "epoch": 0.39359439509570876, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35875925.333333336, "logits/rejected": -49550202.666666664, "logps/chosen": -386.1427001953125, "logps/rejected": -579.5741780598959, "loss": 0.0416, "rewards/chosen": 6.091965993245442, "rewards/margins": 16.14248212178548, "rewards/rejected": -10.050516128540039, "step": 1573 }, { "epoch": 0.3938446140372826, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22923065.6, "logits/rejected": -44994925.71428572, "logps/chosen": -257.373095703125, "logps/rejected": -603.9144810267857, "loss": 0.0741, "rewards/chosen": 6.081145858764648, "rewards/margins": 15.729956109183174, "rewards/rejected": -9.648810250418526, "step": 1574 }, { "epoch": 0.3940948329788565, "grad_norm": 6.65625, "kl": 2.7457356452941895, "learning_rate": 5e-06, "logits/chosen": -49475496.421052635, "logits/rejected": -43429244.8, "logps/chosen": -377.7696083470395, "logps/rejected": -1062.3294921875, "loss": 0.0844, "rewards/chosen": 6.2845410798725325, "rewards/margins": 22.17435736405222, "rewards/rejected": -15.889816284179688, "step": 1575 }, { "epoch": 0.3943450519204304, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43314865.45454545, "logits/rejected": -53678547.692307696, "logps/chosen": -419.62491122159093, "logps/rejected": -602.8793194110577, "loss": 0.0147, "rewards/chosen": 6.8964316628196025, "rewards/margins": 16.72953454931299, "rewards/rejected": -9.83310288649339, "step": 1576 }, { "epoch": 0.39459527086200424, "grad_norm": 17.375, "kl": 6.0239362716674805, "learning_rate": 5e-06, "logits/chosen": -39985053.86666667, "logits/rejected": -29856583.111111112, "logps/chosen": -423.2427083333333, "logps/rejected": -605.8662109375, "loss": 0.0905, "rewards/chosen": 6.987606811523437, "rewards/margins": 19.554048665364583, "rewards/rejected": -12.566441853841146, "step": 1577 }, { "epoch": 0.39484548980357814, "grad_norm": 0.73046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60815378.28571428, "logits/rejected": -55106153.4117647, "logps/chosen": -338.4697265625, "logps/rejected": -619.9574333639706, "loss": 0.0123, "rewards/chosen": 6.868181501116071, "rewards/margins": 19.0307504349396, "rewards/rejected": -12.162568933823529, "step": 1578 }, { "epoch": 0.395095708745152, "grad_norm": 5.625, "kl": 5.623414993286133, "learning_rate": 5e-06, "logits/chosen": -65370400.0, "logits/rejected": -44937066.666666664, "logps/chosen": -432.1836751302083, "logps/rejected": -439.462890625, "loss": 0.0311, "rewards/chosen": 8.802015940348307, "rewards/margins": 20.291122436523438, "rewards/rejected": -11.48910649617513, "step": 1579 }, { "epoch": 0.3953459276867259, "grad_norm": 7.625, "kl": 6.58470344543457, "learning_rate": 5e-06, "logits/chosen": -71539460.57142857, "logits/rejected": -62250611.2, "logps/chosen": -363.54649135044644, "logps/rejected": -646.99189453125, "loss": 0.043, "rewards/chosen": 6.553089686802456, "rewards/margins": 17.648787471226285, "rewards/rejected": -11.095697784423828, "step": 1580 }, { "epoch": 0.3955961466282998, "grad_norm": 7.78125, "kl": 7.161094665527344, "learning_rate": 5e-06, "logits/chosen": -36932496.0, "logits/rejected": -37177340.8, "logps/chosen": -284.66395786830356, "logps/rejected": -448.852099609375, "loss": 0.0578, "rewards/chosen": 6.408753531319754, "rewards/margins": 16.192396872384208, "rewards/rejected": -9.783643341064453, "step": 1581 }, { "epoch": 0.3958463655698736, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54379044.571428575, "logits/rejected": -68398223.05882353, "logps/chosen": -410.6493443080357, "logps/rejected": -682.7164522058823, "loss": 0.0215, "rewards/chosen": 8.095538548060826, "rewards/margins": 20.673114231654576, "rewards/rejected": -12.57757568359375, "step": 1582 }, { "epoch": 0.3960965845114475, "grad_norm": 10.9375, "kl": 3.7315454483032227, "learning_rate": 5e-06, "logits/chosen": -52928290.13333333, "logits/rejected": -44914595.55555555, "logps/chosen": -353.40113932291666, "logps/rejected": -450.56220160590277, "loss": 0.0447, "rewards/chosen": 8.047874450683594, "rewards/margins": 19.42325761583116, "rewards/rejected": -11.37538316514757, "step": 1583 }, { "epoch": 0.39634680345302137, "grad_norm": 5.46875, "kl": 2.293125867843628, "learning_rate": 5e-06, "logits/chosen": -47573681.777777776, "logits/rejected": -40983957.333333336, "logps/chosen": -372.53325737847223, "logps/rejected": -464.9585367838542, "loss": 0.0259, "rewards/chosen": 6.833401997884114, "rewards/margins": 14.92130406697591, "rewards/rejected": -8.087902069091797, "step": 1584 }, { "epoch": 0.39659702239459527, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74240585.84615384, "logits/rejected": -53382562.90909091, "logps/chosen": -361.5519831730769, "logps/rejected": -506.85751065340907, "loss": 0.0438, "rewards/chosen": 7.130145733173077, "rewards/margins": 19.17484566215035, "rewards/rejected": -12.044699928977273, "step": 1585 }, { "epoch": 0.39684724133616917, "grad_norm": 13.375, "kl": 7.100118637084961, "learning_rate": 5e-06, "logits/chosen": -66369619.692307696, "logits/rejected": 33732331.63636363, "logps/chosen": -336.21375450721155, "logps/rejected": -445.1492365056818, "loss": 0.037, "rewards/chosen": 7.199116633488582, "rewards/margins": 16.743858177345114, "rewards/rejected": -9.544741543856533, "step": 1586 }, { "epoch": 0.397097460277743, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52292499.692307696, "logits/rejected": -75604596.36363636, "logps/chosen": -377.1007737379808, "logps/rejected": -673.7362393465909, "loss": 0.0264, "rewards/chosen": 7.1330425555889425, "rewards/margins": 21.752958604505846, "rewards/rejected": -14.619916048916904, "step": 1587 }, { "epoch": 0.3973476792193169, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54069912.615384616, "logits/rejected": 23352887.272727273, "logps/chosen": -383.5125075120192, "logps/rejected": -674.1711647727273, "loss": 0.0264, "rewards/chosen": 7.9783806434044475, "rewards/margins": 19.841187483780867, "rewards/rejected": -11.86280684037642, "step": 1588 }, { "epoch": 0.3975978981608908, "grad_norm": 8.25, "kl": 4.784675121307373, "learning_rate": 5e-06, "logits/chosen": -46123640.0, "logits/rejected": -19507712.0, "logps/chosen": -290.67840576171875, "logps/rejected": -291.2193603515625, "loss": 0.0548, "rewards/chosen": 5.777564525604248, "rewards/margins": 12.735287189483643, "rewards/rejected": -6.9577226638793945, "step": 1589 }, { "epoch": 0.39784811710246465, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56030048.0, "logits/rejected": -58162738.28571428, "logps/chosen": -381.098388671875, "logps/rejected": -583.4241420200893, "loss": 0.0429, "rewards/chosen": 6.6435089111328125, "rewards/margins": 18.917938232421875, "rewards/rejected": -12.274429321289062, "step": 1590 }, { "epoch": 0.39809833604403855, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71870132.36363636, "logits/rejected": -76893745.23076923, "logps/chosen": -471.12038352272725, "logps/rejected": -576.6395733173077, "loss": 0.0485, "rewards/chosen": 7.715235623446378, "rewards/margins": 17.91817282296561, "rewards/rejected": -10.20293719951923, "step": 1591 }, { "epoch": 0.3983485549856124, "grad_norm": 14.9375, "kl": 1.3240079879760742, "learning_rate": 5e-06, "logits/chosen": -28912585.14285714, "logits/rejected": -46249523.2, "logps/chosen": -253.8704833984375, "logps/rejected": -468.809619140625, "loss": 0.0928, "rewards/chosen": 6.2960357666015625, "rewards/margins": 13.71209259033203, "rewards/rejected": -7.416056823730469, "step": 1592 }, { "epoch": 0.3985987739271863, "grad_norm": 12.0, "kl": 7.383831024169922, "learning_rate": 5e-06, "logits/chosen": -58332784.0, "logits/rejected": -51747120.0, "logps/chosen": -397.6625569661458, "logps/rejected": -534.5457356770834, "loss": 0.0551, "rewards/chosen": 8.02014414469401, "rewards/margins": 19.762802124023438, "rewards/rejected": -11.742657979329428, "step": 1593 }, { "epoch": 0.3988489928687602, "grad_norm": 12.1875, "kl": 3.77521014213562, "learning_rate": 5e-06, "logits/chosen": -49732534.85714286, "logits/rejected": -43392867.2, "logps/chosen": -327.02267020089283, "logps/rejected": -474.59541015625, "loss": 0.0922, "rewards/chosen": 7.127044677734375, "rewards/margins": 17.27719039916992, "rewards/rejected": -10.150145721435546, "step": 1594 }, { "epoch": 0.39909921181033403, "grad_norm": 11.75, "kl": 2.157099485397339, "learning_rate": 5e-06, "logits/chosen": -64320992.0, "logits/rejected": -66887384.0, "logps/chosen": -499.60137939453125, "logps/rejected": -731.0667114257812, "loss": 0.0301, "rewards/chosen": 8.41608715057373, "rewards/margins": 23.448493003845215, "rewards/rejected": -15.032405853271484, "step": 1595 }, { "epoch": 0.39934943075190793, "grad_norm": 11.875, "kl": 1.0198545455932617, "learning_rate": 5e-06, "logits/chosen": -22762653.866666667, "logits/rejected": -16213962.666666666, "logps/chosen": -408.17578125, "logps/rejected": -432.27001953125, "loss": 0.0705, "rewards/chosen": 7.218156941731771, "rewards/margins": 15.025240919325087, "rewards/rejected": -7.807083977593316, "step": 1596 }, { "epoch": 0.3995996496934818, "grad_norm": 2.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29325284.57142857, "logits/rejected": -14047376.0, "logps/chosen": -362.2606724330357, "logps/rejected": -356.903125, "loss": 0.054, "rewards/chosen": 7.910264151436942, "rewards/margins": 17.62156481061663, "rewards/rejected": -9.711300659179688, "step": 1597 }, { "epoch": 0.39984986863505567, "grad_norm": 3.734375, "kl": 1.9071954488754272, "learning_rate": 5e-06, "logits/chosen": -41418423.46666667, "logits/rejected": -52286851.55555555, "logps/chosen": -407.4845703125, "logps/rejected": -751.3637152777778, "loss": 0.0529, "rewards/chosen": 7.005551656087239, "rewards/margins": 20.37004682752821, "rewards/rejected": -13.364495171440971, "step": 1598 }, { "epoch": 0.40010008757662957, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61755477.333333336, "logits/rejected": -71222661.33333333, "logps/chosen": -470.6901041666667, "logps/rejected": -531.2591959635416, "loss": 0.1537, "rewards/chosen": 7.345523198445638, "rewards/margins": 16.743183135986328, "rewards/rejected": -9.39765993754069, "step": 1599 }, { "epoch": 0.4003503065182034, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43855260.0, "logits/rejected": -39946852.0, "logps/chosen": -390.2655029296875, "logps/rejected": -591.4718017578125, "loss": 0.1058, "rewards/chosen": 5.37814998626709, "rewards/margins": 16.34162425994873, "rewards/rejected": -10.96347427368164, "step": 1600 }, { "epoch": 0.4006005254597773, "grad_norm": 7.125, "kl": 2.5327582359313965, "learning_rate": 5e-06, "logits/chosen": -22454957.333333332, "logits/rejected": -42332994.666666664, "logps/chosen": -375.364501953125, "logps/rejected": -682.6041666666666, "loss": 0.03, "rewards/chosen": 6.137227376302083, "rewards/margins": 22.949923197428383, "rewards/rejected": -16.8126958211263, "step": 1601 }, { "epoch": 0.40085074440135116, "grad_norm": 5.65625, "kl": 5.81404447555542, "learning_rate": 5e-06, "logits/chosen": -72725269.33333333, "logits/rejected": -67427829.33333333, "logps/chosen": -362.2223714192708, "logps/rejected": -767.32275390625, "loss": 0.0651, "rewards/chosen": 7.319016774495442, "rewards/margins": 24.870738983154297, "rewards/rejected": -17.551722208658855, "step": 1602 }, { "epoch": 0.40110096334292505, "grad_norm": 1.9453125, "kl": 0.6826578974723816, "learning_rate": 5e-06, "logits/chosen": -62999207.384615384, "logits/rejected": -83008180.36363636, "logps/chosen": -566.6778094951923, "logps/rejected": -835.6917613636364, "loss": 0.002, "rewards/chosen": 8.83831552358774, "rewards/margins": 25.206550331382488, "rewards/rejected": -16.368234807794746, "step": 1603 }, { "epoch": 0.40135118228449895, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58847604.36363637, "logits/rejected": -91652381.53846154, "logps/chosen": -439.6197620738636, "logps/rejected": -647.1583533653846, "loss": 0.0168, "rewards/chosen": 8.827486905184658, "rewards/margins": 19.535945438838503, "rewards/rejected": -10.708458533653847, "step": 1604 }, { "epoch": 0.4016014012260728, "grad_norm": 16.75, "kl": 11.9111328125, "learning_rate": 5e-06, "logits/chosen": -46197800.72727273, "logits/rejected": -60817403.07692308, "logps/chosen": -457.00883345170456, "logps/rejected": -662.6548978365385, "loss": 0.0449, "rewards/chosen": 8.777660023082387, "rewards/margins": 19.03984357927229, "rewards/rejected": -10.262183556189903, "step": 1605 }, { "epoch": 0.4018516201676467, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47125570.90909091, "logits/rejected": 26224886.153846152, "logps/chosen": -336.74074485085225, "logps/rejected": -609.8313551682693, "loss": 0.0566, "rewards/chosen": 6.045225663618608, "rewards/margins": 18.470347264429908, "rewards/rejected": -12.425121600811298, "step": 1606 }, { "epoch": 0.4021018391092206, "grad_norm": 1.9296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34907016.0, "logits/rejected": -48890640.0, "logps/chosen": -337.1698303222656, "logps/rejected": -714.4163818359375, "loss": 0.0222, "rewards/chosen": 5.899375915527344, "rewards/margins": 20.677775382995605, "rewards/rejected": -14.778399467468262, "step": 1607 }, { "epoch": 0.40235205805079444, "grad_norm": 1.5703125, "kl": 5.206974983215332, "learning_rate": 5e-06, "logits/chosen": -92848256.0, "logits/rejected": -34442408.0, "logps/chosen": -633.7376302083334, "logps/rejected": -457.6949055989583, "loss": 0.0038, "rewards/chosen": 9.627501169840494, "rewards/margins": 20.351619720458984, "rewards/rejected": -10.72411855061849, "step": 1608 }, { "epoch": 0.40260227699236834, "grad_norm": 16.375, "kl": 0.4834175109863281, "learning_rate": 5e-06, "logits/chosen": -27103404.0, "logits/rejected": -56363736.0, "logps/chosen": -313.13079833984375, "logps/rejected": -731.333251953125, "loss": 0.0417, "rewards/chosen": 5.4009504318237305, "rewards/margins": 22.58290386199951, "rewards/rejected": -17.18195343017578, "step": 1609 }, { "epoch": 0.4028524959339422, "grad_norm": 1.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50550870.4, "logits/rejected": -43663108.571428575, "logps/chosen": -511.809228515625, "logps/rejected": -722.9559849330357, "loss": 0.0157, "rewards/chosen": 8.118496704101563, "rewards/margins": 23.238019888741633, "rewards/rejected": -15.119523184640068, "step": 1610 }, { "epoch": 0.4031027148755161, "grad_norm": 1.375, "kl": 2.832933187484741, "learning_rate": 5e-06, "logits/chosen": -67690344.72727273, "logits/rejected": -51849491.692307696, "logps/chosen": -452.74027876420456, "logps/rejected": -715.4265324519231, "loss": 0.0262, "rewards/chosen": 6.586632468483665, "rewards/margins": 24.123131491921164, "rewards/rejected": -17.5364990234375, "step": 1611 }, { "epoch": 0.40335293381709, "grad_norm": 14.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48614493.538461536, "logits/rejected": -75255394.9090909, "logps/chosen": -316.8092698317308, "logps/rejected": -580.0197531960227, "loss": 0.0488, "rewards/chosen": 5.223993741548979, "rewards/margins": 18.292078458345856, "rewards/rejected": -13.068084716796875, "step": 1612 }, { "epoch": 0.4036031527586638, "grad_norm": 4.8125, "kl": 7.0762939453125, "learning_rate": 5e-06, "logits/chosen": -49319916.0, "logits/rejected": -64383792.0, "logps/chosen": -453.5142822265625, "logps/rejected": -615.13037109375, "loss": 0.0563, "rewards/chosen": 7.7760701179504395, "rewards/margins": 22.334112644195557, "rewards/rejected": -14.558042526245117, "step": 1613 }, { "epoch": 0.4038533717002377, "grad_norm": 12.8125, "kl": 0.6714655756950378, "learning_rate": 5e-06, "logits/chosen": -54289314.461538464, "logits/rejected": -32443156.363636363, "logps/chosen": -347.29867788461536, "logps/rejected": -524.3008700284091, "loss": 0.0431, "rewards/chosen": 6.065758925217849, "rewards/margins": 18.662302670778928, "rewards/rejected": -12.59654374556108, "step": 1614 }, { "epoch": 0.40410359064181156, "grad_norm": 14.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16611586.461538462, "logits/rejected": -58961117.09090909, "logps/chosen": -291.66426908052887, "logps/rejected": -606.5534002130681, "loss": 0.032, "rewards/chosen": 6.913573631873498, "rewards/margins": 19.429420577896224, "rewards/rejected": -12.515846946022727, "step": 1615 }, { "epoch": 0.40435380958338546, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42278272.0, "logits/rejected": -67463104.0, "logps/chosen": -469.428271484375, "logps/rejected": -714.1205357142857, "loss": 0.0212, "rewards/chosen": 9.806190490722656, "rewards/margins": 22.932485307965962, "rewards/rejected": -13.126294817243304, "step": 1616 }, { "epoch": 0.40460402852495936, "grad_norm": 10.625, "kl": 10.973780632019043, "learning_rate": 5e-06, "logits/chosen": -49494793.14285714, "logits/rejected": -11222953.6, "logps/chosen": -471.3872767857143, "logps/rejected": -633.00107421875, "loss": 0.1014, "rewards/chosen": 7.461336408342634, "rewards/margins": 20.24663303920201, "rewards/rejected": -12.785296630859374, "step": 1617 }, { "epoch": 0.4048542474665332, "grad_norm": 12.5625, "kl": 4.824309825897217, "learning_rate": 5e-06, "logits/chosen": -20458724.923076924, "logits/rejected": -20059194.181818184, "logps/chosen": -284.76647010216345, "logps/rejected": -415.1149236505682, "loss": 0.1211, "rewards/chosen": 4.953625018780048, "rewards/margins": 14.63243860631556, "rewards/rejected": -9.678813587535512, "step": 1618 }, { "epoch": 0.4051044664081071, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64775936.0, "logits/rejected": -49273388.307692304, "logps/chosen": -456.61550071022725, "logps/rejected": -896.0818810096154, "loss": 0.0161, "rewards/chosen": 8.672219016335227, "rewards/margins": 23.0474807632553, "rewards/rejected": -14.375261746920073, "step": 1619 }, { "epoch": 0.40535468534968094, "grad_norm": 3.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41676608.0, "logits/rejected": -71040580.92307693, "logps/chosen": -391.15438565340907, "logps/rejected": -545.0922475961538, "loss": 0.0149, "rewards/chosen": 8.871658325195312, "rewards/margins": 19.90182847243089, "rewards/rejected": -11.030170147235577, "step": 1620 }, { "epoch": 0.40560490429125484, "grad_norm": 6.21875, "kl": 0.8442357778549194, "learning_rate": 5e-06, "logits/chosen": -71265072.0, "logits/rejected": -44288000.0, "logps/chosen": -338.06337483723956, "logps/rejected": -408.3837890625, "loss": 0.0662, "rewards/chosen": 6.6371409098307295, "rewards/margins": 16.58682696024577, "rewards/rejected": -9.949686050415039, "step": 1621 }, { "epoch": 0.40585512323282874, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35222400.0, "logits/rejected": -28654796.8, "logps/chosen": -484.07769097222223, "logps/rejected": -592.2565755208333, "loss": 0.0137, "rewards/chosen": 7.5685475667317705, "rewards/margins": 19.422107950846353, "rewards/rejected": -11.853560384114584, "step": 1622 }, { "epoch": 0.4061053421744026, "grad_norm": 12.75, "kl": 14.491351127624512, "learning_rate": 5e-06, "logits/chosen": -39417958.4, "logits/rejected": -70630243.55555555, "logps/chosen": -397.4002278645833, "logps/rejected": -593.7132703993055, "loss": 0.0673, "rewards/chosen": 7.868872578938802, "rewards/margins": 20.150497266981336, "rewards/rejected": -12.281624688042534, "step": 1623 }, { "epoch": 0.4063555611159765, "grad_norm": 16.5, "kl": 21.086952209472656, "learning_rate": 5e-06, "logits/chosen": -76505792.0, "logits/rejected": -49889544.0, "logps/chosen": -423.59661865234375, "logps/rejected": -575.0799560546875, "loss": 0.0381, "rewards/chosen": 9.007286071777344, "rewards/margins": 22.115436553955078, "rewards/rejected": -13.108150482177734, "step": 1624 }, { "epoch": 0.4066057800575504, "grad_norm": 23.875, "kl": 0.9055683016777039, "learning_rate": 5e-06, "logits/chosen": -70076640.0, "logits/rejected": -24935338.666666668, "logps/chosen": -345.0940348307292, "logps/rejected": -418.3577067057292, "loss": 0.0782, "rewards/chosen": 6.653207778930664, "rewards/margins": 12.748928705851238, "rewards/rejected": -6.095720926920573, "step": 1625 }, { "epoch": 0.4068559989991242, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11061960.0, "logits/rejected": -48311524.571428575, "logps/chosen": -407.00556640625, "logps/rejected": -803.0528041294643, "loss": 0.0331, "rewards/chosen": 7.969582366943359, "rewards/margins": 27.628164781842912, "rewards/rejected": -19.658582414899552, "step": 1626 }, { "epoch": 0.4071062179406981, "grad_norm": 6.28125, "kl": 1.2132682800292969, "learning_rate": 5e-06, "logits/chosen": -58869605.333333336, "logits/rejected": -61672352.0, "logps/chosen": -458.9873046875, "logps/rejected": -614.5347493489584, "loss": 0.0175, "rewards/chosen": 10.299263000488281, "rewards/margins": 23.059705098470054, "rewards/rejected": -12.760442097981771, "step": 1627 }, { "epoch": 0.40735643688227197, "grad_norm": 20.25, "kl": 6.165335655212402, "learning_rate": 5e-06, "logits/chosen": -50929296.0, "logits/rejected": -60310784.0, "logps/chosen": -383.41181640625, "logps/rejected": -456.39090401785717, "loss": 0.0709, "rewards/chosen": 6.504924774169922, "rewards/margins": 14.75384793962751, "rewards/rejected": -8.248923165457589, "step": 1628 }, { "epoch": 0.40760665582384586, "grad_norm": 8.25, "kl": 11.097431182861328, "learning_rate": 5e-06, "logits/chosen": -74142600.0, "logits/rejected": -26756688.0, "logps/chosen": -420.35455322265625, "logps/rejected": -513.0372314453125, "loss": 0.0941, "rewards/chosen": 8.567237854003906, "rewards/margins": 17.679146766662598, "rewards/rejected": -9.111908912658691, "step": 1629 }, { "epoch": 0.40785687476541976, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75845862.4, "logits/rejected": -54046678.85714286, "logps/chosen": -351.93271484375, "logps/rejected": -742.3795340401786, "loss": 0.0315, "rewards/chosen": 6.866731262207031, "rewards/margins": 20.888557870047432, "rewards/rejected": -14.021826607840401, "step": 1630 }, { "epoch": 0.4081070937069936, "grad_norm": 8.4375, "kl": 9.062006950378418, "learning_rate": 5e-06, "logits/chosen": -38657929.14285714, "logits/rejected": -57676588.8, "logps/chosen": -376.6824428013393, "logps/rejected": -612.24580078125, "loss": 0.0255, "rewards/chosen": 9.395347595214844, "rewards/margins": 20.158099365234374, "rewards/rejected": -10.76275177001953, "step": 1631 }, { "epoch": 0.4083573126485675, "grad_norm": 3.796875, "kl": 3.86767840385437, "learning_rate": 5e-06, "logits/chosen": -48052172.8, "logits/rejected": -49687328.0, "logps/chosen": -414.1460286458333, "logps/rejected": -582.6489800347222, "loss": 0.0389, "rewards/chosen": 8.310963948567709, "rewards/margins": 24.010028415256077, "rewards/rejected": -15.699064466688368, "step": 1632 }, { "epoch": 0.40860753159014135, "grad_norm": 13.75, "kl": 23.204315185546875, "learning_rate": 5e-06, "logits/chosen": -63010349.71428572, "logits/rejected": -94101990.4, "logps/chosen": -445.02322823660717, "logps/rejected": -709.43779296875, "loss": 0.0838, "rewards/chosen": 8.850147247314453, "rewards/margins": 25.801343536376955, "rewards/rejected": -16.9511962890625, "step": 1633 }, { "epoch": 0.40885775053171525, "grad_norm": 13.625, "kl": 13.236379623413086, "learning_rate": 5e-06, "logits/chosen": -78423835.42857143, "logits/rejected": -59609932.8, "logps/chosen": -480.70584542410717, "logps/rejected": -621.876708984375, "loss": 0.1102, "rewards/chosen": 7.837894984654018, "rewards/margins": 19.3686765398298, "rewards/rejected": -11.530781555175782, "step": 1634 }, { "epoch": 0.40910796947328915, "grad_norm": 6.84375, "kl": 10.435267448425293, "learning_rate": 5e-06, "logits/chosen": -51758596.571428575, "logits/rejected": -10071542.4, "logps/chosen": -363.88692801339283, "logps/rejected": -366.076904296875, "loss": 0.0477, "rewards/chosen": 6.55420902797154, "rewards/margins": 13.6871458871024, "rewards/rejected": -7.13293685913086, "step": 1635 }, { "epoch": 0.409358188414863, "grad_norm": 16.5, "kl": 33.143619537353516, "learning_rate": 5e-06, "logits/chosen": -42388710.4, "logits/rejected": -46028391.11111111, "logps/chosen": -451.69182942708335, "logps/rejected": -294.4460720486111, "loss": 0.1177, "rewards/chosen": 8.79393819173177, "rewards/margins": 11.47926762898763, "rewards/rejected": -2.6853294372558594, "step": 1636 }, { "epoch": 0.4096084073564369, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -78172153.6, "logits/rejected": -57518752.0, "logps/chosen": -461.49228515625, "logps/rejected": -512.8747907366071, "loss": 0.058, "rewards/chosen": 10.112064361572266, "rewards/margins": 21.680502210344585, "rewards/rejected": -11.568437848772321, "step": 1637 }, { "epoch": 0.4098586262980108, "grad_norm": 8.0625, "kl": 0.9550074338912964, "learning_rate": 5e-06, "logits/chosen": -37777479.384615384, "logits/rejected": -8331448.7272727275, "logps/chosen": -371.5591571514423, "logps/rejected": -548.6255326704545, "loss": 0.0555, "rewards/chosen": 6.262809166541467, "rewards/margins": 16.604760016594735, "rewards/rejected": -10.341950850053268, "step": 1638 }, { "epoch": 0.41010884523958463, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -96284252.44444445, "logits/rejected": -36917922.13333333, "logps/chosen": -450.6339518229167, "logps/rejected": -432.86627604166665, "loss": 0.0291, "rewards/chosen": 10.473131815592447, "rewards/margins": 18.828085327148436, "rewards/rejected": -8.354953511555989, "step": 1639 }, { "epoch": 0.41035906418115853, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58175668.36363637, "logits/rejected": -41718350.76923077, "logps/chosen": -377.9964488636364, "logps/rejected": -553.6769831730769, "loss": 0.0313, "rewards/chosen": 7.573543201793324, "rewards/margins": 16.668775651838395, "rewards/rejected": -9.095232450045073, "step": 1640 }, { "epoch": 0.41060928312273237, "grad_norm": 19.5, "kl": 4.1324872970581055, "learning_rate": 5e-06, "logits/chosen": -67599581.53846154, "logits/rejected": -45677355.63636363, "logps/chosen": -262.44119966947113, "logps/rejected": -637.9925426136364, "loss": 0.1267, "rewards/chosen": 6.753439683180589, "rewards/margins": 16.507864732008713, "rewards/rejected": -9.754425048828125, "step": 1641 }, { "epoch": 0.41085950206430627, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52601821.86666667, "logits/rejected": -96512782.22222222, "logps/chosen": -368.4419270833333, "logps/rejected": -725.8990885416666, "loss": 0.0775, "rewards/chosen": 6.6067352294921875, "rewards/margins": 19.85471513536241, "rewards/rejected": -13.247979905870226, "step": 1642 }, { "epoch": 0.41110972100588017, "grad_norm": 4.96875, "kl": 2.2518508434295654, "learning_rate": 5e-06, "logits/chosen": -39417658.18181818, "logits/rejected": -38469410.461538464, "logps/chosen": -358.3555353338068, "logps/rejected": -532.1684945913462, "loss": 0.0695, "rewards/chosen": 7.873641274192116, "rewards/margins": 19.963116465748605, "rewards/rejected": -12.08947519155649, "step": 1643 }, { "epoch": 0.411359939947454, "grad_norm": 4.53125, "kl": 2.1452293395996094, "learning_rate": 5e-06, "logits/chosen": -53285396.0, "logits/rejected": -36789344.0, "logps/chosen": -388.9821472167969, "logps/rejected": -507.4839782714844, "loss": 0.0338, "rewards/chosen": 7.502474308013916, "rewards/margins": 16.050246715545654, "rewards/rejected": -8.547772407531738, "step": 1644 }, { "epoch": 0.4116101588890279, "grad_norm": 1.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59529109.333333336, "logits/rejected": -43106700.8, "logps/chosen": -441.4747721354167, "logps/rejected": -614.9171875, "loss": 0.0019, "rewards/chosen": 7.371741400824653, "rewards/margins": 19.044385443793402, "rewards/rejected": -11.67264404296875, "step": 1645 }, { "epoch": 0.41186037783060175, "grad_norm": 5.59375, "kl": 6.3410139083862305, "learning_rate": 5e-06, "logits/chosen": -72831241.84615384, "logits/rejected": -40293742.54545455, "logps/chosen": -422.9115459735577, "logps/rejected": -579.6659712357955, "loss": 0.0452, "rewards/chosen": 8.884330749511719, "rewards/margins": 19.988499728116125, "rewards/rejected": -11.104168978604404, "step": 1646 }, { "epoch": 0.41211059677217565, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -106576910.22222222, "logits/rejected": -51364804.266666666, "logps/chosen": -362.53436957465277, "logps/rejected": -607.9970052083333, "loss": 0.0313, "rewards/chosen": 7.3232532077365455, "rewards/margins": 22.442031690809465, "rewards/rejected": -15.118778483072917, "step": 1647 }, { "epoch": 0.41236081571374955, "grad_norm": 1.1171875, "kl": 0.24424616992473602, "learning_rate": 5e-06, "logits/chosen": -33650903.27272727, "logits/rejected": -78140923.07692307, "logps/chosen": -324.45192649147725, "logps/rejected": -701.568359375, "loss": 0.0159, "rewards/chosen": 6.9267661354758525, "rewards/margins": 22.86611597021143, "rewards/rejected": -15.939349834735577, "step": 1648 }, { "epoch": 0.4126110346553234, "grad_norm": 5.46875, "kl": 10.869810104370117, "learning_rate": 5e-06, "logits/chosen": -78130243.76470588, "logits/rejected": -51810761.14285714, "logps/chosen": -393.2431640625, "logps/rejected": -599.7381417410714, "loss": 0.0693, "rewards/chosen": 7.30051466997932, "rewards/margins": 23.438502079298516, "rewards/rejected": -16.137987409319198, "step": 1649 }, { "epoch": 0.4128612535968973, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18088593.14285714, "logits/rejected": -36917987.2, "logps/chosen": -295.1450892857143, "logps/rejected": -672.609765625, "loss": 0.0635, "rewards/chosen": 5.77373286655971, "rewards/margins": 21.42665339878627, "rewards/rejected": -15.652920532226563, "step": 1650 }, { "epoch": 0.41311147253847114, "grad_norm": 2.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48716243.2, "logits/rejected": 64390491.428571425, "logps/chosen": -354.59130859375, "logps/rejected": -505.39439174107144, "loss": 0.0315, "rewards/chosen": 5.775858306884766, "rewards/margins": 15.95925805228097, "rewards/rejected": -10.183399745396205, "step": 1651 }, { "epoch": 0.41336169148004503, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59955066.18181818, "logits/rejected": -48774449.23076923, "logps/chosen": -418.4396306818182, "logps/rejected": -464.0920973557692, "loss": 0.0546, "rewards/chosen": 6.665956670587713, "rewards/margins": 15.543499019596126, "rewards/rejected": -8.877542349008413, "step": 1652 }, { "epoch": 0.41361191042161893, "grad_norm": 7.78125, "kl": 1.8725414276123047, "learning_rate": 5e-06, "logits/chosen": -90364060.44444445, "logits/rejected": -52314875.733333334, "logps/chosen": -370.7450900607639, "logps/rejected": -511.53444010416666, "loss": 0.0213, "rewards/chosen": 6.4102355109320746, "rewards/margins": 17.299754757351344, "rewards/rejected": -10.88951924641927, "step": 1653 }, { "epoch": 0.4138621293631928, "grad_norm": 10.8125, "kl": 15.272687911987305, "learning_rate": 5e-06, "logits/chosen": -67907895.46666667, "logits/rejected": -81382094.22222222, "logps/chosen": -556.381640625, "logps/rejected": -735.2584635416666, "loss": 0.0178, "rewards/chosen": 9.716265869140624, "rewards/margins": 28.006650797526042, "rewards/rejected": -18.290384928385418, "step": 1654 }, { "epoch": 0.4141123483047667, "grad_norm": 6.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59433717.333333336, "logits/rejected": -44394624.0, "logps/chosen": -374.85582139756946, "logps/rejected": -501.3428059895833, "loss": 0.0203, "rewards/chosen": 7.185723198784722, "rewards/margins": 20.551968722873266, "rewards/rejected": -13.366245524088542, "step": 1655 }, { "epoch": 0.4143625672463406, "grad_norm": 12.0625, "kl": 3.02698016166687, "learning_rate": 5e-06, "logits/chosen": -47742965.333333336, "logits/rejected": -45709952.0, "logps/chosen": -408.3413492838542, "logps/rejected": -440.6381022135417, "loss": 0.037, "rewards/chosen": 6.543295542399089, "rewards/margins": 18.316186269124348, "rewards/rejected": -11.77289072672526, "step": 1656 }, { "epoch": 0.4146127861879144, "grad_norm": 3.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39354192.0, "logits/rejected": -52151040.0, "logps/chosen": -374.6142985026042, "logps/rejected": -519.3289794921875, "loss": 0.0237, "rewards/chosen": 9.195215861002604, "rewards/margins": 20.375022888183594, "rewards/rejected": -11.17980702718099, "step": 1657 }, { "epoch": 0.4148630051294883, "grad_norm": 1.2109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34607261.09090909, "logits/rejected": -39773235.692307696, "logps/chosen": -256.5924183238636, "logps/rejected": -613.1406625600962, "loss": 0.0417, "rewards/chosen": 6.530414234508168, "rewards/margins": 22.39778249247091, "rewards/rejected": -15.86736825796274, "step": 1658 }, { "epoch": 0.41511322407106216, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33748395.428571425, "logits/rejected": -49825824.0, "logps/chosen": -406.86886160714283, "logps/rejected": -685.704931640625, "loss": 0.0535, "rewards/chosen": 6.1253525870186945, "rewards/margins": 21.165071214948384, "rewards/rejected": -15.039718627929688, "step": 1659 }, { "epoch": 0.41536344301263606, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44813175.46666667, "logits/rejected": -58289070.222222224, "logps/chosen": -314.68037109375, "logps/rejected": -754.6195746527778, "loss": 0.0667, "rewards/chosen": 4.75397694905599, "rewards/margins": 18.113082377115887, "rewards/rejected": -13.359105428059896, "step": 1660 }, { "epoch": 0.41561366195420996, "grad_norm": 3.734375, "kl": 8.254678726196289, "learning_rate": 5e-06, "logits/chosen": -45960979.692307696, "logits/rejected": -9046398.545454545, "logps/chosen": -351.25345552884613, "logps/rejected": -513.6558061079545, "loss": 0.0932, "rewards/chosen": 7.35093982403095, "rewards/margins": 19.912302350664472, "rewards/rejected": -12.561362526633523, "step": 1661 }, { "epoch": 0.4158638808957838, "grad_norm": 7.96875, "kl": 15.150541305541992, "learning_rate": 5e-06, "logits/chosen": -41836072.0, "logits/rejected": -40394258.666666664, "logps/chosen": -461.226318359375, "logps/rejected": -516.92333984375, "loss": 0.0686, "rewards/chosen": 8.58858871459961, "rewards/margins": 20.047770182291664, "rewards/rejected": -11.459181467692057, "step": 1662 }, { "epoch": 0.4161140998373577, "grad_norm": 4.15625, "kl": 2.2227554321289062, "learning_rate": 5e-06, "logits/chosen": -55155347.2, "logits/rejected": -43960896.0, "logps/chosen": -517.47265625, "logps/rejected": -523.0019182477679, "loss": 0.007, "rewards/chosen": 9.063607788085937, "rewards/margins": 20.95015651157924, "rewards/rejected": -11.886548723493304, "step": 1663 }, { "epoch": 0.41636431877893154, "grad_norm": 7.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27399669.333333332, "logits/rejected": -40272230.4, "logps/chosen": -259.3344455295139, "logps/rejected": -530.4388997395833, "loss": 0.0684, "rewards/chosen": 7.452676561143663, "rewards/margins": 19.821204800075954, "rewards/rejected": -12.368528238932292, "step": 1664 }, { "epoch": 0.41661453772050544, "grad_norm": 14.5, "kl": 2.8173513412475586, "learning_rate": 5e-06, "logits/chosen": -29017962.666666668, "logits/rejected": -96421717.33333333, "logps/chosen": -465.97352430555554, "logps/rejected": -787.6481770833333, "loss": 0.0728, "rewards/chosen": 6.756870693630642, "rewards/margins": 24.125079515245226, "rewards/rejected": -17.368208821614584, "step": 1665 }, { "epoch": 0.41686475666207934, "grad_norm": 21.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57357125.81818182, "logits/rejected": -57023867.07692308, "logps/chosen": -482.08540482954544, "logps/rejected": -676.1416015625, "loss": 0.0493, "rewards/chosen": 8.817036021839488, "rewards/margins": 25.959117649318454, "rewards/rejected": -17.142081627478966, "step": 1666 }, { "epoch": 0.4171149756036532, "grad_norm": 2.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32513943.272727273, "logits/rejected": -35667318.15384615, "logps/chosen": -353.6112171519886, "logps/rejected": -454.02403846153845, "loss": 0.0095, "rewards/chosen": 7.150106950239702, "rewards/margins": 17.545078357616504, "rewards/rejected": -10.394971407376802, "step": 1667 }, { "epoch": 0.4173651945452271, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14909945.6, "logits/rejected": -34631609.14285714, "logps/chosen": -326.430078125, "logps/rejected": -513.4900948660714, "loss": 0.053, "rewards/chosen": 7.146656036376953, "rewards/margins": 18.982521602085658, "rewards/rejected": -11.835865565708705, "step": 1668 }, { "epoch": 0.4176154134868009, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38462481.45454545, "logits/rejected": -21171271.384615384, "logps/chosen": -309.63822798295456, "logps/rejected": -474.3869441105769, "loss": 0.0396, "rewards/chosen": 6.054537686434659, "rewards/margins": 19.035671954388384, "rewards/rejected": -12.981134267953726, "step": 1669 }, { "epoch": 0.4178656324283748, "grad_norm": 14.625, "kl": 1.3895353078842163, "learning_rate": 5e-06, "logits/chosen": -38947202.90909091, "logits/rejected": -26649139.692307692, "logps/chosen": -319.7601873224432, "logps/rejected": -688.4923377403846, "loss": 0.0854, "rewards/chosen": 6.717070146040483, "rewards/margins": 19.209234170980388, "rewards/rejected": -12.492164024939903, "step": 1670 }, { "epoch": 0.4181158513699487, "grad_norm": 4.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52775790.222222224, "logits/rejected": -43850641.06666667, "logps/chosen": -549.8658854166666, "logps/rejected": -596.2606770833333, "loss": 0.011, "rewards/chosen": 8.97830539279514, "rewards/margins": 24.045494927300346, "rewards/rejected": -15.067189534505209, "step": 1671 }, { "epoch": 0.41836607031152256, "grad_norm": 1.1015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50244681.14285714, "logits/rejected": -51207126.5882353, "logps/chosen": -367.73060825892856, "logps/rejected": -696.1962316176471, "loss": 0.0021, "rewards/chosen": 8.864627293178014, "rewards/margins": 23.465502250094374, "rewards/rejected": -14.60087495691636, "step": 1672 }, { "epoch": 0.41861628925309646, "grad_norm": 8.4375, "kl": 2.4454410076141357, "learning_rate": 5e-06, "logits/chosen": -73214248.72727273, "logits/rejected": -24650116.923076924, "logps/chosen": -452.2507990056818, "logps/rejected": -524.0574669471154, "loss": 0.0495, "rewards/chosen": 8.795964327725498, "rewards/margins": 22.801457491788, "rewards/rejected": -14.0054931640625, "step": 1673 }, { "epoch": 0.41886650819467036, "grad_norm": 10.5625, "kl": 8.477943420410156, "learning_rate": 5e-06, "logits/chosen": -76193516.3076923, "logits/rejected": -40240570.18181818, "logps/chosen": -400.3032977764423, "logps/rejected": -570.1125710227273, "loss": 0.121, "rewards/chosen": 6.9015667255108175, "rewards/margins": 16.85194583706089, "rewards/rejected": -9.95037911155007, "step": 1674 }, { "epoch": 0.4191167271362442, "grad_norm": 5.96875, "kl": 0.4574432373046875, "learning_rate": 5e-06, "logits/chosen": -59518041.6, "logits/rejected": -59382976.0, "logps/chosen": -531.534423828125, "logps/rejected": -778.0603376116071, "loss": 0.0061, "rewards/chosen": 6.635273742675781, "rewards/margins": 21.527843148367744, "rewards/rejected": -14.892569405691964, "step": 1675 }, { "epoch": 0.4193669460778181, "grad_norm": 3.890625, "kl": 6.50621223449707, "learning_rate": 5e-06, "logits/chosen": -60430308.571428575, "logits/rejected": 66448550.4, "logps/chosen": -397.740234375, "logps/rejected": -513.68740234375, "loss": 0.0082, "rewards/chosen": 7.4650998796735495, "rewards/margins": 22.64015938895089, "rewards/rejected": -15.175059509277343, "step": 1676 }, { "epoch": 0.41961716501939195, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52323658.666666664, "logits/rejected": -35400945.06666667, "logps/chosen": -359.4195963541667, "logps/rejected": -426.90074869791664, "loss": 0.0252, "rewards/chosen": 9.082895067003038, "rewards/margins": 18.942035081651476, "rewards/rejected": -9.859140014648437, "step": 1677 }, { "epoch": 0.41986738396096585, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44130116.92307692, "logits/rejected": -55947496.72727273, "logps/chosen": -370.26558743990387, "logps/rejected": -570.3140980113636, "loss": 0.0344, "rewards/chosen": 6.565777118389423, "rewards/margins": 19.24940832178076, "rewards/rejected": -12.683631203391336, "step": 1678 }, { "epoch": 0.42011760290253974, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42849962.666666664, "logits/rejected": -47988442.666666664, "logps/chosen": -351.1490071614583, "logps/rejected": -748.9602864583334, "loss": 0.0417, "rewards/chosen": 5.431649525960286, "rewards/margins": 20.14191436767578, "rewards/rejected": -14.710264841715494, "step": 1679 }, { "epoch": 0.4203678218441136, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55463368.0, "logits/rejected": -58449936.0, "logps/chosen": -352.3590087890625, "logps/rejected": -500.0791015625, "loss": 0.0149, "rewards/chosen": 6.365176200866699, "rewards/margins": 17.255043029785156, "rewards/rejected": -10.889866828918457, "step": 1680 }, { "epoch": 0.4206180407856875, "grad_norm": 14.125, "kl": 2.4271063804626465, "learning_rate": 5e-06, "logits/chosen": -42911021.71428572, "logits/rejected": -66143411.2, "logps/chosen": -361.56539481026783, "logps/rejected": -820.8953125, "loss": 0.0301, "rewards/chosen": 6.682152884347098, "rewards/margins": 24.351752798897877, "rewards/rejected": -17.66959991455078, "step": 1681 }, { "epoch": 0.42086825972726133, "grad_norm": 10.0, "kl": 14.30746078491211, "learning_rate": 5e-06, "logits/chosen": -47561053.86666667, "logits/rejected": -16069086.222222222, "logps/chosen": -474.1516927083333, "logps/rejected": -803.1561957465278, "loss": 0.0429, "rewards/chosen": 8.564068603515626, "rewards/margins": 22.24999525282118, "rewards/rejected": -13.685926649305555, "step": 1682 }, { "epoch": 0.4211184786688352, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -90078016.0, "logits/rejected": -42238554.35294118, "logps/chosen": -424.76217215401783, "logps/rejected": -540.1424632352941, "loss": 0.0389, "rewards/chosen": 6.276822771344866, "rewards/margins": 18.797487659614628, "rewards/rejected": -12.520664888269762, "step": 1683 }, { "epoch": 0.4213686976104091, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45041888.0, "logits/rejected": -24466570.0, "logps/chosen": -393.0665283203125, "logps/rejected": -418.3067321777344, "loss": 0.043, "rewards/chosen": 5.994039535522461, "rewards/margins": 13.953697204589844, "rewards/rejected": -7.959657669067383, "step": 1684 }, { "epoch": 0.42161891655198297, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60438813.09090909, "logits/rejected": -59543236.92307692, "logps/chosen": -368.94462446732956, "logps/rejected": -607.21142578125, "loss": 0.0419, "rewards/chosen": 6.691439541903409, "rewards/margins": 20.946162723994757, "rewards/rejected": -14.254723182091347, "step": 1685 }, { "epoch": 0.42186913549355687, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46280585.14285714, "logits/rejected": -25187708.8, "logps/chosen": -412.6154087611607, "logps/rejected": -552.739453125, "loss": 0.0372, "rewards/chosen": 5.245063236781529, "rewards/margins": 13.877762821742465, "rewards/rejected": -8.632699584960937, "step": 1686 }, { "epoch": 0.42211935443513077, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38903095.27272727, "logits/rejected": -59422163.692307696, "logps/chosen": -370.1875, "logps/rejected": -626.2998046875, "loss": 0.0222, "rewards/chosen": 8.709320761940695, "rewards/margins": 23.908850556486968, "rewards/rejected": -15.199529794546274, "step": 1687 }, { "epoch": 0.4223695733767046, "grad_norm": 2.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44666617.6, "logits/rejected": -49186546.28571428, "logps/chosen": -456.327294921875, "logps/rejected": -634.3900669642857, "loss": 0.0149, "rewards/chosen": 8.881990051269531, "rewards/margins": 21.954424612862724, "rewards/rejected": -13.072434561593193, "step": 1688 }, { "epoch": 0.4226197923182785, "grad_norm": 2.375, "kl": 14.093932151794434, "learning_rate": 5e-06, "logits/chosen": -26737600.0, "logits/rejected": -51014341.333333336, "logps/chosen": -439.2331136067708, "logps/rejected": -560.5958658854166, "loss": 0.0632, "rewards/chosen": 9.209739685058594, "rewards/margins": 22.60663859049479, "rewards/rejected": -13.396898905436197, "step": 1689 }, { "epoch": 0.42287001125985235, "grad_norm": 3.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49018087.384615384, "logits/rejected": -55935738.18181818, "logps/chosen": -382.02974759615387, "logps/rejected": -668.0794566761364, "loss": 0.0379, "rewards/chosen": 6.03826669546274, "rewards/margins": 20.63727265471345, "rewards/rejected": -14.59900595925071, "step": 1690 }, { "epoch": 0.42312023020142625, "grad_norm": 4.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35056898.666666664, "logits/rejected": -40455616.0, "logps/chosen": -463.4138590494792, "logps/rejected": -415.2669270833333, "loss": 0.0331, "rewards/chosen": 7.657519022623698, "rewards/margins": 18.174386978149414, "rewards/rejected": -10.516867955525717, "step": 1691 }, { "epoch": 0.42337044914300015, "grad_norm": 7.8125, "kl": 2.8107411861419678, "learning_rate": 5e-06, "logits/chosen": -56132125.538461536, "logits/rejected": -44724215.27272727, "logps/chosen": -313.55093149038464, "logps/rejected": -675.9422940340909, "loss": 0.0471, "rewards/chosen": 7.035715543306791, "rewards/margins": 19.523497414755653, "rewards/rejected": -12.487781871448863, "step": 1692 }, { "epoch": 0.423620668084574, "grad_norm": 10.9375, "kl": 14.037200927734375, "learning_rate": 5e-06, "logits/chosen": -47686311.11111111, "logits/rejected": -44932746.666666664, "logps/chosen": -362.4853515625, "logps/rejected": -262.8715006510417, "loss": 0.0882, "rewards/chosen": 7.0972574022081165, "rewards/margins": 14.097117529975044, "rewards/rejected": -6.999860127766927, "step": 1693 }, { "epoch": 0.4238708870261479, "grad_norm": 7.71875, "kl": 0.16559919714927673, "learning_rate": 5e-06, "logits/chosen": -65217472.0, "logits/rejected": -51975142.4, "logps/chosen": -369.78770228794644, "logps/rejected": -568.06142578125, "loss": 0.0679, "rewards/chosen": 7.244997297014509, "rewards/margins": 19.16612003871373, "rewards/rejected": -11.921122741699218, "step": 1694 }, { "epoch": 0.42412110596772173, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42784176.0, "logits/rejected": 75429792.0, "logps/chosen": -325.5848388671875, "logps/rejected": -533.5114135742188, "loss": 0.0583, "rewards/chosen": 5.6966447830200195, "rewards/margins": 17.175299644470215, "rewards/rejected": -11.478654861450195, "step": 1695 }, { "epoch": 0.42437132490929563, "grad_norm": 1.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16292379.2, "logits/rejected": -46763830.85714286, "logps/chosen": -578.477978515625, "logps/rejected": -557.65380859375, "loss": 0.002, "rewards/chosen": 8.897708129882812, "rewards/margins": 22.26006840297154, "rewards/rejected": -13.362360273088727, "step": 1696 }, { "epoch": 0.42462154385086953, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56680969.14285714, "logits/rejected": -45608990.11764706, "logps/chosen": -307.37137276785717, "logps/rejected": -507.7455193014706, "loss": 0.0488, "rewards/chosen": 5.891669137137277, "rewards/margins": 15.873952785459888, "rewards/rejected": -9.98228364832261, "step": 1697 }, { "epoch": 0.4248717627924434, "grad_norm": 1.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51752352.0, "logits/rejected": -84680849.06666666, "logps/chosen": -445.4646809895833, "logps/rejected": -563.323046875, "loss": 0.0105, "rewards/chosen": 7.810881720648871, "rewards/margins": 19.430758836534288, "rewards/rejected": -11.619877115885417, "step": 1698 }, { "epoch": 0.4251219817340173, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58546265.6, "logits/rejected": -35943516.44444445, "logps/chosen": -314.38645833333334, "logps/rejected": -482.13525390625, "loss": 0.0876, "rewards/chosen": 4.787557983398438, "rewards/margins": 16.967413330078124, "rewards/rejected": -12.179855346679688, "step": 1699 }, { "epoch": 0.4253722006755911, "grad_norm": 3.28125, "kl": 2.7685952186584473, "learning_rate": 5e-06, "logits/chosen": -48780144.0, "logits/rejected": -62419082.666666664, "logps/chosen": -395.3228352864583, "logps/rejected": -498.8789469401042, "loss": 0.018, "rewards/chosen": 7.900559743245442, "rewards/margins": 19.62813949584961, "rewards/rejected": -11.727579752604166, "step": 1700 }, { "epoch": 0.425622419617165, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62224362.666666664, "logits/rejected": -64693660.44444445, "logps/chosen": -382.1701253255208, "logps/rejected": -582.4524197048611, "loss": 0.0214, "rewards/chosen": 6.262440999348958, "rewards/margins": 22.214392768012154, "rewards/rejected": -15.951951768663195, "step": 1701 }, { "epoch": 0.4258726385587389, "grad_norm": 18.375, "kl": 12.94474983215332, "learning_rate": 5e-06, "logits/chosen": -39644448.0, "logits/rejected": -38159281.45454545, "logps/chosen": -507.21739783653845, "logps/rejected": -441.9461558948864, "loss": 0.1084, "rewards/chosen": 7.649361243614783, "rewards/margins": 18.70295496587153, "rewards/rejected": -11.053593722256748, "step": 1702 }, { "epoch": 0.42612285750031276, "grad_norm": 2.953125, "kl": 7.729244232177734, "learning_rate": 5e-06, "logits/chosen": -56215893.333333336, "logits/rejected": 7993751.111111111, "logps/chosen": -485.69983723958336, "logps/rejected": -447.6650390625, "loss": 0.0038, "rewards/chosen": 7.767117309570312, "rewards/margins": 17.454815673828126, "rewards/rejected": -9.687698364257812, "step": 1703 }, { "epoch": 0.42637307644188666, "grad_norm": 13.9375, "kl": 11.646564483642578, "learning_rate": 5e-06, "logits/chosen": -45079652.571428575, "logits/rejected": -47878854.4, "logps/chosen": -329.24288504464283, "logps/rejected": -497.2576171875, "loss": 0.0542, "rewards/chosen": 6.367137908935547, "rewards/margins": 13.181292724609374, "rewards/rejected": -6.814154815673828, "step": 1704 }, { "epoch": 0.42662329538346055, "grad_norm": 12.3125, "kl": 15.663991928100586, "learning_rate": 5e-06, "logits/chosen": -81114393.6, "logits/rejected": -26551541.333333332, "logps/chosen": -433.26328125, "logps/rejected": -544.2171766493055, "loss": 0.0342, "rewards/chosen": 8.47106679280599, "rewards/margins": 16.079865349663628, "rewards/rejected": -7.608798556857639, "step": 1705 }, { "epoch": 0.4268735143250344, "grad_norm": 3.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54404027.428571425, "logits/rejected": -54776655.058823526, "logps/chosen": -346.2366420200893, "logps/rejected": -583.1027113970588, "loss": 0.0387, "rewards/chosen": 6.454103197370257, "rewards/margins": 17.682503035088548, "rewards/rejected": -11.22839983771829, "step": 1706 }, { "epoch": 0.4271237332666083, "grad_norm": 3.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47848426.666666664, "logits/rejected": -47116288.0, "logps/chosen": -406.0160725911458, "logps/rejected": -583.9917399088541, "loss": 0.0357, "rewards/chosen": 6.846551259358724, "rewards/margins": 20.443429311116535, "rewards/rejected": -13.596878051757812, "step": 1707 }, { "epoch": 0.42737395220818214, "grad_norm": 7.0625, "kl": 0.8099867701530457, "learning_rate": 5e-06, "logits/chosen": -52418409.14285714, "logits/rejected": -22942369.6, "logps/chosen": -359.94252232142856, "logps/rejected": -464.1509765625, "loss": 0.0537, "rewards/chosen": 6.338814871651786, "rewards/margins": 15.62763170514788, "rewards/rejected": -9.288816833496094, "step": 1708 }, { "epoch": 0.42762417114975604, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43965810.666666664, "logits/rejected": -29156752.0, "logps/chosen": -405.357666015625, "logps/rejected": -441.9420572916667, "loss": 0.0604, "rewards/chosen": 8.050973892211914, "rewards/margins": 18.299269994099937, "rewards/rejected": -10.248296101888021, "step": 1709 }, { "epoch": 0.42787439009132994, "grad_norm": 3.390625, "kl": 0.9258435964584351, "learning_rate": 5e-06, "logits/chosen": 9278760.727272727, "logits/rejected": -35297260.307692304, "logps/chosen": -430.04545454545456, "logps/rejected": -614.0436823918269, "loss": 0.0096, "rewards/chosen": 9.246497414328836, "rewards/margins": 22.055412772652154, "rewards/rejected": -12.808915358323317, "step": 1710 }, { "epoch": 0.4281246090329038, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83876472.0, "logits/rejected": -49734416.0, "logps/chosen": -481.97442626953125, "logps/rejected": -533.2142944335938, "loss": 0.0487, "rewards/chosen": 10.997098922729492, "rewards/margins": 24.313775062561035, "rewards/rejected": -13.316676139831543, "step": 1711 }, { "epoch": 0.4283748279744777, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35620320.0, "logits/rejected": -46986229.333333336, "logps/chosen": -318.46262613932294, "logps/rejected": -546.9024251302084, "loss": 0.0464, "rewards/chosen": 6.131862640380859, "rewards/margins": 18.097195943196617, "rewards/rejected": -11.965333302815756, "step": 1712 }, { "epoch": 0.4286250469160515, "grad_norm": 3.328125, "kl": 7.502655506134033, "learning_rate": 5e-06, "logits/chosen": -52188790.4, "logits/rejected": -48088932.571428575, "logps/chosen": -428.955322265625, "logps/rejected": -560.6171177455357, "loss": 0.0462, "rewards/chosen": 9.466432189941406, "rewards/margins": 20.565653555733817, "rewards/rejected": -11.099221365792411, "step": 1713 }, { "epoch": 0.4288752658576254, "grad_norm": 12.4375, "kl": 6.839764595031738, "learning_rate": 5e-06, "logits/chosen": -76820169.84615384, "logits/rejected": -33810708.36363637, "logps/chosen": -389.4198467548077, "logps/rejected": -687.6620649857955, "loss": 0.054, "rewards/chosen": 7.906393197866587, "rewards/margins": 22.690686632703233, "rewards/rejected": -14.784293434836648, "step": 1714 }, { "epoch": 0.4291254847991993, "grad_norm": 14.1875, "kl": 0.8903192281723022, "learning_rate": 5e-06, "logits/chosen": -67645902.76923077, "logits/rejected": -47539266.90909091, "logps/chosen": -364.6746168870192, "logps/rejected": -636.3119229403409, "loss": 0.0725, "rewards/chosen": 6.2124187762920675, "rewards/margins": 17.916070898096045, "rewards/rejected": -11.703652121803977, "step": 1715 }, { "epoch": 0.42937570374077316, "grad_norm": 13.875, "kl": 6.6375555992126465, "learning_rate": 5e-06, "logits/chosen": -47529417.84615385, "logits/rejected": -27196328.727272727, "logps/chosen": -345.6926457331731, "logps/rejected": -647.9904563210227, "loss": 0.1059, "rewards/chosen": 7.3469725388747, "rewards/margins": 19.345108192283792, "rewards/rejected": -11.998135653409092, "step": 1716 }, { "epoch": 0.42962592268234706, "grad_norm": 2.578125, "kl": 3.3616461753845215, "learning_rate": 5e-06, "logits/chosen": -51825301.333333336, "logits/rejected": -59047194.666666664, "logps/chosen": -380.3544921875, "logps/rejected": -611.5171712239584, "loss": 0.0645, "rewards/chosen": 7.639267603556315, "rewards/margins": 20.722444534301758, "rewards/rejected": -13.083176930745443, "step": 1717 }, { "epoch": 0.4298761416239209, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62654353.45454545, "logits/rejected": -59912256.0, "logps/chosen": -451.9142400568182, "logps/rejected": -615.9150015024038, "loss": 0.0032, "rewards/chosen": 8.970243280584162, "rewards/margins": 20.215569396119015, "rewards/rejected": -11.245326115534855, "step": 1718 }, { "epoch": 0.4301263605654948, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68114275.55555555, "logits/rejected": -59588885.333333336, "logps/chosen": -301.17290581597223, "logps/rejected": -670.4255859375, "loss": 0.0567, "rewards/chosen": 5.892020331488715, "rewards/margins": 17.536803860134548, "rewards/rejected": -11.644783528645833, "step": 1719 }, { "epoch": 0.4303765795070687, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24551925.333333332, "logits/rejected": -40537634.13333333, "logps/chosen": -388.76280381944446, "logps/rejected": -572.955859375, "loss": 0.0365, "rewards/chosen": 7.7253163655598955, "rewards/margins": 19.833175659179688, "rewards/rejected": -12.107859293619791, "step": 1720 }, { "epoch": 0.43062679844864254, "grad_norm": 14.1875, "kl": 6.8203125, "learning_rate": 5e-06, "logits/chosen": -42791654.4, "logits/rejected": -43397769.14285714, "logps/chosen": -394.9171630859375, "logps/rejected": -457.73325892857144, "loss": 0.0326, "rewards/chosen": 6.740348815917969, "rewards/margins": 15.547513689313616, "rewards/rejected": -8.807164873395648, "step": 1721 }, { "epoch": 0.43087701739021644, "grad_norm": 5.21875, "kl": 1.7184561491012573, "learning_rate": 5e-06, "logits/chosen": -44436499.692307696, "logits/rejected": -66164450.90909091, "logps/chosen": -395.1288311298077, "logps/rejected": -783.4544566761364, "loss": 0.0328, "rewards/chosen": 7.838365408090445, "rewards/margins": 20.7616872187261, "rewards/rejected": -12.923321810635654, "step": 1722 }, { "epoch": 0.43112723633179034, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40842295.46666667, "logits/rejected": -29641185.777777776, "logps/chosen": -359.4515625, "logps/rejected": -563.5245768229166, "loss": 0.0496, "rewards/chosen": 7.085377502441406, "rewards/margins": 19.073401896158853, "rewards/rejected": -11.988024393717447, "step": 1723 }, { "epoch": 0.4313774552733642, "grad_norm": 13.4375, "kl": 3.0812313556671143, "learning_rate": 5e-06, "logits/chosen": -62663808.0, "logits/rejected": -30666104.888888888, "logps/chosen": -366.15833333333336, "logps/rejected": -353.538818359375, "loss": 0.0747, "rewards/chosen": 8.160554504394531, "rewards/margins": 17.364532301161024, "rewards/rejected": -9.203977796766493, "step": 1724 }, { "epoch": 0.4316276742149381, "grad_norm": 2.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58860288.0, "logits/rejected": -46976465.06666667, "logps/chosen": -302.7123209635417, "logps/rejected": -783.8604166666667, "loss": 0.029, "rewards/chosen": 6.328499688042535, "rewards/margins": 22.725789727105035, "rewards/rejected": -16.3972900390625, "step": 1725 }, { "epoch": 0.4318778931565119, "grad_norm": 2.640625, "kl": 2.352138042449951, "learning_rate": 5e-06, "logits/chosen": -59568710.4, "logits/rejected": -44088978.28571428, "logps/chosen": -542.09404296875, "logps/rejected": -545.5778459821429, "loss": 0.0051, "rewards/chosen": 11.1889892578125, "rewards/margins": 21.552378409249442, "rewards/rejected": -10.363389151436943, "step": 1726 }, { "epoch": 0.4321281120980858, "grad_norm": 7.03125, "kl": 0.5862541198730469, "learning_rate": 5e-06, "logits/chosen": -70189672.72727273, "logits/rejected": -38422168.615384616, "logps/chosen": -467.0792347301136, "logps/rejected": -574.7234074519231, "loss": 0.0517, "rewards/chosen": 9.150310169566762, "rewards/margins": 17.625838593169526, "rewards/rejected": -8.475528423602764, "step": 1727 }, { "epoch": 0.4323783310396597, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22833303.272727273, "logits/rejected": -83639286.15384616, "logps/chosen": -270.73439719460225, "logps/rejected": -575.6094501201923, "loss": 0.0341, "rewards/chosen": 6.655555031516335, "rewards/margins": 20.536275903661767, "rewards/rejected": -13.880720872145433, "step": 1728 }, { "epoch": 0.43262854998123357, "grad_norm": 10.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66529584.0, "logits/rejected": -73453941.33333333, "logps/chosen": -408.705810546875, "logps/rejected": -602.7852376302084, "loss": 0.0243, "rewards/chosen": 7.105537414550781, "rewards/margins": 20.146995544433594, "rewards/rejected": -13.041458129882812, "step": 1729 }, { "epoch": 0.43287876892280747, "grad_norm": 9.75, "kl": 2.8327815532684326, "learning_rate": 5e-06, "logits/chosen": -64335988.36363637, "logits/rejected": -67166508.3076923, "logps/chosen": -368.0704456676136, "logps/rejected": -554.1681941105769, "loss": 0.019, "rewards/chosen": 6.490728204900568, "rewards/margins": 17.96502301409528, "rewards/rejected": -11.474294809194712, "step": 1730 }, { "epoch": 0.4331289878643813, "grad_norm": 9.6875, "kl": 3.4409854412078857, "learning_rate": 5e-06, "logits/chosen": -38359035.428571425, "logits/rejected": -75107936.0, "logps/chosen": -317.81326729910717, "logps/rejected": -622.18583984375, "loss": 0.0337, "rewards/chosen": 6.8846620832170755, "rewards/margins": 20.430414036342075, "rewards/rejected": -13.545751953125, "step": 1731 }, { "epoch": 0.4333792068059552, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66212064.0, "logits/rejected": -43982601.14285714, "logps/chosen": -486.63984375, "logps/rejected": -633.8462611607143, "loss": 0.0627, "rewards/chosen": 7.7811279296875, "rewards/margins": 19.61305454799107, "rewards/rejected": -11.831926618303571, "step": 1732 }, { "epoch": 0.4336294257475291, "grad_norm": 10.9375, "kl": 1.4858449697494507, "learning_rate": 5e-06, "logits/chosen": -49538408.72727273, "logits/rejected": -79389134.76923077, "logps/chosen": -253.55579723011363, "logps/rejected": -658.3462289663462, "loss": 0.0973, "rewards/chosen": 4.4752068953080615, "rewards/margins": 19.178653503631377, "rewards/rejected": -14.703446608323317, "step": 1733 }, { "epoch": 0.43387964468910295, "grad_norm": 5.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40054329.6, "logits/rejected": -23462521.14285714, "logps/chosen": -336.8068359375, "logps/rejected": -654.4439871651786, "loss": 0.0099, "rewards/chosen": 6.676496887207032, "rewards/margins": 21.427894156319756, "rewards/rejected": -14.751397269112724, "step": 1734 }, { "epoch": 0.43412986363067685, "grad_norm": 17.125, "kl": 12.384054183959961, "learning_rate": 5e-06, "logits/chosen": -32385212.0, "logits/rejected": -43103208.0, "logps/chosen": -339.69903564453125, "logps/rejected": -261.8519287109375, "loss": 0.1108, "rewards/chosen": 7.17822265625, "rewards/margins": 14.521745204925537, "rewards/rejected": -7.343522548675537, "step": 1735 }, { "epoch": 0.4343800825722507, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46461949.09090909, "logits/rejected": -27294097.230769232, "logps/chosen": -533.9335049715909, "logps/rejected": -426.25721153846155, "loss": 0.0221, "rewards/chosen": 8.784595142711293, "rewards/margins": 19.425809366719704, "rewards/rejected": -10.641214224008413, "step": 1736 }, { "epoch": 0.4346303015138246, "grad_norm": 3.609375, "kl": 0.41214117407798767, "learning_rate": 5e-06, "logits/chosen": -55834619.07692308, "logits/rejected": -68854365.0909091, "logps/chosen": -415.0004131610577, "logps/rejected": -511.69881924715907, "loss": 0.0244, "rewards/chosen": 7.425439100999099, "rewards/margins": 18.9243668242768, "rewards/rejected": -11.4989277232777, "step": 1737 }, { "epoch": 0.4348805204553985, "grad_norm": 7.53125, "kl": 5.037423610687256, "learning_rate": 5e-06, "logits/chosen": -18690935.466666665, "logits/rejected": -54017230.222222224, "logps/chosen": -460.46868489583335, "logps/rejected": -654.6276584201389, "loss": 0.0267, "rewards/chosen": 7.764060974121094, "rewards/margins": 24.822614034016926, "rewards/rejected": -17.058553059895832, "step": 1738 }, { "epoch": 0.43513073939697233, "grad_norm": 7.125, "kl": 3.7176432609558105, "learning_rate": 5e-06, "logits/chosen": -58166099.2, "logits/rejected": -66448493.71428572, "logps/chosen": -373.3819580078125, "logps/rejected": -531.9524972098214, "loss": 0.028, "rewards/chosen": 7.904003143310547, "rewards/margins": 19.751651763916016, "rewards/rejected": -11.847648620605469, "step": 1739 }, { "epoch": 0.43538095833854623, "grad_norm": 12.375, "kl": 10.342616081237793, "learning_rate": 5e-06, "logits/chosen": -62692829.538461536, "logits/rejected": 18649166.545454547, "logps/chosen": -457.5563777043269, "logps/rejected": -724.3130326704545, "loss": 0.0309, "rewards/chosen": 8.37613267164964, "rewards/margins": 21.733340923602764, "rewards/rejected": -13.357208251953125, "step": 1740 }, { "epoch": 0.43563117728012013, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83998246.4, "logits/rejected": -30332072.42105263, "logps/chosen": -425.038134765625, "logps/rejected": -657.7446032072369, "loss": 0.0778, "rewards/chosen": 7.995486450195313, "rewards/margins": 20.139522994192024, "rewards/rejected": -12.14403654399671, "step": 1741 }, { "epoch": 0.435881396221694, "grad_norm": 9.375, "kl": 0.16205660998821259, "learning_rate": 5e-06, "logits/chosen": -27679463.384615384, "logits/rejected": -61266594.90909091, "logps/chosen": -424.5254657451923, "logps/rejected": -452.01438210227275, "loss": 0.0425, "rewards/chosen": 8.55937018761268, "rewards/margins": 18.677565461272124, "rewards/rejected": -10.118195273659445, "step": 1742 }, { "epoch": 0.43613161516326787, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51511264.0, "logits/rejected": -43514203.428571425, "logps/chosen": -268.41259765625, "logps/rejected": -472.47994559151783, "loss": 0.0689, "rewards/chosen": 6.514250183105469, "rewards/margins": 16.722412763323103, "rewards/rejected": -10.208162580217634, "step": 1743 }, { "epoch": 0.4363818341048417, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65692685.71428572, "logits/rejected": -31745753.6, "logps/chosen": -535.0950055803571, "logps/rejected": -564.382568359375, "loss": 0.0377, "rewards/chosen": 8.480243137904576, "rewards/margins": 21.311725071498326, "rewards/rejected": -12.83148193359375, "step": 1744 }, { "epoch": 0.4366320530464156, "grad_norm": 17.25, "kl": 3.5959362983703613, "learning_rate": 5e-06, "logits/chosen": -79856679.38461539, "logits/rejected": -29814906.181818184, "logps/chosen": -371.0808293269231, "logps/rejected": -406.611328125, "loss": 0.0492, "rewards/chosen": 8.293344350961538, "rewards/margins": 17.26341770412205, "rewards/rejected": -8.970073353160512, "step": 1745 }, { "epoch": 0.4368822719879895, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55040028.44444445, "logits/rejected": -56867285.333333336, "logps/chosen": -387.8827853732639, "logps/rejected": -671.6983072916667, "loss": 0.0459, "rewards/chosen": 6.181253221299913, "rewards/margins": 18.776427374945747, "rewards/rejected": -12.595174153645834, "step": 1746 }, { "epoch": 0.43713249092956336, "grad_norm": 16.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75343680.0, "logits/rejected": -47932388.571428575, "logps/chosen": -468.927197265625, "logps/rejected": -587.1833147321429, "loss": 0.025, "rewards/chosen": 8.697145080566406, "rewards/margins": 20.991458783830915, "rewards/rejected": -12.294313703264509, "step": 1747 }, { "epoch": 0.43738270987113725, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68261081.6, "logits/rejected": -71557147.42857143, "logps/chosen": -519.54384765625, "logps/rejected": -674.0514090401786, "loss": 0.0202, "rewards/chosen": 10.38585205078125, "rewards/margins": 22.809675816127232, "rewards/rejected": -12.423823765345983, "step": 1748 }, { "epoch": 0.4376329288127111, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38987557.81818182, "logits/rejected": -54131160.615384616, "logps/chosen": -403.91703657670456, "logps/rejected": -666.29296875, "loss": 0.0205, "rewards/chosen": 6.777026089754972, "rewards/margins": 19.322722641738146, "rewards/rejected": -12.545696551983173, "step": 1749 }, { "epoch": 0.437883147754285, "grad_norm": 7.0625, "kl": 1.476178526878357, "learning_rate": 5e-06, "logits/chosen": -55180957.09090909, "logits/rejected": -51466151.384615384, "logps/chosen": -324.15292080965907, "logps/rejected": -597.16162109375, "loss": 0.0463, "rewards/chosen": 5.6106719970703125, "rewards/margins": 18.544174194335938, "rewards/rejected": -12.933502197265625, "step": 1750 }, { "epoch": 0.4381333666958589, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56126813.538461536, "logits/rejected": -13130280.727272727, "logps/chosen": -385.82361778846155, "logps/rejected": -577.9657315340909, "loss": 0.0182, "rewards/chosen": 7.892554063063401, "rewards/margins": 20.405593391898627, "rewards/rejected": -12.513039328835227, "step": 1751 }, { "epoch": 0.43838358563743274, "grad_norm": 13.375, "kl": 6.674837589263916, "learning_rate": 5e-06, "logits/chosen": -45477993.4117647, "logits/rejected": -43217435.428571425, "logps/chosen": -318.71852022058823, "logps/rejected": -625.76416015625, "loss": 0.0928, "rewards/chosen": 6.334524266860065, "rewards/margins": 18.87705269180426, "rewards/rejected": -12.542528424944196, "step": 1752 }, { "epoch": 0.43863380457900664, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57025522.28571428, "logits/rejected": -48281494.5882353, "logps/chosen": -410.1220703125, "logps/rejected": -442.19694967830884, "loss": 0.0111, "rewards/chosen": 7.877737317766462, "rewards/margins": 17.02038365852933, "rewards/rejected": -9.142646340762868, "step": 1753 }, { "epoch": 0.43888402352058054, "grad_norm": 5.03125, "kl": 4.741658687591553, "learning_rate": 5e-06, "logits/chosen": -49010226.28571428, "logits/rejected": -71180588.8, "logps/chosen": -407.03414481026783, "logps/rejected": -599.71064453125, "loss": 0.0188, "rewards/chosen": 8.332615443638392, "rewards/margins": 17.753481837681363, "rewards/rejected": -9.420866394042969, "step": 1754 }, { "epoch": 0.4391342424621544, "grad_norm": 4.90625, "kl": 3.335146903991699, "learning_rate": 5e-06, "logits/chosen": -58319461.64705882, "logits/rejected": -46217056.0, "logps/chosen": -473.1337028952206, "logps/rejected": -482.73521205357144, "loss": 0.0667, "rewards/chosen": 7.7722625732421875, "rewards/margins": 19.922548566545757, "rewards/rejected": -12.150285993303571, "step": 1755 }, { "epoch": 0.4393844614037283, "grad_norm": 14.5, "kl": 16.141101837158203, "learning_rate": 5e-06, "logits/chosen": -39241558.85714286, "logits/rejected": -51333401.6, "logps/chosen": -471.42567661830356, "logps/rejected": -525.24345703125, "loss": 0.0672, "rewards/chosen": 8.030807495117188, "rewards/margins": 18.558697509765626, "rewards/rejected": -10.527890014648438, "step": 1756 }, { "epoch": 0.4396346803453021, "grad_norm": 2.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46667596.8, "logits/rejected": -36808502.85714286, "logps/chosen": -321.97822265625, "logps/rejected": -578.2374790736607, "loss": 0.0262, "rewards/chosen": 6.563163757324219, "rewards/margins": 20.536683218819753, "rewards/rejected": -13.973519461495536, "step": 1757 }, { "epoch": 0.439884899286876, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69243565.71428572, "logits/rejected": -40564086.4, "logps/chosen": -357.99930245535717, "logps/rejected": -574.6998046875, "loss": 0.0534, "rewards/chosen": 5.898641313825335, "rewards/margins": 17.70332271030971, "rewards/rejected": -11.804681396484375, "step": 1758 }, { "epoch": 0.4401351182284499, "grad_norm": 13.875, "kl": 9.868629455566406, "learning_rate": 5e-06, "logits/chosen": -45267879.384615384, "logits/rejected": -36426493.09090909, "logps/chosen": -461.3607647235577, "logps/rejected": -499.91530539772725, "loss": 0.0538, "rewards/chosen": 7.333030700683594, "rewards/margins": 19.2925893610174, "rewards/rejected": -11.959558660333807, "step": 1759 }, { "epoch": 0.44038533717002376, "grad_norm": 8.6875, "kl": 0.6621112823486328, "learning_rate": 5e-06, "logits/chosen": -34314087.11111111, "logits/rejected": -28137774.933333334, "logps/chosen": -300.6897243923611, "logps/rejected": -517.94169921875, "loss": 0.0386, "rewards/chosen": 7.055417378743489, "rewards/margins": 18.887442525227865, "rewards/rejected": -11.832025146484375, "step": 1760 }, { "epoch": 0.44063555611159766, "grad_norm": 4.1875, "kl": 0.675749659538269, "learning_rate": 5e-06, "logits/chosen": -36809636.92307692, "logits/rejected": -52435642.18181818, "logps/chosen": -443.8221905048077, "logps/rejected": -527.2193714488636, "loss": 0.0455, "rewards/chosen": 7.598936814528245, "rewards/margins": 18.248710845733857, "rewards/rejected": -10.649774031205611, "step": 1761 }, { "epoch": 0.4408857750531715, "grad_norm": 3.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39313902.76923077, "logits/rejected": -68325777.45454545, "logps/chosen": -298.28048001802887, "logps/rejected": -781.1985973011364, "loss": 0.0225, "rewards/chosen": 6.934457632211538, "rewards/margins": 22.028290861970063, "rewards/rejected": -15.093833229758523, "step": 1762 }, { "epoch": 0.4411359939947454, "grad_norm": 7.0, "kl": 1.7169456481933594, "learning_rate": 5e-06, "logits/chosen": -43266948.92307692, "logits/rejected": -37215383.27272727, "logps/chosen": -406.02647986778845, "logps/rejected": -463.85329367897725, "loss": 0.038, "rewards/chosen": 7.118249746469351, "rewards/margins": 18.285575546584763, "rewards/rejected": -11.167325800115412, "step": 1763 }, { "epoch": 0.4413862129363193, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6265684.0, "logits/rejected": -48570961.777777776, "logps/chosen": -318.15797932942706, "logps/rejected": -699.9150390625, "loss": 0.051, "rewards/chosen": 5.018633206685384, "rewards/margins": 17.75141281551785, "rewards/rejected": -12.732779608832466, "step": 1764 }, { "epoch": 0.44163643187789314, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60129910.85714286, "logits/rejected": -50563392.0, "logps/chosen": -600.7783203125, "logps/rejected": -711.9457720588235, "loss": 0.0118, "rewards/chosen": 9.88724844796317, "rewards/margins": 19.984983684635964, "rewards/rejected": -10.097735236672793, "step": 1765 }, { "epoch": 0.44188665081946704, "grad_norm": 1.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49175216.0, "logits/rejected": -58110432.0, "logps/chosen": -486.8152669270833, "logps/rejected": -722.115478515625, "loss": 0.0041, "rewards/chosen": 7.803700764973958, "rewards/margins": 26.00478744506836, "rewards/rejected": -18.201086680094402, "step": 1766 }, { "epoch": 0.4421368697610409, "grad_norm": 11.1875, "kl": 10.043951034545898, "learning_rate": 5e-06, "logits/chosen": -40265856.0, "logits/rejected": -39411280.0, "logps/chosen": -367.2900085449219, "logps/rejected": -505.8384094238281, "loss": 0.0362, "rewards/chosen": 7.767824172973633, "rewards/margins": 19.260985374450684, "rewards/rejected": -11.49316120147705, "step": 1767 }, { "epoch": 0.4423870887026148, "grad_norm": 12.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49708817.45454545, "logits/rejected": -40033376.0, "logps/chosen": -396.56143465909093, "logps/rejected": -536.3824744591346, "loss": 0.0709, "rewards/chosen": 7.377537120472301, "rewards/margins": 17.013385532619235, "rewards/rejected": -9.635848412146935, "step": 1768 }, { "epoch": 0.4426373076441887, "grad_norm": 20.625, "kl": 5.363088130950928, "learning_rate": 5e-06, "logits/chosen": -37162451.2, "logits/rejected": -38869120.0, "logps/chosen": -326.704345703125, "logps/rejected": -619.7370954241071, "loss": 0.056, "rewards/chosen": 7.3760833740234375, "rewards/margins": 18.15894971575056, "rewards/rejected": -10.78286634172712, "step": 1769 }, { "epoch": 0.4428875265857625, "grad_norm": 15.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20699465.6, "logits/rejected": -68717814.85714285, "logps/chosen": -282.4941650390625, "logps/rejected": -557.0023716517857, "loss": 0.0551, "rewards/chosen": 6.090250778198242, "rewards/margins": 15.743239756992885, "rewards/rejected": -9.652988978794642, "step": 1770 }, { "epoch": 0.4431377455273364, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30522121.6, "logits/rejected": -35764368.0, "logps/chosen": -464.6640625, "logps/rejected": -369.3518763950893, "loss": 0.0424, "rewards/chosen": 8.982410430908203, "rewards/margins": 17.81627219063895, "rewards/rejected": -8.833861759730748, "step": 1771 }, { "epoch": 0.4433879644689103, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58716864.0, "logits/rejected": -49143922.28571428, "logps/chosen": -370.426318359375, "logps/rejected": -640.2614397321429, "loss": 0.0345, "rewards/chosen": 6.778099822998047, "rewards/margins": 21.858746882847377, "rewards/rejected": -15.08064705984933, "step": 1772 }, { "epoch": 0.44363818341048417, "grad_norm": 1.5078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32047373.333333332, "logits/rejected": -50361162.666666664, "logps/chosen": -454.6949462890625, "logps/rejected": -595.2628038194445, "loss": 0.0134, "rewards/chosen": 8.387965520222982, "rewards/margins": 23.626933415730797, "rewards/rejected": -15.238967895507812, "step": 1773 }, { "epoch": 0.44388840235205806, "grad_norm": 12.25, "kl": 3.587136745452881, "learning_rate": 5e-06, "logits/chosen": -45023188.36363637, "logits/rejected": -36717735.384615384, "logps/chosen": -365.07870205965907, "logps/rejected": -587.0178786057693, "loss": 0.0695, "rewards/chosen": 7.220082369717685, "rewards/margins": 24.02255969280963, "rewards/rejected": -16.802477323091946, "step": 1774 }, { "epoch": 0.4441386212936319, "grad_norm": 13.375, "kl": 1.2241935729980469, "learning_rate": 5e-06, "logits/chosen": -55480352.0, "logits/rejected": -48889317.333333336, "logps/chosen": -340.57798258463544, "logps/rejected": -552.3624674479166, "loss": 0.1119, "rewards/chosen": 4.243961334228516, "rewards/margins": 15.498896280924479, "rewards/rejected": -11.254934946695963, "step": 1775 }, { "epoch": 0.4443888402352058, "grad_norm": 10.0625, "kl": 2.493149518966675, "learning_rate": 5e-06, "logits/chosen": -56879364.92307692, "logits/rejected": -41980506.18181818, "logps/chosen": -408.6760066105769, "logps/rejected": -454.79940518465907, "loss": 0.0347, "rewards/chosen": 7.321468646709736, "rewards/margins": 17.16368791273424, "rewards/rejected": -9.842219266024502, "step": 1776 }, { "epoch": 0.4446390591767797, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15926152.888888888, "logits/rejected": -50662801.06666667, "logps/chosen": -366.1369900173611, "logps/rejected": -563.3386067708333, "loss": 0.0188, "rewards/chosen": 7.333340115017361, "rewards/margins": 21.80038825141059, "rewards/rejected": -14.46704813639323, "step": 1777 }, { "epoch": 0.44488927811835355, "grad_norm": 8.6875, "kl": 14.138043403625488, "learning_rate": 5e-06, "logits/chosen": -45838133.333333336, "logits/rejected": -31965178.666666668, "logps/chosen": -433.3512369791667, "logps/rejected": -413.8290201822917, "loss": 0.1089, "rewards/chosen": 8.491605970594618, "rewards/margins": 21.06764687432183, "rewards/rejected": -12.576040903727213, "step": 1778 }, { "epoch": 0.44513949705992745, "grad_norm": 9.875, "kl": 13.0823392868042, "learning_rate": 5e-06, "logits/chosen": -95805723.42857143, "logits/rejected": -85909708.8, "logps/chosen": -522.8612583705357, "logps/rejected": -555.069873046875, "loss": 0.0318, "rewards/chosen": 9.090159824916295, "rewards/margins": 21.459762028285436, "rewards/rejected": -12.36960220336914, "step": 1779 }, { "epoch": 0.4453897160015013, "grad_norm": 0.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53376621.71428572, "logits/rejected": -52830809.6, "logps/chosen": -453.19224330357144, "logps/rejected": -573.626513671875, "loss": 0.0083, "rewards/chosen": 8.938412257603236, "rewards/margins": 22.412557765415734, "rewards/rejected": -13.4741455078125, "step": 1780 }, { "epoch": 0.4456399349430752, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75237403.42857143, "logits/rejected": -57445455.058823526, "logps/chosen": -391.70424107142856, "logps/rejected": -581.9053883272059, "loss": 0.0643, "rewards/chosen": 7.588176182338169, "rewards/margins": 21.664555397354253, "rewards/rejected": -14.076379215016084, "step": 1781 }, { "epoch": 0.4458901538846491, "grad_norm": 10.875, "kl": 9.098955154418945, "learning_rate": 5e-06, "logits/chosen": -67559760.0, "logits/rejected": -68833472.0, "logps/chosen": -430.5329284667969, "logps/rejected": -575.4144897460938, "loss": 0.0708, "rewards/chosen": 7.655349254608154, "rewards/margins": 22.26174306869507, "rewards/rejected": -14.606393814086914, "step": 1782 }, { "epoch": 0.44614037282622293, "grad_norm": 3.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47084064.0, "logits/rejected": -61674714.666666664, "logps/chosen": -322.6530354817708, "logps/rejected": -705.037109375, "loss": 0.0311, "rewards/chosen": 5.979930241902669, "rewards/margins": 22.179810206095375, "rewards/rejected": -16.199879964192707, "step": 1783 }, { "epoch": 0.44639059176779683, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75594232.0, "logits/rejected": -48111724.0, "logps/chosen": -513.6490478515625, "logps/rejected": -690.1429443359375, "loss": 0.0126, "rewards/chosen": 7.206121921539307, "rewards/margins": 23.410858631134033, "rewards/rejected": -16.204736709594727, "step": 1784 }, { "epoch": 0.4466408107093707, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21882183.272727273, "logits/rejected": -36222857.84615385, "logps/chosen": -404.6328125, "logps/rejected": -652.1051682692307, "loss": 0.0149, "rewards/chosen": 8.209370006214488, "rewards/margins": 21.10542244010872, "rewards/rejected": -12.89605243389423, "step": 1785 }, { "epoch": 0.44689102965094457, "grad_norm": 1.2578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36897752.0, "logits/rejected": -64548512.0, "logps/chosen": -373.8805338541667, "logps/rejected": -730.1560872395834, "loss": 0.0193, "rewards/chosen": 8.859169006347656, "rewards/margins": 26.301392873128254, "rewards/rejected": -17.442223866780598, "step": 1786 }, { "epoch": 0.44714124859251847, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44956263.11111111, "logits/rejected": -33566075.733333334, "logps/chosen": -519.7521701388889, "logps/rejected": -534.9561197916667, "loss": 0.0373, "rewards/chosen": 9.603086683485243, "rewards/margins": 24.427266777886285, "rewards/rejected": -14.824180094401042, "step": 1787 }, { "epoch": 0.4473914675340923, "grad_norm": 5.59375, "kl": 2.685976028442383, "learning_rate": 5e-06, "logits/chosen": -51929436.0, "logits/rejected": -47917304.0, "logps/chosen": -360.30389404296875, "logps/rejected": -464.84906005859375, "loss": 0.0398, "rewards/chosen": 7.607850551605225, "rewards/margins": 18.70877981185913, "rewards/rejected": -11.100929260253906, "step": 1788 }, { "epoch": 0.4476416864756662, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36835847.11111111, "logits/rejected": -47320776.53333333, "logps/chosen": -329.20030381944446, "logps/rejected": -704.5416666666666, "loss": 0.0214, "rewards/chosen": 6.065415700276692, "rewards/margins": 21.08590316772461, "rewards/rejected": -15.020487467447916, "step": 1789 }, { "epoch": 0.4478919054172401, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64112128.0, "logits/rejected": -35477700.0, "logps/chosen": -364.1282958984375, "logps/rejected": -641.5343627929688, "loss": 0.0527, "rewards/chosen": 5.406822204589844, "rewards/margins": 18.312036514282227, "rewards/rejected": -12.905214309692383, "step": 1790 }, { "epoch": 0.44814212435881395, "grad_norm": 8.5625, "kl": 5.4232497215271, "learning_rate": 5e-06, "logits/chosen": -72321787.07692307, "logits/rejected": -51623325.09090909, "logps/chosen": -506.74132361778845, "logps/rejected": -666.6501686789773, "loss": 0.0111, "rewards/chosen": 8.528812115009014, "rewards/margins": 26.629510412683018, "rewards/rejected": -18.100698297674004, "step": 1791 }, { "epoch": 0.44839234330038785, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -78324648.0, "logits/rejected": -60532900.0, "logps/chosen": -416.6427307128906, "logps/rejected": -571.3350830078125, "loss": 0.0517, "rewards/chosen": 6.130279541015625, "rewards/margins": 15.392738342285156, "rewards/rejected": -9.262458801269531, "step": 1792 }, { "epoch": 0.4486425622419617, "grad_norm": 1.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66610035.2, "logits/rejected": -42949188.571428575, "logps/chosen": -285.47431640625, "logps/rejected": -665.2876674107143, "loss": 0.0286, "rewards/chosen": 5.702185440063476, "rewards/margins": 20.401476124354772, "rewards/rejected": -14.699290684291295, "step": 1793 }, { "epoch": 0.4488927811835356, "grad_norm": 10.9375, "kl": 2.5778894424438477, "learning_rate": 5e-06, "logits/chosen": -61384870.4, "logits/rejected": -14482617.0, "logps/chosen": -316.8839111328125, "logps/rejected": -399.7562255859375, "loss": 0.0695, "rewards/chosen": 6.8824920654296875, "rewards/margins": 15.578229904174805, "rewards/rejected": -8.695737838745117, "step": 1794 }, { "epoch": 0.4491430001251095, "grad_norm": 2.75, "kl": 9.455293655395508, "learning_rate": 5e-06, "logits/chosen": -89607488.0, "logits/rejected": -49099850.666666664, "logps/chosen": -495.9684244791667, "logps/rejected": -651.7939046223959, "loss": 0.0635, "rewards/chosen": 8.225971857706705, "rewards/margins": 22.130024592081703, "rewards/rejected": -13.904052734375, "step": 1795 }, { "epoch": 0.44939321906668334, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60593043.2, "logits/rejected": -46339040.0, "logps/chosen": -514.023095703125, "logps/rejected": -592.2505580357143, "loss": 0.0023, "rewards/chosen": 9.517355346679688, "rewards/margins": 23.239915684291297, "rewards/rejected": -13.722560337611608, "step": 1796 }, { "epoch": 0.44964343800825723, "grad_norm": 14.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45215930.666666664, "logits/rejected": -53579840.0, "logps/chosen": -301.9062093098958, "logps/rejected": -616.9141031901041, "loss": 0.0495, "rewards/chosen": 5.078006744384766, "rewards/margins": 17.52708943684896, "rewards/rejected": -12.449082692464193, "step": 1797 }, { "epoch": 0.4498936569498311, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31808290.0, "logits/rejected": -54355768.0, "logps/chosen": -321.48297119140625, "logps/rejected": -664.0029296875, "loss": 0.0436, "rewards/chosen": 7.363469123840332, "rewards/margins": 20.268009185791016, "rewards/rejected": -12.904540061950684, "step": 1798 }, { "epoch": 0.450143875891405, "grad_norm": 7.34375, "kl": 10.426305770874023, "learning_rate": 5e-06, "logits/chosen": -33242500.266666666, "logits/rejected": -63433386.666666664, "logps/chosen": -433.0022786458333, "logps/rejected": -594.8330078125, "loss": 0.0549, "rewards/chosen": 8.395646158854166, "rewards/margins": 19.698702663845488, "rewards/rejected": -11.30305650499132, "step": 1799 }, { "epoch": 0.4503940948329789, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40816760.0, "logits/rejected": -56599258.666666664, "logps/chosen": -426.3206380208333, "logps/rejected": -724.4078776041666, "loss": 0.0301, "rewards/chosen": 7.383562723795573, "rewards/margins": 24.089753468831383, "rewards/rejected": -16.70619074503581, "step": 1800 }, { "epoch": 0.4506443137745527, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20709545.6, "logits/rejected": -50458459.428571425, "logps/chosen": -476.54697265625, "logps/rejected": -513.1675502232143, "loss": 0.0118, "rewards/chosen": 7.179499816894531, "rewards/margins": 21.05487474714007, "rewards/rejected": -13.875374930245536, "step": 1801 }, { "epoch": 0.4508945327161266, "grad_norm": 1.2421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64026577.45454545, "logits/rejected": -60840531.692307696, "logps/chosen": -403.55153586647725, "logps/rejected": -685.3254957932693, "loss": 0.0033, "rewards/chosen": 7.443070151589134, "rewards/margins": 22.113689235873988, "rewards/rejected": -14.670619084284855, "step": 1802 }, { "epoch": 0.4511447516577005, "grad_norm": 22.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27333273.6, "logits/rejected": -44268041.14285714, "logps/chosen": -275.53232421875, "logps/rejected": -603.9342912946429, "loss": 0.043, "rewards/chosen": 6.4478302001953125, "rewards/margins": 19.355772835867747, "rewards/rejected": -12.907942635672432, "step": 1803 }, { "epoch": 0.45139497059927436, "grad_norm": 9.875, "kl": 1.1471812725067139, "learning_rate": 5e-06, "logits/chosen": -48002949.333333336, "logits/rejected": -25026784.0, "logps/chosen": -305.0901692708333, "logps/rejected": -675.329345703125, "loss": 0.0739, "rewards/chosen": 4.8794816335042315, "rewards/margins": 16.949525833129883, "rewards/rejected": -12.07004419962565, "step": 1804 }, { "epoch": 0.45164518954084826, "grad_norm": 12.3125, "kl": 4.332741737365723, "learning_rate": 5e-06, "logits/chosen": -46563840.0, "logits/rejected": -37039238.4, "logps/chosen": -333.8687220982143, "logps/rejected": -557.331640625, "loss": 0.0833, "rewards/chosen": 7.277859279087612, "rewards/margins": 19.7707273210798, "rewards/rejected": -12.492868041992187, "step": 1805 }, { "epoch": 0.4518954084824221, "grad_norm": 11.8125, "kl": 1.997132658958435, "learning_rate": 5e-06, "logits/chosen": -69630794.66666667, "logits/rejected": -27983048.0, "logps/chosen": -350.4193522135417, "logps/rejected": -779.975341796875, "loss": 0.0386, "rewards/chosen": 6.767127354939778, "rewards/margins": 21.988503138224285, "rewards/rejected": -15.221375783284506, "step": 1806 }, { "epoch": 0.452145627423996, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44652521.14285714, "logits/rejected": -246348.0, "logps/chosen": -434.12656947544644, "logps/rejected": -463.12412109375, "loss": 0.0401, "rewards/chosen": 6.641373770577567, "rewards/margins": 19.031978934151784, "rewards/rejected": -12.390605163574218, "step": 1807 }, { "epoch": 0.4523958463655699, "grad_norm": 8.3125, "kl": 4.250385284423828, "learning_rate": 5e-06, "logits/chosen": -85655376.0, "logits/rejected": -54388688.0, "logps/chosen": -565.3590087890625, "logps/rejected": -555.5823364257812, "loss": 0.0244, "rewards/chosen": 8.35127067565918, "rewards/margins": 21.970748901367188, "rewards/rejected": -13.619478225708008, "step": 1808 }, { "epoch": 0.45264606530714374, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50558666.666666664, "logits/rejected": -35236309.333333336, "logps/chosen": -439.0404459635417, "logps/rejected": -722.1485188802084, "loss": 0.0162, "rewards/chosen": 10.310356140136719, "rewards/margins": 24.486577351888023, "rewards/rejected": -14.176221211751303, "step": 1809 }, { "epoch": 0.45289628424871764, "grad_norm": 9.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38164470.85714286, "logits/rejected": -41198288.0, "logps/chosen": -296.66531808035717, "logps/rejected": -588.27236328125, "loss": 0.0451, "rewards/chosen": 5.797191074916294, "rewards/margins": 17.81886465890067, "rewards/rejected": -12.021673583984375, "step": 1810 }, { "epoch": 0.4531465031902915, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83609144.0, "logits/rejected": -77866840.0, "logps/chosen": -398.7728576660156, "logps/rejected": -602.8576049804688, "loss": 0.0382, "rewards/chosen": 5.854246139526367, "rewards/margins": 18.571319580078125, "rewards/rejected": -12.717073440551758, "step": 1811 }, { "epoch": 0.4533967221318654, "grad_norm": 8.4375, "kl": 1.8036067485809326, "learning_rate": 5e-06, "logits/chosen": -38730414.54545455, "logits/rejected": -24260081.230769232, "logps/chosen": -348.451904296875, "logps/rejected": -568.3101712740385, "loss": 0.0289, "rewards/chosen": 6.234905589710582, "rewards/margins": 18.411818337607215, "rewards/rejected": -12.176912747896635, "step": 1812 }, { "epoch": 0.4536469410734393, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56824786.823529415, "logits/rejected": -35912292.571428575, "logps/chosen": -316.43212890625, "logps/rejected": -647.2610909598214, "loss": 0.0356, "rewards/chosen": 6.748531117158778, "rewards/margins": 21.882483666684447, "rewards/rejected": -15.13395254952567, "step": 1813 }, { "epoch": 0.4538971600150131, "grad_norm": 14.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51591952.0, "logits/rejected": -52059477.333333336, "logps/chosen": -439.0169270833333, "logps/rejected": -651.3352864583334, "loss": 0.0485, "rewards/chosen": 7.812395731608073, "rewards/margins": 20.4191411336263, "rewards/rejected": -12.606745402018229, "step": 1814 }, { "epoch": 0.454147378956587, "grad_norm": 1.5078125, "kl": 2.6219139099121094, "learning_rate": 5e-06, "logits/chosen": -44669671.384615384, "logits/rejected": -47210091.63636363, "logps/chosen": -430.2340745192308, "logps/rejected": -666.4497514204545, "loss": 0.0048, "rewards/chosen": 8.787156325120192, "rewards/margins": 25.280309290319053, "rewards/rejected": -16.493152965198863, "step": 1815 }, { "epoch": 0.45439759789816087, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51352977.45454545, "logits/rejected": -50888871.384615384, "logps/chosen": -328.2664683948864, "logps/rejected": -746.3853665865385, "loss": 0.0305, "rewards/chosen": 5.955451965332031, "rewards/margins": 18.627301142765926, "rewards/rejected": -12.671849177433895, "step": 1816 }, { "epoch": 0.45464781683973476, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53398856.53333333, "logits/rejected": -6795163.555555556, "logps/chosen": -349.4905598958333, "logps/rejected": -419.307861328125, "loss": 0.0706, "rewards/chosen": 6.24637451171875, "rewards/margins": 16.863036431206595, "rewards/rejected": -10.616661919487846, "step": 1817 }, { "epoch": 0.45489803578130866, "grad_norm": 20.375, "kl": 22.622695922851562, "learning_rate": 5e-06, "logits/chosen": -50894040.0, "logits/rejected": -48293024.0, "logps/chosen": -471.05316162109375, "logps/rejected": -431.5513916015625, "loss": 0.1078, "rewards/chosen": 9.585517883300781, "rewards/margins": 17.194531440734863, "rewards/rejected": -7.609013557434082, "step": 1818 }, { "epoch": 0.4551482547228825, "grad_norm": 11.875, "kl": 4.875029563903809, "learning_rate": 5e-06, "logits/chosen": -94279488.0, "logits/rejected": -17058228.0, "logps/chosen": -494.9310709635417, "logps/rejected": -496.8284912109375, "loss": 0.0217, "rewards/chosen": 8.994813919067383, "rewards/margins": 19.084096908569336, "rewards/rejected": -10.089282989501953, "step": 1819 }, { "epoch": 0.4553984736644564, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30042866.666666668, "logits/rejected": -35827592.0, "logps/chosen": -368.60546875, "logps/rejected": -585.975830078125, "loss": 0.0432, "rewards/chosen": 7.8612721761067705, "rewards/margins": 20.163007100423176, "rewards/rejected": -12.301734924316406, "step": 1820 }, { "epoch": 0.4556486926060303, "grad_norm": 16.375, "kl": 2.000948667526245, "learning_rate": 5e-06, "logits/chosen": -60658585.6, "logits/rejected": -47144320.0, "logps/chosen": -441.2140625, "logps/rejected": -569.6512276785714, "loss": 0.0269, "rewards/chosen": 11.016093444824218, "rewards/margins": 19.586375972202845, "rewards/rejected": -8.570282527378627, "step": 1821 }, { "epoch": 0.45589891154760415, "grad_norm": 7.71875, "kl": 10.435153007507324, "learning_rate": 5e-06, "logits/chosen": -51845412.571428575, "logits/rejected": -73719699.2, "logps/chosen": -439.9320591517857, "logps/rejected": -717.6755859375, "loss": 0.0421, "rewards/chosen": 8.85178702218192, "rewards/margins": 23.841270664760046, "rewards/rejected": -14.989483642578126, "step": 1822 }, { "epoch": 0.45614913048917805, "grad_norm": 10.75, "kl": 8.552907943725586, "learning_rate": 5e-06, "logits/chosen": -52741897.14285714, "logits/rejected": -20062328.470588237, "logps/chosen": -328.42299107142856, "logps/rejected": -548.6960592830883, "loss": 0.0499, "rewards/chosen": 6.935404096330915, "rewards/margins": 18.203003763150768, "rewards/rejected": -11.267599666819853, "step": 1823 }, { "epoch": 0.4563993494307519, "grad_norm": 2.8125, "kl": 0.040269218385219574, "learning_rate": 5e-06, "logits/chosen": -67590842.18181819, "logits/rejected": -56723367.384615384, "logps/chosen": -393.86177201704544, "logps/rejected": -754.8354116586538, "loss": 0.0232, "rewards/chosen": 8.580128756436435, "rewards/margins": 21.49782989075134, "rewards/rejected": -12.917701134314903, "step": 1824 }, { "epoch": 0.4566495683723258, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33807068.8, "logits/rejected": -45807533.71428572, "logps/chosen": -485.830078125, "logps/rejected": -507.83042689732144, "loss": 0.0486, "rewards/chosen": 7.4284523010253904, "rewards/margins": 18.39190968104771, "rewards/rejected": -10.963457380022321, "step": 1825 }, { "epoch": 0.4568997873138997, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57021216.0, "logits/rejected": -56780730.666666664, "logps/chosen": -446.1334635416667, "logps/rejected": -643.5774739583334, "loss": 0.0423, "rewards/chosen": 8.803810119628906, "rewards/margins": 22.35092798868815, "rewards/rejected": -13.547117869059244, "step": 1826 }, { "epoch": 0.45715000625547353, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74425924.57142857, "logits/rejected": -42103687.52941176, "logps/chosen": -190.69562639508928, "logps/rejected": -580.9008501838235, "loss": 0.0695, "rewards/chosen": 4.26662472316197, "rewards/margins": 17.304901844313164, "rewards/rejected": -13.038277121151195, "step": 1827 }, { "epoch": 0.4574002251970474, "grad_norm": 34.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22310152.0, "logits/rejected": -50640636.0, "logps/chosen": -317.6463623046875, "logps/rejected": -506.959228515625, "loss": 0.0788, "rewards/chosen": 7.503783226013184, "rewards/margins": 16.289525985717773, "rewards/rejected": -8.78574275970459, "step": 1828 }, { "epoch": 0.45765044413862127, "grad_norm": 8.6875, "kl": 3.1323599815368652, "learning_rate": 5e-06, "logits/chosen": -55205690.666666664, "logits/rejected": -47330010.666666664, "logps/chosen": -364.3751627604167, "logps/rejected": -527.6316731770834, "loss": 0.041, "rewards/chosen": 6.439053217569987, "rewards/margins": 19.272939682006836, "rewards/rejected": -12.83388646443685, "step": 1829 }, { "epoch": 0.45790066308019517, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38996614.4, "logits/rejected": -44130907.428571425, "logps/chosen": -374.4339111328125, "logps/rejected": -483.86328125, "loss": 0.0613, "rewards/chosen": 8.183837890625, "rewards/margins": 19.407094682965962, "rewards/rejected": -11.22325679234096, "step": 1830 }, { "epoch": 0.45815088202176907, "grad_norm": 19.625, "kl": 0.17576441168785095, "learning_rate": 5e-06, "logits/chosen": -43133277.86666667, "logits/rejected": -29485104.0, "logps/chosen": -349.8498046875, "logps/rejected": -437.47422960069446, "loss": 0.0462, "rewards/chosen": 7.840579732259115, "rewards/margins": 18.363396708170573, "rewards/rejected": -10.522816975911459, "step": 1831 }, { "epoch": 0.4584011009633429, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29273773.714285713, "logits/rejected": -40448441.6, "logps/chosen": -322.5550013950893, "logps/rejected": -662.237890625, "loss": 0.0233, "rewards/chosen": 6.986337389264788, "rewards/margins": 19.343897356305803, "rewards/rejected": -12.357559967041016, "step": 1832 }, { "epoch": 0.4586513199049168, "grad_norm": 0.8359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40992857.6, "logits/rejected": -43540553.14285714, "logps/chosen": -511.662841796875, "logps/rejected": -554.0796595982143, "loss": 0.0025, "rewards/chosen": 9.118090057373047, "rewards/margins": 22.390919385637556, "rewards/rejected": -13.272829328264509, "step": 1833 }, { "epoch": 0.45890153884649065, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58321179.428571425, "logits/rejected": -48664041.4117647, "logps/chosen": -367.0762416294643, "logps/rejected": -664.0877757352941, "loss": 0.0071, "rewards/chosen": 5.043979099818638, "rewards/margins": 21.837468443798418, "rewards/rejected": -16.79348934397978, "step": 1834 }, { "epoch": 0.45915175778806455, "grad_norm": 9.3125, "kl": 6.099122524261475, "learning_rate": 5e-06, "logits/chosen": -101802730.66666667, "logits/rejected": -64098026.666666664, "logps/chosen": -469.34619140625, "logps/rejected": -743.6424153645834, "loss": 0.0242, "rewards/chosen": 9.84646733601888, "rewards/margins": 24.594205220540367, "rewards/rejected": -14.747737884521484, "step": 1835 }, { "epoch": 0.45940197672963845, "grad_norm": 6.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57251728.0, "logits/rejected": -73646560.0, "logps/chosen": -357.1741129557292, "logps/rejected": -686.609130859375, "loss": 0.0526, "rewards/chosen": 5.172396977742513, "rewards/margins": 18.528693517049152, "rewards/rejected": -13.35629653930664, "step": 1836 }, { "epoch": 0.4596521956712123, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50931328.0, "logits/rejected": -39900739.2, "logps/chosen": -399.63978794642856, "logps/rejected": -420.067626953125, "loss": 0.0513, "rewards/chosen": 7.734361921037946, "rewards/margins": 16.77532719203404, "rewards/rejected": -9.040965270996093, "step": 1837 }, { "epoch": 0.4599024146127862, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32564619.636363637, "logits/rejected": -31510833.230769232, "logps/chosen": -398.4993341619318, "logps/rejected": -636.8825871394231, "loss": 0.0172, "rewards/chosen": 7.572857943448153, "rewards/margins": 20.044916753168707, "rewards/rejected": -12.472058809720552, "step": 1838 }, { "epoch": 0.4601526335543601, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -71099704.8888889, "logits/rejected": -92642628.26666667, "logps/chosen": -349.7503255208333, "logps/rejected": -630.3300130208333, "loss": 0.0282, "rewards/chosen": 5.275037553575304, "rewards/margins": 20.080352698432073, "rewards/rejected": -14.80531514485677, "step": 1839 }, { "epoch": 0.46040285249593393, "grad_norm": 1.7578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73448704.0, "logits/rejected": -54727868.44444445, "logps/chosen": -553.7563802083333, "logps/rejected": -602.5112847222222, "loss": 0.0033, "rewards/chosen": 9.24544677734375, "rewards/margins": 20.629786851671007, "rewards/rejected": -11.384340074327257, "step": 1840 }, { "epoch": 0.46065307143750783, "grad_norm": 2.21875, "kl": 1.5725047588348389, "learning_rate": 5e-06, "logits/chosen": -70169097.84615384, "logits/rejected": -32701984.0, "logps/chosen": -378.1858473557692, "logps/rejected": -668.8263050426136, "loss": 0.0063, "rewards/chosen": 7.302620520958533, "rewards/margins": 22.445614794751148, "rewards/rejected": -15.142994273792613, "step": 1841 }, { "epoch": 0.4609032903790817, "grad_norm": 1.4765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60732713.14285714, "logits/rejected": -54094234.35294118, "logps/chosen": -664.0159040178571, "logps/rejected": -549.5121208639706, "loss": 0.0022, "rewards/chosen": 8.691974094935826, "rewards/margins": 22.68019341220375, "rewards/rejected": -13.988219317267923, "step": 1842 }, { "epoch": 0.4611535093206556, "grad_norm": 3.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24814605.333333332, "logits/rejected": -26440317.333333332, "logps/chosen": -340.81390380859375, "logps/rejected": -412.0260009765625, "loss": 0.026, "rewards/chosen": 5.618303934733073, "rewards/margins": 19.399237314860027, "rewards/rejected": -13.780933380126953, "step": 1843 }, { "epoch": 0.4614037282622295, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35719153.777777776, "logits/rejected": -43465365.333333336, "logps/chosen": -383.1078287760417, "logps/rejected": -853.1214192708334, "loss": 0.0474, "rewards/chosen": 7.898061964246962, "rewards/margins": 25.20655271742079, "rewards/rejected": -17.308490753173828, "step": 1844 }, { "epoch": 0.4616539472038033, "grad_norm": 12.0625, "kl": 0.6608712077140808, "learning_rate": 5e-06, "logits/chosen": -68035444.36363636, "logits/rejected": -53438946.461538464, "logps/chosen": -543.0992542613636, "logps/rejected": -709.5197566105769, "loss": 0.0233, "rewards/chosen": 8.443614612926137, "rewards/margins": 26.665301342944165, "rewards/rejected": -18.22168673001803, "step": 1845 }, { "epoch": 0.4619041661453772, "grad_norm": 2.59375, "kl": 6.9179487228393555, "learning_rate": 5e-06, "logits/chosen": -66434648.615384616, "logits/rejected": -59711185.45454545, "logps/chosen": -411.5178786057692, "logps/rejected": -587.4454456676136, "loss": 0.0269, "rewards/chosen": 9.518671475923979, "rewards/margins": 25.435133767294715, "rewards/rejected": -15.916462291370738, "step": 1846 }, { "epoch": 0.46215438508695106, "grad_norm": 12.1875, "kl": 2.4767978191375732, "learning_rate": 5e-06, "logits/chosen": -63028083.2, "logits/rejected": -107057557.33333333, "logps/chosen": -471.79798177083336, "logps/rejected": -783.8649631076389, "loss": 0.0242, "rewards/chosen": 8.118845113118489, "rewards/margins": 22.72263709174262, "rewards/rejected": -14.603791978624132, "step": 1847 }, { "epoch": 0.46240460402852496, "grad_norm": 20.5, "kl": 3.8112666606903076, "learning_rate": 5e-06, "logits/chosen": -23998481.454545453, "logits/rejected": -36624743.384615384, "logps/chosen": -471.82936789772725, "logps/rejected": -597.1053936298077, "loss": 0.0631, "rewards/chosen": 6.276128595525568, "rewards/margins": 21.98370916193182, "rewards/rejected": -15.70758056640625, "step": 1848 }, { "epoch": 0.46265482297009886, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 143868.0, "logits/rejected": -42434264.615384616, "logps/chosen": -356.240966796875, "logps/rejected": -528.7065054086538, "loss": 0.0753, "rewards/chosen": 6.244314713911577, "rewards/margins": 23.212322902012538, "rewards/rejected": -16.96800818810096, "step": 1849 }, { "epoch": 0.4629050419116727, "grad_norm": 11.375, "kl": 4.972392559051514, "learning_rate": 5e-06, "logits/chosen": -56734173.86666667, "logits/rejected": -76248184.8888889, "logps/chosen": -349.71363932291666, "logps/rejected": -667.8041449652778, "loss": 0.0627, "rewards/chosen": 7.361461385091146, "rewards/margins": 24.032373385959204, "rewards/rejected": -16.670912000868057, "step": 1850 }, { "epoch": 0.4631552608532466, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29821978.666666668, "logits/rejected": -29113394.666666668, "logps/chosen": -413.2227376302083, "logps/rejected": -732.7853190104166, "loss": 0.0349, "rewards/chosen": 6.3348337809244795, "rewards/margins": 22.94544219970703, "rewards/rejected": -16.61060841878255, "step": 1851 }, { "epoch": 0.46340547979482044, "grad_norm": 14.9375, "kl": 8.013051986694336, "learning_rate": 5e-06, "logits/chosen": -53378953.14285714, "logits/rejected": -40966544.0, "logps/chosen": -435.09486607142856, "logps/rejected": -620.365869140625, "loss": 0.0757, "rewards/chosen": 7.492950439453125, "rewards/margins": 19.725806427001952, "rewards/rejected": -12.232855987548827, "step": 1852 }, { "epoch": 0.46365569873639434, "grad_norm": 10.0625, "kl": 2.2254798412323, "learning_rate": 5e-06, "logits/chosen": -63753856.0, "logits/rejected": -41146998.4, "logps/chosen": -391.5137416294643, "logps/rejected": -473.37041015625, "loss": 0.0417, "rewards/chosen": 7.20820563180106, "rewards/margins": 20.717333439418248, "rewards/rejected": -13.509127807617187, "step": 1853 }, { "epoch": 0.46390591767796824, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56538990.54545455, "logits/rejected": -34862993.23076923, "logps/chosen": -447.75874467329544, "logps/rejected": -457.05799278846155, "loss": 0.0332, "rewards/chosen": 8.099980441006748, "rewards/margins": 22.02635561002718, "rewards/rejected": -13.926375169020433, "step": 1854 }, { "epoch": 0.4641561366195421, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38590696.0, "logits/rejected": -57440704.0, "logps/chosen": -256.9342447916667, "logps/rejected": -734.7691243489584, "loss": 0.042, "rewards/chosen": 5.344825744628906, "rewards/margins": 21.600176493326824, "rewards/rejected": -16.255350748697918, "step": 1855 }, { "epoch": 0.464406355561116, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72811689.14285715, "logits/rejected": -45411296.0, "logps/chosen": -425.64310128348217, "logps/rejected": -619.601611328125, "loss": 0.0276, "rewards/chosen": 7.353938511439732, "rewards/margins": 18.278093937465123, "rewards/rejected": -10.92415542602539, "step": 1856 }, { "epoch": 0.4646565745026899, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -94020234.66666667, "logits/rejected": -44557224.0, "logps/chosen": -435.763916015625, "logps/rejected": -684.8262532552084, "loss": 0.0587, "rewards/chosen": 8.143967946370443, "rewards/margins": 21.18489201863607, "rewards/rejected": -13.040924072265625, "step": 1857 }, { "epoch": 0.4649067934442637, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27914455.272727273, "logits/rejected": -42928768.0, "logps/chosen": -230.5587713068182, "logps/rejected": -575.9972956730769, "loss": 0.0404, "rewards/chosen": 6.096482016823509, "rewards/margins": 22.01834293178745, "rewards/rejected": -15.921860914963942, "step": 1858 }, { "epoch": 0.4651570123858376, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57102912.0, "logits/rejected": -64864760.0, "logps/chosen": -341.9047546386719, "logps/rejected": -598.0948486328125, "loss": 0.0262, "rewards/chosen": 8.634907722473145, "rewards/margins": 23.6636381149292, "rewards/rejected": -15.028730392456055, "step": 1859 }, { "epoch": 0.46540723132741146, "grad_norm": 2.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48335844.571428575, "logits/rejected": -44650640.0, "logps/chosen": -367.60630580357144, "logps/rejected": -638.570654296875, "loss": 0.0325, "rewards/chosen": 7.441713605608259, "rewards/margins": 23.476595197405132, "rewards/rejected": -16.034881591796875, "step": 1860 }, { "epoch": 0.46565745026898536, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34463650.90909091, "logits/rejected": -65186673.23076923, "logps/chosen": -380.15354225852275, "logps/rejected": -790.3853665865385, "loss": 0.0351, "rewards/chosen": 6.1895058371803975, "rewards/margins": 18.395468971946023, "rewards/rejected": -12.205963134765625, "step": 1861 }, { "epoch": 0.46590766921055926, "grad_norm": 3.03125, "kl": 4.996633052825928, "learning_rate": 5e-06, "logits/chosen": -52812416.0, "logits/rejected": -33194109.333333332, "logps/chosen": -372.7600911458333, "logps/rejected": -539.4217122395834, "loss": 0.0733, "rewards/chosen": 6.45749028523763, "rewards/margins": 16.92386245727539, "rewards/rejected": -10.46637217203776, "step": 1862 }, { "epoch": 0.4661578881521331, "grad_norm": 6.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53601541.81818182, "logits/rejected": -53796081.23076923, "logps/chosen": -394.84810014204544, "logps/rejected": -628.0455228365385, "loss": 0.005, "rewards/chosen": 8.558661027388139, "rewards/margins": 20.497366471724078, "rewards/rejected": -11.938705444335938, "step": 1863 }, { "epoch": 0.466408107093707, "grad_norm": 13.625, "kl": 3.878955841064453, "learning_rate": 5e-06, "logits/chosen": -57396326.4, "logits/rejected": -59677105.777777776, "logps/chosen": -344.63603515625, "logps/rejected": -523.7227647569445, "loss": 0.068, "rewards/chosen": 7.1005126953125, "rewards/margins": 18.26181165907118, "rewards/rejected": -11.16129896375868, "step": 1864 }, { "epoch": 0.46665832603528085, "grad_norm": 10.0, "kl": 6.232662677764893, "learning_rate": 5e-06, "logits/chosen": -59377280.0, "logits/rejected": -14085264.0, "logps/chosen": -408.01468599759613, "logps/rejected": -692.0333806818181, "loss": 0.0541, "rewards/chosen": 6.857640193058894, "rewards/margins": 18.85916191047722, "rewards/rejected": -12.001521717418324, "step": 1865 }, { "epoch": 0.46690854497685474, "grad_norm": 14.9375, "kl": 2.0003116130828857, "learning_rate": 5e-06, "logits/chosen": -41547254.85714286, "logits/rejected": -31813203.2, "logps/chosen": -468.09915597098217, "logps/rejected": -420.550048828125, "loss": 0.0377, "rewards/chosen": 6.655948093959263, "rewards/margins": 13.600762394496371, "rewards/rejected": -6.944814300537109, "step": 1866 }, { "epoch": 0.46715876391842864, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37455090.666666664, "logits/rejected": -29891192.888888888, "logps/chosen": -406.0683186848958, "logps/rejected": -695.1059027777778, "loss": 0.0578, "rewards/chosen": 5.127570470174153, "rewards/margins": 17.73722775777181, "rewards/rejected": -12.609657287597656, "step": 1867 }, { "epoch": 0.4674089828600025, "grad_norm": 3.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -84758952.0, "logits/rejected": -48354772.0, "logps/chosen": -250.43667602539062, "logps/rejected": -575.507568359375, "loss": 0.0373, "rewards/chosen": 6.526072025299072, "rewards/margins": 19.323596477508545, "rewards/rejected": -12.797524452209473, "step": 1868 }, { "epoch": 0.4676592018015764, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44745929.14285714, "logits/rejected": -40820201.4117647, "logps/chosen": -338.78086635044644, "logps/rejected": -567.5758846507352, "loss": 0.0272, "rewards/chosen": 4.962835039411273, "rewards/margins": 17.51401930095769, "rewards/rejected": -12.551184261546416, "step": 1869 }, { "epoch": 0.4679094207431503, "grad_norm": 5.84375, "kl": 1.981398344039917, "learning_rate": 5e-06, "logits/chosen": -25068525.714285713, "logits/rejected": -62433164.8, "logps/chosen": -508.60477120535717, "logps/rejected": -596.7794921875, "loss": 0.049, "rewards/chosen": 7.745635986328125, "rewards/margins": 21.255525207519533, "rewards/rejected": -13.509889221191406, "step": 1870 }, { "epoch": 0.4681596396847241, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47142089.84615385, "logits/rejected": -44194301.09090909, "logps/chosen": -412.22543569711536, "logps/rejected": -575.4862393465909, "loss": 0.0244, "rewards/chosen": 8.527489295372597, "rewards/margins": 20.028929170195042, "rewards/rejected": -11.501439874822443, "step": 1871 }, { "epoch": 0.468409858626298, "grad_norm": 5.53125, "kl": 2.188936948776245, "learning_rate": 5e-06, "logits/chosen": -27342520.0, "logits/rejected": -44271114.666666664, "logps/chosen": -328.7531331380208, "logps/rejected": -629.1684977213541, "loss": 0.0848, "rewards/chosen": 6.353212992350261, "rewards/margins": 18.740184783935547, "rewards/rejected": -12.386971791585287, "step": 1872 }, { "epoch": 0.46866007756787187, "grad_norm": 4.125, "kl": 2.983301877975464, "learning_rate": 5e-06, "logits/chosen": -54742592.0, "logits/rejected": -32201208.888888888, "logps/chosen": -280.91328125, "logps/rejected": -617.9126519097222, "loss": 0.0137, "rewards/chosen": 7.596306355794271, "rewards/margins": 21.487479824490016, "rewards/rejected": -13.891173468695747, "step": 1873 }, { "epoch": 0.46891029650944577, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31042284.8, "logits/rejected": -43856484.571428575, "logps/chosen": -247.82734375, "logps/rejected": -454.7202845982143, "loss": 0.0427, "rewards/chosen": 5.980234527587891, "rewards/margins": 16.585199737548827, "rewards/rejected": -10.604965209960938, "step": 1874 }, { "epoch": 0.46916051545101967, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38059214.222222224, "logits/rejected": 5302540.8, "logps/chosen": -288.4533962673611, "logps/rejected": -601.8875651041667, "loss": 0.0418, "rewards/chosen": 6.521678924560547, "rewards/margins": 17.7122927347819, "rewards/rejected": -11.190613810221354, "step": 1875 }, { "epoch": 0.4694107343925935, "grad_norm": 14.5, "kl": 4.526261806488037, "learning_rate": 5e-06, "logits/chosen": -60578013.09090909, "logits/rejected": -12723532.307692308, "logps/chosen": -361.40431906960225, "logps/rejected": -453.7809495192308, "loss": 0.0452, "rewards/chosen": 7.557790582830256, "rewards/margins": 15.759252961698946, "rewards/rejected": -8.20146237886869, "step": 1876 }, { "epoch": 0.4696609533341674, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40500568.0, "logits/rejected": -2112438.5, "logps/chosen": -408.1853332519531, "logps/rejected": -595.2637939453125, "loss": 0.0512, "rewards/chosen": 7.1687774658203125, "rewards/margins": 19.724483489990234, "rewards/rejected": -12.555706024169922, "step": 1877 }, { "epoch": 0.46991117227574125, "grad_norm": 26.375, "kl": 4.4974212646484375, "learning_rate": 5e-06, "logits/chosen": -54849408.0, "logits/rejected": -43774193.23076923, "logps/chosen": -391.17116477272725, "logps/rejected": -587.0826697716346, "loss": 0.0498, "rewards/chosen": 8.184157631613992, "rewards/margins": 21.01902936388563, "rewards/rejected": -12.834871732271635, "step": 1878 }, { "epoch": 0.47016139121731515, "grad_norm": 7.96875, "kl": 4.693212032318115, "learning_rate": 5e-06, "logits/chosen": -56772992.0, "logits/rejected": -60292502.85714286, "logps/chosen": -472.859521484375, "logps/rejected": -686.2666015625, "loss": 0.0282, "rewards/chosen": 10.363054656982422, "rewards/margins": 23.50813740321568, "rewards/rejected": -13.145082746233259, "step": 1879 }, { "epoch": 0.47041161015888905, "grad_norm": 6.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49364411.733333334, "logits/rejected": -77578567.1111111, "logps/chosen": -355.53063151041664, "logps/rejected": -746.5803493923611, "loss": 0.0287, "rewards/chosen": 8.154911295572917, "rewards/margins": 26.00656534830729, "rewards/rejected": -17.851654052734375, "step": 1880 }, { "epoch": 0.4706618291004629, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73884384.0, "logits/rejected": -22105738.0, "logps/chosen": -463.17236328125, "logps/rejected": -565.255859375, "loss": 0.0494, "rewards/chosen": 9.413679122924805, "rewards/margins": 21.580289840698242, "rewards/rejected": -12.166610717773438, "step": 1881 }, { "epoch": 0.4709120480420368, "grad_norm": 16.75, "kl": 1.2115046977996826, "learning_rate": 5e-06, "logits/chosen": -55446464.0, "logits/rejected": -32496893.714285713, "logps/chosen": -374.7081787109375, "logps/rejected": -509.57986886160717, "loss": 0.0249, "rewards/chosen": 6.915530395507813, "rewards/margins": 16.933541216169086, "rewards/rejected": -10.018010820661273, "step": 1882 }, { "epoch": 0.47116226698361063, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36450931.692307696, "logits/rejected": -37335895.27272727, "logps/chosen": -320.76370943509613, "logps/rejected": -441.98561789772725, "loss": 0.0274, "rewards/chosen": 6.821241525503305, "rewards/margins": 16.461928254240874, "rewards/rejected": -9.64068672873757, "step": 1883 }, { "epoch": 0.47141248592518453, "grad_norm": 3.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45219225.6, "logits/rejected": -49079730.28571428, "logps/chosen": -255.88115234375, "logps/rejected": -709.9956752232143, "loss": 0.0333, "rewards/chosen": 5.742989730834961, "rewards/margins": 21.973569652012415, "rewards/rejected": -16.230579921177455, "step": 1884 }, { "epoch": 0.47166270486675843, "grad_norm": 1.3828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28300208.0, "logits/rejected": -48176540.44444445, "logps/chosen": -422.6548665364583, "logps/rejected": -536.7003038194445, "loss": 0.0066, "rewards/chosen": 9.325508117675781, "rewards/margins": 23.113004048665367, "rewards/rejected": -13.787495930989584, "step": 1885 }, { "epoch": 0.4719129238083323, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46124534.15384615, "logits/rejected": -43310941.09090909, "logps/chosen": -380.56456580528845, "logps/rejected": -496.02956321022725, "loss": 0.0626, "rewards/chosen": 6.922862126277043, "rewards/margins": 18.361029351507867, "rewards/rejected": -11.438167225230824, "step": 1886 }, { "epoch": 0.4721631427499062, "grad_norm": 6.09375, "kl": 2.2335257530212402, "learning_rate": 5e-06, "logits/chosen": -32178237.714285713, "logits/rejected": -47386048.0, "logps/chosen": -407.6123046875, "logps/rejected": -578.853466796875, "loss": 0.0341, "rewards/chosen": 7.5116473606654575, "rewards/margins": 18.918174307686943, "rewards/rejected": -11.406526947021485, "step": 1887 }, { "epoch": 0.47241336169148007, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59809984.0, "logits/rejected": -48518595.2, "logps/chosen": -290.95765904017856, "logps/rejected": -556.815576171875, "loss": 0.0624, "rewards/chosen": 5.166106632777622, "rewards/margins": 18.504383305140905, "rewards/rejected": -13.338276672363282, "step": 1888 }, { "epoch": 0.4726635806330539, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 7047771.636363637, "logits/rejected": -35451377.23076923, "logps/chosen": -515.8789950284091, "logps/rejected": -563.4643179086538, "loss": 0.0227, "rewards/chosen": 8.004874489524148, "rewards/margins": 20.31007844084626, "rewards/rejected": -12.305203951322115, "step": 1889 }, { "epoch": 0.4729137995746278, "grad_norm": 19.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51887072.0, "logits/rejected": -28549458.285714287, "logps/chosen": -453.22509765625, "logps/rejected": -713.9408482142857, "loss": 0.0455, "rewards/chosen": 9.762258148193359, "rewards/margins": 22.559568023681642, "rewards/rejected": -12.797309875488281, "step": 1890 }, { "epoch": 0.47316401851620166, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12954548.363636363, "logits/rejected": -30745540.923076924, "logps/chosen": -402.88725142045456, "logps/rejected": -431.87289663461536, "loss": 0.037, "rewards/chosen": 6.04262958873402, "rewards/margins": 16.06061265852068, "rewards/rejected": -10.01798306978666, "step": 1891 }, { "epoch": 0.47341423745777556, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36608760.88888889, "logits/rejected": -33135014.4, "logps/chosen": -548.8203125, "logps/rejected": -496.35914713541666, "loss": 0.0943, "rewards/chosen": 6.881097581651476, "rewards/margins": 18.23773210313585, "rewards/rejected": -11.356634521484375, "step": 1892 }, { "epoch": 0.47366445639934945, "grad_norm": 3.96875, "kl": 2.8082737922668457, "learning_rate": 5e-06, "logits/chosen": -57141832.53333333, "logits/rejected": -31109738.666666668, "logps/chosen": -393.179296875, "logps/rejected": -535.7318250868055, "loss": 0.0214, "rewards/chosen": 7.1835683186848955, "rewards/margins": 17.892664591471355, "rewards/rejected": -10.709096272786459, "step": 1893 }, { "epoch": 0.4739146753409233, "grad_norm": 11.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52637640.0, "logits/rejected": -40420796.0, "logps/chosen": -436.98614501953125, "logps/rejected": -508.92376708984375, "loss": 0.0351, "rewards/chosen": 9.695694923400879, "rewards/margins": 20.36480140686035, "rewards/rejected": -10.669106483459473, "step": 1894 }, { "epoch": 0.4741648942824972, "grad_norm": 11.875, "kl": 6.959571361541748, "learning_rate": 5e-06, "logits/chosen": -43281910.85714286, "logits/rejected": -77116787.2, "logps/chosen": -422.927490234375, "logps/rejected": -582.909228515625, "loss": 0.0391, "rewards/chosen": 8.081659589494977, "rewards/margins": 23.404788861955915, "rewards/rejected": -15.323129272460937, "step": 1895 }, { "epoch": 0.47441511322407104, "grad_norm": 10.75, "kl": 2.177305221557617, "learning_rate": 5e-06, "logits/chosen": -40252228.266666666, "logits/rejected": -53974318.222222224, "logps/chosen": -380.80462239583335, "logps/rejected": -474.67540147569446, "loss": 0.0515, "rewards/chosen": 5.971632893880209, "rewards/margins": 16.391041056315103, "rewards/rejected": -10.419408162434896, "step": 1896 }, { "epoch": 0.47466533216564494, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72755219.6923077, "logits/rejected": -53077538.90909091, "logps/chosen": -424.85873647836536, "logps/rejected": -617.0567294034091, "loss": 0.0568, "rewards/chosen": 6.177848229041467, "rewards/margins": 19.813805426750985, "rewards/rejected": -13.635957197709518, "step": 1897 }, { "epoch": 0.47491555110721884, "grad_norm": 17.375, "kl": 18.653339385986328, "learning_rate": 5e-06, "logits/chosen": -42761600.0, "logits/rejected": -22444636.444444444, "logps/chosen": -318.1982747395833, "logps/rejected": -753.7469618055555, "loss": 0.1651, "rewards/chosen": 6.748421732584635, "rewards/margins": 20.402789815266928, "rewards/rejected": -13.654368082682291, "step": 1898 }, { "epoch": 0.4751657700487927, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73855634.28571428, "logits/rejected": -37419958.4, "logps/chosen": -394.91148158482144, "logps/rejected": -360.6666748046875, "loss": 0.0334, "rewards/chosen": 6.17314202444894, "rewards/margins": 17.077735682896204, "rewards/rejected": -10.904593658447265, "step": 1899 }, { "epoch": 0.4754159889903666, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -91424755.2, "logits/rejected": -48867734.85714286, "logps/chosen": -462.37705078125, "logps/rejected": -773.2659040178571, "loss": 0.0199, "rewards/chosen": 8.088202667236327, "rewards/margins": 25.200553131103515, "rewards/rejected": -17.112350463867188, "step": 1900 }, { "epoch": 0.4756662079319404, "grad_norm": 7.4375, "kl": 1.6007335186004639, "learning_rate": 5e-06, "logits/chosen": -47928109.71428572, "logits/rejected": -50502979.2, "logps/chosen": -445.2078334263393, "logps/rejected": -491.758447265625, "loss": 0.0278, "rewards/chosen": 8.313377380371094, "rewards/margins": 18.96755142211914, "rewards/rejected": -10.654174041748046, "step": 1901 }, { "epoch": 0.4759164268735143, "grad_norm": 13.5625, "kl": 11.83216667175293, "learning_rate": 5e-06, "logits/chosen": -43428271.15789474, "logits/rejected": -113903936.0, "logps/chosen": -448.0248252467105, "logps/rejected": -848.2865234375, "loss": 0.0647, "rewards/chosen": 8.114122892680921, "rewards/margins": 28.32719357139186, "rewards/rejected": -20.213070678710938, "step": 1902 }, { "epoch": 0.4761666458150882, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88298419.2, "logits/rejected": -45715977.14285714, "logps/chosen": -356.9390625, "logps/rejected": -617.0769391741071, "loss": 0.0544, "rewards/chosen": 7.975172424316407, "rewards/margins": 21.806593322753905, "rewards/rejected": -13.8314208984375, "step": 1903 }, { "epoch": 0.47641686475666206, "grad_norm": 12.5, "kl": 5.7578444480896, "learning_rate": 5e-06, "logits/chosen": -45758160.84210526, "logits/rejected": -69828940.8, "logps/chosen": -365.9189967105263, "logps/rejected": -615.685595703125, "loss": 0.1247, "rewards/chosen": 6.058263678299753, "rewards/margins": 21.061179632889598, "rewards/rejected": -15.002915954589843, "step": 1904 }, { "epoch": 0.47666708369823596, "grad_norm": 10.5625, "kl": 0.9122282862663269, "learning_rate": 5e-06, "logits/chosen": -55956032.0, "logits/rejected": -26714816.0, "logps/chosen": -449.4697265625, "logps/rejected": -427.7788837139423, "loss": 0.0222, "rewards/chosen": 7.670865145596591, "rewards/margins": 17.5321463204764, "rewards/rejected": -9.861281174879808, "step": 1905 }, { "epoch": 0.47691730263980986, "grad_norm": 7.4375, "kl": 2.0044784545898438, "learning_rate": 5e-06, "logits/chosen": -58131189.333333336, "logits/rejected": -44498549.333333336, "logps/chosen": -353.2483723958333, "logps/rejected": -664.9111735026041, "loss": 0.0781, "rewards/chosen": 6.507879892985026, "rewards/margins": 25.566814422607422, "rewards/rejected": -19.058934529622395, "step": 1906 }, { "epoch": 0.4771675215813837, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62429032.0, "logits/rejected": -59947692.0, "logps/chosen": -297.3627624511719, "logps/rejected": -789.9147338867188, "loss": 0.0745, "rewards/chosen": 6.011932849884033, "rewards/margins": 22.72270441055298, "rewards/rejected": -16.710771560668945, "step": 1907 }, { "epoch": 0.4774177405229576, "grad_norm": 9.375, "kl": 6.982539176940918, "learning_rate": 5e-06, "logits/chosen": -48386732.307692304, "logits/rejected": -75093789.0909091, "logps/chosen": -390.5595703125, "logps/rejected": -592.1262428977273, "loss": 0.0503, "rewards/chosen": 7.237253042367788, "rewards/margins": 19.164231413727876, "rewards/rejected": -11.926978371360086, "step": 1908 }, { "epoch": 0.47766795946453144, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -130208654.22222222, "logits/rejected": -48411485.86666667, "logps/chosen": -421.73076714409723, "logps/rejected": -586.3192057291667, "loss": 0.0604, "rewards/chosen": 6.790916866726345, "rewards/margins": 15.566306474473741, "rewards/rejected": -8.775389607747396, "step": 1909 }, { "epoch": 0.47791817840610534, "grad_norm": 20.625, "kl": 21.40979766845703, "learning_rate": 5e-06, "logits/chosen": -45132957.86666667, "logits/rejected": -54183658.666666664, "logps/chosen": -405.50390625, "logps/rejected": -939.6733940972222, "loss": 0.0741, "rewards/chosen": 9.526466878255208, "rewards/margins": 26.72743191189236, "rewards/rejected": -17.200965033637154, "step": 1910 }, { "epoch": 0.47816839734767924, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46105760.0, "logits/rejected": -64403349.333333336, "logps/chosen": -359.3991292317708, "logps/rejected": -711.50830078125, "loss": 0.0376, "rewards/chosen": 6.146434783935547, "rewards/margins": 22.985132853190105, "rewards/rejected": -16.83869806925456, "step": 1911 }, { "epoch": 0.4784186162892531, "grad_norm": 2.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10282146.0, "logits/rejected": -39385432.0, "logps/chosen": -229.341552734375, "logps/rejected": -561.3908081054688, "loss": 0.0374, "rewards/chosen": 6.122142791748047, "rewards/margins": 19.519729614257812, "rewards/rejected": -13.397586822509766, "step": 1912 }, { "epoch": 0.478668835230827, "grad_norm": 10.8125, "kl": 0.15233168005943298, "learning_rate": 5e-06, "logits/chosen": -38517292.8, "logits/rejected": -32517408.0, "logps/chosen": -593.337158203125, "logps/rejected": -496.22506277901783, "loss": 0.1167, "rewards/chosen": 6.516319274902344, "rewards/margins": 17.675260271344868, "rewards/rejected": -11.158940996442523, "step": 1913 }, { "epoch": 0.4789190541724008, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49710976.0, "logits/rejected": -45232374.15384615, "logps/chosen": -313.15525124289775, "logps/rejected": -847.2007962740385, "loss": 0.0566, "rewards/chosen": 5.561199881813743, "rewards/margins": 25.460400321266867, "rewards/rejected": -19.899200439453125, "step": 1914 }, { "epoch": 0.4791692731139747, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27284509.09090909, "logits/rejected": -36924512.0, "logps/chosen": -373.5554865056818, "logps/rejected": -441.73568960336536, "loss": 0.0395, "rewards/chosen": 7.741558421741832, "rewards/margins": 19.427527847823562, "rewards/rejected": -11.68596942608173, "step": 1915 }, { "epoch": 0.4794194920555486, "grad_norm": 11.75, "kl": 14.188946723937988, "learning_rate": 5e-06, "logits/chosen": -61077848.0, "logits/rejected": -46737404.0, "logps/chosen": -467.231689453125, "logps/rejected": -599.38818359375, "loss": 0.0728, "rewards/chosen": 7.95274019241333, "rewards/margins": 22.40977907180786, "rewards/rejected": -14.457038879394531, "step": 1916 }, { "epoch": 0.47966971099712247, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60779982.222222224, "logits/rejected": -35502632.53333333, "logps/chosen": -519.5782877604166, "logps/rejected": -544.8016927083333, "loss": 0.0119, "rewards/chosen": 9.962235344780815, "rewards/margins": 22.010817294650607, "rewards/rejected": -12.048581949869792, "step": 1917 }, { "epoch": 0.47991992993869637, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57390165.333333336, "logits/rejected": -41360977.06666667, "logps/chosen": -362.00208875868054, "logps/rejected": -663.4918619791666, "loss": 0.0588, "rewards/chosen": 6.329730987548828, "rewards/margins": 23.30272699991862, "rewards/rejected": -16.97299601236979, "step": 1918 }, { "epoch": 0.4801701488802702, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58262745.6, "logits/rejected": -53890171.428571425, "logps/chosen": -216.6924560546875, "logps/rejected": -697.7385602678571, "loss": 0.0723, "rewards/chosen": 4.11395263671875, "rewards/margins": 17.292550223214285, "rewards/rejected": -13.178597586495536, "step": 1919 }, { "epoch": 0.4804203678218441, "grad_norm": 13.375, "kl": 1.8547430038452148, "learning_rate": 5e-06, "logits/chosen": -26991511.272727273, "logits/rejected": -39955524.92307692, "logps/chosen": -337.51376065340907, "logps/rejected": -720.0249399038462, "loss": 0.053, "rewards/chosen": 5.6490395285866475, "rewards/margins": 21.79117397495083, "rewards/rejected": -16.14213444636418, "step": 1920 }, { "epoch": 0.480670586763418, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25023563.636363637, "logits/rejected": -48466638.76923077, "logps/chosen": -238.816650390625, "logps/rejected": -501.7196514423077, "loss": 0.052, "rewards/chosen": 5.699712579900568, "rewards/margins": 17.415908493362107, "rewards/rejected": -11.716195913461538, "step": 1921 }, { "epoch": 0.48092080570499185, "grad_norm": 2.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42263622.4, "logits/rejected": -74208214.85714285, "logps/chosen": -415.3212890625, "logps/rejected": -669.0244140625, "loss": 0.0232, "rewards/chosen": 7.46143798828125, "rewards/margins": 23.398624093191962, "rewards/rejected": -15.937186104910714, "step": 1922 }, { "epoch": 0.48117102464656575, "grad_norm": 13.5, "kl": 6.482516288757324, "learning_rate": 5e-06, "logits/chosen": -69249092.26666667, "logits/rejected": -44832960.0, "logps/chosen": -429.90647786458334, "logps/rejected": -549.9962565104166, "loss": 0.0255, "rewards/chosen": 8.572227986653646, "rewards/margins": 17.960567559136287, "rewards/rejected": -9.38833957248264, "step": 1923 }, { "epoch": 0.48142124358813965, "grad_norm": 12.8125, "kl": 23.512584686279297, "learning_rate": 5e-06, "logits/chosen": -56614925.71428572, "logits/rejected": -76465100.8, "logps/chosen": -505.8662806919643, "logps/rejected": -481.674072265625, "loss": 0.1131, "rewards/chosen": 8.634522574288505, "rewards/margins": 19.620542471749445, "rewards/rejected": -10.986019897460938, "step": 1924 }, { "epoch": 0.4816714625297135, "grad_norm": 5.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43761178.666666664, "logits/rejected": -35434016.0, "logps/chosen": -325.2525634765625, "logps/rejected": -597.0359700520834, "loss": 0.0201, "rewards/chosen": 6.605962753295898, "rewards/margins": 19.074846267700195, "rewards/rejected": -12.468883514404297, "step": 1925 }, { "epoch": 0.4819216814712874, "grad_norm": 5.4375, "kl": 11.590279579162598, "learning_rate": 5e-06, "logits/chosen": -81882971.42857143, "logits/rejected": -62991494.4, "logps/chosen": -440.77469308035717, "logps/rejected": -564.36279296875, "loss": 0.041, "rewards/chosen": 8.702153887067523, "rewards/margins": 19.08459025791713, "rewards/rejected": -10.382436370849609, "step": 1926 }, { "epoch": 0.48217190041286123, "grad_norm": 11.0625, "kl": 3.052659749984741, "learning_rate": 5e-06, "logits/chosen": -65402677.333333336, "logits/rejected": -70461210.66666667, "logps/chosen": -457.2631022135417, "logps/rejected": -650.5652669270834, "loss": 0.0342, "rewards/chosen": 7.065879185994466, "rewards/margins": 20.82480812072754, "rewards/rejected": -13.758928934733072, "step": 1927 }, { "epoch": 0.48242211935443513, "grad_norm": 13.9375, "kl": 4.076589107513428, "learning_rate": 5e-06, "logits/chosen": -70541120.0, "logits/rejected": -81504716.8, "logps/chosen": -409.54227120535717, "logps/rejected": -619.988818359375, "loss": 0.0663, "rewards/chosen": 6.9412705557686945, "rewards/margins": 18.212137712751115, "rewards/rejected": -11.270867156982423, "step": 1928 }, { "epoch": 0.48267233829600903, "grad_norm": 10.25, "kl": 0.19109058380126953, "learning_rate": 5e-06, "logits/chosen": -74265878.85714285, "logits/rejected": -42046764.8, "logps/chosen": -437.51778738839283, "logps/rejected": -462.47861328125, "loss": 0.0436, "rewards/chosen": 7.423160552978516, "rewards/margins": 16.733378601074218, "rewards/rejected": -9.310218048095702, "step": 1929 }, { "epoch": 0.4829225572375829, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61009093.81818182, "logits/rejected": -52788814.76923077, "logps/chosen": -591.8905806107955, "logps/rejected": -755.0114182692307, "loss": 0.0513, "rewards/chosen": 8.715604608709162, "rewards/margins": 21.440873632897862, "rewards/rejected": -12.725269024188702, "step": 1930 }, { "epoch": 0.48317277617915677, "grad_norm": 10.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33286280.727272727, "logits/rejected": -64459854.76923077, "logps/chosen": -426.6640625, "logps/rejected": -761.5040564903846, "loss": 0.0265, "rewards/chosen": 7.638193303888494, "rewards/margins": 24.445725981172146, "rewards/rejected": -16.807532677283653, "step": 1931 }, { "epoch": 0.4834229951207306, "grad_norm": 13.375, "kl": 11.200294494628906, "learning_rate": 5e-06, "logits/chosen": -48218752.0, "logits/rejected": -51606330.18181818, "logps/chosen": -480.3818359375, "logps/rejected": -669.4682173295455, "loss": 0.0557, "rewards/chosen": 8.839434697077824, "rewards/margins": 22.166814790739046, "rewards/rejected": -13.32738009366122, "step": 1932 }, { "epoch": 0.4836732140623045, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25823662.4, "logits/rejected": -57090230.85714286, "logps/chosen": -273.687939453125, "logps/rejected": -592.97265625, "loss": 0.0456, "rewards/chosen": 6.709458923339843, "rewards/margins": 20.446639360700335, "rewards/rejected": -13.737180437360491, "step": 1933 }, { "epoch": 0.4839234330038784, "grad_norm": 1.3515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56556868.92307692, "logits/rejected": -39687138.90909091, "logps/chosen": -465.2841045673077, "logps/rejected": -508.5901988636364, "loss": 0.0144, "rewards/chosen": 9.207007774939903, "rewards/margins": 21.131935919915044, "rewards/rejected": -11.924928144975143, "step": 1934 }, { "epoch": 0.48417365194545225, "grad_norm": 5.9375, "kl": 8.011520385742188, "learning_rate": 5e-06, "logits/chosen": -48447872.0, "logits/rejected": -43923984.0, "logps/chosen": -401.1868489583333, "logps/rejected": -565.0577799479166, "loss": 0.0684, "rewards/chosen": 9.283824920654297, "rewards/margins": 22.88791275024414, "rewards/rejected": -13.604087829589844, "step": 1935 }, { "epoch": 0.48442387088702615, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48362850.90909091, "logits/rejected": -81754210.46153846, "logps/chosen": -409.84033203125, "logps/rejected": -850.5164513221154, "loss": 0.03, "rewards/chosen": 8.232659773393111, "rewards/margins": 26.631907162966428, "rewards/rejected": -18.39924738957332, "step": 1936 }, { "epoch": 0.48467408982860005, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34768073.84615385, "logits/rejected": -44530725.81818182, "logps/chosen": -314.6162672776442, "logps/rejected": -560.1938920454545, "loss": 0.0829, "rewards/chosen": 5.6413726806640625, "rewards/margins": 16.828032753684305, "rewards/rejected": -11.186660073020242, "step": 1937 }, { "epoch": 0.4849243087701739, "grad_norm": 13.5, "kl": 3.664531707763672, "learning_rate": 5e-06, "logits/chosen": -29624058.181818184, "logits/rejected": -32789031.384615384, "logps/chosen": -472.16938920454544, "logps/rejected": -403.0988957331731, "loss": 0.0436, "rewards/chosen": 9.345515858043324, "rewards/margins": 17.692828198412915, "rewards/rejected": -8.34731234036959, "step": 1938 }, { "epoch": 0.4851745277117478, "grad_norm": 8.6875, "kl": 10.540792465209961, "learning_rate": 5e-06, "logits/chosen": -61500834.461538464, "logits/rejected": -38500794.18181818, "logps/chosen": -353.49752103365387, "logps/rejected": -520.1374289772727, "loss": 0.1117, "rewards/chosen": 6.553978553185096, "rewards/margins": 19.465388798213503, "rewards/rejected": -12.911410245028408, "step": 1939 }, { "epoch": 0.48542474665332164, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39404688.0, "logits/rejected": -45555992.0, "logps/chosen": -258.1829528808594, "logps/rejected": -603.4166870117188, "loss": 0.0594, "rewards/chosen": 5.2762451171875, "rewards/margins": 18.531664848327637, "rewards/rejected": -13.255419731140137, "step": 1940 }, { "epoch": 0.48567496559489554, "grad_norm": 2.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44179373.333333336, "logits/rejected": -59915802.666666664, "logps/chosen": -283.96409098307294, "logps/rejected": -467.8128255208333, "loss": 0.023, "rewards/chosen": 6.528441747029622, "rewards/margins": 17.747181574503582, "rewards/rejected": -11.218739827473959, "step": 1941 }, { "epoch": 0.48592518453646943, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63616739.55555555, "logits/rejected": -43949721.6, "logps/chosen": -373.0949978298611, "logps/rejected": -636.1695963541666, "loss": 0.0255, "rewards/chosen": 7.699903700086805, "rewards/margins": 21.558041720920137, "rewards/rejected": -13.858138020833334, "step": 1942 }, { "epoch": 0.4861754034780433, "grad_norm": 1.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56281728.0, "logits/rejected": -75405026.46153846, "logps/chosen": -366.9915660511364, "logps/rejected": -641.7111628605769, "loss": 0.0184, "rewards/chosen": 6.836582530628551, "rewards/margins": 23.438864541220497, "rewards/rejected": -16.602282010591946, "step": 1943 }, { "epoch": 0.4864256224196172, "grad_norm": 5.84375, "kl": 6.260614395141602, "learning_rate": 5e-06, "logits/chosen": -87920914.28571428, "logits/rejected": -46432240.0, "logps/chosen": -464.88804408482144, "logps/rejected": -541.467724609375, "loss": 0.0225, "rewards/chosen": 8.071257999965123, "rewards/margins": 19.90644018990653, "rewards/rejected": -11.835182189941406, "step": 1944 }, { "epoch": 0.486675841361191, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56832442.666666664, "logits/rejected": -58913509.333333336, "logps/chosen": -320.0589192708333, "logps/rejected": -650.1945393880209, "loss": 0.0383, "rewards/chosen": 5.34004275004069, "rewards/margins": 17.321073532104492, "rewards/rejected": -11.981030782063803, "step": 1945 }, { "epoch": 0.4869260603027649, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66299306.666666664, "logits/rejected": -54337945.6, "logps/chosen": -361.333984375, "logps/rejected": -777.0619140625, "loss": 0.047, "rewards/chosen": 6.708567725287543, "rewards/margins": 22.973034074571398, "rewards/rejected": -16.264466349283854, "step": 1946 }, { "epoch": 0.4871762792443388, "grad_norm": 18.75, "kl": 4.923226833343506, "learning_rate": 5e-06, "logits/chosen": -37663815.11111111, "logits/rejected": -41341252.266666666, "logps/chosen": -500.55805121527777, "logps/rejected": -843.296875, "loss": 0.0495, "rewards/chosen": 9.026662190755209, "rewards/margins": 28.73782755533854, "rewards/rejected": -19.711165364583334, "step": 1947 }, { "epoch": 0.48742649818591266, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34642035.2, "logits/rejected": 14962974.857142856, "logps/chosen": -329.342529296875, "logps/rejected": -708.0181361607143, "loss": 0.056, "rewards/chosen": 7.9686744689941404, "rewards/margins": 20.091521889822822, "rewards/rejected": -12.122847420828682, "step": 1948 }, { "epoch": 0.48767671712748656, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59433674.666666664, "logits/rejected": -60323498.666666664, "logps/chosen": -375.8107096354167, "logps/rejected": -669.0751953125, "loss": 0.0325, "rewards/chosen": 5.9443003336588545, "rewards/margins": 25.546641031901043, "rewards/rejected": -19.602340698242188, "step": 1949 }, { "epoch": 0.4879269360690604, "grad_norm": 7.03125, "kl": 7.669111251831055, "learning_rate": 5e-06, "logits/chosen": -63687348.0, "logits/rejected": -67990032.0, "logps/chosen": -385.15252685546875, "logps/rejected": -499.18048095703125, "loss": 0.0626, "rewards/chosen": 7.924552917480469, "rewards/margins": 18.80202293395996, "rewards/rejected": -10.877470016479492, "step": 1950 }, { "epoch": 0.4881771550106343, "grad_norm": 14.6875, "kl": 8.706321716308594, "learning_rate": 5e-06, "logits/chosen": -24244727.466666665, "logits/rejected": -81248618.66666667, "logps/chosen": -469.5697265625, "logps/rejected": -524.0747612847222, "loss": 0.0694, "rewards/chosen": 8.054677836100261, "rewards/margins": 19.121971638997394, "rewards/rejected": -11.067293802897135, "step": 1951 }, { "epoch": 0.4884273739522082, "grad_norm": 0.96484375, "kl": 3.2150962352752686, "learning_rate": 5e-06, "logits/chosen": -54106986.666666664, "logits/rejected": -39224746.666666664, "logps/chosen": -362.5927734375, "logps/rejected": -606.013427734375, "loss": 0.0175, "rewards/chosen": 8.120898564656576, "rewards/margins": 23.28376579284668, "rewards/rejected": -15.162867228190104, "step": 1952 }, { "epoch": 0.48867759289378204, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66858542.54545455, "logits/rejected": -44173307.07692308, "logps/chosen": -385.7620738636364, "logps/rejected": -519.1761568509615, "loss": 0.0137, "rewards/chosen": 7.75482871315696, "rewards/margins": 20.86042422848148, "rewards/rejected": -13.10559551532452, "step": 1953 }, { "epoch": 0.48892781183535594, "grad_norm": 1.625, "kl": 0.40084776282310486, "learning_rate": 5e-06, "logits/chosen": -53397001.14285714, "logits/rejected": -53505478.4, "logps/chosen": -381.13065011160717, "logps/rejected": -505.727734375, "loss": 0.019, "rewards/chosen": 7.013322012765067, "rewards/margins": 20.214681570870535, "rewards/rejected": -13.20135955810547, "step": 1954 }, { "epoch": 0.48917803077692984, "grad_norm": 14.75, "kl": 18.17488670349121, "learning_rate": 5e-06, "logits/chosen": -67558816.0, "logits/rejected": -40184600.0, "logps/chosen": -519.1560872395834, "logps/rejected": -438.814453125, "loss": 0.036, "rewards/chosen": 8.646723429361979, "rewards/margins": 18.27422841389974, "rewards/rejected": -9.62750498453776, "step": 1955 }, { "epoch": 0.4894282497185037, "grad_norm": 6.5625, "kl": 3.30061674118042, "learning_rate": 5e-06, "logits/chosen": -57174459.07692308, "logits/rejected": -76117876.36363636, "logps/chosen": -348.33289513221155, "logps/rejected": -892.8329190340909, "loss": 0.0264, "rewards/chosen": 9.003382756159855, "rewards/margins": 29.31856275438429, "rewards/rejected": -20.315179998224433, "step": 1956 }, { "epoch": 0.4896784686600776, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66881460.0, "logits/rejected": -45749436.0, "logps/chosen": -453.5660400390625, "logps/rejected": -703.0601196289062, "loss": 0.0557, "rewards/chosen": 7.74298620223999, "rewards/margins": 25.60486364364624, "rewards/rejected": -17.86187744140625, "step": 1957 }, { "epoch": 0.4899286876016514, "grad_norm": 6.40625, "kl": 0.15939585864543915, "learning_rate": 5e-06, "logits/chosen": -11980749.714285715, "logits/rejected": -40948902.4, "logps/chosen": -394.297119140625, "logps/rejected": -625.00654296875, "loss": 0.0275, "rewards/chosen": 7.305687495640346, "rewards/margins": 18.470462581089564, "rewards/rejected": -11.164775085449218, "step": 1958 }, { "epoch": 0.4901789065432253, "grad_norm": 3.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47199291.07692308, "logits/rejected": -78185326.54545455, "logps/chosen": -354.5652043269231, "logps/rejected": -527.8115234375, "loss": 0.0135, "rewards/chosen": 6.791297912597656, "rewards/margins": 18.08171844482422, "rewards/rejected": -11.290420532226562, "step": 1959 }, { "epoch": 0.4904291254847992, "grad_norm": 2.421875, "kl": 3.4636759757995605, "learning_rate": 5e-06, "logits/chosen": -80187410.28571428, "logits/rejected": -47529510.4, "logps/chosen": -383.0457240513393, "logps/rejected": -501.22373046875, "loss": 0.0223, "rewards/chosen": 7.857319423130581, "rewards/margins": 18.69271981375558, "rewards/rejected": -10.835400390625, "step": 1960 }, { "epoch": 0.49067934442637307, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25855681.6, "logits/rejected": -37850109.71428572, "logps/chosen": -209.073876953125, "logps/rejected": -435.82167271205356, "loss": 0.0441, "rewards/chosen": 5.124552154541016, "rewards/margins": 14.8501097542899, "rewards/rejected": -9.725557599748884, "step": 1961 }, { "epoch": 0.49092956336794696, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63210777.6, "logits/rejected": -67468355.36842105, "logps/chosen": -318.7895263671875, "logps/rejected": -604.8444181743421, "loss": 0.0188, "rewards/chosen": 7.164218902587891, "rewards/margins": 21.044686367637233, "rewards/rejected": -13.880467465049342, "step": 1962 }, { "epoch": 0.4911797823095208, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88177973.33333333, "logits/rejected": -39010042.666666664, "logps/chosen": -372.9269205729167, "logps/rejected": -445.6918538411458, "loss": 0.0318, "rewards/chosen": 7.533425649007161, "rewards/margins": 16.910661061604817, "rewards/rejected": -9.377235412597656, "step": 1963 }, { "epoch": 0.4914300012510947, "grad_norm": 6.46875, "kl": 5.331838130950928, "learning_rate": 5e-06, "logits/chosen": -44472232.0, "logits/rejected": -54512184.0, "logps/chosen": -345.8945007324219, "logps/rejected": -556.1705932617188, "loss": 0.029, "rewards/chosen": 7.838643550872803, "rewards/margins": 19.6385817527771, "rewards/rejected": -11.799938201904297, "step": 1964 }, { "epoch": 0.4916802201926686, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61190560.0, "logits/rejected": -77380697.6, "logps/chosen": -363.66971261160717, "logps/rejected": -760.0359375, "loss": 0.0576, "rewards/chosen": 6.642707824707031, "rewards/margins": 22.61170196533203, "rewards/rejected": -15.968994140625, "step": 1965 }, { "epoch": 0.49193043913424245, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -93126902.85714285, "logits/rejected": -34372385.88235294, "logps/chosen": -489.9926060267857, "logps/rejected": -647.1927849264706, "loss": 0.0144, "rewards/chosen": 7.906374250139509, "rewards/margins": 24.71354264972591, "rewards/rejected": -16.8071683995864, "step": 1966 }, { "epoch": 0.49218065807581635, "grad_norm": 2.234375, "kl": 11.527244567871094, "learning_rate": 5e-06, "logits/chosen": -49295921.23076923, "logits/rejected": -51179287.27272727, "logps/chosen": -473.7185246394231, "logps/rejected": -741.1678355823864, "loss": 0.0572, "rewards/chosen": 8.49020268366887, "rewards/margins": 25.79953440419444, "rewards/rejected": -17.309331720525567, "step": 1967 }, { "epoch": 0.4924308770173902, "grad_norm": 6.4375, "kl": 7.9568562507629395, "learning_rate": 5e-06, "logits/chosen": -44239261.538461536, "logits/rejected": -90056104.72727273, "logps/chosen": -401.8523512620192, "logps/rejected": -591.1574041193181, "loss": 0.0349, "rewards/chosen": 9.188364469088041, "rewards/margins": 19.93035360483023, "rewards/rejected": -10.741989135742188, "step": 1968 }, { "epoch": 0.4926810959589641, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40718944.0, "logits/rejected": -55282658.13333333, "logps/chosen": -358.61957465277777, "logps/rejected": -620.6891927083333, "loss": 0.0287, "rewards/chosen": 6.3852255079481335, "rewards/margins": 19.769775475396052, "rewards/rejected": -13.384549967447917, "step": 1969 }, { "epoch": 0.492931314900538, "grad_norm": 3.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50345029.81818182, "logits/rejected": -38168100.92307692, "logps/chosen": -392.61714311079544, "logps/rejected": -634.2292668269231, "loss": 0.0216, "rewards/chosen": 7.754757274280895, "rewards/margins": 21.897193375167312, "rewards/rejected": -14.142436100886417, "step": 1970 }, { "epoch": 0.49318153384211183, "grad_norm": 14.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68432320.0, "logits/rejected": -63216464.0, "logps/chosen": -475.3986002604167, "logps/rejected": -652.4186197916666, "loss": 0.0765, "rewards/chosen": 8.245744705200195, "rewards/margins": 20.585823694864906, "rewards/rejected": -12.340078989664713, "step": 1971 }, { "epoch": 0.49343175278368573, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42423616.0, "logits/rejected": -74559064.61538461, "logps/chosen": -392.2444957386364, "logps/rejected": -526.3907752403846, "loss": 0.0742, "rewards/chosen": 6.848864468661222, "rewards/margins": 17.008814138132376, "rewards/rejected": -10.159949669471153, "step": 1972 }, { "epoch": 0.4936819717252596, "grad_norm": 14.3125, "kl": 0.4036343991756439, "learning_rate": 5e-06, "logits/chosen": -60969107.692307696, "logits/rejected": -863456.0, "logps/chosen": -399.7034254807692, "logps/rejected": -692.6775568181819, "loss": 0.0413, "rewards/chosen": 7.455442575307993, "rewards/margins": 21.191575110375464, "rewards/rejected": -13.73613253506747, "step": 1973 }, { "epoch": 0.49393219066683347, "grad_norm": 7.375, "kl": 0.2901446223258972, "learning_rate": 5e-06, "logits/chosen": -35197733.333333336, "logits/rejected": 598925.3333333334, "logps/chosen": -291.9503173828125, "logps/rejected": -550.6531982421875, "loss": 0.0565, "rewards/chosen": 5.831199010213216, "rewards/margins": 19.764504114786785, "rewards/rejected": -13.933305104573568, "step": 1974 }, { "epoch": 0.49418240960840737, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42601444.571428575, "logits/rejected": -20320771.2, "logps/chosen": -435.77406529017856, "logps/rejected": -835.53564453125, "loss": 0.0453, "rewards/chosen": 7.779127938406808, "rewards/margins": 24.131999642508372, "rewards/rejected": -16.352871704101563, "step": 1975 }, { "epoch": 0.4944326285499812, "grad_norm": 2.078125, "kl": 13.637643814086914, "learning_rate": 5e-06, "logits/chosen": -29151579.42857143, "logits/rejected": -50622313.6, "logps/chosen": -624.2164481026786, "logps/rejected": -493.77509765625, "loss": 0.0042, "rewards/chosen": 8.846729278564453, "rewards/margins": 23.465697479248046, "rewards/rejected": -14.618968200683593, "step": 1976 }, { "epoch": 0.4946828474915551, "grad_norm": 6.34375, "kl": 0.8876008987426758, "learning_rate": 5e-06, "logits/chosen": -67830213.81818181, "logits/rejected": -42516332.307692304, "logps/chosen": -292.06906960227275, "logps/rejected": -532.1829552283654, "loss": 0.051, "rewards/chosen": 6.310279846191406, "rewards/margins": 18.150384169358475, "rewards/rejected": -11.840104323167067, "step": 1977 }, { "epoch": 0.494933066433129, "grad_norm": 5.375, "kl": 3.130004405975342, "learning_rate": 5e-06, "logits/chosen": -40128618.666666664, "logits/rejected": -32163418.666666668, "logps/chosen": -316.1538899739583, "logps/rejected": -486.29296875, "loss": 0.0323, "rewards/chosen": 6.9225114186604815, "rewards/margins": 17.675873438517254, "rewards/rejected": -10.753362019856771, "step": 1978 }, { "epoch": 0.49518328537470285, "grad_norm": 14.6875, "kl": 5.886826992034912, "learning_rate": 5e-06, "logits/chosen": -20914996.57142857, "logits/rejected": -45919126.4, "logps/chosen": -492.88302176339283, "logps/rejected": -595.14013671875, "loss": 0.0303, "rewards/chosen": 8.33594730922154, "rewards/margins": 21.781900678362163, "rewards/rejected": -13.445953369140625, "step": 1979 }, { "epoch": 0.49543350431627675, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19213648.0, "logits/rejected": -53516053.333333336, "logps/chosen": -290.16554768880206, "logps/rejected": -396.4081759982639, "loss": 0.0548, "rewards/chosen": 6.331467310587565, "rewards/margins": 16.151762008666992, "rewards/rejected": -9.820294698079428, "step": 1980 }, { "epoch": 0.4956837232578506, "grad_norm": 2.125, "kl": 0.9412371516227722, "learning_rate": 5e-06, "logits/chosen": -33133102.545454547, "logits/rejected": -42555091.692307696, "logps/chosen": -383.1585582386364, "logps/rejected": -778.8347355769231, "loss": 0.0211, "rewards/chosen": 7.866940585049716, "rewards/margins": 28.230656123661493, "rewards/rejected": -20.36371553861178, "step": 1981 }, { "epoch": 0.4959339421994245, "grad_norm": 21.125, "kl": 5.886059761047363, "learning_rate": 5e-06, "logits/chosen": -44564475.733333334, "logits/rejected": -56324736.0, "logps/chosen": -362.37526041666666, "logps/rejected": -527.3142903645834, "loss": 0.0896, "rewards/chosen": 7.147821044921875, "rewards/margins": 16.558714803059896, "rewards/rejected": -9.410893758138021, "step": 1982 }, { "epoch": 0.4961841611409984, "grad_norm": 4.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63586801.777777776, "logits/rejected": -47936507.733333334, "logps/chosen": -450.5864529079861, "logps/rejected": -594.8279947916667, "loss": 0.0085, "rewards/chosen": 8.634791056315104, "rewards/margins": 20.617007446289062, "rewards/rejected": -11.982216389973958, "step": 1983 }, { "epoch": 0.49643438008257224, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17682774.0, "logits/rejected": -31746654.0, "logps/chosen": -301.0936279296875, "logps/rejected": -722.8466796875, "loss": 0.0425, "rewards/chosen": 5.871728897094727, "rewards/margins": 22.531436920166016, "rewards/rejected": -16.65970802307129, "step": 1984 }, { "epoch": 0.49668459902414613, "grad_norm": 9.625, "kl": 1.0102704763412476, "learning_rate": 5e-06, "logits/chosen": -61187015.11111111, "logits/rejected": -22196563.2, "logps/chosen": -425.5768229166667, "logps/rejected": -422.61656901041664, "loss": 0.0347, "rewards/chosen": 6.393941667344835, "rewards/margins": 16.379167090521918, "rewards/rejected": -9.985225423177083, "step": 1985 }, { "epoch": 0.49693481796572003, "grad_norm": 20.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38365064.0, "logits/rejected": -44917064.0, "logps/chosen": -369.53790283203125, "logps/rejected": -490.88641357421875, "loss": 0.0978, "rewards/chosen": 6.231435775756836, "rewards/margins": 18.247509956359863, "rewards/rejected": -12.016074180603027, "step": 1986 }, { "epoch": 0.4971850369072939, "grad_norm": 9.1875, "kl": 14.78561019897461, "learning_rate": 5e-06, "logits/chosen": -59467207.52941176, "logits/rejected": -30995396.57142857, "logps/chosen": -438.0150505514706, "logps/rejected": -405.97715541294644, "loss": 0.0343, "rewards/chosen": 8.787280811982995, "rewards/margins": 20.649233457421055, "rewards/rejected": -11.861952645438057, "step": 1987 }, { "epoch": 0.4974352558488678, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 35594452.571428575, "logits/rejected": -53441200.0, "logps/chosen": -405.2622767857143, "logps/rejected": -757.37607421875, "loss": 0.067, "rewards/chosen": 6.157471793038504, "rewards/margins": 25.20892137799944, "rewards/rejected": -19.051449584960938, "step": 1988 }, { "epoch": 0.4976854747904416, "grad_norm": 4.65625, "kl": 7.945725440979004, "learning_rate": 5e-06, "logits/chosen": -59762077.538461536, "logits/rejected": -71713675.63636364, "logps/chosen": -399.8107346754808, "logps/rejected": -514.9894797585227, "loss": 0.0225, "rewards/chosen": 9.071091871995192, "rewards/margins": 22.92500838699874, "rewards/rejected": -13.85391651500355, "step": 1989 }, { "epoch": 0.4979356937320155, "grad_norm": 18.875, "kl": 17.173019409179688, "learning_rate": 5e-06, "logits/chosen": -52575830.5882353, "logits/rejected": -241204.2857142857, "logps/chosen": -356.5748506433824, "logps/rejected": -499.3059779575893, "loss": 0.1246, "rewards/chosen": 8.117691040039062, "rewards/margins": 21.96437726702009, "rewards/rejected": -13.846686226981026, "step": 1990 }, { "epoch": 0.4981859126735894, "grad_norm": 0.58203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64148534.15384615, "logits/rejected": -47892320.0, "logps/chosen": -441.45718149038464, "logps/rejected": -647.1665482954545, "loss": 0.0017, "rewards/chosen": 8.416517404409555, "rewards/margins": 21.908062648106288, "rewards/rejected": -13.491545243696732, "step": 1991 }, { "epoch": 0.49843613161516326, "grad_norm": 4.46875, "kl": 5.377076148986816, "learning_rate": 5e-06, "logits/chosen": -42565051.07692308, "logits/rejected": -17316741.818181816, "logps/chosen": -358.4328425480769, "logps/rejected": -471.4782049005682, "loss": 0.0728, "rewards/chosen": 8.14966055063101, "rewards/margins": 17.532327398553598, "rewards/rejected": -9.382666847922586, "step": 1992 }, { "epoch": 0.49868635055673716, "grad_norm": 5.375, "kl": 5.9898295402526855, "learning_rate": 5e-06, "logits/chosen": -35864762.18181818, "logits/rejected": -42256571.07692308, "logps/chosen": -440.42063210227275, "logps/rejected": -526.6383338341346, "loss": 0.0277, "rewards/chosen": 9.14466580477628, "rewards/margins": 23.007597276380846, "rewards/rejected": -13.862931471604567, "step": 1993 }, { "epoch": 0.498936569498311, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46151726.54545455, "logits/rejected": -47048753.23076923, "logps/chosen": -355.19797585227275, "logps/rejected": -576.1336388221154, "loss": 0.04, "rewards/chosen": 7.001333063299006, "rewards/margins": 21.493592162232297, "rewards/rejected": -14.492259098933292, "step": 1994 }, { "epoch": 0.4991867884398849, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51479978.666666664, "logits/rejected": -61658325.333333336, "logps/chosen": -606.735595703125, "logps/rejected": -779.0166829427084, "loss": 0.0287, "rewards/chosen": 11.5701904296875, "rewards/margins": 31.145165761311848, "rewards/rejected": -19.574975331624348, "step": 1995 }, { "epoch": 0.4994370073814588, "grad_norm": 7.4375, "kl": 8.853483200073242, "learning_rate": 5e-06, "logits/chosen": -51958464.0, "logits/rejected": -60710789.333333336, "logps/chosen": -515.9022216796875, "logps/rejected": -586.2477213541666, "loss": 0.0234, "rewards/chosen": 9.4551633199056, "rewards/margins": 20.219174702962242, "rewards/rejected": -10.76401138305664, "step": 1996 }, { "epoch": 0.49968722632303264, "grad_norm": 2.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32874407.384615384, "logits/rejected": -75908282.18181819, "logps/chosen": -456.2880108173077, "logps/rejected": -907.9139737215909, "loss": 0.019, "rewards/chosen": 7.912331214317908, "rewards/margins": 30.723405211121886, "rewards/rejected": -22.811073996803977, "step": 1997 }, { "epoch": 0.49993744526460654, "grad_norm": 6.34375, "kl": 2.842404842376709, "learning_rate": 5e-06, "logits/chosen": -47341802.666666664, "logits/rejected": -58670986.666666664, "logps/chosen": -395.780517578125, "logps/rejected": -681.2534586588541, "loss": 0.0434, "rewards/chosen": 6.299989700317383, "rewards/margins": 21.99212328592936, "rewards/rejected": -15.692133585611979, "step": 1998 }, { "epoch": 0.5001876642061804, "grad_norm": 8.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35002752.0, "logits/rejected": -44018730.666666664, "logps/chosen": -334.6203884548611, "logps/rejected": -600.12734375, "loss": 0.0395, "rewards/chosen": 7.109553866916233, "rewards/margins": 18.406961907280817, "rewards/rejected": -11.297408040364584, "step": 1999 }, { "epoch": 0.5004378831477543, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28822954.666666668, "logits/rejected": -49734112.0, "logps/chosen": -418.2074381510417, "logps/rejected": -654.93212890625, "loss": 0.0391, "rewards/chosen": 6.591269810994466, "rewards/margins": 22.38782564798991, "rewards/rejected": -15.796555836995443, "step": 2000 }, { "epoch": 0.5006881020893281, "grad_norm": 10.875, "kl": 8.29981803894043, "learning_rate": 5e-06, "logits/chosen": -62043316.0, "logits/rejected": -36093300.0, "logps/chosen": -356.12384033203125, "logps/rejected": -431.7716979980469, "loss": 0.1011, "rewards/chosen": 6.4548139572143555, "rewards/margins": 15.90286922454834, "rewards/rejected": -9.448055267333984, "step": 2001 }, { "epoch": 0.5009383210309021, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88491146.66666667, "logits/rejected": -59119781.333333336, "logps/chosen": -416.1300862630208, "logps/rejected": -584.5294189453125, "loss": 0.0329, "rewards/chosen": 5.221451123555501, "rewards/margins": 15.168587048848469, "rewards/rejected": -9.947135925292969, "step": 2002 }, { "epoch": 0.5011885399724759, "grad_norm": 13.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27908538.666666668, "logits/rejected": -31083162.666666668, "logps/chosen": -452.5336507161458, "logps/rejected": -594.95458984375, "loss": 0.0497, "rewards/chosen": 8.167304992675781, "rewards/margins": 20.021286010742188, "rewards/rejected": -11.853981018066406, "step": 2003 }, { "epoch": 0.5014387589140498, "grad_norm": 10.5, "kl": 6.473819732666016, "learning_rate": 5e-06, "logits/chosen": -16932506.666666668, "logits/rejected": 21848204.0, "logps/chosen": -271.5944010416667, "logps/rejected": -641.4624837239584, "loss": 0.0809, "rewards/chosen": 5.018838564554851, "rewards/margins": 18.212578137715656, "rewards/rejected": -13.193739573160807, "step": 2004 }, { "epoch": 0.5016889778556237, "grad_norm": 15.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32873629.714285713, "logits/rejected": -43406836.705882356, "logps/chosen": -282.13436453683033, "logps/rejected": -462.07327090992646, "loss": 0.0798, "rewards/chosen": 6.133634294782366, "rewards/margins": 17.492008497735032, "rewards/rejected": -11.358374202952666, "step": 2005 }, { "epoch": 0.5019391967971976, "grad_norm": 1.1640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34588810.666666664, "logits/rejected": -50462997.333333336, "logps/chosen": -363.9585367838542, "logps/rejected": -590.8803168402778, "loss": 0.0121, "rewards/chosen": 6.077334721883138, "rewards/margins": 20.481845219930012, "rewards/rejected": -14.404510498046875, "step": 2006 }, { "epoch": 0.5021894157387714, "grad_norm": 4.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63939432.72727273, "logits/rejected": -69679330.46153846, "logps/chosen": -363.16397372159093, "logps/rejected": -631.08203125, "loss": 0.0172, "rewards/chosen": 9.501785278320312, "rewards/margins": 21.453001755934494, "rewards/rejected": -11.951216477614183, "step": 2007 }, { "epoch": 0.5024396346803452, "grad_norm": 16.125, "kl": 17.85320472717285, "learning_rate": 5e-06, "logits/chosen": -38214148.266666666, "logits/rejected": -57787456.0, "logps/chosen": -459.0707682291667, "logps/rejected": -340.9117838541667, "loss": 0.0921, "rewards/chosen": 9.660264078776041, "rewards/margins": 17.345203993055556, "rewards/rejected": -7.684939914279514, "step": 2008 }, { "epoch": 0.5026898536219192, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68243192.8888889, "logits/rejected": -56420445.86666667, "logps/chosen": -362.98084852430554, "logps/rejected": -614.2248697916667, "loss": 0.0592, "rewards/chosen": 7.397639804416233, "rewards/margins": 19.512221103244357, "rewards/rejected": -12.114581298828124, "step": 2009 }, { "epoch": 0.502940072563493, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48743476.0, "logits/rejected": -53746816.0, "logps/chosen": -371.12890625, "logps/rejected": -726.1782836914062, "loss": 0.0079, "rewards/chosen": 7.997684478759766, "rewards/margins": 22.762825965881348, "rewards/rejected": -14.765141487121582, "step": 2010 }, { "epoch": 0.5031902915050669, "grad_norm": 6.90625, "kl": 2.2826151847839355, "learning_rate": 5e-06, "logits/chosen": -40138432.0, "logits/rejected": -69950112.0, "logps/chosen": -360.39334542410717, "logps/rejected": -735.17119140625, "loss": 0.0229, "rewards/chosen": 7.797163827078683, "rewards/margins": 21.45153089250837, "rewards/rejected": -13.654367065429687, "step": 2011 }, { "epoch": 0.5034405104466408, "grad_norm": 2.0625, "kl": 1.4323346614837646, "learning_rate": 5e-06, "logits/chosen": -34315081.14285714, "logits/rejected": -32883216.0, "logps/chosen": -356.52650669642856, "logps/rejected": -497.866796875, "loss": 0.0327, "rewards/chosen": 6.861260550362723, "rewards/margins": 18.47960695539202, "rewards/rejected": -11.618346405029296, "step": 2012 }, { "epoch": 0.5036907293882147, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60717030.4, "logits/rejected": -24762134.85714286, "logps/chosen": -274.72998046875, "logps/rejected": -653.2473493303571, "loss": 0.0301, "rewards/chosen": 6.775099182128907, "rewards/margins": 18.330494035993304, "rewards/rejected": -11.555394853864398, "step": 2013 }, { "epoch": 0.5039409483297885, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35934493.09090909, "logits/rejected": -35348482.461538464, "logps/chosen": -417.06889204545456, "logps/rejected": -524.5434945913462, "loss": 0.0204, "rewards/chosen": 8.025335138494318, "rewards/margins": 19.958689442881337, "rewards/rejected": -11.93335430438702, "step": 2014 }, { "epoch": 0.5041911672713625, "grad_norm": 1.4453125, "kl": 3.996346950531006, "learning_rate": 5e-06, "logits/chosen": -48889696.0, "logits/rejected": -64489328.0, "logps/chosen": -447.9976399739583, "logps/rejected": -877.7978515625, "loss": 0.0152, "rewards/chosen": 10.134397506713867, "rewards/margins": 28.927006403605144, "rewards/rejected": -18.792608896891277, "step": 2015 }, { "epoch": 0.5044413862129363, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61865528.88888889, "logits/rejected": -29935398.4, "logps/chosen": -311.564697265625, "logps/rejected": -534.73193359375, "loss": 0.0648, "rewards/chosen": 5.945961422390408, "rewards/margins": 17.955091264512802, "rewards/rejected": -12.009129842122396, "step": 2016 }, { "epoch": 0.5046916051545102, "grad_norm": 28.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33599274.666666664, "logits/rejected": 91955317.33333333, "logps/chosen": -422.0048828125, "logps/rejected": -523.9694010416666, "loss": 0.0798, "rewards/chosen": 6.966581344604492, "rewards/margins": 19.22781308492025, "rewards/rejected": -12.261231740315756, "step": 2017 }, { "epoch": 0.5049418240960841, "grad_norm": 25.25, "kl": 1.4314804077148438, "learning_rate": 5e-06, "logits/chosen": -63897024.0, "logits/rejected": -61581915.428571425, "logps/chosen": -302.193115234375, "logps/rejected": -598.8289620535714, "loss": 0.0957, "rewards/chosen": 4.997477722167969, "rewards/margins": 17.78181871686663, "rewards/rejected": -12.784340994698661, "step": 2018 }, { "epoch": 0.505192043037658, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49644432.0, "logits/rejected": -66965136.0, "logps/chosen": -383.0497131347656, "logps/rejected": -719.7156982421875, "loss": 0.0449, "rewards/chosen": 7.8711838722229, "rewards/margins": 24.239969730377197, "rewards/rejected": -16.368785858154297, "step": 2019 }, { "epoch": 0.5054422619792318, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74944413.0909091, "logits/rejected": -52082097.23076923, "logps/chosen": -351.8477672230114, "logps/rejected": -565.2352764423077, "loss": 0.0367, "rewards/chosen": 6.84285458651456, "rewards/margins": 18.091562284456266, "rewards/rejected": -11.248707697941708, "step": 2020 }, { "epoch": 0.5056924809208057, "grad_norm": 10.125, "kl": 4.104727745056152, "learning_rate": 5e-06, "logits/chosen": -42825546.666666664, "logits/rejected": -36853813.333333336, "logps/chosen": -332.12904866536456, "logps/rejected": -645.9530029296875, "loss": 0.042, "rewards/chosen": 7.933237075805664, "rewards/margins": 23.382646560668945, "rewards/rejected": -15.449409484863281, "step": 2021 }, { "epoch": 0.5059426998623796, "grad_norm": 6.375, "kl": 10.881041526794434, "learning_rate": 5e-06, "logits/chosen": -80109138.28571428, "logits/rejected": -19778137.6, "logps/chosen": -488.80772181919644, "logps/rejected": -511.5826171875, "loss": 0.03, "rewards/chosen": 9.987566266741071, "rewards/margins": 20.486856733049663, "rewards/rejected": -10.499290466308594, "step": 2022 }, { "epoch": 0.5061929188039535, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53890227.2, "logits/rejected": -51967369.14285714, "logps/chosen": -367.3782958984375, "logps/rejected": -646.3998325892857, "loss": 0.0366, "rewards/chosen": 7.980458068847656, "rewards/margins": 21.58571537562779, "rewards/rejected": -13.605257306780134, "step": 2023 }, { "epoch": 0.5064431377455273, "grad_norm": 5.5, "kl": 3.007319211959839, "learning_rate": 5e-06, "logits/chosen": -55539352.615384616, "logits/rejected": -48158848.0, "logps/chosen": -310.32376802884613, "logps/rejected": -535.9944957386364, "loss": 0.0474, "rewards/chosen": 7.036426250751202, "rewards/margins": 18.452961368160647, "rewards/rejected": -11.416535117409445, "step": 2024 }, { "epoch": 0.5066933566871012, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28783595.636363637, "logits/rejected": -41406532.92307692, "logps/chosen": -171.83851207386363, "logps/rejected": -435.20350060096155, "loss": 0.1057, "rewards/chosen": 4.301420038396662, "rewards/margins": 13.661863153631037, "rewards/rejected": -9.360443115234375, "step": 2025 }, { "epoch": 0.5069435756286751, "grad_norm": 3.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39268667.428571425, "logits/rejected": -67537318.4, "logps/chosen": -432.4606236049107, "logps/rejected": -710.16943359375, "loss": 0.0307, "rewards/chosen": 9.236327035086495, "rewards/margins": 24.188023812430245, "rewards/rejected": -14.95169677734375, "step": 2026 }, { "epoch": 0.5071937945702489, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53462573.71428572, "logits/rejected": -63127747.76470588, "logps/chosen": -372.2268763950893, "logps/rejected": -638.5193014705883, "loss": 0.0411, "rewards/chosen": 8.204526628766741, "rewards/margins": 23.67129837164358, "rewards/rejected": -15.466771742876839, "step": 2027 }, { "epoch": 0.5074440135118229, "grad_norm": 2.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35760066.28571428, "logits/rejected": -26253728.0, "logps/chosen": -440.37890625, "logps/rejected": -667.30751953125, "loss": 0.0023, "rewards/chosen": 8.752478463309151, "rewards/margins": 27.178850228445867, "rewards/rejected": -18.426371765136718, "step": 2028 }, { "epoch": 0.5076942324533967, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 821124.8, "logits/rejected": -56179086.222222224, "logps/chosen": -506.10341796875, "logps/rejected": -800.9060329861111, "loss": 0.0724, "rewards/chosen": 8.281163533528646, "rewards/margins": 29.901548597547745, "rewards/rejected": -21.620385064019096, "step": 2029 }, { "epoch": 0.5079444513949706, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66115121.23076923, "logits/rejected": -50619438.54545455, "logps/chosen": -347.91068209134613, "logps/rejected": -673.9528142755681, "loss": 0.0366, "rewards/chosen": 6.4918694129356975, "rewards/margins": 18.84308650943783, "rewards/rejected": -12.35121709650213, "step": 2030 }, { "epoch": 0.5081946703365445, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55228066.461538464, "logits/rejected": -52335022.54545455, "logps/chosen": -339.15076622596155, "logps/rejected": -522.8496537642045, "loss": 0.0248, "rewards/chosen": 6.22974865253155, "rewards/margins": 17.15734639201131, "rewards/rejected": -10.927597739479758, "step": 2031 }, { "epoch": 0.5084448892781184, "grad_norm": 3.765625, "kl": 12.808765411376953, "learning_rate": 5e-06, "logits/chosen": -40207765.333333336, "logits/rejected": 86596992.0, "logps/chosen": -354.2416015625, "logps/rejected": -518.2223307291666, "loss": 0.0224, "rewards/chosen": 7.719915771484375, "rewards/margins": 17.23291965060764, "rewards/rejected": -9.513003879123264, "step": 2032 }, { "epoch": 0.5086951082196922, "grad_norm": 7.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -420356.92307692306, "logits/rejected": -34414941.09090909, "logps/chosen": -429.4865910456731, "logps/rejected": -723.2416548295455, "loss": 0.0427, "rewards/chosen": 8.53240732046274, "rewards/margins": 23.635215492515297, "rewards/rejected": -15.102808172052557, "step": 2033 }, { "epoch": 0.5089453271612661, "grad_norm": 7.8125, "kl": 2.299046516418457, "learning_rate": 5e-06, "logits/chosen": -51760022.85714286, "logits/rejected": -32765296.0, "logps/chosen": -376.28501674107144, "logps/rejected": -745.83251953125, "loss": 0.0844, "rewards/chosen": 6.571990966796875, "rewards/margins": 18.112261962890624, "rewards/rejected": -11.54027099609375, "step": 2034 }, { "epoch": 0.50919554610284, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67609898.66666667, "logits/rejected": -70024391.1111111, "logps/chosen": -399.21741536458336, "logps/rejected": -579.0771484375, "loss": 0.0729, "rewards/chosen": 5.416941833496094, "rewards/margins": 17.47944607204861, "rewards/rejected": -12.062504238552517, "step": 2035 }, { "epoch": 0.5094457650444139, "grad_norm": 17.125, "kl": 0.2859668731689453, "learning_rate": 5e-06, "logits/chosen": -40583288.615384616, "logits/rejected": -39466955.63636363, "logps/chosen": -307.37235201322113, "logps/rejected": -400.85591264204544, "loss": 0.0692, "rewards/chosen": 7.486266502967248, "rewards/margins": 17.037972296868173, "rewards/rejected": -9.551705793900924, "step": 2036 }, { "epoch": 0.5096959839859877, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43549829.81818182, "logits/rejected": -62685607.384615384, "logps/chosen": -398.8631036931818, "logps/rejected": -847.6011117788462, "loss": 0.0233, "rewards/chosen": 8.451771129261363, "rewards/margins": 31.233131462043815, "rewards/rejected": -22.78136033278245, "step": 2037 }, { "epoch": 0.5099462029275617, "grad_norm": 8.25, "kl": 19.794681549072266, "learning_rate": 5e-06, "logits/chosen": -35493848.0, "logits/rejected": -40526560.0, "logps/chosen": -458.5340881347656, "logps/rejected": -448.6490478515625, "loss": 0.0208, "rewards/chosen": 9.440528869628906, "rewards/margins": 20.14570426940918, "rewards/rejected": -10.705175399780273, "step": 2038 }, { "epoch": 0.5101964218691355, "grad_norm": 3.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46224173.71428572, "logits/rejected": -58699584.0, "logps/chosen": -246.23092215401786, "logps/rejected": -530.59921875, "loss": 0.0311, "rewards/chosen": 5.910852704729352, "rewards/margins": 18.63238285609654, "rewards/rejected": -12.721530151367187, "step": 2039 }, { "epoch": 0.5104466408107093, "grad_norm": 14.875, "kl": 5.487802028656006, "learning_rate": 5e-06, "logits/chosen": -25693824.0, "logits/rejected": -40154648.0, "logps/chosen": -240.2078653971354, "logps/rejected": -496.9247639973958, "loss": 0.0797, "rewards/chosen": 5.557026545206706, "rewards/margins": 18.78697395324707, "rewards/rejected": -13.229947408040365, "step": 2040 }, { "epoch": 0.5106968597522833, "grad_norm": 1.0234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32680176.0, "logits/rejected": -56139400.0, "logps/chosen": -373.472412109375, "logps/rejected": -556.17333984375, "loss": 0.0157, "rewards/chosen": 8.535799026489258, "rewards/margins": 19.874250411987305, "rewards/rejected": -11.338451385498047, "step": 2041 }, { "epoch": 0.5109470786938571, "grad_norm": 7.03125, "kl": 7.283808708190918, "learning_rate": 5e-06, "logits/chosen": -64955293.538461536, "logits/rejected": -45175784.72727273, "logps/chosen": -460.93716195913464, "logps/rejected": -663.1183416193181, "loss": 0.0513, "rewards/chosen": 8.791025015024038, "rewards/margins": 24.588980614722193, "rewards/rejected": -15.797955599698154, "step": 2042 }, { "epoch": 0.511197297635431, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51452258.90909091, "logits/rejected": -54850702.76923077, "logps/chosen": -410.3224431818182, "logps/rejected": -798.9809194711538, "loss": 0.0201, "rewards/chosen": 7.662064292214134, "rewards/margins": 26.145699827821105, "rewards/rejected": -18.48363553560697, "step": 2043 }, { "epoch": 0.5114475165770049, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54354225.23076923, "logits/rejected": -38532581.81818182, "logps/chosen": -351.4650691105769, "logps/rejected": -525.7906161221591, "loss": 0.0583, "rewards/chosen": 7.728520320012019, "rewards/margins": 19.008498985450583, "rewards/rejected": -11.279978665438565, "step": 2044 }, { "epoch": 0.5116977355185788, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74954264.61538461, "logits/rejected": -51075781.81818182, "logps/chosen": -371.7453425480769, "logps/rejected": -639.94384765625, "loss": 0.0676, "rewards/chosen": 6.025587228628305, "rewards/margins": 21.92118659386268, "rewards/rejected": -15.895599365234375, "step": 2045 }, { "epoch": 0.5119479544601526, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54220776.72727273, "logits/rejected": -43516169.84615385, "logps/chosen": -443.47305575284093, "logps/rejected": -526.1463341346154, "loss": 0.0471, "rewards/chosen": 8.625660289417613, "rewards/margins": 20.94533661528901, "rewards/rejected": -12.319676325871395, "step": 2046 }, { "epoch": 0.5121981734017265, "grad_norm": 8.5625, "kl": 4.593562126159668, "learning_rate": 5e-06, "logits/chosen": -69933260.8, "logits/rejected": -38521984.0, "logps/chosen": -486.873388671875, "logps/rejected": -424.69332449776783, "loss": 0.0224, "rewards/chosen": 8.199452972412109, "rewards/margins": 18.57932379586356, "rewards/rejected": -10.379870823451451, "step": 2047 }, { "epoch": 0.5124483923433004, "grad_norm": 6.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42196930.90909091, "logits/rejected": -44479606.15384615, "logps/chosen": -394.0731090198864, "logps/rejected": -615.8425105168269, "loss": 0.0194, "rewards/chosen": 7.7432098388671875, "rewards/margins": 20.490415132962738, "rewards/rejected": -12.747205294095552, "step": 2048 }, { "epoch": 0.5126986112848743, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48264870.4, "logits/rejected": -24333981.714285713, "logps/chosen": -422.91513671875, "logps/rejected": -516.4100516183036, "loss": 0.0331, "rewards/chosen": 6.503352355957031, "rewards/margins": 19.403905378069197, "rewards/rejected": -12.900553022112165, "step": 2049 }, { "epoch": 0.5129488302264481, "grad_norm": 12.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36412550.4, "logits/rejected": -49880891.428571425, "logps/chosen": -313.1418701171875, "logps/rejected": -665.1412527901786, "loss": 0.0547, "rewards/chosen": 5.760654449462891, "rewards/margins": 20.082562582833425, "rewards/rejected": -14.321908133370536, "step": 2050 }, { "epoch": 0.5131990491680221, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64515987.2, "logits/rejected": -50961792.0, "logps/chosen": -337.335400390625, "logps/rejected": -521.5563616071429, "loss": 0.044, "rewards/chosen": 6.767932891845703, "rewards/margins": 21.28472409929548, "rewards/rejected": -14.516791207449776, "step": 2051 }, { "epoch": 0.5134492681095959, "grad_norm": 1.3359375, "kl": 3.0731735229492188, "learning_rate": 5e-06, "logits/chosen": -38545834.666666664, "logits/rejected": -61840931.55555555, "logps/chosen": -508.8123046875, "logps/rejected": -941.9448784722222, "loss": 0.0076, "rewards/chosen": 8.936802164713542, "rewards/margins": 27.5049072265625, "rewards/rejected": -18.568105061848957, "step": 2052 }, { "epoch": 0.5136994870511697, "grad_norm": 12.8125, "kl": 11.261308670043945, "learning_rate": 5e-06, "logits/chosen": -43652386.461538464, "logits/rejected": -56759604.36363637, "logps/chosen": -282.27768179086536, "logps/rejected": -532.0268110795455, "loss": 0.0636, "rewards/chosen": 6.440862215482271, "rewards/margins": 18.121812033486535, "rewards/rejected": -11.680949818004262, "step": 2053 }, { "epoch": 0.5139497059927437, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41187464.0, "logits/rejected": -71249920.0, "logps/chosen": -351.71087646484375, "logps/rejected": -545.562255859375, "loss": 0.0532, "rewards/chosen": 6.041684627532959, "rewards/margins": 18.375075817108154, "rewards/rejected": -12.333391189575195, "step": 2054 }, { "epoch": 0.5141999249343175, "grad_norm": 6.625, "kl": 3.3547236919403076, "learning_rate": 5e-06, "logits/chosen": -41794619.07692308, "logits/rejected": -45772352.0, "logps/chosen": -351.91793118990387, "logps/rejected": -344.22509765625, "loss": 0.0198, "rewards/chosen": 7.891488882211538, "rewards/margins": 18.213418813852165, "rewards/rejected": -10.321929931640625, "step": 2055 }, { "epoch": 0.5144501438758914, "grad_norm": 4.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67769017.6, "logits/rejected": -76123876.57142857, "logps/chosen": -490.85673828125, "logps/rejected": -585.3030831473214, "loss": 0.0453, "rewards/chosen": 9.409085083007813, "rewards/margins": 20.580262538364956, "rewards/rejected": -11.171177455357142, "step": 2056 }, { "epoch": 0.5147003628174652, "grad_norm": 7.25, "kl": 9.187297821044922, "learning_rate": 5e-06, "logits/chosen": -45276402.28571428, "logits/rejected": -56935846.4, "logps/chosen": -340.0748814174107, "logps/rejected": -610.36640625, "loss": 0.0604, "rewards/chosen": 6.934781210763114, "rewards/margins": 23.833253805977957, "rewards/rejected": -16.898472595214844, "step": 2057 }, { "epoch": 0.5149505817590392, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23492884.363636363, "logits/rejected": -53470119.384615384, "logps/chosen": -313.67891068892044, "logps/rejected": -657.5831580528846, "loss": 0.0174, "rewards/chosen": 8.40437039462003, "rewards/margins": 25.54604942481835, "rewards/rejected": -17.14167903019832, "step": 2058 }, { "epoch": 0.515200800700613, "grad_norm": 7.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47798482.28571428, "logits/rejected": -100592416.0, "logps/chosen": -387.53651646205356, "logps/rejected": -547.970703125, "loss": 0.0578, "rewards/chosen": 5.625618525913784, "rewards/margins": 19.87854058401925, "rewards/rejected": -14.252922058105469, "step": 2059 }, { "epoch": 0.5154510196421869, "grad_norm": 1.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49639776.0, "logits/rejected": -62037486.93333333, "logps/chosen": -511.66807725694446, "logps/rejected": -579.0899739583333, "loss": 0.003, "rewards/chosen": 9.693017747667101, "rewards/margins": 23.81736060248481, "rewards/rejected": -14.124342854817709, "step": 2060 }, { "epoch": 0.5157012385837608, "grad_norm": 8.5, "kl": 6.546737194061279, "learning_rate": 5e-06, "logits/chosen": -51830784.0, "logits/rejected": -30175987.2, "logps/chosen": -367.98458426339283, "logps/rejected": -424.18037109375, "loss": 0.0702, "rewards/chosen": 6.4643434797014505, "rewards/margins": 15.797923496791295, "rewards/rejected": -9.333580017089844, "step": 2061 }, { "epoch": 0.5159514575253347, "grad_norm": 8.8125, "kl": 5.988561153411865, "learning_rate": 5e-06, "logits/chosen": -79898926.54545455, "logits/rejected": -58028150.15384615, "logps/chosen": -494.3445490056818, "logps/rejected": -751.4121844951923, "loss": 0.0427, "rewards/chosen": 9.317793412642045, "rewards/margins": 24.091482095785075, "rewards/rejected": -14.773688683143028, "step": 2062 }, { "epoch": 0.5162016764669085, "grad_norm": 16.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76869869.71428572, "logits/rejected": -19193912.0, "logps/chosen": -540.5588727678571, "logps/rejected": -628.03740234375, "loss": 0.0497, "rewards/chosen": 8.155299595424108, "rewards/margins": 20.52025854928153, "rewards/rejected": -12.364958953857421, "step": 2063 }, { "epoch": 0.5164518954084825, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33451402.666666668, "logits/rejected": 2564219.111111111, "logps/chosen": -269.3422037760417, "logps/rejected": -494.13335503472223, "loss": 0.0402, "rewards/chosen": 6.611663818359375, "rewards/margins": 20.561937967936196, "rewards/rejected": -13.950274149576822, "step": 2064 }, { "epoch": 0.5167021143500563, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62068270.54545455, "logits/rejected": -36287640.615384616, "logps/chosen": -492.9459339488636, "logps/rejected": -502.58484825721155, "loss": 0.0232, "rewards/chosen": 9.643128828568893, "rewards/margins": 21.2342930506993, "rewards/rejected": -11.59116422213041, "step": 2065 }, { "epoch": 0.5169523332916302, "grad_norm": 7.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33438892.8, "logits/rejected": -54511712.0, "logps/chosen": -324.513671875, "logps/rejected": -589.3586077008929, "loss": 0.0342, "rewards/chosen": 6.082374191284179, "rewards/margins": 20.959342575073244, "rewards/rejected": -14.876968383789062, "step": 2066 }, { "epoch": 0.5172025522332041, "grad_norm": 1.421875, "kl": 1.1260770559310913, "learning_rate": 5e-06, "logits/chosen": -45328661.333333336, "logits/rejected": -50588261.333333336, "logps/chosen": -423.48388671875, "logps/rejected": -680.9405110677084, "loss": 0.0136, "rewards/chosen": 9.108111911349827, "rewards/margins": 22.621267530653213, "rewards/rejected": -13.513155619303385, "step": 2067 }, { "epoch": 0.517452771174778, "grad_norm": 2.28125, "kl": 10.033530235290527, "learning_rate": 5e-06, "logits/chosen": -33700582.4, "logits/rejected": -26727598.222222224, "logps/chosen": -370.86516927083335, "logps/rejected": -451.0963541666667, "loss": 0.0587, "rewards/chosen": 7.3603159586588545, "rewards/margins": 20.19067552354601, "rewards/rejected": -12.830359564887154, "step": 2068 }, { "epoch": 0.5177029901163518, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46369206.85714286, "logits/rejected": -30676254.11764706, "logps/chosen": -451.41678292410717, "logps/rejected": -511.03044577205884, "loss": 0.0365, "rewards/chosen": 7.5332810538155695, "rewards/margins": 17.88381493191759, "rewards/rejected": -10.350533878102022, "step": 2069 }, { "epoch": 0.5179532090579256, "grad_norm": 16.25, "kl": 14.29393196105957, "learning_rate": 5e-06, "logits/chosen": -81056467.2, "logits/rejected": 44746034.28571428, "logps/chosen": -462.827685546875, "logps/rejected": -653.1022600446429, "loss": 0.0368, "rewards/chosen": 8.668092346191406, "rewards/margins": 22.96399187360491, "rewards/rejected": -14.295899527413505, "step": 2070 }, { "epoch": 0.5182034279994996, "grad_norm": 26.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38648620.8, "logits/rejected": -28611117.714285713, "logps/chosen": -303.7293212890625, "logps/rejected": -453.70078822544644, "loss": 0.084, "rewards/chosen": 5.249457550048828, "rewards/margins": 13.974338858468194, "rewards/rejected": -8.724881308419365, "step": 2071 }, { "epoch": 0.5184536469410734, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40474459.428571425, "logits/rejected": -40719462.4, "logps/chosen": -307.95113699776783, "logps/rejected": -543.447802734375, "loss": 0.0574, "rewards/chosen": 7.5041689191545755, "rewards/margins": 18.16812918526786, "rewards/rejected": -10.663960266113282, "step": 2072 }, { "epoch": 0.5187038658826473, "grad_norm": 19.25, "kl": 8.714259147644043, "learning_rate": 5e-06, "logits/chosen": -54971684.571428575, "logits/rejected": -59505945.6, "logps/chosen": -436.32693917410717, "logps/rejected": -552.689794921875, "loss": 0.0456, "rewards/chosen": 8.786645071847099, "rewards/margins": 20.13851732526507, "rewards/rejected": -11.35187225341797, "step": 2073 }, { "epoch": 0.5189540848242212, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54045696.0, "logits/rejected": -58517474.461538464, "logps/chosen": -356.51045365767044, "logps/rejected": -623.7421123798077, "loss": 0.0339, "rewards/chosen": 8.108573219992898, "rewards/margins": 21.41727260776333, "rewards/rejected": -13.308699387770433, "step": 2074 }, { "epoch": 0.5192043037657951, "grad_norm": 4.59375, "kl": 6.460572719573975, "learning_rate": 5e-06, "logits/chosen": -45779033.6, "logits/rejected": -28082402.285714287, "logps/chosen": -470.540869140625, "logps/rejected": -618.8392857142857, "loss": 0.0274, "rewards/chosen": 8.77569808959961, "rewards/margins": 19.769512830461775, "rewards/rejected": -10.993814740862165, "step": 2075 }, { "epoch": 0.5194545227073689, "grad_norm": 5.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72792339.6923077, "logits/rejected": -69264535.27272727, "logps/chosen": -430.19884314903845, "logps/rejected": -590.5677379261364, "loss": 0.0235, "rewards/chosen": 8.495459336500902, "rewards/margins": 20.33566236162519, "rewards/rejected": -11.84020302512429, "step": 2076 }, { "epoch": 0.5197047416489429, "grad_norm": 17.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24482441.6, "logits/rejected": -43688827.428571425, "logps/chosen": -282.1775390625, "logps/rejected": -944.1690848214286, "loss": 0.0398, "rewards/chosen": 7.059893798828125, "rewards/margins": 26.580968366350447, "rewards/rejected": -19.521074567522323, "step": 2077 }, { "epoch": 0.5199549605905167, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40506048.0, "logits/rejected": -49761493.333333336, "logps/chosen": -414.41216362847223, "logps/rejected": -496.12151692708335, "loss": 0.0422, "rewards/chosen": 7.368762546115452, "rewards/margins": 17.844663831922745, "rewards/rejected": -10.475901285807291, "step": 2078 }, { "epoch": 0.5202051795320906, "grad_norm": 6.84375, "kl": 0.05200608819723129, "learning_rate": 5e-06, "logits/chosen": -90009856.0, "logits/rejected": -40311389.538461536, "logps/chosen": -425.7457386363636, "logps/rejected": -548.8005183293269, "loss": 0.0169, "rewards/chosen": 7.362829728560015, "rewards/margins": 21.22670255007444, "rewards/rejected": -13.863872821514423, "step": 2079 }, { "epoch": 0.5204553984736645, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32624028.444444444, "logits/rejected": -37109038.93333333, "logps/chosen": -282.2387424045139, "logps/rejected": -639.5348958333333, "loss": 0.0205, "rewards/chosen": 7.308046976725261, "rewards/margins": 22.494525655110678, "rewards/rejected": -15.186478678385416, "step": 2080 }, { "epoch": 0.5207056174152384, "grad_norm": 9.125, "kl": 3.9720003604888916, "learning_rate": 5e-06, "logits/chosen": -55842612.0, "logits/rejected": -56257104.0, "logps/chosen": -394.1968994140625, "logps/rejected": -588.953857421875, "loss": 0.0374, "rewards/chosen": 7.986486434936523, "rewards/margins": 19.265053749084473, "rewards/rejected": -11.27856731414795, "step": 2081 }, { "epoch": 0.5209558363568122, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34274884.0, "logits/rejected": -49217208.0, "logps/chosen": -346.99847412109375, "logps/rejected": -617.9710083007812, "loss": 0.029, "rewards/chosen": 6.280170440673828, "rewards/margins": 23.859987258911133, "rewards/rejected": -17.579816818237305, "step": 2082 }, { "epoch": 0.521206055298386, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42358688.0, "logits/rejected": -23891298.666666668, "logps/chosen": -344.8580729166667, "logps/rejected": -586.412841796875, "loss": 0.0234, "rewards/chosen": 7.743685404459636, "rewards/margins": 22.881924947102863, "rewards/rejected": -15.138239542643229, "step": 2083 }, { "epoch": 0.52145627423996, "grad_norm": 5.46875, "kl": 2.381242275238037, "learning_rate": 5e-06, "logits/chosen": -53220164.92307692, "logits/rejected": -64389312.0, "logps/chosen": -442.71225210336536, "logps/rejected": -647.2072088068181, "loss": 0.0319, "rewards/chosen": 8.017402062049278, "rewards/margins": 21.902012324833372, "rewards/rejected": -13.884610262784092, "step": 2084 }, { "epoch": 0.5217064931815338, "grad_norm": 1.828125, "kl": 1.9587072134017944, "learning_rate": 5e-06, "logits/chosen": -68653096.72727273, "logits/rejected": -55428913.23076923, "logps/chosen": -413.34419389204544, "logps/rejected": -612.1852463942307, "loss": 0.0213, "rewards/chosen": 8.338054310191762, "rewards/margins": 22.31072229772181, "rewards/rejected": -13.972667987530048, "step": 2085 }, { "epoch": 0.5219567121231077, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28742156.0, "logits/rejected": -40883200.0, "logps/chosen": -403.5440673828125, "logps/rejected": -475.2159729003906, "loss": 0.0597, "rewards/chosen": 6.060738563537598, "rewards/margins": 17.26920223236084, "rewards/rejected": -11.208463668823242, "step": 2086 }, { "epoch": 0.5222069310646816, "grad_norm": 10.8125, "kl": 13.573837280273438, "learning_rate": 5e-06, "logits/chosen": -80758272.0, "logits/rejected": -26096030.4, "logps/chosen": -418.25362723214283, "logps/rejected": -697.20859375, "loss": 0.0468, "rewards/chosen": 7.9937270028250555, "rewards/margins": 23.341607557024275, "rewards/rejected": -15.347880554199218, "step": 2087 }, { "epoch": 0.5224571500062555, "grad_norm": 8.4375, "kl": 4.225133419036865, "learning_rate": 5e-06, "logits/chosen": -54701484.307692304, "logits/rejected": -28954170.181818184, "logps/chosen": -413.9116962139423, "logps/rejected": -425.50905539772725, "loss": 0.025, "rewards/chosen": 8.783909724308895, "rewards/margins": 19.46287035108446, "rewards/rejected": -10.678960626775568, "step": 2088 }, { "epoch": 0.5227073689478293, "grad_norm": 16.125, "kl": 6.540717601776123, "learning_rate": 5e-06, "logits/chosen": -29450290.0, "logits/rejected": -33228566.0, "logps/chosen": -310.2848205566406, "logps/rejected": -503.5518493652344, "loss": 0.1057, "rewards/chosen": 5.003633499145508, "rewards/margins": 18.72876739501953, "rewards/rejected": -13.725133895874023, "step": 2089 }, { "epoch": 0.5229575878894033, "grad_norm": 18.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47189081.6, "logits/rejected": -51009380.571428575, "logps/chosen": -434.4208984375, "logps/rejected": -621.5802176339286, "loss": 0.0627, "rewards/chosen": 8.424483489990234, "rewards/margins": 23.781891196114678, "rewards/rejected": -15.357407706124443, "step": 2090 }, { "epoch": 0.5232078068309771, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29001192.727272727, "logits/rejected": -37672637.538461536, "logps/chosen": -318.8658558238636, "logps/rejected": -454.56201171875, "loss": 0.0326, "rewards/chosen": 5.821513782848012, "rewards/margins": 17.469102659425538, "rewards/rejected": -11.647588876577524, "step": 2091 }, { "epoch": 0.523458025772551, "grad_norm": 4.59375, "kl": 2.3009250164031982, "learning_rate": 5e-06, "logits/chosen": -61417113.6, "logits/rejected": -58015021.71428572, "logps/chosen": -589.58125, "logps/rejected": -683.4371512276786, "loss": 0.0264, "rewards/chosen": 9.70254898071289, "rewards/margins": 26.999474116734095, "rewards/rejected": -17.296925136021205, "step": 2092 }, { "epoch": 0.5237082447141248, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 36963522.666666664, "logits/rejected": -38203008.0, "logps/chosen": -427.2663981119792, "logps/rejected": -546.7637261284722, "loss": 0.0367, "rewards/chosen": 7.949426015218099, "rewards/margins": 22.361718495686848, "rewards/rejected": -14.41229248046875, "step": 2093 }, { "epoch": 0.5239584636556988, "grad_norm": 11.6875, "kl": 8.629111289978027, "learning_rate": 5e-06, "logits/chosen": -34492838.85714286, "logits/rejected": -5663905.6, "logps/chosen": -411.48789760044644, "logps/rejected": -557.04755859375, "loss": 0.0369, "rewards/chosen": 8.349563598632812, "rewards/margins": 22.080616760253907, "rewards/rejected": -13.731053161621094, "step": 2094 }, { "epoch": 0.5242086825972726, "grad_norm": 3.953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51616777.84615385, "logits/rejected": -7341293.818181818, "logps/chosen": -420.189453125, "logps/rejected": -432.1419122869318, "loss": 0.0299, "rewards/chosen": 8.60259775015024, "rewards/margins": 21.33293055821132, "rewards/rejected": -12.73033280806108, "step": 2095 }, { "epoch": 0.5244589015388464, "grad_norm": 16.625, "kl": 4.806304931640625, "learning_rate": 5e-06, "logits/chosen": -62126126.54545455, "logits/rejected": -53115431.384615384, "logps/chosen": -375.15371981534093, "logps/rejected": -758.2982271634615, "loss": 0.0542, "rewards/chosen": 7.729522011496804, "rewards/margins": 22.659683707710744, "rewards/rejected": -14.930161696213942, "step": 2096 }, { "epoch": 0.5247091204804204, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32007594.666666668, "logits/rejected": -26637034.666666668, "logps/chosen": -370.8730061848958, "logps/rejected": -709.4248860677084, "loss": 0.0556, "rewards/chosen": 6.582632700602214, "rewards/margins": 18.42068862915039, "rewards/rejected": -11.838055928548178, "step": 2097 }, { "epoch": 0.5249593394219942, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53731357.09090909, "logits/rejected": -60406360.615384616, "logps/chosen": -444.51100852272725, "logps/rejected": -551.7612680288462, "loss": 0.0258, "rewards/chosen": 6.653153159401634, "rewards/margins": 18.796559954023028, "rewards/rejected": -12.143406794621395, "step": 2098 }, { "epoch": 0.5252095583635681, "grad_norm": 6.125, "kl": 14.444297790527344, "learning_rate": 5e-06, "logits/chosen": -58547131.733333334, "logits/rejected": -2612326.222222222, "logps/chosen": -520.5162109375, "logps/rejected": -502.16948784722223, "loss": 0.0158, "rewards/chosen": 9.365028889973958, "rewards/margins": 25.255394490559894, "rewards/rejected": -15.890365600585938, "step": 2099 }, { "epoch": 0.525459777305142, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49314140.44444445, "logits/rejected": -52897715.2, "logps/chosen": -287.5767415364583, "logps/rejected": -596.5087890625, "loss": 0.024, "rewards/chosen": 7.003359476725261, "rewards/margins": 21.92371368408203, "rewards/rejected": -14.920354207356771, "step": 2100 }, { "epoch": 0.5257099962467159, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39520059.07692308, "logits/rejected": -45963450.18181818, "logps/chosen": -376.18130258413464, "logps/rejected": -704.6801313920455, "loss": 0.0243, "rewards/chosen": 7.614232576810396, "rewards/margins": 24.37515232112858, "rewards/rejected": -16.760919744318183, "step": 2101 }, { "epoch": 0.5259602151882897, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32568985.6, "logits/rejected": -50713101.71428572, "logps/chosen": -357.2099609375, "logps/rejected": -492.99239676339283, "loss": 0.0231, "rewards/chosen": 7.878208923339844, "rewards/margins": 20.923835972377233, "rewards/rejected": -13.045627049037389, "step": 2102 }, { "epoch": 0.5262104341298637, "grad_norm": 3.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88828380.44444445, "logits/rejected": -65719266.13333333, "logps/chosen": -535.8365885416666, "logps/rejected": -583.1625, "loss": 0.0066, "rewards/chosen": 9.50605689154731, "rewards/margins": 24.21681942409939, "rewards/rejected": -14.710762532552083, "step": 2103 }, { "epoch": 0.5264606530714375, "grad_norm": 8.4375, "kl": 3.560891628265381, "learning_rate": 5e-06, "logits/chosen": -68230980.92307693, "logits/rejected": -55327778.90909091, "logps/chosen": -428.39321664663464, "logps/rejected": -760.7004616477273, "loss": 0.012, "rewards/chosen": 7.89062734750601, "rewards/margins": 26.79240374798541, "rewards/rejected": -18.901776400479402, "step": 2104 }, { "epoch": 0.5267108720130114, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53851953.23076923, "logits/rejected": -63369972.36363637, "logps/chosen": -437.61245492788464, "logps/rejected": -507.2228338068182, "loss": 0.0748, "rewards/chosen": 7.3579277625450725, "rewards/margins": 16.9631828894982, "rewards/rejected": -9.605255126953125, "step": 2105 }, { "epoch": 0.5269610909545852, "grad_norm": 4.78125, "kl": 2.536426544189453, "learning_rate": 5e-06, "logits/chosen": -46098952.72727273, "logits/rejected": -53464743.384615384, "logps/chosen": -401.6966441761364, "logps/rejected": -516.7514272836538, "loss": 0.0437, "rewards/chosen": 9.960128090598367, "rewards/margins": 21.041033711466756, "rewards/rejected": -11.08090562086839, "step": 2106 }, { "epoch": 0.5272113098961592, "grad_norm": 9.75, "kl": 8.824483871459961, "learning_rate": 5e-06, "logits/chosen": -36492864.0, "logits/rejected": -19210400.0, "logps/chosen": -263.2241962139423, "logps/rejected": -480.88671875, "loss": 0.0876, "rewards/chosen": 5.928091195913462, "rewards/margins": 19.284822263917725, "rewards/rejected": -13.356731068004262, "step": 2107 }, { "epoch": 0.527461528837733, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50913319.384615384, "logits/rejected": -48499389.09090909, "logps/chosen": -446.46304086538464, "logps/rejected": -612.26416015625, "loss": 0.0142, "rewards/chosen": 8.980013333834135, "rewards/margins": 24.598332758550043, "rewards/rejected": -15.618319424715908, "step": 2108 }, { "epoch": 0.5277117477793069, "grad_norm": 3.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35225565.538461536, "logits/rejected": -44093003.63636363, "logps/chosen": -348.47329477163464, "logps/rejected": -610.2667347301136, "loss": 0.0254, "rewards/chosen": 6.221956693209135, "rewards/margins": 20.824971552495356, "rewards/rejected": -14.60301485928622, "step": 2109 }, { "epoch": 0.5279619667208808, "grad_norm": 3.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37615616.0, "logits/rejected": -54655044.92307692, "logps/chosen": -352.99740323153407, "logps/rejected": -589.56201171875, "loss": 0.0119, "rewards/chosen": 7.750507701526988, "rewards/margins": 24.41873115592903, "rewards/rejected": -16.668223454402042, "step": 2110 }, { "epoch": 0.5282121856624546, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65892876.8, "logits/rejected": -14716937.142857144, "logps/chosen": -400.0557373046875, "logps/rejected": -519.3572126116071, "loss": 0.0102, "rewards/chosen": 7.787289428710937, "rewards/margins": 22.163644191196987, "rewards/rejected": -14.376354762486049, "step": 2111 }, { "epoch": 0.5284624046040285, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47895084.8, "logits/rejected": -67631305.14285715, "logps/chosen": -436.317236328125, "logps/rejected": -631.9432198660714, "loss": 0.0339, "rewards/chosen": 8.643710327148437, "rewards/margins": 20.720758492606024, "rewards/rejected": -12.077048165457589, "step": 2112 }, { "epoch": 0.5287126235456024, "grad_norm": 3.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87472378.66666667, "logits/rejected": -67988101.33333333, "logps/chosen": -490.9977213541667, "logps/rejected": -532.859375, "loss": 0.0173, "rewards/chosen": 7.319589614868164, "rewards/margins": 19.909045537312828, "rewards/rejected": -12.589455922444662, "step": 2113 }, { "epoch": 0.5289628424871763, "grad_norm": 4.8125, "kl": 2.0825538635253906, "learning_rate": 5e-06, "logits/chosen": -90405730.46153846, "logits/rejected": -44715182.54545455, "logps/chosen": -370.0554762620192, "logps/rejected": -631.4421164772727, "loss": 0.0501, "rewards/chosen": 7.491507897010217, "rewards/margins": 21.72311134605141, "rewards/rejected": -14.231603449041193, "step": 2114 }, { "epoch": 0.5292130614287501, "grad_norm": 5.84375, "kl": 10.327430725097656, "learning_rate": 5e-06, "logits/chosen": -29686839.272727273, "logits/rejected": -44614148.92307692, "logps/chosen": -419.46883877840907, "logps/rejected": -643.4575570913462, "loss": 0.0388, "rewards/chosen": 7.985938332297585, "rewards/margins": 25.45036305247487, "rewards/rejected": -17.464424720177284, "step": 2115 }, { "epoch": 0.5294632803703241, "grad_norm": 15.875, "kl": 3.125221014022827, "learning_rate": 5e-06, "logits/chosen": -46369974.85714286, "logits/rejected": -46014256.0, "logps/chosen": -378.63487025669644, "logps/rejected": -567.995361328125, "loss": 0.0922, "rewards/chosen": 6.825752803257534, "rewards/margins": 17.87595727103097, "rewards/rejected": -11.050204467773437, "step": 2116 }, { "epoch": 0.5297134993118979, "grad_norm": 3.734375, "kl": 4.1598219871521, "learning_rate": 5e-06, "logits/chosen": -58447701.333333336, "logits/rejected": -57366624.0, "logps/chosen": -499.395751953125, "logps/rejected": -631.832763671875, "loss": 0.0102, "rewards/chosen": 10.0084228515625, "rewards/margins": 22.379816691080727, "rewards/rejected": -12.371393839518229, "step": 2117 }, { "epoch": 0.5299637182534718, "grad_norm": 12.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60711925.333333336, "logits/rejected": -35127509.333333336, "logps/chosen": -308.4695231119792, "logps/rejected": -558.070068359375, "loss": 0.0547, "rewards/chosen": 5.445240656534831, "rewards/margins": 17.939631779988606, "rewards/rejected": -12.494391123453775, "step": 2118 }, { "epoch": 0.5302139371950456, "grad_norm": 6.59375, "kl": 12.382843017578125, "learning_rate": 5e-06, "logits/chosen": -63010341.64705882, "logits/rejected": -38518948.571428575, "logps/chosen": -473.96814682904414, "logps/rejected": -618.8705357142857, "loss": 0.0613, "rewards/chosen": 9.446267520680147, "rewards/margins": 17.345983489220885, "rewards/rejected": -7.899715968540737, "step": 2119 }, { "epoch": 0.5304641561366196, "grad_norm": 3.234375, "kl": 3.022669553756714, "learning_rate": 5e-06, "logits/chosen": -54565216.0, "logits/rejected": -42595736.0, "logps/chosen": -461.761962890625, "logps/rejected": -585.8907877604166, "loss": 0.0418, "rewards/chosen": 8.013600667317709, "rewards/margins": 21.019312540690105, "rewards/rejected": -13.005711873372396, "step": 2120 }, { "epoch": 0.5307143750781934, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42197494.15384615, "logits/rejected": -61500887.27272727, "logps/chosen": -307.20389498197113, "logps/rejected": -498.00204190340907, "loss": 0.0233, "rewards/chosen": 7.171900822566106, "rewards/margins": 18.587839460039472, "rewards/rejected": -11.415938637473367, "step": 2121 }, { "epoch": 0.5309645940197673, "grad_norm": 17.0, "kl": 0.9585012197494507, "learning_rate": 5e-06, "logits/chosen": -54077088.0, "logits/rejected": -87668117.33333333, "logps/chosen": -399.3264973958333, "logps/rejected": -712.0403645833334, "loss": 0.0416, "rewards/chosen": 8.252445220947266, "rewards/margins": 21.95688501993815, "rewards/rejected": -13.704439798990885, "step": 2122 }, { "epoch": 0.5312148129613412, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33616712.53333333, "logits/rejected": -57114951.11111111, "logps/chosen": -349.10813802083334, "logps/rejected": -464.7936740451389, "loss": 0.0367, "rewards/chosen": 7.003118896484375, "rewards/margins": 17.87576904296875, "rewards/rejected": -10.872650146484375, "step": 2123 }, { "epoch": 0.531465031902915, "grad_norm": 9.0625, "kl": 11.809114456176758, "learning_rate": 5e-06, "logits/chosen": -58904052.0, "logits/rejected": -52542116.0, "logps/chosen": -504.20068359375, "logps/rejected": -678.1909790039062, "loss": 0.0602, "rewards/chosen": 8.911550521850586, "rewards/margins": 26.381433486938477, "rewards/rejected": -17.46988296508789, "step": 2124 }, { "epoch": 0.5317152508444889, "grad_norm": 7.0, "kl": 2.268538236618042, "learning_rate": 5e-06, "logits/chosen": -75820424.53333333, "logits/rejected": -39155392.0, "logps/chosen": -520.5255208333333, "logps/rejected": -482.20909288194446, "loss": 0.0396, "rewards/chosen": 9.869484456380208, "rewards/margins": 21.559503851996528, "rewards/rejected": -11.69001939561632, "step": 2125 }, { "epoch": 0.5319654697860629, "grad_norm": 8.9375, "kl": 1.367227554321289, "learning_rate": 5e-06, "logits/chosen": -34305932.8, "logits/rejected": -86356096.0, "logps/chosen": -327.6140625, "logps/rejected": -626.7463727678571, "loss": 0.0558, "rewards/chosen": 6.918932342529297, "rewards/margins": 19.63777389526367, "rewards/rejected": -12.718841552734375, "step": 2126 }, { "epoch": 0.5322156887276367, "grad_norm": 14.5625, "kl": 6.1609206199646, "learning_rate": 5e-06, "logits/chosen": -55220216.47058824, "logits/rejected": -35235504.0, "logps/chosen": -418.03406479779414, "logps/rejected": -498.31717354910717, "loss": 0.074, "rewards/chosen": 8.814028571633731, "rewards/margins": 20.84622025690159, "rewards/rejected": -12.032191685267858, "step": 2127 }, { "epoch": 0.5324659076692105, "grad_norm": 15.8125, "kl": 10.758535385131836, "learning_rate": 5e-06, "logits/chosen": -45146057.14285714, "logits/rejected": -54262873.6, "logps/chosen": -473.0800083705357, "logps/rejected": -707.765234375, "loss": 0.1254, "rewards/chosen": 7.696138109479632, "rewards/margins": 23.422972215924943, "rewards/rejected": -15.726834106445313, "step": 2128 }, { "epoch": 0.5327161266107845, "grad_norm": 9.0625, "kl": 5.758484840393066, "learning_rate": 5e-06, "logits/chosen": -56538137.6, "logits/rejected": -34462044.44444445, "logps/chosen": -363.94755859375, "logps/rejected": -356.225341796875, "loss": 0.0426, "rewards/chosen": 7.233360290527344, "rewards/margins": 18.888608296712242, "rewards/rejected": -11.655248006184896, "step": 2129 }, { "epoch": 0.5329663455523583, "grad_norm": 6.78125, "kl": 9.110494613647461, "learning_rate": 5e-06, "logits/chosen": -66562820.92307692, "logits/rejected": -30190085.818181816, "logps/chosen": -462.06186147836536, "logps/rejected": -368.6480158025568, "loss": 0.0159, "rewards/chosen": 8.03597435584435, "rewards/margins": 16.912475265823044, "rewards/rejected": -8.876500909978693, "step": 2130 }, { "epoch": 0.5332165644939322, "grad_norm": 12.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25514594.46153846, "logits/rejected": -76707072.0, "logps/chosen": -285.0158128004808, "logps/rejected": -612.3947975852273, "loss": 0.0366, "rewards/chosen": 5.488927987905649, "rewards/margins": 21.16722555093832, "rewards/rejected": -15.67829756303267, "step": 2131 }, { "epoch": 0.533466783435506, "grad_norm": 15.75, "kl": 23.703855514526367, "learning_rate": 5e-06, "logits/chosen": -59613158.4, "logits/rejected": -39719881.14285714, "logps/chosen": -522.08369140625, "logps/rejected": -673.2114955357143, "loss": 0.166, "rewards/chosen": 7.790958404541016, "rewards/margins": 20.600690024239675, "rewards/rejected": -12.809731619698661, "step": 2132 }, { "epoch": 0.53371700237708, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51006016.0, "logits/rejected": -33163864.615384616, "logps/chosen": -287.73530717329544, "logps/rejected": -489.1514423076923, "loss": 0.033, "rewards/chosen": 6.132544777610085, "rewards/margins": 18.318372179578233, "rewards/rejected": -12.18582740196815, "step": 2133 }, { "epoch": 0.5339672213186538, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43452890.666666664, "logits/rejected": -73550229.33333333, "logps/chosen": -432.9000651041667, "logps/rejected": -658.461669921875, "loss": 0.0247, "rewards/chosen": 8.371060053507486, "rewards/margins": 21.972912470499672, "rewards/rejected": -13.601852416992188, "step": 2134 }, { "epoch": 0.5342174402602277, "grad_norm": 8.875, "kl": 3.3601222038269043, "learning_rate": 5e-06, "logits/chosen": -53444373.333333336, "logits/rejected": -24643194.666666668, "logps/chosen": -351.6663818359375, "logps/rejected": -373.1070556640625, "loss": 0.0538, "rewards/chosen": 7.32926877339681, "rewards/margins": 18.887587865193684, "rewards/rejected": -11.558319091796875, "step": 2135 }, { "epoch": 0.5344676592018016, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46256667.428571425, "logits/rejected": -48986227.2, "logps/chosen": -430.87642996651783, "logps/rejected": -626.4486328125, "loss": 0.026, "rewards/chosen": 9.192671639578682, "rewards/margins": 22.69137769426618, "rewards/rejected": -13.4987060546875, "step": 2136 }, { "epoch": 0.5347178781433755, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41488512.0, "logits/rejected": -37580633.6, "logps/chosen": -430.07457139756946, "logps/rejected": -513.7670572916667, "loss": 0.0027, "rewards/chosen": 8.642894321017796, "rewards/margins": 20.497755771213107, "rewards/rejected": -11.854861450195312, "step": 2137 }, { "epoch": 0.5349680970849493, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45453704.72727273, "logits/rejected": -51846611.692307696, "logps/chosen": -357.63032670454544, "logps/rejected": -463.6939227764423, "loss": 0.0593, "rewards/chosen": 6.423076282848012, "rewards/margins": 18.061006345948975, "rewards/rejected": -11.637930063100962, "step": 2138 }, { "epoch": 0.5352183160265233, "grad_norm": 11.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63992453.81818182, "logits/rejected": -55921826.461538464, "logps/chosen": -327.09965376420456, "logps/rejected": -543.8126878004807, "loss": 0.0435, "rewards/chosen": 5.947715759277344, "rewards/margins": 20.829772362342247, "rewards/rejected": -14.882056603064903, "step": 2139 }, { "epoch": 0.5354685349680971, "grad_norm": 11.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53123708.0, "logits/rejected": -54756760.0, "logps/chosen": -531.7557373046875, "logps/rejected": -624.1727905273438, "loss": 0.0235, "rewards/chosen": 11.134231567382812, "rewards/margins": 25.82841968536377, "rewards/rejected": -14.694188117980957, "step": 2140 }, { "epoch": 0.5357187539096709, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67625314.13333334, "logits/rejected": -20012430.222222224, "logps/chosen": -463.2603515625, "logps/rejected": -495.0603841145833, "loss": 0.0073, "rewards/chosen": 8.080677795410157, "rewards/margins": 20.32905019124349, "rewards/rejected": -12.248372395833334, "step": 2141 }, { "epoch": 0.5359689728512448, "grad_norm": 11.4375, "kl": 9.354574203491211, "learning_rate": 5e-06, "logits/chosen": -49029770.666666664, "logits/rejected": -78046650.66666667, "logps/chosen": -325.4270833333333, "logps/rejected": -441.2404378255208, "loss": 0.1817, "rewards/chosen": 6.097594579060872, "rewards/margins": 16.70362917582194, "rewards/rejected": -10.606034596761068, "step": 2142 }, { "epoch": 0.5362191917928187, "grad_norm": 7.90625, "kl": 8.570608139038086, "learning_rate": 5e-06, "logits/chosen": -80895488.0, "logits/rejected": -75579944.72727273, "logps/chosen": -548.6659029447115, "logps/rejected": -657.7452503551136, "loss": 0.021, "rewards/chosen": 8.808912423940805, "rewards/margins": 21.949294670478448, "rewards/rejected": -13.140382246537643, "step": 2143 }, { "epoch": 0.5364694107343926, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47609930.666666664, "logits/rejected": -39024976.0, "logps/chosen": -342.5165201822917, "logps/rejected": -695.5323893229166, "loss": 0.0323, "rewards/chosen": 6.115297953287761, "rewards/margins": 24.136516571044922, "rewards/rejected": -18.02121861775716, "step": 2144 }, { "epoch": 0.5367196296759664, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -106418378.66666667, "logits/rejected": -28702352.0, "logps/chosen": -473.4376220703125, "logps/rejected": -562.3690592447916, "loss": 0.0202, "rewards/chosen": 9.191539764404297, "rewards/margins": 23.011844635009766, "rewards/rejected": -13.820304870605469, "step": 2145 }, { "epoch": 0.5369698486175404, "grad_norm": 1.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53861416.0, "logits/rejected": -50869288.0, "logps/chosen": -405.7999267578125, "logps/rejected": -489.3377685546875, "loss": 0.0031, "rewards/chosen": 7.776082515716553, "rewards/margins": 20.25640630722046, "rewards/rejected": -12.480323791503906, "step": 2146 }, { "epoch": 0.5372200675591142, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57994464.0, "logits/rejected": -70563122.28571428, "logps/chosen": -364.7872314453125, "logps/rejected": -768.9342912946429, "loss": 0.019, "rewards/chosen": 8.0123046875, "rewards/margins": 27.013536289760047, "rewards/rejected": -19.001231602260045, "step": 2147 }, { "epoch": 0.5374702865006881, "grad_norm": 10.1875, "kl": 8.6414213180542, "learning_rate": 5e-06, "logits/chosen": -50472853.333333336, "logits/rejected": -39408810.666666664, "logps/chosen": -373.7132568359375, "logps/rejected": -545.390625, "loss": 0.0892, "rewards/chosen": 8.135077158610025, "rewards/margins": 21.166544596354164, "rewards/rejected": -13.03146743774414, "step": 2148 }, { "epoch": 0.537720505442262, "grad_norm": 1.7578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52872763.428571425, "logits/rejected": -68879194.35294117, "logps/chosen": -373.35208565848217, "logps/rejected": -737.7532169117648, "loss": 0.0134, "rewards/chosen": 8.416925157819476, "rewards/margins": 25.080181634726642, "rewards/rejected": -16.66325647690717, "step": 2149 }, { "epoch": 0.5379707243838359, "grad_norm": 10.375, "kl": 0.1011962890625, "learning_rate": 5e-06, "logits/chosen": -60058504.53333333, "logits/rejected": -42145770.666666664, "logps/chosen": -348.3949869791667, "logps/rejected": -438.9693196614583, "loss": 0.0585, "rewards/chosen": 6.946583557128906, "rewards/margins": 16.400017971462674, "rewards/rejected": -9.453434414333767, "step": 2150 }, { "epoch": 0.5382209433254097, "grad_norm": 1.4296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52368337.45454545, "logits/rejected": -45927241.84615385, "logps/chosen": -413.10080788352275, "logps/rejected": -624.5305739182693, "loss": 0.0027, "rewards/chosen": 9.568672180175781, "rewards/margins": 24.63751396766076, "rewards/rejected": -15.068841787484976, "step": 2151 }, { "epoch": 0.5384711622669837, "grad_norm": 1.1875, "kl": 0.8620737791061401, "learning_rate": 5e-06, "logits/chosen": -42608977.45454545, "logits/rejected": -67789380.92307693, "logps/chosen": -507.92258522727275, "logps/rejected": -734.6612830528846, "loss": 0.0147, "rewards/chosen": 9.632912375710227, "rewards/margins": 28.646574860686187, "rewards/rejected": -19.01366248497596, "step": 2152 }, { "epoch": 0.5387213812085575, "grad_norm": 15.8125, "kl": 1.8593229055404663, "learning_rate": 5e-06, "logits/chosen": -57384992.0, "logits/rejected": -61207456.0, "logps/chosen": -381.95528738839283, "logps/rejected": -688.5935546875, "loss": 0.0667, "rewards/chosen": 6.4124025617327005, "rewards/margins": 25.52994864327567, "rewards/rejected": -19.11754608154297, "step": 2153 }, { "epoch": 0.5389716001501313, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43190568.0, "logits/rejected": -56134984.0, "logps/chosen": -485.2317810058594, "logps/rejected": -690.8965454101562, "loss": 0.031, "rewards/chosen": 8.927902221679688, "rewards/margins": 23.43411636352539, "rewards/rejected": -14.506214141845703, "step": 2154 }, { "epoch": 0.5392218190917052, "grad_norm": 15.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53580931.2, "logits/rejected": -59117750.85714286, "logps/chosen": -444.608740234375, "logps/rejected": -624.9909319196429, "loss": 0.0224, "rewards/chosen": 8.351038360595703, "rewards/margins": 23.65227519444057, "rewards/rejected": -15.301236833844866, "step": 2155 }, { "epoch": 0.5394720380332791, "grad_norm": 9.8125, "kl": 1.611695647239685, "learning_rate": 5e-06, "logits/chosen": -22153156.0, "logits/rejected": -39491300.0, "logps/chosen": -451.125244140625, "logps/rejected": -558.5101318359375, "loss": 0.0338, "rewards/chosen": 6.9938836097717285, "rewards/margins": 19.404642581939697, "rewards/rejected": -12.410758972167969, "step": 2156 }, { "epoch": 0.539722256974853, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45607395.2, "logits/rejected": -49520960.0, "logps/chosen": -386.5181396484375, "logps/rejected": -675.9732142857143, "loss": 0.0414, "rewards/chosen": 6.091012573242187, "rewards/margins": 21.779198128836494, "rewards/rejected": -15.688185555594307, "step": 2157 }, { "epoch": 0.5399724759164268, "grad_norm": 6.59375, "kl": 0.5099624395370483, "learning_rate": 5e-06, "logits/chosen": -61208426.666666664, "logits/rejected": -60301669.333333336, "logps/chosen": -385.8745930989583, "logps/rejected": -540.2327067057291, "loss": 0.0136, "rewards/chosen": 8.855892817179361, "rewards/margins": 20.134509404500324, "rewards/rejected": -11.278616587320963, "step": 2158 }, { "epoch": 0.5402226948580008, "grad_norm": 10.8125, "kl": 10.076713562011719, "learning_rate": 5e-06, "logits/chosen": -45392088.615384616, "logits/rejected": -63322426.18181818, "logps/chosen": -582.8646709735577, "logps/rejected": -591.1146573153409, "loss": 0.0426, "rewards/chosen": 11.54007075383113, "rewards/margins": 25.564244837194053, "rewards/rejected": -14.024174083362926, "step": 2159 }, { "epoch": 0.5404729137995746, "grad_norm": 11.625, "kl": 1.472161054611206, "learning_rate": 5e-06, "logits/chosen": -42339500.307692304, "logits/rejected": -22776603.636363637, "logps/chosen": -437.17086087740387, "logps/rejected": -827.0696022727273, "loss": 0.0557, "rewards/chosen": 8.695955716646635, "rewards/margins": 23.84162860150104, "rewards/rejected": -15.145672884854404, "step": 2160 }, { "epoch": 0.5407231327411485, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43561584.0, "logits/rejected": -60191984.0, "logps/chosen": -316.0899251302083, "logps/rejected": -607.5961100260416, "loss": 0.0707, "rewards/chosen": 7.730888366699219, "rewards/margins": 21.68212000528971, "rewards/rejected": -13.951231638590494, "step": 2161 }, { "epoch": 0.5409733516827224, "grad_norm": 18.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66656418.90909091, "logits/rejected": -39606528.0, "logps/chosen": -345.62397904829544, "logps/rejected": -516.9219876802885, "loss": 0.0408, "rewards/chosen": 6.436214793812145, "rewards/margins": 18.246508138163108, "rewards/rejected": -11.810293344350962, "step": 2162 }, { "epoch": 0.5412235706242963, "grad_norm": 4.125, "kl": 3.618314266204834, "learning_rate": 5e-06, "logits/chosen": -59207461.333333336, "logits/rejected": -43887906.666666664, "logps/chosen": -455.6234944661458, "logps/rejected": -595.201171875, "loss": 0.0574, "rewards/chosen": 8.622791290283203, "rewards/margins": 21.570638020833336, "rewards/rejected": -12.94784673055013, "step": 2163 }, { "epoch": 0.5414737895658701, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44189516.0, "logits/rejected": -47208928.0, "logps/chosen": -331.9921875, "logps/rejected": -601.8104248046875, "loss": 0.0447, "rewards/chosen": 6.838160514831543, "rewards/margins": 19.5409574508667, "rewards/rejected": -12.702796936035156, "step": 2164 }, { "epoch": 0.5417240085074441, "grad_norm": 1.90625, "kl": 0.47515934705734253, "learning_rate": 5e-06, "logits/chosen": -39371702.85714286, "logits/rejected": -62783500.8, "logps/chosen": -283.30032784598217, "logps/rejected": -601.6263671875, "loss": 0.0152, "rewards/chosen": 6.797151838030134, "rewards/margins": 22.176578412737165, "rewards/rejected": -15.379426574707031, "step": 2165 }, { "epoch": 0.5419742274490179, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41829029.333333336, "logits/rejected": -68100096.0, "logps/chosen": -319.9404296875, "logps/rejected": -767.939453125, "loss": 0.0216, "rewards/chosen": 7.023010889689128, "rewards/margins": 27.498478571573894, "rewards/rejected": -20.475467681884766, "step": 2166 }, { "epoch": 0.5422244463905918, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15970482.666666666, "logits/rejected": -46390517.333333336, "logps/chosen": -251.555419921875, "logps/rejected": -647.8133951822916, "loss": 0.0699, "rewards/chosen": 4.927940050760905, "rewards/margins": 18.13168430328369, "rewards/rejected": -13.203744252522787, "step": 2167 }, { "epoch": 0.5424746653321656, "grad_norm": 14.0625, "kl": 8.705866813659668, "learning_rate": 5e-06, "logits/chosen": -12845481.142857144, "logits/rejected": -63345945.6, "logps/chosen": -418.20389229910717, "logps/rejected": -671.7306640625, "loss": 0.0845, "rewards/chosen": 7.635802132742746, "rewards/margins": 19.62263913835798, "rewards/rejected": -11.986837005615234, "step": 2168 }, { "epoch": 0.5427248842737395, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47088011.63636363, "logits/rejected": -37711163.07692308, "logps/chosen": -392.6071111505682, "logps/rejected": -616.1868239182693, "loss": 0.0147, "rewards/chosen": 8.458906693892045, "rewards/margins": 26.749160553191928, "rewards/rejected": -18.29025385929988, "step": 2169 }, { "epoch": 0.5429751032153134, "grad_norm": 4.875, "kl": 10.860645294189453, "learning_rate": 5e-06, "logits/chosen": -30952322.90909091, "logits/rejected": -67502080.0, "logps/chosen": -431.0446111505682, "logps/rejected": -577.9402794471154, "loss": 0.0405, "rewards/chosen": 8.814153497869318, "rewards/margins": 22.05920965021307, "rewards/rejected": -13.24505615234375, "step": 2170 }, { "epoch": 0.5432253221568872, "grad_norm": 2.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22608361.6, "logits/rejected": -51221572.571428575, "logps/chosen": -380.4486328125, "logps/rejected": -528.7574637276786, "loss": 0.0164, "rewards/chosen": 7.079256439208985, "rewards/margins": 19.047522517613004, "rewards/rejected": -11.968266078404017, "step": 2171 }, { "epoch": 0.5434755410984612, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28102499.2, "logits/rejected": -52547044.571428575, "logps/chosen": -486.08076171875, "logps/rejected": -541.9791782924107, "loss": 0.0484, "rewards/chosen": 9.59075469970703, "rewards/margins": 21.111326817103794, "rewards/rejected": -11.520572117396764, "step": 2172 }, { "epoch": 0.543725760040035, "grad_norm": 8.5625, "kl": 10.987865447998047, "learning_rate": 5e-06, "logits/chosen": -67112864.0, "logits/rejected": -65872614.4, "logps/chosen": -431.044921875, "logps/rejected": -687.9337890625, "loss": 0.0151, "rewards/chosen": 8.814708164760045, "rewards/margins": 21.962219456263952, "rewards/rejected": -13.147511291503907, "step": 2173 }, { "epoch": 0.5439759789816089, "grad_norm": 11.4375, "kl": 21.365909576416016, "learning_rate": 5e-06, "logits/chosen": -47334392.0, "logits/rejected": -18644942.0, "logps/chosen": -484.3040771484375, "logps/rejected": -564.1136474609375, "loss": 0.0232, "rewards/chosen": 9.497515678405762, "rewards/margins": 23.27571964263916, "rewards/rejected": -13.778203964233398, "step": 2174 }, { "epoch": 0.5442261979231828, "grad_norm": 0.66796875, "kl": 1.4529647827148438, "learning_rate": 5e-06, "logits/chosen": -55848036.571428575, "logits/rejected": -36699685.64705882, "logps/chosen": -414.24155970982144, "logps/rejected": -457.40900735294116, "loss": 0.0015, "rewards/chosen": 7.498930794852121, "rewards/margins": 17.28485536976021, "rewards/rejected": -9.785924574908089, "step": 2175 }, { "epoch": 0.5444764168647567, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40653533.09090909, "logits/rejected": -10402633.846153846, "logps/chosen": -294.27321555397725, "logps/rejected": -510.5788010817308, "loss": 0.0579, "rewards/chosen": 6.40006533536044, "rewards/margins": 17.556719106394095, "rewards/rejected": -11.156653771033653, "step": 2176 }, { "epoch": 0.5447266358063305, "grad_norm": 14.6875, "kl": 4.0885114669799805, "learning_rate": 5e-06, "logits/chosen": -22234309.333333332, "logits/rejected": -37483502.93333333, "logps/chosen": -311.2684733072917, "logps/rejected": -506.48675130208335, "loss": 0.072, "rewards/chosen": 6.507117801242405, "rewards/margins": 18.170707278781467, "rewards/rejected": -11.663589477539062, "step": 2177 }, { "epoch": 0.5449768547479045, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55770001.45454545, "logits/rejected": -66267047.384615384, "logps/chosen": -436.98100142045456, "logps/rejected": -713.3965594951923, "loss": 0.0081, "rewards/chosen": 9.160180525346236, "rewards/margins": 26.000577113011502, "rewards/rejected": -16.840396587665264, "step": 2178 }, { "epoch": 0.5452270736894783, "grad_norm": 15.625, "kl": 8.537248611450195, "learning_rate": 5e-06, "logits/chosen": -70146816.0, "logits/rejected": -54325971.2, "logps/chosen": -413.77378627232144, "logps/rejected": -544.406640625, "loss": 0.0542, "rewards/chosen": 7.999961308070591, "rewards/margins": 25.58484900338309, "rewards/rejected": -17.5848876953125, "step": 2179 }, { "epoch": 0.5454772926310522, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 31091122.285714287, "logits/rejected": -42545980.23529412, "logps/chosen": -529.1649693080357, "logps/rejected": -518.1779067095588, "loss": 0.0438, "rewards/chosen": 6.9560121808733255, "rewards/margins": 19.215088467638033, "rewards/rejected": -12.259076286764707, "step": 2180 }, { "epoch": 0.545727511572626, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37047468.307692304, "logits/rejected": -47433314.90909091, "logps/chosen": -430.4957932692308, "logps/rejected": -835.9010120738636, "loss": 0.0506, "rewards/chosen": 7.492636460524339, "rewards/margins": 25.811034676078314, "rewards/rejected": -18.318398215553977, "step": 2181 }, { "epoch": 0.5459777305142, "grad_norm": 7.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66010784.0, "logits/rejected": -30941896.0, "logps/chosen": -403.0221761067708, "logps/rejected": -526.24560546875, "loss": 0.0329, "rewards/chosen": 8.192264556884766, "rewards/margins": 19.355918884277344, "rewards/rejected": -11.163654327392578, "step": 2182 }, { "epoch": 0.5462279494557738, "grad_norm": 18.0, "kl": 5.3979034423828125, "learning_rate": 5e-06, "logits/chosen": -70337253.33333333, "logits/rejected": -27239736.0, "logps/chosen": -393.8075358072917, "logps/rejected": -520.8450520833334, "loss": 0.0544, "rewards/chosen": 7.347586313883464, "rewards/margins": 17.980595270792644, "rewards/rejected": -10.63300895690918, "step": 2183 }, { "epoch": 0.5464781683973476, "grad_norm": 1.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62514629.333333336, "logits/rejected": 52181093.333333336, "logps/chosen": -504.5513509114583, "logps/rejected": -918.9811197916666, "loss": 0.0023, "rewards/chosen": 8.76574452718099, "rewards/margins": 26.832117716471352, "rewards/rejected": -18.066373189290363, "step": 2184 }, { "epoch": 0.5467283873389216, "grad_norm": 7.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54562797.71428572, "logits/rejected": -53581171.2, "logps/chosen": -405.04453822544644, "logps/rejected": -700.098095703125, "loss": 0.0442, "rewards/chosen": 8.844518389020648, "rewards/margins": 28.08423854282924, "rewards/rejected": -19.239720153808594, "step": 2185 }, { "epoch": 0.5469786062804954, "grad_norm": 4.46875, "kl": 0.8447456359863281, "learning_rate": 5e-06, "logits/chosen": -82862120.72727273, "logits/rejected": -47114525.538461536, "logps/chosen": -393.25727982954544, "logps/rejected": -686.8629807692307, "loss": 0.0107, "rewards/chosen": 8.094938104802912, "rewards/margins": 23.540986547936924, "rewards/rejected": -15.446048443134014, "step": 2186 }, { "epoch": 0.5472288252220693, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10448516.0, "logits/rejected": -59344832.0, "logps/chosen": -282.8724365234375, "logps/rejected": -626.5808919270834, "loss": 0.0531, "rewards/chosen": 6.767660776774089, "rewards/margins": 23.443347930908203, "rewards/rejected": -16.675687154134113, "step": 2187 }, { "epoch": 0.5474790441636432, "grad_norm": 2.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68673338.18181819, "logits/rejected": -53837105.23076923, "logps/chosen": -409.21932705965907, "logps/rejected": -652.3308293269231, "loss": 0.0239, "rewards/chosen": 9.701058127663352, "rewards/margins": 27.989755590478858, "rewards/rejected": -18.288697462815506, "step": 2188 }, { "epoch": 0.5477292631052171, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40026880.0, "logits/rejected": -35643818.666666664, "logps/chosen": -308.7258029513889, "logps/rejected": -565.1641927083333, "loss": 0.048, "rewards/chosen": 6.5353198581271705, "rewards/margins": 17.64164310031467, "rewards/rejected": -11.1063232421875, "step": 2189 }, { "epoch": 0.5479794820467909, "grad_norm": 1.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69521701.33333333, "logits/rejected": -47572458.666666664, "logps/chosen": -541.7156575520834, "logps/rejected": -681.5446370442709, "loss": 0.0226, "rewards/chosen": 10.571956634521484, "rewards/margins": 24.806456247965492, "rewards/rejected": -14.23449961344401, "step": 2190 }, { "epoch": 0.5482297009883648, "grad_norm": 2.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3081955.2, "logits/rejected": -42496416.0, "logps/chosen": -416.4873046875, "logps/rejected": -522.1173270089286, "loss": 0.0048, "rewards/chosen": 6.674021148681641, "rewards/margins": 21.582796478271483, "rewards/rejected": -14.908775329589844, "step": 2191 }, { "epoch": 0.5484799199299387, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58454080.0, "logits/rejected": -64791889.06666667, "logps/chosen": -320.16875542534723, "logps/rejected": -707.8028645833333, "loss": 0.0334, "rewards/chosen": 5.107824113633898, "rewards/margins": 20.67294455634223, "rewards/rejected": -15.565120442708333, "step": 2192 }, { "epoch": 0.5487301388715126, "grad_norm": 7.625, "kl": 2.7421507835388184, "learning_rate": 5e-06, "logits/chosen": -21347787.636363637, "logits/rejected": -36012430.76923077, "logps/chosen": -315.5296519886364, "logps/rejected": -480.9157902644231, "loss": 0.064, "rewards/chosen": 6.043200406161222, "rewards/margins": 18.098869163673243, "rewards/rejected": -12.05566875751202, "step": 2193 }, { "epoch": 0.5489803578130864, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65170368.0, "logits/rejected": 40674552.47058824, "logps/chosen": -574.1031668526786, "logps/rejected": -624.6243106617648, "loss": 0.0041, "rewards/chosen": 9.291656494140625, "rewards/margins": 25.288093118106616, "rewards/rejected": -15.996436623965993, "step": 2194 }, { "epoch": 0.5492305767546604, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36908963.2, "logits/rejected": -49748050.28571428, "logps/chosen": -312.22421875, "logps/rejected": -614.9411969866071, "loss": 0.0063, "rewards/chosen": 6.91546630859375, "rewards/margins": 20.638973781040736, "rewards/rejected": -13.723507472446986, "step": 2195 }, { "epoch": 0.5494807956962342, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15953094.666666666, "logits/rejected": -41879858.666666664, "logps/chosen": -337.8475341796875, "logps/rejected": -522.1038411458334, "loss": 0.0664, "rewards/chosen": 6.207684834798177, "rewards/margins": 19.16211191813151, "rewards/rejected": -12.954427083333334, "step": 2196 }, { "epoch": 0.549731014637808, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56167214.54545455, "logits/rejected": -66680300.307692304, "logps/chosen": -452.09565873579544, "logps/rejected": -561.7049654447115, "loss": 0.0305, "rewards/chosen": 8.521127874200994, "rewards/margins": 25.107314209838016, "rewards/rejected": -16.58618633563702, "step": 2197 }, { "epoch": 0.549981233579382, "grad_norm": 14.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27972622.222222224, "logits/rejected": -43543189.333333336, "logps/chosen": -382.10218641493054, "logps/rejected": -666.6999348958333, "loss": 0.024, "rewards/chosen": 9.928428649902344, "rewards/margins": 22.646010843912762, "rewards/rejected": -12.717582194010417, "step": 2198 }, { "epoch": 0.5502314525209558, "grad_norm": 2.5625, "kl": 0.5684560537338257, "learning_rate": 5e-06, "logits/chosen": -45048960.0, "logits/rejected": -64665073.23076923, "logps/chosen": -371.20450106534093, "logps/rejected": -554.3855168269231, "loss": 0.0175, "rewards/chosen": 6.731409939852628, "rewards/margins": 19.395872689627268, "rewards/rejected": -12.66446274977464, "step": 2199 }, { "epoch": 0.5504816714625297, "grad_norm": 1.6328125, "kl": 1.7726819515228271, "learning_rate": 5e-06, "logits/chosen": -62568110.54545455, "logits/rejected": -38419904.0, "logps/chosen": -365.2747913707386, "logps/rejected": -581.5771108774038, "loss": 0.0044, "rewards/chosen": 7.271180586381392, "rewards/margins": 20.96724780956348, "rewards/rejected": -13.69606722318209, "step": 2200 }, { "epoch": 0.5507318904041036, "grad_norm": 15.25, "kl": 0.7482325434684753, "learning_rate": 5e-06, "logits/chosen": -46166922.666666664, "logits/rejected": -48369476.266666666, "logps/chosen": -315.99565972222223, "logps/rejected": -600.784375, "loss": 0.0668, "rewards/chosen": 6.669189453125, "rewards/margins": 21.257918294270834, "rewards/rejected": -14.588728841145834, "step": 2201 }, { "epoch": 0.5509821093456775, "grad_norm": 3.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63079524.571428575, "logits/rejected": -57004198.4, "logps/chosen": -358.98824637276783, "logps/rejected": -801.42880859375, "loss": 0.0316, "rewards/chosen": 8.077665056501116, "rewards/margins": 26.286652483258926, "rewards/rejected": -18.208987426757812, "step": 2202 }, { "epoch": 0.5512323282872513, "grad_norm": 7.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60866688.0, "logits/rejected": -29806309.333333332, "logps/chosen": -435.8663330078125, "logps/rejected": -610.6060791015625, "loss": 0.0358, "rewards/chosen": 7.366757074991862, "rewards/margins": 23.363701502482098, "rewards/rejected": -15.996944427490234, "step": 2203 }, { "epoch": 0.5514825472288252, "grad_norm": 14.0, "kl": 10.557289123535156, "learning_rate": 5e-06, "logits/chosen": -29518647.111111112, "logits/rejected": -39547400.53333333, "logps/chosen": -324.73822699652777, "logps/rejected": -566.5731119791667, "loss": 0.0644, "rewards/chosen": 7.264499240451389, "rewards/margins": 19.48249240451389, "rewards/rejected": -12.2179931640625, "step": 2204 }, { "epoch": 0.5517327661703991, "grad_norm": 6.59375, "kl": 6.861140251159668, "learning_rate": 5e-06, "logits/chosen": -50567394.90909091, "logits/rejected": -69264000.0, "logps/chosen": -330.64599609375, "logps/rejected": -432.14599609375, "loss": 0.0193, "rewards/chosen": 7.343974720348012, "rewards/margins": 15.957654459493144, "rewards/rejected": -8.613679739145132, "step": 2205 }, { "epoch": 0.551982985111973, "grad_norm": 3.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44249888.0, "logits/rejected": -53415596.307692304, "logps/chosen": -366.6449085582386, "logps/rejected": -648.1543719951923, "loss": 0.0292, "rewards/chosen": 7.450180747292259, "rewards/margins": 20.49393121679346, "rewards/rejected": -13.043750469501202, "step": 2206 }, { "epoch": 0.5522332040535468, "grad_norm": 20.625, "kl": 15.272483825683594, "learning_rate": 5e-06, "logits/chosen": -50977494.85714286, "logits/rejected": -48314016.0, "logps/chosen": -431.32212611607144, "logps/rejected": -694.90078125, "loss": 0.086, "rewards/chosen": 9.32796151297433, "rewards/margins": 24.871801539829796, "rewards/rejected": -15.543840026855468, "step": 2207 }, { "epoch": 0.5524834229951208, "grad_norm": 4.40625, "kl": 1.089684247970581, "learning_rate": 5e-06, "logits/chosen": -31994034.666666668, "logits/rejected": -61946890.666666664, "logps/chosen": -368.3271484375, "logps/rejected": -583.7980143229166, "loss": 0.0198, "rewards/chosen": 8.164920171101889, "rewards/margins": 19.484787623087566, "rewards/rejected": -11.319867451985678, "step": 2208 }, { "epoch": 0.5527336419366946, "grad_norm": 1.0859375, "kl": 1.9735896587371826, "learning_rate": 5e-06, "logits/chosen": -45969210.18181818, "logits/rejected": -48982660.92307692, "logps/chosen": -535.576171875, "logps/rejected": -486.1553485576923, "loss": 0.009, "rewards/chosen": 9.057313398881393, "rewards/margins": 20.035731602381993, "rewards/rejected": -10.9784182035006, "step": 2209 }, { "epoch": 0.5529838608782685, "grad_norm": 4.3125, "kl": 6.8734588623046875, "learning_rate": 5e-06, "logits/chosen": -69413984.0, "logits/rejected": -37256765.71428572, "logps/chosen": -995.30712890625, "logps/rejected": -512.4062848772321, "loss": 0.0591, "rewards/chosen": 14.148481750488282, "rewards/margins": 25.39093736921038, "rewards/rejected": -11.242455618722099, "step": 2210 }, { "epoch": 0.5532340798198424, "grad_norm": 1.25, "kl": 8.407530784606934, "learning_rate": 5e-06, "logits/chosen": -44320438.85714286, "logits/rejected": -64208281.6, "logps/chosen": -360.787109375, "logps/rejected": -606.973681640625, "loss": 0.0141, "rewards/chosen": 7.3471205575125555, "rewards/margins": 22.719944654192243, "rewards/rejected": -15.372824096679688, "step": 2211 }, { "epoch": 0.5534842987614162, "grad_norm": 1.53125, "kl": 9.036863327026367, "learning_rate": 5e-06, "logits/chosen": -32719901.866666667, "logits/rejected": -46798400.0, "logps/chosen": -408.28974609375, "logps/rejected": -757.9505750868055, "loss": 0.0834, "rewards/chosen": 8.70982157389323, "rewards/margins": 21.787941487630206, "rewards/rejected": -13.078119913736979, "step": 2212 }, { "epoch": 0.5537345177029901, "grad_norm": 10.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46420381.86666667, "logits/rejected": -12633033.777777778, "logps/chosen": -281.65673828125, "logps/rejected": -463.3147243923611, "loss": 0.0664, "rewards/chosen": 4.480144246419271, "rewards/margins": 14.702235073513457, "rewards/rejected": -10.222090827094185, "step": 2213 }, { "epoch": 0.553984736644564, "grad_norm": 10.3125, "kl": 2.701261043548584, "learning_rate": 5e-06, "logits/chosen": -90977545.14285715, "logits/rejected": -36695337.6, "logps/chosen": -460.1714564732143, "logps/rejected": -545.32685546875, "loss": 0.0514, "rewards/chosen": 8.387596675327845, "rewards/margins": 22.643401118687223, "rewards/rejected": -14.255804443359375, "step": 2214 }, { "epoch": 0.5542349555861379, "grad_norm": 8.8125, "kl": 2.6284308433532715, "learning_rate": 5e-06, "logits/chosen": -102740549.81818181, "logits/rejected": -48862444.307692304, "logps/chosen": -478.66317471590907, "logps/rejected": -716.7600661057693, "loss": 0.0174, "rewards/chosen": 10.435284701260654, "rewards/margins": 22.260294560785894, "rewards/rejected": -11.82500985952524, "step": 2215 }, { "epoch": 0.5544851745277117, "grad_norm": 1.5390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66416279.27272727, "logits/rejected": -67693838.76923077, "logps/chosen": -378.44651100852275, "logps/rejected": -532.3795072115385, "loss": 0.0159, "rewards/chosen": 7.745054765181108, "rewards/margins": 24.40701976856152, "rewards/rejected": -16.66196500338041, "step": 2216 }, { "epoch": 0.5547353934692856, "grad_norm": 9.0, "kl": 8.4419584274292, "learning_rate": 5e-06, "logits/chosen": -63761844.705882356, "logits/rejected": -31694966.85714286, "logps/chosen": -453.46452780330884, "logps/rejected": -529.6546107700893, "loss": 0.0169, "rewards/chosen": 9.233721564797793, "rewards/margins": 21.631866967978596, "rewards/rejected": -12.398145403180804, "step": 2217 }, { "epoch": 0.5549856124108595, "grad_norm": 3.609375, "kl": 4.949073791503906, "learning_rate": 5e-06, "logits/chosen": -52177148.44444445, "logits/rejected": -26743616.0, "logps/chosen": -405.1030544704861, "logps/rejected": -449.1039713541667, "loss": 0.0219, "rewards/chosen": 9.607464260525173, "rewards/margins": 19.095936754014758, "rewards/rejected": -9.488472493489583, "step": 2218 }, { "epoch": 0.5552358313524334, "grad_norm": 15.0625, "kl": 7.9950079917907715, "learning_rate": 5e-06, "logits/chosen": -27427912.533333335, "logits/rejected": -27206906.666666668, "logps/chosen": -481.2649739583333, "logps/rejected": -358.31982421875, "loss": 0.0733, "rewards/chosen": 7.714934285481771, "rewards/margins": 18.35647396511502, "rewards/rejected": -10.641539679633247, "step": 2219 }, { "epoch": 0.5554860502940072, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -80972597.33333333, "logits/rejected": -65675939.55555555, "logps/chosen": -559.4909261067709, "logps/rejected": -752.3308376736111, "loss": 0.0103, "rewards/chosen": 11.664863586425781, "rewards/margins": 28.364136589898003, "rewards/rejected": -16.69927300347222, "step": 2220 }, { "epoch": 0.5557362692355812, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40739845.333333336, "logits/rejected": -55055258.666666664, "logps/chosen": -434.4303792317708, "logps/rejected": -586.0794270833334, "loss": 0.0229, "rewards/chosen": 9.84492047627767, "rewards/margins": 22.0826104482015, "rewards/rejected": -12.237689971923828, "step": 2221 }, { "epoch": 0.555986488177155, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16062915.0, "logits/rejected": -36711800.0, "logps/chosen": -584.0648193359375, "logps/rejected": -610.6727294921875, "loss": 0.0186, "rewards/chosen": 9.46000862121582, "rewards/margins": 21.026348114013672, "rewards/rejected": -11.566339492797852, "step": 2222 }, { "epoch": 0.5562367071187289, "grad_norm": 14.8125, "kl": 12.567930221557617, "learning_rate": 5e-06, "logits/chosen": -43576085.333333336, "logits/rejected": -46109168.0, "logps/chosen": -422.2206624348958, "logps/rejected": -593.015869140625, "loss": 0.0956, "rewards/chosen": 7.731258392333984, "rewards/margins": 21.115056355794273, "rewards/rejected": -13.383797963460287, "step": 2223 }, { "epoch": 0.5564869260603028, "grad_norm": 16.125, "kl": 24.70016098022461, "learning_rate": 5e-06, "logits/chosen": -17751926.588235293, "logits/rejected": -24687488.0, "logps/chosen": -412.2907284007353, "logps/rejected": -771.4623325892857, "loss": 0.1531, "rewards/chosen": 8.532025505514707, "rewards/margins": 25.56789019929261, "rewards/rejected": -17.035864693777903, "step": 2224 }, { "epoch": 0.5567371450018767, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45025179.428571425, "logits/rejected": -76541312.0, "logps/chosen": -479.84622628348217, "logps/rejected": -755.481201171875, "loss": 0.014, "rewards/chosen": 9.731000627790179, "rewards/margins": 29.588976396833147, "rewards/rejected": -19.85797576904297, "step": 2225 }, { "epoch": 0.5569873639434505, "grad_norm": 9.375, "kl": 8.877699851989746, "learning_rate": 5e-06, "logits/chosen": -75386581.33333333, "logits/rejected": -55426234.666666664, "logps/chosen": -414.4938151041667, "logps/rejected": -497.8795572916667, "loss": 0.043, "rewards/chosen": 7.745774586995442, "rewards/margins": 20.341101328531902, "rewards/rejected": -12.595326741536459, "step": 2226 }, { "epoch": 0.5572375828850245, "grad_norm": 18.25, "kl": 4.1519880294799805, "learning_rate": 5e-06, "logits/chosen": -47674968.88888889, "logits/rejected": -40126660.266666666, "logps/chosen": -383.1032443576389, "logps/rejected": -431.6559244791667, "loss": 0.0426, "rewards/chosen": 5.5805859035915795, "rewards/margins": 17.098009406195747, "rewards/rejected": -11.517423502604167, "step": 2227 }, { "epoch": 0.5574878018265983, "grad_norm": 7.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67344796.44444445, "logits/rejected": -15390999.466666667, "logps/chosen": -315.55967881944446, "logps/rejected": -518.8408854166667, "loss": 0.037, "rewards/chosen": 5.751074473063151, "rewards/margins": 19.282000986735028, "rewards/rejected": -13.530926513671876, "step": 2228 }, { "epoch": 0.5577380207681721, "grad_norm": 3.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -88550865.45454545, "logits/rejected": -66570003.692307696, "logps/chosen": -426.7345525568182, "logps/rejected": -759.4130108173077, "loss": 0.0142, "rewards/chosen": 8.13846241344105, "rewards/margins": 28.333091789192252, "rewards/rejected": -20.1946293757512, "step": 2229 }, { "epoch": 0.557988239709746, "grad_norm": 12.9375, "kl": 10.263299942016602, "learning_rate": 5e-06, "logits/chosen": -57979204.571428575, "logits/rejected": -60437286.4, "logps/chosen": -343.8016880580357, "logps/rejected": -545.359912109375, "loss": 0.1117, "rewards/chosen": 6.547127314976284, "rewards/margins": 17.194928523472377, "rewards/rejected": -10.647801208496094, "step": 2230 }, { "epoch": 0.5582384586513199, "grad_norm": 8.0625, "kl": 23.013729095458984, "learning_rate": 5e-06, "logits/chosen": -50072849.06666667, "logits/rejected": -31788206.222222224, "logps/chosen": -498.35, "logps/rejected": -485.59971788194446, "loss": 0.0256, "rewards/chosen": 10.879056803385417, "rewards/margins": 20.03604261610243, "rewards/rejected": -9.156985812717014, "step": 2231 }, { "epoch": 0.5584886775928938, "grad_norm": 11.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38280023.27272727, "logits/rejected": -61903876.92307692, "logps/chosen": -269.60666725852275, "logps/rejected": -451.9533128004808, "loss": 0.0647, "rewards/chosen": 7.179168007590554, "rewards/margins": 21.69106399429428, "rewards/rejected": -14.511895986703726, "step": 2232 }, { "epoch": 0.5587388965344676, "grad_norm": 4.71875, "kl": 4.89161491394043, "learning_rate": 5e-06, "logits/chosen": -57280913.45454545, "logits/rejected": -800019.6923076923, "logps/chosen": -372.40576171875, "logps/rejected": -595.7172475961538, "loss": 0.0349, "rewards/chosen": 8.683320478959518, "rewards/margins": 21.70396327305507, "rewards/rejected": -13.020642794095552, "step": 2233 }, { "epoch": 0.5589891154760416, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9705457.142857144, "logits/rejected": -27140576.0, "logps/chosen": -554.3320661272321, "logps/rejected": -518.534375, "loss": 0.0182, "rewards/chosen": 9.368435450962611, "rewards/margins": 19.627776881626673, "rewards/rejected": -10.259341430664062, "step": 2234 }, { "epoch": 0.5592393344176154, "grad_norm": 2.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49784336.0, "logits/rejected": -48793508.571428575, "logps/chosen": -345.030859375, "logps/rejected": -768.3096400669643, "loss": 0.0059, "rewards/chosen": 7.549298095703125, "rewards/margins": 24.989733232770647, "rewards/rejected": -17.44043513706752, "step": 2235 }, { "epoch": 0.5594895533591893, "grad_norm": 1.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39050102.85714286, "logits/rejected": -66494712.47058824, "logps/chosen": -387.23538643973217, "logps/rejected": -627.134765625, "loss": 0.0108, "rewards/chosen": 6.802137102399554, "rewards/margins": 21.69599786325663, "rewards/rejected": -14.893860760857077, "step": 2236 }, { "epoch": 0.5597397723007632, "grad_norm": 3.78125, "kl": 12.960908889770508, "learning_rate": 5e-06, "logits/chosen": -51196296.53333333, "logits/rejected": -38265194.666666664, "logps/chosen": -466.35797526041665, "logps/rejected": -513.1847330729166, "loss": 0.0904, "rewards/chosen": 10.151260375976562, "rewards/margins": 26.23682149251302, "rewards/rejected": -16.085561116536457, "step": 2237 }, { "epoch": 0.5599899912423371, "grad_norm": 11.9375, "kl": 3.411080837249756, "learning_rate": 5e-06, "logits/chosen": -47128536.615384616, "logits/rejected": -82026944.0, "logps/chosen": -386.1357421875, "logps/rejected": -653.4439808238636, "loss": 0.049, "rewards/chosen": 7.54829582801232, "rewards/margins": 18.915140698839735, "rewards/rejected": -11.366844870827414, "step": 2238 }, { "epoch": 0.5602402101839109, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16029644.8, "logits/rejected": 23448230.85714286, "logps/chosen": -326.1718505859375, "logps/rejected": -642.9763532366071, "loss": 0.059, "rewards/chosen": 6.172935867309571, "rewards/margins": 19.733556856427874, "rewards/rejected": -13.560620989118304, "step": 2239 }, { "epoch": 0.5604904291254847, "grad_norm": 3.578125, "kl": 15.72818660736084, "learning_rate": 5e-06, "logits/chosen": -84745885.53846154, "logits/rejected": -49870301.09090909, "logps/chosen": -469.5951397235577, "logps/rejected": -788.8330078125, "loss": 0.0514, "rewards/chosen": 10.115003145658052, "rewards/margins": 31.00092150281359, "rewards/rejected": -20.88591835715554, "step": 2240 }, { "epoch": 0.5607406480670587, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37805011.2, "logits/rejected": -52818386.28571428, "logps/chosen": -430.92646484375, "logps/rejected": -647.7152622767857, "loss": 0.007, "rewards/chosen": 8.791746520996094, "rewards/margins": 21.727995300292967, "rewards/rejected": -12.936248779296875, "step": 2241 }, { "epoch": 0.5609908670086325, "grad_norm": 6.21875, "kl": 5.519371032714844, "learning_rate": 5e-06, "logits/chosen": -37944688.0, "logits/rejected": -67067061.333333336, "logps/chosen": -328.9994710286458, "logps/rejected": -977.7801106770834, "loss": 0.0422, "rewards/chosen": 6.542582194010417, "rewards/margins": 28.493682861328125, "rewards/rejected": -21.951100667317707, "step": 2242 }, { "epoch": 0.5612410859502064, "grad_norm": 15.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43252977.777777776, "logits/rejected": -28378161.066666666, "logps/chosen": -303.33661566840277, "logps/rejected": -625.6641927083333, "loss": 0.0777, "rewards/chosen": 6.276815626356337, "rewards/margins": 18.42606489393446, "rewards/rejected": -12.149249267578124, "step": 2243 }, { "epoch": 0.5614913048917803, "grad_norm": 0.76953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60453504.0, "logits/rejected": -45960704.0, "logps/chosen": -487.5872395833333, "logps/rejected": -815.9900716145834, "loss": 0.0089, "rewards/chosen": 10.017840067545572, "rewards/margins": 31.05964914957682, "rewards/rejected": -21.04180908203125, "step": 2244 }, { "epoch": 0.5617415238333542, "grad_norm": 5.78125, "kl": 2.5044188499450684, "learning_rate": 5e-06, "logits/chosen": -102826496.0, "logits/rejected": -51530368.0, "logps/chosen": -454.8592998798077, "logps/rejected": -688.3220880681819, "loss": 0.0303, "rewards/chosen": 7.2289252647986775, "rewards/margins": 22.817442887312883, "rewards/rejected": -15.588517622514205, "step": 2245 }, { "epoch": 0.561991742774928, "grad_norm": 19.375, "kl": 17.250429153442383, "learning_rate": 5e-06, "logits/chosen": -52552736.0, "logits/rejected": -47524304.0, "logps/chosen": -346.87109375, "logps/rejected": -444.7727355957031, "loss": 0.1162, "rewards/chosen": 8.528627395629883, "rewards/margins": 18.407371520996094, "rewards/rejected": -9.878744125366211, "step": 2246 }, { "epoch": 0.562241961716502, "grad_norm": 10.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29423392.0, "logits/rejected": -50943181.71428572, "logps/chosen": -287.190234375, "logps/rejected": -511.52828543526783, "loss": 0.0405, "rewards/chosen": 7.7221923828125, "rewards/margins": 21.86156507219587, "rewards/rejected": -14.13937268938337, "step": 2247 }, { "epoch": 0.5624921806580758, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6685648.0, "logits/rejected": -41198904.0, "logps/chosen": -436.6796875, "logps/rejected": -508.891357421875, "loss": 0.0346, "rewards/chosen": 7.142804463704427, "rewards/margins": 18.437917073567707, "rewards/rejected": -11.295112609863281, "step": 2248 }, { "epoch": 0.5627423995996497, "grad_norm": 20.375, "kl": 9.449443817138672, "learning_rate": 5e-06, "logits/chosen": -41812965.333333336, "logits/rejected": -63191936.0, "logps/chosen": -447.0713704427083, "logps/rejected": -592.46240234375, "loss": 0.0395, "rewards/chosen": 10.017625172932943, "rewards/margins": 24.833875020345054, "rewards/rejected": -14.81624984741211, "step": 2249 }, { "epoch": 0.5629926185412236, "grad_norm": 4.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22586656.0, "logits/rejected": -42727008.0, "logps/chosen": -285.0314534505208, "logps/rejected": -650.9852701822916, "loss": 0.0235, "rewards/chosen": 7.072900136311849, "rewards/margins": 23.525263468424477, "rewards/rejected": -16.45236333211263, "step": 2250 }, { "epoch": 0.5632428374827975, "grad_norm": 2.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23110421.333333332, "logits/rejected": -57703594.666666664, "logps/chosen": -288.81858317057294, "logps/rejected": -640.3797607421875, "loss": 0.0522, "rewards/chosen": 7.162334442138672, "rewards/margins": 21.836200714111328, "rewards/rejected": -14.673866271972656, "step": 2251 }, { "epoch": 0.5634930564243713, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64644294.4, "logits/rejected": -3635798.8571428573, "logps/chosen": -406.491259765625, "logps/rejected": -549.751220703125, "loss": 0.0518, "rewards/chosen": 6.840129852294922, "rewards/margins": 20.174532754080637, "rewards/rejected": -13.334402901785714, "step": 2252 }, { "epoch": 0.5637432753659452, "grad_norm": 6.53125, "kl": 2.65033221244812, "learning_rate": 5e-06, "logits/chosen": -57903829.333333336, "logits/rejected": -49826912.0, "logps/chosen": -427.9501139322917, "logps/rejected": -532.7457682291666, "loss": 0.0408, "rewards/chosen": 7.336427052815755, "rewards/margins": 23.527210235595703, "rewards/rejected": -16.19078318277995, "step": 2253 }, { "epoch": 0.5639934943075191, "grad_norm": 129.0, "kl": 1.3301570415496826, "learning_rate": 5e-06, "logits/chosen": -47594416.0, "logits/rejected": 6942874.666666667, "logps/chosen": -466.6420084635417, "logps/rejected": -727.72021484375, "loss": 0.0315, "rewards/chosen": 8.58200454711914, "rewards/margins": 25.605364481608074, "rewards/rejected": -17.023359934488933, "step": 2254 }, { "epoch": 0.564243713249093, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53714150.4, "logits/rejected": -26692571.42857143, "logps/chosen": -344.93427734375, "logps/rejected": -452.2932826450893, "loss": 0.0123, "rewards/chosen": 7.925592041015625, "rewards/margins": 17.71711883544922, "rewards/rejected": -9.791526794433594, "step": 2255 }, { "epoch": 0.5644939321906668, "grad_norm": 12.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44354074.666666664, "logits/rejected": -52597472.0, "logps/chosen": -406.4167887369792, "logps/rejected": -682.864990234375, "loss": 0.0173, "rewards/chosen": 6.738356272379558, "rewards/margins": 22.1699956258138, "rewards/rejected": -15.431639353434244, "step": 2256 }, { "epoch": 0.5647441511322407, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37157949.538461536, "logits/rejected": -53536436.36363637, "logps/chosen": -465.2844801682692, "logps/rejected": -761.7194602272727, "loss": 0.0345, "rewards/chosen": 7.041176429161658, "rewards/margins": 27.200528124829273, "rewards/rejected": -20.159351695667613, "step": 2257 }, { "epoch": 0.5649943700738146, "grad_norm": 2.765625, "kl": 0.17107200622558594, "learning_rate": 5e-06, "logits/chosen": -25873620.0, "logits/rejected": -35479256.0, "logps/chosen": -325.31622314453125, "logps/rejected": -640.5003051757812, "loss": 0.034, "rewards/chosen": 6.871766090393066, "rewards/margins": 24.039748191833496, "rewards/rejected": -17.16798210144043, "step": 2258 }, { "epoch": 0.5652445890153884, "grad_norm": 7.1875, "kl": 4.9744062423706055, "learning_rate": 5e-06, "logits/chosen": -51242870.15384615, "logits/rejected": -67932130.9090909, "logps/chosen": -460.6453200120192, "logps/rejected": -574.5331587357955, "loss": 0.0174, "rewards/chosen": 8.406485924353966, "rewards/margins": 22.717206194684223, "rewards/rejected": -14.310720270330256, "step": 2259 }, { "epoch": 0.5654948079569624, "grad_norm": 11.0625, "kl": 6.214234828948975, "learning_rate": 5e-06, "logits/chosen": -45167150.222222224, "logits/rejected": -43874649.6, "logps/chosen": -401.38330078125, "logps/rejected": -592.7317057291667, "loss": 0.0725, "rewards/chosen": 7.067115359836155, "rewards/margins": 20.638645511203343, "rewards/rejected": -13.571530151367188, "step": 2260 }, { "epoch": 0.5657450268985362, "grad_norm": 21.375, "kl": 13.548751831054688, "learning_rate": 5e-06, "logits/chosen": -63644416.0, "logits/rejected": -60329924.92307692, "logps/chosen": -466.36177201704544, "logps/rejected": -749.6899038461538, "loss": 0.0852, "rewards/chosen": 8.91021728515625, "rewards/margins": 29.64050762469952, "rewards/rejected": -20.73029033954327, "step": 2261 }, { "epoch": 0.5659952458401101, "grad_norm": 13.25, "kl": 0.7518247365951538, "learning_rate": 5e-06, "logits/chosen": -50342486.85714286, "logits/rejected": 3173688.8, "logps/chosen": -325.92124720982144, "logps/rejected": -668.8755859375, "loss": 0.0497, "rewards/chosen": 6.620638166155134, "rewards/margins": 27.647838483537946, "rewards/rejected": -21.027200317382814, "step": 2262 }, { "epoch": 0.566245464781684, "grad_norm": 12.75, "kl": 5.172573089599609, "learning_rate": 5e-06, "logits/chosen": -36157840.0, "logits/rejected": -28679082.0, "logps/chosen": -407.583740234375, "logps/rejected": -578.8125, "loss": 0.0667, "rewards/chosen": 8.070967674255371, "rewards/margins": 19.281394004821777, "rewards/rejected": -11.210426330566406, "step": 2263 }, { "epoch": 0.5664956837232579, "grad_norm": 2.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41992298.666666664, "logits/rejected": -61205274.666666664, "logps/chosen": -416.782958984375, "logps/rejected": -635.7954508463541, "loss": 0.0243, "rewards/chosen": 7.226793924967448, "rewards/margins": 23.551854451497398, "rewards/rejected": -16.32506052652995, "step": 2264 }, { "epoch": 0.5667459026648317, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57836979.2, "logits/rejected": -18393582.222222224, "logps/chosen": -392.0521484375, "logps/rejected": -589.2864583333334, "loss": 0.0579, "rewards/chosen": 6.7623138427734375, "rewards/margins": 22.924292670355904, "rewards/rejected": -16.161978827582466, "step": 2265 }, { "epoch": 0.5669961216064056, "grad_norm": 3.0625, "kl": 0.1782754361629486, "learning_rate": 5e-06, "logits/chosen": -91771520.0, "logits/rejected": -49431142.4, "logps/chosen": -460.9553920200893, "logps/rejected": -653.052294921875, "loss": 0.0075, "rewards/chosen": 7.638459341866629, "rewards/margins": 23.203597586495537, "rewards/rejected": -15.565138244628907, "step": 2266 }, { "epoch": 0.5672463405479795, "grad_norm": 14.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63439189.333333336, "logits/rejected": -64442760.53333333, "logps/chosen": -421.9326171875, "logps/rejected": -575.6893229166667, "loss": 0.0815, "rewards/chosen": 7.174935234917535, "rewards/margins": 21.671965874565974, "rewards/rejected": -14.497030639648438, "step": 2267 }, { "epoch": 0.5674965594895534, "grad_norm": 3.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75037466.66666667, "logits/rejected": -46718848.0, "logps/chosen": -411.4923095703125, "logps/rejected": -543.7148030598959, "loss": 0.0219, "rewards/chosen": 8.324209849039713, "rewards/margins": 22.459842681884766, "rewards/rejected": -14.135632832845053, "step": 2268 }, { "epoch": 0.5677467784311272, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47134817.88235294, "logits/rejected": -71357952.0, "logps/chosen": -385.07482192095586, "logps/rejected": -720.5118582589286, "loss": 0.0249, "rewards/chosen": 7.466412263758042, "rewards/margins": 23.845067833652013, "rewards/rejected": -16.378655569893972, "step": 2269 }, { "epoch": 0.5679969973727012, "grad_norm": 21.625, "kl": 20.086185455322266, "learning_rate": 5e-06, "logits/chosen": -44130812.23529412, "logits/rejected": -54799168.0, "logps/chosen": -421.8890739889706, "logps/rejected": -365.50840541294644, "loss": 0.0685, "rewards/chosen": 9.60176176183364, "rewards/margins": 21.46457870868074, "rewards/rejected": -11.862816946847099, "step": 2270 }, { "epoch": 0.568247216314275, "grad_norm": 2.546875, "kl": 3.88920259475708, "learning_rate": 5e-06, "logits/chosen": -50329141.333333336, "logits/rejected": -29379850.666666668, "logps/chosen": -508.6871744791667, "logps/rejected": -567.8767496744791, "loss": 0.0045, "rewards/chosen": 9.266108194986979, "rewards/margins": 20.555709838867188, "rewards/rejected": -11.289601643880209, "step": 2271 }, { "epoch": 0.5684974352558488, "grad_norm": 12.9375, "kl": 10.461141586303711, "learning_rate": 5e-06, "logits/chosen": -43328233.4117647, "logits/rejected": -50061458.28571428, "logps/chosen": -471.9319278492647, "logps/rejected": -512.3748604910714, "loss": 0.0838, "rewards/chosen": 9.13576103659237, "rewards/margins": 22.897634554310002, "rewards/rejected": -13.761873517717634, "step": 2272 }, { "epoch": 0.5687476541974228, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28839686.4, "logits/rejected": -51925398.85714286, "logps/chosen": -383.85751953125, "logps/rejected": -589.2259347098214, "loss": 0.0453, "rewards/chosen": 6.904610443115234, "rewards/margins": 22.751034000941686, "rewards/rejected": -15.846423557826451, "step": 2273 }, { "epoch": 0.5689978731389966, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42989643.63636363, "logits/rejected": -69651588.92307693, "logps/chosen": -341.5509144176136, "logps/rejected": -743.3466045673077, "loss": 0.0185, "rewards/chosen": 6.822572187943892, "rewards/margins": 26.617094986922258, "rewards/rejected": -19.794522798978367, "step": 2274 }, { "epoch": 0.5692480920805705, "grad_norm": 15.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63942844.0, "logits/rejected": -47546712.0, "logps/chosen": -581.1844482421875, "logps/rejected": -597.9588623046875, "loss": 0.0158, "rewards/chosen": 8.496853828430176, "rewards/margins": 23.93095302581787, "rewards/rejected": -15.434099197387695, "step": 2275 }, { "epoch": 0.5694983110221443, "grad_norm": 13.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30842835.2, "logits/rejected": -49663547.428571425, "logps/chosen": -314.755029296875, "logps/rejected": -655.4777483258929, "loss": 0.0277, "rewards/chosen": 7.239714050292969, "rewards/margins": 24.36540069580078, "rewards/rejected": -17.125686645507812, "step": 2276 }, { "epoch": 0.5697485299637183, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42923501.333333336, "logits/rejected": -69472762.66666667, "logps/chosen": -351.4805501302083, "logps/rejected": -741.1748046875, "loss": 0.0219, "rewards/chosen": 8.200290044148764, "rewards/margins": 28.485682805379234, "rewards/rejected": -20.28539276123047, "step": 2277 }, { "epoch": 0.5699987489052921, "grad_norm": 5.8125, "kl": 8.19485092163086, "learning_rate": 5e-06, "logits/chosen": -75436749.71428572, "logits/rejected": -46003561.6, "logps/chosen": -489.2398158482143, "logps/rejected": -551.798046875, "loss": 0.0475, "rewards/chosen": 8.35165296282087, "rewards/margins": 20.130722481863838, "rewards/rejected": -11.779069519042968, "step": 2278 }, { "epoch": 0.570248967846866, "grad_norm": 4.78125, "kl": 1.142919898033142, "learning_rate": 5e-06, "logits/chosen": -31582481.454545453, "logits/rejected": 16276025.846153846, "logps/chosen": -417.77241654829544, "logps/rejected": -572.8281625600962, "loss": 0.0164, "rewards/chosen": 7.47433818470348, "rewards/margins": 22.97373957067103, "rewards/rejected": -15.499401385967548, "step": 2279 }, { "epoch": 0.5704991867884399, "grad_norm": 17.0, "kl": 0.2522227168083191, "learning_rate": 5e-06, "logits/chosen": -16189352.0, "logits/rejected": -52041101.71428572, "logps/chosen": -393.3282470703125, "logps/rejected": -647.5641741071429, "loss": 0.0367, "rewards/chosen": 7.314389038085937, "rewards/margins": 20.963177490234376, "rewards/rejected": -13.648788452148438, "step": 2280 }, { "epoch": 0.5707494057300138, "grad_norm": 14.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41014466.90909091, "logits/rejected": -60996224.0, "logps/chosen": -366.21147017045456, "logps/rejected": -788.369140625, "loss": 0.0412, "rewards/chosen": 7.448481473055753, "rewards/margins": 29.98891491656537, "rewards/rejected": -22.540433443509617, "step": 2281 }, { "epoch": 0.5709996246715876, "grad_norm": 19.875, "kl": 2.0576376914978027, "learning_rate": 5e-06, "logits/chosen": -53515623.384615384, "logits/rejected": -36472029.09090909, "logps/chosen": -364.4727313701923, "logps/rejected": -479.9509943181818, "loss": 0.031, "rewards/chosen": 7.516398503230168, "rewards/margins": 19.44329225766909, "rewards/rejected": -11.92689375443892, "step": 2282 }, { "epoch": 0.5712498436131616, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52971182.54545455, "logits/rejected": -45969875.692307696, "logps/chosen": -511.8017578125, "logps/rejected": -724.3255709134615, "loss": 0.021, "rewards/chosen": 8.459647438742898, "rewards/margins": 26.759215028135927, "rewards/rejected": -18.29956758939303, "step": 2283 }, { "epoch": 0.5715000625547354, "grad_norm": 0.8046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42200253.09090909, "logits/rejected": -71571209.84615384, "logps/chosen": -429.8780628551136, "logps/rejected": -558.73974609375, "loss": 0.0086, "rewards/chosen": 9.033871737393467, "rewards/margins": 21.062743713805727, "rewards/rejected": -12.02887197641226, "step": 2284 }, { "epoch": 0.5717502814963092, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51366452.36363637, "logits/rejected": -53055330.461538464, "logps/chosen": -336.97853781960225, "logps/rejected": -628.9108323317307, "loss": 0.0652, "rewards/chosen": 7.31946494362571, "rewards/margins": 25.069990784971864, "rewards/rejected": -17.750525841346153, "step": 2285 }, { "epoch": 0.5720005004378832, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33740840.72727273, "logits/rejected": 35637774.76923077, "logps/chosen": -286.12655362215907, "logps/rejected": -729.4314903846154, "loss": 0.0265, "rewards/chosen": 6.3400490500710225, "rewards/margins": 26.374036242078233, "rewards/rejected": -20.03398719200721, "step": 2286 }, { "epoch": 0.572250719379457, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45299958.4, "logits/rejected": -72508598.85714285, "logps/chosen": -401.4803466796875, "logps/rejected": -816.1028180803571, "loss": 0.0221, "rewards/chosen": 7.455699157714844, "rewards/margins": 24.361087036132812, "rewards/rejected": -16.90538787841797, "step": 2287 }, { "epoch": 0.5725009383210309, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47703718.4, "logits/rejected": -26681656.888888888, "logps/chosen": -282.70921223958334, "logps/rejected": -482.5910915798611, "loss": 0.0558, "rewards/chosen": 7.9832616170247395, "rewards/margins": 20.777765570746528, "rewards/rejected": -12.794503953721788, "step": 2288 }, { "epoch": 0.5727511572626047, "grad_norm": 2.234375, "kl": 6.874081611633301, "learning_rate": 5e-06, "logits/chosen": -65971829.333333336, "logits/rejected": -25103488.0, "logps/chosen": -421.4246419270833, "logps/rejected": -391.2778727213542, "loss": 0.0174, "rewards/chosen": 7.569124221801758, "rewards/margins": 19.661526362101235, "rewards/rejected": -12.092402140299479, "step": 2289 }, { "epoch": 0.5730013762041787, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39596844.8, "logits/rejected": -47378532.571428575, "logps/chosen": -339.8724365234375, "logps/rejected": -373.8902064732143, "loss": 0.0323, "rewards/chosen": 6.276803207397461, "rewards/margins": 16.567570332118443, "rewards/rejected": -10.290767124720983, "step": 2290 }, { "epoch": 0.5732515951457525, "grad_norm": 1.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1105492.111111111, "logits/rejected": -67873117.86666666, "logps/chosen": -359.83184136284723, "logps/rejected": -799.3625651041667, "loss": 0.0104, "rewards/chosen": 7.2234242757161455, "rewards/margins": 26.39148457845052, "rewards/rejected": -19.168060302734375, "step": 2291 }, { "epoch": 0.5735018140873264, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19529256.727272727, "logits/rejected": -38796155.07692308, "logps/chosen": -303.0553533380682, "logps/rejected": -492.66639122596155, "loss": 0.0382, "rewards/chosen": 6.486169988458807, "rewards/margins": 18.597635442560367, "rewards/rejected": -12.111465454101562, "step": 2292 }, { "epoch": 0.5737520330289003, "grad_norm": 7.125, "kl": 3.7763266563415527, "learning_rate": 5e-06, "logits/chosen": -40520439.46666667, "logits/rejected": -61952881.777777776, "logps/chosen": -465.9578125, "logps/rejected": -743.494140625, "loss": 0.0479, "rewards/chosen": 9.458258056640625, "rewards/margins": 22.14397447374132, "rewards/rejected": -12.685716417100695, "step": 2293 }, { "epoch": 0.5740022519704742, "grad_norm": 17.375, "kl": 16.234222412109375, "learning_rate": 5e-06, "logits/chosen": -47367781.333333336, "logits/rejected": -46033674.666666664, "logps/chosen": -459.7914225260417, "logps/rejected": -744.4580078125, "loss": 0.0346, "rewards/chosen": 8.892217636108398, "rewards/margins": 26.569321314493816, "rewards/rejected": -17.677103678385418, "step": 2294 }, { "epoch": 0.574252470912048, "grad_norm": 3.78125, "kl": 0.07759666442871094, "learning_rate": 5e-06, "logits/chosen": -39857252.92307692, "logits/rejected": -51110074.18181818, "logps/chosen": -453.6886643629808, "logps/rejected": -593.8200461647727, "loss": 0.0091, "rewards/chosen": 11.505894587590145, "rewards/margins": 27.143361018254208, "rewards/rejected": -15.637466430664062, "step": 2295 }, { "epoch": 0.574502689853622, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35104108.8, "logits/rejected": -26908363.42857143, "logps/chosen": -318.3389404296875, "logps/rejected": -506.01932198660717, "loss": 0.0386, "rewards/chosen": 6.543833923339844, "rewards/margins": 20.49883793422154, "rewards/rejected": -13.955004010881696, "step": 2296 }, { "epoch": 0.5747529087951958, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62856859.428571425, "logits/rejected": -27532281.6, "logps/chosen": -404.59165736607144, "logps/rejected": -509.885595703125, "loss": 0.0498, "rewards/chosen": 7.6663616725376675, "rewards/margins": 20.925429970877513, "rewards/rejected": -13.259068298339844, "step": 2297 }, { "epoch": 0.5750031277367696, "grad_norm": 9.5, "kl": 8.954851150512695, "learning_rate": 5e-06, "logits/chosen": -43359940.266666666, "logits/rejected": -47932316.44444445, "logps/chosen": -369.38837890625, "logps/rejected": -556.41455078125, "loss": 0.0362, "rewards/chosen": 8.315555826822917, "rewards/margins": 22.065664333767362, "rewards/rejected": -13.750108506944445, "step": 2298 }, { "epoch": 0.5752533466783436, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41919448.0, "logits/rejected": -70777680.0, "logps/chosen": -322.4796142578125, "logps/rejected": -495.68450927734375, "loss": 0.0605, "rewards/chosen": 6.857255458831787, "rewards/margins": 17.698379039764404, "rewards/rejected": -10.841123580932617, "step": 2299 }, { "epoch": 0.5755035656199174, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50534951.384615384, "logits/rejected": -25931054.545454547, "logps/chosen": -476.46495643028845, "logps/rejected": -493.37917258522725, "loss": 0.0119, "rewards/chosen": 8.330653850848858, "rewards/margins": 23.061831440958944, "rewards/rejected": -14.731177590110086, "step": 2300 }, { "epoch": 0.5757537845614913, "grad_norm": 10.125, "kl": 2.183321714401245, "learning_rate": 5e-06, "logits/chosen": -71195081.84615384, "logits/rejected": -51092573.09090909, "logps/chosen": -361.46694711538464, "logps/rejected": -881.6136363636364, "loss": 0.0395, "rewards/chosen": 7.034294715294471, "rewards/margins": 26.224754920372597, "rewards/rejected": -19.190460205078125, "step": 2301 }, { "epoch": 0.5760040035030651, "grad_norm": 17.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46676027.733333334, "logits/rejected": -62618652.44444445, "logps/chosen": -360.320703125, "logps/rejected": -529.2957899305555, "loss": 0.0475, "rewards/chosen": 7.676609802246094, "rewards/margins": 21.768238830566407, "rewards/rejected": -14.091629028320312, "step": 2302 }, { "epoch": 0.5762542224446391, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58230199.46666667, "logits/rejected": -50741600.0, "logps/chosen": -337.98089192708335, "logps/rejected": -681.3541666666666, "loss": 0.044, "rewards/chosen": 6.27715098063151, "rewards/margins": 21.290702480740016, "rewards/rejected": -15.013551500108507, "step": 2303 }, { "epoch": 0.5765044413862129, "grad_norm": 5.90625, "kl": 2.960909605026245, "learning_rate": 5e-06, "logits/chosen": -36897605.333333336, "logits/rejected": -62755520.0, "logps/chosen": -510.5487467447917, "logps/rejected": -478.6233723958333, "loss": 0.0182, "rewards/chosen": 10.688741048177084, "rewards/margins": 23.81709416707357, "rewards/rejected": -13.128353118896484, "step": 2304 }, { "epoch": 0.5767546603277868, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56548474.18181818, "logits/rejected": 8557757.538461538, "logps/chosen": -420.0267223011364, "logps/rejected": -586.5718149038462, "loss": 0.024, "rewards/chosen": 9.094587846235795, "rewards/margins": 23.38417853508796, "rewards/rejected": -14.289590688852163, "step": 2305 }, { "epoch": 0.5770048792693607, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25933668.57142857, "logits/rejected": -48031028.705882356, "logps/chosen": -438.46146065848217, "logps/rejected": -676.0977711397059, "loss": 0.0093, "rewards/chosen": 8.443802969796318, "rewards/margins": 24.630044568486575, "rewards/rejected": -16.18624159869026, "step": 2306 }, { "epoch": 0.5772550982109346, "grad_norm": 7.28125, "kl": 20.78204917907715, "learning_rate": 5e-06, "logits/chosen": -36962432.0, "logits/rejected": -68374198.85714285, "logps/chosen": -385.8329216452206, "logps/rejected": -482.29541015625, "loss": 0.1205, "rewards/chosen": 8.741358139935661, "rewards/margins": 23.503920482988114, "rewards/rejected": -14.762562343052455, "step": 2307 }, { "epoch": 0.5775053171525084, "grad_norm": 7.625, "kl": 4.505006313323975, "learning_rate": 5e-06, "logits/chosen": -80094016.0, "logits/rejected": -58483433.14285714, "logps/chosen": -571.523193359375, "logps/rejected": -697.9043666294643, "loss": 0.0124, "rewards/chosen": 11.276296997070313, "rewards/margins": 27.76383492606027, "rewards/rejected": -16.487537928989955, "step": 2308 }, { "epoch": 0.5777555360940824, "grad_norm": 18.0, "kl": 30.700368881225586, "learning_rate": 5e-06, "logits/chosen": -59101469.86666667, "logits/rejected": -41340650.666666664, "logps/chosen": -406.6466796875, "logps/rejected": -592.9188910590278, "loss": 0.0957, "rewards/chosen": 8.333720397949218, "rewards/margins": 23.376862080891925, "rewards/rejected": -15.043141682942709, "step": 2309 }, { "epoch": 0.5780057550356562, "grad_norm": 11.6875, "kl": 1.2578620910644531, "learning_rate": 5e-06, "logits/chosen": -37653874.28571428, "logits/rejected": -37874464.0, "logps/chosen": -328.96561104910717, "logps/rejected": -542.91455078125, "loss": 0.0367, "rewards/chosen": 7.210897718157087, "rewards/margins": 18.00687473842076, "rewards/rejected": -10.795977020263672, "step": 2310 }, { "epoch": 0.57825597397723, "grad_norm": 18.125, "kl": 6.460975646972656, "learning_rate": 5e-06, "logits/chosen": -29729111.466666665, "logits/rejected": -42319658.666666664, "logps/chosen": -358.3953125, "logps/rejected": -772.3819444444445, "loss": 0.0671, "rewards/chosen": 6.726534525553386, "rewards/margins": 20.716919793023003, "rewards/rejected": -13.990385267469618, "step": 2311 }, { "epoch": 0.578506192918804, "grad_norm": 3.390625, "kl": 3.4758002758026123, "learning_rate": 5e-06, "logits/chosen": -65347541.333333336, "logits/rejected": -110578150.4, "logps/chosen": -407.6927083333333, "logps/rejected": -624.1333333333333, "loss": 0.0105, "rewards/chosen": 9.210013495551216, "rewards/margins": 24.197727118598092, "rewards/rejected": -14.987713623046876, "step": 2312 }, { "epoch": 0.5787564118603779, "grad_norm": 23.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54779601.06666667, "logits/rejected": -20419100.444444444, "logps/chosen": -371.87805989583336, "logps/rejected": -575.3224826388889, "loss": 0.0499, "rewards/chosen": 7.25235850016276, "rewards/margins": 21.78782670762804, "rewards/rejected": -14.535468207465279, "step": 2313 }, { "epoch": 0.5790066308019517, "grad_norm": 4.15625, "kl": 1.7258212566375732, "learning_rate": 5e-06, "logits/chosen": -40238919.384615384, "logits/rejected": -17427005.09090909, "logps/chosen": -315.3366135817308, "logps/rejected": -572.0110085227273, "loss": 0.0453, "rewards/chosen": 6.10852285531851, "rewards/margins": 19.383652693741805, "rewards/rejected": -13.275129838423295, "step": 2314 }, { "epoch": 0.5792568497435255, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52799317.333333336, "logits/rejected": -55749427.2, "logps/chosen": -340.44053819444446, "logps/rejected": -631.1590494791667, "loss": 0.0223, "rewards/chosen": 7.548498365614149, "rewards/margins": 22.93874189588759, "rewards/rejected": -15.390243530273438, "step": 2315 }, { "epoch": 0.5795070686850995, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59717447.11111111, "logits/rejected": 80916309.33333333, "logps/chosen": -406.0830078125, "logps/rejected": -557.1138671875, "loss": 0.0606, "rewards/chosen": 7.304797702365452, "rewards/margins": 19.755439588758684, "rewards/rejected": -12.45064188639323, "step": 2316 }, { "epoch": 0.5797572876266733, "grad_norm": 2.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66924160.0, "logits/rejected": -55256029.09090909, "logps/chosen": -370.81531700721155, "logps/rejected": -694.0400834517045, "loss": 0.0179, "rewards/chosen": 7.445796086237981, "rewards/margins": 24.705102560403464, "rewards/rejected": -17.259306474165484, "step": 2317 }, { "epoch": 0.5800075065682472, "grad_norm": 3.921875, "kl": 7.178936004638672, "learning_rate": 5e-06, "logits/chosen": -35259230.11764706, "logits/rejected": -75841558.85714285, "logps/chosen": -348.5132697610294, "logps/rejected": -588.1996372767857, "loss": 0.054, "rewards/chosen": 8.185930139878216, "rewards/margins": 23.521547205307904, "rewards/rejected": -15.335617065429688, "step": 2318 }, { "epoch": 0.5802577255098211, "grad_norm": 4.59375, "kl": 0.5528386831283569, "learning_rate": 5e-06, "logits/chosen": -44329285.81818182, "logits/rejected": -41249750.15384615, "logps/chosen": -385.97745028409093, "logps/rejected": -608.0238131009615, "loss": 0.0389, "rewards/chosen": 7.100744767622515, "rewards/margins": 20.40601210160689, "rewards/rejected": -13.305267333984375, "step": 2319 }, { "epoch": 0.580507944451395, "grad_norm": 17.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39376568.88888889, "logits/rejected": -47785655.46666667, "logps/chosen": -308.1414388020833, "logps/rejected": -603.6977213541667, "loss": 0.0487, "rewards/chosen": 7.80446031358507, "rewards/margins": 22.919541083441842, "rewards/rejected": -15.115080769856771, "step": 2320 }, { "epoch": 0.5807581633929688, "grad_norm": 6.59375, "kl": 2.173941135406494, "learning_rate": 5e-06, "logits/chosen": -35229341.09090909, "logits/rejected": -52741656.615384616, "logps/chosen": -323.38973721590907, "logps/rejected": -391.2898137019231, "loss": 0.0612, "rewards/chosen": 6.656245144930753, "rewards/margins": 15.936848407025103, "rewards/rejected": -9.28060326209435, "step": 2321 }, { "epoch": 0.5810083823345428, "grad_norm": 17.75, "kl": 2.079366683959961, "learning_rate": 5e-06, "logits/chosen": -43393614.76923077, "logits/rejected": -29509021.09090909, "logps/chosen": -425.3112980769231, "logps/rejected": -515.6060014204545, "loss": 0.0665, "rewards/chosen": 7.763340289776142, "rewards/margins": 19.565709841001286, "rewards/rejected": -11.802369551225143, "step": 2322 }, { "epoch": 0.5812586012761166, "grad_norm": 5.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45164361.84615385, "logits/rejected": -53843572.36363637, "logps/chosen": -385.7033503605769, "logps/rejected": -464.95383522727275, "loss": 0.0289, "rewards/chosen": 7.802693293644832, "rewards/margins": 20.309301736471536, "rewards/rejected": -12.506608442826705, "step": 2323 }, { "epoch": 0.5815088202176905, "grad_norm": 6.5625, "kl": 2.5723178386688232, "learning_rate": 5e-06, "logits/chosen": -80881424.0, "logits/rejected": -38137626.666666664, "logps/chosen": -407.3980305989583, "logps/rejected": -510.5245768229167, "loss": 0.0256, "rewards/chosen": 8.806017557779947, "rewards/margins": 21.405681610107422, "rewards/rejected": -12.599664052327475, "step": 2324 }, { "epoch": 0.5817590391592643, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51974768.0, "logits/rejected": -51535410.28571428, "logps/chosen": -494.88447265625, "logps/rejected": -586.1663643973214, "loss": 0.0165, "rewards/chosen": 7.506608581542968, "rewards/margins": 22.468240356445314, "rewards/rejected": -14.961631774902344, "step": 2325 }, { "epoch": 0.5820092581008383, "grad_norm": 4.09375, "kl": 3.0930585861206055, "learning_rate": 5e-06, "logits/chosen": -80370515.6923077, "logits/rejected": -49963973.81818182, "logps/chosen": -444.57192758413464, "logps/rejected": -448.8917791193182, "loss": 0.0144, "rewards/chosen": 9.81182626577524, "rewards/margins": 20.518366460199957, "rewards/rejected": -10.706540194424717, "step": 2326 }, { "epoch": 0.5822594770424121, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34812373.333333336, "logits/rejected": -40465093.333333336, "logps/chosen": -284.8806559244792, "logps/rejected": -546.6531575520834, "loss": 0.0501, "rewards/chosen": 6.006937662760417, "rewards/margins": 16.237061818440754, "rewards/rejected": -10.230124155680338, "step": 2327 }, { "epoch": 0.5825096959839859, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28590181.333333332, "logits/rejected": -31854685.333333332, "logps/chosen": -383.0777180989583, "logps/rejected": -689.6525065104166, "loss": 0.0474, "rewards/chosen": 9.496591567993164, "rewards/margins": 22.173229853312172, "rewards/rejected": -12.67663828531901, "step": 2328 }, { "epoch": 0.5827599149255599, "grad_norm": 2.578125, "kl": 2.7513957023620605, "learning_rate": 5e-06, "logits/chosen": -40206442.666666664, "logits/rejected": -31748517.333333332, "logps/chosen": -459.46099175347223, "logps/rejected": -444.5061442057292, "loss": 0.0025, "rewards/chosen": 8.752086215549046, "rewards/margins": 21.083858066134983, "rewards/rejected": -12.331771850585938, "step": 2329 }, { "epoch": 0.5830101338671337, "grad_norm": 14.75, "kl": 1.5212924480438232, "learning_rate": 5e-06, "logits/chosen": -31102740.363636363, "logits/rejected": -26216091.076923076, "logps/chosen": -429.8864080255682, "logps/rejected": -406.771484375, "loss": 0.0801, "rewards/chosen": 8.48611103404652, "rewards/margins": 17.807268636209983, "rewards/rejected": -9.321157602163462, "step": 2330 }, { "epoch": 0.5832603528087076, "grad_norm": 10.8125, "kl": 8.701894760131836, "learning_rate": 5e-06, "logits/chosen": -61887707.428571425, "logits/rejected": -63169836.8, "logps/chosen": -467.74857003348217, "logps/rejected": -619.932275390625, "loss": 0.0271, "rewards/chosen": 10.399968828473773, "rewards/margins": 24.550969805036274, "rewards/rejected": -14.1510009765625, "step": 2331 }, { "epoch": 0.5835105717502815, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50183545.6, "logits/rejected": -47917339.428571425, "logps/chosen": -295.367578125, "logps/rejected": -619.44189453125, "loss": 0.0254, "rewards/chosen": 6.958013916015625, "rewards/margins": 21.994104221888954, "rewards/rejected": -15.036090305873326, "step": 2332 }, { "epoch": 0.5837607906918554, "grad_norm": 6.15625, "kl": 1.5171712636947632, "learning_rate": 5e-06, "logits/chosen": -29808851.692307692, "logits/rejected": -29682260.363636363, "logps/chosen": -252.19031700721155, "logps/rejected": -423.02534623579544, "loss": 0.055, "rewards/chosen": 5.100712702824519, "rewards/margins": 13.521801875187801, "rewards/rejected": -8.421089172363281, "step": 2333 }, { "epoch": 0.5840110096334292, "grad_norm": 5.84375, "kl": 0.5418338775634766, "learning_rate": 5e-06, "logits/chosen": -28300811.636363637, "logits/rejected": -21666141.53846154, "logps/chosen": -236.75379527698863, "logps/rejected": -626.2431640625, "loss": 0.0249, "rewards/chosen": 7.111954428932884, "rewards/margins": 18.212664784251395, "rewards/rejected": -11.10071035531851, "step": 2334 }, { "epoch": 0.5842612285750032, "grad_norm": 8.75, "kl": 4.8423614501953125, "learning_rate": 5e-06, "logits/chosen": -32823321.14285714, "logits/rejected": -67425433.6, "logps/chosen": -401.18324497767856, "logps/rejected": -653.69970703125, "loss": 0.0724, "rewards/chosen": 8.084582192557198, "rewards/margins": 19.8562504359654, "rewards/rejected": -11.771668243408204, "step": 2335 }, { "epoch": 0.584511447516577, "grad_norm": 6.15625, "kl": 1.9281806945800781, "learning_rate": 5e-06, "logits/chosen": -50715002.666666664, "logits/rejected": -47137184.0, "logps/chosen": -372.6903889973958, "logps/rejected": -650.6675618489584, "loss": 0.0067, "rewards/chosen": 8.786565144856771, "rewards/margins": 19.775737762451172, "rewards/rejected": -10.9891726175944, "step": 2336 }, { "epoch": 0.5847616664581509, "grad_norm": 6.0, "kl": 0.9779180288314819, "learning_rate": 5e-06, "logits/chosen": -39393252.571428575, "logits/rejected": -16356969.6, "logps/chosen": -270.75538853236606, "logps/rejected": -515.21845703125, "loss": 0.0353, "rewards/chosen": 6.694460187639509, "rewards/margins": 16.414755902971542, "rewards/rejected": -9.720295715332032, "step": 2337 }, { "epoch": 0.5850118853997247, "grad_norm": 7.1875, "kl": 15.422025680541992, "learning_rate": 5e-06, "logits/chosen": -52898892.8, "logits/rejected": -54560483.55555555, "logps/chosen": -437.89283854166666, "logps/rejected": -632.8269856770834, "loss": 0.0295, "rewards/chosen": 10.498996988932292, "rewards/margins": 24.237966579861112, "rewards/rejected": -13.73896959092882, "step": 2338 }, { "epoch": 0.5852621043412987, "grad_norm": 6.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58453056.0, "logits/rejected": -53630117.333333336, "logps/chosen": -236.92940266927084, "logps/rejected": -625.7169189453125, "loss": 0.0986, "rewards/chosen": 4.669447898864746, "rewards/margins": 18.24466609954834, "rewards/rejected": -13.575218200683594, "step": 2339 }, { "epoch": 0.5855123232828725, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49086752.0, "logits/rejected": -32979685.333333332, "logps/chosen": -353.5267740885417, "logps/rejected": -514.0420735677084, "loss": 0.0327, "rewards/chosen": 7.4302927652994795, "rewards/margins": 21.044390360514324, "rewards/rejected": -13.614097595214844, "step": 2340 }, { "epoch": 0.5857625422244463, "grad_norm": 3.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73090597.33333333, "logits/rejected": -22302168.0, "logps/chosen": -459.6234537760417, "logps/rejected": -480.1915690104167, "loss": 0.0038, "rewards/chosen": 11.436658223470053, "rewards/margins": 24.05647913614909, "rewards/rejected": -12.619820912679037, "step": 2341 }, { "epoch": 0.5860127611660203, "grad_norm": 12.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74611878.4, "logits/rejected": 14204768.0, "logps/chosen": -425.77529296875, "logps/rejected": -620.8494698660714, "loss": 0.0419, "rewards/chosen": 8.55035858154297, "rewards/margins": 19.32414267403739, "rewards/rejected": -10.77378409249442, "step": 2342 }, { "epoch": 0.5862629801075941, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31113291.636363637, "logits/rejected": -49016083.692307696, "logps/chosen": -299.64186789772725, "logps/rejected": -457.4909855769231, "loss": 0.0478, "rewards/chosen": 6.267315951260653, "rewards/margins": 17.4171079622282, "rewards/rejected": -11.149792010967548, "step": 2343 }, { "epoch": 0.586513199049168, "grad_norm": 2.59375, "kl": 5.915771007537842, "learning_rate": 5e-06, "logits/chosen": -67885033.14285715, "logits/rejected": 34766118.4, "logps/chosen": -475.1763392857143, "logps/rejected": -405.063427734375, "loss": 0.0066, "rewards/chosen": 10.952953883579799, "rewards/margins": 22.23340508597238, "rewards/rejected": -11.280451202392578, "step": 2344 }, { "epoch": 0.5867634179907419, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 23058416.0, "logits/rejected": -53011131.07692308, "logps/chosen": -438.82572798295456, "logps/rejected": -582.2438025841346, "loss": 0.0151, "rewards/chosen": 7.608842329545454, "rewards/margins": 22.095529622964925, "rewards/rejected": -14.486687293419472, "step": 2345 }, { "epoch": 0.5870136369323158, "grad_norm": 1.5, "kl": 2.4681575298309326, "learning_rate": 5e-06, "logits/chosen": -49283852.8, "logits/rejected": -51654646.85714286, "logps/chosen": -380.2482421875, "logps/rejected": -551.385009765625, "loss": 0.0168, "rewards/chosen": 8.080470275878906, "rewards/margins": 24.979332842145645, "rewards/rejected": -16.89886256626674, "step": 2346 }, { "epoch": 0.5872638558738896, "grad_norm": 6.84375, "kl": 8.959607124328613, "learning_rate": 5e-06, "logits/chosen": -38703020.307692304, "logits/rejected": -45666970.18181818, "logps/chosen": -330.8844651442308, "logps/rejected": -463.72567471590907, "loss": 0.0403, "rewards/chosen": 7.360662020169771, "rewards/margins": 20.19087331278341, "rewards/rejected": -12.830211292613637, "step": 2347 }, { "epoch": 0.5875140748154636, "grad_norm": 6.75, "kl": 7.949047088623047, "learning_rate": 5e-06, "logits/chosen": -55599172.92307692, "logits/rejected": -25030033.454545453, "logps/chosen": -332.3239933894231, "logps/rejected": -586.4134854403409, "loss": 0.0662, "rewards/chosen": 8.06453646146334, "rewards/margins": 21.62874683300098, "rewards/rejected": -13.564210371537643, "step": 2348 }, { "epoch": 0.5877642937570374, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49726147.2, "logits/rejected": -54683328.0, "logps/chosen": -477.727099609375, "logps/rejected": -651.9635881696429, "loss": 0.0488, "rewards/chosen": 8.894602203369141, "rewards/margins": 23.578703853062223, "rewards/rejected": -14.68410164969308, "step": 2349 }, { "epoch": 0.5880145126986113, "grad_norm": 3.828125, "kl": 6.803206443786621, "learning_rate": 5e-06, "logits/chosen": -79640055.46666667, "logits/rejected": -17586549.333333332, "logps/chosen": -457.3422526041667, "logps/rejected": -530.5589192708334, "loss": 0.043, "rewards/chosen": 8.495550028483073, "rewards/margins": 21.1801262749566, "rewards/rejected": -12.684576246473524, "step": 2350 }, { "epoch": 0.5882647316401851, "grad_norm": 3.03125, "kl": 2.3641486167907715, "learning_rate": 5e-06, "logits/chosen": -54434888.53333333, "logits/rejected": -30501553.777777776, "logps/chosen": -345.55244140625, "logps/rejected": -662.8038736979166, "loss": 0.0385, "rewards/chosen": 6.620048014322917, "rewards/margins": 22.77589619954427, "rewards/rejected": -16.155848185221355, "step": 2351 }, { "epoch": 0.5885149505817591, "grad_norm": 3.53125, "kl": 8.91375732421875, "learning_rate": 5e-06, "logits/chosen": -86271360.0, "logits/rejected": -17100435.2, "logps/chosen": -415.5478515625, "logps/rejected": -400.473583984375, "loss": 0.0471, "rewards/chosen": 8.089793613978795, "rewards/margins": 15.29258520943778, "rewards/rejected": -7.202791595458985, "step": 2352 }, { "epoch": 0.5887651695233329, "grad_norm": 2.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40121428.0, "logits/rejected": -38563424.0, "logps/chosen": -412.6603698730469, "logps/rejected": -589.955322265625, "loss": 0.0065, "rewards/chosen": 6.609074115753174, "rewards/margins": 20.483824253082275, "rewards/rejected": -13.874750137329102, "step": 2353 }, { "epoch": 0.5890153884649068, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24293006.222222224, "logits/rejected": -49893504.0, "logps/chosen": -466.72081163194446, "logps/rejected": -831.2415364583334, "loss": 0.0148, "rewards/chosen": 7.70664299858941, "rewards/margins": 28.887809583875868, "rewards/rejected": -21.18116658528646, "step": 2354 }, { "epoch": 0.5892656074064807, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51724388.571428575, "logits/rejected": -28558041.6, "logps/chosen": -245.27549525669642, "logps/rejected": -560.195556640625, "loss": 0.0746, "rewards/chosen": 5.0652618408203125, "rewards/margins": 21.977728271484374, "rewards/rejected": -16.91246643066406, "step": 2355 }, { "epoch": 0.5895158263480546, "grad_norm": 14.8125, "kl": 5.259764671325684, "learning_rate": 5e-06, "logits/chosen": -40742892.8, "logits/rejected": -36664939.428571425, "logps/chosen": -388.2680419921875, "logps/rejected": -471.5330287388393, "loss": 0.0514, "rewards/chosen": 7.806633758544922, "rewards/margins": 19.28615537370954, "rewards/rejected": -11.47952161516462, "step": 2356 }, { "epoch": 0.5897660452896284, "grad_norm": 19.75, "kl": 5.268315315246582, "learning_rate": 5e-06, "logits/chosen": -51075328.0, "logits/rejected": -32064652.8, "logps/chosen": -437.79136439732144, "logps/rejected": -598.233984375, "loss": 0.0429, "rewards/chosen": 8.469888959612165, "rewards/margins": 19.467304120744977, "rewards/rejected": -10.997415161132812, "step": 2357 }, { "epoch": 0.5900162642312023, "grad_norm": 4.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61702896.0, "logits/rejected": -20033510.0, "logps/chosen": -521.8365478515625, "logps/rejected": -794.1048583984375, "loss": 0.0163, "rewards/chosen": 8.349980354309082, "rewards/margins": 27.56750202178955, "rewards/rejected": -19.21752166748047, "step": 2358 }, { "epoch": 0.5902664831727762, "grad_norm": 3.71875, "kl": 2.400516986846924, "learning_rate": 5e-06, "logits/chosen": -47857424.0, "logits/rejected": -90854368.0, "logps/chosen": -490.83148193359375, "logps/rejected": -583.340087890625, "loss": 0.0251, "rewards/chosen": 9.487022399902344, "rewards/margins": 23.997264862060547, "rewards/rejected": -14.510242462158203, "step": 2359 }, { "epoch": 0.59051670211435, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21943089.777777776, "logits/rejected": -47499844.266666666, "logps/chosen": -199.89720323350696, "logps/rejected": -712.91171875, "loss": 0.0437, "rewards/chosen": 4.608177608913845, "rewards/margins": 24.00356742011176, "rewards/rejected": -19.395389811197916, "step": 2360 }, { "epoch": 0.590766921055924, "grad_norm": 1.4296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18764491.636363637, "logits/rejected": -74306215.38461539, "logps/chosen": -348.9466441761364, "logps/rejected": -771.5981069711538, "loss": 0.0018, "rewards/chosen": 7.720674688165838, "rewards/margins": 28.64992085703603, "rewards/rejected": -20.929246168870193, "step": 2361 }, { "epoch": 0.5910171399974978, "grad_norm": 14.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22710183.111111112, "logits/rejected": -46495360.0, "logps/chosen": -358.9510091145833, "logps/rejected": -542.7887369791666, "loss": 0.0223, "rewards/chosen": 9.019293891059029, "rewards/margins": 22.741844346788195, "rewards/rejected": -13.722550455729166, "step": 2362 }, { "epoch": 0.5912673589390717, "grad_norm": 3.890625, "kl": 1.390639066696167, "learning_rate": 5e-06, "logits/chosen": -35027630.93333333, "logits/rejected": -34755271.11111111, "logps/chosen": -402.98932291666665, "logps/rejected": -619.4079861111111, "loss": 0.0184, "rewards/chosen": 7.972222900390625, "rewards/margins": 21.53596649169922, "rewards/rejected": -13.563743591308594, "step": 2363 }, { "epoch": 0.5915175778806455, "grad_norm": 11.4375, "kl": 0.1906595230102539, "learning_rate": 5e-06, "logits/chosen": -22757603.2, "logits/rejected": -10655867.42857143, "logps/chosen": -391.236669921875, "logps/rejected": -559.3989955357143, "loss": 0.0304, "rewards/chosen": 6.823162841796875, "rewards/margins": 21.849928937639508, "rewards/rejected": -15.026766095842634, "step": 2364 }, { "epoch": 0.5917677968222195, "grad_norm": 14.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36494300.8, "logits/rejected": -76514889.14285715, "logps/chosen": -411.500634765625, "logps/rejected": -566.7976771763393, "loss": 0.049, "rewards/chosen": 8.659796142578125, "rewards/margins": 21.45975559779576, "rewards/rejected": -12.799959455217634, "step": 2365 }, { "epoch": 0.5920180157637933, "grad_norm": 1.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37681920.0, "logits/rejected": -44096776.53333333, "logps/chosen": -393.09478081597223, "logps/rejected": -580.926953125, "loss": 0.0116, "rewards/chosen": 9.123224046495226, "rewards/margins": 24.13021664089627, "rewards/rejected": -15.006992594401042, "step": 2366 }, { "epoch": 0.5922682347053672, "grad_norm": 20.625, "kl": 13.989477157592773, "learning_rate": 5e-06, "logits/chosen": -43417972.0, "logits/rejected": -39127656.0, "logps/chosen": -448.8272705078125, "logps/rejected": -435.0310363769531, "loss": 0.0635, "rewards/chosen": 9.904816627502441, "rewards/margins": 17.954041481018066, "rewards/rejected": -8.049224853515625, "step": 2367 }, { "epoch": 0.5925184536469411, "grad_norm": 21.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39725589.333333336, "logits/rejected": -29751944.533333335, "logps/chosen": -309.08558485243054, "logps/rejected": -485.8118489583333, "loss": 0.0553, "rewards/chosen": 8.301888359917534, "rewards/margins": 20.836744520399307, "rewards/rejected": -12.534856160481771, "step": 2368 }, { "epoch": 0.592768672588515, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67594128.0, "logits/rejected": -26614237.333333332, "logps/chosen": -439.4596354166667, "logps/rejected": -446.2477213541667, "loss": 0.0141, "rewards/chosen": 8.851855595906576, "rewards/margins": 19.0233097076416, "rewards/rejected": -10.171454111735025, "step": 2369 }, { "epoch": 0.5930188915300888, "grad_norm": 3.640625, "kl": 13.0173978805542, "learning_rate": 5e-06, "logits/chosen": -70688616.72727273, "logits/rejected": -30222365.53846154, "logps/chosen": -463.54243607954544, "logps/rejected": -449.85802283653845, "loss": 0.0409, "rewards/chosen": 10.731794877485795, "rewards/margins": 21.18584714235959, "rewards/rejected": -10.454052264873798, "step": 2370 }, { "epoch": 0.5932691104716628, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47644742.4, "logits/rejected": -49471588.571428575, "logps/chosen": -453.912744140625, "logps/rejected": -553.0492815290179, "loss": 0.0513, "rewards/chosen": 9.097515869140626, "rewards/margins": 20.913455636160712, "rewards/rejected": -11.815939767020089, "step": 2371 }, { "epoch": 0.5935193294132366, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87971066.66666667, "logits/rejected": -54626704.0, "logps/chosen": -413.75537109375, "logps/rejected": -801.7137044270834, "loss": 0.0108, "rewards/chosen": 9.622482299804688, "rewards/margins": 27.10281244913737, "rewards/rejected": -17.480330149332683, "step": 2372 }, { "epoch": 0.5937695483548104, "grad_norm": 12.625, "kl": 8.559420585632324, "learning_rate": 5e-06, "logits/chosen": -70359904.0, "logits/rejected": -11195393.333333334, "logps/chosen": -462.3204345703125, "logps/rejected": -881.8788248697916, "loss": 0.0107, "rewards/chosen": 9.035934448242188, "rewards/margins": 28.72942606608073, "rewards/rejected": -19.693491617838543, "step": 2373 }, { "epoch": 0.5940197672963843, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53238970.666666664, "logits/rejected": -64142517.333333336, "logps/chosen": -408.2602132161458, "logps/rejected": -722.5432942708334, "loss": 0.0467, "rewards/chosen": 6.665255228678386, "rewards/margins": 24.710252126057945, "rewards/rejected": -18.04499689737956, "step": 2374 }, { "epoch": 0.5942699862379582, "grad_norm": 1.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46869397.333333336, "logits/rejected": -48368053.333333336, "logps/chosen": -386.7445882161458, "logps/rejected": -465.9615885416667, "loss": 0.017, "rewards/chosen": 7.617603302001953, "rewards/margins": 22.618732452392578, "rewards/rejected": -15.001129150390625, "step": 2375 }, { "epoch": 0.5945202051795321, "grad_norm": 12.0625, "kl": 6.916736125946045, "learning_rate": 5e-06, "logits/chosen": -60444125.86666667, "logits/rejected": -41100003.55555555, "logps/chosen": -356.05865885416665, "logps/rejected": -420.3400607638889, "loss": 0.0793, "rewards/chosen": 8.587648518880208, "rewards/margins": 19.842449951171872, "rewards/rejected": -11.254801432291666, "step": 2376 }, { "epoch": 0.5947704241211059, "grad_norm": 3.78125, "kl": 9.687932014465332, "learning_rate": 5e-06, "logits/chosen": -42852541.09090909, "logits/rejected": -48107421.538461536, "logps/chosen": -390.78657670454544, "logps/rejected": -530.0025916466346, "loss": 0.0365, "rewards/chosen": 10.144217057661576, "rewards/margins": 24.942312760786578, "rewards/rejected": -14.798095703125, "step": 2377 }, { "epoch": 0.5950206430626799, "grad_norm": 1.7421875, "kl": 9.12623405456543, "learning_rate": 5e-06, "logits/chosen": -41042037.333333336, "logits/rejected": -53942768.0, "logps/chosen": -491.999755859375, "logps/rejected": -519.3555908203125, "loss": 0.046, "rewards/chosen": 9.226099650065104, "rewards/margins": 23.725377400716145, "rewards/rejected": -14.499277750651041, "step": 2378 }, { "epoch": 0.5952708620042537, "grad_norm": 13.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34590809.6, "logits/rejected": -27609572.57142857, "logps/chosen": -366.021240234375, "logps/rejected": -540.4691685267857, "loss": 0.0432, "rewards/chosen": 8.072207641601562, "rewards/margins": 20.365573120117187, "rewards/rejected": -12.293365478515625, "step": 2379 }, { "epoch": 0.5955210809458276, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34217766.4, "logits/rejected": -34494971.428571425, "logps/chosen": -384.1599609375, "logps/rejected": -610.5228097098214, "loss": 0.0159, "rewards/chosen": 8.138725280761719, "rewards/margins": 21.475396292550222, "rewards/rejected": -13.336671011788505, "step": 2380 }, { "epoch": 0.5957712998874015, "grad_norm": 9.875, "kl": 3.3857040405273438, "learning_rate": 5e-06, "logits/chosen": -47664741.333333336, "logits/rejected": -78805706.66666667, "logps/chosen": -278.1958821614583, "logps/rejected": -652.6418863932291, "loss": 0.0346, "rewards/chosen": 7.988106409708659, "rewards/margins": 21.763859430948894, "rewards/rejected": -13.775753021240234, "step": 2381 }, { "epoch": 0.5960215188289754, "grad_norm": 7.15625, "kl": 3.2921533584594727, "learning_rate": 5e-06, "logits/chosen": -62678441.14285714, "logits/rejected": -67503417.6, "logps/chosen": -427.75927734375, "logps/rejected": -502.67421875, "loss": 0.0266, "rewards/chosen": 8.397334507533483, "rewards/margins": 24.30509970528739, "rewards/rejected": -15.907765197753907, "step": 2382 }, { "epoch": 0.5962717377705492, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31261051.076923076, "logits/rejected": 53720802.90909091, "logps/chosen": -316.45274939903845, "logps/rejected": -548.5925958806819, "loss": 0.0436, "rewards/chosen": 6.960424569936899, "rewards/margins": 20.04407186441488, "rewards/rejected": -13.083647294477982, "step": 2383 }, { "epoch": 0.5965219567121232, "grad_norm": 8.6875, "kl": 5.020663261413574, "learning_rate": 5e-06, "logits/chosen": -44647310.54545455, "logits/rejected": -43354958.76923077, "logps/chosen": -408.08935546875, "logps/rejected": -521.2127028245193, "loss": 0.0428, "rewards/chosen": 9.437430641867898, "rewards/margins": 22.75570550665155, "rewards/rejected": -13.318274864783653, "step": 2384 }, { "epoch": 0.596772175653697, "grad_norm": 16.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37436140.8, "logits/rejected": -44108365.71428572, "logps/chosen": -484.88544921875, "logps/rejected": -676.9610072544643, "loss": 0.0227, "rewards/chosen": 7.37677001953125, "rewards/margins": 21.942423139299663, "rewards/rejected": -14.565653119768415, "step": 2385 }, { "epoch": 0.5970223945952708, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25179603.2, "logits/rejected": -53577717.89473684, "logps/chosen": -253.9442626953125, "logps/rejected": -580.9646381578947, "loss": 0.0242, "rewards/chosen": 4.778381729125977, "rewards/margins": 18.633599913747688, "rewards/rejected": -13.85521818462171, "step": 2386 }, { "epoch": 0.5972726135368447, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39877098.666666664, "logits/rejected": -747470.6666666666, "logps/chosen": -348.7401529947917, "logps/rejected": -670.2941487630209, "loss": 0.0532, "rewards/chosen": 7.556076685587565, "rewards/margins": 21.446739832560223, "rewards/rejected": -13.890663146972656, "step": 2387 }, { "epoch": 0.5975228324784186, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45286792.72727273, "logits/rejected": -34509961.84615385, "logps/chosen": -624.4852627840909, "logps/rejected": -654.6381460336538, "loss": 0.0073, "rewards/chosen": 9.596846147017045, "rewards/margins": 25.538557572798297, "rewards/rejected": -15.94171142578125, "step": 2388 }, { "epoch": 0.5977730514199925, "grad_norm": 5.46875, "kl": 3.6199450492858887, "learning_rate": 5e-06, "logits/chosen": -27591977.14285714, "logits/rejected": -43189808.941176474, "logps/chosen": -314.359375, "logps/rejected": -548.7756204044117, "loss": 0.0468, "rewards/chosen": 6.42853764125279, "rewards/margins": 17.902344583463268, "rewards/rejected": -11.473806942210478, "step": 2389 }, { "epoch": 0.5980232703615663, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54329068.307692304, "logits/rejected": -54851037.09090909, "logps/chosen": -318.0179912860577, "logps/rejected": -440.83447265625, "loss": 0.0246, "rewards/chosen": 6.879795367901142, "rewards/margins": 16.53691373171506, "rewards/rejected": -9.65711836381392, "step": 2390 }, { "epoch": 0.5982734893031403, "grad_norm": 3.546875, "kl": 5.364129543304443, "learning_rate": 5e-06, "logits/chosen": -59773737.14285714, "logits/rejected": -85714598.4, "logps/chosen": -403.46732003348217, "logps/rejected": -550.05908203125, "loss": 0.0119, "rewards/chosen": 8.814078194754464, "rewards/margins": 23.40273938860212, "rewards/rejected": -14.588661193847656, "step": 2391 }, { "epoch": 0.5985237082447141, "grad_norm": 1.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52597987.55555555, "logits/rejected": -42430549.333333336, "logps/chosen": -376.4436306423611, "logps/rejected": -476.2775065104167, "loss": 0.0149, "rewards/chosen": 7.1332346598307295, "rewards/margins": 20.005042521158856, "rewards/rejected": -12.871807861328126, "step": 2392 }, { "epoch": 0.598773927186288, "grad_norm": 6.65625, "kl": 11.070616722106934, "learning_rate": 5e-06, "logits/chosen": -42674517.333333336, "logits/rejected": -38052237.333333336, "logps/chosen": -274.5467122395833, "logps/rejected": -583.68212890625, "loss": 0.0519, "rewards/chosen": 7.428246180216472, "rewards/margins": 20.249821345011394, "rewards/rejected": -12.821575164794922, "step": 2393 }, { "epoch": 0.5990241461278619, "grad_norm": 2.515625, "kl": 1.195264220237732, "learning_rate": 5e-06, "logits/chosen": -62449211.733333334, "logits/rejected": -65416832.0, "logps/chosen": -374.1488932291667, "logps/rejected": -691.6035698784722, "loss": 0.0362, "rewards/chosen": 8.672393798828125, "rewards/margins": 26.20056660970052, "rewards/rejected": -17.528172810872395, "step": 2394 }, { "epoch": 0.5992743650694358, "grad_norm": 2.609375, "kl": 0.2982266843318939, "learning_rate": 5e-06, "logits/chosen": -34465389.333333336, "logits/rejected": -37882362.666666664, "logps/chosen": -298.3574625651042, "logps/rejected": -399.8765869140625, "loss": 0.086, "rewards/chosen": 6.713270823160808, "rewards/margins": 15.534657796223957, "rewards/rejected": -8.82138697306315, "step": 2395 }, { "epoch": 0.5995245840110096, "grad_norm": 9.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66737117.86666667, "logits/rejected": -82213269.33333333, "logps/chosen": -421.7874348958333, "logps/rejected": -645.4722222222222, "loss": 0.0278, "rewards/chosen": 9.091615804036458, "rewards/margins": 25.339442274305554, "rewards/rejected": -16.247826470269096, "step": 2396 }, { "epoch": 0.5997748029525836, "grad_norm": 12.0625, "kl": 1.8417282104492188, "learning_rate": 5e-06, "logits/chosen": -50820676.92307692, "logits/rejected": -87421410.9090909, "logps/chosen": -421.36147836538464, "logps/rejected": -442.84969815340907, "loss": 0.062, "rewards/chosen": 7.634193420410156, "rewards/margins": 19.366353121670812, "rewards/rejected": -11.732159701260654, "step": 2397 }, { "epoch": 0.6000250218941574, "grad_norm": 2.3125, "kl": 11.575826644897461, "learning_rate": 5e-06, "logits/chosen": -38024016.0, "logits/rejected": -21999800.0, "logps/chosen": -330.56685965401783, "logps/rejected": -492.84580078125, "loss": 0.0302, "rewards/chosen": 7.060295104980469, "rewards/margins": 21.7000732421875, "rewards/rejected": -14.639778137207031, "step": 2398 }, { "epoch": 0.6002752408357312, "grad_norm": 2.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55031305.84615385, "logits/rejected": -47352904.72727273, "logps/chosen": -364.91323617788464, "logps/rejected": -586.2329545454545, "loss": 0.0151, "rewards/chosen": 7.9844231238731975, "rewards/margins": 23.317619590492516, "rewards/rejected": -15.333196466619318, "step": 2399 }, { "epoch": 0.6005254597773051, "grad_norm": 7.375, "kl": 4.64101505279541, "learning_rate": 5e-06, "logits/chosen": -35348868.266666666, "logits/rejected": -19450042.666666668, "logps/chosen": -428.7259765625, "logps/rejected": -307.71869574652777, "loss": 0.0202, "rewards/chosen": 8.424148559570312, "rewards/margins": 16.440743340386284, "rewards/rejected": -8.016594780815971, "step": 2400 }, { "epoch": 0.600775678718879, "grad_norm": 21.0, "kl": 23.017948150634766, "learning_rate": 5e-06, "logits/chosen": -55597296.0, "logits/rejected": -64285008.0, "logps/chosen": -445.5265808105469, "logps/rejected": -633.8148193359375, "loss": 0.1226, "rewards/chosen": 7.5456647872924805, "rewards/margins": 21.292704582214355, "rewards/rejected": -13.747039794921875, "step": 2401 }, { "epoch": 0.6010258976604529, "grad_norm": 1.71875, "kl": 0.34704622626304626, "learning_rate": 5e-06, "logits/chosen": -50560704.0, "logits/rejected": -22315698.285714287, "logps/chosen": -459.680126953125, "logps/rejected": -848.0197405133929, "loss": 0.029, "rewards/chosen": 8.405497741699218, "rewards/margins": 27.843612452915735, "rewards/rejected": -19.438114711216517, "step": 2402 }, { "epoch": 0.6012761166020267, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52865641.14285714, "logits/rejected": -29246608.94117647, "logps/chosen": -368.95424107142856, "logps/rejected": -536.3024471507352, "loss": 0.0285, "rewards/chosen": 6.893147059849331, "rewards/margins": 21.018906408999147, "rewards/rejected": -14.125759349149817, "step": 2403 }, { "epoch": 0.6015263355436007, "grad_norm": 4.25, "kl": 2.937960386276245, "learning_rate": 5e-06, "logits/chosen": -56956458.666666664, "logits/rejected": -33671740.44444445, "logps/chosen": -348.36982421875, "logps/rejected": -599.9561631944445, "loss": 0.0322, "rewards/chosen": 7.268742370605469, "rewards/margins": 23.20854983859592, "rewards/rejected": -15.939807467990452, "step": 2404 }, { "epoch": 0.6017765544851745, "grad_norm": 8.5, "kl": 4.8682684898376465, "learning_rate": 5e-06, "logits/chosen": -48482880.0, "logits/rejected": -49103861.333333336, "logps/chosen": -370.2366536458333, "logps/rejected": -486.220703125, "loss": 0.027, "rewards/chosen": 8.454511006673178, "rewards/margins": 18.8785883585612, "rewards/rejected": -10.424077351888021, "step": 2405 }, { "epoch": 0.6020267734267484, "grad_norm": 0.8046875, "kl": 1.1547292470932007, "learning_rate": 5e-06, "logits/chosen": -26156336.0, "logits/rejected": -36712634.666666664, "logps/chosen": -338.4447021484375, "logps/rejected": -650.2509765625, "loss": 0.0021, "rewards/chosen": 8.808040618896484, "rewards/margins": 24.82712173461914, "rewards/rejected": -16.019081115722656, "step": 2406 }, { "epoch": 0.6022769923683223, "grad_norm": 2.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59162297.6, "logits/rejected": -62267382.85714286, "logps/chosen": -361.38203125, "logps/rejected": -621.1392299107143, "loss": 0.0142, "rewards/chosen": 7.983201599121093, "rewards/margins": 23.500381905691963, "rewards/rejected": -15.51718030657087, "step": 2407 }, { "epoch": 0.6025272113098962, "grad_norm": 10.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12246744.0, "logits/rejected": -9252053.714285715, "logps/chosen": -232.0011962890625, "logps/rejected": -590.6630859375, "loss": 0.068, "rewards/chosen": 4.520191955566406, "rewards/margins": 17.950900704520087, "rewards/rejected": -13.430708748953682, "step": 2408 }, { "epoch": 0.60277743025147, "grad_norm": 4.625, "kl": 0.40271252393722534, "learning_rate": 5e-06, "logits/chosen": -20809976.615384616, "logits/rejected": -32241288.727272727, "logps/chosen": -344.3565204326923, "logps/rejected": -693.5885120738636, "loss": 0.037, "rewards/chosen": 7.826763446514423, "rewards/margins": 20.470651053048513, "rewards/rejected": -12.643887606534092, "step": 2409 }, { "epoch": 0.603027649193044, "grad_norm": 7.09375, "kl": 0.712755560874939, "learning_rate": 5e-06, "logits/chosen": -88567308.8, "logits/rejected": -46087204.571428575, "logps/chosen": -496.2703125, "logps/rejected": -594.9705636160714, "loss": 0.0096, "rewards/chosen": 10.420065307617188, "rewards/margins": 24.948672267368863, "rewards/rejected": -14.528606959751674, "step": 2410 }, { "epoch": 0.6032778681346178, "grad_norm": 8.25, "kl": 3.6644248962402344, "learning_rate": 5e-06, "logits/chosen": -36913136.0, "logits/rejected": -29025656.0, "logps/chosen": -340.74871826171875, "logps/rejected": -600.736572265625, "loss": 0.036, "rewards/chosen": 6.268530527750651, "rewards/margins": 19.442816416422527, "rewards/rejected": -13.174285888671875, "step": 2411 }, { "epoch": 0.6035280870761917, "grad_norm": 5.53125, "kl": 3.2745590209960938, "learning_rate": 5e-06, "logits/chosen": -105621208.61538461, "logits/rejected": -33022385.454545453, "logps/chosen": -373.1397235576923, "logps/rejected": -554.189453125, "loss": 0.0375, "rewards/chosen": 8.13821293757512, "rewards/margins": 20.124294147624838, "rewards/rejected": -11.986081210049717, "step": 2412 }, { "epoch": 0.6037783060177655, "grad_norm": 4.59375, "kl": 4.226690292358398, "learning_rate": 5e-06, "logits/chosen": -7347224.533333333, "logits/rejected": -42911473.777777776, "logps/chosen": -443.68919270833334, "logps/rejected": -458.14702690972223, "loss": 0.0674, "rewards/chosen": 8.987929280598959, "rewards/margins": 17.01618347167969, "rewards/rejected": -8.028254191080729, "step": 2413 }, { "epoch": 0.6040285249593395, "grad_norm": 16.875, "kl": 3.228240489959717, "learning_rate": 5e-06, "logits/chosen": -48205045.333333336, "logits/rejected": -29698229.333333332, "logps/chosen": -508.65087890625, "logps/rejected": -514.1309407552084, "loss": 0.0352, "rewards/chosen": 9.594586690266928, "rewards/margins": 20.49249521891276, "rewards/rejected": -10.897908528645834, "step": 2414 }, { "epoch": 0.6042787439009133, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50839985.23076923, "logits/rejected": -70307921.45454545, "logps/chosen": -398.29041466346155, "logps/rejected": -813.1967329545455, "loss": 0.1023, "rewards/chosen": 9.176375169020433, "rewards/margins": 24.501223690859923, "rewards/rejected": -15.324848521839488, "step": 2415 }, { "epoch": 0.6045289628424871, "grad_norm": 6.25, "kl": 8.880748748779297, "learning_rate": 5e-06, "logits/chosen": -36583276.307692304, "logits/rejected": -41477521.45454545, "logps/chosen": -364.60238882211536, "logps/rejected": -479.5604137073864, "loss": 0.0264, "rewards/chosen": 8.5194091796875, "rewards/margins": 21.022756403142758, "rewards/rejected": -12.503347223455256, "step": 2416 }, { "epoch": 0.6047791817840611, "grad_norm": 6.9375, "kl": 28.100996017456055, "learning_rate": 5e-06, "logits/chosen": -57067653.81818182, "logits/rejected": -57354525.538461536, "logps/chosen": -464.3948863636364, "logps/rejected": -540.2642352764423, "loss": 0.0195, "rewards/chosen": 10.887361699884588, "rewards/margins": 20.227166716035427, "rewards/rejected": -9.33980501615084, "step": 2417 }, { "epoch": 0.6050294007256349, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31781384.727272727, "logits/rejected": -42848029.538461536, "logps/chosen": -273.0755504261364, "logps/rejected": -551.7503756009615, "loss": 0.031, "rewards/chosen": 7.55059120871804, "rewards/margins": 18.972071827708426, "rewards/rejected": -11.421480618990385, "step": 2418 }, { "epoch": 0.6052796196672088, "grad_norm": 2.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46424944.0, "logits/rejected": -40721178.666666664, "logps/chosen": -462.5451253255208, "logps/rejected": -552.1821695963541, "loss": 0.0095, "rewards/chosen": 8.919522603352865, "rewards/margins": 19.730603535970054, "rewards/rejected": -10.811080932617188, "step": 2419 }, { "epoch": 0.6055298386087827, "grad_norm": 10.0, "kl": 2.8398702144622803, "learning_rate": 5e-06, "logits/chosen": -40781049.6, "logits/rejected": -36981837.71428572, "logps/chosen": -434.67626953125, "logps/rejected": -445.69095284598217, "loss": 0.0863, "rewards/chosen": 10.223684692382813, "rewards/margins": 20.374068777901787, "rewards/rejected": -10.150384085518974, "step": 2420 }, { "epoch": 0.6057800575503566, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32788406.85714286, "logits/rejected": 15457953.6, "logps/chosen": -369.71212332589283, "logps/rejected": -601.98037109375, "loss": 0.0277, "rewards/chosen": 8.378007071358818, "rewards/margins": 21.968011038643972, "rewards/rejected": -13.590003967285156, "step": 2421 }, { "epoch": 0.6060302764919304, "grad_norm": 3.4375, "kl": 7.394895076751709, "learning_rate": 5e-06, "logits/chosen": -46620725.333333336, "logits/rejected": -59502272.0, "logps/chosen": -375.7605794270833, "logps/rejected": -538.0983479817709, "loss": 0.0076, "rewards/chosen": 8.641480763753256, "rewards/margins": 21.585071563720703, "rewards/rejected": -12.943590799967447, "step": 2422 }, { "epoch": 0.6062804954335043, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37054887.11111111, "logits/rejected": -32116309.333333332, "logps/chosen": -278.7237141927083, "logps/rejected": -511.403515625, "loss": 0.06, "rewards/chosen": 5.3554361131456165, "rewards/margins": 19.576768917507597, "rewards/rejected": -14.22133280436198, "step": 2423 }, { "epoch": 0.6065307143750782, "grad_norm": 4.5, "kl": 11.455562591552734, "learning_rate": 5e-06, "logits/chosen": -85890417.77777778, "logits/rejected": -44022749.86666667, "logps/chosen": -613.96826171875, "logps/rejected": -574.2421875, "loss": 0.0041, "rewards/chosen": 13.403506808810764, "rewards/margins": 27.422169325086806, "rewards/rejected": -14.018662516276041, "step": 2424 }, { "epoch": 0.6067809333166521, "grad_norm": 3.734375, "kl": 12.522576332092285, "learning_rate": 5e-06, "logits/chosen": -53866658.13333333, "logits/rejected": -62558535.11111111, "logps/chosen": -455.5142578125, "logps/rejected": -703.6638454861111, "loss": 0.048, "rewards/chosen": 7.8383433024088545, "rewards/margins": 23.481965806749134, "rewards/rejected": -15.643622504340279, "step": 2425 }, { "epoch": 0.6070311522582259, "grad_norm": 12.875, "kl": 21.92177963256836, "learning_rate": 5e-06, "logits/chosen": -44981472.0, "logits/rejected": -51809229.71428572, "logps/chosen": -527.33193359375, "logps/rejected": -510.62025669642856, "loss": 0.1277, "rewards/chosen": 11.091026306152344, "rewards/margins": 25.567866734095983, "rewards/rejected": -14.476840427943639, "step": 2426 }, { "epoch": 0.6072813711997999, "grad_norm": 4.9375, "kl": 10.216106414794922, "learning_rate": 5e-06, "logits/chosen": -41974728.53333333, "logits/rejected": -46294620.44444445, "logps/chosen": -483.81959635416666, "logps/rejected": -805.7077907986111, "loss": 0.0035, "rewards/chosen": 10.639893595377604, "rewards/margins": 24.071036614312064, "rewards/rejected": -13.431143018934462, "step": 2427 }, { "epoch": 0.6075315901413737, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43851381.333333336, "logits/rejected": -33717002.666666664, "logps/chosen": -310.73622639973956, "logps/rejected": -472.8647054036458, "loss": 0.0197, "rewards/chosen": 6.667568842569987, "rewards/margins": 18.671311060587566, "rewards/rejected": -12.003742218017578, "step": 2428 }, { "epoch": 0.6077818090829475, "grad_norm": 15.3125, "kl": 4.264209747314453, "learning_rate": 5e-06, "logits/chosen": -50845204.0, "logits/rejected": -42699856.0, "logps/chosen": -406.4794616699219, "logps/rejected": -742.0474853515625, "loss": 0.0252, "rewards/chosen": 8.561019897460938, "rewards/margins": 26.82114028930664, "rewards/rejected": -18.260120391845703, "step": 2429 }, { "epoch": 0.6080320280245215, "grad_norm": 7.34375, "kl": 5.061368465423584, "learning_rate": 5e-06, "logits/chosen": -60793048.615384616, "logits/rejected": -46542839.27272727, "logps/chosen": -390.46957632211536, "logps/rejected": -542.5413707386364, "loss": 0.0246, "rewards/chosen": 9.265636737530048, "rewards/margins": 22.809200580303486, "rewards/rejected": -13.543563842773438, "step": 2430 }, { "epoch": 0.6082822469660953, "grad_norm": 1.5546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41714481.23076923, "logits/rejected": -58365870.54545455, "logps/chosen": -272.9983097956731, "logps/rejected": -764.9367009943181, "loss": 0.0219, "rewards/chosen": 7.229895958533654, "rewards/margins": 21.655509361853966, "rewards/rejected": -14.425613403320312, "step": 2431 }, { "epoch": 0.6085324659076692, "grad_norm": 10.75, "kl": 13.515531539916992, "learning_rate": 5e-06, "logits/chosen": -18470566.0, "logits/rejected": -60277912.0, "logps/chosen": -303.9547119140625, "logps/rejected": -810.8048095703125, "loss": 0.0936, "rewards/chosen": 7.259609222412109, "rewards/margins": 21.473526000976562, "rewards/rejected": -14.213916778564453, "step": 2432 }, { "epoch": 0.6087826848492431, "grad_norm": 8.1875, "kl": 4.744281768798828, "learning_rate": 5e-06, "logits/chosen": -45561118.11764706, "logits/rejected": -58099625.14285714, "logps/chosen": -348.9482421875, "logps/rejected": -702.8111746651786, "loss": 0.0401, "rewards/chosen": 8.127595789292279, "rewards/margins": 25.08680250664719, "rewards/rejected": -16.95920671735491, "step": 2433 }, { "epoch": 0.609032903790817, "grad_norm": 3.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32406775.272727273, "logits/rejected": -20146226.46153846, "logps/chosen": -304.24538352272725, "logps/rejected": -521.6920072115385, "loss": 0.0179, "rewards/chosen": 5.514634219082919, "rewards/margins": 15.77713468858412, "rewards/rejected": -10.262500469501202, "step": 2434 }, { "epoch": 0.6092831227323908, "grad_norm": 10.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52801644.8, "logits/rejected": -38254160.0, "logps/chosen": -314.5013671875, "logps/rejected": -645.61669921875, "loss": 0.0351, "rewards/chosen": 6.669560241699219, "rewards/margins": 21.186737278529577, "rewards/rejected": -14.517177036830358, "step": 2435 }, { "epoch": 0.6095333416739647, "grad_norm": 5.0625, "kl": 0.36060842871665955, "learning_rate": 5e-06, "logits/chosen": -54979804.0, "logits/rejected": -60203936.0, "logps/chosen": -372.57769775390625, "logps/rejected": -679.0423583984375, "loss": 0.0322, "rewards/chosen": 7.684247970581055, "rewards/margins": 26.054086685180664, "rewards/rejected": -18.36983871459961, "step": 2436 }, { "epoch": 0.6097835606155386, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -99905104.0, "logits/rejected": -48196732.0, "logps/chosen": -586.1228637695312, "logps/rejected": -653.0353393554688, "loss": 0.0099, "rewards/chosen": 11.60792350769043, "rewards/margins": 28.319639205932617, "rewards/rejected": -16.711715698242188, "step": 2437 }, { "epoch": 0.6100337795571125, "grad_norm": 3.859375, "kl": 2.946535110473633, "learning_rate": 5e-06, "logits/chosen": -51221321.14285714, "logits/rejected": -54871884.8, "logps/chosen": -268.7633579799107, "logps/rejected": -512.336376953125, "loss": 0.0084, "rewards/chosen": 6.748202732631138, "rewards/margins": 20.13454153878348, "rewards/rejected": -13.386338806152343, "step": 2438 }, { "epoch": 0.6102839984986863, "grad_norm": 14.5, "kl": 28.076358795166016, "learning_rate": 5e-06, "logits/chosen": -72316978.28571428, "logits/rejected": -35913097.6, "logps/chosen": -413.25620814732144, "logps/rejected": -340.6446533203125, "loss": 0.0684, "rewards/chosen": 9.59689440046038, "rewards/margins": 19.18056139264788, "rewards/rejected": -9.5836669921875, "step": 2439 }, { "epoch": 0.6105342174402603, "grad_norm": 8.75, "kl": 1.1614367961883545, "learning_rate": 5e-06, "logits/chosen": -65940112.0, "logits/rejected": -46362378.666666664, "logps/chosen": -416.9586588541667, "logps/rejected": -478.8248697916667, "loss": 0.0596, "rewards/chosen": 7.538974761962891, "rewards/margins": 21.200618743896484, "rewards/rejected": -13.661643981933594, "step": 2440 }, { "epoch": 0.6107844363818341, "grad_norm": 4.46875, "kl": 4.290702819824219, "learning_rate": 5e-06, "logits/chosen": -73597882.66666667, "logits/rejected": -52487381.333333336, "logps/chosen": -409.5906168619792, "logps/rejected": -652.3874104817709, "loss": 0.0439, "rewards/chosen": 8.501913706461588, "rewards/margins": 22.486363728841145, "rewards/rejected": -13.984450022379557, "step": 2441 }, { "epoch": 0.611034655323408, "grad_norm": 16.25, "kl": 1.9676513671875, "learning_rate": 5e-06, "logits/chosen": -52199968.0, "logits/rejected": -76611436.8, "logps/chosen": -332.74979073660717, "logps/rejected": -559.1935546875, "loss": 0.0425, "rewards/chosen": 7.0919189453125, "rewards/margins": 20.077818298339842, "rewards/rejected": -12.985899353027344, "step": 2442 }, { "epoch": 0.6112848742649819, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47147748.571428575, "logits/rejected": -62141782.5882353, "logps/chosen": -414.872802734375, "logps/rejected": -639.9566865808823, "loss": 0.0522, "rewards/chosen": 8.221305302211217, "rewards/margins": 23.338525114940996, "rewards/rejected": -15.117219812729779, "step": 2443 }, { "epoch": 0.6115350932065557, "grad_norm": 6.0, "kl": 0.7596168518066406, "learning_rate": 5e-06, "logits/chosen": -51223074.90909091, "logits/rejected": -49544546.461538464, "logps/chosen": -384.83957741477275, "logps/rejected": -490.2123272235577, "loss": 0.0166, "rewards/chosen": 7.436017816716975, "rewards/margins": 18.96030863515147, "rewards/rejected": -11.524290818434496, "step": 2444 }, { "epoch": 0.6117853121481296, "grad_norm": 12.3125, "kl": 2.671675682067871, "learning_rate": 5e-06, "logits/chosen": -45810327.27272727, "logits/rejected": -53568546.461538464, "logps/chosen": -436.48428622159093, "logps/rejected": -408.6998948317308, "loss": 0.0492, "rewards/chosen": 9.302181590687145, "rewards/margins": 19.53797331056395, "rewards/rejected": -10.235791719876802, "step": 2445 }, { "epoch": 0.6120355310897035, "grad_norm": 4.34375, "kl": 2.4231221675872803, "learning_rate": 5e-06, "logits/chosen": -38487882.666666664, "logits/rejected": -35711266.666666664, "logps/chosen": -376.9884440104167, "logps/rejected": -500.9215901692708, "loss": 0.03, "rewards/chosen": 8.16119130452474, "rewards/margins": 21.751017252604168, "rewards/rejected": -13.589825948079428, "step": 2446 }, { "epoch": 0.6122857500312774, "grad_norm": 9.1875, "kl": 17.13314437866211, "learning_rate": 5e-06, "logits/chosen": -63262121.14285714, "logits/rejected": -70119187.2, "logps/chosen": -448.94482421875, "logps/rejected": -606.637109375, "loss": 0.0927, "rewards/chosen": 9.646490914481026, "rewards/margins": 24.02421003069196, "rewards/rejected": -14.377719116210937, "step": 2447 }, { "epoch": 0.6125359689728512, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65378331.428571425, "logits/rejected": -49179664.0, "logps/chosen": -348.97984095982144, "logps/rejected": -680.881982421875, "loss": 0.059, "rewards/chosen": 6.142981392996652, "rewards/margins": 27.057210213797433, "rewards/rejected": -20.91422882080078, "step": 2448 }, { "epoch": 0.6127861879144251, "grad_norm": 6.25, "kl": 10.972585678100586, "learning_rate": 5e-06, "logits/chosen": -51959687.11111111, "logits/rejected": -24469619.2, "logps/chosen": -536.43212890625, "logps/rejected": -582.3038411458333, "loss": 0.0621, "rewards/chosen": 8.81045193142361, "rewards/margins": 25.85059577094184, "rewards/rejected": -17.04014383951823, "step": 2449 }, { "epoch": 0.613036406855999, "grad_norm": 4.3125, "kl": 7.541066646575928, "learning_rate": 5e-06, "logits/chosen": -39188133.64705882, "logits/rejected": -24990640.0, "logps/chosen": -368.9016544117647, "logps/rejected": -484.90304129464283, "loss": 0.052, "rewards/chosen": 7.752206241383272, "rewards/margins": 19.715713757426798, "rewards/rejected": -11.963507516043526, "step": 2450 }, { "epoch": 0.6132866257975729, "grad_norm": 1.671875, "kl": 4.85919713973999, "learning_rate": 5e-06, "logits/chosen": -63278698.666666664, "logits/rejected": -44978922.666666664, "logps/chosen": -490.8492431640625, "logps/rejected": -544.3570963541666, "loss": 0.0045, "rewards/chosen": 9.208826065063477, "rewards/margins": 21.66894849141439, "rewards/rejected": -12.460122426350912, "step": 2451 }, { "epoch": 0.6135368447391467, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34878801.06666667, "logits/rejected": -37617475.55555555, "logps/chosen": -344.06484375, "logps/rejected": -510.19330512152777, "loss": 0.0574, "rewards/chosen": 8.14933573404948, "rewards/margins": 21.36514163547092, "rewards/rejected": -13.21580590142144, "step": 2452 }, { "epoch": 0.6137870636807207, "grad_norm": 1.71875, "kl": 9.40426254272461, "learning_rate": 5e-06, "logits/chosen": -60463862.85714286, "logits/rejected": -72060851.2, "logps/chosen": -464.28348214285717, "logps/rejected": -580.93271484375, "loss": 0.0163, "rewards/chosen": 9.715233939034599, "rewards/margins": 21.630648367745536, "rewards/rejected": -11.915414428710937, "step": 2453 }, { "epoch": 0.6140372826222945, "grad_norm": 1.5390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31084241.454545453, "logits/rejected": -57091475.692307696, "logps/chosen": -422.03324751420456, "logps/rejected": -523.0791766826923, "loss": 0.0186, "rewards/chosen": 8.77844931862571, "rewards/margins": 22.452993432958642, "rewards/rejected": -13.674544114332933, "step": 2454 }, { "epoch": 0.6142875015638684, "grad_norm": 20.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79238136.0, "logits/rejected": -32143772.0, "logps/chosen": -448.37542724609375, "logps/rejected": -695.6185302734375, "loss": 0.0477, "rewards/chosen": 8.31430721282959, "rewards/margins": 21.36212730407715, "rewards/rejected": -13.047820091247559, "step": 2455 }, { "epoch": 0.6145377205054423, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52256960.0, "logits/rejected": -42362570.666666664, "logps/chosen": -231.5125935872396, "logps/rejected": -511.3548177083333, "loss": 0.0773, "rewards/chosen": 5.250890413920085, "rewards/margins": 18.313175519307453, "rewards/rejected": -13.06228510538737, "step": 2456 }, { "epoch": 0.6147879394470162, "grad_norm": 10.8125, "kl": 9.245050430297852, "learning_rate": 5e-06, "logits/chosen": -62399394.461538464, "logits/rejected": -65150312.72727273, "logps/chosen": -451.30814302884613, "logps/rejected": -457.6490589488636, "loss": 0.0846, "rewards/chosen": 10.647026648888222, "rewards/margins": 22.951458910962085, "rewards/rejected": -12.304432262073863, "step": 2457 }, { "epoch": 0.61503815838859, "grad_norm": 3.84375, "kl": 3.278815984725952, "learning_rate": 5e-06, "logits/chosen": -33754066.28571428, "logits/rejected": -45747792.0, "logps/chosen": -333.0765904017857, "logps/rejected": -418.51083984375, "loss": 0.0372, "rewards/chosen": 7.9637603759765625, "rewards/margins": 20.561287689208985, "rewards/rejected": -12.597527313232423, "step": 2458 }, { "epoch": 0.6152883773301638, "grad_norm": 9.9375, "kl": 6.308222770690918, "learning_rate": 5e-06, "logits/chosen": -33344029.09090909, "logits/rejected": -83905063.38461539, "logps/chosen": -333.86496803977275, "logps/rejected": -616.4159780649038, "loss": 0.0781, "rewards/chosen": 6.912535233931108, "rewards/margins": 23.219422347062117, "rewards/rejected": -16.30688711313101, "step": 2459 }, { "epoch": 0.6155385962717378, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45487392.0, "logits/rejected": -53263286.85714286, "logps/chosen": -296.945703125, "logps/rejected": -664.4881417410714, "loss": 0.0391, "rewards/chosen": 6.355614471435547, "rewards/margins": 22.244737570626395, "rewards/rejected": -15.889123099190849, "step": 2460 }, { "epoch": 0.6157888152133116, "grad_norm": 5.8125, "kl": 1.4031486511230469, "learning_rate": 5e-06, "logits/chosen": -41949592.0, "logits/rejected": -43703981.333333336, "logps/chosen": -360.7398274739583, "logps/rejected": -548.024169921875, "loss": 0.0111, "rewards/chosen": 8.542339324951172, "rewards/margins": 19.77811050415039, "rewards/rejected": -11.235771179199219, "step": 2461 }, { "epoch": 0.6160390341548855, "grad_norm": 7.28125, "kl": 18.769794464111328, "learning_rate": 5e-06, "logits/chosen": -53391062.5882353, "logits/rejected": -92306688.0, "logps/chosen": -495.6667049632353, "logps/rejected": -788.7859933035714, "loss": 0.019, "rewards/chosen": 10.059327069450827, "rewards/margins": 29.820295958959754, "rewards/rejected": -19.760968889508927, "step": 2462 }, { "epoch": 0.6162892530964594, "grad_norm": 18.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59000476.0, "logits/rejected": -100337216.0, "logps/chosen": -388.73785400390625, "logps/rejected": -687.3478393554688, "loss": 0.0868, "rewards/chosen": 6.633993625640869, "rewards/margins": 22.363086223602295, "rewards/rejected": -15.729092597961426, "step": 2463 }, { "epoch": 0.6165394720380333, "grad_norm": 5.46875, "kl": 8.836140632629395, "learning_rate": 5e-06, "logits/chosen": -70604873.84615384, "logits/rejected": -50963042.90909091, "logps/chosen": -364.3679762620192, "logps/rejected": -679.8227982954545, "loss": 0.0646, "rewards/chosen": 8.853098355806791, "rewards/margins": 26.630627985600825, "rewards/rejected": -17.777529629794035, "step": 2464 }, { "epoch": 0.6167896909796071, "grad_norm": 15.375, "kl": 11.596526145935059, "learning_rate": 5e-06, "logits/chosen": -23251889.454545453, "logits/rejected": -40303864.615384616, "logps/chosen": -306.76717862215907, "logps/rejected": -687.3375901442307, "loss": 0.0789, "rewards/chosen": 6.736112421209162, "rewards/margins": 21.345324269541493, "rewards/rejected": -14.609211848332333, "step": 2465 }, { "epoch": 0.6170399099211811, "grad_norm": 9.375, "kl": 0.3309618830680847, "learning_rate": 5e-06, "logits/chosen": -12473240.0, "logits/rejected": -35457562.666666664, "logps/chosen": -304.7659098307292, "logps/rejected": -481.67529296875, "loss": 0.019, "rewards/chosen": 8.140413920084635, "rewards/margins": 19.698087056477863, "rewards/rejected": -11.557673136393229, "step": 2466 }, { "epoch": 0.6172901288627549, "grad_norm": 1.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -112141448.0, "logits/rejected": -61328432.0, "logps/chosen": -484.5457763671875, "logps/rejected": -661.2355346679688, "loss": 0.0017, "rewards/chosen": 8.396095275878906, "rewards/margins": 24.701457977294922, "rewards/rejected": -16.305362701416016, "step": 2467 }, { "epoch": 0.6175403478043288, "grad_norm": 15.0, "kl": 8.004132270812988, "learning_rate": 5e-06, "logits/chosen": -21644186.666666668, "logits/rejected": -56096176.0, "logps/chosen": -439.7254231770833, "logps/rejected": -733.9973958333334, "loss": 0.0253, "rewards/chosen": 10.076651255289713, "rewards/margins": 27.724920908610024, "rewards/rejected": -17.648269653320312, "step": 2468 }, { "epoch": 0.6177905667459027, "grad_norm": 7.71875, "kl": 17.530385971069336, "learning_rate": 5e-06, "logits/chosen": -29314292.70588235, "logits/rejected": -55182496.0, "logps/chosen": -364.0469324448529, "logps/rejected": -604.5481305803571, "loss": 0.1596, "rewards/chosen": 8.174032772288603, "rewards/margins": 23.394158243131237, "rewards/rejected": -15.220125470842634, "step": 2469 }, { "epoch": 0.6180407856874766, "grad_norm": 12.75, "kl": 4.381776332855225, "learning_rate": 5e-06, "logits/chosen": -64036770.461538464, "logits/rejected": -37433041.45454545, "logps/chosen": -462.76844200721155, "logps/rejected": -608.7482688210227, "loss": 0.0526, "rewards/chosen": 9.074207012469952, "rewards/margins": 21.898144168453616, "rewards/rejected": -12.823937155983664, "step": 2470 }, { "epoch": 0.6182910046290504, "grad_norm": 10.75, "kl": 14.730803489685059, "learning_rate": 5e-06, "logits/chosen": -25790142.0, "logits/rejected": -43818560.0, "logps/chosen": -324.1304931640625, "logps/rejected": -773.86669921875, "loss": 0.1078, "rewards/chosen": 6.997044563293457, "rewards/margins": 22.20711040496826, "rewards/rejected": -15.210065841674805, "step": 2471 }, { "epoch": 0.6185412235706242, "grad_norm": 2.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46968938.666666664, "logits/rejected": -17848408.0, "logps/chosen": -368.6352132161458, "logps/rejected": -356.0834147135417, "loss": 0.0119, "rewards/chosen": 8.279898325602213, "rewards/margins": 17.227439244588215, "rewards/rejected": -8.947540918986002, "step": 2472 }, { "epoch": 0.6187914425121982, "grad_norm": 5.03125, "kl": 6.900914669036865, "learning_rate": 5e-06, "logits/chosen": -38755913.14285714, "logits/rejected": -58604390.4, "logps/chosen": -386.2473842075893, "logps/rejected": -770.933447265625, "loss": 0.0531, "rewards/chosen": 9.07250486101423, "rewards/margins": 23.436220986502512, "rewards/rejected": -14.363716125488281, "step": 2473 }, { "epoch": 0.619041661453772, "grad_norm": 1.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30218818.90909091, "logits/rejected": -48422843.07692308, "logps/chosen": -411.69770951704544, "logps/rejected": -610.9829852764423, "loss": 0.0277, "rewards/chosen": 7.69169131192294, "rewards/margins": 20.412414390724024, "rewards/rejected": -12.720723078801083, "step": 2474 }, { "epoch": 0.6192918803953459, "grad_norm": 23.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21350089.846153848, "logits/rejected": -36298280.72727273, "logps/chosen": -344.0696364182692, "logps/rejected": -569.771484375, "loss": 0.0838, "rewards/chosen": 7.918998718261719, "rewards/margins": 18.992122303355824, "rewards/rejected": -11.073123585094105, "step": 2475 }, { "epoch": 0.6195420993369198, "grad_norm": 2.5, "kl": 16.14555549621582, "learning_rate": 5e-06, "logits/chosen": -62235485.09090909, "logits/rejected": -44912443.07692308, "logps/chosen": -502.25874467329544, "logps/rejected": -682.666015625, "loss": 0.0401, "rewards/chosen": 10.093490600585938, "rewards/margins": 24.65542485163762, "rewards/rejected": -14.561934251051683, "step": 2476 }, { "epoch": 0.6197923182784937, "grad_norm": 1.875, "kl": 5.006343364715576, "learning_rate": 5e-06, "logits/chosen": -18527432.888888888, "logits/rejected": -2498426.933333333, "logps/chosen": -368.03358289930554, "logps/rejected": -451.92535807291665, "loss": 0.0354, "rewards/chosen": 9.618890550401476, "rewards/margins": 20.628965420193143, "rewards/rejected": -11.010074869791667, "step": 2477 }, { "epoch": 0.6200425372200675, "grad_norm": 2.78125, "kl": 1.7069120407104492, "learning_rate": 5e-06, "logits/chosen": -21469846.4, "logits/rejected": -60583264.0, "logps/chosen": -365.257470703125, "logps/rejected": -533.4583565848214, "loss": 0.0225, "rewards/chosen": 8.044914245605469, "rewards/margins": 20.574822562081472, "rewards/rejected": -12.529908316476005, "step": 2478 }, { "epoch": 0.6202927561616415, "grad_norm": 14.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57225426.28571428, "logits/rejected": -51871066.35294118, "logps/chosen": -339.60365513392856, "logps/rejected": -571.7243795955883, "loss": 0.052, "rewards/chosen": 8.281544276646205, "rewards/margins": 21.38254309902672, "rewards/rejected": -13.100998822380514, "step": 2479 }, { "epoch": 0.6205429751032153, "grad_norm": 4.875, "kl": 17.004467010498047, "learning_rate": 5e-06, "logits/chosen": -59037905.777777776, "logits/rejected": -61594197.333333336, "logps/chosen": -465.43787977430554, "logps/rejected": -530.793701171875, "loss": 0.0462, "rewards/chosen": 8.951393975151909, "rewards/margins": 23.374537997775604, "rewards/rejected": -14.423144022623697, "step": 2480 }, { "epoch": 0.6207931940447892, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26451060.363636363, "logits/rejected": -39794791.384615384, "logps/chosen": -311.2567027698864, "logps/rejected": -461.65024038461536, "loss": 0.0245, "rewards/chosen": 6.418903004039418, "rewards/margins": 20.62016525802079, "rewards/rejected": -14.20126225398137, "step": 2481 }, { "epoch": 0.6210434129863631, "grad_norm": 6.34375, "kl": 3.008413314819336, "learning_rate": 5e-06, "logits/chosen": -84831718.4, "logits/rejected": -45997353.14285714, "logps/chosen": -361.48603515625, "logps/rejected": -619.49755859375, "loss": 0.0516, "rewards/chosen": 7.368170166015625, "rewards/margins": 24.998214285714283, "rewards/rejected": -17.63004411969866, "step": 2482 }, { "epoch": 0.621293631927937, "grad_norm": 1.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58083524.266666666, "logits/rejected": -31263235.555555556, "logps/chosen": -343.8017578125, "logps/rejected": -748.0422634548611, "loss": 0.0118, "rewards/chosen": 8.820060221354167, "rewards/margins": 29.721329074435765, "rewards/rejected": -20.901268853081596, "step": 2483 }, { "epoch": 0.6215438508695108, "grad_norm": 3.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63418248.0, "logits/rejected": -53321612.0, "logps/chosen": -376.13330078125, "logps/rejected": -692.8074951171875, "loss": 0.0051, "rewards/chosen": 7.652372360229492, "rewards/margins": 25.042146682739258, "rewards/rejected": -17.389774322509766, "step": 2484 }, { "epoch": 0.6217940698110846, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38241448.0, "logits/rejected": -28476076.0, "logps/chosen": -260.7626953125, "logps/rejected": -537.1641845703125, "loss": 0.0444, "rewards/chosen": 6.6749114990234375, "rewards/margins": 19.1830472946167, "rewards/rejected": -12.508135795593262, "step": 2485 }, { "epoch": 0.6220442887526586, "grad_norm": 1.328125, "kl": 2.4345602989196777, "learning_rate": 5e-06, "logits/chosen": -60949216.0, "logits/rejected": -59230170.666666664, "logps/chosen": -423.033935546875, "logps/rejected": -798.23193359375, "loss": 0.0181, "rewards/chosen": 9.869501113891602, "rewards/margins": 32.317672093709305, "rewards/rejected": -22.448170979817707, "step": 2486 }, { "epoch": 0.6222945076942324, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46324434.28571428, "logits/rejected": -28065388.8, "logps/chosen": -367.89571707589283, "logps/rejected": -351.865234375, "loss": 0.0054, "rewards/chosen": 8.259424482073102, "rewards/margins": 18.873622022356308, "rewards/rejected": -10.614197540283204, "step": 2487 }, { "epoch": 0.6225447266358063, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34725280.0, "logits/rejected": -40811972.571428575, "logps/chosen": -453.8267578125, "logps/rejected": -401.27357700892856, "loss": 0.0188, "rewards/chosen": 6.439046478271484, "rewards/margins": 17.217401668003625, "rewards/rejected": -10.778355189732142, "step": 2488 }, { "epoch": 0.6227949455773802, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66070272.0, "logits/rejected": -42478464.0, "logps/chosen": -421.36971768465907, "logps/rejected": -511.40707632211536, "loss": 0.0148, "rewards/chosen": 8.068936434659092, "rewards/margins": 23.190863549292505, "rewards/rejected": -15.121927114633413, "step": 2489 }, { "epoch": 0.6230451645189541, "grad_norm": 0.2216796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40115976.72727273, "logits/rejected": -49449723.07692308, "logps/chosen": -345.27397017045456, "logps/rejected": -666.0147235576923, "loss": 0.0005, "rewards/chosen": 9.405317826704545, "rewards/margins": 27.67402510209517, "rewards/rejected": -18.268707275390625, "step": 2490 }, { "epoch": 0.6232953834605279, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39178251.63636363, "logits/rejected": -45704359.384615384, "logps/chosen": -385.79048295454544, "logps/rejected": -517.8949819711538, "loss": 0.0346, "rewards/chosen": 7.300200028852983, "rewards/margins": 24.56292585893111, "rewards/rejected": -17.262725830078125, "step": 2491 }, { "epoch": 0.6235456024021019, "grad_norm": 2.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79184029.53846154, "logits/rejected": -79509975.27272727, "logps/chosen": -393.5075871394231, "logps/rejected": -714.5269442471591, "loss": 0.0437, "rewards/chosen": 8.704150860126202, "rewards/margins": 28.042769745513276, "rewards/rejected": -19.338618885387074, "step": 2492 }, { "epoch": 0.6237958213436757, "grad_norm": 18.125, "kl": 0.7218529582023621, "learning_rate": 5e-06, "logits/chosen": -50356185.6, "logits/rejected": -34987830.85714286, "logps/chosen": -432.276904296875, "logps/rejected": -427.4755859375, "loss": 0.0588, "rewards/chosen": 8.515191650390625, "rewards/margins": 20.437356567382814, "rewards/rejected": -11.922164916992188, "step": 2493 }, { "epoch": 0.6240460402852496, "grad_norm": 11.0625, "kl": 0.5556501150131226, "learning_rate": 5e-06, "logits/chosen": -30307352.615384616, "logits/rejected": -61299514.18181818, "logps/chosen": -480.69027944711536, "logps/rejected": -737.0223721590909, "loss": 0.0174, "rewards/chosen": 9.201239365797777, "rewards/margins": 29.963564305872353, "rewards/rejected": -20.762324940074574, "step": 2494 }, { "epoch": 0.6242962592268235, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60971520.0, "logits/rejected": -68579484.44444445, "logps/chosen": -315.2081705729167, "logps/rejected": -794.966796875, "loss": 0.0907, "rewards/chosen": 6.537539672851563, "rewards/margins": 27.292520819769965, "rewards/rejected": -20.754981146918404, "step": 2495 }, { "epoch": 0.6245464781683974, "grad_norm": 16.125, "kl": 13.783313751220703, "learning_rate": 5e-06, "logits/chosen": -32887274.666666668, "logits/rejected": -54998522.666666664, "logps/chosen": -318.7431640625, "logps/rejected": -424.4005940755208, "loss": 0.0441, "rewards/chosen": 6.299693425496419, "rewards/margins": 17.912112553914387, "rewards/rejected": -11.612419128417969, "step": 2496 }, { "epoch": 0.6247966971099712, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40515432.72727273, "logits/rejected": -48312324.92307692, "logps/chosen": -408.87650923295456, "logps/rejected": -655.9814453125, "loss": 0.033, "rewards/chosen": 6.664486971768466, "rewards/margins": 20.35750595839707, "rewards/rejected": -13.693018986628605, "step": 2497 }, { "epoch": 0.625046916051545, "grad_norm": 13.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34850458.666666664, "logits/rejected": -33612352.0, "logps/chosen": -375.4722086588542, "logps/rejected": -523.0933024088541, "loss": 0.0571, "rewards/chosen": 8.046129862467447, "rewards/margins": 19.1494140625, "rewards/rejected": -11.103284200032553, "step": 2498 }, { "epoch": 0.625297134993119, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28030903.111111112, "logits/rejected": -83402333.86666666, "logps/chosen": -300.59087456597223, "logps/rejected": -915.5887369791667, "loss": 0.0236, "rewards/chosen": 8.18606906467014, "rewards/margins": 31.845948621961806, "rewards/rejected": -23.659879557291667, "step": 2499 }, { "epoch": 0.6255473539346929, "grad_norm": 17.375, "kl": 8.784231185913086, "learning_rate": 5e-06, "logits/chosen": -43492371.2, "logits/rejected": -30408621.714285713, "logps/chosen": -364.315185546875, "logps/rejected": -654.0004185267857, "loss": 0.0395, "rewards/chosen": 7.9260498046875, "rewards/margins": 19.746459306989397, "rewards/rejected": -11.820409502301898, "step": 2500 }, { "epoch": 0.6257975728762667, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74444794.18181819, "logits/rejected": -42855985.23076923, "logps/chosen": -345.7245427911932, "logps/rejected": -527.3418344350962, "loss": 0.0611, "rewards/chosen": 6.343276283957741, "rewards/margins": 21.61644029950762, "rewards/rejected": -15.27316401554988, "step": 2501 }, { "epoch": 0.6260477918178406, "grad_norm": 15.5, "kl": 0.7056414484977722, "learning_rate": 5e-06, "logits/chosen": -35859671.27272727, "logits/rejected": -39802035.692307696, "logps/chosen": -433.1237127130682, "logps/rejected": -500.4910231370192, "loss": 0.0382, "rewards/chosen": 8.305159135298295, "rewards/margins": 21.35317384946596, "rewards/rejected": -13.048014714167667, "step": 2502 }, { "epoch": 0.6262980107594145, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42946251.63636363, "logits/rejected": -58232157.538461536, "logps/chosen": -397.76411576704544, "logps/rejected": -716.6336388221154, "loss": 0.0272, "rewards/chosen": 8.896286010742188, "rewards/margins": 25.1898686335637, "rewards/rejected": -16.293582622821514, "step": 2503 }, { "epoch": 0.6265482297009883, "grad_norm": 7.59375, "kl": 4.407509803771973, "learning_rate": 5e-06, "logits/chosen": -45189897.14285714, "logits/rejected": -52229900.8, "logps/chosen": -298.3546665736607, "logps/rejected": -561.384033203125, "loss": 0.0589, "rewards/chosen": 7.529937744140625, "rewards/margins": 23.666929626464842, "rewards/rejected": -16.136991882324217, "step": 2504 }, { "epoch": 0.6267984486425623, "grad_norm": 3.84375, "kl": 2.7851524353027344, "learning_rate": 5e-06, "logits/chosen": -62565482.666666664, "logits/rejected": -46043829.333333336, "logps/chosen": -386.6598307291667, "logps/rejected": -665.7399495442709, "loss": 0.0282, "rewards/chosen": 8.263753255208334, "rewards/margins": 24.091209411621094, "rewards/rejected": -15.82745615641276, "step": 2505 }, { "epoch": 0.6270486675841361, "grad_norm": 10.1875, "kl": 5.4167609214782715, "learning_rate": 5e-06, "logits/chosen": -63304192.0, "logits/rejected": -24828538.666666668, "logps/chosen": -532.7721354166666, "logps/rejected": -567.024658203125, "loss": 0.0221, "rewards/chosen": 11.037769317626953, "rewards/margins": 23.651137034098305, "rewards/rejected": -12.613367716471354, "step": 2506 }, { "epoch": 0.62729888652571, "grad_norm": 1.1953125, "kl": 4.852348327636719, "learning_rate": 5e-06, "logits/chosen": -55810683.428571425, "logits/rejected": -32385465.6, "logps/chosen": -412.30001395089283, "logps/rejected": -574.458154296875, "loss": 0.0033, "rewards/chosen": 8.256307329450335, "rewards/margins": 20.34444362095424, "rewards/rejected": -12.088136291503906, "step": 2507 }, { "epoch": 0.6275491054672838, "grad_norm": 2.203125, "kl": 3.1178557872772217, "learning_rate": 5e-06, "logits/chosen": -50093008.0, "logits/rejected": -43463205.333333336, "logps/chosen": -401.2214762369792, "logps/rejected": -603.3190511067709, "loss": 0.0232, "rewards/chosen": 7.737422943115234, "rewards/margins": 19.691775004069008, "rewards/rejected": -11.954352060953775, "step": 2508 }, { "epoch": 0.6277993244088578, "grad_norm": 1.7734375, "kl": 0.3774452209472656, "learning_rate": 5e-06, "logits/chosen": -26624032.0, "logits/rejected": -48069425.23076923, "logps/chosen": -339.2908824573864, "logps/rejected": -887.7693058894231, "loss": 0.0207, "rewards/chosen": 7.492101495916193, "rewards/margins": 27.953081251024365, "rewards/rejected": -20.460979755108173, "step": 2509 }, { "epoch": 0.6280495433504316, "grad_norm": 9.9375, "kl": 5.894001007080078, "learning_rate": 5e-06, "logits/chosen": -9593362.0, "logits/rejected": -71870272.0, "logps/chosen": -458.2365417480469, "logps/rejected": -589.8665161132812, "loss": 0.084, "rewards/chosen": 7.839481830596924, "rewards/margins": 20.48694658279419, "rewards/rejected": -12.647464752197266, "step": 2510 }, { "epoch": 0.6282997622920055, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62652595.2, "logits/rejected": -48598084.571428575, "logps/chosen": -431.997509765625, "logps/rejected": -589.3092912946429, "loss": 0.045, "rewards/chosen": 10.407089996337891, "rewards/margins": 24.767578887939454, "rewards/rejected": -14.360488891601562, "step": 2511 }, { "epoch": 0.6285499812335794, "grad_norm": 8.625, "kl": 6.148695945739746, "learning_rate": 5e-06, "logits/chosen": -43121116.8, "logits/rejected": -1173808.0, "logps/chosen": -362.309619140625, "logps/rejected": -693.05029296875, "loss": 0.0231, "rewards/chosen": 7.629971313476562, "rewards/margins": 17.48228803362165, "rewards/rejected": -9.852316720145089, "step": 2512 }, { "epoch": 0.6288002001751533, "grad_norm": 2.390625, "kl": 5.250231742858887, "learning_rate": 5e-06, "logits/chosen": -19799986.285714287, "logits/rejected": -29860492.8, "logps/chosen": -300.18265206473217, "logps/rejected": -535.036669921875, "loss": 0.0479, "rewards/chosen": 7.145606449672154, "rewards/margins": 21.082914188929966, "rewards/rejected": -13.937307739257813, "step": 2513 }, { "epoch": 0.6290504191167271, "grad_norm": 11.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46877904.0, "logits/rejected": -47118160.0, "logps/chosen": -327.307861328125, "logps/rejected": -722.0896606445312, "loss": 0.0315, "rewards/chosen": 7.658298015594482, "rewards/margins": 24.424087047576904, "rewards/rejected": -16.765789031982422, "step": 2514 }, { "epoch": 0.629300638058301, "grad_norm": 21.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43921142.15384615, "logits/rejected": -37598231.27272727, "logps/chosen": -418.68997896634613, "logps/rejected": -645.4795365767045, "loss": 0.0382, "rewards/chosen": 7.209188608022837, "rewards/margins": 22.47498561619045, "rewards/rejected": -15.265797008167613, "step": 2515 }, { "epoch": 0.6295508569998749, "grad_norm": 12.3125, "kl": 5.2679290771484375, "learning_rate": 5e-06, "logits/chosen": -41798291.2, "logits/rejected": -56350637.71428572, "logps/chosen": -331.806494140625, "logps/rejected": -551.3922642299107, "loss": 0.0502, "rewards/chosen": 7.527032470703125, "rewards/margins": 20.825257873535158, "rewards/rejected": -13.298225402832031, "step": 2516 }, { "epoch": 0.6298010759414487, "grad_norm": 2.6875, "kl": 0.6283696889877319, "learning_rate": 5e-06, "logits/chosen": 5085506.285714285, "logits/rejected": -74583667.2, "logps/chosen": -434.1474609375, "logps/rejected": -533.02958984375, "loss": 0.0474, "rewards/chosen": 7.849430629185268, "rewards/margins": 20.395107051304407, "rewards/rejected": -12.54567642211914, "step": 2517 }, { "epoch": 0.6300512948830227, "grad_norm": 1.609375, "kl": 0.6448568105697632, "learning_rate": 5e-06, "logits/chosen": -43250286.54545455, "logits/rejected": -49213267.692307696, "logps/chosen": -457.32563920454544, "logps/rejected": -600.2562725360577, "loss": 0.0016, "rewards/chosen": 10.613924893465908, "rewards/margins": 22.10376958246831, "rewards/rejected": -11.489844689002403, "step": 2518 }, { "epoch": 0.6303015138245965, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -92552778.66666667, "logits/rejected": -56945728.0, "logps/chosen": -439.2107747395833, "logps/rejected": -492.5219319661458, "loss": 0.0218, "rewards/chosen": 9.691757202148438, "rewards/margins": 23.174907684326172, "rewards/rejected": -13.483150482177734, "step": 2519 }, { "epoch": 0.6305517327661704, "grad_norm": 20.5, "kl": 12.489814758300781, "learning_rate": 5e-06, "logits/chosen": -37469034.666666664, "logits/rejected": -17186919.111111112, "logps/chosen": -427.62705078125, "logps/rejected": -379.3385959201389, "loss": 0.0545, "rewards/chosen": 7.884296671549479, "rewards/margins": 16.83777770996094, "rewards/rejected": -8.953481038411459, "step": 2520 }, { "epoch": 0.6308019517077442, "grad_norm": 7.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42540660.36363637, "logits/rejected": -26577824.0, "logps/chosen": -376.55446555397725, "logps/rejected": -493.6002854567308, "loss": 0.0155, "rewards/chosen": 7.891424005681818, "rewards/margins": 20.473610191078453, "rewards/rejected": -12.582186185396635, "step": 2521 }, { "epoch": 0.6310521706493182, "grad_norm": 12.6875, "kl": 0.2251453399658203, "learning_rate": 5e-06, "logits/chosen": -55376464.0, "logits/rejected": -47450712.0, "logps/chosen": -388.3652648925781, "logps/rejected": -641.0706176757812, "loss": 0.0316, "rewards/chosen": 9.333907127380371, "rewards/margins": 24.025349617004395, "rewards/rejected": -14.691442489624023, "step": 2522 }, { "epoch": 0.631302389590892, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35947622.4, "logits/rejected": -40286435.55555555, "logps/chosen": -455.69114583333334, "logps/rejected": -501.96088324652777, "loss": 0.0415, "rewards/chosen": 8.207101440429687, "rewards/margins": 23.05621575249566, "rewards/rejected": -14.849114312065971, "step": 2523 }, { "epoch": 0.6315526085324659, "grad_norm": 1.03125, "kl": 1.0947158336639404, "learning_rate": 5e-06, "logits/chosen": -53513393.23076923, "logits/rejected": -65191709.09090909, "logps/chosen": -472.59848257211536, "logps/rejected": -646.4961825284091, "loss": 0.0115, "rewards/chosen": 10.214528010441708, "rewards/margins": 27.20546983838915, "rewards/rejected": -16.99094182794744, "step": 2524 }, { "epoch": 0.6318028274740398, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59435739.428571425, "logits/rejected": -73352371.2, "logps/chosen": -448.9432896205357, "logps/rejected": -827.290234375, "loss": 0.0169, "rewards/chosen": 8.524293082101005, "rewards/margins": 27.832834952218192, "rewards/rejected": -19.30854187011719, "step": 2525 }, { "epoch": 0.6320530464156137, "grad_norm": 1.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32835399.529411763, "logits/rejected": -51940672.0, "logps/chosen": -375.8563878676471, "logps/rejected": -784.8980887276786, "loss": 0.0244, "rewards/chosen": 7.951719396254596, "rewards/margins": 28.77206023400571, "rewards/rejected": -20.820340837751115, "step": 2526 }, { "epoch": 0.6323032653571875, "grad_norm": 16.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40449703.384615384, "logits/rejected": -51994827.63636363, "logps/chosen": -373.72205528846155, "logps/rejected": -765.6644176136364, "loss": 0.058, "rewards/chosen": 4.469235053429236, "rewards/margins": 26.93773675131631, "rewards/rejected": -22.468501697887074, "step": 2527 }, { "epoch": 0.6325534842987615, "grad_norm": 1.25, "kl": 0.026159923523664474, "learning_rate": 5e-06, "logits/chosen": -39475293.333333336, "logits/rejected": -61318085.333333336, "logps/chosen": -438.5337320963542, "logps/rejected": -610.8920084635416, "loss": 0.0053, "rewards/chosen": 9.895198186238607, "rewards/margins": 27.74160067240397, "rewards/rejected": -17.846402486165363, "step": 2528 }, { "epoch": 0.6328037032403353, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -90446624.0, "logits/rejected": -25616828.444444444, "logps/chosen": -465.4545084635417, "logps/rejected": -445.6344401041667, "loss": 0.0468, "rewards/chosen": 6.079484939575195, "rewards/margins": 19.90537537468804, "rewards/rejected": -13.825890435112846, "step": 2529 }, { "epoch": 0.6330539221819091, "grad_norm": 3.28125, "kl": 5.753346920013428, "learning_rate": 5e-06, "logits/chosen": -34063037.333333336, "logits/rejected": 33536949.333333332, "logps/chosen": -304.8963623046875, "logps/rejected": -720.7193196614584, "loss": 0.0281, "rewards/chosen": 7.834623336791992, "rewards/margins": 25.427427291870117, "rewards/rejected": -17.592803955078125, "step": 2530 }, { "epoch": 0.6333041411234831, "grad_norm": 12.5625, "kl": 9.607837677001953, "learning_rate": 5e-06, "logits/chosen": -84769389.71428572, "logits/rejected": -63669670.4, "logps/chosen": -522.7791573660714, "logps/rejected": -661.33623046875, "loss": 0.0554, "rewards/chosen": 8.110807691301618, "rewards/margins": 25.702882276262557, "rewards/rejected": -17.592074584960937, "step": 2531 }, { "epoch": 0.6335543600650569, "grad_norm": 9.875, "kl": 11.240137100219727, "learning_rate": 5e-06, "logits/chosen": -43457474.666666664, "logits/rejected": -40157626.666666664, "logps/chosen": -417.1097005208333, "logps/rejected": -453.2268880208333, "loss": 0.0649, "rewards/chosen": 7.763933817545573, "rewards/margins": 21.987224578857422, "rewards/rejected": -14.22329076131185, "step": 2532 }, { "epoch": 0.6338045790066308, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11426691.555555556, "logits/rejected": -54092083.2, "logps/chosen": -383.9098849826389, "logps/rejected": -463.32643229166666, "loss": 0.0147, "rewards/chosen": 9.147364298502604, "rewards/margins": 21.69699198404948, "rewards/rejected": -12.549627685546875, "step": 2533 }, { "epoch": 0.6340547979482046, "grad_norm": 17.125, "kl": 5.874564170837402, "learning_rate": 5e-06, "logits/chosen": -66065712.0, "logits/rejected": -48915084.0, "logps/chosen": -411.32861328125, "logps/rejected": -392.4386901855469, "loss": 0.0732, "rewards/chosen": 7.4850945472717285, "rewards/margins": 19.656890392303467, "rewards/rejected": -12.171795845031738, "step": 2534 }, { "epoch": 0.6343050168897786, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73429000.0, "logits/rejected": -31296168.0, "logps/chosen": -354.84466552734375, "logps/rejected": -570.5792846679688, "loss": 0.0206, "rewards/chosen": 7.279775619506836, "rewards/margins": 23.8420467376709, "rewards/rejected": -16.562271118164062, "step": 2535 }, { "epoch": 0.6345552358313524, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38264850.666666664, "logits/rejected": -67447808.0, "logps/chosen": -371.488037109375, "logps/rejected": -774.5887044270834, "loss": 0.0236, "rewards/chosen": 7.023232777913411, "rewards/margins": 24.775208791097004, "rewards/rejected": -17.751976013183594, "step": 2536 }, { "epoch": 0.6348054547729263, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17698210.46153846, "logits/rejected": -55249338.18181818, "logps/chosen": -248.60697115384616, "logps/rejected": -647.9096235795455, "loss": 0.054, "rewards/chosen": 6.410416236290565, "rewards/margins": 26.86635162780335, "rewards/rejected": -20.455935391512785, "step": 2537 }, { "epoch": 0.6350556737145002, "grad_norm": 7.875, "kl": 4.359576225280762, "learning_rate": 5e-06, "logits/chosen": -82343429.81818181, "logits/rejected": -27947869.53846154, "logps/chosen": -425.4182794744318, "logps/rejected": -543.4133112980769, "loss": 0.0244, "rewards/chosen": 7.895267833362926, "rewards/margins": 22.897347723687446, "rewards/rejected": -15.00207989032452, "step": 2538 }, { "epoch": 0.6353058926560741, "grad_norm": 3.4375, "kl": 1.2204082012176514, "learning_rate": 5e-06, "logits/chosen": -72080561.23076923, "logits/rejected": -59333690.18181818, "logps/chosen": -473.35509314903845, "logps/rejected": -680.5055930397727, "loss": 0.0076, "rewards/chosen": 9.993110069861778, "rewards/margins": 29.56628567355496, "rewards/rejected": -19.573175603693183, "step": 2539 }, { "epoch": 0.6355561115976479, "grad_norm": 0.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64160458.666666664, "logits/rejected": -83578677.33333333, "logps/chosen": -426.4730224609375, "logps/rejected": -705.80517578125, "loss": 0.0092, "rewards/chosen": 9.952310562133789, "rewards/margins": 28.4623228708903, "rewards/rejected": -18.51001230875651, "step": 2540 }, { "epoch": 0.6358063305392219, "grad_norm": 9.625, "kl": 3.2951273918151855, "learning_rate": 5e-06, "logits/chosen": -25205796.57142857, "logits/rejected": -73031590.4, "logps/chosen": -410.46718052455356, "logps/rejected": -429.57119140625, "loss": 0.0345, "rewards/chosen": 7.556738172258649, "rewards/margins": 21.21426685878209, "rewards/rejected": -13.657528686523438, "step": 2541 }, { "epoch": 0.6360565494807957, "grad_norm": 13.8125, "kl": 6.436875820159912, "learning_rate": 5e-06, "logits/chosen": -44706156.307692304, "logits/rejected": -45259485.09090909, "logps/chosen": -469.2804987980769, "logps/rejected": -649.1965553977273, "loss": 0.0252, "rewards/chosen": 10.109105036808895, "rewards/margins": 30.280660989401223, "rewards/rejected": -20.17155595259233, "step": 2542 }, { "epoch": 0.6363067684223696, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43531886.54545455, "logits/rejected": -6025697.230769231, "logps/chosen": -367.37300248579544, "logps/rejected": -682.2079326923077, "loss": 0.0077, "rewards/chosen": 8.827782370827414, "rewards/margins": 26.245462377588233, "rewards/rejected": -17.41768000676082, "step": 2543 }, { "epoch": 0.6365569873639435, "grad_norm": 13.125, "kl": 1.0708554983139038, "learning_rate": 5e-06, "logits/chosen": -91921429.33333333, "logits/rejected": -16459538.666666666, "logps/chosen": -534.0583089192709, "logps/rejected": -550.5040283203125, "loss": 0.0805, "rewards/chosen": 10.274063110351562, "rewards/margins": 21.978111267089844, "rewards/rejected": -11.704048156738281, "step": 2544 }, { "epoch": 0.6368072063055173, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46336632.0, "logits/rejected": -31558392.0, "logps/chosen": -293.7705078125, "logps/rejected": -542.282958984375, "loss": 0.0161, "rewards/chosen": 7.196949481964111, "rewards/margins": 19.90035581588745, "rewards/rejected": -12.70340633392334, "step": 2545 }, { "epoch": 0.6370574252470912, "grad_norm": 1.515625, "kl": 2.2637782096862793, "learning_rate": 5e-06, "logits/chosen": -39512566.4, "logits/rejected": -68732182.85714285, "logps/chosen": -424.83955078125, "logps/rejected": -586.6111886160714, "loss": 0.0101, "rewards/chosen": 8.650949096679687, "rewards/margins": 24.075657871791293, "rewards/rejected": -15.424708775111608, "step": 2546 }, { "epoch": 0.637307644188665, "grad_norm": 1.7421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 40297088.0, "logits/rejected": -62848903.11111111, "logps/chosen": -545.55546875, "logps/rejected": -671.9635416666666, "loss": 0.0227, "rewards/chosen": 8.619359334309896, "rewards/margins": 26.290486653645836, "rewards/rejected": -17.671127319335938, "step": 2547 }, { "epoch": 0.637557863130239, "grad_norm": 1.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37049280.0, "logits/rejected": -32221056.0, "logps/chosen": -372.17744584517044, "logps/rejected": -431.46424278846155, "loss": 0.0268, "rewards/chosen": 7.39745400168679, "rewards/margins": 18.247184273246287, "rewards/rejected": -10.849730271559496, "step": 2548 }, { "epoch": 0.6378080820718128, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46406762.666666664, "logits/rejected": -32089821.866666667, "logps/chosen": -321.1032443576389, "logps/rejected": -628.3783203125, "loss": 0.0422, "rewards/chosen": 5.7843475341796875, "rewards/margins": 23.312832641601563, "rewards/rejected": -17.528485107421876, "step": 2549 }, { "epoch": 0.6380583010133867, "grad_norm": 16.875, "kl": 1.0286548137664795, "learning_rate": 5e-06, "logits/chosen": -42210924.307692304, "logits/rejected": -15148734.545454545, "logps/chosen": -412.9252178485577, "logps/rejected": -406.46835049715907, "loss": 0.0766, "rewards/chosen": 8.12040064885066, "rewards/margins": 17.835243011688018, "rewards/rejected": -9.714842362837357, "step": 2550 }, { "epoch": 0.6383085199549606, "grad_norm": 14.75, "kl": 1.0867408514022827, "learning_rate": 5e-06, "logits/chosen": -60261650.28571428, "logits/rejected": -39730502.4, "logps/chosen": -365.9883510044643, "logps/rejected": -697.97705078125, "loss": 0.057, "rewards/chosen": 6.718558175223214, "rewards/margins": 25.85977063860212, "rewards/rejected": -19.141212463378906, "step": 2551 }, { "epoch": 0.6385587388965345, "grad_norm": 0.93359375, "kl": 2.429473876953125, "learning_rate": 5e-06, "logits/chosen": -53061090.13333333, "logits/rejected": -48269105.777777776, "logps/chosen": -460.12275390625, "logps/rejected": -900.0715603298611, "loss": 0.0027, "rewards/chosen": 8.950199381510417, "rewards/margins": 29.09427286783854, "rewards/rejected": -20.144073486328125, "step": 2552 }, { "epoch": 0.6388089578381083, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36669571.2, "logits/rejected": -64002157.71428572, "logps/chosen": -335.831396484375, "logps/rejected": -828.0129743303571, "loss": 0.0498, "rewards/chosen": 7.682994842529297, "rewards/margins": 25.14322934831892, "rewards/rejected": -17.46023450578962, "step": 2553 }, { "epoch": 0.6390591767796823, "grad_norm": 3.390625, "kl": 6.784902095794678, "learning_rate": 5e-06, "logits/chosen": -43633937.45454545, "logits/rejected": -72084475.07692307, "logps/chosen": -364.6028497869318, "logps/rejected": -729.9070763221154, "loss": 0.0309, "rewards/chosen": 8.639443137428977, "rewards/margins": 19.99520297817417, "rewards/rejected": -11.355759840745192, "step": 2554 }, { "epoch": 0.6393093957212561, "grad_norm": 5.71875, "kl": 1.9058170318603516, "learning_rate": 5e-06, "logits/chosen": -33871562.666666664, "logits/rejected": -59478357.333333336, "logps/chosen": -397.8621419270833, "logps/rejected": -467.5590006510417, "loss": 0.0272, "rewards/chosen": 7.556326548258464, "rewards/margins": 20.83548863728841, "rewards/rejected": -13.279162089029947, "step": 2555 }, { "epoch": 0.63955961466283, "grad_norm": 3.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69160774.4, "logits/rejected": -57966518.85714286, "logps/chosen": -419.158984375, "logps/rejected": -613.0360630580357, "loss": 0.0149, "rewards/chosen": 7.633789825439453, "rewards/margins": 22.48558120727539, "rewards/rejected": -14.851791381835938, "step": 2556 }, { "epoch": 0.6398098336044038, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34730304.0, "logits/rejected": -50187769.6, "logps/chosen": -348.24246651785717, "logps/rejected": -544.58701171875, "loss": 0.0341, "rewards/chosen": 7.814400809151786, "rewards/margins": 20.10708956037249, "rewards/rejected": -12.292688751220703, "step": 2557 }, { "epoch": 0.6400600525459778, "grad_norm": 1.953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29714656.0, "logits/rejected": -48258982.4, "logps/chosen": -420.725830078125, "logps/rejected": -581.8076171875, "loss": 0.0136, "rewards/chosen": 8.815721299913195, "rewards/margins": 22.09341803656684, "rewards/rejected": -13.277696736653645, "step": 2558 }, { "epoch": 0.6403102714875516, "grad_norm": 4.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30447932.444444444, "logits/rejected": -49111176.53333333, "logps/chosen": -383.51207139756946, "logps/rejected": -623.5916666666667, "loss": 0.0194, "rewards/chosen": 9.108394198947483, "rewards/margins": 21.439952426486546, "rewards/rejected": -12.331558227539062, "step": 2559 }, { "epoch": 0.6405604904291254, "grad_norm": 15.375, "kl": 13.128257751464844, "learning_rate": 5e-06, "logits/chosen": -56417424.0, "logits/rejected": -38172380.0, "logps/chosen": -374.05157470703125, "logps/rejected": -694.3133544921875, "loss": 0.0496, "rewards/chosen": 7.839700698852539, "rewards/margins": 23.434576988220215, "rewards/rejected": -15.594876289367676, "step": 2560 }, { "epoch": 0.6408107093706994, "grad_norm": 6.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53447286.4, "logits/rejected": -39319300.571428575, "logps/chosen": -350.4548095703125, "logps/rejected": -612.0949358258929, "loss": 0.0107, "rewards/chosen": 6.983589172363281, "rewards/margins": 20.900030953543528, "rewards/rejected": -13.916441781180245, "step": 2561 }, { "epoch": 0.6410609283122732, "grad_norm": 6.0, "kl": 7.286403656005859, "learning_rate": 5e-06, "logits/chosen": -56404580.571428575, "logits/rejected": -7504136.0, "logps/chosen": -516.9685407366071, "logps/rejected": -596.980810546875, "loss": 0.0477, "rewards/chosen": 8.52756336757115, "rewards/margins": 22.6569212777274, "rewards/rejected": -14.12935791015625, "step": 2562 }, { "epoch": 0.6413111472538471, "grad_norm": 7.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55910715.07692308, "logits/rejected": -37080794.18181818, "logps/chosen": -336.80130709134613, "logps/rejected": -506.1842151988636, "loss": 0.0286, "rewards/chosen": 7.195357689490685, "rewards/margins": 19.788637441355032, "rewards/rejected": -12.593279751864346, "step": 2563 }, { "epoch": 0.641561366195421, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45254087.11111111, "logits/rejected": -55797064.53333333, "logps/chosen": -465.9475368923611, "logps/rejected": -476.24462890625, "loss": 0.0127, "rewards/chosen": 9.473346286349827, "rewards/margins": 22.86783481174045, "rewards/rejected": -13.394488525390624, "step": 2564 }, { "epoch": 0.6418115851369949, "grad_norm": 10.5625, "kl": 7.268516540527344, "learning_rate": 5e-06, "logits/chosen": -48703084.0, "logits/rejected": -51075600.0, "logps/chosen": -421.0494384765625, "logps/rejected": -463.1352233886719, "loss": 0.0502, "rewards/chosen": 8.408331871032715, "rewards/margins": 21.744495391845703, "rewards/rejected": -13.336163520812988, "step": 2565 }, { "epoch": 0.6420618040785687, "grad_norm": 15.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30143416.0, "logits/rejected": -16358773.0, "logps/chosen": -408.65142822265625, "logps/rejected": -688.597412109375, "loss": 0.0465, "rewards/chosen": 7.362784385681152, "rewards/margins": 27.470458030700684, "rewards/rejected": -20.10767364501953, "step": 2566 }, { "epoch": 0.6423120230201427, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54584772.0, "logits/rejected": -59398960.0, "logps/chosen": -325.2305603027344, "logps/rejected": -780.05419921875, "loss": 0.0364, "rewards/chosen": 6.76908540725708, "rewards/margins": 22.965229511260986, "rewards/rejected": -16.196144104003906, "step": 2567 }, { "epoch": 0.6425622419617165, "grad_norm": 9.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18299721.846153848, "logits/rejected": -44874088.72727273, "logps/chosen": -230.95872145432693, "logps/rejected": -744.8631036931819, "loss": 0.0441, "rewards/chosen": 5.834239666278545, "rewards/margins": 24.437835746711784, "rewards/rejected": -18.60359608043324, "step": 2568 }, { "epoch": 0.6428124609032904, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -78592526.22222222, "logits/rejected": -35172352.0, "logps/chosen": -445.94135199652777, "logps/rejected": -443.7384765625, "loss": 0.0207, "rewards/chosen": 9.715159098307291, "rewards/margins": 20.093860880533853, "rewards/rejected": -10.378701782226562, "step": 2569 }, { "epoch": 0.6430626798448642, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69234278.4, "logits/rejected": -37025926.85714286, "logps/chosen": -354.488525390625, "logps/rejected": -576.2324916294643, "loss": 0.0696, "rewards/chosen": 7.260047912597656, "rewards/margins": 21.37125723702567, "rewards/rejected": -14.111209324428014, "step": 2570 }, { "epoch": 0.6433128987864382, "grad_norm": 9.0625, "kl": 8.20822525024414, "learning_rate": 5e-06, "logits/chosen": -61277644.8, "logits/rejected": -47898055.11111111, "logps/chosen": -368.7809244791667, "logps/rejected": -461.22645399305554, "loss": 0.0355, "rewards/chosen": 7.210947672526042, "rewards/margins": 16.96502685546875, "rewards/rejected": -9.754079182942709, "step": 2571 }, { "epoch": 0.643563117728012, "grad_norm": 1.9140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 15208137.142857144, "logits/rejected": -30714652.23529412, "logps/chosen": -434.591552734375, "logps/rejected": -570.2740119485294, "loss": 0.0091, "rewards/chosen": 7.537310464041574, "rewards/margins": 21.761845340247916, "rewards/rejected": -14.224534876206341, "step": 2572 }, { "epoch": 0.6438133366695858, "grad_norm": 7.8125, "kl": 8.893774032592773, "learning_rate": 5e-06, "logits/chosen": -62331833.6, "logits/rejected": -63623762.28571428, "logps/chosen": -494.33310546875, "logps/rejected": -761.7556501116071, "loss": 0.0109, "rewards/chosen": 8.482386779785156, "rewards/margins": 27.89463566371373, "rewards/rejected": -19.412248883928573, "step": 2573 }, { "epoch": 0.6440635556111598, "grad_norm": 4.3125, "kl": 0.5777009725570679, "learning_rate": 5e-06, "logits/chosen": -60537403.428571425, "logits/rejected": -36284515.2, "logps/chosen": -414.9247349330357, "logps/rejected": -562.340380859375, "loss": 0.0374, "rewards/chosen": 9.768253871372767, "rewards/margins": 27.31368691580636, "rewards/rejected": -17.545433044433594, "step": 2574 }, { "epoch": 0.6443137745527336, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35045336.0, "logits/rejected": -54628672.0, "logps/chosen": -523.97021484375, "logps/rejected": -754.6220703125, "loss": 0.0091, "rewards/chosen": 8.328710556030273, "rewards/margins": 27.313872655232746, "rewards/rejected": -18.985162099202473, "step": 2575 }, { "epoch": 0.6445639934943075, "grad_norm": 29.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73135059.2, "logits/rejected": -73045385.14285715, "logps/chosen": -470.2392578125, "logps/rejected": -774.4449637276786, "loss": 0.0203, "rewards/chosen": 8.414547729492188, "rewards/margins": 28.71714041573661, "rewards/rejected": -20.30259268624442, "step": 2576 }, { "epoch": 0.6448142124358814, "grad_norm": 0.8203125, "kl": 6.586106777191162, "learning_rate": 5e-06, "logits/chosen": -53333492.36363637, "logits/rejected": -83585860.92307693, "logps/chosen": -338.6180308948864, "logps/rejected": -767.6663912259615, "loss": 0.0091, "rewards/chosen": 8.525799837979404, "rewards/margins": 30.99727529245657, "rewards/rejected": -22.471475454477165, "step": 2577 }, { "epoch": 0.6450644313774553, "grad_norm": 21.25, "kl": 11.7147855758667, "learning_rate": 5e-06, "logits/chosen": -37827764.705882356, "logits/rejected": -49800978.28571428, "logps/chosen": -289.60305606617646, "logps/rejected": -701.91845703125, "loss": 0.1187, "rewards/chosen": 6.395715152516084, "rewards/margins": 22.958223871824117, "rewards/rejected": -16.562508719308035, "step": 2578 }, { "epoch": 0.6453146503190291, "grad_norm": 11.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41999506.28571428, "logits/rejected": -69891960.47058824, "logps/chosen": -296.8842075892857, "logps/rejected": -640.5083869485294, "loss": 0.0226, "rewards/chosen": 7.108986445835659, "rewards/margins": 26.319243583358634, "rewards/rejected": -19.210257137522976, "step": 2579 }, { "epoch": 0.6455648692606031, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 18499828.0, "logits/rejected": -41649420.0, "logps/chosen": -309.2397155761719, "logps/rejected": -451.593017578125, "loss": 0.0302, "rewards/chosen": 6.208970069885254, "rewards/margins": 19.026702880859375, "rewards/rejected": -12.817732810974121, "step": 2580 }, { "epoch": 0.6458150882021769, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18156586.666666668, "logits/rejected": -21711742.666666668, "logps/chosen": -380.1309407552083, "logps/rejected": -575.7657063802084, "loss": 0.0244, "rewards/chosen": 9.304210662841797, "rewards/margins": 25.16513442993164, "rewards/rejected": -15.860923767089844, "step": 2581 }, { "epoch": 0.6460653071437508, "grad_norm": 0.412109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57841184.0, "logits/rejected": -71727394.13333334, "logps/chosen": -327.78602430555554, "logps/rejected": -674.1600911458333, "loss": 0.0011, "rewards/chosen": 8.342170715332031, "rewards/margins": 26.129833475748697, "rewards/rejected": -17.787662760416666, "step": 2582 }, { "epoch": 0.6463155260853246, "grad_norm": 14.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65526245.333333336, "logits/rejected": -41805144.0, "logps/chosen": -394.0609130859375, "logps/rejected": -636.5171305338541, "loss": 0.0529, "rewards/chosen": 7.537298202514648, "rewards/margins": 24.29124387105306, "rewards/rejected": -16.75394566853841, "step": 2583 }, { "epoch": 0.6465657450268986, "grad_norm": 27.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41662989.71428572, "logits/rejected": -55637952.0, "logps/chosen": -524.7522670200893, "logps/rejected": -603.49169921875, "loss": 0.0438, "rewards/chosen": 8.390419006347656, "rewards/margins": 22.677143859863282, "rewards/rejected": -14.286724853515626, "step": 2584 }, { "epoch": 0.6468159639684724, "grad_norm": 0.9453125, "kl": 1.057313323020935, "learning_rate": 5e-06, "logits/chosen": -49415648.0, "logits/rejected": -75937717.33333333, "logps/chosen": -539.6743977864584, "logps/rejected": -685.6486002604166, "loss": 0.0026, "rewards/chosen": 9.297260284423828, "rewards/margins": 25.318923950195312, "rewards/rejected": -16.021663665771484, "step": 2585 }, { "epoch": 0.6470661829100463, "grad_norm": 2.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51800115.2, "logits/rejected": -64162779.428571425, "logps/chosen": -408.997265625, "logps/rejected": -641.6016322544643, "loss": 0.0075, "rewards/chosen": 8.410508728027343, "rewards/margins": 27.354600524902345, "rewards/rejected": -18.944091796875, "step": 2586 }, { "epoch": 0.6473164018516202, "grad_norm": 6.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41831580.44444445, "logits/rejected": -66965619.2, "logps/chosen": -400.2749837239583, "logps/rejected": -724.1975260416667, "loss": 0.021, "rewards/chosen": 7.043175591362847, "rewards/margins": 26.89600355360243, "rewards/rejected": -19.852827962239584, "step": 2587 }, { "epoch": 0.647566620793194, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74825216.0, "logits/rejected": -52825144.0, "logps/chosen": -315.0334167480469, "logps/rejected": -582.433349609375, "loss": 0.0316, "rewards/chosen": 8.182498931884766, "rewards/margins": 24.657867431640625, "rewards/rejected": -16.47536849975586, "step": 2588 }, { "epoch": 0.6478168397347679, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37116824.615384616, "logits/rejected": -41007755.63636363, "logps/chosen": -270.18075796274036, "logps/rejected": -716.0687144886364, "loss": 0.0857, "rewards/chosen": 4.2695943392240086, "rewards/margins": 22.22346563272543, "rewards/rejected": -17.95387129350142, "step": 2589 }, { "epoch": 0.6480670586763418, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38460898.666666664, "logits/rejected": -43403832.88888889, "logps/chosen": -541.3479817708334, "logps/rejected": -611.3713650173611, "loss": 0.0083, "rewards/chosen": 8.231866836547852, "rewards/margins": 23.593986723158096, "rewards/rejected": -15.362119886610243, "step": 2590 }, { "epoch": 0.6483172776179157, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38085765.333333336, "logits/rejected": -42131280.0, "logps/chosen": -418.1512858072917, "logps/rejected": -584.90966796875, "loss": 0.0307, "rewards/chosen": 8.975790659586588, "rewards/margins": 23.90011723836263, "rewards/rejected": -14.924326578776041, "step": 2591 }, { "epoch": 0.6485674965594895, "grad_norm": 20.25, "kl": 2.9637413024902344, "learning_rate": 5e-06, "logits/chosen": -58572352.0, "logits/rejected": 85506803.2, "logps/chosen": -247.87142508370536, "logps/rejected": -681.45712890625, "loss": 0.0784, "rewards/chosen": 5.94172123500279, "rewards/margins": 19.759313092912947, "rewards/rejected": -13.817591857910156, "step": 2592 }, { "epoch": 0.6488177155010635, "grad_norm": 4.53125, "kl": 0.24083614349365234, "learning_rate": 5e-06, "logits/chosen": -66703488.0, "logits/rejected": -36864576.0, "logps/chosen": -408.09033203125, "logps/rejected": -499.81069711538464, "loss": 0.0575, "rewards/chosen": 7.743243824351918, "rewards/margins": 22.046770482630166, "rewards/rejected": -14.303526658278246, "step": 2593 }, { "epoch": 0.6490679344426373, "grad_norm": 11.875, "kl": 0.5005773305892944, "learning_rate": 5e-06, "logits/chosen": -68663125.33333333, "logits/rejected": -60389319.11111111, "logps/chosen": -393.26490885416666, "logps/rejected": -795.6804470486111, "loss": 0.0527, "rewards/chosen": 7.1790308634440105, "rewards/margins": 27.201057773166234, "rewards/rejected": -20.02202690972222, "step": 2594 }, { "epoch": 0.6493181533842112, "grad_norm": 22.75, "kl": 1.237609624862671, "learning_rate": 5e-06, "logits/chosen": -38587463.11111111, "logits/rejected": -44111364.266666666, "logps/chosen": -355.4597981770833, "logps/rejected": -542.9458984375, "loss": 0.0505, "rewards/chosen": 8.114183213975695, "rewards/margins": 21.827253892686635, "rewards/rejected": -13.713070678710938, "step": 2595 }, { "epoch": 0.649568372325785, "grad_norm": 6.875, "kl": 4.719232082366943, "learning_rate": 5e-06, "logits/chosen": -63919808.0, "logits/rejected": -58837194.666666664, "logps/chosen": -337.81231689453125, "logps/rejected": -636.07421875, "loss": 0.0752, "rewards/chosen": 7.54776128133138, "rewards/margins": 19.845111846923828, "rewards/rejected": -12.297350565592447, "step": 2596 }, { "epoch": 0.649818591267359, "grad_norm": 18.125, "kl": 11.642326354980469, "learning_rate": 5e-06, "logits/chosen": -39975397.64705882, "logits/rejected": -49494884.571428575, "logps/chosen": -366.72435087316177, "logps/rejected": -570.8751395089286, "loss": 0.0718, "rewards/chosen": 8.154209810144762, "rewards/margins": 21.989785458861277, "rewards/rejected": -13.835575648716517, "step": 2597 }, { "epoch": 0.6500688102089328, "grad_norm": 5.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41449570.90909091, "logits/rejected": -60956957.538461536, "logps/chosen": -380.1803533380682, "logps/rejected": -663.2719350961538, "loss": 0.039, "rewards/chosen": 8.132714705033736, "rewards/margins": 27.300188131265706, "rewards/rejected": -19.16747342623197, "step": 2598 }, { "epoch": 0.6503190291505067, "grad_norm": 11.6875, "kl": 3.1423709392547607, "learning_rate": 5e-06, "logits/chosen": -36936561.23076923, "logits/rejected": -81594624.0, "logps/chosen": -331.15767728365387, "logps/rejected": -920.0326704545455, "loss": 0.0523, "rewards/chosen": 7.3363811786358175, "rewards/margins": 34.22248125409747, "rewards/rejected": -26.88610007546165, "step": 2599 }, { "epoch": 0.6505692480920806, "grad_norm": 3.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53727578.666666664, "logits/rejected": -41584160.0, "logps/chosen": -281.21006266276044, "logps/rejected": -617.6715901692709, "loss": 0.0219, "rewards/chosen": 7.237630208333333, "rewards/margins": 23.007659912109375, "rewards/rejected": -15.770029703776041, "step": 2600 }, { "epoch": 0.6508194670336545, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22713586.90909091, "logits/rejected": -54916745.84615385, "logps/chosen": -363.50972123579544, "logps/rejected": -601.8341346153846, "loss": 0.0154, "rewards/chosen": 7.86307109485973, "rewards/margins": 24.438617359508168, "rewards/rejected": -16.575546264648438, "step": 2601 }, { "epoch": 0.6510696859752283, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39878627.2, "logits/rejected": -44275725.71428572, "logps/chosen": -452.378662109375, "logps/rejected": -631.1036551339286, "loss": 0.007, "rewards/chosen": 9.500386810302734, "rewards/margins": 25.22063762119838, "rewards/rejected": -15.720250810895648, "step": 2602 }, { "epoch": 0.6513199049168023, "grad_norm": 2.578125, "kl": 6.551934719085693, "learning_rate": 5e-06, "logits/chosen": -37560320.0, "logits/rejected": -32953948.8, "logps/chosen": -380.8405064174107, "logps/rejected": -537.02451171875, "loss": 0.0408, "rewards/chosen": 9.16228267124721, "rewards/margins": 22.43976069859096, "rewards/rejected": -13.27747802734375, "step": 2603 }, { "epoch": 0.6515701238583761, "grad_norm": 10.75, "kl": 2.7489497661590576, "learning_rate": 5e-06, "logits/chosen": -48473190.4, "logits/rejected": -48778688.0, "logps/chosen": -272.7330729166667, "logps/rejected": -877.6409505208334, "loss": 0.0483, "rewards/chosen": 7.909400431315104, "rewards/margins": 34.88739047580295, "rewards/rejected": -26.977990044487846, "step": 2604 }, { "epoch": 0.6518203427999499, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33295990.4, "logits/rejected": -77323437.71428572, "logps/chosen": -345.87158203125, "logps/rejected": -832.4060407366071, "loss": 0.0222, "rewards/chosen": 7.453889465332031, "rewards/margins": 29.814868818010602, "rewards/rejected": -22.360979352678573, "step": 2605 }, { "epoch": 0.6520705617415238, "grad_norm": 1.8515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34417036.307692304, "logits/rejected": -48681489.45454545, "logps/chosen": -199.28910006009616, "logps/rejected": -614.1506125710227, "loss": 0.0605, "rewards/chosen": 5.568749060997596, "rewards/margins": 22.44240351323481, "rewards/rejected": -16.873654452237215, "step": 2606 }, { "epoch": 0.6523207806830977, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62808541.09090909, "logits/rejected": -29936561.230769232, "logps/chosen": -391.92649147727275, "logps/rejected": -619.8834134615385, "loss": 0.0287, "rewards/chosen": 9.098356767134232, "rewards/margins": 23.90413820493471, "rewards/rejected": -14.80578143780048, "step": 2607 }, { "epoch": 0.6525709996246716, "grad_norm": 1.0234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45054949.333333336, "logits/rejected": -37143936.0, "logps/chosen": -422.9926350911458, "logps/rejected": -481.450927734375, "loss": 0.0235, "rewards/chosen": 9.71894391377767, "rewards/margins": 23.11368497212728, "rewards/rejected": -13.39474105834961, "step": 2608 }, { "epoch": 0.6528212185662454, "grad_norm": 4.09375, "kl": 4.311176776885986, "learning_rate": 5e-06, "logits/chosen": -68310714.66666667, "logits/rejected": -42685448.0, "logps/chosen": -415.047607421875, "logps/rejected": -511.8459065755208, "loss": 0.0219, "rewards/chosen": 9.17621103922526, "rewards/margins": 22.7979736328125, "rewards/rejected": -13.62176259358724, "step": 2609 }, { "epoch": 0.6530714375078194, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66322656.0, "logits/rejected": -69696725.33333333, "logps/chosen": -421.3140869140625, "logps/rejected": -757.9734700520834, "loss": 0.044, "rewards/chosen": 8.70077641805013, "rewards/margins": 23.71949640909831, "rewards/rejected": -15.018719991048178, "step": 2610 }, { "epoch": 0.6533216564493932, "grad_norm": 2.59375, "kl": 5.4555487632751465, "learning_rate": 5e-06, "logits/chosen": -44096768.0, "logits/rejected": -18919473.6, "logps/chosen": -390.1876743861607, "logps/rejected": -573.32109375, "loss": 0.006, "rewards/chosen": 9.067035130092076, "rewards/margins": 20.599814060756138, "rewards/rejected": -11.532778930664062, "step": 2611 }, { "epoch": 0.6535718753909671, "grad_norm": 2.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19577036.0, "logits/rejected": -49247404.0, "logps/chosen": -321.8989562988281, "logps/rejected": -562.6668701171875, "loss": 0.0041, "rewards/chosen": 7.552963733673096, "rewards/margins": 23.31929063796997, "rewards/rejected": -15.766326904296875, "step": 2612 }, { "epoch": 0.653822094332541, "grad_norm": 3.921875, "kl": 6.442388534545898, "learning_rate": 5e-06, "logits/chosen": -74628342.15384616, "logits/rejected": -41449364.36363637, "logps/chosen": -438.7626953125, "logps/rejected": -608.5917524857955, "loss": 0.0099, "rewards/chosen": 9.71207486666166, "rewards/margins": 22.128789621633253, "rewards/rejected": -12.416714754971592, "step": 2613 }, { "epoch": 0.6540723132741149, "grad_norm": 2.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51834160.0, "logits/rejected": -33132392.0, "logps/chosen": -382.906005859375, "logps/rejected": -702.3878580729166, "loss": 0.0201, "rewards/chosen": 8.743253707885742, "rewards/margins": 25.969911575317383, "rewards/rejected": -17.22665786743164, "step": 2614 }, { "epoch": 0.6543225322156887, "grad_norm": 13.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41921804.8, "logits/rejected": -35741430.85714286, "logps/chosen": -424.86083984375, "logps/rejected": -463.12461635044644, "loss": 0.0334, "rewards/chosen": 8.369960021972656, "rewards/margins": 19.491855948311944, "rewards/rejected": -11.121895926339286, "step": 2615 }, { "epoch": 0.6545727511572627, "grad_norm": 3.8125, "kl": 4.10398006439209, "learning_rate": 5e-06, "logits/chosen": -41749198.54545455, "logits/rejected": -45669518.76923077, "logps/chosen": -484.33615944602275, "logps/rejected": -586.2886117788462, "loss": 0.0316, "rewards/chosen": 10.513512351296164, "rewards/margins": 24.996315696022727, "rewards/rejected": -14.482803344726562, "step": 2616 }, { "epoch": 0.6548229700988365, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40621316.92307692, "logits/rejected": -50951941.81818182, "logps/chosen": -308.72742638221155, "logps/rejected": -660.2762340198864, "loss": 0.0441, "rewards/chosen": 7.26693608210637, "rewards/margins": 22.049863161740603, "rewards/rejected": -14.782927079634232, "step": 2617 }, { "epoch": 0.6550731890404103, "grad_norm": 13.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39337940.0, "logits/rejected": -63302864.0, "logps/chosen": -480.95306396484375, "logps/rejected": -740.02587890625, "loss": 0.0204, "rewards/chosen": 10.122875213623047, "rewards/margins": 25.432799339294434, "rewards/rejected": -15.309924125671387, "step": 2618 }, { "epoch": 0.6553234079819842, "grad_norm": 11.1875, "kl": 8.538969993591309, "learning_rate": 5e-06, "logits/chosen": -16812114.666666668, "logits/rejected": -33431994.666666668, "logps/chosen": -322.13547770182294, "logps/rejected": -572.052978515625, "loss": 0.0858, "rewards/chosen": 7.46705436706543, "rewards/margins": 22.789865493774414, "rewards/rejected": -15.322811126708984, "step": 2619 }, { "epoch": 0.6555736269235581, "grad_norm": 5.46875, "kl": 16.768688201904297, "learning_rate": 5e-06, "logits/chosen": -32138568.0, "logits/rejected": -38580504.0, "logps/chosen": -355.4034423828125, "logps/rejected": -440.1414794921875, "loss": 0.0931, "rewards/chosen": 8.695669174194336, "rewards/margins": 19.525391578674316, "rewards/rejected": -10.82972240447998, "step": 2620 }, { "epoch": 0.655823845865132, "grad_norm": 7.5625, "kl": 3.1007308959960938, "learning_rate": 5e-06, "logits/chosen": -77448777.14285715, "logits/rejected": -38874566.4, "logps/chosen": -450.631591796875, "logps/rejected": -582.0001953125, "loss": 0.0213, "rewards/chosen": 8.787045070103236, "rewards/margins": 21.606362697056362, "rewards/rejected": -12.819317626953126, "step": 2621 }, { "epoch": 0.6560740648067058, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44684787.2, "logits/rejected": -42661673.14285714, "logps/chosen": -328.97255859375, "logps/rejected": -541.1925223214286, "loss": 0.0477, "rewards/chosen": 7.321540069580078, "rewards/margins": 18.90313208443778, "rewards/rejected": -11.581592014857701, "step": 2622 }, { "epoch": 0.6563242837482798, "grad_norm": 7.21875, "kl": 8.728328704833984, "learning_rate": 5e-06, "logits/chosen": -56993664.0, "logits/rejected": -47757222.4, "logps/chosen": -450.7101353236607, "logps/rejected": -482.53330078125, "loss": 0.0386, "rewards/chosen": 9.207982744489398, "rewards/margins": 20.876314653669084, "rewards/rejected": -11.668331909179688, "step": 2623 }, { "epoch": 0.6565745026898536, "grad_norm": 9.75, "kl": 3.767920970916748, "learning_rate": 5e-06, "logits/chosen": -64791844.571428575, "logits/rejected": -11844357.6, "logps/chosen": -320.60986328125, "logps/rejected": -582.237744140625, "loss": 0.075, "rewards/chosen": 6.097769056047712, "rewards/margins": 23.326014600481308, "rewards/rejected": -17.228245544433594, "step": 2624 }, { "epoch": 0.6568247216314275, "grad_norm": 4.8125, "kl": 7.098320007324219, "learning_rate": 5e-06, "logits/chosen": -74443781.81818181, "logits/rejected": -34297223.384615384, "logps/chosen": -432.7678888494318, "logps/rejected": -487.2122145432692, "loss": 0.0092, "rewards/chosen": 10.14233467795632, "rewards/margins": 25.001911563473147, "rewards/rejected": -14.859576885516827, "step": 2625 }, { "epoch": 0.6570749405730014, "grad_norm": 8.0625, "kl": 2.5036659240722656, "learning_rate": 5e-06, "logits/chosen": -28657366.85714286, "logits/rejected": -61180857.6, "logps/chosen": -384.55381556919644, "logps/rejected": -636.5068359375, "loss": 0.0547, "rewards/chosen": 8.068157741001674, "rewards/margins": 22.446116420200894, "rewards/rejected": -14.377958679199219, "step": 2626 }, { "epoch": 0.6573251595145753, "grad_norm": 12.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -912398.4, "logits/rejected": -47735149.71428572, "logps/chosen": -331.27373046875, "logps/rejected": -594.779296875, "loss": 0.0435, "rewards/chosen": 7.6825714111328125, "rewards/margins": 22.75413077218192, "rewards/rejected": -15.071559361049108, "step": 2627 }, { "epoch": 0.6575753784561491, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33419757.714285713, "logits/rejected": -50945536.0, "logps/chosen": -279.91859654017856, "logps/rejected": -494.073681640625, "loss": 0.0531, "rewards/chosen": 6.421715872628348, "rewards/margins": 18.4920649937221, "rewards/rejected": -12.07034912109375, "step": 2628 }, { "epoch": 0.6578255973977231, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21487148.8, "logits/rejected": 105332103.1111111, "logps/chosen": -407.3789388020833, "logps/rejected": -415.9631076388889, "loss": 0.026, "rewards/chosen": 7.703874715169271, "rewards/margins": 22.485836791992188, "rewards/rejected": -14.781962076822916, "step": 2629 }, { "epoch": 0.6580758163392969, "grad_norm": 8.0, "kl": 5.744670867919922, "learning_rate": 5e-06, "logits/chosen": -48434564.92307692, "logits/rejected": -43658842.18181818, "logps/chosen": -294.0241511418269, "logps/rejected": -720.3905362215909, "loss": 0.0465, "rewards/chosen": 7.145712045522837, "rewards/margins": 27.20128732961375, "rewards/rejected": -20.05557528409091, "step": 2630 }, { "epoch": 0.6583260352808707, "grad_norm": 2.859375, "kl": 10.360231399536133, "learning_rate": 5e-06, "logits/chosen": -31647706.181818184, "logits/rejected": -59683968.0, "logps/chosen": -467.26265092329544, "logps/rejected": -525.9380258413462, "loss": 0.0044, "rewards/chosen": 10.719039223410867, "rewards/margins": 23.136323675409066, "rewards/rejected": -12.417284451998198, "step": 2631 }, { "epoch": 0.6585762542224446, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53001244.8, "logits/rejected": -24031844.57142857, "logps/chosen": -525.626904296875, "logps/rejected": -836.4875837053571, "loss": 0.0043, "rewards/chosen": 10.104798126220704, "rewards/margins": 30.257202911376954, "rewards/rejected": -20.15240478515625, "step": 2632 }, { "epoch": 0.6588264731640185, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44530180.0, "logits/rejected": -46058580.0, "logps/chosen": -293.4967346191406, "logps/rejected": -844.9837646484375, "loss": 0.0231, "rewards/chosen": 7.778436183929443, "rewards/margins": 28.70551824569702, "rewards/rejected": -20.927082061767578, "step": 2633 }, { "epoch": 0.6590766921055924, "grad_norm": 22.75, "kl": 6.236053466796875, "learning_rate": 5e-06, "logits/chosen": -63010933.333333336, "logits/rejected": -37121733.333333336, "logps/chosen": -423.2620035807292, "logps/rejected": -611.0253092447916, "loss": 0.0658, "rewards/chosen": 7.813900629679362, "rewards/margins": 25.391097386678062, "rewards/rejected": -17.5771967569987, "step": 2634 }, { "epoch": 0.6593269110471662, "grad_norm": 5.625, "kl": 17.54743194580078, "learning_rate": 5e-06, "logits/chosen": -51618589.86666667, "logits/rejected": -53489447.11111111, "logps/chosen": -367.08470052083334, "logps/rejected": -655.8415798611111, "loss": 0.0585, "rewards/chosen": 8.570930989583333, "rewards/margins": 22.192786831325954, "rewards/rejected": -13.621855841742622, "step": 2635 }, { "epoch": 0.6595771299887402, "grad_norm": 10.6875, "kl": 10.473540306091309, "learning_rate": 5e-06, "logits/chosen": -34977575.384615384, "logits/rejected": -54290164.36363637, "logps/chosen": -367.66372445913464, "logps/rejected": -696.6633522727273, "loss": 0.0769, "rewards/chosen": 8.727881798377403, "rewards/margins": 25.78831215171547, "rewards/rejected": -17.060430353338067, "step": 2636 }, { "epoch": 0.659827348930314, "grad_norm": 2.65625, "kl": 18.969594955444336, "learning_rate": 5e-06, "logits/chosen": -34904277.333333336, "logits/rejected": -50602343.11111111, "logps/chosen": -443.44622395833335, "logps/rejected": -727.5763346354166, "loss": 0.1004, "rewards/chosen": 9.874061075846354, "rewards/margins": 30.702250501844617, "rewards/rejected": -20.828189425998264, "step": 2637 }, { "epoch": 0.6600775678718879, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56556136.72727273, "logits/rejected": -51331032.615384616, "logps/chosen": -404.1983753551136, "logps/rejected": -582.3425480769231, "loss": 0.031, "rewards/chosen": 8.684429515491832, "rewards/margins": 25.402517518797122, "rewards/rejected": -16.71808800330529, "step": 2638 }, { "epoch": 0.6603277868134618, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -95580561.45454545, "logits/rejected": -25735182.769230768, "logps/chosen": -395.36337002840907, "logps/rejected": -523.5632136418269, "loss": 0.021, "rewards/chosen": 10.181863264604049, "rewards/margins": 24.509445990715825, "rewards/rejected": -14.327582726111778, "step": 2639 }, { "epoch": 0.6605780057550357, "grad_norm": 10.375, "kl": 1.0356299877166748, "learning_rate": 5e-06, "logits/chosen": -60958498.461538464, "logits/rejected": -71077952.0, "logps/chosen": -494.8004807692308, "logps/rejected": -475.3634144176136, "loss": 0.0552, "rewards/chosen": 9.590360788198618, "rewards/margins": 24.334648452438675, "rewards/rejected": -14.744287664240057, "step": 2640 }, { "epoch": 0.6608282246966095, "grad_norm": 6.875, "kl": 1.8426475524902344, "learning_rate": 5e-06, "logits/chosen": -53475032.615384616, "logits/rejected": -38617565.09090909, "logps/chosen": -306.72472205528845, "logps/rejected": -636.8140092329545, "loss": 0.047, "rewards/chosen": 7.489544795109675, "rewards/margins": 24.836225869772317, "rewards/rejected": -17.34668107466264, "step": 2641 }, { "epoch": 0.6610784436381834, "grad_norm": 0.70703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67584972.8, "logits/rejected": -34447282.28571428, "logps/chosen": -416.89189453125, "logps/rejected": -725.1089564732143, "loss": 0.0012, "rewards/chosen": 9.577384948730469, "rewards/margins": 27.50143323625837, "rewards/rejected": -17.924048287527903, "step": 2642 }, { "epoch": 0.6613286625797573, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65972309.333333336, "logits/rejected": -31151413.333333332, "logps/chosen": -431.37787543402777, "logps/rejected": -526.2116536458333, "loss": 0.0088, "rewards/chosen": 7.559441460503472, "rewards/margins": 23.66524692111545, "rewards/rejected": -16.10580546061198, "step": 2643 }, { "epoch": 0.6615788815213312, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35267704.0, "logits/rejected": -77879048.0, "logps/chosen": -368.94427490234375, "logps/rejected": -641.0972290039062, "loss": 0.0259, "rewards/chosen": 7.364168167114258, "rewards/margins": 24.97359848022461, "rewards/rejected": -17.60943031311035, "step": 2644 }, { "epoch": 0.661829100462905, "grad_norm": 18.75, "kl": 12.506287574768066, "learning_rate": 5e-06, "logits/chosen": -79812040.0, "logits/rejected": -57834392.0, "logps/chosen": -451.6444396972656, "logps/rejected": -684.90283203125, "loss": 0.0774, "rewards/chosen": 8.937474250793457, "rewards/margins": 30.96648120880127, "rewards/rejected": -22.029006958007812, "step": 2645 }, { "epoch": 0.662079319404479, "grad_norm": 6.46875, "kl": 5.260871887207031, "learning_rate": 5e-06, "logits/chosen": -62451155.692307696, "logits/rejected": -79673402.18181819, "logps/chosen": -331.97506009615387, "logps/rejected": -604.9599609375, "loss": 0.0453, "rewards/chosen": 7.36358642578125, "rewards/margins": 23.182022094726562, "rewards/rejected": -15.818435668945312, "step": 2646 }, { "epoch": 0.6623295383460528, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75212317.53846154, "logits/rejected": -59983901.09090909, "logps/chosen": -367.33067908653845, "logps/rejected": -610.1803533380681, "loss": 0.0153, "rewards/chosen": 7.38426501934345, "rewards/margins": 25.03890313635339, "rewards/rejected": -17.65463811700994, "step": 2647 }, { "epoch": 0.6625797572876266, "grad_norm": 7.25, "kl": 5.380631923675537, "learning_rate": 5e-06, "logits/chosen": -59594596.0, "logits/rejected": -58787132.0, "logps/chosen": -439.33392333984375, "logps/rejected": -561.7689208984375, "loss": 0.0424, "rewards/chosen": 8.779102325439453, "rewards/margins": 26.367700576782227, "rewards/rejected": -17.588598251342773, "step": 2648 }, { "epoch": 0.6628299762292006, "grad_norm": 8.625, "kl": 3.855585813522339, "learning_rate": 5e-06, "logits/chosen": -43827140.266666666, "logits/rejected": -73567402.66666667, "logps/chosen": -382.0679036458333, "logps/rejected": -598.9692925347222, "loss": 0.0373, "rewards/chosen": 8.954344685872396, "rewards/margins": 22.907244194878473, "rewards/rejected": -13.952899509006077, "step": 2649 }, { "epoch": 0.6630801951707744, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77913609.14285715, "logits/rejected": -39956329.4117647, "logps/chosen": -359.77748325892856, "logps/rejected": -775.2999195772059, "loss": 0.0164, "rewards/chosen": 7.372807094029018, "rewards/margins": 26.760089521648503, "rewards/rejected": -19.387282427619486, "step": 2650 }, { "epoch": 0.6633304141123483, "grad_norm": 0.8515625, "kl": 2.476182460784912, "learning_rate": 5e-06, "logits/chosen": -50669813.333333336, "logits/rejected": -49240709.333333336, "logps/chosen": -430.9484049479167, "logps/rejected": -511.4052327473958, "loss": 0.0013, "rewards/chosen": 10.269240697224935, "rewards/margins": 25.08985201517741, "rewards/rejected": -14.820611317952475, "step": 2651 }, { "epoch": 0.6635806330539222, "grad_norm": 6.15625, "kl": 5.9361891746521, "learning_rate": 5e-06, "logits/chosen": -19159848.727272727, "logits/rejected": 128537875.6923077, "logps/chosen": -547.6073774857955, "logps/rejected": -489.01254507211536, "loss": 0.0666, "rewards/chosen": 9.78666132146662, "rewards/margins": 23.22036775175508, "rewards/rejected": -13.433706430288462, "step": 2652 }, { "epoch": 0.6638308519954961, "grad_norm": 8.8125, "kl": 0.20814132690429688, "learning_rate": 5e-06, "logits/chosen": -62108322.13333333, "logits/rejected": -56376504.88888889, "logps/chosen": -351.02565104166666, "logps/rejected": -807.0352647569445, "loss": 0.0903, "rewards/chosen": 6.760453287760416, "rewards/margins": 26.850464884440104, "rewards/rejected": -20.090011596679688, "step": 2653 }, { "epoch": 0.6640810709370699, "grad_norm": 1.0, "kl": 4.038382530212402, "learning_rate": 5e-06, "logits/chosen": -28677050.181818184, "logits/rejected": -70025127.38461539, "logps/chosen": -431.50483842329544, "logps/rejected": -928.5794771634615, "loss": 0.0165, "rewards/chosen": 7.8884499289772725, "rewards/margins": 29.425836309686407, "rewards/rejected": -21.537386380709133, "step": 2654 }, { "epoch": 0.6643312898786438, "grad_norm": 15.0625, "kl": 7.838709831237793, "learning_rate": 5e-06, "logits/chosen": -51181216.0, "logits/rejected": -20514586.666666668, "logps/chosen": -425.201171875, "logps/rejected": -587.861328125, "loss": 0.0611, "rewards/chosen": 10.227081298828125, "rewards/margins": 24.616216023763023, "rewards/rejected": -14.389134724934896, "step": 2655 }, { "epoch": 0.6645815088202177, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44999498.666666664, "logits/rejected": -32765482.666666668, "logps/chosen": -475.5964626736111, "logps/rejected": -602.8986979166667, "loss": 0.0044, "rewards/chosen": 10.809618631998697, "rewards/margins": 25.595509338378903, "rewards/rejected": -14.785890706380208, "step": 2656 }, { "epoch": 0.6648317277617916, "grad_norm": 15.625, "kl": 6.830526351928711, "learning_rate": 5e-06, "logits/chosen": -60633049.6, "logits/rejected": -46874701.71428572, "logps/chosen": -407.192236328125, "logps/rejected": -644.5550362723214, "loss": 0.0633, "rewards/chosen": 9.8758544921875, "rewards/margins": 23.820674351283483, "rewards/rejected": -13.944819859095983, "step": 2657 }, { "epoch": 0.6650819467033654, "grad_norm": 9.1875, "kl": 4.8259053230285645, "learning_rate": 5e-06, "logits/chosen": -45385262.93333333, "logits/rejected": -40165621.333333336, "logps/chosen": -409.7986328125, "logps/rejected": -661.9758029513889, "loss": 0.0248, "rewards/chosen": 9.176368204752604, "rewards/margins": 22.446957397460938, "rewards/rejected": -13.270589192708334, "step": 2658 }, { "epoch": 0.6653321656449394, "grad_norm": 2.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30742952.727272727, "logits/rejected": -51226121.84615385, "logps/chosen": -293.54150390625, "logps/rejected": -695.4519230769231, "loss": 0.0261, "rewards/chosen": 8.150332364169033, "rewards/margins": 27.299421297086703, "rewards/rejected": -19.149088932917667, "step": 2659 }, { "epoch": 0.6655823845865132, "grad_norm": 4.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60711330.90909091, "logits/rejected": -63298180.92307692, "logps/chosen": -467.28462357954544, "logps/rejected": -653.1494891826923, "loss": 0.0241, "rewards/chosen": 8.606345436789773, "rewards/margins": 25.339166387811407, "rewards/rejected": -16.732820951021633, "step": 2660 }, { "epoch": 0.665832603528087, "grad_norm": 2.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41454778.18181818, "logits/rejected": -42790395.07692308, "logps/chosen": -254.54814009232953, "logps/rejected": -736.71875, "loss": 0.0461, "rewards/chosen": 4.777910405939275, "rewards/margins": 21.677552176522205, "rewards/rejected": -16.89964177058293, "step": 2661 }, { "epoch": 0.666082822469661, "grad_norm": 0.78515625, "kl": 8.616865158081055, "learning_rate": 5e-06, "logits/chosen": -59803061.333333336, "logits/rejected": -49838549.333333336, "logps/chosen": -396.5406901041667, "logps/rejected": -802.1786295572916, "loss": 0.0015, "rewards/chosen": 10.592288970947266, "rewards/margins": 28.317846934000652, "rewards/rejected": -17.725557963053387, "step": 2662 }, { "epoch": 0.6663330414112348, "grad_norm": 6.75, "kl": 7.64814567565918, "learning_rate": 5e-06, "logits/chosen": -19633602.46153846, "logits/rejected": -25652712.727272727, "logps/chosen": -439.44193209134613, "logps/rejected": -697.4636008522727, "loss": 0.0293, "rewards/chosen": 8.857303325946514, "rewards/margins": 22.599200908954327, "rewards/rejected": -13.741897583007812, "step": 2663 }, { "epoch": 0.6665832603528087, "grad_norm": 1.515625, "kl": 15.040311813354492, "learning_rate": 5e-06, "logits/chosen": -62735976.0, "logits/rejected": -42569292.0, "logps/chosen": -442.345458984375, "logps/rejected": -526.3292846679688, "loss": 0.0387, "rewards/chosen": 10.63322639465332, "rewards/margins": 24.691940307617188, "rewards/rejected": -14.058713912963867, "step": 2664 }, { "epoch": 0.6668334792943826, "grad_norm": 15.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49731203.2, "logits/rejected": -43272740.571428575, "logps/chosen": -365.801904296875, "logps/rejected": -525.6661202566964, "loss": 0.0205, "rewards/chosen": 9.561897277832031, "rewards/margins": 21.923970903669087, "rewards/rejected": -12.362073625837054, "step": 2665 }, { "epoch": 0.6670836982359565, "grad_norm": 13.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60050741.333333336, "logits/rejected": -69448346.66666667, "logps/chosen": -385.7237955729167, "logps/rejected": -648.8827311197916, "loss": 0.0205, "rewards/chosen": 8.75260861714681, "rewards/margins": 24.330956141153973, "rewards/rejected": -15.578347524007162, "step": 2666 }, { "epoch": 0.6673339171775303, "grad_norm": 13.125, "kl": 23.260469436645508, "learning_rate": 5e-06, "logits/chosen": -49006668.8, "logits/rejected": -84739384.8888889, "logps/chosen": -329.62659505208336, "logps/rejected": -616.0418294270834, "loss": 0.1564, "rewards/chosen": 7.906956481933594, "rewards/margins": 19.115918816460503, "rewards/rejected": -11.208962334526909, "step": 2667 }, { "epoch": 0.6675841361191042, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74539680.0, "logits/rejected": -66564555.294117644, "logps/chosen": -542.4593680245536, "logps/rejected": -537.1803768382352, "loss": 0.0355, "rewards/chosen": 12.349825177873884, "rewards/margins": 27.5173523205669, "rewards/rejected": -15.167527142693014, "step": 2668 }, { "epoch": 0.6678343550606781, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32113155.555555556, "logits/rejected": -53042282.666666664, "logps/chosen": -316.02392578125, "logps/rejected": -573.2503255208334, "loss": 0.0305, "rewards/chosen": 9.00972154405382, "rewards/margins": 21.600206163194443, "rewards/rejected": -12.590484619140625, "step": 2669 }, { "epoch": 0.668084574002252, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36011229.538461536, "logits/rejected": -40687918.54545455, "logps/chosen": -357.0746319110577, "logps/rejected": -556.7743252840909, "loss": 0.0204, "rewards/chosen": 9.447377718411959, "rewards/margins": 20.97209140804264, "rewards/rejected": -11.524713689630682, "step": 2670 }, { "epoch": 0.6683347929438258, "grad_norm": 16.5, "kl": 5.842411041259766, "learning_rate": 5e-06, "logits/chosen": -50390926.222222224, "logits/rejected": -50245691.733333334, "logps/chosen": -372.9076877170139, "logps/rejected": -652.6373697916666, "loss": 0.0202, "rewards/chosen": 8.231065538194445, "rewards/margins": 23.907410346137155, "rewards/rejected": -15.676344807942709, "step": 2671 }, { "epoch": 0.6685850118853998, "grad_norm": 3.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46119812.571428575, "logits/rejected": -52400243.2, "logps/chosen": -438.19559151785717, "logps/rejected": -553.90595703125, "loss": 0.021, "rewards/chosen": 10.678777422223773, "rewards/margins": 24.0945794241769, "rewards/rejected": -13.415802001953125, "step": 2672 }, { "epoch": 0.6688352308269736, "grad_norm": 5.0, "kl": 9.487467765808105, "learning_rate": 5e-06, "logits/chosen": -37758852.571428575, "logits/rejected": -32018579.2, "logps/chosen": -344.4291294642857, "logps/rejected": -624.848828125, "loss": 0.0246, "rewards/chosen": 8.148284912109375, "rewards/margins": 22.366650390625, "rewards/rejected": -14.218365478515626, "step": 2673 }, { "epoch": 0.6690854497685474, "grad_norm": 7.71875, "kl": 6.337097644805908, "learning_rate": 5e-06, "logits/chosen": -70173829.33333333, "logits/rejected": -38445720.0, "logps/chosen": -487.927978515625, "logps/rejected": -467.0347900390625, "loss": 0.0239, "rewards/chosen": 12.398569742838541, "rewards/margins": 26.636576334635414, "rewards/rejected": -14.238006591796875, "step": 2674 }, { "epoch": 0.6693356687101214, "grad_norm": 12.3125, "kl": 15.159149169921875, "learning_rate": 5e-06, "logits/chosen": -46320230.4, "logits/rejected": -44069749.333333336, "logps/chosen": -480.1328125, "logps/rejected": -767.9574652777778, "loss": 0.0643, "rewards/chosen": 10.066046142578125, "rewards/margins": 25.87298346625434, "rewards/rejected": -15.806937323676216, "step": 2675 }, { "epoch": 0.6695858876516952, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17475469.333333332, "logits/rejected": -31419997.333333332, "logps/chosen": -403.9899088541667, "logps/rejected": -749.6282552083334, "loss": 0.0067, "rewards/chosen": 8.17645009358724, "rewards/margins": 21.987686157226562, "rewards/rejected": -13.811236063639322, "step": 2676 }, { "epoch": 0.6698361065932691, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27400892.444444444, "logits/rejected": -20485478.4, "logps/chosen": -394.02362738715277, "logps/rejected": -595.7180989583334, "loss": 0.0108, "rewards/chosen": 8.875926547580296, "rewards/margins": 21.959499952528212, "rewards/rejected": -13.083573404947916, "step": 2677 }, { "epoch": 0.670086325534843, "grad_norm": 7.65625, "kl": 7.0587158203125, "learning_rate": 5e-06, "logits/chosen": -39545305.6, "logits/rejected": -50497549.71428572, "logps/chosen": -387.5333984375, "logps/rejected": -535.0796595982143, "loss": 0.048, "rewards/chosen": 7.473659515380859, "rewards/margins": 19.049288613455637, "rewards/rejected": -11.575629098074776, "step": 2678 }, { "epoch": 0.6703365444764169, "grad_norm": 5.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30359620.0, "logits/rejected": -41586768.0, "logps/chosen": -425.899169921875, "logps/rejected": -637.9644775390625, "loss": 0.0064, "rewards/chosen": 8.680935859680176, "rewards/margins": 25.36396884918213, "rewards/rejected": -16.683032989501953, "step": 2679 }, { "epoch": 0.6705867634179907, "grad_norm": 16.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57830132.36363637, "logits/rejected": -40725115.07692308, "logps/chosen": -362.08180930397725, "logps/rejected": -680.9657451923077, "loss": 0.0201, "rewards/chosen": 9.598480918190695, "rewards/margins": 26.013969528091536, "rewards/rejected": -16.41548860990084, "step": 2680 }, { "epoch": 0.6708369823595646, "grad_norm": 13.1875, "kl": 15.36419677734375, "learning_rate": 5e-06, "logits/chosen": -46010268.0, "logits/rejected": -53946824.0, "logps/chosen": -314.49957275390625, "logps/rejected": -643.9691772460938, "loss": 0.0992, "rewards/chosen": 7.324263572692871, "rewards/margins": 28.42088031768799, "rewards/rejected": -21.096616744995117, "step": 2681 }, { "epoch": 0.6710872013011385, "grad_norm": 0.8125, "kl": 1.7444674968719482, "learning_rate": 5e-06, "logits/chosen": -39192219.428571425, "logits/rejected": -33029305.6, "logps/chosen": -286.675048828125, "logps/rejected": -538.48330078125, "loss": 0.0502, "rewards/chosen": 7.213796888078962, "rewards/margins": 21.582511792864118, "rewards/rejected": -14.368714904785156, "step": 2682 }, { "epoch": 0.6713374202427124, "grad_norm": 0.73046875, "kl": 7.757453918457031, "learning_rate": 5e-06, "logits/chosen": -61587110.4, "logits/rejected": -28352715.42857143, "logps/chosen": -519.221923828125, "logps/rejected": -551.8696637834821, "loss": 0.04, "rewards/chosen": 12.053076934814452, "rewards/margins": 24.486699022565567, "rewards/rejected": -12.433622087751116, "step": 2683 }, { "epoch": 0.6715876391842862, "grad_norm": 7.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53596712.72727273, "logits/rejected": -60716844.307692304, "logps/chosen": -378.34969815340907, "logps/rejected": -598.3013070913462, "loss": 0.0238, "rewards/chosen": 10.16517500443892, "rewards/margins": 23.35914942601344, "rewards/rejected": -13.19397442157452, "step": 2684 }, { "epoch": 0.6718378581258602, "grad_norm": 2.75, "kl": 4.350074768066406, "learning_rate": 5e-06, "logits/chosen": -49889644.307692304, "logits/rejected": -62265774.54545455, "logps/chosen": -350.44437349759613, "logps/rejected": -568.1552290482955, "loss": 0.0496, "rewards/chosen": 7.237172640286959, "rewards/margins": 24.26373957920741, "rewards/rejected": -17.026566938920453, "step": 2685 }, { "epoch": 0.672088077067434, "grad_norm": 15.125, "kl": 20.56463623046875, "learning_rate": 5e-06, "logits/chosen": -61212128.0, "logits/rejected": -34170332.0, "logps/chosen": -344.35498046875, "logps/rejected": -810.1317749023438, "loss": 0.1279, "rewards/chosen": 7.638521575927735, "rewards/margins": 33.524467849731444, "rewards/rejected": -25.88594627380371, "step": 2686 }, { "epoch": 0.6723382960090079, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43737909.333333336, "logits/rejected": -55531195.733333334, "logps/chosen": -399.9912380642361, "logps/rejected": -653.2701822916666, "loss": 0.0617, "rewards/chosen": 6.207358890109592, "rewards/margins": 23.761277347140844, "rewards/rejected": -17.55391845703125, "step": 2687 }, { "epoch": 0.6725885149505818, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70385504.0, "logits/rejected": -54044954.666666664, "logps/chosen": -392.708984375, "logps/rejected": -590.4505208333334, "loss": 0.0182, "rewards/chosen": 8.729240417480469, "rewards/margins": 25.801105499267578, "rewards/rejected": -17.07186508178711, "step": 2688 }, { "epoch": 0.6728387338921556, "grad_norm": 1.7421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23533370.181818184, "logits/rejected": -48382611.692307696, "logps/chosen": -245.6910067471591, "logps/rejected": -484.8810847355769, "loss": 0.0267, "rewards/chosen": 6.251744357022372, "rewards/margins": 19.462623169372133, "rewards/rejected": -13.21087881234976, "step": 2689 }, { "epoch": 0.6730889528337295, "grad_norm": 2.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59945721.6, "logits/rejected": -25135565.714285713, "logps/chosen": -553.769580078125, "logps/rejected": -587.7325613839286, "loss": 0.0177, "rewards/chosen": 13.16700439453125, "rewards/margins": 29.27313145228795, "rewards/rejected": -16.106127057756698, "step": 2690 }, { "epoch": 0.6733391717753033, "grad_norm": 7.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26749800.0, "logits/rejected": -60415077.333333336, "logps/chosen": -319.173583984375, "logps/rejected": -525.3067626953125, "loss": 0.045, "rewards/chosen": 6.673659642537435, "rewards/margins": 23.84557278951009, "rewards/rejected": -17.171913146972656, "step": 2691 }, { "epoch": 0.6735893907168773, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37493735.384615384, "logits/rejected": -53041326.54545455, "logps/chosen": -393.5027418870192, "logps/rejected": -657.62255859375, "loss": 0.0135, "rewards/chosen": 8.530552790715145, "rewards/margins": 29.029251632156907, "rewards/rejected": -20.49869884144176, "step": 2692 }, { "epoch": 0.6738396096584511, "grad_norm": 14.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55177660.44444445, "logits/rejected": -66466641.06666667, "logps/chosen": -421.4803873697917, "logps/rejected": -626.990234375, "loss": 0.0324, "rewards/chosen": 8.147815280490452, "rewards/margins": 23.37712944878472, "rewards/rejected": -15.229314168294271, "step": 2693 }, { "epoch": 0.674089828600025, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34918650.666666664, "logits/rejected": -54391125.333333336, "logps/chosen": -304.37522379557294, "logps/rejected": -627.1848551432291, "loss": 0.0376, "rewards/chosen": 7.261203765869141, "rewards/margins": 22.953077952067055, "rewards/rejected": -15.691874186197916, "step": 2694 }, { "epoch": 0.6743400475415989, "grad_norm": 8.8125, "kl": 7.613304138183594, "learning_rate": 5e-06, "logits/chosen": -32470618.181818184, "logits/rejected": -54132199.384615384, "logps/chosen": -296.56977982954544, "logps/rejected": -462.1472355769231, "loss": 0.0441, "rewards/chosen": 6.696493668989702, "rewards/margins": 18.86148108635749, "rewards/rejected": -12.164987417367788, "step": 2695 }, { "epoch": 0.6745902664831728, "grad_norm": 0.94140625, "kl": 0.8937060236930847, "learning_rate": 5e-06, "logits/chosen": -62771224.0, "logits/rejected": -45940344.0, "logps/chosen": -489.0162658691406, "logps/rejected": -734.91943359375, "loss": 0.0121, "rewards/chosen": 8.029244422912598, "rewards/margins": 26.41169834136963, "rewards/rejected": -18.38245391845703, "step": 2696 }, { "epoch": 0.6748404854247466, "grad_norm": 8.8125, "kl": 5.343951225280762, "learning_rate": 5e-06, "logits/chosen": -78941514.66666667, "logits/rejected": -38618293.333333336, "logps/chosen": -383.6469319661458, "logps/rejected": -751.5849609375, "loss": 0.1127, "rewards/chosen": 7.094513575236003, "rewards/margins": 26.123745600382488, "rewards/rejected": -19.029232025146484, "step": 2697 }, { "epoch": 0.6750907043663206, "grad_norm": 5.4375, "kl": 9.451787948608398, "learning_rate": 5e-06, "logits/chosen": -48769685.333333336, "logits/rejected": -48950410.666666664, "logps/chosen": -365.4896240234375, "logps/rejected": -604.207275390625, "loss": 0.0841, "rewards/chosen": 7.637644449869792, "rewards/margins": 22.695269266764324, "rewards/rejected": -15.057624816894531, "step": 2698 }, { "epoch": 0.6753409233078944, "grad_norm": 6.875, "kl": 2.1126933097839355, "learning_rate": 5e-06, "logits/chosen": -24267421.333333332, "logits/rejected": -53028144.0, "logps/chosen": -394.5754801432292, "logps/rejected": -569.8325602213541, "loss": 0.0161, "rewards/chosen": 8.710105895996094, "rewards/margins": 25.147602081298828, "rewards/rejected": -16.437496185302734, "step": 2699 }, { "epoch": 0.6755911422494683, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52480612.571428575, "logits/rejected": -57639262.11764706, "logps/chosen": -374.4794921875, "logps/rejected": -658.4264705882352, "loss": 0.0313, "rewards/chosen": 7.2998777117047995, "rewards/margins": 23.097203294770058, "rewards/rejected": -15.797325583065257, "step": 2700 }, { "epoch": 0.6758413611910422, "grad_norm": 6.71875, "kl": 10.380943298339844, "learning_rate": 5e-06, "logits/chosen": -24314040.0, "logits/rejected": -51167120.0, "logps/chosen": -360.02691650390625, "logps/rejected": -629.2841796875, "loss": 0.0884, "rewards/chosen": 6.238309383392334, "rewards/margins": 23.19775152206421, "rewards/rejected": -16.959442138671875, "step": 2701 }, { "epoch": 0.676091580132616, "grad_norm": 11.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40188800.0, "logits/rejected": -40446240.0, "logps/chosen": -330.7169189453125, "logps/rejected": -554.795654296875, "loss": 0.0451, "rewards/chosen": 7.702105712890625, "rewards/margins": 21.11817910330636, "rewards/rejected": -13.416073390415736, "step": 2702 }, { "epoch": 0.6763417990741899, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37575772.44444445, "logits/rejected": -13623056.0, "logps/chosen": -431.8217502170139, "logps/rejected": -703.9953776041667, "loss": 0.0131, "rewards/chosen": 8.958772447374132, "rewards/margins": 23.674991522894963, "rewards/rejected": -14.716219075520833, "step": 2703 }, { "epoch": 0.6765920180157637, "grad_norm": 11.0, "kl": 5.6611785888671875, "learning_rate": 5e-06, "logits/chosen": -38218434.28571428, "logits/rejected": -56203916.8, "logps/chosen": -345.87050083705356, "logps/rejected": -693.977734375, "loss": 0.0319, "rewards/chosen": 7.908814566476004, "rewards/margins": 24.418912833077567, "rewards/rejected": -16.510098266601563, "step": 2704 }, { "epoch": 0.6768422369573377, "grad_norm": 5.34375, "kl": 8.256711959838867, "learning_rate": 5e-06, "logits/chosen": -43704461.71428572, "logits/rejected": -38097846.4, "logps/chosen": -411.94796316964283, "logps/rejected": -432.823828125, "loss": 0.0488, "rewards/chosen": 7.941775730678013, "rewards/margins": 20.05404990059989, "rewards/rejected": -12.112274169921875, "step": 2705 }, { "epoch": 0.6770924558989115, "grad_norm": 0.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50813346.90909091, "logits/rejected": -72436007.38461539, "logps/chosen": -415.6368963068182, "logps/rejected": -768.1403245192307, "loss": 0.0129, "rewards/chosen": 9.202072143554688, "rewards/margins": 28.58419682429387, "rewards/rejected": -19.38212468073918, "step": 2706 }, { "epoch": 0.6773426748404854, "grad_norm": 8.4375, "kl": 5.019961833953857, "learning_rate": 5e-06, "logits/chosen": -22201600.0, "logits/rejected": -44908142.222222224, "logps/chosen": -321.37428385416666, "logps/rejected": -540.7162543402778, "loss": 0.0379, "rewards/chosen": 8.673769124348958, "rewards/margins": 25.391561211480035, "rewards/rejected": -16.717792087131077, "step": 2707 }, { "epoch": 0.6775928937820593, "grad_norm": 1.6015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46433158.4, "logits/rejected": -47900009.14285714, "logps/chosen": -394.57080078125, "logps/rejected": -799.8161272321429, "loss": 0.0036, "rewards/chosen": 8.446973419189453, "rewards/margins": 29.12840042114258, "rewards/rejected": -20.681427001953125, "step": 2708 }, { "epoch": 0.6778431127236332, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47518240.0, "logits/rejected": -36227538.666666664, "logps/chosen": -294.8662923177083, "logps/rejected": -536.8291422526041, "loss": 0.0224, "rewards/chosen": 7.184755961100261, "rewards/margins": 23.859935760498047, "rewards/rejected": -16.675179799397785, "step": 2709 }, { "epoch": 0.678093331665207, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32002519.272727273, "logits/rejected": -30734468.923076924, "logps/chosen": -271.3680974786932, "logps/rejected": -520.1676307091346, "loss": 0.0437, "rewards/chosen": 5.882674477317116, "rewards/margins": 15.740388616815313, "rewards/rejected": -9.857714139498198, "step": 2710 }, { "epoch": 0.678343550606781, "grad_norm": 7.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56590912.0, "logits/rejected": 52505133.71428572, "logps/chosen": -414.9375, "logps/rejected": -540.3111049107143, "loss": 0.0132, "rewards/chosen": 9.511048126220704, "rewards/margins": 23.792784227643693, "rewards/rejected": -14.281736101422991, "step": 2711 }, { "epoch": 0.6785937695483548, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50736374.15384615, "logits/rejected": -44572462.54545455, "logps/chosen": -350.21326622596155, "logps/rejected": -713.8496981534091, "loss": 0.0535, "rewards/chosen": 8.04671360896184, "rewards/margins": 25.778265732985275, "rewards/rejected": -17.731552124023438, "step": 2712 }, { "epoch": 0.6788439884899287, "grad_norm": 10.75, "kl": 7.07819128036499, "learning_rate": 5e-06, "logits/chosen": -37680578.666666664, "logits/rejected": -34412413.333333336, "logps/chosen": -412.0034993489583, "logps/rejected": -493.3621012369792, "loss": 0.0659, "rewards/chosen": 8.202827453613281, "rewards/margins": 20.922884623209633, "rewards/rejected": -12.720057169596354, "step": 2713 }, { "epoch": 0.6790942074315026, "grad_norm": 6.5625, "kl": 5.205258846282959, "learning_rate": 5e-06, "logits/chosen": -18819800.0, "logits/rejected": -54122649.6, "logps/chosen": -335.9092494419643, "logps/rejected": -764.4919921875, "loss": 0.0284, "rewards/chosen": 8.238954271589007, "rewards/margins": 27.04671031406948, "rewards/rejected": -18.80775604248047, "step": 2714 }, { "epoch": 0.6793444263730765, "grad_norm": 7.25, "kl": 0.8505653142929077, "learning_rate": 5e-06, "logits/chosen": -55554368.0, "logits/rejected": 20625708.307692308, "logps/chosen": -369.5399280894886, "logps/rejected": -424.6378831129808, "loss": 0.027, "rewards/chosen": 7.830690557306463, "rewards/margins": 20.60636053552161, "rewards/rejected": -12.775669978215145, "step": 2715 }, { "epoch": 0.6795946453146503, "grad_norm": 8.0, "kl": 10.472179412841797, "learning_rate": 5e-06, "logits/chosen": -44820272.0, "logits/rejected": -48400645.333333336, "logps/chosen": -350.8748372395833, "logps/rejected": -573.3435872395834, "loss": 0.0648, "rewards/chosen": 8.194288889567057, "rewards/margins": 21.978532155354817, "rewards/rejected": -13.78424326578776, "step": 2716 }, { "epoch": 0.6798448642562241, "grad_norm": 5.875, "kl": 6.006850242614746, "learning_rate": 5e-06, "logits/chosen": -49800218.666666664, "logits/rejected": 151990048.0, "logps/chosen": -381.375732421875, "logps/rejected": -508.2392171223958, "loss": 0.0178, "rewards/chosen": 8.715984980265299, "rewards/margins": 21.662726720174152, "rewards/rejected": -12.946741739908854, "step": 2717 }, { "epoch": 0.6800950831977981, "grad_norm": 13.375, "kl": 19.644508361816406, "learning_rate": 5e-06, "logits/chosen": -61825723.733333334, "logits/rejected": -81608156.44444445, "logps/chosen": -349.43792317708335, "logps/rejected": -827.3313802083334, "loss": 0.0903, "rewards/chosen": 7.528719075520834, "rewards/margins": 28.689302571614583, "rewards/rejected": -21.16058349609375, "step": 2718 }, { "epoch": 0.6803453021393719, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -10236976.8, "logits/rejected": 50484251.428571425, "logps/chosen": -436.89658203125, "logps/rejected": -622.4107840401786, "loss": 0.0081, "rewards/chosen": 12.109484100341797, "rewards/margins": 21.3252564566476, "rewards/rejected": -9.215772356305804, "step": 2719 }, { "epoch": 0.6805955210809458, "grad_norm": 10.5625, "kl": 12.579024314880371, "learning_rate": 5e-06, "logits/chosen": -100175302.4, "logits/rejected": -44429449.14285714, "logps/chosen": -420.9041015625, "logps/rejected": -625.2189592633929, "loss": 0.0305, "rewards/chosen": 9.574626159667968, "rewards/margins": 22.764858572823663, "rewards/rejected": -13.190232413155693, "step": 2720 }, { "epoch": 0.6808457400225197, "grad_norm": 6.0, "kl": 4.617887020111084, "learning_rate": 5e-06, "logits/chosen": -43769736.53333333, "logits/rejected": 9159583.111111112, "logps/chosen": -281.06640625, "logps/rejected": -411.7307942708333, "loss": 0.0231, "rewards/chosen": 7.520711263020833, "rewards/margins": 17.84979502360026, "rewards/rejected": -10.329083760579428, "step": 2721 }, { "epoch": 0.6810959589640936, "grad_norm": 2.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40860058.18181818, "logits/rejected": -37956662.15384615, "logps/chosen": -395.5813654119318, "logps/rejected": -549.4824969951923, "loss": 0.0403, "rewards/chosen": 8.768547058105469, "rewards/margins": 19.37740443303035, "rewards/rejected": -10.60885737492488, "step": 2722 }, { "epoch": 0.6813461779056674, "grad_norm": 14.25, "kl": 1.7867101430892944, "learning_rate": 5e-06, "logits/chosen": -33256780.8, "logits/rejected": -51795227.428571425, "logps/chosen": -356.86904296875, "logps/rejected": -715.3819056919643, "loss": 0.0278, "rewards/chosen": 7.858525085449219, "rewards/margins": 24.646076965332032, "rewards/rejected": -16.787551879882812, "step": 2723 }, { "epoch": 0.6815963968472414, "grad_norm": 13.4375, "kl": 14.948885917663574, "learning_rate": 5e-06, "logits/chosen": -46877282.461538464, "logits/rejected": -44689230.54545455, "logps/chosen": -405.7613055889423, "logps/rejected": -635.93115234375, "loss": 0.033, "rewards/chosen": 9.542803250826323, "rewards/margins": 21.895449124849762, "rewards/rejected": -12.352645874023438, "step": 2724 }, { "epoch": 0.6818466157888152, "grad_norm": 2.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48140885.333333336, "logits/rejected": -27085709.333333332, "logps/chosen": -359.6389973958333, "logps/rejected": -782.0182291666666, "loss": 0.0044, "rewards/chosen": 8.134845097859701, "rewards/margins": 21.349106470743816, "rewards/rejected": -13.214261372884115, "step": 2725 }, { "epoch": 0.6820968347303891, "grad_norm": 3.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20440115.692307692, "logits/rejected": -30883706.181818184, "logps/chosen": -304.34560922475964, "logps/rejected": -613.7752130681819, "loss": 0.0406, "rewards/chosen": 7.754799476036658, "rewards/margins": 21.875210742016773, "rewards/rejected": -14.120411265980113, "step": 2726 }, { "epoch": 0.682347053671963, "grad_norm": 5.625, "kl": 8.461029052734375, "learning_rate": 5e-06, "logits/chosen": -2989726.8571428573, "logits/rejected": -75557305.6, "logps/chosen": -403.99658203125, "logps/rejected": -622.13828125, "loss": 0.0698, "rewards/chosen": 9.867722647530693, "rewards/margins": 22.36346849714007, "rewards/rejected": -12.495745849609374, "step": 2727 }, { "epoch": 0.6825972726135369, "grad_norm": 8.3125, "kl": 4.878762722015381, "learning_rate": 5e-06, "logits/chosen": -47422841.6, "logits/rejected": -36688630.85714286, "logps/chosen": -421.531103515625, "logps/rejected": -552.42138671875, "loss": 0.0097, "rewards/chosen": 10.10202407836914, "rewards/margins": 22.43823405674526, "rewards/rejected": -12.336209978376116, "step": 2728 }, { "epoch": 0.6828474915551107, "grad_norm": 14.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31755552.0, "logits/rejected": -30465378.666666668, "logps/chosen": -410.1354573567708, "logps/rejected": -638.93603515625, "loss": 0.0654, "rewards/chosen": 8.828057607014975, "rewards/margins": 22.31747817993164, "rewards/rejected": -13.489420572916666, "step": 2729 }, { "epoch": 0.6830977104966846, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49744180.0, "logits/rejected": -33026046.0, "logps/chosen": -342.1230163574219, "logps/rejected": -556.806884765625, "loss": 0.031, "rewards/chosen": 8.25494384765625, "rewards/margins": 22.288183212280273, "rewards/rejected": -14.033239364624023, "step": 2730 }, { "epoch": 0.6833479294382585, "grad_norm": 7.625, "kl": 7.060115814208984, "learning_rate": 5e-06, "logits/chosen": -60570116.92307692, "logits/rejected": -40945297.45454545, "logps/chosen": -464.50826322115387, "logps/rejected": -661.791015625, "loss": 0.0528, "rewards/chosen": 8.06749021089994, "rewards/margins": 20.925928876116558, "rewards/rejected": -12.85843866521662, "step": 2731 }, { "epoch": 0.6835981483798323, "grad_norm": 2.84375, "kl": 14.406478881835938, "learning_rate": 5e-06, "logits/chosen": -75973021.0909091, "logits/rejected": -51239133.538461536, "logps/chosen": -477.0870472301136, "logps/rejected": -853.2333233173077, "loss": 0.0037, "rewards/chosen": 11.890788685191762, "rewards/margins": 29.41362052864128, "rewards/rejected": -17.52283184344952, "step": 2732 }, { "epoch": 0.6838483673214062, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36164214.15384615, "logits/rejected": -47110929.45454545, "logps/chosen": -330.17664513221155, "logps/rejected": -703.9414950284091, "loss": 0.0364, "rewards/chosen": 7.195610633263221, "rewards/margins": 22.52943964604731, "rewards/rejected": -15.333829012784092, "step": 2733 }, { "epoch": 0.6840985862629801, "grad_norm": 13.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47301571.2, "logits/rejected": -47998121.14285714, "logps/chosen": -359.32333984375, "logps/rejected": -562.8724888392857, "loss": 0.0311, "rewards/chosen": 8.07234878540039, "rewards/margins": 18.828605978829522, "rewards/rejected": -10.75625719342913, "step": 2734 }, { "epoch": 0.684348805204554, "grad_norm": 2.46875, "kl": 17.213876724243164, "learning_rate": 5e-06, "logits/chosen": -51389038.54545455, "logits/rejected": -25589176.615384616, "logps/chosen": -485.08860085227275, "logps/rejected": -564.5911959134615, "loss": 0.0057, "rewards/chosen": 13.2022441517223, "rewards/margins": 24.74373045167723, "rewards/rejected": -11.541486299954927, "step": 2735 }, { "epoch": 0.6845990241461278, "grad_norm": 5.25, "kl": 18.86608123779297, "learning_rate": 5e-06, "logits/chosen": -60093224.72727273, "logits/rejected": -33090503.384615384, "logps/chosen": -399.2390802556818, "logps/rejected": -602.8638070913462, "loss": 0.048, "rewards/chosen": 9.964379744096236, "rewards/margins": 25.582895932497678, "rewards/rejected": -15.618516188401442, "step": 2736 }, { "epoch": 0.6848492430877018, "grad_norm": 13.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40937245.09090909, "logits/rejected": -54273068.307692304, "logps/chosen": -410.59889914772725, "logps/rejected": -668.7396334134615, "loss": 0.0362, "rewards/chosen": 8.760176225142045, "rewards/margins": 23.91406111283736, "rewards/rejected": -15.153884887695312, "step": 2737 }, { "epoch": 0.6850994620292756, "grad_norm": 7.09375, "kl": 5.138020992279053, "learning_rate": 5e-06, "logits/chosen": -57330530.461538464, "logits/rejected": -44201844.36363637, "logps/chosen": -371.63393930288464, "logps/rejected": -476.93581321022725, "loss": 0.059, "rewards/chosen": 8.582878699669472, "rewards/margins": 22.593139541732683, "rewards/rejected": -14.01026084206321, "step": 2738 }, { "epoch": 0.6853496809708495, "grad_norm": 14.5, "kl": 4.2702860832214355, "learning_rate": 5e-06, "logits/chosen": -55323509.333333336, "logits/rejected": -42431802.666666664, "logps/chosen": -425.566162109375, "logps/rejected": -580.7478841145834, "loss": 0.0264, "rewards/chosen": 9.561581293741861, "rewards/margins": 21.445405960083008, "rewards/rejected": -11.883824666341146, "step": 2739 }, { "epoch": 0.6855998999124233, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18671196.307692308, "logits/rejected": -81193216.0, "logps/chosen": -249.00026292067307, "logps/rejected": -791.5316051136364, "loss": 0.0473, "rewards/chosen": 6.072866586538462, "rewards/margins": 24.81621098018193, "rewards/rejected": -18.743344393643465, "step": 2740 }, { "epoch": 0.6858501188539973, "grad_norm": 13.3125, "kl": 0.9031311869621277, "learning_rate": 5e-06, "logits/chosen": -48370715.428571425, "logits/rejected": -32564150.4, "logps/chosen": -427.9457310267857, "logps/rejected": -520.09013671875, "loss": 0.0439, "rewards/chosen": 9.08301762172154, "rewards/margins": 22.744878278459822, "rewards/rejected": -13.66186065673828, "step": 2741 }, { "epoch": 0.6861003377955711, "grad_norm": 2.90625, "kl": 7.487807273864746, "learning_rate": 5e-06, "logits/chosen": -29795060.363636363, "logits/rejected": -38814144.0, "logps/chosen": -363.45174893465907, "logps/rejected": -528.5535606971154, "loss": 0.0531, "rewards/chosen": 9.169233842329545, "rewards/margins": 21.515407642284472, "rewards/rejected": -12.346173799954927, "step": 2742 }, { "epoch": 0.686350556737145, "grad_norm": 8.8125, "kl": 17.490646362304688, "learning_rate": 5e-06, "logits/chosen": -43463378.28571428, "logits/rejected": -46273891.2, "logps/chosen": -373.85215541294644, "logps/rejected": -520.4099609375, "loss": 0.0546, "rewards/chosen": 9.47555650983538, "rewards/margins": 21.29911989484515, "rewards/rejected": -11.823563385009766, "step": 2743 }, { "epoch": 0.6866007756787189, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46575138.461538464, "logits/rejected": -95614830.54545455, "logps/chosen": -410.03267728365387, "logps/rejected": -570.6364524147727, "loss": 0.0393, "rewards/chosen": 8.750277005709135, "rewards/margins": 21.594545804537262, "rewards/rejected": -12.844268798828125, "step": 2744 }, { "epoch": 0.6868509946202928, "grad_norm": 2.9375, "kl": 3.0914268493652344, "learning_rate": 5e-06, "logits/chosen": -65610368.0, "logits/rejected": -44630304.0, "logps/chosen": -309.8809407552083, "logps/rejected": -646.1754557291666, "loss": 0.014, "rewards/chosen": 8.729279200236002, "rewards/margins": 25.35164451599121, "rewards/rejected": -16.622365315755207, "step": 2745 }, { "epoch": 0.6871012135618666, "grad_norm": 6.75, "kl": 9.335000991821289, "learning_rate": 5e-06, "logits/chosen": -51554248.0, "logits/rejected": -72196352.0, "logps/chosen": -386.39996337890625, "logps/rejected": -755.353271484375, "loss": 0.0896, "rewards/chosen": 8.532389640808105, "rewards/margins": 27.59816265106201, "rewards/rejected": -19.065773010253906, "step": 2746 }, { "epoch": 0.6873514325034406, "grad_norm": 7.8125, "kl": 9.820812225341797, "learning_rate": 5e-06, "logits/chosen": -55165445.81818182, "logits/rejected": -36148957.538461536, "logps/chosen": -395.64035866477275, "logps/rejected": -653.5318509615385, "loss": 0.0145, "rewards/chosen": 7.758229342373935, "rewards/margins": 25.054785081556627, "rewards/rejected": -17.296555739182693, "step": 2747 }, { "epoch": 0.6876016514450144, "grad_norm": 1.2265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41252877.333333336, "logits/rejected": -57992848.0, "logps/chosen": -497.4013671875, "logps/rejected": -600.0843912760416, "loss": 0.0034, "rewards/chosen": 9.65780258178711, "rewards/margins": 27.494061787923176, "rewards/rejected": -17.836259206136067, "step": 2748 }, { "epoch": 0.6878518703865882, "grad_norm": 0.55859375, "kl": 10.487970352172852, "learning_rate": 5e-06, "logits/chosen": -58947251.2, "logits/rejected": -46053888.0, "logps/chosen": -419.1844075520833, "logps/rejected": -577.2306857638889, "loss": 0.001, "rewards/chosen": 11.291046142578125, "rewards/margins": 27.883182101779514, "rewards/rejected": -16.59213595920139, "step": 2749 }, { "epoch": 0.6881020893281622, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28783881.14285714, "logits/rejected": -25884744.0, "logps/chosen": -337.6700962611607, "logps/rejected": -520.97412109375, "loss": 0.0434, "rewards/chosen": 5.522030966622489, "rewards/margins": 23.388629477364674, "rewards/rejected": -17.866598510742186, "step": 2750 }, { "epoch": 0.688352308269736, "grad_norm": 16.5, "kl": 2.799732208251953, "learning_rate": 5e-06, "logits/chosen": -54002960.0, "logits/rejected": -52608565.333333336, "logps/chosen": -388.6888020833333, "logps/rejected": -634.1927083333334, "loss": 0.0706, "rewards/chosen": 8.436592737833658, "rewards/margins": 18.89079984029134, "rewards/rejected": -10.454207102457682, "step": 2751 }, { "epoch": 0.6886025272113099, "grad_norm": 8.75, "kl": 10.986352920532227, "learning_rate": 5e-06, "logits/chosen": -46295213.71428572, "logits/rejected": -34509945.6, "logps/chosen": -336.02127511160717, "logps/rejected": -552.241748046875, "loss": 0.0562, "rewards/chosen": 8.029896327427455, "rewards/margins": 22.419825526646207, "rewards/rejected": -14.38992919921875, "step": 2752 }, { "epoch": 0.6888527461528837, "grad_norm": 5.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37182549.333333336, "logits/rejected": -52441898.666666664, "logps/chosen": -291.1960720486111, "logps/rejected": -711.1141927083333, "loss": 0.0306, "rewards/chosen": 7.662573072645399, "rewards/margins": 22.63199039035373, "rewards/rejected": -14.969417317708333, "step": 2753 }, { "epoch": 0.6891029650944577, "grad_norm": 2.125, "kl": 1.9093310832977295, "learning_rate": 5e-06, "logits/chosen": -71006865.45454545, "logits/rejected": -30945504.0, "logps/chosen": -373.26242897727275, "logps/rejected": -785.4478665865385, "loss": 0.0225, "rewards/chosen": 9.412776600230824, "rewards/margins": 29.724180314924332, "rewards/rejected": -20.31140371469351, "step": 2754 }, { "epoch": 0.6893531840360315, "grad_norm": 1.5078125, "kl": 0.45543450117111206, "learning_rate": 5e-06, "logits/chosen": -34418460.8, "logits/rejected": -37040880.0, "logps/chosen": -415.85166015625, "logps/rejected": -450.4140625, "loss": 0.0044, "rewards/chosen": 10.450902557373047, "rewards/margins": 20.677615465436663, "rewards/rejected": -10.226712908063616, "step": 2755 }, { "epoch": 0.6896034029776054, "grad_norm": 3.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32289712.0, "logits/rejected": -49966096.0, "logps/chosen": -334.87583414713544, "logps/rejected": -663.6419270833334, "loss": 0.0388, "rewards/chosen": 8.549997329711914, "rewards/margins": 23.470539728800453, "rewards/rejected": -14.920542399088541, "step": 2756 }, { "epoch": 0.6898536219191793, "grad_norm": 13.4375, "kl": 2.6487064361572266, "learning_rate": 5e-06, "logits/chosen": -39437602.13333333, "logits/rejected": -52504408.88888889, "logps/chosen": -296.668359375, "logps/rejected": -807.7300347222222, "loss": 0.0736, "rewards/chosen": 6.305873107910156, "rewards/margins": 26.28632320827908, "rewards/rejected": -19.980450100368923, "step": 2757 }, { "epoch": 0.6901038408607532, "grad_norm": 8.1875, "kl": 4.755486488342285, "learning_rate": 5e-06, "logits/chosen": -45840162.90909091, "logits/rejected": -78345708.3076923, "logps/chosen": -315.24995561079544, "logps/rejected": -722.7267127403846, "loss": 0.1037, "rewards/chosen": 7.522139115767046, "rewards/margins": 26.40235762162642, "rewards/rejected": -18.880218505859375, "step": 2758 }, { "epoch": 0.690354059802327, "grad_norm": 3.75, "kl": 3.564873695373535, "learning_rate": 5e-06, "logits/chosen": -35138190.76923077, "logits/rejected": -24876442.181818184, "logps/chosen": -311.42001577524036, "logps/rejected": -491.4080699573864, "loss": 0.0484, "rewards/chosen": 7.846774761493389, "rewards/margins": 21.933150604888276, "rewards/rejected": -14.086375843394887, "step": 2759 }, { "epoch": 0.690604278743901, "grad_norm": 3.546875, "kl": 10.656320571899414, "learning_rate": 5e-06, "logits/chosen": -30395949.714285713, "logits/rejected": -28982515.2, "logps/chosen": -337.8095005580357, "logps/rejected": -498.26650390625, "loss": 0.0615, "rewards/chosen": 8.413687569754464, "rewards/margins": 20.957042367117744, "rewards/rejected": -12.543354797363282, "step": 2760 }, { "epoch": 0.6908544976854748, "grad_norm": 5.0625, "kl": 5.239138603210449, "learning_rate": 5e-06, "logits/chosen": -54257216.0, "logits/rejected": -1798210.4, "logps/chosen": -394.43491908482144, "logps/rejected": -637.791259765625, "loss": 0.0366, "rewards/chosen": 9.841561453683036, "rewards/margins": 23.799882071358816, "rewards/rejected": -13.958320617675781, "step": 2761 }, { "epoch": 0.6911047166270486, "grad_norm": 25.5, "kl": 9.283768653869629, "learning_rate": 5e-06, "logits/chosen": -16548397.866666667, "logits/rejected": -46309724.44444445, "logps/chosen": -343.87194010416664, "logps/rejected": -521.0581597222222, "loss": 0.0976, "rewards/chosen": 6.753886922200521, "rewards/margins": 21.572693888346354, "rewards/rejected": -14.818806966145834, "step": 2762 }, { "epoch": 0.6913549355686226, "grad_norm": 9.5625, "kl": 1.26590096950531, "learning_rate": 5e-06, "logits/chosen": -45286950.4, "logits/rejected": 39504205.71428572, "logps/chosen": -368.120263671875, "logps/rejected": -511.7056361607143, "loss": 0.0513, "rewards/chosen": 8.549004364013673, "rewards/margins": 19.419579315185548, "rewards/rejected": -10.870574951171875, "step": 2763 }, { "epoch": 0.6916051545101964, "grad_norm": 3.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45966784.0, "logits/rejected": -32754284.0, "logps/chosen": -372.1645812988281, "logps/rejected": -514.0689697265625, "loss": 0.005, "rewards/chosen": 10.397870063781738, "rewards/margins": 24.037297248840332, "rewards/rejected": -13.639427185058594, "step": 2764 }, { "epoch": 0.6918553734517703, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32034685.09090909, "logits/rejected": -59331165.538461536, "logps/chosen": -334.6779119318182, "logps/rejected": -704.5286959134615, "loss": 0.0375, "rewards/chosen": 8.87028642134233, "rewards/margins": 26.241751077291852, "rewards/rejected": -17.37146465594952, "step": 2765 }, { "epoch": 0.6921055923933441, "grad_norm": 3.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32987436.8, "logits/rejected": -44203186.28571428, "logps/chosen": -313.8713134765625, "logps/rejected": -605.2614397321429, "loss": 0.0175, "rewards/chosen": 7.5282035827636715, "rewards/margins": 22.637847791399274, "rewards/rejected": -15.109644208635602, "step": 2766 }, { "epoch": 0.6923558113349181, "grad_norm": 5.875, "kl": 0.09024810791015625, "learning_rate": 5e-06, "logits/chosen": -70322333.0909091, "logits/rejected": -2981575.3846153845, "logps/chosen": -403.05708451704544, "logps/rejected": -724.8869441105769, "loss": 0.0094, "rewards/chosen": 8.918341203169389, "rewards/margins": 27.123020252147754, "rewards/rejected": -18.204679048978367, "step": 2767 }, { "epoch": 0.6926060302764919, "grad_norm": 3.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76205718.85714285, "logits/rejected": -45852544.0, "logps/chosen": -386.57223074776783, "logps/rejected": -641.0960477941177, "loss": 0.0222, "rewards/chosen": 7.447065080915179, "rewards/margins": 24.392646853663344, "rewards/rejected": -16.945581772748163, "step": 2768 }, { "epoch": 0.6928562492180658, "grad_norm": 1.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38298542.54545455, "logits/rejected": -68176374.15384616, "logps/chosen": -384.20143821022725, "logps/rejected": -841.4265324519231, "loss": 0.0018, "rewards/chosen": 8.810086337002842, "rewards/margins": 33.44242229328289, "rewards/rejected": -24.63233595628005, "step": 2769 }, { "epoch": 0.6931064681596397, "grad_norm": 6.125, "kl": 4.731883525848389, "learning_rate": 5e-06, "logits/chosen": -73563834.18181819, "logits/rejected": -50649875.692307696, "logps/chosen": -431.53715376420456, "logps/rejected": -483.5956280048077, "loss": 0.0359, "rewards/chosen": 9.157230723987926, "rewards/margins": 20.216028707010764, "rewards/rejected": -11.058797983022837, "step": 2770 }, { "epoch": 0.6933566871012136, "grad_norm": 2.828125, "kl": 1.4986610412597656, "learning_rate": 5e-06, "logits/chosen": -39752548.92307692, "logits/rejected": -58983778.90909091, "logps/chosen": -363.0218975360577, "logps/rejected": -634.2046786221591, "loss": 0.0629, "rewards/chosen": 7.548504169170673, "rewards/margins": 22.409459147419962, "rewards/rejected": -14.86095497824929, "step": 2771 }, { "epoch": 0.6936069060427874, "grad_norm": 12.375, "kl": 12.104829788208008, "learning_rate": 5e-06, "logits/chosen": -16362048.0, "logits/rejected": -51085898.666666664, "logps/chosen": -395.4920247395833, "logps/rejected": -705.8592122395834, "loss": 0.0344, "rewards/chosen": 8.538567225138346, "rewards/margins": 27.92913373311361, "rewards/rejected": -19.39056650797526, "step": 2772 }, { "epoch": 0.6938571249843614, "grad_norm": 0.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57311668.36363637, "logits/rejected": -34101346.461538464, "logps/chosen": -480.12167080965907, "logps/rejected": -674.0183293269231, "loss": 0.0016, "rewards/chosen": 11.783626209605824, "rewards/margins": 27.319266045843804, "rewards/rejected": -15.53563983623798, "step": 2773 }, { "epoch": 0.6941073439259352, "grad_norm": 20.375, "kl": 4.801861763000488, "learning_rate": 5e-06, "logits/chosen": -18392203.29411765, "logits/rejected": -26590281.14285714, "logps/chosen": -395.09349149816177, "logps/rejected": -518.1199776785714, "loss": 0.0869, "rewards/chosen": 7.881542430204504, "rewards/margins": 22.602851547112987, "rewards/rejected": -14.721309116908483, "step": 2774 }, { "epoch": 0.694357562867509, "grad_norm": 1.71875, "kl": 7.473282814025879, "learning_rate": 5e-06, "logits/chosen": -30776266.666666668, "logits/rejected": -11937741.333333334, "logps/chosen": -416.1825764973958, "logps/rejected": -637.2902425130209, "loss": 0.0343, "rewards/chosen": 8.383967081705729, "rewards/margins": 21.363797505696613, "rewards/rejected": -12.979830423990885, "step": 2775 }, { "epoch": 0.694607781809083, "grad_norm": 11.625, "kl": 7.142127990722656, "learning_rate": 5e-06, "logits/chosen": -35835084.8, "logits/rejected": -50254346.666666664, "logps/chosen": -370.1675130208333, "logps/rejected": -735.6273328993055, "loss": 0.0695, "rewards/chosen": 7.596914672851563, "rewards/margins": 26.12161187065972, "rewards/rejected": -18.52469719780816, "step": 2776 }, { "epoch": 0.6948580007506568, "grad_norm": 4.0, "kl": 3.1983802318573, "learning_rate": 5e-06, "logits/chosen": -23777403.076923076, "logits/rejected": -33873576.72727273, "logps/chosen": -355.99643179086536, "logps/rejected": -508.69753196022725, "loss": 0.0225, "rewards/chosen": 8.205616290752705, "rewards/margins": 24.125349511633388, "rewards/rejected": -15.919733220880682, "step": 2777 }, { "epoch": 0.6951082196922307, "grad_norm": 14.875, "kl": 12.318202018737793, "learning_rate": 5e-06, "logits/chosen": -55467181.71428572, "logits/rejected": -53353241.6, "logps/chosen": -401.0322963169643, "logps/rejected": -762.9701171875, "loss": 0.123, "rewards/chosen": 7.3664736066545755, "rewards/margins": 22.87324698311942, "rewards/rejected": -15.506773376464844, "step": 2778 }, { "epoch": 0.6953584386338045, "grad_norm": 2.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -104850915.55555555, "logits/rejected": -60264029.86666667, "logps/chosen": -479.99370659722223, "logps/rejected": -569.1280598958333, "loss": 0.0073, "rewards/chosen": 8.949535793728298, "rewards/margins": 25.47959967719184, "rewards/rejected": -16.53006388346354, "step": 2779 }, { "epoch": 0.6956086575753785, "grad_norm": 13.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27184192.0, "logits/rejected": -38193448.0, "logps/chosen": -237.35748291015625, "logps/rejected": -626.9056396484375, "loss": 0.0513, "rewards/chosen": 4.784920692443848, "rewards/margins": 22.4048490524292, "rewards/rejected": -17.61992835998535, "step": 2780 }, { "epoch": 0.6958588765169523, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35130940.44444445, "logits/rejected": -24819801.6, "logps/chosen": -376.9291720920139, "logps/rejected": -480.51100260416666, "loss": 0.0206, "rewards/chosen": 8.273162841796875, "rewards/margins": 19.74463907877604, "rewards/rejected": -11.471476236979166, "step": 2781 }, { "epoch": 0.6961090954585262, "grad_norm": 3.078125, "kl": 3.083484649658203, "learning_rate": 5e-06, "logits/chosen": -56633291.294117644, "logits/rejected": 67303872.0, "logps/chosen": -352.5002010569853, "logps/rejected": -534.7555454799107, "loss": 0.0573, "rewards/chosen": 7.823635325712316, "rewards/margins": 19.71793551404937, "rewards/rejected": -11.894300188337054, "step": 2782 }, { "epoch": 0.6963593144001001, "grad_norm": 4.40625, "kl": 17.497352600097656, "learning_rate": 5e-06, "logits/chosen": -61135563.294117644, "logits/rejected": -67860630.85714285, "logps/chosen": -427.1989315257353, "logps/rejected": -648.0020926339286, "loss": 0.0742, "rewards/chosen": 8.893754846909466, "rewards/margins": 25.07725486434808, "rewards/rejected": -16.183500017438615, "step": 2783 }, { "epoch": 0.696609533341674, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30141705.846153848, "logits/rejected": 1587419.6363636365, "logps/chosen": -351.3662860576923, "logps/rejected": -696.5416370738636, "loss": 0.0336, "rewards/chosen": 7.115438608022837, "rewards/margins": 24.61346744990849, "rewards/rejected": -17.498028841885652, "step": 2784 }, { "epoch": 0.6968597522832478, "grad_norm": 10.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31959434.666666668, "logits/rejected": -57100437.333333336, "logps/chosen": -362.876220703125, "logps/rejected": -546.749267578125, "loss": 0.0633, "rewards/chosen": 9.253252029418945, "rewards/margins": 21.924666722615562, "rewards/rejected": -12.671414693196615, "step": 2785 }, { "epoch": 0.6971099712248218, "grad_norm": 10.375, "kl": 3.1979804039001465, "learning_rate": 5e-06, "logits/chosen": -44869642.666666664, "logits/rejected": -17174326.666666668, "logps/chosen": -325.86204020182294, "logps/rejected": -797.40087890625, "loss": 0.0255, "rewards/chosen": 8.714005788167318, "rewards/margins": 26.19175593058268, "rewards/rejected": -17.477750142415363, "step": 2786 }, { "epoch": 0.6973601901663956, "grad_norm": 3.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28893401.6, "logits/rejected": -68220626.28571428, "logps/chosen": -277.379443359375, "logps/rejected": -609.3088727678571, "loss": 0.0576, "rewards/chosen": 7.005789184570313, "rewards/margins": 20.82567116873605, "rewards/rejected": -13.819881984165736, "step": 2787 }, { "epoch": 0.6976104091079695, "grad_norm": 5.75, "kl": 14.049592971801758, "learning_rate": 5e-06, "logits/chosen": -63613696.0, "logits/rejected": -32537450.666666668, "logps/chosen": -485.440673828125, "logps/rejected": -557.6419270833334, "loss": 0.0418, "rewards/chosen": 9.490630467732748, "rewards/margins": 22.89941469828288, "rewards/rejected": -13.40878423055013, "step": 2788 }, { "epoch": 0.6978606280495433, "grad_norm": 5.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21913449.14285714, "logits/rejected": -50411008.0, "logps/chosen": -279.99154227120533, "logps/rejected": -496.7777458639706, "loss": 0.0432, "rewards/chosen": 7.580116271972656, "rewards/margins": 19.15657896154067, "rewards/rejected": -11.576462689568014, "step": 2789 }, { "epoch": 0.6981108469911173, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39148781.71428572, "logits/rejected": -35425564.8, "logps/chosen": -305.31260463169644, "logps/rejected": -505.523681640625, "loss": 0.0261, "rewards/chosen": 7.639545440673828, "rewards/margins": 21.49573745727539, "rewards/rejected": -13.856192016601563, "step": 2790 }, { "epoch": 0.6983610659326911, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58891893.333333336, "logits/rejected": -44858269.86666667, "logps/chosen": -420.4375, "logps/rejected": -623.8641276041667, "loss": 0.0251, "rewards/chosen": 8.816352844238281, "rewards/margins": 24.295028177897137, "rewards/rejected": -15.478675333658854, "step": 2791 }, { "epoch": 0.6986112848742649, "grad_norm": 2.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30401292.307692308, "logits/rejected": -48266513.45454545, "logps/chosen": -341.9787034254808, "logps/rejected": -786.1349431818181, "loss": 0.0451, "rewards/chosen": 7.697232759915865, "rewards/margins": 25.9851813683143, "rewards/rejected": -18.287948608398438, "step": 2792 }, { "epoch": 0.6988615038158389, "grad_norm": 2.515625, "kl": 3.184396266937256, "learning_rate": 5e-06, "logits/chosen": -46040352.0, "logits/rejected": -38574549.333333336, "logps/chosen": -359.7793375651042, "logps/rejected": -671.8191324869791, "loss": 0.0309, "rewards/chosen": 7.303638458251953, "rewards/margins": 22.310812632242836, "rewards/rejected": -15.007174173990885, "step": 2793 }, { "epoch": 0.6991117227574127, "grad_norm": 2.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48646491.428571425, "logits/rejected": -54450356.705882356, "logps/chosen": -448.410400390625, "logps/rejected": -623.2508042279412, "loss": 0.0022, "rewards/chosen": 10.737269810267858, "rewards/margins": 26.486329150800948, "rewards/rejected": -15.749059340533089, "step": 2794 }, { "epoch": 0.6993619416989866, "grad_norm": 2.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37559544.615384616, "logits/rejected": -52700590.54545455, "logps/chosen": -390.9215745192308, "logps/rejected": -542.5998757102273, "loss": 0.0203, "rewards/chosen": 8.133526141826923, "rewards/margins": 19.37596749259042, "rewards/rejected": -11.242441350763494, "step": 2795 }, { "epoch": 0.6996121606405605, "grad_norm": 0.98828125, "kl": 2.039057493209839, "learning_rate": 5e-06, "logits/chosen": -65946586.666666664, "logits/rejected": -57084096.0, "logps/chosen": -507.5934651692708, "logps/rejected": -535.1720377604166, "loss": 0.0072, "rewards/chosen": 10.154741923014322, "rewards/margins": 25.142923990885414, "rewards/rejected": -14.988182067871094, "step": 2796 }, { "epoch": 0.6998623795821344, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38848039.11111111, "logits/rejected": -32067827.2, "logps/chosen": -368.95551215277777, "logps/rejected": -610.6512369791667, "loss": 0.0177, "rewards/chosen": 7.760469224717882, "rewards/margins": 22.18584459092882, "rewards/rejected": -14.425375366210938, "step": 2797 }, { "epoch": 0.7001125985237082, "grad_norm": 6.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77517914.66666667, "logits/rejected": -37977052.44444445, "logps/chosen": -541.1085205078125, "logps/rejected": -499.93408203125, "loss": 0.0254, "rewards/chosen": 10.468310674031576, "rewards/margins": 25.19971699184842, "rewards/rejected": -14.731406317816841, "step": 2798 }, { "epoch": 0.7003628174652822, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40252268.0, "logits/rejected": -20511608.0, "logps/chosen": -361.3359680175781, "logps/rejected": -609.9810791015625, "loss": 0.037, "rewards/chosen": 8.580954551696777, "rewards/margins": 24.40458393096924, "rewards/rejected": -15.823629379272461, "step": 2799 }, { "epoch": 0.700613036406856, "grad_norm": 6.34375, "kl": 5.255013942718506, "learning_rate": 5e-06, "logits/chosen": -54295344.0, "logits/rejected": -62893952.0, "logps/chosen": -399.2161458333333, "logps/rejected": -408.4585774739583, "loss": 0.05, "rewards/chosen": 8.990501403808594, "rewards/margins": 18.69763946533203, "rewards/rejected": -9.707138061523438, "step": 2800 }, { "epoch": 0.7008632553484299, "grad_norm": 2.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45838513.777777776, "logits/rejected": -53645141.333333336, "logps/chosen": -316.9411349826389, "logps/rejected": -561.6852864583333, "loss": 0.0262, "rewards/chosen": 7.044909159342448, "rewards/margins": 22.372471618652344, "rewards/rejected": -15.327562459309895, "step": 2801 }, { "epoch": 0.7011134742900037, "grad_norm": 19.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54299002.18181818, "logits/rejected": -70709356.3076923, "logps/chosen": -415.04350142045456, "logps/rejected": -630.3205378605769, "loss": 0.0201, "rewards/chosen": 7.9148476340553975, "rewards/margins": 29.25633933327415, "rewards/rejected": -21.34149169921875, "step": 2802 }, { "epoch": 0.7013636932315777, "grad_norm": 3.671875, "kl": 3.8550758361816406, "learning_rate": 5e-06, "logits/chosen": -23494822.4, "logits/rejected": -7775202.285714285, "logps/chosen": -441.475927734375, "logps/rejected": -652.6600864955357, "loss": 0.0186, "rewards/chosen": 9.215643310546875, "rewards/margins": 27.017345537458148, "rewards/rejected": -17.80170222691127, "step": 2803 }, { "epoch": 0.7016139121731515, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36134299.428571425, "logits/rejected": -32621203.2, "logps/chosen": -406.44419642857144, "logps/rejected": -666.3396484375, "loss": 0.0123, "rewards/chosen": 8.483455113002233, "rewards/margins": 28.799202183314733, "rewards/rejected": -20.3157470703125, "step": 2804 }, { "epoch": 0.7018641311147253, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63067738.666666664, "logits/rejected": -34078133.333333336, "logps/chosen": -322.76373291015625, "logps/rejected": -553.343505859375, "loss": 0.0741, "rewards/chosen": 7.6857039133707685, "rewards/margins": 22.321938196818035, "rewards/rejected": -14.636234283447266, "step": 2805 }, { "epoch": 0.7021143500562993, "grad_norm": 19.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44109376.0, "logits/rejected": -29337509.333333332, "logps/chosen": -421.6363118489583, "logps/rejected": -515.2259521484375, "loss": 0.0546, "rewards/chosen": 9.611385345458984, "rewards/margins": 25.6060422261556, "rewards/rejected": -15.994656880696615, "step": 2806 }, { "epoch": 0.7023645689978731, "grad_norm": 3.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42214477.71428572, "logits/rejected": -48400134.4, "logps/chosen": -466.38783482142856, "logps/rejected": -835.85546875, "loss": 0.0163, "rewards/chosen": 9.405502319335938, "rewards/margins": 37.369345092773436, "rewards/rejected": -27.9638427734375, "step": 2807 }, { "epoch": 0.702614787939447, "grad_norm": 34.5, "kl": 8.427021026611328, "learning_rate": 5e-06, "logits/chosen": -47083320.0, "logits/rejected": -68001616.0, "logps/chosen": -334.8960266113281, "logps/rejected": -616.2701416015625, "loss": 0.1235, "rewards/chosen": 6.149234771728516, "rewards/margins": 29.302621841430664, "rewards/rejected": -23.15338706970215, "step": 2808 }, { "epoch": 0.7028650068810209, "grad_norm": 1.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -90080265.84615384, "logits/rejected": -29269917.09090909, "logps/chosen": -428.4971454326923, "logps/rejected": -566.1219371448864, "loss": 0.0147, "rewards/chosen": 10.236552311823917, "rewards/margins": 26.204106577626476, "rewards/rejected": -15.967554265802557, "step": 2809 }, { "epoch": 0.7031152258225948, "grad_norm": 18.375, "kl": 4.1148200035095215, "learning_rate": 5e-06, "logits/chosen": -126290662.4, "logits/rejected": -68087488.0, "logps/chosen": -294.8810791015625, "logps/rejected": -619.4635881696429, "loss": 0.1103, "rewards/chosen": 5.11029167175293, "rewards/margins": 21.478977802821568, "rewards/rejected": -16.36868613106864, "step": 2810 }, { "epoch": 0.7033654447641686, "grad_norm": 5.875, "kl": 13.6532564163208, "learning_rate": 5e-06, "logits/chosen": -48643486.315789476, "logits/rejected": -68821568.0, "logps/chosen": -405.7858244243421, "logps/rejected": -1019.23828125, "loss": 0.1497, "rewards/chosen": 7.513634932668586, "rewards/margins": 40.27934843364515, "rewards/rejected": -32.76571350097656, "step": 2811 }, { "epoch": 0.7036156637057426, "grad_norm": 7.75, "kl": 2.8656413555145264, "learning_rate": 5e-06, "logits/chosen": -49761417.14285714, "logits/rejected": -92548108.8, "logps/chosen": -401.373046875, "logps/rejected": -702.535498046875, "loss": 0.0992, "rewards/chosen": 7.878729684012277, "rewards/margins": 28.049011666434154, "rewards/rejected": -20.170281982421876, "step": 2812 }, { "epoch": 0.7038658826473164, "grad_norm": 19.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -79458713.6, "logits/rejected": -57691712.0, "logps/chosen": -494.002099609375, "logps/rejected": -784.9517299107143, "loss": 0.0348, "rewards/chosen": 10.21259002685547, "rewards/margins": 27.619886343819754, "rewards/rejected": -17.407296316964285, "step": 2813 }, { "epoch": 0.7041161015888903, "grad_norm": 2.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32719906.46153846, "logits/rejected": -54949422.54545455, "logps/chosen": -481.7707331730769, "logps/rejected": -680.7263849431819, "loss": 0.013, "rewards/chosen": 9.475159865159254, "rewards/margins": 28.795902385578287, "rewards/rejected": -19.320742520419035, "step": 2814 }, { "epoch": 0.7043663205304641, "grad_norm": 1.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46511658.666666664, "logits/rejected": -21299357.333333332, "logps/chosen": -339.7562662760417, "logps/rejected": -710.3235677083334, "loss": 0.0151, "rewards/chosen": 8.8230349222819, "rewards/margins": 30.170014699300133, "rewards/rejected": -21.34697977701823, "step": 2815 }, { "epoch": 0.7046165394720381, "grad_norm": 3.4375, "kl": 12.012417793273926, "learning_rate": 5e-06, "logits/chosen": -54868428.0, "logits/rejected": -31207144.0, "logps/chosen": -501.846435546875, "logps/rejected": -500.8480224609375, "loss": 0.0049, "rewards/chosen": 9.695928573608398, "rewards/margins": 20.061039924621582, "rewards/rejected": -10.365111351013184, "step": 2816 }, { "epoch": 0.7048667584136119, "grad_norm": 23.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19373475.2, "logits/rejected": -29106800.0, "logps/chosen": -404.86142578125, "logps/rejected": -522.0775669642857, "loss": 0.0468, "rewards/chosen": 6.328115844726563, "rewards/margins": 17.23578600202288, "rewards/rejected": -10.907670157296318, "step": 2817 }, { "epoch": 0.7051169773551857, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73160306.28571428, "logits/rejected": -55067571.2, "logps/chosen": -356.1717006138393, "logps/rejected": -728.42509765625, "loss": 0.0379, "rewards/chosen": 8.683749607631139, "rewards/margins": 30.957196262904578, "rewards/rejected": -22.27344665527344, "step": 2818 }, { "epoch": 0.7053671962967597, "grad_norm": 13.5, "kl": 13.465093612670898, "learning_rate": 5e-06, "logits/chosen": -64261696.0, "logits/rejected": -94827925.33333333, "logps/chosen": -481.3118082682292, "logps/rejected": -738.933837890625, "loss": 0.0297, "rewards/chosen": 10.634162267049154, "rewards/margins": 35.111388524373375, "rewards/rejected": -24.47722625732422, "step": 2819 }, { "epoch": 0.7056174152383335, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16878382.85714286, "logits/rejected": -30856640.0, "logps/chosen": -216.86164202008928, "logps/rejected": -618.92216796875, "loss": 0.0971, "rewards/chosen": 5.142184666224888, "rewards/margins": 22.473379734584263, "rewards/rejected": -17.331195068359374, "step": 2820 }, { "epoch": 0.7058676341799074, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44171987.692307696, "logits/rejected": -50515688.72727273, "logps/chosen": -376.4453125, "logps/rejected": -679.4142844460227, "loss": 0.0516, "rewards/chosen": 6.357829167292668, "rewards/margins": 23.69655486420318, "rewards/rejected": -17.33872569691051, "step": 2821 }, { "epoch": 0.7061178531214813, "grad_norm": 12.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35233440.0, "logits/rejected": -50717508.92307692, "logps/chosen": -368.59738991477275, "logps/rejected": -476.78309044471155, "loss": 0.019, "rewards/chosen": 7.991248390891335, "rewards/margins": 19.655959976302995, "rewards/rejected": -11.66471158541166, "step": 2822 }, { "epoch": 0.7063680720630552, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64454069.333333336, "logits/rejected": -58710229.333333336, "logps/chosen": -388.794921875, "logps/rejected": -781.9348958333334, "loss": 0.0823, "rewards/chosen": 7.467843373616536, "rewards/margins": 27.562594095865883, "rewards/rejected": -20.094750722249348, "step": 2823 }, { "epoch": 0.706618291004629, "grad_norm": 12.0, "kl": 11.920164108276367, "learning_rate": 5e-06, "logits/chosen": -46198409.14285714, "logits/rejected": -27320323.2, "logps/chosen": -344.45455496651783, "logps/rejected": -607.85703125, "loss": 0.0706, "rewards/chosen": 7.8573777335030695, "rewards/margins": 25.617419542585104, "rewards/rejected": -17.760041809082033, "step": 2824 }, { "epoch": 0.706868509946203, "grad_norm": 2.34375, "kl": 0.28626760840415955, "learning_rate": 5e-06, "logits/chosen": -34663232.0, "logits/rejected": -12738889.142857144, "logps/chosen": -433.88935546875, "logps/rejected": -719.2066127232143, "loss": 0.0054, "rewards/chosen": 9.68294677734375, "rewards/margins": 26.907541983468192, "rewards/rejected": -17.22459520612444, "step": 2825 }, { "epoch": 0.7071187288877768, "grad_norm": 0.94140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33976740.571428575, "logits/rejected": -11835712.0, "logps/chosen": -401.70186941964283, "logps/rejected": -674.78681640625, "loss": 0.0107, "rewards/chosen": 9.433323451450892, "rewards/margins": 27.1864988054548, "rewards/rejected": -17.753175354003908, "step": 2826 }, { "epoch": 0.7073689478293507, "grad_norm": 3.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58853845.333333336, "logits/rejected": -46815978.666666664, "logps/chosen": -407.94407552083334, "logps/rejected": -877.8915473090278, "loss": 0.0094, "rewards/chosen": 10.130316162109375, "rewards/margins": 35.291428629557295, "rewards/rejected": -25.161112467447918, "step": 2827 }, { "epoch": 0.7076191667709245, "grad_norm": 11.1875, "kl": 38.14599609375, "learning_rate": 5e-06, "logits/chosen": -73945910.85714285, "logits/rejected": -30161379.2, "logps/chosen": -551.8123604910714, "logps/rejected": -843.4265625, "loss": 0.0845, "rewards/chosen": 12.412297930036273, "rewards/margins": 35.16861506870815, "rewards/rejected": -22.756317138671875, "step": 2828 }, { "epoch": 0.7078693857124985, "grad_norm": 1.3203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51770572.8, "logits/rejected": -41191488.0, "logps/chosen": -376.494580078125, "logps/rejected": -555.8494001116071, "loss": 0.0282, "rewards/chosen": 9.79822006225586, "rewards/margins": 25.67746854509626, "rewards/rejected": -15.879248482840401, "step": 2829 }, { "epoch": 0.7081196046540723, "grad_norm": 3.96875, "kl": 0.36625418066978455, "learning_rate": 5e-06, "logits/chosen": -75758091.63636364, "logits/rejected": -29314171.076923076, "logps/chosen": -423.9033203125, "logps/rejected": -551.8607271634615, "loss": 0.0126, "rewards/chosen": 9.765291387384588, "rewards/margins": 21.93703257954204, "rewards/rejected": -12.171741192157452, "step": 2830 }, { "epoch": 0.7083698235956462, "grad_norm": 5.0625, "kl": 0.8606275320053101, "learning_rate": 5e-06, "logits/chosen": -71126853.81818181, "logits/rejected": -40138993.23076923, "logps/chosen": -414.7760120738636, "logps/rejected": -591.3296649639423, "loss": 0.0283, "rewards/chosen": 7.694145202636719, "rewards/margins": 21.260935269869293, "rewards/rejected": -13.566790067232573, "step": 2831 }, { "epoch": 0.7086200425372201, "grad_norm": 6.84375, "kl": 13.946002960205078, "learning_rate": 5e-06, "logits/chosen": -42008210.28571428, "logits/rejected": -41951542.4, "logps/chosen": -389.9298618861607, "logps/rejected": -679.4955078125, "loss": 0.038, "rewards/chosen": 9.412314278738839, "rewards/margins": 27.424315316336497, "rewards/rejected": -18.012001037597656, "step": 2832 }, { "epoch": 0.708870261478794, "grad_norm": 14.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45796509.09090909, "logits/rejected": -16990257.230769232, "logps/chosen": -465.1394708806818, "logps/rejected": -522.7392953725962, "loss": 0.0364, "rewards/chosen": 6.38507773659446, "rewards/margins": 16.770491273253114, "rewards/rejected": -10.385413536658653, "step": 2833 }, { "epoch": 0.7091204804203678, "grad_norm": 1.59375, "kl": 3.3121135234832764, "learning_rate": 5e-06, "logits/chosen": -57254997.333333336, "logits/rejected": -28750023.111111112, "logps/chosen": -503.9241536458333, "logps/rejected": -719.1770833333334, "loss": 0.0017, "rewards/chosen": 10.296244303385416, "rewards/margins": 28.75234815809462, "rewards/rejected": -18.456103854709202, "step": 2834 }, { "epoch": 0.7093706993619417, "grad_norm": 9.375, "kl": 6.719563007354736, "learning_rate": 5e-06, "logits/chosen": -33529088.0, "logits/rejected": -86171313.77777778, "logps/chosen": -342.90686848958336, "logps/rejected": -591.7131618923611, "loss": 0.0694, "rewards/chosen": 6.963863627115885, "rewards/margins": 20.23766564263238, "rewards/rejected": -13.273802015516493, "step": 2835 }, { "epoch": 0.7096209183035156, "grad_norm": 21.25, "kl": 26.440372467041016, "learning_rate": 5e-06, "logits/chosen": -29052413.866666667, "logits/rejected": -35388757.333333336, "logps/chosen": -502.05970052083336, "logps/rejected": -560.2633463541666, "loss": 0.0565, "rewards/chosen": 9.2646240234375, "rewards/margins": 22.778560384114584, "rewards/rejected": -13.513936360677084, "step": 2836 }, { "epoch": 0.7098711372450894, "grad_norm": 9.375, "kl": 6.822956085205078, "learning_rate": 5e-06, "logits/chosen": -24766629.818181816, "logits/rejected": -44858008.615384616, "logps/chosen": -417.67631392045456, "logps/rejected": -623.0213716947115, "loss": 0.0405, "rewards/chosen": 9.356787941672586, "rewards/margins": 25.505620996435205, "rewards/rejected": -16.14883305476262, "step": 2837 }, { "epoch": 0.7101213561866633, "grad_norm": 2.703125, "kl": 15.25387954711914, "learning_rate": 5e-06, "logits/chosen": -41985713.23076923, "logits/rejected": -67380706.9090909, "logps/chosen": -490.36470853365387, "logps/rejected": -548.91748046875, "loss": 0.0669, "rewards/chosen": 10.400360107421875, "rewards/margins": 21.486043063077062, "rewards/rejected": -11.085682955655185, "step": 2838 }, { "epoch": 0.7103715751282372, "grad_norm": 3.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43332888.0, "logits/rejected": -51703712.0, "logps/chosen": -389.4402669270833, "logps/rejected": -580.8695068359375, "loss": 0.0587, "rewards/chosen": 8.225232442220053, "rewards/margins": 18.301397959391277, "rewards/rejected": -10.076165517171225, "step": 2839 }, { "epoch": 0.7106217940698111, "grad_norm": 18.875, "kl": 23.652053833007812, "learning_rate": 5e-06, "logits/chosen": -57816952.0, "logits/rejected": 8809010.0, "logps/chosen": -501.9508056640625, "logps/rejected": -412.52691650390625, "loss": 0.0464, "rewards/chosen": 11.180964469909668, "rewards/margins": 19.88382911682129, "rewards/rejected": -8.702864646911621, "step": 2840 }, { "epoch": 0.7108720130113849, "grad_norm": 4.8125, "kl": 3.7574310302734375, "learning_rate": 5e-06, "logits/chosen": -41451766.15384615, "logits/rejected": -39796308.36363637, "logps/chosen": -331.85216346153845, "logps/rejected": -427.05149147727275, "loss": 0.0224, "rewards/chosen": 8.715910104604868, "rewards/margins": 21.05279087520146, "rewards/rejected": -12.336880770596592, "step": 2841 }, { "epoch": 0.7111222319529589, "grad_norm": 15.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45107368.72727273, "logits/rejected": -47414321.23076923, "logps/chosen": -467.9940074573864, "logps/rejected": -653.5563401442307, "loss": 0.0197, "rewards/chosen": 10.463801297274502, "rewards/margins": 21.725147967571978, "rewards/rejected": -11.261346670297476, "step": 2842 }, { "epoch": 0.7113724508945327, "grad_norm": 0.59375, "kl": 3.7489001750946045, "learning_rate": 5e-06, "logits/chosen": -23924918.85714286, "logits/rejected": -47244656.0, "logps/chosen": -380.5603724888393, "logps/rejected": -654.917333984375, "loss": 0.0016, "rewards/chosen": 10.75871821812221, "rewards/margins": 25.424906267438615, "rewards/rejected": -14.666188049316407, "step": 2843 }, { "epoch": 0.7116226698361066, "grad_norm": 14.3125, "kl": 9.8784818649292, "learning_rate": 5e-06, "logits/chosen": -24410053.333333332, "logits/rejected": -23943712.0, "logps/chosen": -399.3997802734375, "logps/rejected": -380.7164713541667, "loss": 0.0837, "rewards/chosen": 9.614611307779947, "rewards/margins": 18.097569783528645, "rewards/rejected": -8.482958475748697, "step": 2844 }, { "epoch": 0.7118728887776805, "grad_norm": 2.984375, "kl": 2.053798198699951, "learning_rate": 5e-06, "logits/chosen": -26502024.0, "logits/rejected": -53121080.0, "logps/chosen": -351.9802551269531, "logps/rejected": -660.3380126953125, "loss": 0.037, "rewards/chosen": 9.009044647216797, "rewards/margins": 21.41208267211914, "rewards/rejected": -12.403038024902344, "step": 2845 }, { "epoch": 0.7121231077192544, "grad_norm": 10.25, "kl": 0.039526622742414474, "learning_rate": 5e-06, "logits/chosen": -60147957.333333336, "logits/rejected": -41260280.0, "logps/chosen": -286.2404378255208, "logps/rejected": -620.2919921875, "loss": 0.0645, "rewards/chosen": 7.060587565104167, "rewards/margins": 23.003092447916668, "rewards/rejected": -15.9425048828125, "step": 2846 }, { "epoch": 0.7123733266608282, "grad_norm": 6.4375, "kl": 15.201155662536621, "learning_rate": 5e-06, "logits/chosen": -33218082.133333333, "logits/rejected": -20480615.111111112, "logps/chosen": -369.7563151041667, "logps/rejected": -514.876953125, "loss": 0.0641, "rewards/chosen": 9.411383056640625, "rewards/margins": 18.668719991048178, "rewards/rejected": -9.257336934407553, "step": 2847 }, { "epoch": 0.7126235456024022, "grad_norm": 3.6875, "kl": 5.470212459564209, "learning_rate": 5e-06, "logits/chosen": -22100365.333333332, "logits/rejected": -39568728.0, "logps/chosen": -362.3927408854167, "logps/rejected": -728.1795247395834, "loss": 0.0657, "rewards/chosen": 8.579461415608725, "rewards/margins": 23.26226806640625, "rewards/rejected": -14.682806650797525, "step": 2848 }, { "epoch": 0.712873764543976, "grad_norm": 12.875, "kl": 10.43774127960205, "learning_rate": 5e-06, "logits/chosen": -58420804.92307692, "logits/rejected": -60946234.18181818, "logps/chosen": -414.2282902644231, "logps/rejected": -629.3176491477273, "loss": 0.0313, "rewards/chosen": 9.766668466421274, "rewards/margins": 24.70909705528846, "rewards/rejected": -14.942428588867188, "step": 2849 }, { "epoch": 0.7131239834855498, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18501408.0, "logits/rejected": -33000492.0, "logps/chosen": -354.88360595703125, "logps/rejected": -531.7135009765625, "loss": 0.0327, "rewards/chosen": 8.246360778808594, "rewards/margins": 19.713088989257812, "rewards/rejected": -11.466728210449219, "step": 2850 }, { "epoch": 0.7133742024271237, "grad_norm": 2.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34843936.0, "logits/rejected": -51467786.666666664, "logps/chosen": -333.20371500651044, "logps/rejected": -581.0220947265625, "loss": 0.0127, "rewards/chosen": 8.187694549560547, "rewards/margins": 20.489177703857422, "rewards/rejected": -12.301483154296875, "step": 2851 }, { "epoch": 0.7136244213686976, "grad_norm": 12.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53458572.0, "logits/rejected": -36250980.0, "logps/chosen": -236.44578552246094, "logps/rejected": -642.029296875, "loss": 0.0315, "rewards/chosen": 5.8505144119262695, "rewards/margins": 20.897113800048828, "rewards/rejected": -15.046599388122559, "step": 2852 }, { "epoch": 0.7138746403102715, "grad_norm": 29.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45227052.307692304, "logits/rejected": -24763938.90909091, "logps/chosen": -372.0153620793269, "logps/rejected": -594.9982688210227, "loss": 0.0829, "rewards/chosen": 8.906301645132212, "rewards/margins": 18.668860322111968, "rewards/rejected": -9.762558676979758, "step": 2853 }, { "epoch": 0.7141248592518453, "grad_norm": 8.5625, "kl": 9.637690544128418, "learning_rate": 5e-06, "logits/chosen": -63842805.333333336, "logits/rejected": -53839456.0, "logps/chosen": -393.1923828125, "logps/rejected": -597.5767415364584, "loss": 0.025, "rewards/chosen": 9.196970621744791, "rewards/margins": 23.25041325887044, "rewards/rejected": -14.05344263712565, "step": 2854 }, { "epoch": 0.7143750781934193, "grad_norm": 1.6796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37782163.692307696, "logits/rejected": -47577643.63636363, "logps/chosen": -422.95748197115387, "logps/rejected": -727.5739524147727, "loss": 0.0164, "rewards/chosen": 8.972540635329027, "rewards/margins": 25.31112622881269, "rewards/rejected": -16.338585593483664, "step": 2855 }, { "epoch": 0.7146252971349931, "grad_norm": 5.3125, "kl": 1.1548964977264404, "learning_rate": 5e-06, "logits/chosen": -15816248.888888888, "logits/rejected": -36990732.8, "logps/chosen": -374.362548828125, "logps/rejected": -602.8686197916667, "loss": 0.0122, "rewards/chosen": 9.278047349717882, "rewards/margins": 23.323512437608507, "rewards/rejected": -14.045465087890625, "step": 2856 }, { "epoch": 0.714875516076567, "grad_norm": 12.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57050950.4, "logits/rejected": -42847936.0, "logps/chosen": -395.9795654296875, "logps/rejected": -666.8650948660714, "loss": 0.0225, "rewards/chosen": 8.862783813476563, "rewards/margins": 23.772857230050224, "rewards/rejected": -14.910073416573661, "step": 2857 }, { "epoch": 0.7151257350181409, "grad_norm": 1.1171875, "kl": 0.5791168212890625, "learning_rate": 5e-06, "logits/chosen": -43719726.93333333, "logits/rejected": -50477834.666666664, "logps/chosen": -446.62115885416665, "logps/rejected": -709.5394965277778, "loss": 0.0024, "rewards/chosen": 9.364790852864584, "rewards/margins": 27.542259046766493, "rewards/rejected": -18.17746819390191, "step": 2858 }, { "epoch": 0.7153759539597148, "grad_norm": 8.4375, "kl": 3.7966461181640625, "learning_rate": 5e-06, "logits/chosen": -41097989.81818182, "logits/rejected": -82372278.15384616, "logps/chosen": -337.95634321732956, "logps/rejected": -551.2882737379807, "loss": 0.0309, "rewards/chosen": 7.855382052334872, "rewards/margins": 24.443642409531385, "rewards/rejected": -16.588260357196514, "step": 2859 }, { "epoch": 0.7156261729012886, "grad_norm": 3.921875, "kl": 10.750984191894531, "learning_rate": 5e-06, "logits/chosen": -40742112.0, "logits/rejected": -17511212.0, "logps/chosen": -378.3724670410156, "logps/rejected": -400.796630859375, "loss": 0.0849, "rewards/chosen": 8.816256523132324, "rewards/margins": 18.769189834594727, "rewards/rejected": -9.952933311462402, "step": 2860 }, { "epoch": 0.7158763918428626, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41644750.222222224, "logits/rejected": -32165075.2, "logps/chosen": -354.6389973958333, "logps/rejected": -449.7046875, "loss": 0.0435, "rewards/chosen": 8.696370442708334, "rewards/margins": 23.544317626953124, "rewards/rejected": -14.847947184244791, "step": 2861 }, { "epoch": 0.7161266107844364, "grad_norm": 13.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36305749.333333336, "logits/rejected": -55204616.53333333, "logps/chosen": -324.2687174479167, "logps/rejected": -687.24140625, "loss": 0.0506, "rewards/chosen": 7.352183024088542, "rewards/margins": 25.028904215494794, "rewards/rejected": -17.67672119140625, "step": 2862 }, { "epoch": 0.7163768297260102, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57934048.0, "logits/rejected": -43297910.85714286, "logps/chosen": -397.777490234375, "logps/rejected": -461.37806919642856, "loss": 0.0408, "rewards/chosen": 6.858859252929688, "rewards/margins": 22.43976571219308, "rewards/rejected": -15.580906459263392, "step": 2863 }, { "epoch": 0.7166270486675841, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62099130.666666664, "logits/rejected": -48201440.0, "logps/chosen": -506.9086507161458, "logps/rejected": -579.0641276041666, "loss": 0.0068, "rewards/chosen": 10.800952911376953, "rewards/margins": 26.549596150716148, "rewards/rejected": -15.748643239339193, "step": 2864 }, { "epoch": 0.716877267609158, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59406997.333333336, "logits/rejected": -70401912.8888889, "logps/chosen": -347.2501220703125, "logps/rejected": -516.6752387152778, "loss": 0.0152, "rewards/chosen": 7.224907557169597, "rewards/margins": 25.40914895799425, "rewards/rejected": -18.184241400824654, "step": 2865 }, { "epoch": 0.7171274865507319, "grad_norm": 1.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76357426.28571428, "logits/rejected": -37111518.11764706, "logps/chosen": -386.46707589285717, "logps/rejected": -570.9237706801471, "loss": 0.0041, "rewards/chosen": 7.317531040736607, "rewards/margins": 23.490377217781646, "rewards/rejected": -16.172846177045038, "step": 2866 }, { "epoch": 0.7173777054923057, "grad_norm": 14.375, "kl": 11.494989395141602, "learning_rate": 5e-06, "logits/chosen": -33947692.307692304, "logits/rejected": -12712884.363636363, "logps/chosen": -457.49767127403845, "logps/rejected": -557.5231267755681, "loss": 0.0966, "rewards/chosen": 7.947304358849158, "rewards/margins": 25.671394935021034, "rewards/rejected": -17.724090576171875, "step": 2867 }, { "epoch": 0.7176279244338797, "grad_norm": 1.3359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37782504.0, "logits/rejected": -68783272.0, "logps/chosen": -343.82275390625, "logps/rejected": -653.927001953125, "loss": 0.0515, "rewards/chosen": 7.150485515594482, "rewards/margins": 28.440362453460693, "rewards/rejected": -21.28987693786621, "step": 2868 }, { "epoch": 0.7178781433754535, "grad_norm": 4.78125, "kl": 2.913024425506592, "learning_rate": 5e-06, "logits/chosen": -72402656.0, "logits/rejected": -58573580.8, "logps/chosen": -392.57742745535717, "logps/rejected": -824.3365234375, "loss": 0.0426, "rewards/chosen": 7.126686096191406, "rewards/margins": 29.11225280761719, "rewards/rejected": -21.985566711425783, "step": 2869 }, { "epoch": 0.7181283623170274, "grad_norm": 7.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37613065.14285714, "logits/rejected": -84125056.0, "logps/chosen": -255.48566545758928, "logps/rejected": -897.02607421875, "loss": 0.0637, "rewards/chosen": 5.523496900285993, "rewards/margins": 38.19929951259068, "rewards/rejected": -32.675802612304686, "step": 2870 }, { "epoch": 0.7183785812586013, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51668209.23076923, "logits/rejected": -41462021.81818182, "logps/chosen": -398.46634615384613, "logps/rejected": -623.2795632102273, "loss": 0.0195, "rewards/chosen": 7.699086115910457, "rewards/margins": 26.738180520651223, "rewards/rejected": -19.039094404740766, "step": 2871 }, { "epoch": 0.7186288002001752, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -85770595.55555555, "logits/rejected": -63380795.733333334, "logps/chosen": -346.42529296875, "logps/rejected": -563.9303385416666, "loss": 0.0722, "rewards/chosen": 4.952493455674913, "rewards/margins": 25.018030971950957, "rewards/rejected": -20.06553751627604, "step": 2872 }, { "epoch": 0.718879019141749, "grad_norm": 20.375, "kl": 9.96059513092041, "learning_rate": 5e-06, "logits/chosen": -58355080.53333333, "logits/rejected": -54580568.88888889, "logps/chosen": -421.45078125, "logps/rejected": -629.4325629340278, "loss": 0.0907, "rewards/chosen": 6.894812520345052, "rewards/margins": 29.980572001139322, "rewards/rejected": -23.08575948079427, "step": 2873 }, { "epoch": 0.7191292380833229, "grad_norm": 3.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47940499.692307696, "logits/rejected": -41416011.63636363, "logps/chosen": -294.9191706730769, "logps/rejected": -583.0656960227273, "loss": 0.0615, "rewards/chosen": 6.7944159874549275, "rewards/margins": 22.63686482889669, "rewards/rejected": -15.842448841441762, "step": 2874 }, { "epoch": 0.7193794570248968, "grad_norm": 31.875, "kl": 1.58340585231781, "learning_rate": 5e-06, "logits/chosen": -50031296.0, "logits/rejected": -64277553.777777776, "logps/chosen": -407.28229166666665, "logps/rejected": -543.6356336805555, "loss": 0.0293, "rewards/chosen": 7.493740844726562, "rewards/margins": 24.173035007052952, "rewards/rejected": -16.67929416232639, "step": 2875 }, { "epoch": 0.7196296759664706, "grad_norm": 2.21875, "kl": 8.623088836669922, "learning_rate": 5e-06, "logits/chosen": -49215817.14285714, "logits/rejected": 5751075.2, "logps/chosen": -445.29659598214283, "logps/rejected": -832.73798828125, "loss": 0.0068, "rewards/chosen": 9.496289934430804, "rewards/margins": 30.461179460797993, "rewards/rejected": -20.964889526367188, "step": 2876 }, { "epoch": 0.7198798949080445, "grad_norm": 13.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36594481.777777776, "logits/rejected": -62171072.0, "logps/chosen": -498.19281684027777, "logps/rejected": -788.6013671875, "loss": 0.0148, "rewards/chosen": 8.378968980577257, "rewards/margins": 32.96138678656684, "rewards/rejected": -24.582417805989582, "step": 2877 }, { "epoch": 0.7201301138496184, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45305348.266666666, "logits/rejected": -64915150.222222224, "logps/chosen": -260.76728515625, "logps/rejected": -700.75048828125, "loss": 0.0495, "rewards/chosen": 7.225716145833333, "rewards/margins": 25.928771633572048, "rewards/rejected": -18.703055487738716, "step": 2878 }, { "epoch": 0.7203803327911923, "grad_norm": 1.3515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16714944.0, "logits/rejected": -52116539.428571425, "logps/chosen": -416.816259765625, "logps/rejected": -678.0104631696429, "loss": 0.0021, "rewards/chosen": 8.776738739013672, "rewards/margins": 25.370780835832868, "rewards/rejected": -16.594042096819198, "step": 2879 }, { "epoch": 0.7206305517327661, "grad_norm": 7.71875, "kl": 4.956021308898926, "learning_rate": 5e-06, "logits/chosen": -50827929.6, "logits/rejected": 19882609.777777776, "logps/chosen": -325.528125, "logps/rejected": -596.7471245659722, "loss": 0.0479, "rewards/chosen": 7.604216003417969, "rewards/margins": 23.96589677598741, "rewards/rejected": -16.361680772569443, "step": 2880 }, { "epoch": 0.7208807706743401, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64226803.2, "logits/rejected": -46863401.14285714, "logps/chosen": -466.429150390625, "logps/rejected": -833.7985491071429, "loss": 0.0249, "rewards/chosen": 6.062162017822265, "rewards/margins": 27.55497076851981, "rewards/rejected": -21.492808750697545, "step": 2881 }, { "epoch": 0.7211309896159139, "grad_norm": 3.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39330888.0, "logits/rejected": -11901704.0, "logps/chosen": -245.7134246826172, "logps/rejected": -690.5980224609375, "loss": 0.0276, "rewards/chosen": 6.096774101257324, "rewards/margins": 22.59409809112549, "rewards/rejected": -16.497323989868164, "step": 2882 }, { "epoch": 0.7213812085574878, "grad_norm": 4.65625, "kl": 11.910772323608398, "learning_rate": 5e-06, "logits/chosen": -79897280.0, "logits/rejected": -32291056.0, "logps/chosen": -469.0347377232143, "logps/rejected": -709.92685546875, "loss": 0.0226, "rewards/chosen": 10.987845284598214, "rewards/margins": 27.889459664481024, "rewards/rejected": -16.901614379882812, "step": 2883 }, { "epoch": 0.7216314274990617, "grad_norm": 7.9375, "kl": 0.6433385610580444, "learning_rate": 5e-06, "logits/chosen": -68300771.55555555, "logits/rejected": -36462280.53333333, "logps/chosen": -559.6072048611111, "logps/rejected": -500.1732421875, "loss": 0.032, "rewards/chosen": 11.133505079481337, "rewards/margins": 22.753039381239148, "rewards/rejected": -11.619534301757813, "step": 2884 }, { "epoch": 0.7218816464406356, "grad_norm": 1.3828125, "kl": 1.8706579208374023, "learning_rate": 5e-06, "logits/chosen": -46206504.72727273, "logits/rejected": -69710498.46153846, "logps/chosen": -437.296875, "logps/rejected": -772.2077824519231, "loss": 0.0209, "rewards/chosen": 8.938297618519176, "rewards/margins": 27.36773276162314, "rewards/rejected": -18.429435143103966, "step": 2885 }, { "epoch": 0.7221318653822094, "grad_norm": 9.3125, "kl": 9.468201637268066, "learning_rate": 5e-06, "logits/chosen": -48625846.15384615, "logits/rejected": -26920116.363636363, "logps/chosen": -390.6511418269231, "logps/rejected": -548.4027432528409, "loss": 0.0554, "rewards/chosen": 9.134733346792368, "rewards/margins": 22.46151834768015, "rewards/rejected": -13.326785000887783, "step": 2886 }, { "epoch": 0.7223820843237833, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36453335.27272727, "logits/rejected": -41300775.384615384, "logps/chosen": -377.9205877130682, "logps/rejected": -880.5459735576923, "loss": 0.0152, "rewards/chosen": 9.597073641690342, "rewards/margins": 27.431426575133848, "rewards/rejected": -17.83435293344351, "step": 2887 }, { "epoch": 0.7226323032653572, "grad_norm": 4.875, "kl": 10.460177421569824, "learning_rate": 5e-06, "logits/chosen": -39939202.461538464, "logits/rejected": -40217064.72727273, "logps/chosen": -356.84581580528845, "logps/rejected": -521.3598188920455, "loss": 0.0429, "rewards/chosen": 9.2763671875, "rewards/margins": 22.18054337935014, "rewards/rejected": -12.904176191850143, "step": 2888 }, { "epoch": 0.722882522206931, "grad_norm": 12.625, "kl": 3.3482789993286133, "learning_rate": 5e-06, "logits/chosen": -26261496.615384616, "logits/rejected": -49188683.63636363, "logps/chosen": -312.43795072115387, "logps/rejected": -714.7829367897727, "loss": 0.0747, "rewards/chosen": 6.7061033982497, "rewards/margins": 21.515174332198562, "rewards/rejected": -14.809070933948863, "step": 2889 }, { "epoch": 0.7231327411485049, "grad_norm": 1.8828125, "kl": 4.55334997177124, "learning_rate": 5e-06, "logits/chosen": -39667737.6, "logits/rejected": -44198741.333333336, "logps/chosen": -354.7807942708333, "logps/rejected": -396.05121527777777, "loss": 0.0395, "rewards/chosen": 8.337481689453124, "rewards/margins": 20.03710649278429, "rewards/rejected": -11.699624803331163, "step": 2890 }, { "epoch": 0.7233829600900789, "grad_norm": 9.625, "kl": 8.35892391204834, "learning_rate": 5e-06, "logits/chosen": -61173485.71428572, "logits/rejected": -32798848.0, "logps/chosen": -321.08349609375, "logps/rejected": -460.376220703125, "loss": 0.0435, "rewards/chosen": 7.999434334891183, "rewards/margins": 20.414312417166574, "rewards/rejected": -12.41487808227539, "step": 2891 }, { "epoch": 0.7236331790316527, "grad_norm": 3.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63597937.23076923, "logits/rejected": -39373425.45454545, "logps/chosen": -381.63656850961536, "logps/rejected": -496.32080078125, "loss": 0.015, "rewards/chosen": 8.831263028658354, "rewards/margins": 21.526453458345856, "rewards/rejected": -12.6951904296875, "step": 2892 }, { "epoch": 0.7238833979732265, "grad_norm": 7.65625, "kl": 3.565218687057495, "learning_rate": 5e-06, "logits/chosen": -51561120.0, "logits/rejected": -36241408.0, "logps/chosen": -394.6404622395833, "logps/rejected": -693.6927897135416, "loss": 0.012, "rewards/chosen": 10.486139933268229, "rewards/margins": 28.127011617024742, "rewards/rejected": -17.64087168375651, "step": 2893 }, { "epoch": 0.7241336169148005, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87192391.1111111, "logits/rejected": -55685811.2, "logps/chosen": -362.45068359375, "logps/rejected": -680.239453125, "loss": 0.073, "rewards/chosen": 9.25250244140625, "rewards/margins": 25.208761596679686, "rewards/rejected": -15.956259155273438, "step": 2894 }, { "epoch": 0.7243838358563743, "grad_norm": 15.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23695948.8, "logits/rejected": -59597645.71428572, "logps/chosen": -290.9193359375, "logps/rejected": -537.29052734375, "loss": 0.0409, "rewards/chosen": 6.676039123535157, "rewards/margins": 18.117690604073662, "rewards/rejected": -11.441651480538505, "step": 2895 }, { "epoch": 0.7246340547979482, "grad_norm": 14.4375, "kl": 11.795345306396484, "learning_rate": 5e-06, "logits/chosen": -29616284.0, "logits/rejected": -37607488.0, "logps/chosen": -346.43328857421875, "logps/rejected": -1024.3037109375, "loss": 0.0634, "rewards/chosen": 7.913320541381836, "rewards/margins": 28.349504470825195, "rewards/rejected": -20.43618392944336, "step": 2896 }, { "epoch": 0.7248842737395221, "grad_norm": 6.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46718300.8, "logits/rejected": -36814902.85714286, "logps/chosen": -446.039697265625, "logps/rejected": -496.92550223214283, "loss": 0.0155, "rewards/chosen": 9.608113098144532, "rewards/margins": 23.49878605433873, "rewards/rejected": -13.890672956194196, "step": 2897 }, { "epoch": 0.725134492681096, "grad_norm": 10.0625, "kl": 1.258618712425232, "learning_rate": 5e-06, "logits/chosen": -40302336.0, "logits/rejected": -56714074.666666664, "logps/chosen": -303.92909749348956, "logps/rejected": -598.5936279296875, "loss": 0.0448, "rewards/chosen": 6.980060577392578, "rewards/margins": 23.430469512939453, "rewards/rejected": -16.450408935546875, "step": 2898 }, { "epoch": 0.7253847116226698, "grad_norm": 9.375, "kl": 6.403668403625488, "learning_rate": 5e-06, "logits/chosen": -36249843.2, "logits/rejected": -26060048.0, "logps/chosen": -318.43525390625, "logps/rejected": -415.39976671006946, "loss": 0.0527, "rewards/chosen": 8.134361775716146, "rewards/margins": 18.383056301540798, "rewards/rejected": -10.248694525824654, "step": 2899 }, { "epoch": 0.7256349305642437, "grad_norm": 5.0, "kl": 7.78309965133667, "learning_rate": 5e-06, "logits/chosen": -31293245.866666667, "logits/rejected": -56980593.777777776, "logps/chosen": -383.75579427083335, "logps/rejected": -852.9083116319445, "loss": 0.0147, "rewards/chosen": 9.44459737141927, "rewards/margins": 33.332663981119794, "rewards/rejected": -23.88806660970052, "step": 2900 }, { "epoch": 0.7258851495058176, "grad_norm": 37.5, "kl": 6.944684982299805, "learning_rate": 5e-06, "logits/chosen": -47888352.0, "logits/rejected": -52359280.0, "logps/chosen": -378.3653971354167, "logps/rejected": -714.9058430989584, "loss": 0.0296, "rewards/chosen": 8.003308614095053, "rewards/margins": 26.430039723714195, "rewards/rejected": -18.42673110961914, "step": 2901 }, { "epoch": 0.7261353684473915, "grad_norm": 0.6171875, "kl": 3.27380633354187, "learning_rate": 5e-06, "logits/chosen": -52830562.461538464, "logits/rejected": -31542510.545454547, "logps/chosen": -425.10336538461536, "logps/rejected": -505.30397727272725, "loss": 0.0163, "rewards/chosen": 10.095457810621996, "rewards/margins": 23.388698444499838, "rewards/rejected": -13.293240633877842, "step": 2902 }, { "epoch": 0.7263855873889653, "grad_norm": 4.53125, "kl": 1.1286303997039795, "learning_rate": 5e-06, "logits/chosen": -54536413.09090909, "logits/rejected": -25687020.307692308, "logps/chosen": -338.84275124289775, "logps/rejected": -559.2696439302885, "loss": 0.07, "rewards/chosen": 8.217640269886363, "rewards/margins": 22.982396692662803, "rewards/rejected": -14.764756422776442, "step": 2903 }, { "epoch": 0.7266358063305393, "grad_norm": 1.9609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 11740802.0, "logits/rejected": -39963104.0, "logps/chosen": -300.1917724609375, "logps/rejected": -460.0611877441406, "loss": 0.0292, "rewards/chosen": 6.3447675704956055, "rewards/margins": 21.09045124053955, "rewards/rejected": -14.745683670043945, "step": 2904 }, { "epoch": 0.7268860252721131, "grad_norm": 5.75, "kl": 3.5030932426452637, "learning_rate": 5e-06, "logits/chosen": -42804072.0, "logits/rejected": -78302928.0, "logps/chosen": -347.8034973144531, "logps/rejected": -612.3585815429688, "loss": 0.0311, "rewards/chosen": 7.102024555206299, "rewards/margins": 24.9579758644104, "rewards/rejected": -17.8559513092041, "step": 2905 }, { "epoch": 0.7271362442136869, "grad_norm": 6.84375, "kl": 17.318065643310547, "learning_rate": 5e-06, "logits/chosen": -37570128.0, "logits/rejected": 63001360.0, "logps/chosen": -460.0954182942708, "logps/rejected": -580.6607259114584, "loss": 0.025, "rewards/chosen": 10.975963592529297, "rewards/margins": 26.392869313557945, "rewards/rejected": -15.416905721028646, "step": 2906 }, { "epoch": 0.7273864631552609, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30255782.4, "logits/rejected": -4995613.714285715, "logps/chosen": -315.675830078125, "logps/rejected": -617.6026785714286, "loss": 0.0235, "rewards/chosen": 6.311699295043946, "rewards/margins": 21.539633996146065, "rewards/rejected": -15.22793470110212, "step": 2907 }, { "epoch": 0.7276366820968347, "grad_norm": 0.64453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63655027.2, "logits/rejected": -60351698.28571428, "logps/chosen": -339.6093994140625, "logps/rejected": -635.181640625, "loss": 0.0087, "rewards/chosen": 9.819187927246094, "rewards/margins": 25.303001621791296, "rewards/rejected": -15.483813694545201, "step": 2908 }, { "epoch": 0.7278869010384086, "grad_norm": 2.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22752053.333333332, "logits/rejected": -29683916.8, "logps/chosen": -304.55479600694446, "logps/rejected": -501.7289713541667, "loss": 0.0464, "rewards/chosen": 5.375161912706163, "rewards/margins": 16.369777594672307, "rewards/rejected": -10.994615681966145, "step": 2909 }, { "epoch": 0.7281371199799825, "grad_norm": 13.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31534196.57142857, "logits/rejected": -26489886.11764706, "logps/chosen": -209.28681291852678, "logps/rejected": -603.5287224264706, "loss": 0.0391, "rewards/chosen": 5.497153690883091, "rewards/margins": 17.27068812907243, "rewards/rejected": -11.773534438189339, "step": 2910 }, { "epoch": 0.7283873389215564, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50321296.0, "logits/rejected": -43530176.0, "logps/chosen": -400.8604736328125, "logps/rejected": -787.0475463867188, "loss": 0.0023, "rewards/chosen": 11.169492721557617, "rewards/margins": 29.428300857543945, "rewards/rejected": -18.258808135986328, "step": 2911 }, { "epoch": 0.7286375578631302, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25860368.0, "logits/rejected": -32112621.333333332, "logps/chosen": -372.0572916666667, "logps/rejected": -504.5743815104167, "loss": 0.0306, "rewards/chosen": 10.26059341430664, "rewards/margins": 23.600494384765625, "rewards/rejected": -13.339900970458984, "step": 2912 }, { "epoch": 0.7288877768047041, "grad_norm": 5.0, "kl": 8.947591781616211, "learning_rate": 5e-06, "logits/chosen": -48572749.71428572, "logits/rejected": -78966572.8, "logps/chosen": -355.59873744419644, "logps/rejected": -349.98388671875, "loss": 0.0393, "rewards/chosen": 7.420417240687779, "rewards/margins": 17.522190311976843, "rewards/rejected": -10.101773071289063, "step": 2913 }, { "epoch": 0.729137995746278, "grad_norm": 8.0, "kl": 8.74381160736084, "learning_rate": 5e-06, "logits/chosen": -27487990.85714286, "logits/rejected": -56077926.4, "logps/chosen": -466.0755092075893, "logps/rejected": -608.82744140625, "loss": 0.022, "rewards/chosen": 9.35600825718471, "rewards/margins": 25.218787493024553, "rewards/rejected": -15.862779235839843, "step": 2914 }, { "epoch": 0.7293882146878519, "grad_norm": 7.8125, "kl": 1.9821374416351318, "learning_rate": 5e-06, "logits/chosen": -42463597.333333336, "logits/rejected": -55375717.333333336, "logps/chosen": -326.7588297526042, "logps/rejected": -620.8380533854166, "loss": 0.0586, "rewards/chosen": 9.717333475748697, "rewards/margins": 22.147659301757812, "rewards/rejected": -12.430325826009115, "step": 2915 }, { "epoch": 0.7296384336294257, "grad_norm": 1.984375, "kl": 8.246866226196289, "learning_rate": 5e-06, "logits/chosen": -61584128.0, "logits/rejected": -11450625.333333334, "logps/chosen": -390.3129611545139, "logps/rejected": -757.2203776041666, "loss": 0.0272, "rewards/chosen": 10.31535169813368, "rewards/margins": 25.350894504123264, "rewards/rejected": -15.035542805989584, "step": 2916 }, { "epoch": 0.7298886525709997, "grad_norm": 14.0, "kl": 0.8370288610458374, "learning_rate": 5e-06, "logits/chosen": -60769780.36363637, "logits/rejected": -53257511.384615384, "logps/chosen": -349.55111416903407, "logps/rejected": -661.6254507211538, "loss": 0.0263, "rewards/chosen": 8.101445978338068, "rewards/margins": 25.340078180486508, "rewards/rejected": -17.238632202148438, "step": 2917 }, { "epoch": 0.7301388715125735, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36238523.07692308, "logits/rejected": -37300701.09090909, "logps/chosen": -283.8169508713942, "logps/rejected": -484.40926846590907, "loss": 0.0436, "rewards/chosen": 7.3023552527794475, "rewards/margins": 19.44380977603939, "rewards/rejected": -12.141454523259943, "step": 2918 }, { "epoch": 0.7303890904541473, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46183744.0, "logits/rejected": -51360104.0, "logps/chosen": -398.9495544433594, "logps/rejected": -430.7995300292969, "loss": 0.0303, "rewards/chosen": 9.59018325805664, "rewards/margins": 21.647807121276855, "rewards/rejected": -12.057623863220215, "step": 2919 }, { "epoch": 0.7306393093957213, "grad_norm": 1.84375, "kl": 2.0275282859802246, "learning_rate": 5e-06, "logits/chosen": -17465004.8, "logits/rejected": -69597417.14285715, "logps/chosen": -374.5014404296875, "logps/rejected": -804.4541713169643, "loss": 0.038, "rewards/chosen": 6.34039077758789, "rewards/margins": 29.302790941510878, "rewards/rejected": -22.96240016392299, "step": 2920 }, { "epoch": 0.7308895283372951, "grad_norm": 5.71875, "kl": 3.5363919734954834, "learning_rate": 5e-06, "logits/chosen": -58141568.0, "logits/rejected": -27343834.0, "logps/chosen": -347.19744873046875, "logps/rejected": -661.2326049804688, "loss": 0.0625, "rewards/chosen": 8.154752731323242, "rewards/margins": 28.26039695739746, "rewards/rejected": -20.10564422607422, "step": 2921 }, { "epoch": 0.731139747278869, "grad_norm": 4.625, "kl": 6.7801408767700195, "learning_rate": 5e-06, "logits/chosen": 29274835.555555556, "logits/rejected": -33269518.933333334, "logps/chosen": -418.17095269097223, "logps/rejected": -412.1693359375, "loss": 0.0887, "rewards/chosen": 6.2674755520290795, "rewards/margins": 19.734837002224392, "rewards/rejected": -13.467361450195312, "step": 2922 }, { "epoch": 0.7313899662204428, "grad_norm": 6.5625, "kl": 4.34232234954834, "learning_rate": 5e-06, "logits/chosen": -43058336.0, "logits/rejected": -60684091.07692308, "logps/chosen": -432.8409534801136, "logps/rejected": -534.7397085336538, "loss": 0.0248, "rewards/chosen": 8.46261943470348, "rewards/margins": 22.34682165826117, "rewards/rejected": -13.884202223557692, "step": 2923 }, { "epoch": 0.7316401851620168, "grad_norm": 9.8125, "kl": 3.0046985149383545, "learning_rate": 5e-06, "logits/chosen": -54814560.0, "logits/rejected": -31381462.85714286, "logps/chosen": -354.921044921875, "logps/rejected": -564.3184988839286, "loss": 0.013, "rewards/chosen": 7.616036987304687, "rewards/margins": 23.745526341029574, "rewards/rejected": -16.12948935372489, "step": 2924 }, { "epoch": 0.7318904041035906, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47689638.4, "logits/rejected": -55047698.28571428, "logps/chosen": -428.25849609375, "logps/rejected": -640.8875558035714, "loss": 0.0356, "rewards/chosen": 10.024781799316406, "rewards/margins": 25.258214024135043, "rewards/rejected": -15.233432224818639, "step": 2925 }, { "epoch": 0.7321406230451645, "grad_norm": 2.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35587772.44444445, "logits/rejected": -27798566.4, "logps/chosen": -388.9150119357639, "logps/rejected": -520.5255533854166, "loss": 0.0121, "rewards/chosen": 9.299229092068142, "rewards/margins": 27.011639573838977, "rewards/rejected": -17.712410481770835, "step": 2926 }, { "epoch": 0.7323908419867384, "grad_norm": 3.09375, "kl": 2.576671600341797, "learning_rate": 5e-06, "logits/chosen": -78720736.0, "logits/rejected": -73070960.0, "logps/chosen": -496.5307210286458, "logps/rejected": -702.7225748697916, "loss": 0.0132, "rewards/chosen": 7.878293355305989, "rewards/margins": 26.099018096923828, "rewards/rejected": -18.22072474161784, "step": 2927 }, { "epoch": 0.7326410609283123, "grad_norm": 16.0, "kl": 6.576563835144043, "learning_rate": 5e-06, "logits/chosen": -13577972.0, "logits/rejected": -47587978.666666664, "logps/chosen": -381.9519856770833, "logps/rejected": -708.8515625, "loss": 0.0432, "rewards/chosen": 8.774944305419922, "rewards/margins": 28.3846918741862, "rewards/rejected": -19.609747568766277, "step": 2928 }, { "epoch": 0.7328912798698861, "grad_norm": 2.75, "kl": 0.8835741877555847, "learning_rate": 5e-06, "logits/chosen": -29939008.0, "logits/rejected": -59251974.4, "logps/chosen": -390.05130440848217, "logps/rejected": -686.340869140625, "loss": 0.0437, "rewards/chosen": 9.324417114257812, "rewards/margins": 31.848109436035156, "rewards/rejected": -22.523692321777343, "step": 2929 }, { "epoch": 0.7331414988114601, "grad_norm": 1.7109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -89944221.53846154, "logits/rejected": -72079773.0909091, "logps/chosen": -435.64310396634613, "logps/rejected": -723.6170099431819, "loss": 0.015, "rewards/chosen": 8.803032508263222, "rewards/margins": 31.64544528347629, "rewards/rejected": -22.842412775213067, "step": 2930 }, { "epoch": 0.7333917177530339, "grad_norm": 1.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58497408.0, "logits/rejected": -35028264.0, "logps/chosen": -510.8992106119792, "logps/rejected": -672.0108235677084, "loss": 0.0016, "rewards/chosen": 9.429258346557617, "rewards/margins": 30.553325017293293, "rewards/rejected": -21.124066670735676, "step": 2931 }, { "epoch": 0.7336419366946078, "grad_norm": 1.5625, "kl": 0.9634284973144531, "learning_rate": 5e-06, "logits/chosen": -42695528.72727273, "logits/rejected": -45854798.76923077, "logps/chosen": -410.88059303977275, "logps/rejected": -517.5489783653846, "loss": 0.0022, "rewards/chosen": 9.711400812322443, "rewards/margins": 23.893248017851292, "rewards/rejected": -14.181847205528847, "step": 2932 }, { "epoch": 0.7338921556361817, "grad_norm": 13.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69670574.54545455, "logits/rejected": -44802038.15384615, "logps/chosen": -401.51669034090907, "logps/rejected": -515.5558894230769, "loss": 0.0206, "rewards/chosen": 9.228928305886008, "rewards/margins": 23.7736283415681, "rewards/rejected": -14.54470003568209, "step": 2933 }, { "epoch": 0.7341423745777556, "grad_norm": 5.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45046835.2, "logits/rejected": -50042244.571428575, "logps/chosen": -329.166845703125, "logps/rejected": -646.5944475446429, "loss": 0.0198, "rewards/chosen": 7.034855651855469, "rewards/margins": 23.91043439592634, "rewards/rejected": -16.87557874407087, "step": 2934 }, { "epoch": 0.7343925935193294, "grad_norm": 1.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34650413.71428572, "logits/rejected": -50884422.4, "logps/chosen": -338.978759765625, "logps/rejected": -596.565283203125, "loss": 0.0268, "rewards/chosen": 8.019074031284877, "rewards/margins": 29.139529963902064, "rewards/rejected": -21.120455932617187, "step": 2935 }, { "epoch": 0.7346428124609032, "grad_norm": 1.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54149321.84615385, "logits/rejected": -50369792.0, "logps/chosen": -406.34867037259613, "logps/rejected": -593.7598544034091, "loss": 0.0291, "rewards/chosen": 9.055301372821514, "rewards/margins": 27.88132065993089, "rewards/rejected": -18.826019287109375, "step": 2936 }, { "epoch": 0.7348930314024772, "grad_norm": 6.4375, "kl": 0.33437079191207886, "learning_rate": 5e-06, "logits/chosen": -72834122.66666667, "logits/rejected": -76415898.66666667, "logps/chosen": -329.2901611328125, "logps/rejected": -690.5845540364584, "loss": 0.1164, "rewards/chosen": 6.727304458618164, "rewards/margins": 28.634594599405926, "rewards/rejected": -21.90729014078776, "step": 2937 }, { "epoch": 0.735143250344051, "grad_norm": 2.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63678432.0, "logits/rejected": -70501248.0, "logps/chosen": -435.1626383463542, "logps/rejected": -518.6283365885416, "loss": 0.0021, "rewards/chosen": 9.546963373819986, "rewards/margins": 23.129985173543293, "rewards/rejected": -13.583021799723307, "step": 2938 }, { "epoch": 0.7353934692856249, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67074065.45454545, "logits/rejected": -48072832.0, "logps/chosen": -296.30171342329544, "logps/rejected": -751.7020733173077, "loss": 0.0588, "rewards/chosen": 4.480669888583097, "rewards/margins": 23.95682979130245, "rewards/rejected": -19.476159902719353, "step": 2939 }, { "epoch": 0.7356436882271988, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -87865914.18181819, "logits/rejected": -88435101.53846154, "logps/chosen": -387.6380060369318, "logps/rejected": -876.8295522836538, "loss": 0.0209, "rewards/chosen": 7.457321860573509, "rewards/margins": 30.319434052580718, "rewards/rejected": -22.86211219200721, "step": 2940 }, { "epoch": 0.7358939071687727, "grad_norm": 2.640625, "kl": 3.472078323364258, "learning_rate": 5e-06, "logits/chosen": -1899634.6666666667, "logits/rejected": -76914805.33333333, "logps/chosen": -258.3375651041667, "logps/rejected": -655.2002766927084, "loss": 0.0496, "rewards/chosen": 7.267150243123372, "rewards/margins": 24.603605270385742, "rewards/rejected": -17.33645502726237, "step": 2941 }, { "epoch": 0.7361441261103465, "grad_norm": 0.62109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49618426.18181818, "logits/rejected": -47196440.615384616, "logps/chosen": -395.06063565340907, "logps/rejected": -590.3279371995193, "loss": 0.0014, "rewards/chosen": 9.711784362792969, "rewards/margins": 29.8503905076247, "rewards/rejected": -20.13860614483173, "step": 2942 }, { "epoch": 0.7363943450519205, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51202466.90909091, "logits/rejected": -60918606.76923077, "logps/chosen": -473.97567471590907, "logps/rejected": -622.8345853365385, "loss": 0.0151, "rewards/chosen": 8.358546170321377, "rewards/margins": 25.67778116506296, "rewards/rejected": -17.319234994741585, "step": 2943 }, { "epoch": 0.7366445639934943, "grad_norm": 6.71875, "kl": 1.7836418151855469, "learning_rate": 5e-06, "logits/chosen": -27899979.636363637, "logits/rejected": -35651584.0, "logps/chosen": -388.77689985795456, "logps/rejected": -888.5279447115385, "loss": 0.0358, "rewards/chosen": 8.221864180131393, "rewards/margins": 37.125407505702306, "rewards/rejected": -28.903543325570915, "step": 2944 }, { "epoch": 0.7368947829350682, "grad_norm": 10.125, "kl": 4.3756208419799805, "learning_rate": 5e-06, "logits/chosen": -49095974.4, "logits/rejected": -55985464.88888889, "logps/chosen": -358.934375, "logps/rejected": -759.603515625, "loss": 0.032, "rewards/chosen": 6.598445129394531, "rewards/margins": 25.893936665852863, "rewards/rejected": -19.295491536458332, "step": 2945 }, { "epoch": 0.7371450018766421, "grad_norm": 10.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -77458002.28571428, "logits/rejected": -36475494.4, "logps/chosen": -438.35982840401783, "logps/rejected": -576.227734375, "loss": 0.0342, "rewards/chosen": 7.649029867989676, "rewards/margins": 26.945779745919364, "rewards/rejected": -19.296749877929688, "step": 2946 }, { "epoch": 0.737395220818216, "grad_norm": 3.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25656273.454545453, "logits/rejected": -26174281.846153848, "logps/chosen": -385.1710759943182, "logps/rejected": -515.7978140024038, "loss": 0.0184, "rewards/chosen": 6.387058604847301, "rewards/margins": 22.33079123330283, "rewards/rejected": -15.943732628455528, "step": 2947 }, { "epoch": 0.7376454397597898, "grad_norm": 11.0625, "kl": 2.1957292556762695, "learning_rate": 5e-06, "logits/chosen": -24279021.714285713, "logits/rejected": -72876326.4, "logps/chosen": -350.51820591517856, "logps/rejected": -663.02265625, "loss": 0.0872, "rewards/chosen": 7.170877729143415, "rewards/margins": 23.55446079799107, "rewards/rejected": -16.383583068847656, "step": 2948 }, { "epoch": 0.7378956587013636, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83296090.66666667, "logits/rejected": -43526138.666666664, "logps/chosen": -461.2036539713542, "logps/rejected": -671.3147786458334, "loss": 0.014, "rewards/chosen": 9.120169321695963, "rewards/margins": 27.264695485432945, "rewards/rejected": -18.14452616373698, "step": 2949 }, { "epoch": 0.7381458776429376, "grad_norm": 2.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55780235.63636363, "logits/rejected": -22258387.692307692, "logps/chosen": -493.10014204545456, "logps/rejected": -507.0874774639423, "loss": 0.0201, "rewards/chosen": 8.55664756081321, "rewards/margins": 21.278862479683404, "rewards/rejected": -12.722214918870192, "step": 2950 }, { "epoch": 0.7383960965845114, "grad_norm": 5.6875, "kl": 0.29381608963012695, "learning_rate": 5e-06, "logits/chosen": -50227168.0, "logits/rejected": -74008890.66666667, "logps/chosen": -332.99159071180554, "logps/rejected": -817.2894694010416, "loss": 0.0255, "rewards/chosen": 7.9797253078884545, "rewards/margins": 28.087435828314888, "rewards/rejected": -20.107710520426433, "step": 2951 }, { "epoch": 0.7386463155260853, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40485405.538461536, "logits/rejected": -62266839.27272727, "logps/chosen": -300.7621319110577, "logps/rejected": -791.8264382102273, "loss": 0.0629, "rewards/chosen": 7.577552208533654, "rewards/margins": 28.469418719098282, "rewards/rejected": -20.89186651056463, "step": 2952 }, { "epoch": 0.7388965344676592, "grad_norm": 5.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50676877.71428572, "logits/rejected": -62268723.2, "logps/chosen": -325.12894112723217, "logps/rejected": -737.095068359375, "loss": 0.037, "rewards/chosen": 7.825897216796875, "rewards/margins": 29.293589782714843, "rewards/rejected": -21.467692565917968, "step": 2953 }, { "epoch": 0.7391467534092331, "grad_norm": 12.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55917870.54545455, "logits/rejected": -43009624.615384616, "logps/chosen": -307.96388938210225, "logps/rejected": -596.3366135817307, "loss": 0.0296, "rewards/chosen": 6.905091025612571, "rewards/margins": 19.052472147908244, "rewards/rejected": -12.147381122295673, "step": 2954 }, { "epoch": 0.7393969723508069, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35241242.666666664, "logits/rejected": -49745301.333333336, "logps/chosen": -412.0762125651042, "logps/rejected": -644.2266845703125, "loss": 0.017, "rewards/chosen": 8.869913101196289, "rewards/margins": 27.08336067199707, "rewards/rejected": -18.21344757080078, "step": 2955 }, { "epoch": 0.7396471912923809, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24585198.545454547, "logits/rejected": -56450372.92307692, "logps/chosen": -466.87056107954544, "logps/rejected": -762.9330679086538, "loss": 0.0306, "rewards/chosen": 9.220947265625, "rewards/margins": 28.915724534254807, "rewards/rejected": -19.694777268629807, "step": 2956 }, { "epoch": 0.7398974102339547, "grad_norm": 6.25, "kl": 2.567213773727417, "learning_rate": 5e-06, "logits/chosen": -32204025.6, "logits/rejected": -36917560.88888889, "logps/chosen": -370.80201822916666, "logps/rejected": -656.787109375, "loss": 0.0251, "rewards/chosen": 8.222299702962239, "rewards/margins": 22.904616800944012, "rewards/rejected": -14.682317097981771, "step": 2957 }, { "epoch": 0.7401476291755286, "grad_norm": 3.875, "kl": 9.299044609069824, "learning_rate": 5e-06, "logits/chosen": -52585413.81818182, "logits/rejected": -38020558.76923077, "logps/chosen": -390.34428267045456, "logps/rejected": -632.4707782451923, "loss": 0.0744, "rewards/chosen": 9.027475530450994, "rewards/margins": 28.22293944458861, "rewards/rejected": -19.19546391413762, "step": 2958 }, { "epoch": 0.7403978481171025, "grad_norm": 0.9765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45409446.4, "logits/rejected": -16195092.57142857, "logps/chosen": -291.617626953125, "logps/rejected": -841.8005022321429, "loss": 0.0163, "rewards/chosen": 7.878179931640625, "rewards/margins": 28.662653895786832, "rewards/rejected": -20.784473964146205, "step": 2959 }, { "epoch": 0.7406480670586764, "grad_norm": 4.90625, "kl": 10.160751342773438, "learning_rate": 5e-06, "logits/chosen": -61351449.6, "logits/rejected": -42780306.28571428, "logps/chosen": -563.00634765625, "logps/rejected": -519.4230608258929, "loss": 0.0064, "rewards/chosen": 11.598554229736328, "rewards/margins": 23.98524682181222, "rewards/rejected": -12.386692592075892, "step": 2960 }, { "epoch": 0.7408982860002502, "grad_norm": 19.5, "kl": 9.81986141204834, "learning_rate": 5e-06, "logits/chosen": -12199661.47368421, "logits/rejected": -25652560.0, "logps/chosen": -345.3108552631579, "logps/rejected": -449.408447265625, "loss": 0.0786, "rewards/chosen": 7.412800035978618, "rewards/margins": 22.00080357601768, "rewards/rejected": -14.588003540039063, "step": 2961 }, { "epoch": 0.741148504941824, "grad_norm": 14.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60431061.333333336, "logits/rejected": -25901303.466666665, "logps/chosen": -406.03138563368054, "logps/rejected": -545.0197265625, "loss": 0.0625, "rewards/chosen": 9.985044691297743, "rewards/margins": 22.85797356499566, "rewards/rejected": -12.872928873697917, "step": 2962 }, { "epoch": 0.741398723883398, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37519982.93333333, "logits/rejected": -32425891.555555556, "logps/chosen": -352.4009114583333, "logps/rejected": -728.9539388020834, "loss": 0.0217, "rewards/chosen": 9.596675618489583, "rewards/margins": 25.51347452799479, "rewards/rejected": -15.916798909505209, "step": 2963 }, { "epoch": 0.7416489428249718, "grad_norm": 0.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43942478.76923077, "logits/rejected": -24516846.545454547, "logps/chosen": -449.21146334134613, "logps/rejected": -455.12619850852275, "loss": 0.0021, "rewards/chosen": 9.571009709284855, "rewards/margins": 22.927006968251476, "rewards/rejected": -13.35599725896662, "step": 2964 }, { "epoch": 0.7418991617665457, "grad_norm": 9.4375, "kl": 0.03052012249827385, "learning_rate": 5e-06, "logits/chosen": -41680084.0, "logits/rejected": -21839496.0, "logps/chosen": -286.52325439453125, "logps/rejected": -364.37060546875, "loss": 0.0692, "rewards/chosen": 6.339506149291992, "rewards/margins": 20.323403358459473, "rewards/rejected": -13.98389720916748, "step": 2965 }, { "epoch": 0.7421493807081196, "grad_norm": 3.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -81365592.0, "logits/rejected": -44765304.0, "logps/chosen": -330.15960693359375, "logps/rejected": -563.4014892578125, "loss": 0.0408, "rewards/chosen": 6.771461009979248, "rewards/margins": 20.151944637298584, "rewards/rejected": -13.380483627319336, "step": 2966 }, { "epoch": 0.7423995996496935, "grad_norm": 11.625, "kl": 9.78278923034668, "learning_rate": 5e-06, "logits/chosen": -43358208.0, "logits/rejected": -27435324.444444444, "logps/chosen": -406.5504557291667, "logps/rejected": -589.2847222222222, "loss": 0.0783, "rewards/chosen": 8.017154947916667, "rewards/margins": 22.23306342230903, "rewards/rejected": -14.21590847439236, "step": 2967 }, { "epoch": 0.7426498185912673, "grad_norm": 21.5, "kl": 8.140419006347656, "learning_rate": 5e-06, "logits/chosen": -40114585.6, "logits/rejected": -13866734.222222222, "logps/chosen": -368.9765299479167, "logps/rejected": -728.7565104166666, "loss": 0.0326, "rewards/chosen": 8.143738810221354, "rewards/margins": 25.86013895670573, "rewards/rejected": -17.716400146484375, "step": 2968 }, { "epoch": 0.7429000375328413, "grad_norm": 6.15625, "kl": 1.3197168111801147, "learning_rate": 5e-06, "logits/chosen": -58030759.384615384, "logits/rejected": -55027019.63636363, "logps/chosen": -352.02095853365387, "logps/rejected": -688.3469460227273, "loss": 0.0379, "rewards/chosen": 6.4862518310546875, "rewards/margins": 21.586529818448156, "rewards/rejected": -15.100277987393467, "step": 2969 }, { "epoch": 0.7431502564744151, "grad_norm": 1.609375, "kl": 17.18589973449707, "learning_rate": 5e-06, "logits/chosen": -49485732.0, "logits/rejected": -104854720.0, "logps/chosen": -339.5189514160156, "logps/rejected": -717.4119873046875, "loss": 0.095, "rewards/chosen": 8.353102684020996, "rewards/margins": 24.882023811340332, "rewards/rejected": -16.528921127319336, "step": 2970 }, { "epoch": 0.743400475415989, "grad_norm": 3.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42508022.15384615, "logits/rejected": -113613067.63636364, "logps/chosen": -347.6548602764423, "logps/rejected": -687.1920276988636, "loss": 0.0211, "rewards/chosen": 8.665538494403545, "rewards/margins": 27.968358873487354, "rewards/rejected": -19.30282037908381, "step": 2971 }, { "epoch": 0.7436506943575628, "grad_norm": 5.4375, "kl": 11.856978416442871, "learning_rate": 5e-06, "logits/chosen": -67924996.57142857, "logits/rejected": -22442825.6, "logps/chosen": -357.82589285714283, "logps/rejected": -481.322216796875, "loss": 0.0256, "rewards/chosen": 8.960338592529297, "rewards/margins": 21.452692413330077, "rewards/rejected": -12.492353820800782, "step": 2972 }, { "epoch": 0.7439009132991368, "grad_norm": 1.7734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38896640.0, "logits/rejected": -72241326.54545455, "logps/chosen": -437.0188551682692, "logps/rejected": -694.5090553977273, "loss": 0.0023, "rewards/chosen": 10.424429086538462, "rewards/margins": 26.08788539646389, "rewards/rejected": -15.663456309925426, "step": 2973 }, { "epoch": 0.7441511322407106, "grad_norm": 6.71875, "kl": 0.38140615820884705, "learning_rate": 5e-06, "logits/chosen": -48950171.428571425, "logits/rejected": -72763993.6, "logps/chosen": -342.34273856026783, "logps/rejected": -750.2884765625, "loss": 0.0324, "rewards/chosen": 7.345164707728794, "rewards/margins": 23.47940237862723, "rewards/rejected": -16.134237670898436, "step": 2974 }, { "epoch": 0.7444013511822845, "grad_norm": 12.4375, "kl": 3.899538278579712, "learning_rate": 5e-06, "logits/chosen": -37282176.0, "logits/rejected": 1396354.4, "logps/chosen": -363.71212332589283, "logps/rejected": -597.833251953125, "loss": 0.0477, "rewards/chosen": 9.398117065429688, "rewards/margins": 22.14468002319336, "rewards/rejected": -12.746562957763672, "step": 2975 }, { "epoch": 0.7446515701238584, "grad_norm": 3.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34198372.0, "logits/rejected": -40790200.0, "logps/chosen": -274.5306701660156, "logps/rejected": -589.3925170898438, "loss": 0.0369, "rewards/chosen": 7.223034858703613, "rewards/margins": 22.51541233062744, "rewards/rejected": -15.292377471923828, "step": 2976 }, { "epoch": 0.7449017890654323, "grad_norm": 6.1875, "kl": 1.7482306957244873, "learning_rate": 5e-06, "logits/chosen": -48254997.333333336, "logits/rejected": -82607473.77777778, "logps/chosen": -434.34983723958334, "logps/rejected": -483.54969618055554, "loss": 0.0391, "rewards/chosen": 8.5630615234375, "rewards/margins": 19.870081922743054, "rewards/rejected": -11.307020399305555, "step": 2977 }, { "epoch": 0.7451520080070061, "grad_norm": 3.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33927266.666666664, "logits/rejected": -37798794.666666664, "logps/chosen": -283.9439697265625, "logps/rejected": -504.4635416666667, "loss": 0.0343, "rewards/chosen": 7.0659230550130205, "rewards/margins": 22.59581247965495, "rewards/rejected": -15.529889424641928, "step": 2978 }, { "epoch": 0.74540222694858, "grad_norm": 11.6875, "kl": 8.553287506103516, "learning_rate": 5e-06, "logits/chosen": -40960466.28571428, "logits/rejected": -65892723.2, "logps/chosen": -427.33443777901783, "logps/rejected": -676.719482421875, "loss": 0.0517, "rewards/chosen": 8.275485447474889, "rewards/margins": 23.598262241908483, "rewards/rejected": -15.322776794433594, "step": 2979 }, { "epoch": 0.7456524458901539, "grad_norm": 2.34375, "kl": 0.613861083984375, "learning_rate": 5e-06, "logits/chosen": -14989211.076923076, "logits/rejected": -51753105.45454545, "logps/chosen": -253.10486778846155, "logps/rejected": -691.1922940340909, "loss": 0.035, "rewards/chosen": 6.399891779972957, "rewards/margins": 25.441764671485743, "rewards/rejected": -19.041872891512785, "step": 2980 }, { "epoch": 0.7459026648317277, "grad_norm": 8.25, "kl": 2.1561267375946045, "learning_rate": 5e-06, "logits/chosen": -37089109.333333336, "logits/rejected": -54329349.333333336, "logps/chosen": -347.764892578125, "logps/rejected": -759.8035481770834, "loss": 0.0291, "rewards/chosen": 8.871681849161783, "rewards/margins": 23.838351567586262, "rewards/rejected": -14.966669718424479, "step": 2981 }, { "epoch": 0.7461528837733017, "grad_norm": 3.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72515507.2, "logits/rejected": -60621284.571428575, "logps/chosen": -533.13037109375, "logps/rejected": -499.085693359375, "loss": 0.0196, "rewards/chosen": 12.613286590576172, "rewards/margins": 27.533389391217913, "rewards/rejected": -14.920102800641741, "step": 2982 }, { "epoch": 0.7464031027148755, "grad_norm": 1.2109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29078661.333333332, "logits/rejected": -48593381.333333336, "logps/chosen": -346.7162679036458, "logps/rejected": -597.641357421875, "loss": 0.0132, "rewards/chosen": 9.14849853515625, "rewards/margins": 24.08013153076172, "rewards/rejected": -14.931632995605469, "step": 2983 }, { "epoch": 0.7466533216564494, "grad_norm": 3.015625, "kl": 11.07688045501709, "learning_rate": 5e-06, "logits/chosen": -49433834.666666664, "logits/rejected": -71199367.1111111, "logps/chosen": -412.8169921875, "logps/rejected": -430.26915147569446, "loss": 0.0049, "rewards/chosen": 10.532106526692708, "rewards/margins": 21.990336439344617, "rewards/rejected": -11.458229912651909, "step": 2984 }, { "epoch": 0.7469035405980232, "grad_norm": 10.9375, "kl": 9.502801895141602, "learning_rate": 5e-06, "logits/chosen": -34099876.0, "logits/rejected": -48263896.0, "logps/chosen": -320.6567687988281, "logps/rejected": -480.9551086425781, "loss": 0.0763, "rewards/chosen": 8.094695091247559, "rewards/margins": 23.50810146331787, "rewards/rejected": -15.413406372070312, "step": 2985 }, { "epoch": 0.7471537595395972, "grad_norm": 14.9375, "kl": 22.94472885131836, "learning_rate": 5e-06, "logits/chosen": -59953979.07692308, "logits/rejected": -35467269.81818182, "logps/chosen": -392.5456730769231, "logps/rejected": -715.5534002130681, "loss": 0.08, "rewards/chosen": 10.32413541353666, "rewards/margins": 28.636009803185097, "rewards/rejected": -18.311874389648438, "step": 2986 }, { "epoch": 0.747403978481171, "grad_norm": 11.625, "kl": 1.1233184337615967, "learning_rate": 5e-06, "logits/chosen": -66068166.4, "logits/rejected": -43138884.571428575, "logps/chosen": -396.4162109375, "logps/rejected": -550.4259556361607, "loss": 0.0541, "rewards/chosen": 9.839991760253906, "rewards/margins": 20.280921718052454, "rewards/rejected": -10.440929957798549, "step": 2987 }, { "epoch": 0.7476541974227449, "grad_norm": 2.734375, "kl": 3.221097469329834, "learning_rate": 5e-06, "logits/chosen": -18861942.153846152, "logits/rejected": -24949486.545454547, "logps/chosen": -313.9387958233173, "logps/rejected": -503.43110795454544, "loss": 0.0625, "rewards/chosen": 6.365314190204327, "rewards/margins": 21.060625303041682, "rewards/rejected": -14.695311112837357, "step": 2988 }, { "epoch": 0.7479044163643188, "grad_norm": 8.125, "kl": 11.11878776550293, "learning_rate": 5e-06, "logits/chosen": -50350862.76923077, "logits/rejected": -34689320.72727273, "logps/chosen": -479.5060847355769, "logps/rejected": -444.9208984375, "loss": 0.0126, "rewards/chosen": 10.012110783503605, "rewards/margins": 21.642028968650976, "rewards/rejected": -11.629918185147373, "step": 2989 }, { "epoch": 0.7481546353058927, "grad_norm": 6.0, "kl": 11.338481903076172, "learning_rate": 5e-06, "logits/chosen": -43079992.0, "logits/rejected": -66883792.0, "logps/chosen": -469.964111328125, "logps/rejected": -660.2730712890625, "loss": 0.0147, "rewards/chosen": 10.641860008239746, "rewards/margins": 25.54690170288086, "rewards/rejected": -14.905041694641113, "step": 2990 }, { "epoch": 0.7484048542474665, "grad_norm": 22.875, "kl": 7.835423946380615, "learning_rate": 5e-06, "logits/chosen": -32104174.933333334, "logits/rejected": -10760193.777777778, "logps/chosen": -394.1509114583333, "logps/rejected": -656.6400824652778, "loss": 0.0681, "rewards/chosen": 8.584361775716145, "rewards/margins": 23.711491224500868, "rewards/rejected": -15.127129448784721, "step": 2991 }, { "epoch": 0.7486550731890405, "grad_norm": 6.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32586333.333333332, "logits/rejected": -11311970.666666666, "logps/chosen": -402.3231608072917, "logps/rejected": -730.009765625, "loss": 0.0494, "rewards/chosen": 9.422327677408854, "rewards/margins": 29.532058715820312, "rewards/rejected": -20.109731038411457, "step": 2992 }, { "epoch": 0.7489052921306143, "grad_norm": 5.90625, "kl": 1.0760133266448975, "learning_rate": 5e-06, "logits/chosen": -52827292.8, "logits/rejected": -16611166.857142856, "logps/chosen": -443.4484375, "logps/rejected": -631.5638253348214, "loss": 0.0192, "rewards/chosen": 9.146554565429687, "rewards/margins": 26.21226545061384, "rewards/rejected": -17.065710885184153, "step": 2993 }, { "epoch": 0.7491555110721881, "grad_norm": 4.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62099514.18181818, "logits/rejected": -64894946.461538464, "logps/chosen": -415.0498046875, "logps/rejected": -728.9068509615385, "loss": 0.0612, "rewards/chosen": 9.971081126819957, "rewards/margins": 25.58747634354171, "rewards/rejected": -15.616395216721754, "step": 2994 }, { "epoch": 0.7494057300137621, "grad_norm": 10.25, "kl": 3.3944811820983887, "learning_rate": 5e-06, "logits/chosen": -82164760.61538461, "logits/rejected": -49706496.0, "logps/chosen": -397.4314152644231, "logps/rejected": -761.3055752840909, "loss": 0.0506, "rewards/chosen": 7.565390953650842, "rewards/margins": 31.46845880254999, "rewards/rejected": -23.90306784889915, "step": 2995 }, { "epoch": 0.7496559489553359, "grad_norm": 8.875, "kl": 1.0490138530731201, "learning_rate": 5e-06, "logits/chosen": -38914734.54545455, "logits/rejected": -27044667.076923076, "logps/chosen": -413.35373757102275, "logps/rejected": -360.2961989182692, "loss": 0.0566, "rewards/chosen": 8.48189267245206, "rewards/margins": 18.102880784681627, "rewards/rejected": -9.620988112229567, "step": 2996 }, { "epoch": 0.7499061678969098, "grad_norm": 2.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62856738.90909091, "logits/rejected": -13626338.461538462, "logps/chosen": -485.50834517045456, "logps/rejected": -637.4113581730769, "loss": 0.0045, "rewards/chosen": 8.025883067737926, "rewards/margins": 26.146242048356918, "rewards/rejected": -18.12035898061899, "step": 2997 }, { "epoch": 0.7501563868384836, "grad_norm": 1.8203125, "kl": 18.09365463256836, "learning_rate": 5e-06, "logits/chosen": -50235293.86666667, "logits/rejected": -52473194.666666664, "logps/chosen": -394.6966145833333, "logps/rejected": -527.9289279513889, "loss": 0.0283, "rewards/chosen": 9.308648681640625, "rewards/margins": 24.848753865559896, "rewards/rejected": -15.540105183919271, "step": 2998 }, { "epoch": 0.7504066057800576, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22388499.692307692, "logits/rejected": -69936779.63636364, "logps/chosen": -402.82962740384613, "logps/rejected": -725.9158380681819, "loss": 0.028, "rewards/chosen": 7.967340909517729, "rewards/margins": 25.986092574112902, "rewards/rejected": -18.01875166459517, "step": 2999 }, { "epoch": 0.7506568247216314, "grad_norm": 1.7734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -76999744.0, "logits/rejected": -63018724.571428575, "logps/chosen": -417.80234375, "logps/rejected": -796.9739118303571, "loss": 0.0131, "rewards/chosen": 9.845954895019531, "rewards/margins": 33.757940019880024, "rewards/rejected": -23.91198512486049, "step": 3000 }, { "epoch": 0.7509070436632053, "grad_norm": 10.125, "kl": 1.8281219005584717, "learning_rate": 5e-06, "logits/chosen": -54846262.85714286, "logits/rejected": -44065548.8, "logps/chosen": -475.9663783482143, "logps/rejected": -626.612744140625, "loss": 0.0161, "rewards/chosen": 10.405157906668526, "rewards/margins": 29.78905988420759, "rewards/rejected": -19.38390197753906, "step": 3001 }, { "epoch": 0.7511572626047792, "grad_norm": 1.3515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54012590.54545455, "logits/rejected": -38756187.07692308, "logps/chosen": -453.06582919034093, "logps/rejected": -579.7786207932693, "loss": 0.0016, "rewards/chosen": 10.06178144975142, "rewards/margins": 24.92873889416248, "rewards/rejected": -14.866957444411058, "step": 3002 }, { "epoch": 0.7514074815463531, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58223310.222222224, "logits/rejected": -50867575.46666667, "logps/chosen": -405.23155381944446, "logps/rejected": -686.0877604166667, "loss": 0.0196, "rewards/chosen": 8.472842746310764, "rewards/margins": 27.707767062717014, "rewards/rejected": -19.23492431640625, "step": 3003 }, { "epoch": 0.7516577004879269, "grad_norm": 12.0, "kl": 0.04863230511546135, "learning_rate": 5e-06, "logits/chosen": -50753301.333333336, "logits/rejected": -28677317.333333332, "logps/chosen": -337.3025716145833, "logps/rejected": -470.729248046875, "loss": 0.0479, "rewards/chosen": 8.101856231689453, "rewards/margins": 23.561314900716148, "rewards/rejected": -15.459458669026693, "step": 3004 }, { "epoch": 0.7519079194295009, "grad_norm": 0.126953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61957233.777777776, "logits/rejected": -46115251.2, "logps/chosen": -405.4639485677083, "logps/rejected": -768.2690104166667, "loss": 0.0003, "rewards/chosen": 8.119927300347221, "rewards/margins": 33.370000542534726, "rewards/rejected": -25.2500732421875, "step": 3005 }, { "epoch": 0.7521581383710747, "grad_norm": 5.71875, "kl": 1.2892125844955444, "learning_rate": 5e-06, "logits/chosen": -67372501.33333333, "logits/rejected": -68092364.8, "logps/chosen": -448.887451171875, "logps/rejected": -645.5022786458334, "loss": 0.0134, "rewards/chosen": 8.635713365342882, "rewards/margins": 27.38247850206163, "rewards/rejected": -18.74676513671875, "step": 3006 }, { "epoch": 0.7524083573126485, "grad_norm": 0.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30905120.0, "logits/rejected": -46393743.058823526, "logps/chosen": -329.446533203125, "logps/rejected": -614.9485294117648, "loss": 0.0013, "rewards/chosen": 8.632052285330635, "rewards/margins": 27.0705539799538, "rewards/rejected": -18.438501694623163, "step": 3007 }, { "epoch": 0.7526585762542225, "grad_norm": 3.125, "kl": 2.888256072998047, "learning_rate": 5e-06, "logits/chosen": -32075693.333333332, "logits/rejected": -20852284.0, "logps/chosen": -382.2194010416667, "logps/rejected": -442.8057047526042, "loss": 0.0606, "rewards/chosen": 10.252699534098307, "rewards/margins": 19.899840037027992, "rewards/rejected": -9.647140502929688, "step": 3008 }, { "epoch": 0.7529087951957963, "grad_norm": 9.9375, "kl": 7.660856246948242, "learning_rate": 5e-06, "logits/chosen": -35853926.4, "logits/rejected": -41917385.14285714, "logps/chosen": -342.877197265625, "logps/rejected": -409.59898158482144, "loss": 0.0914, "rewards/chosen": 6.096869659423828, "rewards/margins": 18.62328153337751, "rewards/rejected": -12.526411873953682, "step": 3009 }, { "epoch": 0.7531590141373702, "grad_norm": 1.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69457749.33333333, "logits/rejected": -50875912.53333333, "logps/chosen": -339.67206488715277, "logps/rejected": -554.9678385416667, "loss": 0.0301, "rewards/chosen": 6.507584889729817, "rewards/margins": 23.42615585327148, "rewards/rejected": -16.918570963541665, "step": 3010 }, { "epoch": 0.753409233078944, "grad_norm": 0.65234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41114216.0, "logits/rejected": -31974850.666666668, "logps/chosen": -439.2692057291667, "logps/rejected": -519.2866617838541, "loss": 0.0012, "rewards/chosen": 10.649532953898111, "rewards/margins": 24.9608097076416, "rewards/rejected": -14.31127675374349, "step": 3011 }, { "epoch": 0.753659452020518, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43821417.6, "logits/rejected": -43697280.0, "logps/chosen": -471.659619140625, "logps/rejected": -650.6641322544643, "loss": 0.0233, "rewards/chosen": 7.995932769775391, "rewards/margins": 25.316506849016463, "rewards/rejected": -17.320574079241073, "step": 3012 }, { "epoch": 0.7539096709620918, "grad_norm": 10.3125, "kl": 8.64933967590332, "learning_rate": 5e-06, "logits/chosen": -49860694.85714286, "logits/rejected": -62711168.0, "logps/chosen": -389.60777064732144, "logps/rejected": -680.565234375, "loss": 0.0629, "rewards/chosen": 8.606111798967634, "rewards/margins": 23.759310041155132, "rewards/rejected": -15.1531982421875, "step": 3013 }, { "epoch": 0.7541598899036657, "grad_norm": 2.328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49506026.666666664, "logits/rejected": -61228134.4, "logps/chosen": -308.4492458767361, "logps/rejected": -723.5415364583333, "loss": 0.0386, "rewards/chosen": 6.334142896864149, "rewards/margins": 24.827406650119357, "rewards/rejected": -18.49326375325521, "step": 3014 }, { "epoch": 0.7544101088452396, "grad_norm": 3.5625, "kl": 0.3426574170589447, "learning_rate": 5e-06, "logits/chosen": -31510262.153846152, "logits/rejected": -39597812.36363637, "logps/chosen": -381.72787710336536, "logps/rejected": -633.2786754261364, "loss": 0.0375, "rewards/chosen": 8.678029573880709, "rewards/margins": 26.689569379899886, "rewards/rejected": -18.011539806019176, "step": 3015 }, { "epoch": 0.7546603277868135, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43553898.666666664, "logits/rejected": -40713237.333333336, "logps/chosen": -401.7740071614583, "logps/rejected": -637.66845703125, "loss": 0.0173, "rewards/chosen": 8.446009318033854, "rewards/margins": 28.855069478352867, "rewards/rejected": -20.40906016031901, "step": 3016 }, { "epoch": 0.7549105467283873, "grad_norm": 2.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46425720.0, "logits/rejected": -42713692.0, "logps/chosen": -377.6973571777344, "logps/rejected": -679.115966796875, "loss": 0.0034, "rewards/chosen": 6.997097492218018, "rewards/margins": 27.958326816558838, "rewards/rejected": -20.96122932434082, "step": 3017 }, { "epoch": 0.7551607656699613, "grad_norm": 12.125, "kl": 10.50784683227539, "learning_rate": 5e-06, "logits/chosen": -42278912.0, "logits/rejected": -57328137.14285714, "logps/chosen": -383.8205997242647, "logps/rejected": -636.2880161830357, "loss": 0.0577, "rewards/chosen": 8.499478508444394, "rewards/margins": 28.35625515464975, "rewards/rejected": -19.856776646205358, "step": 3018 }, { "epoch": 0.7554109846115351, "grad_norm": 2.953125, "kl": 7.741909027099609, "learning_rate": 5e-06, "logits/chosen": -43221589.333333336, "logits/rejected": -54980288.0, "logps/chosen": -424.30237630208336, "logps/rejected": -761.2146809895834, "loss": 0.0102, "rewards/chosen": 10.029179890950521, "rewards/margins": 28.80080329047309, "rewards/rejected": -18.771623399522568, "step": 3019 }, { "epoch": 0.755661203553109, "grad_norm": 2.4375, "kl": 7.2004523277282715, "learning_rate": 5e-06, "logits/chosen": -55501452.8, "logits/rejected": -50906510.222222224, "logps/chosen": -310.1562825520833, "logps/rejected": -520.9491644965278, "loss": 0.0664, "rewards/chosen": 7.485165913899739, "rewards/margins": 22.12284257676866, "rewards/rejected": -14.637676662868923, "step": 3020 }, { "epoch": 0.7559114224946828, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41625974.85714286, "logits/rejected": -34001360.0, "logps/chosen": -381.34116908482144, "logps/rejected": -541.88271484375, "loss": 0.0079, "rewards/chosen": 7.873631068638393, "rewards/margins": 25.508109828404017, "rewards/rejected": -17.634478759765624, "step": 3021 }, { "epoch": 0.7561616414362567, "grad_norm": 2.640625, "kl": 8.345202445983887, "learning_rate": 5e-06, "logits/chosen": -49595253.333333336, "logits/rejected": -25759706.666666668, "logps/chosen": -371.2119954427083, "logps/rejected": -448.7355143229167, "loss": 0.0892, "rewards/chosen": 7.154866536458333, "rewards/margins": 18.299564361572266, "rewards/rejected": -11.144697825113932, "step": 3022 }, { "epoch": 0.7564118603778306, "grad_norm": 2.09375, "kl": 2.554166316986084, "learning_rate": 5e-06, "logits/chosen": -47801490.28571428, "logits/rejected": -37197817.6, "logps/chosen": -378.62465122767856, "logps/rejected": -571.64599609375, "loss": 0.0061, "rewards/chosen": 9.706957135881696, "rewards/margins": 30.476961408342632, "rewards/rejected": -20.770004272460938, "step": 3023 }, { "epoch": 0.7566620793194044, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -1643535.3846153845, "logits/rejected": -43807025.45454545, "logps/chosen": -206.16231595552884, "logps/rejected": -744.5750177556819, "loss": 0.1165, "rewards/chosen": 4.0923746549166164, "rewards/margins": 23.066243285065767, "rewards/rejected": -18.97386863014915, "step": 3024 }, { "epoch": 0.7569122982609784, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36291168.0, "logits/rejected": -64379228.0, "logps/chosen": -256.9881286621094, "logps/rejected": -678.6770629882812, "loss": 0.0831, "rewards/chosen": 5.054719924926758, "rewards/margins": 21.543819427490234, "rewards/rejected": -16.489099502563477, "step": 3025 }, { "epoch": 0.7571625172025522, "grad_norm": 1.4375, "kl": 6.154253959655762, "learning_rate": 5e-06, "logits/chosen": -46648011.63636363, "logits/rejected": -66373287.384615384, "logps/chosen": -405.66432883522725, "logps/rejected": -530.7264873798077, "loss": 0.0212, "rewards/chosen": 11.731084650213068, "rewards/margins": 22.837119149161385, "rewards/rejected": -11.106034498948317, "step": 3026 }, { "epoch": 0.7574127361441261, "grad_norm": 26.5, "kl": 11.78526496887207, "learning_rate": 5e-06, "logits/chosen": -18465640.615384616, "logits/rejected": -54347217.45454545, "logps/chosen": -448.1644756610577, "logps/rejected": -620.7837357954545, "loss": 0.0364, "rewards/chosen": 10.522921048677885, "rewards/margins": 20.262144182111832, "rewards/rejected": -9.73922313343395, "step": 3027 }, { "epoch": 0.7576629550857, "grad_norm": 3.171875, "kl": 8.21436882019043, "learning_rate": 5e-06, "logits/chosen": -40990001.23076923, "logits/rejected": -40971485.09090909, "logps/chosen": -370.52798227163464, "logps/rejected": -569.7867098721591, "loss": 0.0316, "rewards/chosen": 8.740723830003004, "rewards/margins": 24.976121168870193, "rewards/rejected": -16.235397338867188, "step": 3028 }, { "epoch": 0.7579131740272739, "grad_norm": 1.28125, "kl": 8.874314308166504, "learning_rate": 5e-06, "logits/chosen": -57312118.15384615, "logits/rejected": -66919883.63636363, "logps/chosen": -432.7990159254808, "logps/rejected": -703.6788441051136, "loss": 0.002, "rewards/chosen": 10.246924767127403, "rewards/margins": 27.889031256829107, "rewards/rejected": -17.642106489701703, "step": 3029 }, { "epoch": 0.7581633929688477, "grad_norm": 4.65625, "kl": 12.677990913391113, "learning_rate": 5e-06, "logits/chosen": -45911792.0, "logits/rejected": -55012660.0, "logps/chosen": -386.392333984375, "logps/rejected": -766.695556640625, "loss": 0.0434, "rewards/chosen": 9.33446216583252, "rewards/margins": 26.673619270324707, "rewards/rejected": -17.339157104492188, "step": 3030 }, { "epoch": 0.7584136119104217, "grad_norm": 3.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13912113.333333334, "logits/rejected": -61974108.44444445, "logps/chosen": -446.6190592447917, "logps/rejected": -581.9754774305555, "loss": 0.0162, "rewards/chosen": 7.01948356628418, "rewards/margins": 22.12471495734321, "rewards/rejected": -15.105231391059029, "step": 3031 }, { "epoch": 0.7586638308519955, "grad_norm": 3.953125, "kl": 10.596351623535156, "learning_rate": 5e-06, "logits/chosen": -58056000.0, "logits/rejected": -78050905.6, "logps/chosen": -398.57847377232144, "logps/rejected": -758.4318359375, "loss": 0.0444, "rewards/chosen": 8.558523450578962, "rewards/margins": 28.943778882707868, "rewards/rejected": -20.385255432128908, "step": 3032 }, { "epoch": 0.7589140497935694, "grad_norm": 5.4375, "kl": 16.013896942138672, "learning_rate": 5e-06, "logits/chosen": -29352867.76470588, "logits/rejected": -40619245.71428572, "logps/chosen": -407.60431985294116, "logps/rejected": -683.6116768973214, "loss": 0.0886, "rewards/chosen": 10.113013772403493, "rewards/margins": 28.51347197204077, "rewards/rejected": -18.400458199637278, "step": 3033 }, { "epoch": 0.7591642687351432, "grad_norm": 11.6875, "kl": 5.124140739440918, "learning_rate": 5e-06, "logits/chosen": -42533597.538461536, "logits/rejected": -18246498.90909091, "logps/chosen": -338.30235877403845, "logps/rejected": -369.61669921875, "loss": 0.0875, "rewards/chosen": 7.537200927734375, "rewards/margins": 16.929589011452414, "rewards/rejected": -9.39238808371804, "step": 3034 }, { "epoch": 0.7594144876767172, "grad_norm": 3.03125, "kl": 5.172099590301514, "learning_rate": 5e-06, "logits/chosen": -44596512.0, "logits/rejected": -52321866.666666664, "logps/chosen": -328.06646728515625, "logps/rejected": -729.0531412760416, "loss": 0.0416, "rewards/chosen": 9.115330378214518, "rewards/margins": 26.910765965779625, "rewards/rejected": -17.795435587565105, "step": 3035 }, { "epoch": 0.759664706618291, "grad_norm": 2.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47723562.666666664, "logits/rejected": -42728584.53333333, "logps/chosen": -382.64173719618054, "logps/rejected": -655.56875, "loss": 0.0042, "rewards/chosen": 9.192427741156685, "rewards/margins": 25.117186652289497, "rewards/rejected": -15.924758911132812, "step": 3036 }, { "epoch": 0.7599149255598648, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 138294001.7777778, "logits/rejected": -66285602.13333333, "logps/chosen": -305.75792100694446, "logps/rejected": -527.13994140625, "loss": 0.0561, "rewards/chosen": 6.09983656141493, "rewards/margins": 17.994315931532118, "rewards/rejected": -11.894479370117187, "step": 3037 }, { "epoch": 0.7601651445014388, "grad_norm": 4.40625, "kl": 5.025340557098389, "learning_rate": 5e-06, "logits/chosen": -37846953.14285714, "logits/rejected": -33823040.0, "logps/chosen": -394.14697265625, "logps/rejected": -618.312646484375, "loss": 0.0107, "rewards/chosen": 10.457151140485491, "rewards/margins": 24.93178492954799, "rewards/rejected": -14.4746337890625, "step": 3038 }, { "epoch": 0.7604153634430126, "grad_norm": 0.58984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30467680.0, "logits/rejected": -32711118.769230768, "logps/chosen": -445.6549183238636, "logps/rejected": -558.3936298076923, "loss": 0.0031, "rewards/chosen": 11.27105712890625, "rewards/margins": 24.782386192908653, "rewards/rejected": -13.511329064002403, "step": 3039 }, { "epoch": 0.7606655823845865, "grad_norm": 1.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42281235.2, "logits/rejected": -43926523.428571425, "logps/chosen": -447.6833984375, "logps/rejected": -599.5888671875, "loss": 0.0021, "rewards/chosen": 9.945236206054688, "rewards/margins": 25.887191772460938, "rewards/rejected": -15.94195556640625, "step": 3040 }, { "epoch": 0.7609158013261604, "grad_norm": 3.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24406550.153846152, "logits/rejected": -13871012.363636363, "logps/chosen": -311.2875225360577, "logps/rejected": -659.4736328125, "loss": 0.037, "rewards/chosen": 7.30583249605619, "rewards/margins": 21.446686377892128, "rewards/rejected": -14.140853881835938, "step": 3041 }, { "epoch": 0.7611660202677343, "grad_norm": 9.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49932899.55555555, "logits/rejected": -56336179.2, "logps/chosen": -355.6164279513889, "logps/rejected": -730.251171875, "loss": 0.0297, "rewards/chosen": 9.774457295735678, "rewards/margins": 26.44930674235026, "rewards/rejected": -16.674849446614584, "step": 3042 }, { "epoch": 0.7614162392093081, "grad_norm": 31.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56807040.0, "logits/rejected": 25440816.0, "logps/chosen": -326.1539306640625, "logps/rejected": -484.55615234375, "loss": 0.064, "rewards/chosen": 9.176647186279297, "rewards/margins": 20.138504573277064, "rewards/rejected": -10.961857386997767, "step": 3043 }, { "epoch": 0.7616664581508821, "grad_norm": 16.75, "kl": 9.875935554504395, "learning_rate": 5e-06, "logits/chosen": -19702426.0, "logits/rejected": -42361112.0, "logps/chosen": -368.391845703125, "logps/rejected": -434.53009033203125, "loss": 0.0393, "rewards/chosen": 9.276948928833008, "rewards/margins": 21.175695419311523, "rewards/rejected": -11.898746490478516, "step": 3044 }, { "epoch": 0.7619166770924559, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26994219.42857143, "logits/rejected": -37305974.4, "logps/chosen": -236.5732421875, "logps/rejected": -551.793017578125, "loss": 0.1203, "rewards/chosen": 5.238282884870257, "rewards/margins": 24.706700243268692, "rewards/rejected": -19.468417358398437, "step": 3045 }, { "epoch": 0.7621668960340298, "grad_norm": 2.453125, "kl": 8.878522872924805, "learning_rate": 5e-06, "logits/chosen": -19218248.888888888, "logits/rejected": -52900645.333333336, "logps/chosen": -335.7138943142361, "logps/rejected": -598.9007568359375, "loss": 0.0771, "rewards/chosen": 8.101626925998264, "rewards/margins": 20.776271396213108, "rewards/rejected": -12.674644470214844, "step": 3046 }, { "epoch": 0.7624171149756036, "grad_norm": 2.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36713486.54545455, "logits/rejected": -33648337.23076923, "logps/chosen": -337.3917125355114, "logps/rejected": -675.029296875, "loss": 0.0113, "rewards/chosen": 7.587115201083097, "rewards/margins": 25.822898704688868, "rewards/rejected": -18.23578350360577, "step": 3047 }, { "epoch": 0.7626673339171776, "grad_norm": 1.0078125, "kl": 8.387125968933105, "learning_rate": 5e-06, "logits/chosen": -64260174.76923077, "logits/rejected": -55776744.72727273, "logps/chosen": -521.5092022235577, "logps/rejected": -526.8179598721591, "loss": 0.013, "rewards/chosen": 9.756273709810698, "rewards/margins": 24.27290301556354, "rewards/rejected": -14.516629305752842, "step": 3048 }, { "epoch": 0.7629175528587514, "grad_norm": 2.859375, "kl": 0.2631978988647461, "learning_rate": 5e-06, "logits/chosen": -61607914.666666664, "logits/rejected": -41820837.333333336, "logps/chosen": -395.9764811197917, "logps/rejected": -643.2744140625, "loss": 0.0137, "rewards/chosen": 8.586085637410482, "rewards/margins": 25.19300397237142, "rewards/rejected": -16.606918334960938, "step": 3049 }, { "epoch": 0.7631677718003252, "grad_norm": 8.3125, "kl": 7.226790428161621, "learning_rate": 5e-06, "logits/chosen": -32808349.714285713, "logits/rejected": -23309464.0, "logps/chosen": -471.83192661830356, "logps/rejected": -536.179931640625, "loss": 0.0541, "rewards/chosen": 9.304013933454241, "rewards/margins": 24.11996852329799, "rewards/rejected": -14.81595458984375, "step": 3050 }, { "epoch": 0.7634179907418992, "grad_norm": 9.125, "kl": 2.3386073112487793, "learning_rate": 5e-06, "logits/chosen": -48919982.222222224, "logits/rejected": -55310980.266666666, "logps/chosen": -358.27210828993054, "logps/rejected": -517.2578776041667, "loss": 0.0248, "rewards/chosen": 7.991654290093316, "rewards/margins": 25.360184563530815, "rewards/rejected": -17.3685302734375, "step": 3051 }, { "epoch": 0.763668209683473, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62282101.333333336, "logits/rejected": -61297914.666666664, "logps/chosen": -302.46498616536456, "logps/rejected": -788.9519856770834, "loss": 0.0314, "rewards/chosen": 7.059399922688802, "rewards/margins": 31.05707295735677, "rewards/rejected": -23.99767303466797, "step": 3052 }, { "epoch": 0.7639184286250469, "grad_norm": 0.83203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26611738.181818184, "logits/rejected": -82299731.6923077, "logps/chosen": -367.4422496448864, "logps/rejected": -626.7731745793269, "loss": 0.0172, "rewards/chosen": 9.286956787109375, "rewards/margins": 26.46184363731971, "rewards/rejected": -17.174886850210335, "step": 3053 }, { "epoch": 0.7641686475666208, "grad_norm": 7.28125, "kl": 5.8607611656188965, "learning_rate": 5e-06, "logits/chosen": -54126361.6, "logits/rejected": -40858716.44444445, "logps/chosen": -369.61435546875, "logps/rejected": -678.5112847222222, "loss": 0.0375, "rewards/chosen": 9.48133544921875, "rewards/margins": 32.53721483018663, "rewards/rejected": -23.05587938096788, "step": 3054 }, { "epoch": 0.7644188665081947, "grad_norm": 5.5625, "kl": 9.623043060302734, "learning_rate": 5e-06, "logits/chosen": -21914794.666666668, "logits/rejected": -39560439.46666667, "logps/chosen": -467.1070963541667, "logps/rejected": -564.4861328125, "loss": 0.0068, "rewards/chosen": 9.64678700764974, "rewards/margins": 25.613659159342447, "rewards/rejected": -15.966872151692709, "step": 3055 }, { "epoch": 0.7646690854497685, "grad_norm": 6.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49372183.27272727, "logits/rejected": -38091492.92307692, "logps/chosen": -408.87721946022725, "logps/rejected": -671.3844651442307, "loss": 0.0656, "rewards/chosen": 7.8071511008522725, "rewards/margins": 25.322459187540975, "rewards/rejected": -17.5153080866887, "step": 3056 }, { "epoch": 0.7649193043913424, "grad_norm": 0.89453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66731180.307692304, "logits/rejected": -75456011.63636364, "logps/chosen": -495.62222055288464, "logps/rejected": -721.4599609375, "loss": 0.0055, "rewards/chosen": 11.044830322265625, "rewards/margins": 32.5546181418679, "rewards/rejected": -21.509787819602273, "step": 3057 }, { "epoch": 0.7651695233329163, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32681523.2, "logits/rejected": -29462532.57142857, "logps/chosen": -362.75908203125, "logps/rejected": -625.5394810267857, "loss": 0.0482, "rewards/chosen": 7.629109191894531, "rewards/margins": 24.026051330566407, "rewards/rejected": -16.396942138671875, "step": 3058 }, { "epoch": 0.7654197422744902, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50180228.571428575, "logits/rejected": 1073312.8, "logps/chosen": -286.12744140625, "logps/rejected": -738.143603515625, "loss": 0.0341, "rewards/chosen": 7.529172624860491, "rewards/margins": 27.828955950055807, "rewards/rejected": -20.299783325195314, "step": 3059 }, { "epoch": 0.765669961216064, "grad_norm": 6.65625, "kl": 23.394771575927734, "learning_rate": 5e-06, "logits/chosen": -10551692.57142857, "logits/rejected": 11470419.2, "logps/chosen": -500.5634765625, "logps/rejected": -804.8654296875, "loss": 0.0498, "rewards/chosen": 11.067224775041852, "rewards/margins": 35.635501752580915, "rewards/rejected": -24.568276977539064, "step": 3060 }, { "epoch": 0.765920180157638, "grad_norm": 6.375, "kl": 0.1891765594482422, "learning_rate": 5e-06, "logits/chosen": -28096485.333333332, "logits/rejected": -61432170.666666664, "logps/chosen": -229.6199747721354, "logps/rejected": -589.7267252604166, "loss": 0.0444, "rewards/chosen": 5.316739400227864, "rewards/margins": 19.791669209798176, "rewards/rejected": -14.474929809570312, "step": 3061 }, { "epoch": 0.7661703990992118, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34077216.0, "logits/rejected": -21878425.14285714, "logps/chosen": -262.027392578125, "logps/rejected": -810.7154715401786, "loss": 0.0842, "rewards/chosen": 4.591363906860352, "rewards/margins": 30.45322390965053, "rewards/rejected": -25.861860002790177, "step": 3062 }, { "epoch": 0.7664206180407857, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28481168.0, "logits/rejected": -64276522.666666664, "logps/chosen": -275.35487874348956, "logps/rejected": -526.0913899739584, "loss": 0.0444, "rewards/chosen": 5.518931706746419, "rewards/margins": 21.4695618947347, "rewards/rejected": -15.950630187988281, "step": 3063 }, { "epoch": 0.7666708369823596, "grad_norm": 10.125, "kl": 17.648319244384766, "learning_rate": 5e-06, "logits/chosen": -65821134.76923077, "logits/rejected": -45717364.36363637, "logps/chosen": -479.2180739182692, "logps/rejected": -574.78955078125, "loss": 0.052, "rewards/chosen": 10.21160419170673, "rewards/margins": 23.36019011810943, "rewards/rejected": -13.1485859264027, "step": 3064 }, { "epoch": 0.7669210559239334, "grad_norm": 3.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52856357.333333336, "logits/rejected": -21393248.0, "logps/chosen": -266.5519612630208, "logps/rejected": -430.0556233723958, "loss": 0.036, "rewards/chosen": 8.488605499267578, "rewards/margins": 21.123452504475914, "rewards/rejected": -12.634847005208334, "step": 3065 }, { "epoch": 0.7671712748655073, "grad_norm": 3.765625, "kl": 9.52427864074707, "learning_rate": 5e-06, "logits/chosen": -3910475.4285714286, "logits/rejected": -94303776.0, "logps/chosen": -313.5557338169643, "logps/rejected": -828.91220703125, "loss": 0.1083, "rewards/chosen": 6.9978822980608255, "rewards/margins": 27.82281210763114, "rewards/rejected": -20.824929809570314, "step": 3066 }, { "epoch": 0.7674214938070812, "grad_norm": 4.4375, "kl": 1.3731441497802734, "learning_rate": 5e-06, "logits/chosen": -53589184.0, "logits/rejected": -40116995.2, "logps/chosen": -427.18941824776783, "logps/rejected": -529.79384765625, "loss": 0.0218, "rewards/chosen": 9.828845432826451, "rewards/margins": 28.84438956124442, "rewards/rejected": -19.015544128417968, "step": 3067 }, { "epoch": 0.7676717127486551, "grad_norm": 2.890625, "kl": 6.76815128326416, "learning_rate": 5e-06, "logits/chosen": -44723237.64705882, "logits/rejected": -39660292.571428575, "logps/chosen": -354.8489774816176, "logps/rejected": -585.0604073660714, "loss": 0.0156, "rewards/chosen": 8.474680283490349, "rewards/margins": 25.959602420069587, "rewards/rejected": -17.48492213657924, "step": 3068 }, { "epoch": 0.7679219316902289, "grad_norm": 14.5, "kl": 24.758806228637695, "learning_rate": 5e-06, "logits/chosen": -71399323.42857143, "logits/rejected": -38435932.8, "logps/chosen": -440.54080636160717, "logps/rejected": -705.109130859375, "loss": 0.0961, "rewards/chosen": 11.009863717215401, "rewards/margins": 29.299367196219308, "rewards/rejected": -18.289503479003905, "step": 3069 }, { "epoch": 0.7681721506318028, "grad_norm": 15.375, "kl": 4.885580062866211, "learning_rate": 5e-06, "logits/chosen": -64806386.28571428, "logits/rejected": -69796761.6, "logps/chosen": -375.8914271763393, "logps/rejected": -688.26025390625, "loss": 0.0533, "rewards/chosen": 7.988880702427456, "rewards/margins": 23.265810939243863, "rewards/rejected": -15.276930236816407, "step": 3070 }, { "epoch": 0.7684223695733767, "grad_norm": 8.4375, "kl": 9.01988697052002, "learning_rate": 5e-06, "logits/chosen": -68952805.33333333, "logits/rejected": -42101514.666666664, "logps/chosen": -387.4458414713542, "logps/rejected": -446.4722493489583, "loss": 0.049, "rewards/chosen": 8.355372746785482, "rewards/margins": 23.46181297302246, "rewards/rejected": -15.106440226236979, "step": 3071 }, { "epoch": 0.7686725885149506, "grad_norm": 5.09375, "kl": 0.5832252502441406, "learning_rate": 5e-06, "logits/chosen": -32751240.0, "logits/rejected": -54761664.0, "logps/chosen": -371.4450276692708, "logps/rejected": -670.0087076822916, "loss": 0.0259, "rewards/chosen": 9.327786127726236, "rewards/margins": 24.54293886820475, "rewards/rejected": -15.215152740478516, "step": 3072 }, { "epoch": 0.7689228074565244, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56793488.0, "logits/rejected": -22600864.0, "logps/chosen": -376.9311930338542, "logps/rejected": -867.9537760416666, "loss": 0.0214, "rewards/chosen": 9.333326975504557, "rewards/margins": 31.87760798136393, "rewards/rejected": -22.544281005859375, "step": 3073 }, { "epoch": 0.7691730263980984, "grad_norm": 3.078125, "kl": 5.481792449951172, "learning_rate": 5e-06, "logits/chosen": -36319121.06666667, "logits/rejected": -46409752.88888889, "logps/chosen": -295.51474609375, "logps/rejected": -734.0636393229166, "loss": 0.0634, "rewards/chosen": 6.762885538736979, "rewards/margins": 26.27461107042101, "rewards/rejected": -19.51172553168403, "step": 3074 }, { "epoch": 0.7694232453396722, "grad_norm": 8.3125, "kl": 5.888150691986084, "learning_rate": 5e-06, "logits/chosen": -27564006.4, "logits/rejected": -78485902.22222222, "logps/chosen": -401.3704427083333, "logps/rejected": -651.6440972222222, "loss": 0.0392, "rewards/chosen": 8.369480387369792, "rewards/margins": 24.624107191297746, "rewards/rejected": -16.254626803927952, "step": 3075 }, { "epoch": 0.7696734642812461, "grad_norm": 8.75, "kl": 8.82068920135498, "learning_rate": 5e-06, "logits/chosen": -33014621.714285713, "logits/rejected": -69246464.0, "logps/chosen": -337.24393136160717, "logps/rejected": -803.382373046875, "loss": 0.0436, "rewards/chosen": 7.301273345947266, "rewards/margins": 27.211426544189454, "rewards/rejected": -19.910153198242188, "step": 3076 }, { "epoch": 0.76992368322282, "grad_norm": 1.6796875, "kl": 1.0160974264144897, "learning_rate": 5e-06, "logits/chosen": -15504145.23076923, "logits/rejected": -43200189.09090909, "logps/chosen": -188.97115384615384, "logps/rejected": -655.0973011363636, "loss": 0.0711, "rewards/chosen": 6.333577669583834, "rewards/margins": 20.95682429600429, "rewards/rejected": -14.623246626420455, "step": 3077 }, { "epoch": 0.7701739021643939, "grad_norm": 6.875, "kl": 9.873197555541992, "learning_rate": 5e-06, "logits/chosen": -86418609.23076923, "logits/rejected": -37327906.90909091, "logps/chosen": -383.6145582932692, "logps/rejected": -461.5814098011364, "loss": 0.0889, "rewards/chosen": 8.161186805138222, "rewards/margins": 20.167687049278847, "rewards/rejected": -12.006500244140625, "step": 3078 }, { "epoch": 0.7704241211059677, "grad_norm": 6.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46142144.0, "logits/rejected": -50391689.84615385, "logps/chosen": -335.21182528409093, "logps/rejected": -615.4658578725962, "loss": 0.0272, "rewards/chosen": 8.337737343528055, "rewards/margins": 24.37740667383154, "rewards/rejected": -16.039669330303486, "step": 3079 }, { "epoch": 0.7706743400475417, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57657137.23076923, "logits/rejected": -25353588.363636363, "logps/chosen": -360.4960186298077, "logps/rejected": -401.8840997869318, "loss": 0.0623, "rewards/chosen": 8.738087580754208, "rewards/margins": 20.63450675911003, "rewards/rejected": -11.896419178355824, "step": 3080 }, { "epoch": 0.7709245589891155, "grad_norm": 1.4140625, "kl": 0.7843869924545288, "learning_rate": 5e-06, "logits/chosen": -45583477.333333336, "logits/rejected": -48008938.666666664, "logps/chosen": -278.1892903645833, "logps/rejected": -621.3359781901041, "loss": 0.0134, "rewards/chosen": 8.486738840738932, "rewards/margins": 23.02573013305664, "rewards/rejected": -14.538991292317709, "step": 3081 }, { "epoch": 0.7711747779306893, "grad_norm": 5.875, "kl": 8.27616024017334, "learning_rate": 5e-06, "logits/chosen": -59423910.4, "logits/rejected": -33561108.571428575, "logps/chosen": -470.000537109375, "logps/rejected": -549.7661481584821, "loss": 0.0059, "rewards/chosen": 11.532557678222656, "rewards/margins": 23.156912449428013, "rewards/rejected": -11.624354771205358, "step": 3082 }, { "epoch": 0.7714249968722632, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34163730.666666664, "logits/rejected": -29745805.333333332, "logps/chosen": -241.61580403645834, "logps/rejected": -576.1265462239584, "loss": 0.0202, "rewards/chosen": 7.229747772216797, "rewards/margins": 20.808292388916016, "rewards/rejected": -13.578544616699219, "step": 3083 }, { "epoch": 0.7716752158138371, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34087445.333333336, "logits/rejected": -38345971.2, "logps/chosen": -309.2471516927083, "logps/rejected": -577.5503255208333, "loss": 0.0487, "rewards/chosen": 6.995212978786892, "rewards/margins": 20.664888678656684, "rewards/rejected": -13.669675699869792, "step": 3084 }, { "epoch": 0.771925434755411, "grad_norm": 16.875, "kl": 14.930333137512207, "learning_rate": 5e-06, "logits/chosen": -49902144.0, "logits/rejected": -78610773.33333333, "logps/chosen": -446.7172526041667, "logps/rejected": -538.17919921875, "loss": 0.0841, "rewards/chosen": 9.231494140625, "rewards/margins": 21.77554711235894, "rewards/rejected": -12.54405297173394, "step": 3085 }, { "epoch": 0.7721756536969848, "grad_norm": 3.5625, "kl": 6.327107906341553, "learning_rate": 5e-06, "logits/chosen": -63635909.81818182, "logits/rejected": -16798806.153846152, "logps/chosen": -415.92569247159093, "logps/rejected": -452.12939453125, "loss": 0.0051, "rewards/chosen": 9.078102111816406, "rewards/margins": 20.25029050386869, "rewards/rejected": -11.172188392052284, "step": 3086 }, { "epoch": 0.7724258726385588, "grad_norm": 1.1796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23788008.0, "logits/rejected": -46431353.2631579, "logps/chosen": -276.1698974609375, "logps/rejected": -551.7350945723684, "loss": 0.0104, "rewards/chosen": 8.102255249023438, "rewards/margins": 21.810096017937912, "rewards/rejected": -13.707840768914474, "step": 3087 }, { "epoch": 0.7726760915801326, "grad_norm": 10.125, "kl": 9.914198875427246, "learning_rate": 5e-06, "logits/chosen": -9470930.0, "logits/rejected": -34718650.666666664, "logps/chosen": -283.0377604166667, "logps/rejected": -587.2843831380209, "loss": 0.0341, "rewards/chosen": 7.970082600911458, "rewards/margins": 18.455219904581707, "rewards/rejected": -10.485137303670248, "step": 3088 }, { "epoch": 0.7729263105217065, "grad_norm": 13.625, "kl": 13.809676170349121, "learning_rate": 5e-06, "logits/chosen": -41840140.0, "logits/rejected": -41442824.0, "logps/chosen": -381.11895751953125, "logps/rejected": -629.78125, "loss": 0.0475, "rewards/chosen": 9.175348281860352, "rewards/margins": 26.86263084411621, "rewards/rejected": -17.68728256225586, "step": 3089 }, { "epoch": 0.7731765294632804, "grad_norm": 7.5, "kl": 13.262168884277344, "learning_rate": 5e-06, "logits/chosen": -50675222.85714286, "logits/rejected": -39873110.4, "logps/chosen": -385.2784946986607, "logps/rejected": -613.212451171875, "loss": 0.0333, "rewards/chosen": 8.78684561593192, "rewards/margins": 20.94713156563895, "rewards/rejected": -12.160285949707031, "step": 3090 }, { "epoch": 0.7734267484048543, "grad_norm": 12.125, "kl": 5.643848419189453, "learning_rate": 5e-06, "logits/chosen": -60358961.777777776, "logits/rejected": -7662541.866666666, "logps/chosen": -468.90614149305554, "logps/rejected": -510.54713541666666, "loss": 0.0288, "rewards/chosen": 12.079994201660156, "rewards/margins": 23.5205073038737, "rewards/rejected": -11.440513102213542, "step": 3091 }, { "epoch": 0.7736769673464281, "grad_norm": 3.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9278665.6, "logits/rejected": -39063861.333333336, "logps/chosen": -370.7568033854167, "logps/rejected": -783.9172092013889, "loss": 0.0234, "rewards/chosen": 8.525145975748698, "rewards/margins": 26.53505401611328, "rewards/rejected": -18.009908040364582, "step": 3092 }, { "epoch": 0.7739271862880021, "grad_norm": 3.75, "kl": 11.267451286315918, "learning_rate": 5e-06, "logits/chosen": -44673128.0, "logits/rejected": -48458848.0, "logps/chosen": -424.0133463541667, "logps/rejected": -521.7499593098959, "loss": 0.0489, "rewards/chosen": 9.116305033365885, "rewards/margins": 26.136512756347656, "rewards/rejected": -17.02020772298177, "step": 3093 }, { "epoch": 0.7741774052295759, "grad_norm": 11.625, "kl": 17.3372859954834, "learning_rate": 5e-06, "logits/chosen": -39754872.0, "logits/rejected": -42002376.0, "logps/chosen": -395.74884033203125, "logps/rejected": -574.26220703125, "loss": 0.1015, "rewards/chosen": 9.162534713745117, "rewards/margins": 23.3634090423584, "rewards/rejected": -14.200874328613281, "step": 3094 }, { "epoch": 0.7744276241711497, "grad_norm": 12.9375, "kl": 11.118032455444336, "learning_rate": 5e-06, "logits/chosen": -35086284.8, "logits/rejected": -54647250.28571428, "logps/chosen": -300.6271728515625, "logps/rejected": -690.5827287946429, "loss": 0.0768, "rewards/chosen": 9.241445159912109, "rewards/margins": 25.625485992431642, "rewards/rejected": -16.38404083251953, "step": 3095 }, { "epoch": 0.7746778431127236, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52246890.666666664, "logits/rejected": -64625978.666666664, "logps/chosen": -482.2469482421875, "logps/rejected": -643.9549153645834, "loss": 0.0216, "rewards/chosen": 10.69256846110026, "rewards/margins": 32.527732849121094, "rewards/rejected": -21.835164388020832, "step": 3096 }, { "epoch": 0.7749280620542975, "grad_norm": 25.125, "kl": 7.89694881439209, "learning_rate": 5e-06, "logits/chosen": -45727389.09090909, "logits/rejected": -63346141.538461536, "logps/chosen": -383.74636008522725, "logps/rejected": -608.9039963942307, "loss": 0.0616, "rewards/chosen": 8.890484896573154, "rewards/margins": 22.43183856243854, "rewards/rejected": -13.541353665865385, "step": 3097 }, { "epoch": 0.7751782809958714, "grad_norm": 10.9375, "kl": 11.805471420288086, "learning_rate": 5e-06, "logits/chosen": -27859584.0, "logits/rejected": -22996124.8, "logps/chosen": -384.5111607142857, "logps/rejected": -631.20576171875, "loss": 0.049, "rewards/chosen": 9.24264417375837, "rewards/margins": 25.303004891531806, "rewards/rejected": -16.060360717773438, "step": 3098 }, { "epoch": 0.7754284999374452, "grad_norm": 4.40625, "kl": 8.93884563446045, "learning_rate": 5e-06, "logits/chosen": -27946980.266666666, "logits/rejected": -75413781.33333333, "logps/chosen": -396.44397786458336, "logps/rejected": -336.07823350694446, "loss": 0.0687, "rewards/chosen": 9.180886840820312, "rewards/margins": 21.545633782280817, "rewards/rejected": -12.364746941460503, "step": 3099 }, { "epoch": 0.7756787188790192, "grad_norm": 7.1875, "kl": 4.656963348388672, "learning_rate": 5e-06, "logits/chosen": -75081130.66666667, "logits/rejected": 52358668.8, "logps/chosen": -484.15386284722223, "logps/rejected": -548.3673177083333, "loss": 0.0147, "rewards/chosen": 11.95937008327908, "rewards/margins": 28.105831061469182, "rewards/rejected": -16.146460978190103, "step": 3100 }, { "epoch": 0.775928937820593, "grad_norm": 7.53125, "kl": 3.4970359802246094, "learning_rate": 5e-06, "logits/chosen": -32969216.0, "logits/rejected": -35889627.07692308, "logps/chosen": -270.04201438210225, "logps/rejected": -518.2101862980769, "loss": 0.0497, "rewards/chosen": 7.812269731001421, "rewards/margins": 26.015748030655867, "rewards/rejected": -18.203478299654446, "step": 3101 }, { "epoch": 0.7761791567621669, "grad_norm": 4.4375, "kl": 2.7799766063690186, "learning_rate": 5e-06, "logits/chosen": -90191795.2, "logits/rejected": -51659190.85714286, "logps/chosen": -384.98662109375, "logps/rejected": -820.3825334821429, "loss": 0.0065, "rewards/chosen": 9.093458557128907, "rewards/margins": 31.868055725097655, "rewards/rejected": -22.77459716796875, "step": 3102 }, { "epoch": 0.7764293757037408, "grad_norm": 11.0625, "kl": 0.007950624451041222, "learning_rate": 5e-06, "logits/chosen": -77214375.38461539, "logits/rejected": -19624429.09090909, "logps/chosen": -492.4860276442308, "logps/rejected": -658.0683149857955, "loss": 0.0283, "rewards/chosen": 12.044015737680288, "rewards/margins": 32.13958708222929, "rewards/rejected": -20.095571344549004, "step": 3103 }, { "epoch": 0.7766795946453147, "grad_norm": 13.8125, "kl": 3.122584819793701, "learning_rate": 5e-06, "logits/chosen": -42618348.307692304, "logits/rejected": -90994926.54545455, "logps/chosen": -366.75732421875, "logps/rejected": -955.8269708806819, "loss": 0.0521, "rewards/chosen": 8.889064495380108, "rewards/margins": 30.71318336966988, "rewards/rejected": -21.824118874289773, "step": 3104 }, { "epoch": 0.7769298135868885, "grad_norm": 11.5, "kl": 0.9228464961051941, "learning_rate": 5e-06, "logits/chosen": -65750243.55555555, "logits/rejected": -35108989.86666667, "logps/chosen": -456.89198133680554, "logps/rejected": -598.6826171875, "loss": 0.0494, "rewards/chosen": 9.414031982421875, "rewards/margins": 25.554949951171874, "rewards/rejected": -16.14091796875, "step": 3105 }, { "epoch": 0.7771800325284624, "grad_norm": 7.9375, "kl": 8.075639724731445, "learning_rate": 5e-06, "logits/chosen": -30144912.0, "logits/rejected": -3574462.6666666665, "logps/chosen": -381.9097900390625, "logps/rejected": -754.1100260416666, "loss": 0.0982, "rewards/chosen": 9.778003692626953, "rewards/margins": 29.296606699625652, "rewards/rejected": -19.5186030069987, "step": 3106 }, { "epoch": 0.7774302514700363, "grad_norm": 1.546875, "kl": 4.305239677429199, "learning_rate": 5e-06, "logits/chosen": -37979470.54545455, "logits/rejected": -44418043.07692308, "logps/chosen": -338.29097123579544, "logps/rejected": -758.5072115384615, "loss": 0.0276, "rewards/chosen": 8.82777266068892, "rewards/margins": 32.933959747527865, "rewards/rejected": -24.106187086838943, "step": 3107 }, { "epoch": 0.7776804704116101, "grad_norm": 3.078125, "kl": 3.9497880935668945, "learning_rate": 5e-06, "logits/chosen": -31301408.0, "logits/rejected": -78377792.0, "logps/chosen": -315.6142252604167, "logps/rejected": -464.1765407986111, "loss": 0.0525, "rewards/chosen": 7.751452128092448, "rewards/margins": 23.058428446451824, "rewards/rejected": -15.306976318359375, "step": 3108 }, { "epoch": 0.777930689353184, "grad_norm": 5.4375, "kl": 0.13927333056926727, "learning_rate": 5e-06, "logits/chosen": -54686027.63636363, "logits/rejected": -39946604.307692304, "logps/chosen": -442.53901811079544, "logps/rejected": -539.1669546274038, "loss": 0.0116, "rewards/chosen": 9.94445107199929, "rewards/margins": 29.21253273703835, "rewards/rejected": -19.268081665039062, "step": 3109 }, { "epoch": 0.778180908294758, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75545294.76923077, "logits/rejected": -55890967.27272727, "logps/chosen": -446.8544170673077, "logps/rejected": -774.443359375, "loss": 0.0488, "rewards/chosen": 7.956207275390625, "rewards/margins": 29.564389315518465, "rewards/rejected": -21.60818204012784, "step": 3110 }, { "epoch": 0.7784311272363318, "grad_norm": 11.0, "kl": 1.6312549114227295, "learning_rate": 5e-06, "logits/chosen": -44586120.53333333, "logits/rejected": -88642609.77777778, "logps/chosen": -297.5338541666667, "logps/rejected": -436.4309895833333, "loss": 0.0514, "rewards/chosen": 5.938922627766927, "rewards/margins": 18.871905178493925, "rewards/rejected": -12.932982550726997, "step": 3111 }, { "epoch": 0.7786813461779056, "grad_norm": 2.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50263400.0, "logits/rejected": -59647528.0, "logps/chosen": -308.05780029296875, "logps/rejected": -699.5264282226562, "loss": 0.0574, "rewards/chosen": 6.598553657531738, "rewards/margins": 29.065373420715332, "rewards/rejected": -22.466819763183594, "step": 3112 }, { "epoch": 0.7789315651194796, "grad_norm": 12.75, "kl": 10.12739372253418, "learning_rate": 5e-06, "logits/chosen": -37394070.85714286, "logits/rejected": -58530144.0, "logps/chosen": -422.79317801339283, "logps/rejected": -633.23759765625, "loss": 0.073, "rewards/chosen": 9.65329088483538, "rewards/margins": 31.724039786202567, "rewards/rejected": -22.07074890136719, "step": 3113 }, { "epoch": 0.7791817840610534, "grad_norm": 8.1875, "kl": 2.6220130920410156, "learning_rate": 5e-06, "logits/chosen": -83382976.0, "logits/rejected": -55216288.0, "logps/chosen": -518.0105794270834, "logps/rejected": -643.44091796875, "loss": 0.0144, "rewards/chosen": 9.889227549235025, "rewards/margins": 31.907403310139976, "rewards/rejected": -22.01817576090495, "step": 3114 }, { "epoch": 0.7794320030026273, "grad_norm": 2.6875, "kl": 0.8821039199829102, "learning_rate": 5e-06, "logits/chosen": -59582747.428571425, "logits/rejected": -57116378.35294118, "logps/chosen": -532.8210797991071, "logps/rejected": -786.5436006433823, "loss": 0.0029, "rewards/chosen": 10.168629237583705, "rewards/margins": 37.42045131651293, "rewards/rejected": -27.251822078929226, "step": 3115 }, { "epoch": 0.7796822219442012, "grad_norm": 1.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50331685.333333336, "logits/rejected": -48506197.333333336, "logps/chosen": -348.7652994791667, "logps/rejected": -460.1316731770833, "loss": 0.017, "rewards/chosen": 8.298028310139975, "rewards/margins": 22.982758839925133, "rewards/rejected": -14.684730529785156, "step": 3116 }, { "epoch": 0.7799324408857751, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22882520.888888888, "logits/rejected": -52187848.53333333, "logps/chosen": -249.39287651909723, "logps/rejected": -643.231640625, "loss": 0.0311, "rewards/chosen": 6.587571038140191, "rewards/margins": 26.55279320610894, "rewards/rejected": -19.96522216796875, "step": 3117 }, { "epoch": 0.7801826598273489, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40380794.666666664, "logits/rejected": -24098288.0, "logps/chosen": -371.811767578125, "logps/rejected": -525.6431477864584, "loss": 0.0314, "rewards/chosen": 6.868104934692383, "rewards/margins": 21.64151827494303, "rewards/rejected": -14.77341334025065, "step": 3118 }, { "epoch": 0.7804328787689228, "grad_norm": 2.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66292410.18181818, "logits/rejected": -69150700.3076923, "logps/chosen": -462.85697798295456, "logps/rejected": -893.6654146634615, "loss": 0.0178, "rewards/chosen": 7.685485146262429, "rewards/margins": 37.58231225713983, "rewards/rejected": -29.896827110877403, "step": 3119 }, { "epoch": 0.7806830977104967, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 6013406.4, "logits/rejected": -34066429.71428572, "logps/chosen": -487.156982421875, "logps/rejected": -592.1492047991071, "loss": 0.0256, "rewards/chosen": 7.14148178100586, "rewards/margins": 26.187391117640907, "rewards/rejected": -19.045909336635045, "step": 3120 }, { "epoch": 0.7809333166520706, "grad_norm": 0.4296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83720152.61538461, "logits/rejected": -63632488.72727273, "logps/chosen": -469.3756760817308, "logps/rejected": -589.5751509232955, "loss": 0.0008, "rewards/chosen": 8.813344515286959, "rewards/margins": 28.600082130698894, "rewards/rejected": -19.786737615411933, "step": 3121 }, { "epoch": 0.7811835355936444, "grad_norm": 15.625, "kl": 0.1891886442899704, "learning_rate": 5e-06, "logits/chosen": -61324842.666666664, "logits/rejected": -50074901.333333336, "logps/chosen": -426.0803629557292, "logps/rejected": -516.2776692708334, "loss": 0.034, "rewards/chosen": 10.313753763834635, "rewards/margins": 25.528283437093098, "rewards/rejected": -15.214529673258463, "step": 3122 }, { "epoch": 0.7814337545352184, "grad_norm": 6.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46754535.11111111, "logits/rejected": -47186380.8, "logps/chosen": -542.6650933159722, "logps/rejected": -666.4768229166667, "loss": 0.0049, "rewards/chosen": 10.556115044487846, "rewards/margins": 28.656094699435762, "rewards/rejected": -18.099979654947916, "step": 3123 }, { "epoch": 0.7816839734767922, "grad_norm": 6.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54529442.461538464, "logits/rejected": -58733416.72727273, "logps/chosen": -423.11899038461536, "logps/rejected": -744.0970348011364, "loss": 0.0239, "rewards/chosen": 10.640434852013222, "rewards/margins": 36.29103632573481, "rewards/rejected": -25.65060147372159, "step": 3124 }, { "epoch": 0.781934192418366, "grad_norm": 3.859375, "kl": 3.689861297607422, "learning_rate": 5e-06, "logits/chosen": -42082944.0, "logits/rejected": -57787525.81818182, "logps/chosen": -320.47269381009613, "logps/rejected": -523.4486860795455, "loss": 0.0577, "rewards/chosen": 7.987174400916467, "rewards/margins": 22.344153771033653, "rewards/rejected": -14.356979370117188, "step": 3125 }, { "epoch": 0.78218441135994, "grad_norm": 1.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46319450.666666664, "logits/rejected": -58136485.333333336, "logps/chosen": -396.3977457682292, "logps/rejected": -629.2072347005209, "loss": 0.0116, "rewards/chosen": 9.46846071879069, "rewards/margins": 31.06225903828939, "rewards/rejected": -21.5937983194987, "step": 3126 }, { "epoch": 0.7824346303015138, "grad_norm": 19.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28529194.666666668, "logits/rejected": -42638664.0, "logps/chosen": -515.0146484375, "logps/rejected": -600.2367350260416, "loss": 0.0716, "rewards/chosen": 8.847700754801432, "rewards/margins": 28.76372400919596, "rewards/rejected": -19.91602325439453, "step": 3127 }, { "epoch": 0.7826848492430877, "grad_norm": 6.59375, "kl": 0.9355100393295288, "learning_rate": 5e-06, "logits/chosen": -81510170.66666667, "logits/rejected": -87762389.33333333, "logps/chosen": -420.345703125, "logps/rejected": -632.9634602864584, "loss": 0.0103, "rewards/chosen": 9.198383331298828, "rewards/margins": 28.13521957397461, "rewards/rejected": -18.93683624267578, "step": 3128 }, { "epoch": 0.7829350681846616, "grad_norm": 4.3125, "kl": 0.5725581049919128, "learning_rate": 5e-06, "logits/chosen": -58936743.384615384, "logits/rejected": -89005998.54545455, "logps/chosen": -412.79405799278845, "logps/rejected": -746.6052911931819, "loss": 0.0128, "rewards/chosen": 8.69496859036959, "rewards/margins": 33.89529984480851, "rewards/rejected": -25.20033125443892, "step": 3129 }, { "epoch": 0.7831852871262355, "grad_norm": 9.875, "kl": 8.888015747070312, "learning_rate": 5e-06, "logits/chosen": -67455899.42857143, "logits/rejected": -39160233.6, "logps/chosen": -408.77737862723217, "logps/rejected": -444.98759765625, "loss": 0.0499, "rewards/chosen": 7.465677533830915, "rewards/margins": 20.73968418666295, "rewards/rejected": -13.274006652832032, "step": 3130 }, { "epoch": 0.7834355060678093, "grad_norm": 10.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42953516.307692304, "logits/rejected": -55263092.36363637, "logps/chosen": -403.3117487980769, "logps/rejected": -689.1675248579545, "loss": 0.0572, "rewards/chosen": 7.689930842472957, "rewards/margins": 34.877375355967274, "rewards/rejected": -27.187444513494317, "step": 3131 }, { "epoch": 0.7836857250093832, "grad_norm": 1.3828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38362858.666666664, "logits/rejected": -47737888.0, "logps/chosen": -362.32727864583336, "logps/rejected": -577.8590494791666, "loss": 0.0224, "rewards/chosen": 8.77501729329427, "rewards/margins": 23.562255520290798, "rewards/rejected": -14.787238226996529, "step": 3132 }, { "epoch": 0.7839359439509571, "grad_norm": 19.5, "kl": 16.044431686401367, "learning_rate": 5e-06, "logits/chosen": -38804069.64705882, "logits/rejected": -50074605.71428572, "logps/chosen": -418.4707605698529, "logps/rejected": -370.35672433035717, "loss": 0.0825, "rewards/chosen": 10.449409933651195, "rewards/margins": 20.519178566812467, "rewards/rejected": -10.069768633161273, "step": 3133 }, { "epoch": 0.784186162892531, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40449085.333333336, "logits/rejected": -35129957.333333336, "logps/chosen": -450.840087890625, "logps/rejected": -584.0757242838541, "loss": 0.03, "rewards/chosen": 10.264256159464518, "rewards/margins": 28.018547693888344, "rewards/rejected": -17.754291534423828, "step": 3134 }, { "epoch": 0.7844363818341048, "grad_norm": 2.453125, "kl": 3.6843173503875732, "learning_rate": 5e-06, "logits/chosen": -31152984.615384616, "logits/rejected": -62613899.63636363, "logps/chosen": -386.9408428485577, "logps/rejected": -664.0089222301136, "loss": 0.0232, "rewards/chosen": 8.866382305438702, "rewards/margins": 26.396075908954327, "rewards/rejected": -17.529693603515625, "step": 3135 }, { "epoch": 0.7846866007756788, "grad_norm": 1.3828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32743926.4, "logits/rejected": -60384649.14285714, "logps/chosen": -249.809716796875, "logps/rejected": -660.89794921875, "loss": 0.0214, "rewards/chosen": 6.709999084472656, "rewards/margins": 30.02502986363002, "rewards/rejected": -23.315030779157365, "step": 3136 }, { "epoch": 0.7849368197172526, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -80641664.0, "logits/rejected": -41814842.18181818, "logps/chosen": -346.5217848557692, "logps/rejected": -689.0806107954545, "loss": 0.0184, "rewards/chosen": 8.534469017615685, "rewards/margins": 27.653948110300345, "rewards/rejected": -19.11947909268466, "step": 3137 }, { "epoch": 0.7851870386588264, "grad_norm": 9.0625, "kl": 9.07091999053955, "learning_rate": 5e-06, "logits/chosen": -52289996.8, "logits/rejected": -54227637.333333336, "logps/chosen": -397.72376302083336, "logps/rejected": -604.8625217013889, "loss": 0.0441, "rewards/chosen": 8.233914693196615, "rewards/margins": 28.718403286404083, "rewards/rejected": -20.484488593207466, "step": 3138 }, { "epoch": 0.7854372576004004, "grad_norm": 3.859375, "kl": 4.453624248504639, "learning_rate": 5e-06, "logits/chosen": -54622132.36363637, "logits/rejected": -49797159.384615384, "logps/chosen": -443.37362393465907, "logps/rejected": -521.3897986778846, "loss": 0.0153, "rewards/chosen": 9.03527762673118, "rewards/margins": 23.135544303413873, "rewards/rejected": -14.100266676682692, "step": 3139 }, { "epoch": 0.7856874765419742, "grad_norm": 7.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30233636.923076924, "logits/rejected": -27408424.727272727, "logps/chosen": -433.4579326923077, "logps/rejected": -614.9102450284091, "loss": 0.032, "rewards/chosen": 9.076451228215145, "rewards/margins": 26.53073867050918, "rewards/rejected": -17.454287442294035, "step": 3140 }, { "epoch": 0.7859376954835481, "grad_norm": 7.84375, "kl": 3.48591685295105, "learning_rate": 5e-06, "logits/chosen": -56627895.46666667, "logits/rejected": -21182545.777777776, "logps/chosen": -275.36438802083336, "logps/rejected": -787.6069878472222, "loss": 0.0459, "rewards/chosen": 7.977870686848958, "rewards/margins": 27.95308363172743, "rewards/rejected": -19.97521294487847, "step": 3141 }, { "epoch": 0.786187914425122, "grad_norm": 6.4375, "kl": 12.190521240234375, "learning_rate": 5e-06, "logits/chosen": -61239764.0, "logits/rejected": -63715416.0, "logps/chosen": -425.7151794433594, "logps/rejected": -549.8751831054688, "loss": 0.0131, "rewards/chosen": 9.285855293273926, "rewards/margins": 25.927979469299316, "rewards/rejected": -16.64212417602539, "step": 3142 }, { "epoch": 0.7864381333666959, "grad_norm": 6.34375, "kl": 2.870297908782959, "learning_rate": 5e-06, "logits/chosen": -57561028.571428575, "logits/rejected": -58605926.4, "logps/chosen": -488.75223214285717, "logps/rejected": -777.88671875, "loss": 0.0088, "rewards/chosen": 10.573829650878906, "rewards/margins": 31.817106628417967, "rewards/rejected": -21.24327697753906, "step": 3143 }, { "epoch": 0.7866883523082697, "grad_norm": 5.15625, "kl": 6.198540687561035, "learning_rate": 5e-06, "logits/chosen": -62383247.058823526, "logits/rejected": -26992896.0, "logps/chosen": -415.0096220128676, "logps/rejected": -659.2712053571429, "loss": 0.0511, "rewards/chosen": 9.577201394473805, "rewards/margins": 25.989549460531286, "rewards/rejected": -16.41234806605748, "step": 3144 }, { "epoch": 0.7869385712498436, "grad_norm": 5.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24877188.923076924, "logits/rejected": -45688320.0, "logps/chosen": -270.0003004807692, "logps/rejected": -729.8998579545455, "loss": 0.0207, "rewards/chosen": 8.143423227163462, "rewards/margins": 30.31115615951431, "rewards/rejected": -22.16773293235085, "step": 3145 }, { "epoch": 0.7871887901914175, "grad_norm": 15.0, "kl": 11.193626403808594, "learning_rate": 5e-06, "logits/chosen": -46192749.71428572, "logits/rejected": -69040646.4, "logps/chosen": -382.97042410714283, "logps/rejected": -654.82109375, "loss": 0.0907, "rewards/chosen": 7.881475176130023, "rewards/margins": 22.018438066755024, "rewards/rejected": -14.136962890625, "step": 3146 }, { "epoch": 0.7874390091329914, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51012667.428571425, "logits/rejected": -60799750.4, "logps/chosen": -254.10777064732142, "logps/rejected": -646.07841796875, "loss": 0.0547, "rewards/chosen": 6.962978907993862, "rewards/margins": 27.776849147251674, "rewards/rejected": -20.813870239257813, "step": 3147 }, { "epoch": 0.7876892280745652, "grad_norm": 1.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33126227.2, "logits/rejected": -86313709.71428572, "logps/chosen": -304.51005859375, "logps/rejected": -799.0270647321429, "loss": 0.0252, "rewards/chosen": 7.89716796875, "rewards/margins": 33.42469482421875, "rewards/rejected": -25.52752685546875, "step": 3148 }, { "epoch": 0.7879394470161392, "grad_norm": 7.15625, "kl": 0.5937080383300781, "learning_rate": 5e-06, "logits/chosen": -44233964.307692304, "logits/rejected": -24101265.454545453, "logps/chosen": -334.3811598557692, "logps/rejected": -729.2784090909091, "loss": 0.012, "rewards/chosen": 7.730343158428486, "rewards/margins": 26.94199504718914, "rewards/rejected": -19.211651888760652, "step": 3149 }, { "epoch": 0.788189665957713, "grad_norm": 5.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68449989.81818181, "logits/rejected": -55347456.0, "logps/chosen": -467.30752840909093, "logps/rejected": -669.2489483173077, "loss": 0.0059, "rewards/chosen": 7.826829390092329, "rewards/margins": 27.49265486710555, "rewards/rejected": -19.66582547701322, "step": 3150 }, { "epoch": 0.7884398848992868, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58427510.15384615, "logits/rejected": -50430807.27272727, "logps/chosen": -346.02005709134613, "logps/rejected": -730.6151012073864, "loss": 0.0414, "rewards/chosen": 7.546366764948918, "rewards/margins": 28.723646150602328, "rewards/rejected": -21.17727938565341, "step": 3151 }, { "epoch": 0.7886901038408608, "grad_norm": 3.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47160512.0, "logits/rejected": -42045126.4, "logps/chosen": -417.5836704799107, "logps/rejected": -576.8109375, "loss": 0.0056, "rewards/chosen": 8.983966282435826, "rewards/margins": 30.452170017787388, "rewards/rejected": -21.468203735351562, "step": 3152 }, { "epoch": 0.7889403227824346, "grad_norm": 5.96875, "kl": 18.41874122619629, "learning_rate": 5e-06, "logits/chosen": -59820896.0, "logits/rejected": -42821110.85714286, "logps/chosen": -321.6694091796875, "logps/rejected": -618.1948939732143, "loss": 0.094, "rewards/chosen": 7.6344970703125, "rewards/margins": 25.744823564801898, "rewards/rejected": -18.110326494489396, "step": 3153 }, { "epoch": 0.7891905417240085, "grad_norm": 1.6015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51972928.0, "logits/rejected": -63745559.27272727, "logps/chosen": -431.43370643028845, "logps/rejected": -476.5553089488636, "loss": 0.016, "rewards/chosen": 8.034474886380709, "rewards/margins": 21.89949286400855, "rewards/rejected": -13.865017977627842, "step": 3154 }, { "epoch": 0.7894407606655823, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11057011.692307692, "logits/rejected": -86633629.0909091, "logps/chosen": -333.7978515625, "logps/rejected": -775.5379083806819, "loss": 0.0289, "rewards/chosen": 7.483803969163161, "rewards/margins": 29.959417649916002, "rewards/rejected": -22.47561368075284, "step": 3155 }, { "epoch": 0.7896909796071563, "grad_norm": 8.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52886726.4, "logits/rejected": -57080114.28571428, "logps/chosen": -320.0166748046875, "logps/rejected": -604.2458147321429, "loss": 0.0293, "rewards/chosen": 6.805620574951172, "rewards/margins": 25.07688914707729, "rewards/rejected": -18.271268572126115, "step": 3156 }, { "epoch": 0.7899411985487301, "grad_norm": 6.09375, "kl": 15.784521102905273, "learning_rate": 5e-06, "logits/chosen": -77469696.0, "logits/rejected": -37947859.2, "logps/chosen": -393.7251674107143, "logps/rejected": -607.499267578125, "loss": 0.0314, "rewards/chosen": 9.636716570172991, "rewards/margins": 27.675237383161274, "rewards/rejected": -18.03852081298828, "step": 3157 }, { "epoch": 0.790191417490304, "grad_norm": 1.1796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62118282.666666664, "logits/rejected": -49192293.333333336, "logps/chosen": -334.06882731119794, "logps/rejected": -749.6505533854166, "loss": 0.0189, "rewards/chosen": 8.312831242879232, "rewards/margins": 30.779495875040688, "rewards/rejected": -22.466664632161457, "step": 3158 }, { "epoch": 0.7904416364318779, "grad_norm": 3.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9216285.6, "logits/rejected": -62547483.428571425, "logps/chosen": -248.832421875, "logps/rejected": -673.5, "loss": 0.0298, "rewards/chosen": 6.885089874267578, "rewards/margins": 26.526974814278738, "rewards/rejected": -19.64188494001116, "step": 3159 }, { "epoch": 0.7906918553734518, "grad_norm": 2.453125, "kl": 12.48333740234375, "learning_rate": 5e-06, "logits/chosen": -63101454.76923077, "logits/rejected": -22890221.09090909, "logps/chosen": -463.44057992788464, "logps/rejected": -677.2645596590909, "loss": 0.0137, "rewards/chosen": 11.52263934795673, "rewards/margins": 36.74104810594679, "rewards/rejected": -25.21840875799006, "step": 3160 }, { "epoch": 0.7909420743150256, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28679890.666666668, "logits/rejected": -9501543.333333334, "logps/chosen": -436.7552897135417, "logps/rejected": -517.740478515625, "loss": 0.0047, "rewards/chosen": 9.497055053710938, "rewards/margins": 27.30694071451823, "rewards/rejected": -17.809885660807293, "step": 3161 }, { "epoch": 0.7911922932565996, "grad_norm": 11.0, "kl": 1.8298888206481934, "learning_rate": 5e-06, "logits/chosen": -11951538.285714285, "logits/rejected": -46614064.0, "logps/chosen": -318.5646275111607, "logps/rejected": -527.307373046875, "loss": 0.08, "rewards/chosen": 6.056464603969029, "rewards/margins": 18.988440922328405, "rewards/rejected": -12.931976318359375, "step": 3162 }, { "epoch": 0.7914425121981734, "grad_norm": 7.9375, "kl": 6.409518241882324, "learning_rate": 5e-06, "logits/chosen": -43496846.76923077, "logits/rejected": -54040238.54545455, "logps/chosen": -332.86624849759613, "logps/rejected": -560.8447265625, "loss": 0.0341, "rewards/chosen": 8.752815833458534, "rewards/margins": 19.82694372430548, "rewards/rejected": -11.074127890846945, "step": 3163 }, { "epoch": 0.7916927311397473, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69198865.45454545, "logits/rejected": -45574537.84615385, "logps/chosen": -270.3463023792614, "logps/rejected": -603.3871694711538, "loss": 0.0611, "rewards/chosen": 7.215987465598366, "rewards/margins": 26.000185933146444, "rewards/rejected": -18.784198467548077, "step": 3164 }, { "epoch": 0.7919429500813212, "grad_norm": 6.28125, "kl": 3.796109676361084, "learning_rate": 5e-06, "logits/chosen": -31222180.57142857, "logits/rejected": -34780358.4, "logps/chosen": -415.83189174107144, "logps/rejected": -544.44375, "loss": 0.0285, "rewards/chosen": 10.159109933035714, "rewards/margins": 26.318927437918525, "rewards/rejected": -16.159817504882813, "step": 3165 }, { "epoch": 0.792193169022895, "grad_norm": 3.9375, "kl": 14.186848640441895, "learning_rate": 5e-06, "logits/chosen": -45562816.0, "logits/rejected": -42035481.6, "logps/chosen": -358.0990513392857, "logps/rejected": -576.576123046875, "loss": 0.0866, "rewards/chosen": 9.054125104631696, "rewards/margins": 24.360303388323103, "rewards/rejected": -15.306178283691406, "step": 3166 }, { "epoch": 0.7924433879644689, "grad_norm": 3.203125, "kl": 1.0441970825195312, "learning_rate": 5e-06, "logits/chosen": -47982981.81818182, "logits/rejected": -54493080.615384616, "logps/chosen": -328.297607421875, "logps/rejected": -744.0870643028846, "loss": 0.0172, "rewards/chosen": 8.062478498979049, "rewards/margins": 28.275263486208615, "rewards/rejected": -20.21278498722957, "step": 3167 }, { "epoch": 0.7926936069060427, "grad_norm": 11.25, "kl": 0.26417669653892517, "learning_rate": 5e-06, "logits/chosen": -44968453.81818182, "logits/rejected": -54137462.15384615, "logps/chosen": -365.7479137073864, "logps/rejected": -635.6334134615385, "loss": 0.0186, "rewards/chosen": 8.448943398215555, "rewards/margins": 26.123384222284063, "rewards/rejected": -17.67444082406851, "step": 3168 }, { "epoch": 0.7929438258476167, "grad_norm": 7.6875, "kl": 14.424945831298828, "learning_rate": 5e-06, "logits/chosen": -89681005.71428572, "logits/rejected": -31658153.6, "logps/chosen": -498.68896484375, "logps/rejected": -617.385107421875, "loss": 0.042, "rewards/chosen": 12.469615391322545, "rewards/margins": 27.250058201381137, "rewards/rejected": -14.780442810058593, "step": 3169 }, { "epoch": 0.7931940447891905, "grad_norm": 4.75, "kl": 9.40216064453125, "learning_rate": 5e-06, "logits/chosen": -41881044.0, "logits/rejected": -32860892.0, "logps/chosen": -440.55438232421875, "logps/rejected": -714.7567138671875, "loss": 0.0176, "rewards/chosen": 9.913310050964355, "rewards/margins": 27.049975395202637, "rewards/rejected": -17.13666534423828, "step": 3170 }, { "epoch": 0.7934442637307644, "grad_norm": 34.5, "kl": 0.2805735468864441, "learning_rate": 5e-06, "logits/chosen": -35806854.4, "logits/rejected": -24823442.285714287, "logps/chosen": -358.1268798828125, "logps/rejected": -717.6179547991071, "loss": 0.054, "rewards/chosen": 9.4863037109375, "rewards/margins": 21.878467668805804, "rewards/rejected": -12.392163957868304, "step": 3171 }, { "epoch": 0.7936944826723383, "grad_norm": 24.75, "kl": 5.164003372192383, "learning_rate": 5e-06, "logits/chosen": -55440917.333333336, "logits/rejected": -45847984.0, "logps/chosen": -348.71240234375, "logps/rejected": -492.47998046875, "loss": 0.0474, "rewards/chosen": 8.108353932698568, "rewards/margins": 20.167166392008465, "rewards/rejected": -12.058812459309896, "step": 3172 }, { "epoch": 0.7939447016139122, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39877715.692307696, "logits/rejected": -42870679.27272727, "logps/chosen": -432.9982722355769, "logps/rejected": -536.3340287642045, "loss": 0.0435, "rewards/chosen": 7.3421501746544475, "rewards/margins": 23.575875982537973, "rewards/rejected": -16.233725807883523, "step": 3173 }, { "epoch": 0.794194920555486, "grad_norm": 21.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40508986.18181818, "logits/rejected": -49383940.92307692, "logps/chosen": -287.96604225852275, "logps/rejected": -744.3415715144231, "loss": 0.0429, "rewards/chosen": 7.526300603693182, "rewards/margins": 26.27119947313429, "rewards/rejected": -18.744898869441105, "step": 3174 }, { "epoch": 0.79444513949706, "grad_norm": 3.609375, "kl": 4.230663299560547, "learning_rate": 5e-06, "logits/chosen": -53834279.384615384, "logits/rejected": -37132552.72727273, "logps/chosen": -315.91597806490387, "logps/rejected": -515.4821999289773, "loss": 0.0441, "rewards/chosen": 7.125238858736479, "rewards/margins": 21.325274647532645, "rewards/rejected": -14.200035788796164, "step": 3175 }, { "epoch": 0.7946953584386338, "grad_norm": 18.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57633701.333333336, "logits/rejected": -44814215.11111111, "logps/chosen": -387.0567220052083, "logps/rejected": -545.6187065972222, "loss": 0.057, "rewards/chosen": 8.452921549479166, "rewards/margins": 19.92859395345052, "rewards/rejected": -11.475672403971354, "step": 3176 }, { "epoch": 0.7949455773802077, "grad_norm": 0.83203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64562823.11111111, "logits/rejected": -66326694.4, "logps/chosen": -392.9104817708333, "logps/rejected": -787.056640625, "loss": 0.0008, "rewards/chosen": 9.614667256673178, "rewards/margins": 28.884310404459633, "rewards/rejected": -19.269643147786457, "step": 3177 }, { "epoch": 0.7951957963217816, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23798093.333333332, "logits/rejected": -50254560.0, "logps/chosen": -361.1700846354167, "logps/rejected": -621.4038899739584, "loss": 0.0674, "rewards/chosen": 7.391970952351888, "rewards/margins": 22.86704953511556, "rewards/rejected": -15.475078582763672, "step": 3178 }, { "epoch": 0.7954460152633555, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49946013.538461536, "logits/rejected": -38336584.72727273, "logps/chosen": -270.4592472956731, "logps/rejected": -349.99922318892044, "loss": 0.0369, "rewards/chosen": 7.067205575796274, "rewards/margins": 19.71114312018548, "rewards/rejected": -12.643937544389205, "step": 3179 }, { "epoch": 0.7956962342049293, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63100236.8, "logits/rejected": -61981284.571428575, "logps/chosen": -485.782275390625, "logps/rejected": -611.1044921875, "loss": 0.0221, "rewards/chosen": 10.09509506225586, "rewards/margins": 28.49079033987863, "rewards/rejected": -18.395695277622767, "step": 3180 }, { "epoch": 0.7959464531465031, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52217526.15384615, "logits/rejected": -50430789.81818182, "logps/chosen": -373.61087740384613, "logps/rejected": -551.7339754971591, "loss": 0.0199, "rewards/chosen": 9.113879864032452, "rewards/margins": 25.32080152818373, "rewards/rejected": -16.206921664151277, "step": 3181 }, { "epoch": 0.7961966720880771, "grad_norm": 2.796875, "kl": 12.237716674804688, "learning_rate": 5e-06, "logits/chosen": -44968992.0, "logits/rejected": -53136330.666666664, "logps/chosen": -419.9730224609375, "logps/rejected": -650.16650390625, "loss": 0.0136, "rewards/chosen": 10.710688273111979, "rewards/margins": 28.072784423828125, "rewards/rejected": -17.362096150716145, "step": 3182 }, { "epoch": 0.7964468910296509, "grad_norm": 1.9296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45695716.571428575, "logits/rejected": -62012280.47058824, "logps/chosen": -569.3485630580357, "logps/rejected": -672.0313074448529, "loss": 0.0411, "rewards/chosen": 12.5328369140625, "rewards/margins": 31.355357450597428, "rewards/rejected": -18.822520536534928, "step": 3183 }, { "epoch": 0.7966971099712248, "grad_norm": 12.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40817491.692307696, "logits/rejected": -46814949.81818182, "logps/chosen": -421.5954402043269, "logps/rejected": -630.8254616477273, "loss": 0.0247, "rewards/chosen": 8.455179654634916, "rewards/margins": 24.46166602714912, "rewards/rejected": -16.006486372514203, "step": 3184 }, { "epoch": 0.7969473289127987, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -74645229.71428572, "logits/rejected": -50018996.705882356, "logps/chosen": -275.17051478794644, "logps/rejected": -653.041015625, "loss": 0.0153, "rewards/chosen": 6.632861001150949, "rewards/margins": 27.125556529069147, "rewards/rejected": -20.492695527918197, "step": 3185 }, { "epoch": 0.7971975478543726, "grad_norm": 2.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34934528.0, "logits/rejected": -48389464.0, "logps/chosen": -409.9518737792969, "logps/rejected": -725.6411743164062, "loss": 0.022, "rewards/chosen": 8.54737377166748, "rewards/margins": 27.53396701812744, "rewards/rejected": -18.98659324645996, "step": 3186 }, { "epoch": 0.7974477667959464, "grad_norm": 7.625, "kl": 11.701041221618652, "learning_rate": 5e-06, "logits/chosen": -38962309.333333336, "logits/rejected": -34656282.666666664, "logps/chosen": -365.2467447916667, "logps/rejected": -584.3806559244791, "loss": 0.0704, "rewards/chosen": 7.8193613688151045, "rewards/margins": 25.920926411946617, "rewards/rejected": -18.10156504313151, "step": 3187 }, { "epoch": 0.7976979857375204, "grad_norm": 14.6875, "kl": 9.727387428283691, "learning_rate": 5e-06, "logits/chosen": -83727034.66666667, "logits/rejected": -36145797.333333336, "logps/chosen": -452.2635091145833, "logps/rejected": -394.3990071614583, "loss": 0.1001, "rewards/chosen": 9.262088775634766, "rewards/margins": 21.685614267985024, "rewards/rejected": -12.42352549235026, "step": 3188 }, { "epoch": 0.7979482046790942, "grad_norm": 0.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34321337.6, "logits/rejected": -30197581.714285713, "logps/chosen": -428.90302734375, "logps/rejected": -725.2661830357143, "loss": 0.0009, "rewards/chosen": 9.975694274902343, "rewards/margins": 31.05088086809431, "rewards/rejected": -21.075186593191965, "step": 3189 }, { "epoch": 0.7981984236206681, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55778993.777777776, "logits/rejected": -39186107.733333334, "logps/chosen": -451.8445638020833, "logps/rejected": -636.5614583333333, "loss": 0.0047, "rewards/chosen": 10.112894694010416, "rewards/margins": 28.589029947916664, "rewards/rejected": -18.47613525390625, "step": 3190 }, { "epoch": 0.798448642562242, "grad_norm": 4.5625, "kl": 3.388685941696167, "learning_rate": 5e-06, "logits/chosen": -31397070.222222224, "logits/rejected": -78006732.8, "logps/chosen": -399.66314019097223, "logps/rejected": -739.7107421875, "loss": 0.0359, "rewards/chosen": 9.233257717556423, "rewards/margins": 27.453185696072048, "rewards/rejected": -18.219927978515624, "step": 3191 }, { "epoch": 0.7986988615038159, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60730909.538461536, "logits/rejected": -48511467.63636363, "logps/chosen": -462.8356370192308, "logps/rejected": -628.9865056818181, "loss": 0.0076, "rewards/chosen": 9.333608774038462, "rewards/margins": 28.806861503974538, "rewards/rejected": -19.47325272993608, "step": 3192 }, { "epoch": 0.7989490804453897, "grad_norm": 1.9140625, "kl": 7.800597190856934, "learning_rate": 5e-06, "logits/chosen": -79365836.8, "logits/rejected": -42586829.71428572, "logps/chosen": -415.9365234375, "logps/rejected": -691.2517438616071, "loss": 0.016, "rewards/chosen": 9.905123138427735, "rewards/margins": 30.883564649309434, "rewards/rejected": -20.978441510881698, "step": 3193 }, { "epoch": 0.7991992993869635, "grad_norm": 23.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20476076.307692308, "logits/rejected": -61493434.18181818, "logps/chosen": -286.3638258713942, "logps/rejected": -686.7190163352273, "loss": 0.0603, "rewards/chosen": 6.751426696777344, "rewards/margins": 26.39008192582564, "rewards/rejected": -19.638655229048297, "step": 3194 }, { "epoch": 0.7994495183285375, "grad_norm": 20.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20343374.222222224, "logits/rejected": -59409493.333333336, "logps/chosen": -451.0512966579861, "logps/rejected": -641.4867838541667, "loss": 0.0386, "rewards/chosen": 8.523438347710503, "rewards/margins": 28.037704298231336, "rewards/rejected": -19.514265950520834, "step": 3195 }, { "epoch": 0.7996997372701113, "grad_norm": 4.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40899112.0, "logits/rejected": -66037236.0, "logps/chosen": -407.66290283203125, "logps/rejected": -555.2177124023438, "loss": 0.0152, "rewards/chosen": 11.925619125366211, "rewards/margins": 27.827353477478027, "rewards/rejected": -15.901734352111816, "step": 3196 }, { "epoch": 0.7999499562116852, "grad_norm": 9.125, "kl": 5.743766784667969, "learning_rate": 5e-06, "logits/chosen": -58228992.0, "logits/rejected": -54248864.0, "logps/chosen": -528.6583658854166, "logps/rejected": -888.0361328125, "loss": 0.024, "rewards/chosen": 11.198453267415365, "rewards/margins": 34.903709411621094, "rewards/rejected": -23.70525614420573, "step": 3197 }, { "epoch": 0.8002001751532591, "grad_norm": 11.0625, "kl": 13.101539611816406, "learning_rate": 5e-06, "logits/chosen": -95874520.0, "logits/rejected": -51234328.0, "logps/chosen": -322.5673828125, "logps/rejected": -633.8896484375, "loss": 0.0934, "rewards/chosen": 7.500769138336182, "rewards/margins": 24.063400745391846, "rewards/rejected": -16.562631607055664, "step": 3198 }, { "epoch": 0.800450394094833, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62050077.538461536, "logits/rejected": -47988221.09090909, "logps/chosen": -348.57237830528845, "logps/rejected": -814.7113813920455, "loss": 0.0615, "rewards/chosen": 6.292000990647536, "rewards/margins": 30.196763952295264, "rewards/rejected": -23.904762961647727, "step": 3199 }, { "epoch": 0.8007006130364068, "grad_norm": 5.96875, "kl": 0.19045767188072205, "learning_rate": 5e-06, "logits/chosen": -18914081.14285714, "logits/rejected": -84356044.8, "logps/chosen": -396.59852818080356, "logps/rejected": -802.9658203125, "loss": 0.0303, "rewards/chosen": 7.647272382463727, "rewards/margins": 27.783457074846538, "rewards/rejected": -20.136184692382812, "step": 3200 }, { "epoch": 0.8009508319779808, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28635168.0, "logits/rejected": -40962779.428571425, "logps/chosen": -395.7276611328125, "logps/rejected": -565.0685686383929, "loss": 0.055, "rewards/chosen": 7.526345825195312, "rewards/margins": 25.824136352539064, "rewards/rejected": -18.29779052734375, "step": 3201 }, { "epoch": 0.8012010509195546, "grad_norm": 2.046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39366330.666666664, "logits/rejected": 10287048.0, "logps/chosen": -329.6221516927083, "logps/rejected": -630.472412109375, "loss": 0.0138, "rewards/chosen": 7.517525990804036, "rewards/margins": 20.54009755452474, "rewards/rejected": -13.022571563720703, "step": 3202 }, { "epoch": 0.8014512698611285, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39989789.09090909, "logits/rejected": -62233816.615384616, "logps/chosen": -483.79563210227275, "logps/rejected": -765.8640324519231, "loss": 0.0032, "rewards/chosen": 10.0062255859375, "rewards/margins": 30.44920935997596, "rewards/rejected": -20.44298377403846, "step": 3203 }, { "epoch": 0.8017014888027023, "grad_norm": 1.1171875, "kl": 3.607430934906006, "learning_rate": 5e-06, "logits/chosen": -54108136.72727273, "logits/rejected": -65516130.461538464, "logps/chosen": -388.0989879261364, "logps/rejected": -770.2540564903846, "loss": 0.0133, "rewards/chosen": 8.890317049893467, "rewards/margins": 30.706448641690343, "rewards/rejected": -21.816131591796875, "step": 3204 }, { "epoch": 0.8019517077442763, "grad_norm": 1.2421875, "kl": 4.821498870849609, "learning_rate": 5e-06, "logits/chosen": -33921461.333333336, "logits/rejected": -53273552.0, "logps/chosen": -361.0171712239583, "logps/rejected": -718.2425944010416, "loss": 0.0259, "rewards/chosen": 8.330461502075195, "rewards/margins": 28.48835055033366, "rewards/rejected": -20.157889048258465, "step": 3205 }, { "epoch": 0.8022019266858501, "grad_norm": 7.65625, "kl": 6.568589210510254, "learning_rate": 5e-06, "logits/chosen": -36829276.0, "logits/rejected": -22510008.0, "logps/chosen": -309.2214660644531, "logps/rejected": -341.2691650390625, "loss": 0.1108, "rewards/chosen": 7.454017639160156, "rewards/margins": 22.788188934326172, "rewards/rejected": -15.334171295166016, "step": 3206 }, { "epoch": 0.802452145627424, "grad_norm": 3.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40064986.18181818, "logits/rejected": -58998306.461538464, "logps/chosen": -413.9158380681818, "logps/rejected": -929.0920973557693, "loss": 0.0637, "rewards/chosen": 9.797701748934658, "rewards/margins": 36.92608685260053, "rewards/rejected": -27.128385103665867, "step": 3207 }, { "epoch": 0.8027023645689979, "grad_norm": 9.375, "kl": 21.51288414001465, "learning_rate": 5e-06, "logits/chosen": -39970710.85714286, "logits/rejected": -52221651.2, "logps/chosen": -407.622314453125, "logps/rejected": -560.9509765625, "loss": 0.0342, "rewards/chosen": 10.670211791992188, "rewards/margins": 27.144992065429687, "rewards/rejected": -16.4747802734375, "step": 3208 }, { "epoch": 0.8029525835105717, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44512540.0, "logits/rejected": -32697496.0, "logps/chosen": -550.233642578125, "logps/rejected": -752.481201171875, "loss": 0.0225, "rewards/chosen": 10.652582168579102, "rewards/margins": 32.21308708190918, "rewards/rejected": -21.560504913330078, "step": 3209 }, { "epoch": 0.8032028024521456, "grad_norm": 13.75, "kl": 10.17340087890625, "learning_rate": 5e-06, "logits/chosen": -80191890.28571428, "logits/rejected": -28133888.0, "logps/chosen": -376.01827566964283, "logps/rejected": -538.31103515625, "loss": 0.0652, "rewards/chosen": 8.905079432896205, "rewards/margins": 24.338126918247767, "rewards/rejected": -15.433047485351562, "step": 3210 }, { "epoch": 0.8034530213937195, "grad_norm": 10.625, "kl": 5.313276767730713, "learning_rate": 5e-06, "logits/chosen": -67854232.0, "logits/rejected": -19706378.0, "logps/chosen": -303.0367431640625, "logps/rejected": -380.15625, "loss": 0.0371, "rewards/chosen": 7.967641830444336, "rewards/margins": 16.66538143157959, "rewards/rejected": -8.697739601135254, "step": 3211 }, { "epoch": 0.8037032403352934, "grad_norm": 3.609375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -4038902.153846154, "logits/rejected": -29617297.454545453, "logps/chosen": -453.6779221754808, "logps/rejected": -737.0744406960227, "loss": 0.0175, "rewards/chosen": 9.394469627967247, "rewards/margins": 32.474164896078044, "rewards/rejected": -23.079695268110797, "step": 3212 }, { "epoch": 0.8039534592768672, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49809746.28571428, "logits/rejected": -57381990.4, "logps/chosen": -385.96358816964283, "logps/rejected": -738.7212890625, "loss": 0.0169, "rewards/chosen": 8.417591094970703, "rewards/margins": 30.885486602783203, "rewards/rejected": -22.4678955078125, "step": 3213 }, { "epoch": 0.8042036782184412, "grad_norm": 22.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -96051424.0, "logits/rejected": -38469760.0, "logps/chosen": -542.183837890625, "logps/rejected": -511.01953125, "loss": 0.0226, "rewards/chosen": 12.348125457763672, "rewards/margins": 25.98112678527832, "rewards/rejected": -13.633001327514648, "step": 3214 }, { "epoch": 0.804453897160015, "grad_norm": 16.125, "kl": 16.870433807373047, "learning_rate": 5e-06, "logits/chosen": -62498221.71428572, "logits/rejected": -62844435.2, "logps/chosen": -412.41563197544644, "logps/rejected": -716.18056640625, "loss": 0.0659, "rewards/chosen": 9.586285182407924, "rewards/margins": 30.03198983328683, "rewards/rejected": -20.445704650878906, "step": 3215 }, { "epoch": 0.8047041161015889, "grad_norm": 1.34375, "kl": 8.126016616821289, "learning_rate": 5e-06, "logits/chosen": -31591793.230769232, "logits/rejected": -43586085.81818182, "logps/chosen": -394.19858022836536, "logps/rejected": -690.0055930397727, "loss": 0.0294, "rewards/chosen": 9.639727665827824, "rewards/margins": 27.84536065588464, "rewards/rejected": -18.205632990056817, "step": 3216 }, { "epoch": 0.8049543350431627, "grad_norm": 2.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2161596.2, "logits/rejected": -39407762.28571428, "logps/chosen": -185.5008544921875, "logps/rejected": -675.2261439732143, "loss": 0.0715, "rewards/chosen": 5.34935302734375, "rewards/margins": 25.01038556780134, "rewards/rejected": -19.66103254045759, "step": 3217 }, { "epoch": 0.8052045539847367, "grad_norm": 4.21875, "kl": 0.5277456045150757, "learning_rate": 5e-06, "logits/chosen": -72121187.55555555, "logits/rejected": -48000268.8, "logps/chosen": -341.22479926215277, "logps/rejected": -510.85325520833334, "loss": 0.0434, "rewards/chosen": 8.134419759114584, "rewards/margins": 21.283841959635417, "rewards/rejected": -13.149422200520833, "step": 3218 }, { "epoch": 0.8054547729263105, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43047302.4, "logits/rejected": -36442925.71428572, "logps/chosen": -259.7948486328125, "logps/rejected": -585.0173688616071, "loss": 0.0494, "rewards/chosen": 8.889262390136718, "rewards/margins": 21.944705200195312, "rewards/rejected": -13.055442810058594, "step": 3219 }, { "epoch": 0.8057049918678844, "grad_norm": 8.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37111076.92307692, "logits/rejected": -58050914.90909091, "logps/chosen": -197.52497746394232, "logps/rejected": -549.6422230113636, "loss": 0.0939, "rewards/chosen": 5.346764784592849, "rewards/margins": 14.974215900981342, "rewards/rejected": -9.627451116388494, "step": 3220 }, { "epoch": 0.8059552108094583, "grad_norm": 1.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21275764.363636363, "logits/rejected": -61063670.15384615, "logps/chosen": -293.0260120738636, "logps/rejected": -485.2038762019231, "loss": 0.0363, "rewards/chosen": 7.96350791237571, "rewards/margins": 21.04931395203917, "rewards/rejected": -13.085806039663462, "step": 3221 }, { "epoch": 0.8062054297510322, "grad_norm": 1.8203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65545780.0, "logits/rejected": -28291148.0, "logps/chosen": -224.6514129638672, "logps/rejected": -554.32568359375, "loss": 0.0317, "rewards/chosen": 7.293304443359375, "rewards/margins": 24.57693862915039, "rewards/rejected": -17.283634185791016, "step": 3222 }, { "epoch": 0.806455648692606, "grad_norm": 3.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33249570.90909091, "logits/rejected": -3843030.153846154, "logps/chosen": -325.9200550426136, "logps/rejected": -759.0063100961538, "loss": 0.0213, "rewards/chosen": 7.941747492009943, "rewards/margins": 31.17902870444985, "rewards/rejected": -23.237281212439903, "step": 3223 }, { "epoch": 0.80670586763418, "grad_norm": 6.375, "kl": 14.324531555175781, "learning_rate": 5e-06, "logits/chosen": -40541888.0, "logits/rejected": -40822154.666666664, "logps/chosen": -454.834765625, "logps/rejected": -706.8444552951389, "loss": 0.0206, "rewards/chosen": 9.189556884765626, "rewards/margins": 24.148826090494794, "rewards/rejected": -14.959269205729166, "step": 3224 }, { "epoch": 0.8069560865757538, "grad_norm": 1.96875, "kl": 10.10914421081543, "learning_rate": 5e-06, "logits/chosen": -21419680.0, "logits/rejected": -103418123.63636364, "logps/chosen": -340.9619140625, "logps/rejected": -835.7579012784091, "loss": 0.0594, "rewards/chosen": 9.025763878455528, "rewards/margins": 34.824569808853255, "rewards/rejected": -25.798805930397727, "step": 3225 }, { "epoch": 0.8072063055173276, "grad_norm": 1.765625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35815560.0, "logits/rejected": -59639690.666666664, "logps/chosen": -393.3280843098958, "logps/rejected": -713.4117838541666, "loss": 0.0452, "rewards/chosen": 8.315086364746094, "rewards/margins": 31.43829091389974, "rewards/rejected": -23.123204549153645, "step": 3226 }, { "epoch": 0.8074565244589016, "grad_norm": 6.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44187520.0, "logits/rejected": -34528192.0, "logps/chosen": -325.03604403409093, "logps/rejected": -541.2082707331731, "loss": 0.0506, "rewards/chosen": 7.812973716042259, "rewards/margins": 25.548450870113772, "rewards/rejected": -17.735477154071514, "step": 3227 }, { "epoch": 0.8077067434004754, "grad_norm": 25.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37858638.54545455, "logits/rejected": -39233314.461538464, "logps/chosen": -451.9734552556818, "logps/rejected": -544.1219200721154, "loss": 0.0473, "rewards/chosen": 8.544602134011008, "rewards/margins": 22.521569578797667, "rewards/rejected": -13.97696744478666, "step": 3228 }, { "epoch": 0.8079569623420493, "grad_norm": 21.625, "kl": 6.388509750366211, "learning_rate": 5e-06, "logits/chosen": -32603664.0, "logits/rejected": -58856056.0, "logps/chosen": -378.42742919921875, "logps/rejected": -782.7501220703125, "loss": 0.0972, "rewards/chosen": 6.66088342666626, "rewards/margins": 26.88251543045044, "rewards/rejected": -20.22163200378418, "step": 3229 }, { "epoch": 0.8082071812836231, "grad_norm": 2.28125, "kl": 0.2792040705680847, "learning_rate": 5e-06, "logits/chosen": -82100403.2, "logits/rejected": -54392981.333333336, "logps/chosen": -467.4521484375, "logps/rejected": -704.4173177083334, "loss": 0.0034, "rewards/chosen": 9.284032185872396, "rewards/margins": 30.229310777452255, "rewards/rejected": -20.94527859157986, "step": 3230 }, { "epoch": 0.8084574002251971, "grad_norm": 3.59375, "kl": 10.97339153289795, "learning_rate": 5e-06, "logits/chosen": -56906090.666666664, "logits/rejected": -84931813.33333333, "logps/chosen": -405.3321126302083, "logps/rejected": -620.7721354166666, "loss": 0.0065, "rewards/chosen": 10.468449910481771, "rewards/margins": 29.229356129964195, "rewards/rejected": -18.760906219482422, "step": 3231 }, { "epoch": 0.8087076191667709, "grad_norm": 3.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36412997.81818182, "logits/rejected": -49609107.692307696, "logps/chosen": -453.7958984375, "logps/rejected": -657.4012169471154, "loss": 0.0081, "rewards/chosen": 10.475078235973012, "rewards/margins": 33.40958047079873, "rewards/rejected": -22.93450223482572, "step": 3232 }, { "epoch": 0.8089578381083448, "grad_norm": 3.6875, "kl": 5.184110164642334, "learning_rate": 5e-06, "logits/chosen": -51557554.28571428, "logits/rejected": -25176941.17647059, "logps/chosen": -412.2357700892857, "logps/rejected": -462.0173770680147, "loss": 0.0087, "rewards/chosen": 9.480167933872767, "rewards/margins": 23.669512452197676, "rewards/rejected": -14.189344518324909, "step": 3233 }, { "epoch": 0.8092080570499187, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68717392.0, "logits/rejected": -60180245.333333336, "logps/chosen": -293.19236246744794, "logps/rejected": -691.4656575520834, "loss": 0.0318, "rewards/chosen": 7.548010508219401, "rewards/margins": 28.72453816731771, "rewards/rejected": -21.17652765909831, "step": 3234 }, { "epoch": 0.8094582759914926, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47510276.571428575, "logits/rejected": -36143558.4, "logps/chosen": -468.8828822544643, "logps/rejected": -526.086767578125, "loss": 0.0164, "rewards/chosen": 9.206066676548549, "rewards/margins": 29.325936671665737, "rewards/rejected": -20.119869995117188, "step": 3235 }, { "epoch": 0.8097084949330664, "grad_norm": 6.5, "kl": 1.6692924499511719, "learning_rate": 5e-06, "logits/chosen": -66954352.0, "logits/rejected": -66944880.0, "logps/chosen": -544.189453125, "logps/rejected": -838.9501953125, "loss": 0.0107, "rewards/chosen": 12.539039611816406, "rewards/margins": 32.93532943725586, "rewards/rejected": -20.396289825439453, "step": 3236 }, { "epoch": 0.8099587138746404, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47749657.6, "logits/rejected": -31216786.285714287, "logps/chosen": -496.055712890625, "logps/rejected": -565.6720842633929, "loss": 0.0733, "rewards/chosen": 9.69854736328125, "rewards/margins": 25.811283656529017, "rewards/rejected": -16.112736293247767, "step": 3237 }, { "epoch": 0.8102089328162142, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28576819.2, "logits/rejected": -48359725.71428572, "logps/chosen": -522.9619140625, "logps/rejected": -560.15869140625, "loss": 0.0255, "rewards/chosen": 10.574693298339843, "rewards/margins": 25.84295697893415, "rewards/rejected": -15.268263680594307, "step": 3238 }, { "epoch": 0.810459151757788, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39782400.0, "logits/rejected": -38877149.86666667, "logps/chosen": -459.24110243055554, "logps/rejected": -706.6247395833333, "loss": 0.0672, "rewards/chosen": 7.131050957573785, "rewards/margins": 29.361438327365452, "rewards/rejected": -22.230387369791668, "step": 3239 }, { "epoch": 0.8107093706993619, "grad_norm": 22.375, "kl": 4.354381561279297, "learning_rate": 5e-06, "logits/chosen": -39876340.0, "logits/rejected": -42674240.0, "logps/chosen": -346.433837890625, "logps/rejected": -809.4393310546875, "loss": 0.1068, "rewards/chosen": 6.876967906951904, "rewards/margins": 28.745389461517334, "rewards/rejected": -21.86842155456543, "step": 3240 }, { "epoch": 0.8109595896409358, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50067578.666666664, "logits/rejected": -29203608.0, "logps/chosen": -438.6898600260417, "logps/rejected": -556.4170735677084, "loss": 0.0178, "rewards/chosen": 10.146671295166016, "rewards/margins": 25.847323099772133, "rewards/rejected": -15.70065180460612, "step": 3241 }, { "epoch": 0.8112098085825097, "grad_norm": 3.828125, "kl": 1.0155551433563232, "learning_rate": 5e-06, "logits/chosen": -26810105.6, "logits/rejected": -69103405.71428572, "logps/chosen": -350.107958984375, "logps/rejected": -680.0505719866071, "loss": 0.0112, "rewards/chosen": 6.955030822753907, "rewards/margins": 29.35159367152623, "rewards/rejected": -22.396562848772323, "step": 3242 }, { "epoch": 0.8114600275240835, "grad_norm": 18.125, "kl": 3.195341110229492, "learning_rate": 5e-06, "logits/chosen": -36207146.666666664, "logits/rejected": -8706114.666666666, "logps/chosen": -350.7351888020833, "logps/rejected": -613.4280598958334, "loss": 0.102, "rewards/chosen": 7.70967165629069, "rewards/margins": 24.492033004760742, "rewards/rejected": -16.78236134847005, "step": 3243 }, { "epoch": 0.8117102464656575, "grad_norm": 1.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68851750.4, "logits/rejected": -48645083.428571425, "logps/chosen": -385.51201171875, "logps/rejected": -648.0101143973214, "loss": 0.0138, "rewards/chosen": 8.814155578613281, "rewards/margins": 26.67057364327567, "rewards/rejected": -17.85641806466239, "step": 3244 }, { "epoch": 0.8119604654072313, "grad_norm": 10.6875, "kl": 2.8299102783203125, "learning_rate": 5e-06, "logits/chosen": 2463671.272727273, "logits/rejected": -59553073.23076923, "logps/chosen": -449.62819602272725, "logps/rejected": -575.9292367788462, "loss": 0.0135, "rewards/chosen": 8.949037725275213, "rewards/margins": 25.196340440870166, "rewards/rejected": -16.24730271559495, "step": 3245 }, { "epoch": 0.8122106843488052, "grad_norm": 2.15625, "kl": 2.45910906791687, "learning_rate": 5e-06, "logits/chosen": -55036659.2, "logits/rejected": -44434855.11111111, "logps/chosen": -408.84723307291665, "logps/rejected": -646.9089626736111, "loss": 0.0121, "rewards/chosen": 9.530467732747395, "rewards/margins": 30.797013346354163, "rewards/rejected": -21.26654561360677, "step": 3246 }, { "epoch": 0.8124609032903791, "grad_norm": 4.46875, "kl": 20.378314971923828, "learning_rate": 5e-06, "logits/chosen": -44641394.28571428, "logits/rejected": -48323488.0, "logps/chosen": -497.87489536830356, "logps/rejected": -546.97685546875, "loss": 0.0496, "rewards/chosen": 10.162787301199776, "rewards/margins": 26.279068429129467, "rewards/rejected": -16.11628112792969, "step": 3247 }, { "epoch": 0.812711122231953, "grad_norm": 6.0, "kl": 4.379368782043457, "learning_rate": 5e-06, "logits/chosen": -47339720.72727273, "logits/rejected": -48503212.307692304, "logps/chosen": -505.10635653409093, "logps/rejected": -551.5313251201923, "loss": 0.036, "rewards/chosen": 10.930896412242543, "rewards/margins": 23.613938498330285, "rewards/rejected": -12.68304208608774, "step": 3248 }, { "epoch": 0.8129613411735268, "grad_norm": 7.21875, "kl": 10.234516143798828, "learning_rate": 5e-06, "logits/chosen": -39130976.0, "logits/rejected": -47825973.333333336, "logps/chosen": -356.4173990885417, "logps/rejected": -462.5713704427083, "loss": 0.0138, "rewards/chosen": 9.284433364868164, "rewards/margins": 23.537143071492515, "rewards/rejected": -14.25270970662435, "step": 3249 }, { "epoch": 0.8132115601151008, "grad_norm": 1.09375, "kl": 1.483258605003357, "learning_rate": 5e-06, "logits/chosen": -30850737.777777776, "logits/rejected": -45101401.6, "logps/chosen": -389.65654839409723, "logps/rejected": -492.66009114583335, "loss": 0.0119, "rewards/chosen": 9.28047349717882, "rewards/margins": 27.335340033637152, "rewards/rejected": -18.054866536458334, "step": 3250 }, { "epoch": 0.8134617790566746, "grad_norm": 11.0625, "kl": 4.855903148651123, "learning_rate": 5e-06, "logits/chosen": -47246967.46666667, "logits/rejected": -46602801.777777776, "logps/chosen": -375.49485677083334, "logps/rejected": -601.0052083333334, "loss": 0.0163, "rewards/chosen": 8.45121815999349, "rewards/margins": 25.867638990614147, "rewards/rejected": -17.41642083062066, "step": 3251 }, { "epoch": 0.8137119979982484, "grad_norm": 1.1484375, "kl": 1.8343722820281982, "learning_rate": 5e-06, "logits/chosen": -35914066.666666664, "logits/rejected": -45876213.333333336, "logps/chosen": -307.6388753255208, "logps/rejected": -483.3712158203125, "loss": 0.0328, "rewards/chosen": 7.840925216674805, "rewards/margins": 22.493195215861, "rewards/rejected": -14.652269999186197, "step": 3252 }, { "epoch": 0.8139622169398223, "grad_norm": 2.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61106872.88888889, "logits/rejected": -53231906.13333333, "logps/chosen": -486.6623806423611, "logps/rejected": -607.91875, "loss": 0.0422, "rewards/chosen": 11.241613599989149, "rewards/margins": 32.34177432590061, "rewards/rejected": -21.100160725911458, "step": 3253 }, { "epoch": 0.8142124358813962, "grad_norm": 1.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29107756.8, "logits/rejected": -27536690.285714287, "logps/chosen": -404.698046875, "logps/rejected": -659.7548828125, "loss": 0.0172, "rewards/chosen": 9.935633087158203, "rewards/margins": 30.259422411237445, "rewards/rejected": -20.32378932407924, "step": 3254 }, { "epoch": 0.8144626548229701, "grad_norm": 3.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32092966.0, "logits/rejected": -60488304.0, "logps/chosen": -271.8189697265625, "logps/rejected": -721.5950927734375, "loss": 0.036, "rewards/chosen": 7.4692535400390625, "rewards/margins": 24.52656364440918, "rewards/rejected": -17.057310104370117, "step": 3255 }, { "epoch": 0.8147128737645439, "grad_norm": 1.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39123392.0, "logits/rejected": -72306713.6, "logps/chosen": -341.19130161830356, "logps/rejected": -825.44111328125, "loss": 0.0024, "rewards/chosen": 9.079664502825056, "rewards/margins": 33.07671345302037, "rewards/rejected": -23.997048950195314, "step": 3256 }, { "epoch": 0.8149630927061179, "grad_norm": 5.625, "kl": 0.46414631605148315, "learning_rate": 5e-06, "logits/chosen": -32562400.0, "logits/rejected": -11155862.545454545, "logps/chosen": -305.12710336538464, "logps/rejected": -573.8989701704545, "loss": 0.0916, "rewards/chosen": 7.564332815317007, "rewards/margins": 22.24373039832482, "rewards/rejected": -14.679397583007812, "step": 3257 }, { "epoch": 0.8152133116476917, "grad_norm": 5.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70715847.1111111, "logits/rejected": -49782912.0, "logps/chosen": -277.22422960069446, "logps/rejected": -589.951171875, "loss": 0.0343, "rewards/chosen": 7.242077297634548, "rewards/margins": 25.8834469265408, "rewards/rejected": -18.64136962890625, "step": 3258 }, { "epoch": 0.8154635305892656, "grad_norm": 9.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32437645.333333332, "logits/rejected": -38410805.333333336, "logps/chosen": -294.24135335286456, "logps/rejected": -404.7406819661458, "loss": 0.0316, "rewards/chosen": 8.5135129292806, "rewards/margins": 19.373694101969402, "rewards/rejected": -10.860181172688803, "step": 3259 }, { "epoch": 0.8157137495308395, "grad_norm": 2.296875, "kl": 7.911231517791748, "learning_rate": 5e-06, "logits/chosen": -29033680.0, "logits/rejected": -51246264.0, "logps/chosen": -346.0654296875, "logps/rejected": -705.3738403320312, "loss": 0.0116, "rewards/chosen": 8.201961517333984, "rewards/margins": 26.028879165649414, "rewards/rejected": -17.82691764831543, "step": 3260 }, { "epoch": 0.8159639684724134, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37950232.88888889, "logits/rejected": -31082359.466666665, "logps/chosen": -302.5338541666667, "logps/rejected": -454.92652994791666, "loss": 0.0225, "rewards/chosen": 8.03598361545139, "rewards/margins": 22.091924370659722, "rewards/rejected": -14.055940755208333, "step": 3261 }, { "epoch": 0.8162141874139872, "grad_norm": 8.1875, "kl": 5.003114223480225, "learning_rate": 5e-06, "logits/chosen": -43110766.54545455, "logits/rejected": -38709777.23076923, "logps/chosen": -385.88778409090907, "logps/rejected": -631.2416616586538, "loss": 0.0514, "rewards/chosen": 9.170000943270596, "rewards/margins": 26.297740482783816, "rewards/rejected": -17.12773953951322, "step": 3262 }, { "epoch": 0.8164644063555612, "grad_norm": 0.91796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30791382.85714286, "logits/rejected": -36504538.35294118, "logps/chosen": -368.93446568080356, "logps/rejected": -720.671875, "loss": 0.0024, "rewards/chosen": 9.268190656389509, "rewards/margins": 32.190150028517266, "rewards/rejected": -22.92195937212776, "step": 3263 }, { "epoch": 0.816714625297135, "grad_norm": 5.65625, "kl": 1.8144557476043701, "learning_rate": 5e-06, "logits/chosen": -52122326.85714286, "logits/rejected": -49306038.4, "logps/chosen": -364.64334542410717, "logps/rejected": -656.7091796875, "loss": 0.0436, "rewards/chosen": 8.470391954694476, "rewards/margins": 24.853457750592916, "rewards/rejected": -16.38306579589844, "step": 3264 }, { "epoch": 0.8169648442387089, "grad_norm": 2.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40355571.692307696, "logits/rejected": -43003421.09090909, "logps/chosen": -393.0487530048077, "logps/rejected": -616.2797407670455, "loss": 0.0182, "rewards/chosen": 9.44766822228065, "rewards/margins": 26.41180313217056, "rewards/rejected": -16.964134909889914, "step": 3265 }, { "epoch": 0.8172150631802827, "grad_norm": 8.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57840475.428571425, "logits/rejected": -46496044.8, "logps/chosen": -360.45438058035717, "logps/rejected": -669.81591796875, "loss": 0.0499, "rewards/chosen": 7.632750374930246, "rewards/margins": 25.15670667375837, "rewards/rejected": -17.523956298828125, "step": 3266 }, { "epoch": 0.8174652821218567, "grad_norm": 3.125, "kl": 10.741095542907715, "learning_rate": 5e-06, "logits/chosen": -46431884.8, "logits/rejected": -40464809.14285714, "logps/chosen": -338.8407958984375, "logps/rejected": -607.5833565848214, "loss": 0.0721, "rewards/chosen": 6.633848571777344, "rewards/margins": 22.60595877511161, "rewards/rejected": -15.972110203334264, "step": 3267 }, { "epoch": 0.8177155010634305, "grad_norm": 1.9453125, "kl": 2.784249782562256, "learning_rate": 5e-06, "logits/chosen": -24806392.0, "logits/rejected": -47479482.666666664, "logps/chosen": -222.7137451171875, "logps/rejected": -727.4908854166666, "loss": 0.0389, "rewards/chosen": 7.228570938110352, "rewards/margins": 24.578962326049805, "rewards/rejected": -17.350391387939453, "step": 3268 }, { "epoch": 0.8179657200050043, "grad_norm": 15.4375, "kl": 29.407833099365234, "learning_rate": 5e-06, "logits/chosen": -63059456.0, "logits/rejected": -56472234.666666664, "logps/chosen": -436.94867621527777, "logps/rejected": -604.643798828125, "loss": 0.1904, "rewards/chosen": 9.55215115017361, "rewards/margins": 27.006140814887154, "rewards/rejected": -17.453989664713543, "step": 3269 }, { "epoch": 0.8182159389465783, "grad_norm": 6.90625, "kl": 8.866788864135742, "learning_rate": 5e-06, "logits/chosen": -34011643.733333334, "logits/rejected": -38723395.55555555, "logps/chosen": -360.43776041666666, "logps/rejected": -523.2466362847222, "loss": 0.0365, "rewards/chosen": 8.127863566080729, "rewards/margins": 21.33568149142795, "rewards/rejected": -13.207817925347221, "step": 3270 }, { "epoch": 0.8184661578881521, "grad_norm": 8.4375, "kl": 10.106219291687012, "learning_rate": 5e-06, "logits/chosen": -27812226.285714287, "logits/rejected": -75996083.2, "logps/chosen": -356.5025111607143, "logps/rejected": -681.22666015625, "loss": 0.0337, "rewards/chosen": 8.201512472970146, "rewards/margins": 29.149597494942803, "rewards/rejected": -20.948085021972656, "step": 3271 }, { "epoch": 0.818716376829726, "grad_norm": 7.375, "kl": 9.702817916870117, "learning_rate": 5e-06, "logits/chosen": -10994260.57142857, "logits/rejected": -51708723.2, "logps/chosen": -410.0978306361607, "logps/rejected": -899.335546875, "loss": 0.054, "rewards/chosen": 9.463233947753906, "rewards/margins": 26.436311340332033, "rewards/rejected": -16.973077392578126, "step": 3272 }, { "epoch": 0.8189665957712999, "grad_norm": 10.5625, "kl": 5.2907633781433105, "learning_rate": 5e-06, "logits/chosen": -16715988.363636363, "logits/rejected": -60775227.07692308, "logps/chosen": -357.24092240767044, "logps/rejected": -624.3419846754807, "loss": 0.0155, "rewards/chosen": 8.487605701793324, "rewards/margins": 27.185264534049935, "rewards/rejected": -18.69765883225661, "step": 3273 }, { "epoch": 0.8192168147128738, "grad_norm": 3.03125, "kl": 6.142941951751709, "learning_rate": 5e-06, "logits/chosen": -36122069.333333336, "logits/rejected": -35315765.333333336, "logps/chosen": -356.3988037109375, "logps/rejected": -631.3761393229166, "loss": 0.0473, "rewards/chosen": 9.049164454142252, "rewards/margins": 24.87175178527832, "rewards/rejected": -15.822587331136068, "step": 3274 }, { "epoch": 0.8194670336544476, "grad_norm": 7.59375, "kl": 9.817859649658203, "learning_rate": 5e-06, "logits/chosen": -32533915.733333334, "logits/rejected": -47578282.666666664, "logps/chosen": -504.8352864583333, "logps/rejected": -815.2511393229166, "loss": 0.0196, "rewards/chosen": 9.454236857096355, "rewards/margins": 27.86946072048611, "rewards/rejected": -18.415223863389755, "step": 3275 }, { "epoch": 0.8197172525960216, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24456814.0, "logits/rejected": -62693396.0, "logps/chosen": -240.20492553710938, "logps/rejected": -638.2871704101562, "loss": 0.0152, "rewards/chosen": 6.531996726989746, "rewards/margins": 21.37136173248291, "rewards/rejected": -14.839365005493164, "step": 3276 }, { "epoch": 0.8199674715375954, "grad_norm": 6.34375, "kl": 15.615396499633789, "learning_rate": 5e-06, "logits/chosen": -49494661.333333336, "logits/rejected": -47888373.333333336, "logps/chosen": -528.053955078125, "logps/rejected": -580.2891438802084, "loss": 0.0236, "rewards/chosen": 10.40066655476888, "rewards/margins": 24.22950871785482, "rewards/rejected": -13.828842163085938, "step": 3277 }, { "epoch": 0.8202176904791693, "grad_norm": 2.90625, "kl": 7.928256511688232, "learning_rate": 5e-06, "logits/chosen": -38740041.84615385, "logits/rejected": -52866065.45454545, "logps/chosen": -381.10501802884613, "logps/rejected": -514.5059925426136, "loss": 0.0588, "rewards/chosen": 10.038317166841948, "rewards/margins": 23.389040433443512, "rewards/rejected": -13.350723266601562, "step": 3278 }, { "epoch": 0.8204679094207431, "grad_norm": 3.0, "kl": 17.93801498413086, "learning_rate": 5e-06, "logits/chosen": -42899246.93333333, "logits/rejected": -60386901.333333336, "logps/chosen": -420.3256510416667, "logps/rejected": -681.9513888888889, "loss": 0.0409, "rewards/chosen": 11.525728352864583, "rewards/margins": 28.91897447374132, "rewards/rejected": -17.393246120876736, "step": 3279 }, { "epoch": 0.8207181283623171, "grad_norm": 5.0, "kl": 15.330894470214844, "learning_rate": 5e-06, "logits/chosen": -28183748.923076924, "logits/rejected": -62560034.90909091, "logps/chosen": -404.55464993990387, "logps/rejected": -541.0329367897727, "loss": 0.0603, "rewards/chosen": 10.17655005821815, "rewards/margins": 20.944676112461757, "rewards/rejected": -10.768126054243607, "step": 3280 }, { "epoch": 0.8209683473038909, "grad_norm": 8.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33816252.8, "logits/rejected": -28825092.57142857, "logps/chosen": -371.99404296875, "logps/rejected": -441.69032505580356, "loss": 0.0787, "rewards/chosen": 8.090084838867188, "rewards/margins": 18.765856279645647, "rewards/rejected": -10.67577144077846, "step": 3281 }, { "epoch": 0.8212185662454647, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39737920.0, "logits/rejected": -31501730.133333333, "logps/chosen": -429.51475694444446, "logps/rejected": -684.5255859375, "loss": 0.0285, "rewards/chosen": 11.348898145887587, "rewards/margins": 28.945937940809465, "rewards/rejected": -17.597039794921876, "step": 3282 }, { "epoch": 0.8214687851870387, "grad_norm": 2.46875, "kl": 8.799793243408203, "learning_rate": 5e-06, "logits/chosen": -38584037.81818182, "logits/rejected": -32298087.384615384, "logps/chosen": -397.57177734375, "logps/rejected": -482.0186298076923, "loss": 0.0452, "rewards/chosen": 10.044185985218395, "rewards/margins": 20.596177374566352, "rewards/rejected": -10.551991389347958, "step": 3283 }, { "epoch": 0.8217190041286125, "grad_norm": 17.0, "kl": 1.331412672996521, "learning_rate": 5e-06, "logits/chosen": -70933166.54545455, "logits/rejected": -29697784.615384616, "logps/chosen": -362.0716441761364, "logps/rejected": -517.3513371394231, "loss": 0.0795, "rewards/chosen": 7.550413651899858, "rewards/margins": 23.08391491016308, "rewards/rejected": -15.533501258263222, "step": 3284 }, { "epoch": 0.8219692230701864, "grad_norm": 1.6796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29337613.714285713, "logits/rejected": 10111699.2, "logps/chosen": -303.90938895089283, "logps/rejected": -407.0588134765625, "loss": 0.0326, "rewards/chosen": 7.239796229771206, "rewards/margins": 21.7516355242048, "rewards/rejected": -14.511839294433594, "step": 3285 }, { "epoch": 0.8222194420117603, "grad_norm": 0.1455078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43280188.0, "logits/rejected": -75093040.0, "logps/chosen": -348.45892333984375, "logps/rejected": -879.1758422851562, "loss": 0.0004, "rewards/chosen": 8.547391891479492, "rewards/margins": 35.47361946105957, "rewards/rejected": -26.926227569580078, "step": 3286 }, { "epoch": 0.8224696609533342, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34922350.76923077, "logits/rejected": -64050624.0, "logps/chosen": -325.1035907451923, "logps/rejected": -515.2407670454545, "loss": 0.0304, "rewards/chosen": 9.146081190842848, "rewards/margins": 25.368271354195123, "rewards/rejected": -16.222190163352273, "step": 3287 }, { "epoch": 0.822719879894908, "grad_norm": 13.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53542378.666666664, "logits/rejected": -35869512.53333333, "logps/chosen": -330.1077473958333, "logps/rejected": -633.656640625, "loss": 0.0291, "rewards/chosen": 9.096694946289062, "rewards/margins": 25.020217895507812, "rewards/rejected": -15.92352294921875, "step": 3288 }, { "epoch": 0.8229700988364819, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64330201.6, "logits/rejected": -55948329.14285714, "logps/chosen": -296.5574951171875, "logps/rejected": -714.0093470982143, "loss": 0.0255, "rewards/chosen": 7.901792907714844, "rewards/margins": 29.353411647251676, "rewards/rejected": -21.45161873953683, "step": 3289 }, { "epoch": 0.8232203177780558, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37806274.90909091, "logits/rejected": -14778756.923076924, "logps/chosen": -241.27183948863637, "logps/rejected": -735.8997896634615, "loss": 0.0723, "rewards/chosen": 7.361351013183594, "rewards/margins": 26.64551778940054, "rewards/rejected": -19.284166776216946, "step": 3290 }, { "epoch": 0.8234705367196297, "grad_norm": 4.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31221064.533333335, "logits/rejected": -41358069.333333336, "logps/chosen": -357.7890950520833, "logps/rejected": -595.1922200520834, "loss": 0.0351, "rewards/chosen": 8.875649007161458, "rewards/margins": 27.893011813693576, "rewards/rejected": -19.01736280653212, "step": 3291 }, { "epoch": 0.8237207556612035, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43049591.46666667, "logits/rejected": -34886833.777777776, "logps/chosen": -340.0924479166667, "logps/rejected": -665.4007161458334, "loss": 0.0346, "rewards/chosen": 8.422756958007813, "rewards/margins": 25.453506808810765, "rewards/rejected": -17.030749850802952, "step": 3292 }, { "epoch": 0.8239709746027775, "grad_norm": 8.25, "kl": 2.592266082763672, "learning_rate": 5e-06, "logits/chosen": -44962308.266666666, "logits/rejected": 31386215.111111112, "logps/chosen": -428.7900390625, "logps/rejected": -777.6287977430555, "loss": 0.0242, "rewards/chosen": 9.709370930989584, "rewards/margins": 31.827592637803818, "rewards/rejected": -22.118221706814236, "step": 3293 }, { "epoch": 0.8242211935443513, "grad_norm": 12.75, "kl": 2.3202362060546875, "learning_rate": 5e-06, "logits/chosen": 11429590.857142856, "logits/rejected": -56721280.0, "logps/chosen": -496.2706821986607, "logps/rejected": -655.50537109375, "loss": 0.0321, "rewards/chosen": 9.576468331473214, "rewards/margins": 31.49947248186384, "rewards/rejected": -21.923004150390625, "step": 3294 }, { "epoch": 0.8244714124859251, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28976835.2, "logits/rejected": -50273275.428571425, "logps/chosen": -441.313427734375, "logps/rejected": -646.2372349330357, "loss": 0.0226, "rewards/chosen": 8.246953582763672, "rewards/margins": 28.33501205444336, "rewards/rejected": -20.088058471679688, "step": 3295 }, { "epoch": 0.8247216314274991, "grad_norm": 22.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50227406.76923077, "logits/rejected": -44265649.45454545, "logps/chosen": -351.11910306490387, "logps/rejected": -732.3539595170455, "loss": 0.1026, "rewards/chosen": 6.3480072021484375, "rewards/margins": 28.694734053178266, "rewards/rejected": -22.34672685102983, "step": 3296 }, { "epoch": 0.824971850369073, "grad_norm": 10.625, "kl": 1.0299034118652344, "learning_rate": 5e-06, "logits/chosen": -57792118.85714286, "logits/rejected": -45347328.0, "logps/chosen": -379.41531808035717, "logps/rejected": -751.85859375, "loss": 0.0143, "rewards/chosen": 9.028009687151227, "rewards/margins": 27.01486576625279, "rewards/rejected": -17.986856079101564, "step": 3297 }, { "epoch": 0.8252220693106468, "grad_norm": 1.1015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59544060.0, "logits/rejected": -50084268.8, "logps/chosen": -387.0478515625, "logps/rejected": -575.128271484375, "loss": 0.0021, "rewards/chosen": 8.25259780883789, "rewards/margins": 26.847887420654295, "rewards/rejected": -18.595289611816405, "step": 3298 }, { "epoch": 0.8254722882522207, "grad_norm": 7.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26314806.4, "logits/rejected": -52175844.571428575, "logps/chosen": -310.2552490234375, "logps/rejected": -691.3191266741071, "loss": 0.0562, "rewards/chosen": 7.238521575927734, "rewards/margins": 25.701719556535995, "rewards/rejected": -18.46319798060826, "step": 3299 }, { "epoch": 0.8257225071937946, "grad_norm": 0.63671875, "kl": 3.3166141510009766, "learning_rate": 5e-06, "logits/chosen": -41219080.0, "logits/rejected": -53791388.0, "logps/chosen": -512.0276489257812, "logps/rejected": -684.2368774414062, "loss": 0.0149, "rewards/chosen": 11.076394081115723, "rewards/margins": 32.61976146697998, "rewards/rejected": -21.543367385864258, "step": 3300 }, { "epoch": 0.8259727261353684, "grad_norm": 1.46875, "kl": 4.814750671386719, "learning_rate": 5e-06, "logits/chosen": -52724790.15384615, "logits/rejected": -34189489.45454545, "logps/chosen": -457.7012469951923, "logps/rejected": -797.9524147727273, "loss": 0.0025, "rewards/chosen": 10.689822857196514, "rewards/margins": 38.36709946852464, "rewards/rejected": -27.677276611328125, "step": 3301 }, { "epoch": 0.8262229450769423, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53446202.18181818, "logits/rejected": -63393516.307692304, "logps/chosen": -308.81716086647725, "logps/rejected": -732.3592998798077, "loss": 0.0196, "rewards/chosen": 8.789416920055043, "rewards/margins": 35.041717475944466, "rewards/rejected": -26.252300555889423, "step": 3302 }, { "epoch": 0.8264731640185162, "grad_norm": 0.65234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34691204.92307692, "logits/rejected": -34995328.0, "logps/chosen": -400.47408353365387, "logps/rejected": -773.9894353693181, "loss": 0.0076, "rewards/chosen": 9.53081805889423, "rewards/margins": 32.0947357391144, "rewards/rejected": -22.56391768022017, "step": 3303 }, { "epoch": 0.8267233829600901, "grad_norm": 7.46875, "kl": 1.9938793182373047, "learning_rate": 5e-06, "logits/chosen": -46698871.46666667, "logits/rejected": -35383630.222222224, "logps/chosen": -388.05442708333334, "logps/rejected": -416.1911349826389, "loss": 0.0413, "rewards/chosen": 9.964243570963541, "rewards/margins": 23.497785780164932, "rewards/rejected": -13.53354220920139, "step": 3304 }, { "epoch": 0.8269736019016639, "grad_norm": 10.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42376290.461538464, "logits/rejected": -54592482.90909091, "logps/chosen": -307.40576171875, "logps/rejected": -571.3518732244319, "loss": 0.0621, "rewards/chosen": 7.484063955453726, "rewards/margins": 26.982453459626313, "rewards/rejected": -19.498389504172586, "step": 3305 }, { "epoch": 0.8272238208432379, "grad_norm": 3.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27703528.727272727, "logits/rejected": -51822247.384615384, "logps/chosen": -380.71830610795456, "logps/rejected": -585.3508864182693, "loss": 0.005, "rewards/chosen": 7.5115273215553975, "rewards/margins": 25.16945696210528, "rewards/rejected": -17.65792964054988, "step": 3306 }, { "epoch": 0.8274740397848117, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22414901.333333332, "logits/rejected": -39540861.333333336, "logps/chosen": -434.964599609375, "logps/rejected": -518.8721923828125, "loss": 0.0132, "rewards/chosen": 9.05865224202474, "rewards/margins": 22.37799835205078, "rewards/rejected": -13.319346110026041, "step": 3307 }, { "epoch": 0.8277242587263856, "grad_norm": 23.0, "kl": 36.345985412597656, "learning_rate": 5e-06, "logits/chosen": -79350621.86666666, "logits/rejected": -21845425.777777776, "logps/chosen": -441.016796875, "logps/rejected": -597.7423502604166, "loss": 0.14, "rewards/chosen": 9.299156697591146, "rewards/margins": 24.28133273654514, "rewards/rejected": -14.982176038953993, "step": 3308 }, { "epoch": 0.8279744776679595, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34273088.0, "logits/rejected": -41227766.85714286, "logps/chosen": -478.27998046875, "logps/rejected": -744.6792689732143, "loss": 0.0096, "rewards/chosen": 9.446966552734375, "rewards/margins": 31.937083217075894, "rewards/rejected": -22.490116664341517, "step": 3309 }, { "epoch": 0.8282246966095334, "grad_norm": 1.4375, "kl": 1.5874879360198975, "learning_rate": 5e-06, "logits/chosen": -46285351.384615384, "logits/rejected": -42127645.09090909, "logps/chosen": -352.20714393028845, "logps/rejected": -598.8568892045455, "loss": 0.0289, "rewards/chosen": 7.323147113506611, "rewards/margins": 25.105370901681326, "rewards/rejected": -17.782223788174715, "step": 3310 }, { "epoch": 0.8284749155511072, "grad_norm": 5.59375, "kl": 3.4230384826660156, "learning_rate": 5e-06, "logits/chosen": -36454779.07692308, "logits/rejected": -29546030.545454547, "logps/chosen": -310.10146859975964, "logps/rejected": -498.4625799005682, "loss": 0.0701, "rewards/chosen": 8.706908005934496, "rewards/margins": 23.89776632669089, "rewards/rejected": -15.190858320756393, "step": 3311 }, { "epoch": 0.8287251344926811, "grad_norm": 5.875, "kl": 10.114866256713867, "learning_rate": 5e-06, "logits/chosen": -43295266.13333333, "logits/rejected": -69831509.33333333, "logps/chosen": -410.04254557291665, "logps/rejected": -695.9557291666666, "loss": 0.0157, "rewards/chosen": 9.840065511067708, "rewards/margins": 24.528421698676215, "rewards/rejected": -14.688356187608507, "step": 3312 }, { "epoch": 0.828975353434255, "grad_norm": 8.3125, "kl": 2.105194091796875, "learning_rate": 5e-06, "logits/chosen": -23932745.14285714, "logits/rejected": -52005756.8, "logps/chosen": -324.17822265625, "logps/rejected": -902.579296875, "loss": 0.0331, "rewards/chosen": 8.256120954241071, "rewards/margins": 35.128264508928574, "rewards/rejected": -26.8721435546875, "step": 3313 }, { "epoch": 0.8292255723758288, "grad_norm": 2.265625, "kl": 15.011377334594727, "learning_rate": 5e-06, "logits/chosen": -69736925.86666666, "logits/rejected": -41390293.333333336, "logps/chosen": -452.0376302083333, "logps/rejected": -798.68408203125, "loss": 0.0918, "rewards/chosen": 10.94212137858073, "rewards/margins": 34.27952033148871, "rewards/rejected": -23.337398952907986, "step": 3314 }, { "epoch": 0.8294757913174027, "grad_norm": 7.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24229200.0, "logits/rejected": -39475280.0, "logps/chosen": -267.7308756510417, "logps/rejected": -497.00390625, "loss": 0.0477, "rewards/chosen": 5.999849955240886, "rewards/margins": 22.282530466715496, "rewards/rejected": -16.28268051147461, "step": 3315 }, { "epoch": 0.8297260102589766, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33809752.0, "logits/rejected": 30859330.0, "logps/chosen": -240.4337921142578, "logps/rejected": -716.928466796875, "loss": 0.0553, "rewards/chosen": 6.703101634979248, "rewards/margins": 23.37804365158081, "rewards/rejected": -16.674942016601562, "step": 3316 }, { "epoch": 0.8299762292005505, "grad_norm": 9.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43669309.333333336, "logits/rejected": -34923746.666666664, "logps/chosen": -374.477294921875, "logps/rejected": -609.7667643229166, "loss": 0.0178, "rewards/chosen": 10.217585881551107, "rewards/margins": 29.231819788614906, "rewards/rejected": -19.0142339070638, "step": 3317 }, { "epoch": 0.8302264481421243, "grad_norm": 24.375, "kl": 31.539608001708984, "learning_rate": 5e-06, "logits/chosen": -59401009.777777776, "logits/rejected": -37880693.333333336, "logps/chosen": -515.4853515625, "logps/rejected": -567.8069661458334, "loss": 0.1098, "rewards/chosen": 11.087861802842882, "rewards/margins": 32.558865017361114, "rewards/rejected": -21.47100321451823, "step": 3318 }, { "epoch": 0.8304766670836983, "grad_norm": 7.75, "kl": 1.0870425701141357, "learning_rate": 5e-06, "logits/chosen": -64401036.8, "logits/rejected": -10171996.57142857, "logps/chosen": -409.0129150390625, "logps/rejected": -568.3106863839286, "loss": 0.016, "rewards/chosen": 10.63565444946289, "rewards/margins": 27.179124559674943, "rewards/rejected": -16.543470110212052, "step": 3319 }, { "epoch": 0.8307268860252721, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43862707.2, "logits/rejected": -41969664.0, "logps/chosen": -347.829443359375, "logps/rejected": -663.58984375, "loss": 0.0341, "rewards/chosen": 8.182230377197266, "rewards/margins": 23.074542563302177, "rewards/rejected": -14.892312186104911, "step": 3320 }, { "epoch": 0.830977104966846, "grad_norm": 7.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57621382.4, "logits/rejected": -25528660.57142857, "logps/chosen": -497.491796875, "logps/rejected": -515.17041015625, "loss": 0.007, "rewards/chosen": 10.971384429931641, "rewards/margins": 25.543521445138115, "rewards/rejected": -14.572137015206474, "step": 3321 }, { "epoch": 0.8312273239084199, "grad_norm": 0.019287109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26781942.153846152, "logits/rejected": -42990711.27272727, "logps/chosen": -468.1041917067308, "logps/rejected": -740.1134588068181, "loss": 0.0, "rewards/chosen": 12.621317936823917, "rewards/margins": 37.125490522051194, "rewards/rejected": -24.504172585227273, "step": 3322 }, { "epoch": 0.8314775428499938, "grad_norm": 20.375, "kl": 4.926285743713379, "learning_rate": 5e-06, "logits/chosen": -40254930.28571428, "logits/rejected": -41583564.8, "logps/chosen": -410.936767578125, "logps/rejected": -550.081103515625, "loss": 0.049, "rewards/chosen": 8.16054698399135, "rewards/margins": 24.419534410749165, "rewards/rejected": -16.258987426757812, "step": 3323 }, { "epoch": 0.8317277617915676, "grad_norm": 6.09375, "kl": 16.31495475769043, "learning_rate": 5e-06, "logits/chosen": -49415808.0, "logits/rejected": -54033913.6, "logps/chosen": -516.3753836495536, "logps/rejected": -743.772509765625, "loss": 0.055, "rewards/chosen": 9.727398463657924, "rewards/margins": 27.271038600376674, "rewards/rejected": -17.54364013671875, "step": 3324 }, { "epoch": 0.8319779807331416, "grad_norm": 3.15625, "kl": 0.9522311091423035, "learning_rate": 5e-06, "logits/chosen": -35586806.85714286, "logits/rejected": -50276800.0, "logps/chosen": -338.73228236607144, "logps/rejected": -652.16865234375, "loss": 0.0332, "rewards/chosen": 9.390834263392858, "rewards/margins": 26.146693638392858, "rewards/rejected": -16.755859375, "step": 3325 }, { "epoch": 0.8322281996747154, "grad_norm": 1.6953125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37868125.09090909, "logits/rejected": -54182272.0, "logps/chosen": -447.33274147727275, "logps/rejected": -841.9423828125, "loss": 0.0197, "rewards/chosen": 10.815548983487217, "rewards/margins": 32.36075255920837, "rewards/rejected": -21.545203575721153, "step": 3326 }, { "epoch": 0.8324784186162892, "grad_norm": 10.1875, "kl": 13.414175033569336, "learning_rate": 5e-06, "logits/chosen": -20029216.0, "logits/rejected": -37251008.0, "logps/chosen": -477.2794494628906, "logps/rejected": -511.9940185546875, "loss": 0.0664, "rewards/chosen": 10.704218864440918, "rewards/margins": 22.965049743652344, "rewards/rejected": -12.260830879211426, "step": 3327 }, { "epoch": 0.8327286375578631, "grad_norm": 1.7421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -9723276.307692308, "logits/rejected": -44815924.36363637, "logps/chosen": -329.5674579326923, "logps/rejected": -628.5750177556819, "loss": 0.0366, "rewards/chosen": 7.459197998046875, "rewards/margins": 20.49805519797585, "rewards/rejected": -13.038857199928977, "step": 3328 }, { "epoch": 0.832978856499437, "grad_norm": 2.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48634698.666666664, "logits/rejected": -43427450.666666664, "logps/chosen": -320.67331949869794, "logps/rejected": -709.500244140625, "loss": 0.0227, "rewards/chosen": 8.579489390055338, "rewards/margins": 27.103089650472008, "rewards/rejected": -18.523600260416668, "step": 3329 }, { "epoch": 0.8332290754410109, "grad_norm": 0.66796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47702697.6, "logits/rejected": -47301728.0, "logps/chosen": -344.5995849609375, "logps/rejected": -777.52490234375, "loss": 0.0077, "rewards/chosen": 7.826660919189453, "rewards/margins": 28.068726348876954, "rewards/rejected": -20.2420654296875, "step": 3330 }, { "epoch": 0.8334792943825847, "grad_norm": 10.75, "kl": 17.64159393310547, "learning_rate": 5e-06, "logits/chosen": -31657228.8, "logits/rejected": -32322094.222222224, "logps/chosen": -365.816015625, "logps/rejected": -521.9873046875, "loss": 0.0778, "rewards/chosen": 9.595201619466145, "rewards/margins": 22.148414103190106, "rewards/rejected": -12.553212483723959, "step": 3331 }, { "epoch": 0.8337295133241587, "grad_norm": 5.46875, "kl": 1.3476537466049194, "learning_rate": 5e-06, "logits/chosen": -76895522.9090909, "logits/rejected": -31535502.769230768, "logps/chosen": -406.1941583806818, "logps/rejected": -609.6141826923077, "loss": 0.0081, "rewards/chosen": 9.512120333584873, "rewards/margins": 26.848871244417204, "rewards/rejected": -17.336750910832333, "step": 3332 }, { "epoch": 0.8339797322657325, "grad_norm": 6.625, "kl": 8.00233268737793, "learning_rate": 5e-06, "logits/chosen": -41232122.18181818, "logits/rejected": -28681048.615384616, "logps/chosen": -482.28413529829544, "logps/rejected": -523.8064903846154, "loss": 0.0118, "rewards/chosen": 10.490389043634588, "rewards/margins": 28.5664658446412, "rewards/rejected": -18.07607680100661, "step": 3333 }, { "epoch": 0.8342299512073064, "grad_norm": 8.0, "kl": 2.8511955738067627, "learning_rate": 5e-06, "logits/chosen": -40877555.2, "logits/rejected": -32739520.0, "logps/chosen": -513.68232421875, "logps/rejected": -589.7804129464286, "loss": 0.0132, "rewards/chosen": 10.946174621582031, "rewards/margins": 27.19280787876674, "rewards/rejected": -16.24663325718471, "step": 3334 }, { "epoch": 0.8344801701488803, "grad_norm": 2.046875, "kl": 3.5403175354003906, "learning_rate": 5e-06, "logits/chosen": -53730816.0, "logits/rejected": -46655581.09090909, "logps/chosen": -442.96048677884613, "logps/rejected": -580.0924183238636, "loss": 0.0038, "rewards/chosen": 10.939537635216347, "rewards/margins": 29.880799620301573, "rewards/rejected": -18.941261985085227, "step": 3335 }, { "epoch": 0.8347303890904542, "grad_norm": 22.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33175725.333333332, "logits/rejected": -50051552.0, "logps/chosen": -336.46811930338544, "logps/rejected": -826.3871256510416, "loss": 0.0654, "rewards/chosen": 8.239480336507162, "rewards/margins": 28.658510843912758, "rewards/rejected": -20.419030507405598, "step": 3336 }, { "epoch": 0.834980608032028, "grad_norm": 1.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14426797.0, "logits/rejected": -27603104.0, "logps/chosen": -346.4061279296875, "logps/rejected": -578.5408935546875, "loss": 0.0235, "rewards/chosen": 8.377764701843262, "rewards/margins": 24.47706890106201, "rewards/rejected": -16.09930419921875, "step": 3337 }, { "epoch": 0.8352308269736018, "grad_norm": 15.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50160246.85714286, "logits/rejected": -58618969.6, "logps/chosen": -464.0360630580357, "logps/rejected": -649.570458984375, "loss": 0.0548, "rewards/chosen": 9.315637860979352, "rewards/margins": 29.90846165248326, "rewards/rejected": -20.592823791503907, "step": 3338 }, { "epoch": 0.8354810459151758, "grad_norm": 36.75, "kl": 7.0385589599609375, "learning_rate": 5e-06, "logits/chosen": -57311556.571428575, "logits/rejected": 50120304.0, "logps/chosen": -441.13473074776783, "logps/rejected": -584.90263671875, "loss": 0.0488, "rewards/chosen": 10.004616873604911, "rewards/margins": 29.11538805280413, "rewards/rejected": -19.11077117919922, "step": 3339 }, { "epoch": 0.8357312648567496, "grad_norm": 17.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34005351.11111111, "logits/rejected": -45461350.4, "logps/chosen": -309.23627387152777, "logps/rejected": -551.9491536458333, "loss": 0.0439, "rewards/chosen": 7.5518747965494795, "rewards/margins": 22.887745157877603, "rewards/rejected": -15.335870361328125, "step": 3340 }, { "epoch": 0.8359814837983235, "grad_norm": 3.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25305896.727272727, "logits/rejected": -42966370.461538464, "logps/chosen": -373.67196377840907, "logps/rejected": -696.3227914663462, "loss": 0.0219, "rewards/chosen": 7.287936123934659, "rewards/margins": 31.948669860413027, "rewards/rejected": -24.660733736478367, "step": 3341 }, { "epoch": 0.8362317027398974, "grad_norm": 3.0625, "kl": 2.4891600608825684, "learning_rate": 5e-06, "logits/chosen": -64469094.4, "logits/rejected": -36810346.666666664, "logps/chosen": -505.6770833333333, "logps/rejected": -658.6105143229166, "loss": 0.0076, "rewards/chosen": 10.70615946451823, "rewards/margins": 29.716394721137153, "rewards/rejected": -19.010235256618923, "step": 3342 }, { "epoch": 0.8364819216814713, "grad_norm": 0.48046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45972971.63636363, "logits/rejected": -66599616.0, "logps/chosen": -531.6958895596591, "logps/rejected": -798.5167518028846, "loss": 0.0009, "rewards/chosen": 10.915121598677201, "rewards/margins": 36.45509493100893, "rewards/rejected": -25.53997333233173, "step": 3343 }, { "epoch": 0.8367321406230451, "grad_norm": 19.75, "kl": 0.16862361133098602, "learning_rate": 5e-06, "logits/chosen": -68221984.0, "logits/rejected": -33289794.666666668, "logps/chosen": -403.5641682942708, "logps/rejected": -794.8441569010416, "loss": 0.0522, "rewards/chosen": 10.30563227335612, "rewards/margins": 32.17124048868815, "rewards/rejected": -21.86560821533203, "step": 3344 }, { "epoch": 0.8369823595646191, "grad_norm": 2.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3602723.5555555555, "logits/rejected": -29795498.666666668, "logps/chosen": -257.42442491319446, "logps/rejected": -762.309765625, "loss": 0.0267, "rewards/chosen": 7.404542711046007, "rewards/margins": 32.21650763617622, "rewards/rejected": -24.81196492513021, "step": 3345 }, { "epoch": 0.8372325785061929, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14663041.0, "logits/rejected": -62652836.0, "logps/chosen": -424.8878173828125, "logps/rejected": -506.0917663574219, "loss": 0.0257, "rewards/chosen": 8.07442855834961, "rewards/margins": 26.42715835571289, "rewards/rejected": -18.35272979736328, "step": 3346 }, { "epoch": 0.8374827974477668, "grad_norm": 10.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -101051221.33333333, "logits/rejected": -22954131.2, "logps/chosen": -468.30810546875, "logps/rejected": -543.856640625, "loss": 0.0904, "rewards/chosen": 10.01444583468967, "rewards/margins": 26.01239505343967, "rewards/rejected": -15.99794921875, "step": 3347 }, { "epoch": 0.8377330163893407, "grad_norm": 6.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59324985.6, "logits/rejected": -64914322.28571428, "logps/chosen": -403.358642578125, "logps/rejected": -753.4827008928571, "loss": 0.0201, "rewards/chosen": 8.923326110839843, "rewards/margins": 34.99824894496373, "rewards/rejected": -26.074922834123885, "step": 3348 }, { "epoch": 0.8379832353309146, "grad_norm": 17.0, "kl": 3.4943671226501465, "learning_rate": 5e-06, "logits/chosen": -30604476.0, "logits/rejected": -36369264.0, "logps/chosen": -358.021240234375, "logps/rejected": -798.4456176757812, "loss": 0.0402, "rewards/chosen": 9.310311317443848, "rewards/margins": 33.51455783843994, "rewards/rejected": -24.204246520996094, "step": 3349 }, { "epoch": 0.8382334542724884, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29860728.615384616, "logits/rejected": -57623877.81818182, "logps/chosen": -369.6553485576923, "logps/rejected": -734.7033025568181, "loss": 0.0263, "rewards/chosen": 9.437604464017427, "rewards/margins": 34.07962772396061, "rewards/rejected": -24.642023259943183, "step": 3350 }, { "epoch": 0.8384836732140623, "grad_norm": 7.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56266896.0, "logits/rejected": -22380741.333333332, "logps/chosen": -386.4521484375, "logps/rejected": -566.9683024088541, "loss": 0.0261, "rewards/chosen": 7.103212356567383, "rewards/margins": 22.67793083190918, "rewards/rejected": -15.574718475341797, "step": 3351 }, { "epoch": 0.8387338921556362, "grad_norm": 40.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31348704.0, "logits/rejected": -30208700.8, "logps/chosen": -413.88490513392856, "logps/rejected": -876.33857421875, "loss": 0.0184, "rewards/chosen": 9.861567905970983, "rewards/margins": 25.928238133021765, "rewards/rejected": -16.066670227050782, "step": 3352 }, { "epoch": 0.83898411109721, "grad_norm": 5.84375, "kl": 10.686391830444336, "learning_rate": 5e-06, "logits/chosen": -52970804.705882356, "logits/rejected": -31700002.285714287, "logps/chosen": -482.0576746323529, "logps/rejected": -525.08447265625, "loss": 0.0167, "rewards/chosen": 8.789918787339154, "rewards/margins": 22.76984694024094, "rewards/rejected": -13.979928152901786, "step": 3353 }, { "epoch": 0.8392343300387839, "grad_norm": 22.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32388425.6, "logits/rejected": -43144608.0, "logps/chosen": -539.532958984375, "logps/rejected": -510.23880440848217, "loss": 0.0445, "rewards/chosen": 9.477702331542968, "rewards/margins": 22.517141941615513, "rewards/rejected": -13.039439610072545, "step": 3354 }, { "epoch": 0.8394845489803578, "grad_norm": 3.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54359680.0, "logits/rejected": 99626029.71428572, "logps/chosen": -478.42607421875, "logps/rejected": -521.8127092633929, "loss": 0.0061, "rewards/chosen": 8.03902816772461, "rewards/margins": 25.62876946585519, "rewards/rejected": -17.58974129813058, "step": 3355 }, { "epoch": 0.8397347679219317, "grad_norm": 11.625, "kl": 6.405513763427734, "learning_rate": 5e-06, "logits/chosen": -56226693.81818182, "logits/rejected": -52516263.384615384, "logps/chosen": -366.49003462357956, "logps/rejected": -756.3671875, "loss": 0.1104, "rewards/chosen": 8.124447215687145, "rewards/margins": 32.67180345441912, "rewards/rejected": -24.54735623873197, "step": 3356 }, { "epoch": 0.8399849868635055, "grad_norm": 12.875, "kl": 1.3067309856414795, "learning_rate": 5e-06, "logits/chosen": -54273348.0, "logits/rejected": -39804508.0, "logps/chosen": -447.93408203125, "logps/rejected": -651.233642578125, "loss": 0.028, "rewards/chosen": 7.712504863739014, "rewards/margins": 29.240417003631592, "rewards/rejected": -21.527912139892578, "step": 3357 }, { "epoch": 0.8402352058050795, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63352152.615384616, "logits/rejected": -54577326.54545455, "logps/chosen": -305.91128305288464, "logps/rejected": -590.3220880681819, "loss": 0.0141, "rewards/chosen": 7.0193657508263225, "rewards/margins": 26.883401617303598, "rewards/rejected": -19.864035866477273, "step": 3358 }, { "epoch": 0.8404854247466533, "grad_norm": 7.375, "kl": 23.675823211669922, "learning_rate": 5e-06, "logits/chosen": -52558448.0, "logits/rejected": -57646680.0, "logps/chosen": -410.6289978027344, "logps/rejected": -712.3125, "loss": 0.0149, "rewards/chosen": 10.158866882324219, "rewards/margins": 31.347515106201172, "rewards/rejected": -21.188648223876953, "step": 3359 }, { "epoch": 0.8407356436882272, "grad_norm": 1.03125, "kl": 0.8565292358398438, "learning_rate": 5e-06, "logits/chosen": -62042185.84615385, "logits/rejected": -52227031.27272727, "logps/chosen": -445.56651893028845, "logps/rejected": -682.1604225852273, "loss": 0.0017, "rewards/chosen": 9.523953951322115, "rewards/margins": 32.92604246339598, "rewards/rejected": -23.402088512073863, "step": 3360 }, { "epoch": 0.8409858626298011, "grad_norm": 0.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32252270.933333334, "logits/rejected": -33166065.777777776, "logps/chosen": -471.1957682291667, "logps/rejected": -712.1775173611111, "loss": 0.0001, "rewards/chosen": 11.72981465657552, "rewards/margins": 33.282666015625, "rewards/rejected": -21.55285135904948, "step": 3361 }, { "epoch": 0.841236081571375, "grad_norm": 2.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53902926.76923077, "logits/rejected": -25050996.363636363, "logps/chosen": -419.5774113581731, "logps/rejected": -580.96484375, "loss": 0.0124, "rewards/chosen": 8.75354473407452, "rewards/margins": 27.086298695811024, "rewards/rejected": -18.332753961736504, "step": 3362 }, { "epoch": 0.8414863005129488, "grad_norm": 6.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34067204.571428575, "logits/rejected": -43753324.8, "logps/chosen": -305.95242745535717, "logps/rejected": -628.813671875, "loss": 0.0857, "rewards/chosen": 7.177069527762277, "rewards/margins": 28.553522164481027, "rewards/rejected": -21.37645263671875, "step": 3363 }, { "epoch": 0.8417365194545227, "grad_norm": 2.171875, "kl": 3.4250407218933105, "learning_rate": 5e-06, "logits/chosen": -37947548.8, "logits/rejected": -47395757.71428572, "logps/chosen": -432.73818359375, "logps/rejected": -605.6768275669643, "loss": 0.0626, "rewards/chosen": 8.802084350585938, "rewards/margins": 25.125789751325335, "rewards/rejected": -16.323705400739396, "step": 3364 }, { "epoch": 0.8419867383960966, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33304403.2, "logits/rejected": -42468320.0, "logps/chosen": -393.924658203125, "logps/rejected": -630.7925502232143, "loss": 0.017, "rewards/chosen": 9.261844635009766, "rewards/margins": 27.929920196533203, "rewards/rejected": -18.668075561523438, "step": 3365 }, { "epoch": 0.8422369573376705, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33040642.666666668, "logits/rejected": -35234941.333333336, "logps/chosen": -391.0833333333333, "logps/rejected": -708.1373697916666, "loss": 0.0306, "rewards/chosen": 9.782527923583984, "rewards/margins": 27.579129536946613, "rewards/rejected": -17.79660161336263, "step": 3366 }, { "epoch": 0.8424871762792443, "grad_norm": 1.3203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20161572.0, "logits/rejected": -30887076.0, "logps/chosen": -376.81903076171875, "logps/rejected": -645.9494018554688, "loss": 0.0132, "rewards/chosen": 9.124106407165527, "rewards/margins": 29.376858711242676, "rewards/rejected": -20.25275230407715, "step": 3367 }, { "epoch": 0.8427373952208183, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26727586.46153846, "logits/rejected": -43080910.54545455, "logps/chosen": -378.07752403846155, "logps/rejected": -636.8332297585227, "loss": 0.0262, "rewards/chosen": 8.189727783203125, "rewards/margins": 30.29148448597301, "rewards/rejected": -22.101756702769887, "step": 3368 }, { "epoch": 0.8429876141623921, "grad_norm": 1.25, "kl": 5.531114101409912, "learning_rate": 5e-06, "logits/chosen": -55001181.09090909, "logits/rejected": -28199010.46153846, "logps/chosen": -337.08194247159093, "logps/rejected": -561.1319861778846, "loss": 0.0319, "rewards/chosen": 8.170166015625, "rewards/margins": 27.628819392277645, "rewards/rejected": -19.458653376652645, "step": 3369 }, { "epoch": 0.8432378331039659, "grad_norm": 10.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41164069.81818182, "logits/rejected": -49529654.15384615, "logps/chosen": -345.78830788352275, "logps/rejected": -517.7381310096154, "loss": 0.0311, "rewards/chosen": 7.5902488014914775, "rewards/margins": 25.039951137729457, "rewards/rejected": -17.44970233623798, "step": 3370 }, { "epoch": 0.8434880520455399, "grad_norm": 5.9375, "kl": 1.780255675315857, "learning_rate": 5e-06, "logits/chosen": -45028153.6, "logits/rejected": -46323634.28571428, "logps/chosen": -371.44658203125, "logps/rejected": -651.23291015625, "loss": 0.0148, "rewards/chosen": 8.251386260986328, "rewards/margins": 25.5736453465053, "rewards/rejected": -17.322259085518972, "step": 3371 }, { "epoch": 0.8437382709871137, "grad_norm": 7.75, "kl": 13.699926376342773, "learning_rate": 5e-06, "logits/chosen": -49671157.333333336, "logits/rejected": -51886485.333333336, "logps/chosen": -279.90625, "logps/rejected": -688.4078776041666, "loss": 0.1188, "rewards/chosen": 6.317263921101888, "rewards/margins": 24.756689071655273, "rewards/rejected": -18.439425150553387, "step": 3372 }, { "epoch": 0.8439884899286876, "grad_norm": 14.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31615602.666666668, "logits/rejected": -44985776.0, "logps/chosen": -339.19956461588544, "logps/rejected": -548.2075602213541, "loss": 0.0997, "rewards/chosen": 6.550669352213542, "rewards/margins": 21.682912190755207, "rewards/rejected": -15.132242838541666, "step": 3373 }, { "epoch": 0.8442387088702615, "grad_norm": 23.625, "kl": 6.391028881072998, "learning_rate": 5e-06, "logits/chosen": -32915116.307692308, "logits/rejected": -35180805.81818182, "logps/chosen": -429.98343599759613, "logps/rejected": -518.6487926136364, "loss": 0.0525, "rewards/chosen": 9.470690800593449, "rewards/margins": 23.83433826153095, "rewards/rejected": -14.3636474609375, "step": 3374 }, { "epoch": 0.8444889278118354, "grad_norm": 7.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21346904.0, "logits/rejected": -49103285.333333336, "logps/chosen": -340.59547932942706, "logps/rejected": -570.878662109375, "loss": 0.0209, "rewards/chosen": 8.27093251546224, "rewards/margins": 20.73095703125, "rewards/rejected": -12.46002451578776, "step": 3375 }, { "epoch": 0.8447391467534092, "grad_norm": 8.0, "kl": 8.937817573547363, "learning_rate": 5e-06, "logits/chosen": -41573956.0, "logits/rejected": -57145512.0, "logps/chosen": -313.9154052734375, "logps/rejected": -934.1132202148438, "loss": 0.0285, "rewards/chosen": 8.866175651550293, "rewards/margins": 34.44503688812256, "rewards/rejected": -25.578861236572266, "step": 3376 }, { "epoch": 0.8449893656949831, "grad_norm": 2.046875, "kl": 5.882508277893066, "learning_rate": 5e-06, "logits/chosen": -41217820.44444445, "logits/rejected": -34376661.333333336, "logps/chosen": -386.70448133680554, "logps/rejected": -639.5324869791667, "loss": 0.0047, "rewards/chosen": 10.393179999457466, "rewards/margins": 27.323064846462675, "rewards/rejected": -16.92988484700521, "step": 3377 }, { "epoch": 0.845239584636557, "grad_norm": 5.6875, "kl": 1.9702622890472412, "learning_rate": 5e-06, "logits/chosen": -34272320.0, "logits/rejected": -75658624.0, "logps/chosen": -372.7003728693182, "logps/rejected": -683.201171875, "loss": 0.0371, "rewards/chosen": 8.34747314453125, "rewards/margins": 25.32803696852464, "rewards/rejected": -16.98056382399339, "step": 3378 }, { "epoch": 0.8454898035781309, "grad_norm": 6.78125, "kl": 3.3111572265625, "learning_rate": 5e-06, "logits/chosen": -33296197.818181816, "logits/rejected": -50877838.76923077, "logps/chosen": -414.98073508522725, "logps/rejected": -443.0466496394231, "loss": 0.0119, "rewards/chosen": 8.852550159801137, "rewards/margins": 20.650368857217003, "rewards/rejected": -11.797818697415865, "step": 3379 }, { "epoch": 0.8457400225197047, "grad_norm": 4.125, "kl": 2.5250658988952637, "learning_rate": 5e-06, "logits/chosen": -15815747.2, "logits/rejected": -33770395.428571425, "logps/chosen": -271.12080078125, "logps/rejected": -629.4550083705357, "loss": 0.0613, "rewards/chosen": 6.656166076660156, "rewards/margins": 23.83917781284877, "rewards/rejected": -17.183011736188615, "step": 3380 }, { "epoch": 0.8459902414612787, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12792324.57142857, "logits/rejected": -47730063.058823526, "logps/chosen": -353.47412109375, "logps/rejected": -730.3363396139706, "loss": 0.0043, "rewards/chosen": 7.810611724853516, "rewards/margins": 23.37103832469267, "rewards/rejected": -15.560426599839154, "step": 3381 }, { "epoch": 0.8462404604028525, "grad_norm": 5.09375, "kl": 0.27969712018966675, "learning_rate": 5e-06, "logits/chosen": -20665609.846153848, "logits/rejected": -19446404.363636363, "logps/chosen": -351.96446814903845, "logps/rejected": -438.06107954545456, "loss": 0.0495, "rewards/chosen": 9.330155005821815, "rewards/margins": 20.972813586255054, "rewards/rejected": -11.642658580433238, "step": 3382 }, { "epoch": 0.8464906793444263, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43759411.2, "logits/rejected": -63589394.28571428, "logps/chosen": -334.468701171875, "logps/rejected": -642.8381696428571, "loss": 0.0424, "rewards/chosen": 5.877351760864258, "rewards/margins": 20.14314787728446, "rewards/rejected": -14.265796116420201, "step": 3383 }, { "epoch": 0.8467408982860003, "grad_norm": 1.4140625, "kl": 1.0901451110839844, "learning_rate": 5e-06, "logits/chosen": -13172873.6, "logits/rejected": -29516626.285714287, "logps/chosen": -436.950927734375, "logps/rejected": -640.8581194196429, "loss": 0.0203, "rewards/chosen": 9.508102416992188, "rewards/margins": 26.058019365583146, "rewards/rejected": -16.54991694859096, "step": 3384 }, { "epoch": 0.8469911172275741, "grad_norm": 3.09375, "kl": 4.3086981773376465, "learning_rate": 5e-06, "logits/chosen": -17548857.14285714, "logits/rejected": -56037600.0, "logps/chosen": -300.152587890625, "logps/rejected": -795.25048828125, "loss": 0.0629, "rewards/chosen": 6.721431732177734, "rewards/margins": 28.70584945678711, "rewards/rejected": -21.984417724609376, "step": 3385 }, { "epoch": 0.847241336169148, "grad_norm": 10.6875, "kl": 3.5021045207977295, "learning_rate": 5e-06, "logits/chosen": -36030464.0, "logits/rejected": -28365174.153846152, "logps/chosen": -275.283447265625, "logps/rejected": -428.3333082932692, "loss": 0.0399, "rewards/chosen": 5.708704861727628, "rewards/margins": 17.09005251797763, "rewards/rejected": -11.38134765625, "step": 3386 }, { "epoch": 0.8474915551107218, "grad_norm": 5.1875, "kl": 6.609493255615234, "learning_rate": 5e-06, "logits/chosen": -33888103.384615384, "logits/rejected": -38353646.54545455, "logps/chosen": -352.31971153846155, "logps/rejected": -579.8142755681819, "loss": 0.0757, "rewards/chosen": 8.765191298264723, "rewards/margins": 24.54052643675904, "rewards/rejected": -15.775335138494318, "step": 3387 }, { "epoch": 0.8477417740522958, "grad_norm": 3.484375, "kl": 0.01494344137609005, "learning_rate": 5e-06, "logits/chosen": -30109107.692307692, "logits/rejected": -76994112.0, "logps/chosen": -401.13326322115387, "logps/rejected": -718.1029829545455, "loss": 0.0119, "rewards/chosen": 10.662051861102764, "rewards/margins": 32.06918687086839, "rewards/rejected": -21.407135009765625, "step": 3388 }, { "epoch": 0.8479919929938696, "grad_norm": 1.3203125, "kl": 2.4547300338745117, "learning_rate": 5e-06, "logits/chosen": -27921094.85714286, "logits/rejected": -31077004.8, "logps/chosen": -344.03037806919644, "logps/rejected": -806.986376953125, "loss": 0.0346, "rewards/chosen": 8.57152611868722, "rewards/margins": 29.660779353550502, "rewards/rejected": -21.089253234863282, "step": 3389 }, { "epoch": 0.8482422119354435, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31753132.307692308, "logits/rejected": -40929751.27272727, "logps/chosen": -286.71518179086536, "logps/rejected": -587.9979580965909, "loss": 0.0266, "rewards/chosen": 8.309947674091045, "rewards/margins": 24.60431607119687, "rewards/rejected": -16.294368397105824, "step": 3390 }, { "epoch": 0.8484924308770174, "grad_norm": 14.875, "kl": 6.918422698974609, "learning_rate": 5e-06, "logits/chosen": 10752606.666666666, "logits/rejected": -23392357.333333332, "logps/chosen": -355.0339762369792, "logps/rejected": -480.7571207682292, "loss": 0.0768, "rewards/chosen": 8.461181640625, "rewards/margins": 21.773256937662758, "rewards/rejected": -13.31207529703776, "step": 3391 }, { "epoch": 0.8487426498185913, "grad_norm": 11.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41282416.0, "logits/rejected": -68693257.14285715, "logps/chosen": -282.015087890625, "logps/rejected": -579.0231584821429, "loss": 0.0365, "rewards/chosen": 7.52685775756836, "rewards/margins": 23.913140542166573, "rewards/rejected": -16.386282784598215, "step": 3392 }, { "epoch": 0.8489928687601651, "grad_norm": 2.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51253725.86666667, "logits/rejected": -31536117.333333332, "logps/chosen": -369.29479166666664, "logps/rejected": -550.6883680555555, "loss": 0.0211, "rewards/chosen": 8.388052368164063, "rewards/margins": 28.02307908799913, "rewards/rejected": -19.635026719835068, "step": 3393 }, { "epoch": 0.8492430877017391, "grad_norm": 2.796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48584016.0, "logits/rejected": -41139082.666666664, "logps/chosen": -389.5467936197917, "logps/rejected": -644.7683512369791, "loss": 0.0161, "rewards/chosen": 9.44810676574707, "rewards/margins": 29.582793553670246, "rewards/rejected": -20.134686787923176, "step": 3394 }, { "epoch": 0.8494933066433129, "grad_norm": 17.75, "kl": 12.753987312316895, "learning_rate": 5e-06, "logits/chosen": -32396240.0, "logits/rejected": -61361912.0, "logps/chosen": -282.0979309082031, "logps/rejected": -349.08209228515625, "loss": 0.177, "rewards/chosen": 7.218841075897217, "rewards/margins": 19.01440668106079, "rewards/rejected": -11.795565605163574, "step": 3395 }, { "epoch": 0.8497435255848867, "grad_norm": 13.0625, "kl": 6.300783157348633, "learning_rate": 5e-06, "logits/chosen": -33034992.0, "logits/rejected": -63137706.666666664, "logps/chosen": -344.4127604166667, "logps/rejected": -694.491943359375, "loss": 0.0579, "rewards/chosen": 6.93502934773763, "rewards/margins": 27.62053553263346, "rewards/rejected": -20.685506184895832, "step": 3396 }, { "epoch": 0.8499937445264607, "grad_norm": 9.125, "kl": 3.336113691329956, "learning_rate": 5e-06, "logits/chosen": -59140253.538461536, "logits/rejected": -51570513.45454545, "logps/chosen": -490.04244290865387, "logps/rejected": -738.6499467329545, "loss": 0.0529, "rewards/chosen": 10.333992591271034, "rewards/margins": 34.03190463406223, "rewards/rejected": -23.69791204279119, "step": 3397 }, { "epoch": 0.8502439634680345, "grad_norm": 5.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64041898.666666664, "logits/rejected": -48903648.0, "logps/chosen": -340.8223063151042, "logps/rejected": -436.5948893229167, "loss": 0.0511, "rewards/chosen": 8.212958653767904, "rewards/margins": 22.943217595418297, "rewards/rejected": -14.73025894165039, "step": 3398 }, { "epoch": 0.8504941824096084, "grad_norm": 2.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63139532.8, "logits/rejected": -38169323.78947368, "logps/chosen": -431.964404296875, "logps/rejected": -585.9048108552631, "loss": 0.002, "rewards/chosen": 11.419216918945313, "rewards/margins": 27.988580161646794, "rewards/rejected": -16.56936324270148, "step": 3399 }, { "epoch": 0.8507444013511822, "grad_norm": 4.625, "kl": 2.2456088066101074, "learning_rate": 5e-06, "logits/chosen": -80927916.8, "logits/rejected": -52125974.85714286, "logps/chosen": -325.201123046875, "logps/rejected": -772.0938895089286, "loss": 0.0356, "rewards/chosen": 8.067665100097656, "rewards/margins": 31.990841456821986, "rewards/rejected": -23.92317635672433, "step": 3400 }, { "epoch": 0.8509946202927562, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69306117.81818181, "logits/rejected": -77617427.6923077, "logps/chosen": -365.25270774147725, "logps/rejected": -747.5733924278846, "loss": 0.0195, "rewards/chosen": 9.313447432084518, "rewards/margins": 30.280741978358556, "rewards/rejected": -20.96729454627404, "step": 3401 }, { "epoch": 0.85124483923433, "grad_norm": 13.5, "kl": 1.60513436794281, "learning_rate": 5e-06, "logits/chosen": -25966421.333333332, "logits/rejected": -63326672.0, "logps/chosen": -387.8897298177083, "logps/rejected": -760.96142578125, "loss": 0.0304, "rewards/chosen": 9.741454442342123, "rewards/margins": 30.113690058390297, "rewards/rejected": -20.372235616048176, "step": 3402 }, { "epoch": 0.8514950581759039, "grad_norm": 6.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23154827.2, "logits/rejected": -68172278.85714285, "logps/chosen": -280.952978515625, "logps/rejected": -561.969970703125, "loss": 0.0331, "rewards/chosen": 6.843350219726562, "rewards/margins": 26.077489798409598, "rewards/rejected": -19.234139578683035, "step": 3403 }, { "epoch": 0.8517452771174778, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38404334.54545455, "logits/rejected": -26405550.769230768, "logps/chosen": -353.20339133522725, "logps/rejected": -450.88326322115387, "loss": 0.021, "rewards/chosen": 9.237851229580967, "rewards/margins": 22.709310251516065, "rewards/rejected": -13.471459021935097, "step": 3404 }, { "epoch": 0.8519954960590517, "grad_norm": 18.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41984470.4, "logits/rejected": -51389417.14285714, "logps/chosen": -359.0143310546875, "logps/rejected": -633.2073800223214, "loss": 0.0097, "rewards/chosen": 9.553411102294922, "rewards/margins": 27.156695665631972, "rewards/rejected": -17.603284563337052, "step": 3405 }, { "epoch": 0.8522457150006255, "grad_norm": 3.46875, "kl": 0.5354986190795898, "learning_rate": 5e-06, "logits/chosen": -34911616.0, "logits/rejected": -46378156.8, "logps/chosen": -362.75516183035717, "logps/rejected": -739.907421875, "loss": 0.0337, "rewards/chosen": 9.54271480015346, "rewards/margins": 30.447087969098774, "rewards/rejected": -20.904373168945312, "step": 3406 }, { "epoch": 0.8524959339421995, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56427827.2, "logits/rejected": -52261645.71428572, "logps/chosen": -345.6505126953125, "logps/rejected": -699.6164899553571, "loss": 0.0114, "rewards/chosen": 9.564096069335937, "rewards/margins": 31.107489885602675, "rewards/rejected": -21.54339381626674, "step": 3407 }, { "epoch": 0.8527461528837733, "grad_norm": 11.375, "kl": 10.46023178100586, "learning_rate": 5e-06, "logits/chosen": -53238020.266666666, "logits/rejected": -39660728.88888889, "logps/chosen": -449.196875, "logps/rejected": -782.0788845486111, "loss": 0.023, "rewards/chosen": 9.95607401529948, "rewards/margins": 35.84249437120226, "rewards/rejected": -25.88642035590278, "step": 3408 }, { "epoch": 0.8529963718253472, "grad_norm": 1.5, "kl": 0.7935384511947632, "learning_rate": 5e-06, "logits/chosen": -47553068.0, "logits/rejected": -28442418.0, "logps/chosen": -393.3840637207031, "logps/rejected": -804.2012329101562, "loss": 0.0039, "rewards/chosen": 9.483860969543457, "rewards/margins": 32.58375644683838, "rewards/rejected": -23.099895477294922, "step": 3409 }, { "epoch": 0.8532465907669211, "grad_norm": 1.9140625, "kl": 1.9362802505493164, "learning_rate": 5e-06, "logits/chosen": -60796621.71428572, "logits/rejected": -35164256.0, "logps/chosen": -384.19485909598217, "logps/rejected": -679.923779296875, "loss": 0.0292, "rewards/chosen": 7.987306867327009, "rewards/margins": 24.53445042201451, "rewards/rejected": -16.5471435546875, "step": 3410 }, { "epoch": 0.853496809708495, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40565142.15384615, "logits/rejected": -60314461.09090909, "logps/chosen": -429.44722806490387, "logps/rejected": -852.6524325284091, "loss": 0.0157, "rewards/chosen": 9.53851083608774, "rewards/margins": 35.177055092124675, "rewards/rejected": -25.638544256036933, "step": 3411 }, { "epoch": 0.8537470286500688, "grad_norm": 1.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41066076.44444445, "logits/rejected": -18401723.733333334, "logps/chosen": -360.92323133680554, "logps/rejected": -516.06015625, "loss": 0.0418, "rewards/chosen": 7.561635335286458, "rewards/margins": 24.195581054687498, "rewards/rejected": -16.63394571940104, "step": 3412 }, { "epoch": 0.8539972475916426, "grad_norm": 5.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38734702.222222224, "logits/rejected": -46688665.6, "logps/chosen": -371.51898871527777, "logps/rejected": -629.53125, "loss": 0.011, "rewards/chosen": 9.22223154703776, "rewards/margins": 30.369574483235674, "rewards/rejected": -21.147342936197916, "step": 3413 }, { "epoch": 0.8542474665332166, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46984132.92307692, "logits/rejected": -31063697.454545453, "logps/chosen": -345.4401292067308, "logps/rejected": -598.7138671875, "loss": 0.0561, "rewards/chosen": 8.10449453500601, "rewards/margins": 23.045280723305016, "rewards/rejected": -14.940786188299006, "step": 3414 }, { "epoch": 0.8544976854747904, "grad_norm": 3.015625, "kl": 5.003227233886719, "learning_rate": 5e-06, "logits/chosen": -38164480.0, "logits/rejected": -52871031.46666667, "logps/chosen": -609.7924262152778, "logps/rejected": -728.512109375, "loss": 0.0073, "rewards/chosen": 12.935448540581596, "rewards/margins": 36.59801974826389, "rewards/rejected": -23.66257120768229, "step": 3415 }, { "epoch": 0.8547479044163643, "grad_norm": 6.8125, "kl": 0.5025972127914429, "learning_rate": 5e-06, "logits/chosen": -32358964.363636363, "logits/rejected": -45221154.461538464, "logps/chosen": -381.77920809659093, "logps/rejected": -751.0123197115385, "loss": 0.0565, "rewards/chosen": 6.736111727627841, "rewards/margins": 30.62912179373361, "rewards/rejected": -23.89301006610577, "step": 3416 }, { "epoch": 0.8549981233579382, "grad_norm": 2.6875, "kl": 11.569003105163574, "learning_rate": 5e-06, "logits/chosen": -56561984.0, "logits/rejected": -30339544.0, "logps/chosen": -430.8636881510417, "logps/rejected": -846.4625651041666, "loss": 0.017, "rewards/chosen": 9.835779190063477, "rewards/margins": 37.82223192850749, "rewards/rejected": -27.98645273844401, "step": 3417 }, { "epoch": 0.8552483422995121, "grad_norm": 1.4140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18342086.153846152, "logits/rejected": -46345451.63636363, "logps/chosen": -289.98140775240387, "logps/rejected": -770.1831498579545, "loss": 0.0118, "rewards/chosen": 7.640285198505108, "rewards/margins": 27.565271604311217, "rewards/rejected": -19.92498640580611, "step": 3418 }, { "epoch": 0.8554985612410859, "grad_norm": 0.08251953125, "kl": 3.4349327087402344, "learning_rate": 5e-06, "logits/chosen": -24507884.8, "logits/rejected": -63311753.14285714, "logps/chosen": -380.1271240234375, "logps/rejected": -602.6160016741071, "loss": 0.0002, "rewards/chosen": 10.58471221923828, "rewards/margins": 28.48644343784877, "rewards/rejected": -17.90173121861049, "step": 3419 }, { "epoch": 0.8557487801826599, "grad_norm": 7.6875, "kl": 12.221990585327148, "learning_rate": 5e-06, "logits/chosen": -58555200.0, "logits/rejected": -40902781.333333336, "logps/chosen": -410.7368570963542, "logps/rejected": -572.3693033854166, "loss": 0.098, "rewards/chosen": 9.691134770711264, "rewards/margins": 27.45423698425293, "rewards/rejected": -17.763102213541668, "step": 3420 }, { "epoch": 0.8559989991242337, "grad_norm": 1.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32140489.14285714, "logits/rejected": -58306841.6, "logps/chosen": -332.3261021205357, "logps/rejected": -577.804833984375, "loss": 0.0262, "rewards/chosen": 7.202910831996372, "rewards/margins": 24.342908695765903, "rewards/rejected": -17.13999786376953, "step": 3421 }, { "epoch": 0.8562492180658076, "grad_norm": 5.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20116464.0, "logits/rejected": -55928763.733333334, "logps/chosen": -344.44981553819446, "logps/rejected": -658.9731770833333, "loss": 0.0396, "rewards/chosen": 8.593739827473959, "rewards/margins": 24.407590738932292, "rewards/rejected": -15.813850911458333, "step": 3422 }, { "epoch": 0.8564994370073814, "grad_norm": 8.6875, "kl": 4.1583757400512695, "learning_rate": 5e-06, "logits/chosen": -28254267.42857143, "logits/rejected": -32502748.8, "logps/chosen": -273.6908656529018, "logps/rejected": -513.40341796875, "loss": 0.0496, "rewards/chosen": 7.446807861328125, "rewards/margins": 23.383950805664064, "rewards/rejected": -15.937142944335937, "step": 3423 }, { "epoch": 0.8567496559489554, "grad_norm": 2.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29456314.181818184, "logits/rejected": -3746817.230769231, "logps/chosen": -454.02543501420456, "logps/rejected": -617.2699068509615, "loss": 0.0036, "rewards/chosen": 11.79313798384233, "rewards/margins": 30.350104912177663, "rewards/rejected": -18.556966928335335, "step": 3424 }, { "epoch": 0.8569998748905292, "grad_norm": 7.34375, "kl": 24.83080291748047, "learning_rate": 5e-06, "logits/chosen": -48454746.35294118, "logits/rejected": -46828608.0, "logps/chosen": -475.0335477941176, "logps/rejected": -519.8251953125, "loss": 0.0881, "rewards/chosen": 10.413762709673714, "rewards/margins": 23.49734054693655, "rewards/rejected": -13.083577837262835, "step": 3425 }, { "epoch": 0.857250093832103, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19395606.4, "logits/rejected": -84248384.0, "logps/chosen": -199.56685791015624, "logps/rejected": -775.6018415178571, "loss": 0.0922, "rewards/chosen": 4.519903564453125, "rewards/margins": 28.649723161969867, "rewards/rejected": -24.12981959751674, "step": 3426 }, { "epoch": 0.857500312773677, "grad_norm": 7.625, "kl": 4.2523722648620605, "learning_rate": 5e-06, "logits/chosen": -58089545.14285714, "logits/rejected": -26250043.2, "logps/chosen": -298.81734793526783, "logps/rejected": -717.31923828125, "loss": 0.0633, "rewards/chosen": 7.013817923409598, "rewards/margins": 25.99911455426897, "rewards/rejected": -18.985296630859374, "step": 3427 }, { "epoch": 0.8577505317152508, "grad_norm": 13.0625, "kl": 9.870464324951172, "learning_rate": 5e-06, "logits/chosen": -9495052.307692308, "logits/rejected": -60285585.45454545, "logps/chosen": -353.49083533653845, "logps/rejected": -628.0487393465909, "loss": 0.079, "rewards/chosen": 8.568535437950722, "rewards/margins": 24.379702374651714, "rewards/rejected": -15.811166936700994, "step": 3428 }, { "epoch": 0.8580007506568247, "grad_norm": 20.5, "kl": 0.5861492156982422, "learning_rate": 5e-06, "logits/chosen": -31791704.0, "logits/rejected": -34544837.333333336, "logps/chosen": -378.1505126953125, "logps/rejected": -515.7323404947916, "loss": 0.0199, "rewards/chosen": 9.098532358805338, "rewards/margins": 23.284446716308594, "rewards/rejected": -14.185914357503256, "step": 3429 }, { "epoch": 0.8582509695983986, "grad_norm": 20.25, "kl": 2.6534667015075684, "learning_rate": 5e-06, "logits/chosen": -73893261.71428572, "logits/rejected": -35865728.0, "logps/chosen": -473.5694056919643, "logps/rejected": -513.09697265625, "loss": 0.0314, "rewards/chosen": 11.509350367954799, "rewards/margins": 22.890966578892296, "rewards/rejected": -11.3816162109375, "step": 3430 }, { "epoch": 0.8585011885399725, "grad_norm": 9.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16895837.53846154, "logits/rejected": -65064750.54545455, "logps/chosen": -263.77355018028845, "logps/rejected": -745.2493785511364, "loss": 0.071, "rewards/chosen": 7.381386976975661, "rewards/margins": 24.90931888393589, "rewards/rejected": -17.527931906960227, "step": 3431 }, { "epoch": 0.8587514074815463, "grad_norm": 3.75, "kl": 3.6137466430664062, "learning_rate": 5e-06, "logits/chosen": -36982184.0, "logits/rejected": -20813888.0, "logps/chosen": -402.0264587402344, "logps/rejected": -739.8248291015625, "loss": 0.0098, "rewards/chosen": 9.474686622619629, "rewards/margins": 32.261887550354004, "rewards/rejected": -22.787200927734375, "step": 3432 }, { "epoch": 0.8590016264231203, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46333397.333333336, "logits/rejected": -42859933.86666667, "logps/chosen": -484.78461371527777, "logps/rejected": -531.9123046875, "loss": 0.0202, "rewards/chosen": 12.38681369357639, "rewards/margins": 30.41376478407118, "rewards/rejected": -18.02695109049479, "step": 3433 }, { "epoch": 0.8592518453646941, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27609774.545454547, "logits/rejected": -47873969.23076923, "logps/chosen": -288.84237393465907, "logps/rejected": -832.9306640625, "loss": 0.0253, "rewards/chosen": 7.297658053311435, "rewards/margins": 29.15584052025855, "rewards/rejected": -21.858182466947117, "step": 3434 }, { "epoch": 0.859502064306268, "grad_norm": 20.75, "kl": 9.483383178710938, "learning_rate": 5e-06, "logits/chosen": -41271122.666666664, "logits/rejected": -48698330.666666664, "logps/chosen": -409.0347493489583, "logps/rejected": -744.7373046875, "loss": 0.1465, "rewards/chosen": 9.222151438395182, "rewards/margins": 30.983539581298828, "rewards/rejected": -21.761388142903645, "step": 3435 }, { "epoch": 0.8597522832478418, "grad_norm": 6.28125, "kl": 6.486830711364746, "learning_rate": 5e-06, "logits/chosen": -26766176.0, "logits/rejected": -20768977.6, "logps/chosen": -380.53170340401783, "logps/rejected": -700.786328125, "loss": 0.024, "rewards/chosen": 10.49974605015346, "rewards/margins": 26.462389482770647, "rewards/rejected": -15.962643432617188, "step": 3436 }, { "epoch": 0.8600025021894158, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46137204.36363637, "logits/rejected": -76598921.84615384, "logps/chosen": -409.7512872869318, "logps/rejected": -795.2806490384615, "loss": 0.0635, "rewards/chosen": 10.477496060458096, "rewards/margins": 31.27915293019968, "rewards/rejected": -20.801656869741585, "step": 3437 }, { "epoch": 0.8602527211309896, "grad_norm": 7.03125, "kl": 11.06386947631836, "learning_rate": 5e-06, "logits/chosen": -41409408.0, "logits/rejected": -27813930.666666668, "logps/chosen": -292.15944010416666, "logps/rejected": -589.1636284722222, "loss": 0.0667, "rewards/chosen": 7.225406392415365, "rewards/margins": 18.824005296495226, "rewards/rejected": -11.59859890407986, "step": 3438 }, { "epoch": 0.8605029400725634, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58881270.85714286, "logits/rejected": -40243420.8, "logps/chosen": -376.37852260044644, "logps/rejected": -586.9955078125, "loss": 0.0211, "rewards/chosen": 8.889854431152344, "rewards/margins": 25.967770385742188, "rewards/rejected": -17.077915954589844, "step": 3439 }, { "epoch": 0.8607531590141374, "grad_norm": 6.8125, "kl": 0.8614501953125, "learning_rate": 5e-06, "logits/chosen": -54089728.0, "logits/rejected": -37478691.2, "logps/chosen": -388.50830078125, "logps/rejected": -545.693994140625, "loss": 0.0311, "rewards/chosen": 7.5315737043108255, "rewards/margins": 24.173086765834263, "rewards/rejected": -16.641513061523437, "step": 3440 }, { "epoch": 0.8610033779557112, "grad_norm": 9.3125, "kl": 4.9176764488220215, "learning_rate": 5e-06, "logits/chosen": -46591466.666666664, "logits/rejected": -46553765.333333336, "logps/chosen": -395.4475911458333, "logps/rejected": -537.458740234375, "loss": 0.0564, "rewards/chosen": 9.691715240478516, "rewards/margins": 23.488632202148438, "rewards/rejected": -13.796916961669922, "step": 3441 }, { "epoch": 0.8612535968972851, "grad_norm": 6.53125, "kl": 4.45770788192749, "learning_rate": 5e-06, "logits/chosen": -9623728.0, "logits/rejected": -27709010.666666668, "logps/chosen": -503.5404459635417, "logps/rejected": -586.5941975911459, "loss": 0.0218, "rewards/chosen": 10.355766932169596, "rewards/margins": 25.699543635050453, "rewards/rejected": -15.34377670288086, "step": 3442 }, { "epoch": 0.861503815838859, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -75128938.66666667, "logits/rejected": -47218250.666666664, "logps/chosen": -546.0396321614584, "logps/rejected": -645.4585367838541, "loss": 0.0031, "rewards/chosen": 11.731885274251303, "rewards/margins": 31.201409657796226, "rewards/rejected": -19.469524383544922, "step": 3443 }, { "epoch": 0.8617540347804329, "grad_norm": 6.8125, "kl": 24.890928268432617, "learning_rate": 5e-06, "logits/chosen": -49410232.88888889, "logits/rejected": -34724994.666666664, "logps/chosen": -408.79058159722223, "logps/rejected": -742.62060546875, "loss": 0.0831, "rewards/chosen": 9.84573703342014, "rewards/margins": 30.377961052788628, "rewards/rejected": -20.53222401936849, "step": 3444 }, { "epoch": 0.8620042537220067, "grad_norm": 4.96875, "kl": 18.95811653137207, "learning_rate": 5e-06, "logits/chosen": -45254224.0, "logits/rejected": -18760269.714285713, "logps/chosen": -525.060791015625, "logps/rejected": -695.8046875, "loss": 0.0443, "rewards/chosen": 10.7411376953125, "rewards/margins": 29.170136369977676, "rewards/rejected": -18.428998674665177, "step": 3445 }, { "epoch": 0.8622544726635807, "grad_norm": 3.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6674628.8, "logits/rejected": -26047872.0, "logps/chosen": -308.3902099609375, "logps/rejected": -612.9534040178571, "loss": 0.0226, "rewards/chosen": 7.993011474609375, "rewards/margins": 30.906912667410715, "rewards/rejected": -22.91390119280134, "step": 3446 }, { "epoch": 0.8625046916051545, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -25127037.866666667, "logits/rejected": -40818865.777777776, "logps/chosen": -322.5530598958333, "logps/rejected": -571.09423828125, "loss": 0.0587, "rewards/chosen": 7.622718811035156, "rewards/margins": 24.154997931586372, "rewards/rejected": -16.532279120551216, "step": 3447 }, { "epoch": 0.8627549105467284, "grad_norm": 5.21875, "kl": 3.7288360595703125, "learning_rate": 5e-06, "logits/chosen": -59984074.666666664, "logits/rejected": -47882800.0, "logps/chosen": -496.1824544270833, "logps/rejected": -712.8194986979166, "loss": 0.031, "rewards/chosen": 10.47537104288737, "rewards/margins": 31.121027628580727, "rewards/rejected": -20.64565658569336, "step": 3448 }, { "epoch": 0.8630051294883022, "grad_norm": 9.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31467463.111111112, "logits/rejected": -57018077.86666667, "logps/chosen": -357.94447157118054, "logps/rejected": -568.282421875, "loss": 0.0328, "rewards/chosen": 7.756232367621528, "rewards/margins": 25.909349229600696, "rewards/rejected": -18.153116861979168, "step": 3449 }, { "epoch": 0.8632553484298762, "grad_norm": 2.75, "kl": 6.968736171722412, "learning_rate": 5e-06, "logits/chosen": -60291858.28571428, "logits/rejected": -87581926.4, "logps/chosen": -356.03721400669644, "logps/rejected": -668.813037109375, "loss": 0.1175, "rewards/chosen": 10.131209237234932, "rewards/margins": 33.115295846121654, "rewards/rejected": -22.98408660888672, "step": 3450 }, { "epoch": 0.86350556737145, "grad_norm": 2.546875, "kl": 3.6997318267822266, "learning_rate": 5e-06, "logits/chosen": -35512466.28571428, "logits/rejected": -45896595.2, "logps/chosen": -488.79638671875, "logps/rejected": -553.56044921875, "loss": 0.0576, "rewards/chosen": 9.47921861921038, "rewards/margins": 29.508159964425225, "rewards/rejected": -20.028941345214843, "step": 3451 }, { "epoch": 0.8637557863130239, "grad_norm": 16.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41302523.428571425, "logits/rejected": -66977882.35294118, "logps/chosen": -324.8150111607143, "logps/rejected": -735.2728630514706, "loss": 0.0238, "rewards/chosen": 6.967808859688895, "rewards/margins": 27.979588644845144, "rewards/rejected": -21.01177978515625, "step": 3452 }, { "epoch": 0.8640060052545978, "grad_norm": 8.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21832736.0, "logits/rejected": -34254585.6, "logps/chosen": -272.3030482700893, "logps/rejected": -682.8955078125, "loss": 0.0674, "rewards/chosen": 7.860586983816964, "rewards/margins": 24.07650800432478, "rewards/rejected": -16.215921020507814, "step": 3453 }, { "epoch": 0.8642562241961717, "grad_norm": 5.96875, "kl": 1.8852272033691406, "learning_rate": 5e-06, "logits/chosen": -48892112.0, "logits/rejected": -28840530.666666668, "logps/chosen": -393.8746337890625, "logps/rejected": -534.0498046875, "loss": 0.0398, "rewards/chosen": 8.52026621500651, "rewards/margins": 26.391554514567055, "rewards/rejected": -17.871288299560547, "step": 3454 }, { "epoch": 0.8645064431377455, "grad_norm": 2.109375, "kl": 5.647876739501953, "learning_rate": 5e-06, "logits/chosen": -37952635.733333334, "logits/rejected": -40269312.0, "logps/chosen": -342.5437825520833, "logps/rejected": -656.0493706597222, "loss": 0.0501, "rewards/chosen": 8.22558135986328, "rewards/margins": 31.050113762749564, "rewards/rejected": -22.824532402886284, "step": 3455 }, { "epoch": 0.8647566620793194, "grad_norm": 0.94140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40074224.0, "logits/rejected": -58969914.666666664, "logps/chosen": -318.78940836588544, "logps/rejected": -624.6955973307291, "loss": 0.0094, "rewards/chosen": 8.334267298380533, "rewards/margins": 27.249377568562828, "rewards/rejected": -18.915110270182293, "step": 3456 }, { "epoch": 0.8650068810208933, "grad_norm": 2.328125, "kl": 6.171908855438232, "learning_rate": 5e-06, "logits/chosen": -44496172.8, "logits/rejected": 32292772.57142857, "logps/chosen": -424.0076171875, "logps/rejected": -738.0862862723214, "loss": 0.0035, "rewards/chosen": 11.169922637939454, "rewards/margins": 35.951418958391464, "rewards/rejected": -24.78149632045201, "step": 3457 }, { "epoch": 0.8652570999624671, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49062764.8, "logits/rejected": -57530112.0, "logps/chosen": -461.909375, "logps/rejected": -610.2184709821429, "loss": 0.0337, "rewards/chosen": 9.562892150878906, "rewards/margins": 29.530404009137833, "rewards/rejected": -19.967511858258927, "step": 3458 }, { "epoch": 0.8655073189040411, "grad_norm": 0.828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 126569176.0, "logits/rejected": -42944152.0, "logps/chosen": -533.9003295898438, "logps/rejected": -570.6421508789062, "loss": 0.0008, "rewards/chosen": 10.0740385055542, "rewards/margins": 28.47630786895752, "rewards/rejected": -18.40226936340332, "step": 3459 }, { "epoch": 0.8657575378456149, "grad_norm": 1.140625, "kl": 7.083930015563965, "learning_rate": 5e-06, "logits/chosen": -27082567.111111112, "logits/rejected": -2558617.066666667, "logps/chosen": -425.09776475694446, "logps/rejected": -825.0471354166667, "loss": 0.0348, "rewards/chosen": 9.886474609375, "rewards/margins": 28.73097127278646, "rewards/rejected": -18.84449666341146, "step": 3460 }, { "epoch": 0.8660077567871888, "grad_norm": 2.0, "kl": 4.610023498535156, "learning_rate": 5e-06, "logits/chosen": -72936487.38461539, "logits/rejected": -18594686.545454547, "logps/chosen": -501.4353215144231, "logps/rejected": -622.3713156960227, "loss": 0.0028, "rewards/chosen": 10.840529221754808, "rewards/margins": 28.86564380138904, "rewards/rejected": -18.025114579634234, "step": 3461 }, { "epoch": 0.8662579757287626, "grad_norm": 16.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64318257.23076923, "logits/rejected": -48642257.45454545, "logps/chosen": -433.83199368990387, "logps/rejected": -735.3864080255681, "loss": 0.0643, "rewards/chosen": 9.543318528395433, "rewards/margins": 31.57085093251475, "rewards/rejected": -22.027532404119317, "step": 3462 }, { "epoch": 0.8665081946703366, "grad_norm": 19.5, "kl": 10.288519859313965, "learning_rate": 5e-06, "logits/chosen": -48177984.0, "logits/rejected": -74726542.76923077, "logps/chosen": -387.11629971590907, "logps/rejected": -597.6875751201923, "loss": 0.1029, "rewards/chosen": 8.570510864257812, "rewards/margins": 22.266805795522835, "rewards/rejected": -13.696294931265024, "step": 3463 }, { "epoch": 0.8667584136119104, "grad_norm": 0.9140625, "kl": 12.010282516479492, "learning_rate": 5e-06, "logits/chosen": -38682761.84615385, "logits/rejected": -43303889.45454545, "logps/chosen": -457.85103665865387, "logps/rejected": -416.4244939630682, "loss": 0.0134, "rewards/chosen": 11.001606867863583, "rewards/margins": 24.626482023225797, "rewards/rejected": -13.624875155362217, "step": 3464 }, { "epoch": 0.8670086325534843, "grad_norm": 19.0, "kl": 0.9963874816894531, "learning_rate": 5e-06, "logits/chosen": -52735607.46666667, "logits/rejected": -30933425.777777776, "logps/chosen": -358.932421875, "logps/rejected": -819.0618489583334, "loss": 0.0686, "rewards/chosen": 7.7281646728515625, "rewards/margins": 28.729359944661457, "rewards/rejected": -21.001195271809895, "step": 3465 }, { "epoch": 0.8672588514950582, "grad_norm": 1.0078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54212768.0, "logits/rejected": -49032936.0, "logps/chosen": -401.12689208984375, "logps/rejected": -649.1390380859375, "loss": 0.0061, "rewards/chosen": 9.850912094116211, "rewards/margins": 27.142953872680664, "rewards/rejected": -17.292041778564453, "step": 3466 }, { "epoch": 0.8675090704366321, "grad_norm": 7.9375, "kl": 11.11578369140625, "learning_rate": 5e-06, "logits/chosen": -36691840.0, "logits/rejected": -34298195.2, "logps/chosen": -298.06717354910717, "logps/rejected": -778.43212890625, "loss": 0.0602, "rewards/chosen": 6.480438777378628, "rewards/margins": 30.410916682652065, "rewards/rejected": -23.930477905273438, "step": 3467 }, { "epoch": 0.8677592893782059, "grad_norm": 2.515625, "kl": 7.467310905456543, "learning_rate": 5e-06, "logits/chosen": -66754692.92307692, "logits/rejected": -33492797.09090909, "logps/chosen": -423.9967698317308, "logps/rejected": -490.36421342329544, "loss": 0.0039, "rewards/chosen": 10.781780536358173, "rewards/margins": 24.545684067519396, "rewards/rejected": -13.76390353116122, "step": 3468 }, { "epoch": 0.8680095083197799, "grad_norm": 5.875, "kl": 4.220156669616699, "learning_rate": 5e-06, "logits/chosen": -32290999.466666665, "logits/rejected": -35008462.222222224, "logps/chosen": -318.0935546875, "logps/rejected": -517.5664605034722, "loss": 0.0498, "rewards/chosen": 8.685135904947916, "rewards/margins": 25.74177992078993, "rewards/rejected": -17.056644015842014, "step": 3469 }, { "epoch": 0.8682597272613537, "grad_norm": 3.140625, "kl": 10.112049102783203, "learning_rate": 5e-06, "logits/chosen": -49526680.0, "logits/rejected": -55523500.0, "logps/chosen": -403.1163635253906, "logps/rejected": -870.3939208984375, "loss": 0.0204, "rewards/chosen": 7.833972930908203, "rewards/margins": 37.72007942199707, "rewards/rejected": -29.886106491088867, "step": 3470 }, { "epoch": 0.8685099462029275, "grad_norm": 11.0, "kl": 12.619620323181152, "learning_rate": 5e-06, "logits/chosen": -43282796.307692304, "logits/rejected": -37682106.18181818, "logps/chosen": -458.3948317307692, "logps/rejected": -528.6837713068181, "loss": 0.0741, "rewards/chosen": 10.180793175330528, "rewards/margins": 29.60916564514587, "rewards/rejected": -19.42837246981534, "step": 3471 }, { "epoch": 0.8687601651445014, "grad_norm": 5.8125, "kl": 4.687972545623779, "learning_rate": 5e-06, "logits/chosen": -31926302.11764706, "logits/rejected": -50416045.71428572, "logps/chosen": -399.3623621323529, "logps/rejected": -591.3120814732143, "loss": 0.0526, "rewards/chosen": 9.734816607306986, "rewards/margins": 24.717736572778527, "rewards/rejected": -14.98291996547154, "step": 3472 }, { "epoch": 0.8690103840860753, "grad_norm": 11.375, "kl": 9.203490257263184, "learning_rate": 5e-06, "logits/chosen": -67601427.2, "logits/rejected": -34972205.71428572, "logps/chosen": -488.0810546875, "logps/rejected": -576.010986328125, "loss": 0.077, "rewards/chosen": 9.368679809570313, "rewards/margins": 29.90518973214286, "rewards/rejected": -20.536509922572545, "step": 3473 }, { "epoch": 0.8692606030276492, "grad_norm": 8.25, "kl": 0.7488810420036316, "learning_rate": 5e-06, "logits/chosen": -41686637.71428572, "logits/rejected": -57045267.2, "logps/chosen": -348.14773995535717, "logps/rejected": -590.3755859375, "loss": 0.0376, "rewards/chosen": 9.239700317382812, "rewards/margins": 27.616017150878907, "rewards/rejected": -18.376316833496094, "step": 3474 }, { "epoch": 0.869510821969223, "grad_norm": 7.21875, "kl": 11.378637313842773, "learning_rate": 5e-06, "logits/chosen": -33037804.0, "logits/rejected": -27630972.0, "logps/chosen": -359.6537170410156, "logps/rejected": -581.793701171875, "loss": 0.0199, "rewards/chosen": 9.954888343811035, "rewards/margins": 25.091553688049316, "rewards/rejected": -15.136665344238281, "step": 3475 }, { "epoch": 0.869761040910797, "grad_norm": 0.5078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41297801.6, "logits/rejected": -62777778.28571428, "logps/chosen": -507.1861328125, "logps/rejected": -577.0569893973214, "loss": 0.0009, "rewards/chosen": 10.858907318115234, "rewards/margins": 30.113482775006972, "rewards/rejected": -19.25457545689174, "step": 3476 }, { "epoch": 0.8700112598523708, "grad_norm": 4.65625, "kl": 10.955331802368164, "learning_rate": 5e-06, "logits/chosen": -57302702.54545455, "logits/rejected": -34351734.15384615, "logps/chosen": -450.57426313920456, "logps/rejected": -592.2168719951923, "loss": 0.0103, "rewards/chosen": 10.468408064408736, "rewards/margins": 26.55772597306258, "rewards/rejected": -16.089317908653847, "step": 3477 }, { "epoch": 0.8702614787939447, "grad_norm": 2.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23608845.333333332, "logits/rejected": -21870264.0, "logps/chosen": -302.7200520833333, "logps/rejected": -719.738037109375, "loss": 0.0227, "rewards/chosen": 8.484755833943685, "rewards/margins": 30.38633155822754, "rewards/rejected": -21.901575724283855, "step": 3478 }, { "epoch": 0.8705116977355186, "grad_norm": 11.125, "kl": 1.7276370525360107, "learning_rate": 5e-06, "logits/chosen": -45869765.81818182, "logits/rejected": -29070237.53846154, "logps/chosen": -346.1038263494318, "logps/rejected": -552.7812124399038, "loss": 0.0182, "rewards/chosen": 8.338917818936435, "rewards/margins": 22.2284840937261, "rewards/rejected": -13.889566274789663, "step": 3479 }, { "epoch": 0.8707619166770925, "grad_norm": 1.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49231209.14285714, "logits/rejected": -53019574.4, "logps/chosen": -364.5970982142857, "logps/rejected": -710.037060546875, "loss": 0.039, "rewards/chosen": 7.858834947858538, "rewards/margins": 28.742112840924946, "rewards/rejected": -20.883277893066406, "step": 3480 }, { "epoch": 0.8710121356186663, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 4741275.428571428, "logits/rejected": -28629722.352941178, "logps/chosen": -299.351806640625, "logps/rejected": -393.0396369485294, "loss": 0.0164, "rewards/chosen": 7.616309574672154, "rewards/margins": 17.23380673833254, "rewards/rejected": -9.617497163660387, "step": 3481 }, { "epoch": 0.8712623545602403, "grad_norm": 8.0625, "kl": 8.709150314331055, "learning_rate": 5e-06, "logits/chosen": -16687712.0, "logits/rejected": -53757797.333333336, "logps/chosen": -301.19580078125, "logps/rejected": -922.07666015625, "loss": 0.0651, "rewards/chosen": 7.220244513617621, "rewards/margins": 29.746329413519966, "rewards/rejected": -22.526084899902344, "step": 3482 }, { "epoch": 0.8715125735018141, "grad_norm": 0.15234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44238770.666666664, "logits/rejected": -44029120.0, "logps/chosen": -521.3534749348959, "logps/rejected": -530.3155517578125, "loss": 0.0003, "rewards/chosen": 12.447312672932943, "rewards/margins": 31.768147786458336, "rewards/rejected": -19.32083511352539, "step": 3483 }, { "epoch": 0.871762792443388, "grad_norm": 6.53125, "kl": 17.40579605102539, "learning_rate": 5e-06, "logits/chosen": -60392605.86666667, "logits/rejected": -66556309.333333336, "logps/chosen": -448.0633138020833, "logps/rejected": -569.46337890625, "loss": 0.078, "rewards/chosen": 9.13824462890625, "rewards/margins": 23.99889458550347, "rewards/rejected": -14.860649956597221, "step": 3484 }, { "epoch": 0.8720130113849618, "grad_norm": 24.125, "kl": 5.751862049102783, "learning_rate": 5e-06, "logits/chosen": -32837833.14285714, "logits/rejected": -13199528.0, "logps/chosen": -319.37869698660717, "logps/rejected": -428.903466796875, "loss": 0.0546, "rewards/chosen": 5.712645939418247, "rewards/margins": 17.216526249476843, "rewards/rejected": -11.503880310058594, "step": 3485 }, { "epoch": 0.8722632303265357, "grad_norm": 11.9375, "kl": 4.841195583343506, "learning_rate": 5e-06, "logits/chosen": -36043133.09090909, "logits/rejected": -61274338.461538464, "logps/chosen": -384.2327769886364, "logps/rejected": -634.3969350961538, "loss": 0.0671, "rewards/chosen": 8.661341580477627, "rewards/margins": 27.08565329171561, "rewards/rejected": -18.42431171123798, "step": 3486 }, { "epoch": 0.8725134492681096, "grad_norm": 2.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68651826.28571428, "logits/rejected": -19265931.2, "logps/chosen": -477.54268973214283, "logps/rejected": -629.8080078125, "loss": 0.0378, "rewards/chosen": 9.687103271484375, "rewards/margins": 28.023788452148438, "rewards/rejected": -18.336685180664062, "step": 3487 }, { "epoch": 0.8727636682096834, "grad_norm": 12.75, "kl": 2.053499221801758, "learning_rate": 5e-06, "logits/chosen": -28121618.666666668, "logits/rejected": -34491381.333333336, "logps/chosen": -414.3540852864583, "logps/rejected": -740.4046223958334, "loss": 0.0469, "rewards/chosen": 8.724291483561197, "rewards/margins": 29.117375691731773, "rewards/rejected": -20.393084208170574, "step": 3488 }, { "epoch": 0.8730138871512574, "grad_norm": 7.71875, "kl": 0.3010028302669525, "learning_rate": 5e-06, "logits/chosen": -19811253.333333332, "logits/rejected": -32091890.666666668, "logps/chosen": -361.798583984375, "logps/rejected": -564.289794921875, "loss": 0.0236, "rewards/chosen": 7.995500564575195, "rewards/margins": 28.277644475301106, "rewards/rejected": -20.28214391072591, "step": 3489 }, { "epoch": 0.8732641060928312, "grad_norm": 13.8125, "kl": 16.190898895263672, "learning_rate": 5e-06, "logits/chosen": -3217014.153846154, "logits/rejected": -45372791.27272727, "logps/chosen": -367.3820612980769, "logps/rejected": -555.1054243607955, "loss": 0.1296, "rewards/chosen": 9.128287095289965, "rewards/margins": 23.462135528351045, "rewards/rejected": -14.33384843306108, "step": 3490 }, { "epoch": 0.8735143250344051, "grad_norm": 5.03125, "kl": 9.775385856628418, "learning_rate": 5e-06, "logits/chosen": -51088753.23076923, "logits/rejected": -55027776.0, "logps/chosen": -298.17003455528845, "logps/rejected": -584.8488103693181, "loss": 0.0397, "rewards/chosen": 7.636959956242488, "rewards/margins": 23.450544717428567, "rewards/rejected": -15.81358476118608, "step": 3491 }, { "epoch": 0.873764543975979, "grad_norm": 3.484375, "kl": 8.500158309936523, "learning_rate": 5e-06, "logits/chosen": -30177106.666666668, "logits/rejected": -31627458.666666668, "logps/chosen": -375.7908935546875, "logps/rejected": -891.4656575520834, "loss": 0.0038, "rewards/chosen": 11.170824686686197, "rewards/margins": 32.858350118001304, "rewards/rejected": -21.687525431315105, "step": 3492 }, { "epoch": 0.8740147629175529, "grad_norm": 2.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30430137.6, "logits/rejected": -34492251.428571425, "logps/chosen": -369.1553466796875, "logps/rejected": -519.3116629464286, "loss": 0.0333, "rewards/chosen": 7.280263519287109, "rewards/margins": 23.454499271937777, "rewards/rejected": -16.17423575265067, "step": 3493 }, { "epoch": 0.8742649818591267, "grad_norm": 14.875, "kl": 13.738447189331055, "learning_rate": 5e-06, "logits/chosen": -87834368.0, "logits/rejected": -67089960.72727273, "logps/chosen": -479.5993840144231, "logps/rejected": -517.4357244318181, "loss": 0.0949, "rewards/chosen": 11.719855675330528, "rewards/margins": 26.882969356083372, "rewards/rejected": -15.163113680752842, "step": 3494 }, { "epoch": 0.8745152008007007, "grad_norm": 2.53125, "kl": 5.4975104331970215, "learning_rate": 5e-06, "logits/chosen": -44753385.14285714, "logits/rejected": -26740112.0, "logps/chosen": -390.4558803013393, "logps/rejected": -551.2099609375, "loss": 0.0325, "rewards/chosen": 8.827649797712054, "rewards/margins": 25.071894182477678, "rewards/rejected": -16.244244384765626, "step": 3495 }, { "epoch": 0.8747654197422745, "grad_norm": 4.875, "kl": 5.394972801208496, "learning_rate": 5e-06, "logits/chosen": -23658592.0, "logits/rejected": -38754269.538461536, "logps/chosen": -362.89200106534093, "logps/rejected": -475.9330303485577, "loss": 0.0113, "rewards/chosen": 9.438323974609375, "rewards/margins": 23.45937758225661, "rewards/rejected": -14.021053607647236, "step": 3496 }, { "epoch": 0.8750156386838484, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52933876.36363637, "logits/rejected": -33611803.07692308, "logps/chosen": -319.91184303977275, "logps/rejected": -695.7135667067307, "loss": 0.008, "rewards/chosen": 7.543406399813565, "rewards/margins": 27.01269163118376, "rewards/rejected": -19.469285231370193, "step": 3497 }, { "epoch": 0.8752658576254222, "grad_norm": 18.375, "kl": 5.734790802001953, "learning_rate": 5e-06, "logits/chosen": -58964072.72727273, "logits/rejected": -46720585.84615385, "logps/chosen": -409.29225852272725, "logps/rejected": -672.2512019230769, "loss": 0.061, "rewards/chosen": 10.009342540394176, "rewards/margins": 24.840697708663406, "rewards/rejected": -14.83135516826923, "step": 3498 }, { "epoch": 0.8755160765669961, "grad_norm": 5.5, "kl": 14.852598190307617, "learning_rate": 5e-06, "logits/chosen": -43296871.384615384, "logits/rejected": -24105266.90909091, "logps/chosen": -419.0271183894231, "logps/rejected": -506.16264204545456, "loss": 0.0331, "rewards/chosen": 10.60536898099459, "rewards/margins": 25.504981407752403, "rewards/rejected": -14.899612426757812, "step": 3499 }, { "epoch": 0.87576629550857, "grad_norm": 21.875, "kl": 27.28453826904297, "learning_rate": 5e-06, "logits/chosen": -39126731.294117644, "logits/rejected": -33457817.14285714, "logps/chosen": -359.83800551470586, "logps/rejected": -516.1396484375, "loss": 0.2804, "rewards/chosen": 8.156498628504137, "rewards/margins": 20.419150344463958, "rewards/rejected": -12.262651715959821, "step": 3500 }, { "epoch": 0.8760165144501438, "grad_norm": 3.53125, "kl": 0.01238250732421875, "learning_rate": 5e-06, "logits/chosen": -39314901.333333336, "logits/rejected": -69900167.1111111, "logps/chosen": -401.12291666666664, "logps/rejected": -793.1433919270834, "loss": 0.0313, "rewards/chosen": 9.82814229329427, "rewards/margins": 31.292555406358506, "rewards/rejected": -21.464413113064236, "step": 3501 }, { "epoch": 0.8762667333917178, "grad_norm": 0.7421875, "kl": 0.26519775390625, "learning_rate": 5e-06, "logits/chosen": -64505255.384615384, "logits/rejected": -38148221.09090909, "logps/chosen": -458.3414963942308, "logps/rejected": -520.1825727982955, "loss": 0.0157, "rewards/chosen": 10.279526930588942, "rewards/margins": 27.63161100374235, "rewards/rejected": -17.35208407315341, "step": 3502 }, { "epoch": 0.8765169523332916, "grad_norm": 20.25, "kl": 5.348047256469727, "learning_rate": 5e-06, "logits/chosen": 37508404.36363637, "logits/rejected": -45389602.461538464, "logps/chosen": -431.95938387784093, "logps/rejected": -620.1787860576923, "loss": 0.0327, "rewards/chosen": 9.256132646040482, "rewards/margins": 30.644907811304904, "rewards/rejected": -21.388775165264423, "step": 3503 }, { "epoch": 0.8767671712748655, "grad_norm": 0.25, "kl": 0.12527689337730408, "learning_rate": 5e-06, "logits/chosen": -40986042.666666664, "logits/rejected": -19338796.0, "logps/chosen": -526.5663655598959, "logps/rejected": -789.27685546875, "loss": 0.0004, "rewards/chosen": 10.52834383646647, "rewards/margins": 30.603637059529625, "rewards/rejected": -20.075293223063152, "step": 3504 }, { "epoch": 0.8770173902164394, "grad_norm": 1.8203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32284522.666666668, "logits/rejected": -16291065.6, "logps/chosen": -385.85582139756946, "logps/rejected": -654.5609375, "loss": 0.0126, "rewards/chosen": 9.295000712076822, "rewards/margins": 27.57289377848307, "rewards/rejected": -18.27789306640625, "step": 3505 }, { "epoch": 0.8772676091580133, "grad_norm": 2.6875, "kl": 5.3422088623046875, "learning_rate": 5e-06, "logits/chosen": -46049078.85714286, "logits/rejected": -19758204.8, "logps/chosen": -407.06996372767856, "logps/rejected": -439.0158203125, "loss": 0.0077, "rewards/chosen": 9.821343558175224, "rewards/margins": 25.3222407749721, "rewards/rejected": -15.500897216796876, "step": 3506 }, { "epoch": 0.8775178280995871, "grad_norm": 6.84375, "kl": 3.8835322856903076, "learning_rate": 5e-06, "logits/chosen": -55850555.07692308, "logits/rejected": 80364218.18181819, "logps/chosen": -431.5446213942308, "logps/rejected": -591.2766335227273, "loss": 0.0044, "rewards/chosen": 12.38507314828726, "rewards/margins": 29.922022999583426, "rewards/rejected": -17.536949851296164, "step": 3507 }, { "epoch": 0.8777680470411611, "grad_norm": 5.5, "kl": 0.9364904165267944, "learning_rate": 5e-06, "logits/chosen": -42539872.0, "logits/rejected": -18547186.285714287, "logps/chosen": -446.951611328125, "logps/rejected": -405.44754464285717, "loss": 0.0034, "rewards/chosen": 10.959182739257812, "rewards/margins": 23.51131875174386, "rewards/rejected": -12.552136012486049, "step": 3508 }, { "epoch": 0.8780182659827349, "grad_norm": 9.625, "kl": 3.875466823577881, "learning_rate": 5e-06, "logits/chosen": -47632085.333333336, "logits/rejected": -42960853.333333336, "logps/chosen": -407.4012044270833, "logps/rejected": -695.9214680989584, "loss": 0.0313, "rewards/chosen": 9.362682342529297, "rewards/margins": 25.973944346110027, "rewards/rejected": -16.61126200358073, "step": 3509 }, { "epoch": 0.8782684849243088, "grad_norm": 22.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22777596.444444444, "logits/rejected": -55068125.86666667, "logps/chosen": -395.72089301215277, "logps/rejected": -720.6970052083333, "loss": 0.0335, "rewards/chosen": 10.574483235677084, "rewards/margins": 29.378271484375, "rewards/rejected": -18.803788248697916, "step": 3510 }, { "epoch": 0.8785187038658826, "grad_norm": 9.1875, "kl": 6.7372355461120605, "learning_rate": 5e-06, "logits/chosen": -64550128.0, "logits/rejected": -17351972.0, "logps/chosen": -512.9631958007812, "logps/rejected": -474.2779235839844, "loss": 0.0146, "rewards/chosen": 11.224201202392578, "rewards/margins": 27.438823699951172, "rewards/rejected": -16.214622497558594, "step": 3511 }, { "epoch": 0.8787689228074566, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53243881.6, "logits/rejected": -7391868.0, "logps/chosen": -438.39765625, "logps/rejected": -726.9963030133929, "loss": 0.0225, "rewards/chosen": 10.342338562011719, "rewards/margins": 25.175048828125, "rewards/rejected": -14.832710266113281, "step": 3512 }, { "epoch": 0.8790191417490304, "grad_norm": 3.4375, "kl": 4.361793041229248, "learning_rate": 5e-06, "logits/chosen": -50016885.333333336, "logits/rejected": -32127608.0, "logps/chosen": -329.2644449869792, "logps/rejected": -565.9168701171875, "loss": 0.0163, "rewards/chosen": 6.939074198404948, "rewards/margins": 26.60351816813151, "rewards/rejected": -19.664443969726562, "step": 3513 }, { "epoch": 0.8792693606906042, "grad_norm": 0.9296875, "kl": 3.7996115684509277, "learning_rate": 5e-06, "logits/chosen": -18059968.0, "logits/rejected": -32363150.545454547, "logps/chosen": -366.8538161057692, "logps/rejected": -578.0814541903409, "loss": 0.0227, "rewards/chosen": 7.9247612586388225, "rewards/margins": 23.690773277015953, "rewards/rejected": -15.76601201837713, "step": 3514 }, { "epoch": 0.8795195796321782, "grad_norm": 5.5, "kl": 23.02729606628418, "learning_rate": 5e-06, "logits/chosen": -26394752.0, "logits/rejected": -56730240.0, "logps/chosen": -518.8658272879464, "logps/rejected": -791.687451171875, "loss": 0.0095, "rewards/chosen": 11.83224596296038, "rewards/margins": 34.590635245186945, "rewards/rejected": -22.758389282226563, "step": 3515 }, { "epoch": 0.879769798573752, "grad_norm": 3.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39892590.54545455, "logits/rejected": -53648802.461538464, "logps/chosen": -451.50577059659093, "logps/rejected": -789.908203125, "loss": 0.046, "rewards/chosen": 7.973960876464844, "rewards/margins": 34.760912968562195, "rewards/rejected": -26.786952092097355, "step": 3516 }, { "epoch": 0.8800200175153259, "grad_norm": 0.65234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48603776.0, "logits/rejected": -40172689.45454545, "logps/chosen": -444.33687650240387, "logps/rejected": -568.6198508522727, "loss": 0.0047, "rewards/chosen": 10.5233400785006, "rewards/margins": 29.807641836313103, "rewards/rejected": -19.2843017578125, "step": 3517 }, { "epoch": 0.8802702364568998, "grad_norm": 2.140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -64918848.0, "logits/rejected": -57443345.45454545, "logps/chosen": -423.8599384014423, "logps/rejected": -683.4215198863636, "loss": 0.0236, "rewards/chosen": 8.508799039400541, "rewards/margins": 29.895269487287614, "rewards/rejected": -21.386470447887074, "step": 3518 }, { "epoch": 0.8805204553984737, "grad_norm": 4.40625, "kl": 6.827731132507324, "learning_rate": 5e-06, "logits/chosen": -51399088.0, "logits/rejected": -46543749.333333336, "logps/chosen": -443.26904296875, "logps/rejected": -540.4437255859375, "loss": 0.048, "rewards/chosen": 9.395888010660807, "rewards/margins": 28.04160181681315, "rewards/rejected": -18.645713806152344, "step": 3519 }, { "epoch": 0.8807706743400475, "grad_norm": 1.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49925192.0, "logits/rejected": -42542104.0, "logps/chosen": -425.741455078125, "logps/rejected": -565.3547973632812, "loss": 0.0301, "rewards/chosen": 9.724347114562988, "rewards/margins": 25.833613395690918, "rewards/rejected": -16.10926628112793, "step": 3520 }, { "epoch": 0.8810208932816214, "grad_norm": 1.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42885288.0, "logits/rejected": -45824368.0, "logps/chosen": -340.09661865234375, "logps/rejected": -695.1806030273438, "loss": 0.0056, "rewards/chosen": 9.08309268951416, "rewards/margins": 32.04166507720947, "rewards/rejected": -22.958572387695312, "step": 3521 }, { "epoch": 0.8812711122231953, "grad_norm": 2.859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20576410.666666668, "logits/rejected": -43797282.666666664, "logps/chosen": -430.1531982421875, "logps/rejected": -861.8927408854166, "loss": 0.0414, "rewards/chosen": 8.912598927815756, "rewards/margins": 35.784036000569664, "rewards/rejected": -26.871437072753906, "step": 3522 }, { "epoch": 0.8815213311647692, "grad_norm": 11.0, "kl": 2.283547878265381, "learning_rate": 5e-06, "logits/chosen": -18514584.0, "logits/rejected": -51793033.14285714, "logps/chosen": -368.028857421875, "logps/rejected": -529.9784458705357, "loss": 0.052, "rewards/chosen": 8.050103759765625, "rewards/margins": 24.89565756661551, "rewards/rejected": -16.84555380684989, "step": 3523 }, { "epoch": 0.881771550106343, "grad_norm": 4.03125, "kl": 2.735687255859375, "learning_rate": 5e-06, "logits/chosen": -60683537.06666667, "logits/rejected": -25922286.222222224, "logps/chosen": -280.44306640625, "logps/rejected": -628.4701063368055, "loss": 0.0196, "rewards/chosen": 7.375937906901042, "rewards/margins": 23.75389472113715, "rewards/rejected": -16.37795681423611, "step": 3524 }, { "epoch": 0.882021769047917, "grad_norm": 18.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19665780.363636363, "logits/rejected": -50662971.07692308, "logps/chosen": -337.1357421875, "logps/rejected": -738.5169020432693, "loss": 0.0216, "rewards/chosen": 8.446251609108664, "rewards/margins": 27.382603678669962, "rewards/rejected": -18.9363520695613, "step": 3525 }, { "epoch": 0.8822719879894908, "grad_norm": 1.96875, "kl": 2.773669719696045, "learning_rate": 5e-06, "logits/chosen": -45895060.36363637, "logits/rejected": -30374545.230769232, "logps/chosen": -380.40047940340907, "logps/rejected": -613.1191030649038, "loss": 0.0028, "rewards/chosen": 7.972742254083807, "rewards/margins": 30.488106680916736, "rewards/rejected": -22.51536442683293, "step": 3526 }, { "epoch": 0.8825222069310646, "grad_norm": 11.0, "kl": 11.179986953735352, "learning_rate": 5e-06, "logits/chosen": -39017080.47058824, "logits/rejected": 5681417.142857143, "logps/chosen": -395.7303251378676, "logps/rejected": -808.7589285714286, "loss": 0.0566, "rewards/chosen": 9.184193330652574, "rewards/margins": 34.77713025517824, "rewards/rejected": -25.59293692452567, "step": 3527 }, { "epoch": 0.8827724258726386, "grad_norm": 8.625, "kl": 12.569405555725098, "learning_rate": 5e-06, "logits/chosen": -41951995.07692308, "logits/rejected": -52011659.63636363, "logps/chosen": -350.5563777043269, "logps/rejected": -626.0881569602273, "loss": 0.0356, "rewards/chosen": 9.90627699631911, "rewards/margins": 24.86199460329709, "rewards/rejected": -14.955717606977982, "step": 3528 }, { "epoch": 0.8830226448142124, "grad_norm": 3.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -63248659.692307696, "logits/rejected": -40183808.0, "logps/chosen": -396.86868990384613, "logps/rejected": -861.9009232954545, "loss": 0.0316, "rewards/chosen": 8.992201585036058, "rewards/margins": 33.6723357514068, "rewards/rejected": -24.68013416637074, "step": 3529 }, { "epoch": 0.8832728637557863, "grad_norm": 1.921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24235644.444444444, "logits/rejected": -48798442.666666664, "logps/chosen": -405.89794921875, "logps/rejected": -753.6374348958333, "loss": 0.0181, "rewards/chosen": 9.289745754665798, "rewards/margins": 30.007999335394963, "rewards/rejected": -20.718253580729165, "step": 3530 }, { "epoch": 0.8835230826973602, "grad_norm": 1.0390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -69412869.81818181, "logits/rejected": -61747584.0, "logps/chosen": -339.38077059659093, "logps/rejected": -760.5519831730769, "loss": 0.0029, "rewards/chosen": 8.985397338867188, "rewards/margins": 32.52979102501502, "rewards/rejected": -23.544393686147835, "step": 3531 }, { "epoch": 0.8837733016389341, "grad_norm": 11.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20123254.153846152, "logits/rejected": -43051490.90909091, "logps/chosen": -351.21987680288464, "logps/rejected": -685.9308860085227, "loss": 0.0298, "rewards/chosen": 8.055490347055288, "rewards/margins": 27.828644865876313, "rewards/rejected": -19.773154518821023, "step": 3532 }, { "epoch": 0.8840235205805079, "grad_norm": 0.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35598186.666666664, "logits/rejected": -48570629.333333336, "logps/chosen": -507.4412434895833, "logps/rejected": -647.8389485677084, "loss": 0.0025, "rewards/chosen": 11.869539896647135, "rewards/margins": 28.91759490966797, "rewards/rejected": -17.048055013020832, "step": 3533 }, { "epoch": 0.8842737395220818, "grad_norm": 4.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30526798.0, "logits/rejected": -30781508.0, "logps/chosen": -360.6224365234375, "logps/rejected": -460.099365234375, "loss": 0.0595, "rewards/chosen": 9.262928009033203, "rewards/margins": 20.12716293334961, "rewards/rejected": -10.864234924316406, "step": 3534 }, { "epoch": 0.8845239584636557, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -83971916.8, "logits/rejected": -62708585.14285714, "logps/chosen": -368.1653076171875, "logps/rejected": -715.0520368303571, "loss": 0.0453, "rewards/chosen": 9.799618530273438, "rewards/margins": 27.228944178989956, "rewards/rejected": -17.429325648716517, "step": 3535 }, { "epoch": 0.8847741774052296, "grad_norm": 2.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 96781124.26666667, "logits/rejected": -56679288.88888889, "logps/chosen": -363.45813802083336, "logps/rejected": -670.8628472222222, "loss": 0.0183, "rewards/chosen": 8.462307230631511, "rewards/margins": 27.609039137098527, "rewards/rejected": -19.146731906467014, "step": 3536 }, { "epoch": 0.8850243963468034, "grad_norm": 0.73046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20385854.222222224, "logits/rejected": -53205521.06666667, "logps/chosen": -347.93402777777777, "logps/rejected": -600.1393880208333, "loss": 0.0163, "rewards/chosen": 8.680105421278212, "rewards/margins": 25.979535759819875, "rewards/rejected": -17.299430338541665, "step": 3537 }, { "epoch": 0.8852746152883774, "grad_norm": 13.4375, "kl": 0.5519479513168335, "learning_rate": 5e-06, "logits/chosen": -39253520.0, "logits/rejected": -51883792.0, "logps/chosen": -343.6383463541667, "logps/rejected": -595.2027994791666, "loss": 0.0467, "rewards/chosen": 7.444177627563477, "rewards/margins": 23.721469243367512, "rewards/rejected": -16.277291615804035, "step": 3538 }, { "epoch": 0.8855248342299512, "grad_norm": 3.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42127385.6, "logits/rejected": -84539235.55555555, "logps/chosen": -285.89026692708336, "logps/rejected": -841.9453667534722, "loss": 0.0469, "rewards/chosen": 7.388307189941406, "rewards/margins": 26.546664598253038, "rewards/rejected": -19.15835740831163, "step": 3539 }, { "epoch": 0.885775053171525, "grad_norm": 15.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12394200.0, "logits/rejected": -41413717.333333336, "logps/chosen": -234.55257161458334, "logps/rejected": -639.273193359375, "loss": 0.0622, "rewards/chosen": 6.5384572347005205, "rewards/margins": 24.415119171142578, "rewards/rejected": -17.87666193644206, "step": 3540 }, { "epoch": 0.886025272113099, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41635814.4, "logits/rejected": 47349489.777777776, "logps/chosen": -347.8814453125, "logps/rejected": -865.4913194444445, "loss": 0.0444, "rewards/chosen": 9.341150919596354, "rewards/margins": 35.84440782335069, "rewards/rejected": -26.50325690375434, "step": 3541 }, { "epoch": 0.8862754910546728, "grad_norm": 1.53125, "kl": 0.7327525019645691, "learning_rate": 5e-06, "logits/chosen": -45679222.85714286, "logits/rejected": -48611312.0, "logps/chosen": -455.0830775669643, "logps/rejected": -745.722314453125, "loss": 0.0049, "rewards/chosen": 9.598833356584821, "rewards/margins": 29.53981083461216, "rewards/rejected": -19.940977478027342, "step": 3542 }, { "epoch": 0.8865257099962467, "grad_norm": 3.734375, "kl": 11.181112289428711, "learning_rate": 5e-06, "logits/chosen": 1327896.0, "logits/rejected": -50682245.81818182, "logps/chosen": -469.84033203125, "logps/rejected": -636.2815163352273, "loss": 0.0835, "rewards/chosen": 9.44761481651893, "rewards/margins": 26.801549364636827, "rewards/rejected": -17.3539345481179, "step": 3543 }, { "epoch": 0.8867759289378206, "grad_norm": 5.03125, "kl": 6.331469535827637, "learning_rate": 5e-06, "logits/chosen": -39545188.571428575, "logits/rejected": -26938828.8, "logps/chosen": -308.0029296875, "logps/rejected": -528.78740234375, "loss": 0.0408, "rewards/chosen": 9.005165100097656, "rewards/margins": 26.53498077392578, "rewards/rejected": -17.529815673828125, "step": 3544 }, { "epoch": 0.8870261478793945, "grad_norm": 25.5, "kl": 2.7045936584472656, "learning_rate": 5e-06, "logits/chosen": -16624669.333333334, "logits/rejected": -40340477.333333336, "logps/chosen": -354.5863037109375, "logps/rejected": -678.650390625, "loss": 0.094, "rewards/chosen": 7.201658248901367, "rewards/margins": 24.39222780863444, "rewards/rejected": -17.190569559733074, "step": 3545 }, { "epoch": 0.8872763668209683, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43928469.333333336, "logits/rejected": -64286490.666666664, "logps/chosen": -418.9471842447917, "logps/rejected": -750.6517740885416, "loss": 0.0283, "rewards/chosen": 9.349239349365234, "rewards/margins": 34.52497227986653, "rewards/rejected": -25.1757329305013, "step": 3546 }, { "epoch": 0.8875265857625422, "grad_norm": 15.0625, "kl": 8.711920738220215, "learning_rate": 5e-06, "logits/chosen": -46442736.0, "logits/rejected": -6817747.333333333, "logps/chosen": -373.9108072916667, "logps/rejected": -582.3116861979166, "loss": 0.0337, "rewards/chosen": 8.72821299235026, "rewards/margins": 23.87250264485677, "rewards/rejected": -15.14428965250651, "step": 3547 }, { "epoch": 0.8877768047041161, "grad_norm": 4.21875, "kl": 2.5194449424743652, "learning_rate": 5e-06, "logits/chosen": -34197618.666666664, "logits/rejected": -55995338.666666664, "logps/chosen": -332.61171468098956, "logps/rejected": -837.3741861979166, "loss": 0.0124, "rewards/chosen": 9.442577362060547, "rewards/margins": 31.84194819132487, "rewards/rejected": -22.399370829264324, "step": 3548 }, { "epoch": 0.88802702364569, "grad_norm": 4.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20156258.90909091, "logits/rejected": -44403633.23076923, "logps/chosen": -340.86661044034093, "logps/rejected": -470.1190655048077, "loss": 0.0191, "rewards/chosen": 8.664235201748935, "rewards/margins": 21.325013540841482, "rewards/rejected": -12.660778339092548, "step": 3549 }, { "epoch": 0.8882772425872638, "grad_norm": 7.84375, "kl": 17.180910110473633, "learning_rate": 5e-06, "logits/chosen": -29982848.0, "logits/rejected": -86504721.45454545, "logps/chosen": -279.7267503004808, "logps/rejected": -479.09419389204544, "loss": 0.0581, "rewards/chosen": 7.192154517540565, "rewards/margins": 21.77524001114852, "rewards/rejected": -14.583085493607955, "step": 3550 }, { "epoch": 0.8885274615288378, "grad_norm": 14.8125, "kl": 18.139162063598633, "learning_rate": 5e-06, "logits/chosen": -7143572.0, "logits/rejected": -57583088.0, "logps/chosen": -487.1918029785156, "logps/rejected": -633.14794921875, "loss": 0.0577, "rewards/chosen": 10.979626655578613, "rewards/margins": 28.057339668273926, "rewards/rejected": -17.077713012695312, "step": 3551 }, { "epoch": 0.8887776804704116, "grad_norm": 7.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53758528.0, "logits/rejected": -56177733.81818182, "logps/chosen": -363.1078350360577, "logps/rejected": -664.7646484375, "loss": 0.0224, "rewards/chosen": 9.80609365609976, "rewards/margins": 26.052277571671496, "rewards/rejected": -16.246183915571734, "step": 3552 }, { "epoch": 0.8890278994119855, "grad_norm": 8.5625, "kl": 31.09537696838379, "learning_rate": 5e-06, "logits/chosen": -55373048.0, "logits/rejected": -34496304.0, "logps/chosen": -459.2705078125, "logps/rejected": -495.7012634277344, "loss": 0.028, "rewards/chosen": 11.879775047302246, "rewards/margins": 24.93155002593994, "rewards/rejected": -13.051774978637695, "step": 3553 }, { "epoch": 0.8892781183535594, "grad_norm": 4.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -3185761.5, "logits/rejected": 2810681.0, "logps/chosen": -312.50732421875, "logps/rejected": -653.3818969726562, "loss": 0.0183, "rewards/chosen": 7.304194927215576, "rewards/margins": 25.907958507537842, "rewards/rejected": -18.603763580322266, "step": 3554 }, { "epoch": 0.8895283372951333, "grad_norm": 6.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 100725650.28571428, "logits/rejected": -40439401.6, "logps/chosen": -331.2613002232143, "logps/rejected": -714.6546875, "loss": 0.0283, "rewards/chosen": 8.577002934047155, "rewards/margins": 23.27026312691825, "rewards/rejected": -14.693260192871094, "step": 3555 }, { "epoch": 0.8897785562367071, "grad_norm": 24.625, "kl": 9.007562637329102, "learning_rate": 5e-06, "logits/chosen": -49211048.0, "logits/rejected": -60229344.0, "logps/chosen": -506.52044677734375, "logps/rejected": -826.952392578125, "loss": 0.0216, "rewards/chosen": 13.77773380279541, "rewards/margins": 33.69819355010986, "rewards/rejected": -19.920459747314453, "step": 3556 }, { "epoch": 0.890028775178281, "grad_norm": 21.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27839630.545454547, "logits/rejected": -60112452.92307692, "logps/chosen": -361.5335138494318, "logps/rejected": -689.3671875, "loss": 0.036, "rewards/chosen": 9.827920393510299, "rewards/margins": 26.64744178398506, "rewards/rejected": -16.81952139047476, "step": 3557 }, { "epoch": 0.8902789941198549, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41981043.2, "logits/rejected": 29721277.714285713, "logps/chosen": -326.7423828125, "logps/rejected": -613.1862444196429, "loss": 0.0722, "rewards/chosen": 7.103857421875, "rewards/margins": 21.988111877441405, "rewards/rejected": -14.884254455566406, "step": 3558 }, { "epoch": 0.8905292130614287, "grad_norm": 3.296875, "kl": 16.763235092163086, "learning_rate": 5e-06, "logits/chosen": -37970861.71428572, "logits/rejected": -48721548.8, "logps/chosen": -318.97792271205356, "logps/rejected": -733.8240234375, "loss": 0.0205, "rewards/chosen": 9.213544573102679, "rewards/margins": 30.55724269321987, "rewards/rejected": -21.34369812011719, "step": 3559 }, { "epoch": 0.8907794320030026, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39386720.0, "logits/rejected": -24556263.384615384, "logps/chosen": -439.65083451704544, "logps/rejected": -691.5878155048077, "loss": 0.0295, "rewards/chosen": 10.615115772594105, "rewards/margins": 27.964985720761174, "rewards/rejected": -17.34986994816707, "step": 3560 }, { "epoch": 0.8910296509445765, "grad_norm": 5.8125, "kl": 11.556530952453613, "learning_rate": 5e-06, "logits/chosen": -44487637.333333336, "logits/rejected": -41096234.666666664, "logps/chosen": -378.9966145833333, "logps/rejected": -513.0971137152778, "loss": 0.0847, "rewards/chosen": 8.752685546875, "rewards/margins": 21.65960015190972, "rewards/rejected": -12.906914605034721, "step": 3561 }, { "epoch": 0.8912798698861504, "grad_norm": 1.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35319248.0, "logits/rejected": -48383016.0, "logps/chosen": -411.6524658203125, "logps/rejected": -601.8757934570312, "loss": 0.0146, "rewards/chosen": 10.573699951171875, "rewards/margins": 29.511112213134766, "rewards/rejected": -18.93741226196289, "step": 3562 }, { "epoch": 0.8915300888277242, "grad_norm": 0.78125, "kl": 12.529175758361816, "learning_rate": 5e-06, "logits/chosen": -34441112.0, "logits/rejected": -50513808.0, "logps/chosen": -394.88262939453125, "logps/rejected": -753.8955078125, "loss": 0.0368, "rewards/chosen": 10.936822891235352, "rewards/margins": 29.312332153320312, "rewards/rejected": -18.37550926208496, "step": 3563 }, { "epoch": 0.8917803077692982, "grad_norm": 9.75, "kl": 5.645042419433594, "learning_rate": 5e-06, "logits/chosen": -90000665.6, "logits/rejected": -73292814.22222222, "logps/chosen": -316.68834635416664, "logps/rejected": -689.0061306423611, "loss": 0.0605, "rewards/chosen": 7.524369303385416, "rewards/margins": 28.736654663085936, "rewards/rejected": -21.21228535970052, "step": 3564 }, { "epoch": 0.892030526710872, "grad_norm": 7.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57538554.18181818, "logits/rejected": -50735606.15384615, "logps/chosen": -433.54545454545456, "logps/rejected": -576.8485576923077, "loss": 0.0165, "rewards/chosen": 9.260657570578836, "rewards/margins": 24.33347838075011, "rewards/rejected": -15.072820810171274, "step": 3565 }, { "epoch": 0.8922807456524459, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48765479.384615384, "logits/rejected": -26493460.363636363, "logps/chosen": -471.4169170673077, "logps/rejected": -756.3002485795455, "loss": 0.0264, "rewards/chosen": 11.327290461613583, "rewards/margins": 31.671401123900516, "rewards/rejected": -20.344110662286933, "step": 3566 }, { "epoch": 0.8925309645940198, "grad_norm": 10.6875, "kl": 0.8661238551139832, "learning_rate": 5e-06, "logits/chosen": -43011545.6, "logits/rejected": -69104333.71428572, "logps/chosen": -395.7413330078125, "logps/rejected": -597.3536551339286, "loss": 0.0272, "rewards/chosen": 13.063336181640626, "rewards/margins": 34.13588082449777, "rewards/rejected": -21.072544642857142, "step": 3567 }, { "epoch": 0.8927811835355937, "grad_norm": 0.326171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39511264.0, "logits/rejected": -70710256.0, "logps/chosen": -516.6149291992188, "logps/rejected": -776.2039184570312, "loss": 0.0004, "rewards/chosen": 11.192928314208984, "rewards/margins": 42.36050605773926, "rewards/rejected": -31.167577743530273, "step": 3568 }, { "epoch": 0.8930314024771675, "grad_norm": 14.25, "kl": 8.936076164245605, "learning_rate": 5e-06, "logits/chosen": -20995629.714285713, "logits/rejected": -4384881.2, "logps/chosen": -499.23733956473217, "logps/rejected": -761.3767578125, "loss": 0.0212, "rewards/chosen": 11.105767386300224, "rewards/margins": 37.356707327706474, "rewards/rejected": -26.25093994140625, "step": 3569 }, { "epoch": 0.8932816214187413, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29882135.272727273, "logits/rejected": -77923224.61538461, "logps/chosen": -323.5667613636364, "logps/rejected": -657.11328125, "loss": 0.0203, "rewards/chosen": 8.35981542413885, "rewards/margins": 29.99526182588164, "rewards/rejected": -21.63544640174279, "step": 3570 }, { "epoch": 0.8935318403603153, "grad_norm": 1.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42454236.8, "logits/rejected": -31103620.57142857, "logps/chosen": -406.72548828125, "logps/rejected": -761.6640625, "loss": 0.0096, "rewards/chosen": 9.425423431396485, "rewards/margins": 32.22746244158064, "rewards/rejected": -22.802039010184153, "step": 3571 }, { "epoch": 0.8937820593018891, "grad_norm": 4.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -82150376.72727273, "logits/rejected": -64283421.538461536, "logps/chosen": -489.23655007102275, "logps/rejected": -552.8284254807693, "loss": 0.0407, "rewards/chosen": 8.74586209383878, "rewards/margins": 26.95527371493253, "rewards/rejected": -18.20941162109375, "step": 3572 }, { "epoch": 0.894032278243463, "grad_norm": 2.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38546268.0, "logits/rejected": -58857044.0, "logps/chosen": -389.99212646484375, "logps/rejected": -955.5507202148438, "loss": 0.0109, "rewards/chosen": 9.822789192199707, "rewards/margins": 35.163357734680176, "rewards/rejected": -25.34056854248047, "step": 3573 }, { "epoch": 0.8942824971850369, "grad_norm": 7.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42028868.92307692, "logits/rejected": -60457873.45454545, "logps/chosen": -288.2484600360577, "logps/rejected": -868.5490944602273, "loss": 0.0416, "rewards/chosen": 7.456160912146935, "rewards/margins": 34.21430078253046, "rewards/rejected": -26.758139870383523, "step": 3574 }, { "epoch": 0.8945327161266108, "grad_norm": 39.5, "kl": 2.4006075859069824, "learning_rate": 5e-06, "logits/chosen": -37943563.294117644, "logits/rejected": -67948278.85714285, "logps/chosen": -330.2010282628676, "logps/rejected": -942.3878348214286, "loss": 0.0512, "rewards/chosen": 8.659520766314339, "rewards/margins": 39.58336197027639, "rewards/rejected": -30.923841203962052, "step": 3575 }, { "epoch": 0.8947829350681846, "grad_norm": 1.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65529949.09090909, "logits/rejected": -55677528.615384616, "logps/chosen": -417.38041548295456, "logps/rejected": -721.0374098557693, "loss": 0.0038, "rewards/chosen": 9.235323125665838, "rewards/margins": 33.71860184035935, "rewards/rejected": -24.48327871469351, "step": 3576 }, { "epoch": 0.8950331540097586, "grad_norm": 6.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35827665.23076923, "logits/rejected": -20044024.727272727, "logps/chosen": -326.26476111778845, "logps/rejected": -913.6676136363636, "loss": 0.0198, "rewards/chosen": 9.159064659705528, "rewards/margins": 39.16170026872541, "rewards/rejected": -30.002635609019887, "step": 3577 }, { "epoch": 0.8952833729513324, "grad_norm": 0.69140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40076157.09090909, "logits/rejected": -33894230.15384615, "logps/chosen": -497.5905095880682, "logps/rejected": -931.4921123798077, "loss": 0.012, "rewards/chosen": 9.310053045099432, "rewards/margins": 39.167453792545345, "rewards/rejected": -29.857400747445915, "step": 3578 }, { "epoch": 0.8955335918929063, "grad_norm": 2.0, "kl": 6.568772792816162, "learning_rate": 5e-06, "logits/chosen": -50336547.55555555, "logits/rejected": -63288506.666666664, "logps/chosen": -393.36089409722223, "logps/rejected": -699.6407877604166, "loss": 0.0245, "rewards/chosen": 9.618316650390625, "rewards/margins": 34.642303466796875, "rewards/rejected": -25.02398681640625, "step": 3579 }, { "epoch": 0.8957838108344802, "grad_norm": 4.03125, "kl": 4.129493236541748, "learning_rate": 5e-06, "logits/chosen": -45677894.4, "logits/rejected": -70438509.71428572, "logps/chosen": -387.7941162109375, "logps/rejected": -742.3577008928571, "loss": 0.0255, "rewards/chosen": 8.383448791503906, "rewards/margins": 33.648493957519534, "rewards/rejected": -25.265045166015625, "step": 3580 }, { "epoch": 0.8960340297760541, "grad_norm": 19.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24682363.42857143, "logits/rejected": -60028320.0, "logps/chosen": -314.90164620535717, "logps/rejected": -712.51005859375, "loss": 0.0827, "rewards/chosen": 7.976354326520648, "rewards/margins": 27.764229692731583, "rewards/rejected": -19.787875366210937, "step": 3581 }, { "epoch": 0.8962842487176279, "grad_norm": 5.625, "kl": 1.534576416015625, "learning_rate": 5e-06, "logits/chosen": -27526957.333333332, "logits/rejected": -45549317.333333336, "logps/chosen": -326.50661214192706, "logps/rejected": -665.3781331380209, "loss": 0.0389, "rewards/chosen": 9.024898529052734, "rewards/margins": 29.259749094645183, "rewards/rejected": -20.23485056559245, "step": 3582 }, { "epoch": 0.8965344676592018, "grad_norm": 19.25, "kl": 12.660791397094727, "learning_rate": 5e-06, "logits/chosen": -39787291.428571425, "logits/rejected": -68445964.8, "logps/chosen": -455.52308872767856, "logps/rejected": -910.52529296875, "loss": 0.0194, "rewards/chosen": 9.003792354038783, "rewards/margins": 40.383705684116904, "rewards/rejected": -31.379913330078125, "step": 3583 }, { "epoch": 0.8967846866007757, "grad_norm": 3.890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47908964.571428575, "logits/rejected": -68801305.6, "logps/chosen": -394.45689174107144, "logps/rejected": -709.46591796875, "loss": 0.0098, "rewards/chosen": 7.821954454694476, "rewards/margins": 29.60352488926479, "rewards/rejected": -21.781570434570312, "step": 3584 }, { "epoch": 0.8970349055423495, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20332908.307692308, "logits/rejected": -64738897.45454545, "logps/chosen": -291.37503756009613, "logps/rejected": -594.1895419034091, "loss": 0.0366, "rewards/chosen": 7.813441936786358, "rewards/margins": 26.79250132954204, "rewards/rejected": -18.979059392755683, "step": 3585 }, { "epoch": 0.8972851244839234, "grad_norm": 1.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29363378.666666668, "logits/rejected": -66764000.0, "logps/chosen": -345.7313639322917, "logps/rejected": -869.4364420572916, "loss": 0.0187, "rewards/chosen": 9.394649505615234, "rewards/margins": 34.39015324910481, "rewards/rejected": -24.995503743489582, "step": 3586 }, { "epoch": 0.8975353434254973, "grad_norm": 3.203125, "kl": 1.6825002431869507, "learning_rate": 5e-06, "logits/chosen": -21098100.363636363, "logits/rejected": 52558508.307692304, "logps/chosen": -415.849609375, "logps/rejected": -396.3076171875, "loss": 0.0509, "rewards/chosen": 5.612681302157315, "rewards/margins": 20.47586865191693, "rewards/rejected": -14.863187349759615, "step": 3587 }, { "epoch": 0.8977855623670712, "grad_norm": 1.6640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36833483.63636363, "logits/rejected": -37698227.692307696, "logps/chosen": -515.1987748579545, "logps/rejected": -506.27163461538464, "loss": 0.003, "rewards/chosen": 7.8318398215553975, "rewards/margins": 25.51888323163653, "rewards/rejected": -17.68704341008113, "step": 3588 }, { "epoch": 0.898035781308645, "grad_norm": 2.09375, "kl": 2.24967360496521, "learning_rate": 5e-06, "logits/chosen": -63733666.90909091, "logits/rejected": -58205312.0, "logps/chosen": -333.69247159090907, "logps/rejected": -712.125, "loss": 0.0535, "rewards/chosen": 8.564024491743607, "rewards/margins": 35.431791825727984, "rewards/rejected": -26.867767333984375, "step": 3589 }, { "epoch": 0.898286000250219, "grad_norm": 2.171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36968456.72727273, "logits/rejected": -48292470.15384615, "logps/chosen": -331.12051669034093, "logps/rejected": -718.1705228365385, "loss": 0.0432, "rewards/chosen": 10.336235046386719, "rewards/margins": 33.70757293701172, "rewards/rejected": -23.371337890625, "step": 3590 }, { "epoch": 0.8985362191917928, "grad_norm": 9.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48777437.09090909, "logits/rejected": -52432630.15384615, "logps/chosen": -436.1507457386364, "logps/rejected": -638.8423978365385, "loss": 0.0144, "rewards/chosen": 8.619108720259232, "rewards/margins": 28.07390514453808, "rewards/rejected": -19.454796424278847, "step": 3591 }, { "epoch": 0.8987864381333667, "grad_norm": 2.78125, "kl": 4.412869453430176, "learning_rate": 5e-06, "logits/chosen": -37410978.90909091, "logits/rejected": -45998050.461538464, "logps/chosen": -371.9358575994318, "logps/rejected": -709.0854867788462, "loss": 0.0106, "rewards/chosen": 9.744924371892756, "rewards/margins": 27.605226636766552, "rewards/rejected": -17.8603022648738, "step": 3592 }, { "epoch": 0.8990366570749406, "grad_norm": 4.40625, "kl": 1.3186264038085938, "learning_rate": 5e-06, "logits/chosen": -30307997.333333332, "logits/rejected": -55313328.0, "logps/chosen": -453.0516764322917, "logps/rejected": -718.5865071614584, "loss": 0.0064, "rewards/chosen": 11.269307454427084, "rewards/margins": 31.798019409179688, "rewards/rejected": -20.528711954752605, "step": 3593 }, { "epoch": 0.8992868760165145, "grad_norm": 7.34375, "kl": 26.341228485107422, "learning_rate": 5e-06, "logits/chosen": -43001449.4117647, "logits/rejected": -77435117.71428572, "logps/chosen": -420.95812270220586, "logps/rejected": -828.7267020089286, "loss": 0.0592, "rewards/chosen": 10.025647331686582, "rewards/margins": 30.83181929387966, "rewards/rejected": -20.80617196219308, "step": 3594 }, { "epoch": 0.8995370949580883, "grad_norm": 1.0859375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35851840.0, "logits/rejected": -52953745.45454545, "logps/chosen": -353.4519230769231, "logps/rejected": -652.9990234375, "loss": 0.0207, "rewards/chosen": 10.44244854266827, "rewards/margins": 34.528566373811735, "rewards/rejected": -24.086117831143465, "step": 3595 }, { "epoch": 0.8997873138996622, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48907904.0, "logits/rejected": -51206298.666666664, "logps/chosen": -322.0104573567708, "logps/rejected": -597.0435384114584, "loss": 0.0321, "rewards/chosen": 8.561424255371094, "rewards/margins": 23.536322275797524, "rewards/rejected": -14.974898020426432, "step": 3596 }, { "epoch": 0.9000375328412361, "grad_norm": 2.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19482838.4, "logits/rejected": -42198464.0, "logps/chosen": -295.7649658203125, "logps/rejected": -571.9587053571429, "loss": 0.0164, "rewards/chosen": 7.997134399414063, "rewards/margins": 26.359061976841517, "rewards/rejected": -18.361927577427455, "step": 3597 }, { "epoch": 0.90028775178281, "grad_norm": 20.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22118802.285714287, "logits/rejected": -25385037.17647059, "logps/chosen": -319.59158761160717, "logps/rejected": -596.5837545955883, "loss": 0.0479, "rewards/chosen": 9.640521458217076, "rewards/margins": 24.789113341259355, "rewards/rejected": -15.148591883042279, "step": 3598 }, { "epoch": 0.9005379707243838, "grad_norm": 8.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67340470.85714285, "logits/rejected": -51692054.5882353, "logps/chosen": -446.910400390625, "logps/rejected": -569.0407284007352, "loss": 0.0115, "rewards/chosen": 10.921763828822545, "rewards/margins": 28.21607355710839, "rewards/rejected": -17.294309728285846, "step": 3599 }, { "epoch": 0.9007881896659578, "grad_norm": 1.671875, "kl": 9.163370132446289, "learning_rate": 5e-06, "logits/chosen": -48403441.777777776, "logits/rejected": -46045120.0, "logps/chosen": -366.2111545138889, "logps/rejected": -452.1435139973958, "loss": 0.0358, "rewards/chosen": 9.702921549479166, "rewards/margins": 21.955594380696613, "rewards/rejected": -12.252672831217447, "step": 3600 }, { "epoch": 0.9010384086075316, "grad_norm": 10.125, "kl": 8.149924278259277, "learning_rate": 5e-06, "logits/chosen": -58110934.85714286, "logits/rejected": -25523259.2, "logps/chosen": -431.52779715401783, "logps/rejected": -434.67451171875, "loss": 0.0813, "rewards/chosen": 10.335129874093193, "rewards/margins": 25.767835562569758, "rewards/rejected": -15.432705688476563, "step": 3601 }, { "epoch": 0.9012886275491054, "grad_norm": 6.96875, "kl": 4.318479061126709, "learning_rate": 5e-06, "logits/chosen": -43018368.0, "logits/rejected": -69879901.0909091, "logps/chosen": -302.0524338942308, "logps/rejected": -671.2958984375, "loss": 0.0408, "rewards/chosen": 6.9910137469951925, "rewards/margins": 26.914056737939795, "rewards/rejected": -19.9230429909446, "step": 3602 }, { "epoch": 0.9015388464906794, "grad_norm": 9.375, "kl": 5.109602928161621, "learning_rate": 5e-06, "logits/chosen": -33619271.384615384, "logits/rejected": -52030784.0, "logps/chosen": -464.6823167067308, "logps/rejected": -725.1413352272727, "loss": 0.071, "rewards/chosen": 11.125636174128605, "rewards/margins": 28.06567745608883, "rewards/rejected": -16.940041281960227, "step": 3603 }, { "epoch": 0.9017890654322532, "grad_norm": 7.3125, "kl": 6.47227668762207, "learning_rate": 5e-06, "logits/chosen": -30454247.384615384, "logits/rejected": -37461268.36363637, "logps/chosen": -390.27749399038464, "logps/rejected": -544.8722034801136, "loss": 0.0068, "rewards/chosen": 9.12210669884315, "rewards/margins": 30.323131534603093, "rewards/rejected": -21.20102483575994, "step": 3604 }, { "epoch": 0.9020392843738271, "grad_norm": 18.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42158925.71428572, "logits/rejected": -47638304.0, "logps/chosen": -404.50732421875, "logps/rejected": -644.544140625, "loss": 0.0294, "rewards/chosen": 9.399901253836495, "rewards/margins": 24.1706547328404, "rewards/rejected": -14.770753479003906, "step": 3605 }, { "epoch": 0.902289503315401, "grad_norm": 0.2021484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65949144.615384616, "logits/rejected": -76457146.18181819, "logps/chosen": -559.2515399639423, "logps/rejected": -533.3188920454545, "loss": 0.0003, "rewards/chosen": 12.600909893329327, "rewards/margins": 33.308497395548784, "rewards/rejected": -20.70758750221946, "step": 3606 }, { "epoch": 0.9025397222569749, "grad_norm": 9.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15942484.0, "logits/rejected": -30826478.0, "logps/chosen": -398.357177734375, "logps/rejected": -585.0400390625, "loss": 0.0098, "rewards/chosen": 10.620777130126953, "rewards/margins": 30.243654251098633, "rewards/rejected": -19.62287712097168, "step": 3607 }, { "epoch": 0.9027899411985487, "grad_norm": 9.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48321320.72727273, "logits/rejected": -41513088.0, "logps/chosen": -285.69229403409093, "logps/rejected": -766.4764122596154, "loss": 0.0299, "rewards/chosen": 7.378281333229759, "rewards/margins": 33.396251491733366, "rewards/rejected": -26.017970158503605, "step": 3608 }, { "epoch": 0.9030401601401226, "grad_norm": 3.140625, "kl": 6.833434104919434, "learning_rate": 5e-06, "logits/chosen": -39711660.0, "logits/rejected": -41721856.0, "logps/chosen": -372.8978271484375, "logps/rejected": -607.3831176757812, "loss": 0.0199, "rewards/chosen": 10.83343505859375, "rewards/margins": 33.85621643066406, "rewards/rejected": -23.022781372070312, "step": 3609 }, { "epoch": 0.9032903790816965, "grad_norm": 17.5, "kl": 15.860231399536133, "learning_rate": 5e-06, "logits/chosen": -48352465.45454545, "logits/rejected": -46532169.84615385, "logps/chosen": -476.14626242897725, "logps/rejected": -625.5582181490385, "loss": 0.0862, "rewards/chosen": 9.75843672318892, "rewards/margins": 31.126220489715365, "rewards/rejected": -21.367783766526443, "step": 3610 }, { "epoch": 0.9035405980232704, "grad_norm": 10.625, "kl": 13.490285873413086, "learning_rate": 5e-06, "logits/chosen": -47593170.28571428, "logits/rejected": -23428704.0, "logps/chosen": -378.02894810267856, "logps/rejected": -619.52763671875, "loss": 0.0982, "rewards/chosen": 7.996178763253348, "rewards/margins": 25.96215624128069, "rewards/rejected": -17.965977478027344, "step": 3611 }, { "epoch": 0.9037908169648442, "grad_norm": 5.9375, "kl": 17.10900115966797, "learning_rate": 5e-06, "logits/chosen": -46087322.666666664, "logits/rejected": -43144218.666666664, "logps/chosen": -512.732666015625, "logps/rejected": -537.7214762369791, "loss": 0.0398, "rewards/chosen": 12.541951497395834, "rewards/margins": 30.999038696289062, "rewards/rejected": -18.45708719889323, "step": 3612 }, { "epoch": 0.9040410359064182, "grad_norm": 2.796875, "kl": 1.5060895681381226, "learning_rate": 5e-06, "logits/chosen": -41801600.0, "logits/rejected": -42875473.06666667, "logps/chosen": -473.3396267361111, "logps/rejected": -767.7358072916667, "loss": 0.0233, "rewards/chosen": 9.908234490288628, "rewards/margins": 32.38747846815321, "rewards/rejected": -22.479243977864584, "step": 3613 }, { "epoch": 0.904291254847992, "grad_norm": 4.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42385776.0, "logits/rejected": -24973545.14285714, "logps/chosen": -435.598681640625, "logps/rejected": -756.2899693080357, "loss": 0.0376, "rewards/chosen": 11.386499786376953, "rewards/margins": 30.774070848737445, "rewards/rejected": -19.38757106236049, "step": 3614 }, { "epoch": 0.9045414737895658, "grad_norm": 0.400390625, "kl": 2.726717710494995, "learning_rate": 5e-06, "logits/chosen": -47092642.461538464, "logits/rejected": -23503534.545454547, "logps/chosen": -455.4560546875, "logps/rejected": -566.7881303267045, "loss": 0.0006, "rewards/chosen": 10.898423414963942, "rewards/margins": 28.542838143302006, "rewards/rejected": -17.644414728338067, "step": 3615 }, { "epoch": 0.9047916927311398, "grad_norm": 2.546875, "kl": 1.0237910747528076, "learning_rate": 5e-06, "logits/chosen": -69458221.71428572, "logits/rejected": -25794326.4, "logps/chosen": -339.08778599330356, "logps/rejected": -546.679150390625, "loss": 0.0385, "rewards/chosen": 7.639298575265067, "rewards/margins": 28.26591600690569, "rewards/rejected": -20.626617431640625, "step": 3616 }, { "epoch": 0.9050419116727136, "grad_norm": 5.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36360762.18181818, "logits/rejected": -19271931.076923076, "logps/chosen": -287.3035777698864, "logps/rejected": -632.6897536057693, "loss": 0.0172, "rewards/chosen": 7.67533389004794, "rewards/margins": 23.723748853990248, "rewards/rejected": -16.048414963942307, "step": 3617 }, { "epoch": 0.9052921306142875, "grad_norm": 4.1875, "kl": 8.492765426635742, "learning_rate": 5e-06, "logits/chosen": -33959506.28571428, "logits/rejected": -41394275.2, "logps/chosen": -408.01803152901783, "logps/rejected": -793.92158203125, "loss": 0.0117, "rewards/chosen": 9.264268057686943, "rewards/margins": 34.13870348249163, "rewards/rejected": -24.874435424804688, "step": 3618 }, { "epoch": 0.9055423495558613, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30358646.4, "logits/rejected": -36534326.85714286, "logps/chosen": -358.475146484375, "logps/rejected": -636.1419503348214, "loss": 0.038, "rewards/chosen": 6.734801483154297, "rewards/margins": 26.761783381870814, "rewards/rejected": -20.026981898716517, "step": 3619 }, { "epoch": 0.9057925684974353, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27859737.6, "logits/rejected": -28766560.0, "logps/chosen": -361.9039794921875, "logps/rejected": -576.7251674107143, "loss": 0.0262, "rewards/chosen": 8.998316192626953, "rewards/margins": 26.857804543631417, "rewards/rejected": -17.859488351004465, "step": 3620 }, { "epoch": 0.9060427874390091, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60639701.333333336, "logits/rejected": -6386794.666666667, "logps/chosen": -278.45196533203125, "logps/rejected": -621.6043294270834, "loss": 0.0205, "rewards/chosen": 7.352033615112305, "rewards/margins": 28.877785364786785, "rewards/rejected": -21.52575174967448, "step": 3621 }, { "epoch": 0.906293006380583, "grad_norm": 10.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49173093.333333336, "logits/rejected": 612194.6666666666, "logps/chosen": -446.3715006510417, "logps/rejected": -596.6970621744791, "loss": 0.0289, "rewards/chosen": 11.425132751464844, "rewards/margins": 30.438565572102863, "rewards/rejected": -19.01343282063802, "step": 3622 }, { "epoch": 0.9065432253221569, "grad_norm": 0.734375, "kl": 5.22381591796875, "learning_rate": 5e-06, "logits/chosen": -55532432.0, "logits/rejected": -41445882.666666664, "logps/chosen": -523.9429117838541, "logps/rejected": -603.30078125, "loss": 0.0013, "rewards/chosen": 10.956832885742188, "rewards/margins": 30.03661855061849, "rewards/rejected": -19.0797856648763, "step": 3623 }, { "epoch": 0.9067934442637308, "grad_norm": 12.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20152804.923076924, "logits/rejected": 75928913.45454545, "logps/chosen": -331.36485877403845, "logps/rejected": -551.1948686079545, "loss": 0.0582, "rewards/chosen": 6.948644197904146, "rewards/margins": 23.60828164907602, "rewards/rejected": -16.659637451171875, "step": 3624 }, { "epoch": 0.9070436632053046, "grad_norm": 6.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22142537.333333332, "logits/rejected": -51502736.0, "logps/chosen": -224.48514811197916, "logps/rejected": -558.330078125, "loss": 0.1004, "rewards/chosen": 5.383036295572917, "rewards/margins": 20.931939442952473, "rewards/rejected": -15.548903147379557, "step": 3625 }, { "epoch": 0.9072938821468786, "grad_norm": 1.5390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15878730.666666666, "logits/rejected": -16955534.222222224, "logps/chosen": -256.0196533203125, "logps/rejected": -492.54448784722223, "loss": 0.0031, "rewards/chosen": 6.986980438232422, "rewards/margins": 21.04261144002279, "rewards/rejected": -14.055631001790365, "step": 3626 }, { "epoch": 0.9075441010884524, "grad_norm": 21.75, "kl": 4.2932658195495605, "learning_rate": 5e-06, "logits/chosen": -31792569.6, "logits/rejected": -40306788.571428575, "logps/chosen": -498.42197265625, "logps/rejected": -399.437255859375, "loss": 0.0294, "rewards/chosen": 12.09811019897461, "rewards/margins": 24.81177466256278, "rewards/rejected": -12.71366446358817, "step": 3627 }, { "epoch": 0.9077943200300262, "grad_norm": 6.28125, "kl": 3.3068695068359375, "learning_rate": 5e-06, "logits/chosen": -56337810.28571428, "logits/rejected": -48774720.0, "logps/chosen": -329.12081473214283, "logps/rejected": -639.88837890625, "loss": 0.0778, "rewards/chosen": 9.627698625837054, "rewards/margins": 25.88537837437221, "rewards/rejected": -16.257679748535157, "step": 3628 }, { "epoch": 0.9080445389716002, "grad_norm": 4.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 2210708.3636363638, "logits/rejected": -56477572.92307692, "logps/chosen": -281.25390625, "logps/rejected": -781.3757512019231, "loss": 0.0444, "rewards/chosen": 9.01875097101385, "rewards/margins": 32.74205043766048, "rewards/rejected": -23.723299466646633, "step": 3629 }, { "epoch": 0.908294757913174, "grad_norm": 0.33203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30613158.4, "logits/rejected": -40937609.14285714, "logps/chosen": -337.1273681640625, "logps/rejected": -829.2527901785714, "loss": 0.0015, "rewards/chosen": 9.510231018066406, "rewards/margins": 30.49420928955078, "rewards/rejected": -20.983978271484375, "step": 3630 }, { "epoch": 0.9085449768547479, "grad_norm": 4.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17137554.0, "logits/rejected": -48536776.0, "logps/chosen": -310.9695129394531, "logps/rejected": -741.855712890625, "loss": 0.0262, "rewards/chosen": 6.718559265136719, "rewards/margins": 34.259769439697266, "rewards/rejected": -27.541210174560547, "step": 3631 }, { "epoch": 0.9087951957963217, "grad_norm": 4.46875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44472952.0, "logits/rejected": -1821066.0, "logps/chosen": -459.8052673339844, "logps/rejected": -720.4002685546875, "loss": 0.0114, "rewards/chosen": 10.881375312805176, "rewards/margins": 29.489386558532715, "rewards/rejected": -18.60801124572754, "step": 3632 }, { "epoch": 0.9090454147378957, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12176939.733333332, "logits/rejected": -29999893.333333332, "logps/chosen": -397.63564453125, "logps/rejected": -648.1135525173611, "loss": 0.0778, "rewards/chosen": 10.032937622070312, "rewards/margins": 29.20462103949653, "rewards/rejected": -19.171683417426216, "step": 3633 }, { "epoch": 0.9092956336794695, "grad_norm": 3.046875, "kl": 3.945934295654297, "learning_rate": 5e-06, "logits/chosen": -32710739.692307692, "logits/rejected": -44079232.0, "logps/chosen": -370.9562800480769, "logps/rejected": -650.1444424715909, "loss": 0.0096, "rewards/chosen": 10.378493088942308, "rewards/margins": 29.036490913871283, "rewards/rejected": -18.657997824928977, "step": 3634 }, { "epoch": 0.9095458526210434, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32668514.133333333, "logits/rejected": 91630087.1111111, "logps/chosen": -373.2248046875, "logps/rejected": -770.1028103298611, "loss": 0.0371, "rewards/chosen": 9.946636962890626, "rewards/margins": 27.15061984592014, "rewards/rejected": -17.203982883029514, "step": 3635 }, { "epoch": 0.9097960715626173, "grad_norm": 3.390625, "kl": 4.988373279571533, "learning_rate": 5e-06, "logits/chosen": -59898325.333333336, "logits/rejected": -29777077.333333332, "logps/chosen": -449.71912977430554, "logps/rejected": -564.2541666666667, "loss": 0.0401, "rewards/chosen": 9.938756306966146, "rewards/margins": 25.367634073893228, "rewards/rejected": -15.428877766927084, "step": 3636 }, { "epoch": 0.9100462905041912, "grad_norm": 3.09375, "kl": 32.518924713134766, "learning_rate": 5e-06, "logits/chosen": 13307831.466666667, "logits/rejected": -47695146.666666664, "logps/chosen": -521.7168619791667, "logps/rejected": -459.9450412326389, "loss": 0.0912, "rewards/chosen": 10.861112467447917, "rewards/margins": 23.13874020046658, "rewards/rejected": -12.277627733018663, "step": 3637 }, { "epoch": 0.910296509445765, "grad_norm": 0.31640625, "kl": 10.693359375, "learning_rate": 5e-06, "logits/chosen": 28154107.076923076, "logits/rejected": -61624128.0, "logps/chosen": -443.9314152644231, "logps/rejected": -802.2942116477273, "loss": 0.0005, "rewards/chosen": 11.090174748347355, "rewards/margins": 35.4824310516144, "rewards/rejected": -24.392256303267047, "step": 3638 }, { "epoch": 0.910546728387339, "grad_norm": 15.0625, "kl": 20.485065460205078, "learning_rate": 5e-06, "logits/chosen": -11660695.466666667, "logits/rejected": -2228178.6666666665, "logps/chosen": -396.9975911458333, "logps/rejected": -476.83251953125, "loss": 0.113, "rewards/chosen": 10.477537027994792, "rewards/margins": 22.139443800184463, "rewards/rejected": -11.66190677218967, "step": 3639 }, { "epoch": 0.9107969473289128, "grad_norm": 20.625, "kl": 18.392608642578125, "learning_rate": 5e-06, "logits/chosen": -41422512.0, "logits/rejected": -67094613.333333336, "logps/chosen": -462.5016276041667, "logps/rejected": -658.1920572916666, "loss": 0.0966, "rewards/chosen": 11.088214874267578, "rewards/margins": 26.0107790629069, "rewards/rejected": -14.922564188639322, "step": 3640 }, { "epoch": 0.9110471662704867, "grad_norm": 9.125, "kl": 11.363704681396484, "learning_rate": 5e-06, "logits/chosen": -27223797.333333332, "logits/rejected": -51653594.666666664, "logps/chosen": -359.3583984375, "logps/rejected": -554.2525227864584, "loss": 0.0687, "rewards/chosen": 8.567693710327148, "rewards/margins": 24.871999740600586, "rewards/rejected": -16.304306030273438, "step": 3641 }, { "epoch": 0.9112973852120606, "grad_norm": 1.6796875, "kl": 9.7670259475708, "learning_rate": 5e-06, "logits/chosen": -38314385.06666667, "logits/rejected": -72142933.33333333, "logps/chosen": -385.62766927083334, "logps/rejected": -861.6956380208334, "loss": 0.0274, "rewards/chosen": 9.697329711914062, "rewards/margins": 41.79235466851128, "rewards/rejected": -32.09502495659722, "step": 3642 }, { "epoch": 0.9115476041536344, "grad_norm": 7.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23347741.714285713, "logits/rejected": -44701702.4, "logps/chosen": -305.405029296875, "logps/rejected": -761.87587890625, "loss": 0.0246, "rewards/chosen": 8.298958369663783, "rewards/margins": 34.73727624075754, "rewards/rejected": -26.43831787109375, "step": 3643 }, { "epoch": 0.9117978230952083, "grad_norm": 0.057373046875, "kl": 7.651492118835449, "learning_rate": 5e-06, "logits/chosen": -41963942.4, "logits/rejected": -73286011.42857143, "logps/chosen": -520.45634765625, "logps/rejected": -744.5422712053571, "loss": 0.0002, "rewards/chosen": 13.898173522949218, "rewards/margins": 36.88585096086774, "rewards/rejected": -22.987677437918528, "step": 3644 }, { "epoch": 0.9120480420367821, "grad_norm": 1.9921875, "kl": 0.6961174011230469, "learning_rate": 5e-06, "logits/chosen": -55555860.0, "logits/rejected": -77042208.0, "logps/chosen": -403.628662109375, "logps/rejected": -990.4042358398438, "loss": 0.0065, "rewards/chosen": 9.038322448730469, "rewards/margins": 34.12835693359375, "rewards/rejected": -25.09003448486328, "step": 3645 }, { "epoch": 0.9122982609783561, "grad_norm": 0.89453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49404169.84615385, "logits/rejected": -39282385.45454545, "logps/chosen": -375.45346304086536, "logps/rejected": -630.9327947443181, "loss": 0.0245, "rewards/chosen": 9.067580003004808, "rewards/margins": 33.58444235208151, "rewards/rejected": -24.516862349076703, "step": 3646 }, { "epoch": 0.9125484799199299, "grad_norm": 3.53125, "kl": 4.686428070068359, "learning_rate": 5e-06, "logits/chosen": -37559077.64705882, "logits/rejected": -34224626.28571428, "logps/chosen": -362.3977481617647, "logps/rejected": -644.6658761160714, "loss": 0.0142, "rewards/chosen": 9.443285773782168, "rewards/margins": 29.15513880112592, "rewards/rejected": -19.71185302734375, "step": 3647 }, { "epoch": 0.9127986988615038, "grad_norm": 13.0, "kl": 12.220671653747559, "learning_rate": 5e-06, "logits/chosen": -32977894.85714286, "logits/rejected": -55531724.8, "logps/chosen": -408.61819893973217, "logps/rejected": -652.091796875, "loss": 0.0407, "rewards/chosen": 11.029583522251674, "rewards/margins": 30.54897286551339, "rewards/rejected": -19.519389343261718, "step": 3648 }, { "epoch": 0.9130489178030777, "grad_norm": 5.71875, "kl": 5.093777656555176, "learning_rate": 5e-06, "logits/chosen": -67049166.76923077, "logits/rejected": -28293629.09090909, "logps/chosen": -356.7760667067308, "logps/rejected": -433.81196732954544, "loss": 0.0505, "rewards/chosen": 10.035098736102764, "rewards/margins": 26.78887875430234, "rewards/rejected": -16.753780018199574, "step": 3649 }, { "epoch": 0.9132991367446516, "grad_norm": 15.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -6953966.4, "logits/rejected": -33446166.85714286, "logps/chosen": -397.96494140625, "logps/rejected": -638.7155412946429, "loss": 0.0553, "rewards/chosen": 9.630237579345703, "rewards/margins": 26.14644459315709, "rewards/rejected": -16.516207013811385, "step": 3650 }, { "epoch": 0.9135493556862254, "grad_norm": 0.16015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2294672.0, "logits/rejected": -40321971.692307696, "logps/chosen": -470.29305752840907, "logps/rejected": -534.0484525240385, "loss": 0.0004, "rewards/chosen": 11.673589533025568, "rewards/margins": 27.810924503353093, "rewards/rejected": -16.137334970327522, "step": 3651 }, { "epoch": 0.9137995746277994, "grad_norm": 1.484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37783133.538461536, "logits/rejected": -62245154.90909091, "logps/chosen": -385.44771634615387, "logps/rejected": -664.2159978693181, "loss": 0.0154, "rewards/chosen": 9.045796907865084, "rewards/margins": 28.51209024282602, "rewards/rejected": -19.466293334960938, "step": 3652 }, { "epoch": 0.9140497935693732, "grad_norm": 9.1875, "kl": 7.514174461364746, "learning_rate": 5e-06, "logits/chosen": -34458900.0, "logits/rejected": -33113544.0, "logps/chosen": -403.6245422363281, "logps/rejected": -457.1943359375, "loss": 0.034, "rewards/chosen": 8.788313865661621, "rewards/margins": 21.996371269226074, "rewards/rejected": -13.208057403564453, "step": 3653 }, { "epoch": 0.9143000125109471, "grad_norm": 4.1875, "kl": 13.300031661987305, "learning_rate": 5e-06, "logits/chosen": -34659676.0, "logits/rejected": -69080264.0, "logps/chosen": -438.6255187988281, "logps/rejected": -728.2763671875, "loss": 0.0963, "rewards/chosen": 8.88072395324707, "rewards/margins": 28.099050521850586, "rewards/rejected": -19.218326568603516, "step": 3654 }, { "epoch": 0.9145502314525209, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 459860.36363636365, "logits/rejected": -41979648.0, "logps/chosen": -318.0738636363636, "logps/rejected": -575.5450345552885, "loss": 0.0649, "rewards/chosen": 5.489539059725675, "rewards/margins": 21.010960052063414, "rewards/rejected": -15.52142099233774, "step": 3655 }, { "epoch": 0.9148004503940949, "grad_norm": 18.5, "kl": 12.720632553100586, "learning_rate": 5e-06, "logits/chosen": -64066588.0, "logits/rejected": -31259294.0, "logps/chosen": -398.6112976074219, "logps/rejected": -696.6396484375, "loss": 0.0334, "rewards/chosen": 10.866814613342285, "rewards/margins": 29.42197895050049, "rewards/rejected": -18.555164337158203, "step": 3656 }, { "epoch": 0.9150506693356687, "grad_norm": 11.4375, "kl": 0.30351513624191284, "learning_rate": 5e-06, "logits/chosen": -27128407.272727273, "logits/rejected": -51034756.92307692, "logps/chosen": -329.41579367897725, "logps/rejected": -642.1020132211538, "loss": 0.0728, "rewards/chosen": 9.168805902654475, "rewards/margins": 24.575102852774666, "rewards/rejected": -15.406296950120192, "step": 3657 }, { "epoch": 0.9153008882772425, "grad_norm": 2.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17003632.0, "logits/rejected": 42312621.71428572, "logps/chosen": -330.2636474609375, "logps/rejected": -583.6966727120536, "loss": 0.0417, "rewards/chosen": 8.304690551757812, "rewards/margins": 26.36024453299386, "rewards/rejected": -18.05555398123605, "step": 3658 }, { "epoch": 0.9155511072188165, "grad_norm": 8.1875, "kl": 21.391815185546875, "learning_rate": 5e-06, "logits/chosen": -56729629.538461536, "logits/rejected": -32502045.09090909, "logps/chosen": -407.1477238581731, "logps/rejected": -594.32861328125, "loss": 0.0268, "rewards/chosen": 10.405545748197115, "rewards/margins": 28.40753771375109, "rewards/rejected": -18.001991965553977, "step": 3659 }, { "epoch": 0.9158013261603903, "grad_norm": 2.984375, "kl": 14.207000732421875, "learning_rate": 5e-06, "logits/chosen": -33471081.846153848, "logits/rejected": -42789594.18181818, "logps/chosen": -380.37015474759613, "logps/rejected": -564.4909889914773, "loss": 0.0164, "rewards/chosen": 10.338812021108774, "rewards/margins": 27.60155727146389, "rewards/rejected": -17.262745250355113, "step": 3660 }, { "epoch": 0.9160515451019642, "grad_norm": 2.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55015842.461538464, "logits/rejected": -43462557.09090909, "logps/chosen": -493.2356520432692, "logps/rejected": -748.7498224431819, "loss": 0.0021, "rewards/chosen": 10.249497633713942, "rewards/margins": 33.723676988294905, "rewards/rejected": -23.474179354580965, "step": 3661 }, { "epoch": 0.9163017640435381, "grad_norm": 18.25, "kl": 1.6144975423812866, "learning_rate": 5e-06, "logits/chosen": -56993228.0, "logits/rejected": -75634536.0, "logps/chosen": -485.0689697265625, "logps/rejected": -588.8642578125, "loss": 0.0369, "rewards/chosen": 10.926384925842285, "rewards/margins": 27.836487770080566, "rewards/rejected": -16.91010284423828, "step": 3662 }, { "epoch": 0.916551982985112, "grad_norm": 1.4921875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50506960.0, "logits/rejected": -17406766.666666668, "logps/chosen": -356.5105387369792, "logps/rejected": -578.3367513020834, "loss": 0.0153, "rewards/chosen": 9.124540328979492, "rewards/margins": 31.49957338968913, "rewards/rejected": -22.375033060709637, "step": 3663 }, { "epoch": 0.9168022019266858, "grad_norm": 12.0625, "kl": 21.990711212158203, "learning_rate": 5e-06, "logits/chosen": -41240800.0, "logits/rejected": 37801856.0, "logps/chosen": -377.4229736328125, "logps/rejected": -658.3776041666666, "loss": 0.0683, "rewards/chosen": 9.736204783121744, "rewards/margins": 30.072887420654297, "rewards/rejected": -20.33668263753255, "step": 3664 }, { "epoch": 0.9170524208682598, "grad_norm": 3.703125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61251483.428571425, "logits/rejected": -72784569.6, "logps/chosen": -363.5951450892857, "logps/rejected": -754.01552734375, "loss": 0.0098, "rewards/chosen": 9.216442653111049, "rewards/margins": 28.60182168143136, "rewards/rejected": -19.385379028320312, "step": 3665 }, { "epoch": 0.9173026398098336, "grad_norm": 2.8125, "kl": 3.1362476348876953, "learning_rate": 5e-06, "logits/chosen": 10919082.666666666, "logits/rejected": -53927205.333333336, "logps/chosen": -386.0754801432292, "logps/rejected": -849.43017578125, "loss": 0.0256, "rewards/chosen": 9.0714480082194, "rewards/margins": 33.74643325805664, "rewards/rejected": -24.67498524983724, "step": 3666 }, { "epoch": 0.9175528587514075, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37247976.0, "logits/rejected": -30369148.0, "logps/chosen": -330.9925231933594, "logps/rejected": -710.7865600585938, "loss": 0.0113, "rewards/chosen": 9.274223327636719, "rewards/margins": 33.214176177978516, "rewards/rejected": -23.939952850341797, "step": 3667 }, { "epoch": 0.9178030776929813, "grad_norm": 13.5, "kl": 16.243453979492188, "learning_rate": 5e-06, "logits/chosen": -52579952.0, "logits/rejected": -50920408.0, "logps/chosen": -476.9180908203125, "logps/rejected": -500.33154296875, "loss": 0.039, "rewards/chosen": 10.705076217651367, "rewards/margins": 27.96270179748535, "rewards/rejected": -17.257625579833984, "step": 3668 }, { "epoch": 0.9180532966345553, "grad_norm": 1.1484375, "kl": 2.0773468017578125, "learning_rate": 5e-06, "logits/chosen": -46412266.666666664, "logits/rejected": -48617420.8, "logps/chosen": -418.4727376302083, "logps/rejected": -855.7354166666667, "loss": 0.0015, "rewards/chosen": 11.34043460422092, "rewards/margins": 41.99564700656467, "rewards/rejected": -30.65521240234375, "step": 3669 }, { "epoch": 0.9183035155761291, "grad_norm": 7.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18294505.333333332, "logits/rejected": -40784709.333333336, "logps/chosen": -211.52020263671875, "logps/rejected": -669.8411458333334, "loss": 0.0697, "rewards/chosen": 5.355113983154297, "rewards/margins": 31.322254180908203, "rewards/rejected": -25.967140197753906, "step": 3670 }, { "epoch": 0.918553734517703, "grad_norm": 13.0, "kl": 13.475977897644043, "learning_rate": 5e-06, "logits/chosen": -49750765.71428572, "logits/rejected": -34440809.6, "logps/chosen": -364.71895926339283, "logps/rejected": -726.482177734375, "loss": 0.0522, "rewards/chosen": 7.901856558663504, "rewards/margins": 27.06439470563616, "rewards/rejected": -19.162538146972658, "step": 3671 }, { "epoch": 0.9188039534592769, "grad_norm": 1.2578125, "kl": 0.7221651077270508, "learning_rate": 5e-06, "logits/chosen": -28355012.57142857, "logits/rejected": -76347520.0, "logps/chosen": -431.0949009486607, "logps/rejected": -698.064599609375, "loss": 0.0225, "rewards/chosen": 9.873589651925224, "rewards/margins": 31.129368155343194, "rewards/rejected": -21.255778503417968, "step": 3672 }, { "epoch": 0.9190541724008507, "grad_norm": 0.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51378492.8, "logits/rejected": -56532251.428571425, "logps/chosen": -415.995556640625, "logps/rejected": -850.5020926339286, "loss": 0.0008, "rewards/chosen": 10.853074645996093, "rewards/margins": 39.044664001464845, "rewards/rejected": -28.19158935546875, "step": 3673 }, { "epoch": 0.9193043913424246, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73830877.0909091, "logits/rejected": -54255222.15384615, "logps/chosen": -464.34623579545456, "logps/rejected": -622.6720252403846, "loss": 0.0473, "rewards/chosen": 10.302597739479758, "rewards/margins": 34.7514361935062, "rewards/rejected": -24.448838454026443, "step": 3674 }, { "epoch": 0.9195546102839985, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37260390.4, "logits/rejected": -96066510.22222222, "logps/chosen": -298.26689453125, "logps/rejected": -1054.7430555555557, "loss": 0.035, "rewards/chosen": 8.145692952473958, "rewards/margins": 45.22034233940972, "rewards/rejected": -37.07464938693576, "step": 3675 }, { "epoch": 0.9198048292255724, "grad_norm": 1.6796875, "kl": 0.7751471400260925, "learning_rate": 5e-06, "logits/chosen": -60191772.44444445, "logits/rejected": -68596509.86666666, "logps/chosen": -461.78716362847223, "logps/rejected": -842.7582682291667, "loss": 0.015, "rewards/chosen": 10.981557210286459, "rewards/margins": 37.68365275065104, "rewards/rejected": -26.702095540364585, "step": 3676 }, { "epoch": 0.9200550481671462, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15956401.23076923, "logits/rejected": -68831394.9090909, "logps/chosen": -286.7046461838942, "logps/rejected": -594.1253995028409, "loss": 0.0493, "rewards/chosen": 5.6898029033954325, "rewards/margins": 25.393933596310916, "rewards/rejected": -19.704130692915484, "step": 3677 }, { "epoch": 0.9203052671087202, "grad_norm": 5.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -68665557.33333333, "logits/rejected": -24939266.666666668, "logps/chosen": -363.361328125, "logps/rejected": -567.293212890625, "loss": 0.0232, "rewards/chosen": 11.097783406575521, "rewards/margins": 26.18262608846029, "rewards/rejected": -15.084842681884766, "step": 3678 }, { "epoch": 0.920555486050294, "grad_norm": 4.125, "kl": 4.9551544189453125, "learning_rate": 5e-06, "logits/chosen": -15177004.307692308, "logits/rejected": -51948096.0, "logps/chosen": -459.3257587139423, "logps/rejected": -756.7235440340909, "loss": 0.0556, "rewards/chosen": 9.360626220703125, "rewards/margins": 29.90814208984375, "rewards/rejected": -20.547515869140625, "step": 3679 }, { "epoch": 0.9208057049918679, "grad_norm": 5.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43145842.666666664, "logits/rejected": -25372608.0, "logps/chosen": -328.8467610677083, "logps/rejected": -435.873291015625, "loss": 0.0494, "rewards/chosen": 8.137037913004557, "rewards/margins": 22.385472615559895, "rewards/rejected": -14.248434702555338, "step": 3680 }, { "epoch": 0.9210559239334417, "grad_norm": 8.5, "kl": 2.4481773376464844, "learning_rate": 5e-06, "logits/chosen": -38081241.14285714, "logits/rejected": -45931785.6, "logps/chosen": -295.50069754464283, "logps/rejected": -714.18046875, "loss": 0.0583, "rewards/chosen": 7.420160566057477, "rewards/margins": 29.984195600237165, "rewards/rejected": -22.56403503417969, "step": 3681 }, { "epoch": 0.9213061428750157, "grad_norm": 1.453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40405696.0, "logits/rejected": -41863079.384615384, "logps/chosen": -265.56733842329544, "logps/rejected": -695.0422175480769, "loss": 0.024, "rewards/chosen": 7.97051308371804, "rewards/margins": 34.750042361812994, "rewards/rejected": -26.77952927809495, "step": 3682 }, { "epoch": 0.9215563618165895, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65520640.0, "logits/rejected": -46573804.307692304, "logps/chosen": -350.3289905894886, "logps/rejected": -611.9503455528846, "loss": 0.0386, "rewards/chosen": 7.926854220303622, "rewards/margins": 29.376481436349295, "rewards/rejected": -21.449627216045673, "step": 3683 }, { "epoch": 0.9218065807581634, "grad_norm": 5.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26620567.272727273, "logits/rejected": -51018712.615384616, "logps/chosen": -228.464599609375, "logps/rejected": -729.2767427884615, "loss": 0.0377, "rewards/chosen": 6.084580854936079, "rewards/margins": 31.388066431859155, "rewards/rejected": -25.303485576923077, "step": 3684 }, { "epoch": 0.9220567996997373, "grad_norm": 10.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23915882.666666668, "logits/rejected": -67065632.0, "logps/chosen": -255.76859537760416, "logps/rejected": -553.4192301432291, "loss": 0.0687, "rewards/chosen": 6.421606699625651, "rewards/margins": 26.695067087809246, "rewards/rejected": -20.273460388183594, "step": 3685 }, { "epoch": 0.9223070186413111, "grad_norm": 7.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -91014048.0, "logits/rejected": -25845864.0, "logps/chosen": -346.2259114583333, "logps/rejected": -615.8801676432291, "loss": 0.0414, "rewards/chosen": 9.272621154785156, "rewards/margins": 27.147443135579426, "rewards/rejected": -17.87482198079427, "step": 3686 }, { "epoch": 0.922557237582885, "grad_norm": 2.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51465553.45454545, "logits/rejected": -94073787.07692307, "logps/chosen": -375.5769708806818, "logps/rejected": -956.8509615384615, "loss": 0.0119, "rewards/chosen": 7.712018099698153, "rewards/margins": 35.508726426771474, "rewards/rejected": -27.79670832707332, "step": 3687 }, { "epoch": 0.922807456524459, "grad_norm": 2.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30606674.285714287, "logits/rejected": -24823450.352941178, "logps/chosen": -272.11033412388394, "logps/rejected": -824.8774126838235, "loss": 0.024, "rewards/chosen": 8.445656912667411, "rewards/margins": 35.98136606937697, "rewards/rejected": -27.535709156709558, "step": 3688 }, { "epoch": 0.9230576754660328, "grad_norm": 5.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41163616.0, "logits/rejected": -69796608.0, "logps/chosen": -358.54775390625, "logps/rejected": -701.0075334821429, "loss": 0.0364, "rewards/chosen": 8.676412963867188, "rewards/margins": 28.909262084960936, "rewards/rejected": -20.23284912109375, "step": 3689 }, { "epoch": 0.9233078944076066, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35831644.44444445, "logits/rejected": -62976179.2, "logps/chosen": -262.1901041666667, "logps/rejected": -950.980078125, "loss": 0.0439, "rewards/chosen": 8.096754286024305, "rewards/margins": 41.010157606336804, "rewards/rejected": -32.9134033203125, "step": 3690 }, { "epoch": 0.9235581133491806, "grad_norm": 5.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34080085.333333336, "logits/rejected": -38613273.6, "logps/chosen": -429.8879665798611, "logps/rejected": -636.77265625, "loss": 0.0352, "rewards/chosen": 10.336007859971788, "rewards/margins": 31.821125454372833, "rewards/rejected": -21.485117594401043, "step": 3691 }, { "epoch": 0.9238083322907544, "grad_norm": 4.65625, "kl": 5.238432884216309, "learning_rate": 5e-06, "logits/chosen": -48587785.84615385, "logits/rejected": -18208693.818181816, "logps/chosen": -373.20169771634613, "logps/rejected": -531.3069069602273, "loss": 0.0401, "rewards/chosen": 9.612536503718449, "rewards/margins": 24.545536548107656, "rewards/rejected": -14.933000044389205, "step": 3692 }, { "epoch": 0.9240585512323283, "grad_norm": 7.5, "kl": 5.6325225830078125, "learning_rate": 5e-06, "logits/chosen": -47413248.0, "logits/rejected": -62455408.0, "logps/chosen": -387.6510416666667, "logps/rejected": -579.4986165364584, "loss": 0.0837, "rewards/chosen": 9.310820897420248, "rewards/margins": 25.36291058858236, "rewards/rejected": -16.05208969116211, "step": 3693 }, { "epoch": 0.9243087701739021, "grad_norm": 4.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50646572.0, "logits/rejected": -9453488.0, "logps/chosen": -275.2099609375, "logps/rejected": -644.7911376953125, "loss": 0.0252, "rewards/chosen": 8.35729694366455, "rewards/margins": 29.634264945983887, "rewards/rejected": -21.276968002319336, "step": 3694 }, { "epoch": 0.9245589891154761, "grad_norm": 6.875, "kl": 0.6097742915153503, "learning_rate": 5e-06, "logits/chosen": -36478080.0, "logits/rejected": -73992153.6, "logps/chosen": -395.43844168526783, "logps/rejected": -754.53779296875, "loss": 0.0219, "rewards/chosen": 8.53553227015904, "rewards/margins": 30.57603977748326, "rewards/rejected": -22.040507507324218, "step": 3695 }, { "epoch": 0.9248092080570499, "grad_norm": 1.28125, "kl": 1.1071523427963257, "learning_rate": 5e-06, "logits/chosen": -29321816.0, "logits/rejected": -73368826.66666667, "logps/chosen": -344.7670084635417, "logps/rejected": -877.466552734375, "loss": 0.0209, "rewards/chosen": 8.654857635498047, "rewards/margins": 34.738338470458984, "rewards/rejected": -26.083480834960938, "step": 3696 }, { "epoch": 0.9250594269986238, "grad_norm": 4.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51040896.0, "logits/rejected": -40428378.666666664, "logps/chosen": -256.4623209635417, "logps/rejected": -702.9195149739584, "loss": 0.0369, "rewards/chosen": 7.106784820556641, "rewards/margins": 29.26878484090169, "rewards/rejected": -22.16200002034505, "step": 3697 }, { "epoch": 0.9253096459401977, "grad_norm": 8.0, "kl": 2.357769012451172, "learning_rate": 5e-06, "logits/chosen": -45492257.88235294, "logits/rejected": -75074098.28571428, "logps/chosen": -371.2243221507353, "logps/rejected": -962.2689732142857, "loss": 0.0354, "rewards/chosen": 10.685268626493567, "rewards/margins": 50.217939873703386, "rewards/rejected": -39.53267124720982, "step": 3698 }, { "epoch": 0.9255598648817716, "grad_norm": 7.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61758887.384615384, "logits/rejected": -37671226.18181818, "logps/chosen": -402.8660231370192, "logps/rejected": -543.7151544744319, "loss": 0.0133, "rewards/chosen": 9.769400963416466, "rewards/margins": 28.42345092346618, "rewards/rejected": -18.654049960049715, "step": 3699 }, { "epoch": 0.9258100838233454, "grad_norm": 0.33984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54026995.2, "logits/rejected": -46542221.71428572, "logps/chosen": -490.822265625, "logps/rejected": -820.87060546875, "loss": 0.0007, "rewards/chosen": 12.331735229492187, "rewards/margins": 41.802253069196425, "rewards/rejected": -29.47051783970424, "step": 3700 }, { "epoch": 0.9260603027649194, "grad_norm": 10.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -96762604.8, "logits/rejected": -36792221.71428572, "logps/chosen": -336.268505859375, "logps/rejected": -574.1924176897321, "loss": 0.0475, "rewards/chosen": 6.6514404296875, "rewards/margins": 25.226182338169643, "rewards/rejected": -18.574741908482142, "step": 3701 }, { "epoch": 0.9263105217064932, "grad_norm": 1.0859375, "kl": 5.960305690765381, "learning_rate": 5e-06, "logits/chosen": -66248536.615384616, "logits/rejected": 3029041.4545454546, "logps/chosen": -454.0021784855769, "logps/rejected": -846.1086647727273, "loss": 0.0021, "rewards/chosen": 10.19219501201923, "rewards/margins": 40.507014347956726, "rewards/rejected": -30.3148193359375, "step": 3702 }, { "epoch": 0.926560740648067, "grad_norm": 23.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36604312.615384616, "logits/rejected": -37822048.0, "logps/chosen": -295.26639498197113, "logps/rejected": -471.5929509943182, "loss": 0.0603, "rewards/chosen": 7.948537386380709, "rewards/margins": 21.292259643127867, "rewards/rejected": -13.343722256747158, "step": 3703 }, { "epoch": 0.9268109595896409, "grad_norm": 7.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33856205.333333336, "logits/rejected": -48558485.333333336, "logps/chosen": -297.96803792317706, "logps/rejected": -774.0730794270834, "loss": 0.0423, "rewards/chosen": 7.227203369140625, "rewards/margins": 34.115529378255204, "rewards/rejected": -26.888326009114582, "step": 3704 }, { "epoch": 0.9270611785312148, "grad_norm": 6.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47229600.0, "logits/rejected": -62766129.23076923, "logps/chosen": -386.07692649147725, "logps/rejected": -801.9137620192307, "loss": 0.0138, "rewards/chosen": 9.968229814009232, "rewards/margins": 34.25131556370875, "rewards/rejected": -24.28308574969952, "step": 3705 }, { "epoch": 0.9273113974727887, "grad_norm": 4.5, "kl": 9.980598449707031, "learning_rate": 5e-06, "logits/chosen": -61470674.28571428, "logits/rejected": -20664419.2, "logps/chosen": -482.77260044642856, "logps/rejected": -643.902490234375, "loss": 0.0143, "rewards/chosen": 10.542388916015625, "rewards/margins": 26.01171875, "rewards/rejected": -15.469329833984375, "step": 3706 }, { "epoch": 0.9275616164143625, "grad_norm": 5.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -17888912.0, "logits/rejected": -62861909.333333336, "logps/chosen": -340.6318088107639, "logps/rejected": -722.9623046875, "loss": 0.0095, "rewards/chosen": 7.426587422688802, "rewards/margins": 32.73001556396484, "rewards/rejected": -25.30342814127604, "step": 3707 }, { "epoch": 0.9278118353559365, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56353472.0, "logits/rejected": -31678795.636363637, "logps/chosen": -328.4323167067308, "logps/rejected": -417.66592684659093, "loss": 0.0114, "rewards/chosen": 8.44217036320613, "rewards/margins": 23.704353812691217, "rewards/rejected": -15.262183449485086, "step": 3708 }, { "epoch": 0.9280620542975103, "grad_norm": 0.64453125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55930984.72727273, "logits/rejected": -70537737.84615384, "logps/chosen": -492.87375710227275, "logps/rejected": -946.8109975961538, "loss": 0.0055, "rewards/chosen": 11.676055908203125, "rewards/margins": 46.53683941180889, "rewards/rejected": -34.86078350360577, "step": 3709 }, { "epoch": 0.9283122732390842, "grad_norm": 4.6875, "kl": 9.946484565734863, "learning_rate": 5e-06, "logits/chosen": -30531054.769230768, "logits/rejected": -55394978.90909091, "logps/chosen": -399.21567007211536, "logps/rejected": -547.6901189630681, "loss": 0.0153, "rewards/chosen": 9.143834627591646, "rewards/margins": 23.717856400496476, "rewards/rejected": -14.57402177290483, "step": 3710 }, { "epoch": 0.9285624921806581, "grad_norm": 3.34375, "kl": 1.688489317893982, "learning_rate": 5e-06, "logits/chosen": -36479645.538461536, "logits/rejected": -64894952.72727273, "logps/chosen": -425.63172325721155, "logps/rejected": -746.2611860795455, "loss": 0.0406, "rewards/chosen": 10.068436842698317, "rewards/margins": 34.220758398096045, "rewards/rejected": -24.152321555397727, "step": 3711 }, { "epoch": 0.928812711122232, "grad_norm": 8.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38882144.0, "logits/rejected": -33640730.18181818, "logps/chosen": -400.70120943509613, "logps/rejected": -835.4429154829545, "loss": 0.0281, "rewards/chosen": 9.450376657339243, "rewards/margins": 36.150899340222765, "rewards/rejected": -26.700522682883523, "step": 3712 }, { "epoch": 0.9290629300638058, "grad_norm": 20.5, "kl": 4.710544586181641, "learning_rate": 5e-06, "logits/chosen": -87594067.2, "logits/rejected": -32256434.285714287, "logps/chosen": -576.19033203125, "logps/rejected": -617.7624162946429, "loss": 0.0332, "rewards/chosen": 11.363107299804687, "rewards/margins": 27.951182120186942, "rewards/rejected": -16.588074820382253, "step": 3713 }, { "epoch": 0.9293131490053798, "grad_norm": 6.78125, "kl": 8.423606872558594, "learning_rate": 5e-06, "logits/chosen": -47330408.72727273, "logits/rejected": -27152969.846153848, "logps/chosen": -365.7396129261364, "logps/rejected": -658.6613581730769, "loss": 0.071, "rewards/chosen": 7.067854447798296, "rewards/margins": 29.247117049210555, "rewards/rejected": -22.17926260141226, "step": 3714 }, { "epoch": 0.9295633679469536, "grad_norm": 11.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47833331.2, "logits/rejected": -45388032.0, "logps/chosen": -304.074267578125, "logps/rejected": -640.4561244419643, "loss": 0.0226, "rewards/chosen": 6.993989562988281, "rewards/margins": 29.67812826974051, "rewards/rejected": -22.684138706752233, "step": 3715 }, { "epoch": 0.9298135868885274, "grad_norm": 3.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47868002.461538464, "logits/rejected": -32947729.454545453, "logps/chosen": -365.99673227163464, "logps/rejected": -587.7440962357955, "loss": 0.0407, "rewards/chosen": 9.957493708683895, "rewards/margins": 34.6107085968231, "rewards/rejected": -24.653214888139203, "step": 3716 }, { "epoch": 0.9300638058301013, "grad_norm": 1.3125, "kl": 13.263101577758789, "learning_rate": 5e-06, "logits/chosen": -56243035.428571425, "logits/rejected": -46777529.6, "logps/chosen": -412.14571707589283, "logps/rejected": -647.4697265625, "loss": 0.0293, "rewards/chosen": 8.966896057128906, "rewards/margins": 30.687646484375, "rewards/rejected": -21.720750427246095, "step": 3717 }, { "epoch": 0.9303140247716752, "grad_norm": 1.359375, "kl": 8.214506149291992, "learning_rate": 5e-06, "logits/chosen": -45975008.0, "logits/rejected": -43574688.0, "logps/chosen": -447.02845982142856, "logps/rejected": -699.840380859375, "loss": 0.0011, "rewards/chosen": 10.724009922572545, "rewards/margins": 31.35533621651786, "rewards/rejected": -20.631326293945314, "step": 3718 }, { "epoch": 0.9305642437132491, "grad_norm": 3.65625, "kl": 4.150592803955078, "learning_rate": 5e-06, "logits/chosen": -51912950.15384615, "logits/rejected": -53771502.54545455, "logps/chosen": -489.05258413461536, "logps/rejected": -893.3158735795455, "loss": 0.0126, "rewards/chosen": 9.8624267578125, "rewards/margins": 38.26237349076705, "rewards/rejected": -28.399946732954547, "step": 3719 }, { "epoch": 0.9308144626548229, "grad_norm": 1.3046875, "kl": 0.007616996765136719, "learning_rate": 5e-06, "logits/chosen": -26754808.0, "logits/rejected": -74242970.66666667, "logps/chosen": -282.1256103515625, "logps/rejected": -735.69189453125, "loss": 0.0322, "rewards/chosen": 7.477465311686198, "rewards/margins": 33.24460093180338, "rewards/rejected": -25.767135620117188, "step": 3720 }, { "epoch": 0.9310646815963969, "grad_norm": 11.9375, "kl": 3.1887454986572266, "learning_rate": 5e-06, "logits/chosen": -57972976.0, "logits/rejected": -65607861.333333336, "logps/chosen": -445.2245686848958, "logps/rejected": -745.7523600260416, "loss": 0.0187, "rewards/chosen": 9.01176643371582, "rewards/margins": 36.364722569783524, "rewards/rejected": -27.352956136067707, "step": 3721 }, { "epoch": 0.9313149005379707, "grad_norm": 3.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51586737.777777776, "logits/rejected": -36257745.06666667, "logps/chosen": -512.1311306423611, "logps/rejected": -490.779296875, "loss": 0.0055, "rewards/chosen": 11.203750610351562, "rewards/margins": 29.190721638997395, "rewards/rejected": -17.986971028645833, "step": 3722 }, { "epoch": 0.9315651194795446, "grad_norm": 2.390625, "kl": 0.46158599853515625, "learning_rate": 5e-06, "logits/chosen": -49872608.0, "logits/rejected": -58409088.0, "logps/chosen": -417.1673177083333, "logps/rejected": -623.9833170572916, "loss": 0.035, "rewards/chosen": 10.196973164876303, "rewards/margins": 32.75976053873698, "rewards/rejected": -22.562787373860676, "step": 3723 }, { "epoch": 0.9318153384211185, "grad_norm": 6.0, "kl": 0.8521296381950378, "learning_rate": 5e-06, "logits/chosen": -27727817.846153848, "logits/rejected": -10215963.636363637, "logps/chosen": -397.4178936298077, "logps/rejected": -429.93794389204544, "loss": 0.0449, "rewards/chosen": 8.25039555476262, "rewards/margins": 23.550047654371994, "rewards/rejected": -15.299652099609375, "step": 3724 }, { "epoch": 0.9320655573626924, "grad_norm": 11.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23437777.777777776, "logits/rejected": -28600893.866666667, "logps/chosen": -393.53911675347223, "logps/rejected": -622.2699869791667, "loss": 0.0587, "rewards/chosen": 8.319374932183159, "rewards/margins": 26.646492852105034, "rewards/rejected": -18.327117919921875, "step": 3725 }, { "epoch": 0.9323157763042662, "grad_norm": 9.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27401349.333333332, "logits/rejected": -52528728.88888889, "logps/chosen": -284.50889078776044, "logps/rejected": -671.14794921875, "loss": 0.0351, "rewards/chosen": 7.721078236897786, "rewards/margins": 30.882311079237194, "rewards/rejected": -23.16123284233941, "step": 3726 }, { "epoch": 0.9325659952458402, "grad_norm": 6.46875, "kl": 0.6262067556381226, "learning_rate": 5e-06, "logits/chosen": -49242885.81818182, "logits/rejected": -61143522.461538464, "logps/chosen": -408.6328125, "logps/rejected": -773.1281550480769, "loss": 0.0188, "rewards/chosen": 10.500030517578125, "rewards/margins": 40.4487046461839, "rewards/rejected": -29.94867412860577, "step": 3727 }, { "epoch": 0.932816214187414, "grad_norm": 10.5, "kl": 1.0335826873779297, "learning_rate": 5e-06, "logits/chosen": -59183572.0, "logits/rejected": -72408160.0, "logps/chosen": -386.2140808105469, "logps/rejected": -688.2503051757812, "loss": 0.0433, "rewards/chosen": 9.209988594055176, "rewards/margins": 26.654969215393066, "rewards/rejected": -17.44498062133789, "step": 3728 }, { "epoch": 0.9330664331289878, "grad_norm": 21.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72591808.0, "logits/rejected": -45603655.52941176, "logps/chosen": -368.183837890625, "logps/rejected": -809.3220358455883, "loss": 0.0346, "rewards/chosen": 6.652472904750279, "rewards/margins": 33.504270569617006, "rewards/rejected": -26.851797664866726, "step": 3729 }, { "epoch": 0.9333166520705617, "grad_norm": 7.96875, "kl": 4.339824676513672, "learning_rate": 5e-06, "logits/chosen": -58916240.0, "logits/rejected": 52773594.666666664, "logps/chosen": -267.23638916015625, "logps/rejected": -931.5350748697916, "loss": 0.0157, "rewards/chosen": 8.109144846598307, "rewards/margins": 35.10693232218424, "rewards/rejected": -26.997787475585938, "step": 3730 }, { "epoch": 0.9335668710121356, "grad_norm": 11.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36150306.461538464, "logits/rejected": 9082745.454545455, "logps/chosen": -388.4695387620192, "logps/rejected": -666.1118607954545, "loss": 0.04, "rewards/chosen": 6.263761667104868, "rewards/margins": 30.762843365435835, "rewards/rejected": -24.499081698330965, "step": 3731 }, { "epoch": 0.9338170899537095, "grad_norm": 6.3125, "kl": 4.371379852294922, "learning_rate": 5e-06, "logits/chosen": -46694469.81818182, "logits/rejected": -28355153.230769232, "logps/chosen": -458.1736949573864, "logps/rejected": -431.35160006009613, "loss": 0.0201, "rewards/chosen": 11.083661166104404, "rewards/margins": 29.901189523977003, "rewards/rejected": -18.817528357872597, "step": 3732 }, { "epoch": 0.9340673088952833, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -8285928.0, "logits/rejected": -24230909.714285713, "logps/chosen": -437.706982421875, "logps/rejected": -631.3773716517857, "loss": 0.0107, "rewards/chosen": 9.065396881103515, "rewards/margins": 33.61173782348633, "rewards/rejected": -24.546340942382812, "step": 3733 }, { "epoch": 0.9343175278368573, "grad_norm": 2.515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38453646.76923077, "logits/rejected": -74616424.72727273, "logps/chosen": -290.60263296274036, "logps/rejected": -844.4512606534091, "loss": 0.0815, "rewards/chosen": 7.211082458496094, "rewards/margins": 40.52576307816939, "rewards/rejected": -33.3146806196733, "step": 3734 }, { "epoch": 0.9345677467784311, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36598186.666666664, "logits/rejected": -41392834.666666664, "logps/chosen": -324.5240478515625, "logps/rejected": -814.3972981770834, "loss": 0.0125, "rewards/chosen": 9.42100461324056, "rewards/margins": 37.825896581014, "rewards/rejected": -28.404891967773438, "step": 3735 }, { "epoch": 0.934817965720005, "grad_norm": 2.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -67296723.2, "logits/rejected": -42944714.10526316, "logps/chosen": -427.38935546875, "logps/rejected": -637.7658305921053, "loss": 0.0074, "rewards/chosen": 10.53394012451172, "rewards/margins": 35.62255353425678, "rewards/rejected": -25.088613409745065, "step": 3736 }, { "epoch": 0.9350681846615789, "grad_norm": 2.390625, "kl": 4.419887542724609, "learning_rate": 5e-06, "logits/chosen": -41882102.15384615, "logits/rejected": 76265431.27272727, "logps/chosen": -520.6777719350962, "logps/rejected": -668.8429509943181, "loss": 0.0028, "rewards/chosen": 12.890312781700722, "rewards/margins": 40.335275716714925, "rewards/rejected": -27.444962935014203, "step": 3737 }, { "epoch": 0.9353184036031528, "grad_norm": 1.3046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60143084.307692304, "logits/rejected": -50749952.0, "logps/chosen": -351.4051983173077, "logps/rejected": -680.7304243607955, "loss": 0.0026, "rewards/chosen": 9.971133892352764, "rewards/margins": 35.6921976796397, "rewards/rejected": -25.721063787286933, "step": 3738 }, { "epoch": 0.9355686225447266, "grad_norm": 18.25, "kl": 3.025336265563965, "learning_rate": 5e-06, "logits/chosen": -31696558.769230768, "logits/rejected": -33734301.09090909, "logps/chosen": -426.8505108173077, "logps/rejected": -553.9435813210227, "loss": 0.0402, "rewards/chosen": 10.397782545823317, "rewards/margins": 28.228139490514366, "rewards/rejected": -17.83035694469105, "step": 3739 }, { "epoch": 0.9358188414863006, "grad_norm": 1.1484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47182820.571428575, "logits/rejected": -59330508.8, "logps/chosen": -420.0761021205357, "logps/rejected": -798.369140625, "loss": 0.003, "rewards/chosen": 9.45977783203125, "rewards/margins": 36.832254028320314, "rewards/rejected": -27.372476196289064, "step": 3740 }, { "epoch": 0.9360690604278744, "grad_norm": 8.5625, "kl": 9.53818416595459, "learning_rate": 5e-06, "logits/chosen": -49429549.71428572, "logits/rejected": -36252153.6, "logps/chosen": -319.5391322544643, "logps/rejected": -710.951904296875, "loss": 0.0725, "rewards/chosen": 6.999656677246094, "rewards/margins": 23.896250915527343, "rewards/rejected": -16.89659423828125, "step": 3741 }, { "epoch": 0.9363192793694483, "grad_norm": 2.65625, "kl": 2.960111141204834, "learning_rate": 5e-06, "logits/chosen": -32384626.285714287, "logits/rejected": -39933443.2, "logps/chosen": -424.60518973214283, "logps/rejected": -655.7033203125, "loss": 0.0248, "rewards/chosen": 8.663845607212611, "rewards/margins": 31.505135890415737, "rewards/rejected": -22.841290283203126, "step": 3742 }, { "epoch": 0.9365694983110221, "grad_norm": 1.1015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19679116.307692308, "logits/rejected": -48105879.27272727, "logps/chosen": -381.67202524038464, "logps/rejected": -661.2387251420455, "loss": 0.0179, "rewards/chosen": 10.64020244891827, "rewards/margins": 32.50675462842821, "rewards/rejected": -21.86655217950994, "step": 3743 }, { "epoch": 0.936819717252596, "grad_norm": 2.015625, "kl": 6.166493892669678, "learning_rate": 5e-06, "logits/chosen": -23343220.363636363, "logits/rejected": -22544152.615384616, "logps/chosen": -278.97270063920456, "logps/rejected": -705.8221153846154, "loss": 0.0635, "rewards/chosen": 8.095463145862926, "rewards/margins": 30.49448986987134, "rewards/rejected": -22.399026724008415, "step": 3744 }, { "epoch": 0.9370699361941699, "grad_norm": 7.5, "kl": 7.7577619552612305, "learning_rate": 5e-06, "logits/chosen": -43230653.333333336, "logits/rejected": -28837216.0, "logps/chosen": -458.3128255208333, "logps/rejected": -679.1248372395834, "loss": 0.0099, "rewards/chosen": 11.211034138997396, "rewards/margins": 34.11735280354818, "rewards/rejected": -22.90631866455078, "step": 3745 }, { "epoch": 0.9373201551357437, "grad_norm": 4.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60484676.571428575, "logits/rejected": -49555020.8, "logps/chosen": -321.01942661830356, "logps/rejected": -772.556396484375, "loss": 0.0155, "rewards/chosen": 9.171854291643415, "rewards/margins": 34.971402631487166, "rewards/rejected": -25.79954833984375, "step": 3746 }, { "epoch": 0.9375703740773177, "grad_norm": 8.0625, "kl": 4.670871734619141, "learning_rate": 5e-06, "logits/chosen": -59696077.71428572, "logits/rejected": -20999092.8, "logps/chosen": -470.66552734375, "logps/rejected": -695.096484375, "loss": 0.0129, "rewards/chosen": 10.515856061662946, "rewards/margins": 30.583046613420755, "rewards/rejected": -20.06719055175781, "step": 3747 }, { "epoch": 0.9378205930188915, "grad_norm": 0.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65790236.44444445, "logits/rejected": 33705531.733333334, "logps/chosen": -409.43028428819446, "logps/rejected": -638.7223958333333, "loss": 0.0325, "rewards/chosen": 9.500165303548178, "rewards/margins": 30.179209899902347, "rewards/rejected": -20.679044596354167, "step": 3748 }, { "epoch": 0.9380708119604654, "grad_norm": 0.7578125, "kl": 0.13533911108970642, "learning_rate": 5e-06, "logits/chosen": -46760029.538461536, "logits/rejected": -21868519.272727273, "logps/chosen": -359.56107271634613, "logps/rejected": -947.2347301136364, "loss": 0.0064, "rewards/chosen": 9.611349252554087, "rewards/margins": 37.564108041616585, "rewards/rejected": -27.9527587890625, "step": 3749 }, { "epoch": 0.9383210309020393, "grad_norm": 8.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16086424.0, "logits/rejected": -54801546.10526316, "logps/chosen": -328.6706298828125, "logps/rejected": -545.116365131579, "loss": 0.0265, "rewards/chosen": 6.712380218505859, "rewards/margins": 24.638511617560138, "rewards/rejected": -17.926131399054277, "step": 3750 }, { "epoch": 0.9385712498436132, "grad_norm": 6.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34635584.0, "logits/rejected": -25193022.0, "logps/chosen": -425.52740478515625, "logps/rejected": -719.9127807617188, "loss": 0.0233, "rewards/chosen": 10.331106185913086, "rewards/margins": 31.494319915771484, "rewards/rejected": -21.1632137298584, "step": 3751 }, { "epoch": 0.938821468785187, "grad_norm": 3.265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -2163763.2, "logits/rejected": -28687675.42857143, "logps/chosen": -312.575244140625, "logps/rejected": -690.8864397321429, "loss": 0.0665, "rewards/chosen": 6.536237335205078, "rewards/margins": 30.743754686628066, "rewards/rejected": -24.20751735142299, "step": 3752 }, { "epoch": 0.9390716877267609, "grad_norm": 1.2578125, "kl": 4.232607841491699, "learning_rate": 5e-06, "logits/chosen": -23613708.8, "logits/rejected": -54907004.44444445, "logps/chosen": -430.9554036458333, "logps/rejected": -627.6022677951389, "loss": 0.0027, "rewards/chosen": 8.968729654947916, "rewards/margins": 27.106306287977432, "rewards/rejected": -18.137576633029514, "step": 3753 }, { "epoch": 0.9393219066683348, "grad_norm": 5.375, "kl": 12.011497497558594, "learning_rate": 5e-06, "logits/chosen": -55577384.0, "logits/rejected": -48588024.0, "logps/chosen": -386.369873046875, "logps/rejected": -437.56298828125, "loss": 0.0289, "rewards/chosen": 9.366955757141113, "rewards/margins": 24.25215244293213, "rewards/rejected": -14.885196685791016, "step": 3754 }, { "epoch": 0.9395721256099087, "grad_norm": 17.5, "kl": 3.341841459274292, "learning_rate": 5e-06, "logits/chosen": -46243566.54545455, "logits/rejected": -41828952.615384616, "logps/chosen": -371.03848544034093, "logps/rejected": -585.87451171875, "loss": 0.1075, "rewards/chosen": 9.532931241122158, "rewards/margins": 26.69799729994127, "rewards/rejected": -17.16506605881911, "step": 3755 }, { "epoch": 0.9398223445514825, "grad_norm": 13.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32987854.769230768, "logits/rejected": -24177597.09090909, "logps/chosen": -402.4352463942308, "logps/rejected": -724.7319779829545, "loss": 0.0527, "rewards/chosen": 9.85379145695613, "rewards/margins": 30.4667023345307, "rewards/rejected": -20.612910877574574, "step": 3756 }, { "epoch": 0.9400725634930565, "grad_norm": 7.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30146646.85714286, "logits/rejected": -35832256.0, "logps/chosen": -294.4813755580357, "logps/rejected": -815.4399701286765, "loss": 0.0155, "rewards/chosen": 7.019434247698102, "rewards/margins": 30.104617784003253, "rewards/rejected": -23.08518353630515, "step": 3757 }, { "epoch": 0.9403227824346303, "grad_norm": 14.0625, "kl": 8.94985580444336, "learning_rate": 5e-06, "logits/chosen": -32803754.666666668, "logits/rejected": -49997429.333333336, "logps/chosen": -354.2955729166667, "logps/rejected": -758.4219563802084, "loss": 0.1008, "rewards/chosen": 9.271331151326498, "rewards/margins": 30.976027806599937, "rewards/rejected": -21.704696655273438, "step": 3758 }, { "epoch": 0.9405730013762041, "grad_norm": 1.4140625, "kl": 7.016010284423828, "learning_rate": 5e-06, "logits/chosen": -3993308.923076923, "logits/rejected": -17077384.727272727, "logps/chosen": -403.1871995192308, "logps/rejected": -758.7692649147727, "loss": 0.0112, "rewards/chosen": 10.312451876126802, "rewards/margins": 34.32435373159555, "rewards/rejected": -24.01190185546875, "step": 3759 }, { "epoch": 0.9408232203177781, "grad_norm": 1.109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43842537.6, "logits/rejected": -28131154.285714287, "logps/chosen": -349.61201171875, "logps/rejected": -533.9872349330357, "loss": 0.0103, "rewards/chosen": 10.215606689453125, "rewards/margins": 29.725306919642858, "rewards/rejected": -19.509700230189733, "step": 3760 }, { "epoch": 0.9410734392593519, "grad_norm": 5.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -86298861.71428572, "logits/rejected": -51930057.6, "logps/chosen": -425.345703125, "logps/rejected": -621.291845703125, "loss": 0.0216, "rewards/chosen": 9.118870326450892, "rewards/margins": 26.624976893833704, "rewards/rejected": -17.506106567382812, "step": 3761 }, { "epoch": 0.9413236582009258, "grad_norm": 14.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49416675.55555555, "logits/rejected": -63749529.6, "logps/chosen": -444.2258572048611, "logps/rejected": -730.9154296875, "loss": 0.0564, "rewards/chosen": 8.821803622775608, "rewards/margins": 33.69162784152561, "rewards/rejected": -24.86982421875, "step": 3762 }, { "epoch": 0.9415738771424997, "grad_norm": 11.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40426088.72727273, "logits/rejected": -47342759.384615384, "logps/chosen": -449.34596946022725, "logps/rejected": -553.3724834735577, "loss": 0.0403, "rewards/chosen": 10.6955302845348, "rewards/margins": 25.70827606841401, "rewards/rejected": -15.012745783879208, "step": 3763 }, { "epoch": 0.9418240960840736, "grad_norm": 8.9375, "kl": 8.958549499511719, "learning_rate": 5e-06, "logits/chosen": -22657958.0, "logits/rejected": -23025836.0, "logps/chosen": -419.4216613769531, "logps/rejected": -569.431640625, "loss": 0.0746, "rewards/chosen": 9.36103343963623, "rewards/margins": 23.90878963470459, "rewards/rejected": -14.54775619506836, "step": 3764 }, { "epoch": 0.9420743150256474, "grad_norm": 0.0262451171875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65783824.0, "logits/rejected": -46325248.0, "logps/chosen": -520.4666748046875, "logps/rejected": -705.7529296875, "loss": 0.0, "rewards/chosen": 13.508212089538574, "rewards/margins": 37.70999240875244, "rewards/rejected": -24.201780319213867, "step": 3765 }, { "epoch": 0.9423245339672213, "grad_norm": 1.046875, "kl": 3.8747966289520264, "learning_rate": 5e-06, "logits/chosen": -53797978.666666664, "logits/rejected": -42411189.333333336, "logps/chosen": -458.496337890625, "logps/rejected": -627.2190755208334, "loss": 0.0108, "rewards/chosen": 10.138379414876303, "rewards/margins": 31.657127380371094, "rewards/rejected": -21.518747965494793, "step": 3766 }, { "epoch": 0.9425747529087952, "grad_norm": 4.28125, "kl": 7.057338237762451, "learning_rate": 5e-06, "logits/chosen": -48369382.4, "logits/rejected": -76089528.8888889, "logps/chosen": -437.45091145833334, "logps/rejected": -643.6836480034722, "loss": 0.0201, "rewards/chosen": 8.683896891276042, "rewards/margins": 31.425699530707465, "rewards/rejected": -22.741802639431423, "step": 3767 }, { "epoch": 0.9428249718503691, "grad_norm": 8.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33344704.0, "logits/rejected": -40090994.28571428, "logps/chosen": -311.484326171875, "logps/rejected": -692.8228236607143, "loss": 0.0447, "rewards/chosen": 7.7175453186035154, "rewards/margins": 29.06184027535575, "rewards/rejected": -21.344294956752233, "step": 3768 }, { "epoch": 0.9430751907919429, "grad_norm": 1.8515625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39092280.0, "logits/rejected": -60036408.0, "logps/chosen": -319.80633544921875, "logps/rejected": -735.3450927734375, "loss": 0.0412, "rewards/chosen": 8.660524368286133, "rewards/margins": 34.95990562438965, "rewards/rejected": -26.299381256103516, "step": 3769 }, { "epoch": 0.9433254097335169, "grad_norm": 0.640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40973193.6, "logits/rejected": -55989330.28571428, "logps/chosen": -374.49833984375, "logps/rejected": -715.5750558035714, "loss": 0.0135, "rewards/chosen": 10.01400146484375, "rewards/margins": 31.73235909598214, "rewards/rejected": -21.718357631138392, "step": 3770 }, { "epoch": 0.9435756286750907, "grad_norm": 2.890625, "kl": 1.5402755737304688, "learning_rate": 5e-06, "logits/chosen": -41408905.84615385, "logits/rejected": -26084264.727272727, "logps/chosen": -401.9069260817308, "logps/rejected": -871.4486860795455, "loss": 0.045, "rewards/chosen": 8.598973787747896, "rewards/margins": 35.644078768216644, "rewards/rejected": -27.04510498046875, "step": 3771 }, { "epoch": 0.9438258476166645, "grad_norm": 33.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11237077.818181818, "logits/rejected": -14647158.153846154, "logps/chosen": -513.8740678267045, "logps/rejected": -755.6401742788462, "loss": 0.0299, "rewards/chosen": 8.47186348655007, "rewards/margins": 27.952313456501994, "rewards/rejected": -19.480449969951923, "step": 3772 }, { "epoch": 0.9440760665582385, "grad_norm": 0.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37644730.18181818, "logits/rejected": -59118217.84615385, "logps/chosen": -383.2120472301136, "logps/rejected": -738.6766826923077, "loss": 0.0095, "rewards/chosen": 9.43123418634588, "rewards/margins": 36.4097052087317, "rewards/rejected": -26.97847102238582, "step": 3773 }, { "epoch": 0.9443262854998123, "grad_norm": 2.796875, "kl": 11.554250717163086, "learning_rate": 5e-06, "logits/chosen": -53918005.333333336, "logits/rejected": -53164677.333333336, "logps/chosen": -406.2027994791667, "logps/rejected": -762.990966796875, "loss": 0.0909, "rewards/chosen": 10.787008921305338, "rewards/margins": 33.900404612223305, "rewards/rejected": -23.11339569091797, "step": 3774 }, { "epoch": 0.9445765044413862, "grad_norm": 5.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53935409.777777776, "logits/rejected": -52801352.53333333, "logps/chosen": -302.1626790364583, "logps/rejected": -569.7979166666667, "loss": 0.013, "rewards/chosen": 7.197200351291233, "rewards/margins": 25.57388492160373, "rewards/rejected": -18.3766845703125, "step": 3775 }, { "epoch": 0.9448267233829601, "grad_norm": 8.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55364387.55555555, "logits/rejected": -38565789.86666667, "logps/chosen": -368.0413411458333, "logps/rejected": -511.10911458333334, "loss": 0.0366, "rewards/chosen": 7.864317152235243, "rewards/margins": 27.399892510308156, "rewards/rejected": -19.535575358072915, "step": 3776 }, { "epoch": 0.945076942324534, "grad_norm": 3.546875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40192880.0, "logits/rejected": -91861208.0, "logps/chosen": -410.23138427734375, "logps/rejected": -1197.84423828125, "loss": 0.0111, "rewards/chosen": 10.929574966430664, "rewards/margins": 52.52945518493652, "rewards/rejected": -41.59988021850586, "step": 3777 }, { "epoch": 0.9453271612661078, "grad_norm": 6.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40051108.571428575, "logits/rejected": -39784614.4, "logps/chosen": -316.51559012276783, "logps/rejected": -648.75126953125, "loss": 0.0468, "rewards/chosen": 7.9384662083217075, "rewards/margins": 29.15797914777483, "rewards/rejected": -21.219512939453125, "step": 3778 }, { "epoch": 0.9455773802076817, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40503266.13333333, "logits/rejected": -53907648.0, "logps/chosen": -390.42766927083335, "logps/rejected": -595.6471354166666, "loss": 0.0354, "rewards/chosen": 8.561541748046874, "rewards/margins": 30.970933363172744, "rewards/rejected": -22.40939161512587, "step": 3779 }, { "epoch": 0.9458275991492556, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59208885.333333336, "logits/rejected": -58841787.733333334, "logps/chosen": -327.1113009982639, "logps/rejected": -759.5977864583333, "loss": 0.0244, "rewards/chosen": 9.334747314453125, "rewards/margins": 33.580501302083334, "rewards/rejected": -24.24575398763021, "step": 3780 }, { "epoch": 0.9460778180908295, "grad_norm": 0.58203125, "kl": 5.253135681152344, "learning_rate": 5e-06, "logits/chosen": -74490830.76923077, "logits/rejected": -33292805.818181816, "logps/chosen": -414.23035606971155, "logps/rejected": -427.10964133522725, "loss": 0.0013, "rewards/chosen": 11.294388991135817, "rewards/margins": 26.21016463699874, "rewards/rejected": -14.915775645862926, "step": 3781 }, { "epoch": 0.9463280370324033, "grad_norm": 8.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20576890.666666668, "logits/rejected": -14607856.0, "logps/chosen": -334.4853515625, "logps/rejected": -800.1109212239584, "loss": 0.0361, "rewards/chosen": 6.976909637451172, "rewards/margins": 31.142009735107422, "rewards/rejected": -24.16510009765625, "step": 3782 }, { "epoch": 0.9465782559739773, "grad_norm": 1.09375, "kl": 11.445440292358398, "learning_rate": 5e-06, "logits/chosen": -54333213.538461536, "logits/rejected": -38881413.81818182, "logps/chosen": -430.8821364182692, "logps/rejected": -674.87841796875, "loss": 0.034, "rewards/chosen": 10.138096736027645, "rewards/margins": 35.76309673602765, "rewards/rejected": -25.625, "step": 3783 }, { "epoch": 0.9468284749155511, "grad_norm": 3.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -59167424.0, "logits/rejected": -27012451.2, "logps/chosen": -332.4685756138393, "logps/rejected": -661.6291015625, "loss": 0.0441, "rewards/chosen": 8.515264238630023, "rewards/margins": 26.76898433140346, "rewards/rejected": -18.253720092773438, "step": 3784 }, { "epoch": 0.947078693857125, "grad_norm": 5.40625, "kl": 8.7257080078125, "learning_rate": 5e-06, "logits/chosen": -47117354.666666664, "logits/rejected": 75006890.66666667, "logps/chosen": -394.72958984375, "logps/rejected": -714.5469835069445, "loss": 0.0134, "rewards/chosen": 10.296828206380209, "rewards/margins": 31.86348876953125, "rewards/rejected": -21.566660563151043, "step": 3785 }, { "epoch": 0.9473289127986989, "grad_norm": 4.03125, "kl": 5.014748573303223, "learning_rate": 5e-06, "logits/chosen": -52380571.428571425, "logits/rejected": -35608057.6, "logps/chosen": -378.90073939732144, "logps/rejected": -600.69052734375, "loss": 0.0804, "rewards/chosen": 9.719670976911273, "rewards/margins": 28.280777849469864, "rewards/rejected": -18.561106872558593, "step": 3786 }, { "epoch": 0.9475791317402728, "grad_norm": 0.47265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -50876924.44444445, "logits/rejected": -48648499.2, "logps/chosen": -385.52745225694446, "logps/rejected": -671.1296223958333, "loss": 0.0018, "rewards/chosen": 9.264128790961372, "rewards/margins": 32.44622717963325, "rewards/rejected": -23.182098388671875, "step": 3787 }, { "epoch": 0.9478293506818466, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65908036.92307692, "logits/rejected": -36172296.72727273, "logps/chosen": -477.62267127403845, "logps/rejected": -635.6352982954545, "loss": 0.0481, "rewards/chosen": 8.829953120304989, "rewards/margins": 32.68953064605073, "rewards/rejected": -23.85957752574574, "step": 3788 }, { "epoch": 0.9480795696234205, "grad_norm": 1.28125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32335712.0, "logits/rejected": -48316750.76923077, "logps/chosen": -364.0311390269886, "logps/rejected": -651.7323467548077, "loss": 0.0566, "rewards/chosen": 9.95227744362571, "rewards/margins": 37.16128529368581, "rewards/rejected": -27.209007850060097, "step": 3789 }, { "epoch": 0.9483297885649944, "grad_norm": 1.5859375, "kl": 8.490150451660156, "learning_rate": 5e-06, "logits/chosen": -19082315.42857143, "logits/rejected": -39224995.2, "logps/chosen": -345.82603236607144, "logps/rejected": -895.23544921875, "loss": 0.0599, "rewards/chosen": 7.360896519252232, "rewards/margins": 36.67867911202567, "rewards/rejected": -29.317782592773437, "step": 3790 }, { "epoch": 0.9485800075065682, "grad_norm": 3.328125, "kl": 5.8852858543396, "learning_rate": 5e-06, "logits/chosen": -58795672.615384616, "logits/rejected": -63537384.72727273, "logps/chosen": -418.384765625, "logps/rejected": -713.4153497869319, "loss": 0.0087, "rewards/chosen": 9.715853177584135, "rewards/margins": 34.93949986171056, "rewards/rejected": -25.22364668412642, "step": 3791 }, { "epoch": 0.9488302264481421, "grad_norm": 5.25, "kl": 4.169834136962891, "learning_rate": 5e-06, "logits/chosen": -31484921.14285714, "logits/rejected": -78941401.6, "logps/chosen": -295.61781529017856, "logps/rejected": -879.0400390625, "loss": 0.0431, "rewards/chosen": 7.6694199698311945, "rewards/margins": 31.34405681065151, "rewards/rejected": -23.674636840820312, "step": 3792 }, { "epoch": 0.949080445389716, "grad_norm": 0.91015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53337629.538461536, "logits/rejected": -77459421.0909091, "logps/chosen": -377.2835036057692, "logps/rejected": -823.4643998579545, "loss": 0.0127, "rewards/chosen": 9.11517333984375, "rewards/margins": 36.03184925426136, "rewards/rejected": -26.916675914417613, "step": 3793 }, { "epoch": 0.9493306643312899, "grad_norm": 4.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -61373172.36363637, "logits/rejected": -51723874.461538464, "logps/chosen": -417.7513316761364, "logps/rejected": -696.4618389423077, "loss": 0.022, "rewards/chosen": 11.831304376775568, "rewards/margins": 36.82902729594624, "rewards/rejected": -24.997722919170673, "step": 3794 }, { "epoch": 0.9495808832728637, "grad_norm": 7.25, "kl": 12.1416015625, "learning_rate": 5e-06, "logits/chosen": -67905614.76923077, "logits/rejected": -23353488.0, "logps/chosen": -387.5446965144231, "logps/rejected": -621.1163441051136, "loss": 0.0166, "rewards/chosen": 9.56540738619291, "rewards/margins": 28.625521466448593, "rewards/rejected": -19.060114080255683, "step": 3795 }, { "epoch": 0.9498311022144377, "grad_norm": 5.46875, "kl": 1.939288854598999, "learning_rate": 5e-06, "logits/chosen": -50235898.666666664, "logits/rejected": -58083562.666666664, "logps/chosen": -372.0367431640625, "logps/rejected": -579.6534830729166, "loss": 0.0178, "rewards/chosen": 8.998250325520834, "rewards/margins": 28.534975687662758, "rewards/rejected": -19.536725362141926, "step": 3796 }, { "epoch": 0.9500813211560115, "grad_norm": 7.34375, "kl": 0.25363922119140625, "learning_rate": 5e-06, "logits/chosen": -52697370.666666664, "logits/rejected": -69250197.33333333, "logps/chosen": -565.8408610026041, "logps/rejected": -509.6327311197917, "loss": 0.042, "rewards/chosen": 10.185548146565756, "rewards/margins": 27.066181182861328, "rewards/rejected": -16.880633036295574, "step": 3797 }, { "epoch": 0.9503315400975854, "grad_norm": 0.953125, "kl": 8.429555892944336, "learning_rate": 5e-06, "logits/chosen": -55832167.384615384, "logits/rejected": -27408093.09090909, "logps/chosen": -468.6623347355769, "logps/rejected": -892.8915127840909, "loss": 0.0021, "rewards/chosen": 11.061791053185097, "rewards/margins": 35.336796046970605, "rewards/rejected": -24.27500499378551, "step": 3798 }, { "epoch": 0.9505817590391593, "grad_norm": 2.234375, "kl": 11.407859802246094, "learning_rate": 5e-06, "logits/chosen": -51186924.307692304, "logits/rejected": -30782376.727272727, "logps/chosen": -343.0295973557692, "logps/rejected": -504.5138050426136, "loss": 0.0032, "rewards/chosen": 9.838184650127705, "rewards/margins": 28.28085044380668, "rewards/rejected": -18.442665793678977, "step": 3799 }, { "epoch": 0.9508319779807332, "grad_norm": 2.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31134425.6, "logits/rejected": -63519701.333333336, "logps/chosen": -409.1526692708333, "logps/rejected": -691.6558159722222, "loss": 0.0372, "rewards/chosen": 9.865034993489584, "rewards/margins": 32.599343872070314, "rewards/rejected": -22.73430887858073, "step": 3800 }, { "epoch": 0.951082196922307, "grad_norm": 5.15625, "kl": 5.55655574798584, "learning_rate": 5e-06, "logits/chosen": -72932181.33333333, "logits/rejected": -76402396.44444445, "logps/chosen": -426.03297526041666, "logps/rejected": -944.8736979166666, "loss": 0.0183, "rewards/chosen": 9.49543965657552, "rewards/margins": 31.519039916992188, "rewards/rejected": -22.023600260416668, "step": 3801 }, { "epoch": 0.9513324158638808, "grad_norm": 4.5, "kl": 3.1226768493652344, "learning_rate": 5e-06, "logits/chosen": -76730135.27272727, "logits/rejected": -68339692.3076923, "logps/chosen": -429.6572265625, "logps/rejected": -692.8033353365385, "loss": 0.0322, "rewards/chosen": 10.942499334161932, "rewards/margins": 33.83553757033982, "rewards/rejected": -22.893038236177883, "step": 3802 }, { "epoch": 0.9515826348054548, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56076018.28571428, "logits/rejected": -26332581.647058822, "logps/chosen": -426.9267578125, "logps/rejected": -758.7770565257352, "loss": 0.0126, "rewards/chosen": 9.774431501116071, "rewards/margins": 34.82515171595982, "rewards/rejected": -25.05072021484375, "step": 3803 }, { "epoch": 0.9518328537470286, "grad_norm": 19.375, "kl": 3.488145351409912, "learning_rate": 5e-06, "logits/chosen": -26593225.14285714, "logits/rejected": -56995757.176470585, "logps/chosen": -342.9404296875, "logps/rejected": -663.2612591911765, "loss": 0.0544, "rewards/chosen": 10.389007568359375, "rewards/margins": 28.248809814453125, "rewards/rejected": -17.85980224609375, "step": 3804 }, { "epoch": 0.9520830726886025, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40587483.428571425, "logits/rejected": -62314528.0, "logps/chosen": -357.28909737723217, "logps/rejected": -698.70244140625, "loss": 0.0321, "rewards/chosen": 9.805108206612724, "rewards/margins": 30.95259050641741, "rewards/rejected": -21.147482299804686, "step": 3805 }, { "epoch": 0.9523332916301764, "grad_norm": 3.203125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12591857.6, "logits/rejected": -67513694.31578948, "logps/chosen": -259.782861328125, "logps/rejected": -598.796875, "loss": 0.0277, "rewards/chosen": 5.91339111328125, "rewards/margins": 23.642562063116777, "rewards/rejected": -17.729170949835527, "step": 3806 }, { "epoch": 0.9525835105717503, "grad_norm": 8.625, "kl": 2.2471747398376465, "learning_rate": 5e-06, "logits/chosen": -43714816.0, "logits/rejected": -64175654.4, "logps/chosen": -370.87869698660717, "logps/rejected": -622.55576171875, "loss": 0.0565, "rewards/chosen": 8.174957820347377, "rewards/margins": 26.869446345738, "rewards/rejected": -18.694488525390625, "step": 3807 }, { "epoch": 0.9528337295133241, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24678897.777777776, "logits/rejected": -34937514.666666664, "logps/chosen": -460.6534830729167, "logps/rejected": -655.025, "loss": 0.0253, "rewards/chosen": 9.071512858072916, "rewards/margins": 27.85586954752604, "rewards/rejected": -18.784356689453126, "step": 3808 }, { "epoch": 0.9530839484548981, "grad_norm": 4.1875, "kl": 6.0159077644348145, "learning_rate": 5e-06, "logits/chosen": -74684583.38461539, "logits/rejected": -57846109.09090909, "logps/chosen": -385.26355919471155, "logps/rejected": -629.7611416903409, "loss": 0.0534, "rewards/chosen": 9.353405292217548, "rewards/margins": 31.57022436181982, "rewards/rejected": -22.216819069602273, "step": 3809 }, { "epoch": 0.9533341673964719, "grad_norm": 3.09375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -41737565.09090909, "logits/rejected": -43249604.92307692, "logps/chosen": -319.38645241477275, "logps/rejected": -720.6691706730769, "loss": 0.0231, "rewards/chosen": 8.204455982555043, "rewards/margins": 29.226794849742543, "rewards/rejected": -21.0223388671875, "step": 3810 }, { "epoch": 0.9535843863380458, "grad_norm": 2.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -66808120.88888889, "logits/rejected": -51564514.13333333, "logps/chosen": -367.369384765625, "logps/rejected": -730.6117838541667, "loss": 0.009, "rewards/chosen": 8.886767069498697, "rewards/margins": 31.945116678873696, "rewards/rejected": -23.058349609375, "step": 3811 }, { "epoch": 0.9538346052796197, "grad_norm": 6.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29688746.666666668, "logits/rejected": -80733664.0, "logps/chosen": -341.32375081380206, "logps/rejected": -684.650390625, "loss": 0.0485, "rewards/chosen": 7.714158376057942, "rewards/margins": 28.267192840576172, "rewards/rejected": -20.55303446451823, "step": 3812 }, { "epoch": 0.9540848242211936, "grad_norm": 10.9375, "kl": 9.035867691040039, "learning_rate": 5e-06, "logits/chosen": -36842096.0, "logits/rejected": -73049141.33333333, "logps/chosen": -370.1439615885417, "logps/rejected": -683.66357421875, "loss": 0.11, "rewards/chosen": 8.83652114868164, "rewards/margins": 27.136847178141277, "rewards/rejected": -18.300326029459637, "step": 3813 }, { "epoch": 0.9543350431627674, "grad_norm": 5.84375, "kl": 10.510923385620117, "learning_rate": 5e-06, "logits/chosen": -33939623.384615384, "logits/rejected": -47484384.0, "logps/chosen": -375.95838341346155, "logps/rejected": -537.7414328835227, "loss": 0.0416, "rewards/chosen": 9.93990501990685, "rewards/margins": 24.21783970119236, "rewards/rejected": -14.277934681285512, "step": 3814 }, { "epoch": 0.9545852621043412, "grad_norm": 1.5078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21632276.8, "logits/rejected": -25065737.14285714, "logps/chosen": -351.5560302734375, "logps/rejected": -533.1035853794643, "loss": 0.0298, "rewards/chosen": 9.866575622558594, "rewards/margins": 32.62342027936663, "rewards/rejected": -22.756844656808035, "step": 3815 }, { "epoch": 0.9548354810459152, "grad_norm": 0.99609375, "kl": 0.12196986377239227, "learning_rate": 5e-06, "logits/chosen": -39988608.0, "logits/rejected": -54998028.8, "logps/chosen": -421.6563197544643, "logps/rejected": -693.07021484375, "loss": 0.0191, "rewards/chosen": 9.631364004952568, "rewards/margins": 36.0542979649135, "rewards/rejected": -26.422933959960936, "step": 3816 }, { "epoch": 0.955085699987489, "grad_norm": 1.7890625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -54795264.0, "logits/rejected": 41952544.0, "logps/chosen": -471.416259765625, "logps/rejected": -682.9248046875, "loss": 0.0039, "rewards/chosen": 9.791301727294922, "rewards/margins": 31.806022099086217, "rewards/rejected": -22.014720371791295, "step": 3817 }, { "epoch": 0.9553359189290629, "grad_norm": 4.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -13535209.333333334, "logits/rejected": -36767306.666666664, "logps/chosen": -404.706298828125, "logps/rejected": -541.2242838541666, "loss": 0.0229, "rewards/chosen": 9.475298563639322, "rewards/margins": 29.997957865397133, "rewards/rejected": -20.522659301757812, "step": 3818 }, { "epoch": 0.9555861378706368, "grad_norm": 9.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44265088.0, "logits/rejected": -61911642.666666664, "logps/chosen": -340.550537109375, "logps/rejected": -591.4286702473959, "loss": 0.0644, "rewards/chosen": 9.378558476765951, "rewards/margins": 28.47223472595215, "rewards/rejected": -19.0936762491862, "step": 3819 }, { "epoch": 0.9558363568122107, "grad_norm": 4.15625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38584206.54545455, "logits/rejected": -30558355.692307692, "logps/chosen": -275.25337357954544, "logps/rejected": -556.8131009615385, "loss": 0.0237, "rewards/chosen": 8.516375454989346, "rewards/margins": 29.08074876478502, "rewards/rejected": -20.564373309795673, "step": 3820 }, { "epoch": 0.9560865757537845, "grad_norm": 14.9375, "kl": 0.35327786207199097, "learning_rate": 5e-06, "logits/chosen": -40974506.666666664, "logits/rejected": -68638620.44444445, "logps/chosen": -398.1684895833333, "logps/rejected": -648.5995551215278, "loss": 0.0447, "rewards/chosen": 9.299566650390625, "rewards/margins": 27.868817816840277, "rewards/rejected": -18.569251166449654, "step": 3821 }, { "epoch": 0.9563367946953585, "grad_norm": 3.84375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -49520516.92307692, "logits/rejected": -29289876.363636363, "logps/chosen": -325.5168644831731, "logps/rejected": -375.64901455965907, "loss": 0.0525, "rewards/chosen": 8.785682091346153, "rewards/margins": 22.543178318263767, "rewards/rejected": -13.757496226917613, "step": 3822 }, { "epoch": 0.9565870136369323, "grad_norm": 0.19140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -72421932.8, "logits/rejected": -73482477.71428572, "logps/chosen": -474.848388671875, "logps/rejected": -772.5800083705357, "loss": 0.0004, "rewards/chosen": 9.096759796142578, "rewards/margins": 35.16759981427874, "rewards/rejected": -26.07084001813616, "step": 3823 }, { "epoch": 0.9568372325785062, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20718141.53846154, "logits/rejected": -41214807.27272727, "logps/chosen": -260.9956805889423, "logps/rejected": -487.42764559659093, "loss": 0.0386, "rewards/chosen": 7.06999030480018, "rewards/margins": 24.937384492033843, "rewards/rejected": -17.867394187233664, "step": 3824 }, { "epoch": 0.9570874515200801, "grad_norm": 3.390625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -73609013.33333333, "logits/rejected": -47146618.666666664, "logps/chosen": -382.869384765625, "logps/rejected": -720.1292317708334, "loss": 0.0136, "rewards/chosen": 8.288688659667969, "rewards/margins": 35.175534566243485, "rewards/rejected": -26.88684590657552, "step": 3825 }, { "epoch": 0.957337670461654, "grad_norm": 33.25, "kl": 1.5505365133285522, "learning_rate": 5e-06, "logits/chosen": -44374230.4, "logits/rejected": -60039355.428571425, "logps/chosen": -270.5136474609375, "logps/rejected": -780.1568080357143, "loss": 0.0542, "rewards/chosen": 7.985226440429687, "rewards/margins": 33.07666582380022, "rewards/rejected": -25.091439383370535, "step": 3826 }, { "epoch": 0.9575878894032278, "grad_norm": 3.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38941168.0, "logits/rejected": -46705189.333333336, "logps/chosen": -336.78476969401044, "logps/rejected": -744.940673828125, "loss": 0.0538, "rewards/chosen": 8.69411849975586, "rewards/margins": 38.29082107543945, "rewards/rejected": -29.596702575683594, "step": 3827 }, { "epoch": 0.9578381083448017, "grad_norm": 0.5234375, "kl": 11.557807922363281, "learning_rate": 5e-06, "logits/chosen": -52912496.0, "logits/rejected": -35287282.666666664, "logps/chosen": -402.6027425130208, "logps/rejected": -813.48095703125, "loss": 0.043, "rewards/chosen": 11.748739878336588, "rewards/margins": 39.648398081461586, "rewards/rejected": -27.899658203125, "step": 3828 }, { "epoch": 0.9580883272863756, "grad_norm": 8.4375, "kl": 15.501982688903809, "learning_rate": 5e-06, "logits/chosen": -56329719.46666667, "logits/rejected": -23905813.333333332, "logps/chosen": -341.6076171875, "logps/rejected": -762.4013671875, "loss": 0.0324, "rewards/chosen": 8.641023763020833, "rewards/margins": 30.250497097439236, "rewards/rejected": -21.609473334418404, "step": 3829 }, { "epoch": 0.9583385462279495, "grad_norm": 1.1328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20670630.666666668, "logits/rejected": -68291701.33333333, "logps/chosen": -314.68979899088544, "logps/rejected": -653.5166015625, "loss": 0.0022, "rewards/chosen": 8.386489868164062, "rewards/margins": 28.009854634602863, "rewards/rejected": -19.6233647664388, "step": 3830 }, { "epoch": 0.9585887651695233, "grad_norm": 12.8125, "kl": 9.518377304077148, "learning_rate": 5e-06, "logits/chosen": -49091000.0, "logits/rejected": -85204000.0, "logps/chosen": -320.4339599609375, "logps/rejected": -593.9530029296875, "loss": 0.0517, "rewards/chosen": 7.857075214385986, "rewards/margins": 26.012341022491455, "rewards/rejected": -18.15526580810547, "step": 3831 }, { "epoch": 0.9588389841110972, "grad_norm": 2.828125, "kl": 7.422418117523193, "learning_rate": 5e-06, "logits/chosen": -16919313.14285714, "logits/rejected": -58099008.0, "logps/chosen": -360.50048828125, "logps/rejected": -800.7333984375, "loss": 0.026, "rewards/chosen": 9.73052978515625, "rewards/margins": 43.454653930664065, "rewards/rejected": -33.724124145507815, "step": 3832 }, { "epoch": 0.9590892030526711, "grad_norm": 2.015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65289045.333333336, "logits/rejected": -64758741.333333336, "logps/chosen": -509.7870686848958, "logps/rejected": -574.7178548177084, "loss": 0.0015, "rewards/chosen": 11.338635762532553, "rewards/margins": 30.155426025390625, "rewards/rejected": -18.816790262858074, "step": 3833 }, { "epoch": 0.9593394219942449, "grad_norm": 2.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28756937.6, "logits/rejected": -58084909.71428572, "logps/chosen": -351.7574462890625, "logps/rejected": -762.8815569196429, "loss": 0.0092, "rewards/chosen": 7.608365631103515, "rewards/margins": 30.5991580418178, "rewards/rejected": -22.990792410714285, "step": 3834 }, { "epoch": 0.9595896409358189, "grad_norm": 8.875, "kl": 3.9756338596343994, "learning_rate": 5e-06, "logits/chosen": -23455826.285714287, "logits/rejected": -32992726.4, "logps/chosen": -335.25118582589283, "logps/rejected": -818.69130859375, "loss": 0.037, "rewards/chosen": 8.70469502040318, "rewards/margins": 29.234492383684433, "rewards/rejected": -20.52979736328125, "step": 3835 }, { "epoch": 0.9598398598773927, "grad_norm": 3.671875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37097795.55555555, "logits/rejected": -49031654.4, "logps/chosen": -376.55889214409723, "logps/rejected": -617.6861979166666, "loss": 0.0154, "rewards/chosen": 9.517410278320312, "rewards/margins": 30.474990844726562, "rewards/rejected": -20.95758056640625, "step": 3836 }, { "epoch": 0.9600900788189666, "grad_norm": 12.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -15814643.2, "logits/rejected": -35996900.571428575, "logps/chosen": -298.586328125, "logps/rejected": -706.1287667410714, "loss": 0.0528, "rewards/chosen": 6.01234130859375, "rewards/margins": 25.44663260323661, "rewards/rejected": -19.434291294642858, "step": 3837 }, { "epoch": 0.9603402977605404, "grad_norm": 0.1865234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37997083.428571425, "logits/rejected": -23202057.6, "logps/chosen": -374.73154994419644, "logps/rejected": -478.361669921875, "loss": 0.0004, "rewards/chosen": 10.213801792689733, "rewards/margins": 25.345128086635043, "rewards/rejected": -15.131326293945312, "step": 3838 }, { "epoch": 0.9605905167021144, "grad_norm": 4.28125, "kl": 4.487115383148193, "learning_rate": 5e-06, "logits/chosen": -38783844.571428575, "logits/rejected": -74535673.6, "logps/chosen": -363.5967494419643, "logps/rejected": -838.803515625, "loss": 0.0202, "rewards/chosen": 8.72662843976702, "rewards/margins": 37.61637758527483, "rewards/rejected": -28.889749145507814, "step": 3839 }, { "epoch": 0.9608407356436882, "grad_norm": 2.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28871888.0, "logits/rejected": -38986828.0, "logps/chosen": -309.4356689453125, "logps/rejected": -595.8763427734375, "loss": 0.0161, "rewards/chosen": 7.95991849899292, "rewards/margins": 27.782046794891357, "rewards/rejected": -19.822128295898438, "step": 3840 }, { "epoch": 0.9610909545852621, "grad_norm": 5.28125, "kl": 12.016315460205078, "learning_rate": 5e-06, "logits/chosen": -54934240.0, "logits/rejected": -46925210.666666664, "logps/chosen": -405.2626546223958, "logps/rejected": -692.5524088541666, "loss": 0.0225, "rewards/chosen": 10.991649627685547, "rewards/margins": 35.87665430704753, "rewards/rejected": -24.88500467936198, "step": 3841 }, { "epoch": 0.961341173526836, "grad_norm": 6.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34089619.2, "logits/rejected": -49401321.14285714, "logps/chosen": -452.6861328125, "logps/rejected": -719.708984375, "loss": 0.0087, "rewards/chosen": 11.733026123046875, "rewards/margins": 31.139311000279015, "rewards/rejected": -19.406284877232142, "step": 3842 }, { "epoch": 0.9615913924684099, "grad_norm": 10.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -11760846.4, "logits/rejected": -37050210.28571428, "logps/chosen": -257.75185546875, "logps/rejected": -594.1285574776786, "loss": 0.0682, "rewards/chosen": 6.774432373046875, "rewards/margins": 24.45775146484375, "rewards/rejected": -17.683319091796875, "step": 3843 }, { "epoch": 0.9618416114099837, "grad_norm": 1.1328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -18305284.363636363, "logits/rejected": -57329984.0, "logps/chosen": -473.75803444602275, "logps/rejected": -720.1396484375, "loss": 0.0026, "rewards/chosen": 11.285125038840555, "rewards/margins": 34.99987019358815, "rewards/rejected": -23.714745154747597, "step": 3844 }, { "epoch": 0.9620918303515577, "grad_norm": 44.75, "kl": 12.56527328491211, "learning_rate": 5e-06, "logits/chosen": -55231540.36363637, "logits/rejected": -42694680.615384616, "logps/chosen": -408.63108132102275, "logps/rejected": -642.4805438701923, "loss": 0.0399, "rewards/chosen": 11.480680985884232, "rewards/margins": 28.094380091953944, "rewards/rejected": -16.61369910606971, "step": 3845 }, { "epoch": 0.9623420492931315, "grad_norm": 2.296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40305469.09090909, "logits/rejected": -54132647.384615384, "logps/chosen": -389.8980823863636, "logps/rejected": -569.9939152644231, "loss": 0.0108, "rewards/chosen": 9.961592934348367, "rewards/margins": 31.403055524492597, "rewards/rejected": -21.44146259014423, "step": 3846 }, { "epoch": 0.9625922682347053, "grad_norm": 2.734375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -48310518.15384615, "logits/rejected": -45653006.54545455, "logps/chosen": -386.89547025240387, "logps/rejected": -671.8385564630681, "loss": 0.0083, "rewards/chosen": 10.33529545710637, "rewards/margins": 33.40339543269231, "rewards/rejected": -23.068099975585938, "step": 3847 }, { "epoch": 0.9628424871762793, "grad_norm": 6.1875, "kl": 10.920272827148438, "learning_rate": 5e-06, "logits/chosen": -31669776.0, "logits/rejected": -24730280.0, "logps/chosen": -358.03662109375, "logps/rejected": -574.1449584960938, "loss": 0.0829, "rewards/chosen": 9.713077545166016, "rewards/margins": 25.816761016845703, "rewards/rejected": -16.103683471679688, "step": 3848 }, { "epoch": 0.9630927061178531, "grad_norm": 19.25, "kl": 19.09150505065918, "learning_rate": 5e-06, "logits/chosen": -26879507.692307692, "logits/rejected": -51334557.09090909, "logps/chosen": -380.67566856971155, "logps/rejected": -692.3667436079545, "loss": 0.0324, "rewards/chosen": 9.16677034818209, "rewards/margins": 30.5244432996203, "rewards/rejected": -21.35767295143821, "step": 3849 }, { "epoch": 0.963342925059427, "grad_norm": 1.140625, "kl": 8.34872055053711, "learning_rate": 5e-06, "logits/chosen": -37483531.428571425, "logits/rejected": -33689395.2, "logps/chosen": -421.77322823660717, "logps/rejected": -548.781103515625, "loss": 0.0485, "rewards/chosen": 10.449754987444196, "rewards/margins": 27.008191571916853, "rewards/rejected": -16.558436584472656, "step": 3850 }, { "epoch": 0.9635931440010008, "grad_norm": 6.5625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28091118.545454547, "logits/rejected": -34631202.461538464, "logps/chosen": -322.37690873579544, "logps/rejected": -469.8505859375, "loss": 0.0589, "rewards/chosen": 6.198031338778409, "rewards/margins": 25.14949195701759, "rewards/rejected": -18.95146061823918, "step": 3851 }, { "epoch": 0.9638433629425748, "grad_norm": 0.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -58119040.0, "logits/rejected": -43992969.14285714, "logps/chosen": -559.505126953125, "logps/rejected": -675.2760881696429, "loss": 0.0007, "rewards/chosen": 14.47528076171875, "rewards/margins": 37.27697099958147, "rewards/rejected": -22.801690237862722, "step": 3852 }, { "epoch": 0.9640935818841486, "grad_norm": 0.08056640625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40678481.45454545, "logits/rejected": -9905734.153846154, "logps/chosen": -371.78946200284093, "logps/rejected": -723.5649038461538, "loss": 0.0001, "rewards/chosen": 11.309200633655895, "rewards/margins": 34.0964086039083, "rewards/rejected": -22.787207970252403, "step": 3853 }, { "epoch": 0.9643438008257225, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55221164.307692304, "logits/rejected": -56022690.90909091, "logps/chosen": -383.1820537860577, "logps/rejected": -838.6635298295455, "loss": 0.066, "rewards/chosen": 9.849053016075722, "rewards/margins": 33.36286083301464, "rewards/rejected": -23.51380781693892, "step": 3854 }, { "epoch": 0.9645940197672964, "grad_norm": 3.21875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27051552.0, "logits/rejected": -59936955.733333334, "logps/chosen": -302.7287326388889, "logps/rejected": -613.5576171875, "loss": 0.0737, "rewards/chosen": 8.684432135687935, "rewards/margins": 28.053491380479603, "rewards/rejected": -19.369059244791668, "step": 3855 }, { "epoch": 0.9648442387088703, "grad_norm": 1.2109375, "kl": 4.168708801269531, "learning_rate": 5e-06, "logits/chosen": -32282508.8, "logits/rejected": -37779138.28571428, "logps/chosen": -463.106494140625, "logps/rejected": -662.2996651785714, "loss": 0.0049, "rewards/chosen": 10.612116241455078, "rewards/margins": 34.56391154697963, "rewards/rejected": -23.951795305524552, "step": 3856 }, { "epoch": 0.9650944576504441, "grad_norm": 3.328125, "kl": 11.12745475769043, "learning_rate": 5e-06, "logits/chosen": -32653936.0, "logits/rejected": 32110586.0, "logps/chosen": -425.4703369140625, "logps/rejected": -483.9393005371094, "loss": 0.0174, "rewards/chosen": 9.707277297973633, "rewards/margins": 24.246158599853516, "rewards/rejected": -14.538881301879883, "step": 3857 }, { "epoch": 0.9653446765920181, "grad_norm": 4.625, "kl": 8.11190414428711, "learning_rate": 5e-06, "logits/chosen": -31433381.647058822, "logits/rejected": -70687241.14285715, "logps/chosen": -364.31198299632354, "logps/rejected": -484.31856863839283, "loss": 0.0387, "rewards/chosen": 9.439571605009192, "rewards/margins": 23.662127582966782, "rewards/rejected": -14.222555977957589, "step": 3858 }, { "epoch": 0.9655948955335919, "grad_norm": 0.1298828125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56802035.2, "logits/rejected": -58600685.71428572, "logps/chosen": -420.225146484375, "logps/rejected": -973.6275809151786, "loss": 0.0004, "rewards/chosen": 11.7372802734375, "rewards/margins": 44.23220127650669, "rewards/rejected": -32.494921003069194, "step": 3859 }, { "epoch": 0.9658451144751657, "grad_norm": 12.5625, "kl": 5.907201290130615, "learning_rate": 5e-06, "logits/chosen": -23306130.285714287, "logits/rejected": -53998930.823529415, "logps/chosen": -324.04282924107144, "logps/rejected": -726.5144186580883, "loss": 0.0515, "rewards/chosen": 8.301190512520927, "rewards/margins": 33.49217131157883, "rewards/rejected": -25.190980799057904, "step": 3860 }, { "epoch": 0.9660953334167397, "grad_norm": 6.15625, "kl": 2.182804822921753, "learning_rate": 5e-06, "logits/chosen": -32028797.333333332, "logits/rejected": -8360502.666666667, "logps/chosen": -348.4102783203125, "logps/rejected": -620.0096842447916, "loss": 0.0432, "rewards/chosen": 8.913494110107422, "rewards/margins": 33.54720687866211, "rewards/rejected": -24.633712768554688, "step": 3861 }, { "epoch": 0.9663455523583135, "grad_norm": 2.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30353171.2, "logits/rejected": -52253929.14285714, "logps/chosen": -399.001708984375, "logps/rejected": -662.9752371651786, "loss": 0.0075, "rewards/chosen": 9.050270080566406, "rewards/margins": 27.36557115827288, "rewards/rejected": -18.315301077706472, "step": 3862 }, { "epoch": 0.9665957712998874, "grad_norm": 6.28125, "kl": 0.22182178497314453, "learning_rate": 5e-06, "logits/chosen": -57446326.85714286, "logits/rejected": -40376044.8, "logps/chosen": -392.30831473214283, "logps/rejected": -765.30146484375, "loss": 0.0139, "rewards/chosen": 9.939987182617188, "rewards/margins": 34.026486206054685, "rewards/rejected": -24.0864990234375, "step": 3863 }, { "epoch": 0.9668459902414612, "grad_norm": 12.3125, "kl": 23.338483810424805, "learning_rate": 5e-06, "logits/chosen": -40988644.571428575, "logits/rejected": -47086624.0, "logps/chosen": -399.99672154017856, "logps/rejected": -596.09453125, "loss": 0.089, "rewards/chosen": 9.718152727399554, "rewards/margins": 30.955344499860495, "rewards/rejected": -21.23719177246094, "step": 3864 }, { "epoch": 0.9670962091830352, "grad_norm": 13.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22877720.615384616, "logits/rejected": -46798528.0, "logps/chosen": -319.18299278846155, "logps/rejected": -564.1554509943181, "loss": 0.038, "rewards/chosen": 8.825396024263823, "rewards/margins": 29.192720853365387, "rewards/rejected": -20.367324829101562, "step": 3865 }, { "epoch": 0.967346428124609, "grad_norm": 1.6015625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -33983051.63636363, "logits/rejected": -50839020.307692304, "logps/chosen": -432.88858309659093, "logps/rejected": -716.7330979567307, "loss": 0.0159, "rewards/chosen": 11.987903941761363, "rewards/margins": 35.22811249419526, "rewards/rejected": -23.240208552433895, "step": 3866 }, { "epoch": 0.9675966470661829, "grad_norm": 3.6875, "kl": 0.5152873992919922, "learning_rate": 5e-06, "logits/chosen": -37303378.28571428, "logits/rejected": -38571161.6, "logps/chosen": -291.78201729910717, "logps/rejected": -690.518896484375, "loss": 0.024, "rewards/chosen": 8.523482186453682, "rewards/margins": 34.44908643450056, "rewards/rejected": -25.925604248046874, "step": 3867 }, { "epoch": 0.9678468660077568, "grad_norm": 1.078125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35910432.0, "logits/rejected": -69161115.42857143, "logps/chosen": -371.057666015625, "logps/rejected": -814.048828125, "loss": 0.0203, "rewards/chosen": 9.005654907226562, "rewards/margins": 35.64681178501674, "rewards/rejected": -26.641156877790177, "step": 3868 }, { "epoch": 0.9680970849493307, "grad_norm": 6.84375, "kl": 1.2629725933074951, "learning_rate": 5e-06, "logits/chosen": -46466568.0, "logits/rejected": -53591456.0, "logps/chosen": -415.37957763671875, "logps/rejected": -570.9844360351562, "loss": 0.0328, "rewards/chosen": 9.832305908203125, "rewards/margins": 26.621980667114258, "rewards/rejected": -16.789674758911133, "step": 3869 }, { "epoch": 0.9683473038909045, "grad_norm": 8.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47434880.0, "logits/rejected": -37783603.692307696, "logps/chosen": -287.724609375, "logps/rejected": -608.3063777043269, "loss": 0.0359, "rewards/chosen": 6.878135681152344, "rewards/margins": 32.91757612961989, "rewards/rejected": -26.03944044846755, "step": 3870 }, { "epoch": 0.9685975228324785, "grad_norm": 0.333984375, "kl": 2.0322751998901367, "learning_rate": 5e-06, "logits/chosen": -46891330.90909091, "logits/rejected": -53090422.15384615, "logps/chosen": -413.53178267045456, "logps/rejected": -651.9038461538462, "loss": 0.0409, "rewards/chosen": 10.19251181862571, "rewards/margins": 32.75693207854158, "rewards/rejected": -22.564420259915867, "step": 3871 }, { "epoch": 0.9688477417740523, "grad_norm": 6.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28443712.0, "logits/rejected": -38963366.4, "logps/chosen": -245.46820746527777, "logps/rejected": -587.5257161458334, "loss": 0.0472, "rewards/chosen": 6.622306399875217, "rewards/margins": 26.38343921237522, "rewards/rejected": -19.7611328125, "step": 3872 }, { "epoch": 0.9690979607156261, "grad_norm": 18.0, "kl": 9.448108673095703, "learning_rate": 5e-06, "logits/chosen": -25862889.411764707, "logits/rejected": -42743067.428571425, "logps/chosen": -322.56970932904414, "logps/rejected": -724.9907924107143, "loss": 0.0868, "rewards/chosen": 8.542974135454964, "rewards/margins": 33.611475199210545, "rewards/rejected": -25.06850106375558, "step": 3873 }, { "epoch": 0.9693481796572001, "grad_norm": 3.578125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21654961.230769232, "logits/rejected": -52294481.45454545, "logps/chosen": -327.39415564903845, "logps/rejected": -871.4437144886364, "loss": 0.0675, "rewards/chosen": 8.409715505746695, "rewards/margins": 34.32598914299812, "rewards/rejected": -25.91627363725142, "step": 3874 }, { "epoch": 0.969598398598774, "grad_norm": 2.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40782469.81818182, "logits/rejected": -33015707.076923076, "logps/chosen": -392.9646661931818, "logps/rejected": -594.09228515625, "loss": 0.0186, "rewards/chosen": 9.928966175426137, "rewards/margins": 28.39434280929032, "rewards/rejected": -18.46537663386418, "step": 3875 }, { "epoch": 0.9698486175403478, "grad_norm": 2.15625, "kl": 7.827136516571045, "learning_rate": 5e-06, "logits/chosen": -39313274.666666664, "logits/rejected": -41018488.0, "logps/chosen": -305.38478597005206, "logps/rejected": -684.7875162760416, "loss": 0.0141, "rewards/chosen": 8.029731750488281, "rewards/margins": 26.312808990478516, "rewards/rejected": -18.283077239990234, "step": 3876 }, { "epoch": 0.9700988364819216, "grad_norm": 21.375, "kl": 3.491133451461792, "learning_rate": 5e-06, "logits/chosen": -29397490.285714287, "logits/rejected": -62098118.4, "logps/chosen": -413.77476283482144, "logps/rejected": -873.984765625, "loss": 0.0394, "rewards/chosen": 10.881811959402901, "rewards/margins": 36.18558545793806, "rewards/rejected": -25.303773498535158, "step": 3877 }, { "epoch": 0.9703490554234956, "grad_norm": 11.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34417018.18181818, "logits/rejected": -27988539.076923076, "logps/chosen": -330.96395596590907, "logps/rejected": -464.8365009014423, "loss": 0.0591, "rewards/chosen": 9.189322731711648, "rewards/margins": 23.313269875266336, "rewards/rejected": -14.123947143554688, "step": 3878 }, { "epoch": 0.9705992743650694, "grad_norm": 6.96875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30721472.0, "logits/rejected": -62366771.2, "logps/chosen": -406.15001085069446, "logps/rejected": -579.9911458333333, "loss": 0.0339, "rewards/chosen": 11.351082695855034, "rewards/margins": 28.347450086805555, "rewards/rejected": -16.99636739095052, "step": 3879 }, { "epoch": 0.9708494933066433, "grad_norm": 8.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36149027.2, "logits/rejected": -75715451.42857143, "logps/chosen": -391.9965576171875, "logps/rejected": -921.5121372767857, "loss": 0.055, "rewards/chosen": 8.701179504394531, "rewards/margins": 40.30095018659319, "rewards/rejected": -31.59977068219866, "step": 3880 }, { "epoch": 0.9710997122482172, "grad_norm": 1.859375, "kl": 2.748319149017334, "learning_rate": 5e-06, "logits/chosen": -25719410.285714287, "logits/rejected": -53659865.6, "logps/chosen": -418.26572963169644, "logps/rejected": -591.8138671875, "loss": 0.0212, "rewards/chosen": 10.856331961495536, "rewards/margins": 30.9186519077846, "rewards/rejected": -20.062319946289062, "step": 3881 }, { "epoch": 0.9713499311897911, "grad_norm": 10.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65178725.333333336, "logits/rejected": -22346822.666666668, "logps/chosen": -329.09877522786456, "logps/rejected": -509.6519368489583, "loss": 0.0251, "rewards/chosen": 7.801107406616211, "rewards/margins": 24.29829216003418, "rewards/rejected": -16.49718475341797, "step": 3882 }, { "epoch": 0.9716001501313649, "grad_norm": 12.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30783202.46153846, "logits/rejected": -22821837.09090909, "logps/chosen": -384.0134840745192, "logps/rejected": -869.0830965909091, "loss": 0.0201, "rewards/chosen": 9.980858435997597, "rewards/margins": 30.422841745656687, "rewards/rejected": -20.44198330965909, "step": 3883 }, { "epoch": 0.9718503690729389, "grad_norm": 9.75, "kl": 21.838794708251953, "learning_rate": 5e-06, "logits/chosen": -32046446.933333334, "logits/rejected": -37591107.55555555, "logps/chosen": -435.53118489583335, "logps/rejected": -476.92670355902777, "loss": 0.101, "rewards/chosen": 9.048858642578125, "rewards/margins": 24.97481960720486, "rewards/rejected": -15.925960964626736, "step": 3884 }, { "epoch": 0.9721005880145127, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30191610.666666668, "logits/rejected": -39990896.0, "logps/chosen": -335.1617024739583, "logps/rejected": -777.8806966145834, "loss": 0.036, "rewards/chosen": 7.712012608846028, "rewards/margins": 29.219130833943684, "rewards/rejected": -21.507118225097656, "step": 3885 }, { "epoch": 0.9723508069560866, "grad_norm": 7.65625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56472936.0, "logits/rejected": -46947752.0, "logps/chosen": -413.96075439453125, "logps/rejected": -467.5013732910156, "loss": 0.0204, "rewards/chosen": 10.486227035522461, "rewards/margins": 28.210168838500977, "rewards/rejected": -17.723941802978516, "step": 3886 }, { "epoch": 0.9726010258976604, "grad_norm": 12.6875, "kl": 1.9150289297103882, "learning_rate": 5e-06, "logits/chosen": -35636491.63636363, "logits/rejected": -31300672.0, "logps/chosen": -275.99447354403407, "logps/rejected": -558.6902043269231, "loss": 0.0894, "rewards/chosen": 7.320476878773082, "rewards/margins": 24.61486821741491, "rewards/rejected": -17.294391338641827, "step": 3887 }, { "epoch": 0.9728512448392344, "grad_norm": 1.4296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37074065.23076923, "logits/rejected": -14339342.545454545, "logps/chosen": -360.14340444711536, "logps/rejected": -624.1553178267045, "loss": 0.0033, "rewards/chosen": 9.248656052809496, "rewards/margins": 27.727687702312338, "rewards/rejected": -18.47903164950284, "step": 3888 }, { "epoch": 0.9731014637808082, "grad_norm": 1.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55604992.0, "logits/rejected": -60936850.28571428, "logps/chosen": -310.69658203125, "logps/rejected": -687.2011021205357, "loss": 0.0265, "rewards/chosen": 9.678225708007812, "rewards/margins": 32.72256774902344, "rewards/rejected": -23.044342041015625, "step": 3889 }, { "epoch": 0.973351682722382, "grad_norm": 8.75, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -23144411.636363637, "logits/rejected": -60289777.23076923, "logps/chosen": -309.78635475852275, "logps/rejected": -668.6854717548077, "loss": 0.0509, "rewards/chosen": 7.271749323064631, "rewards/margins": 31.78096328415237, "rewards/rejected": -24.50921396108774, "step": 3890 }, { "epoch": 0.973601901663956, "grad_norm": 13.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42042023.384615384, "logits/rejected": -47910059.63636363, "logps/chosen": -419.7848557692308, "logps/rejected": -693.4315074573864, "loss": 0.0206, "rewards/chosen": 10.259602473332333, "rewards/margins": 33.03318146392182, "rewards/rejected": -22.77357899058949, "step": 3891 }, { "epoch": 0.9738521206055298, "grad_norm": 2.5, "kl": 2.666719436645508, "learning_rate": 5e-06, "logits/chosen": -40335667.2, "logits/rejected": -25222595.555555556, "logps/chosen": -388.22294921875, "logps/rejected": -792.2281901041666, "loss": 0.03, "rewards/chosen": 10.124489339192708, "rewards/margins": 34.141579182942706, "rewards/rejected": -24.01708984375, "step": 3892 }, { "epoch": 0.9741023395471037, "grad_norm": 6.25, "kl": 2.3579535484313965, "learning_rate": 5e-06, "logits/chosen": -9998268.666666666, "logits/rejected": -43604661.333333336, "logps/chosen": -529.3503011067709, "logps/rejected": -491.5458984375, "loss": 0.0069, "rewards/chosen": 12.51663589477539, "rewards/margins": 29.207677205403645, "rewards/rejected": -16.691041310628254, "step": 3893 }, { "epoch": 0.9743525584886776, "grad_norm": 11.4375, "kl": 1.738030195236206, "learning_rate": 5e-06, "logits/chosen": -70412578.46153846, "logits/rejected": -11246549.818181818, "logps/chosen": -367.69936899038464, "logps/rejected": -460.8216441761364, "loss": 0.046, "rewards/chosen": 8.68379387488732, "rewards/margins": 21.446622355000954, "rewards/rejected": -12.762828480113637, "step": 3894 }, { "epoch": 0.9746027774302515, "grad_norm": 18.625, "kl": 2.904693841934204, "learning_rate": 5e-06, "logits/chosen": -54031064.0, "logits/rejected": -35670252.0, "logps/chosen": -474.55517578125, "logps/rejected": -795.219970703125, "loss": 0.0179, "rewards/chosen": 10.20977783203125, "rewards/margins": 27.902626037597656, "rewards/rejected": -17.692848205566406, "step": 3895 }, { "epoch": 0.9748529963718253, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37931451.428571425, "logits/rejected": -41357702.4, "logps/chosen": -309.04091099330356, "logps/rejected": -837.5548828125, "loss": 0.0072, "rewards/chosen": 8.340540749686104, "rewards/margins": 37.24057126726423, "rewards/rejected": -28.900030517578124, "step": 3896 }, { "epoch": 0.9751032153133993, "grad_norm": 2.0, "kl": 5.703559875488281, "learning_rate": 5e-06, "logits/chosen": -23395910.4, "logits/rejected": -52158144.0, "logps/chosen": -377.62623697916666, "logps/rejected": -832.6595594618055, "loss": 0.0294, "rewards/chosen": 9.456494140625, "rewards/margins": 33.989771864149304, "rewards/rejected": -24.533277723524307, "step": 3897 }, { "epoch": 0.9753534342549731, "grad_norm": 12.8125, "kl": 4.150864601135254, "learning_rate": 5e-06, "logits/chosen": -40384648.53333333, "logits/rejected": -59752149.333333336, "logps/chosen": -387.2018229166667, "logps/rejected": -725.9015299479166, "loss": 0.0694, "rewards/chosen": 7.4949900309244795, "rewards/margins": 32.71932813856337, "rewards/rejected": -25.22433810763889, "step": 3898 }, { "epoch": 0.975603653196547, "grad_norm": 1.3828125, "kl": 13.705760955810547, "learning_rate": 5e-06, "logits/chosen": -62480768.0, "logits/rejected": -27682688.0, "logps/chosen": -411.98057338169644, "logps/rejected": -552.70419921875, "loss": 0.0303, "rewards/chosen": 9.918835231236049, "rewards/margins": 30.28584267752511, "rewards/rejected": -20.367007446289062, "step": 3899 }, { "epoch": 0.9758538721381208, "grad_norm": 2.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -44139254.85714286, "logits/rejected": -78279788.8, "logps/chosen": -419.72544642857144, "logps/rejected": -546.863525390625, "loss": 0.0423, "rewards/chosen": 11.096158708844866, "rewards/margins": 30.186109270368306, "rewards/rejected": -19.089950561523438, "step": 3900 }, { "epoch": 0.9761040910796948, "grad_norm": 0.94140625, "kl": 5.185084819793701, "learning_rate": 5e-06, "logits/chosen": -41667108.0, "logits/rejected": -48074820.0, "logps/chosen": -320.8394775390625, "logps/rejected": -763.2186889648438, "loss": 0.0221, "rewards/chosen": 9.117944717407227, "rewards/margins": 27.3349666595459, "rewards/rejected": -18.217021942138672, "step": 3901 }, { "epoch": 0.9763543100212686, "grad_norm": 12.75, "kl": 0.5974782705307007, "learning_rate": 5e-06, "logits/chosen": -40336996.92307692, "logits/rejected": -26020381.09090909, "logps/chosen": -335.4240534855769, "logps/rejected": -677.9532137784091, "loss": 0.0297, "rewards/chosen": 8.151806171123798, "rewards/margins": 30.634008874426357, "rewards/rejected": -22.48220270330256, "step": 3902 }, { "epoch": 0.9766045289628424, "grad_norm": 12.9375, "kl": 24.269527435302734, "learning_rate": 5e-06, "logits/chosen": -43479261.86666667, "logits/rejected": 18193614.222222224, "logps/chosen": -381.3014322916667, "logps/rejected": -502.84190538194446, "loss": 0.0774, "rewards/chosen": 9.11646728515625, "rewards/margins": 23.820362006293404, "rewards/rejected": -14.703894721137154, "step": 3903 }, { "epoch": 0.9768547479044164, "grad_norm": 0.73046875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45116666.18181818, "logits/rejected": -45631261.538461536, "logps/chosen": -353.5690252130682, "logps/rejected": -862.0533353365385, "loss": 0.0134, "rewards/chosen": 9.191687150435014, "rewards/margins": 31.392621927328044, "rewards/rejected": -22.20093477689303, "step": 3904 }, { "epoch": 0.9771049668459902, "grad_norm": 1.3359375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -52801926.4, "logits/rejected": -35263286.85714286, "logps/chosen": -283.116552734375, "logps/rejected": -716.5625697544643, "loss": 0.044, "rewards/chosen": 7.934872436523437, "rewards/margins": 32.24507053920201, "rewards/rejected": -24.310198102678573, "step": 3905 }, { "epoch": 0.9773551857875641, "grad_norm": 4.25, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -57245705.84615385, "logits/rejected": -44357844.36363637, "logps/chosen": -545.3950946514423, "logps/rejected": -695.5963245738636, "loss": 0.0078, "rewards/chosen": 12.219673743614784, "rewards/margins": 35.781211052741206, "rewards/rejected": -23.56153730912642, "step": 3906 }, { "epoch": 0.977605404729138, "grad_norm": 24.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27104283.076923076, "logits/rejected": -55196392.72727273, "logps/chosen": -344.4867412860577, "logps/rejected": -601.3670987215909, "loss": 0.0694, "rewards/chosen": 8.338092510516827, "rewards/margins": 28.400448245602053, "rewards/rejected": -20.062355735085227, "step": 3907 }, { "epoch": 0.9778556236707119, "grad_norm": 4.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -39075859.2, "logits/rejected": -49144667.428571425, "logps/chosen": -371.681787109375, "logps/rejected": -664.4563337053571, "loss": 0.0437, "rewards/chosen": 8.994275665283203, "rewards/margins": 27.77451858520508, "rewards/rejected": -18.780242919921875, "step": 3908 }, { "epoch": 0.9781058426122857, "grad_norm": 1.1328125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32540375.272727273, "logits/rejected": -63902168.615384616, "logps/chosen": -351.01700106534093, "logps/rejected": -977.1657902644231, "loss": 0.0083, "rewards/chosen": 9.339064858176492, "rewards/margins": 40.9016165033087, "rewards/rejected": -31.56255164513221, "step": 3909 }, { "epoch": 0.9783560615538597, "grad_norm": 1.3515625, "kl": 3.9817867279052734, "learning_rate": 5e-06, "logits/chosen": -30538598.4, "logits/rejected": -56442843.428571425, "logps/chosen": -390.345556640625, "logps/rejected": -614.03173828125, "loss": 0.0154, "rewards/chosen": 9.918637084960938, "rewards/margins": 26.445374843052456, "rewards/rejected": -16.526737758091517, "step": 3910 }, { "epoch": 0.9786062804954335, "grad_norm": 14.0, "kl": 14.499471664428711, "learning_rate": 5e-06, "logits/chosen": -49904075.63636363, "logits/rejected": 8439950.76923077, "logps/chosen": -375.97469815340907, "logps/rejected": -803.7215294471154, "loss": 0.0565, "rewards/chosen": 11.516175703568893, "rewards/margins": 39.51285163505928, "rewards/rejected": -27.996675931490383, "step": 3911 }, { "epoch": 0.9788564994370074, "grad_norm": 2.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42368813.333333336, "logits/rejected": -66052773.333333336, "logps/chosen": -330.16859944661456, "logps/rejected": -848.3619791666666, "loss": 0.0058, "rewards/chosen": 8.415872573852539, "rewards/margins": 37.10623613993327, "rewards/rejected": -28.69036356608073, "step": 3912 }, { "epoch": 0.9791067183785812, "grad_norm": 5.71875, "kl": 4.7139410972595215, "learning_rate": 5e-06, "logits/chosen": -28923680.0, "logits/rejected": -36269680.0, "logps/chosen": -341.94698660714283, "logps/rejected": -634.24462890625, "loss": 0.0419, "rewards/chosen": 8.72720227922712, "rewards/margins": 33.89705396379743, "rewards/rejected": -25.16985168457031, "step": 3913 }, { "epoch": 0.9793569373201552, "grad_norm": 5.78125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -40997098.666666664, "logits/rejected": -52620080.0, "logps/chosen": -400.3782552083333, "logps/rejected": -768.3256022135416, "loss": 0.0252, "rewards/chosen": 10.45553716023763, "rewards/margins": 33.093939463297524, "rewards/rejected": -22.638402303059895, "step": 3914 }, { "epoch": 0.979607156261729, "grad_norm": 10.5, "kl": 20.873537063598633, "learning_rate": 5e-06, "logits/chosen": -50235899.07692308, "logits/rejected": 24579252.363636363, "logps/chosen": -438.91744290865387, "logps/rejected": -625.6250887784091, "loss": 0.1335, "rewards/chosen": 9.434246356670673, "rewards/margins": 26.384742683463998, "rewards/rejected": -16.950496326793324, "step": 3915 }, { "epoch": 0.9798573752033028, "grad_norm": 8.25, "kl": 1.583831787109375, "learning_rate": 5e-06, "logits/chosen": -52324489.14285714, "logits/rejected": -56690886.4, "logps/chosen": -384.96470424107144, "logps/rejected": -531.48134765625, "loss": 0.0524, "rewards/chosen": 9.045016697474889, "rewards/margins": 28.605673435756138, "rewards/rejected": -19.56065673828125, "step": 3916 }, { "epoch": 0.9801075941448768, "grad_norm": 1.609375, "kl": 10.239280700683594, "learning_rate": 5e-06, "logits/chosen": -45535824.0, "logits/rejected": -44611008.0, "logps/chosen": -447.91259765625, "logps/rejected": -914.749755859375, "loss": 0.0034, "rewards/chosen": 12.477359771728516, "rewards/margins": 36.78392219543457, "rewards/rejected": -24.306562423706055, "step": 3917 }, { "epoch": 0.9803578130864506, "grad_norm": 21.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": 1225360.0, "logits/rejected": -31190221.714285713, "logps/chosen": -282.1539306640625, "logps/rejected": -556.0284598214286, "loss": 0.0114, "rewards/chosen": 7.905995178222656, "rewards/margins": 25.902686200823105, "rewards/rejected": -17.996691022600448, "step": 3918 }, { "epoch": 0.9806080320280245, "grad_norm": 14.8125, "kl": 20.088218688964844, "learning_rate": 5e-06, "logits/chosen": -17907869.714285713, "logits/rejected": -44431916.8, "logps/chosen": -367.9154575892857, "logps/rejected": -637.282177734375, "loss": 0.073, "rewards/chosen": 7.657320840018136, "rewards/margins": 28.288558632986888, "rewards/rejected": -20.63123779296875, "step": 3919 }, { "epoch": 0.9808582509695984, "grad_norm": 2.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32359758.222222224, "logits/rejected": -40579891.2, "logps/chosen": -370.8440212673611, "logps/rejected": -567.5607421875, "loss": 0.0211, "rewards/chosen": 11.505711873372396, "rewards/margins": 29.10923563639323, "rewards/rejected": -17.603523763020835, "step": 3920 }, { "epoch": 0.9811084699111723, "grad_norm": 5.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16098978.909090908, "logits/rejected": -35136462.76923077, "logps/chosen": -456.0098987926136, "logps/rejected": -618.2701322115385, "loss": 0.0755, "rewards/chosen": 12.162471424449574, "rewards/margins": 28.333258355414117, "rewards/rejected": -16.170786930964542, "step": 3921 }, { "epoch": 0.9813586888527461, "grad_norm": 8.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -19941942.85714286, "logits/rejected": -51920018.823529415, "logps/chosen": -263.69559151785717, "logps/rejected": -710.2635569852941, "loss": 0.0139, "rewards/chosen": 8.911597115652901, "rewards/margins": 24.146363458713562, "rewards/rejected": -15.234766343060661, "step": 3922 }, { "epoch": 0.9816089077943201, "grad_norm": 15.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29590700.307692308, "logits/rejected": -24099643.636363637, "logps/chosen": -379.6750300480769, "logps/rejected": -720.9930752840909, "loss": 0.0308, "rewards/chosen": 10.717064490685097, "rewards/margins": 30.015029800521745, "rewards/rejected": -19.29796530983665, "step": 3923 }, { "epoch": 0.9818591267358939, "grad_norm": 4.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -46414090.666666664, "logits/rejected": -19778786.666666668, "logps/chosen": -463.7242024739583, "logps/rejected": -541.4361979166666, "loss": 0.0068, "rewards/chosen": 11.843819936116537, "rewards/margins": 27.001853942871094, "rewards/rejected": -15.158034006754557, "step": 3924 }, { "epoch": 0.9821093456774678, "grad_norm": 1.78125, "kl": 25.73027801513672, "learning_rate": 5e-06, "logits/chosen": -41849339.428571425, "logits/rejected": 490687.6, "logps/chosen": -412.93603515625, "logps/rejected": -559.054052734375, "loss": 0.0052, "rewards/chosen": 12.351627894810267, "rewards/margins": 30.023913356236047, "rewards/rejected": -17.67228546142578, "step": 3925 }, { "epoch": 0.9823595646190416, "grad_norm": 35.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -38446700.0, "logits/rejected": -36006440.0, "logps/chosen": -453.05517578125, "logps/rejected": -479.0089416503906, "loss": 0.1024, "rewards/chosen": 10.47886848449707, "rewards/margins": 21.55883026123047, "rewards/rejected": -11.079961776733398, "step": 3926 }, { "epoch": 0.9826097835606156, "grad_norm": 2.609375, "kl": 4.016099452972412, "learning_rate": 5e-06, "logits/chosen": -35379386.666666664, "logits/rejected": -29712288.0, "logps/chosen": -424.4717610677083, "logps/rejected": -672.4393717447916, "loss": 0.0037, "rewards/chosen": 11.027623494466146, "rewards/margins": 29.700346628824867, "rewards/rejected": -18.672723134358723, "step": 3927 }, { "epoch": 0.9828600025021894, "grad_norm": 3.59375, "kl": 5.973641395568848, "learning_rate": 5e-06, "logits/chosen": -34025917.333333336, "logits/rejected": -6558381.333333333, "logps/chosen": -381.3688557942708, "logps/rejected": -845.018798828125, "loss": 0.0515, "rewards/chosen": 9.177377065022787, "rewards/margins": 33.514400482177734, "rewards/rejected": -24.33702341715495, "step": 3928 }, { "epoch": 0.9831102214437633, "grad_norm": 16.625, "kl": 19.98219108581543, "learning_rate": 5e-06, "logits/chosen": -38877200.0, "logits/rejected": -34899512.0, "logps/chosen": -369.6253967285156, "logps/rejected": -837.4268798828125, "loss": 0.0471, "rewards/chosen": 8.4776611328125, "rewards/margins": 35.82998085021973, "rewards/rejected": -27.352319717407227, "step": 3929 }, { "epoch": 0.9833604403853372, "grad_norm": 9.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -65620986.666666664, "logits/rejected": -42909477.333333336, "logps/chosen": -499.5901692708333, "logps/rejected": -730.4905598958334, "loss": 0.0364, "rewards/chosen": 13.0157839457194, "rewards/margins": 36.617977142333984, "rewards/rejected": -23.602193196614582, "step": 3930 }, { "epoch": 0.983610659326911, "grad_norm": 12.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -56220197.333333336, "logits/rejected": -53672437.333333336, "logps/chosen": -245.3282674153646, "logps/rejected": -603.2243923611111, "loss": 0.0502, "rewards/chosen": 7.396557490030925, "rewards/margins": 26.250307083129883, "rewards/rejected": -18.853749593098957, "step": 3931 }, { "epoch": 0.9838608782684849, "grad_norm": 12.375, "kl": 8.447867393493652, "learning_rate": 5e-06, "logits/chosen": -23518124.0, "logits/rejected": -29392884.0, "logps/chosen": -415.67431640625, "logps/rejected": -655.8687744140625, "loss": 0.0207, "rewards/chosen": 10.663930892944336, "rewards/margins": 27.354080200195312, "rewards/rejected": -16.690149307250977, "step": 3932 }, { "epoch": 0.9841110972100588, "grad_norm": 4.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37172004.92307692, "logits/rejected": -31579182.545454547, "logps/chosen": -215.589599609375, "logps/rejected": -777.7120028409091, "loss": 0.0334, "rewards/chosen": 7.376846900353065, "rewards/margins": 34.59776364839994, "rewards/rejected": -27.220916748046875, "step": 3933 }, { "epoch": 0.9843613161516327, "grad_norm": 17.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -16495175.272727273, "logits/rejected": -34304839.384615384, "logps/chosen": -328.09086470170456, "logps/rejected": -702.5994591346154, "loss": 0.0614, "rewards/chosen": 7.887223677201704, "rewards/margins": 29.041191901360357, "rewards/rejected": -21.153968224158653, "step": 3934 }, { "epoch": 0.9846115350932065, "grad_norm": 6.28125, "kl": 2.333127975463867, "learning_rate": 5e-06, "logits/chosen": -49624749.176470585, "logits/rejected": -49470162.28571428, "logps/chosen": -320.70726102941177, "logps/rejected": -690.4435686383929, "loss": 0.0567, "rewards/chosen": 8.330571791704964, "rewards/margins": 28.948365283613448, "rewards/rejected": -20.617793491908483, "step": 3935 }, { "epoch": 0.9848617540347804, "grad_norm": 6.34375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -27207965.09090909, "logits/rejected": -72611357.53846154, "logps/chosen": -364.87564364346593, "logps/rejected": -817.0207331730769, "loss": 0.0128, "rewards/chosen": 8.27528936212713, "rewards/margins": 33.014716195059826, "rewards/rejected": -24.739426832932693, "step": 3936 }, { "epoch": 0.9851119729763543, "grad_norm": 0.984375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -22068662.4, "logits/rejected": -29371730.285714287, "logps/chosen": -471.294189453125, "logps/rejected": -708.8239397321429, "loss": 0.0163, "rewards/chosen": 10.50101547241211, "rewards/margins": 36.74225692749023, "rewards/rejected": -26.241241455078125, "step": 3937 }, { "epoch": 0.9853621919179282, "grad_norm": 14.375, "kl": 0.10368029773235321, "learning_rate": 5e-06, "logits/chosen": -48363296.0, "logits/rejected": -45868937.6, "logps/chosen": -331.83921595982144, "logps/rejected": -943.74765625, "loss": 0.0734, "rewards/chosen": 7.449642726353237, "rewards/margins": 43.6378415788923, "rewards/rejected": -36.18819885253906, "step": 3938 }, { "epoch": 0.985612410859502, "grad_norm": 9.875, "kl": 3.3190131187438965, "learning_rate": 5e-06, "logits/chosen": -46969557.333333336, "logits/rejected": -36867586.666666664, "logps/chosen": -377.8224690755208, "logps/rejected": -687.1456705729166, "loss": 0.0845, "rewards/chosen": 11.234382629394531, "rewards/margins": 32.21994908650716, "rewards/rejected": -20.98556645711263, "step": 3939 }, { "epoch": 0.985862629801076, "grad_norm": 14.4375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -45813632.0, "logits/rejected": -35051516.0, "logps/chosen": -415.10235595703125, "logps/rejected": -678.3410034179688, "loss": 0.0195, "rewards/chosen": 9.014588356018066, "rewards/margins": 30.950024604797363, "rewards/rejected": -21.935436248779297, "step": 3940 }, { "epoch": 0.9861128487426498, "grad_norm": 6.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31144051.2, "logits/rejected": -62658733.71428572, "logps/chosen": -285.29208984375, "logps/rejected": -733.9418247767857, "loss": 0.1087, "rewards/chosen": 4.349193572998047, "rewards/margins": 28.8267457144601, "rewards/rejected": -24.477552141462052, "step": 3941 }, { "epoch": 0.9863630676842237, "grad_norm": 10.4375, "kl": 9.000922203063965, "learning_rate": 5e-06, "logits/chosen": -58100829.86666667, "logits/rejected": -36515790.222222224, "logps/chosen": -447.3507486979167, "logps/rejected": -619.4696723090278, "loss": 0.0661, "rewards/chosen": 9.913232421875, "rewards/margins": 29.73987053765191, "rewards/rejected": -19.82663811577691, "step": 3942 }, { "epoch": 0.9866132866257976, "grad_norm": 10.0, "kl": 5.4238386154174805, "learning_rate": 5e-06, "logits/chosen": -29101499.733333334, "logits/rejected": -75156103.1111111, "logps/chosen": -363.73313802083334, "logps/rejected": -890.9756944444445, "loss": 0.0891, "rewards/chosen": 7.556283569335937, "rewards/margins": 39.38298102484809, "rewards/rejected": -31.826697455512154, "step": 3943 }, { "epoch": 0.9868635055673715, "grad_norm": 13.4375, "kl": 2.114741802215576, "learning_rate": 5e-06, "logits/chosen": -42494712.88888889, "logits/rejected": -54532599.46666667, "logps/chosen": -531.55712890625, "logps/rejected": -703.9520833333333, "loss": 0.0335, "rewards/chosen": 12.468458387586805, "rewards/margins": 38.12098931206597, "rewards/rejected": -25.652530924479166, "step": 3944 }, { "epoch": 0.9871137245089453, "grad_norm": 0.96484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -53907702.15384615, "logits/rejected": -57878446.54545455, "logps/chosen": -391.70616736778845, "logps/rejected": -626.0774591619319, "loss": 0.0014, "rewards/chosen": 8.92250706599309, "rewards/margins": 32.94486535345758, "rewards/rejected": -24.02235828746449, "step": 3945 }, { "epoch": 0.9873639434505193, "grad_norm": 1.9296875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -29963148.8, "logits/rejected": -19879853.714285713, "logps/chosen": -417.868017578125, "logps/rejected": -664.2801339285714, "loss": 0.0137, "rewards/chosen": 10.50127716064453, "rewards/margins": 31.27513972691127, "rewards/rejected": -20.77386256626674, "step": 3946 }, { "epoch": 0.9876141623920931, "grad_norm": 0.72265625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -51812809.14285714, "logits/rejected": -58584884.705882356, "logps/chosen": -393.245849609375, "logps/rejected": -762.5880055147059, "loss": 0.0023, "rewards/chosen": 10.252015250069755, "rewards/margins": 39.32556979395762, "rewards/rejected": -29.073554543887866, "step": 3947 }, { "epoch": 0.9878643813336669, "grad_norm": 3.8125, "kl": 1.9092109203338623, "learning_rate": 5e-06, "logits/chosen": -28000529.230769232, "logits/rejected": -69654946.9090909, "logps/chosen": -329.1393479567308, "logps/rejected": -607.4361239346591, "loss": 0.0223, "rewards/chosen": 8.90884047288161, "rewards/margins": 27.891068145111724, "rewards/rejected": -18.982227672230113, "step": 3948 }, { "epoch": 0.9881146002752408, "grad_norm": 7.125, "kl": 2.0589828491210938, "learning_rate": 5e-06, "logits/chosen": -45497109.333333336, "logits/rejected": -76610640.0, "logps/chosen": -333.2631564670139, "logps/rejected": -322.0388997395833, "loss": 0.0683, "rewards/chosen": 8.318917168511284, "rewards/margins": 16.71895429823134, "rewards/rejected": -8.400037129720053, "step": 3949 }, { "epoch": 0.9883648192168147, "grad_norm": 1.6796875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47325410.461538464, "logits/rejected": -45818845.09090909, "logps/chosen": -321.19542518028845, "logps/rejected": -651.3072620738636, "loss": 0.019, "rewards/chosen": 8.775801438551683, "rewards/margins": 33.89396550105168, "rewards/rejected": -25.1181640625, "step": 3950 }, { "epoch": 0.9886150381583886, "grad_norm": 0.130859375, "kl": 4.493025302886963, "learning_rate": 5e-06, "logits/chosen": -47206528.0, "logits/rejected": -26520050.666666668, "logps/chosen": -453.1259765625, "logps/rejected": -566.3223470052084, "loss": 0.0003, "rewards/chosen": 12.524405161539713, "rewards/margins": 33.82267506917318, "rewards/rejected": -21.298269907633465, "step": 3951 }, { "epoch": 0.9888652570999624, "grad_norm": 1.09375, "kl": 2.585041046142578, "learning_rate": 5e-06, "logits/chosen": -32735785.14285714, "logits/rejected": 131204300.8, "logps/chosen": -406.31703404017856, "logps/rejected": -747.648388671875, "loss": 0.0015, "rewards/chosen": 9.250094822474889, "rewards/margins": 32.4478273664202, "rewards/rejected": -23.197732543945314, "step": 3952 }, { "epoch": 0.9891154760415364, "grad_norm": 4.03125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -20406521.846153848, "logits/rejected": -43217384.72727273, "logps/chosen": -308.88037109375, "logps/rejected": -609.6330788352273, "loss": 0.0343, "rewards/chosen": 7.953879136305589, "rewards/margins": 26.958494226415674, "rewards/rejected": -19.004615090110086, "step": 3953 }, { "epoch": 0.9893656949831102, "grad_norm": 2.625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -32097112.0, "logits/rejected": -38692576.0, "logps/chosen": -309.71502685546875, "logps/rejected": -725.3766276041666, "loss": 0.0342, "rewards/chosen": 8.32809321085612, "rewards/margins": 28.434861501057945, "rewards/rejected": -20.106768290201824, "step": 3954 }, { "epoch": 0.9896159139246841, "grad_norm": 3.71875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -43478304.0, "logits/rejected": -56707520.0, "logps/chosen": -361.45478515625, "logps/rejected": -623.1678292410714, "loss": 0.0142, "rewards/chosen": 7.855131530761719, "rewards/margins": 30.855000741141183, "rewards/rejected": -22.999869210379465, "step": 3955 }, { "epoch": 0.989866132866258, "grad_norm": 3.796875, "kl": 5.011376857757568, "learning_rate": 5e-06, "logits/chosen": -30940270.222222224, "logits/rejected": -64041518.93333333, "logps/chosen": -415.21533203125, "logps/rejected": -659.2696614583333, "loss": 0.0382, "rewards/chosen": 10.734215630425346, "rewards/margins": 34.966053602430556, "rewards/rejected": -24.23183797200521, "step": 3956 }, { "epoch": 0.9901163518078319, "grad_norm": 12.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -34440978.666666664, "logits/rejected": -43845645.333333336, "logps/chosen": -334.89255777994794, "logps/rejected": -638.4994710286459, "loss": 0.0406, "rewards/chosen": 8.733612696329752, "rewards/margins": 29.712447484334312, "rewards/rejected": -20.97883478800456, "step": 3957 }, { "epoch": 0.9903665707494057, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30342094.0, "logits/rejected": -5984259.0, "logps/chosen": -271.0609436035156, "logps/rejected": -572.8228149414062, "loss": 0.0851, "rewards/chosen": 6.186017036437988, "rewards/margins": 30.027365684509277, "rewards/rejected": -23.84134864807129, "step": 3958 }, { "epoch": 0.9906167896909797, "grad_norm": 7.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21171751.384615384, "logits/rejected": -50809890.90909091, "logps/chosen": -336.90981820913464, "logps/rejected": -546.9691051136364, "loss": 0.0215, "rewards/chosen": 8.231268075796274, "rewards/margins": 28.721452512941163, "rewards/rejected": -20.490184437144887, "step": 3959 }, { "epoch": 0.9908670086325535, "grad_norm": 2.234375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37973939.2, "logits/rejected": -38702560.0, "logps/chosen": -346.8701904296875, "logps/rejected": -716.72900390625, "loss": 0.0497, "rewards/chosen": 8.475537109375, "rewards/margins": 34.29376133510045, "rewards/rejected": -25.818224225725448, "step": 3960 }, { "epoch": 0.9911172275741273, "grad_norm": 9.3125, "kl": 4.190396785736084, "learning_rate": 5e-06, "logits/chosen": -6810541.6, "logits/rejected": -45286345.14285714, "logps/chosen": -205.781982421875, "logps/rejected": -556.1872907366071, "loss": 0.0575, "rewards/chosen": 7.310523986816406, "rewards/margins": 27.405501229422434, "rewards/rejected": -20.094977242606028, "step": 3961 }, { "epoch": 0.9913674465157012, "grad_norm": 18.0, "kl": 1.38067626953125, "learning_rate": 5e-06, "logits/chosen": -76666453.33333333, "logits/rejected": -48690165.333333336, "logps/chosen": -442.7099202473958, "logps/rejected": -673.058349609375, "loss": 0.0531, "rewards/chosen": 11.765276590983072, "rewards/margins": 31.92256418863932, "rewards/rejected": -20.15728759765625, "step": 3962 }, { "epoch": 0.9916176654572751, "grad_norm": 19.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36729674.666666664, "logits/rejected": -47037354.666666664, "logps/chosen": -320.2273356119792, "logps/rejected": -807.5828450520834, "loss": 0.0877, "rewards/chosen": 7.414924621582031, "rewards/margins": 32.027104695638016, "rewards/rejected": -24.61218007405599, "step": 3963 }, { "epoch": 0.991867884398849, "grad_norm": 4.15625, "kl": 15.500907897949219, "learning_rate": 5e-06, "logits/chosen": -49859464.0, "logits/rejected": -37097800.0, "logps/chosen": -449.969482421875, "logps/rejected": -564.3818969726562, "loss": 0.0132, "rewards/chosen": 10.481056213378906, "rewards/margins": 26.549829483032227, "rewards/rejected": -16.06877326965332, "step": 3964 }, { "epoch": 0.9921181033404228, "grad_norm": 15.8125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37106672.0, "logits/rejected": -27229385.14285714, "logps/chosen": -383.689111328125, "logps/rejected": -489.959716796875, "loss": 0.0707, "rewards/chosen": 10.878845977783204, "rewards/margins": 26.086871228899277, "rewards/rejected": -15.208025251116071, "step": 3965 }, { "epoch": 0.9923683222819968, "grad_norm": 7.75, "kl": 4.65280294418335, "learning_rate": 5e-06, "logits/chosen": -15188214.153846154, "logits/rejected": -47066257.45454545, "logps/chosen": -324.2204777644231, "logps/rejected": -857.1129261363636, "loss": 0.0595, "rewards/chosen": 8.03616685133714, "rewards/margins": 36.50703643585419, "rewards/rejected": -28.470869584517047, "step": 3966 }, { "epoch": 0.9926185412235706, "grad_norm": 8.5625, "kl": 12.692632675170898, "learning_rate": 5e-06, "logits/chosen": -24829265.454545453, "logits/rejected": -45168477.538461536, "logps/chosen": -431.27028586647725, "logps/rejected": -725.1931340144231, "loss": 0.0318, "rewards/chosen": 11.118859724564986, "rewards/margins": 37.536889509721235, "rewards/rejected": -26.41802978515625, "step": 3967 }, { "epoch": 0.9928687601651445, "grad_norm": 2.90625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -62926003.2, "logits/rejected": -35090912.0, "logps/chosen": -567.55556640625, "logps/rejected": -787.8756277901786, "loss": 0.017, "rewards/chosen": 10.352887725830078, "rewards/margins": 36.05899908883231, "rewards/rejected": -25.706111363002233, "step": 3968 }, { "epoch": 0.9931189791067184, "grad_norm": 1.7734375, "kl": 6.0283002853393555, "learning_rate": 5e-06, "logits/chosen": -29337501.53846154, "logits/rejected": -62914682.18181818, "logps/chosen": -418.9519230769231, "logps/rejected": -839.6400035511364, "loss": 0.0214, "rewards/chosen": 11.14601839505709, "rewards/margins": 42.964877148608224, "rewards/rejected": -31.818858753551137, "step": 3969 }, { "epoch": 0.9933691980482923, "grad_norm": 5.875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42198946.461538464, "logits/rejected": -34953393.45454545, "logps/chosen": -377.6365309495192, "logps/rejected": -593.7579900568181, "loss": 0.0239, "rewards/chosen": 7.9718757042518025, "rewards/margins": 24.810108958424387, "rewards/rejected": -16.838233254172586, "step": 3970 }, { "epoch": 0.9936194169898661, "grad_norm": 14.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -24040375.272727273, "logits/rejected": -46528280.615384616, "logps/chosen": -339.61372514204544, "logps/rejected": -547.3669621394231, "loss": 0.0402, "rewards/chosen": 8.295546791770242, "rewards/margins": 27.063933632590555, "rewards/rejected": -18.768386840820312, "step": 3971 }, { "epoch": 0.9938696359314401, "grad_norm": 5.53125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30519040.0, "logits/rejected": -20677053.333333332, "logps/chosen": -312.00702582465277, "logps/rejected": -689.5808919270834, "loss": 0.0631, "rewards/chosen": 8.195038689507378, "rewards/margins": 28.54941134982639, "rewards/rejected": -20.35437266031901, "step": 3972 }, { "epoch": 0.9941198548730139, "grad_norm": 3.0625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -21888375.466666665, "logits/rejected": -29557779.555555556, "logps/chosen": -297.1052734375, "logps/rejected": -721.9598524305555, "loss": 0.0315, "rewards/chosen": 8.641639200846354, "rewards/margins": 31.512203979492185, "rewards/rejected": -22.870564778645832, "step": 3973 }, { "epoch": 0.9943700738145878, "grad_norm": 8.5, "kl": 14.943087577819824, "learning_rate": 5e-06, "logits/chosen": -42088554.666666664, "logits/rejected": -31933837.333333332, "logps/chosen": -424.4392903645833, "logps/rejected": -615.1873779296875, "loss": 0.0385, "rewards/chosen": 9.567827860514322, "rewards/margins": 28.13157399495443, "rewards/rejected": -18.563746134440105, "step": 3974 }, { "epoch": 0.9946202927561616, "grad_norm": 0.87109375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -31922220.307692308, "logits/rejected": -40276116.36363637, "logps/chosen": -333.86902794471155, "logps/rejected": -487.08354048295456, "loss": 0.0154, "rewards/chosen": 9.25815171461839, "rewards/margins": 27.50727225350333, "rewards/rejected": -18.24912053888494, "step": 3975 }, { "epoch": 0.9948705116977355, "grad_norm": 5.0, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -36897230.222222224, "logits/rejected": -27221506.133333333, "logps/chosen": -465.48480902777777, "logps/rejected": -529.3318033854167, "loss": 0.015, "rewards/chosen": 10.384301079644096, "rewards/margins": 27.93383314344618, "rewards/rejected": -17.549532063802083, "step": 3976 }, { "epoch": 0.9951207306393094, "grad_norm": 2.40625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -14568724.0, "logits/rejected": -60527620.0, "logps/chosen": -558.1046142578125, "logps/rejected": -587.919921875, "loss": 0.0008, "rewards/chosen": 11.181229591369629, "rewards/margins": 34.777085304260254, "rewards/rejected": -23.595855712890625, "step": 3977 }, { "epoch": 0.9953709495808832, "grad_norm": 5.28125, "kl": 9.59811019897461, "learning_rate": 5e-06, "logits/chosen": -25749104.0, "logits/rejected": -69065280.0, "logps/chosen": -326.09388950892856, "logps/rejected": -835.42861328125, "loss": 0.0574, "rewards/chosen": 8.926060812813896, "rewards/margins": 36.27694582257952, "rewards/rejected": -27.350885009765626, "step": 3978 }, { "epoch": 0.9956211685224572, "grad_norm": 0.51171875, "kl": 6.3562469482421875, "learning_rate": 5e-06, "logits/chosen": -42641000.72727273, "logits/rejected": -62957952.0, "logps/chosen": -383.52681107954544, "logps/rejected": -746.2035006009615, "loss": 0.0009, "rewards/chosen": 10.290686867453836, "rewards/margins": 36.80545972277234, "rewards/rejected": -26.51477285531851, "step": 3979 }, { "epoch": 0.995871387464031, "grad_norm": 9.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -47848285.09090909, "logits/rejected": -14248665.846153846, "logps/chosen": -513.8664772727273, "logps/rejected": -508.95639272836536, "loss": 0.0402, "rewards/chosen": 13.651870727539062, "rewards/margins": 28.735745943509613, "rewards/rejected": -15.083875215970552, "step": 3980 }, { "epoch": 0.9961216064056049, "grad_norm": 5.21875, "kl": 6.232570648193359, "learning_rate": 5e-06, "logits/chosen": -27180532.363636363, "logits/rejected": -34019874.461538464, "logps/chosen": -338.5, "logps/rejected": -470.90831580528845, "loss": 0.0186, "rewards/chosen": 8.257475419477982, "rewards/margins": 23.03724296943291, "rewards/rejected": -14.779767549954927, "step": 3981 }, { "epoch": 0.9963718253471788, "grad_norm": 0.8515625, "kl": 0.5155544281005859, "learning_rate": 5e-06, "logits/chosen": -51104442.666666664, "logits/rejected": -61470432.0, "logps/chosen": -466.1374104817708, "logps/rejected": -1086.4558919270833, "loss": 0.012, "rewards/chosen": 10.900699615478516, "rewards/margins": 49.879258473714195, "rewards/rejected": -38.97855885823568, "step": 3982 }, { "epoch": 0.9966220442887527, "grad_norm": 1.046875, "kl": 15.152315139770508, "learning_rate": 5e-06, "logits/chosen": -29074664.0, "logits/rejected": -44868956.0, "logps/chosen": -527.9139404296875, "logps/rejected": -740.472412109375, "loss": 0.015, "rewards/chosen": 11.776090621948242, "rewards/margins": 39.6702995300293, "rewards/rejected": -27.894208908081055, "step": 3983 }, { "epoch": 0.9968722632303265, "grad_norm": 13.375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -42194412.307692304, "logits/rejected": -28314778.181818184, "logps/chosen": -429.6138446514423, "logps/rejected": -571.6382723721591, "loss": 0.0144, "rewards/chosen": 10.024604210486778, "rewards/margins": 26.796969433764477, "rewards/rejected": -16.7723652232777, "step": 3984 }, { "epoch": 0.9971224821719004, "grad_norm": 11.4375, "kl": 10.635231971740723, "learning_rate": 5e-06, "logits/chosen": -30552768.0, "logits/rejected": -47285680.0, "logps/chosen": -429.8095296223958, "logps/rejected": -707.5057779947916, "loss": 0.0734, "rewards/chosen": 10.152160008748373, "rewards/margins": 37.80118497212728, "rewards/rejected": -27.649024963378906, "step": 3985 }, { "epoch": 0.9973727011134743, "grad_norm": 5.59375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -12818886.222222222, "logits/rejected": -69408196.26666667, "logps/chosen": -341.0488552517361, "logps/rejected": -722.6271484375, "loss": 0.0364, "rewards/chosen": 8.758168538411459, "rewards/margins": 36.78118896484375, "rewards/rejected": -28.023020426432293, "step": 3986 }, { "epoch": 0.9976229200550482, "grad_norm": 4.6875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -37982288.0, "logits/rejected": -10794621.333333334, "logps/chosen": -451.7815755208333, "logps/rejected": -728.7311197916666, "loss": 0.0133, "rewards/chosen": 9.61706797281901, "rewards/margins": 34.14792124430338, "rewards/rejected": -24.530853271484375, "step": 3987 }, { "epoch": 0.997873138996622, "grad_norm": 0.021484375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60591674.18181818, "logits/rejected": -40643539.692307696, "logps/chosen": -449.9423828125, "logps/rejected": -545.2093599759615, "loss": 0.0, "rewards/chosen": 11.91179032759233, "rewards/margins": 32.02961400172094, "rewards/rejected": -20.117823674128605, "step": 3988 }, { "epoch": 0.998123357938196, "grad_norm": 12.9375, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -55121693.09090909, "logits/rejected": -44548253.538461536, "logps/chosen": -414.1126598011364, "logps/rejected": -662.9449368990385, "loss": 0.0455, "rewards/chosen": 6.894778164950284, "rewards/margins": 30.14309030812937, "rewards/rejected": -23.248312143179085, "step": 3989 }, { "epoch": 0.9983735768797698, "grad_norm": 1.421875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -28642167.272727273, "logits/rejected": -41489836.307692304, "logps/chosen": -458.8709161931818, "logps/rejected": -600.6727764423077, "loss": 0.0092, "rewards/chosen": 8.172753073952414, "rewards/margins": 31.718522398621886, "rewards/rejected": -23.54576932466947, "step": 3990 }, { "epoch": 0.9986237958213436, "grad_norm": 22.125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -26861908.0, "logits/rejected": -46560456.0, "logps/chosen": -420.3039245605469, "logps/rejected": -522.4829711914062, "loss": 0.0198, "rewards/chosen": 9.867569923400879, "rewards/margins": 29.49724292755127, "rewards/rejected": -19.62967300415039, "step": 3991 }, { "epoch": 0.9988740147629176, "grad_norm": 3.15625, "kl": 12.076005935668945, "learning_rate": 5e-06, "logits/chosen": -49633680.0, "logits/rejected": -35480872.0, "logps/chosen": -440.7423400878906, "logps/rejected": -484.1937255859375, "loss": 0.0153, "rewards/chosen": 9.369802474975586, "rewards/margins": 28.970916748046875, "rewards/rejected": -19.60111427307129, "step": 3992 }, { "epoch": 0.9991242337044914, "grad_norm": 1.3125, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -60890150.4, "logits/rejected": -52861408.0, "logps/chosen": -319.358544921875, "logps/rejected": -835.9989536830357, "loss": 0.0092, "rewards/chosen": 7.933580017089843, "rewards/margins": 42.47269483293806, "rewards/rejected": -34.539114815848215, "step": 3993 }, { "epoch": 0.9993744526460653, "grad_norm": 6.1875, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -70344584.53333333, "logits/rejected": -60286464.0, "logps/chosen": -259.4136067708333, "logps/rejected": -998.4607204861111, "loss": 0.076, "rewards/chosen": 6.816383870442708, "rewards/margins": 44.20916544596354, "rewards/rejected": -37.392781575520836, "step": 3994 }, { "epoch": 0.9996246715876392, "grad_norm": 11.9375, "kl": 5.460507869720459, "learning_rate": 5e-06, "logits/chosen": -43662973.09090909, "logits/rejected": -57438281.84615385, "logps/chosen": -341.693115234375, "logps/rejected": -897.6317608173077, "loss": 0.0627, "rewards/chosen": 8.85528564453125, "rewards/margins": 38.00040377103365, "rewards/rejected": -29.145118126502403, "step": 3995 }, { "epoch": 0.9998748905292131, "grad_norm": 4.5, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -30524667.076923076, "logits/rejected": -32380634.181818184, "logps/chosen": -347.27786959134613, "logps/rejected": -583.8941761363636, "loss": 0.0694, "rewards/chosen": 7.406897324782151, "rewards/margins": 29.70618043912874, "rewards/rejected": -22.29928311434659, "step": 3996 }, { "epoch": 1.0, "grad_norm": 0.06494140625, "kl": 0.0, "learning_rate": 5e-06, "logits/chosen": -35362256.0, "logits/rejected": -50782952.0, "logps/chosen": -445.2106119791667, "logps/rejected": -768.860107421875, "loss": 0.0001, "rewards/chosen": 9.19583829243978, "rewards/margins": 48.01045163472494, "rewards/rejected": -38.814613342285156, "step": 3997 } ], "logging_steps": 1, "max_steps": 3997, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 1999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }