diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,59989 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3997, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00025021894157387716, + "grad_norm": 88.5, + "kl": 0.0, + "learning_rate": 1.4285714285714287e-07, + "logits/chosen": -59541664.0, + "logits/rejected": -25616944.0, + "logps/chosen": -488.6276448567708, + "logps/rejected": -316.3020833333333, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0005004378831477543, + "grad_norm": 98.0, + "kl": 0.0, + "learning_rate": 2.8571428571428575e-07, + "logits/chosen": -56574988.8, + "logits/rejected": -65667410.28571428, + "logps/chosen": -472.650927734375, + "logps/rejected": -482.46128627232144, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0007506568247216314, + "grad_norm": 76.0, + "kl": 0.0, + "learning_rate": 4.285714285714286e-07, + "logits/chosen": -16256480.0, + "logits/rejected": -27352000.0, + "logps/chosen": -367.8279622395833, + "logps/rejected": -355.7858072916667, + "loss": 0.5002, + "rewards/chosen": 0.003980398178100586, + "rewards/margins": -0.0015324751536051435, + "rewards/rejected": 0.0055128733317057295, + "step": 3 + }, + { + "epoch": 0.0010008757662955086, + "grad_norm": 93.0, + "kl": 0.08193652331829071, + "learning_rate": 5.714285714285715e-07, + "logits/chosen": -85845922.9090909, + "logits/rejected": -38409316.92307692, + "logps/chosen": -430.83473899147725, + "logps/rejected": -493.1432542067308, + "loss": 0.4964, + "rewards/chosen": 0.008217828517610376, + "rewards/margins": 0.026016472363388624, + "rewards/rejected": -0.017798643845778245, + "step": 4 + }, + { + "epoch": 0.0012510947078693856, + "grad_norm": 86.5, + "kl": 0.052356719970703125, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": -63304915.2, + "logits/rejected": -19663620.57142857, + "logps/chosen": -542.8798828125, + "logps/rejected": -435.6636439732143, + "loss": 0.5014, + "rewards/chosen": -0.020672836899757387, + "rewards/margins": -0.018652871676853727, + "rewards/rejected": -0.0020199652229036602, + "step": 5 + }, + { + "epoch": 0.0015013136494432628, + "grad_norm": 94.0, + "kl": 0.04557546228170395, + "learning_rate": 8.571428571428572e-07, + "logits/chosen": -45416180.0, + "logits/rejected": -37515092.0, + "logps/chosen": -473.4654235839844, + "logps/rejected": -438.4719543457031, + "loss": 0.5, + "rewards/chosen": 0.006817246787250042, + "rewards/margins": 0.001284862868487835, + "rewards/rejected": 0.005532383918762207, + "step": 6 + }, + { + "epoch": 0.00175153259101714, + "grad_norm": 112.0, + "kl": 0.0428212508559227, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": -80972763.42857143, + "logits/rejected": -25930512.0, + "logps/chosen": -587.2008928571429, + "logps/rejected": -637.29287109375, + "loss": 0.4812, + "rewards/chosen": 0.06468875067574638, + "rewards/margins": 0.15806072013718742, + "rewards/rejected": -0.09337196946144104, + "step": 7 + }, + { + "epoch": 0.0020017515325910173, + "grad_norm": 101.0, + "kl": 0.0089480085298419, + "learning_rate": 1.142857142857143e-06, + "logits/chosen": -58541333.333333336, + "logits/rejected": -18188266.666666668, + "logps/chosen": -537.5132378472222, + "logps/rejected": -592.4221354166667, + "loss": 0.5031, + "rewards/chosen": -0.04628287421332465, + "rewards/margins": -0.039186521536774106, + "rewards/rejected": -0.007096352676550548, + "step": 8 + }, + { + "epoch": 0.0022519704741648943, + "grad_norm": 81.0, + "kl": 0.1360289305448532, + "learning_rate": 1.2857142857142856e-06, + "logits/chosen": -30847316.57142857, + "logits/rejected": -29223027.2, + "logps/chosen": -365.27615792410717, + "logps/rejected": -316.3841796875, + "loss": 0.4918, + "rewards/chosen": 0.01241051937852587, + "rewards/margins": 0.08507478024278368, + "rewards/rejected": -0.07266426086425781, + "step": 9 + }, + { + "epoch": 0.0025021894157387712, + "grad_norm": 80.0, + "kl": 0.05688444897532463, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -41497478.4, + "logits/rejected": -35174166.85714286, + "logps/chosen": -405.414208984375, + "logps/rejected": -407.22471400669644, + "loss": 0.4902, + "rewards/chosen": -0.002581767737865448, + "rewards/margins": 0.06627030351332255, + "rewards/rejected": -0.068852071251188, + "step": 10 + }, + { + "epoch": 0.0027524083573126487, + "grad_norm": 85.5, + "kl": 0.0, + "learning_rate": 1.5714285714285714e-06, + "logits/chosen": 4115368.727272727, + "logits/rejected": -23189848.615384616, + "logps/chosen": -608.0187766335227, + "logps/rejected": -409.3933293269231, + "loss": 0.4965, + "rewards/chosen": -0.04436756805940108, + "rewards/margins": 0.01926962437329592, + "rewards/rejected": -0.063637192432697, + "step": 11 + }, + { + "epoch": 0.0030026272988865257, + "grad_norm": 76.5, + "kl": 0.005174319259822369, + "learning_rate": 1.7142857142857145e-06, + "logits/chosen": -36525562.18181818, + "logits/rejected": -59515072.0, + "logps/chosen": -335.98237748579544, + "logps/rejected": -383.279296875, + "loss": 0.4785, + "rewards/chosen": 0.026898888024416836, + "rewards/margins": 0.16427097829071793, + "rewards/rejected": -0.1373720902663011, + "step": 12 + }, + { + "epoch": 0.0032528462404604027, + "grad_norm": 90.5, + "kl": 0.0, + "learning_rate": 1.8571428571428573e-06, + "logits/chosen": -61187712.0, + "logits/rejected": -30487753.14285714, + "logps/chosen": -554.416796875, + "logps/rejected": -470.5059291294643, + "loss": 0.4635, + "rewards/chosen": 0.04775131344795227, + "rewards/margins": 0.2668748813016074, + "rewards/rejected": -0.21912356785365514, + "step": 13 + }, + { + "epoch": 0.00350306518203428, + "grad_norm": 87.0, + "kl": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -82098724.57142857, + "logits/rejected": -23819310.4, + "logps/chosen": -455.3506556919643, + "logps/rejected": -330.45283203125, + "loss": 0.4759, + "rewards/chosen": 0.030299640127590725, + "rewards/margins": 0.2239126924957548, + "rewards/rejected": -0.19361305236816406, + "step": 14 + }, + { + "epoch": 0.003753284123608157, + "grad_norm": 90.0, + "kl": 0.0, + "learning_rate": 2.1428571428571427e-06, + "logits/chosen": -91756363.63636364, + "logits/rejected": -43028002.461538464, + "logps/chosen": -455.7848011363636, + "logps/rejected": -466.22325721153845, + "loss": 0.4504, + "rewards/chosen": 0.07364630699157715, + "rewards/margins": 0.3964222211104173, + "rewards/rejected": -0.32277591411884016, + "step": 15 + }, + { + "epoch": 0.0040035030651820345, + "grad_norm": 87.0, + "kl": 0.0, + "learning_rate": 2.285714285714286e-06, + "logits/chosen": -9058512.0, + "logits/rejected": -45979446.85714286, + "logps/chosen": -545.936767578125, + "logps/rejected": -541.0837751116071, + "loss": 0.4381, + "rewards/chosen": 0.058348214626312254, + "rewards/margins": 0.4579805663653782, + "rewards/rejected": -0.39963235173906597, + "step": 16 + }, + { + "epoch": 0.0042537220067559115, + "grad_norm": 81.0, + "kl": 0.019661586731672287, + "learning_rate": 2.428571428571429e-06, + "logits/chosen": -5214717.090909091, + "logits/rejected": -40260457.84615385, + "logps/chosen": -561.8737571022727, + "logps/rejected": -403.4113957331731, + "loss": 0.4358, + "rewards/chosen": 0.1934277577833696, + "rewards/margins": 0.5138672065067958, + "rewards/rejected": -0.3204394487234262, + "step": 17 + }, + { + "epoch": 0.0045039409483297885, + "grad_norm": 78.5, + "kl": 0.0, + "learning_rate": 2.571428571428571e-06, + "logits/chosen": -51075559.384615384, + "logits/rejected": -69050926.54545455, + "logps/chosen": -383.6446063701923, + "logps/rejected": -542.7177734375, + "loss": 0.435, + "rewards/chosen": 0.054797039582179144, + "rewards/margins": 0.5791361086018436, + "rewards/rejected": -0.5243390690196644, + "step": 18 + }, + { + "epoch": 0.0047541598899036655, + "grad_norm": 84.0, + "kl": 0.28668975830078125, + "learning_rate": 2.7142857142857144e-06, + "logits/chosen": -84931226.66666667, + "logits/rejected": -37310128.0, + "logps/chosen": -557.7818603515625, + "logps/rejected": -442.5623372395833, + "loss": 0.4194, + "rewards/chosen": 0.20295224587122598, + "rewards/margins": 0.7280211249987284, + "rewards/rejected": -0.5250688791275024, + "step": 19 + }, + { + "epoch": 0.0050043788314775425, + "grad_norm": 76.5, + "kl": 0.022904079407453537, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": -30446306.285714287, + "logits/rejected": -60657049.6, + "logps/chosen": -415.9957798549107, + "logps/rejected": -440.586328125, + "loss": 0.4335, + "rewards/chosen": 0.11211512769971575, + "rewards/margins": 0.6526112862995693, + "rewards/rejected": -0.5404961585998536, + "step": 20 + }, + { + "epoch": 0.00525459777305142, + "grad_norm": 77.0, + "kl": 0.0, + "learning_rate": 3e-06, + "logits/chosen": -47356508.0, + "logits/rejected": -38769844.0, + "logps/chosen": -540.397705078125, + "logps/rejected": -476.3580322265625, + "loss": 0.3676, + "rewards/chosen": 0.2587902247905731, + "rewards/margins": 1.0264058411121368, + "rewards/rejected": -0.7676156163215637, + "step": 21 + }, + { + "epoch": 0.005504816714625297, + "grad_norm": 78.5, + "kl": 0.0, + "learning_rate": 3.142857142857143e-06, + "logits/chosen": -44589024.0, + "logits/rejected": -50823984.0, + "logps/chosen": -392.33087158203125, + "logps/rejected": -431.8802185058594, + "loss": 0.4085, + "rewards/chosen": 0.20118574798107147, + "rewards/margins": 0.9541785567998886, + "rewards/rejected": -0.7529928088188171, + "step": 22 + }, + { + "epoch": 0.005755035656199174, + "grad_norm": 73.5, + "kl": 0.2415180206298828, + "learning_rate": 3.285714285714286e-06, + "logits/chosen": -29925634.285714287, + "logits/rejected": -40795820.8, + "logps/chosen": -434.2471400669643, + "logps/rejected": -425.832421875, + "loss": 0.3729, + "rewards/chosen": 0.42611227716718403, + "rewards/margins": 1.3466090270451136, + "rewards/rejected": -0.9204967498779297, + "step": 23 + }, + { + "epoch": 0.006005254597773051, + "grad_norm": 67.0, + "kl": 0.8058691024780273, + "learning_rate": 3.428571428571429e-06, + "logits/chosen": -54340292.571428575, + "logits/rejected": -37819292.8, + "logps/chosen": -382.73416573660717, + "logps/rejected": -552.03896484375, + "loss": 0.3819, + "rewards/chosen": 0.33219385147094727, + "rewards/margins": 1.3029109001159669, + "rewards/rejected": -0.9707170486450195, + "step": 24 + }, + { + "epoch": 0.006255473539346928, + "grad_norm": 62.0, + "kl": 0.8036088943481445, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": -59392403.692307696, + "logits/rejected": -47422577.45454545, + "logps/chosen": -459.494140625, + "logps/rejected": -587.9924538352273, + "loss": 0.3155, + "rewards/chosen": 0.5894884696373572, + "rewards/margins": 2.0235542017263133, + "rewards/rejected": -1.434065732088956, + "step": 25 + }, + { + "epoch": 0.006505692480920805, + "grad_norm": 52.0, + "kl": 0.0, + "learning_rate": 3.7142857142857146e-06, + "logits/chosen": -41334444.307692304, + "logits/rejected": -39218932.36363637, + "logps/chosen": -381.20650540865387, + "logps/rejected": -425.54092684659093, + "loss": 0.3401, + "rewards/chosen": 0.5716312848604642, + "rewards/margins": 1.5807496250926198, + "rewards/rejected": -1.0091183402321555, + "step": 26 + }, + { + "epoch": 0.006755911422494683, + "grad_norm": 65.5, + "kl": 0.18149185180664062, + "learning_rate": 3.857142857142858e-06, + "logits/chosen": -45916352.0, + "logits/rejected": -43130663.384615384, + "logps/chosen": -632.1180308948864, + "logps/rejected": -468.31385216346155, + "loss": 0.2855, + "rewards/chosen": 0.8257727189497515, + "rewards/margins": 2.0523407942765246, + "rewards/rejected": -1.226568075326773, + "step": 27 + }, + { + "epoch": 0.00700613036406856, + "grad_norm": 60.75, + "kl": 2.256681442260742, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -52474614.15384615, + "logits/rejected": -35245515.63636363, + "logps/chosen": -576.4675856370193, + "logps/rejected": -331.1305042613636, + "loss": 0.297, + "rewards/chosen": 1.36091555081881, + "rewards/margins": 2.2001000517731777, + "rewards/rejected": -0.8391845009543679, + "step": 28 + }, + { + "epoch": 0.007256349305642437, + "grad_norm": 46.75, + "kl": 0.9984372854232788, + "learning_rate": 4.1428571428571435e-06, + "logits/chosen": -46850481.777777776, + "logits/rejected": -56521501.86666667, + "logps/chosen": -426.9078776041667, + "logps/rejected": -384.9435546875, + "loss": 0.2869, + "rewards/chosen": 1.1934963862101238, + "rewards/margins": 2.367296028137207, + "rewards/rejected": -1.1737996419270833, + "step": 29 + }, + { + "epoch": 0.007506568247216314, + "grad_norm": 66.5, + "kl": 2.4080123901367188, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": -49813892.92307692, + "logits/rejected": -23674167.272727273, + "logps/chosen": -528.6504657451923, + "logps/rejected": -403.7041015625, + "loss": 0.3331, + "rewards/chosen": 0.9016431661752554, + "rewards/margins": 1.9492643729790107, + "rewards/rejected": -1.0476212068037554, + "step": 30 + }, + { + "epoch": 0.007756787188790191, + "grad_norm": 46.75, + "kl": 0.9658675193786621, + "learning_rate": 4.428571428571429e-06, + "logits/chosen": -48486181.333333336, + "logits/rejected": -28836466.666666668, + "logps/chosen": -339.1027018229167, + "logps/rejected": -356.8131510416667, + "loss": 0.2965, + "rewards/chosen": 1.1228853861490886, + "rewards/margins": 2.131355444590251, + "rewards/rejected": -1.008470058441162, + "step": 31 + }, + { + "epoch": 0.008007006130364069, + "grad_norm": 66.5, + "kl": 5.168900012969971, + "learning_rate": 4.571428571428572e-06, + "logits/chosen": -51749624.47058824, + "logits/rejected": -26386843.42857143, + "logps/chosen": -440.3121553308824, + "logps/rejected": -363.31431361607144, + "loss": 0.3235, + "rewards/chosen": 1.1607482012580423, + "rewards/margins": 2.905660212540827, + "rewards/rejected": -1.7449120112827845, + "step": 32 + }, + { + "epoch": 0.008257225071937945, + "grad_norm": 49.25, + "kl": 5.057428359985352, + "learning_rate": 4.714285714285715e-06, + "logits/chosen": -42186612.705882356, + "logits/rejected": -63770464.0, + "logps/chosen": -444.23133042279414, + "logps/rejected": -495.78916713169644, + "loss": 0.2656, + "rewards/chosen": 1.668528388528263, + "rewards/margins": 3.3087558425775097, + "rewards/rejected": -1.6402274540492467, + "step": 33 + }, + { + "epoch": 0.008507444013511823, + "grad_norm": 49.0, + "kl": 5.5824666023254395, + "learning_rate": 4.857142857142858e-06, + "logits/chosen": -9483001.846153846, + "logits/rejected": -43397184.0, + "logps/chosen": -628.7256234975962, + "logps/rejected": -417.3606622869318, + "loss": 0.2009, + "rewards/chosen": 2.539976560152494, + "rewards/margins": 3.367193662203275, + "rewards/rejected": -0.8272171020507812, + "step": 34 + }, + { + "epoch": 0.0087576629550857, + "grad_norm": 38.5, + "kl": 6.245134353637695, + "learning_rate": 5e-06, + "logits/chosen": -44029211.428571425, + "logits/rejected": -69293568.0, + "logps/chosen": -381.298828125, + "logps/rejected": -482.6978515625, + "loss": 0.253, + "rewards/chosen": 1.7988497870309013, + "rewards/margins": 2.8967734473092213, + "rewards/rejected": -1.0979236602783202, + "step": 35 + }, + { + "epoch": 0.009007881896659577, + "grad_norm": 45.5, + "kl": 4.012667655944824, + "learning_rate": 5e-06, + "logits/chosen": -46534710.15384615, + "logits/rejected": -37244465.45454545, + "logps/chosen": -459.57083834134613, + "logps/rejected": -457.34614701704544, + "loss": 0.2657, + "rewards/chosen": 1.8993977766770582, + "rewards/margins": 3.029151496353683, + "rewards/rejected": -1.1297537196766247, + "step": 36 + }, + { + "epoch": 0.009258100838233455, + "grad_norm": 44.25, + "kl": 4.958836078643799, + "learning_rate": 5e-06, + "logits/chosen": -54757964.8, + "logits/rejected": -41609389.71428572, + "logps/chosen": -385.8251953125, + "logps/rejected": -402.88065011160717, + "loss": 0.2729, + "rewards/chosen": 2.25915412902832, + "rewards/margins": 3.121587835039411, + "rewards/rejected": -0.8624337060110909, + "step": 37 + }, + { + "epoch": 0.009508319779807331, + "grad_norm": 46.0, + "kl": 7.350357532501221, + "learning_rate": 5e-06, + "logits/chosen": -68632512.0, + "logits/rejected": -33406549.333333332, + "logps/chosen": -517.378662109375, + "logps/rejected": -517.5869140625, + "loss": 0.2025, + "rewards/chosen": 2.505164623260498, + "rewards/margins": 3.9591061274210615, + "rewards/rejected": -1.4539415041605632, + "step": 38 + }, + { + "epoch": 0.009758538721381209, + "grad_norm": 30.75, + "kl": 7.688513278961182, + "learning_rate": 5e-06, + "logits/chosen": -63052647.384615384, + "logits/rejected": -40342722.90909091, + "logps/chosen": -486.09878305288464, + "logps/rejected": -492.6912286931818, + "loss": 0.1482, + "rewards/chosen": 3.5016696636493387, + "rewards/margins": 5.5918673135183905, + "rewards/rejected": -2.0901976498690518, + "step": 39 + }, + { + "epoch": 0.010008757662955085, + "grad_norm": 52.75, + "kl": 0.9967638850212097, + "learning_rate": 5e-06, + "logits/chosen": -45448392.0, + "logits/rejected": -21358310.0, + "logps/chosen": -449.9379577636719, + "logps/rejected": -443.4837646484375, + "loss": 0.2033, + "rewards/chosen": 3.020130157470703, + "rewards/margins": 4.729620933532715, + "rewards/rejected": -1.7094907760620117, + "step": 40 + }, + { + "epoch": 0.010258976604528963, + "grad_norm": 39.0, + "kl": 3.558825969696045, + "learning_rate": 5e-06, + "logits/chosen": -78859318.15384616, + "logits/rejected": -52826565.81818182, + "logps/chosen": -418.4805438701923, + "logps/rejected": -416.9651988636364, + "loss": 0.1979, + "rewards/chosen": 2.3863435891958384, + "rewards/margins": 4.1012449064454835, + "rewards/rejected": -1.7149013172496448, + "step": 41 + }, + { + "epoch": 0.01050919554610284, + "grad_norm": 36.5, + "kl": 5.200307369232178, + "learning_rate": 5e-06, + "logits/chosen": -49897130.666666664, + "logits/rejected": -51501777.777777776, + "logps/chosen": -446.23912760416664, + "logps/rejected": -361.51161024305554, + "loss": 0.2222, + "rewards/chosen": 2.500630187988281, + "rewards/margins": 3.2733071857028535, + "rewards/rejected": -0.7726769977145724, + "step": 42 + }, + { + "epoch": 0.010759414487676717, + "grad_norm": 34.0, + "kl": 5.424537181854248, + "learning_rate": 5e-06, + "logits/chosen": -66925262.76923077, + "logits/rejected": -28641832.727272727, + "logps/chosen": -444.0276442307692, + "logps/rejected": -439.81986860795456, + "loss": 0.262, + "rewards/chosen": 2.707141582782452, + "rewards/margins": 4.80719745075786, + "rewards/rejected": -2.100055867975408, + "step": 43 + }, + { + "epoch": 0.011009633429250595, + "grad_norm": 27.875, + "kl": 4.094232082366943, + "learning_rate": 5e-06, + "logits/chosen": -30213911.466666665, + "logits/rejected": -60019619.55555555, + "logps/chosen": -424.7192708333333, + "logps/rejected": -568.1768120659722, + "loss": 0.1609, + "rewards/chosen": 2.563251241048177, + "rewards/margins": 5.800806850857205, + "rewards/rejected": -3.2375556098090277, + "step": 44 + }, + { + "epoch": 0.01125985237082447, + "grad_norm": 37.0, + "kl": 3.9221110343933105, + "learning_rate": 5e-06, + "logits/chosen": -32619204.923076924, + "logits/rejected": -81111790.54545455, + "logps/chosen": -342.2945087139423, + "logps/rejected": -644.5071910511364, + "loss": 0.1991, + "rewards/chosen": 1.9459634927602916, + "rewards/margins": 3.8989322635677315, + "rewards/rejected": -1.9529687708074397, + "step": 45 + }, + { + "epoch": 0.011510071312398349, + "grad_norm": 35.5, + "kl": 5.3883137702941895, + "learning_rate": 5e-06, + "logits/chosen": -23232284.0, + "logits/rejected": -24105454.0, + "logps/chosen": -347.0953369140625, + "logps/rejected": -546.0194091796875, + "loss": 0.2393, + "rewards/chosen": 2.0438919067382812, + "rewards/margins": 4.417301893234253, + "rewards/rejected": -2.3734099864959717, + "step": 46 + }, + { + "epoch": 0.011760290253972227, + "grad_norm": 29.25, + "kl": 5.374369144439697, + "learning_rate": 5e-06, + "logits/chosen": -61469664.0, + "logits/rejected": -81658696.0, + "logps/chosen": -452.052490234375, + "logps/rejected": -572.4196166992188, + "loss": 0.175, + "rewards/chosen": 2.912391185760498, + "rewards/margins": 6.923405170440674, + "rewards/rejected": -4.011013984680176, + "step": 47 + }, + { + "epoch": 0.012010509195546103, + "grad_norm": 42.0, + "kl": 3.7215304374694824, + "learning_rate": 5e-06, + "logits/chosen": -60276736.0, + "logits/rejected": -55375522.90909091, + "logps/chosen": -453.3477313701923, + "logps/rejected": -561.0490500710227, + "loss": 0.2311, + "rewards/chosen": 2.7364683884840746, + "rewards/margins": 4.453294393899558, + "rewards/rejected": -1.716826005415483, + "step": 48 + }, + { + "epoch": 0.01226072813711998, + "grad_norm": 26.0, + "kl": 3.1391959190368652, + "learning_rate": 5e-06, + "logits/chosen": -53692666.18181818, + "logits/rejected": -52693277.538461536, + "logps/chosen": -464.56005859375, + "logps/rejected": -463.7102614182692, + "loss": 0.1819, + "rewards/chosen": 2.6378198103471235, + "rewards/margins": 5.295058990691926, + "rewards/rejected": -2.657239180344802, + "step": 49 + }, + { + "epoch": 0.012510947078693857, + "grad_norm": 34.25, + "kl": 2.9572463035583496, + "learning_rate": 5e-06, + "logits/chosen": -36200530.28571428, + "logits/rejected": -24690896.0, + "logps/chosen": -300.24166434151783, + "logps/rejected": -411.090625, + "loss": 0.2121, + "rewards/chosen": 2.0287959235055104, + "rewards/margins": 4.210692514692034, + "rewards/rejected": -2.1818965911865233, + "step": 50 + }, + { + "epoch": 0.012761166020267735, + "grad_norm": 35.5, + "kl": 3.992845058441162, + "learning_rate": 5e-06, + "logits/chosen": -43658215.384615384, + "logits/rejected": -38654816.0, + "logps/chosen": -418.28568209134613, + "logps/rejected": -510.7462713068182, + "loss": 0.1616, + "rewards/chosen": 2.639216789832482, + "rewards/margins": 5.9850447594702665, + "rewards/rejected": -3.345827969637784, + "step": 51 + }, + { + "epoch": 0.01301138496184161, + "grad_norm": 35.75, + "kl": 4.593269348144531, + "learning_rate": 5e-06, + "logits/chosen": -53449028.266666666, + "logits/rejected": -42102737.777777776, + "logps/chosen": -324.2541015625, + "logps/rejected": -351.76752387152777, + "loss": 0.2912, + "rewards/chosen": 1.6974297841389974, + "rewards/margins": 3.6081525166829427, + "rewards/rejected": -1.9107227325439453, + "step": 52 + }, + { + "epoch": 0.013261603903415489, + "grad_norm": 24.25, + "kl": 2.7101986408233643, + "learning_rate": 5e-06, + "logits/chosen": -70743861.33333333, + "logits/rejected": -23802826.666666668, + "logps/chosen": -488.441162109375, + "logps/rejected": -642.853515625, + "loss": 0.1279, + "rewards/chosen": 3.102839152018229, + "rewards/margins": 7.0102189381917315, + "rewards/rejected": -3.9073797861735025, + "step": 53 + }, + { + "epoch": 0.013511822844989366, + "grad_norm": 33.75, + "kl": 1.6181539297103882, + "learning_rate": 5e-06, + "logits/chosen": -79371884.3076923, + "logits/rejected": -19914660.363636363, + "logps/chosen": -541.4880558894231, + "logps/rejected": -677.6415127840909, + "loss": 0.1736, + "rewards/chosen": 3.6615911630483775, + "rewards/margins": 7.590517110757895, + "rewards/rejected": -3.928925947709517, + "step": 54 + }, + { + "epoch": 0.013762041786563243, + "grad_norm": 34.0, + "kl": 17.674467086791992, + "learning_rate": 5e-06, + "logits/chosen": -65001088.0, + "logits/rejected": -17387204.0, + "logps/chosen": -424.58978271484375, + "logps/rejected": -450.2849426269531, + "loss": 0.3174, + "rewards/chosen": 3.732567310333252, + "rewards/margins": 6.20811915397644, + "rewards/rejected": -2.4755518436431885, + "step": 55 + }, + { + "epoch": 0.01401226072813712, + "grad_norm": 23.5, + "kl": 11.61301040649414, + "learning_rate": 5e-06, + "logits/chosen": -72500420.26666667, + "logits/rejected": -57343502.222222224, + "logps/chosen": -424.5728515625, + "logps/rejected": -460.6795247395833, + "loss": 0.1971, + "rewards/chosen": 3.042181905110677, + "rewards/margins": 6.413038084242078, + "rewards/rejected": -3.370856179131402, + "step": 56 + }, + { + "epoch": 0.014262479669710997, + "grad_norm": 29.75, + "kl": 10.680805206298828, + "learning_rate": 5e-06, + "logits/chosen": -58818313.84615385, + "logits/rejected": -38190298.18181818, + "logps/chosen": -472.00473257211536, + "logps/rejected": -425.52903053977275, + "loss": 0.2436, + "rewards/chosen": 3.4778641920823317, + "rewards/margins": 5.632143033967985, + "rewards/rejected": -2.1542788418856533, + "step": 57 + }, + { + "epoch": 0.014512698611284874, + "grad_norm": 19.5, + "kl": 5.999025821685791, + "learning_rate": 5e-06, + "logits/chosen": -80188544.0, + "logits/rejected": -60439285.333333336, + "logps/chosen": -452.6884358723958, + "logps/rejected": -470.2325032552083, + "loss": 0.1122, + "rewards/chosen": 4.041547139485677, + "rewards/margins": 6.5902516047159825, + "rewards/rejected": -2.548704465230306, + "step": 58 + }, + { + "epoch": 0.014762917552858752, + "grad_norm": 23.75, + "kl": 6.040742874145508, + "learning_rate": 5e-06, + "logits/chosen": -56422065.23076923, + "logits/rejected": -51661288.72727273, + "logps/chosen": -452.8036358173077, + "logps/rejected": -417.02823153409093, + "loss": 0.1324, + "rewards/chosen": 3.348560333251953, + "rewards/margins": 5.20868561484597, + "rewards/rejected": -1.8601252815940164, + "step": 59 + }, + { + "epoch": 0.015013136494432628, + "grad_norm": 26.375, + "kl": 2.8226964473724365, + "learning_rate": 5e-06, + "logits/chosen": -35815285.333333336, + "logits/rejected": -34013146.666666664, + "logps/chosen": -462.9361979166667, + "logps/rejected": -472.0421549479167, + "loss": 0.1239, + "rewards/chosen": 3.104276657104492, + "rewards/margins": 6.3904050191243496, + "rewards/rejected": -3.286128362019857, + "step": 60 + }, + { + "epoch": 0.015263355436006506, + "grad_norm": 28.375, + "kl": 6.3608245849609375, + "learning_rate": 5e-06, + "logits/chosen": -35580477.333333336, + "logits/rejected": -49281386.666666664, + "logps/chosen": -451.8302408854167, + "logps/rejected": -531.75537109375, + "loss": 0.1522, + "rewards/chosen": 3.9960447947184243, + "rewards/margins": 8.040406862894693, + "rewards/rejected": -4.0443620681762695, + "step": 61 + }, + { + "epoch": 0.015513574377580382, + "grad_norm": 38.75, + "kl": 15.220528602600098, + "learning_rate": 5e-06, + "logits/chosen": -61638628.571428575, + "logits/rejected": -35349504.0, + "logps/chosen": -506.88438197544644, + "logps/rejected": -523.688134765625, + "loss": 0.1738, + "rewards/chosen": 3.3980004446847096, + "rewards/margins": 6.051696504865374, + "rewards/rejected": -2.653696060180664, + "step": 62 + }, + { + "epoch": 0.01576379331915426, + "grad_norm": 31.125, + "kl": 6.477090835571289, + "learning_rate": 5e-06, + "logits/chosen": -46088345.6, + "logits/rejected": -20498917.333333332, + "logps/chosen": -412.12242838541664, + "logps/rejected": -664.1638454861111, + "loss": 0.1967, + "rewards/chosen": 3.4296048482259116, + "rewards/margins": 7.458029429117838, + "rewards/rejected": -4.028424580891927, + "step": 63 + }, + { + "epoch": 0.016014012260728138, + "grad_norm": 30.625, + "kl": 8.509873390197754, + "learning_rate": 5e-06, + "logits/chosen": -40442308.0, + "logits/rejected": -56075968.0, + "logps/chosen": -437.4285888671875, + "logps/rejected": -570.4381713867188, + "loss": 0.1208, + "rewards/chosen": 3.924570083618164, + "rewards/margins": 7.112115859985352, + "rewards/rejected": -3.1875457763671875, + "step": 64 + }, + { + "epoch": 0.016264231202302016, + "grad_norm": 24.5, + "kl": 0.34754371643066406, + "learning_rate": 5e-06, + "logits/chosen": -84957613.71428572, + "logits/rejected": -46729106.823529415, + "logps/chosen": -515.1071428571429, + "logps/rejected": -652.2908432904412, + "loss": 0.0864, + "rewards/chosen": 4.909284319196429, + "rewards/margins": 9.977046806271337, + "rewards/rejected": -5.0677624870749085, + "step": 65 + }, + { + "epoch": 0.01651445014387589, + "grad_norm": 30.875, + "kl": 6.232099533081055, + "learning_rate": 5e-06, + "logits/chosen": -4933568.0, + "logits/rejected": -36256624.0, + "logps/chosen": -379.7061767578125, + "logps/rejected": -517.9809919084821, + "loss": 0.1841, + "rewards/chosen": 3.948345184326172, + "rewards/margins": 6.235989706856864, + "rewards/rejected": -2.287644522530692, + "step": 66 + }, + { + "epoch": 0.016764669085449768, + "grad_norm": 27.5, + "kl": 7.00858211517334, + "learning_rate": 5e-06, + "logits/chosen": -69562180.57142857, + "logits/rejected": 14068864.0, + "logps/chosen": -482.22202845982144, + "logps/rejected": -514.0224609375, + "loss": 0.158, + "rewards/chosen": 3.3606959751674106, + "rewards/margins": 6.84954103742327, + "rewards/rejected": -3.4888450622558596, + "step": 67 + }, + { + "epoch": 0.017014888027023646, + "grad_norm": 28.5, + "kl": 2.7451655864715576, + "learning_rate": 5e-06, + "logits/chosen": -53321006.54545455, + "logits/rejected": -56115938.461538464, + "logps/chosen": -363.21928267045456, + "logps/rejected": -505.3498347355769, + "loss": 0.1439, + "rewards/chosen": 2.931535547429865, + "rewards/margins": 6.898419573590472, + "rewards/rejected": -3.966884026160607, + "step": 68 + }, + { + "epoch": 0.017265106968597524, + "grad_norm": 38.25, + "kl": 4.104753017425537, + "learning_rate": 5e-06, + "logits/chosen": -28209125.818181816, + "logits/rejected": -44421026.461538464, + "logps/chosen": -415.62442294034093, + "logps/rejected": -705.1533203125, + "loss": 0.1929, + "rewards/chosen": 4.027130473743785, + "rewards/margins": 7.716762596077018, + "rewards/rejected": -3.6896321223332333, + "step": 69 + }, + { + "epoch": 0.0175153259101714, + "grad_norm": 28.5, + "kl": 11.870689392089844, + "learning_rate": 5e-06, + "logits/chosen": 13527932.57142857, + "logits/rejected": -30350771.2, + "logps/chosen": -632.6715262276786, + "logps/rejected": -554.3529296875, + "loss": 0.2055, + "rewards/chosen": 4.735430036272321, + "rewards/margins": 7.6904417855399, + "rewards/rejected": -2.955011749267578, + "step": 70 + }, + { + "epoch": 0.017765544851745276, + "grad_norm": 37.0, + "kl": 22.042312622070312, + "learning_rate": 5e-06, + "logits/chosen": -66415018.666666664, + "logits/rejected": -37259733.333333336, + "logps/chosen": -532.311328125, + "logps/rejected": -393.3572591145833, + "loss": 0.1746, + "rewards/chosen": 5.246820068359375, + "rewards/margins": 7.600402450561523, + "rewards/rejected": -2.3535823822021484, + "step": 71 + }, + { + "epoch": 0.018015763793319154, + "grad_norm": 34.0, + "kl": 8.992449760437012, + "learning_rate": 5e-06, + "logits/chosen": -44114397.538461536, + "logits/rejected": -35993207.27272727, + "logps/chosen": -402.97408353365387, + "logps/rejected": -380.97194602272725, + "loss": 0.1486, + "rewards/chosen": 4.046647585355318, + "rewards/margins": 6.243044846541398, + "rewards/rejected": -2.1963972611860796, + "step": 72 + }, + { + "epoch": 0.018265982734893032, + "grad_norm": 31.125, + "kl": 3.5951037406921387, + "learning_rate": 5e-06, + "logits/chosen": -44106713.6, + "logits/rejected": -29727241.14285714, + "logps/chosen": -367.2716796875, + "logps/rejected": -408.199951171875, + "loss": 0.1656, + "rewards/chosen": 2.72470645904541, + "rewards/margins": 5.160595566885812, + "rewards/rejected": -2.435889107840402, + "step": 73 + }, + { + "epoch": 0.01851620167646691, + "grad_norm": 24.75, + "kl": 0.0681304931640625, + "learning_rate": 5e-06, + "logits/chosen": -42291921.45454545, + "logits/rejected": -5568534.153846154, + "logps/chosen": -345.49429598721593, + "logps/rejected": -627.5030423677885, + "loss": 0.1198, + "rewards/chosen": 2.857915011319247, + "rewards/margins": 7.511329810936134, + "rewards/rejected": -4.653414799616887, + "step": 74 + }, + { + "epoch": 0.018766420618040784, + "grad_norm": 29.125, + "kl": 3.71746826171875, + "learning_rate": 5e-06, + "logits/chosen": -77624949.33333333, + "logits/rejected": -32048949.333333332, + "logps/chosen": -498.4450276692708, + "logps/rejected": -601.1195068359375, + "loss": 0.1168, + "rewards/chosen": 4.614773432413737, + "rewards/margins": 7.70643170674642, + "rewards/rejected": -3.091658274332682, + "step": 75 + }, + { + "epoch": 0.019016639559614662, + "grad_norm": 23.375, + "kl": 9.963424682617188, + "learning_rate": 5e-06, + "logits/chosen": -28286677.333333332, + "logits/rejected": -48645717.333333336, + "logps/chosen": -356.578125, + "logps/rejected": -511.0622151692708, + "loss": 0.1213, + "rewards/chosen": 4.0463972091674805, + "rewards/margins": 8.232327461242676, + "rewards/rejected": -4.185930252075195, + "step": 76 + }, + { + "epoch": 0.01926685850118854, + "grad_norm": 23.125, + "kl": 6.5861496925354, + "learning_rate": 5e-06, + "logits/chosen": -65989980.0, + "logits/rejected": -39129820.0, + "logps/chosen": -412.86334228515625, + "logps/rejected": -353.802001953125, + "loss": 0.1798, + "rewards/chosen": 3.7884206771850586, + "rewards/margins": 7.3194260597229, + "rewards/rejected": -3.531005382537842, + "step": 77 + }, + { + "epoch": 0.019517077442762418, + "grad_norm": 25.125, + "kl": 2.9897258281707764, + "learning_rate": 5e-06, + "logits/chosen": -48174040.0, + "logits/rejected": -41698836.0, + "logps/chosen": -446.5084533691406, + "logps/rejected": -447.56201171875, + "loss": 0.1329, + "rewards/chosen": 3.867509365081787, + "rewards/margins": 7.806653261184692, + "rewards/rejected": -3.9391438961029053, + "step": 78 + }, + { + "epoch": 0.019767296384336296, + "grad_norm": 20.375, + "kl": 6.163200855255127, + "learning_rate": 5e-06, + "logits/chosen": -50068060.44444445, + "logits/rejected": -58402713.6, + "logps/chosen": -468.9856228298611, + "logps/rejected": -660.4854166666667, + "loss": 0.1099, + "rewards/chosen": 3.648191663953993, + "rewards/margins": 8.640795220269098, + "rewards/rejected": -4.992603556315104, + "step": 79 + }, + { + "epoch": 0.02001751532591017, + "grad_norm": 14.625, + "kl": 1.4809026718139648, + "learning_rate": 5e-06, + "logits/chosen": -54611097.6, + "logits/rejected": -22446710.85714286, + "logps/chosen": -528.61337890625, + "logps/rejected": -472.2774135044643, + "loss": 0.0852, + "rewards/chosen": 5.32177963256836, + "rewards/margins": 9.697051021030973, + "rewards/rejected": -4.375271388462612, + "step": 80 + }, + { + "epoch": 0.020267734267484048, + "grad_norm": 31.0, + "kl": 7.566961288452148, + "learning_rate": 5e-06, + "logits/chosen": -29653430.85714286, + "logits/rejected": -33127379.2, + "logps/chosen": -489.04771205357144, + "logps/rejected": -366.3197509765625, + "loss": 0.1507, + "rewards/chosen": 3.8084632328578403, + "rewards/margins": 8.1127623966762, + "rewards/rejected": -4.304299163818359, + "step": 81 + }, + { + "epoch": 0.020517953209057926, + "grad_norm": 21.75, + "kl": 4.328423976898193, + "learning_rate": 5e-06, + "logits/chosen": -63439066.666666664, + "logits/rejected": -61543376.0, + "logps/chosen": -424.947265625, + "logps/rejected": -386.8924560546875, + "loss": 0.1485, + "rewards/chosen": 3.516907056172689, + "rewards/margins": 6.636496225992839, + "rewards/rejected": -3.11958916982015, + "step": 82 + }, + { + "epoch": 0.020768172150631804, + "grad_norm": 21.0, + "kl": 5.679565906524658, + "learning_rate": 5e-06, + "logits/chosen": -43479936.0, + "logits/rejected": -64066611.2, + "logps/chosen": -403.6904296875, + "logps/rejected": -428.721630859375, + "loss": 0.132, + "rewards/chosen": 3.5876573835100447, + "rewards/margins": 7.082850864955358, + "rewards/rejected": -3.4951934814453125, + "step": 83 + }, + { + "epoch": 0.02101839109220568, + "grad_norm": 40.25, + "kl": 0.9630905985832214, + "learning_rate": 5e-06, + "logits/chosen": -47370304.0, + "logits/rejected": -22394812.0, + "logps/chosen": -311.91552734375, + "logps/rejected": -397.6983337402344, + "loss": 0.1971, + "rewards/chosen": 2.817434787750244, + "rewards/margins": 5.503911256790161, + "rewards/rejected": -2.686476469039917, + "step": 84 + }, + { + "epoch": 0.021268610033779556, + "grad_norm": 27.375, + "kl": 2.1226773262023926, + "learning_rate": 5e-06, + "logits/chosen": -63888085.333333336, + "logits/rejected": -20810644.0, + "logps/chosen": -368.785400390625, + "logps/rejected": -496.9632975260417, + "loss": 0.1262, + "rewards/chosen": 3.5727866490681968, + "rewards/margins": 8.387812932332357, + "rewards/rejected": -4.81502628326416, + "step": 85 + }, + { + "epoch": 0.021518828975353434, + "grad_norm": 25.0, + "kl": 4.572732448577881, + "learning_rate": 5e-06, + "logits/chosen": -15606040.615384616, + "logits/rejected": -42046190.54545455, + "logps/chosen": -381.45601712740387, + "logps/rejected": -488.03151633522725, + "loss": 0.1288, + "rewards/chosen": 3.206278287447416, + "rewards/margins": 6.891124298522522, + "rewards/rejected": -3.6848460110751065, + "step": 86 + }, + { + "epoch": 0.02176904791692731, + "grad_norm": 20.625, + "kl": 4.554980278015137, + "learning_rate": 5e-06, + "logits/chosen": -39270776.615384616, + "logits/rejected": -32091194.181818184, + "logps/chosen": -354.12777944711536, + "logps/rejected": -391.29092684659093, + "loss": 0.1579, + "rewards/chosen": 3.6647494389460635, + "rewards/margins": 8.14450275981343, + "rewards/rejected": -4.479753320867365, + "step": 87 + }, + { + "epoch": 0.02201926685850119, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43021610.666666664, + "logits/rejected": -37736665.6, + "logps/chosen": -382.052490234375, + "logps/rejected": -483.31591796875, + "loss": 0.0882, + "rewards/chosen": 3.0966402689615884, + "rewards/margins": 7.5184684753417965, + "rewards/rejected": -4.421828206380209, + "step": 88 + }, + { + "epoch": 0.022269485800075067, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25497083.2, + "logits/rejected": -52602468.571428575, + "logps/chosen": -394.87197265625, + "logps/rejected": -571.9961286272321, + "loss": 0.1, + "rewards/chosen": 4.237030792236328, + "rewards/margins": 9.36163624354771, + "rewards/rejected": -5.124605451311384, + "step": 89 + }, + { + "epoch": 0.02251970474164894, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67350298.66666667, + "logits/rejected": -45903715.55555555, + "logps/chosen": -454.5413818359375, + "logps/rejected": -465.64171006944446, + "loss": 0.1194, + "rewards/chosen": 3.133005142211914, + "rewards/margins": 7.996957355075413, + "rewards/rejected": -4.863952212863499, + "step": 90 + }, + { + "epoch": 0.02276992368322282, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54561536.0, + "logits/rejected": -43982166.85714286, + "logps/chosen": -463.916015625, + "logps/rejected": -568.4403599330357, + "loss": 0.065, + "rewards/chosen": 3.3763851165771483, + "rewards/margins": 9.043676485334124, + "rewards/rejected": -5.667291368756976, + "step": 91 + }, + { + "epoch": 0.023020142624796697, + "grad_norm": 29.75, + "kl": 3.878533363342285, + "learning_rate": 5e-06, + "logits/chosen": -78604160.0, + "logits/rejected": -49953117.86666667, + "logps/chosen": -414.2492947048611, + "logps/rejected": -565.402734375, + "loss": 0.1407, + "rewards/chosen": 3.157175064086914, + "rewards/margins": 7.866324742635091, + "rewards/rejected": -4.709149678548177, + "step": 92 + }, + { + "epoch": 0.023270361566370575, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -115785036.8, + "logits/rejected": -50407865.2631579, + "logps/chosen": -614.140771484375, + "logps/rejected": -535.7399773848684, + "loss": 0.1498, + "rewards/chosen": 5.492605209350586, + "rewards/margins": 10.425421564202558, + "rewards/rejected": -4.9328163548519735, + "step": 93 + }, + { + "epoch": 0.023520580507944453, + "grad_norm": 18.0, + "kl": 2.0050878524780273, + "learning_rate": 5e-06, + "logits/chosen": -63961902.54545455, + "logits/rejected": -33704019.692307696, + "logps/chosen": -452.275390625, + "logps/rejected": -492.4416691706731, + "loss": 0.1261, + "rewards/chosen": 3.503257404674183, + "rewards/margins": 9.732444416392934, + "rewards/rejected": -6.22918701171875, + "step": 94 + }, + { + "epoch": 0.023770799449518328, + "grad_norm": 30.125, + "kl": 0.7020899653434753, + "learning_rate": 5e-06, + "logits/chosen": -72260728.8888889, + "logits/rejected": -33154007.466666665, + "logps/chosen": -533.3898654513889, + "logps/rejected": -419.801171875, + "loss": 0.1758, + "rewards/chosen": 3.7349586486816406, + "rewards/margins": 7.236947886149089, + "rewards/rejected": -3.501989237467448, + "step": 95 + }, + { + "epoch": 0.024021018391092205, + "grad_norm": 33.25, + "kl": 0.9786033630371094, + "learning_rate": 5e-06, + "logits/chosen": -65529914.18181818, + "logits/rejected": -19485442.46153846, + "logps/chosen": -422.83536044034093, + "logps/rejected": -591.4449368990385, + "loss": 0.1803, + "rewards/chosen": 1.8292180841619319, + "rewards/margins": 8.98130712975989, + "rewards/rejected": -7.152089045597957, + "step": 96 + }, + { + "epoch": 0.024271237332666083, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64227723.63636363, + "logits/rejected": -39207089.23076923, + "logps/chosen": -403.1671697443182, + "logps/rejected": -562.7894381009615, + "loss": 0.095, + "rewards/chosen": 2.459473870017312, + "rewards/margins": 10.224654604504993, + "rewards/rejected": -7.76518073448768, + "step": 97 + }, + { + "epoch": 0.02452145627423996, + "grad_norm": 29.375, + "kl": 1.1252658367156982, + "learning_rate": 5e-06, + "logits/chosen": -44272933.333333336, + "logits/rejected": -40071653.333333336, + "logps/chosen": -422.0517985026042, + "logps/rejected": -501.0906982421875, + "loss": 0.133, + "rewards/chosen": 2.880974769592285, + "rewards/margins": 8.332698504130047, + "rewards/rejected": -5.451723734537761, + "step": 98 + }, + { + "epoch": 0.024771675215813835, + "grad_norm": 29.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71466904.61538461, + "logits/rejected": -54177931.63636363, + "logps/chosen": -490.6721379206731, + "logps/rejected": -404.0007990056818, + "loss": 0.1428, + "rewards/chosen": 3.21109859759991, + "rewards/margins": 8.094061604746571, + "rewards/rejected": -4.882963007146662, + "step": 99 + }, + { + "epoch": 0.025021894157387713, + "grad_norm": 28.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61428824.0, + "logits/rejected": -60440752.0, + "logps/chosen": -516.803955078125, + "logps/rejected": -664.438720703125, + "loss": 0.1351, + "rewards/chosen": 2.4329707622528076, + "rewards/margins": 10.272232294082642, + "rewards/rejected": -7.839261531829834, + "step": 100 + }, + { + "epoch": 0.02527211309896159, + "grad_norm": 15.5, + "kl": 2.8989791870117188, + "learning_rate": 5e-06, + "logits/chosen": -56807532.307692304, + "logits/rejected": -33420832.0, + "logps/chosen": -545.9090670072115, + "logps/rejected": -378.61714311079544, + "loss": 0.109, + "rewards/chosen": 3.6259260911207933, + "rewards/margins": 7.566817356989934, + "rewards/rejected": -3.9408912658691406, + "step": 101 + }, + { + "epoch": 0.02552233204053547, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52177293.71428572, + "logits/rejected": -83865396.70588236, + "logps/chosen": -437.81612723214283, + "logps/rejected": -750.4253216911765, + "loss": 0.0519, + "rewards/chosen": 4.108769825526646, + "rewards/margins": 12.032810675997695, + "rewards/rejected": -7.924040850471048, + "step": 102 + }, + { + "epoch": 0.025772550982109347, + "grad_norm": 24.5, + "kl": 2.3272581100463867, + "learning_rate": 5e-06, + "logits/chosen": -27030333.09090909, + "logits/rejected": -46858318.76923077, + "logps/chosen": -335.79365678267044, + "logps/rejected": -514.9300255408654, + "loss": 0.1732, + "rewards/chosen": 2.424674294211648, + "rewards/margins": 7.565785227955638, + "rewards/rejected": -5.14111093374399, + "step": 103 + }, + { + "epoch": 0.02602276992368322, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50505687.27272727, + "logits/rejected": -51117435.07692308, + "logps/chosen": -499.69588955965907, + "logps/rejected": -542.9323918269231, + "loss": 0.0748, + "rewards/chosen": 4.681109341708097, + "rewards/margins": 11.172604914311762, + "rewards/rejected": -6.491495572603666, + "step": 104 + }, + { + "epoch": 0.0262729888652571, + "grad_norm": 21.375, + "kl": 5.133856296539307, + "learning_rate": 5e-06, + "logits/chosen": 14268272.0, + "logits/rejected": -63890316.8, + "logps/chosen": -495.60836356026783, + "logps/rejected": -505.9212890625, + "loss": 0.1042, + "rewards/chosen": 3.7813717978341237, + "rewards/margins": 9.263994325910296, + "rewards/rejected": -5.482622528076172, + "step": 105 + }, + { + "epoch": 0.026523207806830977, + "grad_norm": 20.0, + "kl": 3.6473140716552734, + "learning_rate": 5e-06, + "logits/chosen": -54861888.0, + "logits/rejected": -48034137.6, + "logps/chosen": -366.1411830357143, + "logps/rejected": -525.676171875, + "loss": 0.1435, + "rewards/chosen": 3.1206065586635043, + "rewards/margins": 6.874971335274832, + "rewards/rejected": -3.754364776611328, + "step": 106 + }, + { + "epoch": 0.026773426748404855, + "grad_norm": 28.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72654528.0, + "logits/rejected": -55476534.85714286, + "logps/chosen": -372.630517578125, + "logps/rejected": -451.62901088169644, + "loss": 0.1135, + "rewards/chosen": 1.8628992080688476, + "rewards/margins": 6.851275280543735, + "rewards/rejected": -4.988376072474888, + "step": 107 + }, + { + "epoch": 0.027023645689978733, + "grad_norm": 30.25, + "kl": 7.377291679382324, + "learning_rate": 5e-06, + "logits/chosen": -60453529.6, + "logits/rejected": -3780721.3333333335, + "logps/chosen": -439.65693359375, + "logps/rejected": -445.7779134114583, + "loss": 0.1769, + "rewards/chosen": 3.740906270345052, + "rewards/margins": 7.646404774983724, + "rewards/rejected": -3.905498504638672, + "step": 108 + }, + { + "epoch": 0.027273864631552607, + "grad_norm": 23.125, + "kl": 7.282306671142578, + "learning_rate": 5e-06, + "logits/chosen": -33562976.0, + "logits/rejected": -81358826.66666667, + "logps/chosen": -396.25547960069446, + "logps/rejected": -531.6099039713541, + "loss": 0.1514, + "rewards/chosen": 3.3797940148247614, + "rewards/margins": 8.526789559258354, + "rewards/rejected": -5.146995544433594, + "step": 109 + }, + { + "epoch": 0.027524083573126485, + "grad_norm": 18.5, + "kl": 0.6033732295036316, + "learning_rate": 5e-06, + "logits/chosen": -11015164.666666666, + "logits/rejected": -43291568.0, + "logps/chosen": -376.6101481119792, + "logps/rejected": -580.6302897135416, + "loss": 0.116, + "rewards/chosen": 4.113841374715169, + "rewards/margins": 10.078737258911133, + "rewards/rejected": -5.964895884195964, + "step": 110 + }, + { + "epoch": 0.027774302514700363, + "grad_norm": 19.375, + "kl": 8.607128143310547, + "learning_rate": 5e-06, + "logits/chosen": -80056891.07692307, + "logits/rejected": -8103525.818181818, + "logps/chosen": -391.80014272836536, + "logps/rejected": -500.23606178977275, + "loss": 0.1769, + "rewards/chosen": 3.5198141244741588, + "rewards/margins": 6.782989395248307, + "rewards/rejected": -3.263175270774148, + "step": 111 + }, + { + "epoch": 0.02802452145627424, + "grad_norm": 23.25, + "kl": 6.622926712036133, + "learning_rate": 5e-06, + "logits/chosen": -68026336.0, + "logits/rejected": -53684298.666666664, + "logps/chosen": -458.5303141276042, + "logps/rejected": -464.957763671875, + "loss": 0.1257, + "rewards/chosen": 3.841912269592285, + "rewards/margins": 7.030071258544922, + "rewards/rejected": -3.1881589889526367, + "step": 112 + }, + { + "epoch": 0.02827474039784812, + "grad_norm": 13.5625, + "kl": 2.5748486518859863, + "learning_rate": 5e-06, + "logits/chosen": -50560486.4, + "logits/rejected": -22140749.714285713, + "logps/chosen": -516.96865234375, + "logps/rejected": -348.55751255580356, + "loss": 0.0899, + "rewards/chosen": 5.615151977539062, + "rewards/margins": 9.130515943254743, + "rewards/rejected": -3.515363965715681, + "step": 113 + }, + { + "epoch": 0.028524959339421993, + "grad_norm": 22.25, + "kl": 7.2938947677612305, + "learning_rate": 5e-06, + "logits/chosen": -36971982.76923077, + "logits/rejected": -34339246.54545455, + "logps/chosen": -370.43306790865387, + "logps/rejected": -484.18075284090907, + "loss": 0.1932, + "rewards/chosen": 3.307170867919922, + "rewards/margins": 7.3687522194602275, + "rewards/rejected": -4.061581351540306, + "step": 114 + }, + { + "epoch": 0.02877517828099587, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70304418.9090909, + "logits/rejected": -68940150.15384616, + "logps/chosen": -339.2457386363636, + "logps/rejected": -573.2331730769231, + "loss": 0.1163, + "rewards/chosen": 3.3149663751775567, + "rewards/margins": 8.222295054188976, + "rewards/rejected": -4.907328679011418, + "step": 115 + }, + { + "epoch": 0.02902539722256975, + "grad_norm": 16.5, + "kl": 5.182840347290039, + "learning_rate": 5e-06, + "logits/chosen": -32940020.363636363, + "logits/rejected": -31526279.384615384, + "logps/chosen": -338.5106756036932, + "logps/rejected": -444.4393780048077, + "loss": 0.1589, + "rewards/chosen": 3.36935112693093, + "rewards/margins": 7.280587523133605, + "rewards/rejected": -3.9112363962026744, + "step": 116 + }, + { + "epoch": 0.029275616164143627, + "grad_norm": 84.5, + "kl": 2.374370574951172, + "learning_rate": 5e-06, + "logits/chosen": -41273824.0, + "logits/rejected": -9672752.0, + "logps/chosen": -451.16943359375, + "logps/rejected": -619.67578125, + "loss": 0.114, + "rewards/chosen": 3.3866065979003905, + "rewards/margins": 8.341240583147322, + "rewards/rejected": -4.9546339852469305, + "step": 117 + }, + { + "epoch": 0.029525835105717504, + "grad_norm": 14.6875, + "kl": 2.604111433029175, + "learning_rate": 5e-06, + "logits/chosen": -53653376.0, + "logits/rejected": -59455158.15384615, + "logps/chosen": -418.7755681818182, + "logps/rejected": -505.4073016826923, + "loss": 0.082, + "rewards/chosen": 4.632992137562145, + "rewards/margins": 9.54545422534009, + "rewards/rejected": -4.912462087777945, + "step": 118 + }, + { + "epoch": 0.02977605404729138, + "grad_norm": 13.5, + "kl": 0.20227432250976562, + "learning_rate": 5e-06, + "logits/chosen": -32224620.0, + "logits/rejected": -41155708.0, + "logps/chosen": -416.0848083496094, + "logps/rejected": -377.306396484375, + "loss": 0.0932, + "rewards/chosen": 3.985408306121826, + "rewards/margins": 8.498955726623535, + "rewards/rejected": -4.513547420501709, + "step": 119 + }, + { + "epoch": 0.030026272988865257, + "grad_norm": 11.0625, + "kl": 0.35961565375328064, + "learning_rate": 5e-06, + "logits/chosen": -52238199.46666667, + "logits/rejected": -55250368.0, + "logps/chosen": -430.0063151041667, + "logps/rejected": -558.1921657986111, + "loss": 0.0457, + "rewards/chosen": 4.434163411458333, + "rewards/margins": 10.700937059190537, + "rewards/rejected": -6.2667736477322045, + "step": 120 + }, + { + "epoch": 0.030276491930439135, + "grad_norm": 22.875, + "kl": 10.833108901977539, + "learning_rate": 5e-06, + "logits/chosen": -55598536.0, + "logits/rejected": -36607028.0, + "logps/chosen": -454.464111328125, + "logps/rejected": -471.4989318847656, + "loss": 0.0801, + "rewards/chosen": 5.157410621643066, + "rewards/margins": 10.901457786560059, + "rewards/rejected": -5.744047164916992, + "step": 121 + }, + { + "epoch": 0.030526710872013012, + "grad_norm": 17.0, + "kl": 7.9545464515686035, + "learning_rate": 5e-06, + "logits/chosen": -49838222.222222224, + "logits/rejected": -50103086.93333333, + "logps/chosen": -415.40961371527777, + "logps/rejected": -517.4078776041666, + "loss": 0.0859, + "rewards/chosen": 5.3493804931640625, + "rewards/margins": 9.173910522460938, + "rewards/rejected": -3.824530029296875, + "step": 122 + }, + { + "epoch": 0.030776929813586887, + "grad_norm": 20.625, + "kl": 1.4038188457489014, + "learning_rate": 5e-06, + "logits/chosen": -40942124.307692304, + "logits/rejected": -46390813.09090909, + "logps/chosen": -450.25863882211536, + "logps/rejected": -705.0929509943181, + "loss": 0.0696, + "rewards/chosen": 4.544291276198167, + "rewards/margins": 12.617362842693197, + "rewards/rejected": -8.07307156649503, + "step": 123 + }, + { + "epoch": 0.031027148755160765, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39217398.15384615, + "logits/rejected": -54239610.18181818, + "logps/chosen": -331.6940354567308, + "logps/rejected": -562.7880415482955, + "loss": 0.1387, + "rewards/chosen": 3.295840336726262, + "rewards/margins": 9.500584155529529, + "rewards/rejected": -6.204743818803267, + "step": 124 + }, + { + "epoch": 0.03127736769673464, + "grad_norm": 20.125, + "kl": 8.88840103149414, + "learning_rate": 5e-06, + "logits/chosen": -75248842.66666667, + "logits/rejected": -27488213.333333332, + "logps/chosen": -512.0596516927084, + "logps/rejected": -431.65234375, + "loss": 0.1034, + "rewards/chosen": 6.4001725514729815, + "rewards/margins": 10.636566162109375, + "rewards/rejected": -4.2363936106363935, + "step": 125 + }, + { + "epoch": 0.03152758663830852, + "grad_norm": 10.4375, + "kl": 0.7614803314208984, + "learning_rate": 5e-06, + "logits/chosen": -52280902.4, + "logits/rejected": -44913298.28571428, + "logps/chosen": -436.962109375, + "logps/rejected": -555.4670061383929, + "loss": 0.062, + "rewards/chosen": 5.041194152832031, + "rewards/margins": 10.586859348842076, + "rewards/rejected": -5.545665196010044, + "step": 126 + }, + { + "epoch": 0.0317778055798824, + "grad_norm": 22.25, + "kl": 10.628379821777344, + "learning_rate": 5e-06, + "logits/chosen": -64422680.615384616, + "logits/rejected": -74492311.27272727, + "logps/chosen": -516.8713942307693, + "logps/rejected": -438.9396306818182, + "loss": 0.1222, + "rewards/chosen": 5.40548588679387, + "rewards/margins": 9.113758994149162, + "rewards/rejected": -3.708273107355291, + "step": 127 + }, + { + "epoch": 0.032028024521456276, + "grad_norm": 20.875, + "kl": 16.492826461791992, + "learning_rate": 5e-06, + "logits/chosen": -96028784.0, + "logits/rejected": -46677208.0, + "logps/chosen": -388.1448059082031, + "logps/rejected": -429.96466064453125, + "loss": 0.1979, + "rewards/chosen": 4.420779228210449, + "rewards/margins": 9.198101997375488, + "rewards/rejected": -4.777322769165039, + "step": 128 + }, + { + "epoch": 0.032278243463030154, + "grad_norm": 20.125, + "kl": 2.5171051025390625, + "learning_rate": 5e-06, + "logits/chosen": -78527726.54545455, + "logits/rejected": -58059150.76923077, + "logps/chosen": -589.0059925426136, + "logps/rejected": -637.1475360576923, + "loss": 0.0816, + "rewards/chosen": 6.104273015802557, + "rewards/margins": 11.774602183095226, + "rewards/rejected": -5.670329167292668, + "step": 129 + }, + { + "epoch": 0.03252846240460403, + "grad_norm": 24.75, + "kl": 6.977313041687012, + "learning_rate": 5e-06, + "logits/chosen": -45330215.384615384, + "logits/rejected": -52318446.54545455, + "logps/chosen": -332.63683143028845, + "logps/rejected": -449.8136541193182, + "loss": 0.1588, + "rewards/chosen": 3.3289369436410756, + "rewards/margins": 8.708844031487311, + "rewards/rejected": -5.379907087846235, + "step": 130 + }, + { + "epoch": 0.0327786813461779, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23178550.4, + "logits/rejected": -45325394.28571428, + "logps/chosen": -428.61552734375, + "logps/rejected": -615.2171456473214, + "loss": 0.0438, + "rewards/chosen": 4.332117080688477, + "rewards/margins": 10.99391692025321, + "rewards/rejected": -6.661799839564732, + "step": 131 + }, + { + "epoch": 0.03302890028775178, + "grad_norm": 19.25, + "kl": 8.29294204711914, + "learning_rate": 5e-06, + "logits/chosen": -46093674.666666664, + "logits/rejected": -22097666.666666668, + "logps/chosen": -460.8732096354167, + "logps/rejected": -280.8202311197917, + "loss": 0.1448, + "rewards/chosen": 4.674725850423177, + "rewards/margins": 6.884338537851969, + "rewards/rejected": -2.2096126874287925, + "step": 132 + }, + { + "epoch": 0.03327911922932566, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63178402.90909091, + "logits/rejected": -24557321.846153848, + "logps/chosen": -361.7850452769886, + "logps/rejected": -507.1363055889423, + "loss": 0.0875, + "rewards/chosen": 3.1080398559570312, + "rewards/margins": 9.310384310208834, + "rewards/rejected": -6.2023444542518025, + "step": 133 + }, + { + "epoch": 0.033529338170899536, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69576857.6, + "logits/rejected": -41444192.0, + "logps/chosen": -365.8089599609375, + "logps/rejected": -486.6082240513393, + "loss": 0.0939, + "rewards/chosen": 3.5887569427490233, + "rewards/margins": 8.99369032723563, + "rewards/rejected": -5.404933384486607, + "step": 134 + }, + { + "epoch": 0.033779557112473414, + "grad_norm": 24.125, + "kl": 6.132481575012207, + "learning_rate": 5e-06, + "logits/chosen": -34672068.571428575, + "logits/rejected": -64537472.0, + "logps/chosen": -431.24354771205356, + "logps/rejected": -673.63056640625, + "loss": 0.1251, + "rewards/chosen": 3.8119286128452847, + "rewards/margins": 11.959659630911691, + "rewards/rejected": -8.147731018066406, + "step": 135 + }, + { + "epoch": 0.03402977605404729, + "grad_norm": 23.125, + "kl": 15.4842529296875, + "learning_rate": 5e-06, + "logits/chosen": -100979224.0, + "logits/rejected": -17410722.0, + "logps/chosen": -524.205322265625, + "logps/rejected": -326.41387939453125, + "loss": 0.1419, + "rewards/chosen": 5.6238112449646, + "rewards/margins": 10.311150550842285, + "rewards/rejected": -4.6873393058776855, + "step": 136 + }, + { + "epoch": 0.03427999499562117, + "grad_norm": 28.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30986018.46153846, + "logits/rejected": -26399616.0, + "logps/chosen": -356.25304236778845, + "logps/rejected": -426.40380859375, + "loss": 0.1682, + "rewards/chosen": 2.9559669494628906, + "rewards/margins": 7.164258089932528, + "rewards/rejected": -4.208291140469638, + "step": 137 + }, + { + "epoch": 0.03453021393719505, + "grad_norm": 23.75, + "kl": 10.649872779846191, + "learning_rate": 5e-06, + "logits/chosen": -74135760.0, + "logits/rejected": -38775632.0, + "logps/chosen": -489.68231201171875, + "logps/rejected": -550.9609375, + "loss": 0.0558, + "rewards/chosen": 4.875946521759033, + "rewards/margins": 9.918260097503662, + "rewards/rejected": -5.042313575744629, + "step": 138 + }, + { + "epoch": 0.034780432878768926, + "grad_norm": 24.875, + "kl": 4.290389060974121, + "learning_rate": 5e-06, + "logits/chosen": -43735931.428571425, + "logits/rejected": -33822678.4, + "logps/chosen": -338.83150809151783, + "logps/rejected": -483.46162109375, + "loss": 0.1309, + "rewards/chosen": 3.1358509063720703, + "rewards/margins": 9.684260940551757, + "rewards/rejected": -6.548410034179687, + "step": 139 + }, + { + "epoch": 0.0350306518203428, + "grad_norm": 18.75, + "kl": 5.464059829711914, + "learning_rate": 5e-06, + "logits/chosen": -59807507.692307696, + "logits/rejected": -62122222.54545455, + "logps/chosen": -364.8303786057692, + "logps/rejected": -498.06516335227275, + "loss": 0.1168, + "rewards/chosen": 3.21218255849985, + "rewards/margins": 8.490869775518672, + "rewards/rejected": -5.278687217018821, + "step": 140 + }, + { + "epoch": 0.035280870761916674, + "grad_norm": 25.75, + "kl": 5.874354362487793, + "learning_rate": 5e-06, + "logits/chosen": -80815156.70588236, + "logits/rejected": -99578194.28571428, + "logps/chosen": -403.36790556066177, + "logps/rejected": -484.09176199776783, + "loss": 0.1625, + "rewards/chosen": 4.2788184670841, + "rewards/margins": 10.05712951531931, + "rewards/rejected": -5.778311048235212, + "step": 141 + }, + { + "epoch": 0.03553108970349055, + "grad_norm": 23.0, + "kl": 1.0059306621551514, + "learning_rate": 5e-06, + "logits/chosen": -50342840.0, + "logits/rejected": -55544748.0, + "logps/chosen": -416.4798583984375, + "logps/rejected": -632.0045166015625, + "loss": 0.1058, + "rewards/chosen": 3.6286752223968506, + "rewards/margins": 8.574209451675415, + "rewards/rejected": -4.9455342292785645, + "step": 142 + }, + { + "epoch": 0.03578130864506443, + "grad_norm": 21.75, + "kl": 8.205643653869629, + "learning_rate": 5e-06, + "logits/chosen": -59896755.2, + "logits/rejected": -28854793.14285714, + "logps/chosen": -527.664453125, + "logps/rejected": -552.7942592075893, + "loss": 0.0741, + "rewards/chosen": 5.277662658691407, + "rewards/margins": 10.381992994035993, + "rewards/rejected": -5.104330335344587, + "step": 143 + }, + { + "epoch": 0.03603152758663831, + "grad_norm": 21.25, + "kl": 0.39021429419517517, + "learning_rate": 5e-06, + "logits/chosen": -54364202.666666664, + "logits/rejected": -43710101.333333336, + "logps/chosen": -355.3279215494792, + "logps/rejected": -591.236328125, + "loss": 0.1382, + "rewards/chosen": 3.4100462595621743, + "rewards/margins": 9.961926142374674, + "rewards/rejected": -6.5518798828125, + "step": 144 + }, + { + "epoch": 0.036281746528212186, + "grad_norm": 19.625, + "kl": 3.4919161796569824, + "learning_rate": 5e-06, + "logits/chosen": -44140093.09090909, + "logits/rejected": -60244603.07692308, + "logps/chosen": -431.66645951704544, + "logps/rejected": -479.35659555288464, + "loss": 0.0717, + "rewards/chosen": 6.132303411310369, + "rewards/margins": 10.587619514732094, + "rewards/rejected": -4.455316103421724, + "step": 145 + }, + { + "epoch": 0.036531965469786064, + "grad_norm": 28.125, + "kl": 9.083163261413574, + "learning_rate": 5e-06, + "logits/chosen": -27821803.42857143, + "logits/rejected": -15193225.6, + "logps/chosen": -330.38779994419644, + "logps/rejected": -567.6158203125, + "loss": 0.2525, + "rewards/chosen": 4.288521902901786, + "rewards/margins": 6.976574461800711, + "rewards/rejected": -2.6880525588989257, + "step": 146 + }, + { + "epoch": 0.03678218441135994, + "grad_norm": 16.625, + "kl": 1.34625244140625, + "learning_rate": 5e-06, + "logits/chosen": -55421824.0, + "logits/rejected": -7098027.636363637, + "logps/chosen": -323.37161959134613, + "logps/rejected": -678.7225230823864, + "loss": 0.0908, + "rewards/chosen": 3.7793003962590146, + "rewards/margins": 8.31683389623682, + "rewards/rejected": -4.537533499977806, + "step": 147 + }, + { + "epoch": 0.03703240335293382, + "grad_norm": 20.625, + "kl": 6.829098701477051, + "learning_rate": 5e-06, + "logits/chosen": -38491163.07692308, + "logits/rejected": -31447584.0, + "logps/chosen": -433.2761793870192, + "logps/rejected": -421.71799538352275, + "loss": 0.1703, + "rewards/chosen": 4.942574134239783, + "rewards/margins": 8.83587601134827, + "rewards/rejected": -3.8933018771084873, + "step": 148 + }, + { + "epoch": 0.0372826222945077, + "grad_norm": 20.0, + "kl": 3.9278724193573, + "learning_rate": 5e-06, + "logits/chosen": -52982498.461538464, + "logits/rejected": -51161460.36363637, + "logps/chosen": -428.42202524038464, + "logps/rejected": -591.8447265625, + "loss": 0.0763, + "rewards/chosen": 4.8494253892164965, + "rewards/margins": 10.34067967554906, + "rewards/rejected": -5.4912542863325635, + "step": 149 + }, + { + "epoch": 0.03753284123608157, + "grad_norm": 19.375, + "kl": 6.239824295043945, + "learning_rate": 5e-06, + "logits/chosen": -48351232.0, + "logits/rejected": -20709194.0, + "logps/chosen": -283.2247314453125, + "logps/rejected": -323.7087097167969, + "loss": 0.1278, + "rewards/chosen": 4.198636054992676, + "rewards/margins": 7.8641743659973145, + "rewards/rejected": -3.6655383110046387, + "step": 150 + }, + { + "epoch": 0.037783060177655446, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54331776.0, + "logits/rejected": 35268820.571428575, + "logps/chosen": -556.68505859375, + "logps/rejected": -581.5312848772321, + "loss": 0.0675, + "rewards/chosen": 6.455515289306641, + "rewards/margins": 11.3932131086077, + "rewards/rejected": -4.93769781930106, + "step": 151 + }, + { + "epoch": 0.038033279119229324, + "grad_norm": 14.375, + "kl": 4.6379618644714355, + "learning_rate": 5e-06, + "logits/chosen": -44820281.6, + "logits/rejected": -57210107.428571425, + "logps/chosen": -297.274853515625, + "logps/rejected": -502.84061104910717, + "loss": 0.075, + "rewards/chosen": 3.327361297607422, + "rewards/margins": 8.49720960344587, + "rewards/rejected": -5.169848305838449, + "step": 152 + }, + { + "epoch": 0.0382834980608032, + "grad_norm": 22.625, + "kl": 1.0584895610809326, + "learning_rate": 5e-06, + "logits/chosen": -81029024.0, + "logits/rejected": -52466168.0, + "logps/chosen": -656.5376586914062, + "logps/rejected": -481.95355224609375, + "loss": 0.0681, + "rewards/chosen": 6.785458087921143, + "rewards/margins": 12.110761165618896, + "rewards/rejected": -5.325303077697754, + "step": 153 + }, + { + "epoch": 0.03853371700237708, + "grad_norm": 27.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49451008.0, + "logits/rejected": -97331114.66666667, + "logps/chosen": -333.88014729817706, + "logps/rejected": -486.0155029296875, + "loss": 0.1643, + "rewards/chosen": 3.8546034495035806, + "rewards/margins": 8.570431391398111, + "rewards/rejected": -4.715827941894531, + "step": 154 + }, + { + "epoch": 0.03878393594395096, + "grad_norm": 19.0, + "kl": 8.156567573547363, + "learning_rate": 5e-06, + "logits/chosen": -40026600.0, + "logits/rejected": -12795388.0, + "logps/chosen": -357.08233642578125, + "logps/rejected": -397.34918212890625, + "loss": 0.1987, + "rewards/chosen": 3.712439775466919, + "rewards/margins": 7.8179771900177, + "rewards/rejected": -4.105537414550781, + "step": 155 + }, + { + "epoch": 0.039034154885524835, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18985547.42857143, + "logits/rejected": -4583904.0, + "logps/chosen": -363.89854213169644, + "logps/rejected": -481.7215360753676, + "loss": 0.126, + "rewards/chosen": 4.135683332170759, + "rewards/margins": 8.639876806435463, + "rewards/rejected": -4.504193474264706, + "step": 156 + }, + { + "epoch": 0.03928437382709871, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40897821.86666667, + "logits/rejected": -43241447.11111111, + "logps/chosen": -368.06103515625, + "logps/rejected": -526.2424045138889, + "loss": 0.1085, + "rewards/chosen": 4.275669860839844, + "rewards/margins": 10.76724616156684, + "rewards/rejected": -6.491576300726996, + "step": 157 + }, + { + "epoch": 0.03953459276867259, + "grad_norm": 16.5, + "kl": 0.36798352003097534, + "learning_rate": 5e-06, + "logits/chosen": -57061719.27272727, + "logits/rejected": -39024620.307692304, + "logps/chosen": -391.3818359375, + "logps/rejected": -388.5891676682692, + "loss": 0.0616, + "rewards/chosen": 4.3716558976606885, + "rewards/margins": 9.17924667571808, + "rewards/rejected": -4.807590778057392, + "step": 158 + }, + { + "epoch": 0.03978481171024647, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28181875.2, + "logits/rejected": -31613298.285714287, + "logps/chosen": -428.174462890625, + "logps/rejected": -416.0806361607143, + "loss": 0.0532, + "rewards/chosen": 4.767366790771485, + "rewards/margins": 10.949200330461775, + "rewards/rejected": -6.18183353969029, + "step": 159 + }, + { + "epoch": 0.04003503065182034, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -97138384.0, + "logits/rejected": -46946064.0, + "logps/chosen": -571.2589111328125, + "logps/rejected": -585.5206298828125, + "loss": 0.0944, + "rewards/chosen": 6.3586201667785645, + "rewards/margins": 11.417516708374023, + "rewards/rejected": -5.058896541595459, + "step": 160 + }, + { + "epoch": 0.04028524959339422, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45920308.36363637, + "logits/rejected": -27952059.076923076, + "logps/chosen": -352.62990500710225, + "logps/rejected": -485.89554537259613, + "loss": 0.1199, + "rewards/chosen": 4.034776167436079, + "rewards/margins": 10.131546820793952, + "rewards/rejected": -6.096770653357873, + "step": 161 + }, + { + "epoch": 0.040535468534968096, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37612216.0, + "logits/rejected": -30060588.0, + "logps/chosen": -300.6482238769531, + "logps/rejected": -473.1891174316406, + "loss": 0.1184, + "rewards/chosen": 3.2378549575805664, + "rewards/margins": 8.858850479125977, + "rewards/rejected": -5.62099552154541, + "step": 162 + }, + { + "epoch": 0.040785687476541974, + "grad_norm": 18.5, + "kl": 4.293770790100098, + "learning_rate": 5e-06, + "logits/chosen": -45873765.333333336, + "logits/rejected": -55967301.333333336, + "logps/chosen": -384.5272623697917, + "logps/rejected": -594.3080647786459, + "loss": 0.1583, + "rewards/chosen": 2.8155123392740884, + "rewards/margins": 8.81442387898763, + "rewards/rejected": -5.998911539713542, + "step": 163 + }, + { + "epoch": 0.04103590641811585, + "grad_norm": 21.5, + "kl": 7.401231288909912, + "learning_rate": 5e-06, + "logits/chosen": -52780744.0, + "logits/rejected": -56251888.0, + "logps/chosen": -586.0512084960938, + "logps/rejected": -395.9722595214844, + "loss": 0.0694, + "rewards/chosen": 5.403119087219238, + "rewards/margins": 11.10162353515625, + "rewards/rejected": -5.698504447937012, + "step": 164 + }, + { + "epoch": 0.04128612535968973, + "grad_norm": 18.75, + "kl": 4.575628280639648, + "learning_rate": 5e-06, + "logits/chosen": -46542098.28571428, + "logits/rejected": -65141612.8, + "logps/chosen": -417.9397670200893, + "logps/rejected": -495.366015625, + "loss": 0.076, + "rewards/chosen": 3.4580459594726562, + "rewards/margins": 9.116856384277344, + "rewards/rejected": -5.658810424804687, + "step": 165 + }, + { + "epoch": 0.04153634430126361, + "grad_norm": 26.625, + "kl": 2.5202600955963135, + "learning_rate": 5e-06, + "logits/chosen": -50662144.0, + "logits/rejected": -33581750.4, + "logps/chosen": -390.7208775111607, + "logps/rejected": -318.3043701171875, + "loss": 0.1234, + "rewards/chosen": 4.517326354980469, + "rewards/margins": 7.158951377868652, + "rewards/rejected": -2.6416250228881837, + "step": 166 + }, + { + "epoch": 0.041786563242837485, + "grad_norm": 24.125, + "kl": 3.483273983001709, + "learning_rate": 5e-06, + "logits/chosen": -59899664.0, + "logits/rejected": -17329224.0, + "logps/chosen": -469.6291097005208, + "logps/rejected": -388.012939453125, + "loss": 0.0926, + "rewards/chosen": 5.202668190002441, + "rewards/margins": 10.415954271952312, + "rewards/rejected": -5.21328608194987, + "step": 167 + }, + { + "epoch": 0.04203678218441136, + "grad_norm": 13.875, + "kl": 0.2973499298095703, + "learning_rate": 5e-06, + "logits/chosen": -71792883.2, + "logits/rejected": -51374930.28571428, + "logps/chosen": -482.91533203125, + "logps/rejected": -594.4489397321429, + "loss": 0.0549, + "rewards/chosen": 4.24849853515625, + "rewards/margins": 11.637181854248047, + "rewards/rejected": -7.388683319091797, + "step": 168 + }, + { + "epoch": 0.042287001125985234, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19547772.0, + "logits/rejected": -43603864.0, + "logps/chosen": -274.3015543619792, + "logps/rejected": -582.5530598958334, + "loss": 0.0915, + "rewards/chosen": 2.4779138565063477, + "rewards/margins": 9.535144488016766, + "rewards/rejected": -7.057230631510417, + "step": 169 + }, + { + "epoch": 0.04253722006755911, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22624244.57142857, + "logits/rejected": -46519928.47058824, + "logps/chosen": -293.7974330357143, + "logps/rejected": -620.6377527573529, + "loss": 0.0856, + "rewards/chosen": 3.239545004708426, + "rewards/margins": 11.214790760969915, + "rewards/rejected": -7.975245756261489, + "step": 170 + }, + { + "epoch": 0.04278743900913299, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47868133.81818182, + "logits/rejected": 57006483.692307696, + "logps/chosen": -385.4287109375, + "logps/rejected": -448.4245793269231, + "loss": 0.1033, + "rewards/chosen": 3.897959275679155, + "rewards/margins": 10.785419837578193, + "rewards/rejected": -6.887460561899038, + "step": 171 + }, + { + "epoch": 0.04303765795070687, + "grad_norm": 14.6875, + "kl": 0.7472852468490601, + "learning_rate": 5e-06, + "logits/chosen": -40471397.333333336, + "logits/rejected": -77715264.0, + "logps/chosen": -422.4617513020833, + "logps/rejected": -555.0777994791666, + "loss": 0.0723, + "rewards/chosen": 4.81877326965332, + "rewards/margins": 11.356734593709309, + "rewards/rejected": -6.537961324055989, + "step": 172 + }, + { + "epoch": 0.043287876892280745, + "grad_norm": 15.6875, + "kl": 4.088824272155762, + "learning_rate": 5e-06, + "logits/chosen": -52978038.15384615, + "logits/rejected": 90874722.9090909, + "logps/chosen": -365.61767578125, + "logps/rejected": -551.6926491477273, + "loss": 0.0937, + "rewards/chosen": 4.703894981971154, + "rewards/margins": 12.280280480018028, + "rewards/rejected": -7.576385498046875, + "step": 173 + }, + { + "epoch": 0.04353809583385462, + "grad_norm": 12.875, + "kl": 1.2338712215423584, + "learning_rate": 5e-06, + "logits/chosen": -32180448.0, + "logits/rejected": -40426395.428571425, + "logps/chosen": -308.8005126953125, + "logps/rejected": -429.81005859375, + "loss": 0.0771, + "rewards/chosen": 3.661548614501953, + "rewards/margins": 9.841023581368582, + "rewards/rejected": -6.179474966866629, + "step": 174 + }, + { + "epoch": 0.0437883147754285, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57538949.81818182, + "logits/rejected": -34008918.15384615, + "logps/chosen": -347.4867498224432, + "logps/rejected": -562.6441556490385, + "loss": 0.0613, + "rewards/chosen": 4.080998854203657, + "rewards/margins": 11.374724674891759, + "rewards/rejected": -7.293725820688101, + "step": 175 + }, + { + "epoch": 0.04403853371700238, + "grad_norm": 18.375, + "kl": 5.854638576507568, + "learning_rate": 5e-06, + "logits/chosen": -47022602.666666664, + "logits/rejected": -48280176.0, + "logps/chosen": -415.4212239583333, + "logps/rejected": -594.5871175130209, + "loss": 0.0913, + "rewards/chosen": 5.363189697265625, + "rewards/margins": 13.202273050944012, + "rewards/rejected": -7.839083353678386, + "step": 176 + }, + { + "epoch": 0.04428875265857626, + "grad_norm": 13.25, + "kl": 1.109082579612732, + "learning_rate": 5e-06, + "logits/chosen": -46307733.333333336, + "logits/rejected": -40083797.333333336, + "logps/chosen": -273.48581949869794, + "logps/rejected": -353.5043131510417, + "loss": 0.1185, + "rewards/chosen": 3.213988939921061, + "rewards/margins": 7.801450411478678, + "rewards/rejected": -4.587461471557617, + "step": 177 + }, + { + "epoch": 0.044538971600150135, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26819675.42857143, + "logits/rejected": -42344256.0, + "logps/chosen": -211.00730678013392, + "logps/rejected": -606.9449103860294, + "loss": 0.109, + "rewards/chosen": 2.6112401144845143, + "rewards/margins": 8.805287032568154, + "rewards/rejected": -6.1940469180836395, + "step": 178 + }, + { + "epoch": 0.044789190541724005, + "grad_norm": 20.25, + "kl": 1.9853570461273193, + "learning_rate": 5e-06, + "logits/chosen": -49304214.4, + "logits/rejected": -75421686.85714285, + "logps/chosen": -552.79306640625, + "logps/rejected": -567.9908621651786, + "loss": 0.0845, + "rewards/chosen": 4.3198188781738285, + "rewards/margins": 10.149826158796039, + "rewards/rejected": -5.83000728062221, + "step": 179 + }, + { + "epoch": 0.04503940948329788, + "grad_norm": 15.375, + "kl": 1.1453670263290405, + "learning_rate": 5e-06, + "logits/chosen": -50428322.461538464, + "logits/rejected": -59052404.36363637, + "logps/chosen": -387.38439002403845, + "logps/rejected": -631.0325816761364, + "loss": 0.0927, + "rewards/chosen": 4.393623938927283, + "rewards/margins": 12.610069168197526, + "rewards/rejected": -8.216445229270242, + "step": 180 + }, + { + "epoch": 0.04528962842487176, + "grad_norm": 31.0, + "kl": 9.287070274353027, + "learning_rate": 5e-06, + "logits/chosen": -72337800.53333333, + "logits/rejected": -54460508.44444445, + "logps/chosen": -430.4421875, + "logps/rejected": -463.251953125, + "loss": 0.1243, + "rewards/chosen": 4.164467112223307, + "rewards/margins": 8.997049374050565, + "rewards/rejected": -4.832582261827257, + "step": 181 + }, + { + "epoch": 0.04553984736644564, + "grad_norm": 26.5, + "kl": 7.495224952697754, + "learning_rate": 5e-06, + "logits/chosen": -68060749.71428572, + "logits/rejected": -37737881.6, + "logps/chosen": -436.6040736607143, + "logps/rejected": -428.13115234375, + "loss": 0.108, + "rewards/chosen": 4.3831939697265625, + "rewards/margins": 10.0071533203125, + "rewards/rejected": -5.623959350585937, + "step": 182 + }, + { + "epoch": 0.04579006630801952, + "grad_norm": 16.375, + "kl": 8.493086814880371, + "learning_rate": 5e-06, + "logits/chosen": -47477799.384615384, + "logits/rejected": -58968791.27272727, + "logps/chosen": -429.14663461538464, + "logps/rejected": -332.81716086647725, + "loss": 0.1017, + "rewards/chosen": 5.500474783090445, + "rewards/margins": 10.639228500686325, + "rewards/rejected": -5.138753717595881, + "step": 183 + }, + { + "epoch": 0.046040285249593395, + "grad_norm": 12.25, + "kl": 4.708995342254639, + "learning_rate": 5e-06, + "logits/chosen": -44649705.14285714, + "logits/rejected": -67075948.8, + "logps/chosen": -523.5865304129464, + "logps/rejected": -622.985791015625, + "loss": 0.0371, + "rewards/chosen": 5.529398236955915, + "rewards/margins": 13.241002546037947, + "rewards/rejected": -7.711604309082031, + "step": 184 + }, + { + "epoch": 0.04629050419116727, + "grad_norm": 17.0, + "kl": 2.1218771934509277, + "learning_rate": 5e-06, + "logits/chosen": -50696034.461538464, + "logits/rejected": -49664046.54545455, + "logps/chosen": -322.49057241586536, + "logps/rejected": -529.3594193892045, + "loss": 0.1227, + "rewards/chosen": 3.6770201462965746, + "rewards/margins": 9.060277925504671, + "rewards/rejected": -5.383257779208097, + "step": 185 + }, + { + "epoch": 0.04654072313274115, + "grad_norm": 11.6875, + "kl": 4.476684093475342, + "learning_rate": 5e-06, + "logits/chosen": -53033125.333333336, + "logits/rejected": -43685322.666666664, + "logps/chosen": -444.3708089192708, + "logps/rejected": -532.5503743489584, + "loss": 0.0768, + "rewards/chosen": 4.921902974446614, + "rewards/margins": 10.608036041259766, + "rewards/rejected": -5.686133066813151, + "step": 186 + }, + { + "epoch": 0.04679094207431503, + "grad_norm": 19.125, + "kl": 2.0418787002563477, + "learning_rate": 5e-06, + "logits/chosen": -50350249.6, + "logits/rejected": -37511499.428571425, + "logps/chosen": -381.4614990234375, + "logps/rejected": -589.4267578125, + "loss": 0.1072, + "rewards/chosen": 4.3655342102050785, + "rewards/margins": 10.5390745980399, + "rewards/rejected": -6.173540387834821, + "step": 187 + }, + { + "epoch": 0.047041161015888906, + "grad_norm": 10.4375, + "kl": 8.362052917480469, + "learning_rate": 5e-06, + "logits/chosen": -56223128.0, + "logits/rejected": -49902480.0, + "logps/chosen": -370.11468505859375, + "logps/rejected": -875.49365234375, + "loss": 0.1507, + "rewards/chosen": 4.879919052124023, + "rewards/margins": 12.756749629974365, + "rewards/rejected": -7.876830577850342, + "step": 188 + }, + { + "epoch": 0.04729137995746278, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34136556.307692304, + "logits/rejected": -44233218.90909091, + "logps/chosen": -375.6189528245192, + "logps/rejected": -477.4869939630682, + "loss": 0.0834, + "rewards/chosen": 4.037762275108924, + "rewards/margins": 8.266784908054593, + "rewards/rejected": -4.229022632945668, + "step": 189 + }, + { + "epoch": 0.047541598899036655, + "grad_norm": 16.375, + "kl": 1.9649031162261963, + "learning_rate": 5e-06, + "logits/chosen": -31002606.769230768, + "logits/rejected": -41109626.18181818, + "logps/chosen": -327.640625, + "logps/rejected": -631.2844460227273, + "loss": 0.1154, + "rewards/chosen": 3.483311286339393, + "rewards/margins": 11.118699747365671, + "rewards/rejected": -7.635388461026278, + "step": 190 + }, + { + "epoch": 0.04779181784061053, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61394141.09090909, + "logits/rejected": -37747611.07692308, + "logps/chosen": -453.96484375, + "logps/rejected": -758.7204777644231, + "loss": 0.0453, + "rewards/chosen": 5.382943933660334, + "rewards/margins": 12.17320659610775, + "rewards/rejected": -6.790262662447416, + "step": 191 + }, + { + "epoch": 0.04804203678218441, + "grad_norm": 9.8125, + "kl": 8.697744369506836, + "learning_rate": 5e-06, + "logits/chosen": -29078030.769230768, + "logits/rejected": -53254888.72727273, + "logps/chosen": -411.91165865384613, + "logps/rejected": -551.6779119318181, + "loss": 0.0816, + "rewards/chosen": 5.3462360088641825, + "rewards/margins": 11.406404882044225, + "rewards/rejected": -6.060168873180043, + "step": 192 + }, + { + "epoch": 0.04829225572375829, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72034519.27272727, + "logits/rejected": -31024893.53846154, + "logps/chosen": -347.4210094105114, + "logps/rejected": -458.4655198317308, + "loss": 0.0981, + "rewards/chosen": 3.841581171209162, + "rewards/margins": 10.548764422223286, + "rewards/rejected": -6.707183251014123, + "step": 193 + }, + { + "epoch": 0.048542474665332166, + "grad_norm": 17.375, + "kl": 3.349236249923706, + "learning_rate": 5e-06, + "logits/chosen": -59783954.28571428, + "logits/rejected": 59085209.6, + "logps/chosen": -360.4351283482143, + "logps/rejected": -575.68037109375, + "loss": 0.0747, + "rewards/chosen": 4.300768171037946, + "rewards/margins": 9.780179868425641, + "rewards/rejected": -5.479411697387695, + "step": 194 + }, + { + "epoch": 0.048792693606906044, + "grad_norm": 17.125, + "kl": 6.444109916687012, + "learning_rate": 5e-06, + "logits/chosen": -32805256.533333335, + "logits/rejected": -63079182.222222224, + "logps/chosen": -363.3408528645833, + "logps/rejected": -466.26019965277777, + "loss": 0.1606, + "rewards/chosen": 3.773480987548828, + "rewards/margins": 9.127889251708984, + "rewards/rejected": -5.354408264160156, + "step": 195 + }, + { + "epoch": 0.04904291254847992, + "grad_norm": 14.625, + "kl": 12.910234451293945, + "learning_rate": 5e-06, + "logits/chosen": -11549870.933333334, + "logits/rejected": -54725589.333333336, + "logps/chosen": -429.72649739583335, + "logps/rejected": -525.4338650173611, + "loss": 0.1118, + "rewards/chosen": 4.961282857259115, + "rewards/margins": 9.972749413384332, + "rewards/rejected": -5.011466556125217, + "step": 196 + }, + { + "epoch": 0.0492931314900538, + "grad_norm": 27.375, + "kl": 4.634339809417725, + "learning_rate": 5e-06, + "logits/chosen": -25323805.866666667, + "logits/rejected": -45791729.777777776, + "logps/chosen": -450.1834309895833, + "logps/rejected": -538.20947265625, + "loss": 0.0775, + "rewards/chosen": 5.164319356282552, + "rewards/margins": 11.035110473632812, + "rewards/rejected": -5.870791117350261, + "step": 197 + }, + { + "epoch": 0.04954335043162767, + "grad_norm": 15.8125, + "kl": 16.27667999267578, + "learning_rate": 5e-06, + "logits/chosen": -59654941.09090909, + "logits/rejected": -59627126.15384615, + "logps/chosen": -378.24147727272725, + "logps/rejected": -449.65981820913464, + "loss": 0.1025, + "rewards/chosen": 5.658745158802379, + "rewards/margins": 9.16282013579682, + "rewards/rejected": -3.504074976994441, + "step": 198 + }, + { + "epoch": 0.04979356937320155, + "grad_norm": 11.9375, + "kl": 3.600116014480591, + "learning_rate": 5e-06, + "logits/chosen": -33377289.6, + "logits/rejected": -31516605.714285713, + "logps/chosen": -322.9662353515625, + "logps/rejected": -415.9325474330357, + "loss": 0.0971, + "rewards/chosen": 4.778628540039063, + "rewards/margins": 9.22843393598284, + "rewards/rejected": -4.449805395943778, + "step": 199 + }, + { + "epoch": 0.05004378831477543, + "grad_norm": 13.125, + "kl": 4.291494369506836, + "learning_rate": 5e-06, + "logits/chosen": -63218432.0, + "logits/rejected": -69397736.0, + "logps/chosen": -384.4539489746094, + "logps/rejected": -540.909423828125, + "loss": 0.0536, + "rewards/chosen": 4.746562957763672, + "rewards/margins": 10.028171062469482, + "rewards/rejected": -5.2816081047058105, + "step": 200 + }, + { + "epoch": 0.050294007256349305, + "grad_norm": 20.125, + "kl": 4.140326499938965, + "learning_rate": 5e-06, + "logits/chosen": -26370669.714285713, + "logits/rejected": -55841625.6, + "logps/chosen": -383.97140066964283, + "logps/rejected": -646.44501953125, + "loss": 0.1091, + "rewards/chosen": 4.063012531825474, + "rewards/margins": 10.766155079432895, + "rewards/rejected": -6.703142547607422, + "step": 201 + }, + { + "epoch": 0.05054422619792318, + "grad_norm": 19.625, + "kl": 3.0051372051239014, + "learning_rate": 5e-06, + "logits/chosen": -124939721.14285715, + "logits/rejected": -46234996.705882356, + "logps/chosen": -421.8641880580357, + "logps/rejected": -554.2309857536765, + "loss": 0.0387, + "rewards/chosen": 5.835538591657366, + "rewards/margins": 11.872191100561318, + "rewards/rejected": -6.036652508903952, + "step": 202 + }, + { + "epoch": 0.05079444513949706, + "grad_norm": 7.28125, + "kl": 7.439305782318115, + "learning_rate": 5e-06, + "logits/chosen": -66490786.461538464, + "logits/rejected": -65816610.90909091, + "logps/chosen": -453.8348858173077, + "logps/rejected": -551.6194513494319, + "loss": 0.0628, + "rewards/chosen": 6.551427401029146, + "rewards/margins": 10.499076229709011, + "rewards/rejected": -3.947648828679865, + "step": 203 + }, + { + "epoch": 0.05104466408107094, + "grad_norm": 20.25, + "kl": 5.730766773223877, + "learning_rate": 5e-06, + "logits/chosen": -72196405.33333333, + "logits/rejected": -51363594.666666664, + "logps/chosen": -440.599365234375, + "logps/rejected": -537.4098307291666, + "loss": 0.095, + "rewards/chosen": 4.975851694742839, + "rewards/margins": 10.486460367838543, + "rewards/rejected": -5.510608673095703, + "step": 204 + }, + { + "epoch": 0.051294883022644816, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70233012.36363636, + "logits/rejected": -45291643.07692308, + "logps/chosen": -519.3923117897727, + "logps/rejected": -454.42078575721155, + "loss": 0.0654, + "rewards/chosen": 5.860253767533735, + "rewards/margins": 11.40235217968067, + "rewards/rejected": -5.542098412146935, + "step": 205 + }, + { + "epoch": 0.051545101964218694, + "grad_norm": 21.375, + "kl": 18.074310302734375, + "learning_rate": 5e-06, + "logits/chosen": -71379493.64705883, + "logits/rejected": -60079908.571428575, + "logps/chosen": -548.6579733455883, + "logps/rejected": -558.7813197544643, + "loss": 0.1183, + "rewards/chosen": 6.103434394387638, + "rewards/margins": 11.75511711184718, + "rewards/rejected": -5.6516827174595425, + "step": 206 + }, + { + "epoch": 0.05179532090579257, + "grad_norm": 25.5, + "kl": 18.960453033447266, + "learning_rate": 5e-06, + "logits/chosen": -70417160.0, + "logits/rejected": -59043764.0, + "logps/chosen": -387.33612060546875, + "logps/rejected": -598.7929077148438, + "loss": 0.1292, + "rewards/chosen": 4.7016496658325195, + "rewards/margins": 11.90509033203125, + "rewards/rejected": -7.2034406661987305, + "step": 207 + }, + { + "epoch": 0.05204553984736644, + "grad_norm": 10.125, + "kl": 0.08770434558391571, + "learning_rate": 5e-06, + "logits/chosen": -53707336.0, + "logits/rejected": -58144924.0, + "logps/chosen": -316.13916015625, + "logps/rejected": -602.5560302734375, + "loss": 0.0665, + "rewards/chosen": 4.102164268493652, + "rewards/margins": 11.597392082214355, + "rewards/rejected": -7.495227813720703, + "step": 208 + }, + { + "epoch": 0.05229575878894032, + "grad_norm": 17.75, + "kl": 5.174756050109863, + "learning_rate": 5e-06, + "logits/chosen": -74477824.0, + "logits/rejected": -45505221.81818182, + "logps/chosen": -460.4802809495192, + "logps/rejected": -369.1199396306818, + "loss": 0.1004, + "rewards/chosen": 5.138316814716045, + "rewards/margins": 9.534788705252268, + "rewards/rejected": -4.396471890536222, + "step": 209 + }, + { + "epoch": 0.0525459777305142, + "grad_norm": 13.8125, + "kl": 0.1292479932308197, + "learning_rate": 5e-06, + "logits/chosen": -69940625.45454545, + "logits/rejected": -42426825.84615385, + "logps/chosen": -371.20749733664775, + "logps/rejected": -638.8646334134615, + "loss": 0.0675, + "rewards/chosen": 4.9522316672585225, + "rewards/margins": 13.068748954292776, + "rewards/rejected": -8.116517287034254, + "step": 210 + }, + { + "epoch": 0.052796196672088076, + "grad_norm": 21.25, + "kl": 6.5166192054748535, + "learning_rate": 5e-06, + "logits/chosen": -70400464.0, + "logits/rejected": -35769348.0, + "logps/chosen": -424.12115478515625, + "logps/rejected": -300.89501953125, + "loss": 0.0704, + "rewards/chosen": 4.73396110534668, + "rewards/margins": 9.076435565948486, + "rewards/rejected": -4.342474460601807, + "step": 211 + }, + { + "epoch": 0.053046415613661954, + "grad_norm": 13.0, + "kl": 4.748558044433594, + "learning_rate": 5e-06, + "logits/chosen": -59821738.666666664, + "logits/rejected": -64486704.0, + "logps/chosen": -477.7695719401042, + "logps/rejected": -744.5475260416666, + "loss": 0.0284, + "rewards/chosen": 6.058443705240886, + "rewards/margins": 14.159394582112629, + "rewards/rejected": -8.100950876871744, + "step": 212 + }, + { + "epoch": 0.05329663455523583, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52458066.28571428, + "logits/rejected": -69845586.8235294, + "logps/chosen": -502.26499720982144, + "logps/rejected": -596.6792279411765, + "loss": 0.0869, + "rewards/chosen": 5.950331551688058, + "rewards/margins": 10.853909468450468, + "rewards/rejected": -4.9035779167624085, + "step": 213 + }, + { + "epoch": 0.05354685349680971, + "grad_norm": 20.875, + "kl": 9.591470718383789, + "learning_rate": 5e-06, + "logits/chosen": -61181298.28571428, + "logits/rejected": -76267212.8, + "logps/chosen": -359.708740234375, + "logps/rejected": -512.524658203125, + "loss": 0.1162, + "rewards/chosen": 4.980873107910156, + "rewards/margins": 11.222312927246094, + "rewards/rejected": -6.2414398193359375, + "step": 214 + }, + { + "epoch": 0.05379707243838359, + "grad_norm": 18.75, + "kl": 2.8346781730651855, + "learning_rate": 5e-06, + "logits/chosen": -58141400.615384616, + "logits/rejected": -33345233.454545453, + "logps/chosen": -434.98084435096155, + "logps/rejected": -382.52738813920456, + "loss": 0.0696, + "rewards/chosen": 4.866139045128455, + "rewards/margins": 8.685153507686161, + "rewards/rejected": -3.819014462557706, + "step": 215 + }, + { + "epoch": 0.054047291379957466, + "grad_norm": 15.8125, + "kl": 10.940942764282227, + "learning_rate": 5e-06, + "logits/chosen": -50631176.0, + "logits/rejected": -55207144.0, + "logps/chosen": -439.8216247558594, + "logps/rejected": -594.8463134765625, + "loss": 0.0943, + "rewards/chosen": 5.669807434082031, + "rewards/margins": 13.193309783935547, + "rewards/rejected": -7.523502349853516, + "step": 216 + }, + { + "epoch": 0.054297510321531336, + "grad_norm": 33.75, + "kl": 3.2548866271972656, + "learning_rate": 5e-06, + "logits/chosen": -53489436.44444445, + "logits/rejected": 4758829.333333333, + "logps/chosen": -395.04747178819446, + "logps/rejected": -472.0696614583333, + "loss": 0.0891, + "rewards/chosen": 4.646226671006945, + "rewards/margins": 8.308933427598742, + "rewards/rejected": -3.662706756591797, + "step": 217 + }, + { + "epoch": 0.054547729263105214, + "grad_norm": 13.6875, + "kl": 4.568079948425293, + "learning_rate": 5e-06, + "logits/chosen": -60876618.666666664, + "logits/rejected": -21642705.333333332, + "logps/chosen": -503.5177408854167, + "logps/rejected": -483.5083414713542, + "loss": 0.0767, + "rewards/chosen": 5.721874872843425, + "rewards/margins": 11.690620422363281, + "rewards/rejected": -5.9687455495198565, + "step": 218 + }, + { + "epoch": 0.05479794820467909, + "grad_norm": 8.9375, + "kl": 5.388765811920166, + "learning_rate": 5e-06, + "logits/chosen": -76596437.33333333, + "logits/rejected": -47909795.55555555, + "logps/chosen": -510.7514973958333, + "logps/rejected": -329.8972981770833, + "loss": 0.0261, + "rewards/chosen": 5.45499267578125, + "rewards/margins": 9.806046125623915, + "rewards/rejected": -4.351053449842665, + "step": 219 + }, + { + "epoch": 0.05504816714625297, + "grad_norm": 23.25, + "kl": 8.13897705078125, + "learning_rate": 5e-06, + "logits/chosen": -103544469.33333333, + "logits/rejected": -30345754.666666668, + "logps/chosen": -445.0954182942708, + "logps/rejected": -572.4197591145834, + "loss": 0.169, + "rewards/chosen": 5.362514495849609, + "rewards/margins": 12.308923085530598, + "rewards/rejected": -6.946408589680989, + "step": 220 + }, + { + "epoch": 0.05529838608782685, + "grad_norm": 16.25, + "kl": 8.565845489501953, + "learning_rate": 5e-06, + "logits/chosen": -35269291.428571425, + "logits/rejected": -24489796.8, + "logps/chosen": -421.0266810825893, + "logps/rejected": -347.6060302734375, + "loss": 0.1262, + "rewards/chosen": 5.468855721609933, + "rewards/margins": 9.556808907645088, + "rewards/rejected": -4.087953186035156, + "step": 221 + }, + { + "epoch": 0.055548605029400726, + "grad_norm": 16.0, + "kl": 3.1567230224609375, + "learning_rate": 5e-06, + "logits/chosen": -38834736.0, + "logits/rejected": -53589360.0, + "logps/chosen": -404.2430826822917, + "logps/rejected": -549.4991861979166, + "loss": 0.0491, + "rewards/chosen": 4.901371320088704, + "rewards/margins": 11.388110796610514, + "rewards/rejected": -6.48673947652181, + "step": 222 + }, + { + "epoch": 0.055798823970974604, + "grad_norm": 14.125, + "kl": 2.0810012817382812, + "learning_rate": 5e-06, + "logits/chosen": -69334592.0, + "logits/rejected": -35340559.058823526, + "logps/chosen": -588.7001255580357, + "logps/rejected": -495.96559053308823, + "loss": 0.0385, + "rewards/chosen": 6.903956821986607, + "rewards/margins": 12.074652856137572, + "rewards/rejected": -5.170696034150965, + "step": 223 + }, + { + "epoch": 0.05604904291254848, + "grad_norm": 22.0, + "kl": 1.7932794094085693, + "learning_rate": 5e-06, + "logits/chosen": -33799616.0, + "logits/rejected": -1172655.3333333333, + "logps/chosen": -399.4767252604167, + "logps/rejected": -460.508544921875, + "loss": 0.104, + "rewards/chosen": 4.64273738861084, + "rewards/margins": 9.231263796488445, + "rewards/rejected": -4.5885264078776045, + "step": 224 + }, + { + "epoch": 0.05629926185412236, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51637317.81818182, + "logits/rejected": -58171603.692307696, + "logps/chosen": -483.3936878551136, + "logps/rejected": -606.2865459735577, + "loss": 0.0551, + "rewards/chosen": 5.742768721147017, + "rewards/margins": 13.433718381228147, + "rewards/rejected": -7.69094966008113, + "step": 225 + }, + { + "epoch": 0.05654948079569624, + "grad_norm": 9.875, + "kl": 3.7677154541015625, + "learning_rate": 5e-06, + "logits/chosen": -53113203.2, + "logits/rejected": -49358720.0, + "logps/chosen": -384.63059895833334, + "logps/rejected": -585.7591145833334, + "loss": 0.0748, + "rewards/chosen": 5.463581339518229, + "rewards/margins": 13.2242184109158, + "rewards/rejected": -7.76063707139757, + "step": 226 + }, + { + "epoch": 0.05679969973727011, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76321258.66666667, + "logits/rejected": -60759840.0, + "logps/chosen": -530.7881673177084, + "logps/rejected": -690.87646484375, + "loss": 0.0407, + "rewards/chosen": 5.5117142995198565, + "rewards/margins": 13.567319234212238, + "rewards/rejected": -8.055604934692383, + "step": 227 + }, + { + "epoch": 0.057049918678843986, + "grad_norm": 9.625, + "kl": 2.9436187744140625, + "learning_rate": 5e-06, + "logits/chosen": -57832976.0, + "logits/rejected": -43035576.0, + "logps/chosen": -475.21234130859375, + "logps/rejected": -455.1776123046875, + "loss": 0.0263, + "rewards/chosen": 5.860957145690918, + "rewards/margins": 10.955077171325684, + "rewards/rejected": -5.094120025634766, + "step": 228 + }, + { + "epoch": 0.057300137620417864, + "grad_norm": 18.875, + "kl": 3.507528066635132, + "learning_rate": 5e-06, + "logits/chosen": -50419136.0, + "logits/rejected": -58293361.23076923, + "logps/chosen": -354.875, + "logps/rejected": -387.4791917067308, + "loss": 0.1223, + "rewards/chosen": 4.062959844415838, + "rewards/margins": 8.998005526882785, + "rewards/rejected": -4.9350456824669475, + "step": 229 + }, + { + "epoch": 0.05755035656199174, + "grad_norm": 16.125, + "kl": 0.5412664413452148, + "learning_rate": 5e-06, + "logits/chosen": -38644117.333333336, + "logits/rejected": -56184810.666666664, + "logps/chosen": -420.4574381510417, + "logps/rejected": -551.7562255859375, + "loss": 0.0768, + "rewards/chosen": 3.9948673248291016, + "rewards/margins": 10.442398707071941, + "rewards/rejected": -6.447531382242839, + "step": 230 + }, + { + "epoch": 0.05780057550356562, + "grad_norm": 16.375, + "kl": 0.26791128516197205, + "learning_rate": 5e-06, + "logits/chosen": -41563904.0, + "logits/rejected": -73339136.0, + "logps/chosen": -442.39808872767856, + "logps/rejected": -640.181005859375, + "loss": 0.0442, + "rewards/chosen": 5.2872499738420755, + "rewards/margins": 11.534498814174107, + "rewards/rejected": -6.247248840332031, + "step": 231 + }, + { + "epoch": 0.0580507944451395, + "grad_norm": 21.5, + "kl": 7.603259086608887, + "learning_rate": 5e-06, + "logits/chosen": -85483227.42857143, + "logits/rejected": -63368768.0, + "logps/chosen": -552.6742466517857, + "logps/rejected": -519.0142578125, + "loss": 0.1034, + "rewards/chosen": 5.500741141183036, + "rewards/margins": 11.000902502877372, + "rewards/rejected": -5.500161361694336, + "step": 232 + }, + { + "epoch": 0.058301013386713375, + "grad_norm": 15.125, + "kl": 6.472053527832031, + "learning_rate": 5e-06, + "logits/chosen": -38552170.666666664, + "logits/rejected": -17897214.666666668, + "logps/chosen": -468.6722819010417, + "logps/rejected": -590.6138916015625, + "loss": 0.0491, + "rewards/chosen": 4.910961151123047, + "rewards/margins": 12.375680923461914, + "rewards/rejected": -7.464719772338867, + "step": 233 + }, + { + "epoch": 0.05855123232828725, + "grad_norm": 15.5625, + "kl": 0.8780374526977539, + "learning_rate": 5e-06, + "logits/chosen": -96147411.2, + "logits/rejected": -43601572.571428575, + "logps/chosen": -609.7056640625, + "logps/rejected": -413.4390345982143, + "loss": 0.0475, + "rewards/chosen": 5.946613311767578, + "rewards/margins": 12.216070992606028, + "rewards/rejected": -6.269457680838449, + "step": 234 + }, + { + "epoch": 0.05880145126986113, + "grad_norm": 16.125, + "kl": 5.695685386657715, + "learning_rate": 5e-06, + "logits/chosen": -93081225.84615384, + "logits/rejected": -49701550.54545455, + "logps/chosen": -468.4971454326923, + "logps/rejected": -421.6673029119318, + "loss": 0.0623, + "rewards/chosen": 4.9567741980919475, + "rewards/margins": 11.436547979608282, + "rewards/rejected": -6.479773781516335, + "step": 235 + }, + { + "epoch": 0.05905167021143501, + "grad_norm": 14.6875, + "kl": 1.8992418050765991, + "learning_rate": 5e-06, + "logits/chosen": -54616021.333333336, + "logits/rejected": -32344803.555555556, + "logps/chosen": -413.20299479166664, + "logps/rejected": -326.95220269097223, + "loss": 0.1424, + "rewards/chosen": 3.883965555826823, + "rewards/margins": 8.674331834581164, + "rewards/rejected": -4.79036627875434, + "step": 236 + }, + { + "epoch": 0.05930188915300888, + "grad_norm": 13.0625, + "kl": 4.48274040222168, + "learning_rate": 5e-06, + "logits/chosen": -63192994.461538464, + "logits/rejected": -39326164.36363637, + "logps/chosen": -534.2183368389423, + "logps/rejected": -460.103515625, + "loss": 0.1283, + "rewards/chosen": 5.91471686730018, + "rewards/margins": 11.04461389821726, + "rewards/rejected": -5.129897030917081, + "step": 237 + }, + { + "epoch": 0.05955210809458276, + "grad_norm": 17.375, + "kl": 8.008248329162598, + "learning_rate": 5e-06, + "logits/chosen": -105651729.06666666, + "logits/rejected": -33743971.55555555, + "logps/chosen": -534.3837890625, + "logps/rejected": -353.31640625, + "loss": 0.0603, + "rewards/chosen": 4.986274210611979, + "rewards/margins": 11.95004645453559, + "rewards/rejected": -6.963772243923611, + "step": 238 + }, + { + "epoch": 0.059802327036156636, + "grad_norm": 18.0, + "kl": 1.8964078426361084, + "learning_rate": 5e-06, + "logits/chosen": -52408721.45454545, + "logits/rejected": -48703926.15384615, + "logps/chosen": -353.20503373579544, + "logps/rejected": -686.5046574519231, + "loss": 0.1101, + "rewards/chosen": 4.136805447665128, + "rewards/margins": 12.106794343961703, + "rewards/rejected": -7.969988896296575, + "step": 239 + }, + { + "epoch": 0.06005254597773051, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53857590.15384615, + "logits/rejected": -43563741.09090909, + "logps/chosen": -363.1044921875, + "logps/rejected": -474.27756569602275, + "loss": 0.0699, + "rewards/chosen": 4.238471397986779, + "rewards/margins": 10.517301492757731, + "rewards/rejected": -6.278830094770952, + "step": 240 + }, + { + "epoch": 0.06030276491930439, + "grad_norm": 13.0625, + "kl": 0.6801236867904663, + "learning_rate": 5e-06, + "logits/chosen": -34695546.666666664, + "logits/rejected": -48599274.666666664, + "logps/chosen": -383.9803466796875, + "logps/rejected": -568.0260416666666, + "loss": 0.07, + "rewards/chosen": 4.735932032267253, + "rewards/margins": 11.453741073608398, + "rewards/rejected": -6.7178090413411455, + "step": 241 + }, + { + "epoch": 0.06055298386087827, + "grad_norm": 28.75, + "kl": 7.704229354858398, + "learning_rate": 5e-06, + "logits/chosen": -40987808.0, + "logits/rejected": -33765154.28571428, + "logps/chosen": -432.7125, + "logps/rejected": -600.5432477678571, + "loss": 0.1087, + "rewards/chosen": 4.74278564453125, + "rewards/margins": 10.816473606654576, + "rewards/rejected": -6.0736879621233255, + "step": 242 + }, + { + "epoch": 0.06080320280245215, + "grad_norm": 23.125, + "kl": 2.1749091148376465, + "learning_rate": 5e-06, + "logits/chosen": -91359545.6, + "logits/rejected": -62814555.428571425, + "logps/chosen": -501.44365234375, + "logps/rejected": -450.42354910714283, + "loss": 0.0623, + "rewards/chosen": 5.585881042480469, + "rewards/margins": 11.966207994733537, + "rewards/rejected": -6.3803269522530695, + "step": 243 + }, + { + "epoch": 0.061053421744026025, + "grad_norm": 20.25, + "kl": 3.0634572505950928, + "learning_rate": 5e-06, + "logits/chosen": -32003923.692307692, + "logits/rejected": -44226141.09090909, + "logps/chosen": -304.0939190204327, + "logps/rejected": -494.07901278409093, + "loss": 0.1763, + "rewards/chosen": 2.9934842036320615, + "rewards/margins": 8.667183669297012, + "rewards/rejected": -5.67369946566495, + "step": 244 + }, + { + "epoch": 0.0613036406855999, + "grad_norm": 14.75, + "kl": 1.1172847747802734, + "learning_rate": 5e-06, + "logits/chosen": -71501435.73333333, + "logits/rejected": -49895534.222222224, + "logps/chosen": -448.57200520833334, + "logps/rejected": -532.8487413194445, + "loss": 0.0571, + "rewards/chosen": 4.671438598632813, + "rewards/margins": 10.938484361436632, + "rewards/rejected": -6.26704576280382, + "step": 245 + }, + { + "epoch": 0.061553859627173774, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42116583.384615384, + "logits/rejected": -64825611.63636363, + "logps/chosen": -414.7998046875, + "logps/rejected": -464.74116654829544, + "loss": 0.0804, + "rewards/chosen": 5.251841031588041, + "rewards/margins": 11.372012318431082, + "rewards/rejected": -6.12017128684304, + "step": 246 + }, + { + "epoch": 0.06180407856874765, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27848082.0, + "logits/rejected": -49274160.0, + "logps/chosen": -323.40582275390625, + "logps/rejected": -380.6914367675781, + "loss": 0.088, + "rewards/chosen": 3.8589885234832764, + "rewards/margins": 8.341522932052612, + "rewards/rejected": -4.482534408569336, + "step": 247 + }, + { + "epoch": 0.06205429751032153, + "grad_norm": 17.25, + "kl": 0.6061592102050781, + "learning_rate": 5e-06, + "logits/chosen": -72504634.66666667, + "logits/rejected": -68203098.66666667, + "logps/chosen": -358.8920084635417, + "logps/rejected": -500.3172200520833, + "loss": 0.0906, + "rewards/chosen": 3.5224291483561196, + "rewards/margins": 9.297661463419596, + "rewards/rejected": -5.775232315063477, + "step": 248 + }, + { + "epoch": 0.06230451645189541, + "grad_norm": 8.0625, + "kl": 3.0028254985809326, + "learning_rate": 5e-06, + "logits/chosen": -44604178.666666664, + "logits/rejected": -50529397.333333336, + "logps/chosen": -586.1739095052084, + "logps/rejected": -642.0966389973959, + "loss": 0.0191, + "rewards/chosen": 6.448746999104817, + "rewards/margins": 13.13725471496582, + "rewards/rejected": -6.688507715861003, + "step": 249 + }, + { + "epoch": 0.06255473539346929, + "grad_norm": 16.25, + "kl": 4.768838405609131, + "learning_rate": 5e-06, + "logits/chosen": -54296755.2, + "logits/rejected": -50749120.0, + "logps/chosen": -484.103125, + "logps/rejected": -633.3464704241071, + "loss": 0.0487, + "rewards/chosen": 6.53180160522461, + "rewards/margins": 14.83898173740932, + "rewards/rejected": -8.30718013218471, + "step": 250 + }, + { + "epoch": 0.06280495433504316, + "grad_norm": 16.5, + "kl": 0.9286988973617554, + "learning_rate": 5e-06, + "logits/chosen": -46544112.0, + "logits/rejected": -36229072.0, + "logps/chosen": -341.8699544270833, + "logps/rejected": -429.4635416666667, + "loss": 0.0906, + "rewards/chosen": 4.2282514572143555, + "rewards/margins": 10.53867244720459, + "rewards/rejected": -6.310420989990234, + "step": 251 + }, + { + "epoch": 0.06305517327661704, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62439566.222222224, + "logits/rejected": -41197469.86666667, + "logps/chosen": -521.8043619791666, + "logps/rejected": -565.6182291666667, + "loss": 0.0467, + "rewards/chosen": 6.853285471598308, + "rewards/margins": 15.23663813273112, + "rewards/rejected": -8.383352661132813, + "step": 252 + }, + { + "epoch": 0.06330539221819091, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46289870.222222224, + "logits/rejected": -48755741.86666667, + "logps/chosen": -429.5540364583333, + "logps/rejected": -520.4573567708334, + "loss": 0.0305, + "rewards/chosen": 5.6796459621853295, + "rewards/margins": 11.141227383083766, + "rewards/rejected": -5.461581420898438, + "step": 253 + }, + { + "epoch": 0.0635556111597648, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63038912.0, + "logits/rejected": -41706232.88888889, + "logps/chosen": -318.92742919921875, + "logps/rejected": -603.8517795138889, + "loss": 0.0363, + "rewards/chosen": 3.1966425577799478, + "rewards/margins": 10.984870062934027, + "rewards/rejected": -7.7882275051540795, + "step": 254 + }, + { + "epoch": 0.06380583010133867, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47767109.333333336, + "logits/rejected": -42213802.666666664, + "logps/chosen": -420.8616536458333, + "logps/rejected": -464.4077962239583, + "loss": 0.0585, + "rewards/chosen": 5.024503707885742, + "rewards/margins": 10.489451726277668, + "rewards/rejected": -5.464948018391927, + "step": 255 + }, + { + "epoch": 0.06405604904291255, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69395494.4, + "logits/rejected": -39749270.85714286, + "logps/chosen": -396.38232421875, + "logps/rejected": -442.34340122767856, + "loss": 0.0836, + "rewards/chosen": 4.396539306640625, + "rewards/margins": 9.73754381452288, + "rewards/rejected": -5.341004507882254, + "step": 256 + }, + { + "epoch": 0.06430626798448642, + "grad_norm": 16.5, + "kl": 1.3632354736328125, + "learning_rate": 5e-06, + "logits/chosen": -46325056.0, + "logits/rejected": -39119104.0, + "logps/chosen": -459.4122869318182, + "logps/rejected": -449.71987680288464, + "loss": 0.0746, + "rewards/chosen": 6.272554570978338, + "rewards/margins": 12.569833928888494, + "rewards/rejected": -6.297279357910156, + "step": 257 + }, + { + "epoch": 0.06455648692606031, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45575133.09090909, + "logits/rejected": -57118119.384615384, + "logps/chosen": -478.939453125, + "logps/rejected": -490.60069861778845, + "loss": 0.0495, + "rewards/chosen": 4.3977227644486865, + "rewards/margins": 12.05151786003913, + "rewards/rejected": -7.653795095590445, + "step": 258 + }, + { + "epoch": 0.06480670586763418, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52410422.4, + "logits/rejected": -30478674.285714287, + "logps/chosen": -292.6397705078125, + "logps/rejected": -452.7027064732143, + "loss": 0.0918, + "rewards/chosen": 3.266457366943359, + "rewards/margins": 10.335402025495256, + "rewards/rejected": -7.068944658551898, + "step": 259 + }, + { + "epoch": 0.06505692480920806, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53376630.15384615, + "logits/rejected": -24394141.09090909, + "logps/chosen": -463.49906099759613, + "logps/rejected": -558.4932972301136, + "loss": 0.067, + "rewards/chosen": 5.315664438100962, + "rewards/margins": 13.556491131549116, + "rewards/rejected": -8.240826693448154, + "step": 260 + }, + { + "epoch": 0.06530714375078193, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65685779.2, + "logits/rejected": -59654674.28571428, + "logps/chosen": -493.353076171875, + "logps/rejected": -534.4528459821429, + "loss": 0.0196, + "rewards/chosen": 4.4684593200683596, + "rewards/margins": 12.256799752371652, + "rewards/rejected": -7.7883404323032925, + "step": 261 + }, + { + "epoch": 0.0655573626923558, + "grad_norm": 11.375, + "kl": 0.8784777522087097, + "learning_rate": 5e-06, + "logits/chosen": -87586154.66666667, + "logits/rejected": -65527802.666666664, + "logps/chosen": -483.6160481770833, + "logps/rejected": -566.429931640625, + "loss": 0.059, + "rewards/chosen": 5.209161122639974, + "rewards/margins": 12.132217407226562, + "rewards/rejected": -6.923056284586589, + "step": 262 + }, + { + "epoch": 0.06580758163392969, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70752604.44444445, + "logits/rejected": 51250338.13333333, + "logps/chosen": -425.29399956597223, + "logps/rejected": -520.7492838541667, + "loss": 0.0508, + "rewards/chosen": 3.386476516723633, + "rewards/margins": 11.8880552927653, + "rewards/rejected": -8.501578776041667, + "step": 263 + }, + { + "epoch": 0.06605780057550356, + "grad_norm": 25.75, + "kl": 5.149889945983887, + "learning_rate": 5e-06, + "logits/chosen": -48233130.666666664, + "logits/rejected": -43924462.222222224, + "logps/chosen": -552.0205729166667, + "logps/rejected": -537.4425998263889, + "loss": 0.0788, + "rewards/chosen": 5.195638020833333, + "rewards/margins": 11.78333960639106, + "rewards/rejected": -6.587701585557726, + "step": 264 + }, + { + "epoch": 0.06630801951707745, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61328758.85714286, + "logits/rejected": -56918505.4117647, + "logps/chosen": -479.367431640625, + "logps/rejected": -608.0940372242648, + "loss": 0.0618, + "rewards/chosen": 3.7964319501604353, + "rewards/margins": 12.093412864108046, + "rewards/rejected": -8.29698091394761, + "step": 265 + }, + { + "epoch": 0.06655823845865132, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55310618.666666664, + "logits/rejected": -21998644.0, + "logps/chosen": -439.9580078125, + "logps/rejected": -547.12060546875, + "loss": 0.0748, + "rewards/chosen": 5.038155873616536, + "rewards/margins": 12.72253672281901, + "rewards/rejected": -7.684380849202474, + "step": 266 + }, + { + "epoch": 0.0668084574002252, + "grad_norm": 15.5, + "kl": 5.1014509201049805, + "learning_rate": 5e-06, + "logits/chosen": -75437512.0, + "logits/rejected": -80797840.0, + "logps/chosen": -525.6380004882812, + "logps/rejected": -414.5377197265625, + "loss": 0.029, + "rewards/chosen": 7.350945472717285, + "rewards/margins": 13.114680767059326, + "rewards/rejected": -5.763735294342041, + "step": 267 + }, + { + "epoch": 0.06705867634179907, + "grad_norm": 17.25, + "kl": 1.123401403427124, + "learning_rate": 5e-06, + "logits/chosen": -58302169.6, + "logits/rejected": -46181952.0, + "logps/chosen": -475.768017578125, + "logps/rejected": -429.35302734375, + "loss": 0.0977, + "rewards/chosen": 4.8388420104980465, + "rewards/margins": 11.183382197788784, + "rewards/rejected": -6.344540187290737, + "step": 268 + }, + { + "epoch": 0.06730889528337296, + "grad_norm": 20.75, + "kl": 7.065022945404053, + "learning_rate": 5e-06, + "logits/chosen": -77816576.0, + "logits/rejected": -52881866.666666664, + "logps/chosen": -483.0377604166667, + "logps/rejected": -646.4273681640625, + "loss": 0.0953, + "rewards/chosen": 4.503868103027344, + "rewards/margins": 16.014129638671875, + "rewards/rejected": -11.510261535644531, + "step": 269 + }, + { + "epoch": 0.06755911422494683, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41498552.88888889, + "logits/rejected": -60701781.333333336, + "logps/chosen": -437.0484212239583, + "logps/rejected": -565.5378255208333, + "loss": 0.0493, + "rewards/chosen": 4.081297556559245, + "rewards/margins": 13.958947499593098, + "rewards/rejected": -9.877649943033854, + "step": 270 + }, + { + "epoch": 0.0678093331665207, + "grad_norm": 15.5625, + "kl": 0.29158782958984375, + "learning_rate": 5e-06, + "logits/chosen": -78919831.27272727, + "logits/rejected": -25148347.076923076, + "logps/chosen": -400.4002574573864, + "logps/rejected": -301.63927283653845, + "loss": 0.0679, + "rewards/chosen": 5.59576381336559, + "rewards/margins": 11.211588586127007, + "rewards/rejected": -5.615824772761418, + "step": 271 + }, + { + "epoch": 0.06805955210809458, + "grad_norm": 18.25, + "kl": 7.191731929779053, + "learning_rate": 5e-06, + "logits/chosen": -52007271.384615384, + "logits/rejected": -45008203.63636363, + "logps/chosen": -379.2160456730769, + "logps/rejected": -450.93212890625, + "loss": 0.0883, + "rewards/chosen": 4.103186387282151, + "rewards/margins": 12.224304412628388, + "rewards/rejected": -8.121118025346236, + "step": 272 + }, + { + "epoch": 0.06830977104966846, + "grad_norm": 22.0, + "kl": 5.124803066253662, + "learning_rate": 5e-06, + "logits/chosen": -78945928.53333333, + "logits/rejected": -29661472.0, + "logps/chosen": -470.85325520833334, + "logps/rejected": -476.0070529513889, + "loss": 0.085, + "rewards/chosen": 5.226908365885417, + "rewards/margins": 10.916728973388672, + "rewards/rejected": -5.689820607503255, + "step": 273 + }, + { + "epoch": 0.06855998999124234, + "grad_norm": 22.375, + "kl": 7.02076530456543, + "learning_rate": 5e-06, + "logits/chosen": -58399515.428571425, + "logits/rejected": -61953196.8, + "logps/chosen": -535.1898018973214, + "logps/rejected": -317.746240234375, + "loss": 0.0809, + "rewards/chosen": 6.339141845703125, + "rewards/margins": 9.71686019897461, + "rewards/rejected": -3.3777183532714843, + "step": 274 + }, + { + "epoch": 0.06881020893281621, + "grad_norm": 10.4375, + "kl": 6.946084022521973, + "learning_rate": 5e-06, + "logits/chosen": -53368098.461538464, + "logits/rejected": -55329297.45454545, + "logps/chosen": -428.52201021634613, + "logps/rejected": -597.0437233664773, + "loss": 0.047, + "rewards/chosen": 4.962626530573918, + "rewards/margins": 11.407653968650978, + "rewards/rejected": -6.44502743807706, + "step": 275 + }, + { + "epoch": 0.0690604278743901, + "grad_norm": 21.75, + "kl": 5.97451114654541, + "learning_rate": 5e-06, + "logits/chosen": -81498112.0, + "logits/rejected": 81226272.0, + "logps/chosen": -430.68505859375, + "logps/rejected": -447.3458984375, + "loss": 0.0851, + "rewards/chosen": 4.633864266531808, + "rewards/margins": 9.713998086111886, + "rewards/rejected": -5.0801338195800785, + "step": 276 + }, + { + "epoch": 0.06931064681596397, + "grad_norm": 12.25, + "kl": 0.1952921599149704, + "learning_rate": 5e-06, + "logits/chosen": -78502592.0, + "logits/rejected": -31381964.0, + "logps/chosen": -487.24859619140625, + "logps/rejected": -431.78997802734375, + "loss": 0.0301, + "rewards/chosen": 5.563851833343506, + "rewards/margins": 10.804934978485107, + "rewards/rejected": -5.241083145141602, + "step": 277 + }, + { + "epoch": 0.06956086575753785, + "grad_norm": 10.75, + "kl": 0.5088316798210144, + "learning_rate": 5e-06, + "logits/chosen": -30100498.666666668, + "logits/rejected": -39578920.0, + "logps/chosen": -342.0590006510417, + "logps/rejected": -634.4694417317709, + "loss": 0.0814, + "rewards/chosen": 5.2512868245442705, + "rewards/margins": 13.84987513224284, + "rewards/rejected": -8.598588307698568, + "step": 278 + }, + { + "epoch": 0.06981108469911172, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61760720.0, + "logits/rejected": -42346205.333333336, + "logps/chosen": -427.8580729166667, + "logps/rejected": -348.9947916666667, + "loss": 0.0705, + "rewards/chosen": 4.600465456644694, + "rewards/margins": 9.550864537556967, + "rewards/rejected": -4.950399080912272, + "step": 279 + }, + { + "epoch": 0.0700613036406856, + "grad_norm": 7.6875, + "kl": 4.760970115661621, + "learning_rate": 5e-06, + "logits/chosen": -41277586.28571428, + "logits/rejected": -52429590.4, + "logps/chosen": -442.60525948660717, + "logps/rejected": -449.628955078125, + "loss": 0.0498, + "rewards/chosen": 5.872192927769253, + "rewards/margins": 11.140544673374722, + "rewards/rejected": -5.268351745605469, + "step": 280 + }, + { + "epoch": 0.07031152258225948, + "grad_norm": 8.875, + "kl": 1.9243710041046143, + "learning_rate": 5e-06, + "logits/chosen": -45989888.0, + "logits/rejected": -58634885.333333336, + "logps/chosen": -356.4975992838542, + "logps/rejected": -542.73876953125, + "loss": 0.037, + "rewards/chosen": 4.887360254923503, + "rewards/margins": 11.935904184977215, + "rewards/rejected": -7.048543930053711, + "step": 281 + }, + { + "epoch": 0.07056174152383335, + "grad_norm": 10.9375, + "kl": 11.589720726013184, + "learning_rate": 5e-06, + "logits/chosen": -59778560.0, + "logits/rejected": -66434538.666666664, + "logps/chosen": -458.0664388020833, + "logps/rejected": -428.1740993923611, + "loss": 0.1556, + "rewards/chosen": 5.7575327555338545, + "rewards/margins": 10.448915269639757, + "rewards/rejected": -4.691382514105903, + "step": 282 + }, + { + "epoch": 0.07081196046540723, + "grad_norm": 16.25, + "kl": 10.859646797180176, + "learning_rate": 5e-06, + "logits/chosen": -50777248.0, + "logits/rejected": -43125244.0, + "logps/chosen": -399.5986633300781, + "logps/rejected": -522.7695922851562, + "loss": 0.0645, + "rewards/chosen": 5.321485996246338, + "rewards/margins": 12.317273139953613, + "rewards/rejected": -6.995787143707275, + "step": 283 + }, + { + "epoch": 0.0710621794069811, + "grad_norm": 7.90625, + "kl": 1.132131814956665, + "learning_rate": 5e-06, + "logits/chosen": -55504290.90909091, + "logits/rejected": -39513597.538461536, + "logps/chosen": -495.51247336647725, + "logps/rejected": -362.1698467548077, + "loss": 0.032, + "rewards/chosen": 6.6378936767578125, + "rewards/margins": 10.8924319927509, + "rewards/rejected": -4.254538315993089, + "step": 284 + }, + { + "epoch": 0.07131239834855499, + "grad_norm": 16.875, + "kl": 11.234561920166016, + "learning_rate": 5e-06, + "logits/chosen": -71418130.28571428, + "logits/rejected": -41369632.0, + "logps/chosen": -428.03271484375, + "logps/rejected": -731.75654296875, + "loss": 0.0669, + "rewards/chosen": 6.893448965890067, + "rewards/margins": 15.258488028390065, + "rewards/rejected": -8.3650390625, + "step": 285 + }, + { + "epoch": 0.07156261729012886, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54508409.6, + "logits/rejected": -55452480.0, + "logps/chosen": -340.187939453125, + "logps/rejected": -539.3630022321429, + "loss": 0.079, + "rewards/chosen": 5.844940948486328, + "rewards/margins": 13.450724138532365, + "rewards/rejected": -7.605783190046038, + "step": 286 + }, + { + "epoch": 0.07181283623170275, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57286966.85714286, + "logits/rejected": -38474522.35294118, + "logps/chosen": -254.78475516183036, + "logps/rejected": -456.94514016544116, + "loss": 0.1102, + "rewards/chosen": 3.0357987540108815, + "rewards/margins": 9.18617187628225, + "rewards/rejected": -6.150373122271369, + "step": 287 + }, + { + "epoch": 0.07206305517327662, + "grad_norm": 15.5625, + "kl": 2.9759058952331543, + "learning_rate": 5e-06, + "logits/chosen": -92077286.4, + "logits/rejected": -37994189.71428572, + "logps/chosen": -515.35732421875, + "logps/rejected": -623.5980747767857, + "loss": 0.0347, + "rewards/chosen": 7.753670501708984, + "rewards/margins": 13.778749411446707, + "rewards/rejected": -6.025078909737723, + "step": 288 + }, + { + "epoch": 0.0723132741148505, + "grad_norm": 7.09375, + "kl": 5.745540618896484, + "learning_rate": 5e-06, + "logits/chosen": -73232245.33333333, + "logits/rejected": -49373632.0, + "logps/chosen": -429.4737548828125, + "logps/rejected": -579.3299153645834, + "loss": 0.0339, + "rewards/chosen": 5.881547292073567, + "rewards/margins": 12.978575388590494, + "rewards/rejected": -7.097028096516927, + "step": 289 + }, + { + "epoch": 0.07256349305642437, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24547368.727272727, + "logits/rejected": -39950163.692307696, + "logps/chosen": -547.5836736505681, + "logps/rejected": -678.9582331730769, + "loss": 0.0331, + "rewards/chosen": 5.565066944469105, + "rewards/margins": 13.200354889556245, + "rewards/rejected": -7.635287945087139, + "step": 290 + }, + { + "epoch": 0.07281371199799824, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64983193.6, + "logits/rejected": -55468856.88888889, + "logps/chosen": -355.5180989583333, + "logps/rejected": -449.8848470052083, + "loss": 0.0891, + "rewards/chosen": 4.4547876993815105, + "rewards/margins": 10.654435390896268, + "rewards/rejected": -6.199647691514757, + "step": 291 + }, + { + "epoch": 0.07306393093957213, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69864550.4, + "logits/rejected": -37210121.14285714, + "logps/chosen": -418.81591796875, + "logps/rejected": -421.0170200892857, + "loss": 0.0631, + "rewards/chosen": 5.072563171386719, + "rewards/margins": 12.084261757986887, + "rewards/rejected": -7.0116985866001675, + "step": 292 + }, + { + "epoch": 0.073314149881146, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54616715.63636363, + "logits/rejected": -51849550.76923077, + "logps/chosen": -451.50221946022725, + "logps/rejected": -551.5064978966346, + "loss": 0.0619, + "rewards/chosen": 4.917368108575994, + "rewards/margins": 12.405217417470226, + "rewards/rejected": -7.487849308894231, + "step": 293 + }, + { + "epoch": 0.07356436882271988, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50288458.666666664, + "logits/rejected": -57870400.0, + "logps/chosen": -452.7593994140625, + "logps/rejected": -536.1645100911459, + "loss": 0.0448, + "rewards/chosen": 5.461680094401042, + "rewards/margins": 13.866125106811523, + "rewards/rejected": -8.404445012410482, + "step": 294 + }, + { + "epoch": 0.07381458776429375, + "grad_norm": 14.1875, + "kl": 7.171340465545654, + "learning_rate": 5e-06, + "logits/chosen": -54457760.0, + "logits/rejected": -68416877.71428572, + "logps/chosen": -398.9943115234375, + "logps/rejected": -708.7611607142857, + "loss": 0.1173, + "rewards/chosen": 4.315273666381836, + "rewards/margins": 13.175844301496234, + "rewards/rejected": -8.860570635114398, + "step": 295 + }, + { + "epoch": 0.07406480670586764, + "grad_norm": 20.75, + "kl": 4.5795793533325195, + "learning_rate": 5e-06, + "logits/chosen": -75763680.0, + "logits/rejected": -44389234.28571428, + "logps/chosen": -454.39814453125, + "logps/rejected": -446.1196986607143, + "loss": 0.0631, + "rewards/chosen": 4.975424957275391, + "rewards/margins": 12.284866659981864, + "rewards/rejected": -7.309441702706473, + "step": 296 + }, + { + "epoch": 0.07431502564744151, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38831180.307692304, + "logits/rejected": -55790394.18181818, + "logps/chosen": -338.1838566706731, + "logps/rejected": -588.3064630681819, + "loss": 0.1255, + "rewards/chosen": 4.476335672231821, + "rewards/margins": 12.39231731174709, + "rewards/rejected": -7.91598163951527, + "step": 297 + }, + { + "epoch": 0.0745652445890154, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55339756.8, + "logits/rejected": -62088137.14285714, + "logps/chosen": -392.377001953125, + "logps/rejected": -520.7548130580357, + "loss": 0.0632, + "rewards/chosen": 4.347047805786133, + "rewards/margins": 13.66012328011649, + "rewards/rejected": -9.313075474330358, + "step": 298 + }, + { + "epoch": 0.07481546353058927, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55607074.90909091, + "logits/rejected": -68183310.76923077, + "logps/chosen": -368.6839488636364, + "logps/rejected": -563.94970703125, + "loss": 0.0925, + "rewards/chosen": 3.4984796697443183, + "rewards/margins": 11.65328008478338, + "rewards/rejected": -8.154800415039062, + "step": 299 + }, + { + "epoch": 0.07506568247216314, + "grad_norm": 12.0, + "kl": 7.9532246589660645, + "learning_rate": 5e-06, + "logits/chosen": -48124446.11764706, + "logits/rejected": -48984662.85714286, + "logps/chosen": -462.72409237132354, + "logps/rejected": -419.2373046875, + "loss": 0.072, + "rewards/chosen": 6.3196240593405335, + "rewards/margins": 12.44195976577887, + "rewards/rejected": -6.122335706438337, + "step": 300 + }, + { + "epoch": 0.07531590141373702, + "grad_norm": 14.0, + "kl": 5.24700403213501, + "learning_rate": 5e-06, + "logits/chosen": -68653216.0, + "logits/rejected": -35296780.8, + "logps/chosen": -466.408203125, + "logps/rejected": -407.138232421875, + "loss": 0.1249, + "rewards/chosen": 4.35285895211356, + "rewards/margins": 10.530777958461215, + "rewards/rejected": -6.177919006347656, + "step": 301 + }, + { + "epoch": 0.07556612035531089, + "grad_norm": 16.625, + "kl": 3.634018659591675, + "learning_rate": 5e-06, + "logits/chosen": -50885474.13333333, + "logits/rejected": -56333162.666666664, + "logps/chosen": -377.9595052083333, + "logps/rejected": -554.6233723958334, + "loss": 0.1282, + "rewards/chosen": 4.844576009114584, + "rewards/margins": 12.83094991048177, + "rewards/rejected": -7.9863739013671875, + "step": 302 + }, + { + "epoch": 0.07581633929688478, + "grad_norm": 5.3125, + "kl": 0.5038427114486694, + "learning_rate": 5e-06, + "logits/chosen": -16485792.0, + "logits/rejected": -45833536.0, + "logps/chosen": -608.5945638020834, + "logps/rejected": -612.8363850911459, + "loss": 0.0134, + "rewards/chosen": 5.650701522827148, + "rewards/margins": 13.277856826782227, + "rewards/rejected": -7.627155303955078, + "step": 303 + }, + { + "epoch": 0.07606655823845865, + "grad_norm": 217.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71577656.8888889, + "logits/rejected": 1446080.0, + "logps/chosen": -392.09239366319446, + "logps/rejected": -441.5680338541667, + "loss": 0.0545, + "rewards/chosen": 4.2894774542914496, + "rewards/margins": 10.09934582180447, + "rewards/rejected": -5.809868367513021, + "step": 304 + }, + { + "epoch": 0.07631677718003253, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45708024.88888889, + "logits/rejected": -48254118.4, + "logps/chosen": -469.90489366319446, + "logps/rejected": -553.3817057291667, + "loss": 0.0403, + "rewards/chosen": 5.711358812120226, + "rewards/margins": 15.100974697536893, + "rewards/rejected": -9.389615885416667, + "step": 305 + }, + { + "epoch": 0.0765669961216064, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65677869.71428572, + "logits/rejected": -37207715.2, + "logps/chosen": -538.8613630022321, + "logps/rejected": -673.329736328125, + "loss": 0.0181, + "rewards/chosen": 6.0059612819126675, + "rewards/margins": 17.722608620779855, + "rewards/rejected": -11.716647338867187, + "step": 306 + }, + { + "epoch": 0.07681721506318029, + "grad_norm": 13.5, + "kl": 3.474886894226074, + "learning_rate": 5e-06, + "logits/chosen": -57763948.0, + "logits/rejected": -35655440.0, + "logps/chosen": -380.42388916015625, + "logps/rejected": -565.8240966796875, + "loss": 0.0494, + "rewards/chosen": 5.190339088439941, + "rewards/margins": 12.820594310760498, + "rewards/rejected": -7.630255222320557, + "step": 307 + }, + { + "epoch": 0.07706743400475416, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50644028.8, + "logits/rejected": -64086038.85714286, + "logps/chosen": -335.427197265625, + "logps/rejected": -471.4610072544643, + "loss": 0.1226, + "rewards/chosen": 4.116854476928711, + "rewards/margins": 10.397411291939871, + "rewards/rejected": -6.280556815011161, + "step": 308 + }, + { + "epoch": 0.07731765294632803, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40328700.44444445, + "logits/rejected": -66934630.4, + "logps/chosen": -318.47987196180554, + "logps/rejected": -568.573828125, + "loss": 0.0707, + "rewards/chosen": 3.737506866455078, + "rewards/margins": 11.43120091756185, + "rewards/rejected": -7.693694051106771, + "step": 309 + }, + { + "epoch": 0.07756787188790192, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -78686771.2, + "logits/rejected": -47525301.333333336, + "logps/chosen": -442.12867838541666, + "logps/rejected": -514.9216579861111, + "loss": 0.0935, + "rewards/chosen": 4.877289326985677, + "rewards/margins": 13.47551778157552, + "rewards/rejected": -8.598228454589844, + "step": 310 + }, + { + "epoch": 0.07781809082947579, + "grad_norm": 20.75, + "kl": 0.8121821284294128, + "learning_rate": 5e-06, + "logits/chosen": -44828251.428571425, + "logits/rejected": -53163142.4, + "logps/chosen": -362.40244838169644, + "logps/rejected": -622.3521484375, + "loss": 0.084, + "rewards/chosen": 3.6696810041155135, + "rewards/margins": 11.712674931117467, + "rewards/rejected": -8.042993927001953, + "step": 311 + }, + { + "epoch": 0.07806830977104967, + "grad_norm": 27.5, + "kl": 7.953427791595459, + "learning_rate": 5e-06, + "logits/chosen": -48007264.0, + "logits/rejected": -50813360.0, + "logps/chosen": -312.81829833984375, + "logps/rejected": -782.123779296875, + "loss": 0.194, + "rewards/chosen": 2.631920337677002, + "rewards/margins": 13.814859867095947, + "rewards/rejected": -11.182939529418945, + "step": 312 + }, + { + "epoch": 0.07831852871262354, + "grad_norm": 18.375, + "kl": 4.491418838500977, + "learning_rate": 5e-06, + "logits/chosen": -59036037.333333336, + "logits/rejected": -62983744.0, + "logps/chosen": -405.5749104817708, + "logps/rejected": -400.713134765625, + "loss": 0.0604, + "rewards/chosen": 5.748222351074219, + "rewards/margins": 12.755852381388348, + "rewards/rejected": -7.007630030314128, + "step": 313 + }, + { + "epoch": 0.07856874765419743, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49776497.23076923, + "logits/rejected": -39559080.72727273, + "logps/chosen": -410.9743088942308, + "logps/rejected": -649.0001775568181, + "loss": 0.0764, + "rewards/chosen": 3.891269096961388, + "rewards/margins": 12.504322852288093, + "rewards/rejected": -8.613053755326705, + "step": 314 + }, + { + "epoch": 0.0788189665957713, + "grad_norm": 13.5625, + "kl": 2.1953799724578857, + "learning_rate": 5e-06, + "logits/chosen": -22781229.714285713, + "logits/rejected": -51122547.2, + "logps/chosen": -473.47377232142856, + "logps/rejected": -650.680078125, + "loss": 0.0604, + "rewards/chosen": 3.87014525277274, + "rewards/margins": 11.899960163661412, + "rewards/rejected": -8.029814910888671, + "step": 315 + }, + { + "epoch": 0.07906918553734518, + "grad_norm": 15.8125, + "kl": 9.544827461242676, + "learning_rate": 5e-06, + "logits/chosen": -48614872.0, + "logits/rejected": -43650948.0, + "logps/chosen": -453.4354553222656, + "logps/rejected": -512.2210693359375, + "loss": 0.0898, + "rewards/chosen": 5.919229984283447, + "rewards/margins": 13.66039228439331, + "rewards/rejected": -7.741162300109863, + "step": 316 + }, + { + "epoch": 0.07931940447891905, + "grad_norm": 10.25, + "kl": 4.059052467346191, + "learning_rate": 5e-06, + "logits/chosen": -39676822.15384615, + "logits/rejected": -43106210.90909091, + "logps/chosen": -295.5746882512019, + "logps/rejected": -311.35336026278407, + "loss": 0.0869, + "rewards/chosen": 4.078121478740986, + "rewards/margins": 8.50689033028129, + "rewards/rejected": -4.428768851540306, + "step": 317 + }, + { + "epoch": 0.07956962342049294, + "grad_norm": 9.3125, + "kl": 5.8941545486450195, + "learning_rate": 5e-06, + "logits/chosen": -64744792.615384616, + "logits/rejected": -51566289.45454545, + "logps/chosen": -515.5871018629807, + "logps/rejected": -568.5423473011364, + "loss": 0.0258, + "rewards/chosen": 5.717002281775842, + "rewards/margins": 16.027895253855032, + "rewards/rejected": -10.31089297207919, + "step": 318 + }, + { + "epoch": 0.07981984236206681, + "grad_norm": 16.875, + "kl": 2.7760798931121826, + "learning_rate": 5e-06, + "logits/chosen": -71659475.2, + "logits/rejected": -19278834.285714287, + "logps/chosen": -399.245458984375, + "logps/rejected": -608.02392578125, + "loss": 0.1061, + "rewards/chosen": 4.554701614379883, + "rewards/margins": 10.876112747192384, + "rewards/rejected": -6.3214111328125, + "step": 319 + }, + { + "epoch": 0.08007006130364068, + "grad_norm": 21.5, + "kl": 3.1826140880584717, + "learning_rate": 5e-06, + "logits/chosen": -47259182.54545455, + "logits/rejected": -36462040.615384616, + "logps/chosen": -488.42813387784093, + "logps/rejected": -390.6603440504808, + "loss": 0.0642, + "rewards/chosen": 6.43964316628196, + "rewards/margins": 11.169853984059152, + "rewards/rejected": -4.730210817777193, + "step": 320 + }, + { + "epoch": 0.08032028024521456, + "grad_norm": 4.78125, + "kl": 1.7273375988006592, + "learning_rate": 5e-06, + "logits/chosen": -66869568.0, + "logits/rejected": -85939221.33333333, + "logps/chosen": -534.5769856770834, + "logps/rejected": -714.4563802083334, + "loss": 0.0283, + "rewards/chosen": 6.996565500895183, + "rewards/margins": 15.929007212320965, + "rewards/rejected": -8.932441711425781, + "step": 321 + }, + { + "epoch": 0.08057049918678844, + "grad_norm": 20.0, + "kl": 2.56874418258667, + "learning_rate": 5e-06, + "logits/chosen": -57239475.2, + "logits/rejected": -56795510.85714286, + "logps/chosen": -335.910400390625, + "logps/rejected": -404.29600306919644, + "loss": 0.1252, + "rewards/chosen": 4.564379119873047, + "rewards/margins": 8.60868355887277, + "rewards/rejected": -4.044304438999721, + "step": 322 + }, + { + "epoch": 0.08082071812836232, + "grad_norm": 5.03125, + "kl": 0.07329623401165009, + "learning_rate": 5e-06, + "logits/chosen": -69165893.81818181, + "logits/rejected": -1601237.5384615385, + "logps/chosen": -447.9138849431818, + "logps/rejected": -450.7696063701923, + "loss": 0.0382, + "rewards/chosen": 6.760638150301847, + "rewards/margins": 13.934056635503168, + "rewards/rejected": -7.1734184852013225, + "step": 323 + }, + { + "epoch": 0.08107093706993619, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39580979.2, + "logits/rejected": -68372598.85714285, + "logps/chosen": -313.34326171875, + "logps/rejected": -575.8517020089286, + "loss": 0.0425, + "rewards/chosen": 6.435821533203125, + "rewards/margins": 13.611937386648997, + "rewards/rejected": -7.176115853445871, + "step": 324 + }, + { + "epoch": 0.08132115601151008, + "grad_norm": 20.125, + "kl": 0.8204015493392944, + "learning_rate": 5e-06, + "logits/chosen": -65441307.428571425, + "logits/rejected": -57284736.0, + "logps/chosen": -425.15806361607144, + "logps/rejected": -590.3502987132352, + "loss": 0.0802, + "rewards/chosen": 6.303914751325335, + "rewards/margins": 11.97205852861164, + "rewards/rejected": -5.668143777286305, + "step": 325 + }, + { + "epoch": 0.08157137495308395, + "grad_norm": 19.0, + "kl": 10.519261360168457, + "learning_rate": 5e-06, + "logits/chosen": -80258446.76923077, + "logits/rejected": -51475234.90909091, + "logps/chosen": -530.1903545673077, + "logps/rejected": -660.7665571732955, + "loss": 0.0618, + "rewards/chosen": 7.094483595628005, + "rewards/margins": 13.829765106414582, + "rewards/rejected": -6.735281510786577, + "step": 326 + }, + { + "epoch": 0.08182159389465783, + "grad_norm": 10.9375, + "kl": 2.997100830078125, + "learning_rate": 5e-06, + "logits/chosen": -68122611.2, + "logits/rejected": -37164114.28571428, + "logps/chosen": -521.449755859375, + "logps/rejected": -398.49173409598217, + "loss": 0.0351, + "rewards/chosen": 6.441287231445313, + "rewards/margins": 12.19258804321289, + "rewards/rejected": -5.751300811767578, + "step": 327 + }, + { + "epoch": 0.0820718128362317, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28296394.666666668, + "logits/rejected": -53455300.266666666, + "logps/chosen": -365.97100151909723, + "logps/rejected": -708.0231119791666, + "loss": 0.0643, + "rewards/chosen": 4.3345459832085504, + "rewards/margins": 15.565710533989801, + "rewards/rejected": -11.23116455078125, + "step": 328 + }, + { + "epoch": 0.08232203177780557, + "grad_norm": 25.0, + "kl": 6.122256755828857, + "learning_rate": 5e-06, + "logits/chosen": -59985792.0, + "logits/rejected": -43732233.14285714, + "logps/chosen": -395.7806181066176, + "logps/rejected": -368.55897739955356, + "loss": 0.1264, + "rewards/chosen": 5.007666195140166, + "rewards/margins": 11.517860701104173, + "rewards/rejected": -6.510194505964007, + "step": 329 + }, + { + "epoch": 0.08257225071937946, + "grad_norm": 19.25, + "kl": 8.461600303649902, + "learning_rate": 5e-06, + "logits/chosen": -42237380.92307692, + "logits/rejected": -49793361.45454545, + "logps/chosen": -479.0254657451923, + "logps/rejected": -346.64839311079544, + "loss": 0.0445, + "rewards/chosen": 6.577327434833233, + "rewards/margins": 12.408399461866258, + "rewards/rejected": -5.831072027033025, + "step": 330 + }, + { + "epoch": 0.08282246966095333, + "grad_norm": 18.25, + "kl": 1.3572839498519897, + "learning_rate": 5e-06, + "logits/chosen": -79799195.42857143, + "logits/rejected": -47506073.6, + "logps/chosen": -401.2435825892857, + "logps/rejected": -499.124609375, + "loss": 0.057, + "rewards/chosen": 5.039963858468192, + "rewards/margins": 14.429381125313895, + "rewards/rejected": -9.389417266845703, + "step": 331 + }, + { + "epoch": 0.08307268860252721, + "grad_norm": 15.625, + "kl": 3.3528761863708496, + "learning_rate": 5e-06, + "logits/chosen": -91620602.18181819, + "logits/rejected": -45963298.461538464, + "logps/chosen": -401.23073508522725, + "logps/rejected": -722.0138221153846, + "loss": 0.0619, + "rewards/chosen": 3.542086514559659, + "rewards/margins": 12.321062368112845, + "rewards/rejected": -8.778975853553185, + "step": 332 + }, + { + "epoch": 0.08332290754410109, + "grad_norm": 19.25, + "kl": 2.2864317893981934, + "learning_rate": 5e-06, + "logits/chosen": -78876793.6, + "logits/rejected": -32254505.14285714, + "logps/chosen": -462.73212890625, + "logps/rejected": -509.91015625, + "loss": 0.078, + "rewards/chosen": 4.534415817260742, + "rewards/margins": 13.48143185206822, + "rewards/rejected": -8.947016034807477, + "step": 333 + }, + { + "epoch": 0.08357312648567497, + "grad_norm": 23.5, + "kl": 15.974513053894043, + "learning_rate": 5e-06, + "logits/chosen": -58769088.0, + "logits/rejected": -43553034.666666664, + "logps/chosen": -456.30121527777777, + "logps/rejected": -764.964599609375, + "loss": 0.1248, + "rewards/chosen": 5.548309326171875, + "rewards/margins": 13.340513229370117, + "rewards/rejected": -7.792203903198242, + "step": 334 + }, + { + "epoch": 0.08382334542724884, + "grad_norm": 31.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57135756.8, + "logits/rejected": -54594267.428571425, + "logps/chosen": -427.58017578125, + "logps/rejected": -613.2606026785714, + "loss": 0.1121, + "rewards/chosen": 4.363252639770508, + "rewards/margins": 11.321872438703265, + "rewards/rejected": -6.958619798932757, + "step": 335 + }, + { + "epoch": 0.08407356436882273, + "grad_norm": 18.5, + "kl": 3.6958415508270264, + "learning_rate": 5e-06, + "logits/chosen": -29753862.4, + "logits/rejected": -55726363.428571425, + "logps/chosen": -332.8385009765625, + "logps/rejected": -564.3188127790179, + "loss": 0.0776, + "rewards/chosen": 3.554238128662109, + "rewards/margins": 12.512367466517858, + "rewards/rejected": -8.958129337855748, + "step": 336 + }, + { + "epoch": 0.0843237833103966, + "grad_norm": 14.75, + "kl": 1.9886001348495483, + "learning_rate": 5e-06, + "logits/chosen": -71776645.33333333, + "logits/rejected": -33673770.666666664, + "logps/chosen": -468.2509765625, + "logps/rejected": -374.058349609375, + "loss": 0.0554, + "rewards/chosen": 5.30703608194987, + "rewards/margins": 11.932327906290691, + "rewards/rejected": -6.62529182434082, + "step": 337 + }, + { + "epoch": 0.08457400225197047, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46806941.09090909, + "logits/rejected": -59882958.76923077, + "logps/chosen": -452.71950461647725, + "logps/rejected": -889.8108473557693, + "loss": 0.0187, + "rewards/chosen": 6.304441972212358, + "rewards/margins": 18.56909040971236, + "rewards/rejected": -12.2646484375, + "step": 338 + }, + { + "epoch": 0.08482422119354435, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39775402.666666664, + "logits/rejected": -60844488.53333333, + "logps/chosen": -373.6398111979167, + "logps/rejected": -589.4291666666667, + "loss": 0.0569, + "rewards/chosen": 4.993715074327257, + "rewards/margins": 12.940140448676216, + "rewards/rejected": -7.946425374348959, + "step": 339 + }, + { + "epoch": 0.08507444013511822, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54369786.18181818, + "logits/rejected": -34264625.23076923, + "logps/chosen": -363.16166548295456, + "logps/rejected": -634.0920222355769, + "loss": 0.0525, + "rewards/chosen": 5.306921872225675, + "rewards/margins": 15.834063736708849, + "rewards/rejected": -10.527141864483173, + "step": 340 + }, + { + "epoch": 0.08532465907669211, + "grad_norm": 13.8125, + "kl": 3.7437191009521484, + "learning_rate": 5e-06, + "logits/chosen": -69789661.53846154, + "logits/rejected": -71503592.72727273, + "logps/chosen": -552.8607271634615, + "logps/rejected": -348.5597478693182, + "loss": 0.0393, + "rewards/chosen": 6.496727576622596, + "rewards/margins": 11.194468304827495, + "rewards/rejected": -4.6977407282049, + "step": 341 + }, + { + "epoch": 0.08557487801826598, + "grad_norm": 18.75, + "kl": 2.2659192085266113, + "learning_rate": 5e-06, + "logits/chosen": -55322080.0, + "logits/rejected": -19687675.42857143, + "logps/chosen": -468.987841796875, + "logps/rejected": -484.4410923549107, + "loss": 0.0457, + "rewards/chosen": 5.919325637817383, + "rewards/margins": 12.086318915230887, + "rewards/rejected": -6.166993277413504, + "step": 342 + }, + { + "epoch": 0.08582509695983986, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -82238848.0, + "logits/rejected": -32393040.0, + "logps/chosen": -356.1858825683594, + "logps/rejected": -407.47186279296875, + "loss": 0.0844, + "rewards/chosen": 3.6388773918151855, + "rewards/margins": 9.83605670928955, + "rewards/rejected": -6.197179317474365, + "step": 343 + }, + { + "epoch": 0.08607531590141373, + "grad_norm": 13.1875, + "kl": 1.406911849975586, + "learning_rate": 5e-06, + "logits/chosen": -54211136.0, + "logits/rejected": -49059731.2, + "logps/chosen": -490.14634486607144, + "logps/rejected": -461.68271484375, + "loss": 0.0765, + "rewards/chosen": 5.853201729910714, + "rewards/margins": 13.903588540213448, + "rewards/rejected": -8.050386810302735, + "step": 344 + }, + { + "epoch": 0.08632553484298762, + "grad_norm": 23.75, + "kl": 10.15842056274414, + "learning_rate": 5e-06, + "logits/chosen": -16359782.666666666, + "logits/rejected": -95072714.66666667, + "logps/chosen": -399.1624348958333, + "logps/rejected": -492.3768717447917, + "loss": 0.1384, + "rewards/chosen": 4.814750989278157, + "rewards/margins": 13.772103945414226, + "rewards/rejected": -8.957352956136068, + "step": 345 + }, + { + "epoch": 0.08657575378456149, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39405501.333333336, + "logits/rejected": -35529018.666666664, + "logps/chosen": -424.7720133463542, + "logps/rejected": -584.3602294921875, + "loss": 0.0367, + "rewards/chosen": 5.987045923868815, + "rewards/margins": 15.205389658610027, + "rewards/rejected": -9.218343734741211, + "step": 346 + }, + { + "epoch": 0.08682597272613538, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71689997.71428572, + "logits/rejected": -36729336.47058824, + "logps/chosen": -670.90380859375, + "logps/rejected": -450.77039292279414, + "loss": 0.0563, + "rewards/chosen": 7.845038822719029, + "rewards/margins": 14.11451801332105, + "rewards/rejected": -6.269479190602022, + "step": 347 + }, + { + "epoch": 0.08707619166770925, + "grad_norm": 10.1875, + "kl": 0.8384997248649597, + "learning_rate": 5e-06, + "logits/chosen": -73047540.36363636, + "logits/rejected": -50919163.07692308, + "logps/chosen": -502.95854048295456, + "logps/rejected": -549.7272385817307, + "loss": 0.0266, + "rewards/chosen": 6.3045786077326, + "rewards/margins": 15.477944274048705, + "rewards/rejected": -9.173365666316105, + "step": 348 + }, + { + "epoch": 0.08732641060928312, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8290919.0, + "logits/rejected": 18450308.0, + "logps/chosen": -378.5304870605469, + "logps/rejected": -619.2883911132812, + "loss": 0.0528, + "rewards/chosen": 4.09270715713501, + "rewards/margins": 12.661085605621338, + "rewards/rejected": -8.568378448486328, + "step": 349 + }, + { + "epoch": 0.087576629550857, + "grad_norm": 24.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65208251.07692308, + "logits/rejected": -50788544.0, + "logps/chosen": -550.4088792067307, + "logps/rejected": -394.7405894886364, + "loss": 0.0686, + "rewards/chosen": 4.863852867713342, + "rewards/margins": 11.727734759137348, + "rewards/rejected": -6.863881891424006, + "step": 350 + }, + { + "epoch": 0.08782684849243087, + "grad_norm": 8.5625, + "kl": 3.084390640258789, + "learning_rate": 5e-06, + "logits/chosen": -50031324.44444445, + "logits/rejected": -53625011.2, + "logps/chosen": -503.4275716145833, + "logps/rejected": -781.49453125, + "loss": 0.0382, + "rewards/chosen": 5.849625481499566, + "rewards/margins": 18.398342725965712, + "rewards/rejected": -12.548717244466145, + "step": 351 + }, + { + "epoch": 0.08807706743400476, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50025744.0, + "logits/rejected": -73963898.66666667, + "logps/chosen": -418.3951822916667, + "logps/rejected": -627.3041585286459, + "loss": 0.0376, + "rewards/chosen": 5.052695910135905, + "rewards/margins": 13.235371589660645, + "rewards/rejected": -8.18267567952474, + "step": 352 + }, + { + "epoch": 0.08832728637557863, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26449706.666666668, + "logits/rejected": -36583244.8, + "logps/chosen": -303.1364474826389, + "logps/rejected": -434.3656901041667, + "loss": 0.0688, + "rewards/chosen": 3.8088162740071616, + "rewards/margins": 12.002457427978516, + "rewards/rejected": -8.193641153971354, + "step": 353 + }, + { + "epoch": 0.08857750531715251, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56242976.0, + "logits/rejected": 33482381.714285713, + "logps/chosen": -548.902880859375, + "logps/rejected": -346.14878627232144, + "loss": 0.0369, + "rewards/chosen": 7.538912200927735, + "rewards/margins": 13.006964656284879, + "rewards/rejected": -5.468052455357143, + "step": 354 + }, + { + "epoch": 0.08882772425872638, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53564284.8, + "logits/rejected": -62660818.28571428, + "logps/chosen": -397.53154296875, + "logps/rejected": -714.7157505580357, + "loss": 0.0661, + "rewards/chosen": 3.3319602966308595, + "rewards/margins": 15.111241912841797, + "rewards/rejected": -11.779281616210938, + "step": 355 + }, + { + "epoch": 0.08907794320030027, + "grad_norm": 20.625, + "kl": 3.1753997802734375, + "learning_rate": 5e-06, + "logits/chosen": -39247488.0, + "logits/rejected": -55188656.0, + "logps/chosen": -421.8942565917969, + "logps/rejected": -615.5950927734375, + "loss": 0.0858, + "rewards/chosen": 4.690866470336914, + "rewards/margins": 15.615058898925781, + "rewards/rejected": -10.924192428588867, + "step": 356 + }, + { + "epoch": 0.08932816214187414, + "grad_norm": 17.375, + "kl": 1.4323711395263672, + "learning_rate": 5e-06, + "logits/chosen": -60970124.8, + "logits/rejected": -73626752.0, + "logps/chosen": -333.5826171875, + "logps/rejected": -555.0177525111607, + "loss": 0.0877, + "rewards/chosen": 3.657939910888672, + "rewards/margins": 11.130105699811663, + "rewards/rejected": -7.472165788922991, + "step": 357 + }, + { + "epoch": 0.08957838108344801, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67144472.0, + "logits/rejected": -38479056.0, + "logps/chosen": -423.0672607421875, + "logps/rejected": -615.0408935546875, + "loss": 0.1016, + "rewards/chosen": 4.16230583190918, + "rewards/margins": 13.634326934814453, + "rewards/rejected": -9.472021102905273, + "step": 358 + }, + { + "epoch": 0.0898286000250219, + "grad_norm": 18.625, + "kl": 3.3815784454345703, + "learning_rate": 5e-06, + "logits/chosen": -45535152.0, + "logits/rejected": -51963908.0, + "logps/chosen": -386.4666748046875, + "logps/rejected": -322.1533508300781, + "loss": 0.1285, + "rewards/chosen": 3.6765856742858887, + "rewards/margins": 8.868733882904053, + "rewards/rejected": -5.192148208618164, + "step": 359 + }, + { + "epoch": 0.09007881896659577, + "grad_norm": 19.875, + "kl": 0.9900690913200378, + "learning_rate": 5e-06, + "logits/chosen": -57026585.6, + "logits/rejected": -60664571.428571425, + "logps/chosen": -374.341943359375, + "logps/rejected": -529.1930454799107, + "loss": 0.0431, + "rewards/chosen": 4.683572387695312, + "rewards/margins": 12.21929212297712, + "rewards/rejected": -7.535719735281808, + "step": 360 + }, + { + "epoch": 0.09032903790816965, + "grad_norm": 5.65625, + "kl": 3.256103515625, + "learning_rate": 5e-06, + "logits/chosen": -61525415.384615384, + "logits/rejected": -63263197.09090909, + "logps/chosen": -541.3863431490385, + "logps/rejected": -405.3743341619318, + "loss": 0.0149, + "rewards/chosen": 6.213826693021334, + "rewards/margins": 12.861952775008195, + "rewards/rejected": -6.64812608198686, + "step": 361 + }, + { + "epoch": 0.09057925684974352, + "grad_norm": 13.375, + "kl": 4.158720016479492, + "learning_rate": 5e-06, + "logits/chosen": -56053405.538461536, + "logits/rejected": -46150685.09090909, + "logps/chosen": -394.01810396634613, + "logps/rejected": -508.25053267045456, + "loss": 0.0663, + "rewards/chosen": 4.731486100416917, + "rewards/margins": 11.334641063129986, + "rewards/rejected": -6.603154962713068, + "step": 362 + }, + { + "epoch": 0.09082947579131741, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58984179.2, + "logits/rejected": -44649984.0, + "logps/chosen": -429.757373046875, + "logps/rejected": -497.41650390625, + "loss": 0.0432, + "rewards/chosen": 5.909109497070313, + "rewards/margins": 14.319558933803014, + "rewards/rejected": -8.410449436732701, + "step": 363 + }, + { + "epoch": 0.09107969473289128, + "grad_norm": 15.4375, + "kl": 2.5075480937957764, + "learning_rate": 5e-06, + "logits/chosen": -36391227.733333334, + "logits/rejected": -60230762.666666664, + "logps/chosen": -371.2073567708333, + "logps/rejected": -653.6985677083334, + "loss": 0.107, + "rewards/chosen": 4.0661875406901045, + "rewards/margins": 14.488671196831596, + "rewards/rejected": -10.422483656141493, + "step": 364 + }, + { + "epoch": 0.09132991367446516, + "grad_norm": 19.625, + "kl": 5.928158760070801, + "learning_rate": 5e-06, + "logits/chosen": -66486496.0, + "logits/rejected": 1660486.6666666667, + "logps/chosen": -516.28369140625, + "logps/rejected": -445.3562418619792, + "loss": 0.0979, + "rewards/chosen": 6.100261688232422, + "rewards/margins": 12.259714762369793, + "rewards/rejected": -6.15945307413737, + "step": 365 + }, + { + "epoch": 0.09158013261603903, + "grad_norm": 13.6875, + "kl": 6.553426265716553, + "learning_rate": 5e-06, + "logits/chosen": -55526043.428571425, + "logits/rejected": -49916358.4, + "logps/chosen": -441.85703822544644, + "logps/rejected": -593.2875, + "loss": 0.0348, + "rewards/chosen": 5.074913569859096, + "rewards/margins": 13.679854365757535, + "rewards/rejected": -8.604940795898438, + "step": 366 + }, + { + "epoch": 0.0918303515576129, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41151360.0, + "logits/rejected": -41403968.0, + "logps/chosen": -417.757666015625, + "logps/rejected": -400.60878208705356, + "loss": 0.0609, + "rewards/chosen": 4.611019897460937, + "rewards/margins": 11.120613861083985, + "rewards/rejected": -6.509593963623047, + "step": 367 + }, + { + "epoch": 0.09208057049918679, + "grad_norm": 14.0, + "kl": 8.070347785949707, + "learning_rate": 5e-06, + "logits/chosen": -37015507.692307696, + "logits/rejected": -31232366.545454547, + "logps/chosen": -395.2751652644231, + "logps/rejected": -530.2959872159091, + "loss": 0.0552, + "rewards/chosen": 5.743570767916166, + "rewards/margins": 12.627804416042942, + "rewards/rejected": -6.884233648126775, + "step": 368 + }, + { + "epoch": 0.09233078944076066, + "grad_norm": 27.75, + "kl": 2.749298095703125, + "learning_rate": 5e-06, + "logits/chosen": -65480448.0, + "logits/rejected": -25416629.333333332, + "logps/chosen": -481.6826171875, + "logps/rejected": -474.8369140625, + "loss": 0.1008, + "rewards/chosen": 6.059861501057942, + "rewards/margins": 10.086954752604166, + "rewards/rejected": -4.027093251546224, + "step": 369 + }, + { + "epoch": 0.09258100838233455, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44566824.0, + "logits/rejected": -45385685.333333336, + "logps/chosen": -399.710693359375, + "logps/rejected": -653.25244140625, + "loss": 0.0388, + "rewards/chosen": 4.439666112263997, + "rewards/margins": 15.766674677530926, + "rewards/rejected": -11.327008565266928, + "step": 370 + }, + { + "epoch": 0.09283122732390842, + "grad_norm": 18.75, + "kl": 6.107187271118164, + "learning_rate": 5e-06, + "logits/chosen": -63404730.666666664, + "logits/rejected": -22978010.666666668, + "logps/chosen": -474.6171061197917, + "logps/rejected": -350.288818359375, + "loss": 0.088, + "rewards/chosen": 4.971944491068522, + "rewards/margins": 10.882649421691895, + "rewards/rejected": -5.910704930623372, + "step": 371 + }, + { + "epoch": 0.0930814462654823, + "grad_norm": 14.75, + "kl": 3.7354979515075684, + "learning_rate": 5e-06, + "logits/chosen": -34817354.666666664, + "logits/rejected": -6763334.666666667, + "logps/chosen": -350.4503173828125, + "logps/rejected": -509.0269775390625, + "loss": 0.0934, + "rewards/chosen": 5.426799774169922, + "rewards/margins": 10.002487182617188, + "rewards/rejected": -4.575687408447266, + "step": 372 + }, + { + "epoch": 0.09333166520705617, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50732711.384615384, + "logits/rejected": -43489856.0, + "logps/chosen": -204.86395733173077, + "logps/rejected": -454.76589133522725, + "loss": 0.1408, + "rewards/chosen": 3.477960146390475, + "rewards/margins": 8.843580846186285, + "rewards/rejected": -5.36562069979581, + "step": 373 + }, + { + "epoch": 0.09358188414863006, + "grad_norm": 12.75, + "kl": 0.46170300245285034, + "learning_rate": 5e-06, + "logits/chosen": -53032192.0, + "logits/rejected": -76971019.63636364, + "logps/chosen": -434.9354717548077, + "logps/rejected": -654.2144886363636, + "loss": 0.0481, + "rewards/chosen": 6.249504089355469, + "rewards/margins": 12.367059881036932, + "rewards/rejected": -6.117555791681463, + "step": 374 + }, + { + "epoch": 0.09383210309020393, + "grad_norm": 18.75, + "kl": 17.76038360595703, + "learning_rate": 5e-06, + "logits/chosen": -66216277.333333336, + "logits/rejected": -48391557.333333336, + "logps/chosen": -435.092529296875, + "logps/rejected": -399.892333984375, + "loss": 0.1204, + "rewards/chosen": 6.640109592013889, + "rewards/margins": 12.17822986178928, + "rewards/rejected": -5.538120269775391, + "step": 375 + }, + { + "epoch": 0.09408232203177781, + "grad_norm": 17.5, + "kl": 13.891247749328613, + "learning_rate": 5e-06, + "logits/chosen": -59493808.0, + "logits/rejected": -28179568.0, + "logps/chosen": -455.0027262369792, + "logps/rejected": -370.8207600911458, + "loss": 0.1225, + "rewards/chosen": 6.140077590942383, + "rewards/margins": 10.166110038757324, + "rewards/rejected": -4.026032447814941, + "step": 376 + }, + { + "epoch": 0.09433254097335168, + "grad_norm": 5.6875, + "kl": 7.242225170135498, + "learning_rate": 5e-06, + "logits/chosen": -58986362.666666664, + "logits/rejected": -28042061.333333332, + "logps/chosen": -565.0171712239584, + "logps/rejected": -347.8406168619792, + "loss": 0.0162, + "rewards/chosen": 7.618915557861328, + "rewards/margins": 13.646324157714844, + "rewards/rejected": -6.027408599853516, + "step": 377 + }, + { + "epoch": 0.09458275991492555, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52458016.0, + "logits/rejected": -24554508.8, + "logps/chosen": -299.80918666294644, + "logps/rejected": -501.97216796875, + "loss": 0.0837, + "rewards/chosen": 3.9322482517787387, + "rewards/margins": 12.154504721505301, + "rewards/rejected": -8.222256469726563, + "step": 378 + }, + { + "epoch": 0.09483297885649944, + "grad_norm": 23.75, + "kl": 8.254425048828125, + "learning_rate": 5e-06, + "logits/chosen": -49368411.428571425, + "logits/rejected": -47947078.4, + "logps/chosen": -448.5439453125, + "logps/rejected": -664.28271484375, + "loss": 0.0619, + "rewards/chosen": 6.7694887433733255, + "rewards/margins": 14.484971836635044, + "rewards/rejected": -7.715483093261719, + "step": 379 + }, + { + "epoch": 0.09508319779807331, + "grad_norm": 12.4375, + "kl": 1.073413610458374, + "learning_rate": 5e-06, + "logits/chosen": -28673864.533333335, + "logits/rejected": -38044817.777777776, + "logps/chosen": -329.4783203125, + "logps/rejected": -461.26161024305554, + "loss": 0.1649, + "rewards/chosen": 4.78260498046875, + "rewards/margins": 11.099345228407117, + "rewards/rejected": -6.316740247938368, + "step": 380 + }, + { + "epoch": 0.0953334167396472, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30628922.181818184, + "logits/rejected": -29202161.230769232, + "logps/chosen": -285.32712624289775, + "logps/rejected": -498.37661508413464, + "loss": 0.1054, + "rewards/chosen": 4.811467950994318, + "rewards/margins": 10.034194626174607, + "rewards/rejected": -5.222726675180288, + "step": 381 + }, + { + "epoch": 0.09558363568122107, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39937210.18181818, + "logits/rejected": -34606724.92307692, + "logps/chosen": -437.4091796875, + "logps/rejected": -572.4448617788462, + "loss": 0.0646, + "rewards/chosen": 5.796208815141157, + "rewards/margins": 14.80409995659248, + "rewards/rejected": -9.007891141451323, + "step": 382 + }, + { + "epoch": 0.09583385462279495, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31486598.4, + "logits/rejected": -56153600.0, + "logps/chosen": -402.7879638671875, + "logps/rejected": -559.7917131696429, + "loss": 0.0637, + "rewards/chosen": 5.576757431030273, + "rewards/margins": 14.116545813424247, + "rewards/rejected": -8.539788382393974, + "step": 383 + }, + { + "epoch": 0.09608407356436882, + "grad_norm": 13.125, + "kl": 8.177698135375977, + "learning_rate": 5e-06, + "logits/chosen": -28956149.333333332, + "logits/rejected": -38146065.777777776, + "logps/chosen": -396.94127604166664, + "logps/rejected": -384.12681749131946, + "loss": 0.0715, + "rewards/chosen": 5.464049275716146, + "rewards/margins": 11.317184702555338, + "rewards/rejected": -5.853135426839192, + "step": 384 + }, + { + "epoch": 0.0963342925059427, + "grad_norm": 8.8125, + "kl": 5.718497276306152, + "learning_rate": 5e-06, + "logits/chosen": -40536068.571428575, + "logits/rejected": -93329408.0, + "logps/chosen": -372.83241489955356, + "logps/rejected": -517.881298828125, + "loss": 0.0322, + "rewards/chosen": 4.840836661202567, + "rewards/margins": 11.606588309151785, + "rewards/rejected": -6.765751647949219, + "step": 385 + }, + { + "epoch": 0.09658451144751658, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46708416.0, + "logits/rejected": -38078960.0, + "logps/chosen": -339.2285888671875, + "logps/rejected": -552.4586007254464, + "loss": 0.054, + "rewards/chosen": 5.0961761474609375, + "rewards/margins": 14.008648463657924, + "rewards/rejected": -8.912472316196986, + "step": 386 + }, + { + "epoch": 0.09683473038909045, + "grad_norm": 12.25, + "kl": 1.0777747631072998, + "learning_rate": 5e-06, + "logits/chosen": -65940394.666666664, + "logits/rejected": -61548618.666666664, + "logps/chosen": -545.5083821614584, + "logps/rejected": -683.36474609375, + "loss": 0.0414, + "rewards/chosen": 5.95986811319987, + "rewards/margins": 15.873210906982422, + "rewards/rejected": -9.913342793782553, + "step": 387 + }, + { + "epoch": 0.09708494933066433, + "grad_norm": 24.125, + "kl": 7.892280101776123, + "learning_rate": 5e-06, + "logits/chosen": -45052066.13333333, + "logits/rejected": -22081038.222222224, + "logps/chosen": -353.6652018229167, + "logps/rejected": -462.7194010416667, + "loss": 0.0798, + "rewards/chosen": 5.11092274983724, + "rewards/margins": 13.230683898925781, + "rewards/rejected": -8.119761149088541, + "step": 388 + }, + { + "epoch": 0.0973351682722382, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27990958.769230768, + "logits/rejected": -65293504.0, + "logps/chosen": -320.1760441706731, + "logps/rejected": -499.63383345170456, + "loss": 0.0959, + "rewards/chosen": 4.739841461181641, + "rewards/margins": 10.843053991144354, + "rewards/rejected": -6.103212529962713, + "step": 389 + }, + { + "epoch": 0.09758538721381209, + "grad_norm": 13.75, + "kl": 0.3630460202693939, + "learning_rate": 5e-06, + "logits/chosen": -45893707.63636363, + "logits/rejected": -34649604.92307692, + "logps/chosen": -292.3326970880682, + "logps/rejected": -452.06381460336536, + "loss": 0.0554, + "rewards/chosen": 3.9567704634232954, + "rewards/margins": 9.537572794027263, + "rewards/rejected": -5.580802330603967, + "step": 390 + }, + { + "epoch": 0.09783560615538596, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53935556.571428575, + "logits/rejected": -41159052.8, + "logps/chosen": -393.78916713169644, + "logps/rejected": -526.728857421875, + "loss": 0.0606, + "rewards/chosen": 5.70606449672154, + "rewards/margins": 13.297688184465681, + "rewards/rejected": -7.591623687744141, + "step": 391 + }, + { + "epoch": 0.09808582509695984, + "grad_norm": 22.625, + "kl": 8.097393989562988, + "learning_rate": 5e-06, + "logits/chosen": -44183943.52941176, + "logits/rejected": -50947894.85714286, + "logps/chosen": -410.19680606617646, + "logps/rejected": -846.0643833705357, + "loss": 0.1159, + "rewards/chosen": 5.803953282973346, + "rewards/margins": 20.649368670808165, + "rewards/rejected": -14.845415387834821, + "step": 392 + }, + { + "epoch": 0.09833604403853372, + "grad_norm": 13.125, + "kl": 1.3518741130828857, + "learning_rate": 5e-06, + "logits/chosen": -40081449.84615385, + "logits/rejected": -45273658.18181818, + "logps/chosen": -468.5323016826923, + "logps/rejected": -567.9649325284091, + "loss": 0.0829, + "rewards/chosen": 5.369481013371394, + "rewards/margins": 13.613824164117133, + "rewards/rejected": -8.244343150745738, + "step": 393 + }, + { + "epoch": 0.0985862629801076, + "grad_norm": 18.75, + "kl": 8.161908149719238, + "learning_rate": 5e-06, + "logits/chosen": -60935847.384615384, + "logits/rejected": -40088401.45454545, + "logps/chosen": -450.28797325721155, + "logps/rejected": -503.8245738636364, + "loss": 0.1072, + "rewards/chosen": 6.421485900878906, + "rewards/margins": 12.865921714089133, + "rewards/rejected": -6.4444358132102275, + "step": 394 + }, + { + "epoch": 0.09883648192168147, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49220906.666666664, + "logits/rejected": -35500326.4, + "logps/chosen": -517.005859375, + "logps/rejected": -534.5302083333333, + "loss": 0.0846, + "rewards/chosen": 6.194328308105469, + "rewards/margins": 13.034056599934896, + "rewards/rejected": -6.839728291829427, + "step": 395 + }, + { + "epoch": 0.09908670086325534, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77013294.54545455, + "logits/rejected": -18858615.384615384, + "logps/chosen": -552.6618874289773, + "logps/rejected": -617.21435546875, + "loss": 0.0566, + "rewards/chosen": 7.5582143610174, + "rewards/margins": 15.697539429564577, + "rewards/rejected": -8.139325068547176, + "step": 396 + }, + { + "epoch": 0.09933691980482923, + "grad_norm": 7.125, + "kl": 2.094575881958008, + "learning_rate": 5e-06, + "logits/chosen": -57553609.14285714, + "logits/rejected": -50990073.6, + "logps/chosen": -456.2685546875, + "logps/rejected": -521.18876953125, + "loss": 0.0339, + "rewards/chosen": 6.946342468261719, + "rewards/margins": 14.4228515625, + "rewards/rejected": -7.476509094238281, + "step": 397 + }, + { + "epoch": 0.0995871387464031, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45192153.6, + "logits/rejected": -36347890.28571428, + "logps/chosen": -328.09736328125, + "logps/rejected": -487.84291294642856, + "loss": 0.0884, + "rewards/chosen": 4.698400497436523, + "rewards/margins": 11.430424554007395, + "rewards/rejected": -6.732024056570871, + "step": 398 + }, + { + "epoch": 0.09983735768797698, + "grad_norm": 10.0625, + "kl": 2.2654342651367188, + "learning_rate": 5e-06, + "logits/chosen": 20556571.636363637, + "logits/rejected": -22432477.53846154, + "logps/chosen": -473.34170809659093, + "logps/rejected": -640.076171875, + "loss": 0.0478, + "rewards/chosen": 5.259163943204013, + "rewards/margins": 13.574018491731657, + "rewards/rejected": -8.314854548527645, + "step": 399 + }, + { + "epoch": 0.10008757662955085, + "grad_norm": 11.75, + "kl": 1.1438745260238647, + "learning_rate": 5e-06, + "logits/chosen": -70439456.0, + "logits/rejected": -48127808.0, + "logps/chosen": -391.1977132161458, + "logps/rejected": -468.4460042317708, + "loss": 0.0664, + "rewards/chosen": 4.742527008056641, + "rewards/margins": 11.225823720296223, + "rewards/rejected": -6.483296712239583, + "step": 400 + }, + { + "epoch": 0.10033779557112474, + "grad_norm": 16.375, + "kl": 1.0232124328613281, + "learning_rate": 5e-06, + "logits/chosen": -57168663.27272727, + "logits/rejected": -54569028.92307692, + "logps/chosen": -300.26895419034093, + "logps/rejected": -476.3874323918269, + "loss": 0.0886, + "rewards/chosen": 4.676924618807706, + "rewards/margins": 10.507736339435711, + "rewards/rejected": -5.830811720628005, + "step": 401 + }, + { + "epoch": 0.10058801451269861, + "grad_norm": 11.25, + "kl": 0.8866307139396667, + "learning_rate": 5e-06, + "logits/chosen": -44025165.71428572, + "logits/rejected": -46981712.0, + "logps/chosen": -429.13818359375, + "logps/rejected": -417.379736328125, + "loss": 0.0927, + "rewards/chosen": 5.455929347446987, + "rewards/margins": 12.826463862827847, + "rewards/rejected": -7.370534515380859, + "step": 402 + }, + { + "epoch": 0.1008382334542725, + "grad_norm": 11.8125, + "kl": 1.8641650676727295, + "learning_rate": 5e-06, + "logits/chosen": -63372405.333333336, + "logits/rejected": -44824485.333333336, + "logps/chosen": -416.6009928385417, + "logps/rejected": -511.1730550130208, + "loss": 0.0466, + "rewards/chosen": 5.438326517740886, + "rewards/margins": 12.913864135742188, + "rewards/rejected": -7.475537618001302, + "step": 403 + }, + { + "epoch": 0.10108845239584636, + "grad_norm": 8.9375, + "kl": 0.44264063239097595, + "learning_rate": 5e-06, + "logits/chosen": -81691273.84615384, + "logits/rejected": -30523296.0, + "logps/chosen": -401.87289663461536, + "logps/rejected": -425.74338600852275, + "loss": 0.0489, + "rewards/chosen": 4.802876985990084, + "rewards/margins": 10.41016777412041, + "rewards/rejected": -5.607290788130327, + "step": 404 + }, + { + "epoch": 0.10133867133742024, + "grad_norm": 14.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43013686.4, + "logits/rejected": -31936971.42857143, + "logps/chosen": -422.82099609375, + "logps/rejected": -417.3173828125, + "loss": 0.0853, + "rewards/chosen": 4.338645553588867, + "rewards/margins": 10.886906923566546, + "rewards/rejected": -6.548261369977679, + "step": 405 + }, + { + "epoch": 0.10158889027899412, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51162449.777777776, + "logits/rejected": -54308573.86666667, + "logps/chosen": -520.4177517361111, + "logps/rejected": -607.8944661458333, + "loss": 0.004, + "rewards/chosen": 7.769977145724827, + "rewards/margins": 19.033067152235244, + "rewards/rejected": -11.263090006510417, + "step": 406 + }, + { + "epoch": 0.10183910922056799, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47403029.333333336, + "logits/rejected": -51197397.333333336, + "logps/chosen": -332.00469970703125, + "logps/rejected": -457.2459309895833, + "loss": 0.0488, + "rewards/chosen": 3.972020467122396, + "rewards/margins": 11.739330291748047, + "rewards/rejected": -7.767309824625651, + "step": 407 + }, + { + "epoch": 0.10208932816214188, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -92978826.66666667, + "logits/rejected": -39795232.0, + "logps/chosen": -444.3814290364583, + "logps/rejected": -519.9366319444445, + "loss": 0.0436, + "rewards/chosen": 6.104851404825847, + "rewards/margins": 13.283403820461697, + "rewards/rejected": -7.178552415635851, + "step": 408 + }, + { + "epoch": 0.10233954710371575, + "grad_norm": 25.5, + "kl": 3.115605354309082, + "learning_rate": 5e-06, + "logits/chosen": -30063936.0, + "logits/rejected": -55245927.11111111, + "logps/chosen": -266.97451171875, + "logps/rejected": -378.625, + "loss": 0.1341, + "rewards/chosen": 3.777569580078125, + "rewards/margins": 9.546980455186631, + "rewards/rejected": -5.769410875108507, + "step": 409 + }, + { + "epoch": 0.10258976604528963, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2317781.3333333335, + "logits/rejected": -73145820.44444445, + "logps/chosen": -510.08118489583336, + "logps/rejected": -891.1088324652778, + "loss": 0.0647, + "rewards/chosen": 6.504647827148437, + "rewards/margins": 17.896556260850694, + "rewards/rejected": -11.391908433702257, + "step": 410 + }, + { + "epoch": 0.1028399849868635, + "grad_norm": 24.5, + "kl": 10.833757400512695, + "learning_rate": 5e-06, + "logits/chosen": -35619772.23529412, + "logits/rejected": -44575968.0, + "logps/chosen": -382.6388154871324, + "logps/rejected": -406.9873744419643, + "loss": 0.1755, + "rewards/chosen": 4.639774995691636, + "rewards/margins": 10.885245315167083, + "rewards/rejected": -6.245470319475446, + "step": 411 + }, + { + "epoch": 0.10309020392843739, + "grad_norm": 19.25, + "kl": 7.022817134857178, + "learning_rate": 5e-06, + "logits/chosen": -58954845.538461536, + "logits/rejected": -45061486.54545455, + "logps/chosen": -487.4467022235577, + "logps/rejected": -543.7039683948864, + "loss": 0.0714, + "rewards/chosen": 6.067454998309795, + "rewards/margins": 15.676225195397864, + "rewards/rejected": -9.608770197088068, + "step": 412 + }, + { + "epoch": 0.10334042287001126, + "grad_norm": 12.5, + "kl": 1.0240873098373413, + "learning_rate": 5e-06, + "logits/chosen": -48003172.571428575, + "logits/rejected": -37617283.2, + "logps/chosen": -261.23702566964283, + "logps/rejected": -470.354443359375, + "loss": 0.096, + "rewards/chosen": 3.7107960837227956, + "rewards/margins": 9.509578432355609, + "rewards/rejected": -5.7987823486328125, + "step": 413 + }, + { + "epoch": 0.10359064181158514, + "grad_norm": 12.0625, + "kl": 1.8942980766296387, + "learning_rate": 5e-06, + "logits/chosen": -42673890.13333333, + "logits/rejected": -45731893.333333336, + "logps/chosen": -375.79625651041664, + "logps/rejected": -445.3735622829861, + "loss": 0.079, + "rewards/chosen": 5.186106363932292, + "rewards/margins": 12.294780731201172, + "rewards/rejected": -7.10867436726888, + "step": 414 + }, + { + "epoch": 0.10384086075315901, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7241936.0, + "logits/rejected": -50001260.307692304, + "logps/chosen": -552.8135209517045, + "logps/rejected": -572.2059420072115, + "loss": 0.03, + "rewards/chosen": 5.882486516779119, + "rewards/margins": 15.103575806517702, + "rewards/rejected": -9.221089289738583, + "step": 415 + }, + { + "epoch": 0.10409107969473289, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43977437.86666667, + "logits/rejected": -79319217.77777778, + "logps/chosen": -486.6686197916667, + "logps/rejected": -732.5679796006945, + "loss": 0.0268, + "rewards/chosen": 6.040640258789063, + "rewards/margins": 16.642068820529516, + "rewards/rejected": -10.601428561740452, + "step": 416 + }, + { + "epoch": 0.10434129863630677, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41147776.0, + "logits/rejected": -41775364.266666666, + "logps/chosen": -368.81426323784723, + "logps/rejected": -450.8942057291667, + "loss": 0.0977, + "rewards/chosen": 5.021399603949653, + "rewards/margins": 10.690480719672308, + "rewards/rejected": -5.669081115722657, + "step": 417 + }, + { + "epoch": 0.10459151757788064, + "grad_norm": 13.25, + "kl": 4.804970741271973, + "learning_rate": 5e-06, + "logits/chosen": -47708659.2, + "logits/rejected": -40883541.333333336, + "logps/chosen": -427.544140625, + "logps/rejected": -615.9753146701389, + "loss": 0.0508, + "rewards/chosen": 5.232835896809896, + "rewards/margins": 14.803684997558594, + "rewards/rejected": -9.570849100748697, + "step": 418 + }, + { + "epoch": 0.10484173651945453, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37544085.333333336, + "logits/rejected": -33258616.0, + "logps/chosen": -348.0797932942708, + "logps/rejected": -572.90478515625, + "loss": 0.0556, + "rewards/chosen": 5.4418894449869795, + "rewards/margins": 15.198873519897461, + "rewards/rejected": -9.756984074910482, + "step": 419 + }, + { + "epoch": 0.1050919554610284, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54339653.81818182, + "logits/rejected": -37937309.538461536, + "logps/chosen": -371.27934126420456, + "logps/rejected": -563.3169696514423, + "loss": 0.0581, + "rewards/chosen": 4.655000166459517, + "rewards/margins": 14.421822981400922, + "rewards/rejected": -9.766822814941406, + "step": 420 + }, + { + "epoch": 0.10534217440260228, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63854405.81818182, + "logits/rejected": -57682284.307692304, + "logps/chosen": -497.42746803977275, + "logps/rejected": -663.4538762019231, + "loss": 0.0264, + "rewards/chosen": 6.401584972034801, + "rewards/margins": 15.93082081187855, + "rewards/rejected": -9.52923583984375, + "step": 421 + }, + { + "epoch": 0.10559239334417615, + "grad_norm": 2.859375, + "kl": 1.827080488204956, + "learning_rate": 5e-06, + "logits/chosen": -73727705.6, + "logits/rejected": -37536870.85714286, + "logps/chosen": -520.97421875, + "logps/rejected": -383.84995814732144, + "loss": 0.0072, + "rewards/chosen": 7.687107086181641, + "rewards/margins": 15.212650844029017, + "rewards/rejected": -7.525543757847378, + "step": 422 + }, + { + "epoch": 0.10584261228575004, + "grad_norm": 4.625, + "kl": 5.112859725952148, + "learning_rate": 5e-06, + "logits/chosen": -71892704.0, + "logits/rejected": -18449296.0, + "logps/chosen": -536.8836263020834, + "logps/rejected": -331.226318359375, + "loss": 0.0121, + "rewards/chosen": 8.18552271525065, + "rewards/margins": 14.193390528361002, + "rewards/rejected": -6.007867813110352, + "step": 423 + }, + { + "epoch": 0.10609283122732391, + "grad_norm": 11.4375, + "kl": 4.349067211151123, + "learning_rate": 5e-06, + "logits/chosen": -59904878.93333333, + "logits/rejected": -62533589.333333336, + "logps/chosen": -395.45517578125, + "logps/rejected": -482.7253689236111, + "loss": 0.0882, + "rewards/chosen": 4.3522796630859375, + "rewards/margins": 12.742725796169704, + "rewards/rejected": -8.390446133083767, + "step": 424 + }, + { + "epoch": 0.10634305016889778, + "grad_norm": 14.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42599539.2, + "logits/rejected": -81123666.28571428, + "logps/chosen": -436.782666015625, + "logps/rejected": -501.5443638392857, + "loss": 0.1078, + "rewards/chosen": 4.611692810058594, + "rewards/margins": 11.875519452776228, + "rewards/rejected": -7.263826642717634, + "step": 425 + }, + { + "epoch": 0.10659326911047166, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37022701.333333336, + "logits/rejected": -43165373.333333336, + "logps/chosen": -283.94740804036456, + "logps/rejected": -503.1743570963542, + "loss": 0.0585, + "rewards/chosen": 4.248084704081218, + "rewards/margins": 10.905401547749838, + "rewards/rejected": -6.65731684366862, + "step": 426 + }, + { + "epoch": 0.10684348805204553, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23991613.09090909, + "logits/rejected": -49280157.538461536, + "logps/chosen": -492.92959872159093, + "logps/rejected": -610.6906174879807, + "loss": 0.0308, + "rewards/chosen": 5.263313640247691, + "rewards/margins": 15.086980726335431, + "rewards/rejected": -9.82366708608774, + "step": 427 + }, + { + "epoch": 0.10709370699361942, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53200699.07692308, + "logits/rejected": 9402146.909090908, + "logps/chosen": -270.28173828125, + "logps/rejected": -583.5978338068181, + "loss": 0.0781, + "rewards/chosen": 4.015569833608774, + "rewards/margins": 11.952680040906358, + "rewards/rejected": -7.937110207297585, + "step": 428 + }, + { + "epoch": 0.10734392593519329, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -105375266.9090909, + "logits/rejected": -44693144.615384616, + "logps/chosen": -422.41526100852275, + "logps/rejected": -547.7668269230769, + "loss": 0.0656, + "rewards/chosen": 3.8282331986860796, + "rewards/margins": 14.670419759683677, + "rewards/rejected": -10.842186560997597, + "step": 429 + }, + { + "epoch": 0.10759414487676718, + "grad_norm": 19.375, + "kl": 1.44424569606781, + "learning_rate": 5e-06, + "logits/chosen": -39972753.06666667, + "logits/rejected": -27854250.666666668, + "logps/chosen": -400.51256510416664, + "logps/rejected": -663.8101128472222, + "loss": 0.0632, + "rewards/chosen": 6.041231282552084, + "rewards/margins": 14.498454962836373, + "rewards/rejected": -8.457223680284288, + "step": 430 + }, + { + "epoch": 0.10784436381834105, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -92005045.33333333, + "logits/rejected": -51833440.0, + "logps/chosen": -465.8907063802083, + "logps/rejected": -594.366455078125, + "loss": 0.0596, + "rewards/chosen": 5.629677454630534, + "rewards/margins": 13.738147099812824, + "rewards/rejected": -8.108469645182291, + "step": 431 + }, + { + "epoch": 0.10809458275991493, + "grad_norm": 5.3125, + "kl": 0.06840769946575165, + "learning_rate": 5e-06, + "logits/chosen": -30050475.636363637, + "logits/rejected": -55986180.92307692, + "logps/chosen": -455.12349076704544, + "logps/rejected": -548.5456730769231, + "loss": 0.0332, + "rewards/chosen": 6.364178050648082, + "rewards/margins": 16.604269227781494, + "rewards/rejected": -10.240091177133413, + "step": 432 + }, + { + "epoch": 0.1083448017014888, + "grad_norm": 7.8125, + "kl": 0.8455416560173035, + "learning_rate": 5e-06, + "logits/chosen": -59441879.27272727, + "logits/rejected": -43116731.07692308, + "logps/chosen": -540.3319424715909, + "logps/rejected": -570.4277719350962, + "loss": 0.01, + "rewards/chosen": 7.734629544344815, + "rewards/margins": 15.767583246831293, + "rewards/rejected": -8.032953702486479, + "step": 433 + }, + { + "epoch": 0.10859502064306267, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42783740.8, + "logits/rejected": -59091515.428571425, + "logps/chosen": -336.4683349609375, + "logps/rejected": -619.0197405133929, + "loss": 0.0688, + "rewards/chosen": 4.17241325378418, + "rewards/margins": 14.749433081490654, + "rewards/rejected": -10.577019827706474, + "step": 434 + }, + { + "epoch": 0.10884523958463656, + "grad_norm": 15.0625, + "kl": 1.6571426391601562, + "learning_rate": 5e-06, + "logits/chosen": -52426240.0, + "logits/rejected": -43346029.333333336, + "logps/chosen": -327.007080078125, + "logps/rejected": -359.2379964192708, + "loss": 0.0616, + "rewards/chosen": 3.975179354349772, + "rewards/margins": 10.891711870829264, + "rewards/rejected": -6.916532516479492, + "step": 435 + }, + { + "epoch": 0.10909545852621043, + "grad_norm": 9.25, + "kl": 2.7744154930114746, + "learning_rate": 5e-06, + "logits/chosen": -56368116.36363637, + "logits/rejected": -55251067.07692308, + "logps/chosen": -512.1365855823864, + "logps/rejected": -612.5079627403846, + "loss": 0.0327, + "rewards/chosen": 7.093271428888494, + "rewards/margins": 17.611389480270706, + "rewards/rejected": -10.518118051382212, + "step": 436 + }, + { + "epoch": 0.10934567746778431, + "grad_norm": 16.25, + "kl": 1.4108521938323975, + "learning_rate": 5e-06, + "logits/chosen": -39318958.54545455, + "logits/rejected": -36087556.92307692, + "logps/chosen": -297.08469460227275, + "logps/rejected": -451.4909855769231, + "loss": 0.0989, + "rewards/chosen": 4.057796825062145, + "rewards/margins": 11.61541518631515, + "rewards/rejected": -7.557618361253005, + "step": 437 + }, + { + "epoch": 0.10959589640935818, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64789038.54545455, + "logits/rejected": -46188721.23076923, + "logps/chosen": -406.27787642045456, + "logps/rejected": -488.8867938701923, + "loss": 0.0522, + "rewards/chosen": 4.8656369989568535, + "rewards/margins": 12.836351861486902, + "rewards/rejected": -7.970714862530048, + "step": 438 + }, + { + "epoch": 0.10984611535093207, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47823145.6, + "logits/rejected": -57922779.428571425, + "logps/chosen": -363.1122314453125, + "logps/rejected": -569.68994140625, + "loss": 0.0802, + "rewards/chosen": 4.555009460449218, + "rewards/margins": 13.741898018973213, + "rewards/rejected": -9.186888558523995, + "step": 439 + }, + { + "epoch": 0.11009633429250594, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46450609.23076923, + "logits/rejected": -62006528.0, + "logps/chosen": -415.20688100961536, + "logps/rejected": -607.5606800426136, + "loss": 0.0706, + "rewards/chosen": 5.4302203838641825, + "rewards/margins": 14.26920756093272, + "rewards/rejected": -8.838987177068537, + "step": 440 + }, + { + "epoch": 0.11034655323407982, + "grad_norm": 8.4375, + "kl": 0.7785409688949585, + "learning_rate": 5e-06, + "logits/chosen": -48214857.84615385, + "logits/rejected": -68788165.81818181, + "logps/chosen": -421.80551382211536, + "logps/rejected": -653.4932084517045, + "loss": 0.0484, + "rewards/chosen": 5.005285409780649, + "rewards/margins": 16.069748244919143, + "rewards/rejected": -11.064462835138494, + "step": 441 + }, + { + "epoch": 0.1105967721756537, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57273272.0, + "logits/rejected": -46521432.0, + "logps/chosen": -472.7195129394531, + "logps/rejected": -615.48193359375, + "loss": 0.0665, + "rewards/chosen": 5.434605598449707, + "rewards/margins": 15.126108169555664, + "rewards/rejected": -9.691502571105957, + "step": 442 + }, + { + "epoch": 0.11084699111722758, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59167099.07692308, + "logits/rejected": -56120965.81818182, + "logps/chosen": -430.2843674879808, + "logps/rejected": -611.7582120028409, + "loss": 0.0408, + "rewards/chosen": 5.39390857403095, + "rewards/margins": 15.747195317195011, + "rewards/rejected": -10.353286743164062, + "step": 443 + }, + { + "epoch": 0.11109721005880145, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40118033.45454545, + "logits/rejected": -34688561.23076923, + "logps/chosen": -360.45780806107956, + "logps/rejected": -517.9439227764423, + "loss": 0.0346, + "rewards/chosen": 4.406946702436968, + "rewards/margins": 14.288353046337207, + "rewards/rejected": -9.88140634390024, + "step": 444 + }, + { + "epoch": 0.11134742900037532, + "grad_norm": 12.1875, + "kl": 1.8678210973739624, + "learning_rate": 5e-06, + "logits/chosen": -42519163.07692308, + "logits/rejected": -29502612.363636363, + "logps/chosen": -329.134765625, + "logps/rejected": -450.5138494318182, + "loss": 0.0965, + "rewards/chosen": 4.272696568415715, + "rewards/margins": 14.40737827007587, + "rewards/rejected": -10.134681701660156, + "step": 445 + }, + { + "epoch": 0.11159764794194921, + "grad_norm": 15.625, + "kl": 0.2287565916776657, + "learning_rate": 5e-06, + "logits/chosen": -16067683.555555556, + "logits/rejected": -75281416.53333333, + "logps/chosen": -523.2049696180555, + "logps/rejected": -596.6475260416667, + "loss": 0.0885, + "rewards/chosen": 4.291886647542317, + "rewards/margins": 12.028658294677733, + "rewards/rejected": -7.736771647135416, + "step": 446 + }, + { + "epoch": 0.11184786688352308, + "grad_norm": 3.4375, + "kl": 1.16470468044281, + "learning_rate": 5e-06, + "logits/chosen": -91582615.27272727, + "logits/rejected": -32545806.769230768, + "logps/chosen": -589.0840287642045, + "logps/rejected": -544.3393930288462, + "loss": 0.0063, + "rewards/chosen": 8.398947975852273, + "rewards/margins": 16.89925725976904, + "rewards/rejected": -8.500309283916767, + "step": 447 + }, + { + "epoch": 0.11209808582509696, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66744974.76923077, + "logits/rejected": -36661026.90909091, + "logps/chosen": -386.50450721153845, + "logps/rejected": -571.6574928977273, + "loss": 0.053, + "rewards/chosen": 5.015506450946514, + "rewards/margins": 14.494260667920946, + "rewards/rejected": -9.478754216974432, + "step": 448 + }, + { + "epoch": 0.11234830476667083, + "grad_norm": 10.1875, + "kl": 1.5537364482879639, + "learning_rate": 5e-06, + "logits/chosen": -63999374.76923077, + "logits/rejected": -74214469.81818181, + "logps/chosen": -385.4125225360577, + "logps/rejected": -595.1963778409091, + "loss": 0.0543, + "rewards/chosen": 5.21881338266226, + "rewards/margins": 14.902449441122842, + "rewards/rejected": -9.683636058460582, + "step": 449 + }, + { + "epoch": 0.11259852370824472, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48373746.28571428, + "logits/rejected": -66640519.52941176, + "logps/chosen": -399.30106026785717, + "logps/rejected": -635.9929917279412, + "loss": 0.0336, + "rewards/chosen": 5.6707354954310825, + "rewards/margins": 16.89242595384101, + "rewards/rejected": -11.221690458409926, + "step": 450 + }, + { + "epoch": 0.11284874264981859, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55439051.63636363, + "logits/rejected": -27698461.53846154, + "logps/chosen": -360.3815252130682, + "logps/rejected": -319.3821364182692, + "loss": 0.0666, + "rewards/chosen": 4.7006613991477275, + "rewards/margins": 12.42681895436107, + "rewards/rejected": -7.726157555213342, + "step": 451 + }, + { + "epoch": 0.11309896159139247, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67583872.0, + "logits/rejected": -83846882.46153846, + "logps/chosen": -341.9344371448864, + "logps/rejected": -519.6827674278846, + "loss": 0.0478, + "rewards/chosen": 5.524294072931463, + "rewards/margins": 13.071720736843723, + "rewards/rejected": -7.54742666391226, + "step": 452 + }, + { + "epoch": 0.11334918053296635, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42845492.0, + "logits/rejected": -37726132.0, + "logps/chosen": -332.44293212890625, + "logps/rejected": -580.2208862304688, + "loss": 0.0755, + "rewards/chosen": 4.65379524230957, + "rewards/margins": 12.033156871795654, + "rewards/rejected": -7.379361629486084, + "step": 453 + }, + { + "epoch": 0.11359939947454022, + "grad_norm": 9.5, + "kl": 5.602470397949219, + "learning_rate": 5e-06, + "logits/chosen": -57169546.666666664, + "logits/rejected": -13590354.666666666, + "logps/chosen": -388.2919108072917, + "logps/rejected": -596.8492838541666, + "loss": 0.068, + "rewards/chosen": 5.011869430541992, + "rewards/margins": 13.917655309041342, + "rewards/rejected": -8.90578587849935, + "step": 454 + }, + { + "epoch": 0.1138496184161141, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -84295982.54545455, + "logits/rejected": -54136497.23076923, + "logps/chosen": -457.11319247159093, + "logps/rejected": -520.7634465144231, + "loss": 0.0264, + "rewards/chosen": 6.40874550559304, + "rewards/margins": 14.554382430923567, + "rewards/rejected": -8.145636925330528, + "step": 455 + }, + { + "epoch": 0.11409983735768797, + "grad_norm": 30.75, + "kl": 11.186366081237793, + "learning_rate": 5e-06, + "logits/chosen": -66573275.428571425, + "logits/rejected": -67588889.6, + "logps/chosen": -326.14327566964283, + "logps/rejected": -690.95556640625, + "loss": 0.2057, + "rewards/chosen": 3.4635941641671315, + "rewards/margins": 12.93242656162807, + "rewards/rejected": -9.468832397460938, + "step": 456 + }, + { + "epoch": 0.11435005629926186, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73573179.73333333, + "logits/rejected": -77340636.44444445, + "logps/chosen": -524.6490885416666, + "logps/rejected": -512.8880750868055, + "loss": 0.0285, + "rewards/chosen": 6.236517333984375, + "rewards/margins": 13.422470262315539, + "rewards/rejected": -7.185952928331163, + "step": 457 + }, + { + "epoch": 0.11460027524083573, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79265735.1111111, + "logits/rejected": -36078596.266666666, + "logps/chosen": -396.03301323784723, + "logps/rejected": -469.51373697916665, + "loss": 0.0173, + "rewards/chosen": 4.9003550211588545, + "rewards/margins": 14.78834228515625, + "rewards/rejected": -9.887987263997395, + "step": 458 + }, + { + "epoch": 0.11485049418240961, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -82130585.6, + "logits/rejected": -48419026.28571428, + "logps/chosen": -509.479736328125, + "logps/rejected": -520.1116768973214, + "loss": 0.0365, + "rewards/chosen": 6.426950073242187, + "rewards/margins": 15.140568215506416, + "rewards/rejected": -8.71361814226423, + "step": 459 + }, + { + "epoch": 0.11510071312398348, + "grad_norm": 16.0, + "kl": 9.457859992980957, + "learning_rate": 5e-06, + "logits/chosen": -45067478.85714286, + "logits/rejected": -62056377.6, + "logps/chosen": -474.35306222098217, + "logps/rejected": -622.212841796875, + "loss": 0.0838, + "rewards/chosen": 6.8805084228515625, + "rewards/margins": 15.898917388916015, + "rewards/rejected": -9.018408966064452, + "step": 460 + }, + { + "epoch": 0.11535093206555737, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35475108.571428575, + "logits/rejected": -77169062.4, + "logps/chosen": -363.6903599330357, + "logps/rejected": -648.9046875, + "loss": 0.0722, + "rewards/chosen": 4.659856523786273, + "rewards/margins": 16.018920244489397, + "rewards/rejected": -11.359063720703125, + "step": 461 + }, + { + "epoch": 0.11560115100713124, + "grad_norm": 6.28125, + "kl": 1.090598464012146, + "learning_rate": 5e-06, + "logits/chosen": -74379520.0, + "logits/rejected": -40001571.55555555, + "logps/chosen": -486.63287760416665, + "logps/rejected": -554.8781467013889, + "loss": 0.0109, + "rewards/chosen": 8.59778544108073, + "rewards/margins": 19.568378363715276, + "rewards/rejected": -10.970592922634548, + "step": 462 + }, + { + "epoch": 0.11585136994870511, + "grad_norm": 13.5625, + "kl": 1.3382396697998047, + "learning_rate": 5e-06, + "logits/chosen": -22800081.777777776, + "logits/rejected": -37535044.266666666, + "logps/chosen": -508.11675347222223, + "logps/rejected": -432.5379231770833, + "loss": 0.0531, + "rewards/chosen": 6.193984561496311, + "rewards/margins": 12.401752302381727, + "rewards/rejected": -6.207767740885417, + "step": 463 + }, + { + "epoch": 0.116101588890279, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50146112.0, + "logits/rejected": -39513017.6, + "logps/chosen": -459.5585239955357, + "logps/rejected": -578.97080078125, + "loss": 0.0283, + "rewards/chosen": 6.518726348876953, + "rewards/margins": 17.24543685913086, + "rewards/rejected": -10.726710510253906, + "step": 464 + }, + { + "epoch": 0.11635180783185287, + "grad_norm": 9.6875, + "kl": 0.2913055419921875, + "learning_rate": 5e-06, + "logits/chosen": -33372720.0, + "logits/rejected": -55401685.333333336, + "logps/chosen": -376.7622884114583, + "logps/rejected": -360.7132568359375, + "loss": 0.0812, + "rewards/chosen": 5.447196960449219, + "rewards/margins": 12.01614761352539, + "rewards/rejected": -6.568950653076172, + "step": 465 + }, + { + "epoch": 0.11660202677342675, + "grad_norm": 15.0, + "kl": 0.9293226003646851, + "learning_rate": 5e-06, + "logits/chosen": -54136459.63636363, + "logits/rejected": -25768428.307692308, + "logps/chosen": -369.4251819957386, + "logps/rejected": -394.2653245192308, + "loss": 0.0812, + "rewards/chosen": 5.809360850941051, + "rewards/margins": 11.81350361217152, + "rewards/rejected": -6.004142761230469, + "step": 466 + }, + { + "epoch": 0.11685224571500062, + "grad_norm": 4.90625, + "kl": 0.4398040771484375, + "learning_rate": 5e-06, + "logits/chosen": -21144785.333333332, + "logits/rejected": -77457888.0, + "logps/chosen": -427.287353515625, + "logps/rejected": -368.9241536458333, + "loss": 0.0163, + "rewards/chosen": 5.185189247131348, + "rewards/margins": 11.416746457417805, + "rewards/rejected": -6.231557210286458, + "step": 467 + }, + { + "epoch": 0.1171024646565745, + "grad_norm": 3.4375, + "kl": 1.4409472942352295, + "learning_rate": 5e-06, + "logits/chosen": -57688384.0, + "logits/rejected": -49374240.0, + "logps/chosen": -552.608447265625, + "logps/rejected": -549.7452218191964, + "loss": 0.0062, + "rewards/chosen": 7.94610595703125, + "rewards/margins": 16.24700927734375, + "rewards/rejected": -8.3009033203125, + "step": 468 + }, + { + "epoch": 0.11735268359814838, + "grad_norm": 9.0625, + "kl": 3.589811325073242, + "learning_rate": 5e-06, + "logits/chosen": -64632625.23076923, + "logits/rejected": -41390429.09090909, + "logps/chosen": -343.5602463942308, + "logps/rejected": -445.1888316761364, + "loss": 0.0812, + "rewards/chosen": 4.639742631178636, + "rewards/margins": 9.403381907856549, + "rewards/rejected": -4.763639276677912, + "step": 469 + }, + { + "epoch": 0.11760290253972226, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55867793.45454545, + "logits/rejected": -25705326.769230768, + "logps/chosen": -446.68696732954544, + "logps/rejected": -461.61177884615387, + "loss": 0.0355, + "rewards/chosen": 5.096036737615412, + "rewards/margins": 13.319340899274065, + "rewards/rejected": -8.223304161658653, + "step": 470 + }, + { + "epoch": 0.11785312148129613, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46496901.81818182, + "logits/rejected": -27688430.769230768, + "logps/chosen": -290.997802734375, + "logps/rejected": -648.7838040865385, + "loss": 0.0984, + "rewards/chosen": 3.826521786776456, + "rewards/margins": 10.63640783883475, + "rewards/rejected": -6.809886052058293, + "step": 471 + }, + { + "epoch": 0.11810334042287002, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35014086.85714286, + "logits/rejected": -46966264.47058824, + "logps/chosen": -364.2755650111607, + "logps/rejected": -588.3134765625, + "loss": 0.0637, + "rewards/chosen": 4.479448318481445, + "rewards/margins": 14.524808210485121, + "rewards/rejected": -10.045359892003676, + "step": 472 + }, + { + "epoch": 0.11835355936444389, + "grad_norm": 10.0, + "kl": 5.845806121826172, + "learning_rate": 5e-06, + "logits/chosen": -42262010.666666664, + "logits/rejected": -5774304.0, + "logps/chosen": -383.2425537109375, + "logps/rejected": -824.1959635416666, + "loss": 0.0602, + "rewards/chosen": 6.369055430094401, + "rewards/margins": 18.599793752034504, + "rewards/rejected": -12.230738321940104, + "step": 473 + }, + { + "epoch": 0.11860377830601776, + "grad_norm": 6.65625, + "kl": 2.1483943462371826, + "learning_rate": 5e-06, + "logits/chosen": -61046961.23076923, + "logits/rejected": -63630714.18181818, + "logps/chosen": -368.2033128004808, + "logps/rejected": -500.08198686079544, + "loss": 0.0166, + "rewards/chosen": 6.370841393103967, + "rewards/margins": 14.787518694684223, + "rewards/rejected": -8.416677301580256, + "step": 474 + }, + { + "epoch": 0.11885399724759164, + "grad_norm": 10.6875, + "kl": 15.326486587524414, + "learning_rate": 5e-06, + "logits/chosen": -59713250.13333333, + "logits/rejected": -25291317.333333332, + "logps/chosen": -460.09765625, + "logps/rejected": -703.8848741319445, + "loss": 0.0689, + "rewards/chosen": 6.759861246744792, + "rewards/margins": 16.236472913953993, + "rewards/rejected": -9.476611667209202, + "step": 475 + }, + { + "epoch": 0.11910421618916552, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20665618.90909091, + "logits/rejected": -27693661.53846154, + "logps/chosen": -301.78457919034093, + "logps/rejected": -550.5569786658654, + "loss": 0.0703, + "rewards/chosen": 4.220409046519887, + "rewards/margins": 13.16304186840991, + "rewards/rejected": -8.942632821890024, + "step": 476 + }, + { + "epoch": 0.1193544351307394, + "grad_norm": 11.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68123658.66666667, + "logits/rejected": -52094229.333333336, + "logps/chosen": -459.8805338541667, + "logps/rejected": -468.3011474609375, + "loss": 0.0423, + "rewards/chosen": 5.906017939249675, + "rewards/margins": 16.16494305928548, + "rewards/rejected": -10.258925120035807, + "step": 477 + }, + { + "epoch": 0.11960465407231327, + "grad_norm": 6.15625, + "kl": 2.859895706176758, + "learning_rate": 5e-06, + "logits/chosen": -50117834.666666664, + "logits/rejected": 13962564.0, + "logps/chosen": -417.3835042317708, + "logps/rejected": -478.0733235677083, + "loss": 0.0527, + "rewards/chosen": 6.168768564860026, + "rewards/margins": 12.145323435465496, + "rewards/rejected": -5.976554870605469, + "step": 478 + }, + { + "epoch": 0.11985487301388716, + "grad_norm": 18.5, + "kl": 1.1742995977401733, + "learning_rate": 5e-06, + "logits/chosen": -49570372.92307692, + "logits/rejected": -31609547.636363637, + "logps/chosen": -434.6548602764423, + "logps/rejected": -373.51686789772725, + "loss": 0.087, + "rewards/chosen": 5.331655062161959, + "rewards/margins": 11.418631546980851, + "rewards/rejected": -6.086976484818892, + "step": 479 + }, + { + "epoch": 0.12010509195546103, + "grad_norm": 15.3125, + "kl": 1.315460205078125, + "learning_rate": 5e-06, + "logits/chosen": -40150051.2, + "logits/rejected": -38186505.14285714, + "logps/chosen": -401.6398681640625, + "logps/rejected": -547.6748744419643, + "loss": 0.0514, + "rewards/chosen": 5.968011474609375, + "rewards/margins": 15.018058122907366, + "rewards/rejected": -9.050046648297991, + "step": 480 + }, + { + "epoch": 0.12035531089703491, + "grad_norm": 28.5, + "kl": 12.87321662902832, + "learning_rate": 5e-06, + "logits/chosen": -55077428.36363637, + "logits/rejected": 53048851.692307696, + "logps/chosen": -451.60715553977275, + "logps/rejected": -631.8985877403846, + "loss": 0.0932, + "rewards/chosen": 7.126282431862571, + "rewards/margins": 14.064957358620383, + "rewards/rejected": -6.9386749267578125, + "step": 481 + }, + { + "epoch": 0.12060552983860878, + "grad_norm": 12.75, + "kl": 2.398913860321045, + "learning_rate": 5e-06, + "logits/chosen": -47802420.36363637, + "logits/rejected": -45856064.0, + "logps/chosen": -375.00363991477275, + "logps/rejected": -495.0247145432692, + "loss": 0.0733, + "rewards/chosen": 5.409019817005504, + "rewards/margins": 12.809068879881105, + "rewards/rejected": -7.400049062875601, + "step": 482 + }, + { + "epoch": 0.12085574878018265, + "grad_norm": 5.09375, + "kl": 6.2386674880981445, + "learning_rate": 5e-06, + "logits/chosen": -38800280.0, + "logits/rejected": -33975160.0, + "logps/chosen": -497.49139404296875, + "logps/rejected": -713.5885620117188, + "loss": 0.0938, + "rewards/chosen": 6.65096378326416, + "rewards/margins": 14.327479839324951, + "rewards/rejected": -7.676516056060791, + "step": 483 + }, + { + "epoch": 0.12110596772175654, + "grad_norm": 22.625, + "kl": 11.31030559539795, + "learning_rate": 5e-06, + "logits/chosen": -65001774.93333333, + "logits/rejected": -24923900.444444444, + "logps/chosen": -507.31178385416666, + "logps/rejected": -521.2833116319445, + "loss": 0.061, + "rewards/chosen": 7.19973398844401, + "rewards/margins": 15.36088375515408, + "rewards/rejected": -8.16114976671007, + "step": 484 + }, + { + "epoch": 0.12135618666333041, + "grad_norm": 5.90625, + "kl": 5.078367233276367, + "learning_rate": 5e-06, + "logits/chosen": -74973979.42857143, + "logits/rejected": -48408723.2, + "logps/chosen": -400.0113002232143, + "logps/rejected": -327.6156494140625, + "loss": 0.0571, + "rewards/chosen": 6.5358734130859375, + "rewards/margins": 12.664257049560547, + "rewards/rejected": -6.128383636474609, + "step": 485 + }, + { + "epoch": 0.1216064056049043, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48302883.2, + "logits/rejected": -12341133.714285715, + "logps/chosen": -360.29853515625, + "logps/rejected": -503.53627232142856, + "loss": 0.0337, + "rewards/chosen": 6.144741439819336, + "rewards/margins": 13.616791697910855, + "rewards/rejected": -7.472050258091518, + "step": 486 + }, + { + "epoch": 0.12185662454647816, + "grad_norm": 10.3125, + "kl": 7.17965841293335, + "learning_rate": 5e-06, + "logits/chosen": -72141056.0, + "logits/rejected": -44767606.15384615, + "logps/chosen": -547.6633522727273, + "logps/rejected": -548.7711838942307, + "loss": 0.0105, + "rewards/chosen": 8.66045448996804, + "rewards/margins": 16.591907901363772, + "rewards/rejected": -7.931453411395733, + "step": 487 + }, + { + "epoch": 0.12210684348805205, + "grad_norm": 18.75, + "kl": 2.8869330883026123, + "learning_rate": 5e-06, + "logits/chosen": -39958440.0, + "logits/rejected": -20978050.666666668, + "logps/chosen": -266.3028564453125, + "logps/rejected": -570.3648274739584, + "loss": 0.1361, + "rewards/chosen": 3.7354443868001304, + "rewards/margins": 14.502566019694012, + "rewards/rejected": -10.76712163289388, + "step": 488 + }, + { + "epoch": 0.12235706242962592, + "grad_norm": 8.125, + "kl": 7.888890743255615, + "learning_rate": 5e-06, + "logits/chosen": -39167570.28571428, + "logits/rejected": -76855008.0, + "logps/chosen": -388.73032924107144, + "logps/rejected": -485.32421875, + "loss": 0.0769, + "rewards/chosen": 6.364748273577009, + "rewards/margins": 13.185390363420758, + "rewards/rejected": -6.82064208984375, + "step": 489 + }, + { + "epoch": 0.1226072813711998, + "grad_norm": 27.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33452950.85714286, + "logits/rejected": -65937460.705882356, + "logps/chosen": -416.3704310825893, + "logps/rejected": -577.2244944852941, + "loss": 0.0699, + "rewards/chosen": 5.780368259974888, + "rewards/margins": 11.687593027323235, + "rewards/rejected": -5.907224767348346, + "step": 490 + }, + { + "epoch": 0.12285750031277368, + "grad_norm": 14.9375, + "kl": 8.293676376342773, + "learning_rate": 5e-06, + "logits/chosen": -95655424.0, + "logits/rejected": -58857070.222222224, + "logps/chosen": -504.1125, + "logps/rejected": -525.0114474826389, + "loss": 0.0645, + "rewards/chosen": 5.967329915364584, + "rewards/margins": 15.612222290039062, + "rewards/rejected": -9.644892374674479, + "step": 491 + }, + { + "epoch": 0.12310771925434755, + "grad_norm": 15.6875, + "kl": 0.24074110388755798, + "learning_rate": 5e-06, + "logits/chosen": -69385493.33333333, + "logits/rejected": -34415170.13333333, + "logps/chosen": -425.4816080729167, + "logps/rejected": -573.4834635416667, + "loss": 0.0401, + "rewards/chosen": 6.937757703993055, + "rewards/margins": 15.098754204644097, + "rewards/rejected": -8.160996500651041, + "step": 492 + }, + { + "epoch": 0.12335793819592143, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49814382.54545455, + "logits/rejected": -44867017.84615385, + "logps/chosen": -516.3655894886364, + "logps/rejected": -450.52944711538464, + "loss": 0.0215, + "rewards/chosen": 5.593210740522905, + "rewards/margins": 13.613422767265693, + "rewards/rejected": -8.020212026742788, + "step": 493 + }, + { + "epoch": 0.1236081571374953, + "grad_norm": 22.625, + "kl": 12.503021240234375, + "learning_rate": 5e-06, + "logits/chosen": -7045472.0, + "logits/rejected": -61588666.18181818, + "logps/chosen": -560.8127253605769, + "logps/rejected": -613.7776988636364, + "loss": 0.0884, + "rewards/chosen": 6.884715153620793, + "rewards/margins": 13.994393942239402, + "rewards/rejected": -7.109678788618608, + "step": 494 + }, + { + "epoch": 0.12385837607906919, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67192540.44444445, + "logits/rejected": -44729378.13333333, + "logps/chosen": -484.6663411458333, + "logps/rejected": -537.2735026041667, + "loss": 0.0621, + "rewards/chosen": 6.838818868001302, + "rewards/margins": 14.29560546875, + "rewards/rejected": -7.456786600748698, + "step": 495 + }, + { + "epoch": 0.12410859502064306, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41143004.8, + "logits/rejected": -40233846.85714286, + "logps/chosen": -306.438134765625, + "logps/rejected": -423.08740234375, + "loss": 0.0433, + "rewards/chosen": 5.018299102783203, + "rewards/margins": 12.230145590645925, + "rewards/rejected": -7.211846487862723, + "step": 496 + }, + { + "epoch": 0.12435881396221694, + "grad_norm": 5.125, + "kl": 2.0190443992614746, + "learning_rate": 5e-06, + "logits/chosen": -58310645.333333336, + "logits/rejected": -59065100.8, + "logps/chosen": -442.364501953125, + "logps/rejected": -632.051171875, + "loss": 0.0371, + "rewards/chosen": 6.31556150648329, + "rewards/margins": 16.375715721978082, + "rewards/rejected": -10.060154215494792, + "step": 497 + }, + { + "epoch": 0.12460903290379081, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10794531.692307692, + "logits/rejected": -47191863.27272727, + "logps/chosen": -393.6940730168269, + "logps/rejected": -479.57790305397725, + "loss": 0.0533, + "rewards/chosen": 6.14932133601262, + "rewards/margins": 13.989344029993443, + "rewards/rejected": -7.840022693980824, + "step": 498 + }, + { + "epoch": 0.1248592518453647, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -660263.5, + "logits/rejected": -41052148.0, + "logps/chosen": -345.80963134765625, + "logps/rejected": -500.7593688964844, + "loss": 0.0649, + "rewards/chosen": 4.8870849609375, + "rewards/margins": 13.218653678894043, + "rewards/rejected": -8.331568717956543, + "step": 499 + }, + { + "epoch": 0.12510947078693857, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49384086.85714286, + "logits/rejected": -47163286.5882353, + "logps/chosen": -392.75770786830356, + "logps/rejected": -514.7162798713235, + "loss": 0.0447, + "rewards/chosen": 5.240893227713449, + "rewards/margins": 13.04556098104525, + "rewards/rejected": -7.804667753331802, + "step": 500 + }, + { + "epoch": 0.12535968972851244, + "grad_norm": 24.125, + "kl": 13.870463371276855, + "learning_rate": 5e-06, + "logits/chosen": -34992692.705882356, + "logits/rejected": -49290697.14285714, + "logps/chosen": -449.64694393382354, + "logps/rejected": -478.52779715401783, + "loss": 0.1207, + "rewards/chosen": 5.975617352653952, + "rewards/margins": 15.519418796571363, + "rewards/rejected": -9.543801443917411, + "step": 501 + }, + { + "epoch": 0.1256099086700863, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46354883.55555555, + "logits/rejected": -52046830.93333333, + "logps/chosen": -504.72667100694446, + "logps/rejected": -782.2417317708333, + "loss": 0.0263, + "rewards/chosen": 6.27376471625434, + "rewards/margins": 19.49381883409288, + "rewards/rejected": -13.220054117838542, + "step": 502 + }, + { + "epoch": 0.1258601276116602, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63085602.461538464, + "logits/rejected": -49758004.36363637, + "logps/chosen": -346.48940805288464, + "logps/rejected": -501.6334783380682, + "loss": 0.0873, + "rewards/chosen": 4.194590348463792, + "rewards/margins": 10.549574978701717, + "rewards/rejected": -6.354984630237926, + "step": 503 + }, + { + "epoch": 0.12611034655323408, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37878294.4, + "logits/rejected": -44222112.0, + "logps/chosen": -300.1798828125, + "logps/rejected": -666.5936104910714, + "loss": 0.0332, + "rewards/chosen": 4.9337005615234375, + "rewards/margins": 17.644949776785715, + "rewards/rejected": -12.711249215262276, + "step": 504 + }, + { + "epoch": 0.12636056549480795, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53258168.88888889, + "logits/rejected": -62086340.266666666, + "logps/chosen": -497.5511067708333, + "logps/rejected": -525.2665364583333, + "loss": 0.0172, + "rewards/chosen": 8.90140872531467, + "rewards/margins": 18.32415042453342, + "rewards/rejected": -9.42274169921875, + "step": 505 + }, + { + "epoch": 0.12661078443638182, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58714240.0, + "logits/rejected": -68869421.71428572, + "logps/chosen": -466.70322265625, + "logps/rejected": -527.516357421875, + "loss": 0.0408, + "rewards/chosen": 6.711792755126953, + "rewards/margins": 16.323926326206752, + "rewards/rejected": -9.612133571079799, + "step": 506 + }, + { + "epoch": 0.12686100337795572, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28285740.8, + "logits/rejected": -51621261.71428572, + "logps/chosen": -253.264501953125, + "logps/rejected": -547.0942034040179, + "loss": 0.0528, + "rewards/chosen": 4.269728088378907, + "rewards/margins": 13.727525547572546, + "rewards/rejected": -9.457797459193639, + "step": 507 + }, + { + "epoch": 0.1271112223195296, + "grad_norm": 16.375, + "kl": 1.6754951477050781, + "learning_rate": 5e-06, + "logits/chosen": -60989648.0, + "logits/rejected": -48107856.0, + "logps/chosen": -577.9093831380209, + "logps/rejected": -504.8100992838542, + "loss": 0.035, + "rewards/chosen": 5.744365692138672, + "rewards/margins": 13.332722345987957, + "rewards/rejected": -7.588356653849284, + "step": 508 + }, + { + "epoch": 0.12736144126110346, + "grad_norm": 19.375, + "kl": 5.535085201263428, + "learning_rate": 5e-06, + "logits/chosen": -59012608.0, + "logits/rejected": -43437836.0, + "logps/chosen": -449.87152099609375, + "logps/rejected": -410.8323669433594, + "loss": 0.0751, + "rewards/chosen": 5.659799098968506, + "rewards/margins": 12.933778762817383, + "rewards/rejected": -7.273979663848877, + "step": 509 + }, + { + "epoch": 0.12761166020267734, + "grad_norm": 11.125, + "kl": 2.6640734672546387, + "learning_rate": 5e-06, + "logits/chosen": -38895342.93333333, + "logits/rejected": 20717648.0, + "logps/chosen": -480.3834635416667, + "logps/rejected": -503.7947591145833, + "loss": 0.0827, + "rewards/chosen": 5.3864802042643225, + "rewards/margins": 14.387341478135852, + "rewards/rejected": -9.000861273871529, + "step": 510 + }, + { + "epoch": 0.12786187914425123, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25693600.0, + "logits/rejected": -39176096.0, + "logps/chosen": -292.12186373197113, + "logps/rejected": -862.1125710227273, + "loss": 0.0745, + "rewards/chosen": 4.115102327786959, + "rewards/margins": 15.687349919672613, + "rewards/rejected": -11.572247591885654, + "step": 511 + }, + { + "epoch": 0.1281120980858251, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46853792.0, + "logits/rejected": -53366701.71428572, + "logps/chosen": -295.2908935546875, + "logps/rejected": -556.5693359375, + "loss": 0.0817, + "rewards/chosen": 4.939128494262695, + "rewards/margins": 15.05255252293178, + "rewards/rejected": -10.113424028669085, + "step": 512 + }, + { + "epoch": 0.12836231702739898, + "grad_norm": 11.0625, + "kl": 9.1113862991333, + "learning_rate": 5e-06, + "logits/chosen": -48826320.0, + "logits/rejected": -78610368.0, + "logps/chosen": -402.9245910644531, + "logps/rejected": -859.9910888671875, + "loss": 0.0929, + "rewards/chosen": 5.665109634399414, + "rewards/margins": 21.817462921142578, + "rewards/rejected": -16.152353286743164, + "step": 513 + }, + { + "epoch": 0.12861253596897285, + "grad_norm": 9.125, + "kl": 0.6761309504508972, + "learning_rate": 5e-06, + "logits/chosen": -52341338.666666664, + "logits/rejected": -62310213.333333336, + "logps/chosen": -522.9697265625, + "logps/rejected": -595.3945719401041, + "loss": 0.0305, + "rewards/chosen": 4.998908996582031, + "rewards/margins": 15.923812866210938, + "rewards/rejected": -10.924903869628906, + "step": 514 + }, + { + "epoch": 0.12886275491054672, + "grad_norm": 8.875, + "kl": 0.22754161059856415, + "learning_rate": 5e-06, + "logits/chosen": -71374148.26666667, + "logits/rejected": -48037955.55555555, + "logps/chosen": -380.0354817708333, + "logps/rejected": -605.1385091145834, + "loss": 0.0613, + "rewards/chosen": 4.615861002604166, + "rewards/margins": 13.637049526638455, + "rewards/rejected": -9.021188524034288, + "step": 515 + }, + { + "epoch": 0.12911297385212062, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24142262.85714286, + "logits/rejected": -54316363.294117644, + "logps/chosen": -352.91859654017856, + "logps/rejected": -469.2028377757353, + "loss": 0.063, + "rewards/chosen": 4.072139195033482, + "rewards/margins": 13.999904985187435, + "rewards/rejected": -9.927765790153952, + "step": 516 + }, + { + "epoch": 0.1293631927936945, + "grad_norm": 14.5, + "kl": 9.314910888671875, + "learning_rate": 5e-06, + "logits/chosen": -44396648.0, + "logits/rejected": -68413600.0, + "logps/chosen": -431.5744323730469, + "logps/rejected": -652.3296508789062, + "loss": 0.0826, + "rewards/chosen": 5.625824928283691, + "rewards/margins": 15.972043991088867, + "rewards/rejected": -10.346219062805176, + "step": 517 + }, + { + "epoch": 0.12961341173526836, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33778769.23076923, + "logits/rejected": -55241437.09090909, + "logps/chosen": -296.46570763221155, + "logps/rejected": -525.5427024147727, + "loss": 0.0907, + "rewards/chosen": 3.4785006596491885, + "rewards/margins": 13.163419843553664, + "rewards/rejected": -9.684919183904475, + "step": 518 + }, + { + "epoch": 0.12986363067684223, + "grad_norm": 11.5625, + "kl": 14.970568656921387, + "learning_rate": 5e-06, + "logits/chosen": -56321322.666666664, + "logits/rejected": -28531660.444444444, + "logps/chosen": -448.6962565104167, + "logps/rejected": -724.5801866319445, + "loss": 0.1089, + "rewards/chosen": 6.593615214029948, + "rewards/margins": 18.20522138807509, + "rewards/rejected": -11.61160617404514, + "step": 519 + }, + { + "epoch": 0.13011384961841613, + "grad_norm": 19.125, + "kl": 4.355490684509277, + "learning_rate": 5e-06, + "logits/chosen": -52453877.333333336, + "logits/rejected": -61738533.333333336, + "logps/chosen": -398.1594645182292, + "logps/rejected": -449.4321695963542, + "loss": 0.0581, + "rewards/chosen": 5.456010182698567, + "rewards/margins": 13.074945449829102, + "rewards/rejected": -7.618935267130534, + "step": 520 + }, + { + "epoch": 0.13036406855999, + "grad_norm": 11.4375, + "kl": 5.234340667724609, + "learning_rate": 5e-06, + "logits/chosen": -54637986.13333333, + "logits/rejected": -66116152.88888889, + "logps/chosen": -407.49892578125, + "logps/rejected": -543.0160590277778, + "loss": 0.0588, + "rewards/chosen": 6.215244547526042, + "rewards/margins": 14.532039048936632, + "rewards/rejected": -8.316794501410591, + "step": 521 + }, + { + "epoch": 0.13061428750156387, + "grad_norm": 15.6875, + "kl": 18.19165802001953, + "learning_rate": 5e-06, + "logits/chosen": -25189714.82352941, + "logits/rejected": -77390098.28571428, + "logps/chosen": -517.8507582720588, + "logps/rejected": -708.9725167410714, + "loss": 0.1545, + "rewards/chosen": 7.169649011948529, + "rewards/margins": 20.293441387785585, + "rewards/rejected": -13.123792375837054, + "step": 522 + }, + { + "epoch": 0.13086450644313774, + "grad_norm": 13.75, + "kl": 2.2514073848724365, + "learning_rate": 5e-06, + "logits/chosen": -49493472.0, + "logits/rejected": -34294025.14285714, + "logps/chosen": -484.2455078125, + "logps/rejected": -619.6990094866071, + "loss": 0.0245, + "rewards/chosen": 5.915974426269531, + "rewards/margins": 16.226895141601563, + "rewards/rejected": -10.310920715332031, + "step": 523 + }, + { + "epoch": 0.1311147253847116, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63092003.55555555, + "logits/rejected": -67593390.93333334, + "logps/chosen": -361.02989366319446, + "logps/rejected": -579.9860677083333, + "loss": 0.0337, + "rewards/chosen": 5.46497556898329, + "rewards/margins": 13.815851423475477, + "rewards/rejected": -8.350875854492188, + "step": 524 + }, + { + "epoch": 0.1313649443262855, + "grad_norm": 6.9375, + "kl": 2.5712223052978516, + "learning_rate": 5e-06, + "logits/chosen": -17019883.076923076, + "logits/rejected": -47405239.27272727, + "logps/chosen": -296.4245793269231, + "logps/rejected": -495.5245916193182, + "loss": 0.1468, + "rewards/chosen": 4.746087587796724, + "rewards/margins": 10.624109521612422, + "rewards/rejected": -5.878021933815696, + "step": 525 + }, + { + "epoch": 0.13161516326785938, + "grad_norm": 15.125, + "kl": 17.520803451538086, + "learning_rate": 5e-06, + "logits/chosen": -51876352.0, + "logits/rejected": -65297644.8, + "logps/chosen": -404.55092075892856, + "logps/rejected": -424.95859375, + "loss": 0.0571, + "rewards/chosen": 6.774650573730469, + "rewards/margins": 11.588059616088866, + "rewards/rejected": -4.813409042358399, + "step": 526 + }, + { + "epoch": 0.13186538220943325, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43696916.36363637, + "logits/rejected": -60581454.76923077, + "logps/chosen": -436.27299360795456, + "logps/rejected": -458.74594350961536, + "loss": 0.0766, + "rewards/chosen": 8.198015386408025, + "rewards/margins": 13.268264503745765, + "rewards/rejected": -5.07024911733774, + "step": 527 + }, + { + "epoch": 0.13211560115100712, + "grad_norm": 24.5, + "kl": 5.007396697998047, + "learning_rate": 5e-06, + "logits/chosen": -46646429.09090909, + "logits/rejected": -37675318.15384615, + "logps/chosen": -485.1492365056818, + "logps/rejected": -466.7234450120192, + "loss": 0.0798, + "rewards/chosen": 7.680758389559659, + "rewards/margins": 11.481649385465609, + "rewards/rejected": -3.8008909959059496, + "step": 528 + }, + { + "epoch": 0.13236582009258102, + "grad_norm": 14.1875, + "kl": 5.743526458740234, + "learning_rate": 5e-06, + "logits/chosen": -30546733.333333332, + "logits/rejected": -24016501.333333332, + "logps/chosen": -409.8443603515625, + "logps/rejected": -327.99269612630206, + "loss": 0.1558, + "rewards/chosen": 5.747198104858398, + "rewards/margins": 11.759943008422852, + "rewards/rejected": -6.012744903564453, + "step": 529 + }, + { + "epoch": 0.1326160390341549, + "grad_norm": 15.0, + "kl": 13.818990707397461, + "learning_rate": 5e-06, + "logits/chosen": -47930131.692307696, + "logits/rejected": -41468194.90909091, + "logps/chosen": -468.44125600961536, + "logps/rejected": -552.3140536221591, + "loss": 0.0647, + "rewards/chosen": 7.358089153583233, + "rewards/margins": 13.968134846720663, + "rewards/rejected": -6.610045693137429, + "step": 530 + }, + { + "epoch": 0.13286625797572876, + "grad_norm": 11.8125, + "kl": 5.293578147888184, + "learning_rate": 5e-06, + "logits/chosen": -89926336.0, + "logits/rejected": -47984923.428571425, + "logps/chosen": -488.002587890625, + "logps/rejected": -619.7414899553571, + "loss": 0.0323, + "rewards/chosen": 8.120950317382812, + "rewards/margins": 18.287205723353793, + "rewards/rejected": -10.166255405970983, + "step": 531 + }, + { + "epoch": 0.13311647691730263, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57388392.72727273, + "logits/rejected": -66457895.384615384, + "logps/chosen": -337.86514559659093, + "logps/rejected": -636.5494290865385, + "loss": 0.0944, + "rewards/chosen": 5.181604905561968, + "rewards/margins": 14.031535302008784, + "rewards/rejected": -8.849930396446815, + "step": 532 + }, + { + "epoch": 0.1333666958588765, + "grad_norm": 16.0, + "kl": 3.3148531913757324, + "learning_rate": 5e-06, + "logits/chosen": -56274967.27272727, + "logits/rejected": -37210028.307692304, + "logps/chosen": -419.67276278409093, + "logps/rejected": -511.70474008413464, + "loss": 0.0532, + "rewards/chosen": 7.069847800514915, + "rewards/margins": 12.83716065733583, + "rewards/rejected": -5.767312856820913, + "step": 533 + }, + { + "epoch": 0.1336169148004504, + "grad_norm": 8.875, + "kl": 4.858542442321777, + "learning_rate": 5e-06, + "logits/chosen": -51129984.0, + "logits/rejected": -127513344.0, + "logps/chosen": -489.1588134765625, + "logps/rejected": -339.93943277994794, + "loss": 0.0793, + "rewards/chosen": 5.95106824239095, + "rewards/margins": 11.520591100056965, + "rewards/rejected": -5.569522857666016, + "step": 534 + }, + { + "epoch": 0.13386713374202427, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61036231.11111111, + "logits/rejected": -45355780.266666666, + "logps/chosen": -242.20610894097223, + "logps/rejected": -535.25869140625, + "loss": 0.1034, + "rewards/chosen": 3.5311997731526694, + "rewards/margins": 10.262492497762045, + "rewards/rejected": -6.731292724609375, + "step": 535 + }, + { + "epoch": 0.13411735268359815, + "grad_norm": 26.5, + "kl": 5.973757743835449, + "learning_rate": 5e-06, + "logits/chosen": -71781450.66666667, + "logits/rejected": -70372618.66666667, + "logps/chosen": -412.5847574869792, + "logps/rejected": -782.48046875, + "loss": 0.159, + "rewards/chosen": 4.9632829030354815, + "rewards/margins": 13.216527938842773, + "rewards/rejected": -8.253245035807291, + "step": 536 + }, + { + "epoch": 0.13436757162517202, + "grad_norm": 1.8671875, + "kl": 2.53564715385437, + "learning_rate": 5e-06, + "logits/chosen": -34058551.27272727, + "logits/rejected": -65766291.692307696, + "logps/chosen": -467.0050603693182, + "logps/rejected": -673.2614182692307, + "loss": 0.0042, + "rewards/chosen": 7.107920559969815, + "rewards/margins": 18.238155498371256, + "rewards/rejected": -11.130234938401442, + "step": 537 + }, + { + "epoch": 0.13461779056674592, + "grad_norm": 15.5625, + "kl": 2.483484983444214, + "learning_rate": 5e-06, + "logits/chosen": -37301804.307692304, + "logits/rejected": -49108584.72727273, + "logps/chosen": -403.10366586538464, + "logps/rejected": -386.72878196022725, + "loss": 0.0422, + "rewards/chosen": 6.849193279559795, + "rewards/margins": 12.445289665168815, + "rewards/rejected": -5.59609638560902, + "step": 538 + }, + { + "epoch": 0.13486800950831979, + "grad_norm": 13.875, + "kl": 3.3756346702575684, + "learning_rate": 5e-06, + "logits/chosen": -40917779.2, + "logits/rejected": -38514413.71428572, + "logps/chosen": -252.2524169921875, + "logps/rejected": -540.5044991629464, + "loss": 0.0961, + "rewards/chosen": 3.6661636352539064, + "rewards/margins": 10.864099557059152, + "rewards/rejected": -7.197935921805246, + "step": 539 + }, + { + "epoch": 0.13511822844989366, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62873988.0, + "logits/rejected": -62200472.0, + "logps/chosen": -373.53582763671875, + "logps/rejected": -349.1548767089844, + "loss": 0.0465, + "rewards/chosen": 5.450967788696289, + "rewards/margins": 12.257472515106201, + "rewards/rejected": -6.806504726409912, + "step": 540 + }, + { + "epoch": 0.13536844739146753, + "grad_norm": 15.125, + "kl": 6.65887975692749, + "learning_rate": 5e-06, + "logits/chosen": -75504914.28571428, + "logits/rejected": -44939814.4, + "logps/chosen": -454.80154854910717, + "logps/rejected": -561.496923828125, + "loss": 0.0378, + "rewards/chosen": 6.142020089285714, + "rewards/margins": 15.195914132254465, + "rewards/rejected": -9.05389404296875, + "step": 541 + }, + { + "epoch": 0.1356186663330414, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88219491.55555555, + "logits/rejected": -53119641.6, + "logps/chosen": -325.59288194444446, + "logps/rejected": -455.66611328125, + "loss": 0.0911, + "rewards/chosen": 3.8690032958984375, + "rewards/margins": 11.863264973958334, + "rewards/rejected": -7.994261678059896, + "step": 542 + }, + { + "epoch": 0.1358688852746153, + "grad_norm": 12.3125, + "kl": 5.512020111083984, + "learning_rate": 5e-06, + "logits/chosen": -64126796.8, + "logits/rejected": -40754148.571428575, + "logps/chosen": -554.72607421875, + "logps/rejected": -515.8116629464286, + "loss": 0.0328, + "rewards/chosen": 7.9691215515136715, + "rewards/margins": 16.311395263671876, + "rewards/rejected": -8.342273712158203, + "step": 543 + }, + { + "epoch": 0.13611910421618917, + "grad_norm": 11.5, + "kl": 7.555688381195068, + "learning_rate": 5e-06, + "logits/chosen": -80192728.61538461, + "logits/rejected": -31945445.818181816, + "logps/chosen": -503.6399489182692, + "logps/rejected": -663.5648082386364, + "loss": 0.0438, + "rewards/chosen": 6.85441648043119, + "rewards/margins": 16.89773826332359, + "rewards/rejected": -10.0433217828924, + "step": 544 + }, + { + "epoch": 0.13636932315776304, + "grad_norm": 12.1875, + "kl": 10.995513916015625, + "learning_rate": 5e-06, + "logits/chosen": -79398520.47058824, + "logits/rejected": -112806070.85714285, + "logps/chosen": -579.7518956801471, + "logps/rejected": -900.4176897321429, + "loss": 0.0834, + "rewards/chosen": 6.391205731560202, + "rewards/margins": 23.74232572667739, + "rewards/rejected": -17.351119995117188, + "step": 545 + }, + { + "epoch": 0.1366195420993369, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27158245.818181816, + "logits/rejected": -34891116.307692304, + "logps/chosen": -412.5660955255682, + "logps/rejected": -411.27328725961536, + "loss": 0.0498, + "rewards/chosen": 5.319623773748225, + "rewards/margins": 13.087192508724186, + "rewards/rejected": -7.767568734975962, + "step": 546 + }, + { + "epoch": 0.1368697610409108, + "grad_norm": 12.875, + "kl": 1.4945749044418335, + "learning_rate": 5e-06, + "logits/chosen": -27459042.666666668, + "logits/rejected": -29302160.0, + "logps/chosen": -307.5785725911458, + "logps/rejected": -464.4750569661458, + "loss": 0.1028, + "rewards/chosen": 3.5340277353922525, + "rewards/margins": 12.774560928344727, + "rewards/rejected": -9.240533192952475, + "step": 547 + }, + { + "epoch": 0.13711997998248468, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70222515.2, + "logits/rejected": -24017531.42857143, + "logps/chosen": -489.0595703125, + "logps/rejected": -525.2247488839286, + "loss": 0.0384, + "rewards/chosen": 5.771475219726563, + "rewards/margins": 13.34258989606585, + "rewards/rejected": -7.571114676339286, + "step": 548 + }, + { + "epoch": 0.13737019892405855, + "grad_norm": 14.6875, + "kl": 1.2472445964813232, + "learning_rate": 5e-06, + "logits/chosen": -102107372.8, + "logits/rejected": -43376987.428571425, + "logps/chosen": -456.514794921875, + "logps/rejected": -312.28536551339283, + "loss": 0.0448, + "rewards/chosen": 5.982299423217773, + "rewards/margins": 11.620232336861747, + "rewards/rejected": -5.637932913643973, + "step": 549 + }, + { + "epoch": 0.13762041786563242, + "grad_norm": 16.25, + "kl": 0.9486293792724609, + "learning_rate": 5e-06, + "logits/chosen": -69926752.0, + "logits/rejected": -54773900.8, + "logps/chosen": -391.06689453125, + "logps/rejected": -451.2779296875, + "loss": 0.0811, + "rewards/chosen": 4.650173732212612, + "rewards/margins": 12.90467812674386, + "rewards/rejected": -8.25450439453125, + "step": 550 + }, + { + "epoch": 0.1378706368072063, + "grad_norm": 10.4375, + "kl": 1.4036941528320312, + "learning_rate": 5e-06, + "logits/chosen": -62808960.0, + "logits/rejected": -46912640.0, + "logps/chosen": -405.81338778409093, + "logps/rejected": -471.51160606971155, + "loss": 0.0796, + "rewards/chosen": 4.9530112526633525, + "rewards/margins": 13.210419981629698, + "rewards/rejected": -8.257408728966347, + "step": 551 + }, + { + "epoch": 0.1381208557487802, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50658583.27272727, + "logits/rejected": -49524686.76923077, + "logps/chosen": -403.10813210227275, + "logps/rejected": -632.4972956730769, + "loss": 0.0522, + "rewards/chosen": 5.546658602627841, + "rewards/margins": 15.684891494004042, + "rewards/rejected": -10.138232891376202, + "step": 552 + }, + { + "epoch": 0.13837107469035406, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -82265824.0, + "logits/rejected": -51365882.666666664, + "logps/chosen": -461.3359781901042, + "logps/rejected": -644.1104329427084, + "loss": 0.0771, + "rewards/chosen": 4.5082963307698565, + "rewards/margins": 16.37892468770345, + "rewards/rejected": -11.870628356933594, + "step": 553 + }, + { + "epoch": 0.13862129363192793, + "grad_norm": 21.125, + "kl": 0.04537200927734375, + "learning_rate": 5e-06, + "logits/chosen": -50316848.0, + "logits/rejected": -84522186.66666667, + "logps/chosen": -326.8287353515625, + "logps/rejected": -470.2118733723958, + "loss": 0.0984, + "rewards/chosen": 3.7203763326009116, + "rewards/margins": 12.240569432576498, + "rewards/rejected": -8.520193099975586, + "step": 554 + }, + { + "epoch": 0.1388715125735018, + "grad_norm": 16.375, + "kl": 3.3425607681274414, + "learning_rate": 5e-06, + "logits/chosen": -31000344.615384616, + "logits/rejected": -46964930.90909091, + "logps/chosen": -421.4607496995192, + "logps/rejected": -458.14936967329544, + "loss": 0.073, + "rewards/chosen": 4.11741696871244, + "rewards/margins": 14.60420632529092, + "rewards/rejected": -10.48678935657848, + "step": 555 + }, + { + "epoch": 0.1391217315150757, + "grad_norm": 14.1875, + "kl": 0.7640914916992188, + "learning_rate": 5e-06, + "logits/chosen": -40181469.333333336, + "logits/rejected": -30542840.0, + "logps/chosen": -306.20522054036456, + "logps/rejected": -431.0502115885417, + "loss": 0.039, + "rewards/chosen": 4.890604654947917, + "rewards/margins": 11.853193918863933, + "rewards/rejected": -6.962589263916016, + "step": 556 + }, + { + "epoch": 0.13937195045664957, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27684346.666666668, + "logits/rejected": -44124005.333333336, + "logps/chosen": -427.5699869791667, + "logps/rejected": -468.980224609375, + "loss": 0.0667, + "rewards/chosen": 5.270424524943034, + "rewards/margins": 12.634319305419922, + "rewards/rejected": -7.363894780476888, + "step": 557 + }, + { + "epoch": 0.13962216939822344, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74414016.0, + "logits/rejected": -47394377.14285714, + "logps/chosen": -408.1877685546875, + "logps/rejected": -541.2767508370536, + "loss": 0.0637, + "rewards/chosen": 5.466611862182617, + "rewards/margins": 15.036128725324359, + "rewards/rejected": -9.569516863141741, + "step": 558 + }, + { + "epoch": 0.13987238833979732, + "grad_norm": 5.09375, + "kl": 0.3665122985839844, + "learning_rate": 5e-06, + "logits/chosen": -50733922.461538464, + "logits/rejected": -60117550.54545455, + "logps/chosen": -438.68111478365387, + "logps/rejected": -583.6179421164773, + "loss": 0.0324, + "rewards/chosen": 5.679512023925781, + "rewards/margins": 13.910150007768111, + "rewards/rejected": -8.23063798384233, + "step": 559 + }, + { + "epoch": 0.1401226072813712, + "grad_norm": 13.0, + "kl": 1.3728488683700562, + "learning_rate": 5e-06, + "logits/chosen": -57274709.333333336, + "logits/rejected": -28212930.666666668, + "logps/chosen": -401.2125244140625, + "logps/rejected": -535.2503255208334, + "loss": 0.0603, + "rewards/chosen": 5.87824821472168, + "rewards/margins": 13.921934127807617, + "rewards/rejected": -8.043685913085938, + "step": 560 + }, + { + "epoch": 0.14037282622294509, + "grad_norm": 28.25, + "kl": 2.3028316497802734, + "learning_rate": 5e-06, + "logits/chosen": -33563565.333333336, + "logits/rejected": -44590602.666666664, + "logps/chosen": -306.4778238932292, + "logps/rejected": -550.0835774739584, + "loss": 0.1334, + "rewards/chosen": 5.0559336344401045, + "rewards/margins": 11.945223490397137, + "rewards/rejected": -6.889289855957031, + "step": 561 + }, + { + "epoch": 0.14062304516451896, + "grad_norm": 9.125, + "kl": 1.8545424938201904, + "learning_rate": 5e-06, + "logits/chosen": -48026560.0, + "logits/rejected": -30056333.333333332, + "logps/chosen": -388.0746663411458, + "logps/rejected": -397.7444661458333, + "loss": 0.0811, + "rewards/chosen": 5.234479268391927, + "rewards/margins": 13.937051773071289, + "rewards/rejected": -8.702572504679361, + "step": 562 + }, + { + "epoch": 0.14087326410609283, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56461430.15384615, + "logits/rejected": -104653649.45454545, + "logps/chosen": -402.31497896634613, + "logps/rejected": -790.8291015625, + "loss": 0.0716, + "rewards/chosen": 5.494441105769231, + "rewards/margins": 16.66681687148301, + "rewards/rejected": -11.17237576571378, + "step": 563 + }, + { + "epoch": 0.1411234830476667, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69598738.28571428, + "logits/rejected": -29663859.2, + "logps/chosen": -533.5154854910714, + "logps/rejected": -601.884765625, + "loss": 0.0264, + "rewards/chosen": 7.532101222446987, + "rewards/margins": 19.468496486118863, + "rewards/rejected": -11.936395263671875, + "step": 564 + }, + { + "epoch": 0.1413737019892406, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88706675.2, + "logits/rejected": -55017769.14285714, + "logps/chosen": -437.87451171875, + "logps/rejected": -772.2451869419643, + "loss": 0.05, + "rewards/chosen": 4.9825439453125, + "rewards/margins": 18.630482264927455, + "rewards/rejected": -13.647938319614955, + "step": 565 + }, + { + "epoch": 0.14162392093081447, + "grad_norm": 9.75, + "kl": 3.747547149658203, + "learning_rate": 5e-06, + "logits/chosen": -60948240.0, + "logits/rejected": -84816202.66666667, + "logps/chosen": -360.94189453125, + "logps/rejected": -462.5052083333333, + "loss": 0.0659, + "rewards/chosen": 5.374217987060547, + "rewards/margins": 11.517545064290363, + "rewards/rejected": -6.143327077229817, + "step": 566 + }, + { + "epoch": 0.14187413987238834, + "grad_norm": 11.125, + "kl": 1.3537509441375732, + "learning_rate": 5e-06, + "logits/chosen": -44645988.571428575, + "logits/rejected": -45456985.6, + "logps/chosen": -462.3232421875, + "logps/rejected": -501.261083984375, + "loss": 0.0216, + "rewards/chosen": 6.114850725446429, + "rewards/margins": 17.424613298688616, + "rewards/rejected": -11.309762573242187, + "step": 567 + }, + { + "epoch": 0.1421243588139622, + "grad_norm": 13.5625, + "kl": 5.523778438568115, + "learning_rate": 5e-06, + "logits/chosen": -65928890.666666664, + "logits/rejected": -44987210.666666664, + "logps/chosen": -441.14794921875, + "logps/rejected": -499.1584879557292, + "loss": 0.0334, + "rewards/chosen": 5.457477569580078, + "rewards/margins": 13.345184326171875, + "rewards/rejected": -7.887706756591797, + "step": 568 + }, + { + "epoch": 0.14237457775553608, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65111108.92307692, + "logits/rejected": -10760162.909090908, + "logps/chosen": -466.7189753605769, + "logps/rejected": -671.4787819602273, + "loss": 0.0181, + "rewards/chosen": 6.900662348820613, + "rewards/margins": 16.807293818547176, + "rewards/rejected": -9.906631469726562, + "step": 569 + }, + { + "epoch": 0.14262479669710998, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63227194.666666664, + "logits/rejected": -70243680.0, + "logps/chosen": -451.7148844401042, + "logps/rejected": -532.2007242838541, + "loss": 0.0684, + "rewards/chosen": 5.362323760986328, + "rewards/margins": 14.59932009379069, + "rewards/rejected": -9.236996332804361, + "step": 570 + }, + { + "epoch": 0.14287501563868385, + "grad_norm": 6.875, + "kl": 0.22522418200969696, + "learning_rate": 5e-06, + "logits/chosen": -38687499.63636363, + "logits/rejected": -69070168.61538461, + "logps/chosen": -416.4705699573864, + "logps/rejected": -423.6765700120192, + "loss": 0.0206, + "rewards/chosen": 6.458177046342329, + "rewards/margins": 14.118144322108556, + "rewards/rejected": -7.659967275766226, + "step": 571 + }, + { + "epoch": 0.14312523458025772, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69844089.6, + "logits/rejected": -47900699.428571425, + "logps/chosen": -364.145556640625, + "logps/rejected": -686.8136160714286, + "loss": 0.0559, + "rewards/chosen": 5.256170654296875, + "rewards/margins": 14.609437779017858, + "rewards/rejected": -9.353267124720983, + "step": 572 + }, + { + "epoch": 0.1433754535218316, + "grad_norm": 7.25, + "kl": 6.505663871765137, + "learning_rate": 5e-06, + "logits/chosen": -71723273.84615384, + "logits/rejected": -37394836.36363637, + "logps/chosen": -557.0062725360577, + "logps/rejected": -574.6129705255681, + "loss": 0.0489, + "rewards/chosen": 7.415489783653846, + "rewards/margins": 15.985279323337796, + "rewards/rejected": -8.56978953968395, + "step": 573 + }, + { + "epoch": 0.1436256724634055, + "grad_norm": 13.6875, + "kl": 4.055292129516602, + "learning_rate": 5e-06, + "logits/chosen": -76171057.23076923, + "logits/rejected": -50149015.27272727, + "logps/chosen": -408.18558443509613, + "logps/rejected": -559.4716796875, + "loss": 0.0607, + "rewards/chosen": 5.947671743539663, + "rewards/margins": 16.2534689736533, + "rewards/rejected": -10.305797230113637, + "step": 574 + }, + { + "epoch": 0.14387589140497936, + "grad_norm": 11.1875, + "kl": 2.946934223175049, + "learning_rate": 5e-06, + "logits/chosen": -22387913.846153848, + "logits/rejected": -37404657.45454545, + "logps/chosen": -539.9292743389423, + "logps/rejected": -442.79350142045456, + "loss": 0.0303, + "rewards/chosen": 6.466593228853666, + "rewards/margins": 14.180487466025186, + "rewards/rejected": -7.71389423717152, + "step": 575 + }, + { + "epoch": 0.14412611034655323, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50337417.84615385, + "logits/rejected": -31008706.90909091, + "logps/chosen": -519.6180889423077, + "logps/rejected": -499.71657492897725, + "loss": 0.0135, + "rewards/chosen": 7.569244384765625, + "rewards/margins": 16.236910733309657, + "rewards/rejected": -8.667666348544033, + "step": 576 + }, + { + "epoch": 0.1443763292881271, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72365312.0, + "logits/rejected": -84830858.66666667, + "logps/chosen": -383.6020100911458, + "logps/rejected": -745.9160970052084, + "loss": 0.06, + "rewards/chosen": 6.994204203287761, + "rewards/margins": 19.90826161702474, + "rewards/rejected": -12.914057413736979, + "step": 577 + }, + { + "epoch": 0.144626548229701, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53728320.0, + "logits/rejected": -62778950.4, + "logps/chosen": -298.8453892299107, + "logps/rejected": -609.87529296875, + "loss": 0.0502, + "rewards/chosen": 4.344665254865374, + "rewards/margins": 12.591643251691545, + "rewards/rejected": -8.246977996826171, + "step": 578 + }, + { + "epoch": 0.14487676717127487, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70246381.71428572, + "logits/rejected": -30079724.8, + "logps/chosen": -445.68729073660717, + "logps/rejected": -556.24599609375, + "loss": 0.0208, + "rewards/chosen": 6.037149156842913, + "rewards/margins": 17.830748094831193, + "rewards/rejected": -11.793598937988282, + "step": 579 + }, + { + "epoch": 0.14512698611284874, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22964480.0, + "logits/rejected": -46092390.4, + "logps/chosen": -174.85899135044642, + "logps/rejected": -524.25390625, + "loss": 0.135, + "rewards/chosen": 2.951103482927595, + "rewards/margins": 12.166678128923689, + "rewards/rejected": -9.215574645996094, + "step": 580 + }, + { + "epoch": 0.14537720505442261, + "grad_norm": 23.0, + "kl": 6.223883628845215, + "learning_rate": 5e-06, + "logits/chosen": -57095116.8, + "logits/rejected": -56019975.11111111, + "logps/chosen": -516.7806640625, + "logps/rejected": -630.8013237847222, + "loss": 0.0569, + "rewards/chosen": 6.289462280273438, + "rewards/margins": 16.177319505479602, + "rewards/rejected": -9.887857225206163, + "step": 581 + }, + { + "epoch": 0.14562742399599649, + "grad_norm": 21.0, + "kl": 3.0273406505584717, + "learning_rate": 5e-06, + "logits/chosen": -57891221.333333336, + "logits/rejected": -70378224.0, + "logps/chosen": -536.1588948567709, + "logps/rejected": -587.8660481770834, + "loss": 0.047, + "rewards/chosen": 7.852675120035808, + "rewards/margins": 15.25135103861491, + "rewards/rejected": -7.398675918579102, + "step": 582 + }, + { + "epoch": 0.14587764293757038, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 53221752.88888889, + "logits/rejected": -39718408.53333333, + "logps/chosen": -571.5025499131945, + "logps/rejected": -556.9738932291667, + "loss": 0.0487, + "rewards/chosen": 5.060741424560547, + "rewards/margins": 16.374883270263673, + "rewards/rejected": -11.314141845703125, + "step": 583 + }, + { + "epoch": 0.14612786187914426, + "grad_norm": 6.21875, + "kl": 3.0376155376434326, + "learning_rate": 5e-06, + "logits/chosen": -62987258.666666664, + "logits/rejected": -61405440.0, + "logps/chosen": -421.086181640625, + "logps/rejected": -643.8584798177084, + "loss": 0.0463, + "rewards/chosen": 5.113549868265788, + "rewards/margins": 13.810180346171062, + "rewards/rejected": -8.696630477905273, + "step": 584 + }, + { + "epoch": 0.14637808082071813, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33535074.666666668, + "logits/rejected": -22897408.0, + "logps/chosen": -206.67183430989584, + "logps/rejected": -308.8408610026042, + "loss": 0.1215, + "rewards/chosen": 2.9940287272135415, + "rewards/margins": 9.589642842610678, + "rewards/rejected": -6.595614115397136, + "step": 585 + }, + { + "epoch": 0.146628299762292, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71851984.0, + "logits/rejected": -37640056.0, + "logps/chosen": -449.3997802734375, + "logps/rejected": -503.60772705078125, + "loss": 0.0564, + "rewards/chosen": 5.691864967346191, + "rewards/margins": 13.455193519592285, + "rewards/rejected": -7.763328552246094, + "step": 586 + }, + { + "epoch": 0.1468785187038659, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47702720.0, + "logits/rejected": -62317696.0, + "logps/chosen": -430.5486949573864, + "logps/rejected": -817.4065504807693, + "loss": 0.0333, + "rewards/chosen": 5.671214363791726, + "rewards/margins": 16.31571744371961, + "rewards/rejected": -10.644503079927885, + "step": 587 + }, + { + "epoch": 0.14712873764543977, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60553320.72727273, + "logits/rejected": -70311276.3076923, + "logps/chosen": -354.89208984375, + "logps/rejected": -566.9461388221154, + "loss": 0.0739, + "rewards/chosen": 4.223767367276278, + "rewards/margins": 13.48609705571528, + "rewards/rejected": -9.262329688439003, + "step": 588 + }, + { + "epoch": 0.14737895658701364, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16443514.666666666, + "logits/rejected": -41768354.13333333, + "logps/chosen": -362.9908854166667, + "logps/rejected": -483.8588541666667, + "loss": 0.0802, + "rewards/chosen": 5.540819803873698, + "rewards/margins": 13.324569193522136, + "rewards/rejected": -7.783749389648437, + "step": 589 + }, + { + "epoch": 0.1476291755285875, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45180163.2, + "logits/rejected": -63016658.28571428, + "logps/chosen": -355.2489501953125, + "logps/rejected": -715.7284458705357, + "loss": 0.0695, + "rewards/chosen": 4.424626159667969, + "rewards/margins": 15.527487836565289, + "rewards/rejected": -11.102861676897321, + "step": 590 + }, + { + "epoch": 0.14787939447016138, + "grad_norm": 11.0625, + "kl": 3.0407485961914062, + "learning_rate": 5e-06, + "logits/chosen": -41149545.14285714, + "logits/rejected": -48740230.4, + "logps/chosen": -484.2509068080357, + "logps/rejected": -575.022314453125, + "loss": 0.0294, + "rewards/chosen": 6.6061521257672995, + "rewards/margins": 16.47402779715402, + "rewards/rejected": -9.867875671386718, + "step": 591 + }, + { + "epoch": 0.14812961341173528, + "grad_norm": 13.0, + "kl": 1.0395148992538452, + "learning_rate": 5e-06, + "logits/chosen": -27352150.4, + "logits/rejected": -44808640.0, + "logps/chosen": -326.3874267578125, + "logps/rejected": -615.2264229910714, + "loss": 0.0912, + "rewards/chosen": 4.599735260009766, + "rewards/margins": 15.051055363246373, + "rewards/rejected": -10.451320103236608, + "step": 592 + }, + { + "epoch": 0.14837983235330915, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64602268.44444445, + "logits/rejected": -37757606.4, + "logps/chosen": -399.9679904513889, + "logps/rejected": -622.66484375, + "loss": 0.0381, + "rewards/chosen": 5.446172926161024, + "rewards/margins": 16.01845279269748, + "rewards/rejected": -10.572279866536459, + "step": 593 + }, + { + "epoch": 0.14863005129488302, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48084785.23076923, + "logits/rejected": -31335467.636363637, + "logps/chosen": -305.96542593149036, + "logps/rejected": -653.6206942471591, + "loss": 0.0687, + "rewards/chosen": 4.481809762807993, + "rewards/margins": 13.614516144865876, + "rewards/rejected": -9.132706382057883, + "step": 594 + }, + { + "epoch": 0.1488802702364569, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49751385.6, + "logits/rejected": -59806578.28571428, + "logps/chosen": -427.02548828125, + "logps/rejected": -788.4076450892857, + "loss": 0.037, + "rewards/chosen": 7.2791259765625, + "rewards/margins": 22.111955043247768, + "rewards/rejected": -14.832829066685267, + "step": 595 + }, + { + "epoch": 0.1491304891780308, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44918483.2, + "logits/rejected": -41798006.85714286, + "logps/chosen": -463.986181640625, + "logps/rejected": -465.1258021763393, + "loss": 0.0722, + "rewards/chosen": 6.066531372070313, + "rewards/margins": 15.181361934116907, + "rewards/rejected": -9.114830562046595, + "step": 596 + }, + { + "epoch": 0.14938070811960466, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49981060.571428575, + "logits/rejected": -30313939.2, + "logps/chosen": -436.2766810825893, + "logps/rejected": -507.420361328125, + "loss": 0.0286, + "rewards/chosen": 6.159907749720982, + "rewards/margins": 16.996255711146762, + "rewards/rejected": -10.836347961425782, + "step": 597 + }, + { + "epoch": 0.14963092706117853, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53047340.8, + "logits/rejected": -42007465.14285714, + "logps/chosen": -321.1655517578125, + "logps/rejected": -500.06089564732144, + "loss": 0.0677, + "rewards/chosen": 4.893730545043946, + "rewards/margins": 13.315142331804548, + "rewards/rejected": -8.421411786760602, + "step": 598 + }, + { + "epoch": 0.1498811460027524, + "grad_norm": 6.59375, + "kl": 0.5697571635246277, + "learning_rate": 5e-06, + "logits/chosen": -49678749.538461536, + "logits/rejected": -28427377.454545453, + "logps/chosen": -437.12631460336536, + "logps/rejected": -677.50927734375, + "loss": 0.0496, + "rewards/chosen": 5.437606224646935, + "rewards/margins": 15.197456466568099, + "rewards/rejected": -9.759850241921164, + "step": 599 + }, + { + "epoch": 0.15013136494432627, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44754730.666666664, + "logits/rejected": -31375938.666666668, + "logps/chosen": -263.7760009765625, + "logps/rejected": -404.2611897786458, + "loss": 0.0664, + "rewards/chosen": 4.977120717366536, + "rewards/margins": 15.294183095296223, + "rewards/rejected": -10.317062377929688, + "step": 600 + }, + { + "epoch": 0.15038158388590017, + "grad_norm": 14.0625, + "kl": 1.9754600524902344, + "learning_rate": 5e-06, + "logits/chosen": -38264466.666666664, + "logits/rejected": -13454984.0, + "logps/chosen": -379.0237223307292, + "logps/rejected": -513.689208984375, + "loss": 0.1188, + "rewards/chosen": 4.791454950968425, + "rewards/margins": 13.80802281697591, + "rewards/rejected": -9.016567866007486, + "step": 601 + }, + { + "epoch": 0.15063180282747404, + "grad_norm": 1.921875, + "kl": 1.9281539916992188, + "learning_rate": 5e-06, + "logits/chosen": -73308996.26666667, + "logits/rejected": -42650944.0, + "logps/chosen": -518.6475260416667, + "logps/rejected": -616.7601453993055, + "loss": 0.017, + "rewards/chosen": 6.334693908691406, + "rewards/margins": 15.746002197265625, + "rewards/rejected": -9.411308288574219, + "step": 602 + }, + { + "epoch": 0.1508820217690479, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11186326.153846154, + "logits/rejected": -59520768.0, + "logps/chosen": -312.8362379807692, + "logps/rejected": -550.9716796875, + "loss": 0.057, + "rewards/chosen": 4.333173018235427, + "rewards/margins": 15.060367370818877, + "rewards/rejected": -10.727194352583451, + "step": 603 + }, + { + "epoch": 0.15113224071062178, + "grad_norm": 13.1875, + "kl": 1.423807144165039, + "learning_rate": 5e-06, + "logits/chosen": -57009925.81818182, + "logits/rejected": -62002540.307692304, + "logps/chosen": -354.83389559659093, + "logps/rejected": -456.54995492788464, + "loss": 0.0408, + "rewards/chosen": 5.314810319380327, + "rewards/margins": 14.247145806159175, + "rewards/rejected": -8.932335486778847, + "step": 604 + }, + { + "epoch": 0.15138245965219568, + "grad_norm": 11.8125, + "kl": 3.8470964431762695, + "learning_rate": 5e-06, + "logits/chosen": -81040907.63636364, + "logits/rejected": -47877080.615384616, + "logps/chosen": -441.78884055397725, + "logps/rejected": -457.900390625, + "loss": 0.05, + "rewards/chosen": 6.137951937588778, + "rewards/margins": 15.409236774577963, + "rewards/rejected": -9.271284836989183, + "step": 605 + }, + { + "epoch": 0.15163267859376955, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33345590.4, + "logits/rejected": -47074130.28571428, + "logps/chosen": -307.0808349609375, + "logps/rejected": -507.64501953125, + "loss": 0.0887, + "rewards/chosen": 4.056896209716797, + "rewards/margins": 12.964509691510882, + "rewards/rejected": -8.907613481794085, + "step": 606 + }, + { + "epoch": 0.15188289753534343, + "grad_norm": 12.5625, + "kl": 1.1803348064422607, + "learning_rate": 5e-06, + "logits/chosen": -63267341.71428572, + "logits/rejected": -57571315.2, + "logps/chosen": -444.02797154017856, + "logps/rejected": -610.296875, + "loss": 0.0584, + "rewards/chosen": 6.373085021972656, + "rewards/margins": 18.516206359863283, + "rewards/rejected": -12.143121337890625, + "step": 607 + }, + { + "epoch": 0.1521331164769173, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54215261.09090909, + "logits/rejected": -61871547.07692308, + "logps/chosen": -387.6716974431818, + "logps/rejected": -354.5124323918269, + "loss": 0.076, + "rewards/chosen": 5.259364734996449, + "rewards/margins": 11.558089569732026, + "rewards/rejected": -6.298724834735577, + "step": 608 + }, + { + "epoch": 0.15238333541849117, + "grad_norm": 21.875, + "kl": 11.305414199829102, + "learning_rate": 5e-06, + "logits/chosen": -56930860.307692304, + "logits/rejected": -31004282.181818184, + "logps/chosen": -433.7024113581731, + "logps/rejected": -658.8701171875, + "loss": 0.0973, + "rewards/chosen": 5.5040740966796875, + "rewards/margins": 16.440994262695312, + "rewards/rejected": -10.936920166015625, + "step": 609 + }, + { + "epoch": 0.15263355436006507, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -97271246.22222222, + "logits/rejected": -46111820.8, + "logps/chosen": -458.6940104166667, + "logps/rejected": -552.3071614583333, + "loss": 0.0254, + "rewards/chosen": 6.719607883029514, + "rewards/margins": 16.007958306206596, + "rewards/rejected": -9.288350423177084, + "step": 610 + }, + { + "epoch": 0.15288377330163894, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21712582.4, + "logits/rejected": -55264342.85714286, + "logps/chosen": -265.624169921875, + "logps/rejected": -575.4641462053571, + "loss": 0.0957, + "rewards/chosen": 3.5226036071777345, + "rewards/margins": 11.884290313720703, + "rewards/rejected": -8.361686706542969, + "step": 611 + }, + { + "epoch": 0.1531339922432128, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29308192.0, + "logits/rejected": -53309746.28571428, + "logps/chosen": -312.162841796875, + "logps/rejected": -548.2455705915179, + "loss": 0.0854, + "rewards/chosen": 2.817942237854004, + "rewards/margins": 12.169474220275879, + "rewards/rejected": -9.351531982421875, + "step": 612 + }, + { + "epoch": 0.15338421118478668, + "grad_norm": 10.125, + "kl": 7.217921257019043, + "learning_rate": 5e-06, + "logits/chosen": -43526417.45454545, + "logits/rejected": -58481019.07692308, + "logps/chosen": -427.17489346590907, + "logps/rejected": -537.5062349759615, + "loss": 0.0465, + "rewards/chosen": 5.5406410910866475, + "rewards/margins": 16.36690686632703, + "rewards/rejected": -10.826265775240385, + "step": 613 + }, + { + "epoch": 0.15363443012636058, + "grad_norm": 9.3125, + "kl": 2.2072792053222656, + "learning_rate": 5e-06, + "logits/chosen": -56456826.18181818, + "logits/rejected": -37112531.692307696, + "logps/chosen": -512.4351029829545, + "logps/rejected": -406.7492487980769, + "loss": 0.047, + "rewards/chosen": 7.514734441583807, + "rewards/margins": 13.133255324997268, + "rewards/rejected": -5.618520883413462, + "step": 614 + }, + { + "epoch": 0.15388464906793445, + "grad_norm": 12.875, + "kl": 0.842522144317627, + "learning_rate": 5e-06, + "logits/chosen": -19764147.2, + "logits/rejected": -36850731.428571425, + "logps/chosen": -357.34423828125, + "logps/rejected": -360.9752720424107, + "loss": 0.0883, + "rewards/chosen": 6.33453483581543, + "rewards/margins": 12.28083588736398, + "rewards/rejected": -5.9463010515485495, + "step": 615 + }, + { + "epoch": 0.15413486800950832, + "grad_norm": 12.6875, + "kl": 1.6806972026824951, + "learning_rate": 5e-06, + "logits/chosen": -57333124.92307692, + "logits/rejected": -22525808.0, + "logps/chosen": -454.38611778846155, + "logps/rejected": -517.8662997159091, + "loss": 0.0464, + "rewards/chosen": 6.989862295297476, + "rewards/margins": 17.49558988984648, + "rewards/rejected": -10.505727594549006, + "step": 616 + }, + { + "epoch": 0.1543850869510822, + "grad_norm": 14.625, + "kl": 2.119199752807617, + "learning_rate": 5e-06, + "logits/chosen": -53081521.23076923, + "logits/rejected": -68264372.36363636, + "logps/chosen": -368.2942457932692, + "logps/rejected": -647.6526544744319, + "loss": 0.068, + "rewards/chosen": 5.668980525090144, + "rewards/margins": 14.989180664916137, + "rewards/rejected": -9.320200139825994, + "step": 617 + }, + { + "epoch": 0.15463530589265606, + "grad_norm": 12.1875, + "kl": 3.436605453491211, + "learning_rate": 5e-06, + "logits/chosen": -51985715.2, + "logits/rejected": -71383497.14285715, + "logps/chosen": -349.806884765625, + "logps/rejected": -594.2551618303571, + "loss": 0.0599, + "rewards/chosen": 5.950720977783203, + "rewards/margins": 13.747690800258091, + "rewards/rejected": -7.796969822474888, + "step": 618 + }, + { + "epoch": 0.15488552483422996, + "grad_norm": 13.375, + "kl": 9.635443687438965, + "learning_rate": 5e-06, + "logits/chosen": -45207136.0, + "logits/rejected": -52954444.8, + "logps/chosen": -364.88204520089283, + "logps/rejected": -584.69951171875, + "loss": 0.1153, + "rewards/chosen": 4.932337624686105, + "rewards/margins": 12.84488797869001, + "rewards/rejected": -7.912550354003907, + "step": 619 + }, + { + "epoch": 0.15513574377580383, + "grad_norm": 14.6875, + "kl": 1.0316712856292725, + "learning_rate": 5e-06, + "logits/chosen": -82956437.33333333, + "logits/rejected": -59409184.0, + "logps/chosen": -407.2008870442708, + "logps/rejected": -469.4278157552083, + "loss": 0.077, + "rewards/chosen": 6.241847991943359, + "rewards/margins": 14.080009460449219, + "rewards/rejected": -7.838161468505859, + "step": 620 + }, + { + "epoch": 0.1553859627173777, + "grad_norm": 3.484375, + "kl": 5.205187797546387, + "learning_rate": 5e-06, + "logits/chosen": 104229688.8888889, + "logits/rejected": -53546018.13333333, + "logps/chosen": -437.0675998263889, + "logps/rejected": -628.8258463541666, + "loss": 0.0076, + "rewards/chosen": 7.225312974717882, + "rewards/margins": 16.454711574978298, + "rewards/rejected": -9.229398600260417, + "step": 621 + }, + { + "epoch": 0.15563618165895157, + "grad_norm": 10.5, + "kl": 7.199731826782227, + "learning_rate": 5e-06, + "logits/chosen": -69697683.6923077, + "logits/rejected": -51939397.81818182, + "logps/chosen": -494.4455378605769, + "logps/rejected": -478.54190340909093, + "loss": 0.0652, + "rewards/chosen": 7.387502230130709, + "rewards/margins": 17.04444442428909, + "rewards/rejected": -9.65694219415838, + "step": 622 + }, + { + "epoch": 0.15588640060052547, + "grad_norm": 11.375, + "kl": 11.347785949707031, + "learning_rate": 5e-06, + "logits/chosen": -83514090.66666667, + "logits/rejected": -40512921.6, + "logps/chosen": -520.8565538194445, + "logps/rejected": -495.40930989583336, + "loss": 0.0688, + "rewards/chosen": 9.910608927408854, + "rewards/margins": 16.37061462402344, + "rewards/rejected": -6.460005696614584, + "step": 623 + }, + { + "epoch": 0.15613661954209934, + "grad_norm": 20.125, + "kl": 6.3800764083862305, + "learning_rate": 5e-06, + "logits/chosen": -66245792.0, + "logits/rejected": -30638410.666666668, + "logps/chosen": -405.9878743489583, + "logps/rejected": -304.5220133463542, + "loss": 0.1018, + "rewards/chosen": 6.917832056681315, + "rewards/margins": 11.399139722188314, + "rewards/rejected": -4.481307665506999, + "step": 624 + }, + { + "epoch": 0.1563868384836732, + "grad_norm": 8.0, + "kl": 8.922649383544922, + "learning_rate": 5e-06, + "logits/chosen": -53217347.2, + "logits/rejected": -43760681.14285714, + "logps/chosen": -364.7166748046875, + "logps/rejected": -381.60023716517856, + "loss": 0.0746, + "rewards/chosen": 7.687879180908203, + "rewards/margins": 14.588166264125277, + "rewards/rejected": -6.9002870832170755, + "step": 625 + }, + { + "epoch": 0.15663705742524708, + "grad_norm": 0.8515625, + "kl": 1.8622817993164062, + "learning_rate": 5e-06, + "logits/chosen": -78571328.0, + "logits/rejected": -48910037.333333336, + "logps/chosen": -487.4912923177083, + "logps/rejected": -679.3701985677084, + "loss": 0.0015, + "rewards/chosen": 8.298712412516275, + "rewards/margins": 19.89679718017578, + "rewards/rejected": -11.598084767659506, + "step": 626 + }, + { + "epoch": 0.15688727636682095, + "grad_norm": 5.71875, + "kl": 6.2126946449279785, + "learning_rate": 5e-06, + "logits/chosen": -48173108.36363637, + "logits/rejected": -60336866.461538464, + "logps/chosen": -402.76979758522725, + "logps/rejected": -534.7078575721154, + "loss": 0.0341, + "rewards/chosen": 7.176218206232244, + "rewards/margins": 14.030700416831703, + "rewards/rejected": -6.854482210599459, + "step": 627 + }, + { + "epoch": 0.15713749530839485, + "grad_norm": 14.0625, + "kl": 9.500288963317871, + "learning_rate": 5e-06, + "logits/chosen": -74956672.0, + "logits/rejected": -42692579.55555555, + "logps/chosen": -543.0876627604167, + "logps/rejected": -716.7958984375, + "loss": 0.0436, + "rewards/chosen": 7.639811706542969, + "rewards/margins": 16.986564297146266, + "rewards/rejected": -9.346752590603298, + "step": 628 + }, + { + "epoch": 0.15738771424996872, + "grad_norm": 1.8828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83722230.85714285, + "logits/rejected": -56166113.88235294, + "logps/chosen": -504.71351841517856, + "logps/rejected": -530.2498276654412, + "loss": 0.0046, + "rewards/chosen": 7.7960935320172995, + "rewards/margins": 16.280020641679524, + "rewards/rejected": -8.483927109662224, + "step": 629 + }, + { + "epoch": 0.1576379331915426, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27695993.6, + "logits/rejected": -15187328.0, + "logps/chosen": -326.235009765625, + "logps/rejected": -658.9176897321429, + "loss": 0.0436, + "rewards/chosen": 5.420652770996094, + "rewards/margins": 14.86701158796038, + "rewards/rejected": -9.446358816964286, + "step": 630 + }, + { + "epoch": 0.15788815213311647, + "grad_norm": 22.0, + "kl": 1.486368179321289, + "learning_rate": 5e-06, + "logits/chosen": -66076716.8, + "logits/rejected": -29961417.14285714, + "logps/chosen": -429.9384765625, + "logps/rejected": -505.41622488839283, + "loss": 0.0499, + "rewards/chosen": 7.387535095214844, + "rewards/margins": 15.074607304164342, + "rewards/rejected": -7.687072208949497, + "step": 631 + }, + { + "epoch": 0.15813837107469036, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75176978.28571428, + "logits/rejected": -37523501.176470585, + "logps/chosen": -436.0302734375, + "logps/rejected": -549.9060202205883, + "loss": 0.0469, + "rewards/chosen": 6.377547127859933, + "rewards/margins": 14.81867153905019, + "rewards/rejected": -8.441124411190257, + "step": 632 + }, + { + "epoch": 0.15838859001626424, + "grad_norm": 13.8125, + "kl": 3.222527265548706, + "learning_rate": 5e-06, + "logits/chosen": -29587706.181818184, + "logits/rejected": -45230843.07692308, + "logps/chosen": -299.69731001420456, + "logps/rejected": -654.0891676682693, + "loss": 0.0876, + "rewards/chosen": 4.940381136807528, + "rewards/margins": 15.726164170912096, + "rewards/rejected": -10.785783034104567, + "step": 633 + }, + { + "epoch": 0.1586388089578381, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79841837.71428572, + "logits/rejected": -43428491.294117644, + "logps/chosen": -381.05824497767856, + "logps/rejected": -388.09558823529414, + "loss": 0.0625, + "rewards/chosen": 5.998417445591518, + "rewards/margins": 12.576788830156087, + "rewards/rejected": -6.578371384564568, + "step": 634 + }, + { + "epoch": 0.15888902789941198, + "grad_norm": 3.546875, + "kl": 2.7077102661132812, + "learning_rate": 5e-06, + "logits/chosen": -50009619.2, + "logits/rejected": -50984548.571428575, + "logps/chosen": -351.46279296875, + "logps/rejected": -432.29830496651783, + "loss": 0.0195, + "rewards/chosen": 6.222322463989258, + "rewards/margins": 15.356186730521065, + "rewards/rejected": -9.133864266531807, + "step": 635 + }, + { + "epoch": 0.15913924684098588, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59448686.93333333, + "logits/rejected": -31012142.222222224, + "logps/chosen": -341.6755859375, + "logps/rejected": -556.4164496527778, + "loss": 0.0569, + "rewards/chosen": 5.288178507486979, + "rewards/margins": 15.303597513834635, + "rewards/rejected": -10.015419006347656, + "step": 636 + }, + { + "epoch": 0.15938946578255975, + "grad_norm": 5.375, + "kl": 6.035285472869873, + "learning_rate": 5e-06, + "logits/chosen": -59883669.333333336, + "logits/rejected": -37449261.333333336, + "logps/chosen": -456.2738444010417, + "logps/rejected": -547.1472981770834, + "loss": 0.0383, + "rewards/chosen": 6.626650492350261, + "rewards/margins": 15.629468282063801, + "rewards/rejected": -9.002817789713541, + "step": 637 + }, + { + "epoch": 0.15963968472413362, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32597276.444444444, + "logits/rejected": -26964051.2, + "logps/chosen": -459.62065972222223, + "logps/rejected": -666.8653645833333, + "loss": 0.0536, + "rewards/chosen": 7.42759026421441, + "rewards/margins": 16.06582777235243, + "rewards/rejected": -8.63823750813802, + "step": 638 + }, + { + "epoch": 0.1598899036657075, + "grad_norm": 18.375, + "kl": 4.807146072387695, + "learning_rate": 5e-06, + "logits/chosen": -65658290.28571428, + "logits/rejected": -62354956.8, + "logps/chosen": -429.42843191964283, + "logps/rejected": -757.53955078125, + "loss": 0.0804, + "rewards/chosen": 6.848824092320034, + "rewards/margins": 18.877921077183316, + "rewards/rejected": -12.029096984863282, + "step": 639 + }, + { + "epoch": 0.16014012260728136, + "grad_norm": 1.4453125, + "kl": 0.20021185278892517, + "learning_rate": 5e-06, + "logits/chosen": -66669568.0, + "logits/rejected": -30092720.0, + "logps/chosen": -480.6462890625, + "logps/rejected": -553.7813197544643, + "loss": 0.044, + "rewards/chosen": 7.846075439453125, + "rewards/margins": 18.060491943359374, + "rewards/rejected": -10.21441650390625, + "step": 640 + }, + { + "epoch": 0.16039034154885526, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49854862.76923077, + "logits/rejected": -40396616.72727273, + "logps/chosen": -495.71788611778845, + "logps/rejected": -692.5755948153409, + "loss": 0.0362, + "rewards/chosen": 6.8903632530799275, + "rewards/margins": 19.248831048712027, + "rewards/rejected": -12.358467795632102, + "step": 641 + }, + { + "epoch": 0.16064056049042913, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57201440.0, + "logits/rejected": -62493845.333333336, + "logps/chosen": -386.4222819010417, + "logps/rejected": -622.29833984375, + "loss": 0.0286, + "rewards/chosen": 5.635476430257161, + "rewards/margins": 16.534656524658203, + "rewards/rejected": -10.899180094401041, + "step": 642 + }, + { + "epoch": 0.160890779432003, + "grad_norm": 11.875, + "kl": 5.814295768737793, + "learning_rate": 5e-06, + "logits/chosen": -82580775.38461539, + "logits/rejected": -7982277.818181818, + "logps/chosen": -421.7467698317308, + "logps/rejected": -440.0792347301136, + "loss": 0.1019, + "rewards/chosen": 5.027743412898137, + "rewards/margins": 13.461065279020296, + "rewards/rejected": -8.433321866122158, + "step": 643 + }, + { + "epoch": 0.16114099837357687, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56197717.333333336, + "logits/rejected": -68045883.73333333, + "logps/chosen": -331.20252821180554, + "logps/rejected": -543.6722330729167, + "loss": 0.0434, + "rewards/chosen": 5.1691436767578125, + "rewards/margins": 16.076640828450522, + "rewards/rejected": -10.907497151692708, + "step": 644 + }, + { + "epoch": 0.16139121731515077, + "grad_norm": 8.6875, + "kl": 1.6532491445541382, + "learning_rate": 5e-06, + "logits/chosen": -66253376.0, + "logits/rejected": -18327956.8, + "logps/chosen": -441.36575753348217, + "logps/rejected": -411.46396484375, + "loss": 0.0324, + "rewards/chosen": 6.985917227608817, + "rewards/margins": 14.692185538155691, + "rewards/rejected": -7.706268310546875, + "step": 645 + }, + { + "epoch": 0.16164143625672464, + "grad_norm": 13.125, + "kl": 1.1214256286621094, + "learning_rate": 5e-06, + "logits/chosen": -44702904.88888889, + "logits/rejected": -53356019.2, + "logps/chosen": -384.03917100694446, + "logps/rejected": -466.74095052083334, + "loss": 0.0506, + "rewards/chosen": 5.757381863064236, + "rewards/margins": 13.231995815700955, + "rewards/rejected": -7.474613952636719, + "step": 646 + }, + { + "epoch": 0.1618916551982985, + "grad_norm": 9.5, + "kl": 0.4956817626953125, + "learning_rate": 5e-06, + "logits/chosen": -82611601.45454545, + "logits/rejected": -42202840.615384616, + "logps/chosen": -476.5628551136364, + "logps/rejected": -474.54184194711536, + "loss": 0.0255, + "rewards/chosen": 5.4842071533203125, + "rewards/margins": 15.649455143855167, + "rewards/rejected": -10.165247990534855, + "step": 647 + }, + { + "epoch": 0.16214187413987238, + "grad_norm": 27.25, + "kl": 0.2697928845882416, + "learning_rate": 5e-06, + "logits/chosen": -51315141.81818182, + "logits/rejected": -34651052.307692304, + "logps/chosen": -411.80122514204544, + "logps/rejected": -355.74793419471155, + "loss": 0.1193, + "rewards/chosen": 6.381439902565696, + "rewards/margins": 12.210419394753195, + "rewards/rejected": -5.8289794921875, + "step": 648 + }, + { + "epoch": 0.16239209308144625, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37384800.0, + "logits/rejected": -47076154.18181818, + "logps/chosen": -306.23563326322113, + "logps/rejected": -484.53488991477275, + "loss": 0.0711, + "rewards/chosen": 4.964331993689904, + "rewards/margins": 12.463073143592247, + "rewards/rejected": -7.498741149902344, + "step": 649 + }, + { + "epoch": 0.16264231202302015, + "grad_norm": 14.5625, + "kl": 11.090937614440918, + "learning_rate": 5e-06, + "logits/chosen": -93352152.61538461, + "logits/rejected": -48217291.63636363, + "logps/chosen": -507.54800180288464, + "logps/rejected": -510.0047496448864, + "loss": 0.0721, + "rewards/chosen": 7.322487464317908, + "rewards/margins": 17.584539133352, + "rewards/rejected": -10.262051669034092, + "step": 650 + }, + { + "epoch": 0.16289253096459402, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45809664.0, + "logits/rejected": -52096361.14285714, + "logps/chosen": -338.61005859375, + "logps/rejected": -508.8189174107143, + "loss": 0.0752, + "rewards/chosen": 3.650387191772461, + "rewards/margins": 12.195720509120397, + "rewards/rejected": -8.545333317347936, + "step": 651 + }, + { + "epoch": 0.1631427499061679, + "grad_norm": 10.375, + "kl": 0.15212313830852509, + "learning_rate": 5e-06, + "logits/chosen": -45016186.18181818, + "logits/rejected": -47008290.461538464, + "logps/chosen": -345.62868430397725, + "logps/rejected": -493.28162560096155, + "loss": 0.0557, + "rewards/chosen": 5.799393393776634, + "rewards/margins": 15.5548777546916, + "rewards/rejected": -9.755484360914965, + "step": 652 + }, + { + "epoch": 0.16339296884774177, + "grad_norm": 15.1875, + "kl": 8.093406677246094, + "learning_rate": 5e-06, + "logits/chosen": -47084368.0, + "logits/rejected": -33976602.666666664, + "logps/chosen": -571.6569010416666, + "logps/rejected": -402.2288004557292, + "loss": 0.0805, + "rewards/chosen": 6.332256317138672, + "rewards/margins": 13.123188018798828, + "rewards/rejected": -6.790931701660156, + "step": 653 + }, + { + "epoch": 0.16364318778931566, + "grad_norm": 10.75, + "kl": 1.6146190166473389, + "learning_rate": 5e-06, + "logits/chosen": -49042076.44444445, + "logits/rejected": -48191219.2, + "logps/chosen": -499.9162326388889, + "logps/rejected": -544.7228515625, + "loss": 0.0293, + "rewards/chosen": 6.670145670572917, + "rewards/margins": 17.234286499023437, + "rewards/rejected": -10.56414082845052, + "step": 654 + }, + { + "epoch": 0.16389340673088953, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33668744.72727273, + "logits/rejected": -54299268.92307692, + "logps/chosen": -392.8492542613636, + "logps/rejected": -707.9043719951923, + "loss": 0.0508, + "rewards/chosen": 5.799758564342152, + "rewards/margins": 17.658680495682297, + "rewards/rejected": -11.858921931340145, + "step": 655 + }, + { + "epoch": 0.1641436256724634, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59663526.4, + "logits/rejected": -74696000.0, + "logps/chosen": -438.689990234375, + "logps/rejected": -462.24654715401783, + "loss": 0.041, + "rewards/chosen": 4.823660278320313, + "rewards/margins": 12.745981597900391, + "rewards/rejected": -7.922321319580078, + "step": 656 + }, + { + "epoch": 0.16439384461403728, + "grad_norm": 12.3125, + "kl": 4.263171195983887, + "learning_rate": 5e-06, + "logits/chosen": -72709725.0909091, + "logits/rejected": -45490875.07692308, + "logps/chosen": -425.22727272727275, + "logps/rejected": -546.3515625, + "loss": 0.0142, + "rewards/chosen": 6.606288563121449, + "rewards/margins": 16.010276741081185, + "rewards/rejected": -9.403988177959736, + "step": 657 + }, + { + "epoch": 0.16464406355561115, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39353725.333333336, + "logits/rejected": -10432152.666666666, + "logps/chosen": -515.0740559895834, + "logps/rejected": -531.9388834635416, + "loss": 0.0861, + "rewards/chosen": 5.730404535929362, + "rewards/margins": 14.719844182332356, + "rewards/rejected": -8.989439646402994, + "step": 658 + }, + { + "epoch": 0.16489428249718505, + "grad_norm": 16.25, + "kl": 5.5067877769470215, + "learning_rate": 5e-06, + "logits/chosen": -83794325.33333333, + "logits/rejected": -52759696.0, + "logps/chosen": -523.13818359375, + "logps/rejected": -510.0330403645833, + "loss": 0.0727, + "rewards/chosen": 9.032029469807943, + "rewards/margins": 17.574310302734375, + "rewards/rejected": -8.542280832926432, + "step": 659 + }, + { + "epoch": 0.16514450143875892, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67866263.27272727, + "logits/rejected": -29991428.923076924, + "logps/chosen": -511.46004971590907, + "logps/rejected": -441.0225360576923, + "loss": 0.0185, + "rewards/chosen": 7.2500083229758525, + "rewards/margins": 16.020787272419962, + "rewards/rejected": -8.77077894944411, + "step": 660 + }, + { + "epoch": 0.1653947203803328, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41601848.0, + "logits/rejected": -41465676.0, + "logps/chosen": -405.6173400878906, + "logps/rejected": -618.76025390625, + "loss": 0.0353, + "rewards/chosen": 6.1663007736206055, + "rewards/margins": 15.854592323303223, + "rewards/rejected": -9.688291549682617, + "step": 661 + }, + { + "epoch": 0.16564493932190666, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45884917.333333336, + "logits/rejected": -56918109.86666667, + "logps/chosen": -409.5426974826389, + "logps/rejected": -568.8666015625, + "loss": 0.0365, + "rewards/chosen": 5.419274648030599, + "rewards/margins": 16.57511672973633, + "rewards/rejected": -11.15584208170573, + "step": 662 + }, + { + "epoch": 0.16589515826348056, + "grad_norm": 19.125, + "kl": 2.281391143798828, + "learning_rate": 5e-06, + "logits/chosen": -48919569.06666667, + "logits/rejected": -74399658.66666667, + "logps/chosen": -374.050390625, + "logps/rejected": -561.4972330729166, + "loss": 0.1097, + "rewards/chosen": 5.25815684000651, + "rewards/margins": 15.972480095757378, + "rewards/rejected": -10.714323255750868, + "step": 663 + }, + { + "epoch": 0.16614537720505443, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66907374.54545455, + "logits/rejected": -56381902.76923077, + "logps/chosen": -425.4708806818182, + "logps/rejected": -617.5872145432693, + "loss": 0.0229, + "rewards/chosen": 5.99124492298473, + "rewards/margins": 17.961372909012375, + "rewards/rejected": -11.970127986027645, + "step": 664 + }, + { + "epoch": 0.1663955961466283, + "grad_norm": 5.28125, + "kl": 2.266335964202881, + "learning_rate": 5e-06, + "logits/chosen": -58393786.18181818, + "logits/rejected": -53815990.15384615, + "logps/chosen": -579.6943803267045, + "logps/rejected": -849.4885817307693, + "loss": 0.0189, + "rewards/chosen": 7.249734358354048, + "rewards/margins": 20.1824865274496, + "rewards/rejected": -12.932752169095552, + "step": 665 + }, + { + "epoch": 0.16664581508820217, + "grad_norm": 6.9375, + "kl": 9.461564064025879, + "learning_rate": 5e-06, + "logits/chosen": -57897604.571428575, + "logits/rejected": -68688595.2, + "logps/chosen": -535.4178641183036, + "logps/rejected": -563.0220703125, + "loss": 0.0671, + "rewards/chosen": 6.964071001325335, + "rewards/margins": 16.061434282575334, + "rewards/rejected": -9.09736328125, + "step": 666 + }, + { + "epoch": 0.16689603402977604, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47103685.81818182, + "logits/rejected": -60106491.07692308, + "logps/chosen": -465.9483753551136, + "logps/rejected": -720.1829176682693, + "loss": 0.0385, + "rewards/chosen": 7.5133056640625, + "rewards/margins": 17.82909451998197, + "rewards/rejected": -10.315788855919472, + "step": 667 + }, + { + "epoch": 0.16714625297134994, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42105920.0, + "logits/rejected": -50973195.63636363, + "logps/chosen": -368.02779447115387, + "logps/rejected": -590.5321377840909, + "loss": 0.0932, + "rewards/chosen": 5.910755450908955, + "rewards/margins": 15.882768057443046, + "rewards/rejected": -9.972012606534092, + "step": 668 + }, + { + "epoch": 0.1673964719129238, + "grad_norm": 7.09375, + "kl": 4.374902248382568, + "learning_rate": 5e-06, + "logits/chosen": -71032672.0, + "logits/rejected": -51098856.0, + "logps/chosen": -572.2081909179688, + "logps/rejected": -486.6452941894531, + "loss": 0.0216, + "rewards/chosen": 9.053348541259766, + "rewards/margins": 17.3983154296875, + "rewards/rejected": -8.344966888427734, + "step": 669 + }, + { + "epoch": 0.16764669085449768, + "grad_norm": 6.125, + "kl": 0.622650146484375, + "learning_rate": 5e-06, + "logits/chosen": -79201744.0, + "logits/rejected": -58268224.0, + "logps/chosen": -499.8522135416667, + "logps/rejected": -579.6601969401041, + "loss": 0.0105, + "rewards/chosen": 6.753852208455403, + "rewards/margins": 16.505355834960938, + "rewards/rejected": -9.751503626505533, + "step": 670 + }, + { + "epoch": 0.16789690979607155, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87231334.4, + "logits/rejected": -25305997.714285713, + "logps/chosen": -454.819140625, + "logps/rejected": -497.30782645089283, + "loss": 0.0247, + "rewards/chosen": 5.814561080932617, + "rewards/margins": 13.032736696515766, + "rewards/rejected": -7.218175615583148, + "step": 671 + }, + { + "epoch": 0.16814712873764545, + "grad_norm": 10.5625, + "kl": 3.3707733154296875, + "learning_rate": 5e-06, + "logits/chosen": -59407125.333333336, + "logits/rejected": -67145016.8888889, + "logps/chosen": -362.93359375, + "logps/rejected": -701.4104275173611, + "loss": 0.1211, + "rewards/chosen": 5.802168273925782, + "rewards/margins": 15.662977600097657, + "rewards/rejected": -9.860809326171875, + "step": 672 + }, + { + "epoch": 0.16839734767921932, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77413526.58823529, + "logits/rejected": -35157140.571428575, + "logps/chosen": -406.3794519761029, + "logps/rejected": -374.62744140625, + "loss": 0.0617, + "rewards/chosen": 5.624132941750919, + "rewards/margins": 13.331869365788307, + "rewards/rejected": -7.707736424037388, + "step": 673 + }, + { + "epoch": 0.1686475666207932, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74230120.0, + "logits/rejected": -45749808.0, + "logps/chosen": -275.2440185546875, + "logps/rejected": -655.884033203125, + "loss": 0.0775, + "rewards/chosen": 3.579000473022461, + "rewards/margins": 14.1707124710083, + "rewards/rejected": -10.59171199798584, + "step": 674 + }, + { + "epoch": 0.16889778556236706, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52094637.71428572, + "logits/rejected": -36853193.6, + "logps/chosen": -221.60508510044642, + "logps/rejected": -426.48662109375, + "loss": 0.1137, + "rewards/chosen": 3.56838253566197, + "rewards/margins": 11.586692319597516, + "rewards/rejected": -8.018309783935546, + "step": 675 + }, + { + "epoch": 0.16914800450394094, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70561402.66666667, + "logits/rejected": -42819237.333333336, + "logps/chosen": -472.2602132161458, + "logps/rejected": -518.2571614583334, + "loss": 0.0156, + "rewards/chosen": 5.527693430582683, + "rewards/margins": 14.173243204752605, + "rewards/rejected": -8.645549774169922, + "step": 676 + }, + { + "epoch": 0.16939822344551483, + "grad_norm": 8.625, + "kl": 2.589200973510742, + "learning_rate": 5e-06, + "logits/chosen": -57153557.333333336, + "logits/rejected": -19165306.666666668, + "logps/chosen": -459.7823079427083, + "logps/rejected": -440.3107096354167, + "loss": 0.0543, + "rewards/chosen": 5.891520818074544, + "rewards/margins": 12.614496231079102, + "rewards/rejected": -6.722975413004558, + "step": 677 + }, + { + "epoch": 0.1696484423870887, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58684000.0, + "logits/rejected": -25339477.333333332, + "logps/chosen": -278.4515380859375, + "logps/rejected": -715.0196940104166, + "loss": 0.1064, + "rewards/chosen": 3.71870485941569, + "rewards/margins": 16.13400713602702, + "rewards/rejected": -12.415302276611328, + "step": 678 + }, + { + "epoch": 0.16989866132866258, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50479221.333333336, + "logits/rejected": -52558997.333333336, + "logps/chosen": -429.3028564453125, + "logps/rejected": -698.0872395833334, + "loss": 0.0482, + "rewards/chosen": 5.871030171712239, + "rewards/margins": 17.883778889973957, + "rewards/rejected": -12.012748718261719, + "step": 679 + }, + { + "epoch": 0.17014888027023645, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70929932.8, + "logits/rejected": -26153078.85714286, + "logps/chosen": -363.9793212890625, + "logps/rejected": -454.523681640625, + "loss": 0.0352, + "rewards/chosen": 4.510313415527344, + "rewards/margins": 13.084225899832589, + "rewards/rejected": -8.573912484305245, + "step": 680 + }, + { + "epoch": 0.17039909921181035, + "grad_norm": 9.0625, + "kl": 0.3682422637939453, + "learning_rate": 5e-06, + "logits/chosen": -69601413.33333333, + "logits/rejected": -49324549.333333336, + "logps/chosen": -375.6105143229167, + "logps/rejected": -379.2108968098958, + "loss": 0.083, + "rewards/chosen": 3.9637940724690757, + "rewards/margins": 11.69102923075358, + "rewards/rejected": -7.727235158284505, + "step": 681 + }, + { + "epoch": 0.17064931815338422, + "grad_norm": 5.375, + "kl": 1.6558949947357178, + "learning_rate": 5e-06, + "logits/chosen": -42467827.2, + "logits/rejected": -53473435.428571425, + "logps/chosen": -373.6046875, + "logps/rejected": -554.8095354352679, + "loss": 0.0169, + "rewards/chosen": 5.687347412109375, + "rewards/margins": 15.629019601004464, + "rewards/rejected": -9.941672188895089, + "step": 682 + }, + { + "epoch": 0.1708995370949581, + "grad_norm": 10.125, + "kl": 1.0112838745117188, + "learning_rate": 5e-06, + "logits/chosen": -75722122.66666667, + "logits/rejected": -52346037.333333336, + "logps/chosen": -422.0463460286458, + "logps/rejected": -495.5492757161458, + "loss": 0.0405, + "rewards/chosen": 6.150388717651367, + "rewards/margins": 14.395827611287435, + "rewards/rejected": -8.245438893636068, + "step": 683 + }, + { + "epoch": 0.17114975603653196, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -106837376.0, + "logits/rejected": -93661883.73333333, + "logps/chosen": -507.72840711805554, + "logps/rejected": -703.4063802083333, + "loss": 0.0145, + "rewards/chosen": 7.626553005642361, + "rewards/margins": 20.393387518988717, + "rewards/rejected": -12.766834513346355, + "step": 684 + }, + { + "epoch": 0.17139997497810583, + "grad_norm": 8.0, + "kl": 5.420981407165527, + "learning_rate": 5e-06, + "logits/chosen": -81585810.28571428, + "logits/rejected": -18807579.2, + "logps/chosen": -500.3882533482143, + "logps/rejected": -639.95791015625, + "loss": 0.0358, + "rewards/chosen": 8.625656127929688, + "rewards/margins": 20.069236755371094, + "rewards/rejected": -11.443580627441406, + "step": 685 + }, + { + "epoch": 0.17165019391967973, + "grad_norm": 13.875, + "kl": 0.13960489630699158, + "learning_rate": 5e-06, + "logits/chosen": -26490086.85714286, + "logits/rejected": -36049920.0, + "logps/chosen": -351.08523995535717, + "logps/rejected": -465.912890625, + "loss": 0.0909, + "rewards/chosen": 4.871974400111607, + "rewards/margins": 11.220363071986608, + "rewards/rejected": -6.348388671875, + "step": 686 + }, + { + "epoch": 0.1719004128612536, + "grad_norm": 9.75, + "kl": 2.86164927482605, + "learning_rate": 5e-06, + "logits/chosen": -66866652.44444445, + "logits/rejected": -20687991.466666665, + "logps/chosen": -465.76860894097223, + "logps/rejected": -624.56953125, + "loss": 0.0416, + "rewards/chosen": 6.558584425184462, + "rewards/margins": 15.554604932996961, + "rewards/rejected": -8.9960205078125, + "step": 687 + }, + { + "epoch": 0.17215063180282747, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64529397.333333336, + "logits/rejected": -34528013.333333336, + "logps/chosen": -375.1025390625, + "logps/rejected": -494.53125, + "loss": 0.0311, + "rewards/chosen": 6.433191299438477, + "rewards/margins": 13.5333251953125, + "rewards/rejected": -7.100133895874023, + "step": 688 + }, + { + "epoch": 0.17240085074440134, + "grad_norm": 12.375, + "kl": 0.8503507375717163, + "learning_rate": 5e-06, + "logits/chosen": -81551445.33333333, + "logits/rejected": -32157968.0, + "logps/chosen": -291.41636149088544, + "logps/rejected": -418.5695393880208, + "loss": 0.0583, + "rewards/chosen": 6.001850128173828, + "rewards/margins": 12.537049611409504, + "rewards/rejected": -6.535199483235677, + "step": 689 + }, + { + "epoch": 0.17265106968597524, + "grad_norm": 16.875, + "kl": 0.7009134292602539, + "learning_rate": 5e-06, + "logits/chosen": -58566660.266666666, + "logits/rejected": -48897628.44444445, + "logps/chosen": -366.5291341145833, + "logps/rejected": -325.05322265625, + "loss": 0.1278, + "rewards/chosen": 6.048930358886719, + "rewards/margins": 11.004567125108508, + "rewards/rejected": -4.955636766221788, + "step": 690 + }, + { + "epoch": 0.1729012886275491, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27105212.444444444, + "logits/rejected": -56637614.93333333, + "logps/chosen": -380.39198133680554, + "logps/rejected": -702.0263020833333, + "loss": 0.0742, + "rewards/chosen": 5.317849900987413, + "rewards/margins": 16.23008134629991, + "rewards/rejected": -10.9122314453125, + "step": 691 + }, + { + "epoch": 0.17315150756912298, + "grad_norm": 12.25, + "kl": 10.227622032165527, + "learning_rate": 5e-06, + "logits/chosen": -50098432.0, + "logits/rejected": -54675366.4, + "logps/chosen": -349.3036411830357, + "logps/rejected": -542.9171875, + "loss": 0.1075, + "rewards/chosen": 6.4098325456891745, + "rewards/margins": 14.273331996372768, + "rewards/rejected": -7.863499450683594, + "step": 692 + }, + { + "epoch": 0.17340172651069685, + "grad_norm": 16.125, + "kl": 6.429734230041504, + "learning_rate": 5e-06, + "logits/chosen": -45190384.941176474, + "logits/rejected": -61296333.71428572, + "logps/chosen": -354.80701401654414, + "logps/rejected": -578.6396833147321, + "loss": 0.106, + "rewards/chosen": 5.8299547083237595, + "rewards/margins": 14.019735897288603, + "rewards/rejected": -8.189781188964844, + "step": 693 + }, + { + "epoch": 0.17365194545227075, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66619697.777777776, + "logits/rejected": -19792582.4, + "logps/chosen": -455.56206597222223, + "logps/rejected": -498.78196614583334, + "loss": 0.0392, + "rewards/chosen": 5.388075510660808, + "rewards/margins": 14.306572214762369, + "rewards/rejected": -8.918496704101562, + "step": 694 + }, + { + "epoch": 0.17390216439384462, + "grad_norm": 17.125, + "kl": 8.696220397949219, + "learning_rate": 5e-06, + "logits/chosen": -38154048.0, + "logits/rejected": -46748677.333333336, + "logps/chosen": -422.6399739583333, + "logps/rejected": -586.0579427083334, + "loss": 0.0855, + "rewards/chosen": 5.315046946207683, + "rewards/margins": 14.517519632975262, + "rewards/rejected": -9.202472686767578, + "step": 695 + }, + { + "epoch": 0.1741523833354185, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57893158.4, + "logits/rejected": -44106176.0, + "logps/chosen": -307.336083984375, + "logps/rejected": -518.4172014508929, + "loss": 0.0778, + "rewards/chosen": 3.7311058044433594, + "rewards/margins": 13.723721640450615, + "rewards/rejected": -9.992615836007255, + "step": 696 + }, + { + "epoch": 0.17440260227699236, + "grad_norm": 7.90625, + "kl": 5.789601802825928, + "learning_rate": 5e-06, + "logits/chosen": -26980958.0, + "logits/rejected": 4796307.5, + "logps/chosen": -460.54522705078125, + "logps/rejected": -577.863037109375, + "loss": 0.1196, + "rewards/chosen": 5.657016277313232, + "rewards/margins": 14.294976711273193, + "rewards/rejected": -8.637960433959961, + "step": 697 + }, + { + "epoch": 0.17465282121856623, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38760436.36363637, + "logits/rejected": -36518616.615384616, + "logps/chosen": -463.99320845170456, + "logps/rejected": -431.96228966346155, + "loss": 0.0509, + "rewards/chosen": 5.864227294921875, + "rewards/margins": 13.251133845402645, + "rewards/rejected": -7.386906550480769, + "step": 698 + }, + { + "epoch": 0.17490304016014013, + "grad_norm": 15.4375, + "kl": 0.9401601552963257, + "learning_rate": 5e-06, + "logits/chosen": -93742813.0909091, + "logits/rejected": -46093065.84615385, + "logps/chosen": -428.4674627130682, + "logps/rejected": -708.6716496394231, + "loss": 0.0956, + "rewards/chosen": 4.923358223655007, + "rewards/margins": 14.918353340842508, + "rewards/rejected": -9.9949951171875, + "step": 699 + }, + { + "epoch": 0.175153259101714, + "grad_norm": 8.875, + "kl": 2.8063995838165283, + "learning_rate": 5e-06, + "logits/chosen": -57266080.0, + "logits/rejected": -71095024.0, + "logps/chosen": -380.40972900390625, + "logps/rejected": -743.252685546875, + "loss": 0.0885, + "rewards/chosen": 5.708252906799316, + "rewards/margins": 19.88963508605957, + "rewards/rejected": -14.181382179260254, + "step": 700 + }, + { + "epoch": 0.17540347804328787, + "grad_norm": 9.375, + "kl": 2.2932868003845215, + "learning_rate": 5e-06, + "logits/chosen": -58853480.72727273, + "logits/rejected": -55137329.23076923, + "logps/chosen": -390.6741388494318, + "logps/rejected": -615.8079552283654, + "loss": 0.0535, + "rewards/chosen": 7.638673262162642, + "rewards/margins": 18.623382781769013, + "rewards/rejected": -10.98470951960637, + "step": 701 + }, + { + "epoch": 0.17565369698486175, + "grad_norm": 17.75, + "kl": 3.0111489295959473, + "learning_rate": 5e-06, + "logits/chosen": -57234824.53333333, + "logits/rejected": -18644693.333333332, + "logps/chosen": -356.08671875, + "logps/rejected": -425.9432779947917, + "loss": 0.0751, + "rewards/chosen": 4.617676798502604, + "rewards/margins": 15.072693040635851, + "rewards/rejected": -10.455016242133247, + "step": 702 + }, + { + "epoch": 0.17590391592643564, + "grad_norm": 5.0, + "kl": 9.278034210205078, + "learning_rate": 5e-06, + "logits/chosen": -61575414.15384615, + "logits/rejected": -38665376.0, + "logps/chosen": -521.6234224759615, + "logps/rejected": -612.9747869318181, + "loss": 0.0611, + "rewards/chosen": 7.4358684833233175, + "rewards/margins": 16.680585474401088, + "rewards/rejected": -9.24471699107777, + "step": 703 + }, + { + "epoch": 0.17615413486800952, + "grad_norm": 11.5, + "kl": 0.6015090942382812, + "learning_rate": 5e-06, + "logits/chosen": -53363524.92307692, + "logits/rejected": -80383976.72727273, + "logps/chosen": -452.48328575721155, + "logps/rejected": -645.0536665482955, + "loss": 0.0467, + "rewards/chosen": 6.177543053260217, + "rewards/margins": 13.943426065511638, + "rewards/rejected": -7.765883012251421, + "step": 704 + }, + { + "epoch": 0.1764043538095834, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47379992.615384616, + "logits/rejected": -81091473.45454545, + "logps/chosen": -258.82470703125, + "logps/rejected": -763.2136896306819, + "loss": 0.0999, + "rewards/chosen": 4.53782712496244, + "rewards/margins": 16.743336657544116, + "rewards/rejected": -12.205509532581676, + "step": 705 + }, + { + "epoch": 0.17665457275115726, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73657624.61538461, + "logits/rejected": -1966561.4545454546, + "logps/chosen": -528.8232797475962, + "logps/rejected": -392.4462890625, + "loss": 0.0231, + "rewards/chosen": 6.44094731257512, + "rewards/margins": 14.868203810044935, + "rewards/rejected": -8.427256497469815, + "step": 706 + }, + { + "epoch": 0.17690479169273113, + "grad_norm": 21.25, + "kl": 33.04115295410156, + "learning_rate": 5e-06, + "logits/chosen": -55989918.315789476, + "logits/rejected": -55486969.6, + "logps/chosen": -499.5463610197368, + "logps/rejected": -1032.14833984375, + "loss": 0.1098, + "rewards/chosen": 7.99026810495477, + "rewards/margins": 23.005423134251643, + "rewards/rejected": -15.015155029296874, + "step": 707 + }, + { + "epoch": 0.17715501063430503, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50918538.666666664, + "logits/rejected": -27359146.666666668, + "logps/chosen": -454.8282063802083, + "logps/rejected": -698.5673828125, + "loss": 0.0246, + "rewards/chosen": 6.743810653686523, + "rewards/margins": 16.029017130533852, + "rewards/rejected": -9.28520647684733, + "step": 708 + }, + { + "epoch": 0.1774052295758789, + "grad_norm": 15.625, + "kl": 3.79986572265625, + "learning_rate": 5e-06, + "logits/chosen": -66966252.307692304, + "logits/rejected": -58647947.63636363, + "logps/chosen": -470.3007061298077, + "logps/rejected": -517.6930930397727, + "loss": 0.0913, + "rewards/chosen": 7.029987041766827, + "rewards/margins": 14.387965168986288, + "rewards/rejected": -7.35797812721946, + "step": 709 + }, + { + "epoch": 0.17765544851745277, + "grad_norm": 4.78125, + "kl": 7.186982154846191, + "learning_rate": 5e-06, + "logits/chosen": -73866131.6923077, + "logits/rejected": -36368529.45454545, + "logps/chosen": -483.39066256009613, + "logps/rejected": -474.87668678977275, + "loss": 0.0766, + "rewards/chosen": 6.209300114558293, + "rewards/margins": 13.51670437259274, + "rewards/rejected": -7.307404258034446, + "step": 710 + }, + { + "epoch": 0.17790566745902664, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31546496.0, + "logits/rejected": -30095182.933333334, + "logps/chosen": -575.1024305555555, + "logps/rejected": -466.9169921875, + "loss": 0.0292, + "rewards/chosen": 6.828341166178386, + "rewards/margins": 14.729346211751302, + "rewards/rejected": -7.901005045572917, + "step": 711 + }, + { + "epoch": 0.17815588640060054, + "grad_norm": 10.25, + "kl": 5.903179168701172, + "learning_rate": 5e-06, + "logits/chosen": -61828789.333333336, + "logits/rejected": -29850149.333333332, + "logps/chosen": -393.2134195963542, + "logps/rejected": -494.1728108723958, + "loss": 0.027, + "rewards/chosen": 7.188343048095703, + "rewards/margins": 13.707970937093098, + "rewards/rejected": -6.5196278889973955, + "step": 712 + }, + { + "epoch": 0.1784061053421744, + "grad_norm": 16.125, + "kl": 11.499292373657227, + "learning_rate": 5e-06, + "logits/chosen": -74213997.71428572, + "logits/rejected": 124858931.2, + "logps/chosen": -477.9955357142857, + "logps/rejected": -592.252587890625, + "loss": 0.1009, + "rewards/chosen": 7.06403568812779, + "rewards/margins": 14.65481480189732, + "rewards/rejected": -7.590779113769531, + "step": 713 + }, + { + "epoch": 0.17865632428374828, + "grad_norm": 7.875, + "kl": 1.8934530019760132, + "learning_rate": 5e-06, + "logits/chosen": -62661139.692307696, + "logits/rejected": -68402885.81818181, + "logps/chosen": -419.9270207331731, + "logps/rejected": -668.5748845880681, + "loss": 0.053, + "rewards/chosen": 7.242148766150842, + "rewards/margins": 16.546153782130954, + "rewards/rejected": -9.304005015980113, + "step": 714 + }, + { + "epoch": 0.17890654322532215, + "grad_norm": 15.1875, + "kl": 3.329894781112671, + "learning_rate": 5e-06, + "logits/chosen": -43639113.6, + "logits/rejected": -49763670.85714286, + "logps/chosen": -352.9053955078125, + "logps/rejected": -552.7942940848214, + "loss": 0.1092, + "rewards/chosen": 5.797957229614258, + "rewards/margins": 16.703293882097515, + "rewards/rejected": -10.905336652483259, + "step": 715 + }, + { + "epoch": 0.17915676216689602, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74053873.77777778, + "logits/rejected": -34832123.733333334, + "logps/chosen": -470.9084201388889, + "logps/rejected": -633.1209635416667, + "loss": 0.0582, + "rewards/chosen": 5.858901129828559, + "rewards/margins": 14.033810085720486, + "rewards/rejected": -8.174908955891928, + "step": 716 + }, + { + "epoch": 0.17940698110846992, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44742971.07692308, + "logits/rejected": -30457152.0, + "logps/chosen": -260.2517653245192, + "logps/rejected": -557.5134943181819, + "loss": 0.082, + "rewards/chosen": 4.013749636136568, + "rewards/margins": 13.470396642084722, + "rewards/rejected": -9.456647005948154, + "step": 717 + }, + { + "epoch": 0.1796572000500438, + "grad_norm": 12.8125, + "kl": 2.5944085121154785, + "learning_rate": 5e-06, + "logits/chosen": -33767108.571428575, + "logits/rejected": -52787110.4, + "logps/chosen": -341.03853934151783, + "logps/rejected": -668.7759765625, + "loss": 0.0641, + "rewards/chosen": 5.297382354736328, + "rewards/margins": 14.50130386352539, + "rewards/rejected": -9.203921508789062, + "step": 718 + }, + { + "epoch": 0.17990741899161766, + "grad_norm": 8.125, + "kl": 5.018739700317383, + "learning_rate": 5e-06, + "logits/chosen": -54394554.666666664, + "logits/rejected": -74321024.0, + "logps/chosen": -325.34425862630206, + "logps/rejected": -537.8746744791666, + "loss": 0.0702, + "rewards/chosen": 5.580752054850261, + "rewards/margins": 16.269168853759766, + "rewards/rejected": -10.688416798909506, + "step": 719 + }, + { + "epoch": 0.18015763793319153, + "grad_norm": 6.125, + "kl": 7.915319442749023, + "learning_rate": 5e-06, + "logits/chosen": -83251652.92307693, + "logits/rejected": -70421120.0, + "logps/chosen": -388.1950871394231, + "logps/rejected": -481.87397904829544, + "loss": 0.0517, + "rewards/chosen": 5.9322028526893025, + "rewards/margins": 12.56598914086402, + "rewards/rejected": -6.633786288174716, + "step": 720 + }, + { + "epoch": 0.18040785687476543, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69674013.0909091, + "logits/rejected": -47242003.692307696, + "logps/chosen": -371.03178267045456, + "logps/rejected": -691.5302734375, + "loss": 0.0752, + "rewards/chosen": 5.602556055242365, + "rewards/margins": 17.488119832285633, + "rewards/rejected": -11.88556377704327, + "step": 721 + }, + { + "epoch": 0.1806580758163393, + "grad_norm": 10.4375, + "kl": 3.593658447265625, + "learning_rate": 5e-06, + "logits/chosen": -76687530.66666667, + "logits/rejected": -54437056.0, + "logps/chosen": -519.9554036458334, + "logps/rejected": -457.9519856770833, + "loss": 0.0333, + "rewards/chosen": 8.330449422200521, + "rewards/margins": 15.347049967447917, + "rewards/rejected": -7.016600545247396, + "step": 722 + }, + { + "epoch": 0.18090829475791317, + "grad_norm": 7.1875, + "kl": 1.7823947668075562, + "learning_rate": 5e-06, + "logits/chosen": -93296168.72727273, + "logits/rejected": -35667318.15384615, + "logps/chosen": -426.74587180397725, + "logps/rejected": -546.3592247596154, + "loss": 0.0522, + "rewards/chosen": 5.792817549272017, + "rewards/margins": 14.118543398130189, + "rewards/rejected": -8.325725848858173, + "step": 723 + }, + { + "epoch": 0.18115851369948704, + "grad_norm": 8.0, + "kl": 0.570186972618103, + "learning_rate": 5e-06, + "logits/chosen": -45147566.54545455, + "logits/rejected": -29549462.153846152, + "logps/chosen": -387.9017223011364, + "logps/rejected": -420.05716646634613, + "loss": 0.0424, + "rewards/chosen": 5.388437444513494, + "rewards/margins": 12.461523522863855, + "rewards/rejected": -7.073086078350361, + "step": 724 + }, + { + "epoch": 0.18140873264106092, + "grad_norm": 12.9375, + "kl": 1.8238639831542969, + "learning_rate": 5e-06, + "logits/chosen": -49162180.92307692, + "logits/rejected": -26878609.454545453, + "logps/chosen": -414.88927283653845, + "logps/rejected": -296.8208673650568, + "loss": 0.0584, + "rewards/chosen": 6.385125967172476, + "rewards/margins": 12.43112913545195, + "rewards/rejected": -6.046003168279475, + "step": 725 + }, + { + "epoch": 0.18165895158263481, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37551239.11111111, + "logits/rejected": -79150412.8, + "logps/chosen": -375.75146484375, + "logps/rejected": -591.9458333333333, + "loss": 0.0481, + "rewards/chosen": 6.276812235514323, + "rewards/margins": 16.622684224446616, + "rewards/rejected": -10.345871988932291, + "step": 726 + }, + { + "epoch": 0.18190917052420869, + "grad_norm": 6.75, + "kl": 2.515467405319214, + "learning_rate": 5e-06, + "logits/chosen": -76964346.66666667, + "logits/rejected": -94197440.0, + "logps/chosen": -456.6259358723958, + "logps/rejected": -456.8588460286458, + "loss": 0.0236, + "rewards/chosen": 6.5506032307942705, + "rewards/margins": 13.756879170735676, + "rewards/rejected": -7.206275939941406, + "step": 727 + }, + { + "epoch": 0.18215938946578256, + "grad_norm": 2.234375, + "kl": 2.5210318565368652, + "learning_rate": 5e-06, + "logits/chosen": -53339451.07692308, + "logits/rejected": -48501076.36363637, + "logps/chosen": -493.7118389423077, + "logps/rejected": -848.0596590909091, + "loss": 0.0054, + "rewards/chosen": 8.017585167518028, + "rewards/margins": 25.27701232483337, + "rewards/rejected": -17.25942715731534, + "step": 728 + }, + { + "epoch": 0.18240960840735643, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48911010.90909091, + "logits/rejected": -49852224.0, + "logps/chosen": -361.5631214488636, + "logps/rejected": -530.8541165865385, + "loss": 0.0232, + "rewards/chosen": 5.89506738836115, + "rewards/margins": 15.495888076462112, + "rewards/rejected": -9.600820688100962, + "step": 729 + }, + { + "epoch": 0.18265982734893033, + "grad_norm": 19.375, + "kl": 11.989924430847168, + "learning_rate": 5e-06, + "logits/chosen": -8390604.666666666, + "logits/rejected": -47819898.666666664, + "logps/chosen": -492.7035319010417, + "logps/rejected": -470.9751383463542, + "loss": 0.1241, + "rewards/chosen": 6.317582448323567, + "rewards/margins": 12.549383799235025, + "rewards/rejected": -6.231801350911458, + "step": 730 + }, + { + "epoch": 0.1829100462905042, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67372736.0, + "logits/rejected": -53511112.53333333, + "logps/chosen": -542.2922634548611, + "logps/rejected": -534.9907877604167, + "loss": 0.041, + "rewards/chosen": 7.221045600043403, + "rewards/margins": 17.336568874782987, + "rewards/rejected": -10.115523274739584, + "step": 731 + }, + { + "epoch": 0.18316026523207807, + "grad_norm": 12.125, + "kl": 4.440423965454102, + "learning_rate": 5e-06, + "logits/chosen": -35697900.307692304, + "logits/rejected": -52358365.09090909, + "logps/chosen": -384.01986929086536, + "logps/rejected": -458.3294122869318, + "loss": 0.0756, + "rewards/chosen": 4.612891270564153, + "rewards/margins": 13.66867532263269, + "rewards/rejected": -9.055784052068537, + "step": 732 + }, + { + "epoch": 0.18341048417365194, + "grad_norm": 6.59375, + "kl": 3.787334442138672, + "learning_rate": 5e-06, + "logits/chosen": -105404205.1764706, + "logits/rejected": -42533677.71428572, + "logps/chosen": -420.24488740808823, + "logps/rejected": -518.8389020647321, + "loss": 0.0744, + "rewards/chosen": 5.8443163703469665, + "rewards/margins": 16.0253793411896, + "rewards/rejected": -10.181062970842634, + "step": 733 + }, + { + "epoch": 0.1836607031152258, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22021725.714285713, + "logits/rejected": -40388875.294117644, + "logps/chosen": -348.66353934151783, + "logps/rejected": -431.63683363970586, + "loss": 0.0368, + "rewards/chosen": 5.842098236083984, + "rewards/margins": 13.277313905603744, + "rewards/rejected": -7.435215669519761, + "step": 734 + }, + { + "epoch": 0.1839109220567997, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52535842.90909091, + "logits/rejected": -26753092.923076924, + "logps/chosen": -414.73291015625, + "logps/rejected": -521.9594350961538, + "loss": 0.0319, + "rewards/chosen": 6.992141030051491, + "rewards/margins": 16.080937792371202, + "rewards/rejected": -9.088796762319712, + "step": 735 + }, + { + "epoch": 0.18416114099837358, + "grad_norm": 24.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48058642.28571428, + "logits/rejected": -23420648.0, + "logps/chosen": -362.770263671875, + "logps/rejected": -518.7287109375, + "loss": 0.0708, + "rewards/chosen": 3.8764964512416293, + "rewards/margins": 10.613148062569755, + "rewards/rejected": -6.736651611328125, + "step": 736 + }, + { + "epoch": 0.18441135993994745, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51263054.222222224, + "logits/rejected": -35914214.4, + "logps/chosen": -342.0602756076389, + "logps/rejected": -459.39755859375, + "loss": 0.0564, + "rewards/chosen": 5.4303783840603295, + "rewards/margins": 15.345145840115016, + "rewards/rejected": -9.914767456054687, + "step": 737 + }, + { + "epoch": 0.18466157888152132, + "grad_norm": 13.625, + "kl": 10.84058952331543, + "learning_rate": 5e-06, + "logits/chosen": -71714112.0, + "logits/rejected": -45106220.307692304, + "logps/chosen": -532.7406338778409, + "logps/rejected": -384.07474459134613, + "loss": 0.0204, + "rewards/chosen": 7.893820329145952, + "rewards/margins": 16.849273468230987, + "rewards/rejected": -8.955453139085035, + "step": 738 + }, + { + "epoch": 0.18491179782309522, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63248288.0, + "logits/rejected": -52859536.0, + "logps/chosen": -549.8051147460938, + "logps/rejected": -500.8243103027344, + "loss": 0.0035, + "rewards/chosen": 7.644482612609863, + "rewards/margins": 16.183485984802246, + "rewards/rejected": -8.539003372192383, + "step": 739 + }, + { + "epoch": 0.1851620167646691, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83834936.8888889, + "logits/rejected": -79412394.66666667, + "logps/chosen": -474.4694010416667, + "logps/rejected": -596.1455729166667, + "loss": 0.031, + "rewards/chosen": 7.703482733832465, + "rewards/margins": 19.916092597113714, + "rewards/rejected": -12.21260986328125, + "step": 740 + }, + { + "epoch": 0.18541223570624296, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19660963.692307692, + "logits/rejected": -40386658.90909091, + "logps/chosen": -297.5119816706731, + "logps/rejected": -511.09525923295456, + "loss": 0.0906, + "rewards/chosen": 4.164818396935096, + "rewards/margins": 13.506016017673733, + "rewards/rejected": -9.341197620738637, + "step": 741 + }, + { + "epoch": 0.18566245464781683, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -104695656.72727273, + "logits/rejected": -79931441.23076923, + "logps/chosen": -542.4687056107955, + "logps/rejected": -587.4605994591346, + "loss": 0.0096, + "rewards/chosen": 7.262121027166193, + "rewards/margins": 18.767978054660183, + "rewards/rejected": -11.50585702749399, + "step": 742 + }, + { + "epoch": 0.1859126735893907, + "grad_norm": 7.40625, + "kl": 0.5892280340194702, + "learning_rate": 5e-06, + "logits/chosen": -25681698.666666668, + "logits/rejected": -36335941.333333336, + "logps/chosen": -331.6855061848958, + "logps/rejected": -488.4554443359375, + "loss": 0.0403, + "rewards/chosen": 5.152231852213542, + "rewards/margins": 14.552221934000652, + "rewards/rejected": -9.39999008178711, + "step": 743 + }, + { + "epoch": 0.1861628925309646, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77571288.0, + "logits/rejected": -64650784.0, + "logps/chosen": -470.01446533203125, + "logps/rejected": -569.0632934570312, + "loss": 0.0608, + "rewards/chosen": 7.393571853637695, + "rewards/margins": 17.142139434814453, + "rewards/rejected": -9.748567581176758, + "step": 744 + }, + { + "epoch": 0.18641311147253847, + "grad_norm": 16.625, + "kl": 0.5460826754570007, + "learning_rate": 5e-06, + "logits/chosen": -7227328.0, + "logits/rejected": -26068051.2, + "logps/chosen": -490.69580078125, + "logps/rejected": -464.62421875, + "loss": 0.0381, + "rewards/chosen": 6.061325920952691, + "rewards/margins": 13.666175672743055, + "rewards/rejected": -7.604849751790365, + "step": 745 + }, + { + "epoch": 0.18666333041411234, + "grad_norm": 22.125, + "kl": 1.6078147888183594, + "learning_rate": 5e-06, + "logits/chosen": -43218771.692307696, + "logits/rejected": -50225640.72727273, + "logps/chosen": -323.54830228365387, + "logps/rejected": -555.5528231534091, + "loss": 0.0997, + "rewards/chosen": 4.106776310847356, + "rewards/margins": 12.702093804632867, + "rewards/rejected": -8.595317493785512, + "step": 746 + }, + { + "epoch": 0.18691354935568621, + "grad_norm": 5.59375, + "kl": 5.847883701324463, + "learning_rate": 5e-06, + "logits/chosen": -61582642.28571428, + "logits/rejected": -46114960.0, + "logps/chosen": -444.17173549107144, + "logps/rejected": -591.34609375, + "loss": 0.0394, + "rewards/chosen": 6.704927716936384, + "rewards/margins": 17.70850285121373, + "rewards/rejected": -11.003575134277344, + "step": 747 + }, + { + "epoch": 0.1871637682972601, + "grad_norm": 4.1875, + "kl": 5.278030872344971, + "learning_rate": 5e-06, + "logits/chosen": -36747737.6, + "logits/rejected": -70000704.0, + "logps/chosen": -368.3920572916667, + "logps/rejected": -311.98133680555554, + "loss": 0.0477, + "rewards/chosen": 6.172967529296875, + "rewards/margins": 13.405596245659723, + "rewards/rejected": -7.232628716362847, + "step": 748 + }, + { + "epoch": 0.18741398723883398, + "grad_norm": 10.875, + "kl": 1.9263179302215576, + "learning_rate": 5e-06, + "logits/chosen": -70863404.3076923, + "logits/rejected": -22107310.545454547, + "logps/chosen": -462.0105168269231, + "logps/rejected": -458.2373046875, + "loss": 0.0246, + "rewards/chosen": 7.0742962176983175, + "rewards/margins": 14.719641732169197, + "rewards/rejected": -7.645345514470881, + "step": 749 + }, + { + "epoch": 0.18766420618040786, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -91789513.14285715, + "logits/rejected": -72141683.2, + "logps/chosen": -390.64554268973217, + "logps/rejected": -771.376513671875, + "loss": 0.0271, + "rewards/chosen": 6.177424839564732, + "rewards/margins": 19.402516392299106, + "rewards/rejected": -13.225091552734375, + "step": 750 + }, + { + "epoch": 0.18791442512198173, + "grad_norm": 22.875, + "kl": 19.72315788269043, + "learning_rate": 5e-06, + "logits/chosen": -55871130.35294118, + "logits/rejected": -70852123.42857143, + "logps/chosen": -436.32117417279414, + "logps/rejected": -483.63089425223217, + "loss": 0.0678, + "rewards/chosen": 6.746851303998162, + "rewards/margins": 15.705984452191522, + "rewards/rejected": -8.95913314819336, + "step": 751 + }, + { + "epoch": 0.18816464406355562, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47239680.0, + "logits/rejected": -54592625.23076923, + "logps/chosen": -603.2428977272727, + "logps/rejected": -478.13003305288464, + "loss": 0.0499, + "rewards/chosen": 8.076557506214488, + "rewards/margins": 16.09364734996449, + "rewards/rejected": -8.01708984375, + "step": 752 + }, + { + "epoch": 0.1884148630051295, + "grad_norm": 15.9375, + "kl": 1.6756629943847656, + "learning_rate": 5e-06, + "logits/chosen": -60310000.0, + "logits/rejected": -29489061.333333332, + "logps/chosen": -258.2334798177083, + "logps/rejected": -634.0535074869791, + "loss": 0.0662, + "rewards/chosen": 4.710054079691569, + "rewards/margins": 15.058815320332844, + "rewards/rejected": -10.348761240641275, + "step": 753 + }, + { + "epoch": 0.18866508194670337, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49030297.6, + "logits/rejected": -40142065.777777776, + "logps/chosen": -340.61272786458335, + "logps/rejected": -400.89344618055554, + "loss": 0.0465, + "rewards/chosen": 6.660277811686198, + "rewards/margins": 14.644435797797309, + "rewards/rejected": -7.984157986111111, + "step": 754 + }, + { + "epoch": 0.18891530088827724, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23835105.6, + "logits/rejected": -54121389.71428572, + "logps/chosen": -292.4234375, + "logps/rejected": -622.8983677455357, + "loss": 0.0217, + "rewards/chosen": 4.780410385131836, + "rewards/margins": 17.66634875706264, + "rewards/rejected": -12.885938371930804, + "step": 755 + }, + { + "epoch": 0.1891655198298511, + "grad_norm": 19.75, + "kl": 6.965249061584473, + "learning_rate": 5e-06, + "logits/chosen": -52097258.666666664, + "logits/rejected": -41704900.266666666, + "logps/chosen": -564.0428602430555, + "logps/rejected": -433.09619140625, + "loss": 0.0203, + "rewards/chosen": 10.003687540690104, + "rewards/margins": 17.34937744140625, + "rewards/rejected": -7.345689900716146, + "step": 756 + }, + { + "epoch": 0.189415738771425, + "grad_norm": 4.28125, + "kl": 1.6121814250946045, + "learning_rate": 5e-06, + "logits/chosen": -56811072.0, + "logits/rejected": -62204773.333333336, + "logps/chosen": -426.9440104166667, + "logps/rejected": -559.7593587239584, + "loss": 0.0094, + "rewards/chosen": 8.24566396077474, + "rewards/margins": 18.42844835917155, + "rewards/rejected": -10.18278439839681, + "step": 757 + }, + { + "epoch": 0.18966595771299888, + "grad_norm": 13.4375, + "kl": 7.0572829246521, + "learning_rate": 5e-06, + "logits/chosen": -84275536.0, + "logits/rejected": -38835888.0, + "logps/chosen": -473.0864664713542, + "logps/rejected": -457.4165852864583, + "loss": 0.0525, + "rewards/chosen": 6.5260874430338545, + "rewards/margins": 14.497683207194012, + "rewards/rejected": -7.971595764160156, + "step": 758 + }, + { + "epoch": 0.18991617665457275, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 24208880.0, + "logits/rejected": -44486724.0, + "logps/chosen": -504.8377990722656, + "logps/rejected": -723.7698974609375, + "loss": 0.0133, + "rewards/chosen": 7.936724662780762, + "rewards/margins": 19.90409564971924, + "rewards/rejected": -11.967370986938477, + "step": 759 + }, + { + "epoch": 0.19016639559614662, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52100419.2, + "logits/rejected": -17151417.14285714, + "logps/chosen": -317.670703125, + "logps/rejected": -504.99581473214283, + "loss": 0.1065, + "rewards/chosen": 4.338721466064453, + "rewards/margins": 12.845295824323383, + "rewards/rejected": -8.506574358258929, + "step": 760 + }, + { + "epoch": 0.19041661453772052, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32805285.818181816, + "logits/rejected": -3965060.923076923, + "logps/chosen": -326.91410688920456, + "logps/rejected": -358.51089242788464, + "loss": 0.0618, + "rewards/chosen": 5.743856950239702, + "rewards/margins": 12.598762885673896, + "rewards/rejected": -6.854905935434195, + "step": 761 + }, + { + "epoch": 0.1906668334792944, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51308214.85714286, + "logits/rejected": -12822941.6, + "logps/chosen": -343.5065220424107, + "logps/rejected": -273.5468994140625, + "loss": 0.0461, + "rewards/chosen": 6.219408307756696, + "rewards/margins": 12.495134626116071, + "rewards/rejected": -6.275726318359375, + "step": 762 + }, + { + "epoch": 0.19091705242086826, + "grad_norm": 10.4375, + "kl": 1.5390746593475342, + "learning_rate": 5e-06, + "logits/chosen": -52308371.692307696, + "logits/rejected": -32224605.09090909, + "logps/chosen": -446.91458834134613, + "logps/rejected": -300.5582386363636, + "loss": 0.0579, + "rewards/chosen": 7.066094031700721, + "rewards/margins": 13.239156442922312, + "rewards/rejected": -6.173062411221591, + "step": 763 + }, + { + "epoch": 0.19116727136244213, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54476643.55555555, + "logits/rejected": -43014997.333333336, + "logps/chosen": -340.44146050347223, + "logps/rejected": -560.6063802083333, + "loss": 0.0272, + "rewards/chosen": 6.979727003309462, + "rewards/margins": 18.29089135064019, + "rewards/rejected": -11.31116434733073, + "step": 764 + }, + { + "epoch": 0.191417490304016, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55464677.333333336, + "logits/rejected": -60452624.0, + "logps/chosen": -387.4718831380208, + "logps/rejected": -610.9570719401041, + "loss": 0.0212, + "rewards/chosen": 7.022189458211263, + "rewards/margins": 15.729825337727863, + "rewards/rejected": -8.707635879516602, + "step": 765 + }, + { + "epoch": 0.1916677092455899, + "grad_norm": 11.75, + "kl": 2.5066115856170654, + "learning_rate": 5e-06, + "logits/chosen": -31838448.0, + "logits/rejected": -55355976.0, + "logps/chosen": -367.44366455078125, + "logps/rejected": -665.2879028320312, + "loss": 0.0561, + "rewards/chosen": 5.7261786460876465, + "rewards/margins": 16.928192615509033, + "rewards/rejected": -11.202013969421387, + "step": 766 + }, + { + "epoch": 0.19191792818716377, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48132042.666666664, + "logits/rejected": -52281546.666666664, + "logps/chosen": -415.7325846354167, + "logps/rejected": -640.1173095703125, + "loss": 0.0248, + "rewards/chosen": 6.91850217183431, + "rewards/margins": 17.803024927775066, + "rewards/rejected": -10.884522755940756, + "step": 767 + }, + { + "epoch": 0.19216814712873764, + "grad_norm": 11.4375, + "kl": 4.991845607757568, + "learning_rate": 5e-06, + "logits/chosen": -41127569.23076923, + "logits/rejected": -26718731.636363637, + "logps/chosen": -391.3740985576923, + "logps/rejected": -466.5599254261364, + "loss": 0.0352, + "rewards/chosen": 6.182608384352464, + "rewards/margins": 13.746976198849978, + "rewards/rejected": -7.564367814497515, + "step": 768 + }, + { + "epoch": 0.19241836607031151, + "grad_norm": 14.0625, + "kl": 0.30423229932785034, + "learning_rate": 5e-06, + "logits/chosen": -61034953.84615385, + "logits/rejected": -47198824.72727273, + "logps/chosen": -389.52892127403845, + "logps/rejected": -490.61860795454544, + "loss": 0.0388, + "rewards/chosen": 6.54525639460637, + "rewards/margins": 16.552333698406088, + "rewards/rejected": -10.007077303799717, + "step": 769 + }, + { + "epoch": 0.1926685850118854, + "grad_norm": 2.828125, + "kl": 2.7039363384246826, + "learning_rate": 5e-06, + "logits/chosen": -69796817.45454545, + "logits/rejected": -36781767.384615384, + "logps/chosen": -494.58522727272725, + "logps/rejected": -555.9032451923077, + "loss": 0.0122, + "rewards/chosen": 7.6953277587890625, + "rewards/margins": 17.446314298189606, + "rewards/rejected": -9.750986539400541, + "step": 770 + }, + { + "epoch": 0.19291880395345928, + "grad_norm": 15.75, + "kl": 4.258843898773193, + "learning_rate": 5e-06, + "logits/chosen": -81830108.44444445, + "logits/rejected": -62735112.53333333, + "logps/chosen": -397.0026584201389, + "logps/rejected": -426.97307942708335, + "loss": 0.0708, + "rewards/chosen": 7.245611402723524, + "rewards/margins": 13.92344750298394, + "rewards/rejected": -6.677836100260417, + "step": 771 + }, + { + "epoch": 0.19316902289503315, + "grad_norm": 8.75, + "kl": 3.016787528991699, + "learning_rate": 5e-06, + "logits/chosen": -52523565.71428572, + "logits/rejected": -56918009.6, + "logps/chosen": -449.27828543526783, + "logps/rejected": -550.47978515625, + "loss": 0.0346, + "rewards/chosen": 6.104246956961496, + "rewards/margins": 16.66878934587751, + "rewards/rejected": -10.564542388916015, + "step": 772 + }, + { + "epoch": 0.19341924183660703, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62468249.6, + "logits/rejected": -61920635.428571425, + "logps/chosen": -336.78134765625, + "logps/rejected": -499.6768275669643, + "loss": 0.0486, + "rewards/chosen": 6.820289611816406, + "rewards/margins": 14.544644492013113, + "rewards/rejected": -7.7243548801967075, + "step": 773 + }, + { + "epoch": 0.1936694607781809, + "grad_norm": 3.078125, + "kl": 4.653754711151123, + "learning_rate": 5e-06, + "logits/chosen": -79820381.86666666, + "logits/rejected": -68412416.0, + "logps/chosen": -436.62353515625, + "logps/rejected": -549.1727430555555, + "loss": 0.018, + "rewards/chosen": 7.988427734375, + "rewards/margins": 16.950892808702257, + "rewards/rejected": -8.962465074327257, + "step": 774 + }, + { + "epoch": 0.1939196797197548, + "grad_norm": 15.625, + "kl": 9.926856994628906, + "learning_rate": 5e-06, + "logits/chosen": -43630656.0, + "logits/rejected": -62075379.2, + "logps/chosen": -351.16245814732144, + "logps/rejected": -721.505517578125, + "loss": 0.0918, + "rewards/chosen": 6.118412562779018, + "rewards/margins": 20.750901576450893, + "rewards/rejected": -14.632489013671876, + "step": 775 + }, + { + "epoch": 0.19416989866132867, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32805280.0, + "logits/rejected": -48117952.0, + "logps/chosen": -276.3290771484375, + "logps/rejected": -588.8889508928571, + "loss": 0.0706, + "rewards/chosen": 4.932155609130859, + "rewards/margins": 13.072474343436104, + "rewards/rejected": -8.140318734305245, + "step": 776 + }, + { + "epoch": 0.19442011760290254, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56095260.44444445, + "logits/rejected": -48964078.93333333, + "logps/chosen": -285.20005967881946, + "logps/rejected": -579.3704427083334, + "loss": 0.0854, + "rewards/chosen": 4.338704427083333, + "rewards/margins": 11.526825968424479, + "rewards/rejected": -7.188121541341146, + "step": 777 + }, + { + "epoch": 0.1946703365444764, + "grad_norm": 4.96875, + "kl": 2.4994025230407715, + "learning_rate": 5e-06, + "logits/chosen": -58249338.18181818, + "logits/rejected": -21601095.384615384, + "logps/chosen": -464.8508966619318, + "logps/rejected": -497.72475961538464, + "loss": 0.0369, + "rewards/chosen": 6.807033192027699, + "rewards/margins": 16.16709953254753, + "rewards/rejected": -9.360066340519833, + "step": 778 + }, + { + "epoch": 0.1949205554860503, + "grad_norm": 16.625, + "kl": 5.22830867767334, + "learning_rate": 5e-06, + "logits/chosen": -48293481.14285714, + "logits/rejected": -50497737.6, + "logps/chosen": -400.02713448660717, + "logps/rejected": -577.370654296875, + "loss": 0.0892, + "rewards/chosen": 6.610093252999442, + "rewards/margins": 17.666199820382253, + "rewards/rejected": -11.056106567382812, + "step": 779 + }, + { + "epoch": 0.19517077442762418, + "grad_norm": 2.28125, + "kl": 5.2460150718688965, + "learning_rate": 5e-06, + "logits/chosen": -48169334.85714286, + "logits/rejected": -44441955.2, + "logps/chosen": -369.29286411830356, + "logps/rejected": -656.659619140625, + "loss": 0.0308, + "rewards/chosen": 7.134678431919643, + "rewards/margins": 16.292231532505582, + "rewards/rejected": -9.157553100585938, + "step": 780 + }, + { + "epoch": 0.19542099336919805, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 213684309.33333334, + "logits/rejected": -59194485.333333336, + "logps/chosen": -576.1879475911459, + "logps/rejected": -682.8180338541666, + "loss": 0.0548, + "rewards/chosen": 7.890483856201172, + "rewards/margins": 17.955281999376083, + "rewards/rejected": -10.064798143174913, + "step": 781 + }, + { + "epoch": 0.19567121231077192, + "grad_norm": 12.6875, + "kl": 5.794898509979248, + "learning_rate": 5e-06, + "logits/chosen": -51934084.0, + "logits/rejected": -59407036.0, + "logps/chosen": -375.10528564453125, + "logps/rejected": -476.72027587890625, + "loss": 0.0459, + "rewards/chosen": 5.85671329498291, + "rewards/margins": 14.188584327697754, + "rewards/rejected": -8.331871032714844, + "step": 782 + }, + { + "epoch": 0.1959214312523458, + "grad_norm": 10.4375, + "kl": 7.434493541717529, + "learning_rate": 5e-06, + "logits/chosen": -58841052.44444445, + "logits/rejected": -36314103.46666667, + "logps/chosen": -306.030517578125, + "logps/rejected": -443.04791666666665, + "loss": 0.0421, + "rewards/chosen": 6.728176964653863, + "rewards/margins": 14.62759713066949, + "rewards/rejected": -7.899420166015625, + "step": 783 + }, + { + "epoch": 0.1961716501939197, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66068544.0, + "logits/rejected": -36125330.823529415, + "logps/chosen": -417.34116908482144, + "logps/rejected": -447.0884650735294, + "loss": 0.0157, + "rewards/chosen": 6.3509014674595425, + "rewards/margins": 14.345273667022962, + "rewards/rejected": -7.994372199563419, + "step": 784 + }, + { + "epoch": 0.19642186913549356, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33594185.14285714, + "logits/rejected": -37133347.2, + "logps/chosen": -350.42518833705356, + "logps/rejected": -527.660107421875, + "loss": 0.0516, + "rewards/chosen": 5.214516230991909, + "rewards/margins": 14.731714412144253, + "rewards/rejected": -9.517198181152343, + "step": 785 + }, + { + "epoch": 0.19667208807706743, + "grad_norm": 15.5, + "kl": 11.855327606201172, + "learning_rate": 5e-06, + "logits/chosen": -42866858.666666664, + "logits/rejected": -66386104.88888889, + "logps/chosen": -423.8452473958333, + "logps/rejected": -528.5264756944445, + "loss": 0.0424, + "rewards/chosen": 7.475948079427083, + "rewards/margins": 14.528540886773005, + "rewards/rejected": -7.0525928073459205, + "step": 786 + }, + { + "epoch": 0.1969223070186413, + "grad_norm": 13.3125, + "kl": 8.092233657836914, + "learning_rate": 5e-06, + "logits/chosen": -50351599.15789474, + "logits/rejected": -49429683.2, + "logps/chosen": -443.9545641447368, + "logps/rejected": -830.33525390625, + "loss": 0.0466, + "rewards/chosen": 6.621235094572368, + "rewards/margins": 21.335682116056745, + "rewards/rejected": -14.714447021484375, + "step": 787 + }, + { + "epoch": 0.1971725259602152, + "grad_norm": 16.25, + "kl": 20.436603546142578, + "learning_rate": 5e-06, + "logits/chosen": -64435602.28571428, + "logits/rejected": -72926502.4, + "logps/chosen": -473.6840122767857, + "logps/rejected": -505.0876953125, + "loss": 0.1178, + "rewards/chosen": 7.37617438180106, + "rewards/margins": 14.716451481410434, + "rewards/rejected": -7.340277099609375, + "step": 788 + }, + { + "epoch": 0.19742274490178907, + "grad_norm": 17.375, + "kl": 1.1567704677581787, + "learning_rate": 5e-06, + "logits/chosen": -54946469.333333336, + "logits/rejected": -37511346.666666664, + "logps/chosen": -413.1483968098958, + "logps/rejected": -540.9465738932291, + "loss": 0.1048, + "rewards/chosen": 5.627518971761067, + "rewards/margins": 13.126118977864582, + "rewards/rejected": -7.498600006103516, + "step": 789 + }, + { + "epoch": 0.19767296384336294, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62774502.4, + "logits/rejected": -39964036.571428575, + "logps/chosen": -463.8818359375, + "logps/rejected": -584.0109514508929, + "loss": 0.0218, + "rewards/chosen": 7.480061340332031, + "rewards/margins": 18.057569449288504, + "rewards/rejected": -10.577508108956474, + "step": 790 + }, + { + "epoch": 0.1979231827849368, + "grad_norm": 9.25, + "kl": 8.550105094909668, + "learning_rate": 5e-06, + "logits/chosen": -55717208.615384616, + "logits/rejected": -47519586.90909091, + "logps/chosen": -513.7130033052885, + "logps/rejected": -545.9212979403409, + "loss": 0.0318, + "rewards/chosen": 7.185725872333233, + "rewards/margins": 15.249381378814057, + "rewards/rejected": -8.063655506480824, + "step": 791 + }, + { + "epoch": 0.19817340172651068, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36282480.0, + "logits/rejected": -39602952.0, + "logps/chosen": -421.6878662109375, + "logps/rejected": -653.6077880859375, + "loss": 0.0519, + "rewards/chosen": 8.16075325012207, + "rewards/margins": 17.460349082946777, + "rewards/rejected": -9.299595832824707, + "step": 792 + }, + { + "epoch": 0.19842362066808458, + "grad_norm": 19.375, + "kl": 3.8469817638397217, + "learning_rate": 5e-06, + "logits/chosen": -63231360.0, + "logits/rejected": -30232147.692307692, + "logps/chosen": -485.91104403409093, + "logps/rejected": -524.6609074519231, + "loss": 0.0929, + "rewards/chosen": 8.293641523881393, + "rewards/margins": 15.931041637500684, + "rewards/rejected": -7.637400113619291, + "step": 793 + }, + { + "epoch": 0.19867383960965845, + "grad_norm": 1.4375, + "kl": 1.10211181640625, + "learning_rate": 5e-06, + "logits/chosen": -81161390.54545455, + "logits/rejected": -41907657.84615385, + "logps/chosen": -481.1915838068182, + "logps/rejected": -525.1224834735577, + "loss": 0.0039, + "rewards/chosen": 6.877793051979759, + "rewards/margins": 14.500239552317801, + "rewards/rejected": -7.622446500338041, + "step": 794 + }, + { + "epoch": 0.19892405855123232, + "grad_norm": 11.9375, + "kl": 1.7197463512420654, + "learning_rate": 5e-06, + "logits/chosen": -57853036.307692304, + "logits/rejected": -49966731.63636363, + "logps/chosen": -400.5968674879808, + "logps/rejected": -671.5992542613636, + "loss": 0.0389, + "rewards/chosen": 7.59848139836238, + "rewards/margins": 17.708185155908545, + "rewards/rejected": -10.109703757546164, + "step": 795 + }, + { + "epoch": 0.1991742774928062, + "grad_norm": 1.9921875, + "kl": 6.740237236022949, + "learning_rate": 5e-06, + "logits/chosen": -70098694.4, + "logits/rejected": -53354130.28571428, + "logps/chosen": -486.557275390625, + "logps/rejected": -710.7572544642857, + "loss": 0.0144, + "rewards/chosen": 8.284804534912109, + "rewards/margins": 21.42544915335519, + "rewards/rejected": -13.14064461844308, + "step": 796 + }, + { + "epoch": 0.1994244964343801, + "grad_norm": 9.0, + "kl": 1.36517333984375, + "learning_rate": 5e-06, + "logits/chosen": -51640822.15384615, + "logits/rejected": -39425210.18181818, + "logps/chosen": -366.5441706730769, + "logps/rejected": -533.6463068181819, + "loss": 0.0431, + "rewards/chosen": 6.539581298828125, + "rewards/margins": 17.53961181640625, + "rewards/rejected": -11.000030517578125, + "step": 797 + }, + { + "epoch": 0.19967471537595397, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48864339.692307696, + "logits/rejected": -44865262.54545455, + "logps/chosen": -343.4710036057692, + "logps/rejected": -601.4108664772727, + "loss": 0.0712, + "rewards/chosen": 4.839988708496094, + "rewards/margins": 13.800580804998225, + "rewards/rejected": -8.96059209650213, + "step": 798 + }, + { + "epoch": 0.19992493431752784, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47303571.2, + "logits/rejected": -47950340.571428575, + "logps/chosen": -351.1532470703125, + "logps/rejected": -664.36328125, + "loss": 0.0467, + "rewards/chosen": 6.578648376464844, + "rewards/margins": 16.07303641183036, + "rewards/rejected": -9.494388035365514, + "step": 799 + }, + { + "epoch": 0.2001751532591017, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69996309.33333333, + "logits/rejected": -47352160.0, + "logps/chosen": -408.6333414713542, + "logps/rejected": -636.7174886067709, + "loss": 0.0456, + "rewards/chosen": 5.707289377848308, + "rewards/margins": 13.65249252319336, + "rewards/rejected": -7.945203145345052, + "step": 800 + }, + { + "epoch": 0.20042537220067558, + "grad_norm": 13.6875, + "kl": 0.8703645467758179, + "learning_rate": 5e-06, + "logits/chosen": -42387300.0, + "logits/rejected": -18186752.0, + "logps/chosen": -338.995361328125, + "logps/rejected": -379.30133056640625, + "loss": 0.0741, + "rewards/chosen": 6.07996129989624, + "rewards/margins": 13.174275398254395, + "rewards/rejected": -7.094314098358154, + "step": 801 + }, + { + "epoch": 0.20067559114224948, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49454641.23076923, + "logits/rejected": -45453466.18181818, + "logps/chosen": -344.4498948317308, + "logps/rejected": -545.4881924715909, + "loss": 0.0363, + "rewards/chosen": 5.379285959097055, + "rewards/margins": 16.756572696712468, + "rewards/rejected": -11.377286737615412, + "step": 802 + }, + { + "epoch": 0.20092581008382335, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59304434.28571428, + "logits/rejected": -29780595.2, + "logps/chosen": -328.17801339285717, + "logps/rejected": -540.84912109375, + "loss": 0.0697, + "rewards/chosen": 5.914222717285156, + "rewards/margins": 18.02959747314453, + "rewards/rejected": -12.115374755859374, + "step": 803 + }, + { + "epoch": 0.20117602902539722, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66439744.0, + "logits/rejected": -28480691.692307692, + "logps/chosen": -414.01957563920456, + "logps/rejected": -547.6895282451923, + "loss": 0.0246, + "rewards/chosen": 7.101613825017756, + "rewards/margins": 18.80237723397208, + "rewards/rejected": -11.700763408954327, + "step": 804 + }, + { + "epoch": 0.2014262479669711, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26977117.53846154, + "logits/rejected": -35117533.09090909, + "logps/chosen": -387.91650390625, + "logps/rejected": -524.9378107244319, + "loss": 0.081, + "rewards/chosen": 5.092916048490084, + "rewards/margins": 15.22620642602027, + "rewards/rejected": -10.133290377530185, + "step": 805 + }, + { + "epoch": 0.201676466908545, + "grad_norm": 15.1875, + "kl": 0.31385931372642517, + "learning_rate": 5e-06, + "logits/chosen": -46441126.4, + "logits/rejected": -43305436.44444445, + "logps/chosen": -359.1406575520833, + "logps/rejected": -717.1804470486111, + "loss": 0.0661, + "rewards/chosen": 6.0799204508463545, + "rewards/margins": 16.787812127007378, + "rewards/rejected": -10.707891676161024, + "step": 806 + }, + { + "epoch": 0.20192668585011886, + "grad_norm": 6.375, + "kl": 5.913142204284668, + "learning_rate": 5e-06, + "logits/chosen": -40712609.88235294, + "logits/rejected": -85484982.85714285, + "logps/chosen": -331.26220703125, + "logps/rejected": -697.1421595982143, + "loss": 0.0534, + "rewards/chosen": 5.862836052389706, + "rewards/margins": 15.491141760048745, + "rewards/rejected": -9.62830570765904, + "step": 807 + }, + { + "epoch": 0.20217690479169273, + "grad_norm": 13.125, + "kl": 2.1432228088378906, + "learning_rate": 5e-06, + "logits/chosen": -35160153.14285714, + "logits/rejected": -28466432.0, + "logps/chosen": -454.322021484375, + "logps/rejected": -456.8068359375, + "loss": 0.0554, + "rewards/chosen": 6.256374904087612, + "rewards/margins": 15.419541713169643, + "rewards/rejected": -9.163166809082032, + "step": 808 + }, + { + "epoch": 0.2024271237332666, + "grad_norm": 7.3125, + "kl": 2.2905538082122803, + "learning_rate": 5e-06, + "logits/chosen": -50495085.71428572, + "logits/rejected": -33732294.4, + "logps/chosen": -382.4754115513393, + "logps/rejected": -485.15234375, + "loss": 0.0485, + "rewards/chosen": 6.194065638950893, + "rewards/margins": 15.424795314243863, + "rewards/rejected": -9.230729675292968, + "step": 809 + }, + { + "epoch": 0.20267734267484047, + "grad_norm": 10.0625, + "kl": 12.888921737670898, + "learning_rate": 5e-06, + "logits/chosen": -54909376.0, + "logits/rejected": -93817309.0909091, + "logps/chosen": -510.6948993389423, + "logps/rejected": -764.3355823863636, + "loss": 0.0188, + "rewards/chosen": 7.7440032958984375, + "rewards/margins": 19.01056601784446, + "rewards/rejected": -11.266562721946023, + "step": 810 + }, + { + "epoch": 0.20292756161641437, + "grad_norm": 11.875, + "kl": 0.23932330310344696, + "learning_rate": 5e-06, + "logits/chosen": -44965632.0, + "logits/rejected": -41088272.0, + "logps/chosen": -286.18516322544644, + "logps/rejected": -505.327978515625, + "loss": 0.0831, + "rewards/chosen": 4.751211983816964, + "rewards/margins": 13.533659798758372, + "rewards/rejected": -8.782447814941406, + "step": 811 + }, + { + "epoch": 0.20317778055798824, + "grad_norm": 13.625, + "kl": 5.660660743713379, + "learning_rate": 5e-06, + "logits/chosen": -43158245.05263158, + "logits/rejected": -28269974.4, + "logps/chosen": -368.97239925986844, + "logps/rejected": -602.176611328125, + "loss": 0.107, + "rewards/chosen": 5.408772117213199, + "rewards/margins": 14.234994346217107, + "rewards/rejected": -8.826222229003907, + "step": 812 + }, + { + "epoch": 0.2034279994995621, + "grad_norm": 19.125, + "kl": 1.3985570669174194, + "learning_rate": 5e-06, + "logits/chosen": -13799566.666666666, + "logits/rejected": -50878677.333333336, + "logps/chosen": -307.6500244140625, + "logps/rejected": -757.4918619791666, + "loss": 0.0697, + "rewards/chosen": 5.658947626749675, + "rewards/margins": 14.656039555867512, + "rewards/rejected": -8.997091929117838, + "step": 813 + }, + { + "epoch": 0.20367821844113598, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49396320.0, + "logits/rejected": -29117410.666666668, + "logps/chosen": -405.6623942057292, + "logps/rejected": -581.714111328125, + "loss": 0.0388, + "rewards/chosen": 7.830986022949219, + "rewards/margins": 16.280290603637695, + "rewards/rejected": -8.449304580688477, + "step": 814 + }, + { + "epoch": 0.20392843738270988, + "grad_norm": 21.25, + "kl": 6.440698623657227, + "learning_rate": 5e-06, + "logits/chosen": -61339304.0, + "logits/rejected": -34033212.0, + "logps/chosen": -381.3726501464844, + "logps/rejected": -487.78515625, + "loss": 0.0597, + "rewards/chosen": 6.295356750488281, + "rewards/margins": 13.154322624206543, + "rewards/rejected": -6.858965873718262, + "step": 815 + }, + { + "epoch": 0.20417865632428375, + "grad_norm": 16.75, + "kl": 0.2847709655761719, + "learning_rate": 5e-06, + "logits/chosen": -49366925.71428572, + "logits/rejected": -53796761.6, + "logps/chosen": -377.70595005580356, + "logps/rejected": -507.280810546875, + "loss": 0.058, + "rewards/chosen": 5.664988926478794, + "rewards/margins": 14.688199833461216, + "rewards/rejected": -9.023210906982422, + "step": 816 + }, + { + "epoch": 0.20442887526585762, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51120056.88888889, + "logits/rejected": -30452398.933333334, + "logps/chosen": -351.32188585069446, + "logps/rejected": -631.512890625, + "loss": 0.0578, + "rewards/chosen": 5.218953026665582, + "rewards/margins": 12.556123436821832, + "rewards/rejected": -7.33717041015625, + "step": 817 + }, + { + "epoch": 0.2046790942074315, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40052777.6, + "logits/rejected": -55822555.428571425, + "logps/chosen": -408.3108642578125, + "logps/rejected": -545.50537109375, + "loss": 0.0356, + "rewards/chosen": 6.1724708557128904, + "rewards/margins": 15.47464828491211, + "rewards/rejected": -9.302177429199219, + "step": 818 + }, + { + "epoch": 0.2049293131490054, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59217819.428571425, + "logits/rejected": -35902732.8, + "logps/chosen": -402.3795689174107, + "logps/rejected": -592.60673828125, + "loss": 0.0565, + "rewards/chosen": 6.2483640398297995, + "rewards/margins": 16.120682307652064, + "rewards/rejected": -9.872318267822266, + "step": 819 + }, + { + "epoch": 0.20517953209057926, + "grad_norm": 9.1875, + "kl": 3.18426775932312, + "learning_rate": 5e-06, + "logits/chosen": -48457554.28571428, + "logits/rejected": -43792019.2, + "logps/chosen": -375.62681361607144, + "logps/rejected": -590.33076171875, + "loss": 0.0856, + "rewards/chosen": 6.395714351109096, + "rewards/margins": 15.888672419956752, + "rewards/rejected": -9.492958068847656, + "step": 820 + }, + { + "epoch": 0.20542975103215314, + "grad_norm": 14.3125, + "kl": 6.878249168395996, + "learning_rate": 5e-06, + "logits/chosen": -63146906.666666664, + "logits/rejected": -19526541.333333332, + "logps/chosen": -317.61997477213544, + "logps/rejected": -612.055908203125, + "loss": 0.0913, + "rewards/chosen": 4.943337440490723, + "rewards/margins": 15.065688769022623, + "rewards/rejected": -10.1223513285319, + "step": 821 + }, + { + "epoch": 0.205679969973727, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54319974.4, + "logits/rejected": -57623954.28571428, + "logps/chosen": -406.001220703125, + "logps/rejected": -621.3896484375, + "loss": 0.0278, + "rewards/chosen": 5.587949752807617, + "rewards/margins": 13.978311538696289, + "rewards/rejected": -8.390361785888672, + "step": 822 + }, + { + "epoch": 0.20593018891530088, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45855674.666666664, + "logits/rejected": -53418816.0, + "logps/chosen": -474.3279622395833, + "logps/rejected": -620.4185384114584, + "loss": 0.0533, + "rewards/chosen": 5.96588134765625, + "rewards/margins": 16.95417912801107, + "rewards/rejected": -10.988297780354818, + "step": 823 + }, + { + "epoch": 0.20618040785687478, + "grad_norm": 7.71875, + "kl": 0.8577841520309448, + "learning_rate": 5e-06, + "logits/chosen": -41662215.11111111, + "logits/rejected": -35238327.46666667, + "logps/chosen": -383.3494466145833, + "logps/rejected": -397.70712890625, + "loss": 0.0346, + "rewards/chosen": 7.946407741970486, + "rewards/margins": 15.131324428982204, + "rewards/rejected": -7.184916687011719, + "step": 824 + }, + { + "epoch": 0.20643062679844865, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65579238.4, + "logits/rejected": -44685193.14285714, + "logps/chosen": -377.4366943359375, + "logps/rejected": -494.9213169642857, + "loss": 0.0418, + "rewards/chosen": 5.857852935791016, + "rewards/margins": 13.956713540213448, + "rewards/rejected": -8.098860604422432, + "step": 825 + }, + { + "epoch": 0.20668084574002252, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37624906.666666664, + "logits/rejected": 91468125.86666666, + "logps/chosen": -349.5646701388889, + "logps/rejected": -633.4867838541667, + "loss": 0.0468, + "rewards/chosen": 5.625090705023871, + "rewards/margins": 16.80655025906033, + "rewards/rejected": -11.181459554036458, + "step": 826 + }, + { + "epoch": 0.2069310646815964, + "grad_norm": 7.75, + "kl": 0.3838348388671875, + "learning_rate": 5e-06, + "logits/chosen": -11999882.181818182, + "logits/rejected": -29316740.923076924, + "logps/chosen": -403.7110706676136, + "logps/rejected": -361.85802283653845, + "loss": 0.0359, + "rewards/chosen": 5.75521503795277, + "rewards/margins": 11.892577004599405, + "rewards/rejected": -6.137361966646635, + "step": 827 + }, + { + "epoch": 0.2071812836231703, + "grad_norm": 16.625, + "kl": 7.2646613121032715, + "learning_rate": 5e-06, + "logits/chosen": -79085479.38461539, + "logits/rejected": -53273629.09090909, + "logps/chosen": -479.7726862980769, + "logps/rejected": -563.8832563920455, + "loss": 0.0301, + "rewards/chosen": 7.126521770770733, + "rewards/margins": 16.66407503781619, + "rewards/rejected": -9.537553267045455, + "step": 828 + }, + { + "epoch": 0.20743150256474416, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74224710.4, + "logits/rejected": -35041188.571428575, + "logps/chosen": -335.867041015625, + "logps/rejected": -397.70703125, + "loss": 0.0439, + "rewards/chosen": 5.500564193725586, + "rewards/margins": 14.455432510375976, + "rewards/rejected": -8.95486831665039, + "step": 829 + }, + { + "epoch": 0.20768172150631803, + "grad_norm": 6.3125, + "kl": 4.6465630531311035, + "learning_rate": 5e-06, + "logits/chosen": -42577053.538461536, + "logits/rejected": -60188904.72727273, + "logps/chosen": -517.484375, + "logps/rejected": -514.5040838068181, + "loss": 0.0474, + "rewards/chosen": 5.921867370605469, + "rewards/margins": 15.534251126376065, + "rewards/rejected": -9.612383755770596, + "step": 830 + }, + { + "epoch": 0.2079319404478919, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63994416.0, + "logits/rejected": -41208864.0, + "logps/chosen": -432.72100830078125, + "logps/rejected": -644.6954956054688, + "loss": 0.0331, + "rewards/chosen": 6.187349796295166, + "rewards/margins": 21.06824827194214, + "rewards/rejected": -14.880898475646973, + "step": 831 + }, + { + "epoch": 0.20818215938946577, + "grad_norm": 4.59375, + "kl": 3.961434841156006, + "learning_rate": 5e-06, + "logits/chosen": -33809644.307692304, + "logits/rejected": -54601547.63636363, + "logps/chosen": -448.92540564903845, + "logps/rejected": -667.9743874289773, + "loss": 0.0104, + "rewards/chosen": 6.512371356670673, + "rewards/margins": 21.6511167512907, + "rewards/rejected": -15.13874539462003, + "step": 832 + }, + { + "epoch": 0.20843237833103967, + "grad_norm": 10.375, + "kl": 5.6155829429626465, + "learning_rate": 5e-06, + "logits/chosen": -70941508.26666667, + "logits/rejected": -49239608.88888889, + "logps/chosen": -432.57607421875, + "logps/rejected": -613.7744140625, + "loss": 0.0512, + "rewards/chosen": 6.954598999023437, + "rewards/margins": 16.32216033935547, + "rewards/rejected": -9.367561340332031, + "step": 833 + }, + { + "epoch": 0.20868259727261354, + "grad_norm": 8.4375, + "kl": 1.0676867961883545, + "learning_rate": 5e-06, + "logits/chosen": -54695699.2, + "logits/rejected": -44111465.14285714, + "logps/chosen": -505.381494140625, + "logps/rejected": -486.27535574776783, + "loss": 0.0167, + "rewards/chosen": 7.41577377319336, + "rewards/margins": 16.328859710693358, + "rewards/rejected": -8.9130859375, + "step": 834 + }, + { + "epoch": 0.2089328162141874, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36743656.0, + "logits/rejected": -37833946.666666664, + "logps/chosen": -445.3172200520833, + "logps/rejected": -498.4071044921875, + "loss": 0.0403, + "rewards/chosen": 5.983867645263672, + "rewards/margins": 16.58716901143392, + "rewards/rejected": -10.603301366170248, + "step": 835 + }, + { + "epoch": 0.20918303515576128, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38590470.4, + "logits/rejected": -47026797.71428572, + "logps/chosen": -367.522900390625, + "logps/rejected": -475.99379185267856, + "loss": 0.0558, + "rewards/chosen": 4.908218765258789, + "rewards/margins": 16.21550897870745, + "rewards/rejected": -11.307290213448661, + "step": 836 + }, + { + "epoch": 0.20943325409733518, + "grad_norm": 16.25, + "kl": 1.3447717428207397, + "learning_rate": 5e-06, + "logits/chosen": -33159101.09090909, + "logits/rejected": -37479372.307692304, + "logps/chosen": -363.07852450284093, + "logps/rejected": -421.51615084134613, + "loss": 0.0755, + "rewards/chosen": 4.89566386829723, + "rewards/margins": 13.38884729772181, + "rewards/rejected": -8.49318342942458, + "step": 837 + }, + { + "epoch": 0.20968347303890905, + "grad_norm": 7.875, + "kl": 2.8279104232788086, + "learning_rate": 5e-06, + "logits/chosen": -63577109.333333336, + "logits/rejected": -23843453.866666667, + "logps/chosen": -535.0646701388889, + "logps/rejected": -451.5923828125, + "loss": 0.0314, + "rewards/chosen": 9.165021260579428, + "rewards/margins": 18.087744649251302, + "rewards/rejected": -8.922723388671875, + "step": 838 + }, + { + "epoch": 0.20993369198048292, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72973677.71428572, + "logits/rejected": -20600211.2, + "logps/chosen": -393.15189034598217, + "logps/rejected": -617.3955078125, + "loss": 0.0299, + "rewards/chosen": 6.266732352120536, + "rewards/margins": 15.502330344063896, + "rewards/rejected": -9.23559799194336, + "step": 839 + }, + { + "epoch": 0.2101839109220568, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64426212.571428575, + "logits/rejected": -54891276.8, + "logps/chosen": -400.4794921875, + "logps/rejected": -513.963623046875, + "loss": 0.0446, + "rewards/chosen": 6.538800920758929, + "rewards/margins": 15.272718157087054, + "rewards/rejected": -8.733917236328125, + "step": 840 + }, + { + "epoch": 0.21043412986363066, + "grad_norm": 13.5625, + "kl": 8.454451560974121, + "learning_rate": 5e-06, + "logits/chosen": -91480469.33333333, + "logits/rejected": -54089498.666666664, + "logps/chosen": -552.0216471354166, + "logps/rejected": -524.0720621744791, + "loss": 0.0815, + "rewards/chosen": 6.681446075439453, + "rewards/margins": 15.04843266805013, + "rewards/rejected": -8.366986592610678, + "step": 841 + }, + { + "epoch": 0.21068434880520456, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43597897.84615385, + "logits/rejected": -50333736.72727273, + "logps/chosen": -471.59119591346155, + "logps/rejected": -776.5814098011364, + "loss": 0.0171, + "rewards/chosen": 7.979854290301983, + "rewards/margins": 19.372585003192608, + "rewards/rejected": -11.392730712890625, + "step": 842 + }, + { + "epoch": 0.21093456774677843, + "grad_norm": 20.5, + "kl": 1.9040069580078125, + "learning_rate": 5e-06, + "logits/chosen": -67600106.66666667, + "logits/rejected": -49108629.333333336, + "logps/chosen": -405.8778483072917, + "logps/rejected": -359.3288981119792, + "loss": 0.0445, + "rewards/chosen": 6.412080764770508, + "rewards/margins": 13.216399510701496, + "rewards/rejected": -6.804318745930989, + "step": 843 + }, + { + "epoch": 0.2111847866883523, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53075444.36363637, + "logits/rejected": -53297467.07692308, + "logps/chosen": -466.5056818181818, + "logps/rejected": -615.8974233774038, + "loss": 0.0369, + "rewards/chosen": 7.342480746182528, + "rewards/margins": 17.85110430950885, + "rewards/rejected": -10.508623563326323, + "step": 844 + }, + { + "epoch": 0.21143500562992618, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46289424.0, + "logits/rejected": 122063264.0, + "logps/chosen": -417.7291259765625, + "logps/rejected": -580.0477294921875, + "loss": 0.0336, + "rewards/chosen": 4.884383678436279, + "rewards/margins": 15.343745708465576, + "rewards/rejected": -10.459362030029297, + "step": 845 + }, + { + "epoch": 0.21168522457150007, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76412326.4, + "logits/rejected": -68620544.0, + "logps/chosen": -512.96103515625, + "logps/rejected": -427.666015625, + "loss": 0.0469, + "rewards/chosen": 5.886021423339844, + "rewards/margins": 14.041544451032365, + "rewards/rejected": -8.155523027692523, + "step": 846 + }, + { + "epoch": 0.21193544351307395, + "grad_norm": 17.25, + "kl": 16.615293502807617, + "learning_rate": 5e-06, + "logits/chosen": -68891504.94117647, + "logits/rejected": -69892210.28571428, + "logps/chosen": -387.6879308363971, + "logps/rejected": -434.17006138392856, + "loss": 0.1837, + "rewards/chosen": 6.362054263844209, + "rewards/margins": 15.139738066857602, + "rewards/rejected": -8.777683803013392, + "step": 847 + }, + { + "epoch": 0.21218566245464782, + "grad_norm": 11.125, + "kl": 3.3448550701141357, + "learning_rate": 5e-06, + "logits/chosen": -45499528.53333333, + "logits/rejected": -25999232.0, + "logps/chosen": -401.82985026041666, + "logps/rejected": -768.7527669270834, + "loss": 0.0203, + "rewards/chosen": 6.305695597330729, + "rewards/margins": 20.066478474934897, + "rewards/rejected": -13.760782877604166, + "step": 848 + }, + { + "epoch": 0.2124358813962217, + "grad_norm": 4.78125, + "kl": 0.7335942983627319, + "learning_rate": 5e-06, + "logits/chosen": -63022805.333333336, + "logits/rejected": -44460765.333333336, + "logps/chosen": -422.1632486979167, + "logps/rejected": -604.6455891927084, + "loss": 0.0242, + "rewards/chosen": 7.583875020345052, + "rewards/margins": 16.90160306294759, + "rewards/rejected": -9.317728042602539, + "step": 849 + }, + { + "epoch": 0.21268610033779556, + "grad_norm": 12.9375, + "kl": 0.8178736567497253, + "learning_rate": 5e-06, + "logits/chosen": -32316640.0, + "logits/rejected": -47145015.27272727, + "logps/chosen": -303.7578312800481, + "logps/rejected": -715.5393732244319, + "loss": 0.0536, + "rewards/chosen": 5.149880629319411, + "rewards/margins": 14.817243189244838, + "rewards/rejected": -9.667362559925426, + "step": 850 + }, + { + "epoch": 0.21293631927936946, + "grad_norm": 11.4375, + "kl": 2.490816831588745, + "learning_rate": 5e-06, + "logits/chosen": -50717082.666666664, + "logits/rejected": -25829136.0, + "logps/chosen": -440.0017496744792, + "logps/rejected": -454.2618001302083, + "loss": 0.0728, + "rewards/chosen": 6.279614766438802, + "rewards/margins": 14.91478157043457, + "rewards/rejected": -8.635166803995768, + "step": 851 + }, + { + "epoch": 0.21318653822094333, + "grad_norm": 9.5625, + "kl": 1.5940736532211304, + "learning_rate": 5e-06, + "logits/chosen": -41784200.0, + "logits/rejected": -48387576.0, + "logps/chosen": -398.4791259765625, + "logps/rejected": -440.3336181640625, + "loss": 0.0336, + "rewards/chosen": 6.844158172607422, + "rewards/margins": 14.78076982498169, + "rewards/rejected": -7.936611652374268, + "step": 852 + }, + { + "epoch": 0.2134367571625172, + "grad_norm": 27.25, + "kl": 15.438124656677246, + "learning_rate": 5e-06, + "logits/chosen": -38035357.86666667, + "logits/rejected": 93770844.44444445, + "logps/chosen": -443.3076497395833, + "logps/rejected": -593.2223307291666, + "loss": 0.1567, + "rewards/chosen": 7.630831909179688, + "rewards/margins": 14.684690772162543, + "rewards/rejected": -7.053858862982856, + "step": 853 + }, + { + "epoch": 0.21368697610409107, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36225346.28571428, + "logits/rejected": -45393803.294117644, + "logps/chosen": -408.18858119419644, + "logps/rejected": -557.3362821691177, + "loss": 0.0229, + "rewards/chosen": 8.157574244907924, + "rewards/margins": 16.173671369793034, + "rewards/rejected": -8.01609712488511, + "step": 854 + }, + { + "epoch": 0.21393719504566497, + "grad_norm": 16.75, + "kl": 1.0470657348632812, + "learning_rate": 5e-06, + "logits/chosen": -58416372.36363637, + "logits/rejected": -51174183.384615384, + "logps/chosen": -496.31503018465907, + "logps/rejected": -636.4634915865385, + "loss": 0.0516, + "rewards/chosen": 7.375865589488637, + "rewards/margins": 16.24125724739128, + "rewards/rejected": -8.865391657902645, + "step": 855 + }, + { + "epoch": 0.21418741398723884, + "grad_norm": 9.875, + "kl": 6.740177154541016, + "learning_rate": 5e-06, + "logits/chosen": -61907012.92307692, + "logits/rejected": -56968302.54545455, + "logps/chosen": -440.6944110576923, + "logps/rejected": -592.2461381392045, + "loss": 0.035, + "rewards/chosen": 7.263149554912861, + "rewards/margins": 18.913061608801357, + "rewards/rejected": -11.649912053888494, + "step": 856 + }, + { + "epoch": 0.2144376329288127, + "grad_norm": 13.3125, + "kl": 2.8779256343841553, + "learning_rate": 5e-06, + "logits/chosen": -29195834.181818184, + "logits/rejected": -43359773.538461536, + "logps/chosen": -240.44322620738637, + "logps/rejected": -551.4405048076923, + "loss": 0.0979, + "rewards/chosen": 4.928837862881747, + "rewards/margins": 12.499620450960172, + "rewards/rejected": -7.570782588078425, + "step": 857 + }, + { + "epoch": 0.21468785187038658, + "grad_norm": 21.375, + "kl": 3.101168394088745, + "learning_rate": 5e-06, + "logits/chosen": -40155383.46666667, + "logits/rejected": -41494887.11111111, + "logps/chosen": -339.31598307291665, + "logps/rejected": -500.6658528645833, + "loss": 0.1539, + "rewards/chosen": 4.952131144205729, + "rewards/margins": 12.43179711235894, + "rewards/rejected": -7.479665968153212, + "step": 858 + }, + { + "epoch": 0.21493807081196045, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37480608.0, + "logits/rejected": -24109422.933333334, + "logps/chosen": -476.09657118055554, + "logps/rejected": -429.8720703125, + "loss": 0.039, + "rewards/chosen": 7.48978508843316, + "rewards/margins": 15.800205824110243, + "rewards/rejected": -8.310420735677083, + "step": 859 + }, + { + "epoch": 0.21518828975353435, + "grad_norm": 6.09375, + "kl": 1.8882615566253662, + "learning_rate": 5e-06, + "logits/chosen": -34047748.92307692, + "logits/rejected": -49096768.0, + "logps/chosen": -265.89954552283655, + "logps/rejected": -702.7535955255681, + "loss": 0.0673, + "rewards/chosen": 4.728611872746394, + "rewards/margins": 17.00692941092111, + "rewards/rejected": -12.278317538174717, + "step": 860 + }, + { + "epoch": 0.21543850869510822, + "grad_norm": 8.3125, + "kl": 3.954415798187256, + "learning_rate": 5e-06, + "logits/chosen": -44509050.666666664, + "logits/rejected": -62676245.333333336, + "logps/chosen": -489.1415201822917, + "logps/rejected": -654.4629720052084, + "loss": 0.0385, + "rewards/chosen": 8.263253529866537, + "rewards/margins": 18.790645599365234, + "rewards/rejected": -10.527392069498697, + "step": 861 + }, + { + "epoch": 0.2156887276366821, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35458925.333333336, + "logits/rejected": -49140213.333333336, + "logps/chosen": -287.7821451822917, + "logps/rejected": -480.912841796875, + "loss": 0.0332, + "rewards/chosen": 5.380205154418945, + "rewards/margins": 13.390286763509115, + "rewards/rejected": -8.01008160909017, + "step": 862 + }, + { + "epoch": 0.21593894657825596, + "grad_norm": 20.625, + "kl": 11.090689659118652, + "learning_rate": 5e-06, + "logits/chosen": -69584402.28571428, + "logits/rejected": -55375321.6, + "logps/chosen": -552.8187081473214, + "logps/rejected": -499.70009765625, + "loss": 0.0499, + "rewards/chosen": 8.731765747070312, + "rewards/margins": 16.728807067871095, + "rewards/rejected": -7.997041320800781, + "step": 863 + }, + { + "epoch": 0.21618916551982986, + "grad_norm": 17.375, + "kl": 10.584943771362305, + "learning_rate": 5e-06, + "logits/chosen": -66684266.666666664, + "logits/rejected": -27548413.333333332, + "logps/chosen": -357.3514811197917, + "logps/rejected": -520.56103515625, + "loss": 0.144, + "rewards/chosen": 6.678855895996094, + "rewards/margins": 15.041339874267578, + "rewards/rejected": -8.362483978271484, + "step": 864 + }, + { + "epoch": 0.21643938446140373, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48434244.571428575, + "logits/rejected": -45303168.0, + "logps/chosen": -338.2197963169643, + "logps/rejected": -618.5232421875, + "loss": 0.0694, + "rewards/chosen": 5.867035457066128, + "rewards/margins": 15.076931544712611, + "rewards/rejected": -9.209896087646484, + "step": 865 + }, + { + "epoch": 0.2166896034029776, + "grad_norm": 20.125, + "kl": 6.285125255584717, + "learning_rate": 5e-06, + "logits/chosen": -42835514.666666664, + "logits/rejected": -60667349.333333336, + "logps/chosen": -536.45751953125, + "logps/rejected": -662.5782877604166, + "loss": 0.0821, + "rewards/chosen": 8.507424672444662, + "rewards/margins": 17.059466679890953, + "rewards/rejected": -8.552042007446289, + "step": 866 + }, + { + "epoch": 0.21693982234455148, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51094688.0, + "logits/rejected": -14992970.666666666, + "logps/chosen": -414.3006998697917, + "logps/rejected": -399.5585123697917, + "loss": 0.0872, + "rewards/chosen": 5.1931107838948565, + "rewards/margins": 11.766401290893555, + "rewards/rejected": -6.573290506998698, + "step": 867 + }, + { + "epoch": 0.21719004128612535, + "grad_norm": 6.0, + "kl": 2.2430219650268555, + "learning_rate": 5e-06, + "logits/chosen": -55216610.90909091, + "logits/rejected": -36635756.307692304, + "logps/chosen": -386.10751065340907, + "logps/rejected": -446.90685096153845, + "loss": 0.0384, + "rewards/chosen": 6.514386263760653, + "rewards/margins": 12.784233253318947, + "rewards/rejected": -6.269846989558293, + "step": 868 + }, + { + "epoch": 0.21744026022769924, + "grad_norm": 8.5625, + "kl": 7.486400127410889, + "learning_rate": 5e-06, + "logits/chosen": -52908590.54545455, + "logits/rejected": -9853538.461538462, + "logps/chosen": -357.14228959517044, + "logps/rejected": -739.2422626201923, + "loss": 0.0766, + "rewards/chosen": 6.715452714399858, + "rewards/margins": 18.655198103898055, + "rewards/rejected": -11.939745389498198, + "step": 869 + }, + { + "epoch": 0.21769047916927312, + "grad_norm": 2.296875, + "kl": 7.726406097412109, + "learning_rate": 5e-06, + "logits/chosen": -68999123.6923077, + "logits/rejected": -39331319.27272727, + "logps/chosen": -524.8665114182693, + "logps/rejected": -500.24636008522725, + "loss": 0.005, + "rewards/chosen": 8.946279672475962, + "rewards/margins": 18.330212999890733, + "rewards/rejected": -9.383933327414773, + "step": 870 + }, + { + "epoch": 0.217940698110847, + "grad_norm": 8.3125, + "kl": 2.492487668991089, + "learning_rate": 5e-06, + "logits/chosen": -39259072.0, + "logits/rejected": -44220557.71428572, + "logps/chosen": -421.684375, + "logps/rejected": -674.5276227678571, + "loss": 0.0204, + "rewards/chosen": 7.777132415771485, + "rewards/margins": 18.12573689052037, + "rewards/rejected": -10.348604474748884, + "step": 871 + }, + { + "epoch": 0.21819091705242086, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70753427.2, + "logits/rejected": -62200672.0, + "logps/chosen": -350.109130859375, + "logps/rejected": -551.241943359375, + "loss": 0.039, + "rewards/chosen": 5.932463073730469, + "rewards/margins": 15.67813720703125, + "rewards/rejected": -9.745674133300781, + "step": 872 + }, + { + "epoch": 0.21844113599399476, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57096768.0, + "logits/rejected": -26418562.285714287, + "logps/chosen": -392.160498046875, + "logps/rejected": -668.8669084821429, + "loss": 0.0061, + "rewards/chosen": 6.697810363769531, + "rewards/margins": 19.36263253348214, + "rewards/rejected": -12.664822169712611, + "step": 873 + }, + { + "epoch": 0.21869135493556863, + "grad_norm": 29.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32409620.363636363, + "logits/rejected": -46671035.07692308, + "logps/chosen": -299.3151189630682, + "logps/rejected": -512.1787109375, + "loss": 0.0735, + "rewards/chosen": 5.699134826660156, + "rewards/margins": 12.853279700646034, + "rewards/rejected": -7.154144873985877, + "step": 874 + }, + { + "epoch": 0.2189415738771425, + "grad_norm": 13.0625, + "kl": 2.6314876079559326, + "learning_rate": 5e-06, + "logits/chosen": -23004838.85714286, + "logits/rejected": -37200979.2, + "logps/chosen": -337.72593470982144, + "logps/rejected": -511.9251953125, + "loss": 0.0794, + "rewards/chosen": 5.1929811750139505, + "rewards/margins": 14.745045689174106, + "rewards/rejected": -9.552064514160156, + "step": 875 + }, + { + "epoch": 0.21919179281871637, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37902272.0, + "logits/rejected": -31997494.153846152, + "logps/chosen": -285.9669744318182, + "logps/rejected": -628.6466346153846, + "loss": 0.0439, + "rewards/chosen": 4.861722772771662, + "rewards/margins": 16.22666840453248, + "rewards/rejected": -11.364945631760817, + "step": 876 + }, + { + "epoch": 0.21944201176029027, + "grad_norm": 8.6875, + "kl": 7.84165096282959, + "learning_rate": 5e-06, + "logits/chosen": -28804657.230769232, + "logits/rejected": -28111429.818181816, + "logps/chosen": -428.36099008413464, + "logps/rejected": -415.67578125, + "loss": 0.078, + "rewards/chosen": 6.948799720177283, + "rewards/margins": 15.052854257863718, + "rewards/rejected": -8.104054537686435, + "step": 877 + }, + { + "epoch": 0.21969223070186414, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46892076.307692304, + "logits/rejected": -50200855.27272727, + "logps/chosen": -375.0295973557692, + "logps/rejected": -546.7670010653409, + "loss": 0.053, + "rewards/chosen": 7.37348879300631, + "rewards/margins": 14.722526656997788, + "rewards/rejected": -7.3490378639914775, + "step": 878 + }, + { + "epoch": 0.219942449643438, + "grad_norm": 3.015625, + "kl": 0.28276318311691284, + "learning_rate": 5e-06, + "logits/chosen": -35701813.333333336, + "logits/rejected": -33751249.06666667, + "logps/chosen": -544.9024522569445, + "logps/rejected": -461.6625, + "loss": 0.0193, + "rewards/chosen": 7.179185655381945, + "rewards/margins": 16.952802191840277, + "rewards/rejected": -9.773616536458333, + "step": 879 + }, + { + "epoch": 0.22019266858501188, + "grad_norm": 10.5625, + "kl": 5.377200126647949, + "learning_rate": 5e-06, + "logits/chosen": -51452160.0, + "logits/rejected": -48970146.90909091, + "logps/chosen": -417.5757587139423, + "logps/rejected": -558.3232421875, + "loss": 0.0537, + "rewards/chosen": 7.008513817420373, + "rewards/margins": 16.327812301529036, + "rewards/rejected": -9.319298484108664, + "step": 880 + }, + { + "epoch": 0.22044288752658575, + "grad_norm": 12.9375, + "kl": 0.08029492944478989, + "learning_rate": 5e-06, + "logits/chosen": -69940356.92307693, + "logits/rejected": -55195752.72727273, + "logps/chosen": -351.60486778846155, + "logps/rejected": -509.8441051136364, + "loss": 0.0734, + "rewards/chosen": 6.0399627685546875, + "rewards/margins": 14.784964821555398, + "rewards/rejected": -8.74500205300071, + "step": 881 + }, + { + "epoch": 0.22069310646815965, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61429606.4, + "logits/rejected": -29648416.0, + "logps/chosen": -308.7400390625, + "logps/rejected": -384.663818359375, + "loss": 0.0919, + "rewards/chosen": 4.954191080729166, + "rewards/margins": 13.243031480577258, + "rewards/rejected": -8.288840399848091, + "step": 882 + }, + { + "epoch": 0.22094332540973352, + "grad_norm": 10.25, + "kl": 0.22380320727825165, + "learning_rate": 5e-06, + "logits/chosen": -46709013.333333336, + "logits/rejected": -51381120.0, + "logps/chosen": -436.3108723958333, + "logps/rejected": -573.4994710286459, + "loss": 0.0192, + "rewards/chosen": 7.917387008666992, + "rewards/margins": 16.842842737833656, + "rewards/rejected": -8.925455729166666, + "step": 883 + }, + { + "epoch": 0.2211935443513074, + "grad_norm": 15.6875, + "kl": 7.116348743438721, + "learning_rate": 5e-06, + "logits/chosen": -100355675.42857143, + "logits/rejected": -63054739.2, + "logps/chosen": -393.7709263392857, + "logps/rejected": -600.83974609375, + "loss": 0.0716, + "rewards/chosen": 6.167784009660993, + "rewards/margins": 17.562579236711777, + "rewards/rejected": -11.394795227050782, + "step": 884 + }, + { + "epoch": 0.22144376329288126, + "grad_norm": 8.9375, + "kl": 0.2906290888786316, + "learning_rate": 5e-06, + "logits/chosen": -49279701.333333336, + "logits/rejected": -61520213.333333336, + "logps/chosen": -433.4287109375, + "logps/rejected": -671.3612630208333, + "loss": 0.0261, + "rewards/chosen": 8.179108513726128, + "rewards/margins": 20.204544915093315, + "rewards/rejected": -12.025436401367188, + "step": 885 + }, + { + "epoch": 0.22169398223445516, + "grad_norm": 3.71875, + "kl": 6.281719207763672, + "learning_rate": 5e-06, + "logits/chosen": -45588474.666666664, + "logits/rejected": -14956497.333333334, + "logps/chosen": -486.5383707682292, + "logps/rejected": -812.0821126302084, + "loss": 0.0052, + "rewards/chosen": 8.805744171142578, + "rewards/margins": 21.97202173868815, + "rewards/rejected": -13.166277567545572, + "step": 886 + }, + { + "epoch": 0.22194420117602903, + "grad_norm": 5.0625, + "kl": 0.26970165967941284, + "learning_rate": 5e-06, + "logits/chosen": -52520635.07692308, + "logits/rejected": -49551418.18181818, + "logps/chosen": -440.88882211538464, + "logps/rejected": -610.7002840909091, + "loss": 0.0227, + "rewards/chosen": 6.061122600848858, + "rewards/margins": 18.8501601319213, + "rewards/rejected": -12.789037531072443, + "step": 887 + }, + { + "epoch": 0.2221944201176029, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73012264.72727273, + "logits/rejected": -51247315.692307696, + "logps/chosen": -331.8717595880682, + "logps/rejected": -552.4230769230769, + "loss": 0.1024, + "rewards/chosen": 4.700930508700284, + "rewards/margins": 16.03623183457168, + "rewards/rejected": -11.335301325871395, + "step": 888 + }, + { + "epoch": 0.22244463905917677, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39887114.666666664, + "logits/rejected": -41164781.333333336, + "logps/chosen": -298.58392333984375, + "logps/rejected": -374.5581868489583, + "loss": 0.1278, + "rewards/chosen": 3.9807020823160806, + "rewards/margins": 11.933468500773111, + "rewards/rejected": -7.952766418457031, + "step": 889 + }, + { + "epoch": 0.22269485800075065, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65502884.571428575, + "logits/rejected": -40199883.294117644, + "logps/chosen": -483.1144321986607, + "logps/rejected": -476.2304113051471, + "loss": 0.014, + "rewards/chosen": 8.052347455705915, + "rewards/margins": 17.155099243676965, + "rewards/rejected": -9.102751787971048, + "step": 890 + }, + { + "epoch": 0.22294507694232454, + "grad_norm": 26.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54301969.06666667, + "logits/rejected": -79465927.1111111, + "logps/chosen": -337.78675130208336, + "logps/rejected": -579.8811848958334, + "loss": 0.0692, + "rewards/chosen": 5.206717936197917, + "rewards/margins": 13.366334194607205, + "rewards/rejected": -8.159616258409288, + "step": 891 + }, + { + "epoch": 0.22319529588389841, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74470363.42857143, + "logits/rejected": -10162648.470588235, + "logps/chosen": -439.63553292410717, + "logps/rejected": -639.7953814338235, + "loss": 0.0697, + "rewards/chosen": 5.8232863289969305, + "rewards/margins": 17.13429904585125, + "rewards/rejected": -11.31101271685432, + "step": 892 + }, + { + "epoch": 0.22344551482547229, + "grad_norm": 9.8125, + "kl": 4.602041721343994, + "learning_rate": 5e-06, + "logits/chosen": -68313344.0, + "logits/rejected": -31684557.714285713, + "logps/chosen": -404.32275390625, + "logps/rejected": -411.10682896205356, + "loss": 0.04, + "rewards/chosen": 5.589457702636719, + "rewards/margins": 14.57886701311384, + "rewards/rejected": -8.98940931047712, + "step": 893 + }, + { + "epoch": 0.22369573376704616, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61931227.428571425, + "logits/rejected": -43959251.2, + "logps/chosen": -382.4054478236607, + "logps/rejected": -673.0078125, + "loss": 0.0844, + "rewards/chosen": 4.8950685773577005, + "rewards/margins": 16.836561802455357, + "rewards/rejected": -11.941493225097656, + "step": 894 + }, + { + "epoch": 0.22394595270862006, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61059333.81818182, + "logits/rejected": -53136452.92307692, + "logps/chosen": -326.50106534090907, + "logps/rejected": -629.8517127403846, + "loss": 0.0338, + "rewards/chosen": 5.8683554909446025, + "rewards/margins": 18.607122674688593, + "rewards/rejected": -12.73876718374399, + "step": 895 + }, + { + "epoch": 0.22419617165019393, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30159670.85714286, + "logits/rejected": -59443648.0, + "logps/chosen": -354.20894949776783, + "logps/rejected": -581.8898782169117, + "loss": 0.0175, + "rewards/chosen": 5.0240968976702005, + "rewards/margins": 19.113194762157793, + "rewards/rejected": -14.089097864487591, + "step": 896 + }, + { + "epoch": 0.2244463905917678, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32034229.333333332, + "logits/rejected": -59916186.666666664, + "logps/chosen": -406.53369140625, + "logps/rejected": -576.3888346354166, + "loss": 0.0936, + "rewards/chosen": 5.919286727905273, + "rewards/margins": 16.343662897745766, + "rewards/rejected": -10.424376169840494, + "step": 897 + }, + { + "epoch": 0.22469660953334167, + "grad_norm": 3.109375, + "kl": 1.7369754314422607, + "learning_rate": 5e-06, + "logits/chosen": -51159249.06666667, + "logits/rejected": -34830954.666666664, + "logps/chosen": -435.738671875, + "logps/rejected": -599.8976236979166, + "loss": 0.0213, + "rewards/chosen": 6.416150919596354, + "rewards/margins": 19.040545654296874, + "rewards/rejected": -12.624394734700521, + "step": 898 + }, + { + "epoch": 0.22494682847491554, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41043462.4, + "logits/rejected": -45398747.428571425, + "logps/chosen": -479.7412109375, + "logps/rejected": -527.4547293526786, + "loss": 0.0568, + "rewards/chosen": 4.900117492675781, + "rewards/margins": 17.14068647112165, + "rewards/rejected": -12.24056897844587, + "step": 899 + }, + { + "epoch": 0.22519704741648944, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39985076.0, + "logits/rejected": -35510740.0, + "logps/chosen": -324.8722229003906, + "logps/rejected": -546.1305541992188, + "loss": 0.078, + "rewards/chosen": 4.132997989654541, + "rewards/margins": 14.69203519821167, + "rewards/rejected": -10.559037208557129, + "step": 900 + }, + { + "epoch": 0.2254472663580633, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48580533.333333336, + "logits/rejected": -67845529.6, + "logps/chosen": -359.9342447916667, + "logps/rejected": -636.8192708333333, + "loss": 0.0472, + "rewards/chosen": 6.685011545817058, + "rewards/margins": 21.08140640258789, + "rewards/rejected": -14.396394856770833, + "step": 901 + }, + { + "epoch": 0.22569748529963718, + "grad_norm": 19.125, + "kl": 8.682260513305664, + "learning_rate": 5e-06, + "logits/chosen": -79313547.63636364, + "logits/rejected": -39524388.92307692, + "logps/chosen": -517.0273881392045, + "logps/rejected": -398.0355694110577, + "loss": 0.0731, + "rewards/chosen": 6.073057001287287, + "rewards/margins": 14.0310923969829, + "rewards/rejected": -7.958035395695613, + "step": 902 + }, + { + "epoch": 0.22594770424121105, + "grad_norm": 10.1875, + "kl": 6.758369445800781, + "learning_rate": 5e-06, + "logits/chosen": 107983008.0, + "logits/rejected": -21591917.333333332, + "logps/chosen": -588.9263916015625, + "logps/rejected": -442.9207763671875, + "loss": 0.027, + "rewards/chosen": 7.587039311726888, + "rewards/margins": 14.34028434753418, + "rewards/rejected": -6.753245035807292, + "step": 903 + }, + { + "epoch": 0.22619792318278495, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49310124.307692304, + "logits/rejected": -48749812.36363637, + "logps/chosen": -461.45004507211536, + "logps/rejected": -566.53515625, + "loss": 0.0274, + "rewards/chosen": 7.519031231219952, + "rewards/margins": 19.996614682924495, + "rewards/rejected": -12.477583451704545, + "step": 904 + }, + { + "epoch": 0.22644814212435882, + "grad_norm": 8.5, + "kl": 4.311858177185059, + "learning_rate": 5e-06, + "logits/chosen": -44651034.666666664, + "logits/rejected": 555985.3333333334, + "logps/chosen": -516.0038655598959, + "logps/rejected": -644.2854817708334, + "loss": 0.0378, + "rewards/chosen": 7.955541610717773, + "rewards/margins": 18.99603716532389, + "rewards/rejected": -11.04049555460612, + "step": 905 + }, + { + "epoch": 0.2266983610659327, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39005595.428571425, + "logits/rejected": -50134841.6, + "logps/chosen": -339.9086216517857, + "logps/rejected": -593.230859375, + "loss": 0.0857, + "rewards/chosen": 4.672640664236886, + "rewards/margins": 17.40224805559431, + "rewards/rejected": -12.729607391357423, + "step": 906 + }, + { + "epoch": 0.22694858000750656, + "grad_norm": 12.75, + "kl": 13.265981674194336, + "learning_rate": 5e-06, + "logits/chosen": -76839330.46153846, + "logits/rejected": -21663102.545454547, + "logps/chosen": -380.22348257211536, + "logps/rejected": -431.7615855823864, + "loss": 0.0862, + "rewards/chosen": 5.711649968073918, + "rewards/margins": 13.704359734808648, + "rewards/rejected": -7.99270976673473, + "step": 907 + }, + { + "epoch": 0.22719879894908043, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54960480.0, + "logits/rejected": -42284340.0, + "logps/chosen": -424.51910400390625, + "logps/rejected": -657.2796630859375, + "loss": 0.0289, + "rewards/chosen": 7.0018310546875, + "rewards/margins": 19.016210556030273, + "rewards/rejected": -12.014379501342773, + "step": 908 + }, + { + "epoch": 0.22744901789065433, + "grad_norm": 5.5625, + "kl": 1.0971851348876953, + "learning_rate": 5e-06, + "logits/chosen": -51318132.705882356, + "logits/rejected": -80645988.57142857, + "logps/chosen": -408.4037511488971, + "logps/rejected": -456.4017857142857, + "loss": 0.0471, + "rewards/chosen": 6.688805972828584, + "rewards/margins": 13.419990796001017, + "rewards/rejected": -6.731184823172433, + "step": 909 + }, + { + "epoch": 0.2276992368322282, + "grad_norm": 10.3125, + "kl": 6.374741554260254, + "learning_rate": 5e-06, + "logits/chosen": -70748445.53846154, + "logits/rejected": -33107482.181818184, + "logps/chosen": -465.82068810096155, + "logps/rejected": -380.7496448863636, + "loss": 0.054, + "rewards/chosen": 6.5722527137169475, + "rewards/margins": 14.515082192587686, + "rewards/rejected": -7.942829478870738, + "step": 910 + }, + { + "epoch": 0.22794945577380207, + "grad_norm": 12.6875, + "kl": 15.299120903015137, + "learning_rate": 5e-06, + "logits/chosen": -80027810.46153846, + "logits/rejected": -40516040.72727273, + "logps/chosen": -503.6709735576923, + "logps/rejected": -594.6040482954545, + "loss": 0.0937, + "rewards/chosen": 6.277316753680889, + "rewards/margins": 17.76777723619154, + "rewards/rejected": -11.490460482510654, + "step": 911 + }, + { + "epoch": 0.22819967471537594, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37288764.44444445, + "logits/rejected": -15200901.333333334, + "logps/chosen": -422.58867730034723, + "logps/rejected": -550.1824869791667, + "loss": 0.0301, + "rewards/chosen": 7.073209126790364, + "rewards/margins": 19.02741444905599, + "rewards/rejected": -11.954205322265626, + "step": 912 + }, + { + "epoch": 0.22844989365694984, + "grad_norm": 0.7578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63073976.88888889, + "logits/rejected": -72308906.66666667, + "logps/chosen": -467.6124674479167, + "logps/rejected": -547.29296875, + "loss": 0.002, + "rewards/chosen": 6.477109273274739, + "rewards/margins": 17.309584045410155, + "rewards/rejected": -10.832474772135416, + "step": 913 + }, + { + "epoch": 0.2287001125985237, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47196632.615384616, + "logits/rejected": -52435991.27272727, + "logps/chosen": -355.9587214543269, + "logps/rejected": -663.9367897727273, + "loss": 0.0601, + "rewards/chosen": 5.280397268442007, + "rewards/margins": 19.18201137089229, + "rewards/rejected": -13.901614102450283, + "step": 914 + }, + { + "epoch": 0.22895033154009758, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37041440.0, + "logits/rejected": -47587936.0, + "logps/chosen": -357.572021484375, + "logps/rejected": -557.8770751953125, + "loss": 0.0828, + "rewards/chosen": 6.3396453857421875, + "rewards/margins": 15.498096466064453, + "rewards/rejected": -9.158451080322266, + "step": 915 + }, + { + "epoch": 0.22920055048167146, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41357060.0, + "logits/rejected": -70418416.0, + "logps/chosen": -546.7437133789062, + "logps/rejected": -653.62451171875, + "loss": 0.018, + "rewards/chosen": 7.156656742095947, + "rewards/margins": 19.993201732635498, + "rewards/rejected": -12.83654499053955, + "step": 916 + }, + { + "epoch": 0.22945076942324533, + "grad_norm": 5.21875, + "kl": 6.046242713928223, + "learning_rate": 5e-06, + "logits/chosen": -49670478.222222224, + "logits/rejected": -48864304.0, + "logps/chosen": -411.0027126736111, + "logps/rejected": -613.636962890625, + "loss": 0.0537, + "rewards/chosen": 6.228641086154514, + "rewards/margins": 21.116777631971573, + "rewards/rejected": -14.888136545817057, + "step": 917 + }, + { + "epoch": 0.22970098836481923, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47282772.0, + "logits/rejected": -62508904.0, + "logps/chosen": -373.3050842285156, + "logps/rejected": -704.8584594726562, + "loss": 0.024, + "rewards/chosen": 6.87139892578125, + "rewards/margins": 19.98058795928955, + "rewards/rejected": -13.1091890335083, + "step": 918 + }, + { + "epoch": 0.2299512073063931, + "grad_norm": 16.125, + "kl": 3.070117950439453, + "learning_rate": 5e-06, + "logits/chosen": -43018977.88235294, + "logits/rejected": -46294806.85714286, + "logps/chosen": -356.7622931985294, + "logps/rejected": -515.0734165736607, + "loss": 0.0473, + "rewards/chosen": 7.065644208122702, + "rewards/margins": 15.731119236024488, + "rewards/rejected": -8.665475027901786, + "step": 919 + }, + { + "epoch": 0.23020142624796697, + "grad_norm": 8.0, + "kl": 6.292427062988281, + "learning_rate": 5e-06, + "logits/chosen": -62030030.76923077, + "logits/rejected": -56004939.63636363, + "logps/chosen": -503.7373046875, + "logps/rejected": -794.2305575284091, + "loss": 0.0433, + "rewards/chosen": 7.658596332256611, + "rewards/margins": 21.78488500635107, + "rewards/rejected": -14.12628867409446, + "step": 920 + }, + { + "epoch": 0.23045164518954084, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77693888.0, + "logits/rejected": -34540572.0, + "logps/chosen": -418.0924377441406, + "logps/rejected": -568.9324951171875, + "loss": 0.0102, + "rewards/chosen": 5.300932884216309, + "rewards/margins": 16.795610427856445, + "rewards/rejected": -11.494677543640137, + "step": 921 + }, + { + "epoch": 0.23070186413111474, + "grad_norm": 13.3125, + "kl": 1.3069236278533936, + "learning_rate": 5e-06, + "logits/chosen": -50080742.4, + "logits/rejected": -42146915.55555555, + "logps/chosen": -327.55035807291665, + "logps/rejected": -795.9128146701389, + "loss": 0.0349, + "rewards/chosen": 5.24858652750651, + "rewards/margins": 22.489956834581164, + "rewards/rejected": -17.241370307074654, + "step": 922 + }, + { + "epoch": 0.2309520830726886, + "grad_norm": 11.375, + "kl": 3.664954662322998, + "learning_rate": 5e-06, + "logits/chosen": -59110253.71428572, + "logits/rejected": -43279382.4, + "logps/chosen": -482.2735072544643, + "logps/rejected": -578.491845703125, + "loss": 0.0541, + "rewards/chosen": 7.756150381905692, + "rewards/margins": 17.201266043526786, + "rewards/rejected": -9.445115661621093, + "step": 923 + }, + { + "epoch": 0.23120230201426248, + "grad_norm": 21.75, + "kl": 5.4658613204956055, + "learning_rate": 5e-06, + "logits/chosen": -37039584.0, + "logits/rejected": -32755747.2, + "logps/chosen": -325.296875, + "logps/rejected": -422.929296875, + "loss": 0.0793, + "rewards/chosen": 6.870804922921317, + "rewards/margins": 13.902114432198662, + "rewards/rejected": -7.031309509277344, + "step": 924 + }, + { + "epoch": 0.23145252095583635, + "grad_norm": 12.3125, + "kl": 0.8870315551757812, + "learning_rate": 5e-06, + "logits/chosen": -68504384.0, + "logits/rejected": -48437780.36363637, + "logps/chosen": -473.87905649038464, + "logps/rejected": -553.9419833096591, + "loss": 0.0419, + "rewards/chosen": 7.379144521859976, + "rewards/margins": 14.978341909555288, + "rewards/rejected": -7.5991973876953125, + "step": 925 + }, + { + "epoch": 0.23170273989741022, + "grad_norm": 15.4375, + "kl": 8.738922119140625, + "learning_rate": 5e-06, + "logits/chosen": -38038200.0, + "logits/rejected": -61981349.333333336, + "logps/chosen": -536.9969889322916, + "logps/rejected": -495.327880859375, + "loss": 0.0265, + "rewards/chosen": 8.034940083821615, + "rewards/margins": 18.238653818766277, + "rewards/rejected": -10.203713734944662, + "step": 926 + }, + { + "epoch": 0.23195295883898412, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38799692.8, + "logits/rejected": -62651832.88888889, + "logps/chosen": -359.4570638020833, + "logps/rejected": -483.93896484375, + "loss": 0.0473, + "rewards/chosen": 7.463754272460937, + "rewards/margins": 17.33887464735243, + "rewards/rejected": -9.875120374891493, + "step": 927 + }, + { + "epoch": 0.232203177780558, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36857338.18181818, + "logits/rejected": -50456064.0, + "logps/chosen": -367.7675115411932, + "logps/rejected": -576.2898137019231, + "loss": 0.038, + "rewards/chosen": 6.119823109019887, + "rewards/margins": 16.769954334605824, + "rewards/rejected": -10.650131225585938, + "step": 928 + }, + { + "epoch": 0.23245339672213186, + "grad_norm": 12.0625, + "kl": 9.128301620483398, + "learning_rate": 5e-06, + "logits/chosen": -25711554.285714287, + "logits/rejected": -51162745.6, + "logps/chosen": -547.6895228794643, + "logps/rejected": -526.092236328125, + "loss": 0.0739, + "rewards/chosen": 8.874160766601562, + "rewards/margins": 20.233805847167968, + "rewards/rejected": -11.359645080566406, + "step": 929 + }, + { + "epoch": 0.23270361566370573, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31915549.09090909, + "logits/rejected": -56411136.0, + "logps/chosen": -489.5325816761364, + "logps/rejected": -482.1345402644231, + "loss": 0.0405, + "rewards/chosen": 7.253825794566762, + "rewards/margins": 14.993141120963998, + "rewards/rejected": -7.739315326397236, + "step": 930 + }, + { + "epoch": 0.23295383460527963, + "grad_norm": 14.125, + "kl": 0.4628610610961914, + "learning_rate": 5e-06, + "logits/chosen": -41247953.777777776, + "logits/rejected": -62558429.86666667, + "logps/chosen": -307.1012369791667, + "logps/rejected": -483.59290364583336, + "loss": 0.0932, + "rewards/chosen": 5.360449896918403, + "rewards/margins": 11.433313666449653, + "rewards/rejected": -6.07286376953125, + "step": 931 + }, + { + "epoch": 0.2332040535468535, + "grad_norm": 4.625, + "kl": 3.119382619857788, + "learning_rate": 5e-06, + "logits/chosen": -37342084.0, + "logits/rejected": -26470720.0, + "logps/chosen": -395.2557678222656, + "logps/rejected": -451.6656188964844, + "loss": 0.056, + "rewards/chosen": 6.482509136199951, + "rewards/margins": 13.495808124542236, + "rewards/rejected": -7.013298988342285, + "step": 932 + }, + { + "epoch": 0.23345427248842737, + "grad_norm": 25.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56840908.8, + "logits/rejected": 5934252.571428572, + "logps/chosen": -299.0068115234375, + "logps/rejected": -649.7470703125, + "loss": 0.0555, + "rewards/chosen": 6.617679595947266, + "rewards/margins": 14.541585104806082, + "rewards/rejected": -7.923905508858817, + "step": 933 + }, + { + "epoch": 0.23370449143000124, + "grad_norm": 16.125, + "kl": 2.8662619590759277, + "learning_rate": 5e-06, + "logits/chosen": -52624208.0, + "logits/rejected": -57799962.666666664, + "logps/chosen": -411.8450520833333, + "logps/rejected": -528.4455973307291, + "loss": 0.0467, + "rewards/chosen": 7.910139719645183, + "rewards/margins": 17.80528513590495, + "rewards/rejected": -9.895145416259766, + "step": 934 + }, + { + "epoch": 0.23395471037157514, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41030153.6, + "logits/rejected": -66448649.14285714, + "logps/chosen": -330.2591064453125, + "logps/rejected": -728.6985212053571, + "loss": 0.0179, + "rewards/chosen": 6.134889221191406, + "rewards/margins": 19.477854483468192, + "rewards/rejected": -13.342965262276786, + "step": 935 + }, + { + "epoch": 0.234204929313149, + "grad_norm": 8.875, + "kl": 0.379302978515625, + "learning_rate": 5e-06, + "logits/chosen": -54399656.72727273, + "logits/rejected": -26842806.153846152, + "logps/chosen": -366.34565873579544, + "logps/rejected": -421.78463040865387, + "loss": 0.0533, + "rewards/chosen": 6.0342885797674, + "rewards/margins": 13.81286535729895, + "rewards/rejected": -7.77857677753155, + "step": 936 + }, + { + "epoch": 0.23445514825472288, + "grad_norm": 7.6875, + "kl": 6.206681728363037, + "learning_rate": 5e-06, + "logits/chosen": -44843712.0, + "logits/rejected": -45088960.0, + "logps/chosen": -400.8680889423077, + "logps/rejected": -681.0548206676136, + "loss": 0.0353, + "rewards/chosen": 6.523436913123498, + "rewards/margins": 15.86996123840759, + "rewards/rejected": -9.346524325284092, + "step": 937 + }, + { + "epoch": 0.23470536719629675, + "grad_norm": 9.5, + "kl": 2.585693836212158, + "learning_rate": 5e-06, + "logits/chosen": -29395781.333333332, + "logits/rejected": -54385114.666666664, + "logps/chosen": -352.1145833333333, + "logps/rejected": -508.311767578125, + "loss": 0.0772, + "rewards/chosen": 5.958161036173503, + "rewards/margins": 14.334941228230793, + "rewards/rejected": -8.376780192057291, + "step": 938 + }, + { + "epoch": 0.23495558613787063, + "grad_norm": 13.625, + "kl": 9.4812650680542, + "learning_rate": 5e-06, + "logits/chosen": -58789984.0, + "logits/rejected": -37406240.0, + "logps/chosen": -565.1275227864584, + "logps/rejected": -404.2013753255208, + "loss": 0.0299, + "rewards/chosen": 8.59155527750651, + "rewards/margins": 16.36270968119303, + "rewards/rejected": -7.771154403686523, + "step": 939 + }, + { + "epoch": 0.23520580507944452, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46891365.333333336, + "logits/rejected": -48070080.0, + "logps/chosen": -480.721435546875, + "logps/rejected": -499.0227864583333, + "loss": 0.045, + "rewards/chosen": 6.095334370930989, + "rewards/margins": 17.481115976969402, + "rewards/rejected": -11.385781606038412, + "step": 940 + }, + { + "epoch": 0.2354560240210184, + "grad_norm": 14.875, + "kl": 11.093153953552246, + "learning_rate": 5e-06, + "logits/chosen": -42544972.8, + "logits/rejected": -45334599.11111111, + "logps/chosen": -417.0918294270833, + "logps/rejected": -573.8923611111111, + "loss": 0.053, + "rewards/chosen": 6.57138671875, + "rewards/margins": 15.85594702826606, + "rewards/rejected": -9.28456030951606, + "step": 941 + }, + { + "epoch": 0.23570624296259227, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43262704.0, + "logits/rejected": -63748053.333333336, + "logps/chosen": -323.8709716796875, + "logps/rejected": -680.2295328776041, + "loss": 0.0387, + "rewards/chosen": 5.177082061767578, + "rewards/margins": 17.591632843017578, + "rewards/rejected": -12.41455078125, + "step": 942 + }, + { + "epoch": 0.23595646190416614, + "grad_norm": 15.375, + "kl": 1.3339195251464844, + "learning_rate": 5e-06, + "logits/chosen": -66854378.666666664, + "logits/rejected": -38074235.733333334, + "logps/chosen": -495.10205078125, + "logps/rejected": -492.8537109375, + "loss": 0.0346, + "rewards/chosen": 8.158946567111546, + "rewards/margins": 16.75756310356988, + "rewards/rejected": -8.598616536458334, + "step": 943 + }, + { + "epoch": 0.23620668084574004, + "grad_norm": 8.8125, + "kl": 2.094184637069702, + "learning_rate": 5e-06, + "logits/chosen": -25042998.85714286, + "logits/rejected": -41569836.8, + "logps/chosen": -299.8716517857143, + "logps/rejected": -558.7623046875, + "loss": 0.0933, + "rewards/chosen": 6.422407967703683, + "rewards/margins": 16.13129163469587, + "rewards/rejected": -9.708883666992188, + "step": 944 + }, + { + "epoch": 0.2364568997873139, + "grad_norm": 13.625, + "kl": 5.683592796325684, + "learning_rate": 5e-06, + "logits/chosen": -53061714.28571428, + "logits/rejected": -51149849.6, + "logps/chosen": -311.28086635044644, + "logps/rejected": -545.0552734375, + "loss": 0.1163, + "rewards/chosen": 4.430931636265346, + "rewards/margins": 14.698024531773157, + "rewards/rejected": -10.267092895507812, + "step": 945 + }, + { + "epoch": 0.23670711872888778, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43230976.0, + "logits/rejected": -70082609.77777778, + "logps/chosen": -323.52900390625, + "logps/rejected": -666.9175889756945, + "loss": 0.0676, + "rewards/chosen": 5.542985534667968, + "rewards/margins": 15.466888258192274, + "rewards/rejected": -9.923902723524305, + "step": 946 + }, + { + "epoch": 0.23695733767046165, + "grad_norm": 3.65625, + "kl": 5.355119228363037, + "learning_rate": 5e-06, + "logits/chosen": -53822139.07692308, + "logits/rejected": -59451601.45454545, + "logps/chosen": -516.5736177884615, + "logps/rejected": -628.6519886363636, + "loss": 0.0175, + "rewards/chosen": 7.771853520320012, + "rewards/margins": 18.58540915109061, + "rewards/rejected": -10.813555630770596, + "step": 947 + }, + { + "epoch": 0.23720755661203552, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40924457.84615385, + "logits/rejected": -57772125.09090909, + "logps/chosen": -313.2546198918269, + "logps/rejected": -609.9730557528409, + "loss": 0.0595, + "rewards/chosen": 6.1494627732497, + "rewards/margins": 14.070502007757867, + "rewards/rejected": -7.921039234508168, + "step": 948 + }, + { + "epoch": 0.23745777555360942, + "grad_norm": 8.0, + "kl": 7.103673458099365, + "learning_rate": 5e-06, + "logits/chosen": -69940736.0, + "logits/rejected": -33472898.285714287, + "logps/chosen": -363.793603515625, + "logps/rejected": -411.0101841517857, + "loss": 0.0329, + "rewards/chosen": 5.684453964233398, + "rewards/margins": 14.026391983032227, + "rewards/rejected": -8.341938018798828, + "step": 949 + }, + { + "epoch": 0.2377079944951833, + "grad_norm": 13.8125, + "kl": 10.972240447998047, + "learning_rate": 5e-06, + "logits/chosen": -55315656.0, + "logits/rejected": -37758380.0, + "logps/chosen": -462.2991943359375, + "logps/rejected": -372.3765869140625, + "loss": 0.073, + "rewards/chosen": 6.486313819885254, + "rewards/margins": 12.432631492614746, + "rewards/rejected": -5.946317672729492, + "step": 950 + }, + { + "epoch": 0.23795821343675716, + "grad_norm": 11.1875, + "kl": 4.412203788757324, + "learning_rate": 5e-06, + "logits/chosen": -75238116.57142857, + "logits/rejected": -58091456.0, + "logps/chosen": -400.944580078125, + "logps/rejected": -584.079833984375, + "loss": 0.0444, + "rewards/chosen": 5.991706848144531, + "rewards/margins": 15.89676513671875, + "rewards/rejected": -9.905058288574219, + "step": 951 + }, + { + "epoch": 0.23820843237833103, + "grad_norm": 6.5625, + "kl": 9.460249900817871, + "learning_rate": 5e-06, + "logits/chosen": -40347648.0, + "logits/rejected": -39134568.0, + "logps/chosen": -421.4286295572917, + "logps/rejected": -471.579833984375, + "loss": 0.0255, + "rewards/chosen": 7.644588470458984, + "rewards/margins": 12.908722241719563, + "rewards/rejected": -5.264133771260579, + "step": 952 + }, + { + "epoch": 0.23845865131990493, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77202478.54545455, + "logits/rejected": -52970422.15384615, + "logps/chosen": -390.42964311079544, + "logps/rejected": -525.9840369591346, + "loss": 0.0403, + "rewards/chosen": 7.254411177201704, + "rewards/margins": 16.002279641744973, + "rewards/rejected": -8.74786846454327, + "step": 953 + }, + { + "epoch": 0.2387088702614788, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11673018.666666666, + "logits/rejected": -45780957.86666667, + "logps/chosen": -494.92192925347223, + "logps/rejected": -638.621875, + "loss": 0.0926, + "rewards/chosen": 5.703413645426433, + "rewards/margins": 13.848574574788412, + "rewards/rejected": -8.14516092936198, + "step": 954 + }, + { + "epoch": 0.23895908920305267, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41096916.0, + "logits/rejected": -38687588.0, + "logps/chosen": -372.59002685546875, + "logps/rejected": -688.5537109375, + "loss": 0.0332, + "rewards/chosen": 7.318997859954834, + "rewards/margins": 19.08086061477661, + "rewards/rejected": -11.761862754821777, + "step": 955 + }, + { + "epoch": 0.23920930814462654, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44092472.88888889, + "logits/rejected": -30713898.666666668, + "logps/chosen": -338.7389322916667, + "logps/rejected": -479.07395833333334, + "loss": 0.0326, + "rewards/chosen": 7.433725145128038, + "rewards/margins": 18.19242875840929, + "rewards/rejected": -10.75870361328125, + "step": 956 + }, + { + "epoch": 0.2394595270862004, + "grad_norm": 11.25, + "kl": 1.5393956899642944, + "learning_rate": 5e-06, + "logits/chosen": -83178843.42857143, + "logits/rejected": -85053644.8, + "logps/chosen": -432.54561941964283, + "logps/rejected": -618.793505859375, + "loss": 0.0472, + "rewards/chosen": 6.379090445382254, + "rewards/margins": 14.241289847237724, + "rewards/rejected": -7.862199401855468, + "step": 957 + }, + { + "epoch": 0.2397097460277743, + "grad_norm": 14.625, + "kl": 6.892382621765137, + "learning_rate": 5e-06, + "logits/chosen": -22198784.0, + "logits/rejected": -37976570.18181818, + "logps/chosen": -394.47964242788464, + "logps/rejected": -592.0108753551136, + "loss": 0.0754, + "rewards/chosen": 6.815983698918269, + "rewards/margins": 16.970387385441708, + "rewards/rejected": -10.154403686523438, + "step": 958 + }, + { + "epoch": 0.23995996496934818, + "grad_norm": 7.875, + "kl": 1.3100789785385132, + "learning_rate": 5e-06, + "logits/chosen": -62880742.4, + "logits/rejected": -81479222.85714285, + "logps/chosen": -326.6181884765625, + "logps/rejected": -519.4295828683036, + "loss": 0.0941, + "rewards/chosen": 3.6879344940185548, + "rewards/margins": 11.727349581037249, + "rewards/rejected": -8.039415087018694, + "step": 959 + }, + { + "epoch": 0.24021018391092205, + "grad_norm": 12.8125, + "kl": 1.117627501487732, + "learning_rate": 5e-06, + "logits/chosen": -39224738.461538464, + "logits/rejected": -42938368.0, + "logps/chosen": -300.0834209735577, + "logps/rejected": -358.07601651278407, + "loss": 0.0677, + "rewards/chosen": 5.270392197829026, + "rewards/margins": 10.915350000341455, + "rewards/rejected": -5.644957802512429, + "step": 960 + }, + { + "epoch": 0.24046040285249592, + "grad_norm": 7.09375, + "kl": 0.7437850832939148, + "learning_rate": 5e-06, + "logits/chosen": -11991135.0, + "logits/rejected": -60121304.0, + "logps/chosen": -354.5306701660156, + "logps/rejected": -524.0028076171875, + "loss": 0.049, + "rewards/chosen": 5.199479103088379, + "rewards/margins": 12.830081939697266, + "rewards/rejected": -7.630602836608887, + "step": 961 + }, + { + "epoch": 0.24071062179406982, + "grad_norm": 6.34375, + "kl": 5.288510799407959, + "learning_rate": 5e-06, + "logits/chosen": -52445209.6, + "logits/rejected": -49654798.222222224, + "logps/chosen": -358.82236328125, + "logps/rejected": -596.2080620659722, + "loss": 0.0272, + "rewards/chosen": 6.363755798339843, + "rewards/margins": 17.211896091037325, + "rewards/rejected": -10.848140292697483, + "step": 962 + }, + { + "epoch": 0.2409608407356437, + "grad_norm": 12.75, + "kl": 2.5923068523406982, + "learning_rate": 5e-06, + "logits/chosen": -63866322.823529415, + "logits/rejected": -71825179.42857143, + "logps/chosen": -425.9490751378676, + "logps/rejected": -766.2367466517857, + "loss": 0.0681, + "rewards/chosen": 6.591006559484145, + "rewards/margins": 17.28947980864709, + "rewards/rejected": -10.698473249162946, + "step": 963 + }, + { + "epoch": 0.24121105967721757, + "grad_norm": 13.5, + "kl": 7.148531436920166, + "learning_rate": 5e-06, + "logits/chosen": -67719542.85714285, + "logits/rejected": -49603027.2, + "logps/chosen": -408.22719029017856, + "logps/rejected": -545.6841796875, + "loss": 0.0519, + "rewards/chosen": 6.195568084716797, + "rewards/margins": 15.375952911376952, + "rewards/rejected": -9.180384826660156, + "step": 964 + }, + { + "epoch": 0.24146127861879144, + "grad_norm": 6.96875, + "kl": 3.784701108932495, + "learning_rate": 5e-06, + "logits/chosen": -51663904.0, + "logits/rejected": -51272873.14285714, + "logps/chosen": -346.400732421875, + "logps/rejected": -528.1700265066964, + "loss": 0.0297, + "rewards/chosen": 5.401966857910156, + "rewards/margins": 15.662945556640626, + "rewards/rejected": -10.260978698730469, + "step": 965 + }, + { + "epoch": 0.2417114975603653, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41541956.92307692, + "logits/rejected": -50786996.36363637, + "logps/chosen": -376.15271935096155, + "logps/rejected": -710.5560635653409, + "loss": 0.0984, + "rewards/chosen": 5.761100182166467, + "rewards/margins": 17.646116883604677, + "rewards/rejected": -11.88501670143821, + "step": 966 + }, + { + "epoch": 0.2419617165019392, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67082682.18181818, + "logits/rejected": -61389745.23076923, + "logps/chosen": -396.5559747869318, + "logps/rejected": -586.0338792067307, + "loss": 0.0248, + "rewards/chosen": 6.245370344682173, + "rewards/margins": 16.267867668525323, + "rewards/rejected": -10.02249732384315, + "step": 967 + }, + { + "epoch": 0.24221193544351308, + "grad_norm": 6.21875, + "kl": 2.1117992401123047, + "learning_rate": 5e-06, + "logits/chosen": -57475852.8, + "logits/rejected": -41609577.14285714, + "logps/chosen": -526.372119140625, + "logps/rejected": -543.5379115513393, + "loss": 0.0093, + "rewards/chosen": 6.452223205566407, + "rewards/margins": 16.720904323032926, + "rewards/rejected": -10.268681117466517, + "step": 968 + }, + { + "epoch": 0.24246215438508695, + "grad_norm": 5.15625, + "kl": 5.931025505065918, + "learning_rate": 5e-06, + "logits/chosen": -25839696.0, + "logits/rejected": -42132539.733333334, + "logps/chosen": -365.1384548611111, + "logps/rejected": -499.8753255208333, + "loss": 0.0491, + "rewards/chosen": 7.373534308539496, + "rewards/margins": 16.471940273708768, + "rewards/rejected": -9.09840596516927, + "step": 969 + }, + { + "epoch": 0.24271237332666082, + "grad_norm": 7.78125, + "kl": 5.186364650726318, + "learning_rate": 5e-06, + "logits/chosen": -46551428.0, + "logits/rejected": -59868468.0, + "logps/chosen": -436.8776550292969, + "logps/rejected": -419.1877136230469, + "loss": 0.0204, + "rewards/chosen": 8.265854835510254, + "rewards/margins": 16.00886869430542, + "rewards/rejected": -7.743013858795166, + "step": 970 + }, + { + "epoch": 0.24296259226823472, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48635178.666666664, + "logits/rejected": -22060261.333333332, + "logps/chosen": -424.09661458333335, + "logps/rejected": -465.74370659722223, + "loss": 0.0268, + "rewards/chosen": 7.468690999348959, + "rewards/margins": 18.117595587836373, + "rewards/rejected": -10.648904588487413, + "step": 971 + }, + { + "epoch": 0.2432128112098086, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26651632.0, + "logits/rejected": -46666843.428571425, + "logps/chosen": -379.4952880859375, + "logps/rejected": -596.7696707589286, + "loss": 0.0263, + "rewards/chosen": 6.369148254394531, + "rewards/margins": 15.411238534109932, + "rewards/rejected": -9.042090279715401, + "step": 972 + }, + { + "epoch": 0.24346303015138246, + "grad_norm": 7.5625, + "kl": 0.9020862579345703, + "learning_rate": 5e-06, + "logits/chosen": -44098397.538461536, + "logits/rejected": -56110161.45454545, + "logps/chosen": -481.2684795673077, + "logps/rejected": -599.7449396306819, + "loss": 0.0115, + "rewards/chosen": 8.506537804236778, + "rewards/margins": 22.31245433033763, + "rewards/rejected": -13.805916526100852, + "step": 973 + }, + { + "epoch": 0.24371324909295633, + "grad_norm": 20.5, + "kl": 11.748554229736328, + "learning_rate": 5e-06, + "logits/chosen": -48147471.058823526, + "logits/rejected": -40631003.428571425, + "logps/chosen": -420.59449678308823, + "logps/rejected": -627.2489536830357, + "loss": 0.0955, + "rewards/chosen": 6.807598787195542, + "rewards/margins": 15.587558329606257, + "rewards/rejected": -8.779959542410714, + "step": 974 + }, + { + "epoch": 0.2439634680345302, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45298222.54545455, + "logits/rejected": -45828617.84615385, + "logps/chosen": -499.63423295454544, + "logps/rejected": -673.6026141826923, + "loss": 0.0526, + "rewards/chosen": 7.542264764959162, + "rewards/margins": 19.699298831966374, + "rewards/rejected": -12.157034067007212, + "step": 975 + }, + { + "epoch": 0.2442136869761041, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22724881.6, + "logits/rejected": -56636242.28571428, + "logps/chosen": -303.7207275390625, + "logps/rejected": -520.9016810825893, + "loss": 0.0762, + "rewards/chosen": 5.799948883056641, + "rewards/margins": 14.588018471854074, + "rewards/rejected": -8.788069588797432, + "step": 976 + }, + { + "epoch": 0.24446390591767797, + "grad_norm": 19.5, + "kl": 7.317101955413818, + "learning_rate": 5e-06, + "logits/chosen": -28845570.666666668, + "logits/rejected": -38781898.666666664, + "logps/chosen": -409.3150634765625, + "logps/rejected": -512.3441569010416, + "loss": 0.0447, + "rewards/chosen": 6.162761688232422, + "rewards/margins": 13.854595184326172, + "rewards/rejected": -7.69183349609375, + "step": 977 + }, + { + "epoch": 0.24471412485925184, + "grad_norm": 7.40625, + "kl": 7.131377696990967, + "learning_rate": 5e-06, + "logits/chosen": -27759594.0, + "logits/rejected": -38790868.0, + "logps/chosen": -366.73529052734375, + "logps/rejected": -505.4588928222656, + "loss": 0.0912, + "rewards/chosen": 7.375744819641113, + "rewards/margins": 16.4173526763916, + "rewards/rejected": -9.041607856750488, + "step": 978 + }, + { + "epoch": 0.2449643438008257, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33602082.461538464, + "logits/rejected": 41118132.36363637, + "logps/chosen": -392.90884164663464, + "logps/rejected": -679.8631036931819, + "loss": 0.0344, + "rewards/chosen": 6.0534491905799275, + "rewards/margins": 17.51038472635763, + "rewards/rejected": -11.4569355357777, + "step": 979 + }, + { + "epoch": 0.2452145627423996, + "grad_norm": 8.5625, + "kl": 9.956182479858398, + "learning_rate": 5e-06, + "logits/chosen": -74135819.63636364, + "logits/rejected": -37772140.307692304, + "logps/chosen": -509.16313032670456, + "logps/rejected": -541.5817307692307, + "loss": 0.0229, + "rewards/chosen": 8.883264021439986, + "rewards/margins": 17.90471787886186, + "rewards/rejected": -9.021453857421875, + "step": 980 + }, + { + "epoch": 0.24546478168397348, + "grad_norm": 17.25, + "kl": 9.704726219177246, + "learning_rate": 5e-06, + "logits/chosen": -33535570.285714287, + "logits/rejected": -33441254.4, + "logps/chosen": -355.86083984375, + "logps/rejected": -568.32529296875, + "loss": 0.062, + "rewards/chosen": 7.664146423339844, + "rewards/margins": 17.043216705322266, + "rewards/rejected": -9.379070281982422, + "step": 981 + }, + { + "epoch": 0.24571500062554735, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41295132.8, + "logits/rejected": -62507634.28571428, + "logps/chosen": -471.283935546875, + "logps/rejected": -480.21177455357144, + "loss": 0.043, + "rewards/chosen": 5.92628288269043, + "rewards/margins": 12.615982873099192, + "rewards/rejected": -6.689699990408761, + "step": 982 + }, + { + "epoch": 0.24596521956712122, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44087664.0, + "logits/rejected": -52632604.0, + "logps/chosen": -363.9759521484375, + "logps/rejected": -392.74951171875, + "loss": 0.0511, + "rewards/chosen": 5.343209266662598, + "rewards/margins": 12.991962909698486, + "rewards/rejected": -7.648753643035889, + "step": 983 + }, + { + "epoch": 0.2462154385086951, + "grad_norm": 4.28125, + "kl": 5.336452007293701, + "learning_rate": 5e-06, + "logits/chosen": -45089450.666666664, + "logits/rejected": -21235751.111111112, + "logps/chosen": -386.187890625, + "logps/rejected": -438.6044921875, + "loss": 0.0102, + "rewards/chosen": 8.136567179361979, + "rewards/margins": 18.851971266004774, + "rewards/rejected": -10.715404086642796, + "step": 984 + }, + { + "epoch": 0.246465657450269, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36906614.4, + "logits/rejected": -16278251.42857143, + "logps/chosen": -248.096484375, + "logps/rejected": -478.1185825892857, + "loss": 0.0484, + "rewards/chosen": 4.976877212524414, + "rewards/margins": 13.192564555576869, + "rewards/rejected": -8.215687343052455, + "step": 985 + }, + { + "epoch": 0.24671587639184286, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34849632.0, + "logits/rejected": -55342107.428571425, + "logps/chosen": -430.87333984375, + "logps/rejected": -626.6460658482143, + "loss": 0.0197, + "rewards/chosen": 8.463484191894532, + "rewards/margins": 18.211029706682478, + "rewards/rejected": -9.747545514787946, + "step": 986 + }, + { + "epoch": 0.24696609533341674, + "grad_norm": 9.625, + "kl": 7.418025970458984, + "learning_rate": 5e-06, + "logits/chosen": -72797735.38461539, + "logits/rejected": -41053122.90909091, + "logps/chosen": -391.8903245192308, + "logps/rejected": -508.41463955965907, + "loss": 0.0396, + "rewards/chosen": 6.177619934082031, + "rewards/margins": 15.243307633833451, + "rewards/rejected": -9.06568769975142, + "step": 987 + }, + { + "epoch": 0.2472163142749906, + "grad_norm": 25.875, + "kl": 1.2753047943115234, + "learning_rate": 5e-06, + "logits/chosen": -44311702.85714286, + "logits/rejected": -52756188.8, + "logps/chosen": -274.71533203125, + "logps/rejected": -559.870751953125, + "loss": 0.0852, + "rewards/chosen": 5.22882080078125, + "rewards/margins": 11.39544219970703, + "rewards/rejected": -6.166621398925781, + "step": 988 + }, + { + "epoch": 0.2474665332165645, + "grad_norm": 8.6875, + "kl": 10.789250373840332, + "learning_rate": 5e-06, + "logits/chosen": -26739306.0, + "logits/rejected": 121936768.0, + "logps/chosen": -412.46630859375, + "logps/rejected": -691.3695068359375, + "loss": 0.0366, + "rewards/chosen": 7.12168025970459, + "rewards/margins": 16.61695098876953, + "rewards/rejected": -9.495270729064941, + "step": 989 + }, + { + "epoch": 0.24771675215813838, + "grad_norm": 16.5, + "kl": 5.047600746154785, + "learning_rate": 5e-06, + "logits/chosen": -55969934.222222224, + "logits/rejected": -35237674.666666664, + "logps/chosen": -477.73149956597223, + "logps/rejected": -559.80078125, + "loss": 0.0423, + "rewards/chosen": 8.644964430067274, + "rewards/margins": 15.533296542697482, + "rewards/rejected": -6.888332112630208, + "step": 990 + }, + { + "epoch": 0.24796697109971225, + "grad_norm": 13.125, + "kl": 0.7679736018180847, + "learning_rate": 5e-06, + "logits/chosen": -31620014.545454547, + "logits/rejected": -73518897.23076923, + "logps/chosen": -361.9465997869318, + "logps/rejected": -631.2200270432693, + "loss": 0.0418, + "rewards/chosen": 6.584184820001775, + "rewards/margins": 16.191359619994266, + "rewards/rejected": -9.607174799992489, + "step": 991 + }, + { + "epoch": 0.24821719004128612, + "grad_norm": 9.9375, + "kl": 8.253324508666992, + "learning_rate": 5e-06, + "logits/chosen": -38875401.14285714, + "logits/rejected": 4182356.0, + "logps/chosen": -480.73580496651783, + "logps/rejected": -971.26484375, + "loss": 0.028, + "rewards/chosen": 7.2487335205078125, + "rewards/margins": 24.113922119140625, + "rewards/rejected": -16.865188598632812, + "step": 992 + }, + { + "epoch": 0.24846740898286002, + "grad_norm": 8.5, + "kl": 4.701239585876465, + "learning_rate": 5e-06, + "logits/chosen": -20693806.545454547, + "logits/rejected": -50711187.692307696, + "logps/chosen": -369.4903453480114, + "logps/rejected": -411.2606670673077, + "loss": 0.033, + "rewards/chosen": 7.151483709161932, + "rewards/margins": 14.639182190795044, + "rewards/rejected": -7.487698481633113, + "step": 993 + }, + { + "epoch": 0.2487176279244339, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64155991.27272727, + "logits/rejected": -56464275.692307696, + "logps/chosen": -436.88645241477275, + "logps/rejected": -492.15054086538464, + "loss": 0.042, + "rewards/chosen": 7.837338534268466, + "rewards/margins": 18.271864244154283, + "rewards/rejected": -10.434525709885817, + "step": 994 + }, + { + "epoch": 0.24896784686600776, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -93347315.2, + "logits/rejected": -51931538.28571428, + "logps/chosen": -581.70400390625, + "logps/rejected": -634.4309430803571, + "loss": 0.0142, + "rewards/chosen": 9.399922180175782, + "rewards/margins": 21.41703818184989, + "rewards/rejected": -12.017116001674108, + "step": 995 + }, + { + "epoch": 0.24921806580758163, + "grad_norm": 3.703125, + "kl": 10.504900932312012, + "learning_rate": 5e-06, + "logits/chosen": -82139697.23076923, + "logits/rejected": -36180968.72727273, + "logps/chosen": -443.50210336538464, + "logps/rejected": -578.3034002130681, + "loss": 0.0079, + "rewards/chosen": 7.701954768254207, + "rewards/margins": 16.321860413451294, + "rewards/rejected": -8.619905645197088, + "step": 996 + }, + { + "epoch": 0.2494682847491555, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63988465.777777776, + "logits/rejected": 119205981.86666666, + "logps/chosen": -383.4265950520833, + "logps/rejected": -538.39052734375, + "loss": 0.0528, + "rewards/chosen": 6.038827260335286, + "rewards/margins": 14.730064646402994, + "rewards/rejected": -8.691237386067709, + "step": 997 + }, + { + "epoch": 0.2497185036907294, + "grad_norm": 6.3125, + "kl": 2.5585403442382812, + "learning_rate": 5e-06, + "logits/chosen": -45557655.27272727, + "logits/rejected": -51408177.23076923, + "logps/chosen": -368.10531338778407, + "logps/rejected": -601.8615159254807, + "loss": 0.0463, + "rewards/chosen": 5.519921042702415, + "rewards/margins": 16.688814710070204, + "rewards/rejected": -11.168893667367788, + "step": 998 + }, + { + "epoch": 0.24996872263230327, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34937315.55555555, + "logits/rejected": -75829555.2, + "logps/chosen": -162.37904188368054, + "logps/rejected": -554.6164713541667, + "loss": 0.1019, + "rewards/chosen": 1.883637958102756, + "rewards/margins": 13.290573586357965, + "rewards/rejected": -11.406935628255209, + "step": 999 + }, + { + "epoch": 0.25021894157387714, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35295445.333333336, + "logits/rejected": -54430352.0, + "logps/chosen": -434.4065755208333, + "logps/rejected": -617.1285400390625, + "loss": 0.0077, + "rewards/chosen": 6.1912797292073565, + "rewards/margins": 17.268178939819336, + "rewards/rejected": -11.076899210611979, + "step": 1000 + }, + { + "epoch": 0.25046916051545104, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53168928.0, + "logits/rejected": 80021893.33333333, + "logps/chosen": -332.048095703125, + "logps/rejected": -623.7146809895834, + "loss": 0.058, + "rewards/chosen": 6.199230194091797, + "rewards/margins": 18.334878285725914, + "rewards/rejected": -12.135648091634115, + "step": 1001 + }, + { + "epoch": 0.2507193794570249, + "grad_norm": 3.015625, + "kl": 8.039688110351562, + "learning_rate": 5e-06, + "logits/chosen": -46497472.0, + "logits/rejected": -23026619.076923076, + "logps/chosen": -455.58744673295456, + "logps/rejected": -561.5847731370193, + "loss": 0.0091, + "rewards/chosen": 7.264249628240412, + "rewards/margins": 19.195509957266854, + "rewards/rejected": -11.931260329026442, + "step": 1002 + }, + { + "epoch": 0.2509695983985988, + "grad_norm": 8.125, + "kl": 1.5779623985290527, + "learning_rate": 5e-06, + "logits/chosen": -60927122.28571428, + "logits/rejected": -17497033.6, + "logps/chosen": -525.3565848214286, + "logps/rejected": -385.39814453125, + "loss": 0.0382, + "rewards/chosen": 7.462730952671596, + "rewards/margins": 13.477145167759488, + "rewards/rejected": -6.014414215087891, + "step": 1003 + }, + { + "epoch": 0.2512198173401726, + "grad_norm": 8.25, + "kl": 1.5625652074813843, + "learning_rate": 5e-06, + "logits/chosen": -25108976.0, + "logits/rejected": -20830339.2, + "logps/chosen": -317.3733607700893, + "logps/rejected": -578.748974609375, + "loss": 0.0713, + "rewards/chosen": 5.1822646004813055, + "rewards/margins": 16.48181141444615, + "rewards/rejected": -11.299546813964843, + "step": 1004 + }, + { + "epoch": 0.2514700362817465, + "grad_norm": 8.8125, + "kl": 6.138740539550781, + "learning_rate": 5e-06, + "logits/chosen": -43888369.23076923, + "logits/rejected": -66328436.36363637, + "logps/chosen": -357.99605618990387, + "logps/rejected": -508.69522372159093, + "loss": 0.0489, + "rewards/chosen": 6.330251840444712, + "rewards/margins": 17.637029794546272, + "rewards/rejected": -11.306777954101562, + "step": 1005 + }, + { + "epoch": 0.2517202552233204, + "grad_norm": 14.875, + "kl": 2.96492338180542, + "learning_rate": 5e-06, + "logits/chosen": -67805671.38461539, + "logits/rejected": -52245154.90909091, + "logps/chosen": -408.27456430288464, + "logps/rejected": -613.4556107954545, + "loss": 0.0275, + "rewards/chosen": 6.22849860558143, + "rewards/margins": 17.653217982578944, + "rewards/rejected": -11.424719376997514, + "step": 1006 + }, + { + "epoch": 0.25197047416489426, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46761316.0, + "logits/rejected": -17286142.0, + "logps/chosen": -488.9906311035156, + "logps/rejected": -519.4513549804688, + "loss": 0.0351, + "rewards/chosen": 7.57394552230835, + "rewards/margins": 16.839669704437256, + "rewards/rejected": -9.265724182128906, + "step": 1007 + }, + { + "epoch": 0.25222069310646816, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47920457.14285714, + "logits/rejected": -47777536.0, + "logps/chosen": -347.48329380580356, + "logps/rejected": -508.841015625, + "loss": 0.0162, + "rewards/chosen": 6.903561183384487, + "rewards/margins": 17.68131844656808, + "rewards/rejected": -10.777757263183593, + "step": 1008 + }, + { + "epoch": 0.25247091204804206, + "grad_norm": 11.5, + "kl": 1.9794502258300781, + "learning_rate": 5e-06, + "logits/chosen": -43209225.14285714, + "logits/rejected": -35479430.4, + "logps/chosen": -427.35062081473217, + "logps/rejected": -489.30712890625, + "loss": 0.0385, + "rewards/chosen": 7.117788587297712, + "rewards/margins": 20.615838514055525, + "rewards/rejected": -13.498049926757812, + "step": 1009 + }, + { + "epoch": 0.2527211309896159, + "grad_norm": 1.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60640464.0, + "logits/rejected": -47734826.666666664, + "logps/chosen": -486.995361328125, + "logps/rejected": -481.5196940104167, + "loss": 0.018, + "rewards/chosen": 8.655935923258463, + "rewards/margins": 19.78811772664388, + "rewards/rejected": -11.132181803385416, + "step": 1010 + }, + { + "epoch": 0.2529713499311898, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54788080.0, + "logits/rejected": -48225640.0, + "logps/chosen": -419.5021057128906, + "logps/rejected": -506.8221435546875, + "loss": 0.0837, + "rewards/chosen": 5.512420177459717, + "rewards/margins": 16.09086561203003, + "rewards/rejected": -10.578445434570312, + "step": 1011 + }, + { + "epoch": 0.25322156887276365, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36486714.18181818, + "logits/rejected": -31619288.615384616, + "logps/chosen": -299.0135609019886, + "logps/rejected": -444.0518329326923, + "loss": 0.0769, + "rewards/chosen": 6.2726287841796875, + "rewards/margins": 18.588010347806488, + "rewards/rejected": -12.315381563626802, + "step": 1012 + }, + { + "epoch": 0.25347178781433755, + "grad_norm": 8.4375, + "kl": 6.553333282470703, + "learning_rate": 5e-06, + "logits/chosen": -25746574.769230768, + "logits/rejected": -59588596.36363637, + "logps/chosen": -334.5786884014423, + "logps/rejected": -781.0011541193181, + "loss": 0.0613, + "rewards/chosen": 5.6049969012920675, + "rewards/margins": 20.91585727504917, + "rewards/rejected": -15.310860373757102, + "step": 1013 + }, + { + "epoch": 0.25372200675591144, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23167282.666666668, + "logits/rejected": -52002032.0, + "logps/chosen": -444.2410888671875, + "logps/rejected": -652.1468098958334, + "loss": 0.0225, + "rewards/chosen": 6.999585469563802, + "rewards/margins": 21.186602274576824, + "rewards/rejected": -14.187016805013021, + "step": 1014 + }, + { + "epoch": 0.2539722256974853, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68860960.0, + "logits/rejected": -53725508.0, + "logps/chosen": -424.82183837890625, + "logps/rejected": -647.5213623046875, + "loss": 0.0549, + "rewards/chosen": 7.0070271492004395, + "rewards/margins": 16.34157419204712, + "rewards/rejected": -9.33454704284668, + "step": 1015 + }, + { + "epoch": 0.2542224446390592, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52634100.0, + "logits/rejected": -35175144.0, + "logps/chosen": -412.65069580078125, + "logps/rejected": -394.19256591796875, + "loss": 0.0104, + "rewards/chosen": 5.859254837036133, + "rewards/margins": 14.669235229492188, + "rewards/rejected": -8.809980392456055, + "step": 1016 + }, + { + "epoch": 0.25447266358063303, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42593801.14285714, + "logits/rejected": -81095033.6, + "logps/chosen": -365.44203404017856, + "logps/rejected": -580.0462890625, + "loss": 0.0514, + "rewards/chosen": 5.486855643136161, + "rewards/margins": 18.86259111676897, + "rewards/rejected": -13.375735473632812, + "step": 1017 + }, + { + "epoch": 0.25472288252220693, + "grad_norm": 1.6171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48885939.2, + "logits/rejected": -58672932.571428575, + "logps/chosen": -401.49375, + "logps/rejected": -572.3987862723214, + "loss": 0.0158, + "rewards/chosen": 7.813041687011719, + "rewards/margins": 19.08989061628069, + "rewards/rejected": -11.276848929268974, + "step": 1018 + }, + { + "epoch": 0.2549731014637808, + "grad_norm": 23.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42999747.2, + "logits/rejected": -64734381.71428572, + "logps/chosen": -353.2420654296875, + "logps/rejected": -599.8684430803571, + "loss": 0.0788, + "rewards/chosen": 4.258368301391601, + "rewards/margins": 14.02631481715611, + "rewards/rejected": -9.767946515764509, + "step": 1019 + }, + { + "epoch": 0.25522332040535467, + "grad_norm": 16.625, + "kl": 4.18208122253418, + "learning_rate": 5e-06, + "logits/chosen": -39622581.333333336, + "logits/rejected": -30896533.333333332, + "logps/chosen": -456.6374918619792, + "logps/rejected": -491.9557291666667, + "loss": 0.0654, + "rewards/chosen": 5.739707946777344, + "rewards/margins": 15.960919698079428, + "rewards/rejected": -10.221211751302084, + "step": 1020 + }, + { + "epoch": 0.25547353934692857, + "grad_norm": 7.40625, + "kl": 3.0132863521575928, + "learning_rate": 5e-06, + "logits/chosen": -62003580.0, + "logits/rejected": -46525796.0, + "logps/chosen": -523.8833618164062, + "logps/rejected": -671.1158447265625, + "loss": 0.0389, + "rewards/chosen": 6.62756872177124, + "rewards/margins": 20.664516925811768, + "rewards/rejected": -14.036948204040527, + "step": 1021 + }, + { + "epoch": 0.25572375828850247, + "grad_norm": 12.125, + "kl": 4.427289962768555, + "learning_rate": 5e-06, + "logits/chosen": -66106632.53333333, + "logits/rejected": -26632158.222222224, + "logps/chosen": -408.7776692708333, + "logps/rejected": -483.0852864583333, + "loss": 0.031, + "rewards/chosen": 7.477281188964843, + "rewards/margins": 15.351595391167535, + "rewards/rejected": -7.874314202202691, + "step": 1022 + }, + { + "epoch": 0.2559739772300763, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39644416.0, + "logits/rejected": -23809483.2, + "logps/chosen": -303.5419921875, + "logps/rejected": -339.08544921875, + "loss": 0.0668, + "rewards/chosen": 5.307792118617466, + "rewards/margins": 15.151117924281529, + "rewards/rejected": -9.843325805664062, + "step": 1023 + }, + { + "epoch": 0.2562241961716502, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74494576.0, + "logits/rejected": 7892221.5, + "logps/chosen": -463.3726501464844, + "logps/rejected": -538.4326171875, + "loss": 0.0133, + "rewards/chosen": 6.614655017852783, + "rewards/margins": 16.026517391204834, + "rewards/rejected": -9.41186237335205, + "step": 1024 + }, + { + "epoch": 0.25647441511322405, + "grad_norm": 6.125, + "kl": 6.1739325523376465, + "learning_rate": 5e-06, + "logits/chosen": -58289456.0, + "logits/rejected": -74404805.33333333, + "logps/chosen": -379.2535400390625, + "logps/rejected": -536.9324544270834, + "loss": 0.0592, + "rewards/chosen": 6.860469818115234, + "rewards/margins": 17.552379608154297, + "rewards/rejected": -10.691909790039062, + "step": 1025 + }, + { + "epoch": 0.25672463405479795, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61612708.0, + "logits/rejected": -46607440.0, + "logps/chosen": -281.4748840332031, + "logps/rejected": -813.1284790039062, + "loss": 0.0743, + "rewards/chosen": 2.569995164871216, + "rewards/margins": 16.324273347854614, + "rewards/rejected": -13.754278182983398, + "step": 1026 + }, + { + "epoch": 0.25697485299637185, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43085370.18181818, + "logits/rejected": -6887155.692307692, + "logps/chosen": -499.87748579545456, + "logps/rejected": -691.3734975961538, + "loss": 0.0338, + "rewards/chosen": 7.815723072398793, + "rewards/margins": 22.084843508847108, + "rewards/rejected": -14.269120436448317, + "step": 1027 + }, + { + "epoch": 0.2572250719379457, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39086007.27272727, + "logits/rejected": -39490820.92307692, + "logps/chosen": -322.6455078125, + "logps/rejected": -474.70132211538464, + "loss": 0.0516, + "rewards/chosen": 5.844175165349787, + "rewards/margins": 15.344976251775568, + "rewards/rejected": -9.500801086425781, + "step": 1028 + }, + { + "epoch": 0.2574752908795196, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42935284.36363637, + "logits/rejected": -32859200.0, + "logps/chosen": -307.62808504971593, + "logps/rejected": -616.8149038461538, + "loss": 0.0531, + "rewards/chosen": 6.581899469549006, + "rewards/margins": 18.02195707734648, + "rewards/rejected": -11.440057607797476, + "step": 1029 + }, + { + "epoch": 0.25772550982109343, + "grad_norm": 14.375, + "kl": 0.05431016534566879, + "learning_rate": 5e-06, + "logits/chosen": -66931792.0, + "logits/rejected": -46914684.0, + "logps/chosen": -437.85186767578125, + "logps/rejected": -441.6575927734375, + "loss": 0.0729, + "rewards/chosen": 7.840295791625977, + "rewards/margins": 13.40229320526123, + "rewards/rejected": -5.561997413635254, + "step": 1030 + }, + { + "epoch": 0.25797572876266733, + "grad_norm": 11.6875, + "kl": 4.4825873374938965, + "learning_rate": 5e-06, + "logits/chosen": -99679173.81818181, + "logits/rejected": -65518897.23076923, + "logps/chosen": -471.60933061079544, + "logps/rejected": -570.8533653846154, + "loss": 0.0261, + "rewards/chosen": 8.747844349254262, + "rewards/margins": 18.80046134895378, + "rewards/rejected": -10.05261699969952, + "step": 1031 + }, + { + "epoch": 0.25822594770424123, + "grad_norm": 11.625, + "kl": 12.704346656799316, + "learning_rate": 5e-06, + "logits/chosen": -47932327.384615384, + "logits/rejected": -22647476.363636363, + "logps/chosen": -416.32189002403845, + "logps/rejected": -610.4292436079545, + "loss": 0.065, + "rewards/chosen": 7.321493882399339, + "rewards/margins": 19.031692024711127, + "rewards/rejected": -11.71019814231179, + "step": 1032 + }, + { + "epoch": 0.2584761666458151, + "grad_norm": 32.25, + "kl": 10.949246406555176, + "learning_rate": 5e-06, + "logits/chosen": -16912742.4, + "logits/rejected": -85911637.33333333, + "logps/chosen": -460.91637369791664, + "logps/rejected": -540.9468315972222, + "loss": 0.0607, + "rewards/chosen": 7.205246988932291, + "rewards/margins": 16.048444281684027, + "rewards/rejected": -8.843197292751736, + "step": 1033 + }, + { + "epoch": 0.258726385587389, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40004640.0, + "logits/rejected": -54037641.84615385, + "logps/chosen": -383.189453125, + "logps/rejected": -512.6988055889423, + "loss": 0.0548, + "rewards/chosen": 5.053617304021662, + "rewards/margins": 14.073775338126229, + "rewards/rejected": -9.020158034104567, + "step": 1034 + }, + { + "epoch": 0.2589766045289628, + "grad_norm": 2.9375, + "kl": 3.688621997833252, + "learning_rate": 5e-06, + "logits/chosen": -47938101.333333336, + "logits/rejected": -74513685.33333333, + "logps/chosen": -408.0246175130208, + "logps/rejected": -539.6314290364584, + "loss": 0.0145, + "rewards/chosen": 7.235884348551433, + "rewards/margins": 17.9769287109375, + "rewards/rejected": -10.741044362386068, + "step": 1035 + }, + { + "epoch": 0.2592268234705367, + "grad_norm": 4.21875, + "kl": 5.445687770843506, + "learning_rate": 5e-06, + "logits/chosen": -49607470.93333333, + "logits/rejected": -53894122.666666664, + "logps/chosen": -400.16998697916665, + "logps/rejected": -577.62890625, + "loss": 0.0184, + "rewards/chosen": 7.655976867675781, + "rewards/margins": 20.3657962375217, + "rewards/rejected": -12.70981936984592, + "step": 1036 + }, + { + "epoch": 0.2594770424121106, + "grad_norm": 13.8125, + "kl": 15.88320255279541, + "learning_rate": 5e-06, + "logits/chosen": -60380708.0, + "logits/rejected": -53676040.0, + "logps/chosen": -373.0316162109375, + "logps/rejected": -722.8099975585938, + "loss": 0.0407, + "rewards/chosen": 7.708272933959961, + "rewards/margins": 22.90108299255371, + "rewards/rejected": -15.19281005859375, + "step": 1037 + }, + { + "epoch": 0.25972726135368446, + "grad_norm": 4.1875, + "kl": 3.2029032707214355, + "learning_rate": 5e-06, + "logits/chosen": -43271523.2, + "logits/rejected": -56064704.0, + "logps/chosen": -435.09599609375, + "logps/rejected": -612.1997767857143, + "loss": 0.0274, + "rewards/chosen": 6.463885498046875, + "rewards/margins": 16.702281188964843, + "rewards/rejected": -10.238395690917969, + "step": 1038 + }, + { + "epoch": 0.25997748029525836, + "grad_norm": 3.546875, + "kl": 0.4309489130973816, + "learning_rate": 5e-06, + "logits/chosen": -70473397.33333333, + "logits/rejected": -17723012.0, + "logps/chosen": -443.3926595052083, + "logps/rejected": -425.0060221354167, + "loss": 0.0089, + "rewards/chosen": 8.649250666300455, + "rewards/margins": 18.368024826049805, + "rewards/rejected": -9.71877415974935, + "step": 1039 + }, + { + "epoch": 0.26022769923683226, + "grad_norm": 14.6875, + "kl": 4.056169033050537, + "learning_rate": 5e-06, + "logits/chosen": -20390875.733333334, + "logits/rejected": -46449955.55555555, + "logps/chosen": -313.95384114583334, + "logps/rejected": -445.68866644965277, + "loss": 0.1157, + "rewards/chosen": 5.378410847981771, + "rewards/margins": 11.495064714219835, + "rewards/rejected": -6.116653866238064, + "step": 1040 + }, + { + "epoch": 0.2604779181784061, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37050550.15384615, + "logits/rejected": -30389207.272727273, + "logps/chosen": -322.15726412259613, + "logps/rejected": -507.79567649147725, + "loss": 0.0689, + "rewards/chosen": 3.795181861290565, + "rewards/margins": 15.502620937107327, + "rewards/rejected": -11.707439075816762, + "step": 1041 + }, + { + "epoch": 0.26072813711998, + "grad_norm": 10.5625, + "kl": 6.8166961669921875, + "learning_rate": 5e-06, + "logits/chosen": -48090980.571428575, + "logits/rejected": -9097024.0, + "logps/chosen": -340.4025181361607, + "logps/rejected": -489.7923828125, + "loss": 0.0716, + "rewards/chosen": 6.434259142194476, + "rewards/margins": 15.65297862461635, + "rewards/rejected": -9.218719482421875, + "step": 1042 + }, + { + "epoch": 0.26097835606155384, + "grad_norm": 4.6875, + "kl": 3.012301206588745, + "learning_rate": 5e-06, + "logits/chosen": -56313262.54545455, + "logits/rejected": -38509769.84615385, + "logps/chosen": -492.56840376420456, + "logps/rejected": -687.3157301682693, + "loss": 0.0123, + "rewards/chosen": 8.033505526455967, + "rewards/margins": 16.60095054786522, + "rewards/rejected": -8.567445021409254, + "step": 1043 + }, + { + "epoch": 0.26122857500312774, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40478364.0, + "logits/rejected": -50540128.0, + "logps/chosen": -303.23431396484375, + "logps/rejected": -593.4578857421875, + "loss": 0.0307, + "rewards/chosen": 5.1532745361328125, + "rewards/margins": 15.232929229736328, + "rewards/rejected": -10.079654693603516, + "step": 1044 + }, + { + "epoch": 0.26147879394470164, + "grad_norm": 9.625, + "kl": 3.0611419677734375, + "learning_rate": 5e-06, + "logits/chosen": -48648808.0, + "logits/rejected": -36076716.0, + "logps/chosen": -429.60162353515625, + "logps/rejected": -429.864501953125, + "loss": 0.0244, + "rewards/chosen": 9.218293190002441, + "rewards/margins": 16.823873043060303, + "rewards/rejected": -7.605579853057861, + "step": 1045 + }, + { + "epoch": 0.2617290128862755, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41144442.666666664, + "logits/rejected": -35357786.666666664, + "logps/chosen": -361.2045084635417, + "logps/rejected": -550.9252522786459, + "loss": 0.0701, + "rewards/chosen": 6.7840728759765625, + "rewards/margins": 15.464081446329752, + "rewards/rejected": -8.68000857035319, + "step": 1046 + }, + { + "epoch": 0.2619792318278494, + "grad_norm": 10.625, + "kl": 4.34995174407959, + "learning_rate": 5e-06, + "logits/chosen": -50915858.28571428, + "logits/rejected": -40179104.0, + "logps/chosen": -424.2108677455357, + "logps/rejected": -601.500390625, + "loss": 0.0304, + "rewards/chosen": 7.509768894740513, + "rewards/margins": 19.86227057320731, + "rewards/rejected": -12.352501678466798, + "step": 1047 + }, + { + "epoch": 0.2622294507694232, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58716384.0, + "logits/rejected": -53422901.333333336, + "logps/chosen": -410.5199381510417, + "logps/rejected": -452.4109700520833, + "loss": 0.0276, + "rewards/chosen": 7.995380401611328, + "rewards/margins": 17.340922673543297, + "rewards/rejected": -9.345542271931967, + "step": 1048 + }, + { + "epoch": 0.2624796697109971, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36150487.27272727, + "logits/rejected": -85648285.53846154, + "logps/chosen": -342.70869584517044, + "logps/rejected": -437.68765024038464, + "loss": 0.055, + "rewards/chosen": 6.717924638227983, + "rewards/margins": 14.550274562168788, + "rewards/rejected": -7.832349923940805, + "step": 1049 + }, + { + "epoch": 0.262729888652571, + "grad_norm": 4.8125, + "kl": 6.073253631591797, + "learning_rate": 5e-06, + "logits/chosen": -54816570.18181818, + "logits/rejected": -29863310.769230768, + "logps/chosen": -425.4305308948864, + "logps/rejected": -440.45015775240387, + "loss": 0.0246, + "rewards/chosen": 7.383992975408381, + "rewards/margins": 15.853181372155676, + "rewards/rejected": -8.469188396747295, + "step": 1050 + }, + { + "epoch": 0.26298010759414486, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24965732.0, + "logits/rejected": -22566512.0, + "logps/chosen": -395.2441101074219, + "logps/rejected": -617.1180419921875, + "loss": 0.0482, + "rewards/chosen": 6.644718170166016, + "rewards/margins": 19.166110038757324, + "rewards/rejected": -12.521391868591309, + "step": 1051 + }, + { + "epoch": 0.26323032653571876, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66122944.0, + "logits/rejected": -46419668.0, + "logps/chosen": -561.966552734375, + "logps/rejected": -351.14605712890625, + "loss": 0.0287, + "rewards/chosen": 8.718419075012207, + "rewards/margins": 15.356658935546875, + "rewards/rejected": -6.638239860534668, + "step": 1052 + }, + { + "epoch": 0.2634805454772926, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53186496.0, + "logits/rejected": -39044332.307692304, + "logps/chosen": -390.18678977272725, + "logps/rejected": -504.33980618990387, + "loss": 0.0645, + "rewards/chosen": 6.3116982199928975, + "rewards/margins": 16.40342701731862, + "rewards/rejected": -10.091728797325722, + "step": 1053 + }, + { + "epoch": 0.2637307644188665, + "grad_norm": 12.0, + "kl": 1.1019856929779053, + "learning_rate": 5e-06, + "logits/chosen": -39770276.571428575, + "logits/rejected": -51058931.2, + "logps/chosen": -320.23158482142856, + "logps/rejected": -565.858984375, + "loss": 0.0543, + "rewards/chosen": 5.458232334681919, + "rewards/margins": 15.545704868861606, + "rewards/rejected": -10.087472534179687, + "step": 1054 + }, + { + "epoch": 0.2639809833604404, + "grad_norm": 21.875, + "kl": 13.707293510437012, + "learning_rate": 5e-06, + "logits/chosen": -65360053.333333336, + "logits/rejected": -24593626.666666668, + "logps/chosen": -396.2681477864583, + "logps/rejected": -519.4673258463541, + "loss": 0.1107, + "rewards/chosen": 6.441303253173828, + "rewards/margins": 15.619555155436197, + "rewards/rejected": -9.17825190226237, + "step": 1055 + }, + { + "epoch": 0.26423120230201425, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37330126.54545455, + "logits/rejected": -35687133.538461536, + "logps/chosen": -412.78493430397725, + "logps/rejected": -454.18637319711536, + "loss": 0.0199, + "rewards/chosen": 6.795394203879616, + "rewards/margins": 15.800334530276853, + "rewards/rejected": -9.004940326397236, + "step": 1056 + }, + { + "epoch": 0.26448142124358814, + "grad_norm": 3.703125, + "kl": 1.5879911184310913, + "learning_rate": 5e-06, + "logits/chosen": -55692164.571428575, + "logits/rejected": -42299350.4, + "logps/chosen": -464.7675083705357, + "logps/rejected": -541.408203125, + "loss": 0.0171, + "rewards/chosen": 8.523343222481865, + "rewards/margins": 20.877652304513113, + "rewards/rejected": -12.35430908203125, + "step": 1057 + }, + { + "epoch": 0.26473164018516204, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12121508.0, + "logits/rejected": -40629152.0, + "logps/chosen": -302.60715738932294, + "logps/rejected": -526.6312934027778, + "loss": 0.03, + "rewards/chosen": 3.363840103149414, + "rewards/margins": 15.145866605970594, + "rewards/rejected": -11.78202650282118, + "step": 1058 + }, + { + "epoch": 0.2649818591267359, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33326417.066666666, + "logits/rejected": -39347768.88888889, + "logps/chosen": -460.6977213541667, + "logps/rejected": -626.7797309027778, + "loss": 0.051, + "rewards/chosen": 6.554227701822916, + "rewards/margins": 18.729056972927516, + "rewards/rejected": -12.174829271104601, + "step": 1059 + }, + { + "epoch": 0.2652320780683098, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58585731.55555555, + "logits/rejected": -53787080.53333333, + "logps/chosen": -470.43901909722223, + "logps/rejected": -597.9861979166667, + "loss": 0.0431, + "rewards/chosen": 8.312078687879774, + "rewards/margins": 21.554023064507376, + "rewards/rejected": -13.241944376627604, + "step": 1060 + }, + { + "epoch": 0.2654822970098836, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60873498.35294118, + "logits/rejected": -43994779.428571425, + "logps/chosen": -537.8562729779412, + "logps/rejected": -504.58963448660717, + "loss": 0.0407, + "rewards/chosen": 6.962846194996553, + "rewards/margins": 21.59422218899767, + "rewards/rejected": -14.631375994001116, + "step": 1061 + }, + { + "epoch": 0.2657325159514575, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48586210.461538464, + "logits/rejected": -52390033.45454545, + "logps/chosen": -482.6952373798077, + "logps/rejected": -470.1321910511364, + "loss": 0.008, + "rewards/chosen": 7.405115567720854, + "rewards/margins": 18.144474643093723, + "rewards/rejected": -10.73935907537287, + "step": 1062 + }, + { + "epoch": 0.2659827348930314, + "grad_norm": 12.4375, + "kl": 12.692652702331543, + "learning_rate": 5e-06, + "logits/chosen": -49571694.93333333, + "logits/rejected": -33375488.0, + "logps/chosen": -416.4013671875, + "logps/rejected": -689.6477864583334, + "loss": 0.0377, + "rewards/chosen": 7.510750325520833, + "rewards/margins": 24.81729770236545, + "rewards/rejected": -17.30654737684462, + "step": 1063 + }, + { + "epoch": 0.26623295383460527, + "grad_norm": 4.4375, + "kl": 2.9872589111328125, + "learning_rate": 5e-06, + "logits/chosen": -30440634.181818184, + "logits/rejected": -40265137.23076923, + "logps/chosen": -372.70676491477275, + "logps/rejected": -601.6565880408654, + "loss": 0.0238, + "rewards/chosen": 6.4228057861328125, + "rewards/margins": 19.749731210561897, + "rewards/rejected": -13.326925424429087, + "step": 1064 + }, + { + "epoch": 0.26648317277617917, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32604874.666666668, + "logits/rejected": -22527709.333333332, + "logps/chosen": -380.1713460286458, + "logps/rejected": -573.5654296875, + "loss": 0.0531, + "rewards/chosen": 4.891154607137044, + "rewards/margins": 16.450868606567383, + "rewards/rejected": -11.559713999430338, + "step": 1065 + }, + { + "epoch": 0.266733391717753, + "grad_norm": 5.4375, + "kl": 0.3885812759399414, + "learning_rate": 5e-06, + "logits/chosen": -61228234.666666664, + "logits/rejected": -74673984.0, + "logps/chosen": -355.7219645182292, + "logps/rejected": -584.5227864583334, + "loss": 0.0605, + "rewards/chosen": 4.82704480489095, + "rewards/margins": 17.867760340372723, + "rewards/rejected": -13.040715535481771, + "step": 1066 + }, + { + "epoch": 0.2669836106593269, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55856928.0, + "logits/rejected": 46298482.28571428, + "logps/chosen": -384.7314453125, + "logps/rejected": -582.9377790178571, + "loss": 0.0975, + "rewards/chosen": 5.264641571044922, + "rewards/margins": 16.17779835292271, + "rewards/rejected": -10.91315678187779, + "step": 1067 + }, + { + "epoch": 0.2672338296009008, + "grad_norm": 6.96875, + "kl": 1.9263758659362793, + "learning_rate": 5e-06, + "logits/chosen": -49573779.692307696, + "logits/rejected": -26875904.0, + "logps/chosen": -344.23106971153845, + "logps/rejected": -522.1237571022727, + "loss": 0.1146, + "rewards/chosen": 4.320041363055889, + "rewards/margins": 15.965288388979184, + "rewards/rejected": -11.645247025923295, + "step": 1068 + }, + { + "epoch": 0.26748404854247465, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41251515.428571425, + "logits/rejected": -73711545.6, + "logps/chosen": -323.14505440848217, + "logps/rejected": -698.8765625, + "loss": 0.0499, + "rewards/chosen": 5.300013405936105, + "rewards/margins": 17.88540235246931, + "rewards/rejected": -12.585388946533204, + "step": 1069 + }, + { + "epoch": 0.26773426748404855, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76435219.2, + "logits/rejected": -59747597.473684214, + "logps/chosen": -412.98466796875, + "logps/rejected": -595.5314555921053, + "loss": 0.073, + "rewards/chosen": 5.491269302368164, + "rewards/margins": 16.93192626551578, + "rewards/rejected": -11.440656963147616, + "step": 1070 + }, + { + "epoch": 0.2679844864256224, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70582061.71428572, + "logits/rejected": -28251091.2, + "logps/chosen": -448.77926199776783, + "logps/rejected": -716.748291015625, + "loss": 0.0508, + "rewards/chosen": 6.677077157156808, + "rewards/margins": 16.48286328996931, + "rewards/rejected": -9.8057861328125, + "step": 1071 + }, + { + "epoch": 0.2682347053671963, + "grad_norm": 13.4375, + "kl": 6.2189483642578125, + "learning_rate": 5e-06, + "logits/chosen": -51928312.47058824, + "logits/rejected": -746270.8571428572, + "logps/chosen": -422.0572150735294, + "logps/rejected": -480.60501534598217, + "loss": 0.0432, + "rewards/chosen": 7.531087538775275, + "rewards/margins": 20.113306253898042, + "rewards/rejected": -12.582218715122767, + "step": 1072 + }, + { + "epoch": 0.2684849243087702, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42603328.0, + "logits/rejected": -48993929.14285714, + "logps/chosen": -493.55888671875, + "logps/rejected": -410.38058035714283, + "loss": 0.0392, + "rewards/chosen": 7.736805725097656, + "rewards/margins": 16.214313180106025, + "rewards/rejected": -8.47750745500837, + "step": 1073 + }, + { + "epoch": 0.26873514325034403, + "grad_norm": 4.21875, + "kl": 5.50734281539917, + "learning_rate": 5e-06, + "logits/chosen": -64664249.6, + "logits/rejected": -36441474.28571428, + "logps/chosen": -498.4115234375, + "logps/rejected": -419.52852957589283, + "loss": 0.0219, + "rewards/chosen": 7.686170959472657, + "rewards/margins": 15.24597396850586, + "rewards/rejected": -7.559803009033203, + "step": 1074 + }, + { + "epoch": 0.26898536219191793, + "grad_norm": 18.75, + "kl": 1.8065681457519531, + "learning_rate": 5e-06, + "logits/chosen": -80071461.33333333, + "logits/rejected": -58035962.666666664, + "logps/chosen": -518.0250244140625, + "logps/rejected": -723.7351888020834, + "loss": 0.0535, + "rewards/chosen": 6.1612599690755205, + "rewards/margins": 19.24044672648112, + "rewards/rejected": -13.0791867574056, + "step": 1075 + }, + { + "epoch": 0.26923558113349183, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37960340.36363637, + "logits/rejected": -52759522.461538464, + "logps/chosen": -390.21067116477275, + "logps/rejected": -472.46142578125, + "loss": 0.0286, + "rewards/chosen": 6.200856295498935, + "rewards/margins": 16.18916102055903, + "rewards/rejected": -9.988304725060097, + "step": 1076 + }, + { + "epoch": 0.2694858000750657, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75481536.0, + "logits/rejected": -38179432.0, + "logps/chosen": -362.4228210449219, + "logps/rejected": -484.3741149902344, + "loss": 0.0905, + "rewards/chosen": 5.8872971534729, + "rewards/margins": 16.126540660858154, + "rewards/rejected": -10.239243507385254, + "step": 1077 + }, + { + "epoch": 0.26973601901663957, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6303317.333333333, + "logits/rejected": -40102544.0, + "logps/chosen": -426.8374837239583, + "logps/rejected": -507.0885009765625, + "loss": 0.0307, + "rewards/chosen": 6.030133565266927, + "rewards/margins": 16.44853146870931, + "rewards/rejected": -10.418397903442383, + "step": 1078 + }, + { + "epoch": 0.2699862379582134, + "grad_norm": 12.5625, + "kl": 1.014570951461792, + "learning_rate": 5e-06, + "logits/chosen": -30062451.2, + "logits/rejected": -5822384.444444444, + "logps/chosen": -414.942578125, + "logps/rejected": -456.8699001736111, + "loss": 0.0672, + "rewards/chosen": 6.2477060953776045, + "rewards/margins": 13.918084038628471, + "rewards/rejected": -7.670377943250868, + "step": 1079 + }, + { + "epoch": 0.2702364568997873, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41193900.8, + "logits/rejected": -54204164.571428575, + "logps/chosen": -289.3846435546875, + "logps/rejected": -428.8517368861607, + "loss": 0.0635, + "rewards/chosen": 6.360298156738281, + "rewards/margins": 13.527463095528738, + "rewards/rejected": -7.1671649387904575, + "step": 1080 + }, + { + "epoch": 0.2704866758413612, + "grad_norm": 8.625, + "kl": 8.937373161315918, + "learning_rate": 5e-06, + "logits/chosen": -68271445.33333333, + "logits/rejected": -57289600.0, + "logps/chosen": -467.50169270833334, + "logps/rejected": -483.86094835069446, + "loss": 0.0598, + "rewards/chosen": 8.355076599121094, + "rewards/margins": 16.197863599989148, + "rewards/rejected": -7.842787000868055, + "step": 1081 + }, + { + "epoch": 0.27073689478293506, + "grad_norm": 6.28125, + "kl": 6.583486080169678, + "learning_rate": 5e-06, + "logits/chosen": -48324998.4, + "logits/rejected": -64252214.85714286, + "logps/chosen": -400.69033203125, + "logps/rejected": -538.2389787946429, + "loss": 0.08, + "rewards/chosen": 8.198338317871094, + "rewards/margins": 16.923764038085938, + "rewards/rejected": -8.725425720214844, + "step": 1082 + }, + { + "epoch": 0.27098711372450895, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33818705.45454545, + "logits/rejected": -37354188.307692304, + "logps/chosen": -414.52778764204544, + "logps/rejected": -622.1455453725962, + "loss": 0.0267, + "rewards/chosen": 6.887436606667259, + "rewards/margins": 17.916585588788653, + "rewards/rejected": -11.029148982121395, + "step": 1083 + }, + { + "epoch": 0.2712373326660828, + "grad_norm": 11.1875, + "kl": 2.986737012863159, + "learning_rate": 5e-06, + "logits/chosen": -28127161.14285714, + "logits/rejected": -32415081.6, + "logps/chosen": -407.00258091517856, + "logps/rejected": -570.648828125, + "loss": 0.0454, + "rewards/chosen": 7.337047576904297, + "rewards/margins": 18.68454818725586, + "rewards/rejected": -11.347500610351563, + "step": 1084 + }, + { + "epoch": 0.2714875516076567, + "grad_norm": 3.09375, + "kl": 1.8498420715332031, + "learning_rate": 5e-06, + "logits/chosen": -51871883.63636363, + "logits/rejected": -58128162.461538464, + "logps/chosen": -399.17338423295456, + "logps/rejected": -598.6005108173077, + "loss": 0.0145, + "rewards/chosen": 6.610595703125, + "rewards/margins": 18.59569138746995, + "rewards/rejected": -11.985095684344952, + "step": 1085 + }, + { + "epoch": 0.2717377705492306, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65536170.666666664, + "logits/rejected": -60796364.8, + "logps/chosen": -428.63525390625, + "logps/rejected": -625.8037109375, + "loss": 0.0286, + "rewards/chosen": 7.138812594943577, + "rewards/margins": 17.146891615125867, + "rewards/rejected": -10.008079020182292, + "step": 1086 + }, + { + "epoch": 0.27198798949080444, + "grad_norm": 1.9453125, + "kl": 0.1335500180721283, + "learning_rate": 5e-06, + "logits/chosen": -67314986.66666667, + "logits/rejected": -68208704.0, + "logps/chosen": -448.6257731119792, + "logps/rejected": -595.0028076171875, + "loss": 0.0236, + "rewards/chosen": 8.418085734049479, + "rewards/margins": 19.364306131998696, + "rewards/rejected": -10.946220397949219, + "step": 1087 + }, + { + "epoch": 0.27223820843237834, + "grad_norm": 11.9375, + "kl": 2.4781596660614014, + "learning_rate": 5e-06, + "logits/chosen": -32897909.333333332, + "logits/rejected": -50726010.666666664, + "logps/chosen": -341.4776611328125, + "logps/rejected": -386.9042154947917, + "loss": 0.0575, + "rewards/chosen": 5.447121302286784, + "rewards/margins": 12.173222223917644, + "rewards/rejected": -6.726100921630859, + "step": 1088 + }, + { + "epoch": 0.27248842737395224, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65734038.85714286, + "logits/rejected": -37282688.0, + "logps/chosen": -460.634765625, + "logps/rejected": -557.724462890625, + "loss": 0.0652, + "rewards/chosen": 6.275933946881976, + "rewards/margins": 17.17930613926479, + "rewards/rejected": -10.903372192382813, + "step": 1089 + }, + { + "epoch": 0.2727386463155261, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53089733.81818182, + "logits/rejected": -52116263.384615384, + "logps/chosen": -341.99471768465907, + "logps/rejected": -592.1624474158654, + "loss": 0.0477, + "rewards/chosen": 5.92734805020419, + "rewards/margins": 17.439280423251066, + "rewards/rejected": -11.511932373046875, + "step": 1090 + }, + { + "epoch": 0.2729888652571, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25469912.0, + "logits/rejected": -56716392.0, + "logps/chosen": -281.33465576171875, + "logps/rejected": -462.2007141113281, + "loss": 0.0713, + "rewards/chosen": 5.469954490661621, + "rewards/margins": 14.639933586120605, + "rewards/rejected": -9.169979095458984, + "step": 1091 + }, + { + "epoch": 0.2732390841986738, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42062903.46666667, + "logits/rejected": -56085603.55555555, + "logps/chosen": -268.9640299479167, + "logps/rejected": -600.7552083333334, + "loss": 0.0727, + "rewards/chosen": 4.227912902832031, + "rewards/margins": 18.390953063964844, + "rewards/rejected": -14.163040161132812, + "step": 1092 + }, + { + "epoch": 0.2734893031402477, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69614899.2, + "logits/rejected": -55605156.571428575, + "logps/chosen": -459.2021484375, + "logps/rejected": -469.44960239955356, + "loss": 0.0215, + "rewards/chosen": 6.567465209960938, + "rewards/margins": 16.48907645089286, + "rewards/rejected": -9.92161124093192, + "step": 1093 + }, + { + "epoch": 0.2737395220818216, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3467280.0, + "logits/rejected": -56018474.666666664, + "logps/chosen": -385.5160807291667, + "logps/rejected": -762.1418185763889, + "loss": 0.0375, + "rewards/chosen": 7.350739542643229, + "rewards/margins": 24.25840861002604, + "rewards/rejected": -16.907669067382812, + "step": 1094 + }, + { + "epoch": 0.27398974102339546, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41002224.0, + "logits/rejected": -46760886.85714286, + "logps/chosen": -317.456640625, + "logps/rejected": -442.14481026785717, + "loss": 0.0619, + "rewards/chosen": 5.158559036254883, + "rewards/margins": 15.3337582724435, + "rewards/rejected": -10.175199236188616, + "step": 1095 + }, + { + "epoch": 0.27423995996496936, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66423336.0, + "logits/rejected": 15578939.0, + "logps/chosen": -598.3119506835938, + "logps/rejected": -538.156005859375, + "loss": 0.0326, + "rewards/chosen": 9.851755142211914, + "rewards/margins": 20.736827850341797, + "rewards/rejected": -10.885072708129883, + "step": 1096 + }, + { + "epoch": 0.2744901789065432, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56380012.0, + "logits/rejected": -45252272.0, + "logps/chosen": -360.52008056640625, + "logps/rejected": -775.1851196289062, + "loss": 0.0491, + "rewards/chosen": 4.268970966339111, + "rewards/margins": 20.097017765045166, + "rewards/rejected": -15.828046798706055, + "step": 1097 + }, + { + "epoch": 0.2747403978481171, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42099539.2, + "logits/rejected": -70175158.85714285, + "logps/chosen": -432.8919921875, + "logps/rejected": -546.2735072544643, + "loss": 0.0328, + "rewards/chosen": 6.726247406005859, + "rewards/margins": 15.323533085414342, + "rewards/rejected": -8.597285679408483, + "step": 1098 + }, + { + "epoch": 0.274990616789691, + "grad_norm": 16.0, + "kl": 1.2806282043457031, + "learning_rate": 5e-06, + "logits/chosen": -55991517.86666667, + "logits/rejected": -13366439.111111112, + "logps/chosen": -308.357421875, + "logps/rejected": -559.4840494791666, + "loss": 0.0504, + "rewards/chosen": 5.396641540527344, + "rewards/margins": 18.67631564670139, + "rewards/rejected": -13.279674106174046, + "step": 1099 + }, + { + "epoch": 0.27524083573126484, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57582818.13333333, + "logits/rejected": -673201.7777777778, + "logps/chosen": -400.09137369791665, + "logps/rejected": -696.7108289930555, + "loss": 0.0349, + "rewards/chosen": 6.75584716796875, + "rewards/margins": 24.74800075954861, + "rewards/rejected": -17.99215359157986, + "step": 1100 + }, + { + "epoch": 0.27549105467283874, + "grad_norm": 30.625, + "kl": 0.6924750208854675, + "learning_rate": 5e-06, + "logits/chosen": -27497136.0, + "logits/rejected": -29163402.0, + "logps/chosen": -348.93359375, + "logps/rejected": -368.3362121582031, + "loss": 0.1269, + "rewards/chosen": 3.9706897735595703, + "rewards/margins": 14.764211654663086, + "rewards/rejected": -10.793521881103516, + "step": 1101 + }, + { + "epoch": 0.2757412736144126, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50458805.333333336, + "logits/rejected": -67763530.66666667, + "logps/chosen": -436.042724609375, + "logps/rejected": -732.9457194010416, + "loss": 0.0332, + "rewards/chosen": 7.310827891031901, + "rewards/margins": 22.272210439046223, + "rewards/rejected": -14.961382548014322, + "step": 1102 + }, + { + "epoch": 0.2759914925559865, + "grad_norm": 11.375, + "kl": 9.376516342163086, + "learning_rate": 5e-06, + "logits/chosen": -47450645.333333336, + "logits/rejected": -84347980.8, + "logps/chosen": -406.51036241319446, + "logps/rejected": -679.2341796875, + "loss": 0.095, + "rewards/chosen": 8.338723076714409, + "rewards/margins": 21.0160159640842, + "rewards/rejected": -12.677292887369791, + "step": 1103 + }, + { + "epoch": 0.2762417114975604, + "grad_norm": 17.625, + "kl": 0.4985329508781433, + "learning_rate": 5e-06, + "logits/chosen": -58128019.692307696, + "logits/rejected": -50909597.09090909, + "logps/chosen": -375.994140625, + "logps/rejected": -634.8025568181819, + "loss": 0.0619, + "rewards/chosen": 4.677800692044771, + "rewards/margins": 16.36170516647659, + "rewards/rejected": -11.683904474431818, + "step": 1104 + }, + { + "epoch": 0.2764919304391342, + "grad_norm": 20.875, + "kl": 2.386707305908203, + "learning_rate": 5e-06, + "logits/chosen": -38971032.0, + "logits/rejected": -35238500.0, + "logps/chosen": -517.1376342773438, + "logps/rejected": -600.728759765625, + "loss": 0.048, + "rewards/chosen": 5.9778876304626465, + "rewards/margins": 14.879157543182373, + "rewards/rejected": -8.901269912719727, + "step": 1105 + }, + { + "epoch": 0.2767421493807081, + "grad_norm": 18.5, + "kl": 1.0896530151367188, + "learning_rate": 5e-06, + "logits/chosen": -69047163.07692307, + "logits/rejected": -33687758.54545455, + "logps/chosen": -485.11527193509613, + "logps/rejected": -600.8192471590909, + "loss": 0.0497, + "rewards/chosen": 7.161865234375, + "rewards/margins": 18.86987165971236, + "rewards/rejected": -11.708006425337357, + "step": 1106 + }, + { + "epoch": 0.276992368322282, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57469159.384615384, + "logits/rejected": -63527133.09090909, + "logps/chosen": -482.9043719951923, + "logps/rejected": -649.0275213068181, + "loss": 0.0247, + "rewards/chosen": 7.538060701810396, + "rewards/margins": 19.453976451100168, + "rewards/rejected": -11.915915749289773, + "step": 1107 + }, + { + "epoch": 0.27724258726385587, + "grad_norm": 25.5, + "kl": 0.8959074020385742, + "learning_rate": 5e-06, + "logits/chosen": -63813358.93333333, + "logits/rejected": -71857265.77777778, + "logps/chosen": -440.74720052083336, + "logps/rejected": -692.888671875, + "loss": 0.0772, + "rewards/chosen": 5.711973571777344, + "rewards/margins": 17.683696492513022, + "rewards/rejected": -11.971722920735678, + "step": 1108 + }, + { + "epoch": 0.27749280620542977, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41586064.0, + "logits/rejected": -29834989.714285713, + "logps/chosen": -456.718115234375, + "logps/rejected": -563.10986328125, + "loss": 0.0284, + "rewards/chosen": 6.393958663940429, + "rewards/margins": 16.333298437935966, + "rewards/rejected": -9.939339773995536, + "step": 1109 + }, + { + "epoch": 0.2777430251470036, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83759024.0, + "logits/rejected": -74702080.0, + "logps/chosen": -425.6722005208333, + "logps/rejected": -599.1381022135416, + "loss": 0.0148, + "rewards/chosen": 7.529047012329102, + "rewards/margins": 18.031349182128906, + "rewards/rejected": -10.502302169799805, + "step": 1110 + }, + { + "epoch": 0.2779932440885775, + "grad_norm": 3.359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49951896.88888889, + "logits/rejected": -30285772.8, + "logps/chosen": -428.3562282986111, + "logps/rejected": -510.3357421875, + "loss": 0.0186, + "rewards/chosen": 6.390851762559679, + "rewards/margins": 17.234177568223743, + "rewards/rejected": -10.843325805664062, + "step": 1111 + }, + { + "epoch": 0.2782434630301514, + "grad_norm": 4.5, + "kl": 3.721956253051758, + "learning_rate": 5e-06, + "logits/chosen": -71663616.0, + "logits/rejected": -63565701.81818182, + "logps/chosen": -577.8170823317307, + "logps/rejected": -530.6829723011364, + "loss": 0.0284, + "rewards/chosen": 8.188385009765625, + "rewards/margins": 16.593276283957742, + "rewards/rejected": -8.404891274192117, + "step": 1112 + }, + { + "epoch": 0.27849368197172525, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57338761.14285714, + "logits/rejected": -11805608.8, + "logps/chosen": -384.8462611607143, + "logps/rejected": -459.12490234375, + "loss": 0.0088, + "rewards/chosen": 6.566524505615234, + "rewards/margins": 14.983558654785156, + "rewards/rejected": -8.417034149169922, + "step": 1113 + }, + { + "epoch": 0.27874390091329915, + "grad_norm": 11.4375, + "kl": 10.931575775146484, + "learning_rate": 5e-06, + "logits/chosen": -64109453.71428572, + "logits/rejected": -73537651.2, + "logps/chosen": -525.2949916294643, + "logps/rejected": -424.23046875, + "loss": 0.0329, + "rewards/chosen": 7.816885811941964, + "rewards/margins": 15.793410164969309, + "rewards/rejected": -7.976524353027344, + "step": 1114 + }, + { + "epoch": 0.278994119854873, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35641701.333333336, + "logits/rejected": -41233922.666666664, + "logps/chosen": -297.7326253255208, + "logps/rejected": -481.4709065755208, + "loss": 0.0485, + "rewards/chosen": 5.158966064453125, + "rewards/margins": 16.710957845052086, + "rewards/rejected": -11.551991780598959, + "step": 1115 + }, + { + "epoch": 0.2792443387964469, + "grad_norm": 1.0234375, + "kl": 5.5768256187438965, + "learning_rate": 5e-06, + "logits/chosen": -55892608.0, + "logits/rejected": -60655173.81818182, + "logps/chosen": -539.2142052283654, + "logps/rejected": -671.0494939630681, + "loss": 0.0022, + "rewards/chosen": 8.648090069110577, + "rewards/margins": 21.435173088020377, + "rewards/rejected": -12.7870830189098, + "step": 1116 + }, + { + "epoch": 0.2794945577380208, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60317222.4, + "logits/rejected": -35313513.14285714, + "logps/chosen": -341.6595703125, + "logps/rejected": -375.9506138392857, + "loss": 0.0929, + "rewards/chosen": 5.1135601043701175, + "rewards/margins": 12.600681686401368, + "rewards/rejected": -7.48712158203125, + "step": 1117 + }, + { + "epoch": 0.27974477667959463, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64883498.666666664, + "logits/rejected": -78907754.66666667, + "logps/chosen": -443.2113037109375, + "logps/rejected": -552.0526123046875, + "loss": 0.018, + "rewards/chosen": 7.160554885864258, + "rewards/margins": 19.559404373168945, + "rewards/rejected": -12.398849487304688, + "step": 1118 + }, + { + "epoch": 0.27999499562116853, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44881655.27272727, + "logits/rejected": -8537550.76923077, + "logps/chosen": -447.947265625, + "logps/rejected": -672.7516526442307, + "loss": 0.0229, + "rewards/chosen": 6.872282548384233, + "rewards/margins": 21.806531252560916, + "rewards/rejected": -14.934248704176683, + "step": 1119 + }, + { + "epoch": 0.2802452145627424, + "grad_norm": 8.875, + "kl": 0.3929786682128906, + "learning_rate": 5e-06, + "logits/chosen": -45486376.72727273, + "logits/rejected": -21242176.0, + "logps/chosen": -390.81356534090907, + "logps/rejected": -469.76900540865387, + "loss": 0.0495, + "rewards/chosen": 5.598920995538885, + "rewards/margins": 15.924116308038885, + "rewards/rejected": -10.3251953125, + "step": 1120 + }, + { + "epoch": 0.28049543350431627, + "grad_norm": 23.75, + "kl": 22.835674285888672, + "learning_rate": 5e-06, + "logits/chosen": -49082532.0, + "logits/rejected": -42672644.0, + "logps/chosen": -392.3042907714844, + "logps/rejected": -348.0960998535156, + "loss": 0.1745, + "rewards/chosen": 6.786016464233398, + "rewards/margins": 14.288293838500977, + "rewards/rejected": -7.502277374267578, + "step": 1121 + }, + { + "epoch": 0.28074565244589017, + "grad_norm": 13.5, + "kl": 10.35714340209961, + "learning_rate": 5e-06, + "logits/chosen": -45470136.0, + "logits/rejected": -23907364.0, + "logps/chosen": -535.1232299804688, + "logps/rejected": -396.4907531738281, + "loss": 0.0315, + "rewards/chosen": 6.495186805725098, + "rewards/margins": 14.931267738342285, + "rewards/rejected": -8.436080932617188, + "step": 1122 + }, + { + "epoch": 0.280995871387464, + "grad_norm": 7.40625, + "kl": 5.8620924949646, + "learning_rate": 5e-06, + "logits/chosen": -20827218.82352941, + "logits/rejected": -55563227.428571425, + "logps/chosen": -456.75178079044116, + "logps/rejected": -754.9725167410714, + "loss": 0.0192, + "rewards/chosen": 7.6631308162913605, + "rewards/margins": 21.92915908428801, + "rewards/rejected": -14.266028267996651, + "step": 1123 + }, + { + "epoch": 0.2812460903290379, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28628236.307692308, + "logits/rejected": -44597704.72727273, + "logps/chosen": -425.4797175480769, + "logps/rejected": -522.7269176136364, + "loss": 0.0373, + "rewards/chosen": 5.29712148813101, + "rewards/margins": 14.073091947115385, + "rewards/rejected": -8.775970458984375, + "step": 1124 + }, + { + "epoch": 0.2814963092706118, + "grad_norm": 7.0, + "kl": 1.6981037855148315, + "learning_rate": 5e-06, + "logits/chosen": -43891998.11764706, + "logits/rejected": -55080987.428571425, + "logps/chosen": -351.44617417279414, + "logps/rejected": -472.8842075892857, + "loss": 0.0655, + "rewards/chosen": 5.352763905244715, + "rewards/margins": 15.160261202259225, + "rewards/rejected": -9.807497297014509, + "step": 1125 + }, + { + "epoch": 0.28174652821218565, + "grad_norm": 5.78125, + "kl": 4.999650478363037, + "learning_rate": 5e-06, + "logits/chosen": -52355825.23076923, + "logits/rejected": -29107421.09090909, + "logps/chosen": -442.4424579326923, + "logps/rejected": -578.0192649147727, + "loss": 0.0116, + "rewards/chosen": 6.517998915452224, + "rewards/margins": 17.675476554390436, + "rewards/rejected": -11.15747763893821, + "step": 1126 + }, + { + "epoch": 0.28199674715375955, + "grad_norm": 2.828125, + "kl": 2.381016254425049, + "learning_rate": 5e-06, + "logits/chosen": -39125462.85714286, + "logits/rejected": -65015564.8, + "logps/chosen": -394.45511300223217, + "logps/rejected": -309.7927978515625, + "loss": 0.0367, + "rewards/chosen": 7.08624267578125, + "rewards/margins": 15.39000244140625, + "rewards/rejected": -8.303759765625, + "step": 1127 + }, + { + "epoch": 0.2822469660953334, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42608813.333333336, + "logits/rejected": -39809245.333333336, + "logps/chosen": -308.09075927734375, + "logps/rejected": -623.582763671875, + "loss": 0.0235, + "rewards/chosen": 6.037115097045898, + "rewards/margins": 19.856931686401367, + "rewards/rejected": -13.819816589355469, + "step": 1128 + }, + { + "epoch": 0.2824971850369073, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55244179.692307696, + "logits/rejected": -33944610.90909091, + "logps/chosen": -439.1571514423077, + "logps/rejected": -503.52370383522725, + "loss": 0.03, + "rewards/chosen": 7.263636662409856, + "rewards/margins": 15.412269752342386, + "rewards/rejected": -8.14863308993253, + "step": 1129 + }, + { + "epoch": 0.2827474039784812, + "grad_norm": 14.1875, + "kl": 3.562600612640381, + "learning_rate": 5e-06, + "logits/chosen": -37833376.0, + "logits/rejected": -37308688.0, + "logps/chosen": -436.9351501464844, + "logps/rejected": -576.4046630859375, + "loss": 0.0425, + "rewards/chosen": 6.774343013763428, + "rewards/margins": 19.837724208831787, + "rewards/rejected": -13.06338119506836, + "step": 1130 + }, + { + "epoch": 0.28299762292005504, + "grad_norm": 6.9375, + "kl": 0.6583760976791382, + "learning_rate": 5e-06, + "logits/chosen": -17543952.0, + "logits/rejected": -43305002.666666664, + "logps/chosen": -411.9682210286458, + "logps/rejected": -449.27587890625, + "loss": 0.0403, + "rewards/chosen": 7.3061097462972, + "rewards/margins": 15.605206807454426, + "rewards/rejected": -8.299097061157227, + "step": 1131 + }, + { + "epoch": 0.28324784186162894, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11877948.0, + "logits/rejected": -52788173.71428572, + "logps/chosen": -280.8844482421875, + "logps/rejected": -757.8016880580357, + "loss": 0.0684, + "rewards/chosen": 6.709333801269532, + "rewards/margins": 18.47840292794364, + "rewards/rejected": -11.769069126674108, + "step": 1132 + }, + { + "epoch": 0.2834980608032028, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24925646.222222224, + "logits/rejected": -51881169.06666667, + "logps/chosen": -198.14195421006946, + "logps/rejected": -663.5908854166667, + "loss": 0.0815, + "rewards/chosen": 4.016237894694011, + "rewards/margins": 16.9124506632487, + "rewards/rejected": -12.896212768554687, + "step": 1133 + }, + { + "epoch": 0.2837482797447767, + "grad_norm": 11.3125, + "kl": 1.8512611389160156, + "learning_rate": 5e-06, + "logits/chosen": -41117060.266666666, + "logits/rejected": -31486176.0, + "logps/chosen": -387.18040364583334, + "logps/rejected": -724.8056640625, + "loss": 0.0513, + "rewards/chosen": 7.161700439453125, + "rewards/margins": 18.074291314019096, + "rewards/rejected": -10.912590874565971, + "step": 1134 + }, + { + "epoch": 0.2839984986863506, + "grad_norm": 19.125, + "kl": 2.806396484375, + "learning_rate": 5e-06, + "logits/chosen": -72035584.0, + "logits/rejected": -39650119.11111111, + "logps/chosen": -373.69049479166665, + "logps/rejected": -530.0896267361111, + "loss": 0.095, + "rewards/chosen": 6.401496887207031, + "rewards/margins": 16.91127438015408, + "rewards/rejected": -10.509777492947048, + "step": 1135 + }, + { + "epoch": 0.2842487176279244, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52554972.44444445, + "logits/rejected": -63235443.2, + "logps/chosen": -467.20838758680554, + "logps/rejected": -588.9477864583333, + "loss": 0.0425, + "rewards/chosen": 8.019686381022135, + "rewards/margins": 17.684647115071613, + "rewards/rejected": -9.66496073404948, + "step": 1136 + }, + { + "epoch": 0.2844989365694983, + "grad_norm": 4.0625, + "kl": 14.149798393249512, + "learning_rate": 5e-06, + "logits/chosen": -75866186.66666667, + "logits/rejected": -21808450.666666668, + "logps/chosen": -492.2015380859375, + "logps/rejected": -441.0258382161458, + "loss": 0.0502, + "rewards/chosen": 7.840506235758464, + "rewards/margins": 17.053593317667644, + "rewards/rejected": -9.21308708190918, + "step": 1137 + }, + { + "epoch": 0.28474915551107216, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16061280.0, + "logits/rejected": -57771382.85714286, + "logps/chosen": -292.27177734375, + "logps/rejected": -742.0191127232143, + "loss": 0.0383, + "rewards/chosen": 5.383618927001953, + "rewards/margins": 16.250819505964007, + "rewards/rejected": -10.867200578962054, + "step": 1138 + }, + { + "epoch": 0.28499937445264606, + "grad_norm": 10.0, + "kl": 4.595436096191406, + "learning_rate": 5e-06, + "logits/chosen": -44831984.0, + "logits/rejected": -38189504.0, + "logps/chosen": -460.630908203125, + "logps/rejected": -546.166015625, + "loss": 0.043, + "rewards/chosen": 7.146396636962891, + "rewards/margins": 18.307462964739116, + "rewards/rejected": -11.161066327776227, + "step": 1139 + }, + { + "epoch": 0.28524959339421996, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52034154.666666664, + "logits/rejected": -58314090.666666664, + "logps/chosen": -493.29443359375, + "logps/rejected": -709.8943684895834, + "loss": 0.0318, + "rewards/chosen": 7.900133768717448, + "rewards/margins": 19.16943868001302, + "rewards/rejected": -11.269304911295572, + "step": 1140 + }, + { + "epoch": 0.2854998123357938, + "grad_norm": 22.375, + "kl": 1.6676957607269287, + "learning_rate": 5e-06, + "logits/chosen": 14674387.2, + "logits/rejected": -42041792.0, + "logps/chosen": -507.386865234375, + "logps/rejected": -522.7941545758929, + "loss": 0.047, + "rewards/chosen": 8.240525817871093, + "rewards/margins": 18.35200980050223, + "rewards/rejected": -10.111483982631139, + "step": 1141 + }, + { + "epoch": 0.2857500312773677, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58925924.571428575, + "logits/rejected": -47231235.76470588, + "logps/chosen": -377.67654854910717, + "logps/rejected": -561.3901654411765, + "loss": 0.0308, + "rewards/chosen": 6.969936915806362, + "rewards/margins": 17.053445575617943, + "rewards/rejected": -10.083508659811582, + "step": 1142 + }, + { + "epoch": 0.2860002502189416, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67890048.0, + "logits/rejected": -37485654.15384615, + "logps/chosen": -460.69588955965907, + "logps/rejected": -652.5498798076923, + "loss": 0.0125, + "rewards/chosen": 8.55354863947088, + "rewards/margins": 19.06226375553158, + "rewards/rejected": -10.508715116060698, + "step": 1143 + }, + { + "epoch": 0.28625046916051544, + "grad_norm": 8.1875, + "kl": 0.6028093099594116, + "learning_rate": 5e-06, + "logits/chosen": -51619936.0, + "logits/rejected": -47086843.428571425, + "logps/chosen": -472.382421875, + "logps/rejected": -444.4257114955357, + "loss": 0.0216, + "rewards/chosen": 7.286181640625, + "rewards/margins": 16.01197466169085, + "rewards/rejected": -8.725793021065849, + "step": 1144 + }, + { + "epoch": 0.28650068810208934, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77244224.0, + "logits/rejected": -71108745.14285715, + "logps/chosen": -501.0978515625, + "logps/rejected": -574.0147879464286, + "loss": 0.0769, + "rewards/chosen": 7.306327819824219, + "rewards/margins": 18.51054164341518, + "rewards/rejected": -11.20421382359096, + "step": 1145 + }, + { + "epoch": 0.2867509070436632, + "grad_norm": 23.375, + "kl": 3.8275184631347656, + "learning_rate": 5e-06, + "logits/chosen": -48584009.14285714, + "logits/rejected": -41102422.4, + "logps/chosen": -307.35843331473217, + "logps/rejected": -454.167822265625, + "loss": 0.1518, + "rewards/chosen": 4.919449397495815, + "rewards/margins": 14.319527980259487, + "rewards/rejected": -9.400078582763673, + "step": 1146 + }, + { + "epoch": 0.2870011259852371, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33572341.333333336, + "logits/rejected": -56828341.333333336, + "logps/chosen": -252.9339599609375, + "logps/rejected": -638.4321695963541, + "loss": 0.0473, + "rewards/chosen": 5.4500579833984375, + "rewards/margins": 17.980410257975258, + "rewards/rejected": -12.530352274576822, + "step": 1147 + }, + { + "epoch": 0.287251344926811, + "grad_norm": 4.34375, + "kl": 6.115841865539551, + "learning_rate": 5e-06, + "logits/chosen": -46525792.0, + "logits/rejected": -55102822.4, + "logps/chosen": -290.5564662388393, + "logps/rejected": -481.272119140625, + "loss": 0.0699, + "rewards/chosen": 5.91736820765904, + "rewards/margins": 16.89659641810826, + "rewards/rejected": -10.979228210449218, + "step": 1148 + }, + { + "epoch": 0.2875015638683848, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -86422912.0, + "logits/rejected": -29018448.0, + "logps/chosen": -457.31138392857144, + "logps/rejected": -504.507421875, + "loss": 0.0263, + "rewards/chosen": 7.788644518171038, + "rewards/margins": 18.48494197300502, + "rewards/rejected": -10.696297454833985, + "step": 1149 + }, + { + "epoch": 0.2877517828099587, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35256182.15384615, + "logits/rejected": -47389853.09090909, + "logps/chosen": -410.7327223557692, + "logps/rejected": -410.20725319602275, + "loss": 0.0304, + "rewards/chosen": 6.891871525691106, + "rewards/margins": 14.435614312445367, + "rewards/rejected": -7.543742786754262, + "step": 1150 + }, + { + "epoch": 0.28800200175153257, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42627355.428571425, + "logits/rejected": -40488342.4, + "logps/chosen": -534.7102399553571, + "logps/rejected": -459.3259765625, + "loss": 0.0259, + "rewards/chosen": 8.4393310546875, + "rewards/margins": 18.26767578125, + "rewards/rejected": -9.8283447265625, + "step": 1151 + }, + { + "epoch": 0.28825222069310646, + "grad_norm": 11.125, + "kl": 1.3041725158691406, + "learning_rate": 5e-06, + "logits/chosen": -60510795.63636363, + "logits/rejected": 453984.0, + "logps/chosen": -397.06010298295456, + "logps/rejected": -455.00439453125, + "loss": 0.0311, + "rewards/chosen": 6.105116410688921, + "rewards/margins": 15.377516313032672, + "rewards/rejected": -9.27239990234375, + "step": 1152 + }, + { + "epoch": 0.28850243963468036, + "grad_norm": 14.5625, + "kl": 5.840234279632568, + "learning_rate": 5e-06, + "logits/chosen": -65389233.23076923, + "logits/rejected": -71592244.36363636, + "logps/chosen": -345.34337439903845, + "logps/rejected": -582.2925248579545, + "loss": 0.0411, + "rewards/chosen": 7.0215301513671875, + "rewards/margins": 20.155201305042613, + "rewards/rejected": -13.133671153675426, + "step": 1153 + }, + { + "epoch": 0.2887526585762542, + "grad_norm": 25.0, + "kl": 11.057903289794922, + "learning_rate": 5e-06, + "logits/chosen": -55171406.222222224, + "logits/rejected": -102561578.66666667, + "logps/chosen": -419.52362738715277, + "logps/rejected": -601.3760579427084, + "loss": 0.0741, + "rewards/chosen": 6.599257998996311, + "rewards/margins": 19.258938683403862, + "rewards/rejected": -12.659680684407553, + "step": 1154 + }, + { + "epoch": 0.2890028775178281, + "grad_norm": 11.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43943498.666666664, + "logits/rejected": -38566464.0, + "logps/chosen": -313.95220947265625, + "logps/rejected": -437.9047037760417, + "loss": 0.0373, + "rewards/chosen": 5.543373107910156, + "rewards/margins": 14.59536043802897, + "rewards/rejected": -9.051987330118815, + "step": 1155 + }, + { + "epoch": 0.289253096459402, + "grad_norm": 8.1875, + "kl": 1.2169806957244873, + "learning_rate": 5e-06, + "logits/chosen": -32880795.076923076, + "logits/rejected": -27842656.0, + "logps/chosen": -305.5837965745192, + "logps/rejected": -474.1767578125, + "loss": 0.0405, + "rewards/chosen": 6.181807884803185, + "rewards/margins": 13.862743831181026, + "rewards/rejected": -7.680935946377841, + "step": 1156 + }, + { + "epoch": 0.28950331540097585, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76837461.33333333, + "logits/rejected": -55483170.13333333, + "logps/chosen": -304.2406412760417, + "logps/rejected": -763.5622395833333, + "loss": 0.0402, + "rewards/chosen": 5.299874623616536, + "rewards/margins": 19.165586090087892, + "rewards/rejected": -13.865711466471355, + "step": 1157 + }, + { + "epoch": 0.28975353434254975, + "grad_norm": 7.71875, + "kl": 11.241491317749023, + "learning_rate": 5e-06, + "logits/chosen": -53666258.28571428, + "logits/rejected": -55073804.8, + "logps/chosen": -441.96578543526783, + "logps/rejected": -634.8125, + "loss": 0.0181, + "rewards/chosen": 7.33161871773856, + "rewards/margins": 18.380584171840123, + "rewards/rejected": -11.048965454101562, + "step": 1158 + }, + { + "epoch": 0.2900037532841236, + "grad_norm": 8.625, + "kl": 8.927696228027344, + "learning_rate": 5e-06, + "logits/chosen": 3694013.3333333335, + "logits/rejected": -76322346.66666667, + "logps/chosen": -448.2857666015625, + "logps/rejected": -624.241943359375, + "loss": 0.0476, + "rewards/chosen": 7.467383702596028, + "rewards/margins": 17.04286066691081, + "rewards/rejected": -9.57547696431478, + "step": 1159 + }, + { + "epoch": 0.2902539722256975, + "grad_norm": 20.25, + "kl": 5.393974304199219, + "learning_rate": 5e-06, + "logits/chosen": -43851430.4, + "logits/rejected": -55026546.28571428, + "logps/chosen": -394.58173828125, + "logps/rejected": -543.3825334821429, + "loss": 0.0729, + "rewards/chosen": 8.295943450927734, + "rewards/margins": 16.72632250104632, + "rewards/rejected": -8.430379050118583, + "step": 1160 + }, + { + "epoch": 0.2905041911672714, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27270582.85714286, + "logits/rejected": -74103878.4, + "logps/chosen": -331.34287806919644, + "logps/rejected": -499.31845703125, + "loss": 0.0486, + "rewards/chosen": 6.761083330426898, + "rewards/margins": 16.667295183454243, + "rewards/rejected": -9.906211853027344, + "step": 1161 + }, + { + "epoch": 0.29075441010884523, + "grad_norm": 7.0625, + "kl": 11.803191184997559, + "learning_rate": 5e-06, + "logits/chosen": -48919748.266666666, + "logits/rejected": -11062577.777777778, + "logps/chosen": -406.24427083333336, + "logps/rejected": -430.6764322916667, + "loss": 0.0281, + "rewards/chosen": 7.299162801106771, + "rewards/margins": 15.235323418511285, + "rewards/rejected": -7.936160617404514, + "step": 1162 + }, + { + "epoch": 0.29100462905041913, + "grad_norm": 2.359375, + "kl": 5.95670747756958, + "learning_rate": 5e-06, + "logits/chosen": -47983514.18181818, + "logits/rejected": -58994697.84615385, + "logps/chosen": -440.89888139204544, + "logps/rejected": -572.2101111778846, + "loss": 0.0056, + "rewards/chosen": 8.965993707830256, + "rewards/margins": 18.987236289711266, + "rewards/rejected": -10.02124258188101, + "step": 1163 + }, + { + "epoch": 0.29125484799199297, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51669549.71428572, + "logits/rejected": -54192704.0, + "logps/chosen": -332.2840053013393, + "logps/rejected": -748.29404296875, + "loss": 0.047, + "rewards/chosen": 7.1476576668875555, + "rewards/margins": 23.29574268886021, + "rewards/rejected": -16.148085021972655, + "step": 1164 + }, + { + "epoch": 0.29150506693356687, + "grad_norm": 13.875, + "kl": 7.269364833831787, + "learning_rate": 5e-06, + "logits/chosen": -39548300.8, + "logits/rejected": -59502286.222222224, + "logps/chosen": -378.7070638020833, + "logps/rejected": -472.37651909722223, + "loss": 0.1074, + "rewards/chosen": 6.256101481119791, + "rewards/margins": 14.99893069797092, + "rewards/rejected": -8.742829216851128, + "step": 1165 + }, + { + "epoch": 0.29175528587514077, + "grad_norm": 11.875, + "kl": 10.952193260192871, + "learning_rate": 5e-06, + "logits/chosen": -42194747.07692308, + "logits/rejected": -52604928.0, + "logps/chosen": -386.5329777644231, + "logps/rejected": -546.6471058238636, + "loss": 0.0558, + "rewards/chosen": 8.422494741586538, + "rewards/margins": 16.069486818113525, + "rewards/rejected": -7.646992076526988, + "step": 1166 + }, + { + "epoch": 0.2920055048167146, + "grad_norm": 9.125, + "kl": 10.713637351989746, + "learning_rate": 5e-06, + "logits/chosen": -35048576.0, + "logits/rejected": -59720981.333333336, + "logps/chosen": -449.3466796875, + "logps/rejected": -522.2677951388889, + "loss": 0.0232, + "rewards/chosen": 8.380671183268229, + "rewards/margins": 17.041798909505207, + "rewards/rejected": -8.661127726236979, + "step": 1167 + }, + { + "epoch": 0.2922557237582885, + "grad_norm": 8.0, + "kl": 0.8730294704437256, + "learning_rate": 5e-06, + "logits/chosen": -32253632.0, + "logits/rejected": -43578926.54545455, + "logps/chosen": -340.8405198317308, + "logps/rejected": -659.3392666903409, + "loss": 0.074, + "rewards/chosen": 5.842277526855469, + "rewards/margins": 15.458738153631037, + "rewards/rejected": -9.616460626775568, + "step": 1168 + }, + { + "epoch": 0.29250594269986235, + "grad_norm": 11.1875, + "kl": 6.464358329772949, + "learning_rate": 5e-06, + "logits/chosen": -18985476.0, + "logits/rejected": -61411680.0, + "logps/chosen": -442.2316080729167, + "logps/rejected": -709.6593424479166, + "loss": 0.046, + "rewards/chosen": 6.438519795735677, + "rewards/margins": 17.92563756306966, + "rewards/rejected": -11.487117767333984, + "step": 1169 + }, + { + "epoch": 0.29275616164143625, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69223529.14285715, + "logits/rejected": -55853152.0, + "logps/chosen": -409.9129115513393, + "logps/rejected": -782.859033203125, + "loss": 0.086, + "rewards/chosen": 7.345761980329241, + "rewards/margins": 21.801278032575333, + "rewards/rejected": -14.455516052246093, + "step": 1170 + }, + { + "epoch": 0.29300638058301015, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40741317.81818182, + "logits/rejected": -50016777.84615385, + "logps/chosen": -326.20478959517044, + "logps/rejected": -678.9094050480769, + "loss": 0.049, + "rewards/chosen": 6.4317543723366475, + "rewards/margins": 14.607239836579435, + "rewards/rejected": -8.175485464242788, + "step": 1171 + }, + { + "epoch": 0.293256599524584, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43050307.2, + "logits/rejected": -50048731.428571425, + "logps/chosen": -315.14072265625, + "logps/rejected": -391.5376674107143, + "loss": 0.0469, + "rewards/chosen": 5.647267913818359, + "rewards/margins": 13.675783320835658, + "rewards/rejected": -8.028515407017299, + "step": 1172 + }, + { + "epoch": 0.2935068184661579, + "grad_norm": 25.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73290880.0, + "logits/rejected": -41485750.85714286, + "logps/chosen": -482.339599609375, + "logps/rejected": -607.3962053571429, + "loss": 0.0504, + "rewards/chosen": 8.916798400878907, + "rewards/margins": 17.531564113071987, + "rewards/rejected": -8.61476571219308, + "step": 1173 + }, + { + "epoch": 0.2937570374077318, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33187360.0, + "logits/rejected": -47261147.428571425, + "logps/chosen": -487.595263671875, + "logps/rejected": -420.5537109375, + "loss": 0.0362, + "rewards/chosen": 8.267884826660156, + "rewards/margins": 16.589737919398715, + "rewards/rejected": -8.32185309273856, + "step": 1174 + }, + { + "epoch": 0.29400725634930563, + "grad_norm": 8.0625, + "kl": 4.510001182556152, + "learning_rate": 5e-06, + "logits/chosen": -51537077.333333336, + "logits/rejected": -5749093.333333333, + "logps/chosen": -492.6954345703125, + "logps/rejected": -454.897705078125, + "loss": 0.0489, + "rewards/chosen": 7.210700988769531, + "rewards/margins": 15.760632832845053, + "rewards/rejected": -8.549931844075521, + "step": 1175 + }, + { + "epoch": 0.29425747529087953, + "grad_norm": 11.3125, + "kl": 9.470272064208984, + "learning_rate": 5e-06, + "logits/chosen": -43001432.615384616, + "logits/rejected": -45236808.72727273, + "logps/chosen": -346.83559945913464, + "logps/rejected": -406.42356178977275, + "loss": 0.0954, + "rewards/chosen": 5.944509652944712, + "rewards/margins": 12.52782818987653, + "rewards/rejected": -6.583318536931818, + "step": 1176 + }, + { + "epoch": 0.2945076942324534, + "grad_norm": 14.1875, + "kl": 2.1260085105895996, + "learning_rate": 5e-06, + "logits/chosen": -56938752.0, + "logits/rejected": -35727936.0, + "logps/chosen": -394.81507161458336, + "logps/rejected": -559.5191514756945, + "loss": 0.0552, + "rewards/chosen": 6.616814676920573, + "rewards/margins": 16.59097985161675, + "rewards/rejected": -9.97416517469618, + "step": 1177 + }, + { + "epoch": 0.2947579131740273, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35963780.571428575, + "logits/rejected": -40654284.8, + "logps/chosen": -367.4828404017857, + "logps/rejected": -555.4779296875, + "loss": 0.0538, + "rewards/chosen": 6.843018123081753, + "rewards/margins": 18.400052424839565, + "rewards/rejected": -11.557034301757813, + "step": 1178 + }, + { + "epoch": 0.2950081321156012, + "grad_norm": 4.65625, + "kl": 0.869391143321991, + "learning_rate": 5e-06, + "logits/chosen": -64680185.6, + "logits/rejected": -46269709.71428572, + "logps/chosen": -296.861865234375, + "logps/rejected": -482.86293247767856, + "loss": 0.0642, + "rewards/chosen": 6.163364028930664, + "rewards/margins": 16.856327765328544, + "rewards/rejected": -10.69296373639788, + "step": 1179 + }, + { + "epoch": 0.295258351057175, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52071910.4, + "logits/rejected": -62973454.222222224, + "logps/chosen": -400.16155598958335, + "logps/rejected": -759.7228732638889, + "loss": 0.0406, + "rewards/chosen": 6.897824605305989, + "rewards/margins": 20.841357760959202, + "rewards/rejected": -13.943533155653212, + "step": 1180 + }, + { + "epoch": 0.2955085699987489, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35593959.384615384, + "logits/rejected": -50709364.36363637, + "logps/chosen": -345.9950420673077, + "logps/rejected": -656.3503639914773, + "loss": 0.0783, + "rewards/chosen": 5.690041175255408, + "rewards/margins": 18.259755808156687, + "rewards/rejected": -12.56971463290128, + "step": 1181 + }, + { + "epoch": 0.29575878894032276, + "grad_norm": 10.375, + "kl": 9.542585372924805, + "learning_rate": 5e-06, + "logits/chosen": -49999276.8, + "logits/rejected": -56117622.85714286, + "logps/chosen": -392.2707763671875, + "logps/rejected": -645.2907366071429, + "loss": 0.0812, + "rewards/chosen": 7.23094482421875, + "rewards/margins": 17.091972133091517, + "rewards/rejected": -9.861027308872767, + "step": 1182 + }, + { + "epoch": 0.29600900788189666, + "grad_norm": 7.53125, + "kl": 1.3462190628051758, + "learning_rate": 5e-06, + "logits/chosen": -73396597.33333333, + "logits/rejected": -58261290.666666664, + "logps/chosen": -489.7920328776042, + "logps/rejected": -587.763427734375, + "loss": 0.0643, + "rewards/chosen": 7.031253814697266, + "rewards/margins": 16.520645141601562, + "rewards/rejected": -9.489391326904297, + "step": 1183 + }, + { + "epoch": 0.29625922682347056, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44937866.666666664, + "logits/rejected": -57909968.0, + "logps/chosen": -295.5955810546875, + "logps/rejected": -518.7527262369791, + "loss": 0.0848, + "rewards/chosen": 4.8132584889729815, + "rewards/margins": 15.341567993164062, + "rewards/rejected": -10.52830950419108, + "step": 1184 + }, + { + "epoch": 0.2965094457650444, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66403982.222222224, + "logits/rejected": -68882841.6, + "logps/chosen": -338.1589084201389, + "logps/rejected": -577.3327473958333, + "loss": 0.0652, + "rewards/chosen": 4.731848822699653, + "rewards/margins": 17.06589830186632, + "rewards/rejected": -12.334049479166667, + "step": 1185 + }, + { + "epoch": 0.2967596647066183, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59508433.45454545, + "logits/rejected": -60182759.384615384, + "logps/chosen": -320.5403497869318, + "logps/rejected": -695.4613131009615, + "loss": 0.0464, + "rewards/chosen": 5.647186972878196, + "rewards/margins": 18.00622627951882, + "rewards/rejected": -12.359039306640625, + "step": 1186 + }, + { + "epoch": 0.29700988364819214, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63789216.0, + "logits/rejected": -67286698.66666667, + "logps/chosen": -243.3569539388021, + "logps/rejected": -369.5504150390625, + "loss": 0.11, + "rewards/chosen": 5.065320650736491, + "rewards/margins": 12.966883023579916, + "rewards/rejected": -7.901562372843425, + "step": 1187 + }, + { + "epoch": 0.29726010258976604, + "grad_norm": 19.375, + "kl": 3.0311450958251953, + "learning_rate": 5e-06, + "logits/chosen": -52727392.0, + "logits/rejected": -45968368.0, + "logps/chosen": -317.70501708984375, + "logps/rejected": -490.6730651855469, + "loss": 0.1226, + "rewards/chosen": 4.367021560668945, + "rewards/margins": 14.081737518310547, + "rewards/rejected": -9.714715957641602, + "step": 1188 + }, + { + "epoch": 0.29751032153133994, + "grad_norm": 9.0625, + "kl": 0.4523735046386719, + "learning_rate": 5e-06, + "logits/chosen": -80183897.6, + "logits/rejected": -49963616.0, + "logps/chosen": -404.010986328125, + "logps/rejected": -511.8562709263393, + "loss": 0.0162, + "rewards/chosen": 7.27861328125, + "rewards/margins": 18.591876220703124, + "rewards/rejected": -11.313262939453125, + "step": 1189 + }, + { + "epoch": 0.2977605404729138, + "grad_norm": 2.5625, + "kl": 11.088561058044434, + "learning_rate": 5e-06, + "logits/chosen": -60941792.0, + "logits/rejected": -66404777.14285714, + "logps/chosen": -494.26298828125, + "logps/rejected": -569.3899274553571, + "loss": 0.0119, + "rewards/chosen": 8.699906921386718, + "rewards/margins": 21.065955461774553, + "rewards/rejected": -12.366048540387835, + "step": 1190 + }, + { + "epoch": 0.2980107594144877, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30597309.53846154, + "logits/rejected": -50105530.18181818, + "logps/chosen": -375.7626953125, + "logps/rejected": -610.1363192471591, + "loss": 0.027, + "rewards/chosen": 6.560080895057092, + "rewards/margins": 18.856131920447716, + "rewards/rejected": -12.296051025390625, + "step": 1191 + }, + { + "epoch": 0.2982609783560616, + "grad_norm": 1.9609375, + "kl": 0.9644635915756226, + "learning_rate": 5e-06, + "logits/chosen": -65544477.538461536, + "logits/rejected": -58500642.90909091, + "logps/chosen": -427.7375676081731, + "logps/rejected": -650.7820046164773, + "loss": 0.0305, + "rewards/chosen": 6.857108482947717, + "rewards/margins": 18.837696528934934, + "rewards/rejected": -11.980588045987217, + "step": 1192 + }, + { + "epoch": 0.2985111972976354, + "grad_norm": 29.75, + "kl": 10.394487380981445, + "learning_rate": 5e-06, + "logits/chosen": -26408361.14285714, + "logits/rejected": -64922188.8, + "logps/chosen": -380.1547154017857, + "logps/rejected": -435.6400390625, + "loss": 0.1729, + "rewards/chosen": 4.728291102818081, + "rewards/margins": 12.919478389195035, + "rewards/rejected": -8.191187286376953, + "step": 1193 + }, + { + "epoch": 0.2987614162392093, + "grad_norm": 6.75, + "kl": 2.676608085632324, + "learning_rate": 5e-06, + "logits/chosen": -51292818.28571428, + "logits/rejected": -45942102.4, + "logps/chosen": -453.580810546875, + "logps/rejected": -500.6263671875, + "loss": 0.0254, + "rewards/chosen": 7.770008632114956, + "rewards/margins": 19.930053492954798, + "rewards/rejected": -12.160044860839843, + "step": 1194 + }, + { + "epoch": 0.29901163518078316, + "grad_norm": 6.34375, + "kl": 10.00429916381836, + "learning_rate": 5e-06, + "logits/chosen": -49094005.333333336, + "logits/rejected": -52719712.0, + "logps/chosen": -404.1981201171875, + "logps/rejected": -633.2005615234375, + "loss": 0.0266, + "rewards/chosen": 7.785235087076823, + "rewards/margins": 20.702952067057293, + "rewards/rejected": -12.917716979980469, + "step": 1195 + }, + { + "epoch": 0.29926185412235706, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32333948.444444444, + "logits/rejected": -54345403.733333334, + "logps/chosen": -315.71869574652777, + "logps/rejected": -579.2310546875, + "loss": 0.0464, + "rewards/chosen": 5.535135904947917, + "rewards/margins": 15.939619954427084, + "rewards/rejected": -10.404484049479167, + "step": 1196 + }, + { + "epoch": 0.29951207306393096, + "grad_norm": 6.59375, + "kl": 2.8395156860351562, + "learning_rate": 5e-06, + "logits/chosen": -43056997.333333336, + "logits/rejected": -29393584.0, + "logps/chosen": -365.9730631510417, + "logps/rejected": -471.6285400390625, + "loss": 0.0421, + "rewards/chosen": 6.637304306030273, + "rewards/margins": 15.41088612874349, + "rewards/rejected": -8.773581822713217, + "step": 1197 + }, + { + "epoch": 0.2997622920055048, + "grad_norm": 4.8125, + "kl": 4.63289213180542, + "learning_rate": 5e-06, + "logits/chosen": -38853734.4, + "logits/rejected": -30732472.888888888, + "logps/chosen": -456.22470703125, + "logps/rejected": -479.3478732638889, + "loss": 0.0166, + "rewards/chosen": 7.875729370117187, + "rewards/margins": 15.737787373860677, + "rewards/rejected": -7.862058003743489, + "step": 1198 + }, + { + "epoch": 0.3000125109470787, + "grad_norm": 10.0625, + "kl": 11.848552703857422, + "learning_rate": 5e-06, + "logits/chosen": -64484292.571428575, + "logits/rejected": -58322188.8, + "logps/chosen": -385.57470703125, + "logps/rejected": -657.2455078125, + "loss": 0.0863, + "rewards/chosen": 6.681431361607143, + "rewards/margins": 17.403420802525112, + "rewards/rejected": -10.72198944091797, + "step": 1199 + }, + { + "epoch": 0.30026272988865255, + "grad_norm": 2.734375, + "kl": 4.273982048034668, + "learning_rate": 5e-06, + "logits/chosen": -84237008.0, + "logits/rejected": -52003640.0, + "logps/chosen": -547.9490356445312, + "logps/rejected": -717.3812866210938, + "loss": 0.0038, + "rewards/chosen": 10.082688331604004, + "rewards/margins": 22.521946907043457, + "rewards/rejected": -12.439258575439453, + "step": 1200 + }, + { + "epoch": 0.30051294883022645, + "grad_norm": 5.8125, + "kl": 2.1367499828338623, + "learning_rate": 5e-06, + "logits/chosen": -47991885.71428572, + "logits/rejected": -37787616.0, + "logps/chosen": -421.96212332589283, + "logps/rejected": -646.25380859375, + "loss": 0.062, + "rewards/chosen": 7.607967921665737, + "rewards/margins": 18.67392098563058, + "rewards/rejected": -11.065953063964844, + "step": 1201 + }, + { + "epoch": 0.30076316777180034, + "grad_norm": 19.875, + "kl": 20.412845611572266, + "learning_rate": 5e-06, + "logits/chosen": -57176690.28571428, + "logits/rejected": -41773651.2, + "logps/chosen": -496.66183035714283, + "logps/rejected": -485.713720703125, + "loss": 0.0505, + "rewards/chosen": 8.625023978097099, + "rewards/margins": 16.44388918195452, + "rewards/rejected": -7.818865203857422, + "step": 1202 + }, + { + "epoch": 0.3010133867133742, + "grad_norm": 6.09375, + "kl": 7.945102691650391, + "learning_rate": 5e-06, + "logits/chosen": -41518966.15384615, + "logits/rejected": -54993536.0, + "logps/chosen": -436.09761868990387, + "logps/rejected": -662.8452592329545, + "loss": 0.0415, + "rewards/chosen": 7.948289724496695, + "rewards/margins": 18.823940303775814, + "rewards/rejected": -10.87565057927912, + "step": 1203 + }, + { + "epoch": 0.3012636056549481, + "grad_norm": 13.1875, + "kl": 12.051488876342773, + "learning_rate": 5e-06, + "logits/chosen": -47792244.0, + "logits/rejected": -47254648.0, + "logps/chosen": -327.1128234863281, + "logps/rejected": -590.921142578125, + "loss": 0.0755, + "rewards/chosen": 6.375910758972168, + "rewards/margins": 17.78371524810791, + "rewards/rejected": -11.407804489135742, + "step": 1204 + }, + { + "epoch": 0.301513824596522, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43345376.0, + "logits/rejected": -26328621.714285713, + "logps/chosen": -332.8568115234375, + "logps/rejected": -507.40869140625, + "loss": 0.0445, + "rewards/chosen": 5.435160827636719, + "rewards/margins": 13.513526698521204, + "rewards/rejected": -8.078365870884486, + "step": 1205 + }, + { + "epoch": 0.3017640435380958, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44591864.88888889, + "logits/rejected": -82771959.46666667, + "logps/chosen": -434.6588541666667, + "logps/rejected": -862.0003255208334, + "loss": 0.0452, + "rewards/chosen": 8.864386664496529, + "rewards/margins": 22.953476630316842, + "rewards/rejected": -14.089089965820312, + "step": 1206 + }, + { + "epoch": 0.3020142624796697, + "grad_norm": 7.90625, + "kl": 5.905923366546631, + "learning_rate": 5e-06, + "logits/chosen": -46774262.15384615, + "logits/rejected": -14435822.545454545, + "logps/chosen": -339.00255408653845, + "logps/rejected": -408.1629083806818, + "loss": 0.0729, + "rewards/chosen": 6.62992448073167, + "rewards/margins": 13.8753461504316, + "rewards/rejected": -7.245421669699929, + "step": 1207 + }, + { + "epoch": 0.30226448142124357, + "grad_norm": 4.125, + "kl": 5.0011420249938965, + "learning_rate": 5e-06, + "logits/chosen": -35347517.71428572, + "logits/rejected": -54376166.4, + "logps/chosen": -444.19088309151783, + "logps/rejected": -581.9115234375, + "loss": 0.0375, + "rewards/chosen": 7.872530800955636, + "rewards/margins": 17.909774453299388, + "rewards/rejected": -10.03724365234375, + "step": 1208 + }, + { + "epoch": 0.30251470036281747, + "grad_norm": 15.0, + "kl": 11.214672088623047, + "learning_rate": 5e-06, + "logits/chosen": -54350061.176470585, + "logits/rejected": -40607730.28571428, + "logps/chosen": -414.0768037683824, + "logps/rejected": -440.0709751674107, + "loss": 0.0779, + "rewards/chosen": 7.746686150045956, + "rewards/margins": 15.269918201350364, + "rewards/rejected": -7.523232051304409, + "step": 1209 + }, + { + "epoch": 0.30276491930439137, + "grad_norm": 8.5, + "kl": 14.319580078125, + "learning_rate": 5e-06, + "logits/chosen": -76032904.53333333, + "logits/rejected": -63509973.333333336, + "logps/chosen": -416.6420572916667, + "logps/rejected": -630.6883680555555, + "loss": 0.0843, + "rewards/chosen": 8.988008626302083, + "rewards/margins": 19.222793070475262, + "rewards/rejected": -10.234784444173178, + "step": 1210 + }, + { + "epoch": 0.3030151382459652, + "grad_norm": 13.8125, + "kl": 4.323009490966797, + "learning_rate": 5e-06, + "logits/chosen": -40642581.333333336, + "logits/rejected": -53383825.06666667, + "logps/chosen": -470.5436197916667, + "logps/rejected": -466.6597005208333, + "loss": 0.1265, + "rewards/chosen": 7.391743977864583, + "rewards/margins": 16.222974650065105, + "rewards/rejected": -8.83123067220052, + "step": 1211 + }, + { + "epoch": 0.3032653571875391, + "grad_norm": 10.0, + "kl": 15.987177848815918, + "learning_rate": 5e-06, + "logits/chosen": -86152557.71428572, + "logits/rejected": -55593881.6, + "logps/chosen": -579.7297712053571, + "logps/rejected": -559.9326171875, + "loss": 0.0304, + "rewards/chosen": 9.981085641043526, + "rewards/margins": 18.77410627092634, + "rewards/rejected": -8.793020629882813, + "step": 1212 + }, + { + "epoch": 0.30351557612911295, + "grad_norm": 8.5625, + "kl": 0.21150970458984375, + "learning_rate": 5e-06, + "logits/chosen": -19375637.333333332, + "logits/rejected": -34573491.2, + "logps/chosen": -270.0732150607639, + "logps/rejected": -572.109765625, + "loss": 0.0528, + "rewards/chosen": 6.671888139512804, + "rewards/margins": 16.843167029486764, + "rewards/rejected": -10.171278889973959, + "step": 1213 + }, + { + "epoch": 0.30376579507068685, + "grad_norm": 26.25, + "kl": 16.300756454467773, + "learning_rate": 5e-06, + "logits/chosen": -23889493.333333332, + "logits/rejected": -58384629.333333336, + "logps/chosen": -419.4922688802083, + "logps/rejected": -507.2896321614583, + "loss": 0.1428, + "rewards/chosen": 7.208911895751953, + "rewards/margins": 16.83298428853353, + "rewards/rejected": -9.624072392781576, + "step": 1214 + }, + { + "epoch": 0.30401601401226075, + "grad_norm": 18.5, + "kl": 8.834426879882812, + "learning_rate": 5e-06, + "logits/chosen": -13152477.538461538, + "logits/rejected": -12759269.818181818, + "logps/chosen": -252.78532527043268, + "logps/rejected": -529.7227450284091, + "loss": 0.0963, + "rewards/chosen": 5.433926508976863, + "rewards/margins": 17.288959689907262, + "rewards/rejected": -11.855033180930398, + "step": 1215 + }, + { + "epoch": 0.3042662329538346, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10205563.636363637, + "logits/rejected": -54210569.84615385, + "logps/chosen": -451.13188032670456, + "logps/rejected": -535.5322641225962, + "loss": 0.0081, + "rewards/chosen": 7.300703568892046, + "rewards/margins": 17.306140392810317, + "rewards/rejected": -10.00543682391827, + "step": 1216 + }, + { + "epoch": 0.3045164518954085, + "grad_norm": 16.875, + "kl": 1.085489273071289, + "learning_rate": 5e-06, + "logits/chosen": -49543040.0, + "logits/rejected": -44048296.72727273, + "logps/chosen": -464.95361328125, + "logps/rejected": -475.2381036931818, + "loss": 0.018, + "rewards/chosen": 8.724955045259916, + "rewards/margins": 17.333094776927176, + "rewards/rejected": -8.608139731667258, + "step": 1217 + }, + { + "epoch": 0.30476667083698233, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38865818.18181818, + "logits/rejected": -57863758.76923077, + "logps/chosen": -417.4181463068182, + "logps/rejected": -690.7618689903846, + "loss": 0.1122, + "rewards/chosen": 5.963247819380327, + "rewards/margins": 16.167141627598475, + "rewards/rejected": -10.20389380821815, + "step": 1218 + }, + { + "epoch": 0.30501688977855623, + "grad_norm": 6.53125, + "kl": 0.8689308166503906, + "learning_rate": 5e-06, + "logits/chosen": -45907918.222222224, + "logits/rejected": -29998685.866666667, + "logps/chosen": -443.6657986111111, + "logps/rejected": -493.95384114583334, + "loss": 0.0222, + "rewards/chosen": 8.326126098632812, + "rewards/margins": 16.025186157226564, + "rewards/rejected": -7.69906005859375, + "step": 1219 + }, + { + "epoch": 0.30526710872013013, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76317580.8, + "logits/rejected": -40659858.28571428, + "logps/chosen": -298.8114013671875, + "logps/rejected": -468.56082589285717, + "loss": 0.0571, + "rewards/chosen": 5.860202407836914, + "rewards/margins": 12.43331162588937, + "rewards/rejected": -6.573109218052456, + "step": 1220 + }, + { + "epoch": 0.305517327661704, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36596786.28571428, + "logits/rejected": -52418819.76470588, + "logps/chosen": -482.5286342075893, + "logps/rejected": -558.5536534926471, + "loss": 0.0037, + "rewards/chosen": 8.219329289027623, + "rewards/margins": 19.81026987668847, + "rewards/rejected": -11.590940587660846, + "step": 1221 + }, + { + "epoch": 0.3057675466032779, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57436570.666666664, + "logits/rejected": -30109176.0, + "logps/chosen": -336.4075113932292, + "logps/rejected": -499.1826985677083, + "loss": 0.0335, + "rewards/chosen": 5.587828318277995, + "rewards/margins": 13.55677096048991, + "rewards/rejected": -7.968942642211914, + "step": 1222 + }, + { + "epoch": 0.30601776554485177, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31468336.0, + "logits/rejected": -34517024.0, + "logps/chosen": -333.1884521484375, + "logps/rejected": -665.4081333705357, + "loss": 0.0488, + "rewards/chosen": 5.672737884521484, + "rewards/margins": 16.932380349295478, + "rewards/rejected": -11.259642464773995, + "step": 1223 + }, + { + "epoch": 0.3062679844864256, + "grad_norm": 16.375, + "kl": 14.873919486999512, + "learning_rate": 5e-06, + "logits/chosen": -65133782.85714286, + "logits/rejected": -83509286.4, + "logps/chosen": -487.99550083705356, + "logps/rejected": -648.62421875, + "loss": 0.0507, + "rewards/chosen": 6.1974896022251675, + "rewards/margins": 17.42685056413923, + "rewards/rejected": -11.229360961914063, + "step": 1224 + }, + { + "epoch": 0.3065182034279995, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61670101.333333336, + "logits/rejected": -48595835.733333334, + "logps/chosen": -364.9957682291667, + "logps/rejected": -590.0022135416667, + "loss": 0.004, + "rewards/chosen": 7.189697265625, + "rewards/margins": 17.719127400716147, + "rewards/rejected": -10.529430135091145, + "step": 1225 + }, + { + "epoch": 0.30676842236957336, + "grad_norm": 7.5625, + "kl": 11.055181503295898, + "learning_rate": 5e-06, + "logits/chosen": -16172059.0, + "logits/rejected": -64466960.0, + "logps/chosen": -669.9346313476562, + "logps/rejected": -700.0983276367188, + "loss": 0.0616, + "rewards/chosen": 9.303380012512207, + "rewards/margins": 23.678828239440918, + "rewards/rejected": -14.375448226928711, + "step": 1226 + }, + { + "epoch": 0.30701864131114726, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14761928.727272727, + "logits/rejected": -9460416.0, + "logps/chosen": -414.7590997869318, + "logps/rejected": -724.708984375, + "loss": 0.0719, + "rewards/chosen": 4.879152471368963, + "rewards/margins": 22.25484029062978, + "rewards/rejected": -17.37568781926082, + "step": 1227 + }, + { + "epoch": 0.30726886025272115, + "grad_norm": 20.25, + "kl": 6.07587194442749, + "learning_rate": 5e-06, + "logits/chosen": -34716292.571428575, + "logits/rejected": -34900057.6, + "logps/chosen": -449.8076171875, + "logps/rejected": -492.19150390625, + "loss": 0.0694, + "rewards/chosen": 7.262049538748605, + "rewards/margins": 14.568482644217355, + "rewards/rejected": -7.30643310546875, + "step": 1228 + }, + { + "epoch": 0.307519079194295, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46084236.8, + "logits/rejected": -53814720.0, + "logps/chosen": -341.7791259765625, + "logps/rejected": -642.7267020089286, + "loss": 0.0381, + "rewards/chosen": 4.39723014831543, + "rewards/margins": 16.99350438799177, + "rewards/rejected": -12.596274239676339, + "step": 1229 + }, + { + "epoch": 0.3077692981358689, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61462215.11111111, + "logits/rejected": -17072117.333333332, + "logps/chosen": -520.7877604166666, + "logps/rejected": -565.2350260416666, + "loss": 0.0088, + "rewards/chosen": 6.520641326904297, + "rewards/margins": 20.729000091552734, + "rewards/rejected": -14.208358764648438, + "step": 1230 + }, + { + "epoch": 0.30801951707744274, + "grad_norm": 11.8125, + "kl": 3.8257193565368652, + "learning_rate": 5e-06, + "logits/chosen": -25244777.14285714, + "logits/rejected": -35328236.8, + "logps/chosen": -392.2129603794643, + "logps/rejected": -542.0587890625, + "loss": 0.0741, + "rewards/chosen": 5.690225873674665, + "rewards/margins": 19.81968754359654, + "rewards/rejected": -14.129461669921875, + "step": 1231 + }, + { + "epoch": 0.30826973601901664, + "grad_norm": 12.5625, + "kl": 0.28105735778808594, + "learning_rate": 5e-06, + "logits/chosen": -53168320.0, + "logits/rejected": -48682928.0, + "logps/chosen": -321.15576171875, + "logps/rejected": -432.2138264973958, + "loss": 0.0512, + "rewards/chosen": 4.481417338053386, + "rewards/margins": 14.419905980428059, + "rewards/rejected": -9.938488642374674, + "step": 1232 + }, + { + "epoch": 0.30851995496059054, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63650205.538461536, + "logits/rejected": -12155309.090909092, + "logps/chosen": -292.4852764423077, + "logps/rejected": -369.77587890625, + "loss": 0.0811, + "rewards/chosen": 4.245010962853065, + "rewards/margins": 13.588428737400296, + "rewards/rejected": -9.34341777454723, + "step": 1233 + }, + { + "epoch": 0.3087701739021644, + "grad_norm": 14.25, + "kl": 10.198458671569824, + "learning_rate": 5e-06, + "logits/chosen": -80799793.23076923, + "logits/rejected": -71043642.18181819, + "logps/chosen": -403.5478515625, + "logps/rejected": -562.3085049715909, + "loss": 0.1501, + "rewards/chosen": 5.712518545297476, + "rewards/margins": 19.58325440733583, + "rewards/rejected": -13.870735862038352, + "step": 1234 + }, + { + "epoch": 0.3090203928437383, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30094779.42857143, + "logits/rejected": -30140227.2, + "logps/chosen": -376.94144112723217, + "logps/rejected": -618.9744140625, + "loss": 0.0606, + "rewards/chosen": 6.012803213936942, + "rewards/margins": 18.650406973702566, + "rewards/rejected": -12.637603759765625, + "step": 1235 + }, + { + "epoch": 0.3092706117853121, + "grad_norm": 14.75, + "kl": 1.3741252422332764, + "learning_rate": 5e-06, + "logits/chosen": -22974729.14285714, + "logits/rejected": -47919587.2, + "logps/chosen": -455.51576450892856, + "logps/rejected": -509.818896484375, + "loss": 0.0753, + "rewards/chosen": 5.769430433000837, + "rewards/margins": 17.118478284563338, + "rewards/rejected": -11.3490478515625, + "step": 1236 + }, + { + "epoch": 0.309520830726886, + "grad_norm": 2.5, + "kl": 1.9214465618133545, + "learning_rate": 5e-06, + "logits/chosen": -50238396.44444445, + "logits/rejected": -22097812.0, + "logps/chosen": -313.0553385416667, + "logps/rejected": -452.2313639322917, + "loss": 0.0106, + "rewards/chosen": 6.583102332221137, + "rewards/margins": 16.858679241604275, + "rewards/rejected": -10.275576909383139, + "step": 1237 + }, + { + "epoch": 0.3097710496684599, + "grad_norm": 7.375, + "kl": 8.763050079345703, + "learning_rate": 5e-06, + "logits/chosen": -67746020.57142857, + "logits/rejected": -79124198.4, + "logps/chosen": -539.0365862165179, + "logps/rejected": -750.8107421875, + "loss": 0.0644, + "rewards/chosen": 8.34955324445452, + "rewards/margins": 25.343193381173272, + "rewards/rejected": -16.99364013671875, + "step": 1238 + }, + { + "epoch": 0.31002126861003376, + "grad_norm": 7.28125, + "kl": 7.915022850036621, + "learning_rate": 5e-06, + "logits/chosen": -56121408.0, + "logits/rejected": -33696928.0, + "logps/chosen": -342.4960611979167, + "logps/rejected": -433.22862413194446, + "loss": 0.089, + "rewards/chosen": 7.548491414388021, + "rewards/margins": 18.4423826429579, + "rewards/rejected": -10.893891228569878, + "step": 1239 + }, + { + "epoch": 0.31027148755160766, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47854330.666666664, + "logits/rejected": -79582746.66666667, + "logps/chosen": -293.01316324869794, + "logps/rejected": -660.5765787760416, + "loss": 0.0559, + "rewards/chosen": 4.899873733520508, + "rewards/margins": 17.41712506612142, + "rewards/rejected": -12.517251332600912, + "step": 1240 + }, + { + "epoch": 0.31052170649318156, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48317344.0, + "logits/rejected": -19875664.0, + "logps/chosen": -366.90098353794644, + "logps/rejected": -399.4595703125, + "loss": 0.0534, + "rewards/chosen": 5.985648018973214, + "rewards/margins": 15.232921273367747, + "rewards/rejected": -9.247273254394532, + "step": 1241 + }, + { + "epoch": 0.3107719254347554, + "grad_norm": 12.6875, + "kl": 6.448036193847656, + "learning_rate": 5e-06, + "logits/chosen": -50070218.666666664, + "logits/rejected": -14155168.0, + "logps/chosen": -435.9420166015625, + "logps/rejected": -522.2616373697916, + "loss": 0.014, + "rewards/chosen": 7.722878774007161, + "rewards/margins": 17.908846537272137, + "rewards/rejected": -10.185967763264975, + "step": 1242 + }, + { + "epoch": 0.3110221443763293, + "grad_norm": 23.25, + "kl": 10.553285598754883, + "learning_rate": 5e-06, + "logits/chosen": -58239488.0, + "logits/rejected": -49623240.0, + "logps/chosen": -314.5216979980469, + "logps/rejected": -742.7978515625, + "loss": 0.1889, + "rewards/chosen": 5.35376501083374, + "rewards/margins": 18.42606782913208, + "rewards/rejected": -13.07230281829834, + "step": 1243 + }, + { + "epoch": 0.31127236331790314, + "grad_norm": 13.5625, + "kl": 18.329917907714844, + "learning_rate": 5e-06, + "logits/chosen": -69109981.86666666, + "logits/rejected": -7965475.555555556, + "logps/chosen": -481.5512369791667, + "logps/rejected": -644.6374240451389, + "loss": 0.1081, + "rewards/chosen": 8.319614156087239, + "rewards/margins": 22.451639133029513, + "rewards/rejected": -14.132024976942274, + "step": 1244 + }, + { + "epoch": 0.31152258225947704, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41265280.0, + "logits/rejected": -51238592.0, + "logps/chosen": -421.37571022727275, + "logps/rejected": -518.7028245192307, + "loss": 0.0363, + "rewards/chosen": 6.618546919389204, + "rewards/margins": 17.33440244447935, + "rewards/rejected": -10.715855525090145, + "step": 1245 + }, + { + "epoch": 0.31177280120105094, + "grad_norm": 6.5625, + "kl": 2.755945920944214, + "learning_rate": 5e-06, + "logits/chosen": -28064611.2, + "logits/rejected": -56540539.428571425, + "logps/chosen": -322.7212646484375, + "logps/rejected": -618.4041573660714, + "loss": 0.0591, + "rewards/chosen": 6.259741973876953, + "rewards/margins": 17.569922637939452, + "rewards/rejected": -11.3101806640625, + "step": 1246 + }, + { + "epoch": 0.3120230201426248, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48707445.333333336, + "logits/rejected": -15703682.666666666, + "logps/chosen": -368.1058756510417, + "logps/rejected": -552.9111735026041, + "loss": 0.0501, + "rewards/chosen": 5.385286966959636, + "rewards/margins": 15.32118542989095, + "rewards/rejected": -9.935898462931315, + "step": 1247 + }, + { + "epoch": 0.3122732390841987, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69974586.18181819, + "logits/rejected": -38543259.07692308, + "logps/chosen": -531.0395063920455, + "logps/rejected": -658.4560546875, + "loss": 0.0169, + "rewards/chosen": 7.813090931285512, + "rewards/margins": 17.791294337986233, + "rewards/rejected": -9.978203406700722, + "step": 1248 + }, + { + "epoch": 0.3125234580257725, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57584999.11111111, + "logits/rejected": -26344529.066666666, + "logps/chosen": -350.635986328125, + "logps/rejected": -667.6969401041666, + "loss": 0.0268, + "rewards/chosen": 5.984920077853733, + "rewards/margins": 14.629159376356338, + "rewards/rejected": -8.644239298502605, + "step": 1249 + }, + { + "epoch": 0.3127736769673464, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29123074.0, + "logits/rejected": -55702496.0, + "logps/chosen": -311.8891906738281, + "logps/rejected": -502.1377258300781, + "loss": 0.0634, + "rewards/chosen": 6.006384372711182, + "rewards/margins": 14.619926929473877, + "rewards/rejected": -8.613542556762695, + "step": 1250 + }, + { + "epoch": 0.3130238959089203, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73254158.22222222, + "logits/rejected": -55719364.266666666, + "logps/chosen": -310.37744140625, + "logps/rejected": -609.6881510416666, + "loss": 0.0123, + "rewards/chosen": 8.115142822265625, + "rewards/margins": 21.34777018229167, + "rewards/rejected": -13.232627360026042, + "step": 1251 + }, + { + "epoch": 0.31327411485049417, + "grad_norm": 4.25, + "kl": 0.9545091390609741, + "learning_rate": 5e-06, + "logits/chosen": -55141664.0, + "logits/rejected": -63513642.666666664, + "logps/chosen": -418.5477701822917, + "logps/rejected": -597.7539876302084, + "loss": 0.0221, + "rewards/chosen": 8.29672114054362, + "rewards/margins": 18.434499104817707, + "rewards/rejected": -10.137777964274088, + "step": 1252 + }, + { + "epoch": 0.31352433379206807, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75923128.0, + "logits/rejected": -44531580.0, + "logps/chosen": -478.70660400390625, + "logps/rejected": -534.2015380859375, + "loss": 0.05, + "rewards/chosen": 7.728018760681152, + "rewards/margins": 15.276987552642822, + "rewards/rejected": -7.54896879196167, + "step": 1253 + }, + { + "epoch": 0.3137745527336419, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83031749.33333333, + "logits/rejected": -39417584.0, + "logps/chosen": -331.9674886067708, + "logps/rejected": -506.0525716145833, + "loss": 0.0087, + "rewards/chosen": 7.54632568359375, + "rewards/margins": 17.604111353556313, + "rewards/rejected": -10.057785669962565, + "step": 1254 + }, + { + "epoch": 0.3140247716752158, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74658396.44444445, + "logits/rejected": -70124074.66666667, + "logps/chosen": -329.97816297743054, + "logps/rejected": -542.4194661458333, + "loss": 0.0386, + "rewards/chosen": 6.777015262179905, + "rewards/margins": 16.719556766086157, + "rewards/rejected": -9.94254150390625, + "step": 1255 + }, + { + "epoch": 0.3142749906167897, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -127000969.84615384, + "logits/rejected": -55268328.72727273, + "logps/chosen": -355.56088491586536, + "logps/rejected": -578.9248046875, + "loss": 0.0751, + "rewards/chosen": 5.809100811298077, + "rewards/margins": 13.339487302553405, + "rewards/rejected": -7.530386491255327, + "step": 1256 + }, + { + "epoch": 0.31452520955836355, + "grad_norm": 6.53125, + "kl": 1.0577256679534912, + "learning_rate": 5e-06, + "logits/chosen": -43596028.44444445, + "logits/rejected": -39182749.86666667, + "logps/chosen": -442.22157118055554, + "logps/rejected": -665.2127604166667, + "loss": 0.05, + "rewards/chosen": 7.25166490342882, + "rewards/margins": 18.627539740668404, + "rewards/rejected": -11.375874837239584, + "step": 1257 + }, + { + "epoch": 0.31477542849993745, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37669309.538461536, + "logits/rejected": -59644497.45454545, + "logps/chosen": -438.7571364182692, + "logps/rejected": -543.0248135653409, + "loss": 0.0566, + "rewards/chosen": 7.409668555626502, + "rewards/margins": 20.050626474660593, + "rewards/rejected": -12.640957919034092, + "step": 1258 + }, + { + "epoch": 0.31502564744151135, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56054498.90909091, + "logits/rejected": -54907465.84615385, + "logps/chosen": -473.20210404829544, + "logps/rejected": -503.11144080528845, + "loss": 0.0287, + "rewards/chosen": 6.974796641956676, + "rewards/margins": 14.99104020979021, + "rewards/rejected": -8.016243567833534, + "step": 1259 + }, + { + "epoch": 0.3152758663830852, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31206835.2, + "logits/rejected": -49201581.71428572, + "logps/chosen": -395.8020263671875, + "logps/rejected": -494.59329659598217, + "loss": 0.0588, + "rewards/chosen": 5.384991836547852, + "rewards/margins": 15.526380865914483, + "rewards/rejected": -10.14138902936663, + "step": 1260 + }, + { + "epoch": 0.3155260853246591, + "grad_norm": 8.375, + "kl": 8.176740646362305, + "learning_rate": 5e-06, + "logits/chosen": -43392391.11111111, + "logits/rejected": -82738480.0, + "logps/chosen": -405.25946723090277, + "logps/rejected": -850.5380859375, + "loss": 0.0262, + "rewards/chosen": 7.222204420301649, + "rewards/margins": 25.75845294528537, + "rewards/rejected": -18.536248524983723, + "step": 1261 + }, + { + "epoch": 0.31577630426623293, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32081225.846153848, + "logits/rejected": -53937361.45454545, + "logps/chosen": -298.4744215745192, + "logps/rejected": -641.1859463778409, + "loss": 0.0665, + "rewards/chosen": 6.669564467210036, + "rewards/margins": 18.06667583972424, + "rewards/rejected": -11.397111372514205, + "step": 1262 + }, + { + "epoch": 0.31602652320780683, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46724680.0, + "logits/rejected": -64328520.0, + "logps/chosen": -299.5916748046875, + "logps/rejected": -706.776123046875, + "loss": 0.0399, + "rewards/chosen": 6.258452892303467, + "rewards/margins": 18.273942470550537, + "rewards/rejected": -12.01548957824707, + "step": 1263 + }, + { + "epoch": 0.31627674214938073, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 28210421.333333332, + "logits/rejected": -42209237.333333336, + "logps/chosen": -392.4527180989583, + "logps/rejected": -541.4264322916666, + "loss": 0.1138, + "rewards/chosen": 5.391928354899089, + "rewards/margins": 15.481610616048176, + "rewards/rejected": -10.089682261149088, + "step": 1264 + }, + { + "epoch": 0.3165269610909546, + "grad_norm": 7.0, + "kl": 10.140898704528809, + "learning_rate": 5e-06, + "logits/chosen": -18645608.727272727, + "logits/rejected": -52932824.615384616, + "logps/chosen": -387.9818004261364, + "logps/rejected": -547.0643780048077, + "loss": 0.0348, + "rewards/chosen": 7.51687275279652, + "rewards/margins": 17.709093587381858, + "rewards/rejected": -10.192220834585337, + "step": 1265 + }, + { + "epoch": 0.31677718003252847, + "grad_norm": 16.375, + "kl": 4.632102966308594, + "learning_rate": 5e-06, + "logits/chosen": -12108572.307692308, + "logits/rejected": -49853498.18181818, + "logps/chosen": -359.57898888221155, + "logps/rejected": -776.3927556818181, + "loss": 0.0298, + "rewards/chosen": 7.409845205453726, + "rewards/margins": 22.61639094852901, + "rewards/rejected": -15.206545743075283, + "step": 1266 + }, + { + "epoch": 0.3170273989741023, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40591907.2, + "logits/rejected": -33658761.14285714, + "logps/chosen": -386.29423828125, + "logps/rejected": -718.2762974330357, + "loss": 0.0177, + "rewards/chosen": 7.0229850769042965, + "rewards/margins": 21.472626713344027, + "rewards/rejected": -14.449641636439733, + "step": 1267 + }, + { + "epoch": 0.3172776179156762, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45506995.2, + "logits/rejected": -66656490.666666664, + "logps/chosen": -377.16149088541664, + "logps/rejected": -528.9047309027778, + "loss": 0.0247, + "rewards/chosen": 7.000936889648438, + "rewards/margins": 20.10698716905382, + "rewards/rejected": -13.106050279405382, + "step": 1268 + }, + { + "epoch": 0.3175278368572501, + "grad_norm": 5.3125, + "kl": 0.21482086181640625, + "learning_rate": 5e-06, + "logits/chosen": -43414668.8, + "logits/rejected": -57985998.222222224, + "logps/chosen": -408.27057291666665, + "logps/rejected": -523.1936848958334, + "loss": 0.0395, + "rewards/chosen": 6.874470011393229, + "rewards/margins": 16.336249287923177, + "rewards/rejected": -9.461779276529947, + "step": 1269 + }, + { + "epoch": 0.31777805579882396, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45678677.333333336, + "logits/rejected": -27435040.0, + "logps/chosen": -421.4379475911458, + "logps/rejected": -634.3431803385416, + "loss": 0.0826, + "rewards/chosen": 5.739796956380208, + "rewards/margins": 16.778418223063152, + "rewards/rejected": -11.038621266682943, + "step": 1270 + }, + { + "epoch": 0.31802827474039785, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45876445.09090909, + "logits/rejected": -57674259.692307696, + "logps/chosen": -382.40793678977275, + "logps/rejected": -755.0516826923077, + "loss": 0.0246, + "rewards/chosen": 7.015948208895597, + "rewards/margins": 22.097244689514586, + "rewards/rejected": -15.08129648061899, + "step": 1271 + }, + { + "epoch": 0.31827849368197175, + "grad_norm": 10.25, + "kl": 9.881219863891602, + "learning_rate": 5e-06, + "logits/chosen": -63959099.733333334, + "logits/rejected": -39135047.11111111, + "logps/chosen": -504.5817057291667, + "logps/rejected": -818.7438151041666, + "loss": 0.0528, + "rewards/chosen": 7.784451293945312, + "rewards/margins": 21.66170111762153, + "rewards/rejected": -13.877249823676216, + "step": 1272 + }, + { + "epoch": 0.3185287126235456, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48841749.333333336, + "logits/rejected": -39677824.0, + "logps/chosen": -304.23032633463544, + "logps/rejected": -530.8593207465278, + "loss": 0.0321, + "rewards/chosen": 7.149608612060547, + "rewards/margins": 19.449505700005425, + "rewards/rejected": -12.299897087944878, + "step": 1273 + }, + { + "epoch": 0.3187789315651195, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52670457.6, + "logits/rejected": -44969750.85714286, + "logps/chosen": -371.2403564453125, + "logps/rejected": -566.4298967633929, + "loss": 0.0482, + "rewards/chosen": 5.227872085571289, + "rewards/margins": 17.715610558646066, + "rewards/rejected": -12.487738473074776, + "step": 1274 + }, + { + "epoch": 0.31902915050669334, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29936220.444444444, + "logits/rejected": -52257467.733333334, + "logps/chosen": -255.77734375, + "logps/rejected": -552.2119791666667, + "loss": 0.05, + "rewards/chosen": 4.643010033501519, + "rewards/margins": 15.452336205376518, + "rewards/rejected": -10.809326171875, + "step": 1275 + }, + { + "epoch": 0.31927936944826724, + "grad_norm": 8.375, + "kl": 1.7003517150878906, + "learning_rate": 5e-06, + "logits/chosen": -40648089.6, + "logits/rejected": -78199409.77777778, + "logps/chosen": -368.59485677083336, + "logps/rejected": -622.3368598090278, + "loss": 0.0234, + "rewards/chosen": 6.860987854003906, + "rewards/margins": 15.721263122558593, + "rewards/rejected": -8.860275268554688, + "step": 1276 + }, + { + "epoch": 0.31952958838984113, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63605989.333333336, + "logits/rejected": -36670560.0, + "logps/chosen": -379.5790201822917, + "logps/rejected": -568.8715006510416, + "loss": 0.043, + "rewards/chosen": 5.063301722208659, + "rewards/margins": 18.589928309122723, + "rewards/rejected": -13.526626586914062, + "step": 1277 + }, + { + "epoch": 0.319779807331415, + "grad_norm": 6.90625, + "kl": 0.7730096578598022, + "learning_rate": 5e-06, + "logits/chosen": -43640310.85714286, + "logits/rejected": -55026489.6, + "logps/chosen": -311.25924246651783, + "logps/rejected": -493.325732421875, + "loss": 0.0365, + "rewards/chosen": 5.7841322762625555, + "rewards/margins": 17.032314954485212, + "rewards/rejected": -11.248182678222657, + "step": 1278 + }, + { + "epoch": 0.3200300262729889, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39818297.6, + "logits/rejected": -48446802.28571428, + "logps/chosen": -313.08740234375, + "logps/rejected": -579.4892578125, + "loss": 0.026, + "rewards/chosen": 5.144179916381836, + "rewards/margins": 17.348122351510185, + "rewards/rejected": -12.203942435128349, + "step": 1279 + }, + { + "epoch": 0.3202802452145627, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48784283.428571425, + "logits/rejected": -44368300.8, + "logps/chosen": -504.41245814732144, + "logps/rejected": -582.00107421875, + "loss": 0.034, + "rewards/chosen": 8.076739719935826, + "rewards/margins": 22.640622166224887, + "rewards/rejected": -14.563882446289062, + "step": 1280 + }, + { + "epoch": 0.3205304641561366, + "grad_norm": 9.1875, + "kl": 2.7007224559783936, + "learning_rate": 5e-06, + "logits/chosen": -105592797.0909091, + "logits/rejected": -35596194.461538464, + "logps/chosen": -453.2277166193182, + "logps/rejected": -424.7365910456731, + "loss": 0.0295, + "rewards/chosen": 5.819885947487571, + "rewards/margins": 17.609108077896224, + "rewards/rejected": -11.789222130408653, + "step": 1281 + }, + { + "epoch": 0.3207806830977105, + "grad_norm": 12.5625, + "kl": 7.347240447998047, + "learning_rate": 5e-06, + "logits/chosen": -43336922.35294118, + "logits/rejected": -40496996.571428575, + "logps/chosen": -371.56399356617646, + "logps/rejected": -655.4492885044643, + "loss": 0.0357, + "rewards/chosen": 7.157273685230928, + "rewards/margins": 22.04531885996586, + "rewards/rejected": -14.888045174734932, + "step": 1282 + }, + { + "epoch": 0.32103090203928436, + "grad_norm": 24.0, + "kl": 0.1550954282283783, + "learning_rate": 5e-06, + "logits/chosen": -31235501.333333332, + "logits/rejected": -48669770.666666664, + "logps/chosen": -353.2637939453125, + "logps/rejected": -601.1693522135416, + "loss": 0.065, + "rewards/chosen": 5.363969802856445, + "rewards/margins": 20.086745580037437, + "rewards/rejected": -14.72277577718099, + "step": 1283 + }, + { + "epoch": 0.32128112098085826, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63968576.0, + "logits/rejected": -36907298.461538464, + "logps/chosen": -352.1075550426136, + "logps/rejected": -633.4033203125, + "loss": 0.0275, + "rewards/chosen": 6.64397569136186, + "rewards/margins": 21.92105065192376, + "rewards/rejected": -15.2770749605619, + "step": 1284 + }, + { + "epoch": 0.3215313399224321, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49737393.777777776, + "logits/rejected": -63062912.0, + "logps/chosen": -274.05799696180554, + "logps/rejected": -688.4330729166667, + "loss": 0.0871, + "rewards/chosen": 3.257261488172743, + "rewards/margins": 16.115530734592014, + "rewards/rejected": -12.85826924641927, + "step": 1285 + }, + { + "epoch": 0.321781558864006, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26646731.42857143, + "logits/rejected": -47209379.2, + "logps/chosen": -382.17529296875, + "logps/rejected": -605.72880859375, + "loss": 0.0555, + "rewards/chosen": 4.946352277483259, + "rewards/margins": 20.262341962541853, + "rewards/rejected": -15.315989685058593, + "step": 1286 + }, + { + "epoch": 0.3220317778055799, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60115338.666666664, + "logits/rejected": -55130805.333333336, + "logps/chosen": -303.1997884114583, + "logps/rejected": -608.5017496744791, + "loss": 0.0459, + "rewards/chosen": 5.368688583374023, + "rewards/margins": 18.260388056437172, + "rewards/rejected": -12.89169947306315, + "step": 1287 + }, + { + "epoch": 0.32228199674715374, + "grad_norm": 18.625, + "kl": 9.332144737243652, + "learning_rate": 5e-06, + "logits/chosen": -64099438.54545455, + "logits/rejected": -63784659.692307696, + "logps/chosen": -419.41787997159093, + "logps/rejected": -504.9330303485577, + "loss": 0.0656, + "rewards/chosen": 7.781129316850142, + "rewards/margins": 19.046220259232953, + "rewards/rejected": -11.265090942382812, + "step": 1288 + }, + { + "epoch": 0.32253221568872764, + "grad_norm": 7.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59888535.27272727, + "logits/rejected": -46535222.15384615, + "logps/chosen": -487.5755504261364, + "logps/rejected": -631.5006760817307, + "loss": 0.0171, + "rewards/chosen": 7.721756675026634, + "rewards/margins": 19.121367928031443, + "rewards/rejected": -11.399611253004808, + "step": 1289 + }, + { + "epoch": 0.32278243463030154, + "grad_norm": 11.375, + "kl": 4.022454738616943, + "learning_rate": 5e-06, + "logits/chosen": -48174052.571428575, + "logits/rejected": -36629676.8, + "logps/chosen": -458.96323939732144, + "logps/rejected": -695.745556640625, + "loss": 0.0626, + "rewards/chosen": 6.650062561035156, + "rewards/margins": 23.885919189453126, + "rewards/rejected": -17.23585662841797, + "step": 1290 + }, + { + "epoch": 0.3230326535718754, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33614868.36363637, + "logits/rejected": -31564366.769230768, + "logps/chosen": -394.28444602272725, + "logps/rejected": -528.0996844951923, + "loss": 0.0376, + "rewards/chosen": 5.247021068226207, + "rewards/margins": 15.810591530966592, + "rewards/rejected": -10.563570462740385, + "step": 1291 + }, + { + "epoch": 0.3232828725134493, + "grad_norm": 1.7734375, + "kl": 6.965981483459473, + "learning_rate": 5e-06, + "logits/chosen": -74536950.85714285, + "logits/rejected": -57493292.8, + "logps/chosen": -439.77640206473217, + "logps/rejected": -529.878515625, + "loss": 0.0823, + "rewards/chosen": 8.393458775111608, + "rewards/margins": 20.22898428780692, + "rewards/rejected": -11.835525512695312, + "step": 1292 + }, + { + "epoch": 0.3235330914550231, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56255319.27272727, + "logits/rejected": -63900489.84615385, + "logps/chosen": -322.28677645596593, + "logps/rejected": -634.2214543269231, + "loss": 0.0572, + "rewards/chosen": 7.038822520862926, + "rewards/margins": 17.64155994762074, + "rewards/rejected": -10.602737426757812, + "step": 1293 + }, + { + "epoch": 0.323783310396597, + "grad_norm": 14.625, + "kl": 2.604123830795288, + "learning_rate": 5e-06, + "logits/chosen": -26121428.0, + "logits/rejected": -64316448.0, + "logps/chosen": -312.4049377441406, + "logps/rejected": -732.2247314453125, + "loss": 0.0732, + "rewards/chosen": 5.839616775512695, + "rewards/margins": 21.819026947021484, + "rewards/rejected": -15.979410171508789, + "step": 1294 + }, + { + "epoch": 0.3240335293381709, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49314279.384615384, + "logits/rejected": -30681736.727272727, + "logps/chosen": -398.8065655048077, + "logps/rejected": -743.0536665482955, + "loss": 0.0368, + "rewards/chosen": 6.979983990009014, + "rewards/margins": 21.606319827633303, + "rewards/rejected": -14.62633583762429, + "step": 1295 + }, + { + "epoch": 0.32428374827974477, + "grad_norm": 5.90625, + "kl": 10.46474838256836, + "learning_rate": 5e-06, + "logits/chosen": -33184354.285714287, + "logits/rejected": 21539731.2, + "logps/chosen": -512.68115234375, + "logps/rejected": -608.69970703125, + "loss": 0.0104, + "rewards/chosen": 7.437153407505581, + "rewards/margins": 18.65278069632394, + "rewards/rejected": -11.215627288818359, + "step": 1296 + }, + { + "epoch": 0.32453396722131866, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55096283.428571425, + "logits/rejected": -38106669.176470585, + "logps/chosen": -387.54652622767856, + "logps/rejected": -511.9735753676471, + "loss": 0.0311, + "rewards/chosen": 7.0351137433733255, + "rewards/margins": 18.50823673280347, + "rewards/rejected": -11.473122989430147, + "step": 1297 + }, + { + "epoch": 0.3247841861628925, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69361152.0, + "logits/rejected": -27781085.53846154, + "logps/chosen": -484.75284090909093, + "logps/rejected": -587.9753605769231, + "loss": 0.023, + "rewards/chosen": 7.069455233487216, + "rewards/margins": 19.2225278841032, + "rewards/rejected": -12.153072650615986, + "step": 1298 + }, + { + "epoch": 0.3250344051044664, + "grad_norm": 13.0, + "kl": 4.947166442871094, + "learning_rate": 5e-06, + "logits/chosen": -48727114.666666664, + "logits/rejected": -29651333.333333332, + "logps/chosen": -423.2584635416667, + "logps/rejected": -330.3903401692708, + "loss": 0.0626, + "rewards/chosen": 8.027955373128256, + "rewards/margins": 15.964884440104168, + "rewards/rejected": -7.936929066975911, + "step": 1299 + }, + { + "epoch": 0.3252846240460403, + "grad_norm": 12.125, + "kl": 2.5877013206481934, + "learning_rate": 5e-06, + "logits/chosen": -54991104.0, + "logits/rejected": 3685610.6666666665, + "logps/chosen": -431.54120551215277, + "logps/rejected": -522.689453125, + "loss": 0.0348, + "rewards/chosen": 8.492072211371529, + "rewards/margins": 20.74616156684028, + "rewards/rejected": -12.25408935546875, + "step": 1300 + }, + { + "epoch": 0.32553484298761415, + "grad_norm": 3.390625, + "kl": 4.459981918334961, + "learning_rate": 5e-06, + "logits/chosen": -58122426.18181818, + "logits/rejected": -51050353.23076923, + "logps/chosen": -410.40016867897725, + "logps/rejected": -652.0196063701923, + "loss": 0.021, + "rewards/chosen": 8.1951904296875, + "rewards/margins": 22.66502615121695, + "rewards/rejected": -14.469835721529448, + "step": 1301 + }, + { + "epoch": 0.32578506192918805, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57415301.333333336, + "logits/rejected": -27322098.666666668, + "logps/chosen": -507.4646809895833, + "logps/rejected": -532.3552652994791, + "loss": 0.0354, + "rewards/chosen": 8.16363271077474, + "rewards/margins": 18.853150685628258, + "rewards/rejected": -10.689517974853516, + "step": 1302 + }, + { + "epoch": 0.3260352808707619, + "grad_norm": 7.5, + "kl": 2.8420791625976562, + "learning_rate": 5e-06, + "logits/chosen": -45678602.666666664, + "logits/rejected": 47868314.666666664, + "logps/chosen": -484.0874837239583, + "logps/rejected": -394.8536376953125, + "loss": 0.04, + "rewards/chosen": 7.418270746866862, + "rewards/margins": 16.138280232747395, + "rewards/rejected": -8.720009485880533, + "step": 1303 + }, + { + "epoch": 0.3262854998123358, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37260436.571428575, + "logits/rejected": -23889561.6, + "logps/chosen": -387.35707310267856, + "logps/rejected": -335.847265625, + "loss": 0.0206, + "rewards/chosen": 6.58929933820452, + "rewards/margins": 15.41866182599749, + "rewards/rejected": -8.829362487792968, + "step": 1304 + }, + { + "epoch": 0.3265357187539097, + "grad_norm": 14.8125, + "kl": 0.38400477170944214, + "learning_rate": 5e-06, + "logits/chosen": -54965115.07692308, + "logits/rejected": -62456593.45454545, + "logps/chosen": -400.7051532451923, + "logps/rejected": -552.0832741477273, + "loss": 0.0908, + "rewards/chosen": 5.856508108285757, + "rewards/margins": 16.283283953900103, + "rewards/rejected": -10.426775845614346, + "step": 1305 + }, + { + "epoch": 0.32678593769548353, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49025269.333333336, + "logits/rejected": -27415837.866666667, + "logps/chosen": -413.7991536458333, + "logps/rejected": -386.639453125, + "loss": 0.0926, + "rewards/chosen": 4.815151214599609, + "rewards/margins": 12.762701161702473, + "rewards/rejected": -7.947549947102865, + "step": 1306 + }, + { + "epoch": 0.32703615663705743, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46450272.0, + "logits/rejected": -16916649.333333332, + "logps/chosen": -543.2366129557291, + "logps/rejected": -612.9325764973959, + "loss": 0.0147, + "rewards/chosen": 8.421676635742188, + "rewards/margins": 22.21736399332682, + "rewards/rejected": -13.795687357584635, + "step": 1307 + }, + { + "epoch": 0.32728637557863133, + "grad_norm": 9.125, + "kl": 2.1481730937957764, + "learning_rate": 5e-06, + "logits/chosen": -51507024.0, + "logits/rejected": -69488650.66666667, + "logps/chosen": -470.5382893880208, + "logps/rejected": -731.5758463541666, + "loss": 0.0317, + "rewards/chosen": 8.839162190755209, + "rewards/margins": 21.126419067382812, + "rewards/rejected": -12.287256876627604, + "step": 1308 + }, + { + "epoch": 0.32753659452020517, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75509195.63636364, + "logits/rejected": -28084519.384615384, + "logps/chosen": -289.5700017755682, + "logps/rejected": -491.72716346153845, + "loss": 0.0505, + "rewards/chosen": 4.795547832142223, + "rewards/margins": 16.448959697376598, + "rewards/rejected": -11.653411865234375, + "step": 1309 + }, + { + "epoch": 0.32778681346177907, + "grad_norm": 11.25, + "kl": 6.043050289154053, + "learning_rate": 5e-06, + "logits/chosen": -54099234.90909091, + "logits/rejected": -108971204.92307693, + "logps/chosen": -475.1676136363636, + "logps/rejected": -531.2613807091346, + "loss": 0.0373, + "rewards/chosen": 8.551368019797586, + "rewards/margins": 22.20765323238773, + "rewards/rejected": -13.656285212590145, + "step": 1310 + }, + { + "epoch": 0.3280370324033529, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56028224.0, + "logits/rejected": -43560963.2, + "logps/chosen": -456.30496651785717, + "logps/rejected": -610.762255859375, + "loss": 0.0292, + "rewards/chosen": 5.967198508126395, + "rewards/margins": 19.150186484200614, + "rewards/rejected": -13.18298797607422, + "step": 1311 + }, + { + "epoch": 0.3282872513449268, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -102815469.71428572, + "logits/rejected": -61270885.64705882, + "logps/chosen": -479.75205775669644, + "logps/rejected": -652.5213694852941, + "loss": 0.0417, + "rewards/chosen": 8.19467544555664, + "rewards/margins": 21.028763939352597, + "rewards/rejected": -12.834088493795957, + "step": 1312 + }, + { + "epoch": 0.3285374702865007, + "grad_norm": 5.59375, + "kl": 2.733454942703247, + "learning_rate": 5e-06, + "logits/chosen": -37395322.18181818, + "logits/rejected": -912665.8461538461, + "logps/chosen": -374.05055930397725, + "logps/rejected": -523.19482421875, + "loss": 0.0427, + "rewards/chosen": 8.183658253062855, + "rewards/margins": 16.726570076042123, + "rewards/rejected": -8.542911822979267, + "step": 1313 + }, + { + "epoch": 0.32878768922807455, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31023730.666666668, + "logits/rejected": -52924389.333333336, + "logps/chosen": -355.6796061197917, + "logps/rejected": -574.7836507161459, + "loss": 0.0706, + "rewards/chosen": 6.032105763753255, + "rewards/margins": 16.904056549072266, + "rewards/rejected": -10.87195078531901, + "step": 1314 + }, + { + "epoch": 0.32903790816964845, + "grad_norm": 12.9375, + "kl": 14.697174072265625, + "learning_rate": 5e-06, + "logits/chosen": -48039571.692307696, + "logits/rejected": -69240017.45454545, + "logps/chosen": -455.8532151442308, + "logps/rejected": -730.8930220170455, + "loss": 0.0803, + "rewards/chosen": 8.880028357872597, + "rewards/margins": 23.880536059399585, + "rewards/rejected": -15.000507701526988, + "step": 1315 + }, + { + "epoch": 0.3292881271112223, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57237326.76923077, + "logits/rejected": -43537780.36363637, + "logps/chosen": -445.71814903846155, + "logps/rejected": -462.0006214488636, + "loss": 0.0184, + "rewards/chosen": 7.54583505483774, + "rewards/margins": 16.45185201151388, + "rewards/rejected": -8.906016956676137, + "step": 1316 + }, + { + "epoch": 0.3295383460527962, + "grad_norm": 7.875, + "kl": 3.6453094482421875, + "learning_rate": 5e-06, + "logits/chosen": -84937830.4, + "logits/rejected": -63595181.71428572, + "logps/chosen": -481.329296875, + "logps/rejected": -652.0350167410714, + "loss": 0.043, + "rewards/chosen": 6.581423950195313, + "rewards/margins": 18.190345546177454, + "rewards/rejected": -11.608921595982142, + "step": 1317 + }, + { + "epoch": 0.3297885649943701, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71455808.0, + "logits/rejected": -60674279.384615384, + "logps/chosen": -270.35311612215907, + "logps/rejected": -752.7180739182693, + "loss": 0.0303, + "rewards/chosen": 3.9281075217507104, + "rewards/margins": 18.5446823226822, + "rewards/rejected": -14.61657480093149, + "step": 1318 + }, + { + "epoch": 0.33003878393594394, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46047648.0, + "logits/rejected": -70868557.71428572, + "logps/chosen": -357.9293701171875, + "logps/rejected": -740.3204520089286, + "loss": 0.0259, + "rewards/chosen": 5.3519329071044925, + "rewards/margins": 22.76464173453195, + "rewards/rejected": -17.412708827427455, + "step": 1319 + }, + { + "epoch": 0.33028900287751783, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66989794.461538464, + "logits/rejected": -56064698.18181818, + "logps/chosen": -458.98985877403845, + "logps/rejected": -545.8565784801136, + "loss": 0.0072, + "rewards/chosen": 8.09774428147536, + "rewards/margins": 17.084722985754482, + "rewards/rejected": -8.98697870427912, + "step": 1320 + }, + { + "epoch": 0.3305392218190917, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69612960.0, + "logits/rejected": -51901398.4, + "logps/chosen": -402.77901785714283, + "logps/rejected": -564.616845703125, + "loss": 0.0586, + "rewards/chosen": 5.751282828194754, + "rewards/margins": 19.252323477608815, + "rewards/rejected": -13.501040649414062, + "step": 1321 + }, + { + "epoch": 0.3307894407606656, + "grad_norm": 14.5625, + "kl": 12.04847240447998, + "learning_rate": 5e-06, + "logits/chosen": -34522457.6, + "logits/rejected": -39687120.0, + "logps/chosen": -476.9611328125, + "logps/rejected": -897.1007080078125, + "loss": 0.0823, + "rewards/chosen": 8.806517791748046, + "rewards/margins": 26.31516761779785, + "rewards/rejected": -17.508649826049805, + "step": 1322 + }, + { + "epoch": 0.3310396597022395, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70662486.85714285, + "logits/rejected": -63623667.2, + "logps/chosen": -357.3809291294643, + "logps/rejected": -658.059326171875, + "loss": 0.0353, + "rewards/chosen": 5.4811875479561945, + "rewards/margins": 20.022956957135882, + "rewards/rejected": -14.541769409179688, + "step": 1323 + }, + { + "epoch": 0.3312898786438133, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48801028.571428575, + "logits/rejected": -69028928.0, + "logps/chosen": -430.1524135044643, + "logps/rejected": -755.635546875, + "loss": 0.0404, + "rewards/chosen": 8.647618430001396, + "rewards/margins": 21.954083578927175, + "rewards/rejected": -13.306465148925781, + "step": 1324 + }, + { + "epoch": 0.3315400975853872, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37805664.0, + "logits/rejected": -49762391.27272727, + "logps/chosen": -327.4065504807692, + "logps/rejected": -559.6705433238636, + "loss": 0.0252, + "rewards/chosen": 6.329107431265024, + "rewards/margins": 18.372063696801245, + "rewards/rejected": -12.04295626553622, + "step": 1325 + }, + { + "epoch": 0.3317903165269611, + "grad_norm": 10.625, + "kl": 2.1260504722595215, + "learning_rate": 5e-06, + "logits/chosen": -58163997.538461536, + "logits/rejected": -58095197.09090909, + "logps/chosen": -354.61658653846155, + "logps/rejected": -867.4247159090909, + "loss": 0.0375, + "rewards/chosen": 6.199777456430288, + "rewards/margins": 21.506789841018357, + "rewards/rejected": -15.307012384588068, + "step": 1326 + }, + { + "epoch": 0.33204053546853496, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27763982.0, + "logits/rejected": -47246644.0, + "logps/chosen": -394.0245666503906, + "logps/rejected": -583.006103515625, + "loss": 0.0318, + "rewards/chosen": 6.902261734008789, + "rewards/margins": 19.452731132507324, + "rewards/rejected": -12.550469398498535, + "step": 1327 + }, + { + "epoch": 0.33229075441010886, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61280471.27272727, + "logits/rejected": -29800585.846153848, + "logps/chosen": -466.18918678977275, + "logps/rejected": -373.14002403846155, + "loss": 0.0337, + "rewards/chosen": 7.508759932084517, + "rewards/margins": 16.588247072446595, + "rewards/rejected": -9.07948714036208, + "step": 1328 + }, + { + "epoch": 0.3325409733516827, + "grad_norm": 5.5, + "kl": 0.4970232844352722, + "learning_rate": 5e-06, + "logits/chosen": -33947829.333333336, + "logits/rejected": -36240066.666666664, + "logps/chosen": -382.8893229166667, + "logps/rejected": -538.6937662760416, + "loss": 0.0333, + "rewards/chosen": 6.703197479248047, + "rewards/margins": 18.959982554117836, + "rewards/rejected": -12.256785074869791, + "step": 1329 + }, + { + "epoch": 0.3327911922932566, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37734368.0, + "logits/rejected": 33503694.933333334, + "logps/chosen": -349.6689181857639, + "logps/rejected": -447.7561848958333, + "loss": 0.0202, + "rewards/chosen": 6.241354200575087, + "rewards/margins": 16.77003156873915, + "rewards/rejected": -10.528677368164063, + "step": 1330 + }, + { + "epoch": 0.3330414112348305, + "grad_norm": 5.71875, + "kl": 1.872991919517517, + "learning_rate": 5e-06, + "logits/chosen": -43453418.666666664, + "logits/rejected": -6143753.333333333, + "logps/chosen": -343.99696180555554, + "logps/rejected": -605.0619710286459, + "loss": 0.0732, + "rewards/chosen": 5.6351267496744795, + "rewards/margins": 16.732693990071613, + "rewards/rejected": -11.097567240397135, + "step": 1331 + }, + { + "epoch": 0.33329163017640434, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54579840.0, + "logits/rejected": -63119669.333333336, + "logps/chosen": -438.6621500651042, + "logps/rejected": -531.3933512369791, + "loss": 0.0517, + "rewards/chosen": 5.8907470703125, + "rewards/margins": 15.888117472330729, + "rewards/rejected": -9.997370402018229, + "step": 1332 + }, + { + "epoch": 0.33354184911797824, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39835549.09090909, + "logits/rejected": -28802279.384615384, + "logps/chosen": -275.9366344105114, + "logps/rejected": -451.02249849759613, + "loss": 0.1043, + "rewards/chosen": 4.384413979270241, + "rewards/margins": 14.761321327903055, + "rewards/rejected": -10.376907348632812, + "step": 1333 + }, + { + "epoch": 0.3337920680595521, + "grad_norm": 6.375, + "kl": 0.39078569412231445, + "learning_rate": 5e-06, + "logits/chosen": -41747347.2, + "logits/rejected": -48206258.28571428, + "logps/chosen": -188.80257568359374, + "logps/rejected": -517.7456752232143, + "loss": 0.0396, + "rewards/chosen": 4.530624771118164, + "rewards/margins": 16.05058435712542, + "rewards/rejected": -11.519959586007255, + "step": 1334 + }, + { + "epoch": 0.334042287001126, + "grad_norm": 11.375, + "kl": 1.87445068359375, + "learning_rate": 5e-06, + "logits/chosen": -51963562.666666664, + "logits/rejected": -34073514.666666664, + "logps/chosen": -411.3940755208333, + "logps/rejected": -372.69349500868054, + "loss": 0.0319, + "rewards/chosen": 7.220335896809896, + "rewards/margins": 18.33167928059896, + "rewards/rejected": -11.111343383789062, + "step": 1335 + }, + { + "epoch": 0.3342925059426999, + "grad_norm": 18.25, + "kl": 1.205657958984375, + "learning_rate": 5e-06, + "logits/chosen": -7935102.4, + "logits/rejected": -35353090.28571428, + "logps/chosen": -305.5932373046875, + "logps/rejected": -439.08461216517856, + "loss": 0.0682, + "rewards/chosen": 5.213975524902343, + "rewards/margins": 17.195648629324776, + "rewards/rejected": -11.981673104422432, + "step": 1336 + }, + { + "epoch": 0.3345427248842737, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50167117.71428572, + "logits/rejected": -64593702.4, + "logps/chosen": -365.00830078125, + "logps/rejected": -658.148388671875, + "loss": 0.0617, + "rewards/chosen": 5.423768724714007, + "rewards/margins": 18.849464525495257, + "rewards/rejected": -13.42569580078125, + "step": 1337 + }, + { + "epoch": 0.3347929438258476, + "grad_norm": 10.5, + "kl": 2.0387485027313232, + "learning_rate": 5e-06, + "logits/chosen": -52057192.72727273, + "logits/rejected": -22635057.230769232, + "logps/chosen": -426.41792436079544, + "logps/rejected": -537.6845327524038, + "loss": 0.037, + "rewards/chosen": 6.5159149169921875, + "rewards/margins": 18.38060349684495, + "rewards/rejected": -11.864688579852764, + "step": 1338 + }, + { + "epoch": 0.3350431627674215, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28478294.4, + "logits/rejected": -22022157.714285713, + "logps/chosen": -207.52890625, + "logps/rejected": -577.8662109375, + "loss": 0.0594, + "rewards/chosen": 5.3212932586669925, + "rewards/margins": 15.112489809308734, + "rewards/rejected": -9.791196550641741, + "step": 1339 + }, + { + "epoch": 0.33529338170899536, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54632616.0, + "logits/rejected": -46140208.0, + "logps/chosen": -337.9712219238281, + "logps/rejected": -574.8458251953125, + "loss": 0.0476, + "rewards/chosen": 5.999845504760742, + "rewards/margins": 19.704029083251953, + "rewards/rejected": -13.704183578491211, + "step": 1340 + }, + { + "epoch": 0.33554360065056926, + "grad_norm": 13.0, + "kl": 1.5396665334701538, + "learning_rate": 5e-06, + "logits/chosen": -67867834.18181819, + "logits/rejected": -52158749.538461536, + "logps/chosen": -307.18257279829544, + "logps/rejected": -528.5794771634615, + "loss": 0.0609, + "rewards/chosen": 5.633566076105291, + "rewards/margins": 17.671494730702648, + "rewards/rejected": -12.037928654597355, + "step": 1341 + }, + { + "epoch": 0.3357938195921431, + "grad_norm": 6.8125, + "kl": 5.219085693359375, + "learning_rate": 5e-06, + "logits/chosen": -52744969.14285714, + "logits/rejected": -25142401.6, + "logps/chosen": -412.9884556361607, + "logps/rejected": -497.02978515625, + "loss": 0.046, + "rewards/chosen": 7.841343470982143, + "rewards/margins": 21.259319850376674, + "rewards/rejected": -13.417976379394531, + "step": 1342 + }, + { + "epoch": 0.336044038533717, + "grad_norm": 4.4375, + "kl": 0.0262451171875, + "learning_rate": 5e-06, + "logits/chosen": -43297984.0, + "logits/rejected": -48623338.666666664, + "logps/chosen": -377.7265218098958, + "logps/rejected": -585.1934000651041, + "loss": 0.0504, + "rewards/chosen": 6.501618067423503, + "rewards/margins": 18.779150009155273, + "rewards/rejected": -12.277531941731771, + "step": 1343 + }, + { + "epoch": 0.3362942574752909, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30450723.555555556, + "logits/rejected": -31818880.0, + "logps/chosen": -414.99799262152777, + "logps/rejected": -767.8289713541667, + "loss": 0.0398, + "rewards/chosen": 5.126502143012153, + "rewards/margins": 21.735421413845486, + "rewards/rejected": -16.608919270833333, + "step": 1344 + }, + { + "epoch": 0.33654447641686475, + "grad_norm": 11.0625, + "kl": 4.947748184204102, + "learning_rate": 5e-06, + "logits/chosen": -51574592.0, + "logits/rejected": -78118165.33333333, + "logps/chosen": -442.65576171875, + "logps/rejected": -623.1094156901041, + "loss": 0.0512, + "rewards/chosen": 7.760725657145183, + "rewards/margins": 18.367804845174152, + "rewards/rejected": -10.60707918802897, + "step": 1345 + }, + { + "epoch": 0.33679469535843864, + "grad_norm": 12.8125, + "kl": 7.564021110534668, + "learning_rate": 5e-06, + "logits/chosen": -67430976.0, + "logits/rejected": -62796666.666666664, + "logps/chosen": -436.0860188802083, + "logps/rejected": -500.9079996744792, + "loss": 0.0495, + "rewards/chosen": 6.593201955159505, + "rewards/margins": 15.753700256347656, + "rewards/rejected": -9.16049830118815, + "step": 1346 + }, + { + "epoch": 0.3370449143000125, + "grad_norm": 10.9375, + "kl": 7.317770004272461, + "learning_rate": 5e-06, + "logits/chosen": -22696803.76470588, + "logits/rejected": -70915177.14285715, + "logps/chosen": -308.36764705882354, + "logps/rejected": -582.8038504464286, + "loss": 0.0669, + "rewards/chosen": 6.396182789522059, + "rewards/margins": 18.096769380970162, + "rewards/rejected": -11.700586591448102, + "step": 1347 + }, + { + "epoch": 0.3372951332415864, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42279850.666666664, + "logits/rejected": -68172637.86666666, + "logps/chosen": -335.36371527777777, + "logps/rejected": -596.8044270833333, + "loss": 0.025, + "rewards/chosen": 6.992486741807726, + "rewards/margins": 18.569833543565537, + "rewards/rejected": -11.577346801757812, + "step": 1348 + }, + { + "epoch": 0.3375453521831603, + "grad_norm": 8.6875, + "kl": 1.6140928268432617, + "learning_rate": 5e-06, + "logits/chosen": -44424864.0, + "logits/rejected": -70485562.66666667, + "logps/chosen": -360.2839762369792, + "logps/rejected": -661.4430338541666, + "loss": 0.082, + "rewards/chosen": 6.851779937744141, + "rewards/margins": 19.409959157307945, + "rewards/rejected": -12.558179219563803, + "step": 1349 + }, + { + "epoch": 0.33779557112473413, + "grad_norm": 20.125, + "kl": 13.532404899597168, + "learning_rate": 5e-06, + "logits/chosen": -54110276.92307692, + "logits/rejected": -44575662.54545455, + "logps/chosen": -385.5871394230769, + "logps/rejected": -538.53173828125, + "loss": 0.0824, + "rewards/chosen": 7.348847022423377, + "rewards/margins": 15.594080731585308, + "rewards/rejected": -8.245233709161932, + "step": 1350 + }, + { + "epoch": 0.338045790066308, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50688764.44444445, + "logits/rejected": -46143872.0, + "logps/chosen": -504.49370659722223, + "logps/rejected": -485.072265625, + "loss": 0.0401, + "rewards/chosen": 8.675616794162327, + "rewards/margins": 14.77322726779514, + "rewards/rejected": -6.0976104736328125, + "step": 1351 + }, + { + "epoch": 0.33829600900788187, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36727572.571428575, + "logits/rejected": -27164060.8, + "logps/chosen": -303.04356166294644, + "logps/rejected": -322.324267578125, + "loss": 0.0498, + "rewards/chosen": 6.981774466378348, + "rewards/margins": 14.689091055733817, + "rewards/rejected": -7.707316589355469, + "step": 1352 + }, + { + "epoch": 0.33854622794945577, + "grad_norm": 11.0, + "kl": 8.41501235961914, + "learning_rate": 5e-06, + "logits/chosen": -45846715.733333334, + "logits/rejected": 32867413.333333332, + "logps/chosen": -419.6502278645833, + "logps/rejected": -599.8024088541666, + "loss": 0.0923, + "rewards/chosen": 6.2668706258138025, + "rewards/margins": 16.30726996527778, + "rewards/rejected": -10.040399339463976, + "step": 1353 + }, + { + "epoch": 0.33879644689102967, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51326234.666666664, + "logits/rejected": -78415200.0, + "logps/chosen": -353.623291015625, + "logps/rejected": -734.7068684895834, + "loss": 0.0288, + "rewards/chosen": 7.545745849609375, + "rewards/margins": 22.716283162434898, + "rewards/rejected": -15.170537312825521, + "step": 1354 + }, + { + "epoch": 0.3390466658326035, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36534173.09090909, + "logits/rejected": -51090215.384615384, + "logps/chosen": -383.0226384943182, + "logps/rejected": -695.0777493990385, + "loss": 0.0118, + "rewards/chosen": 6.529823303222656, + "rewards/margins": 18.708175072303185, + "rewards/rejected": -12.178351769080528, + "step": 1355 + }, + { + "epoch": 0.3392968847741774, + "grad_norm": 8.125, + "kl": 2.185840606689453, + "learning_rate": 5e-06, + "logits/chosen": -44884644.0, + "logits/rejected": -39766516.0, + "logps/chosen": -494.3297424316406, + "logps/rejected": -685.2808837890625, + "loss": 0.0089, + "rewards/chosen": 9.92443561553955, + "rewards/margins": 23.356390953063965, + "rewards/rejected": -13.431955337524414, + "step": 1356 + }, + { + "epoch": 0.3395471037157513, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74740584.72727273, + "logits/rejected": -66027347.692307696, + "logps/chosen": -446.85293856534093, + "logps/rejected": -647.4830979567307, + "loss": 0.0253, + "rewards/chosen": 7.935643282803622, + "rewards/margins": 20.64057703618403, + "rewards/rejected": -12.70493375338041, + "step": 1357 + }, + { + "epoch": 0.33979732265732515, + "grad_norm": 8.625, + "kl": 2.9340224266052246, + "learning_rate": 5e-06, + "logits/chosen": -27940229.333333332, + "logits/rejected": -51206391.46666667, + "logps/chosen": -440.60986328125, + "logps/rejected": -557.525, + "loss": 0.0655, + "rewards/chosen": 6.054548051622179, + "rewards/margins": 15.8319701300727, + "rewards/rejected": -9.77742207845052, + "step": 1358 + }, + { + "epoch": 0.34004754159889905, + "grad_norm": 12.3125, + "kl": 0.7810115814208984, + "learning_rate": 5e-06, + "logits/chosen": -15250318.0, + "logits/rejected": 2441615.0, + "logps/chosen": -291.82830810546875, + "logps/rejected": -410.85205078125, + "loss": 0.0746, + "rewards/chosen": 5.514744281768799, + "rewards/margins": 12.880590915679932, + "rewards/rejected": -7.365846633911133, + "step": 1359 + }, + { + "epoch": 0.3402977605404729, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31370933.333333332, + "logits/rejected": -75846480.0, + "logps/chosen": -366.5683186848958, + "logps/rejected": -579.5812174479166, + "loss": 0.0363, + "rewards/chosen": 7.31094233194987, + "rewards/margins": 19.48259989420573, + "rewards/rejected": -12.17165756225586, + "step": 1360 + }, + { + "epoch": 0.3405479794820468, + "grad_norm": 27.125, + "kl": 0.7354120016098022, + "learning_rate": 5e-06, + "logits/chosen": -24336891.42857143, + "logits/rejected": -88193536.0, + "logps/chosen": -346.33517020089283, + "logps/rejected": -600.51484375, + "loss": 0.0647, + "rewards/chosen": 4.871701921735491, + "rewards/margins": 15.511420549665178, + "rewards/rejected": -10.639718627929687, + "step": 1361 + }, + { + "epoch": 0.3407981984236207, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72564198.4, + "logits/rejected": -81451520.0, + "logps/chosen": -441.51897786458335, + "logps/rejected": -681.1800130208334, + "loss": 0.0256, + "rewards/chosen": 6.898324584960937, + "rewards/margins": 18.322556220160592, + "rewards/rejected": -11.424231635199654, + "step": 1362 + }, + { + "epoch": 0.34104841736519453, + "grad_norm": 17.125, + "kl": 3.1553988456726074, + "learning_rate": 5e-06, + "logits/chosen": -64922200.615384616, + "logits/rejected": -64769378.90909091, + "logps/chosen": -389.02640474759613, + "logps/rejected": -428.58065518465907, + "loss": 0.079, + "rewards/chosen": 5.469981266902043, + "rewards/margins": 13.756486532571433, + "rewards/rejected": -8.286505265669389, + "step": 1363 + }, + { + "epoch": 0.34129863630676843, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71494608.0, + "logits/rejected": -45395173.333333336, + "logps/chosen": -488.9459635416667, + "logps/rejected": -692.489013671875, + "loss": 0.0271, + "rewards/chosen": 7.334481557210286, + "rewards/margins": 19.616621653238933, + "rewards/rejected": -12.282140096028646, + "step": 1364 + }, + { + "epoch": 0.3415488552483423, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46601742.54545455, + "logits/rejected": -68418092.3076923, + "logps/chosen": -440.64936967329544, + "logps/rejected": -582.9760366586538, + "loss": 0.0271, + "rewards/chosen": 6.367367137562145, + "rewards/margins": 18.072649495584983, + "rewards/rejected": -11.705282358022837, + "step": 1365 + }, + { + "epoch": 0.3417990741899162, + "grad_norm": 18.5, + "kl": 18.15050506591797, + "learning_rate": 5e-06, + "logits/chosen": -50474990.93333333, + "logits/rejected": -61366727.11111111, + "logps/chosen": -523.4237630208333, + "logps/rejected": -563.9654405381945, + "loss": 0.1084, + "rewards/chosen": 7.516913859049479, + "rewards/margins": 21.16167229546441, + "rewards/rejected": -13.64475843641493, + "step": 1366 + }, + { + "epoch": 0.3420492931314901, + "grad_norm": 7.03125, + "kl": 3.886951446533203, + "learning_rate": 5e-06, + "logits/chosen": -55835328.0, + "logits/rejected": -13413665.454545455, + "logps/chosen": -467.9079777644231, + "logps/rejected": -525.4630681818181, + "loss": 0.0581, + "rewards/chosen": 7.6134209266075725, + "rewards/margins": 16.18602501929223, + "rewards/rejected": -8.572604092684658, + "step": 1367 + }, + { + "epoch": 0.3422995120730639, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12680450.461538462, + "logits/rejected": -24374149.818181816, + "logps/chosen": -323.2471454326923, + "logps/rejected": -440.1715198863636, + "loss": 0.0581, + "rewards/chosen": 5.362631577711839, + "rewards/margins": 15.90045950296042, + "rewards/rejected": -10.53782792524858, + "step": 1368 + }, + { + "epoch": 0.3425497310146378, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21023002.666666668, + "logits/rejected": -40075374.222222224, + "logps/chosen": -363.3076171875, + "logps/rejected": -516.2307942708334, + "loss": 0.0028, + "rewards/chosen": 6.823829650878906, + "rewards/margins": 18.073543124728733, + "rewards/rejected": -11.249713473849827, + "step": 1369 + }, + { + "epoch": 0.34279994995621166, + "grad_norm": 10.875, + "kl": 0.19117769598960876, + "learning_rate": 5e-06, + "logits/chosen": -38705467.07692308, + "logits/rejected": -37777914.18181818, + "logps/chosen": -411.40538611778845, + "logps/rejected": -368.20805220170456, + "loss": 0.0516, + "rewards/chosen": 6.531492379995493, + "rewards/margins": 13.993226111352026, + "rewards/rejected": -7.461733731356534, + "step": 1370 + }, + { + "epoch": 0.34305016889778556, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55614997.333333336, + "logits/rejected": -15105030.666666666, + "logps/chosen": -421.9579264322917, + "logps/rejected": -529.8504231770834, + "loss": 0.0707, + "rewards/chosen": 7.2065887451171875, + "rewards/margins": 21.210697174072266, + "rewards/rejected": -14.004108428955078, + "step": 1371 + }, + { + "epoch": 0.34330038783935946, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30513008.0, + "logits/rejected": -28525005.333333332, + "logps/chosen": -357.9259847005208, + "logps/rejected": -402.765380859375, + "loss": 0.0687, + "rewards/chosen": 5.72294553120931, + "rewards/margins": 14.270186742146809, + "rewards/rejected": -8.5472412109375, + "step": 1372 + }, + { + "epoch": 0.3435506067809333, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59987328.0, + "logits/rejected": -25641462.0, + "logps/chosen": -263.6124267578125, + "logps/rejected": -471.99639892578125, + "loss": 0.0283, + "rewards/chosen": 5.485619068145752, + "rewards/margins": 15.866224765777588, + "rewards/rejected": -10.380605697631836, + "step": 1373 + }, + { + "epoch": 0.3438008257225072, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10521260.8, + "logits/rejected": -46716654.222222224, + "logps/chosen": -480.3423177083333, + "logps/rejected": -782.1529405381945, + "loss": 0.0099, + "rewards/chosen": 8.070053100585938, + "rewards/margins": 22.473271687825523, + "rewards/rejected": -14.403218587239584, + "step": 1374 + }, + { + "epoch": 0.3440510446640811, + "grad_norm": 12.25, + "kl": 8.94655704498291, + "learning_rate": 5e-06, + "logits/chosen": -63110028.8, + "logits/rejected": -40571721.14285714, + "logps/chosen": -489.54150390625, + "logps/rejected": -442.15321568080356, + "loss": 0.0692, + "rewards/chosen": 8.175430297851562, + "rewards/margins": 16.03187506539481, + "rewards/rejected": -7.856444767543247, + "step": 1375 + }, + { + "epoch": 0.34430126360565494, + "grad_norm": 7.59375, + "kl": 3.9555816650390625, + "learning_rate": 5e-06, + "logits/chosen": -44090709.333333336, + "logits/rejected": -53187781.333333336, + "logps/chosen": -546.0808919270834, + "logps/rejected": -703.1783040364584, + "loss": 0.0249, + "rewards/chosen": 7.2213389078776045, + "rewards/margins": 20.84004847208659, + "rewards/rejected": -13.618709564208984, + "step": 1376 + }, + { + "epoch": 0.34455148254722884, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40956295.11111111, + "logits/rejected": -25625280.0, + "logps/chosen": -354.81523980034723, + "logps/rejected": -493.5981119791667, + "loss": 0.0462, + "rewards/chosen": 6.573668162027995, + "rewards/margins": 14.77570826212565, + "rewards/rejected": -8.202040100097657, + "step": 1377 + }, + { + "epoch": 0.3448017014888027, + "grad_norm": 9.0625, + "kl": 3.7113418579101562, + "learning_rate": 5e-06, + "logits/chosen": -50864117.333333336, + "logits/rejected": -46956000.0, + "logps/chosen": -318.4588623046875, + "logps/rejected": -713.8990071614584, + "loss": 0.0822, + "rewards/chosen": 7.12844721476237, + "rewards/margins": 20.45091374715169, + "rewards/rejected": -13.322466532389322, + "step": 1378 + }, + { + "epoch": 0.3450519204303766, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87894656.0, + "logits/rejected": -55881356.8, + "logps/chosen": -324.99379185267856, + "logps/rejected": -587.48818359375, + "loss": 0.1286, + "rewards/chosen": 5.334012712751116, + "rewards/margins": 14.57075892857143, + "rewards/rejected": -9.236746215820313, + "step": 1379 + }, + { + "epoch": 0.3453021393719505, + "grad_norm": 6.40625, + "kl": 2.066612958908081, + "learning_rate": 5e-06, + "logits/chosen": -46889063.384615384, + "logits/rejected": -73043066.18181819, + "logps/chosen": -361.35787259615387, + "logps/rejected": -503.84645774147725, + "loss": 0.0291, + "rewards/chosen": 7.517757709209736, + "rewards/margins": 19.83745430899667, + "rewards/rejected": -12.319696599786932, + "step": 1380 + }, + { + "epoch": 0.3455523583135243, + "grad_norm": 7.34375, + "kl": 1.531408667564392, + "learning_rate": 5e-06, + "logits/chosen": -43154321.06666667, + "logits/rejected": -27408366.222222224, + "logps/chosen": -372.10631510416664, + "logps/rejected": -567.4663628472222, + "loss": 0.0163, + "rewards/chosen": 6.417936197916666, + "rewards/margins": 19.293846638997394, + "rewards/rejected": -12.875910441080729, + "step": 1381 + }, + { + "epoch": 0.3458025772550982, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 19201792.0, + "logits/rejected": -47947975.52941176, + "logps/chosen": -455.1500767299107, + "logps/rejected": -605.4234834558823, + "loss": 0.0202, + "rewards/chosen": 8.886889866420201, + "rewards/margins": 19.985506522555312, + "rewards/rejected": -11.09861665613511, + "step": 1382 + }, + { + "epoch": 0.34605279619667206, + "grad_norm": 11.875, + "kl": 8.238302230834961, + "learning_rate": 5e-06, + "logits/chosen": -57637897.84615385, + "logits/rejected": -43387450.18181818, + "logps/chosen": -364.1184269831731, + "logps/rejected": -546.6664151278409, + "loss": 0.094, + "rewards/chosen": 6.448720491849459, + "rewards/margins": 17.34112868942581, + "rewards/rejected": -10.89240819757635, + "step": 1383 + }, + { + "epoch": 0.34630301513824596, + "grad_norm": 2.28125, + "kl": 2.4979376792907715, + "learning_rate": 5e-06, + "logits/chosen": -45941051.07692308, + "logits/rejected": -45544192.0, + "logps/chosen": -438.09510216346155, + "logps/rejected": -649.1272194602273, + "loss": 0.026, + "rewards/chosen": 7.812551058255709, + "rewards/margins": 19.802229181036246, + "rewards/rejected": -11.98967812278054, + "step": 1384 + }, + { + "epoch": 0.34655323407981986, + "grad_norm": 2.375, + "kl": 4.510049343109131, + "learning_rate": 5e-06, + "logits/chosen": -40340452.571428575, + "logits/rejected": -43033008.0, + "logps/chosen": -414.4686802455357, + "logps/rejected": -605.7154296875, + "loss": 0.0272, + "rewards/chosen": 8.496366228376116, + "rewards/margins": 22.162793840680806, + "rewards/rejected": -13.666427612304688, + "step": 1385 + }, + { + "epoch": 0.3468034530213937, + "grad_norm": 0.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38219467.63636363, + "logits/rejected": -43419072.0, + "logps/chosen": -480.02787642045456, + "logps/rejected": -548.1022761418269, + "loss": 0.0032, + "rewards/chosen": 7.689335909756747, + "rewards/margins": 20.41747123878319, + "rewards/rejected": -12.728135329026442, + "step": 1386 + }, + { + "epoch": 0.3470536719629676, + "grad_norm": 8.0, + "kl": 5.687331199645996, + "learning_rate": 5e-06, + "logits/chosen": -12470166.153846154, + "logits/rejected": -52592384.0, + "logps/chosen": -512.0307241586538, + "logps/rejected": -661.3045099431819, + "loss": 0.0638, + "rewards/chosen": 8.254149216871996, + "rewards/margins": 20.152106758597846, + "rewards/rejected": -11.897957541725852, + "step": 1387 + }, + { + "epoch": 0.3473038909045415, + "grad_norm": 4.71875, + "kl": 4.2610015869140625, + "learning_rate": 5e-06, + "logits/chosen": -36584472.615384616, + "logits/rejected": -68275642.18181819, + "logps/chosen": -317.00931490384613, + "logps/rejected": -678.169921875, + "loss": 0.0463, + "rewards/chosen": 7.358241741473858, + "rewards/margins": 19.540428908554823, + "rewards/rejected": -12.182187167080967, + "step": 1388 + }, + { + "epoch": 0.34755410984611534, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35182205.333333336, + "logits/rejected": -48418435.55555555, + "logps/chosen": -436.0703938802083, + "logps/rejected": -698.2550998263889, + "loss": 0.0854, + "rewards/chosen": 8.249051411946615, + "rewards/margins": 19.68235524495443, + "rewards/rejected": -11.433303833007812, + "step": 1389 + }, + { + "epoch": 0.34780432878768924, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30450473.6, + "logits/rejected": -30351821.714285713, + "logps/chosen": -394.827099609375, + "logps/rejected": -550.4891183035714, + "loss": 0.033, + "rewards/chosen": 8.100794219970703, + "rewards/margins": 19.846066175188337, + "rewards/rejected": -11.745271955217634, + "step": 1390 + }, + { + "epoch": 0.3480545477292631, + "grad_norm": 6.34375, + "kl": 3.1048903465270996, + "learning_rate": 5e-06, + "logits/chosen": -29222763.42857143, + "logits/rejected": -61965824.0, + "logps/chosen": -297.25547572544644, + "logps/rejected": -538.98232421875, + "loss": 0.0576, + "rewards/chosen": 6.200111389160156, + "rewards/margins": 16.02859573364258, + "rewards/rejected": -9.828484344482423, + "step": 1391 + }, + { + "epoch": 0.348304766670837, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76160296.0, + "logits/rejected": -70088784.0, + "logps/chosen": -336.0784606933594, + "logps/rejected": -760.227294921875, + "loss": 0.04, + "rewards/chosen": 6.1641387939453125, + "rewards/margins": 19.47642707824707, + "rewards/rejected": -13.312288284301758, + "step": 1392 + }, + { + "epoch": 0.3485549856124109, + "grad_norm": 10.5, + "kl": 1.8843917846679688, + "learning_rate": 5e-06, + "logits/chosen": -59755688.72727273, + "logits/rejected": -46480329.84615385, + "logps/chosen": -398.46462180397725, + "logps/rejected": -546.4738581730769, + "loss": 0.0191, + "rewards/chosen": 7.625688726251775, + "rewards/margins": 15.847116056855741, + "rewards/rejected": -8.221427330603966, + "step": 1393 + }, + { + "epoch": 0.3488052045539847, + "grad_norm": 9.125, + "kl": 1.4063594341278076, + "learning_rate": 5e-06, + "logits/chosen": -43650157.71428572, + "logits/rejected": -58462035.2, + "logps/chosen": -339.45235770089283, + "logps/rejected": -705.771240234375, + "loss": 0.027, + "rewards/chosen": 7.272457667759487, + "rewards/margins": 22.102779933384486, + "rewards/rejected": -14.830322265625, + "step": 1394 + }, + { + "epoch": 0.3490554234955586, + "grad_norm": 19.0, + "kl": 15.25861644744873, + "learning_rate": 5e-06, + "logits/chosen": -27900723.2, + "logits/rejected": -37788544.0, + "logps/chosen": -435.4731770833333, + "logps/rejected": -534.9817708333334, + "loss": 0.0844, + "rewards/chosen": 8.663606770833333, + "rewards/margins": 19.592518615722653, + "rewards/rejected": -10.928911844889322, + "step": 1395 + }, + { + "epoch": 0.34930564243713247, + "grad_norm": 4.3125, + "kl": 6.739832878112793, + "learning_rate": 5e-06, + "logits/chosen": -48544906.666666664, + "logits/rejected": -54690320.0, + "logps/chosen": -399.119140625, + "logps/rejected": -762.9268391927084, + "loss": 0.0133, + "rewards/chosen": 9.885173797607422, + "rewards/margins": 25.321478525797524, + "rewards/rejected": -15.436304728190104, + "step": 1396 + }, + { + "epoch": 0.34955586137870637, + "grad_norm": 11.5, + "kl": 2.9718384742736816, + "learning_rate": 5e-06, + "logits/chosen": -52491236.571428575, + "logits/rejected": -53341286.4, + "logps/chosen": -297.95186941964283, + "logps/rejected": -592.1646484375, + "loss": 0.0566, + "rewards/chosen": 4.977242061070034, + "rewards/margins": 18.4941289629255, + "rewards/rejected": -13.516886901855468, + "step": 1397 + }, + { + "epoch": 0.34980608032028027, + "grad_norm": 5.90625, + "kl": 0.271176815032959, + "learning_rate": 5e-06, + "logits/chosen": -34071288.0, + "logits/rejected": -64014901.333333336, + "logps/chosen": -353.0765787760417, + "logps/rejected": -691.5896809895834, + "loss": 0.0642, + "rewards/chosen": 5.96182378133138, + "rewards/margins": 19.90054702758789, + "rewards/rejected": -13.93872324625651, + "step": 1398 + }, + { + "epoch": 0.3500562992618541, + "grad_norm": 1.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41278669.333333336, + "logits/rejected": -2138555.5555555555, + "logps/chosen": -366.9925130208333, + "logps/rejected": -676.63037109375, + "loss": 0.0137, + "rewards/chosen": 6.664829254150391, + "rewards/margins": 16.928256140814888, + "rewards/rejected": -10.263426886664497, + "step": 1399 + }, + { + "epoch": 0.350306518203428, + "grad_norm": 1.4140625, + "kl": 12.511804580688477, + "learning_rate": 5e-06, + "logits/chosen": -36576179.2, + "logits/rejected": -54901404.44444445, + "logps/chosen": -551.7591796875, + "logps/rejected": -816.2090928819445, + "loss": 0.0023, + "rewards/chosen": 10.2717041015625, + "rewards/margins": 27.145997111002607, + "rewards/rejected": -16.874293009440105, + "step": 1400 + }, + { + "epoch": 0.35055673714500185, + "grad_norm": 0.74609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64965109.333333336, + "logits/rejected": -46812032.0, + "logps/chosen": -357.7642415364583, + "logps/rejected": -564.5442708333334, + "loss": 0.0051, + "rewards/chosen": 8.211051940917969, + "rewards/margins": 19.42884063720703, + "rewards/rejected": -11.217788696289062, + "step": 1401 + }, + { + "epoch": 0.35080695608657575, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53030712.0, + "logits/rejected": -48485420.0, + "logps/chosen": -282.1611328125, + "logps/rejected": -662.4784545898438, + "loss": 0.0379, + "rewards/chosen": 5.630801200866699, + "rewards/margins": 17.286304473876953, + "rewards/rejected": -11.655503273010254, + "step": 1402 + }, + { + "epoch": 0.35105717502814965, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -78118144.0, + "logits/rejected": -61208064.0, + "logps/chosen": -650.1380504261364, + "logps/rejected": -775.9940655048077, + "loss": 0.0062, + "rewards/chosen": 10.618053783069957, + "rewards/margins": 25.78856722958438, + "rewards/rejected": -15.170513446514423, + "step": 1403 + }, + { + "epoch": 0.3513073939697235, + "grad_norm": 12.6875, + "kl": 2.596083402633667, + "learning_rate": 5e-06, + "logits/chosen": -68005970.28571428, + "logits/rejected": 59958547.2, + "logps/chosen": -431.27633231026783, + "logps/rejected": -686.0212890625, + "loss": 0.046, + "rewards/chosen": 6.783527919224331, + "rewards/margins": 16.459911128452845, + "rewards/rejected": -9.676383209228515, + "step": 1404 + }, + { + "epoch": 0.3515576129112974, + "grad_norm": 14.0625, + "kl": 6.426671028137207, + "learning_rate": 5e-06, + "logits/chosen": -57330112.0, + "logits/rejected": -31658176.0, + "logps/chosen": -442.88025841346155, + "logps/rejected": -523.8718039772727, + "loss": 0.0662, + "rewards/chosen": 7.974584726186899, + "rewards/margins": 19.058880519200038, + "rewards/rejected": -11.084295793013139, + "step": 1405 + }, + { + "epoch": 0.3518078318528713, + "grad_norm": 1.6875, + "kl": 7.128184795379639, + "learning_rate": 5e-06, + "logits/chosen": -39027270.4, + "logits/rejected": -55688004.571428575, + "logps/chosen": -426.928125, + "logps/rejected": -450.0786830357143, + "loss": 0.017, + "rewards/chosen": 8.389739227294921, + "rewards/margins": 18.13197468348912, + "rewards/rejected": -9.742235456194196, + "step": 1406 + }, + { + "epoch": 0.35205805079444513, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58844288.0, + "logits/rejected": -45988250.666666664, + "logps/chosen": -382.8331705729167, + "logps/rejected": -456.5674235026042, + "loss": 0.0291, + "rewards/chosen": 6.269336700439453, + "rewards/margins": 15.390447616577148, + "rewards/rejected": -9.121110916137695, + "step": 1407 + }, + { + "epoch": 0.35230826973601903, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34870579.2, + "logits/rejected": -45762802.28571428, + "logps/chosen": -375.8123291015625, + "logps/rejected": -474.4597865513393, + "loss": 0.0097, + "rewards/chosen": 6.738951873779297, + "rewards/margins": 18.88167495727539, + "rewards/rejected": -12.142723083496094, + "step": 1408 + }, + { + "epoch": 0.3525584886775929, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56879675.07692308, + "logits/rejected": -47576791.27272727, + "logps/chosen": -470.84908353365387, + "logps/rejected": -502.419921875, + "loss": 0.0586, + "rewards/chosen": 8.04593012883113, + "rewards/margins": 18.539741996284967, + "rewards/rejected": -10.493811867453836, + "step": 1409 + }, + { + "epoch": 0.3528087076191668, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36270729.84615385, + "logits/rejected": -74471185.45454545, + "logps/chosen": -474.7137920673077, + "logps/rejected": -558.6563387784091, + "loss": 0.0368, + "rewards/chosen": 6.993660560021033, + "rewards/margins": 19.025967577954273, + "rewards/rejected": -12.032307017933238, + "step": 1410 + }, + { + "epoch": 0.35305892656074067, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25235098.181818184, + "logits/rejected": 17158104.615384616, + "logps/chosen": -281.726806640625, + "logps/rejected": -545.2224308894231, + "loss": 0.0833, + "rewards/chosen": 4.301422812721946, + "rewards/margins": 15.044850969648028, + "rewards/rejected": -10.743428156926083, + "step": 1411 + }, + { + "epoch": 0.3533091455023145, + "grad_norm": 14.9375, + "kl": 5.812307357788086, + "learning_rate": 5e-06, + "logits/chosen": -65225700.571428575, + "logits/rejected": -52220195.2, + "logps/chosen": -439.56515066964283, + "logps/rejected": -726.58310546875, + "loss": 0.0469, + "rewards/chosen": 8.212796892438616, + "rewards/margins": 22.355350603376117, + "rewards/rejected": -14.1425537109375, + "step": 1412 + }, + { + "epoch": 0.3535593644438884, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62327764.0, + "logits/rejected": -24458312.0, + "logps/chosen": -460.3680725097656, + "logps/rejected": -596.10400390625, + "loss": 0.0556, + "rewards/chosen": 7.300900459289551, + "rewards/margins": 19.922649383544922, + "rewards/rejected": -12.621748924255371, + "step": 1413 + }, + { + "epoch": 0.35380958338546226, + "grad_norm": 10.6875, + "kl": 11.8612060546875, + "learning_rate": 5e-06, + "logits/chosen": -57664365.71428572, + "logits/rejected": -29341267.2, + "logps/chosen": -398.34326171875, + "logps/rejected": -522.267919921875, + "loss": 0.0721, + "rewards/chosen": 5.677768162318638, + "rewards/margins": 15.454412296840122, + "rewards/rejected": -9.776644134521485, + "step": 1414 + }, + { + "epoch": 0.35405980232703615, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60141370.18181818, + "logits/rejected": -57800438.15384615, + "logps/chosen": -470.8170276988636, + "logps/rejected": -560.9586463341346, + "loss": 0.026, + "rewards/chosen": 8.210657986727627, + "rewards/margins": 21.072558903193972, + "rewards/rejected": -12.861900916466347, + "step": 1415 + }, + { + "epoch": 0.35431002126861005, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32914932.0, + "logits/rejected": -38338192.0, + "logps/chosen": -173.03822326660156, + "logps/rejected": -584.142333984375, + "loss": 0.0764, + "rewards/chosen": 3.7667694091796875, + "rewards/margins": 15.68187141418457, + "rewards/rejected": -11.915102005004883, + "step": 1416 + }, + { + "epoch": 0.3545602402101839, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77776715.63636364, + "logits/rejected": -36113853.538461536, + "logps/chosen": -396.49729225852275, + "logps/rejected": -629.8743990384615, + "loss": 0.0346, + "rewards/chosen": 7.152030944824219, + "rewards/margins": 21.61517862173227, + "rewards/rejected": -14.463147676908052, + "step": 1417 + }, + { + "epoch": 0.3548104591517578, + "grad_norm": 16.5, + "kl": 7.717679500579834, + "learning_rate": 5e-06, + "logits/chosen": 17478704.0, + "logits/rejected": -52612176.0, + "logps/chosen": -429.768798828125, + "logps/rejected": -281.666259765625, + "loss": 0.0918, + "rewards/chosen": 5.887630462646484, + "rewards/margins": 13.455334186553955, + "rewards/rejected": -7.567703723907471, + "step": 1418 + }, + { + "epoch": 0.35506067809333164, + "grad_norm": 17.625, + "kl": 11.13028335571289, + "learning_rate": 5e-06, + "logits/chosen": -28847306.666666668, + "logits/rejected": -61549909.333333336, + "logps/chosen": -463.72486979166666, + "logps/rejected": -495.6037326388889, + "loss": 0.13, + "rewards/chosen": 7.3768056233723955, + "rewards/margins": 19.00676981608073, + "rewards/rejected": -11.629964192708334, + "step": 1419 + }, + { + "epoch": 0.35531089703490554, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74581996.3076923, + "logits/rejected": -60997742.54545455, + "logps/chosen": -365.0421799879808, + "logps/rejected": -507.80229048295456, + "loss": 0.0689, + "rewards/chosen": 6.126974252554087, + "rewards/margins": 14.834668566296985, + "rewards/rejected": -8.707694313742898, + "step": 1420 + }, + { + "epoch": 0.35556111597647944, + "grad_norm": 1.1953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70038394.18181819, + "logits/rejected": -59319748.92307692, + "logps/chosen": -489.3062855113636, + "logps/rejected": -592.6949368990385, + "loss": 0.0094, + "rewards/chosen": 8.56694308194247, + "rewards/margins": 24.725538240446078, + "rewards/rejected": -16.158595158503605, + "step": 1421 + }, + { + "epoch": 0.3558113349180533, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55887858.28571428, + "logits/rejected": 24521422.4, + "logps/chosen": -500.5084751674107, + "logps/rejected": -544.201953125, + "loss": 0.034, + "rewards/chosen": 8.7468626839774, + "rewards/margins": 20.526266370500835, + "rewards/rejected": -11.779403686523438, + "step": 1422 + }, + { + "epoch": 0.3560615538596272, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76280480.0, + "logits/rejected": -67293340.44444445, + "logps/chosen": -345.414306640625, + "logps/rejected": -477.0716145833333, + "loss": 0.0553, + "rewards/chosen": 5.34048589070638, + "rewards/margins": 14.012145572238499, + "rewards/rejected": -8.671659681532118, + "step": 1423 + }, + { + "epoch": 0.3563117728012011, + "grad_norm": 2.65625, + "kl": 0.27488836646080017, + "learning_rate": 5e-06, + "logits/chosen": -34870131.692307696, + "logits/rejected": -51258350.54545455, + "logps/chosen": -399.97787710336536, + "logps/rejected": -603.3857865767045, + "loss": 0.0065, + "rewards/chosen": 7.985346280611479, + "rewards/margins": 20.77538118162355, + "rewards/rejected": -12.790034901012074, + "step": 1424 + }, + { + "epoch": 0.3565619917427749, + "grad_norm": 15.375, + "kl": 3.0965240001678467, + "learning_rate": 5e-06, + "logits/chosen": -73984182.85714285, + "logits/rejected": -50863641.6, + "logps/chosen": -428.75948660714283, + "logps/rejected": -346.171728515625, + "loss": 0.0602, + "rewards/chosen": 6.2626816885811945, + "rewards/margins": 15.120948900495257, + "rewards/rejected": -8.858267211914063, + "step": 1425 + }, + { + "epoch": 0.3568122106843488, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48847749.333333336, + "logits/rejected": -41748872.0, + "logps/chosen": -377.884521484375, + "logps/rejected": -508.500244140625, + "loss": 0.0284, + "rewards/chosen": 6.642574310302734, + "rewards/margins": 17.614665985107422, + "rewards/rejected": -10.972091674804688, + "step": 1426 + }, + { + "epoch": 0.35706242962592266, + "grad_norm": 12.6875, + "kl": 1.6349167823791504, + "learning_rate": 5e-06, + "logits/chosen": -33113094.85714286, + "logits/rejected": -52767555.2, + "logps/chosen": -291.21182686941967, + "logps/rejected": -551.41259765625, + "loss": 0.072, + "rewards/chosen": 5.402621677943638, + "rewards/margins": 17.80347693307059, + "rewards/rejected": -12.400855255126952, + "step": 1427 + }, + { + "epoch": 0.35731264856749656, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60668824.0, + "logits/rejected": -72480448.0, + "logps/chosen": -392.55889892578125, + "logps/rejected": -691.7050170898438, + "loss": 0.0189, + "rewards/chosen": 6.731208324432373, + "rewards/margins": 21.27409315109253, + "rewards/rejected": -14.542884826660156, + "step": 1428 + }, + { + "epoch": 0.35756286750907046, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56924032.0, + "logits/rejected": -57238551.27272727, + "logps/chosen": -338.5987079326923, + "logps/rejected": -551.2757013494319, + "loss": 0.0453, + "rewards/chosen": 6.334498478816106, + "rewards/margins": 19.08776385967548, + "rewards/rejected": -12.753265380859375, + "step": 1429 + }, + { + "epoch": 0.3578130864506443, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38227300.571428575, + "logits/rejected": -13758476.8, + "logps/chosen": -395.7186802455357, + "logps/rejected": -662.495654296875, + "loss": 0.0354, + "rewards/chosen": 6.083441598074777, + "rewards/margins": 20.663643319266182, + "rewards/rejected": -14.580201721191406, + "step": 1430 + }, + { + "epoch": 0.3580633053922182, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50863402.666666664, + "logits/rejected": -90437248.0, + "logps/chosen": -437.3853352864583, + "logps/rejected": -570.7180582682291, + "loss": 0.0116, + "rewards/chosen": 7.654365539550781, + "rewards/margins": 18.859172821044922, + "rewards/rejected": -11.20480728149414, + "step": 1431 + }, + { + "epoch": 0.35831352433379204, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44515515.428571425, + "logits/rejected": -40173155.2, + "logps/chosen": -348.9318150111607, + "logps/rejected": -728.392578125, + "loss": 0.0564, + "rewards/chosen": 6.237430027553013, + "rewards/margins": 19.374383762904575, + "rewards/rejected": -13.136953735351563, + "step": 1432 + }, + { + "epoch": 0.35856374327536594, + "grad_norm": 14.3125, + "kl": 0.3510233759880066, + "learning_rate": 5e-06, + "logits/chosen": -78531214.22222222, + "logits/rejected": -35573386.666666664, + "logps/chosen": -477.5673828125, + "logps/rejected": -664.309765625, + "loss": 0.0403, + "rewards/chosen": 8.29533216688368, + "rewards/margins": 19.821913994683158, + "rewards/rejected": -11.526581827799479, + "step": 1433 + }, + { + "epoch": 0.35881396221693984, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23483219.692307692, + "logits/rejected": -37687714.90909091, + "logps/chosen": -397.96206430288464, + "logps/rejected": -571.9079367897727, + "loss": 0.0246, + "rewards/chosen": 5.6260516826923075, + "rewards/margins": 22.129813667777533, + "rewards/rejected": -16.503761985085227, + "step": 1434 + }, + { + "epoch": 0.3590641811585137, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42031988.0, + "logits/rejected": -47345768.0, + "logps/chosen": -401.73577880859375, + "logps/rejected": -539.3719482421875, + "loss": 0.0057, + "rewards/chosen": 6.956500053405762, + "rewards/margins": 18.414944648742676, + "rewards/rejected": -11.458444595336914, + "step": 1435 + }, + { + "epoch": 0.3593144001000876, + "grad_norm": 11.4375, + "kl": 1.1771705150604248, + "learning_rate": 5e-06, + "logits/chosen": -75799488.0, + "logits/rejected": -34032836.571428575, + "logps/chosen": -351.0594970703125, + "logps/rejected": -509.5528041294643, + "loss": 0.0459, + "rewards/chosen": 6.812472534179688, + "rewards/margins": 17.59641353062221, + "rewards/rejected": -10.783940996442523, + "step": 1436 + }, + { + "epoch": 0.3595646190416614, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30782788.57142857, + "logits/rejected": -48426796.8, + "logps/chosen": -337.0228794642857, + "logps/rejected": -588.4873046875, + "loss": 0.0353, + "rewards/chosen": 5.464809962681362, + "rewards/margins": 17.96720711844308, + "rewards/rejected": -12.502397155761718, + "step": 1437 + }, + { + "epoch": 0.3598148379832353, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25957444.8, + "logits/rejected": -49068164.571428575, + "logps/chosen": -323.091943359375, + "logps/rejected": -711.6405552455357, + "loss": 0.0451, + "rewards/chosen": 5.587612915039062, + "rewards/margins": 16.85543954031808, + "rewards/rejected": -11.267826625279017, + "step": 1438 + }, + { + "epoch": 0.3600650569248092, + "grad_norm": 7.5, + "kl": 11.932319641113281, + "learning_rate": 5e-06, + "logits/chosen": -67978737.23076923, + "logits/rejected": -40119258.18181818, + "logps/chosen": -507.05284705528845, + "logps/rejected": -577.4150390625, + "loss": 0.0532, + "rewards/chosen": 8.382584205040565, + "rewards/margins": 18.729393592247597, + "rewards/rejected": -10.346809387207031, + "step": 1439 + }, + { + "epoch": 0.36031527586638307, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57670336.0, + "logits/rejected": 85713389.71428572, + "logps/chosen": -432.618798828125, + "logps/rejected": -548.0721958705357, + "loss": 0.0049, + "rewards/chosen": 7.446672058105468, + "rewards/margins": 18.45464368547712, + "rewards/rejected": -11.007971627371651, + "step": 1440 + }, + { + "epoch": 0.36056549480795697, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55968552.72727273, + "logits/rejected": -41064172.307692304, + "logps/chosen": -332.743896484375, + "logps/rejected": -533.4876802884615, + "loss": 0.026, + "rewards/chosen": 5.945619756525213, + "rewards/margins": 16.623060693274013, + "rewards/rejected": -10.677440936748798, + "step": 1441 + }, + { + "epoch": 0.36081571374953086, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49842715.428571425, + "logits/rejected": -59495563.294117644, + "logps/chosen": -563.5268903459821, + "logps/rejected": -543.5722081801471, + "loss": 0.0546, + "rewards/chosen": 11.332850864955358, + "rewards/margins": 21.585354203937435, + "rewards/rejected": -10.252503338982077, + "step": 1442 + }, + { + "epoch": 0.3610659326911047, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40110734.222222224, + "logits/rejected": -48277162.666666664, + "logps/chosen": -431.4103732638889, + "logps/rejected": -660.915625, + "loss": 0.0195, + "rewards/chosen": 9.167383829752604, + "rewards/margins": 20.99742736816406, + "rewards/rejected": -11.830043538411458, + "step": 1443 + }, + { + "epoch": 0.3613161516326786, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33238566.0, + "logits/rejected": -67309864.0, + "logps/chosen": -349.23663330078125, + "logps/rejected": -633.099853515625, + "loss": 0.0522, + "rewards/chosen": 4.590353965759277, + "rewards/margins": 20.492164611816406, + "rewards/rejected": -15.901810646057129, + "step": 1444 + }, + { + "epoch": 0.36156637057425245, + "grad_norm": 2.546875, + "kl": 1.2477658987045288, + "learning_rate": 5e-06, + "logits/chosen": -38277891.2, + "logits/rejected": -57382464.0, + "logps/chosen": -445.97236328125, + "logps/rejected": -766.3161969866071, + "loss": 0.0105, + "rewards/chosen": 9.916059875488282, + "rewards/margins": 26.178626578194752, + "rewards/rejected": -16.262566702706472, + "step": 1445 + }, + { + "epoch": 0.36181658951582635, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76586595.55555555, + "logits/rejected": -60284834.13333333, + "logps/chosen": -410.7468532986111, + "logps/rejected": -590.7415364583334, + "loss": 0.0632, + "rewards/chosen": 6.82777574327257, + "rewards/margins": 18.97667202419705, + "rewards/rejected": -12.148896280924479, + "step": 1446 + }, + { + "epoch": 0.36206680845740025, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23338980.57142857, + "logits/rejected": -47900617.6, + "logps/chosen": -459.7919224330357, + "logps/rejected": -697.594482421875, + "loss": 0.048, + "rewards/chosen": 5.634098052978516, + "rewards/margins": 18.001346588134766, + "rewards/rejected": -12.36724853515625, + "step": 1447 + }, + { + "epoch": 0.3623170273989741, + "grad_norm": 27.875, + "kl": 0.20477867126464844, + "learning_rate": 5e-06, + "logits/chosen": -45810074.666666664, + "logits/rejected": -61941322.666666664, + "logps/chosen": -356.6944580078125, + "logps/rejected": -673.794677734375, + "loss": 0.0901, + "rewards/chosen": 5.975573221842448, + "rewards/margins": 14.205772399902344, + "rewards/rejected": -8.230199178059896, + "step": 1448 + }, + { + "epoch": 0.362567246340548, + "grad_norm": 15.375, + "kl": 5.429044246673584, + "learning_rate": 5e-06, + "logits/chosen": -27342520.615384616, + "logits/rejected": -45290042.18181818, + "logps/chosen": -355.38461538461536, + "logps/rejected": -537.3761985085227, + "loss": 0.1103, + "rewards/chosen": 4.918288891132061, + "rewards/margins": 17.5353820640724, + "rewards/rejected": -12.617093172940342, + "step": 1449 + }, + { + "epoch": 0.36281746528212183, + "grad_norm": 6.1875, + "kl": 3.1310575008392334, + "learning_rate": 5e-06, + "logits/chosen": -50957627.07692308, + "logits/rejected": -47080209.45454545, + "logps/chosen": -381.6692457932692, + "logps/rejected": -636.3359375, + "loss": 0.0151, + "rewards/chosen": 7.457385723407452, + "rewards/margins": 20.24586561509779, + "rewards/rejected": -12.788479891690342, + "step": 1450 + }, + { + "epoch": 0.36306768422369573, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63243426.90909091, + "logits/rejected": -54973538.461538464, + "logps/chosen": -327.92092063210225, + "logps/rejected": -607.486328125, + "loss": 0.0414, + "rewards/chosen": 4.701632412997159, + "rewards/margins": 18.160959523874563, + "rewards/rejected": -13.459327110877403, + "step": 1451 + }, + { + "epoch": 0.36331790316526963, + "grad_norm": 1.3984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40940501.333333336, + "logits/rejected": -89388492.8, + "logps/chosen": -549.4225260416666, + "logps/rejected": -708.7557942708333, + "loss": 0.003, + "rewards/chosen": 10.054502699110243, + "rewards/margins": 24.36369459364149, + "rewards/rejected": -14.30919189453125, + "step": 1452 + }, + { + "epoch": 0.36356812210684347, + "grad_norm": 8.875, + "kl": 4.9395599365234375, + "learning_rate": 5e-06, + "logits/chosen": -59268502.85714286, + "logits/rejected": -50867353.6, + "logps/chosen": -370.87587193080356, + "logps/rejected": -742.21689453125, + "loss": 0.0269, + "rewards/chosen": 6.457456861223493, + "rewards/margins": 22.824256787981305, + "rewards/rejected": -16.366799926757814, + "step": 1453 + }, + { + "epoch": 0.36381834104841737, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40006464.0, + "logits/rejected": -43237564.0, + "logps/chosen": -278.547607421875, + "logps/rejected": -419.1848449707031, + "loss": 0.0591, + "rewards/chosen": 6.350396633148193, + "rewards/margins": 15.127200603485107, + "rewards/rejected": -8.776803970336914, + "step": 1454 + }, + { + "epoch": 0.36406855998999127, + "grad_norm": 13.6875, + "kl": 4.628375053405762, + "learning_rate": 5e-06, + "logits/chosen": -72887250.28571428, + "logits/rejected": 51300352.0, + "logps/chosen": -317.3553989955357, + "logps/rejected": -668.96689453125, + "loss": 0.0472, + "rewards/chosen": 6.140809195382254, + "rewards/margins": 17.672460501534598, + "rewards/rejected": -11.531651306152344, + "step": 1455 + }, + { + "epoch": 0.3643187789315651, + "grad_norm": 5.21875, + "kl": 2.7030959129333496, + "learning_rate": 5e-06, + "logits/chosen": -70124012.8, + "logits/rejected": -41078427.428571425, + "logps/chosen": -454.91982421875, + "logps/rejected": -442.45472935267856, + "loss": 0.0413, + "rewards/chosen": 8.184555053710938, + "rewards/margins": 20.056630815778462, + "rewards/rejected": -11.872075762067523, + "step": 1456 + }, + { + "epoch": 0.364568997873139, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62508608.0, + "logits/rejected": -60690880.0, + "logps/chosen": -412.9384358723958, + "logps/rejected": -696.38525390625, + "loss": 0.0444, + "rewards/chosen": 6.140658696492513, + "rewards/margins": 22.935712814331055, + "rewards/rejected": -16.795054117838543, + "step": 1457 + }, + { + "epoch": 0.36481921681471285, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44248132.92307692, + "logits/rejected": -107704354.9090909, + "logps/chosen": -353.9615009014423, + "logps/rejected": -618.1991743607955, + "loss": 0.031, + "rewards/chosen": 6.919779850886418, + "rewards/margins": 21.778407170222355, + "rewards/rejected": -14.858627319335938, + "step": 1458 + }, + { + "epoch": 0.36506943575628675, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79556812.8, + "logits/rejected": -57513934.222222224, + "logps/chosen": -408.33372395833334, + "logps/rejected": -589.1618923611111, + "loss": 0.0171, + "rewards/chosen": 7.305982462565104, + "rewards/margins": 21.13246290418837, + "rewards/rejected": -13.826480441623264, + "step": 1459 + }, + { + "epoch": 0.36531965469786065, + "grad_norm": 7.9375, + "kl": 0.5344009399414062, + "learning_rate": 5e-06, + "logits/chosen": -21463386.666666668, + "logits/rejected": -42706066.666666664, + "logps/chosen": -252.90580240885416, + "logps/rejected": -473.3464762369792, + "loss": 0.0685, + "rewards/chosen": 5.302000363667806, + "rewards/margins": 16.691630999247234, + "rewards/rejected": -11.389630635579428, + "step": 1460 + }, + { + "epoch": 0.3655698736394345, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63345425.45454545, + "logits/rejected": -51707377.23076923, + "logps/chosen": -364.1495472301136, + "logps/rejected": -549.7163461538462, + "loss": 0.0263, + "rewards/chosen": 7.571725325150923, + "rewards/margins": 20.361564849640107, + "rewards/rejected": -12.789839524489183, + "step": 1461 + }, + { + "epoch": 0.3658200925810084, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44151470.222222224, + "logits/rejected": -47670485.333333336, + "logps/chosen": -297.3771158854167, + "logps/rejected": -483.42666015625, + "loss": 0.0522, + "rewards/chosen": 5.755478752983941, + "rewards/margins": 16.315081617567273, + "rewards/rejected": -10.559602864583333, + "step": 1462 + }, + { + "epoch": 0.36607031152258224, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44405037.71428572, + "logits/rejected": -45631990.4, + "logps/chosen": -325.0625, + "logps/rejected": -476.936181640625, + "loss": 0.0603, + "rewards/chosen": 4.838845934186663, + "rewards/margins": 14.867067064557757, + "rewards/rejected": -10.028221130371094, + "step": 1463 + }, + { + "epoch": 0.36632053046415614, + "grad_norm": 16.5, + "kl": 2.6466217041015625, + "learning_rate": 5e-06, + "logits/chosen": -79042285.71428572, + "logits/rejected": -62999872.0, + "logps/chosen": -406.38242885044644, + "logps/rejected": -540.474072265625, + "loss": 0.0348, + "rewards/chosen": 6.694066728864398, + "rewards/margins": 20.671233476911272, + "rewards/rejected": -13.977166748046875, + "step": 1464 + }, + { + "epoch": 0.36657074940573003, + "grad_norm": 7.59375, + "kl": 3.3304905891418457, + "learning_rate": 5e-06, + "logits/chosen": -84168308.36363636, + "logits/rejected": -38034441.84615385, + "logps/chosen": -448.4808238636364, + "logps/rejected": -532.1355543870193, + "loss": 0.0385, + "rewards/chosen": 8.67235079678622, + "rewards/margins": 20.51585430865521, + "rewards/rejected": -11.84350351186899, + "step": 1465 + }, + { + "epoch": 0.3668209683473039, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -90441744.0, + "logits/rejected": -75981264.0, + "logps/chosen": -386.78564453125, + "logps/rejected": -691.1327514648438, + "loss": 0.0395, + "rewards/chosen": 6.249045372009277, + "rewards/margins": 20.348111152648926, + "rewards/rejected": -14.099065780639648, + "step": 1466 + }, + { + "epoch": 0.3670711872888778, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63987768.88888889, + "logits/rejected": -36629307.733333334, + "logps/chosen": -388.2990451388889, + "logps/rejected": -582.2656901041667, + "loss": 0.0059, + "rewards/chosen": 8.126688639322916, + "rewards/margins": 19.559955851236978, + "rewards/rejected": -11.433267211914062, + "step": 1467 + }, + { + "epoch": 0.3673214062304516, + "grad_norm": 2.46875, + "kl": 3.54791522026062, + "learning_rate": 5e-06, + "logits/chosen": -58671863.46666667, + "logits/rejected": -12699426.666666666, + "logps/chosen": -363.3497721354167, + "logps/rejected": -341.2548014322917, + "loss": 0.0275, + "rewards/chosen": 8.100952657063802, + "rewards/margins": 16.662552388509113, + "rewards/rejected": -8.561599731445312, + "step": 1468 + }, + { + "epoch": 0.3675716251720255, + "grad_norm": 13.5625, + "kl": 10.497701644897461, + "learning_rate": 5e-06, + "logits/chosen": -77485331.6923077, + "logits/rejected": -38310112.0, + "logps/chosen": -389.5580303485577, + "logps/rejected": -478.53981711647725, + "loss": 0.0393, + "rewards/chosen": 7.283131526066707, + "rewards/margins": 18.54624298735932, + "rewards/rejected": -11.263111461292613, + "step": 1469 + }, + { + "epoch": 0.3678218441135994, + "grad_norm": 9.9375, + "kl": 8.398536682128906, + "learning_rate": 5e-06, + "logits/chosen": -45394133.333333336, + "logits/rejected": -81482332.44444445, + "logps/chosen": -295.29856770833334, + "logps/rejected": -604.4596896701389, + "loss": 0.0971, + "rewards/chosen": 5.591945393880208, + "rewards/margins": 18.995632765028212, + "rewards/rejected": -13.403687371148003, + "step": 1470 + }, + { + "epoch": 0.36807206305517326, + "grad_norm": 3.828125, + "kl": 5.776141166687012, + "learning_rate": 5e-06, + "logits/chosen": -66453548.8, + "logits/rejected": -35028992.0, + "logps/chosen": -407.5990478515625, + "logps/rejected": -544.4265834263393, + "loss": 0.0081, + "rewards/chosen": 8.373365783691407, + "rewards/margins": 22.0897702898298, + "rewards/rejected": -13.716404506138392, + "step": 1471 + }, + { + "epoch": 0.36832228199674716, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50171630.54545455, + "logits/rejected": 51244342.15384615, + "logps/chosen": -387.94797585227275, + "logps/rejected": -515.2635591947115, + "loss": 0.0606, + "rewards/chosen": 5.185014204545454, + "rewards/margins": 15.870903815422857, + "rewards/rejected": -10.685889610877403, + "step": 1472 + }, + { + "epoch": 0.36857250093832106, + "grad_norm": 11.25, + "kl": 1.8079898357391357, + "learning_rate": 5e-06, + "logits/chosen": -50533488.0, + "logits/rejected": -48172309.333333336, + "logps/chosen": -366.69482421875, + "logps/rejected": -727.1637369791666, + "loss": 0.0531, + "rewards/chosen": 5.371676762898763, + "rewards/margins": 18.358312606811523, + "rewards/rejected": -12.98663584391276, + "step": 1473 + }, + { + "epoch": 0.3688227198798949, + "grad_norm": 2.25, + "kl": 10.794523239135742, + "learning_rate": 5e-06, + "logits/chosen": -62364614.4, + "logits/rejected": -61305526.85714286, + "logps/chosen": -375.316259765625, + "logps/rejected": -517.3069196428571, + "loss": 0.005, + "rewards/chosen": 9.397773742675781, + "rewards/margins": 19.65023258754185, + "rewards/rejected": -10.252458844866071, + "step": 1474 + }, + { + "epoch": 0.3690729388214688, + "grad_norm": 12.125, + "kl": 10.09415340423584, + "learning_rate": 5e-06, + "logits/chosen": -68816680.0, + "logits/rejected": -38184508.0, + "logps/chosen": -480.1839904785156, + "logps/rejected": -519.7053833007812, + "loss": 0.1193, + "rewards/chosen": 6.878000259399414, + "rewards/margins": 17.881831169128418, + "rewards/rejected": -11.003830909729004, + "step": 1475 + }, + { + "epoch": 0.36932315776304264, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43772096.0, + "logits/rejected": -38886904.615384616, + "logps/chosen": -394.36874112215907, + "logps/rejected": -581.1694711538462, + "loss": 0.039, + "rewards/chosen": 6.982310208407315, + "rewards/margins": 19.27888120637907, + "rewards/rejected": -12.296570997971754, + "step": 1476 + }, + { + "epoch": 0.36957337670461654, + "grad_norm": 16.125, + "kl": 1.2153505086898804, + "learning_rate": 5e-06, + "logits/chosen": -43910435.2, + "logits/rejected": -61819936.0, + "logps/chosen": -352.085205078125, + "logps/rejected": -566.4048200334821, + "loss": 0.0466, + "rewards/chosen": 6.919402313232422, + "rewards/margins": 17.703001076834543, + "rewards/rejected": -10.78359876360212, + "step": 1477 + }, + { + "epoch": 0.36982359564619044, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47157385.84615385, + "logits/rejected": -53794781.09090909, + "logps/chosen": -350.62943209134613, + "logps/rejected": -551.9283114346591, + "loss": 0.0348, + "rewards/chosen": 7.055459829477163, + "rewards/margins": 20.27703409261637, + "rewards/rejected": -13.221574263139205, + "step": 1478 + }, + { + "epoch": 0.3700738145877643, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41338093.333333336, + "logits/rejected": -32626581.333333332, + "logps/chosen": -298.90293375651044, + "logps/rejected": -618.6534016927084, + "loss": 0.0094, + "rewards/chosen": 7.279170989990234, + "rewards/margins": 21.39152399698893, + "rewards/rejected": -14.112353006998697, + "step": 1479 + }, + { + "epoch": 0.3703240335293382, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59876096.0, + "logits/rejected": -25147684.57142857, + "logps/chosen": -453.362841796875, + "logps/rejected": -436.4036342075893, + "loss": 0.0418, + "rewards/chosen": 9.705673980712891, + "rewards/margins": 21.370231301443916, + "rewards/rejected": -11.664557320731026, + "step": 1480 + }, + { + "epoch": 0.370574252470912, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37758555.428571425, + "logits/rejected": 10056352.0, + "logps/chosen": -354.9829799107143, + "logps/rejected": -490.49040670955884, + "loss": 0.0135, + "rewards/chosen": 7.0408172607421875, + "rewards/margins": 18.278410967658548, + "rewards/rejected": -11.23759370691636, + "step": 1481 + }, + { + "epoch": 0.3708244714124859, + "grad_norm": 18.0, + "kl": 9.759722709655762, + "learning_rate": 5e-06, + "logits/chosen": 4218770.823529412, + "logits/rejected": -11962620.57142857, + "logps/chosen": -420.60285500919116, + "logps/rejected": -449.01290457589283, + "loss": 0.077, + "rewards/chosen": 8.556619083180147, + "rewards/margins": 20.04801203623539, + "rewards/rejected": -11.491392953055245, + "step": 1482 + }, + { + "epoch": 0.3710746903540598, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43813546.666666664, + "logits/rejected": -42713714.666666664, + "logps/chosen": -413.4487711588542, + "logps/rejected": -486.2548828125, + "loss": 0.0343, + "rewards/chosen": 8.300863265991211, + "rewards/margins": 17.352370580037437, + "rewards/rejected": -9.051507314046225, + "step": 1483 + }, + { + "epoch": 0.37132490929563366, + "grad_norm": 12.8125, + "kl": 2.8726329803466797, + "learning_rate": 5e-06, + "logits/chosen": -18753536.0, + "logits/rejected": -83926448.0, + "logps/chosen": -398.6054280598958, + "logps/rejected": -629.783447265625, + "loss": 0.0237, + "rewards/chosen": 8.39282480875651, + "rewards/margins": 18.916951497395832, + "rewards/rejected": -10.524126688639322, + "step": 1484 + }, + { + "epoch": 0.37157512823720756, + "grad_norm": 7.9375, + "kl": 0.5384852290153503, + "learning_rate": 5e-06, + "logits/chosen": -47932476.0, + "logits/rejected": -42761188.0, + "logps/chosen": -448.7462158203125, + "logps/rejected": -492.65087890625, + "loss": 0.0382, + "rewards/chosen": 7.764603137969971, + "rewards/margins": 16.71304178237915, + "rewards/rejected": -8.94843864440918, + "step": 1485 + }, + { + "epoch": 0.3718253471787814, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79898827.63636364, + "logits/rejected": -28066569.846153848, + "logps/chosen": -379.1720525568182, + "logps/rejected": -700.5993088942307, + "loss": 0.0246, + "rewards/chosen": 6.93018271706321, + "rewards/margins": 23.255096328842058, + "rewards/rejected": -16.324913611778847, + "step": 1486 + }, + { + "epoch": 0.3720755661203553, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68727541.33333333, + "logits/rejected": -39419306.666666664, + "logps/chosen": -243.83072916666666, + "logps/rejected": -586.9990641276041, + "loss": 0.0425, + "rewards/chosen": 5.1235246658325195, + "rewards/margins": 20.250266710917153, + "rewards/rejected": -15.126742045084635, + "step": 1487 + }, + { + "epoch": 0.3723257850619292, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73166112.0, + "logits/rejected": -30268201.14285714, + "logps/chosen": -385.7388916015625, + "logps/rejected": -737.8534458705357, + "loss": 0.0166, + "rewards/chosen": 6.703889465332031, + "rewards/margins": 19.759499032156807, + "rewards/rejected": -13.055609566824776, + "step": 1488 + }, + { + "epoch": 0.37257600400350305, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76090630.4, + "logits/rejected": -51500681.14285714, + "logps/chosen": -552.44677734375, + "logps/rejected": -731.6400669642857, + "loss": 0.005, + "rewards/chosen": 8.726946258544922, + "rewards/margins": 25.98178983415876, + "rewards/rejected": -17.25484357561384, + "step": 1489 + }, + { + "epoch": 0.37282622294507695, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55076461.71428572, + "logits/rejected": -42433923.76470588, + "logps/chosen": -453.6431361607143, + "logps/rejected": -488.16457950367646, + "loss": 0.0252, + "rewards/chosen": 6.651004791259766, + "rewards/margins": 16.44989215626436, + "rewards/rejected": -9.798887365004596, + "step": 1490 + }, + { + "epoch": 0.37307644188665084, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29556128.0, + "logits/rejected": -27918313.14285714, + "logps/chosen": -291.852490234375, + "logps/rejected": -406.06033761160717, + "loss": 0.0543, + "rewards/chosen": 7.14937744140625, + "rewards/margins": 17.696335274832588, + "rewards/rejected": -10.546957833426339, + "step": 1491 + }, + { + "epoch": 0.3733266608282247, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72913254.4, + "logits/rejected": -20996069.333333332, + "logps/chosen": -406.1290690104167, + "logps/rejected": -469.84624565972223, + "loss": 0.0519, + "rewards/chosen": 7.254035949707031, + "rewards/margins": 20.007662455240883, + "rewards/rejected": -12.753626505533854, + "step": 1492 + }, + { + "epoch": 0.3735768797697986, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53002656.0, + "logits/rejected": -42120474.666666664, + "logps/chosen": -423.9876708984375, + "logps/rejected": -655.2089436848959, + "loss": 0.0703, + "rewards/chosen": 6.128908793131511, + "rewards/margins": 20.21557871500651, + "rewards/rejected": -14.086669921875, + "step": 1493 + }, + { + "epoch": 0.37382709871137243, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30981002.666666668, + "logits/rejected": -29381042.666666668, + "logps/chosen": -450.0165201822917, + "logps/rejected": -379.5830078125, + "loss": 0.0389, + "rewards/chosen": 7.783847808837891, + "rewards/margins": 17.051981608072914, + "rewards/rejected": -9.268133799235025, + "step": 1494 + }, + { + "epoch": 0.37407731765294633, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47246299.428571425, + "logits/rejected": -55030256.941176474, + "logps/chosen": -469.45664760044644, + "logps/rejected": -728.8492647058823, + "loss": 0.0078, + "rewards/chosen": 9.222942897251674, + "rewards/margins": 22.16458879999754, + "rewards/rejected": -12.941645902745863, + "step": 1495 + }, + { + "epoch": 0.3743275365945202, + "grad_norm": 14.375, + "kl": 8.163302421569824, + "learning_rate": 5e-06, + "logits/chosen": -60342880.0, + "logits/rejected": 60988160.0, + "logps/chosen": -341.5933837890625, + "logps/rejected": -642.232421875, + "loss": 0.1069, + "rewards/chosen": 5.002639452616374, + "rewards/margins": 18.22743574778239, + "rewards/rejected": -13.224796295166016, + "step": 1496 + }, + { + "epoch": 0.37457775553609407, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39287498.666666664, + "logits/rejected": -53840477.86666667, + "logps/chosen": -421.43299696180554, + "logps/rejected": -626.4707682291667, + "loss": 0.0259, + "rewards/chosen": 7.583435906304254, + "rewards/margins": 22.185001458062064, + "rewards/rejected": -14.601565551757812, + "step": 1497 + }, + { + "epoch": 0.37482797447766797, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48876600.0, + "logits/rejected": -52909204.0, + "logps/chosen": -333.1134338378906, + "logps/rejected": -550.0637817382812, + "loss": 0.0451, + "rewards/chosen": 6.283137321472168, + "rewards/margins": 18.175339698791504, + "rewards/rejected": -11.892202377319336, + "step": 1498 + }, + { + "epoch": 0.3750781934192418, + "grad_norm": 9.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51574290.28571428, + "logits/rejected": -60402867.2, + "logps/chosen": -290.1018763950893, + "logps/rejected": -598.34931640625, + "loss": 0.0612, + "rewards/chosen": 5.244940076555524, + "rewards/margins": 17.886013684953962, + "rewards/rejected": -12.641073608398438, + "step": 1499 + }, + { + "epoch": 0.3753284123608157, + "grad_norm": 10.0, + "kl": 9.629730224609375, + "learning_rate": 5e-06, + "logits/chosen": -86127396.57142857, + "logits/rejected": -47186486.4, + "logps/chosen": -359.7578822544643, + "logps/rejected": -604.67939453125, + "loss": 0.0445, + "rewards/chosen": 7.190484183175223, + "rewards/margins": 21.387906973702567, + "rewards/rejected": -14.197422790527344, + "step": 1500 + }, + { + "epoch": 0.3755786313023896, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66806793.84615385, + "logits/rejected": -43458059.63636363, + "logps/chosen": -437.1399489182692, + "logps/rejected": -411.3058416193182, + "loss": 0.061, + "rewards/chosen": 6.5617546668419475, + "rewards/margins": 15.403611883416877, + "rewards/rejected": -8.84185721657493, + "step": 1501 + }, + { + "epoch": 0.37582885024396345, + "grad_norm": 11.5, + "kl": 1.826680064201355, + "learning_rate": 5e-06, + "logits/chosen": -61711133.86666667, + "logits/rejected": -82257905.77777778, + "logps/chosen": -304.7396484375, + "logps/rejected": -540.8516710069445, + "loss": 0.0394, + "rewards/chosen": 5.438275655110677, + "rewards/margins": 16.29000006781684, + "rewards/rejected": -10.851724412706163, + "step": 1502 + }, + { + "epoch": 0.37607906918553735, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34954262.4, + "logits/rejected": -50900754.28571428, + "logps/chosen": -395.3766357421875, + "logps/rejected": -674.3683733258929, + "loss": 0.0212, + "rewards/chosen": 6.769110870361328, + "rewards/margins": 21.44669919695173, + "rewards/rejected": -14.677588326590401, + "step": 1503 + }, + { + "epoch": 0.37632928812711125, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58222069.333333336, + "logits/rejected": -51208453.333333336, + "logps/chosen": -371.5040283203125, + "logps/rejected": -579.203125, + "loss": 0.0455, + "rewards/chosen": 4.879982630411784, + "rewards/margins": 16.281888961791992, + "rewards/rejected": -11.401906331380209, + "step": 1504 + }, + { + "epoch": 0.3765795070686851, + "grad_norm": 1.3046875, + "kl": 2.357339382171631, + "learning_rate": 5e-06, + "logits/chosen": -45106060.8, + "logits/rejected": -55622848.0, + "logps/chosen": -502.6572265625, + "logps/rejected": -823.5059678819445, + "loss": 0.0364, + "rewards/chosen": 9.417509969075521, + "rewards/margins": 26.391280110677084, + "rewards/rejected": -16.973770141601562, + "step": 1505 + }, + { + "epoch": 0.376829726010259, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55083072.0, + "logits/rejected": -45244051.2, + "logps/chosen": -315.50167410714283, + "logps/rejected": -541.287109375, + "loss": 0.079, + "rewards/chosen": 4.188118525913784, + "rewards/margins": 16.441922542027065, + "rewards/rejected": -12.253804016113282, + "step": 1506 + }, + { + "epoch": 0.37707994495183284, + "grad_norm": 3.46875, + "kl": 0.1512959897518158, + "learning_rate": 5e-06, + "logits/chosen": 2154250.285714286, + "logits/rejected": -82293952.0, + "logps/chosen": -369.66556222098217, + "logps/rejected": -530.7470703125, + "loss": 0.0154, + "rewards/chosen": 6.587891714913504, + "rewards/margins": 20.365275137765067, + "rewards/rejected": -13.777383422851562, + "step": 1507 + }, + { + "epoch": 0.37733016389340673, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44232152.0, + "logits/rejected": -50541624.0, + "logps/chosen": -380.9112854003906, + "logps/rejected": -488.965576171875, + "loss": 0.0534, + "rewards/chosen": 5.986564636230469, + "rewards/margins": 15.534777641296387, + "rewards/rejected": -9.548213005065918, + "step": 1508 + }, + { + "epoch": 0.37758038283498063, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40509820.0, + "logits/rejected": -68375168.0, + "logps/chosen": -392.9261474609375, + "logps/rejected": -575.1669921875, + "loss": 0.0321, + "rewards/chosen": 6.878152847290039, + "rewards/margins": 18.946308135986328, + "rewards/rejected": -12.068155288696289, + "step": 1509 + }, + { + "epoch": 0.3778306017765545, + "grad_norm": 9.375, + "kl": 4.054653167724609, + "learning_rate": 5e-06, + "logits/chosen": -30676578.285714287, + "logits/rejected": -48333776.0, + "logps/chosen": -550.9013323102679, + "logps/rejected": -822.26640625, + "loss": 0.0075, + "rewards/chosen": 8.581047058105469, + "rewards/margins": 27.540748596191406, + "rewards/rejected": -18.959701538085938, + "step": 1510 + }, + { + "epoch": 0.3780808207181284, + "grad_norm": 3.21875, + "kl": 2.9975523948669434, + "learning_rate": 5e-06, + "logits/chosen": -69100750.76923077, + "logits/rejected": -41653134.54545455, + "logps/chosen": -490.8140399639423, + "logps/rejected": -612.9503284801136, + "loss": 0.0054, + "rewards/chosen": 8.579756516676683, + "rewards/margins": 20.793590412273275, + "rewards/rejected": -12.213833895596592, + "step": 1511 + }, + { + "epoch": 0.3783310396597022, + "grad_norm": 5.4375, + "kl": 16.88369369506836, + "learning_rate": 5e-06, + "logits/chosen": -54836566.85714286, + "logits/rejected": -31125961.6, + "logps/chosen": -430.0634068080357, + "logps/rejected": -454.46572265625, + "loss": 0.0706, + "rewards/chosen": 10.103321620396205, + "rewards/margins": 19.27356545584542, + "rewards/rejected": -9.170243835449218, + "step": 1512 + }, + { + "epoch": 0.3785812586012761, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72190966.15384616, + "logits/rejected": -66811345.45454545, + "logps/chosen": -348.3506610576923, + "logps/rejected": -591.2220348011364, + "loss": 0.0289, + "rewards/chosen": 6.97769282414363, + "rewards/margins": 18.67318288096181, + "rewards/rejected": -11.695490056818182, + "step": 1513 + }, + { + "epoch": 0.37883147754285, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79889938.28571428, + "logits/rejected": -55504806.4, + "logps/chosen": -397.7470005580357, + "logps/rejected": -652.032080078125, + "loss": 0.0574, + "rewards/chosen": 7.618073599679129, + "rewards/margins": 21.785721915108816, + "rewards/rejected": -14.167648315429688, + "step": 1514 + }, + { + "epoch": 0.37908169648442386, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13003349.0, + "logits/rejected": -42715532.0, + "logps/chosen": -354.6552734375, + "logps/rejected": -571.0947875976562, + "loss": 0.0313, + "rewards/chosen": 6.287143707275391, + "rewards/margins": 18.31677532196045, + "rewards/rejected": -12.029631614685059, + "step": 1515 + }, + { + "epoch": 0.37933191542599776, + "grad_norm": 10.6875, + "kl": 23.4040584564209, + "learning_rate": 5e-06, + "logits/chosen": -43846515.2, + "logits/rejected": -33908547.55555555, + "logps/chosen": -423.00283203125, + "logps/rejected": -403.5305989583333, + "loss": 0.1387, + "rewards/chosen": 8.704026285807291, + "rewards/margins": 16.69274664984809, + "rewards/rejected": -7.988720364040798, + "step": 1516 + }, + { + "epoch": 0.3795821343675716, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -81527261.0909091, + "logits/rejected": -26310112.0, + "logps/chosen": -384.15988991477275, + "logps/rejected": -470.4381760817308, + "loss": 0.0291, + "rewards/chosen": 7.699965043501421, + "rewards/margins": 19.194301685253222, + "rewards/rejected": -11.494336641751802, + "step": 1517 + }, + { + "epoch": 0.3798323533091455, + "grad_norm": 15.8125, + "kl": 4.488152980804443, + "learning_rate": 5e-06, + "logits/chosen": -59436473.6, + "logits/rejected": 46565211.428571425, + "logps/chosen": -411.4630859375, + "logps/rejected": -614.6962890625, + "loss": 0.0479, + "rewards/chosen": 6.233036804199219, + "rewards/margins": 17.670736258370535, + "rewards/rejected": -11.437699454171318, + "step": 1518 + }, + { + "epoch": 0.3800825722507194, + "grad_norm": 2.015625, + "kl": 5.165832042694092, + "learning_rate": 5e-06, + "logits/chosen": -31114702.545454547, + "logits/rejected": -73696659.6923077, + "logps/chosen": -357.61172762784093, + "logps/rejected": -678.9456129807693, + "loss": 0.0079, + "rewards/chosen": 8.33049149946733, + "rewards/margins": 21.25602860884233, + "rewards/rejected": -12.925537109375, + "step": 1519 + }, + { + "epoch": 0.38033279119229324, + "grad_norm": 17.125, + "kl": 11.158552169799805, + "learning_rate": 5e-06, + "logits/chosen": -57488808.72727273, + "logits/rejected": -39432851.692307696, + "logps/chosen": -456.2736150568182, + "logps/rejected": -261.6331129807692, + "loss": 0.1006, + "rewards/chosen": 8.497854059392756, + "rewards/margins": 14.644600181312828, + "rewards/rejected": -6.1467461219200725, + "step": 1520 + }, + { + "epoch": 0.38058301013386714, + "grad_norm": 21.375, + "kl": 4.887245178222656, + "learning_rate": 5e-06, + "logits/chosen": -56543266.90909091, + "logits/rejected": -90804214.15384616, + "logps/chosen": -431.1031605113636, + "logps/rejected": -582.3155423677885, + "loss": 0.0892, + "rewards/chosen": 6.0468361594460225, + "rewards/margins": 14.944600512097765, + "rewards/rejected": -8.897764352651743, + "step": 1521 + }, + { + "epoch": 0.38083322907544104, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19850392.727272727, + "logits/rejected": -31027657.846153848, + "logps/chosen": -475.45210404829544, + "logps/rejected": -614.6077599158654, + "loss": 0.0296, + "rewards/chosen": 7.16021728515625, + "rewards/margins": 20.232137826772835, + "rewards/rejected": -13.071920541616587, + "step": 1522 + }, + { + "epoch": 0.3810834480170149, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72314741.33333333, + "logits/rejected": -67724346.66666667, + "logps/chosen": -454.7184651692708, + "logps/rejected": -529.76220703125, + "loss": 0.0629, + "rewards/chosen": 7.364760716756185, + "rewards/margins": 16.69577980041504, + "rewards/rejected": -9.331019083658854, + "step": 1523 + }, + { + "epoch": 0.3813336669585888, + "grad_norm": 14.5625, + "kl": 0.7795896530151367, + "learning_rate": 5e-06, + "logits/chosen": -73304938.66666667, + "logits/rejected": -56953344.0, + "logps/chosen": -288.66514078776044, + "logps/rejected": -507.5502522786458, + "loss": 0.0685, + "rewards/chosen": 5.534720102945964, + "rewards/margins": 16.35481135050456, + "rewards/rejected": -10.820091247558594, + "step": 1524 + }, + { + "epoch": 0.3815838859001626, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55282208.0, + "logits/rejected": -75112214.85714285, + "logps/chosen": -404.2345947265625, + "logps/rejected": -568.1969866071429, + "loss": 0.055, + "rewards/chosen": 6.519377136230469, + "rewards/margins": 14.721871076311384, + "rewards/rejected": -8.202493940080915, + "step": 1525 + }, + { + "epoch": 0.3818341048417365, + "grad_norm": 5.875, + "kl": 2.9955215454101562, + "learning_rate": 5e-06, + "logits/chosen": -55246453.333333336, + "logits/rejected": -42434261.333333336, + "logps/chosen": -370.6536051432292, + "logps/rejected": -461.8415934244792, + "loss": 0.0289, + "rewards/chosen": 7.182188669840495, + "rewards/margins": 15.839283625284832, + "rewards/rejected": -8.657094955444336, + "step": 1526 + }, + { + "epoch": 0.3820843237833104, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6866636.0, + "logits/rejected": -72539656.0, + "logps/chosen": -575.23779296875, + "logps/rejected": -502.9908447265625, + "loss": 0.0621, + "rewards/chosen": 8.676311492919922, + "rewards/margins": 18.104681968688965, + "rewards/rejected": -9.428370475769043, + "step": 1527 + }, + { + "epoch": 0.38233454272488426, + "grad_norm": 6.90625, + "kl": 5.5987701416015625, + "learning_rate": 5e-06, + "logits/chosen": -80637710.22222222, + "logits/rejected": -39810432.0, + "logps/chosen": -490.1339518229167, + "logps/rejected": -569.58125, + "loss": 0.0175, + "rewards/chosen": 9.917039659288195, + "rewards/margins": 20.660138617621527, + "rewards/rejected": -10.743098958333333, + "step": 1528 + }, + { + "epoch": 0.38258476166645816, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28269437.09090909, + "logits/rejected": -36584093.538461536, + "logps/chosen": -363.4459117542614, + "logps/rejected": -454.58657602163464, + "loss": 0.0532, + "rewards/chosen": 6.719588539817116, + "rewards/margins": 18.085076445466154, + "rewards/rejected": -11.365487905649038, + "step": 1529 + }, + { + "epoch": 0.382834980608032, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53403863.27272727, + "logits/rejected": -52322432.0, + "logps/chosen": -292.2071644176136, + "logps/rejected": -574.6057692307693, + "loss": 0.0665, + "rewards/chosen": 5.259834289550781, + "rewards/margins": 16.10341057410607, + "rewards/rejected": -10.843576284555288, + "step": 1530 + }, + { + "epoch": 0.3830851995496059, + "grad_norm": 17.5, + "kl": 7.816334247589111, + "learning_rate": 5e-06, + "logits/chosen": -48588228.0, + "logits/rejected": -40557196.0, + "logps/chosen": -403.4239807128906, + "logps/rejected": -639.576904296875, + "loss": 0.055, + "rewards/chosen": 7.863036155700684, + "rewards/margins": 16.942940711975098, + "rewards/rejected": -9.079904556274414, + "step": 1531 + }, + { + "epoch": 0.3833354184911798, + "grad_norm": 10.0625, + "kl": 3.45674467086792, + "learning_rate": 5e-06, + "logits/chosen": -35228987.428571425, + "logits/rejected": -67342969.6, + "logps/chosen": -289.88779994419644, + "logps/rejected": -502.24150390625, + "loss": 0.0697, + "rewards/chosen": 5.424467904227121, + "rewards/margins": 17.334997994559153, + "rewards/rejected": -11.910530090332031, + "step": 1532 + }, + { + "epoch": 0.38358563743275365, + "grad_norm": 3.171875, + "kl": 1.3790347576141357, + "learning_rate": 5e-06, + "logits/chosen": -50948342.85714286, + "logits/rejected": -68044672.0, + "logps/chosen": -345.66012137276783, + "logps/rejected": -576.31328125, + "loss": 0.0271, + "rewards/chosen": 8.226844787597656, + "rewards/margins": 18.942554473876953, + "rewards/rejected": -10.715709686279297, + "step": 1533 + }, + { + "epoch": 0.38383585637432754, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4193448.0, + "logits/rejected": -49205840.0, + "logps/chosen": -679.2981567382812, + "logps/rejected": -543.2861938476562, + "loss": 0.0363, + "rewards/chosen": 10.279638290405273, + "rewards/margins": 19.407093048095703, + "rewards/rejected": -9.12745475769043, + "step": 1534 + }, + { + "epoch": 0.3840860753159014, + "grad_norm": 2.640625, + "kl": 1.4770686626434326, + "learning_rate": 5e-06, + "logits/chosen": -61227688.72727273, + "logits/rejected": -39253051.07692308, + "logps/chosen": -537.5231711647727, + "logps/rejected": -724.4465895432693, + "loss": 0.0179, + "rewards/chosen": 10.051686373623935, + "rewards/margins": 21.17281533621408, + "rewards/rejected": -11.121128962590145, + "step": 1535 + }, + { + "epoch": 0.3843362942574753, + "grad_norm": 19.25, + "kl": 5.299968719482422, + "learning_rate": 5e-06, + "logits/chosen": -27923964.8, + "logits/rejected": -30241273.14285714, + "logps/chosen": -473.3396484375, + "logps/rejected": -485.83272879464283, + "loss": 0.0215, + "rewards/chosen": 9.987073516845703, + "rewards/margins": 18.1262941632952, + "rewards/rejected": -8.139220646449498, + "step": 1536 + }, + { + "epoch": 0.3845865131990492, + "grad_norm": 4.15625, + "kl": 3.274555206298828, + "learning_rate": 5e-06, + "logits/chosen": -49640285.09090909, + "logits/rejected": -43750931.692307696, + "logps/chosen": -379.07590553977275, + "logps/rejected": -545.8899489182693, + "loss": 0.0391, + "rewards/chosen": 6.578675703568892, + "rewards/margins": 17.362469226330308, + "rewards/rejected": -10.783793522761417, + "step": 1537 + }, + { + "epoch": 0.38483673214062303, + "grad_norm": 9.0625, + "kl": 3.5699737071990967, + "learning_rate": 5e-06, + "logits/chosen": -35539602.28571428, + "logits/rejected": -44411769.6, + "logps/chosen": -301.36854771205356, + "logps/rejected": -553.735205078125, + "loss": 0.0625, + "rewards/chosen": 6.414234706333706, + "rewards/margins": 17.522111293247768, + "rewards/rejected": -11.107876586914063, + "step": 1538 + }, + { + "epoch": 0.3850869510821969, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35209483.63636363, + "logits/rejected": -42758852.92307692, + "logps/chosen": -351.9339488636364, + "logps/rejected": -648.3533653846154, + "loss": 0.0815, + "rewards/chosen": 5.24550385908647, + "rewards/margins": 16.894082209447046, + "rewards/rejected": -11.648578350360577, + "step": 1539 + }, + { + "epoch": 0.3853371700237708, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62190650.18181818, + "logits/rejected": -78514953.84615384, + "logps/chosen": -480.68599076704544, + "logps/rejected": -568.4444861778846, + "loss": 0.0074, + "rewards/chosen": 7.456821788441051, + "rewards/margins": 19.565300041145377, + "rewards/rejected": -12.108478252704327, + "step": 1540 + }, + { + "epoch": 0.38558738896534467, + "grad_norm": 15.0, + "kl": 11.122676849365234, + "learning_rate": 5e-06, + "logits/chosen": -61903338.666666664, + "logits/rejected": -62698112.0, + "logps/chosen": -254.6910196940104, + "logps/rejected": -789.6026204427084, + "loss": 0.0958, + "rewards/chosen": 4.186389287312825, + "rewards/margins": 17.29101626078288, + "rewards/rejected": -13.104626973470053, + "step": 1541 + }, + { + "epoch": 0.38583760790691857, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30149254.0, + "logits/rejected": -19735254.0, + "logps/chosen": -276.3680419921875, + "logps/rejected": -648.9039916992188, + "loss": 0.064, + "rewards/chosen": 5.651928424835205, + "rewards/margins": 20.514277935028076, + "rewards/rejected": -14.862349510192871, + "step": 1542 + }, + { + "epoch": 0.3860878268484924, + "grad_norm": 14.8125, + "kl": 3.869978666305542, + "learning_rate": 5e-06, + "logits/chosen": -18704593.14285714, + "logits/rejected": -53599376.0, + "logps/chosen": -492.6671665736607, + "logps/rejected": -642.15009765625, + "loss": 0.0241, + "rewards/chosen": 8.568756648472377, + "rewards/margins": 22.675739070347376, + "rewards/rejected": -14.106982421875, + "step": 1543 + }, + { + "epoch": 0.3863380457900663, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14169156.923076924, + "logits/rejected": -39728826.18181818, + "logps/chosen": -246.31049053485577, + "logps/rejected": -568.0261008522727, + "loss": 0.0615, + "rewards/chosen": 6.015750591571514, + "rewards/margins": 17.324505946019315, + "rewards/rejected": -11.308755354447799, + "step": 1544 + }, + { + "epoch": 0.3865882647316402, + "grad_norm": 11.9375, + "kl": 1.039900541305542, + "learning_rate": 5e-06, + "logits/chosen": -69726037.33333333, + "logits/rejected": -16867088.0, + "logps/chosen": -360.1797200520833, + "logps/rejected": -545.5159505208334, + "loss": 0.0805, + "rewards/chosen": 5.97146962483724, + "rewards/margins": 14.771454874674479, + "rewards/rejected": -8.79998524983724, + "step": 1545 + }, + { + "epoch": 0.38683848367321405, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49347886.54545455, + "logits/rejected": -81052977.23076923, + "logps/chosen": -301.373046875, + "logps/rejected": -740.3809344951923, + "loss": 0.0526, + "rewards/chosen": 5.9335105202414775, + "rewards/margins": 20.306566785265517, + "rewards/rejected": -14.373056265024038, + "step": 1546 + }, + { + "epoch": 0.38708870261478795, + "grad_norm": 1.4140625, + "kl": 1.5461070537567139, + "learning_rate": 5e-06, + "logits/chosen": -48313552.0, + "logits/rejected": -48347562.666666664, + "logps/chosen": -379.9781901041667, + "logps/rejected": -606.9390462239584, + "loss": 0.0425, + "rewards/chosen": 7.176903406778972, + "rewards/margins": 18.952517827351887, + "rewards/rejected": -11.775614420572916, + "step": 1547 + }, + { + "epoch": 0.3873389215563618, + "grad_norm": 6.40625, + "kl": 3.7011592388153076, + "learning_rate": 5e-06, + "logits/chosen": -87210321.45454545, + "logits/rejected": -21078112.0, + "logps/chosen": -483.9068714488636, + "logps/rejected": -326.86609825721155, + "loss": 0.0311, + "rewards/chosen": 8.05739385431463, + "rewards/margins": 15.626142915312226, + "rewards/rejected": -7.568749060997596, + "step": 1548 + }, + { + "epoch": 0.3875891404979357, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -95598359.27272727, + "logits/rejected": -24670084.923076924, + "logps/chosen": -453.8734019886364, + "logps/rejected": -477.9554912860577, + "loss": 0.0272, + "rewards/chosen": 7.100312666459517, + "rewards/margins": 19.62544506579846, + "rewards/rejected": -12.525132399338942, + "step": 1549 + }, + { + "epoch": 0.3878393594395096, + "grad_norm": 10.8125, + "kl": 7.050755977630615, + "learning_rate": 5e-06, + "logits/chosen": -38994102.85714286, + "logits/rejected": -39168192.0, + "logps/chosen": -521.2699497767857, + "logps/rejected": -576.305419921875, + "loss": 0.0396, + "rewards/chosen": 9.492527553013392, + "rewards/margins": 20.6130857195173, + "rewards/rejected": -11.120558166503907, + "step": 1550 + }, + { + "epoch": 0.38808957838108343, + "grad_norm": 17.75, + "kl": 1.4493141174316406, + "learning_rate": 5e-06, + "logits/chosen": -92590517.33333333, + "logits/rejected": -56608421.333333336, + "logps/chosen": -466.5970052083333, + "logps/rejected": -635.5377604166666, + "loss": 0.0654, + "rewards/chosen": 8.066460291544596, + "rewards/margins": 20.011279424031574, + "rewards/rejected": -11.944819132486979, + "step": 1551 + }, + { + "epoch": 0.38833979732265733, + "grad_norm": 6.40625, + "kl": 4.593206405639648, + "learning_rate": 5e-06, + "logits/chosen": -64595889.23076923, + "logits/rejected": -34343517.09090909, + "logps/chosen": -419.7858323317308, + "logps/rejected": -520.7568359375, + "loss": 0.0287, + "rewards/chosen": 7.569098252516526, + "rewards/margins": 17.65582504805985, + "rewards/rejected": -10.086726795543324, + "step": 1552 + }, + { + "epoch": 0.3885900162642312, + "grad_norm": 20.0, + "kl": 6.921544075012207, + "learning_rate": 5e-06, + "logits/chosen": -37210870.15384615, + "logits/rejected": -40107886.54545455, + "logps/chosen": -337.2096980168269, + "logps/rejected": -595.0050603693181, + "loss": 0.0531, + "rewards/chosen": 4.817118131197416, + "rewards/margins": 18.157794178782645, + "rewards/rejected": -13.340676047585227, + "step": 1553 + }, + { + "epoch": 0.3888402352058051, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32519335.111111112, + "logits/rejected": -52503944.53333333, + "logps/chosen": -461.6263020833333, + "logps/rejected": -454.01689453125, + "loss": 0.0394, + "rewards/chosen": 7.8503528171115455, + "rewards/margins": 17.58455488416884, + "rewards/rejected": -9.734202067057291, + "step": 1554 + }, + { + "epoch": 0.389090454147379, + "grad_norm": 10.625, + "kl": 11.004191398620605, + "learning_rate": 5e-06, + "logits/chosen": -45119763.692307696, + "logits/rejected": -34119418.18181818, + "logps/chosen": -416.51307091346155, + "logps/rejected": -470.24360795454544, + "loss": 0.0213, + "rewards/chosen": 8.634840745192308, + "rewards/margins": 20.445210063374127, + "rewards/rejected": -11.810369318181818, + "step": 1555 + }, + { + "epoch": 0.3893406730889528, + "grad_norm": 8.5, + "kl": 5.074847221374512, + "learning_rate": 5e-06, + "logits/chosen": -52541892.266666666, + "logits/rejected": -58095320.88888889, + "logps/chosen": -383.05052083333334, + "logps/rejected": -496.83018663194446, + "loss": 0.0573, + "rewards/chosen": 7.852602640787761, + "rewards/margins": 17.40517849392361, + "rewards/rejected": -9.552575853135851, + "step": 1556 + }, + { + "epoch": 0.3895908920305267, + "grad_norm": 13.625, + "kl": 4.986838340759277, + "learning_rate": 5e-06, + "logits/chosen": -50742144.0, + "logits/rejected": -45844102.4, + "logps/chosen": -509.5894252232143, + "logps/rejected": -486.414697265625, + "loss": 0.018, + "rewards/chosen": 8.469165257045201, + "rewards/margins": 18.375714329310824, + "rewards/rejected": -9.906549072265625, + "step": 1557 + }, + { + "epoch": 0.3898411109721006, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33524884.0, + "logits/rejected": 15619832.0, + "logps/chosen": -394.5790100097656, + "logps/rejected": -772.2391357421875, + "loss": 0.0224, + "rewards/chosen": 6.39989709854126, + "rewards/margins": 20.26790189743042, + "rewards/rejected": -13.86800479888916, + "step": 1558 + }, + { + "epoch": 0.39009132991367446, + "grad_norm": 20.25, + "kl": 18.480480194091797, + "learning_rate": 5e-06, + "logits/chosen": -70890555.42857143, + "logits/rejected": -61248192.0, + "logps/chosen": -429.66688755580356, + "logps/rejected": -614.1064453125, + "loss": 0.047, + "rewards/chosen": 6.514197758265904, + "rewards/margins": 16.985174015590122, + "rewards/rejected": -10.470976257324219, + "step": 1559 + }, + { + "epoch": 0.39034154885524835, + "grad_norm": 15.75, + "kl": 0.6127548217773438, + "learning_rate": 5e-06, + "logits/chosen": -59696005.333333336, + "logits/rejected": -78712608.0, + "logps/chosen": -377.5688883463542, + "logps/rejected": -707.6808268229166, + "loss": 0.0327, + "rewards/chosen": 7.171212514241536, + "rewards/margins": 22.457936604817707, + "rewards/rejected": -15.286724090576172, + "step": 1560 + }, + { + "epoch": 0.3905917677968222, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44644579.2, + "logits/rejected": -60546816.0, + "logps/chosen": -398.78916015625, + "logps/rejected": -696.2825055803571, + "loss": 0.0285, + "rewards/chosen": 7.975958251953125, + "rewards/margins": 20.657434300013954, + "rewards/rejected": -12.681476048060826, + "step": 1561 + }, + { + "epoch": 0.3908419867383961, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17875178.666666668, + "logits/rejected": -44063910.4, + "logps/chosen": -253.60259331597223, + "logps/rejected": -494.1056640625, + "loss": 0.022, + "rewards/chosen": 6.223546769883898, + "rewards/margins": 17.610119035508898, + "rewards/rejected": -11.386572265625, + "step": 1562 + }, + { + "epoch": 0.39109220567997, + "grad_norm": 11.5, + "kl": 18.026988983154297, + "learning_rate": 5e-06, + "logits/chosen": -49660525.176470585, + "logits/rejected": -91150619.42857143, + "logps/chosen": -386.2071174172794, + "logps/rejected": -676.1298130580357, + "loss": 0.0718, + "rewards/chosen": 8.575309304630055, + "rewards/margins": 22.164142704811418, + "rewards/rejected": -13.588833400181361, + "step": 1563 + }, + { + "epoch": 0.39134242462154384, + "grad_norm": 14.1875, + "kl": 0.5734914541244507, + "learning_rate": 5e-06, + "logits/chosen": -80333809.77777778, + "logits/rejected": -34712507.733333334, + "logps/chosen": -375.451171875, + "logps/rejected": -593.5008463541667, + "loss": 0.0577, + "rewards/chosen": 7.233046637641059, + "rewards/margins": 17.59545610215929, + "rewards/rejected": -10.36240946451823, + "step": 1564 + }, + { + "epoch": 0.39159264356311774, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49640634.666666664, + "logits/rejected": -33605261.333333336, + "logps/chosen": -367.917236328125, + "logps/rejected": -377.2064615885417, + "loss": 0.0241, + "rewards/chosen": 7.1159407297770185, + "rewards/margins": 16.494548797607422, + "rewards/rejected": -9.378608067830404, + "step": 1565 + }, + { + "epoch": 0.3918428625046916, + "grad_norm": 4.4375, + "kl": 6.361255645751953, + "learning_rate": 5e-06, + "logits/chosen": -37976981.333333336, + "logits/rejected": -53967562.666666664, + "logps/chosen": -368.8640950520833, + "logps/rejected": -526.2738444010416, + "loss": 0.0455, + "rewards/chosen": 6.881423314412435, + "rewards/margins": 18.78554089864095, + "rewards/rejected": -11.904117584228516, + "step": 1566 + }, + { + "epoch": 0.3920930814462655, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51512531.692307696, + "logits/rejected": -66421771.63636363, + "logps/chosen": -433.203125, + "logps/rejected": -584.4297762784091, + "loss": 0.0361, + "rewards/chosen": 7.266213637131911, + "rewards/margins": 16.878148645787807, + "rewards/rejected": -9.611935008655895, + "step": 1567 + }, + { + "epoch": 0.3923433003878394, + "grad_norm": 3.234375, + "kl": 8.539081573486328, + "learning_rate": 5e-06, + "logits/chosen": -50990528.0, + "logits/rejected": -24223909.333333332, + "logps/chosen": -413.7838948567708, + "logps/rejected": -267.9441731770833, + "loss": 0.0482, + "rewards/chosen": 8.737911224365234, + "rewards/margins": 14.086903254191082, + "rewards/rejected": -5.348992029825847, + "step": 1568 + }, + { + "epoch": 0.3925935193294132, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63922569.84615385, + "logits/rejected": -49597300.36363637, + "logps/chosen": -439.88146033653845, + "logps/rejected": -708.5866477272727, + "loss": 0.0171, + "rewards/chosen": 8.851475642277645, + "rewards/margins": 23.01541745912779, + "rewards/rejected": -14.163941816850143, + "step": 1569 + }, + { + "epoch": 0.3928437382709871, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56432226.461538464, + "logits/rejected": -42487621.81818182, + "logps/chosen": -294.70742563100964, + "logps/rejected": -500.23002485795456, + "loss": 0.0889, + "rewards/chosen": 6.72278066781851, + "rewards/margins": 17.206085471840172, + "rewards/rejected": -10.483304804021662, + "step": 1570 + }, + { + "epoch": 0.393093957212561, + "grad_norm": 16.0, + "kl": 0.8531255722045898, + "learning_rate": 5e-06, + "logits/chosen": -43765696.0, + "logits/rejected": -39412829.09090909, + "logps/chosen": -377.3454777644231, + "logps/rejected": -468.8181818181818, + "loss": 0.124, + "rewards/chosen": 5.567727309006911, + "rewards/margins": 16.901978712815506, + "rewards/rejected": -11.334251403808594, + "step": 1571 + }, + { + "epoch": 0.39334417615413486, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50214562.461538464, + "logits/rejected": -47814362.18181818, + "logps/chosen": -335.5585186298077, + "logps/rejected": -620.6560724431819, + "loss": 0.025, + "rewards/chosen": 6.158630957970252, + "rewards/margins": 18.306207029969542, + "rewards/rejected": -12.14757607199929, + "step": 1572 + }, + { + "epoch": 0.39359439509570876, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35875925.333333336, + "logits/rejected": -49550202.666666664, + "logps/chosen": -386.1427001953125, + "logps/rejected": -579.5741780598959, + "loss": 0.0416, + "rewards/chosen": 6.091965993245442, + "rewards/margins": 16.14248212178548, + "rewards/rejected": -10.050516128540039, + "step": 1573 + }, + { + "epoch": 0.3938446140372826, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22923065.6, + "logits/rejected": -44994925.71428572, + "logps/chosen": -257.373095703125, + "logps/rejected": -603.9144810267857, + "loss": 0.0741, + "rewards/chosen": 6.081145858764648, + "rewards/margins": 15.729956109183174, + "rewards/rejected": -9.648810250418526, + "step": 1574 + }, + { + "epoch": 0.3940948329788565, + "grad_norm": 6.65625, + "kl": 2.7457356452941895, + "learning_rate": 5e-06, + "logits/chosen": -49475496.421052635, + "logits/rejected": -43429244.8, + "logps/chosen": -377.7696083470395, + "logps/rejected": -1062.3294921875, + "loss": 0.0844, + "rewards/chosen": 6.2845410798725325, + "rewards/margins": 22.17435736405222, + "rewards/rejected": -15.889816284179688, + "step": 1575 + }, + { + "epoch": 0.3943450519204304, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43314865.45454545, + "logits/rejected": -53678547.692307696, + "logps/chosen": -419.62491122159093, + "logps/rejected": -602.8793194110577, + "loss": 0.0147, + "rewards/chosen": 6.8964316628196025, + "rewards/margins": 16.72953454931299, + "rewards/rejected": -9.83310288649339, + "step": 1576 + }, + { + "epoch": 0.39459527086200424, + "grad_norm": 17.375, + "kl": 6.0239362716674805, + "learning_rate": 5e-06, + "logits/chosen": -39985053.86666667, + "logits/rejected": -29856583.111111112, + "logps/chosen": -423.2427083333333, + "logps/rejected": -605.8662109375, + "loss": 0.0905, + "rewards/chosen": 6.987606811523437, + "rewards/margins": 19.554048665364583, + "rewards/rejected": -12.566441853841146, + "step": 1577 + }, + { + "epoch": 0.39484548980357814, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60815378.28571428, + "logits/rejected": -55106153.4117647, + "logps/chosen": -338.4697265625, + "logps/rejected": -619.9574333639706, + "loss": 0.0123, + "rewards/chosen": 6.868181501116071, + "rewards/margins": 19.0307504349396, + "rewards/rejected": -12.162568933823529, + "step": 1578 + }, + { + "epoch": 0.395095708745152, + "grad_norm": 5.625, + "kl": 5.623414993286133, + "learning_rate": 5e-06, + "logits/chosen": -65370400.0, + "logits/rejected": -44937066.666666664, + "logps/chosen": -432.1836751302083, + "logps/rejected": -439.462890625, + "loss": 0.0311, + "rewards/chosen": 8.802015940348307, + "rewards/margins": 20.291122436523438, + "rewards/rejected": -11.48910649617513, + "step": 1579 + }, + { + "epoch": 0.3953459276867259, + "grad_norm": 7.625, + "kl": 6.58470344543457, + "learning_rate": 5e-06, + "logits/chosen": -71539460.57142857, + "logits/rejected": -62250611.2, + "logps/chosen": -363.54649135044644, + "logps/rejected": -646.99189453125, + "loss": 0.043, + "rewards/chosen": 6.553089686802456, + "rewards/margins": 17.648787471226285, + "rewards/rejected": -11.095697784423828, + "step": 1580 + }, + { + "epoch": 0.3955961466282998, + "grad_norm": 7.78125, + "kl": 7.161094665527344, + "learning_rate": 5e-06, + "logits/chosen": -36932496.0, + "logits/rejected": -37177340.8, + "logps/chosen": -284.66395786830356, + "logps/rejected": -448.852099609375, + "loss": 0.0578, + "rewards/chosen": 6.408753531319754, + "rewards/margins": 16.192396872384208, + "rewards/rejected": -9.783643341064453, + "step": 1581 + }, + { + "epoch": 0.3958463655698736, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54379044.571428575, + "logits/rejected": -68398223.05882353, + "logps/chosen": -410.6493443080357, + "logps/rejected": -682.7164522058823, + "loss": 0.0215, + "rewards/chosen": 8.095538548060826, + "rewards/margins": 20.673114231654576, + "rewards/rejected": -12.57757568359375, + "step": 1582 + }, + { + "epoch": 0.3960965845114475, + "grad_norm": 10.9375, + "kl": 3.7315454483032227, + "learning_rate": 5e-06, + "logits/chosen": -52928290.13333333, + "logits/rejected": -44914595.55555555, + "logps/chosen": -353.40113932291666, + "logps/rejected": -450.56220160590277, + "loss": 0.0447, + "rewards/chosen": 8.047874450683594, + "rewards/margins": 19.42325761583116, + "rewards/rejected": -11.37538316514757, + "step": 1583 + }, + { + "epoch": 0.39634680345302137, + "grad_norm": 5.46875, + "kl": 2.293125867843628, + "learning_rate": 5e-06, + "logits/chosen": -47573681.777777776, + "logits/rejected": -40983957.333333336, + "logps/chosen": -372.53325737847223, + "logps/rejected": -464.9585367838542, + "loss": 0.0259, + "rewards/chosen": 6.833401997884114, + "rewards/margins": 14.92130406697591, + "rewards/rejected": -8.087902069091797, + "step": 1584 + }, + { + "epoch": 0.39659702239459527, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74240585.84615384, + "logits/rejected": -53382562.90909091, + "logps/chosen": -361.5519831730769, + "logps/rejected": -506.85751065340907, + "loss": 0.0438, + "rewards/chosen": 7.130145733173077, + "rewards/margins": 19.17484566215035, + "rewards/rejected": -12.044699928977273, + "step": 1585 + }, + { + "epoch": 0.39684724133616917, + "grad_norm": 13.375, + "kl": 7.100118637084961, + "learning_rate": 5e-06, + "logits/chosen": -66369619.692307696, + "logits/rejected": 33732331.63636363, + "logps/chosen": -336.21375450721155, + "logps/rejected": -445.1492365056818, + "loss": 0.037, + "rewards/chosen": 7.199116633488582, + "rewards/margins": 16.743858177345114, + "rewards/rejected": -9.544741543856533, + "step": 1586 + }, + { + "epoch": 0.397097460277743, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52292499.692307696, + "logits/rejected": -75604596.36363636, + "logps/chosen": -377.1007737379808, + "logps/rejected": -673.7362393465909, + "loss": 0.0264, + "rewards/chosen": 7.1330425555889425, + "rewards/margins": 21.752958604505846, + "rewards/rejected": -14.619916048916904, + "step": 1587 + }, + { + "epoch": 0.3973476792193169, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54069912.615384616, + "logits/rejected": 23352887.272727273, + "logps/chosen": -383.5125075120192, + "logps/rejected": -674.1711647727273, + "loss": 0.0264, + "rewards/chosen": 7.9783806434044475, + "rewards/margins": 19.841187483780867, + "rewards/rejected": -11.86280684037642, + "step": 1588 + }, + { + "epoch": 0.3975978981608908, + "grad_norm": 8.25, + "kl": 4.784675121307373, + "learning_rate": 5e-06, + "logits/chosen": -46123640.0, + "logits/rejected": -19507712.0, + "logps/chosen": -290.67840576171875, + "logps/rejected": -291.2193603515625, + "loss": 0.0548, + "rewards/chosen": 5.777564525604248, + "rewards/margins": 12.735287189483643, + "rewards/rejected": -6.9577226638793945, + "step": 1589 + }, + { + "epoch": 0.39784811710246465, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56030048.0, + "logits/rejected": -58162738.28571428, + "logps/chosen": -381.098388671875, + "logps/rejected": -583.4241420200893, + "loss": 0.0429, + "rewards/chosen": 6.6435089111328125, + "rewards/margins": 18.917938232421875, + "rewards/rejected": -12.274429321289062, + "step": 1590 + }, + { + "epoch": 0.39809833604403855, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71870132.36363636, + "logits/rejected": -76893745.23076923, + "logps/chosen": -471.12038352272725, + "logps/rejected": -576.6395733173077, + "loss": 0.0485, + "rewards/chosen": 7.715235623446378, + "rewards/margins": 17.91817282296561, + "rewards/rejected": -10.20293719951923, + "step": 1591 + }, + { + "epoch": 0.3983485549856124, + "grad_norm": 14.9375, + "kl": 1.3240079879760742, + "learning_rate": 5e-06, + "logits/chosen": -28912585.14285714, + "logits/rejected": -46249523.2, + "logps/chosen": -253.8704833984375, + "logps/rejected": -468.809619140625, + "loss": 0.0928, + "rewards/chosen": 6.2960357666015625, + "rewards/margins": 13.71209259033203, + "rewards/rejected": -7.416056823730469, + "step": 1592 + }, + { + "epoch": 0.3985987739271863, + "grad_norm": 12.0, + "kl": 7.383831024169922, + "learning_rate": 5e-06, + "logits/chosen": -58332784.0, + "logits/rejected": -51747120.0, + "logps/chosen": -397.6625569661458, + "logps/rejected": -534.5457356770834, + "loss": 0.0551, + "rewards/chosen": 8.02014414469401, + "rewards/margins": 19.762802124023438, + "rewards/rejected": -11.742657979329428, + "step": 1593 + }, + { + "epoch": 0.3988489928687602, + "grad_norm": 12.1875, + "kl": 3.77521014213562, + "learning_rate": 5e-06, + "logits/chosen": -49732534.85714286, + "logits/rejected": -43392867.2, + "logps/chosen": -327.02267020089283, + "logps/rejected": -474.59541015625, + "loss": 0.0922, + "rewards/chosen": 7.127044677734375, + "rewards/margins": 17.27719039916992, + "rewards/rejected": -10.150145721435546, + "step": 1594 + }, + { + "epoch": 0.39909921181033403, + "grad_norm": 11.75, + "kl": 2.157099485397339, + "learning_rate": 5e-06, + "logits/chosen": -64320992.0, + "logits/rejected": -66887384.0, + "logps/chosen": -499.60137939453125, + "logps/rejected": -731.0667114257812, + "loss": 0.0301, + "rewards/chosen": 8.41608715057373, + "rewards/margins": 23.448493003845215, + "rewards/rejected": -15.032405853271484, + "step": 1595 + }, + { + "epoch": 0.39934943075190793, + "grad_norm": 11.875, + "kl": 1.0198545455932617, + "learning_rate": 5e-06, + "logits/chosen": -22762653.866666667, + "logits/rejected": -16213962.666666666, + "logps/chosen": -408.17578125, + "logps/rejected": -432.27001953125, + "loss": 0.0705, + "rewards/chosen": 7.218156941731771, + "rewards/margins": 15.025240919325087, + "rewards/rejected": -7.807083977593316, + "step": 1596 + }, + { + "epoch": 0.3995996496934818, + "grad_norm": 2.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29325284.57142857, + "logits/rejected": -14047376.0, + "logps/chosen": -362.2606724330357, + "logps/rejected": -356.903125, + "loss": 0.054, + "rewards/chosen": 7.910264151436942, + "rewards/margins": 17.62156481061663, + "rewards/rejected": -9.711300659179688, + "step": 1597 + }, + { + "epoch": 0.39984986863505567, + "grad_norm": 3.734375, + "kl": 1.9071954488754272, + "learning_rate": 5e-06, + "logits/chosen": -41418423.46666667, + "logits/rejected": -52286851.55555555, + "logps/chosen": -407.4845703125, + "logps/rejected": -751.3637152777778, + "loss": 0.0529, + "rewards/chosen": 7.005551656087239, + "rewards/margins": 20.37004682752821, + "rewards/rejected": -13.364495171440971, + "step": 1598 + }, + { + "epoch": 0.40010008757662957, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61755477.333333336, + "logits/rejected": -71222661.33333333, + "logps/chosen": -470.6901041666667, + "logps/rejected": -531.2591959635416, + "loss": 0.1537, + "rewards/chosen": 7.345523198445638, + "rewards/margins": 16.743183135986328, + "rewards/rejected": -9.39765993754069, + "step": 1599 + }, + { + "epoch": 0.4003503065182034, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43855260.0, + "logits/rejected": -39946852.0, + "logps/chosen": -390.2655029296875, + "logps/rejected": -591.4718017578125, + "loss": 0.1058, + "rewards/chosen": 5.37814998626709, + "rewards/margins": 16.34162425994873, + "rewards/rejected": -10.96347427368164, + "step": 1600 + }, + { + "epoch": 0.4006005254597773, + "grad_norm": 7.125, + "kl": 2.5327582359313965, + "learning_rate": 5e-06, + "logits/chosen": -22454957.333333332, + "logits/rejected": -42332994.666666664, + "logps/chosen": -375.364501953125, + "logps/rejected": -682.6041666666666, + "loss": 0.03, + "rewards/chosen": 6.137227376302083, + "rewards/margins": 22.949923197428383, + "rewards/rejected": -16.8126958211263, + "step": 1601 + }, + { + "epoch": 0.40085074440135116, + "grad_norm": 5.65625, + "kl": 5.81404447555542, + "learning_rate": 5e-06, + "logits/chosen": -72725269.33333333, + "logits/rejected": -67427829.33333333, + "logps/chosen": -362.2223714192708, + "logps/rejected": -767.32275390625, + "loss": 0.0651, + "rewards/chosen": 7.319016774495442, + "rewards/margins": 24.870738983154297, + "rewards/rejected": -17.551722208658855, + "step": 1602 + }, + { + "epoch": 0.40110096334292505, + "grad_norm": 1.9453125, + "kl": 0.6826578974723816, + "learning_rate": 5e-06, + "logits/chosen": -62999207.384615384, + "logits/rejected": -83008180.36363636, + "logps/chosen": -566.6778094951923, + "logps/rejected": -835.6917613636364, + "loss": 0.002, + "rewards/chosen": 8.83831552358774, + "rewards/margins": 25.206550331382488, + "rewards/rejected": -16.368234807794746, + "step": 1603 + }, + { + "epoch": 0.40135118228449895, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58847604.36363637, + "logits/rejected": -91652381.53846154, + "logps/chosen": -439.6197620738636, + "logps/rejected": -647.1583533653846, + "loss": 0.0168, + "rewards/chosen": 8.827486905184658, + "rewards/margins": 19.535945438838503, + "rewards/rejected": -10.708458533653847, + "step": 1604 + }, + { + "epoch": 0.4016014012260728, + "grad_norm": 16.75, + "kl": 11.9111328125, + "learning_rate": 5e-06, + "logits/chosen": -46197800.72727273, + "logits/rejected": -60817403.07692308, + "logps/chosen": -457.00883345170456, + "logps/rejected": -662.6548978365385, + "loss": 0.0449, + "rewards/chosen": 8.777660023082387, + "rewards/margins": 19.03984357927229, + "rewards/rejected": -10.262183556189903, + "step": 1605 + }, + { + "epoch": 0.4018516201676467, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47125570.90909091, + "logits/rejected": 26224886.153846152, + "logps/chosen": -336.74074485085225, + "logps/rejected": -609.8313551682693, + "loss": 0.0566, + "rewards/chosen": 6.045225663618608, + "rewards/margins": 18.470347264429908, + "rewards/rejected": -12.425121600811298, + "step": 1606 + }, + { + "epoch": 0.4021018391092206, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34907016.0, + "logits/rejected": -48890640.0, + "logps/chosen": -337.1698303222656, + "logps/rejected": -714.4163818359375, + "loss": 0.0222, + "rewards/chosen": 5.899375915527344, + "rewards/margins": 20.677775382995605, + "rewards/rejected": -14.778399467468262, + "step": 1607 + }, + { + "epoch": 0.40235205805079444, + "grad_norm": 1.5703125, + "kl": 5.206974983215332, + "learning_rate": 5e-06, + "logits/chosen": -92848256.0, + "logits/rejected": -34442408.0, + "logps/chosen": -633.7376302083334, + "logps/rejected": -457.6949055989583, + "loss": 0.0038, + "rewards/chosen": 9.627501169840494, + "rewards/margins": 20.351619720458984, + "rewards/rejected": -10.72411855061849, + "step": 1608 + }, + { + "epoch": 0.40260227699236834, + "grad_norm": 16.375, + "kl": 0.4834175109863281, + "learning_rate": 5e-06, + "logits/chosen": -27103404.0, + "logits/rejected": -56363736.0, + "logps/chosen": -313.13079833984375, + "logps/rejected": -731.333251953125, + "loss": 0.0417, + "rewards/chosen": 5.4009504318237305, + "rewards/margins": 22.58290386199951, + "rewards/rejected": -17.18195343017578, + "step": 1609 + }, + { + "epoch": 0.4028524959339422, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50550870.4, + "logits/rejected": -43663108.571428575, + "logps/chosen": -511.809228515625, + "logps/rejected": -722.9559849330357, + "loss": 0.0157, + "rewards/chosen": 8.118496704101563, + "rewards/margins": 23.238019888741633, + "rewards/rejected": -15.119523184640068, + "step": 1610 + }, + { + "epoch": 0.4031027148755161, + "grad_norm": 1.375, + "kl": 2.832933187484741, + "learning_rate": 5e-06, + "logits/chosen": -67690344.72727273, + "logits/rejected": -51849491.692307696, + "logps/chosen": -452.74027876420456, + "logps/rejected": -715.4265324519231, + "loss": 0.0262, + "rewards/chosen": 6.586632468483665, + "rewards/margins": 24.123131491921164, + "rewards/rejected": -17.5364990234375, + "step": 1611 + }, + { + "epoch": 0.40335293381709, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48614493.538461536, + "logits/rejected": -75255394.9090909, + "logps/chosen": -316.8092698317308, + "logps/rejected": -580.0197531960227, + "loss": 0.0488, + "rewards/chosen": 5.223993741548979, + "rewards/margins": 18.292078458345856, + "rewards/rejected": -13.068084716796875, + "step": 1612 + }, + { + "epoch": 0.4036031527586638, + "grad_norm": 4.8125, + "kl": 7.0762939453125, + "learning_rate": 5e-06, + "logits/chosen": -49319916.0, + "logits/rejected": -64383792.0, + "logps/chosen": -453.5142822265625, + "logps/rejected": -615.13037109375, + "loss": 0.0563, + "rewards/chosen": 7.7760701179504395, + "rewards/margins": 22.334112644195557, + "rewards/rejected": -14.558042526245117, + "step": 1613 + }, + { + "epoch": 0.4038533717002377, + "grad_norm": 12.8125, + "kl": 0.6714655756950378, + "learning_rate": 5e-06, + "logits/chosen": -54289314.461538464, + "logits/rejected": -32443156.363636363, + "logps/chosen": -347.29867788461536, + "logps/rejected": -524.3008700284091, + "loss": 0.0431, + "rewards/chosen": 6.065758925217849, + "rewards/margins": 18.662302670778928, + "rewards/rejected": -12.59654374556108, + "step": 1614 + }, + { + "epoch": 0.40410359064181156, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16611586.461538462, + "logits/rejected": -58961117.09090909, + "logps/chosen": -291.66426908052887, + "logps/rejected": -606.5534002130681, + "loss": 0.032, + "rewards/chosen": 6.913573631873498, + "rewards/margins": 19.429420577896224, + "rewards/rejected": -12.515846946022727, + "step": 1615 + }, + { + "epoch": 0.40435380958338546, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42278272.0, + "logits/rejected": -67463104.0, + "logps/chosen": -469.428271484375, + "logps/rejected": -714.1205357142857, + "loss": 0.0212, + "rewards/chosen": 9.806190490722656, + "rewards/margins": 22.932485307965962, + "rewards/rejected": -13.126294817243304, + "step": 1616 + }, + { + "epoch": 0.40460402852495936, + "grad_norm": 10.625, + "kl": 10.973780632019043, + "learning_rate": 5e-06, + "logits/chosen": -49494793.14285714, + "logits/rejected": -11222953.6, + "logps/chosen": -471.3872767857143, + "logps/rejected": -633.00107421875, + "loss": 0.1014, + "rewards/chosen": 7.461336408342634, + "rewards/margins": 20.24663303920201, + "rewards/rejected": -12.785296630859374, + "step": 1617 + }, + { + "epoch": 0.4048542474665332, + "grad_norm": 12.5625, + "kl": 4.824309825897217, + "learning_rate": 5e-06, + "logits/chosen": -20458724.923076924, + "logits/rejected": -20059194.181818184, + "logps/chosen": -284.76647010216345, + "logps/rejected": -415.1149236505682, + "loss": 0.1211, + "rewards/chosen": 4.953625018780048, + "rewards/margins": 14.63243860631556, + "rewards/rejected": -9.678813587535512, + "step": 1618 + }, + { + "epoch": 0.4051044664081071, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64775936.0, + "logits/rejected": -49273388.307692304, + "logps/chosen": -456.61550071022725, + "logps/rejected": -896.0818810096154, + "loss": 0.0161, + "rewards/chosen": 8.672219016335227, + "rewards/margins": 23.0474807632553, + "rewards/rejected": -14.375261746920073, + "step": 1619 + }, + { + "epoch": 0.40535468534968094, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41676608.0, + "logits/rejected": -71040580.92307693, + "logps/chosen": -391.15438565340907, + "logps/rejected": -545.0922475961538, + "loss": 0.0149, + "rewards/chosen": 8.871658325195312, + "rewards/margins": 19.90182847243089, + "rewards/rejected": -11.030170147235577, + "step": 1620 + }, + { + "epoch": 0.40560490429125484, + "grad_norm": 6.21875, + "kl": 0.8442357778549194, + "learning_rate": 5e-06, + "logits/chosen": -71265072.0, + "logits/rejected": -44288000.0, + "logps/chosen": -338.06337483723956, + "logps/rejected": -408.3837890625, + "loss": 0.0662, + "rewards/chosen": 6.6371409098307295, + "rewards/margins": 16.58682696024577, + "rewards/rejected": -9.949686050415039, + "step": 1621 + }, + { + "epoch": 0.40585512323282874, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35222400.0, + "logits/rejected": -28654796.8, + "logps/chosen": -484.07769097222223, + "logps/rejected": -592.2565755208333, + "loss": 0.0137, + "rewards/chosen": 7.5685475667317705, + "rewards/margins": 19.422107950846353, + "rewards/rejected": -11.853560384114584, + "step": 1622 + }, + { + "epoch": 0.4061053421744026, + "grad_norm": 12.75, + "kl": 14.491351127624512, + "learning_rate": 5e-06, + "logits/chosen": -39417958.4, + "logits/rejected": -70630243.55555555, + "logps/chosen": -397.4002278645833, + "logps/rejected": -593.7132703993055, + "loss": 0.0673, + "rewards/chosen": 7.868872578938802, + "rewards/margins": 20.150497266981336, + "rewards/rejected": -12.281624688042534, + "step": 1623 + }, + { + "epoch": 0.4063555611159765, + "grad_norm": 16.5, + "kl": 21.086952209472656, + "learning_rate": 5e-06, + "logits/chosen": -76505792.0, + "logits/rejected": -49889544.0, + "logps/chosen": -423.59661865234375, + "logps/rejected": -575.0799560546875, + "loss": 0.0381, + "rewards/chosen": 9.007286071777344, + "rewards/margins": 22.115436553955078, + "rewards/rejected": -13.108150482177734, + "step": 1624 + }, + { + "epoch": 0.4066057800575504, + "grad_norm": 23.875, + "kl": 0.9055683016777039, + "learning_rate": 5e-06, + "logits/chosen": -70076640.0, + "logits/rejected": -24935338.666666668, + "logps/chosen": -345.0940348307292, + "logps/rejected": -418.3577067057292, + "loss": 0.0782, + "rewards/chosen": 6.653207778930664, + "rewards/margins": 12.748928705851238, + "rewards/rejected": -6.095720926920573, + "step": 1625 + }, + { + "epoch": 0.4068559989991242, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11061960.0, + "logits/rejected": -48311524.571428575, + "logps/chosen": -407.00556640625, + "logps/rejected": -803.0528041294643, + "loss": 0.0331, + "rewards/chosen": 7.969582366943359, + "rewards/margins": 27.628164781842912, + "rewards/rejected": -19.658582414899552, + "step": 1626 + }, + { + "epoch": 0.4071062179406981, + "grad_norm": 6.28125, + "kl": 1.2132682800292969, + "learning_rate": 5e-06, + "logits/chosen": -58869605.333333336, + "logits/rejected": -61672352.0, + "logps/chosen": -458.9873046875, + "logps/rejected": -614.5347493489584, + "loss": 0.0175, + "rewards/chosen": 10.299263000488281, + "rewards/margins": 23.059705098470054, + "rewards/rejected": -12.760442097981771, + "step": 1627 + }, + { + "epoch": 0.40735643688227197, + "grad_norm": 20.25, + "kl": 6.165335655212402, + "learning_rate": 5e-06, + "logits/chosen": -50929296.0, + "logits/rejected": -60310784.0, + "logps/chosen": -383.41181640625, + "logps/rejected": -456.39090401785717, + "loss": 0.0709, + "rewards/chosen": 6.504924774169922, + "rewards/margins": 14.75384793962751, + "rewards/rejected": -8.248923165457589, + "step": 1628 + }, + { + "epoch": 0.40760665582384586, + "grad_norm": 8.25, + "kl": 11.097431182861328, + "learning_rate": 5e-06, + "logits/chosen": -74142600.0, + "logits/rejected": -26756688.0, + "logps/chosen": -420.35455322265625, + "logps/rejected": -513.0372314453125, + "loss": 0.0941, + "rewards/chosen": 8.567237854003906, + "rewards/margins": 17.679146766662598, + "rewards/rejected": -9.111908912658691, + "step": 1629 + }, + { + "epoch": 0.40785687476541976, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75845862.4, + "logits/rejected": -54046678.85714286, + "logps/chosen": -351.93271484375, + "logps/rejected": -742.3795340401786, + "loss": 0.0315, + "rewards/chosen": 6.866731262207031, + "rewards/margins": 20.888557870047432, + "rewards/rejected": -14.021826607840401, + "step": 1630 + }, + { + "epoch": 0.4081070937069936, + "grad_norm": 8.4375, + "kl": 9.062006950378418, + "learning_rate": 5e-06, + "logits/chosen": -38657929.14285714, + "logits/rejected": -57676588.8, + "logps/chosen": -376.6824428013393, + "logps/rejected": -612.24580078125, + "loss": 0.0255, + "rewards/chosen": 9.395347595214844, + "rewards/margins": 20.158099365234374, + "rewards/rejected": -10.76275177001953, + "step": 1631 + }, + { + "epoch": 0.4083573126485675, + "grad_norm": 3.796875, + "kl": 3.86767840385437, + "learning_rate": 5e-06, + "logits/chosen": -48052172.8, + "logits/rejected": -49687328.0, + "logps/chosen": -414.1460286458333, + "logps/rejected": -582.6489800347222, + "loss": 0.0389, + "rewards/chosen": 8.310963948567709, + "rewards/margins": 24.010028415256077, + "rewards/rejected": -15.699064466688368, + "step": 1632 + }, + { + "epoch": 0.40860753159014135, + "grad_norm": 13.75, + "kl": 23.204315185546875, + "learning_rate": 5e-06, + "logits/chosen": -63010349.71428572, + "logits/rejected": -94101990.4, + "logps/chosen": -445.02322823660717, + "logps/rejected": -709.43779296875, + "loss": 0.0838, + "rewards/chosen": 8.850147247314453, + "rewards/margins": 25.801343536376955, + "rewards/rejected": -16.9511962890625, + "step": 1633 + }, + { + "epoch": 0.40885775053171525, + "grad_norm": 13.625, + "kl": 13.236379623413086, + "learning_rate": 5e-06, + "logits/chosen": -78423835.42857143, + "logits/rejected": -59609932.8, + "logps/chosen": -480.70584542410717, + "logps/rejected": -621.876708984375, + "loss": 0.1102, + "rewards/chosen": 7.837894984654018, + "rewards/margins": 19.3686765398298, + "rewards/rejected": -11.530781555175782, + "step": 1634 + }, + { + "epoch": 0.40910796947328915, + "grad_norm": 6.84375, + "kl": 10.435267448425293, + "learning_rate": 5e-06, + "logits/chosen": -51758596.571428575, + "logits/rejected": -10071542.4, + "logps/chosen": -363.88692801339283, + "logps/rejected": -366.076904296875, + "loss": 0.0477, + "rewards/chosen": 6.55420902797154, + "rewards/margins": 13.6871458871024, + "rewards/rejected": -7.13293685913086, + "step": 1635 + }, + { + "epoch": 0.409358188414863, + "grad_norm": 16.5, + "kl": 33.143619537353516, + "learning_rate": 5e-06, + "logits/chosen": -42388710.4, + "logits/rejected": -46028391.11111111, + "logps/chosen": -451.69182942708335, + "logps/rejected": -294.4460720486111, + "loss": 0.1177, + "rewards/chosen": 8.79393819173177, + "rewards/margins": 11.47926762898763, + "rewards/rejected": -2.6853294372558594, + "step": 1636 + }, + { + "epoch": 0.4096084073564369, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -78172153.6, + "logits/rejected": -57518752.0, + "logps/chosen": -461.49228515625, + "logps/rejected": -512.8747907366071, + "loss": 0.058, + "rewards/chosen": 10.112064361572266, + "rewards/margins": 21.680502210344585, + "rewards/rejected": -11.568437848772321, + "step": 1637 + }, + { + "epoch": 0.4098586262980108, + "grad_norm": 8.0625, + "kl": 0.9550074338912964, + "learning_rate": 5e-06, + "logits/chosen": -37777479.384615384, + "logits/rejected": -8331448.7272727275, + "logps/chosen": -371.5591571514423, + "logps/rejected": -548.6255326704545, + "loss": 0.0555, + "rewards/chosen": 6.262809166541467, + "rewards/margins": 16.604760016594735, + "rewards/rejected": -10.341950850053268, + "step": 1638 + }, + { + "epoch": 0.41010884523958463, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -96284252.44444445, + "logits/rejected": -36917922.13333333, + "logps/chosen": -450.6339518229167, + "logps/rejected": -432.86627604166665, + "loss": 0.0291, + "rewards/chosen": 10.473131815592447, + "rewards/margins": 18.828085327148436, + "rewards/rejected": -8.354953511555989, + "step": 1639 + }, + { + "epoch": 0.41035906418115853, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58175668.36363637, + "logits/rejected": -41718350.76923077, + "logps/chosen": -377.9964488636364, + "logps/rejected": -553.6769831730769, + "loss": 0.0313, + "rewards/chosen": 7.573543201793324, + "rewards/margins": 16.668775651838395, + "rewards/rejected": -9.095232450045073, + "step": 1640 + }, + { + "epoch": 0.41060928312273237, + "grad_norm": 19.5, + "kl": 4.1324872970581055, + "learning_rate": 5e-06, + "logits/chosen": -67599581.53846154, + "logits/rejected": -45677355.63636363, + "logps/chosen": -262.44119966947113, + "logps/rejected": -637.9925426136364, + "loss": 0.1267, + "rewards/chosen": 6.753439683180589, + "rewards/margins": 16.507864732008713, + "rewards/rejected": -9.754425048828125, + "step": 1641 + }, + { + "epoch": 0.41085950206430627, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52601821.86666667, + "logits/rejected": -96512782.22222222, + "logps/chosen": -368.4419270833333, + "logps/rejected": -725.8990885416666, + "loss": 0.0775, + "rewards/chosen": 6.6067352294921875, + "rewards/margins": 19.85471513536241, + "rewards/rejected": -13.247979905870226, + "step": 1642 + }, + { + "epoch": 0.41110972100588017, + "grad_norm": 4.96875, + "kl": 2.2518508434295654, + "learning_rate": 5e-06, + "logits/chosen": -39417658.18181818, + "logits/rejected": -38469410.461538464, + "logps/chosen": -358.3555353338068, + "logps/rejected": -532.1684945913462, + "loss": 0.0695, + "rewards/chosen": 7.873641274192116, + "rewards/margins": 19.963116465748605, + "rewards/rejected": -12.08947519155649, + "step": 1643 + }, + { + "epoch": 0.411359939947454, + "grad_norm": 4.53125, + "kl": 2.1452293395996094, + "learning_rate": 5e-06, + "logits/chosen": -53285396.0, + "logits/rejected": -36789344.0, + "logps/chosen": -388.9821472167969, + "logps/rejected": -507.4839782714844, + "loss": 0.0338, + "rewards/chosen": 7.502474308013916, + "rewards/margins": 16.050246715545654, + "rewards/rejected": -8.547772407531738, + "step": 1644 + }, + { + "epoch": 0.4116101588890279, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59529109.333333336, + "logits/rejected": -43106700.8, + "logps/chosen": -441.4747721354167, + "logps/rejected": -614.9171875, + "loss": 0.0019, + "rewards/chosen": 7.371741400824653, + "rewards/margins": 19.044385443793402, + "rewards/rejected": -11.67264404296875, + "step": 1645 + }, + { + "epoch": 0.41186037783060175, + "grad_norm": 5.59375, + "kl": 6.3410139083862305, + "learning_rate": 5e-06, + "logits/chosen": -72831241.84615384, + "logits/rejected": -40293742.54545455, + "logps/chosen": -422.9115459735577, + "logps/rejected": -579.6659712357955, + "loss": 0.0452, + "rewards/chosen": 8.884330749511719, + "rewards/margins": 19.988499728116125, + "rewards/rejected": -11.104168978604404, + "step": 1646 + }, + { + "epoch": 0.41211059677217565, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -106576910.22222222, + "logits/rejected": -51364804.266666666, + "logps/chosen": -362.53436957465277, + "logps/rejected": -607.9970052083333, + "loss": 0.0313, + "rewards/chosen": 7.3232532077365455, + "rewards/margins": 22.442031690809465, + "rewards/rejected": -15.118778483072917, + "step": 1647 + }, + { + "epoch": 0.41236081571374955, + "grad_norm": 1.1171875, + "kl": 0.24424616992473602, + "learning_rate": 5e-06, + "logits/chosen": -33650903.27272727, + "logits/rejected": -78140923.07692307, + "logps/chosen": -324.45192649147725, + "logps/rejected": -701.568359375, + "loss": 0.0159, + "rewards/chosen": 6.9267661354758525, + "rewards/margins": 22.86611597021143, + "rewards/rejected": -15.939349834735577, + "step": 1648 + }, + { + "epoch": 0.4126110346553234, + "grad_norm": 5.46875, + "kl": 10.869810104370117, + "learning_rate": 5e-06, + "logits/chosen": -78130243.76470588, + "logits/rejected": -51810761.14285714, + "logps/chosen": -393.2431640625, + "logps/rejected": -599.7381417410714, + "loss": 0.0693, + "rewards/chosen": 7.30051466997932, + "rewards/margins": 23.438502079298516, + "rewards/rejected": -16.137987409319198, + "step": 1649 + }, + { + "epoch": 0.4128612535968973, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18088593.14285714, + "logits/rejected": -36917987.2, + "logps/chosen": -295.1450892857143, + "logps/rejected": -672.609765625, + "loss": 0.0635, + "rewards/chosen": 5.77373286655971, + "rewards/margins": 21.42665339878627, + "rewards/rejected": -15.652920532226563, + "step": 1650 + }, + { + "epoch": 0.41311147253847114, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48716243.2, + "logits/rejected": 64390491.428571425, + "logps/chosen": -354.59130859375, + "logps/rejected": -505.39439174107144, + "loss": 0.0315, + "rewards/chosen": 5.775858306884766, + "rewards/margins": 15.95925805228097, + "rewards/rejected": -10.183399745396205, + "step": 1651 + }, + { + "epoch": 0.41336169148004503, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59955066.18181818, + "logits/rejected": -48774449.23076923, + "logps/chosen": -418.4396306818182, + "logps/rejected": -464.0920973557692, + "loss": 0.0546, + "rewards/chosen": 6.665956670587713, + "rewards/margins": 15.543499019596126, + "rewards/rejected": -8.877542349008413, + "step": 1652 + }, + { + "epoch": 0.41361191042161893, + "grad_norm": 7.78125, + "kl": 1.8725414276123047, + "learning_rate": 5e-06, + "logits/chosen": -90364060.44444445, + "logits/rejected": -52314875.733333334, + "logps/chosen": -370.7450900607639, + "logps/rejected": -511.53444010416666, + "loss": 0.0213, + "rewards/chosen": 6.4102355109320746, + "rewards/margins": 17.299754757351344, + "rewards/rejected": -10.88951924641927, + "step": 1653 + }, + { + "epoch": 0.4138621293631928, + "grad_norm": 10.8125, + "kl": 15.272687911987305, + "learning_rate": 5e-06, + "logits/chosen": -67907895.46666667, + "logits/rejected": -81382094.22222222, + "logps/chosen": -556.381640625, + "logps/rejected": -735.2584635416666, + "loss": 0.0178, + "rewards/chosen": 9.716265869140624, + "rewards/margins": 28.006650797526042, + "rewards/rejected": -18.290384928385418, + "step": 1654 + }, + { + "epoch": 0.4141123483047667, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59433717.333333336, + "logits/rejected": -44394624.0, + "logps/chosen": -374.85582139756946, + "logps/rejected": -501.3428059895833, + "loss": 0.0203, + "rewards/chosen": 7.185723198784722, + "rewards/margins": 20.551968722873266, + "rewards/rejected": -13.366245524088542, + "step": 1655 + }, + { + "epoch": 0.4143625672463406, + "grad_norm": 12.0625, + "kl": 3.02698016166687, + "learning_rate": 5e-06, + "logits/chosen": -47742965.333333336, + "logits/rejected": -45709952.0, + "logps/chosen": -408.3413492838542, + "logps/rejected": -440.6381022135417, + "loss": 0.037, + "rewards/chosen": 6.543295542399089, + "rewards/margins": 18.316186269124348, + "rewards/rejected": -11.77289072672526, + "step": 1656 + }, + { + "epoch": 0.4146127861879144, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39354192.0, + "logits/rejected": -52151040.0, + "logps/chosen": -374.6142985026042, + "logps/rejected": -519.3289794921875, + "loss": 0.0237, + "rewards/chosen": 9.195215861002604, + "rewards/margins": 20.375022888183594, + "rewards/rejected": -11.17980702718099, + "step": 1657 + }, + { + "epoch": 0.4148630051294883, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34607261.09090909, + "logits/rejected": -39773235.692307696, + "logps/chosen": -256.5924183238636, + "logps/rejected": -613.1406625600962, + "loss": 0.0417, + "rewards/chosen": 6.530414234508168, + "rewards/margins": 22.39778249247091, + "rewards/rejected": -15.86736825796274, + "step": 1658 + }, + { + "epoch": 0.41511322407106216, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33748395.428571425, + "logits/rejected": -49825824.0, + "logps/chosen": -406.86886160714283, + "logps/rejected": -685.704931640625, + "loss": 0.0535, + "rewards/chosen": 6.1253525870186945, + "rewards/margins": 21.165071214948384, + "rewards/rejected": -15.039718627929688, + "step": 1659 + }, + { + "epoch": 0.41536344301263606, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44813175.46666667, + "logits/rejected": -58289070.222222224, + "logps/chosen": -314.68037109375, + "logps/rejected": -754.6195746527778, + "loss": 0.0667, + "rewards/chosen": 4.75397694905599, + "rewards/margins": 18.113082377115887, + "rewards/rejected": -13.359105428059896, + "step": 1660 + }, + { + "epoch": 0.41561366195420996, + "grad_norm": 3.734375, + "kl": 8.254678726196289, + "learning_rate": 5e-06, + "logits/chosen": -45960979.692307696, + "logits/rejected": -9046398.545454545, + "logps/chosen": -351.25345552884613, + "logps/rejected": -513.6558061079545, + "loss": 0.0932, + "rewards/chosen": 7.35093982403095, + "rewards/margins": 19.912302350664472, + "rewards/rejected": -12.561362526633523, + "step": 1661 + }, + { + "epoch": 0.4158638808957838, + "grad_norm": 7.96875, + "kl": 15.150541305541992, + "learning_rate": 5e-06, + "logits/chosen": -41836072.0, + "logits/rejected": -40394258.666666664, + "logps/chosen": -461.226318359375, + "logps/rejected": -516.92333984375, + "loss": 0.0686, + "rewards/chosen": 8.58858871459961, + "rewards/margins": 20.047770182291664, + "rewards/rejected": -11.459181467692057, + "step": 1662 + }, + { + "epoch": 0.4161140998373577, + "grad_norm": 4.15625, + "kl": 2.2227554321289062, + "learning_rate": 5e-06, + "logits/chosen": -55155347.2, + "logits/rejected": -43960896.0, + "logps/chosen": -517.47265625, + "logps/rejected": -523.0019182477679, + "loss": 0.007, + "rewards/chosen": 9.063607788085937, + "rewards/margins": 20.95015651157924, + "rewards/rejected": -11.886548723493304, + "step": 1663 + }, + { + "epoch": 0.41636431877893154, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27399669.333333332, + "logits/rejected": -40272230.4, + "logps/chosen": -259.3344455295139, + "logps/rejected": -530.4388997395833, + "loss": 0.0684, + "rewards/chosen": 7.452676561143663, + "rewards/margins": 19.821204800075954, + "rewards/rejected": -12.368528238932292, + "step": 1664 + }, + { + "epoch": 0.41661453772050544, + "grad_norm": 14.5, + "kl": 2.8173513412475586, + "learning_rate": 5e-06, + "logits/chosen": -29017962.666666668, + "logits/rejected": -96421717.33333333, + "logps/chosen": -465.97352430555554, + "logps/rejected": -787.6481770833333, + "loss": 0.0728, + "rewards/chosen": 6.756870693630642, + "rewards/margins": 24.125079515245226, + "rewards/rejected": -17.368208821614584, + "step": 1665 + }, + { + "epoch": 0.41686475666207934, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57357125.81818182, + "logits/rejected": -57023867.07692308, + "logps/chosen": -482.08540482954544, + "logps/rejected": -676.1416015625, + "loss": 0.0493, + "rewards/chosen": 8.817036021839488, + "rewards/margins": 25.959117649318454, + "rewards/rejected": -17.142081627478966, + "step": 1666 + }, + { + "epoch": 0.4171149756036532, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32513943.272727273, + "logits/rejected": -35667318.15384615, + "logps/chosen": -353.6112171519886, + "logps/rejected": -454.02403846153845, + "loss": 0.0095, + "rewards/chosen": 7.150106950239702, + "rewards/margins": 17.545078357616504, + "rewards/rejected": -10.394971407376802, + "step": 1667 + }, + { + "epoch": 0.4173651945452271, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14909945.6, + "logits/rejected": -34631609.14285714, + "logps/chosen": -326.430078125, + "logps/rejected": -513.4900948660714, + "loss": 0.053, + "rewards/chosen": 7.146656036376953, + "rewards/margins": 18.982521602085658, + "rewards/rejected": -11.835865565708705, + "step": 1668 + }, + { + "epoch": 0.4176154134868009, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38462481.45454545, + "logits/rejected": -21171271.384615384, + "logps/chosen": -309.63822798295456, + "logps/rejected": -474.3869441105769, + "loss": 0.0396, + "rewards/chosen": 6.054537686434659, + "rewards/margins": 19.035671954388384, + "rewards/rejected": -12.981134267953726, + "step": 1669 + }, + { + "epoch": 0.4178656324283748, + "grad_norm": 14.625, + "kl": 1.3895353078842163, + "learning_rate": 5e-06, + "logits/chosen": -38947202.90909091, + "logits/rejected": -26649139.692307692, + "logps/chosen": -319.7601873224432, + "logps/rejected": -688.4923377403846, + "loss": 0.0854, + "rewards/chosen": 6.717070146040483, + "rewards/margins": 19.209234170980388, + "rewards/rejected": -12.492164024939903, + "step": 1670 + }, + { + "epoch": 0.4181158513699487, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52775790.222222224, + "logits/rejected": -43850641.06666667, + "logps/chosen": -549.8658854166666, + "logps/rejected": -596.2606770833333, + "loss": 0.011, + "rewards/chosen": 8.97830539279514, + "rewards/margins": 24.045494927300346, + "rewards/rejected": -15.067189534505209, + "step": 1671 + }, + { + "epoch": 0.41836607031152256, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50244681.14285714, + "logits/rejected": -51207126.5882353, + "logps/chosen": -367.73060825892856, + "logps/rejected": -696.1962316176471, + "loss": 0.0021, + "rewards/chosen": 8.864627293178014, + "rewards/margins": 23.465502250094374, + "rewards/rejected": -14.60087495691636, + "step": 1672 + }, + { + "epoch": 0.41861628925309646, + "grad_norm": 8.4375, + "kl": 2.4454410076141357, + "learning_rate": 5e-06, + "logits/chosen": -73214248.72727273, + "logits/rejected": -24650116.923076924, + "logps/chosen": -452.2507990056818, + "logps/rejected": -524.0574669471154, + "loss": 0.0495, + "rewards/chosen": 8.795964327725498, + "rewards/margins": 22.801457491788, + "rewards/rejected": -14.0054931640625, + "step": 1673 + }, + { + "epoch": 0.41886650819467036, + "grad_norm": 10.5625, + "kl": 8.477943420410156, + "learning_rate": 5e-06, + "logits/chosen": -76193516.3076923, + "logits/rejected": -40240570.18181818, + "logps/chosen": -400.3032977764423, + "logps/rejected": -570.1125710227273, + "loss": 0.121, + "rewards/chosen": 6.9015667255108175, + "rewards/margins": 16.85194583706089, + "rewards/rejected": -9.95037911155007, + "step": 1674 + }, + { + "epoch": 0.4191167271362442, + "grad_norm": 5.96875, + "kl": 0.4574432373046875, + "learning_rate": 5e-06, + "logits/chosen": -59518041.6, + "logits/rejected": -59382976.0, + "logps/chosen": -531.534423828125, + "logps/rejected": -778.0603376116071, + "loss": 0.0061, + "rewards/chosen": 6.635273742675781, + "rewards/margins": 21.527843148367744, + "rewards/rejected": -14.892569405691964, + "step": 1675 + }, + { + "epoch": 0.4193669460778181, + "grad_norm": 3.890625, + "kl": 6.50621223449707, + "learning_rate": 5e-06, + "logits/chosen": -60430308.571428575, + "logits/rejected": 66448550.4, + "logps/chosen": -397.740234375, + "logps/rejected": -513.68740234375, + "loss": 0.0082, + "rewards/chosen": 7.4650998796735495, + "rewards/margins": 22.64015938895089, + "rewards/rejected": -15.175059509277343, + "step": 1676 + }, + { + "epoch": 0.41961716501939195, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52323658.666666664, + "logits/rejected": -35400945.06666667, + "logps/chosen": -359.4195963541667, + "logps/rejected": -426.90074869791664, + "loss": 0.0252, + "rewards/chosen": 9.082895067003038, + "rewards/margins": 18.942035081651476, + "rewards/rejected": -9.859140014648437, + "step": 1677 + }, + { + "epoch": 0.41986738396096585, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44130116.92307692, + "logits/rejected": -55947496.72727273, + "logps/chosen": -370.26558743990387, + "logps/rejected": -570.3140980113636, + "loss": 0.0344, + "rewards/chosen": 6.565777118389423, + "rewards/margins": 19.24940832178076, + "rewards/rejected": -12.683631203391336, + "step": 1678 + }, + { + "epoch": 0.42011760290253974, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42849962.666666664, + "logits/rejected": -47988442.666666664, + "logps/chosen": -351.1490071614583, + "logps/rejected": -748.9602864583334, + "loss": 0.0417, + "rewards/chosen": 5.431649525960286, + "rewards/margins": 20.14191436767578, + "rewards/rejected": -14.710264841715494, + "step": 1679 + }, + { + "epoch": 0.4203678218441136, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55463368.0, + "logits/rejected": -58449936.0, + "logps/chosen": -352.3590087890625, + "logps/rejected": -500.0791015625, + "loss": 0.0149, + "rewards/chosen": 6.365176200866699, + "rewards/margins": 17.255043029785156, + "rewards/rejected": -10.889866828918457, + "step": 1680 + }, + { + "epoch": 0.4206180407856875, + "grad_norm": 14.125, + "kl": 2.4271063804626465, + "learning_rate": 5e-06, + "logits/chosen": -42911021.71428572, + "logits/rejected": -66143411.2, + "logps/chosen": -361.56539481026783, + "logps/rejected": -820.8953125, + "loss": 0.0301, + "rewards/chosen": 6.682152884347098, + "rewards/margins": 24.351752798897877, + "rewards/rejected": -17.66959991455078, + "step": 1681 + }, + { + "epoch": 0.42086825972726133, + "grad_norm": 10.0, + "kl": 14.30746078491211, + "learning_rate": 5e-06, + "logits/chosen": -47561053.86666667, + "logits/rejected": -16069086.222222222, + "logps/chosen": -474.1516927083333, + "logps/rejected": -803.1561957465278, + "loss": 0.0429, + "rewards/chosen": 8.564068603515626, + "rewards/margins": 22.24999525282118, + "rewards/rejected": -13.685926649305555, + "step": 1682 + }, + { + "epoch": 0.4211184786688352, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -90078016.0, + "logits/rejected": -42238554.35294118, + "logps/chosen": -424.76217215401783, + "logps/rejected": -540.1424632352941, + "loss": 0.0389, + "rewards/chosen": 6.276822771344866, + "rewards/margins": 18.797487659614628, + "rewards/rejected": -12.520664888269762, + "step": 1683 + }, + { + "epoch": 0.4213686976104091, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45041888.0, + "logits/rejected": -24466570.0, + "logps/chosen": -393.0665283203125, + "logps/rejected": -418.3067321777344, + "loss": 0.043, + "rewards/chosen": 5.994039535522461, + "rewards/margins": 13.953697204589844, + "rewards/rejected": -7.959657669067383, + "step": 1684 + }, + { + "epoch": 0.42161891655198297, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60438813.09090909, + "logits/rejected": -59543236.92307692, + "logps/chosen": -368.94462446732956, + "logps/rejected": -607.21142578125, + "loss": 0.0419, + "rewards/chosen": 6.691439541903409, + "rewards/margins": 20.946162723994757, + "rewards/rejected": -14.254723182091347, + "step": 1685 + }, + { + "epoch": 0.42186913549355687, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46280585.14285714, + "logits/rejected": -25187708.8, + "logps/chosen": -412.6154087611607, + "logps/rejected": -552.739453125, + "loss": 0.0372, + "rewards/chosen": 5.245063236781529, + "rewards/margins": 13.877762821742465, + "rewards/rejected": -8.632699584960937, + "step": 1686 + }, + { + "epoch": 0.42211935443513077, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38903095.27272727, + "logits/rejected": -59422163.692307696, + "logps/chosen": -370.1875, + "logps/rejected": -626.2998046875, + "loss": 0.0222, + "rewards/chosen": 8.709320761940695, + "rewards/margins": 23.908850556486968, + "rewards/rejected": -15.199529794546274, + "step": 1687 + }, + { + "epoch": 0.4223695733767046, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44666617.6, + "logits/rejected": -49186546.28571428, + "logps/chosen": -456.327294921875, + "logps/rejected": -634.3900669642857, + "loss": 0.0149, + "rewards/chosen": 8.881990051269531, + "rewards/margins": 21.954424612862724, + "rewards/rejected": -13.072434561593193, + "step": 1688 + }, + { + "epoch": 0.4226197923182785, + "grad_norm": 2.375, + "kl": 14.093932151794434, + "learning_rate": 5e-06, + "logits/chosen": -26737600.0, + "logits/rejected": -51014341.333333336, + "logps/chosen": -439.2331136067708, + "logps/rejected": -560.5958658854166, + "loss": 0.0632, + "rewards/chosen": 9.209739685058594, + "rewards/margins": 22.60663859049479, + "rewards/rejected": -13.396898905436197, + "step": 1689 + }, + { + "epoch": 0.42287001125985235, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49018087.384615384, + "logits/rejected": -55935738.18181818, + "logps/chosen": -382.02974759615387, + "logps/rejected": -668.0794566761364, + "loss": 0.0379, + "rewards/chosen": 6.03826669546274, + "rewards/margins": 20.63727265471345, + "rewards/rejected": -14.59900595925071, + "step": 1690 + }, + { + "epoch": 0.42312023020142625, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35056898.666666664, + "logits/rejected": -40455616.0, + "logps/chosen": -463.4138590494792, + "logps/rejected": -415.2669270833333, + "loss": 0.0331, + "rewards/chosen": 7.657519022623698, + "rewards/margins": 18.174386978149414, + "rewards/rejected": -10.516867955525717, + "step": 1691 + }, + { + "epoch": 0.42337044914300015, + "grad_norm": 7.8125, + "kl": 2.8107411861419678, + "learning_rate": 5e-06, + "logits/chosen": -56132125.538461536, + "logits/rejected": -44724215.27272727, + "logps/chosen": -313.55093149038464, + "logps/rejected": -675.9422940340909, + "loss": 0.0471, + "rewards/chosen": 7.035715543306791, + "rewards/margins": 19.523497414755653, + "rewards/rejected": -12.487781871448863, + "step": 1692 + }, + { + "epoch": 0.423620668084574, + "grad_norm": 10.9375, + "kl": 14.037200927734375, + "learning_rate": 5e-06, + "logits/chosen": -47686311.11111111, + "logits/rejected": -44932746.666666664, + "logps/chosen": -362.4853515625, + "logps/rejected": -262.8715006510417, + "loss": 0.0882, + "rewards/chosen": 7.0972574022081165, + "rewards/margins": 14.097117529975044, + "rewards/rejected": -6.999860127766927, + "step": 1693 + }, + { + "epoch": 0.4238708870261479, + "grad_norm": 7.71875, + "kl": 0.16559919714927673, + "learning_rate": 5e-06, + "logits/chosen": -65217472.0, + "logits/rejected": -51975142.4, + "logps/chosen": -369.78770228794644, + "logps/rejected": -568.06142578125, + "loss": 0.0679, + "rewards/chosen": 7.244997297014509, + "rewards/margins": 19.16612003871373, + "rewards/rejected": -11.921122741699218, + "step": 1694 + }, + { + "epoch": 0.42412110596772173, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42784176.0, + "logits/rejected": 75429792.0, + "logps/chosen": -325.5848388671875, + "logps/rejected": -533.5114135742188, + "loss": 0.0583, + "rewards/chosen": 5.6966447830200195, + "rewards/margins": 17.175299644470215, + "rewards/rejected": -11.478654861450195, + "step": 1695 + }, + { + "epoch": 0.42437132490929563, + "grad_norm": 1.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16292379.2, + "logits/rejected": -46763830.85714286, + "logps/chosen": -578.477978515625, + "logps/rejected": -557.65380859375, + "loss": 0.002, + "rewards/chosen": 8.897708129882812, + "rewards/margins": 22.26006840297154, + "rewards/rejected": -13.362360273088727, + "step": 1696 + }, + { + "epoch": 0.42462154385086953, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56680969.14285714, + "logits/rejected": -45608990.11764706, + "logps/chosen": -307.37137276785717, + "logps/rejected": -507.7455193014706, + "loss": 0.0488, + "rewards/chosen": 5.891669137137277, + "rewards/margins": 15.873952785459888, + "rewards/rejected": -9.98228364832261, + "step": 1697 + }, + { + "epoch": 0.4248717627924434, + "grad_norm": 1.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51752352.0, + "logits/rejected": -84680849.06666666, + "logps/chosen": -445.4646809895833, + "logps/rejected": -563.323046875, + "loss": 0.0105, + "rewards/chosen": 7.810881720648871, + "rewards/margins": 19.430758836534288, + "rewards/rejected": -11.619877115885417, + "step": 1698 + }, + { + "epoch": 0.4251219817340173, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58546265.6, + "logits/rejected": -35943516.44444445, + "logps/chosen": -314.38645833333334, + "logps/rejected": -482.13525390625, + "loss": 0.0876, + "rewards/chosen": 4.787557983398438, + "rewards/margins": 16.967413330078124, + "rewards/rejected": -12.179855346679688, + "step": 1699 + }, + { + "epoch": 0.4253722006755911, + "grad_norm": 3.28125, + "kl": 2.7685952186584473, + "learning_rate": 5e-06, + "logits/chosen": -48780144.0, + "logits/rejected": -62419082.666666664, + "logps/chosen": -395.3228352864583, + "logps/rejected": -498.8789469401042, + "loss": 0.018, + "rewards/chosen": 7.900559743245442, + "rewards/margins": 19.62813949584961, + "rewards/rejected": -11.727579752604166, + "step": 1700 + }, + { + "epoch": 0.425622419617165, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62224362.666666664, + "logits/rejected": -64693660.44444445, + "logps/chosen": -382.1701253255208, + "logps/rejected": -582.4524197048611, + "loss": 0.0214, + "rewards/chosen": 6.262440999348958, + "rewards/margins": 22.214392768012154, + "rewards/rejected": -15.951951768663195, + "step": 1701 + }, + { + "epoch": 0.4258726385587389, + "grad_norm": 18.375, + "kl": 12.94474983215332, + "learning_rate": 5e-06, + "logits/chosen": -39644448.0, + "logits/rejected": -38159281.45454545, + "logps/chosen": -507.21739783653845, + "logps/rejected": -441.9461558948864, + "loss": 0.1084, + "rewards/chosen": 7.649361243614783, + "rewards/margins": 18.70295496587153, + "rewards/rejected": -11.053593722256748, + "step": 1702 + }, + { + "epoch": 0.42612285750031276, + "grad_norm": 2.953125, + "kl": 7.729244232177734, + "learning_rate": 5e-06, + "logits/chosen": -56215893.333333336, + "logits/rejected": 7993751.111111111, + "logps/chosen": -485.69983723958336, + "logps/rejected": -447.6650390625, + "loss": 0.0038, + "rewards/chosen": 7.767117309570312, + "rewards/margins": 17.454815673828126, + "rewards/rejected": -9.687698364257812, + "step": 1703 + }, + { + "epoch": 0.42637307644188666, + "grad_norm": 13.9375, + "kl": 11.646564483642578, + "learning_rate": 5e-06, + "logits/chosen": -45079652.571428575, + "logits/rejected": -47878854.4, + "logps/chosen": -329.24288504464283, + "logps/rejected": -497.2576171875, + "loss": 0.0542, + "rewards/chosen": 6.367137908935547, + "rewards/margins": 13.181292724609374, + "rewards/rejected": -6.814154815673828, + "step": 1704 + }, + { + "epoch": 0.42662329538346055, + "grad_norm": 12.3125, + "kl": 15.663991928100586, + "learning_rate": 5e-06, + "logits/chosen": -81114393.6, + "logits/rejected": -26551541.333333332, + "logps/chosen": -433.26328125, + "logps/rejected": -544.2171766493055, + "loss": 0.0342, + "rewards/chosen": 8.47106679280599, + "rewards/margins": 16.079865349663628, + "rewards/rejected": -7.608798556857639, + "step": 1705 + }, + { + "epoch": 0.4268735143250344, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54404027.428571425, + "logits/rejected": -54776655.058823526, + "logps/chosen": -346.2366420200893, + "logps/rejected": -583.1027113970588, + "loss": 0.0387, + "rewards/chosen": 6.454103197370257, + "rewards/margins": 17.682503035088548, + "rewards/rejected": -11.22839983771829, + "step": 1706 + }, + { + "epoch": 0.4271237332666083, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47848426.666666664, + "logits/rejected": -47116288.0, + "logps/chosen": -406.0160725911458, + "logps/rejected": -583.9917399088541, + "loss": 0.0357, + "rewards/chosen": 6.846551259358724, + "rewards/margins": 20.443429311116535, + "rewards/rejected": -13.596878051757812, + "step": 1707 + }, + { + "epoch": 0.42737395220818214, + "grad_norm": 7.0625, + "kl": 0.8099867701530457, + "learning_rate": 5e-06, + "logits/chosen": -52418409.14285714, + "logits/rejected": -22942369.6, + "logps/chosen": -359.94252232142856, + "logps/rejected": -464.1509765625, + "loss": 0.0537, + "rewards/chosen": 6.338814871651786, + "rewards/margins": 15.62763170514788, + "rewards/rejected": -9.288816833496094, + "step": 1708 + }, + { + "epoch": 0.42762417114975604, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43965810.666666664, + "logits/rejected": -29156752.0, + "logps/chosen": -405.357666015625, + "logps/rejected": -441.9420572916667, + "loss": 0.0604, + "rewards/chosen": 8.050973892211914, + "rewards/margins": 18.299269994099937, + "rewards/rejected": -10.248296101888021, + "step": 1709 + }, + { + "epoch": 0.42787439009132994, + "grad_norm": 3.390625, + "kl": 0.9258435964584351, + "learning_rate": 5e-06, + "logits/chosen": 9278760.727272727, + "logits/rejected": -35297260.307692304, + "logps/chosen": -430.04545454545456, + "logps/rejected": -614.0436823918269, + "loss": 0.0096, + "rewards/chosen": 9.246497414328836, + "rewards/margins": 22.055412772652154, + "rewards/rejected": -12.808915358323317, + "step": 1710 + }, + { + "epoch": 0.4281246090329038, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83876472.0, + "logits/rejected": -49734416.0, + "logps/chosen": -481.97442626953125, + "logps/rejected": -533.2142944335938, + "loss": 0.0487, + "rewards/chosen": 10.997098922729492, + "rewards/margins": 24.313775062561035, + "rewards/rejected": -13.316676139831543, + "step": 1711 + }, + { + "epoch": 0.4283748279744777, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35620320.0, + "logits/rejected": -46986229.333333336, + "logps/chosen": -318.46262613932294, + "logps/rejected": -546.9024251302084, + "loss": 0.0464, + "rewards/chosen": 6.131862640380859, + "rewards/margins": 18.097195943196617, + "rewards/rejected": -11.965333302815756, + "step": 1712 + }, + { + "epoch": 0.4286250469160515, + "grad_norm": 3.328125, + "kl": 7.502655506134033, + "learning_rate": 5e-06, + "logits/chosen": -52188790.4, + "logits/rejected": -48088932.571428575, + "logps/chosen": -428.955322265625, + "logps/rejected": -560.6171177455357, + "loss": 0.0462, + "rewards/chosen": 9.466432189941406, + "rewards/margins": 20.565653555733817, + "rewards/rejected": -11.099221365792411, + "step": 1713 + }, + { + "epoch": 0.4288752658576254, + "grad_norm": 12.4375, + "kl": 6.839764595031738, + "learning_rate": 5e-06, + "logits/chosen": -76820169.84615384, + "logits/rejected": -33810708.36363637, + "logps/chosen": -389.4198467548077, + "logps/rejected": -687.6620649857955, + "loss": 0.054, + "rewards/chosen": 7.906393197866587, + "rewards/margins": 22.690686632703233, + "rewards/rejected": -14.784293434836648, + "step": 1714 + }, + { + "epoch": 0.4291254847991993, + "grad_norm": 14.1875, + "kl": 0.8903192281723022, + "learning_rate": 5e-06, + "logits/chosen": -67645902.76923077, + "logits/rejected": -47539266.90909091, + "logps/chosen": -364.6746168870192, + "logps/rejected": -636.3119229403409, + "loss": 0.0725, + "rewards/chosen": 6.2124187762920675, + "rewards/margins": 17.916070898096045, + "rewards/rejected": -11.703652121803977, + "step": 1715 + }, + { + "epoch": 0.42937570374077316, + "grad_norm": 13.875, + "kl": 6.6375555992126465, + "learning_rate": 5e-06, + "logits/chosen": -47529417.84615385, + "logits/rejected": -27196328.727272727, + "logps/chosen": -345.6926457331731, + "logps/rejected": -647.9904563210227, + "loss": 0.1059, + "rewards/chosen": 7.3469725388747, + "rewards/margins": 19.345108192283792, + "rewards/rejected": -11.998135653409092, + "step": 1716 + }, + { + "epoch": 0.42962592268234706, + "grad_norm": 2.578125, + "kl": 3.3616461753845215, + "learning_rate": 5e-06, + "logits/chosen": -51825301.333333336, + "logits/rejected": -59047194.666666664, + "logps/chosen": -380.3544921875, + "logps/rejected": -611.5171712239584, + "loss": 0.0645, + "rewards/chosen": 7.639267603556315, + "rewards/margins": 20.722444534301758, + "rewards/rejected": -13.083176930745443, + "step": 1717 + }, + { + "epoch": 0.4298761416239209, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62654353.45454545, + "logits/rejected": -59912256.0, + "logps/chosen": -451.9142400568182, + "logps/rejected": -615.9150015024038, + "loss": 0.0032, + "rewards/chosen": 8.970243280584162, + "rewards/margins": 20.215569396119015, + "rewards/rejected": -11.245326115534855, + "step": 1718 + }, + { + "epoch": 0.4301263605654948, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68114275.55555555, + "logits/rejected": -59588885.333333336, + "logps/chosen": -301.17290581597223, + "logps/rejected": -670.4255859375, + "loss": 0.0567, + "rewards/chosen": 5.892020331488715, + "rewards/margins": 17.536803860134548, + "rewards/rejected": -11.644783528645833, + "step": 1719 + }, + { + "epoch": 0.4303765795070687, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24551925.333333332, + "logits/rejected": -40537634.13333333, + "logps/chosen": -388.76280381944446, + "logps/rejected": -572.955859375, + "loss": 0.0365, + "rewards/chosen": 7.7253163655598955, + "rewards/margins": 19.833175659179688, + "rewards/rejected": -12.107859293619791, + "step": 1720 + }, + { + "epoch": 0.43062679844864254, + "grad_norm": 14.1875, + "kl": 6.8203125, + "learning_rate": 5e-06, + "logits/chosen": -42791654.4, + "logits/rejected": -43397769.14285714, + "logps/chosen": -394.9171630859375, + "logps/rejected": -457.73325892857144, + "loss": 0.0326, + "rewards/chosen": 6.740348815917969, + "rewards/margins": 15.547513689313616, + "rewards/rejected": -8.807164873395648, + "step": 1721 + }, + { + "epoch": 0.43087701739021644, + "grad_norm": 5.21875, + "kl": 1.7184561491012573, + "learning_rate": 5e-06, + "logits/chosen": -44436499.692307696, + "logits/rejected": -66164450.90909091, + "logps/chosen": -395.1288311298077, + "logps/rejected": -783.4544566761364, + "loss": 0.0328, + "rewards/chosen": 7.838365408090445, + "rewards/margins": 20.7616872187261, + "rewards/rejected": -12.923321810635654, + "step": 1722 + }, + { + "epoch": 0.43112723633179034, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40842295.46666667, + "logits/rejected": -29641185.777777776, + "logps/chosen": -359.4515625, + "logps/rejected": -563.5245768229166, + "loss": 0.0496, + "rewards/chosen": 7.085377502441406, + "rewards/margins": 19.073401896158853, + "rewards/rejected": -11.988024393717447, + "step": 1723 + }, + { + "epoch": 0.4313774552733642, + "grad_norm": 13.4375, + "kl": 3.0812313556671143, + "learning_rate": 5e-06, + "logits/chosen": -62663808.0, + "logits/rejected": -30666104.888888888, + "logps/chosen": -366.15833333333336, + "logps/rejected": -353.538818359375, + "loss": 0.0747, + "rewards/chosen": 8.160554504394531, + "rewards/margins": 17.364532301161024, + "rewards/rejected": -9.203977796766493, + "step": 1724 + }, + { + "epoch": 0.4316276742149381, + "grad_norm": 2.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58860288.0, + "logits/rejected": -46976465.06666667, + "logps/chosen": -302.7123209635417, + "logps/rejected": -783.8604166666667, + "loss": 0.029, + "rewards/chosen": 6.328499688042535, + "rewards/margins": 22.725789727105035, + "rewards/rejected": -16.3972900390625, + "step": 1725 + }, + { + "epoch": 0.4318778931565119, + "grad_norm": 2.640625, + "kl": 2.352138042449951, + "learning_rate": 5e-06, + "logits/chosen": -59568710.4, + "logits/rejected": -44088978.28571428, + "logps/chosen": -542.09404296875, + "logps/rejected": -545.5778459821429, + "loss": 0.0051, + "rewards/chosen": 11.1889892578125, + "rewards/margins": 21.552378409249442, + "rewards/rejected": -10.363389151436943, + "step": 1726 + }, + { + "epoch": 0.4321281120980858, + "grad_norm": 7.03125, + "kl": 0.5862541198730469, + "learning_rate": 5e-06, + "logits/chosen": -70189672.72727273, + "logits/rejected": -38422168.615384616, + "logps/chosen": -467.0792347301136, + "logps/rejected": -574.7234074519231, + "loss": 0.0517, + "rewards/chosen": 9.150310169566762, + "rewards/margins": 17.625838593169526, + "rewards/rejected": -8.475528423602764, + "step": 1727 + }, + { + "epoch": 0.4323783310396597, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22833303.272727273, + "logits/rejected": -83639286.15384616, + "logps/chosen": -270.73439719460225, + "logps/rejected": -575.6094501201923, + "loss": 0.0341, + "rewards/chosen": 6.655555031516335, + "rewards/margins": 20.536275903661767, + "rewards/rejected": -13.880720872145433, + "step": 1728 + }, + { + "epoch": 0.43262854998123357, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66529584.0, + "logits/rejected": -73453941.33333333, + "logps/chosen": -408.705810546875, + "logps/rejected": -602.7852376302084, + "loss": 0.0243, + "rewards/chosen": 7.105537414550781, + "rewards/margins": 20.146995544433594, + "rewards/rejected": -13.041458129882812, + "step": 1729 + }, + { + "epoch": 0.43287876892280747, + "grad_norm": 9.75, + "kl": 2.8327815532684326, + "learning_rate": 5e-06, + "logits/chosen": -64335988.36363637, + "logits/rejected": -67166508.3076923, + "logps/chosen": -368.0704456676136, + "logps/rejected": -554.1681941105769, + "loss": 0.019, + "rewards/chosen": 6.490728204900568, + "rewards/margins": 17.96502301409528, + "rewards/rejected": -11.474294809194712, + "step": 1730 + }, + { + "epoch": 0.4331289878643813, + "grad_norm": 9.6875, + "kl": 3.4409854412078857, + "learning_rate": 5e-06, + "logits/chosen": -38359035.428571425, + "logits/rejected": -75107936.0, + "logps/chosen": -317.81326729910717, + "logps/rejected": -622.18583984375, + "loss": 0.0337, + "rewards/chosen": 6.8846620832170755, + "rewards/margins": 20.430414036342075, + "rewards/rejected": -13.545751953125, + "step": 1731 + }, + { + "epoch": 0.4333792068059552, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66212064.0, + "logits/rejected": -43982601.14285714, + "logps/chosen": -486.63984375, + "logps/rejected": -633.8462611607143, + "loss": 0.0627, + "rewards/chosen": 7.7811279296875, + "rewards/margins": 19.61305454799107, + "rewards/rejected": -11.831926618303571, + "step": 1732 + }, + { + "epoch": 0.4336294257475291, + "grad_norm": 10.9375, + "kl": 1.4858449697494507, + "learning_rate": 5e-06, + "logits/chosen": -49538408.72727273, + "logits/rejected": -79389134.76923077, + "logps/chosen": -253.55579723011363, + "logps/rejected": -658.3462289663462, + "loss": 0.0973, + "rewards/chosen": 4.4752068953080615, + "rewards/margins": 19.178653503631377, + "rewards/rejected": -14.703446608323317, + "step": 1733 + }, + { + "epoch": 0.43387964468910295, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40054329.6, + "logits/rejected": -23462521.14285714, + "logps/chosen": -336.8068359375, + "logps/rejected": -654.4439871651786, + "loss": 0.0099, + "rewards/chosen": 6.676496887207032, + "rewards/margins": 21.427894156319756, + "rewards/rejected": -14.751397269112724, + "step": 1734 + }, + { + "epoch": 0.43412986363067685, + "grad_norm": 17.125, + "kl": 12.384054183959961, + "learning_rate": 5e-06, + "logits/chosen": -32385212.0, + "logits/rejected": -43103208.0, + "logps/chosen": -339.69903564453125, + "logps/rejected": -261.8519287109375, + "loss": 0.1108, + "rewards/chosen": 7.17822265625, + "rewards/margins": 14.521745204925537, + "rewards/rejected": -7.343522548675537, + "step": 1735 + }, + { + "epoch": 0.4343800825722507, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46461949.09090909, + "logits/rejected": -27294097.230769232, + "logps/chosen": -533.9335049715909, + "logps/rejected": -426.25721153846155, + "loss": 0.0221, + "rewards/chosen": 8.784595142711293, + "rewards/margins": 19.425809366719704, + "rewards/rejected": -10.641214224008413, + "step": 1736 + }, + { + "epoch": 0.4346303015138246, + "grad_norm": 3.609375, + "kl": 0.41214117407798767, + "learning_rate": 5e-06, + "logits/chosen": -55834619.07692308, + "logits/rejected": -68854365.0909091, + "logps/chosen": -415.0004131610577, + "logps/rejected": -511.69881924715907, + "loss": 0.0244, + "rewards/chosen": 7.425439100999099, + "rewards/margins": 18.9243668242768, + "rewards/rejected": -11.4989277232777, + "step": 1737 + }, + { + "epoch": 0.4348805204553985, + "grad_norm": 7.53125, + "kl": 5.037423610687256, + "learning_rate": 5e-06, + "logits/chosen": -18690935.466666665, + "logits/rejected": -54017230.222222224, + "logps/chosen": -460.46868489583335, + "logps/rejected": -654.6276584201389, + "loss": 0.0267, + "rewards/chosen": 7.764060974121094, + "rewards/margins": 24.822614034016926, + "rewards/rejected": -17.058553059895832, + "step": 1738 + }, + { + "epoch": 0.43513073939697233, + "grad_norm": 7.125, + "kl": 3.7176432609558105, + "learning_rate": 5e-06, + "logits/chosen": -58166099.2, + "logits/rejected": -66448493.71428572, + "logps/chosen": -373.3819580078125, + "logps/rejected": -531.9524972098214, + "loss": 0.028, + "rewards/chosen": 7.904003143310547, + "rewards/margins": 19.751651763916016, + "rewards/rejected": -11.847648620605469, + "step": 1739 + }, + { + "epoch": 0.43538095833854623, + "grad_norm": 12.375, + "kl": 10.342616081237793, + "learning_rate": 5e-06, + "logits/chosen": -62692829.538461536, + "logits/rejected": 18649166.545454547, + "logps/chosen": -457.5563777043269, + "logps/rejected": -724.3130326704545, + "loss": 0.0309, + "rewards/chosen": 8.37613267164964, + "rewards/margins": 21.733340923602764, + "rewards/rejected": -13.357208251953125, + "step": 1740 + }, + { + "epoch": 0.43563117728012013, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83998246.4, + "logits/rejected": -30332072.42105263, + "logps/chosen": -425.038134765625, + "logps/rejected": -657.7446032072369, + "loss": 0.0778, + "rewards/chosen": 7.995486450195313, + "rewards/margins": 20.139522994192024, + "rewards/rejected": -12.14403654399671, + "step": 1741 + }, + { + "epoch": 0.435881396221694, + "grad_norm": 9.375, + "kl": 0.16205660998821259, + "learning_rate": 5e-06, + "logits/chosen": -27679463.384615384, + "logits/rejected": -61266594.90909091, + "logps/chosen": -424.5254657451923, + "logps/rejected": -452.01438210227275, + "loss": 0.0425, + "rewards/chosen": 8.55937018761268, + "rewards/margins": 18.677565461272124, + "rewards/rejected": -10.118195273659445, + "step": 1742 + }, + { + "epoch": 0.43613161516326787, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51511264.0, + "logits/rejected": -43514203.428571425, + "logps/chosen": -268.41259765625, + "logps/rejected": -472.47994559151783, + "loss": 0.0689, + "rewards/chosen": 6.514250183105469, + "rewards/margins": 16.722412763323103, + "rewards/rejected": -10.208162580217634, + "step": 1743 + }, + { + "epoch": 0.4363818341048417, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65692685.71428572, + "logits/rejected": -31745753.6, + "logps/chosen": -535.0950055803571, + "logps/rejected": -564.382568359375, + "loss": 0.0377, + "rewards/chosen": 8.480243137904576, + "rewards/margins": 21.311725071498326, + "rewards/rejected": -12.83148193359375, + "step": 1744 + }, + { + "epoch": 0.4366320530464156, + "grad_norm": 17.25, + "kl": 3.5959362983703613, + "learning_rate": 5e-06, + "logits/chosen": -79856679.38461539, + "logits/rejected": -29814906.181818184, + "logps/chosen": -371.0808293269231, + "logps/rejected": -406.611328125, + "loss": 0.0492, + "rewards/chosen": 8.293344350961538, + "rewards/margins": 17.26341770412205, + "rewards/rejected": -8.970073353160512, + "step": 1745 + }, + { + "epoch": 0.4368822719879895, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55040028.44444445, + "logits/rejected": -56867285.333333336, + "logps/chosen": -387.8827853732639, + "logps/rejected": -671.6983072916667, + "loss": 0.0459, + "rewards/chosen": 6.181253221299913, + "rewards/margins": 18.776427374945747, + "rewards/rejected": -12.595174153645834, + "step": 1746 + }, + { + "epoch": 0.43713249092956336, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75343680.0, + "logits/rejected": -47932388.571428575, + "logps/chosen": -468.927197265625, + "logps/rejected": -587.1833147321429, + "loss": 0.025, + "rewards/chosen": 8.697145080566406, + "rewards/margins": 20.991458783830915, + "rewards/rejected": -12.294313703264509, + "step": 1747 + }, + { + "epoch": 0.43738270987113725, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68261081.6, + "logits/rejected": -71557147.42857143, + "logps/chosen": -519.54384765625, + "logps/rejected": -674.0514090401786, + "loss": 0.0202, + "rewards/chosen": 10.38585205078125, + "rewards/margins": 22.809675816127232, + "rewards/rejected": -12.423823765345983, + "step": 1748 + }, + { + "epoch": 0.4376329288127111, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38987557.81818182, + "logits/rejected": -54131160.615384616, + "logps/chosen": -403.91703657670456, + "logps/rejected": -666.29296875, + "loss": 0.0205, + "rewards/chosen": 6.777026089754972, + "rewards/margins": 19.322722641738146, + "rewards/rejected": -12.545696551983173, + "step": 1749 + }, + { + "epoch": 0.437883147754285, + "grad_norm": 7.0625, + "kl": 1.476178526878357, + "learning_rate": 5e-06, + "logits/chosen": -55180957.09090909, + "logits/rejected": -51466151.384615384, + "logps/chosen": -324.15292080965907, + "logps/rejected": -597.16162109375, + "loss": 0.0463, + "rewards/chosen": 5.6106719970703125, + "rewards/margins": 18.544174194335938, + "rewards/rejected": -12.933502197265625, + "step": 1750 + }, + { + "epoch": 0.4381333666958589, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56126813.538461536, + "logits/rejected": -13130280.727272727, + "logps/chosen": -385.82361778846155, + "logps/rejected": -577.9657315340909, + "loss": 0.0182, + "rewards/chosen": 7.892554063063401, + "rewards/margins": 20.405593391898627, + "rewards/rejected": -12.513039328835227, + "step": 1751 + }, + { + "epoch": 0.43838358563743274, + "grad_norm": 13.375, + "kl": 6.674837589263916, + "learning_rate": 5e-06, + "logits/chosen": -45477993.4117647, + "logits/rejected": -43217435.428571425, + "logps/chosen": -318.71852022058823, + "logps/rejected": -625.76416015625, + "loss": 0.0928, + "rewards/chosen": 6.334524266860065, + "rewards/margins": 18.87705269180426, + "rewards/rejected": -12.542528424944196, + "step": 1752 + }, + { + "epoch": 0.43863380457900664, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57025522.28571428, + "logits/rejected": -48281494.5882353, + "logps/chosen": -410.1220703125, + "logps/rejected": -442.19694967830884, + "loss": 0.0111, + "rewards/chosen": 7.877737317766462, + "rewards/margins": 17.02038365852933, + "rewards/rejected": -9.142646340762868, + "step": 1753 + }, + { + "epoch": 0.43888402352058054, + "grad_norm": 5.03125, + "kl": 4.741658687591553, + "learning_rate": 5e-06, + "logits/chosen": -49010226.28571428, + "logits/rejected": -71180588.8, + "logps/chosen": -407.03414481026783, + "logps/rejected": -599.71064453125, + "loss": 0.0188, + "rewards/chosen": 8.332615443638392, + "rewards/margins": 17.753481837681363, + "rewards/rejected": -9.420866394042969, + "step": 1754 + }, + { + "epoch": 0.4391342424621544, + "grad_norm": 4.90625, + "kl": 3.335146903991699, + "learning_rate": 5e-06, + "logits/chosen": -58319461.64705882, + "logits/rejected": -46217056.0, + "logps/chosen": -473.1337028952206, + "logps/rejected": -482.73521205357144, + "loss": 0.0667, + "rewards/chosen": 7.7722625732421875, + "rewards/margins": 19.922548566545757, + "rewards/rejected": -12.150285993303571, + "step": 1755 + }, + { + "epoch": 0.4393844614037283, + "grad_norm": 14.5, + "kl": 16.141101837158203, + "learning_rate": 5e-06, + "logits/chosen": -39241558.85714286, + "logits/rejected": -51333401.6, + "logps/chosen": -471.42567661830356, + "logps/rejected": -525.24345703125, + "loss": 0.0672, + "rewards/chosen": 8.030807495117188, + "rewards/margins": 18.558697509765626, + "rewards/rejected": -10.527890014648438, + "step": 1756 + }, + { + "epoch": 0.4396346803453021, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46667596.8, + "logits/rejected": -36808502.85714286, + "logps/chosen": -321.97822265625, + "logps/rejected": -578.2374790736607, + "loss": 0.0262, + "rewards/chosen": 6.563163757324219, + "rewards/margins": 20.536683218819753, + "rewards/rejected": -13.973519461495536, + "step": 1757 + }, + { + "epoch": 0.439884899286876, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69243565.71428572, + "logits/rejected": -40564086.4, + "logps/chosen": -357.99930245535717, + "logps/rejected": -574.6998046875, + "loss": 0.0534, + "rewards/chosen": 5.898641313825335, + "rewards/margins": 17.70332271030971, + "rewards/rejected": -11.804681396484375, + "step": 1758 + }, + { + "epoch": 0.4401351182284499, + "grad_norm": 13.875, + "kl": 9.868629455566406, + "learning_rate": 5e-06, + "logits/chosen": -45267879.384615384, + "logits/rejected": -36426493.09090909, + "logps/chosen": -461.3607647235577, + "logps/rejected": -499.91530539772725, + "loss": 0.0538, + "rewards/chosen": 7.333030700683594, + "rewards/margins": 19.2925893610174, + "rewards/rejected": -11.959558660333807, + "step": 1759 + }, + { + "epoch": 0.44038533717002376, + "grad_norm": 8.6875, + "kl": 0.6621112823486328, + "learning_rate": 5e-06, + "logits/chosen": -34314087.11111111, + "logits/rejected": -28137774.933333334, + "logps/chosen": -300.6897243923611, + "logps/rejected": -517.94169921875, + "loss": 0.0386, + "rewards/chosen": 7.055417378743489, + "rewards/margins": 18.887442525227865, + "rewards/rejected": -11.832025146484375, + "step": 1760 + }, + { + "epoch": 0.44063555611159766, + "grad_norm": 4.1875, + "kl": 0.675749659538269, + "learning_rate": 5e-06, + "logits/chosen": -36809636.92307692, + "logits/rejected": -52435642.18181818, + "logps/chosen": -443.8221905048077, + "logps/rejected": -527.2193714488636, + "loss": 0.0455, + "rewards/chosen": 7.598936814528245, + "rewards/margins": 18.248710845733857, + "rewards/rejected": -10.649774031205611, + "step": 1761 + }, + { + "epoch": 0.4408857750531715, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39313902.76923077, + "logits/rejected": -68325777.45454545, + "logps/chosen": -298.28048001802887, + "logps/rejected": -781.1985973011364, + "loss": 0.0225, + "rewards/chosen": 6.934457632211538, + "rewards/margins": 22.028290861970063, + "rewards/rejected": -15.093833229758523, + "step": 1762 + }, + { + "epoch": 0.4411359939947454, + "grad_norm": 7.0, + "kl": 1.7169456481933594, + "learning_rate": 5e-06, + "logits/chosen": -43266948.92307692, + "logits/rejected": -37215383.27272727, + "logps/chosen": -406.02647986778845, + "logps/rejected": -463.85329367897725, + "loss": 0.038, + "rewards/chosen": 7.118249746469351, + "rewards/margins": 18.285575546584763, + "rewards/rejected": -11.167325800115412, + "step": 1763 + }, + { + "epoch": 0.4413862129363193, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6265684.0, + "logits/rejected": -48570961.777777776, + "logps/chosen": -318.15797932942706, + "logps/rejected": -699.9150390625, + "loss": 0.051, + "rewards/chosen": 5.018633206685384, + "rewards/margins": 17.75141281551785, + "rewards/rejected": -12.732779608832466, + "step": 1764 + }, + { + "epoch": 0.44163643187789314, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60129910.85714286, + "logits/rejected": -50563392.0, + "logps/chosen": -600.7783203125, + "logps/rejected": -711.9457720588235, + "loss": 0.0118, + "rewards/chosen": 9.88724844796317, + "rewards/margins": 19.984983684635964, + "rewards/rejected": -10.097735236672793, + "step": 1765 + }, + { + "epoch": 0.44188665081946704, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49175216.0, + "logits/rejected": -58110432.0, + "logps/chosen": -486.8152669270833, + "logps/rejected": -722.115478515625, + "loss": 0.0041, + "rewards/chosen": 7.803700764973958, + "rewards/margins": 26.00478744506836, + "rewards/rejected": -18.201086680094402, + "step": 1766 + }, + { + "epoch": 0.4421368697610409, + "grad_norm": 11.1875, + "kl": 10.043951034545898, + "learning_rate": 5e-06, + "logits/chosen": -40265856.0, + "logits/rejected": -39411280.0, + "logps/chosen": -367.2900085449219, + "logps/rejected": -505.8384094238281, + "loss": 0.0362, + "rewards/chosen": 7.767824172973633, + "rewards/margins": 19.260985374450684, + "rewards/rejected": -11.49316120147705, + "step": 1767 + }, + { + "epoch": 0.4423870887026148, + "grad_norm": 12.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49708817.45454545, + "logits/rejected": -40033376.0, + "logps/chosen": -396.56143465909093, + "logps/rejected": -536.3824744591346, + "loss": 0.0709, + "rewards/chosen": 7.377537120472301, + "rewards/margins": 17.013385532619235, + "rewards/rejected": -9.635848412146935, + "step": 1768 + }, + { + "epoch": 0.4426373076441887, + "grad_norm": 20.625, + "kl": 5.363088130950928, + "learning_rate": 5e-06, + "logits/chosen": -37162451.2, + "logits/rejected": -38869120.0, + "logps/chosen": -326.704345703125, + "logps/rejected": -619.7370954241071, + "loss": 0.056, + "rewards/chosen": 7.3760833740234375, + "rewards/margins": 18.15894971575056, + "rewards/rejected": -10.78286634172712, + "step": 1769 + }, + { + "epoch": 0.4428875265857625, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20699465.6, + "logits/rejected": -68717814.85714285, + "logps/chosen": -282.4941650390625, + "logps/rejected": -557.0023716517857, + "loss": 0.0551, + "rewards/chosen": 6.090250778198242, + "rewards/margins": 15.743239756992885, + "rewards/rejected": -9.652988978794642, + "step": 1770 + }, + { + "epoch": 0.4431377455273364, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30522121.6, + "logits/rejected": -35764368.0, + "logps/chosen": -464.6640625, + "logps/rejected": -369.3518763950893, + "loss": 0.0424, + "rewards/chosen": 8.982410430908203, + "rewards/margins": 17.81627219063895, + "rewards/rejected": -8.833861759730748, + "step": 1771 + }, + { + "epoch": 0.4433879644689103, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58716864.0, + "logits/rejected": -49143922.28571428, + "logps/chosen": -370.426318359375, + "logps/rejected": -640.2614397321429, + "loss": 0.0345, + "rewards/chosen": 6.778099822998047, + "rewards/margins": 21.858746882847377, + "rewards/rejected": -15.08064705984933, + "step": 1772 + }, + { + "epoch": 0.44363818341048417, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32047373.333333332, + "logits/rejected": -50361162.666666664, + "logps/chosen": -454.6949462890625, + "logps/rejected": -595.2628038194445, + "loss": 0.0134, + "rewards/chosen": 8.387965520222982, + "rewards/margins": 23.626933415730797, + "rewards/rejected": -15.238967895507812, + "step": 1773 + }, + { + "epoch": 0.44388840235205806, + "grad_norm": 12.25, + "kl": 3.587136745452881, + "learning_rate": 5e-06, + "logits/chosen": -45023188.36363637, + "logits/rejected": -36717735.384615384, + "logps/chosen": -365.07870205965907, + "logps/rejected": -587.0178786057693, + "loss": 0.0695, + "rewards/chosen": 7.220082369717685, + "rewards/margins": 24.02255969280963, + "rewards/rejected": -16.802477323091946, + "step": 1774 + }, + { + "epoch": 0.4441386212936319, + "grad_norm": 13.375, + "kl": 1.2241935729980469, + "learning_rate": 5e-06, + "logits/chosen": -55480352.0, + "logits/rejected": -48889317.333333336, + "logps/chosen": -340.57798258463544, + "logps/rejected": -552.3624674479166, + "loss": 0.1119, + "rewards/chosen": 4.243961334228516, + "rewards/margins": 15.498896280924479, + "rewards/rejected": -11.254934946695963, + "step": 1775 + }, + { + "epoch": 0.4443888402352058, + "grad_norm": 10.0625, + "kl": 2.493149518966675, + "learning_rate": 5e-06, + "logits/chosen": -56879364.92307692, + "logits/rejected": -41980506.18181818, + "logps/chosen": -408.6760066105769, + "logps/rejected": -454.79940518465907, + "loss": 0.0347, + "rewards/chosen": 7.321468646709736, + "rewards/margins": 17.16368791273424, + "rewards/rejected": -9.842219266024502, + "step": 1776 + }, + { + "epoch": 0.4446390591767797, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15926152.888888888, + "logits/rejected": -50662801.06666667, + "logps/chosen": -366.1369900173611, + "logps/rejected": -563.3386067708333, + "loss": 0.0188, + "rewards/chosen": 7.333340115017361, + "rewards/margins": 21.80038825141059, + "rewards/rejected": -14.46704813639323, + "step": 1777 + }, + { + "epoch": 0.44488927811835355, + "grad_norm": 8.6875, + "kl": 14.138043403625488, + "learning_rate": 5e-06, + "logits/chosen": -45838133.333333336, + "logits/rejected": -31965178.666666668, + "logps/chosen": -433.3512369791667, + "logps/rejected": -413.8290201822917, + "loss": 0.1089, + "rewards/chosen": 8.491605970594618, + "rewards/margins": 21.06764687432183, + "rewards/rejected": -12.576040903727213, + "step": 1778 + }, + { + "epoch": 0.44513949705992745, + "grad_norm": 9.875, + "kl": 13.0823392868042, + "learning_rate": 5e-06, + "logits/chosen": -95805723.42857143, + "logits/rejected": -85909708.8, + "logps/chosen": -522.8612583705357, + "logps/rejected": -555.069873046875, + "loss": 0.0318, + "rewards/chosen": 9.090159824916295, + "rewards/margins": 21.459762028285436, + "rewards/rejected": -12.36960220336914, + "step": 1779 + }, + { + "epoch": 0.4453897160015013, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53376621.71428572, + "logits/rejected": -52830809.6, + "logps/chosen": -453.19224330357144, + "logps/rejected": -573.626513671875, + "loss": 0.0083, + "rewards/chosen": 8.938412257603236, + "rewards/margins": 22.412557765415734, + "rewards/rejected": -13.4741455078125, + "step": 1780 + }, + { + "epoch": 0.4456399349430752, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75237403.42857143, + "logits/rejected": -57445455.058823526, + "logps/chosen": -391.70424107142856, + "logps/rejected": -581.9053883272059, + "loss": 0.0643, + "rewards/chosen": 7.588176182338169, + "rewards/margins": 21.664555397354253, + "rewards/rejected": -14.076379215016084, + "step": 1781 + }, + { + "epoch": 0.4458901538846491, + "grad_norm": 10.875, + "kl": 9.098955154418945, + "learning_rate": 5e-06, + "logits/chosen": -67559760.0, + "logits/rejected": -68833472.0, + "logps/chosen": -430.5329284667969, + "logps/rejected": -575.4144897460938, + "loss": 0.0708, + "rewards/chosen": 7.655349254608154, + "rewards/margins": 22.26174306869507, + "rewards/rejected": -14.606393814086914, + "step": 1782 + }, + { + "epoch": 0.44614037282622293, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47084064.0, + "logits/rejected": -61674714.666666664, + "logps/chosen": -322.6530354817708, + "logps/rejected": -705.037109375, + "loss": 0.0311, + "rewards/chosen": 5.979930241902669, + "rewards/margins": 22.179810206095375, + "rewards/rejected": -16.199879964192707, + "step": 1783 + }, + { + "epoch": 0.44639059176779683, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75594232.0, + "logits/rejected": -48111724.0, + "logps/chosen": -513.6490478515625, + "logps/rejected": -690.1429443359375, + "loss": 0.0126, + "rewards/chosen": 7.206121921539307, + "rewards/margins": 23.410858631134033, + "rewards/rejected": -16.204736709594727, + "step": 1784 + }, + { + "epoch": 0.4466408107093707, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21882183.272727273, + "logits/rejected": -36222857.84615385, + "logps/chosen": -404.6328125, + "logps/rejected": -652.1051682692307, + "loss": 0.0149, + "rewards/chosen": 8.209370006214488, + "rewards/margins": 21.10542244010872, + "rewards/rejected": -12.89605243389423, + "step": 1785 + }, + { + "epoch": 0.44689102965094457, + "grad_norm": 1.2578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36897752.0, + "logits/rejected": -64548512.0, + "logps/chosen": -373.8805338541667, + "logps/rejected": -730.1560872395834, + "loss": 0.0193, + "rewards/chosen": 8.859169006347656, + "rewards/margins": 26.301392873128254, + "rewards/rejected": -17.442223866780598, + "step": 1786 + }, + { + "epoch": 0.44714124859251847, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44956263.11111111, + "logits/rejected": -33566075.733333334, + "logps/chosen": -519.7521701388889, + "logps/rejected": -534.9561197916667, + "loss": 0.0373, + "rewards/chosen": 9.603086683485243, + "rewards/margins": 24.427266777886285, + "rewards/rejected": -14.824180094401042, + "step": 1787 + }, + { + "epoch": 0.4473914675340923, + "grad_norm": 5.59375, + "kl": 2.685976028442383, + "learning_rate": 5e-06, + "logits/chosen": -51929436.0, + "logits/rejected": -47917304.0, + "logps/chosen": -360.30389404296875, + "logps/rejected": -464.84906005859375, + "loss": 0.0398, + "rewards/chosen": 7.607850551605225, + "rewards/margins": 18.70877981185913, + "rewards/rejected": -11.100929260253906, + "step": 1788 + }, + { + "epoch": 0.4476416864756662, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36835847.11111111, + "logits/rejected": -47320776.53333333, + "logps/chosen": -329.20030381944446, + "logps/rejected": -704.5416666666666, + "loss": 0.0214, + "rewards/chosen": 6.065415700276692, + "rewards/margins": 21.08590316772461, + "rewards/rejected": -15.020487467447916, + "step": 1789 + }, + { + "epoch": 0.4478919054172401, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64112128.0, + "logits/rejected": -35477700.0, + "logps/chosen": -364.1282958984375, + "logps/rejected": -641.5343627929688, + "loss": 0.0527, + "rewards/chosen": 5.406822204589844, + "rewards/margins": 18.312036514282227, + "rewards/rejected": -12.905214309692383, + "step": 1790 + }, + { + "epoch": 0.44814212435881395, + "grad_norm": 8.5625, + "kl": 5.4232497215271, + "learning_rate": 5e-06, + "logits/chosen": -72321787.07692307, + "logits/rejected": -51623325.09090909, + "logps/chosen": -506.74132361778845, + "logps/rejected": -666.6501686789773, + "loss": 0.0111, + "rewards/chosen": 8.528812115009014, + "rewards/margins": 26.629510412683018, + "rewards/rejected": -18.100698297674004, + "step": 1791 + }, + { + "epoch": 0.44839234330038785, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -78324648.0, + "logits/rejected": -60532900.0, + "logps/chosen": -416.6427307128906, + "logps/rejected": -571.3350830078125, + "loss": 0.0517, + "rewards/chosen": 6.130279541015625, + "rewards/margins": 15.392738342285156, + "rewards/rejected": -9.262458801269531, + "step": 1792 + }, + { + "epoch": 0.4486425622419617, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66610035.2, + "logits/rejected": -42949188.571428575, + "logps/chosen": -285.47431640625, + "logps/rejected": -665.2876674107143, + "loss": 0.0286, + "rewards/chosen": 5.702185440063476, + "rewards/margins": 20.401476124354772, + "rewards/rejected": -14.699290684291295, + "step": 1793 + }, + { + "epoch": 0.4488927811835356, + "grad_norm": 10.9375, + "kl": 2.5778894424438477, + "learning_rate": 5e-06, + "logits/chosen": -61384870.4, + "logits/rejected": -14482617.0, + "logps/chosen": -316.8839111328125, + "logps/rejected": -399.7562255859375, + "loss": 0.0695, + "rewards/chosen": 6.8824920654296875, + "rewards/margins": 15.578229904174805, + "rewards/rejected": -8.695737838745117, + "step": 1794 + }, + { + "epoch": 0.4491430001251095, + "grad_norm": 2.75, + "kl": 9.455293655395508, + "learning_rate": 5e-06, + "logits/chosen": -89607488.0, + "logits/rejected": -49099850.666666664, + "logps/chosen": -495.9684244791667, + "logps/rejected": -651.7939046223959, + "loss": 0.0635, + "rewards/chosen": 8.225971857706705, + "rewards/margins": 22.130024592081703, + "rewards/rejected": -13.904052734375, + "step": 1795 + }, + { + "epoch": 0.44939321906668334, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60593043.2, + "logits/rejected": -46339040.0, + "logps/chosen": -514.023095703125, + "logps/rejected": -592.2505580357143, + "loss": 0.0023, + "rewards/chosen": 9.517355346679688, + "rewards/margins": 23.239915684291297, + "rewards/rejected": -13.722560337611608, + "step": 1796 + }, + { + "epoch": 0.44964343800825723, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45215930.666666664, + "logits/rejected": -53579840.0, + "logps/chosen": -301.9062093098958, + "logps/rejected": -616.9141031901041, + "loss": 0.0495, + "rewards/chosen": 5.078006744384766, + "rewards/margins": 17.52708943684896, + "rewards/rejected": -12.449082692464193, + "step": 1797 + }, + { + "epoch": 0.4498936569498311, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31808290.0, + "logits/rejected": -54355768.0, + "logps/chosen": -321.48297119140625, + "logps/rejected": -664.0029296875, + "loss": 0.0436, + "rewards/chosen": 7.363469123840332, + "rewards/margins": 20.268009185791016, + "rewards/rejected": -12.904540061950684, + "step": 1798 + }, + { + "epoch": 0.450143875891405, + "grad_norm": 7.34375, + "kl": 10.426305770874023, + "learning_rate": 5e-06, + "logits/chosen": -33242500.266666666, + "logits/rejected": -63433386.666666664, + "logps/chosen": -433.0022786458333, + "logps/rejected": -594.8330078125, + "loss": 0.0549, + "rewards/chosen": 8.395646158854166, + "rewards/margins": 19.698702663845488, + "rewards/rejected": -11.30305650499132, + "step": 1799 + }, + { + "epoch": 0.4503940948329789, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40816760.0, + "logits/rejected": -56599258.666666664, + "logps/chosen": -426.3206380208333, + "logps/rejected": -724.4078776041666, + "loss": 0.0301, + "rewards/chosen": 7.383562723795573, + "rewards/margins": 24.089753468831383, + "rewards/rejected": -16.70619074503581, + "step": 1800 + }, + { + "epoch": 0.4506443137745527, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20709545.6, + "logits/rejected": -50458459.428571425, + "logps/chosen": -476.54697265625, + "logps/rejected": -513.1675502232143, + "loss": 0.0118, + "rewards/chosen": 7.179499816894531, + "rewards/margins": 21.05487474714007, + "rewards/rejected": -13.875374930245536, + "step": 1801 + }, + { + "epoch": 0.4508945327161266, + "grad_norm": 1.2421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64026577.45454545, + "logits/rejected": -60840531.692307696, + "logps/chosen": -403.55153586647725, + "logps/rejected": -685.3254957932693, + "loss": 0.0033, + "rewards/chosen": 7.443070151589134, + "rewards/margins": 22.113689235873988, + "rewards/rejected": -14.670619084284855, + "step": 1802 + }, + { + "epoch": 0.4511447516577005, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27333273.6, + "logits/rejected": -44268041.14285714, + "logps/chosen": -275.53232421875, + "logps/rejected": -603.9342912946429, + "loss": 0.043, + "rewards/chosen": 6.4478302001953125, + "rewards/margins": 19.355772835867747, + "rewards/rejected": -12.907942635672432, + "step": 1803 + }, + { + "epoch": 0.45139497059927436, + "grad_norm": 9.875, + "kl": 1.1471812725067139, + "learning_rate": 5e-06, + "logits/chosen": -48002949.333333336, + "logits/rejected": -25026784.0, + "logps/chosen": -305.0901692708333, + "logps/rejected": -675.329345703125, + "loss": 0.0739, + "rewards/chosen": 4.8794816335042315, + "rewards/margins": 16.949525833129883, + "rewards/rejected": -12.07004419962565, + "step": 1804 + }, + { + "epoch": 0.45164518954084826, + "grad_norm": 12.3125, + "kl": 4.332741737365723, + "learning_rate": 5e-06, + "logits/chosen": -46563840.0, + "logits/rejected": -37039238.4, + "logps/chosen": -333.8687220982143, + "logps/rejected": -557.331640625, + "loss": 0.0833, + "rewards/chosen": 7.277859279087612, + "rewards/margins": 19.7707273210798, + "rewards/rejected": -12.492868041992187, + "step": 1805 + }, + { + "epoch": 0.4518954084824221, + "grad_norm": 11.8125, + "kl": 1.997132658958435, + "learning_rate": 5e-06, + "logits/chosen": -69630794.66666667, + "logits/rejected": -27983048.0, + "logps/chosen": -350.4193522135417, + "logps/rejected": -779.975341796875, + "loss": 0.0386, + "rewards/chosen": 6.767127354939778, + "rewards/margins": 21.988503138224285, + "rewards/rejected": -15.221375783284506, + "step": 1806 + }, + { + "epoch": 0.452145627423996, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44652521.14285714, + "logits/rejected": -246348.0, + "logps/chosen": -434.12656947544644, + "logps/rejected": -463.12412109375, + "loss": 0.0401, + "rewards/chosen": 6.641373770577567, + "rewards/margins": 19.031978934151784, + "rewards/rejected": -12.390605163574218, + "step": 1807 + }, + { + "epoch": 0.4523958463655699, + "grad_norm": 8.3125, + "kl": 4.250385284423828, + "learning_rate": 5e-06, + "logits/chosen": -85655376.0, + "logits/rejected": -54388688.0, + "logps/chosen": -565.3590087890625, + "logps/rejected": -555.5823364257812, + "loss": 0.0244, + "rewards/chosen": 8.35127067565918, + "rewards/margins": 21.970748901367188, + "rewards/rejected": -13.619478225708008, + "step": 1808 + }, + { + "epoch": 0.45264606530714374, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50558666.666666664, + "logits/rejected": -35236309.333333336, + "logps/chosen": -439.0404459635417, + "logps/rejected": -722.1485188802084, + "loss": 0.0162, + "rewards/chosen": 10.310356140136719, + "rewards/margins": 24.486577351888023, + "rewards/rejected": -14.176221211751303, + "step": 1809 + }, + { + "epoch": 0.45289628424871764, + "grad_norm": 9.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38164470.85714286, + "logits/rejected": -41198288.0, + "logps/chosen": -296.66531808035717, + "logps/rejected": -588.27236328125, + "loss": 0.0451, + "rewards/chosen": 5.797191074916294, + "rewards/margins": 17.81886465890067, + "rewards/rejected": -12.021673583984375, + "step": 1810 + }, + { + "epoch": 0.4531465031902915, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83609144.0, + "logits/rejected": -77866840.0, + "logps/chosen": -398.7728576660156, + "logps/rejected": -602.8576049804688, + "loss": 0.0382, + "rewards/chosen": 5.854246139526367, + "rewards/margins": 18.571319580078125, + "rewards/rejected": -12.717073440551758, + "step": 1811 + }, + { + "epoch": 0.4533967221318654, + "grad_norm": 8.4375, + "kl": 1.8036067485809326, + "learning_rate": 5e-06, + "logits/chosen": -38730414.54545455, + "logits/rejected": -24260081.230769232, + "logps/chosen": -348.451904296875, + "logps/rejected": -568.3101712740385, + "loss": 0.0289, + "rewards/chosen": 6.234905589710582, + "rewards/margins": 18.411818337607215, + "rewards/rejected": -12.176912747896635, + "step": 1812 + }, + { + "epoch": 0.4536469410734393, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56824786.823529415, + "logits/rejected": -35912292.571428575, + "logps/chosen": -316.43212890625, + "logps/rejected": -647.2610909598214, + "loss": 0.0356, + "rewards/chosen": 6.748531117158778, + "rewards/margins": 21.882483666684447, + "rewards/rejected": -15.13395254952567, + "step": 1813 + }, + { + "epoch": 0.4538971600150131, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51591952.0, + "logits/rejected": -52059477.333333336, + "logps/chosen": -439.0169270833333, + "logps/rejected": -651.3352864583334, + "loss": 0.0485, + "rewards/chosen": 7.812395731608073, + "rewards/margins": 20.4191411336263, + "rewards/rejected": -12.606745402018229, + "step": 1814 + }, + { + "epoch": 0.454147378956587, + "grad_norm": 1.5078125, + "kl": 2.6219139099121094, + "learning_rate": 5e-06, + "logits/chosen": -44669671.384615384, + "logits/rejected": -47210091.63636363, + "logps/chosen": -430.2340745192308, + "logps/rejected": -666.4497514204545, + "loss": 0.0048, + "rewards/chosen": 8.787156325120192, + "rewards/margins": 25.280309290319053, + "rewards/rejected": -16.493152965198863, + "step": 1815 + }, + { + "epoch": 0.45439759789816087, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51352977.45454545, + "logits/rejected": -50888871.384615384, + "logps/chosen": -328.2664683948864, + "logps/rejected": -746.3853665865385, + "loss": 0.0305, + "rewards/chosen": 5.955451965332031, + "rewards/margins": 18.627301142765926, + "rewards/rejected": -12.671849177433895, + "step": 1816 + }, + { + "epoch": 0.45464781683973476, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53398856.53333333, + "logits/rejected": -6795163.555555556, + "logps/chosen": -349.4905598958333, + "logps/rejected": -419.307861328125, + "loss": 0.0706, + "rewards/chosen": 6.24637451171875, + "rewards/margins": 16.863036431206595, + "rewards/rejected": -10.616661919487846, + "step": 1817 + }, + { + "epoch": 0.45489803578130866, + "grad_norm": 20.375, + "kl": 22.622695922851562, + "learning_rate": 5e-06, + "logits/chosen": -50894040.0, + "logits/rejected": -48293024.0, + "logps/chosen": -471.05316162109375, + "logps/rejected": -431.5513916015625, + "loss": 0.1078, + "rewards/chosen": 9.585517883300781, + "rewards/margins": 17.194531440734863, + "rewards/rejected": -7.609013557434082, + "step": 1818 + }, + { + "epoch": 0.4551482547228825, + "grad_norm": 11.875, + "kl": 4.875029563903809, + "learning_rate": 5e-06, + "logits/chosen": -94279488.0, + "logits/rejected": -17058228.0, + "logps/chosen": -494.9310709635417, + "logps/rejected": -496.8284912109375, + "loss": 0.0217, + "rewards/chosen": 8.994813919067383, + "rewards/margins": 19.084096908569336, + "rewards/rejected": -10.089282989501953, + "step": 1819 + }, + { + "epoch": 0.4553984736644564, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30042866.666666668, + "logits/rejected": -35827592.0, + "logps/chosen": -368.60546875, + "logps/rejected": -585.975830078125, + "loss": 0.0432, + "rewards/chosen": 7.8612721761067705, + "rewards/margins": 20.163007100423176, + "rewards/rejected": -12.301734924316406, + "step": 1820 + }, + { + "epoch": 0.4556486926060303, + "grad_norm": 16.375, + "kl": 2.000948667526245, + "learning_rate": 5e-06, + "logits/chosen": -60658585.6, + "logits/rejected": -47144320.0, + "logps/chosen": -441.2140625, + "logps/rejected": -569.6512276785714, + "loss": 0.0269, + "rewards/chosen": 11.016093444824218, + "rewards/margins": 19.586375972202845, + "rewards/rejected": -8.570282527378627, + "step": 1821 + }, + { + "epoch": 0.45589891154760415, + "grad_norm": 7.71875, + "kl": 10.435153007507324, + "learning_rate": 5e-06, + "logits/chosen": -51845412.571428575, + "logits/rejected": -73719699.2, + "logps/chosen": -439.9320591517857, + "logps/rejected": -717.6755859375, + "loss": 0.0421, + "rewards/chosen": 8.85178702218192, + "rewards/margins": 23.841270664760046, + "rewards/rejected": -14.989483642578126, + "step": 1822 + }, + { + "epoch": 0.45614913048917805, + "grad_norm": 10.75, + "kl": 8.552907943725586, + "learning_rate": 5e-06, + "logits/chosen": -52741897.14285714, + "logits/rejected": -20062328.470588237, + "logps/chosen": -328.42299107142856, + "logps/rejected": -548.6960592830883, + "loss": 0.0499, + "rewards/chosen": 6.935404096330915, + "rewards/margins": 18.203003763150768, + "rewards/rejected": -11.267599666819853, + "step": 1823 + }, + { + "epoch": 0.4563993494307519, + "grad_norm": 2.8125, + "kl": 0.040269218385219574, + "learning_rate": 5e-06, + "logits/chosen": -67590842.18181819, + "logits/rejected": -56723367.384615384, + "logps/chosen": -393.86177201704544, + "logps/rejected": -754.8354116586538, + "loss": 0.0232, + "rewards/chosen": 8.580128756436435, + "rewards/margins": 21.49782989075134, + "rewards/rejected": -12.917701134314903, + "step": 1824 + }, + { + "epoch": 0.4566495683723258, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33807068.8, + "logits/rejected": -45807533.71428572, + "logps/chosen": -485.830078125, + "logps/rejected": -507.83042689732144, + "loss": 0.0486, + "rewards/chosen": 7.4284523010253904, + "rewards/margins": 18.39190968104771, + "rewards/rejected": -10.963457380022321, + "step": 1825 + }, + { + "epoch": 0.4568997873138997, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57021216.0, + "logits/rejected": -56780730.666666664, + "logps/chosen": -446.1334635416667, + "logps/rejected": -643.5774739583334, + "loss": 0.0423, + "rewards/chosen": 8.803810119628906, + "rewards/margins": 22.35092798868815, + "rewards/rejected": -13.547117869059244, + "step": 1826 + }, + { + "epoch": 0.45715000625547353, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74425924.57142857, + "logits/rejected": -42103687.52941176, + "logps/chosen": -190.69562639508928, + "logps/rejected": -580.9008501838235, + "loss": 0.0695, + "rewards/chosen": 4.26662472316197, + "rewards/margins": 17.304901844313164, + "rewards/rejected": -13.038277121151195, + "step": 1827 + }, + { + "epoch": 0.4574002251970474, + "grad_norm": 34.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22310152.0, + "logits/rejected": -50640636.0, + "logps/chosen": -317.6463623046875, + "logps/rejected": -506.959228515625, + "loss": 0.0788, + "rewards/chosen": 7.503783226013184, + "rewards/margins": 16.289525985717773, + "rewards/rejected": -8.78574275970459, + "step": 1828 + }, + { + "epoch": 0.45765044413862127, + "grad_norm": 8.6875, + "kl": 3.1323599815368652, + "learning_rate": 5e-06, + "logits/chosen": -55205690.666666664, + "logits/rejected": -47330010.666666664, + "logps/chosen": -364.3751627604167, + "logps/rejected": -527.6316731770834, + "loss": 0.041, + "rewards/chosen": 6.439053217569987, + "rewards/margins": 19.272939682006836, + "rewards/rejected": -12.83388646443685, + "step": 1829 + }, + { + "epoch": 0.45790066308019517, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38996614.4, + "logits/rejected": -44130907.428571425, + "logps/chosen": -374.4339111328125, + "logps/rejected": -483.86328125, + "loss": 0.0613, + "rewards/chosen": 8.183837890625, + "rewards/margins": 19.407094682965962, + "rewards/rejected": -11.22325679234096, + "step": 1830 + }, + { + "epoch": 0.45815088202176907, + "grad_norm": 19.625, + "kl": 0.17576441168785095, + "learning_rate": 5e-06, + "logits/chosen": -43133277.86666667, + "logits/rejected": -29485104.0, + "logps/chosen": -349.8498046875, + "logps/rejected": -437.47422960069446, + "loss": 0.0462, + "rewards/chosen": 7.840579732259115, + "rewards/margins": 18.363396708170573, + "rewards/rejected": -10.522816975911459, + "step": 1831 + }, + { + "epoch": 0.4584011009633429, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29273773.714285713, + "logits/rejected": -40448441.6, + "logps/chosen": -322.5550013950893, + "logps/rejected": -662.237890625, + "loss": 0.0233, + "rewards/chosen": 6.986337389264788, + "rewards/margins": 19.343897356305803, + "rewards/rejected": -12.357559967041016, + "step": 1832 + }, + { + "epoch": 0.4586513199049168, + "grad_norm": 0.8359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40992857.6, + "logits/rejected": -43540553.14285714, + "logps/chosen": -511.662841796875, + "logps/rejected": -554.0796595982143, + "loss": 0.0025, + "rewards/chosen": 9.118090057373047, + "rewards/margins": 22.390919385637556, + "rewards/rejected": -13.272829328264509, + "step": 1833 + }, + { + "epoch": 0.45890153884649065, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58321179.428571425, + "logits/rejected": -48664041.4117647, + "logps/chosen": -367.0762416294643, + "logps/rejected": -664.0877757352941, + "loss": 0.0071, + "rewards/chosen": 5.043979099818638, + "rewards/margins": 21.837468443798418, + "rewards/rejected": -16.79348934397978, + "step": 1834 + }, + { + "epoch": 0.45915175778806455, + "grad_norm": 9.3125, + "kl": 6.099122524261475, + "learning_rate": 5e-06, + "logits/chosen": -101802730.66666667, + "logits/rejected": -64098026.666666664, + "logps/chosen": -469.34619140625, + "logps/rejected": -743.6424153645834, + "loss": 0.0242, + "rewards/chosen": 9.84646733601888, + "rewards/margins": 24.594205220540367, + "rewards/rejected": -14.747737884521484, + "step": 1835 + }, + { + "epoch": 0.45940197672963845, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57251728.0, + "logits/rejected": -73646560.0, + "logps/chosen": -357.1741129557292, + "logps/rejected": -686.609130859375, + "loss": 0.0526, + "rewards/chosen": 5.172396977742513, + "rewards/margins": 18.528693517049152, + "rewards/rejected": -13.35629653930664, + "step": 1836 + }, + { + "epoch": 0.4596521956712123, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50931328.0, + "logits/rejected": -39900739.2, + "logps/chosen": -399.63978794642856, + "logps/rejected": -420.067626953125, + "loss": 0.0513, + "rewards/chosen": 7.734361921037946, + "rewards/margins": 16.77532719203404, + "rewards/rejected": -9.040965270996093, + "step": 1837 + }, + { + "epoch": 0.4599024146127862, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32564619.636363637, + "logits/rejected": -31510833.230769232, + "logps/chosen": -398.4993341619318, + "logps/rejected": -636.8825871394231, + "loss": 0.0172, + "rewards/chosen": 7.572857943448153, + "rewards/margins": 20.044916753168707, + "rewards/rejected": -12.472058809720552, + "step": 1838 + }, + { + "epoch": 0.4601526335543601, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -71099704.8888889, + "logits/rejected": -92642628.26666667, + "logps/chosen": -349.7503255208333, + "logps/rejected": -630.3300130208333, + "loss": 0.0282, + "rewards/chosen": 5.275037553575304, + "rewards/margins": 20.080352698432073, + "rewards/rejected": -14.80531514485677, + "step": 1839 + }, + { + "epoch": 0.46040285249593393, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73448704.0, + "logits/rejected": -54727868.44444445, + "logps/chosen": -553.7563802083333, + "logps/rejected": -602.5112847222222, + "loss": 0.0033, + "rewards/chosen": 9.24544677734375, + "rewards/margins": 20.629786851671007, + "rewards/rejected": -11.384340074327257, + "step": 1840 + }, + { + "epoch": 0.46065307143750783, + "grad_norm": 2.21875, + "kl": 1.5725047588348389, + "learning_rate": 5e-06, + "logits/chosen": -70169097.84615384, + "logits/rejected": -32701984.0, + "logps/chosen": -378.1858473557692, + "logps/rejected": -668.8263050426136, + "loss": 0.0063, + "rewards/chosen": 7.302620520958533, + "rewards/margins": 22.445614794751148, + "rewards/rejected": -15.142994273792613, + "step": 1841 + }, + { + "epoch": 0.4609032903790817, + "grad_norm": 1.4765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60732713.14285714, + "logits/rejected": -54094234.35294118, + "logps/chosen": -664.0159040178571, + "logps/rejected": -549.5121208639706, + "loss": 0.0022, + "rewards/chosen": 8.691974094935826, + "rewards/margins": 22.68019341220375, + "rewards/rejected": -13.988219317267923, + "step": 1842 + }, + { + "epoch": 0.4611535093206556, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24814605.333333332, + "logits/rejected": -26440317.333333332, + "logps/chosen": -340.81390380859375, + "logps/rejected": -412.0260009765625, + "loss": 0.026, + "rewards/chosen": 5.618303934733073, + "rewards/margins": 19.399237314860027, + "rewards/rejected": -13.780933380126953, + "step": 1843 + }, + { + "epoch": 0.4614037282622295, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35719153.777777776, + "logits/rejected": -43465365.333333336, + "logps/chosen": -383.1078287760417, + "logps/rejected": -853.1214192708334, + "loss": 0.0474, + "rewards/chosen": 7.898061964246962, + "rewards/margins": 25.20655271742079, + "rewards/rejected": -17.308490753173828, + "step": 1844 + }, + { + "epoch": 0.4616539472038033, + "grad_norm": 12.0625, + "kl": 0.6608712077140808, + "learning_rate": 5e-06, + "logits/chosen": -68035444.36363636, + "logits/rejected": -53438946.461538464, + "logps/chosen": -543.0992542613636, + "logps/rejected": -709.5197566105769, + "loss": 0.0233, + "rewards/chosen": 8.443614612926137, + "rewards/margins": 26.665301342944165, + "rewards/rejected": -18.22168673001803, + "step": 1845 + }, + { + "epoch": 0.4619041661453772, + "grad_norm": 2.59375, + "kl": 6.9179487228393555, + "learning_rate": 5e-06, + "logits/chosen": -66434648.615384616, + "logits/rejected": -59711185.45454545, + "logps/chosen": -411.5178786057692, + "logps/rejected": -587.4454456676136, + "loss": 0.0269, + "rewards/chosen": 9.518671475923979, + "rewards/margins": 25.435133767294715, + "rewards/rejected": -15.916462291370738, + "step": 1846 + }, + { + "epoch": 0.46215438508695106, + "grad_norm": 12.1875, + "kl": 2.4767978191375732, + "learning_rate": 5e-06, + "logits/chosen": -63028083.2, + "logits/rejected": -107057557.33333333, + "logps/chosen": -471.79798177083336, + "logps/rejected": -783.8649631076389, + "loss": 0.0242, + "rewards/chosen": 8.118845113118489, + "rewards/margins": 22.72263709174262, + "rewards/rejected": -14.603791978624132, + "step": 1847 + }, + { + "epoch": 0.46240460402852496, + "grad_norm": 20.5, + "kl": 3.8112666606903076, + "learning_rate": 5e-06, + "logits/chosen": -23998481.454545453, + "logits/rejected": -36624743.384615384, + "logps/chosen": -471.82936789772725, + "logps/rejected": -597.1053936298077, + "loss": 0.0631, + "rewards/chosen": 6.276128595525568, + "rewards/margins": 21.98370916193182, + "rewards/rejected": -15.70758056640625, + "step": 1848 + }, + { + "epoch": 0.46265482297009886, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 143868.0, + "logits/rejected": -42434264.615384616, + "logps/chosen": -356.240966796875, + "logps/rejected": -528.7065054086538, + "loss": 0.0753, + "rewards/chosen": 6.244314713911577, + "rewards/margins": 23.212322902012538, + "rewards/rejected": -16.96800818810096, + "step": 1849 + }, + { + "epoch": 0.4629050419116727, + "grad_norm": 11.375, + "kl": 4.972392559051514, + "learning_rate": 5e-06, + "logits/chosen": -56734173.86666667, + "logits/rejected": -76248184.8888889, + "logps/chosen": -349.71363932291666, + "logps/rejected": -667.8041449652778, + "loss": 0.0627, + "rewards/chosen": 7.361461385091146, + "rewards/margins": 24.032373385959204, + "rewards/rejected": -16.670912000868057, + "step": 1850 + }, + { + "epoch": 0.4631552608532466, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29821978.666666668, + "logits/rejected": -29113394.666666668, + "logps/chosen": -413.2227376302083, + "logps/rejected": -732.7853190104166, + "loss": 0.0349, + "rewards/chosen": 6.3348337809244795, + "rewards/margins": 22.94544219970703, + "rewards/rejected": -16.61060841878255, + "step": 1851 + }, + { + "epoch": 0.46340547979482044, + "grad_norm": 14.9375, + "kl": 8.013051986694336, + "learning_rate": 5e-06, + "logits/chosen": -53378953.14285714, + "logits/rejected": -40966544.0, + "logps/chosen": -435.09486607142856, + "logps/rejected": -620.365869140625, + "loss": 0.0757, + "rewards/chosen": 7.492950439453125, + "rewards/margins": 19.725806427001952, + "rewards/rejected": -12.232855987548827, + "step": 1852 + }, + { + "epoch": 0.46365569873639434, + "grad_norm": 10.0625, + "kl": 2.2254798412323, + "learning_rate": 5e-06, + "logits/chosen": -63753856.0, + "logits/rejected": -41146998.4, + "logps/chosen": -391.5137416294643, + "logps/rejected": -473.37041015625, + "loss": 0.0417, + "rewards/chosen": 7.20820563180106, + "rewards/margins": 20.717333439418248, + "rewards/rejected": -13.509127807617187, + "step": 1853 + }, + { + "epoch": 0.46390591767796824, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56538990.54545455, + "logits/rejected": -34862993.23076923, + "logps/chosen": -447.75874467329544, + "logps/rejected": -457.05799278846155, + "loss": 0.0332, + "rewards/chosen": 8.099980441006748, + "rewards/margins": 22.02635561002718, + "rewards/rejected": -13.926375169020433, + "step": 1854 + }, + { + "epoch": 0.4641561366195421, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38590696.0, + "logits/rejected": -57440704.0, + "logps/chosen": -256.9342447916667, + "logps/rejected": -734.7691243489584, + "loss": 0.042, + "rewards/chosen": 5.344825744628906, + "rewards/margins": 21.600176493326824, + "rewards/rejected": -16.255350748697918, + "step": 1855 + }, + { + "epoch": 0.464406355561116, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72811689.14285715, + "logits/rejected": -45411296.0, + "logps/chosen": -425.64310128348217, + "logps/rejected": -619.601611328125, + "loss": 0.0276, + "rewards/chosen": 7.353938511439732, + "rewards/margins": 18.278093937465123, + "rewards/rejected": -10.92415542602539, + "step": 1856 + }, + { + "epoch": 0.4646565745026899, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -94020234.66666667, + "logits/rejected": -44557224.0, + "logps/chosen": -435.763916015625, + "logps/rejected": -684.8262532552084, + "loss": 0.0587, + "rewards/chosen": 8.143967946370443, + "rewards/margins": 21.18489201863607, + "rewards/rejected": -13.040924072265625, + "step": 1857 + }, + { + "epoch": 0.4649067934442637, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27914455.272727273, + "logits/rejected": -42928768.0, + "logps/chosen": -230.5587713068182, + "logps/rejected": -575.9972956730769, + "loss": 0.0404, + "rewards/chosen": 6.096482016823509, + "rewards/margins": 22.01834293178745, + "rewards/rejected": -15.921860914963942, + "step": 1858 + }, + { + "epoch": 0.4651570123858376, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57102912.0, + "logits/rejected": -64864760.0, + "logps/chosen": -341.9047546386719, + "logps/rejected": -598.0948486328125, + "loss": 0.0262, + "rewards/chosen": 8.634907722473145, + "rewards/margins": 23.6636381149292, + "rewards/rejected": -15.028730392456055, + "step": 1859 + }, + { + "epoch": 0.46540723132741146, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48335844.571428575, + "logits/rejected": -44650640.0, + "logps/chosen": -367.60630580357144, + "logps/rejected": -638.570654296875, + "loss": 0.0325, + "rewards/chosen": 7.441713605608259, + "rewards/margins": 23.476595197405132, + "rewards/rejected": -16.034881591796875, + "step": 1860 + }, + { + "epoch": 0.46565745026898536, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34463650.90909091, + "logits/rejected": -65186673.23076923, + "logps/chosen": -380.15354225852275, + "logps/rejected": -790.3853665865385, + "loss": 0.0351, + "rewards/chosen": 6.1895058371803975, + "rewards/margins": 18.395468971946023, + "rewards/rejected": -12.205963134765625, + "step": 1861 + }, + { + "epoch": 0.46590766921055926, + "grad_norm": 3.03125, + "kl": 4.996633052825928, + "learning_rate": 5e-06, + "logits/chosen": -52812416.0, + "logits/rejected": -33194109.333333332, + "logps/chosen": -372.7600911458333, + "logps/rejected": -539.4217122395834, + "loss": 0.0733, + "rewards/chosen": 6.45749028523763, + "rewards/margins": 16.92386245727539, + "rewards/rejected": -10.46637217203776, + "step": 1862 + }, + { + "epoch": 0.4661578881521331, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53601541.81818182, + "logits/rejected": -53796081.23076923, + "logps/chosen": -394.84810014204544, + "logps/rejected": -628.0455228365385, + "loss": 0.005, + "rewards/chosen": 8.558661027388139, + "rewards/margins": 20.497366471724078, + "rewards/rejected": -11.938705444335938, + "step": 1863 + }, + { + "epoch": 0.466408107093707, + "grad_norm": 13.625, + "kl": 3.878955841064453, + "learning_rate": 5e-06, + "logits/chosen": -57396326.4, + "logits/rejected": -59677105.777777776, + "logps/chosen": -344.63603515625, + "logps/rejected": -523.7227647569445, + "loss": 0.068, + "rewards/chosen": 7.1005126953125, + "rewards/margins": 18.26181165907118, + "rewards/rejected": -11.16129896375868, + "step": 1864 + }, + { + "epoch": 0.46665832603528085, + "grad_norm": 10.0, + "kl": 6.232662677764893, + "learning_rate": 5e-06, + "logits/chosen": -59377280.0, + "logits/rejected": -14085264.0, + "logps/chosen": -408.01468599759613, + "logps/rejected": -692.0333806818181, + "loss": 0.0541, + "rewards/chosen": 6.857640193058894, + "rewards/margins": 18.85916191047722, + "rewards/rejected": -12.001521717418324, + "step": 1865 + }, + { + "epoch": 0.46690854497685474, + "grad_norm": 14.9375, + "kl": 2.0003116130828857, + "learning_rate": 5e-06, + "logits/chosen": -41547254.85714286, + "logits/rejected": -31813203.2, + "logps/chosen": -468.09915597098217, + "logps/rejected": -420.550048828125, + "loss": 0.0377, + "rewards/chosen": 6.655948093959263, + "rewards/margins": 13.600762394496371, + "rewards/rejected": -6.944814300537109, + "step": 1866 + }, + { + "epoch": 0.46715876391842864, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37455090.666666664, + "logits/rejected": -29891192.888888888, + "logps/chosen": -406.0683186848958, + "logps/rejected": -695.1059027777778, + "loss": 0.0578, + "rewards/chosen": 5.127570470174153, + "rewards/margins": 17.73722775777181, + "rewards/rejected": -12.609657287597656, + "step": 1867 + }, + { + "epoch": 0.4674089828600025, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -84758952.0, + "logits/rejected": -48354772.0, + "logps/chosen": -250.43667602539062, + "logps/rejected": -575.507568359375, + "loss": 0.0373, + "rewards/chosen": 6.526072025299072, + "rewards/margins": 19.323596477508545, + "rewards/rejected": -12.797524452209473, + "step": 1868 + }, + { + "epoch": 0.4676592018015764, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44745929.14285714, + "logits/rejected": -40820201.4117647, + "logps/chosen": -338.78086635044644, + "logps/rejected": -567.5758846507352, + "loss": 0.0272, + "rewards/chosen": 4.962835039411273, + "rewards/margins": 17.51401930095769, + "rewards/rejected": -12.551184261546416, + "step": 1869 + }, + { + "epoch": 0.4679094207431503, + "grad_norm": 5.84375, + "kl": 1.981398344039917, + "learning_rate": 5e-06, + "logits/chosen": -25068525.714285713, + "logits/rejected": -62433164.8, + "logps/chosen": -508.60477120535717, + "logps/rejected": -596.7794921875, + "loss": 0.049, + "rewards/chosen": 7.745635986328125, + "rewards/margins": 21.255525207519533, + "rewards/rejected": -13.509889221191406, + "step": 1870 + }, + { + "epoch": 0.4681596396847241, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47142089.84615385, + "logits/rejected": -44194301.09090909, + "logps/chosen": -412.22543569711536, + "logps/rejected": -575.4862393465909, + "loss": 0.0244, + "rewards/chosen": 8.527489295372597, + "rewards/margins": 20.028929170195042, + "rewards/rejected": -11.501439874822443, + "step": 1871 + }, + { + "epoch": 0.468409858626298, + "grad_norm": 5.53125, + "kl": 2.188936948776245, + "learning_rate": 5e-06, + "logits/chosen": -27342520.0, + "logits/rejected": -44271114.666666664, + "logps/chosen": -328.7531331380208, + "logps/rejected": -629.1684977213541, + "loss": 0.0848, + "rewards/chosen": 6.353212992350261, + "rewards/margins": 18.740184783935547, + "rewards/rejected": -12.386971791585287, + "step": 1872 + }, + { + "epoch": 0.46866007756787187, + "grad_norm": 4.125, + "kl": 2.983301877975464, + "learning_rate": 5e-06, + "logits/chosen": -54742592.0, + "logits/rejected": -32201208.888888888, + "logps/chosen": -280.91328125, + "logps/rejected": -617.9126519097222, + "loss": 0.0137, + "rewards/chosen": 7.596306355794271, + "rewards/margins": 21.487479824490016, + "rewards/rejected": -13.891173468695747, + "step": 1873 + }, + { + "epoch": 0.46891029650944577, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31042284.8, + "logits/rejected": -43856484.571428575, + "logps/chosen": -247.82734375, + "logps/rejected": -454.7202845982143, + "loss": 0.0427, + "rewards/chosen": 5.980234527587891, + "rewards/margins": 16.585199737548827, + "rewards/rejected": -10.604965209960938, + "step": 1874 + }, + { + "epoch": 0.46916051545101967, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38059214.222222224, + "logits/rejected": 5302540.8, + "logps/chosen": -288.4533962673611, + "logps/rejected": -601.8875651041667, + "loss": 0.0418, + "rewards/chosen": 6.521678924560547, + "rewards/margins": 17.7122927347819, + "rewards/rejected": -11.190613810221354, + "step": 1875 + }, + { + "epoch": 0.4694107343925935, + "grad_norm": 14.5, + "kl": 4.526261806488037, + "learning_rate": 5e-06, + "logits/chosen": -60578013.09090909, + "logits/rejected": -12723532.307692308, + "logps/chosen": -361.40431906960225, + "logps/rejected": -453.7809495192308, + "loss": 0.0452, + "rewards/chosen": 7.557790582830256, + "rewards/margins": 15.759252961698946, + "rewards/rejected": -8.20146237886869, + "step": 1876 + }, + { + "epoch": 0.4696609533341674, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40500568.0, + "logits/rejected": -2112438.5, + "logps/chosen": -408.1853332519531, + "logps/rejected": -595.2637939453125, + "loss": 0.0512, + "rewards/chosen": 7.1687774658203125, + "rewards/margins": 19.724483489990234, + "rewards/rejected": -12.555706024169922, + "step": 1877 + }, + { + "epoch": 0.46991117227574125, + "grad_norm": 26.375, + "kl": 4.4974212646484375, + "learning_rate": 5e-06, + "logits/chosen": -54849408.0, + "logits/rejected": -43774193.23076923, + "logps/chosen": -391.17116477272725, + "logps/rejected": -587.0826697716346, + "loss": 0.0498, + "rewards/chosen": 8.184157631613992, + "rewards/margins": 21.01902936388563, + "rewards/rejected": -12.834871732271635, + "step": 1878 + }, + { + "epoch": 0.47016139121731515, + "grad_norm": 7.96875, + "kl": 4.693212032318115, + "learning_rate": 5e-06, + "logits/chosen": -56772992.0, + "logits/rejected": -60292502.85714286, + "logps/chosen": -472.859521484375, + "logps/rejected": -686.2666015625, + "loss": 0.0282, + "rewards/chosen": 10.363054656982422, + "rewards/margins": 23.50813740321568, + "rewards/rejected": -13.145082746233259, + "step": 1879 + }, + { + "epoch": 0.47041161015888905, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49364411.733333334, + "logits/rejected": -77578567.1111111, + "logps/chosen": -355.53063151041664, + "logps/rejected": -746.5803493923611, + "loss": 0.0287, + "rewards/chosen": 8.154911295572917, + "rewards/margins": 26.00656534830729, + "rewards/rejected": -17.851654052734375, + "step": 1880 + }, + { + "epoch": 0.4706618291004629, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73884384.0, + "logits/rejected": -22105738.0, + "logps/chosen": -463.17236328125, + "logps/rejected": -565.255859375, + "loss": 0.0494, + "rewards/chosen": 9.413679122924805, + "rewards/margins": 21.580289840698242, + "rewards/rejected": -12.166610717773438, + "step": 1881 + }, + { + "epoch": 0.4709120480420368, + "grad_norm": 16.75, + "kl": 1.2115046977996826, + "learning_rate": 5e-06, + "logits/chosen": -55446464.0, + "logits/rejected": -32496893.714285713, + "logps/chosen": -374.7081787109375, + "logps/rejected": -509.57986886160717, + "loss": 0.0249, + "rewards/chosen": 6.915530395507813, + "rewards/margins": 16.933541216169086, + "rewards/rejected": -10.018010820661273, + "step": 1882 + }, + { + "epoch": 0.47116226698361063, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36450931.692307696, + "logits/rejected": -37335895.27272727, + "logps/chosen": -320.76370943509613, + "logps/rejected": -441.98561789772725, + "loss": 0.0274, + "rewards/chosen": 6.821241525503305, + "rewards/margins": 16.461928254240874, + "rewards/rejected": -9.64068672873757, + "step": 1883 + }, + { + "epoch": 0.47141248592518453, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45219225.6, + "logits/rejected": -49079730.28571428, + "logps/chosen": -255.88115234375, + "logps/rejected": -709.9956752232143, + "loss": 0.0333, + "rewards/chosen": 5.742989730834961, + "rewards/margins": 21.973569652012415, + "rewards/rejected": -16.230579921177455, + "step": 1884 + }, + { + "epoch": 0.47166270486675843, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28300208.0, + "logits/rejected": -48176540.44444445, + "logps/chosen": -422.6548665364583, + "logps/rejected": -536.7003038194445, + "loss": 0.0066, + "rewards/chosen": 9.325508117675781, + "rewards/margins": 23.113004048665367, + "rewards/rejected": -13.787495930989584, + "step": 1885 + }, + { + "epoch": 0.4719129238083323, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46124534.15384615, + "logits/rejected": -43310941.09090909, + "logps/chosen": -380.56456580528845, + "logps/rejected": -496.02956321022725, + "loss": 0.0626, + "rewards/chosen": 6.922862126277043, + "rewards/margins": 18.361029351507867, + "rewards/rejected": -11.438167225230824, + "step": 1886 + }, + { + "epoch": 0.4721631427499062, + "grad_norm": 6.09375, + "kl": 2.2335257530212402, + "learning_rate": 5e-06, + "logits/chosen": -32178237.714285713, + "logits/rejected": -47386048.0, + "logps/chosen": -407.6123046875, + "logps/rejected": -578.853466796875, + "loss": 0.0341, + "rewards/chosen": 7.5116473606654575, + "rewards/margins": 18.918174307686943, + "rewards/rejected": -11.406526947021485, + "step": 1887 + }, + { + "epoch": 0.47241336169148007, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59809984.0, + "logits/rejected": -48518595.2, + "logps/chosen": -290.95765904017856, + "logps/rejected": -556.815576171875, + "loss": 0.0624, + "rewards/chosen": 5.166106632777622, + "rewards/margins": 18.504383305140905, + "rewards/rejected": -13.338276672363282, + "step": 1888 + }, + { + "epoch": 0.4726635806330539, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7047771.636363637, + "logits/rejected": -35451377.23076923, + "logps/chosen": -515.8789950284091, + "logps/rejected": -563.4643179086538, + "loss": 0.0227, + "rewards/chosen": 8.004874489524148, + "rewards/margins": 20.31007844084626, + "rewards/rejected": -12.305203951322115, + "step": 1889 + }, + { + "epoch": 0.4729137995746278, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51887072.0, + "logits/rejected": -28549458.285714287, + "logps/chosen": -453.22509765625, + "logps/rejected": -713.9408482142857, + "loss": 0.0455, + "rewards/chosen": 9.762258148193359, + "rewards/margins": 22.559568023681642, + "rewards/rejected": -12.797309875488281, + "step": 1890 + }, + { + "epoch": 0.47316401851620166, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12954548.363636363, + "logits/rejected": -30745540.923076924, + "logps/chosen": -402.88725142045456, + "logps/rejected": -431.87289663461536, + "loss": 0.037, + "rewards/chosen": 6.04262958873402, + "rewards/margins": 16.06061265852068, + "rewards/rejected": -10.01798306978666, + "step": 1891 + }, + { + "epoch": 0.47341423745777556, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36608760.88888889, + "logits/rejected": -33135014.4, + "logps/chosen": -548.8203125, + "logps/rejected": -496.35914713541666, + "loss": 0.0943, + "rewards/chosen": 6.881097581651476, + "rewards/margins": 18.23773210313585, + "rewards/rejected": -11.356634521484375, + "step": 1892 + }, + { + "epoch": 0.47366445639934945, + "grad_norm": 3.96875, + "kl": 2.8082737922668457, + "learning_rate": 5e-06, + "logits/chosen": -57141832.53333333, + "logits/rejected": -31109738.666666668, + "logps/chosen": -393.179296875, + "logps/rejected": -535.7318250868055, + "loss": 0.0214, + "rewards/chosen": 7.1835683186848955, + "rewards/margins": 17.892664591471355, + "rewards/rejected": -10.709096272786459, + "step": 1893 + }, + { + "epoch": 0.4739146753409233, + "grad_norm": 11.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52637640.0, + "logits/rejected": -40420796.0, + "logps/chosen": -436.98614501953125, + "logps/rejected": -508.92376708984375, + "loss": 0.0351, + "rewards/chosen": 9.695694923400879, + "rewards/margins": 20.36480140686035, + "rewards/rejected": -10.669106483459473, + "step": 1894 + }, + { + "epoch": 0.4741648942824972, + "grad_norm": 11.875, + "kl": 6.959571361541748, + "learning_rate": 5e-06, + "logits/chosen": -43281910.85714286, + "logits/rejected": -77116787.2, + "logps/chosen": -422.927490234375, + "logps/rejected": -582.909228515625, + "loss": 0.0391, + "rewards/chosen": 8.081659589494977, + "rewards/margins": 23.404788861955915, + "rewards/rejected": -15.323129272460937, + "step": 1895 + }, + { + "epoch": 0.47441511322407104, + "grad_norm": 10.75, + "kl": 2.177305221557617, + "learning_rate": 5e-06, + "logits/chosen": -40252228.266666666, + "logits/rejected": -53974318.222222224, + "logps/chosen": -380.80462239583335, + "logps/rejected": -474.67540147569446, + "loss": 0.0515, + "rewards/chosen": 5.971632893880209, + "rewards/margins": 16.391041056315103, + "rewards/rejected": -10.419408162434896, + "step": 1896 + }, + { + "epoch": 0.47466533216564494, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72755219.6923077, + "logits/rejected": -53077538.90909091, + "logps/chosen": -424.85873647836536, + "logps/rejected": -617.0567294034091, + "loss": 0.0568, + "rewards/chosen": 6.177848229041467, + "rewards/margins": 19.813805426750985, + "rewards/rejected": -13.635957197709518, + "step": 1897 + }, + { + "epoch": 0.47491555110721884, + "grad_norm": 17.375, + "kl": 18.653339385986328, + "learning_rate": 5e-06, + "logits/chosen": -42761600.0, + "logits/rejected": -22444636.444444444, + "logps/chosen": -318.1982747395833, + "logps/rejected": -753.7469618055555, + "loss": 0.1651, + "rewards/chosen": 6.748421732584635, + "rewards/margins": 20.402789815266928, + "rewards/rejected": -13.654368082682291, + "step": 1898 + }, + { + "epoch": 0.4751657700487927, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73855634.28571428, + "logits/rejected": -37419958.4, + "logps/chosen": -394.91148158482144, + "logps/rejected": -360.6666748046875, + "loss": 0.0334, + "rewards/chosen": 6.17314202444894, + "rewards/margins": 17.077735682896204, + "rewards/rejected": -10.904593658447265, + "step": 1899 + }, + { + "epoch": 0.4754159889903666, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -91424755.2, + "logits/rejected": -48867734.85714286, + "logps/chosen": -462.37705078125, + "logps/rejected": -773.2659040178571, + "loss": 0.0199, + "rewards/chosen": 8.088202667236327, + "rewards/margins": 25.200553131103515, + "rewards/rejected": -17.112350463867188, + "step": 1900 + }, + { + "epoch": 0.4756662079319404, + "grad_norm": 7.4375, + "kl": 1.6007335186004639, + "learning_rate": 5e-06, + "logits/chosen": -47928109.71428572, + "logits/rejected": -50502979.2, + "logps/chosen": -445.2078334263393, + "logps/rejected": -491.758447265625, + "loss": 0.0278, + "rewards/chosen": 8.313377380371094, + "rewards/margins": 18.96755142211914, + "rewards/rejected": -10.654174041748046, + "step": 1901 + }, + { + "epoch": 0.4759164268735143, + "grad_norm": 13.5625, + "kl": 11.83216667175293, + "learning_rate": 5e-06, + "logits/chosen": -43428271.15789474, + "logits/rejected": -113903936.0, + "logps/chosen": -448.0248252467105, + "logps/rejected": -848.2865234375, + "loss": 0.0647, + "rewards/chosen": 8.114122892680921, + "rewards/margins": 28.32719357139186, + "rewards/rejected": -20.213070678710938, + "step": 1902 + }, + { + "epoch": 0.4761666458150882, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88298419.2, + "logits/rejected": -45715977.14285714, + "logps/chosen": -356.9390625, + "logps/rejected": -617.0769391741071, + "loss": 0.0544, + "rewards/chosen": 7.975172424316407, + "rewards/margins": 21.806593322753905, + "rewards/rejected": -13.8314208984375, + "step": 1903 + }, + { + "epoch": 0.47641686475666206, + "grad_norm": 12.5, + "kl": 5.7578444480896, + "learning_rate": 5e-06, + "logits/chosen": -45758160.84210526, + "logits/rejected": -69828940.8, + "logps/chosen": -365.9189967105263, + "logps/rejected": -615.685595703125, + "loss": 0.1247, + "rewards/chosen": 6.058263678299753, + "rewards/margins": 21.061179632889598, + "rewards/rejected": -15.002915954589843, + "step": 1904 + }, + { + "epoch": 0.47666708369823596, + "grad_norm": 10.5625, + "kl": 0.9122282862663269, + "learning_rate": 5e-06, + "logits/chosen": -55956032.0, + "logits/rejected": -26714816.0, + "logps/chosen": -449.4697265625, + "logps/rejected": -427.7788837139423, + "loss": 0.0222, + "rewards/chosen": 7.670865145596591, + "rewards/margins": 17.5321463204764, + "rewards/rejected": -9.861281174879808, + "step": 1905 + }, + { + "epoch": 0.47691730263980986, + "grad_norm": 7.4375, + "kl": 2.0044784545898438, + "learning_rate": 5e-06, + "logits/chosen": -58131189.333333336, + "logits/rejected": -44498549.333333336, + "logps/chosen": -353.2483723958333, + "logps/rejected": -664.9111735026041, + "loss": 0.0781, + "rewards/chosen": 6.507879892985026, + "rewards/margins": 25.566814422607422, + "rewards/rejected": -19.058934529622395, + "step": 1906 + }, + { + "epoch": 0.4771675215813837, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62429032.0, + "logits/rejected": -59947692.0, + "logps/chosen": -297.3627624511719, + "logps/rejected": -789.9147338867188, + "loss": 0.0745, + "rewards/chosen": 6.011932849884033, + "rewards/margins": 22.72270441055298, + "rewards/rejected": -16.710771560668945, + "step": 1907 + }, + { + "epoch": 0.4774177405229576, + "grad_norm": 9.375, + "kl": 6.982539176940918, + "learning_rate": 5e-06, + "logits/chosen": -48386732.307692304, + "logits/rejected": -75093789.0909091, + "logps/chosen": -390.5595703125, + "logps/rejected": -592.1262428977273, + "loss": 0.0503, + "rewards/chosen": 7.237253042367788, + "rewards/margins": 19.164231413727876, + "rewards/rejected": -11.926978371360086, + "step": 1908 + }, + { + "epoch": 0.47766795946453144, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -130208654.22222222, + "logits/rejected": -48411485.86666667, + "logps/chosen": -421.73076714409723, + "logps/rejected": -586.3192057291667, + "loss": 0.0604, + "rewards/chosen": 6.790916866726345, + "rewards/margins": 15.566306474473741, + "rewards/rejected": -8.775389607747396, + "step": 1909 + }, + { + "epoch": 0.47791817840610534, + "grad_norm": 20.625, + "kl": 21.40979766845703, + "learning_rate": 5e-06, + "logits/chosen": -45132957.86666667, + "logits/rejected": -54183658.666666664, + "logps/chosen": -405.50390625, + "logps/rejected": -939.6733940972222, + "loss": 0.0741, + "rewards/chosen": 9.526466878255208, + "rewards/margins": 26.72743191189236, + "rewards/rejected": -17.200965033637154, + "step": 1910 + }, + { + "epoch": 0.47816839734767924, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46105760.0, + "logits/rejected": -64403349.333333336, + "logps/chosen": -359.3991292317708, + "logps/rejected": -711.50830078125, + "loss": 0.0376, + "rewards/chosen": 6.146434783935547, + "rewards/margins": 22.985132853190105, + "rewards/rejected": -16.83869806925456, + "step": 1911 + }, + { + "epoch": 0.4784186162892531, + "grad_norm": 2.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10282146.0, + "logits/rejected": -39385432.0, + "logps/chosen": -229.341552734375, + "logps/rejected": -561.3908081054688, + "loss": 0.0374, + "rewards/chosen": 6.122142791748047, + "rewards/margins": 19.519729614257812, + "rewards/rejected": -13.397586822509766, + "step": 1912 + }, + { + "epoch": 0.478668835230827, + "grad_norm": 10.8125, + "kl": 0.15233168005943298, + "learning_rate": 5e-06, + "logits/chosen": -38517292.8, + "logits/rejected": -32517408.0, + "logps/chosen": -593.337158203125, + "logps/rejected": -496.22506277901783, + "loss": 0.1167, + "rewards/chosen": 6.516319274902344, + "rewards/margins": 17.675260271344868, + "rewards/rejected": -11.158940996442523, + "step": 1913 + }, + { + "epoch": 0.4789190541724008, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49710976.0, + "logits/rejected": -45232374.15384615, + "logps/chosen": -313.15525124289775, + "logps/rejected": -847.2007962740385, + "loss": 0.0566, + "rewards/chosen": 5.561199881813743, + "rewards/margins": 25.460400321266867, + "rewards/rejected": -19.899200439453125, + "step": 1914 + }, + { + "epoch": 0.4791692731139747, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27284509.09090909, + "logits/rejected": -36924512.0, + "logps/chosen": -373.5554865056818, + "logps/rejected": -441.73568960336536, + "loss": 0.0395, + "rewards/chosen": 7.741558421741832, + "rewards/margins": 19.427527847823562, + "rewards/rejected": -11.68596942608173, + "step": 1915 + }, + { + "epoch": 0.4794194920555486, + "grad_norm": 11.75, + "kl": 14.188946723937988, + "learning_rate": 5e-06, + "logits/chosen": -61077848.0, + "logits/rejected": -46737404.0, + "logps/chosen": -467.231689453125, + "logps/rejected": -599.38818359375, + "loss": 0.0728, + "rewards/chosen": 7.95274019241333, + "rewards/margins": 22.40977907180786, + "rewards/rejected": -14.457038879394531, + "step": 1916 + }, + { + "epoch": 0.47966971099712247, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60779982.222222224, + "logits/rejected": -35502632.53333333, + "logps/chosen": -519.5782877604166, + "logps/rejected": -544.8016927083333, + "loss": 0.0119, + "rewards/chosen": 9.962235344780815, + "rewards/margins": 22.010817294650607, + "rewards/rejected": -12.048581949869792, + "step": 1917 + }, + { + "epoch": 0.47991992993869637, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57390165.333333336, + "logits/rejected": -41360977.06666667, + "logps/chosen": -362.00208875868054, + "logps/rejected": -663.4918619791666, + "loss": 0.0588, + "rewards/chosen": 6.329730987548828, + "rewards/margins": 23.30272699991862, + "rewards/rejected": -16.97299601236979, + "step": 1918 + }, + { + "epoch": 0.4801701488802702, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58262745.6, + "logits/rejected": -53890171.428571425, + "logps/chosen": -216.6924560546875, + "logps/rejected": -697.7385602678571, + "loss": 0.0723, + "rewards/chosen": 4.11395263671875, + "rewards/margins": 17.292550223214285, + "rewards/rejected": -13.178597586495536, + "step": 1919 + }, + { + "epoch": 0.4804203678218441, + "grad_norm": 13.375, + "kl": 1.8547430038452148, + "learning_rate": 5e-06, + "logits/chosen": -26991511.272727273, + "logits/rejected": -39955524.92307692, + "logps/chosen": -337.51376065340907, + "logps/rejected": -720.0249399038462, + "loss": 0.053, + "rewards/chosen": 5.6490395285866475, + "rewards/margins": 21.79117397495083, + "rewards/rejected": -16.14213444636418, + "step": 1920 + }, + { + "epoch": 0.480670586763418, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25023563.636363637, + "logits/rejected": -48466638.76923077, + "logps/chosen": -238.816650390625, + "logps/rejected": -501.7196514423077, + "loss": 0.052, + "rewards/chosen": 5.699712579900568, + "rewards/margins": 17.415908493362107, + "rewards/rejected": -11.716195913461538, + "step": 1921 + }, + { + "epoch": 0.48092080570499185, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42263622.4, + "logits/rejected": -74208214.85714285, + "logps/chosen": -415.3212890625, + "logps/rejected": -669.0244140625, + "loss": 0.0232, + "rewards/chosen": 7.46143798828125, + "rewards/margins": 23.398624093191962, + "rewards/rejected": -15.937186104910714, + "step": 1922 + }, + { + "epoch": 0.48117102464656575, + "grad_norm": 13.5, + "kl": 6.482516288757324, + "learning_rate": 5e-06, + "logits/chosen": -69249092.26666667, + "logits/rejected": -44832960.0, + "logps/chosen": -429.90647786458334, + "logps/rejected": -549.9962565104166, + "loss": 0.0255, + "rewards/chosen": 8.572227986653646, + "rewards/margins": 17.960567559136287, + "rewards/rejected": -9.38833957248264, + "step": 1923 + }, + { + "epoch": 0.48142124358813965, + "grad_norm": 12.8125, + "kl": 23.512584686279297, + "learning_rate": 5e-06, + "logits/chosen": -56614925.71428572, + "logits/rejected": -76465100.8, + "logps/chosen": -505.8662806919643, + "logps/rejected": -481.674072265625, + "loss": 0.1131, + "rewards/chosen": 8.634522574288505, + "rewards/margins": 19.620542471749445, + "rewards/rejected": -10.986019897460938, + "step": 1924 + }, + { + "epoch": 0.4816714625297135, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43761178.666666664, + "logits/rejected": -35434016.0, + "logps/chosen": -325.2525634765625, + "logps/rejected": -597.0359700520834, + "loss": 0.0201, + "rewards/chosen": 6.605962753295898, + "rewards/margins": 19.074846267700195, + "rewards/rejected": -12.468883514404297, + "step": 1925 + }, + { + "epoch": 0.4819216814712874, + "grad_norm": 5.4375, + "kl": 11.590279579162598, + "learning_rate": 5e-06, + "logits/chosen": -81882971.42857143, + "logits/rejected": -62991494.4, + "logps/chosen": -440.77469308035717, + "logps/rejected": -564.36279296875, + "loss": 0.041, + "rewards/chosen": 8.702153887067523, + "rewards/margins": 19.08459025791713, + "rewards/rejected": -10.382436370849609, + "step": 1926 + }, + { + "epoch": 0.48217190041286123, + "grad_norm": 11.0625, + "kl": 3.052659749984741, + "learning_rate": 5e-06, + "logits/chosen": -65402677.333333336, + "logits/rejected": -70461210.66666667, + "logps/chosen": -457.2631022135417, + "logps/rejected": -650.5652669270834, + "loss": 0.0342, + "rewards/chosen": 7.065879185994466, + "rewards/margins": 20.82480812072754, + "rewards/rejected": -13.758928934733072, + "step": 1927 + }, + { + "epoch": 0.48242211935443513, + "grad_norm": 13.9375, + "kl": 4.076589107513428, + "learning_rate": 5e-06, + "logits/chosen": -70541120.0, + "logits/rejected": -81504716.8, + "logps/chosen": -409.54227120535717, + "logps/rejected": -619.988818359375, + "loss": 0.0663, + "rewards/chosen": 6.9412705557686945, + "rewards/margins": 18.212137712751115, + "rewards/rejected": -11.270867156982423, + "step": 1928 + }, + { + "epoch": 0.48267233829600903, + "grad_norm": 10.25, + "kl": 0.19109058380126953, + "learning_rate": 5e-06, + "logits/chosen": -74265878.85714285, + "logits/rejected": -42046764.8, + "logps/chosen": -437.51778738839283, + "logps/rejected": -462.47861328125, + "loss": 0.0436, + "rewards/chosen": 7.423160552978516, + "rewards/margins": 16.733378601074218, + "rewards/rejected": -9.310218048095702, + "step": 1929 + }, + { + "epoch": 0.4829225572375829, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61009093.81818182, + "logits/rejected": -52788814.76923077, + "logps/chosen": -591.8905806107955, + "logps/rejected": -755.0114182692307, + "loss": 0.0513, + "rewards/chosen": 8.715604608709162, + "rewards/margins": 21.440873632897862, + "rewards/rejected": -12.725269024188702, + "step": 1930 + }, + { + "epoch": 0.48317277617915677, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33286280.727272727, + "logits/rejected": -64459854.76923077, + "logps/chosen": -426.6640625, + "logps/rejected": -761.5040564903846, + "loss": 0.0265, + "rewards/chosen": 7.638193303888494, + "rewards/margins": 24.445725981172146, + "rewards/rejected": -16.807532677283653, + "step": 1931 + }, + { + "epoch": 0.4834229951207306, + "grad_norm": 13.375, + "kl": 11.200294494628906, + "learning_rate": 5e-06, + "logits/chosen": -48218752.0, + "logits/rejected": -51606330.18181818, + "logps/chosen": -480.3818359375, + "logps/rejected": -669.4682173295455, + "loss": 0.0557, + "rewards/chosen": 8.839434697077824, + "rewards/margins": 22.166814790739046, + "rewards/rejected": -13.32738009366122, + "step": 1932 + }, + { + "epoch": 0.4836732140623045, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25823662.4, + "logits/rejected": -57090230.85714286, + "logps/chosen": -273.687939453125, + "logps/rejected": -592.97265625, + "loss": 0.0456, + "rewards/chosen": 6.709458923339843, + "rewards/margins": 20.446639360700335, + "rewards/rejected": -13.737180437360491, + "step": 1933 + }, + { + "epoch": 0.4839234330038784, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56556868.92307692, + "logits/rejected": -39687138.90909091, + "logps/chosen": -465.2841045673077, + "logps/rejected": -508.5901988636364, + "loss": 0.0144, + "rewards/chosen": 9.207007774939903, + "rewards/margins": 21.131935919915044, + "rewards/rejected": -11.924928144975143, + "step": 1934 + }, + { + "epoch": 0.48417365194545225, + "grad_norm": 5.9375, + "kl": 8.011520385742188, + "learning_rate": 5e-06, + "logits/chosen": -48447872.0, + "logits/rejected": -43923984.0, + "logps/chosen": -401.1868489583333, + "logps/rejected": -565.0577799479166, + "loss": 0.0684, + "rewards/chosen": 9.283824920654297, + "rewards/margins": 22.88791275024414, + "rewards/rejected": -13.604087829589844, + "step": 1935 + }, + { + "epoch": 0.48442387088702615, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48362850.90909091, + "logits/rejected": -81754210.46153846, + "logps/chosen": -409.84033203125, + "logps/rejected": -850.5164513221154, + "loss": 0.03, + "rewards/chosen": 8.232659773393111, + "rewards/margins": 26.631907162966428, + "rewards/rejected": -18.39924738957332, + "step": 1936 + }, + { + "epoch": 0.48467408982860005, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34768073.84615385, + "logits/rejected": -44530725.81818182, + "logps/chosen": -314.6162672776442, + "logps/rejected": -560.1938920454545, + "loss": 0.0829, + "rewards/chosen": 5.6413726806640625, + "rewards/margins": 16.828032753684305, + "rewards/rejected": -11.186660073020242, + "step": 1937 + }, + { + "epoch": 0.4849243087701739, + "grad_norm": 13.5, + "kl": 3.664531707763672, + "learning_rate": 5e-06, + "logits/chosen": -29624058.181818184, + "logits/rejected": -32789031.384615384, + "logps/chosen": -472.16938920454544, + "logps/rejected": -403.0988957331731, + "loss": 0.0436, + "rewards/chosen": 9.345515858043324, + "rewards/margins": 17.692828198412915, + "rewards/rejected": -8.34731234036959, + "step": 1938 + }, + { + "epoch": 0.4851745277117478, + "grad_norm": 8.6875, + "kl": 10.540792465209961, + "learning_rate": 5e-06, + "logits/chosen": -61500834.461538464, + "logits/rejected": -38500794.18181818, + "logps/chosen": -353.49752103365387, + "logps/rejected": -520.1374289772727, + "loss": 0.1117, + "rewards/chosen": 6.553978553185096, + "rewards/margins": 19.465388798213503, + "rewards/rejected": -12.911410245028408, + "step": 1939 + }, + { + "epoch": 0.48542474665332164, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39404688.0, + "logits/rejected": -45555992.0, + "logps/chosen": -258.1829528808594, + "logps/rejected": -603.4166870117188, + "loss": 0.0594, + "rewards/chosen": 5.2762451171875, + "rewards/margins": 18.531664848327637, + "rewards/rejected": -13.255419731140137, + "step": 1940 + }, + { + "epoch": 0.48567496559489554, + "grad_norm": 2.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44179373.333333336, + "logits/rejected": -59915802.666666664, + "logps/chosen": -283.96409098307294, + "logps/rejected": -467.8128255208333, + "loss": 0.023, + "rewards/chosen": 6.528441747029622, + "rewards/margins": 17.747181574503582, + "rewards/rejected": -11.218739827473959, + "step": 1941 + }, + { + "epoch": 0.48592518453646943, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63616739.55555555, + "logits/rejected": -43949721.6, + "logps/chosen": -373.0949978298611, + "logps/rejected": -636.1695963541666, + "loss": 0.0255, + "rewards/chosen": 7.699903700086805, + "rewards/margins": 21.558041720920137, + "rewards/rejected": -13.858138020833334, + "step": 1942 + }, + { + "epoch": 0.4861754034780433, + "grad_norm": 1.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56281728.0, + "logits/rejected": -75405026.46153846, + "logps/chosen": -366.9915660511364, + "logps/rejected": -641.7111628605769, + "loss": 0.0184, + "rewards/chosen": 6.836582530628551, + "rewards/margins": 23.438864541220497, + "rewards/rejected": -16.602282010591946, + "step": 1943 + }, + { + "epoch": 0.4864256224196172, + "grad_norm": 5.84375, + "kl": 6.260614395141602, + "learning_rate": 5e-06, + "logits/chosen": -87920914.28571428, + "logits/rejected": -46432240.0, + "logps/chosen": -464.88804408482144, + "logps/rejected": -541.467724609375, + "loss": 0.0225, + "rewards/chosen": 8.071257999965123, + "rewards/margins": 19.90644018990653, + "rewards/rejected": -11.835182189941406, + "step": 1944 + }, + { + "epoch": 0.486675841361191, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56832442.666666664, + "logits/rejected": -58913509.333333336, + "logps/chosen": -320.0589192708333, + "logps/rejected": -650.1945393880209, + "loss": 0.0383, + "rewards/chosen": 5.34004275004069, + "rewards/margins": 17.321073532104492, + "rewards/rejected": -11.981030782063803, + "step": 1945 + }, + { + "epoch": 0.4869260603027649, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66299306.666666664, + "logits/rejected": -54337945.6, + "logps/chosen": -361.333984375, + "logps/rejected": -777.0619140625, + "loss": 0.047, + "rewards/chosen": 6.708567725287543, + "rewards/margins": 22.973034074571398, + "rewards/rejected": -16.264466349283854, + "step": 1946 + }, + { + "epoch": 0.4871762792443388, + "grad_norm": 18.75, + "kl": 4.923226833343506, + "learning_rate": 5e-06, + "logits/chosen": -37663815.11111111, + "logits/rejected": -41341252.266666666, + "logps/chosen": -500.55805121527777, + "logps/rejected": -843.296875, + "loss": 0.0495, + "rewards/chosen": 9.026662190755209, + "rewards/margins": 28.73782755533854, + "rewards/rejected": -19.711165364583334, + "step": 1947 + }, + { + "epoch": 0.48742649818591266, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34642035.2, + "logits/rejected": 14962974.857142856, + "logps/chosen": -329.342529296875, + "logps/rejected": -708.0181361607143, + "loss": 0.056, + "rewards/chosen": 7.9686744689941404, + "rewards/margins": 20.091521889822822, + "rewards/rejected": -12.122847420828682, + "step": 1948 + }, + { + "epoch": 0.48767671712748656, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59433674.666666664, + "logits/rejected": -60323498.666666664, + "logps/chosen": -375.8107096354167, + "logps/rejected": -669.0751953125, + "loss": 0.0325, + "rewards/chosen": 5.9443003336588545, + "rewards/margins": 25.546641031901043, + "rewards/rejected": -19.602340698242188, + "step": 1949 + }, + { + "epoch": 0.4879269360690604, + "grad_norm": 7.03125, + "kl": 7.669111251831055, + "learning_rate": 5e-06, + "logits/chosen": -63687348.0, + "logits/rejected": -67990032.0, + "logps/chosen": -385.15252685546875, + "logps/rejected": -499.18048095703125, + "loss": 0.0626, + "rewards/chosen": 7.924552917480469, + "rewards/margins": 18.80202293395996, + "rewards/rejected": -10.877470016479492, + "step": 1950 + }, + { + "epoch": 0.4881771550106343, + "grad_norm": 14.6875, + "kl": 8.706321716308594, + "learning_rate": 5e-06, + "logits/chosen": -24244727.466666665, + "logits/rejected": -81248618.66666667, + "logps/chosen": -469.5697265625, + "logps/rejected": -524.0747612847222, + "loss": 0.0694, + "rewards/chosen": 8.054677836100261, + "rewards/margins": 19.121971638997394, + "rewards/rejected": -11.067293802897135, + "step": 1951 + }, + { + "epoch": 0.4884273739522082, + "grad_norm": 0.96484375, + "kl": 3.2150962352752686, + "learning_rate": 5e-06, + "logits/chosen": -54106986.666666664, + "logits/rejected": -39224746.666666664, + "logps/chosen": -362.5927734375, + "logps/rejected": -606.013427734375, + "loss": 0.0175, + "rewards/chosen": 8.120898564656576, + "rewards/margins": 23.28376579284668, + "rewards/rejected": -15.162867228190104, + "step": 1952 + }, + { + "epoch": 0.48867759289378204, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66858542.54545455, + "logits/rejected": -44173307.07692308, + "logps/chosen": -385.7620738636364, + "logps/rejected": -519.1761568509615, + "loss": 0.0137, + "rewards/chosen": 7.75482871315696, + "rewards/margins": 20.86042422848148, + "rewards/rejected": -13.10559551532452, + "step": 1953 + }, + { + "epoch": 0.48892781183535594, + "grad_norm": 1.625, + "kl": 0.40084776282310486, + "learning_rate": 5e-06, + "logits/chosen": -53397001.14285714, + "logits/rejected": -53505478.4, + "logps/chosen": -381.13065011160717, + "logps/rejected": -505.727734375, + "loss": 0.019, + "rewards/chosen": 7.013322012765067, + "rewards/margins": 20.214681570870535, + "rewards/rejected": -13.20135955810547, + "step": 1954 + }, + { + "epoch": 0.48917803077692984, + "grad_norm": 14.75, + "kl": 18.17488670349121, + "learning_rate": 5e-06, + "logits/chosen": -67558816.0, + "logits/rejected": -40184600.0, + "logps/chosen": -519.1560872395834, + "logps/rejected": -438.814453125, + "loss": 0.036, + "rewards/chosen": 8.646723429361979, + "rewards/margins": 18.27422841389974, + "rewards/rejected": -9.62750498453776, + "step": 1955 + }, + { + "epoch": 0.4894282497185037, + "grad_norm": 6.5625, + "kl": 3.30061674118042, + "learning_rate": 5e-06, + "logits/chosen": -57174459.07692308, + "logits/rejected": -76117876.36363636, + "logps/chosen": -348.33289513221155, + "logps/rejected": -892.8329190340909, + "loss": 0.0264, + "rewards/chosen": 9.003382756159855, + "rewards/margins": 29.31856275438429, + "rewards/rejected": -20.315179998224433, + "step": 1956 + }, + { + "epoch": 0.4896784686600776, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66881460.0, + "logits/rejected": -45749436.0, + "logps/chosen": -453.5660400390625, + "logps/rejected": -703.0601196289062, + "loss": 0.0557, + "rewards/chosen": 7.74298620223999, + "rewards/margins": 25.60486364364624, + "rewards/rejected": -17.86187744140625, + "step": 1957 + }, + { + "epoch": 0.4899286876016514, + "grad_norm": 6.40625, + "kl": 0.15939585864543915, + "learning_rate": 5e-06, + "logits/chosen": -11980749.714285715, + "logits/rejected": -40948902.4, + "logps/chosen": -394.297119140625, + "logps/rejected": -625.00654296875, + "loss": 0.0275, + "rewards/chosen": 7.305687495640346, + "rewards/margins": 18.470462581089564, + "rewards/rejected": -11.164775085449218, + "step": 1958 + }, + { + "epoch": 0.4901789065432253, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47199291.07692308, + "logits/rejected": -78185326.54545455, + "logps/chosen": -354.5652043269231, + "logps/rejected": -527.8115234375, + "loss": 0.0135, + "rewards/chosen": 6.791297912597656, + "rewards/margins": 18.08171844482422, + "rewards/rejected": -11.290420532226562, + "step": 1959 + }, + { + "epoch": 0.4904291254847992, + "grad_norm": 2.421875, + "kl": 3.4636759757995605, + "learning_rate": 5e-06, + "logits/chosen": -80187410.28571428, + "logits/rejected": -47529510.4, + "logps/chosen": -383.0457240513393, + "logps/rejected": -501.22373046875, + "loss": 0.0223, + "rewards/chosen": 7.857319423130581, + "rewards/margins": 18.69271981375558, + "rewards/rejected": -10.835400390625, + "step": 1960 + }, + { + "epoch": 0.49067934442637307, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25855681.6, + "logits/rejected": -37850109.71428572, + "logps/chosen": -209.073876953125, + "logps/rejected": -435.82167271205356, + "loss": 0.0441, + "rewards/chosen": 5.124552154541016, + "rewards/margins": 14.8501097542899, + "rewards/rejected": -9.725557599748884, + "step": 1961 + }, + { + "epoch": 0.49092956336794696, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63210777.6, + "logits/rejected": -67468355.36842105, + "logps/chosen": -318.7895263671875, + "logps/rejected": -604.8444181743421, + "loss": 0.0188, + "rewards/chosen": 7.164218902587891, + "rewards/margins": 21.044686367637233, + "rewards/rejected": -13.880467465049342, + "step": 1962 + }, + { + "epoch": 0.4911797823095208, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88177973.33333333, + "logits/rejected": -39010042.666666664, + "logps/chosen": -372.9269205729167, + "logps/rejected": -445.6918538411458, + "loss": 0.0318, + "rewards/chosen": 7.533425649007161, + "rewards/margins": 16.910661061604817, + "rewards/rejected": -9.377235412597656, + "step": 1963 + }, + { + "epoch": 0.4914300012510947, + "grad_norm": 6.46875, + "kl": 5.331838130950928, + "learning_rate": 5e-06, + "logits/chosen": -44472232.0, + "logits/rejected": -54512184.0, + "logps/chosen": -345.8945007324219, + "logps/rejected": -556.1705932617188, + "loss": 0.029, + "rewards/chosen": 7.838643550872803, + "rewards/margins": 19.6385817527771, + "rewards/rejected": -11.799938201904297, + "step": 1964 + }, + { + "epoch": 0.4916802201926686, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61190560.0, + "logits/rejected": -77380697.6, + "logps/chosen": -363.66971261160717, + "logps/rejected": -760.0359375, + "loss": 0.0576, + "rewards/chosen": 6.642707824707031, + "rewards/margins": 22.61170196533203, + "rewards/rejected": -15.968994140625, + "step": 1965 + }, + { + "epoch": 0.49193043913424245, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -93126902.85714285, + "logits/rejected": -34372385.88235294, + "logps/chosen": -489.9926060267857, + "logps/rejected": -647.1927849264706, + "loss": 0.0144, + "rewards/chosen": 7.906374250139509, + "rewards/margins": 24.71354264972591, + "rewards/rejected": -16.8071683995864, + "step": 1966 + }, + { + "epoch": 0.49218065807581635, + "grad_norm": 2.234375, + "kl": 11.527244567871094, + "learning_rate": 5e-06, + "logits/chosen": -49295921.23076923, + "logits/rejected": -51179287.27272727, + "logps/chosen": -473.7185246394231, + "logps/rejected": -741.1678355823864, + "loss": 0.0572, + "rewards/chosen": 8.49020268366887, + "rewards/margins": 25.79953440419444, + "rewards/rejected": -17.309331720525567, + "step": 1967 + }, + { + "epoch": 0.4924308770173902, + "grad_norm": 6.4375, + "kl": 7.9568562507629395, + "learning_rate": 5e-06, + "logits/chosen": -44239261.538461536, + "logits/rejected": -90056104.72727273, + "logps/chosen": -401.8523512620192, + "logps/rejected": -591.1574041193181, + "loss": 0.0349, + "rewards/chosen": 9.188364469088041, + "rewards/margins": 19.93035360483023, + "rewards/rejected": -10.741989135742188, + "step": 1968 + }, + { + "epoch": 0.4926810959589641, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40718944.0, + "logits/rejected": -55282658.13333333, + "logps/chosen": -358.61957465277777, + "logps/rejected": -620.6891927083333, + "loss": 0.0287, + "rewards/chosen": 6.3852255079481335, + "rewards/margins": 19.769775475396052, + "rewards/rejected": -13.384549967447917, + "step": 1969 + }, + { + "epoch": 0.492931314900538, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50345029.81818182, + "logits/rejected": -38168100.92307692, + "logps/chosen": -392.61714311079544, + "logps/rejected": -634.2292668269231, + "loss": 0.0216, + "rewards/chosen": 7.754757274280895, + "rewards/margins": 21.897193375167312, + "rewards/rejected": -14.142436100886417, + "step": 1970 + }, + { + "epoch": 0.49318153384211183, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68432320.0, + "logits/rejected": -63216464.0, + "logps/chosen": -475.3986002604167, + "logps/rejected": -652.4186197916666, + "loss": 0.0765, + "rewards/chosen": 8.245744705200195, + "rewards/margins": 20.585823694864906, + "rewards/rejected": -12.340078989664713, + "step": 1971 + }, + { + "epoch": 0.49343175278368573, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42423616.0, + "logits/rejected": -74559064.61538461, + "logps/chosen": -392.2444957386364, + "logps/rejected": -526.3907752403846, + "loss": 0.0742, + "rewards/chosen": 6.848864468661222, + "rewards/margins": 17.008814138132376, + "rewards/rejected": -10.159949669471153, + "step": 1972 + }, + { + "epoch": 0.4936819717252596, + "grad_norm": 14.3125, + "kl": 0.4036343991756439, + "learning_rate": 5e-06, + "logits/chosen": -60969107.692307696, + "logits/rejected": -863456.0, + "logps/chosen": -399.7034254807692, + "logps/rejected": -692.6775568181819, + "loss": 0.0413, + "rewards/chosen": 7.455442575307993, + "rewards/margins": 21.191575110375464, + "rewards/rejected": -13.73613253506747, + "step": 1973 + }, + { + "epoch": 0.49393219066683347, + "grad_norm": 7.375, + "kl": 0.2901446223258972, + "learning_rate": 5e-06, + "logits/chosen": -35197733.333333336, + "logits/rejected": 598925.3333333334, + "logps/chosen": -291.9503173828125, + "logps/rejected": -550.6531982421875, + "loss": 0.0565, + "rewards/chosen": 5.831199010213216, + "rewards/margins": 19.764504114786785, + "rewards/rejected": -13.933305104573568, + "step": 1974 + }, + { + "epoch": 0.49418240960840737, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42601444.571428575, + "logits/rejected": -20320771.2, + "logps/chosen": -435.77406529017856, + "logps/rejected": -835.53564453125, + "loss": 0.0453, + "rewards/chosen": 7.779127938406808, + "rewards/margins": 24.131999642508372, + "rewards/rejected": -16.352871704101563, + "step": 1975 + }, + { + "epoch": 0.4944326285499812, + "grad_norm": 2.078125, + "kl": 13.637643814086914, + "learning_rate": 5e-06, + "logits/chosen": -29151579.42857143, + "logits/rejected": -50622313.6, + "logps/chosen": -624.2164481026786, + "logps/rejected": -493.77509765625, + "loss": 0.0042, + "rewards/chosen": 8.846729278564453, + "rewards/margins": 23.465697479248046, + "rewards/rejected": -14.618968200683593, + "step": 1976 + }, + { + "epoch": 0.4946828474915551, + "grad_norm": 6.34375, + "kl": 0.8876008987426758, + "learning_rate": 5e-06, + "logits/chosen": -67830213.81818181, + "logits/rejected": -42516332.307692304, + "logps/chosen": -292.06906960227275, + "logps/rejected": -532.1829552283654, + "loss": 0.051, + "rewards/chosen": 6.310279846191406, + "rewards/margins": 18.150384169358475, + "rewards/rejected": -11.840104323167067, + "step": 1977 + }, + { + "epoch": 0.494933066433129, + "grad_norm": 5.375, + "kl": 3.130004405975342, + "learning_rate": 5e-06, + "logits/chosen": -40128618.666666664, + "logits/rejected": -32163418.666666668, + "logps/chosen": -316.1538899739583, + "logps/rejected": -486.29296875, + "loss": 0.0323, + "rewards/chosen": 6.9225114186604815, + "rewards/margins": 17.675873438517254, + "rewards/rejected": -10.753362019856771, + "step": 1978 + }, + { + "epoch": 0.49518328537470285, + "grad_norm": 14.6875, + "kl": 5.886826992034912, + "learning_rate": 5e-06, + "logits/chosen": -20914996.57142857, + "logits/rejected": -45919126.4, + "logps/chosen": -492.88302176339283, + "logps/rejected": -595.14013671875, + "loss": 0.0303, + "rewards/chosen": 8.33594730922154, + "rewards/margins": 21.781900678362163, + "rewards/rejected": -13.445953369140625, + "step": 1979 + }, + { + "epoch": 0.49543350431627675, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19213648.0, + "logits/rejected": -53516053.333333336, + "logps/chosen": -290.16554768880206, + "logps/rejected": -396.4081759982639, + "loss": 0.0548, + "rewards/chosen": 6.331467310587565, + "rewards/margins": 16.151762008666992, + "rewards/rejected": -9.820294698079428, + "step": 1980 + }, + { + "epoch": 0.4956837232578506, + "grad_norm": 2.125, + "kl": 0.9412371516227722, + "learning_rate": 5e-06, + "logits/chosen": -33133102.545454547, + "logits/rejected": -42555091.692307696, + "logps/chosen": -383.1585582386364, + "logps/rejected": -778.8347355769231, + "loss": 0.0211, + "rewards/chosen": 7.866940585049716, + "rewards/margins": 28.230656123661493, + "rewards/rejected": -20.36371553861178, + "step": 1981 + }, + { + "epoch": 0.4959339421994245, + "grad_norm": 21.125, + "kl": 5.886059761047363, + "learning_rate": 5e-06, + "logits/chosen": -44564475.733333334, + "logits/rejected": -56324736.0, + "logps/chosen": -362.37526041666666, + "logps/rejected": -527.3142903645834, + "loss": 0.0896, + "rewards/chosen": 7.147821044921875, + "rewards/margins": 16.558714803059896, + "rewards/rejected": -9.410893758138021, + "step": 1982 + }, + { + "epoch": 0.4961841611409984, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63586801.777777776, + "logits/rejected": -47936507.733333334, + "logps/chosen": -450.5864529079861, + "logps/rejected": -594.8279947916667, + "loss": 0.0085, + "rewards/chosen": 8.634791056315104, + "rewards/margins": 20.617007446289062, + "rewards/rejected": -11.982216389973958, + "step": 1983 + }, + { + "epoch": 0.49643438008257224, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17682774.0, + "logits/rejected": -31746654.0, + "logps/chosen": -301.0936279296875, + "logps/rejected": -722.8466796875, + "loss": 0.0425, + "rewards/chosen": 5.871728897094727, + "rewards/margins": 22.531436920166016, + "rewards/rejected": -16.65970802307129, + "step": 1984 + }, + { + "epoch": 0.49668459902414613, + "grad_norm": 9.625, + "kl": 1.0102704763412476, + "learning_rate": 5e-06, + "logits/chosen": -61187015.11111111, + "logits/rejected": -22196563.2, + "logps/chosen": -425.5768229166667, + "logps/rejected": -422.61656901041664, + "loss": 0.0347, + "rewards/chosen": 6.393941667344835, + "rewards/margins": 16.379167090521918, + "rewards/rejected": -9.985225423177083, + "step": 1985 + }, + { + "epoch": 0.49693481796572003, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38365064.0, + "logits/rejected": -44917064.0, + "logps/chosen": -369.53790283203125, + "logps/rejected": -490.88641357421875, + "loss": 0.0978, + "rewards/chosen": 6.231435775756836, + "rewards/margins": 18.247509956359863, + "rewards/rejected": -12.016074180603027, + "step": 1986 + }, + { + "epoch": 0.4971850369072939, + "grad_norm": 9.1875, + "kl": 14.78561019897461, + "learning_rate": 5e-06, + "logits/chosen": -59467207.52941176, + "logits/rejected": -30995396.57142857, + "logps/chosen": -438.0150505514706, + "logps/rejected": -405.97715541294644, + "loss": 0.0343, + "rewards/chosen": 8.787280811982995, + "rewards/margins": 20.649233457421055, + "rewards/rejected": -11.861952645438057, + "step": 1987 + }, + { + "epoch": 0.4974352558488678, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 35594452.571428575, + "logits/rejected": -53441200.0, + "logps/chosen": -405.2622767857143, + "logps/rejected": -757.37607421875, + "loss": 0.067, + "rewards/chosen": 6.157471793038504, + "rewards/margins": 25.20892137799944, + "rewards/rejected": -19.051449584960938, + "step": 1988 + }, + { + "epoch": 0.4976854747904416, + "grad_norm": 4.65625, + "kl": 7.945725440979004, + "learning_rate": 5e-06, + "logits/chosen": -59762077.538461536, + "logits/rejected": -71713675.63636364, + "logps/chosen": -399.8107346754808, + "logps/rejected": -514.9894797585227, + "loss": 0.0225, + "rewards/chosen": 9.071091871995192, + "rewards/margins": 22.92500838699874, + "rewards/rejected": -13.85391651500355, + "step": 1989 + }, + { + "epoch": 0.4979356937320155, + "grad_norm": 18.875, + "kl": 17.173019409179688, + "learning_rate": 5e-06, + "logits/chosen": -52575830.5882353, + "logits/rejected": -241204.2857142857, + "logps/chosen": -356.5748506433824, + "logps/rejected": -499.3059779575893, + "loss": 0.1246, + "rewards/chosen": 8.117691040039062, + "rewards/margins": 21.96437726702009, + "rewards/rejected": -13.846686226981026, + "step": 1990 + }, + { + "epoch": 0.4981859126735894, + "grad_norm": 0.58203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64148534.15384615, + "logits/rejected": -47892320.0, + "logps/chosen": -441.45718149038464, + "logps/rejected": -647.1665482954545, + "loss": 0.0017, + "rewards/chosen": 8.416517404409555, + "rewards/margins": 21.908062648106288, + "rewards/rejected": -13.491545243696732, + "step": 1991 + }, + { + "epoch": 0.49843613161516326, + "grad_norm": 4.46875, + "kl": 5.377076148986816, + "learning_rate": 5e-06, + "logits/chosen": -42565051.07692308, + "logits/rejected": -17316741.818181816, + "logps/chosen": -358.4328425480769, + "logps/rejected": -471.4782049005682, + "loss": 0.0728, + "rewards/chosen": 8.14966055063101, + "rewards/margins": 17.532327398553598, + "rewards/rejected": -9.382666847922586, + "step": 1992 + }, + { + "epoch": 0.49868635055673716, + "grad_norm": 5.375, + "kl": 5.9898295402526855, + "learning_rate": 5e-06, + "logits/chosen": -35864762.18181818, + "logits/rejected": -42256571.07692308, + "logps/chosen": -440.42063210227275, + "logps/rejected": -526.6383338341346, + "loss": 0.0277, + "rewards/chosen": 9.14466580477628, + "rewards/margins": 23.007597276380846, + "rewards/rejected": -13.862931471604567, + "step": 1993 + }, + { + "epoch": 0.498936569498311, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46151726.54545455, + "logits/rejected": -47048753.23076923, + "logps/chosen": -355.19797585227275, + "logps/rejected": -576.1336388221154, + "loss": 0.04, + "rewards/chosen": 7.001333063299006, + "rewards/margins": 21.493592162232297, + "rewards/rejected": -14.492259098933292, + "step": 1994 + }, + { + "epoch": 0.4991867884398849, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51479978.666666664, + "logits/rejected": -61658325.333333336, + "logps/chosen": -606.735595703125, + "logps/rejected": -779.0166829427084, + "loss": 0.0287, + "rewards/chosen": 11.5701904296875, + "rewards/margins": 31.145165761311848, + "rewards/rejected": -19.574975331624348, + "step": 1995 + }, + { + "epoch": 0.4994370073814588, + "grad_norm": 7.4375, + "kl": 8.853483200073242, + "learning_rate": 5e-06, + "logits/chosen": -51958464.0, + "logits/rejected": -60710789.333333336, + "logps/chosen": -515.9022216796875, + "logps/rejected": -586.2477213541666, + "loss": 0.0234, + "rewards/chosen": 9.4551633199056, + "rewards/margins": 20.219174702962242, + "rewards/rejected": -10.76401138305664, + "step": 1996 + }, + { + "epoch": 0.49968722632303264, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32874407.384615384, + "logits/rejected": -75908282.18181819, + "logps/chosen": -456.2880108173077, + "logps/rejected": -907.9139737215909, + "loss": 0.019, + "rewards/chosen": 7.912331214317908, + "rewards/margins": 30.723405211121886, + "rewards/rejected": -22.811073996803977, + "step": 1997 + }, + { + "epoch": 0.49993744526460654, + "grad_norm": 6.34375, + "kl": 2.842404842376709, + "learning_rate": 5e-06, + "logits/chosen": -47341802.666666664, + "logits/rejected": -58670986.666666664, + "logps/chosen": -395.780517578125, + "logps/rejected": -681.2534586588541, + "loss": 0.0434, + "rewards/chosen": 6.299989700317383, + "rewards/margins": 21.99212328592936, + "rewards/rejected": -15.692133585611979, + "step": 1998 + }, + { + "epoch": 0.5001876642061804, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35002752.0, + "logits/rejected": -44018730.666666664, + "logps/chosen": -334.6203884548611, + "logps/rejected": -600.12734375, + "loss": 0.0395, + "rewards/chosen": 7.109553866916233, + "rewards/margins": 18.406961907280817, + "rewards/rejected": -11.297408040364584, + "step": 1999 + }, + { + "epoch": 0.5004378831477543, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28822954.666666668, + "logits/rejected": -49734112.0, + "logps/chosen": -418.2074381510417, + "logps/rejected": -654.93212890625, + "loss": 0.0391, + "rewards/chosen": 6.591269810994466, + "rewards/margins": 22.38782564798991, + "rewards/rejected": -15.796555836995443, + "step": 2000 + }, + { + "epoch": 0.5006881020893281, + "grad_norm": 10.875, + "kl": 8.29981803894043, + "learning_rate": 5e-06, + "logits/chosen": -62043316.0, + "logits/rejected": -36093300.0, + "logps/chosen": -356.12384033203125, + "logps/rejected": -431.7716979980469, + "loss": 0.1011, + "rewards/chosen": 6.4548139572143555, + "rewards/margins": 15.90286922454834, + "rewards/rejected": -9.448055267333984, + "step": 2001 + }, + { + "epoch": 0.5009383210309021, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88491146.66666667, + "logits/rejected": -59119781.333333336, + "logps/chosen": -416.1300862630208, + "logps/rejected": -584.5294189453125, + "loss": 0.0329, + "rewards/chosen": 5.221451123555501, + "rewards/margins": 15.168587048848469, + "rewards/rejected": -9.947135925292969, + "step": 2002 + }, + { + "epoch": 0.5011885399724759, + "grad_norm": 13.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27908538.666666668, + "logits/rejected": -31083162.666666668, + "logps/chosen": -452.5336507161458, + "logps/rejected": -594.95458984375, + "loss": 0.0497, + "rewards/chosen": 8.167304992675781, + "rewards/margins": 20.021286010742188, + "rewards/rejected": -11.853981018066406, + "step": 2003 + }, + { + "epoch": 0.5014387589140498, + "grad_norm": 10.5, + "kl": 6.473819732666016, + "learning_rate": 5e-06, + "logits/chosen": -16932506.666666668, + "logits/rejected": 21848204.0, + "logps/chosen": -271.5944010416667, + "logps/rejected": -641.4624837239584, + "loss": 0.0809, + "rewards/chosen": 5.018838564554851, + "rewards/margins": 18.212578137715656, + "rewards/rejected": -13.193739573160807, + "step": 2004 + }, + { + "epoch": 0.5016889778556237, + "grad_norm": 15.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32873629.714285713, + "logits/rejected": -43406836.705882356, + "logps/chosen": -282.13436453683033, + "logps/rejected": -462.07327090992646, + "loss": 0.0798, + "rewards/chosen": 6.133634294782366, + "rewards/margins": 17.492008497735032, + "rewards/rejected": -11.358374202952666, + "step": 2005 + }, + { + "epoch": 0.5019391967971976, + "grad_norm": 1.1640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34588810.666666664, + "logits/rejected": -50462997.333333336, + "logps/chosen": -363.9585367838542, + "logps/rejected": -590.8803168402778, + "loss": 0.0121, + "rewards/chosen": 6.077334721883138, + "rewards/margins": 20.481845219930012, + "rewards/rejected": -14.404510498046875, + "step": 2006 + }, + { + "epoch": 0.5021894157387714, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63939432.72727273, + "logits/rejected": -69679330.46153846, + "logps/chosen": -363.16397372159093, + "logps/rejected": -631.08203125, + "loss": 0.0172, + "rewards/chosen": 9.501785278320312, + "rewards/margins": 21.453001755934494, + "rewards/rejected": -11.951216477614183, + "step": 2007 + }, + { + "epoch": 0.5024396346803452, + "grad_norm": 16.125, + "kl": 17.85320472717285, + "learning_rate": 5e-06, + "logits/chosen": -38214148.266666666, + "logits/rejected": -57787456.0, + "logps/chosen": -459.0707682291667, + "logps/rejected": -340.9117838541667, + "loss": 0.0921, + "rewards/chosen": 9.660264078776041, + "rewards/margins": 17.345203993055556, + "rewards/rejected": -7.684939914279514, + "step": 2008 + }, + { + "epoch": 0.5026898536219192, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68243192.8888889, + "logits/rejected": -56420445.86666667, + "logps/chosen": -362.98084852430554, + "logps/rejected": -614.2248697916667, + "loss": 0.0592, + "rewards/chosen": 7.397639804416233, + "rewards/margins": 19.512221103244357, + "rewards/rejected": -12.114581298828124, + "step": 2009 + }, + { + "epoch": 0.502940072563493, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48743476.0, + "logits/rejected": -53746816.0, + "logps/chosen": -371.12890625, + "logps/rejected": -726.1782836914062, + "loss": 0.0079, + "rewards/chosen": 7.997684478759766, + "rewards/margins": 22.762825965881348, + "rewards/rejected": -14.765141487121582, + "step": 2010 + }, + { + "epoch": 0.5031902915050669, + "grad_norm": 6.90625, + "kl": 2.2826151847839355, + "learning_rate": 5e-06, + "logits/chosen": -40138432.0, + "logits/rejected": -69950112.0, + "logps/chosen": -360.39334542410717, + "logps/rejected": -735.17119140625, + "loss": 0.0229, + "rewards/chosen": 7.797163827078683, + "rewards/margins": 21.45153089250837, + "rewards/rejected": -13.654367065429687, + "step": 2011 + }, + { + "epoch": 0.5034405104466408, + "grad_norm": 2.0625, + "kl": 1.4323346614837646, + "learning_rate": 5e-06, + "logits/chosen": -34315081.14285714, + "logits/rejected": -32883216.0, + "logps/chosen": -356.52650669642856, + "logps/rejected": -497.866796875, + "loss": 0.0327, + "rewards/chosen": 6.861260550362723, + "rewards/margins": 18.47960695539202, + "rewards/rejected": -11.618346405029296, + "step": 2012 + }, + { + "epoch": 0.5036907293882147, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60717030.4, + "logits/rejected": -24762134.85714286, + "logps/chosen": -274.72998046875, + "logps/rejected": -653.2473493303571, + "loss": 0.0301, + "rewards/chosen": 6.775099182128907, + "rewards/margins": 18.330494035993304, + "rewards/rejected": -11.555394853864398, + "step": 2013 + }, + { + "epoch": 0.5039409483297885, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35934493.09090909, + "logits/rejected": -35348482.461538464, + "logps/chosen": -417.06889204545456, + "logps/rejected": -524.5434945913462, + "loss": 0.0204, + "rewards/chosen": 8.025335138494318, + "rewards/margins": 19.958689442881337, + "rewards/rejected": -11.93335430438702, + "step": 2014 + }, + { + "epoch": 0.5041911672713625, + "grad_norm": 1.4453125, + "kl": 3.996346950531006, + "learning_rate": 5e-06, + "logits/chosen": -48889696.0, + "logits/rejected": -64489328.0, + "logps/chosen": -447.9976399739583, + "logps/rejected": -877.7978515625, + "loss": 0.0152, + "rewards/chosen": 10.134397506713867, + "rewards/margins": 28.927006403605144, + "rewards/rejected": -18.792608896891277, + "step": 2015 + }, + { + "epoch": 0.5044413862129363, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61865528.88888889, + "logits/rejected": -29935398.4, + "logps/chosen": -311.564697265625, + "logps/rejected": -534.73193359375, + "loss": 0.0648, + "rewards/chosen": 5.945961422390408, + "rewards/margins": 17.955091264512802, + "rewards/rejected": -12.009129842122396, + "step": 2016 + }, + { + "epoch": 0.5046916051545102, + "grad_norm": 28.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33599274.666666664, + "logits/rejected": 91955317.33333333, + "logps/chosen": -422.0048828125, + "logps/rejected": -523.9694010416666, + "loss": 0.0798, + "rewards/chosen": 6.966581344604492, + "rewards/margins": 19.22781308492025, + "rewards/rejected": -12.261231740315756, + "step": 2017 + }, + { + "epoch": 0.5049418240960841, + "grad_norm": 25.25, + "kl": 1.4314804077148438, + "learning_rate": 5e-06, + "logits/chosen": -63897024.0, + "logits/rejected": -61581915.428571425, + "logps/chosen": -302.193115234375, + "logps/rejected": -598.8289620535714, + "loss": 0.0957, + "rewards/chosen": 4.997477722167969, + "rewards/margins": 17.78181871686663, + "rewards/rejected": -12.784340994698661, + "step": 2018 + }, + { + "epoch": 0.505192043037658, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49644432.0, + "logits/rejected": -66965136.0, + "logps/chosen": -383.0497131347656, + "logps/rejected": -719.7156982421875, + "loss": 0.0449, + "rewards/chosen": 7.8711838722229, + "rewards/margins": 24.239969730377197, + "rewards/rejected": -16.368785858154297, + "step": 2019 + }, + { + "epoch": 0.5054422619792318, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74944413.0909091, + "logits/rejected": -52082097.23076923, + "logps/chosen": -351.8477672230114, + "logps/rejected": -565.2352764423077, + "loss": 0.0367, + "rewards/chosen": 6.84285458651456, + "rewards/margins": 18.091562284456266, + "rewards/rejected": -11.248707697941708, + "step": 2020 + }, + { + "epoch": 0.5056924809208057, + "grad_norm": 10.125, + "kl": 4.104727745056152, + "learning_rate": 5e-06, + "logits/chosen": -42825546.666666664, + "logits/rejected": -36853813.333333336, + "logps/chosen": -332.12904866536456, + "logps/rejected": -645.9530029296875, + "loss": 0.042, + "rewards/chosen": 7.933237075805664, + "rewards/margins": 23.382646560668945, + "rewards/rejected": -15.449409484863281, + "step": 2021 + }, + { + "epoch": 0.5059426998623796, + "grad_norm": 6.375, + "kl": 10.881041526794434, + "learning_rate": 5e-06, + "logits/chosen": -80109138.28571428, + "logits/rejected": -19778137.6, + "logps/chosen": -488.80772181919644, + "logps/rejected": -511.5826171875, + "loss": 0.03, + "rewards/chosen": 9.987566266741071, + "rewards/margins": 20.486856733049663, + "rewards/rejected": -10.499290466308594, + "step": 2022 + }, + { + "epoch": 0.5061929188039535, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53890227.2, + "logits/rejected": -51967369.14285714, + "logps/chosen": -367.3782958984375, + "logps/rejected": -646.3998325892857, + "loss": 0.0366, + "rewards/chosen": 7.980458068847656, + "rewards/margins": 21.58571537562779, + "rewards/rejected": -13.605257306780134, + "step": 2023 + }, + { + "epoch": 0.5064431377455273, + "grad_norm": 5.5, + "kl": 3.007319211959839, + "learning_rate": 5e-06, + "logits/chosen": -55539352.615384616, + "logits/rejected": -48158848.0, + "logps/chosen": -310.32376802884613, + "logps/rejected": -535.9944957386364, + "loss": 0.0474, + "rewards/chosen": 7.036426250751202, + "rewards/margins": 18.452961368160647, + "rewards/rejected": -11.416535117409445, + "step": 2024 + }, + { + "epoch": 0.5066933566871012, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28783595.636363637, + "logits/rejected": -41406532.92307692, + "logps/chosen": -171.83851207386363, + "logps/rejected": -435.20350060096155, + "loss": 0.1057, + "rewards/chosen": 4.301420038396662, + "rewards/margins": 13.661863153631037, + "rewards/rejected": -9.360443115234375, + "step": 2025 + }, + { + "epoch": 0.5069435756286751, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39268667.428571425, + "logits/rejected": -67537318.4, + "logps/chosen": -432.4606236049107, + "logps/rejected": -710.16943359375, + "loss": 0.0307, + "rewards/chosen": 9.236327035086495, + "rewards/margins": 24.188023812430245, + "rewards/rejected": -14.95169677734375, + "step": 2026 + }, + { + "epoch": 0.5071937945702489, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53462573.71428572, + "logits/rejected": -63127747.76470588, + "logps/chosen": -372.2268763950893, + "logps/rejected": -638.5193014705883, + "loss": 0.0411, + "rewards/chosen": 8.204526628766741, + "rewards/margins": 23.67129837164358, + "rewards/rejected": -15.466771742876839, + "step": 2027 + }, + { + "epoch": 0.5074440135118229, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35760066.28571428, + "logits/rejected": -26253728.0, + "logps/chosen": -440.37890625, + "logps/rejected": -667.30751953125, + "loss": 0.0023, + "rewards/chosen": 8.752478463309151, + "rewards/margins": 27.178850228445867, + "rewards/rejected": -18.426371765136718, + "step": 2028 + }, + { + "epoch": 0.5076942324533967, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 821124.8, + "logits/rejected": -56179086.222222224, + "logps/chosen": -506.10341796875, + "logps/rejected": -800.9060329861111, + "loss": 0.0724, + "rewards/chosen": 8.281163533528646, + "rewards/margins": 29.901548597547745, + "rewards/rejected": -21.620385064019096, + "step": 2029 + }, + { + "epoch": 0.5079444513949706, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66115121.23076923, + "logits/rejected": -50619438.54545455, + "logps/chosen": -347.91068209134613, + "logps/rejected": -673.9528142755681, + "loss": 0.0366, + "rewards/chosen": 6.4918694129356975, + "rewards/margins": 18.84308650943783, + "rewards/rejected": -12.35121709650213, + "step": 2030 + }, + { + "epoch": 0.5081946703365445, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55228066.461538464, + "logits/rejected": -52335022.54545455, + "logps/chosen": -339.15076622596155, + "logps/rejected": -522.8496537642045, + "loss": 0.0248, + "rewards/chosen": 6.22974865253155, + "rewards/margins": 17.15734639201131, + "rewards/rejected": -10.927597739479758, + "step": 2031 + }, + { + "epoch": 0.5084448892781184, + "grad_norm": 3.765625, + "kl": 12.808765411376953, + "learning_rate": 5e-06, + "logits/chosen": -40207765.333333336, + "logits/rejected": 86596992.0, + "logps/chosen": -354.2416015625, + "logps/rejected": -518.2223307291666, + "loss": 0.0224, + "rewards/chosen": 7.719915771484375, + "rewards/margins": 17.23291965060764, + "rewards/rejected": -9.513003879123264, + "step": 2032 + }, + { + "epoch": 0.5086951082196922, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -420356.92307692306, + "logits/rejected": -34414941.09090909, + "logps/chosen": -429.4865910456731, + "logps/rejected": -723.2416548295455, + "loss": 0.0427, + "rewards/chosen": 8.53240732046274, + "rewards/margins": 23.635215492515297, + "rewards/rejected": -15.102808172052557, + "step": 2033 + }, + { + "epoch": 0.5089453271612661, + "grad_norm": 7.8125, + "kl": 2.299046516418457, + "learning_rate": 5e-06, + "logits/chosen": -51760022.85714286, + "logits/rejected": -32765296.0, + "logps/chosen": -376.28501674107144, + "logps/rejected": -745.83251953125, + "loss": 0.0844, + "rewards/chosen": 6.571990966796875, + "rewards/margins": 18.112261962890624, + "rewards/rejected": -11.54027099609375, + "step": 2034 + }, + { + "epoch": 0.50919554610284, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67609898.66666667, + "logits/rejected": -70024391.1111111, + "logps/chosen": -399.21741536458336, + "logps/rejected": -579.0771484375, + "loss": 0.0729, + "rewards/chosen": 5.416941833496094, + "rewards/margins": 17.47944607204861, + "rewards/rejected": -12.062504238552517, + "step": 2035 + }, + { + "epoch": 0.5094457650444139, + "grad_norm": 17.125, + "kl": 0.2859668731689453, + "learning_rate": 5e-06, + "logits/chosen": -40583288.615384616, + "logits/rejected": -39466955.63636363, + "logps/chosen": -307.37235201322113, + "logps/rejected": -400.85591264204544, + "loss": 0.0692, + "rewards/chosen": 7.486266502967248, + "rewards/margins": 17.037972296868173, + "rewards/rejected": -9.551705793900924, + "step": 2036 + }, + { + "epoch": 0.5096959839859877, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43549829.81818182, + "logits/rejected": -62685607.384615384, + "logps/chosen": -398.8631036931818, + "logps/rejected": -847.6011117788462, + "loss": 0.0233, + "rewards/chosen": 8.451771129261363, + "rewards/margins": 31.233131462043815, + "rewards/rejected": -22.78136033278245, + "step": 2037 + }, + { + "epoch": 0.5099462029275617, + "grad_norm": 8.25, + "kl": 19.794681549072266, + "learning_rate": 5e-06, + "logits/chosen": -35493848.0, + "logits/rejected": -40526560.0, + "logps/chosen": -458.5340881347656, + "logps/rejected": -448.6490478515625, + "loss": 0.0208, + "rewards/chosen": 9.440528869628906, + "rewards/margins": 20.14570426940918, + "rewards/rejected": -10.705175399780273, + "step": 2038 + }, + { + "epoch": 0.5101964218691355, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46224173.71428572, + "logits/rejected": -58699584.0, + "logps/chosen": -246.23092215401786, + "logps/rejected": -530.59921875, + "loss": 0.0311, + "rewards/chosen": 5.910852704729352, + "rewards/margins": 18.63238285609654, + "rewards/rejected": -12.721530151367187, + "step": 2039 + }, + { + "epoch": 0.5104466408107093, + "grad_norm": 14.875, + "kl": 5.487802028656006, + "learning_rate": 5e-06, + "logits/chosen": -25693824.0, + "logits/rejected": -40154648.0, + "logps/chosen": -240.2078653971354, + "logps/rejected": -496.9247639973958, + "loss": 0.0797, + "rewards/chosen": 5.557026545206706, + "rewards/margins": 18.78697395324707, + "rewards/rejected": -13.229947408040365, + "step": 2040 + }, + { + "epoch": 0.5106968597522833, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32680176.0, + "logits/rejected": -56139400.0, + "logps/chosen": -373.472412109375, + "logps/rejected": -556.17333984375, + "loss": 0.0157, + "rewards/chosen": 8.535799026489258, + "rewards/margins": 19.874250411987305, + "rewards/rejected": -11.338451385498047, + "step": 2041 + }, + { + "epoch": 0.5109470786938571, + "grad_norm": 7.03125, + "kl": 7.283808708190918, + "learning_rate": 5e-06, + "logits/chosen": -64955293.538461536, + "logits/rejected": -45175784.72727273, + "logps/chosen": -460.93716195913464, + "logps/rejected": -663.1183416193181, + "loss": 0.0513, + "rewards/chosen": 8.791025015024038, + "rewards/margins": 24.588980614722193, + "rewards/rejected": -15.797955599698154, + "step": 2042 + }, + { + "epoch": 0.511197297635431, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51452258.90909091, + "logits/rejected": -54850702.76923077, + "logps/chosen": -410.3224431818182, + "logps/rejected": -798.9809194711538, + "loss": 0.0201, + "rewards/chosen": 7.662064292214134, + "rewards/margins": 26.145699827821105, + "rewards/rejected": -18.48363553560697, + "step": 2043 + }, + { + "epoch": 0.5114475165770049, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54354225.23076923, + "logits/rejected": -38532581.81818182, + "logps/chosen": -351.4650691105769, + "logps/rejected": -525.7906161221591, + "loss": 0.0583, + "rewards/chosen": 7.728520320012019, + "rewards/margins": 19.008498985450583, + "rewards/rejected": -11.279978665438565, + "step": 2044 + }, + { + "epoch": 0.5116977355185788, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74954264.61538461, + "logits/rejected": -51075781.81818182, + "logps/chosen": -371.7453425480769, + "logps/rejected": -639.94384765625, + "loss": 0.0676, + "rewards/chosen": 6.025587228628305, + "rewards/margins": 21.92118659386268, + "rewards/rejected": -15.895599365234375, + "step": 2045 + }, + { + "epoch": 0.5119479544601526, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54220776.72727273, + "logits/rejected": -43516169.84615385, + "logps/chosen": -443.47305575284093, + "logps/rejected": -526.1463341346154, + "loss": 0.0471, + "rewards/chosen": 8.625660289417613, + "rewards/margins": 20.94533661528901, + "rewards/rejected": -12.319676325871395, + "step": 2046 + }, + { + "epoch": 0.5121981734017265, + "grad_norm": 8.5625, + "kl": 4.593562126159668, + "learning_rate": 5e-06, + "logits/chosen": -69933260.8, + "logits/rejected": -38521984.0, + "logps/chosen": -486.873388671875, + "logps/rejected": -424.69332449776783, + "loss": 0.0224, + "rewards/chosen": 8.199452972412109, + "rewards/margins": 18.57932379586356, + "rewards/rejected": -10.379870823451451, + "step": 2047 + }, + { + "epoch": 0.5124483923433004, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42196930.90909091, + "logits/rejected": -44479606.15384615, + "logps/chosen": -394.0731090198864, + "logps/rejected": -615.8425105168269, + "loss": 0.0194, + "rewards/chosen": 7.7432098388671875, + "rewards/margins": 20.490415132962738, + "rewards/rejected": -12.747205294095552, + "step": 2048 + }, + { + "epoch": 0.5126986112848743, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48264870.4, + "logits/rejected": -24333981.714285713, + "logps/chosen": -422.91513671875, + "logps/rejected": -516.4100516183036, + "loss": 0.0331, + "rewards/chosen": 6.503352355957031, + "rewards/margins": 19.403905378069197, + "rewards/rejected": -12.900553022112165, + "step": 2049 + }, + { + "epoch": 0.5129488302264481, + "grad_norm": 12.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36412550.4, + "logits/rejected": -49880891.428571425, + "logps/chosen": -313.1418701171875, + "logps/rejected": -665.1412527901786, + "loss": 0.0547, + "rewards/chosen": 5.760654449462891, + "rewards/margins": 20.082562582833425, + "rewards/rejected": -14.321908133370536, + "step": 2050 + }, + { + "epoch": 0.5131990491680221, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64515987.2, + "logits/rejected": -50961792.0, + "logps/chosen": -337.335400390625, + "logps/rejected": -521.5563616071429, + "loss": 0.044, + "rewards/chosen": 6.767932891845703, + "rewards/margins": 21.28472409929548, + "rewards/rejected": -14.516791207449776, + "step": 2051 + }, + { + "epoch": 0.5134492681095959, + "grad_norm": 1.3359375, + "kl": 3.0731735229492188, + "learning_rate": 5e-06, + "logits/chosen": -38545834.666666664, + "logits/rejected": -61840931.55555555, + "logps/chosen": -508.8123046875, + "logps/rejected": -941.9448784722222, + "loss": 0.0076, + "rewards/chosen": 8.936802164713542, + "rewards/margins": 27.5049072265625, + "rewards/rejected": -18.568105061848957, + "step": 2052 + }, + { + "epoch": 0.5136994870511697, + "grad_norm": 12.8125, + "kl": 11.261308670043945, + "learning_rate": 5e-06, + "logits/chosen": -43652386.461538464, + "logits/rejected": -56759604.36363637, + "logps/chosen": -282.27768179086536, + "logps/rejected": -532.0268110795455, + "loss": 0.0636, + "rewards/chosen": 6.440862215482271, + "rewards/margins": 18.121812033486535, + "rewards/rejected": -11.680949818004262, + "step": 2053 + }, + { + "epoch": 0.5139497059927437, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41187464.0, + "logits/rejected": -71249920.0, + "logps/chosen": -351.71087646484375, + "logps/rejected": -545.562255859375, + "loss": 0.0532, + "rewards/chosen": 6.041684627532959, + "rewards/margins": 18.375075817108154, + "rewards/rejected": -12.333391189575195, + "step": 2054 + }, + { + "epoch": 0.5141999249343175, + "grad_norm": 6.625, + "kl": 3.3547236919403076, + "learning_rate": 5e-06, + "logits/chosen": -41794619.07692308, + "logits/rejected": -45772352.0, + "logps/chosen": -351.91793118990387, + "logps/rejected": -344.22509765625, + "loss": 0.0198, + "rewards/chosen": 7.891488882211538, + "rewards/margins": 18.213418813852165, + "rewards/rejected": -10.321929931640625, + "step": 2055 + }, + { + "epoch": 0.5144501438758914, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67769017.6, + "logits/rejected": -76123876.57142857, + "logps/chosen": -490.85673828125, + "logps/rejected": -585.3030831473214, + "loss": 0.0453, + "rewards/chosen": 9.409085083007813, + "rewards/margins": 20.580262538364956, + "rewards/rejected": -11.171177455357142, + "step": 2056 + }, + { + "epoch": 0.5147003628174652, + "grad_norm": 7.25, + "kl": 9.187297821044922, + "learning_rate": 5e-06, + "logits/chosen": -45276402.28571428, + "logits/rejected": -56935846.4, + "logps/chosen": -340.0748814174107, + "logps/rejected": -610.36640625, + "loss": 0.0604, + "rewards/chosen": 6.934781210763114, + "rewards/margins": 23.833253805977957, + "rewards/rejected": -16.898472595214844, + "step": 2057 + }, + { + "epoch": 0.5149505817590392, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23492884.363636363, + "logits/rejected": -53470119.384615384, + "logps/chosen": -313.67891068892044, + "logps/rejected": -657.5831580528846, + "loss": 0.0174, + "rewards/chosen": 8.40437039462003, + "rewards/margins": 25.54604942481835, + "rewards/rejected": -17.14167903019832, + "step": 2058 + }, + { + "epoch": 0.515200800700613, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47798482.28571428, + "logits/rejected": -100592416.0, + "logps/chosen": -387.53651646205356, + "logps/rejected": -547.970703125, + "loss": 0.0578, + "rewards/chosen": 5.625618525913784, + "rewards/margins": 19.87854058401925, + "rewards/rejected": -14.252922058105469, + "step": 2059 + }, + { + "epoch": 0.5154510196421869, + "grad_norm": 1.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49639776.0, + "logits/rejected": -62037486.93333333, + "logps/chosen": -511.66807725694446, + "logps/rejected": -579.0899739583333, + "loss": 0.003, + "rewards/chosen": 9.693017747667101, + "rewards/margins": 23.81736060248481, + "rewards/rejected": -14.124342854817709, + "step": 2060 + }, + { + "epoch": 0.5157012385837608, + "grad_norm": 8.5, + "kl": 6.546737194061279, + "learning_rate": 5e-06, + "logits/chosen": -51830784.0, + "logits/rejected": -30175987.2, + "logps/chosen": -367.98458426339283, + "logps/rejected": -424.18037109375, + "loss": 0.0702, + "rewards/chosen": 6.4643434797014505, + "rewards/margins": 15.797923496791295, + "rewards/rejected": -9.333580017089844, + "step": 2061 + }, + { + "epoch": 0.5159514575253347, + "grad_norm": 8.8125, + "kl": 5.988561153411865, + "learning_rate": 5e-06, + "logits/chosen": -79898926.54545455, + "logits/rejected": -58028150.15384615, + "logps/chosen": -494.3445490056818, + "logps/rejected": -751.4121844951923, + "loss": 0.0427, + "rewards/chosen": 9.317793412642045, + "rewards/margins": 24.091482095785075, + "rewards/rejected": -14.773688683143028, + "step": 2062 + }, + { + "epoch": 0.5162016764669085, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76869869.71428572, + "logits/rejected": -19193912.0, + "logps/chosen": -540.5588727678571, + "logps/rejected": -628.03740234375, + "loss": 0.0497, + "rewards/chosen": 8.155299595424108, + "rewards/margins": 20.52025854928153, + "rewards/rejected": -12.364958953857421, + "step": 2063 + }, + { + "epoch": 0.5164518954084825, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33451402.666666668, + "logits/rejected": 2564219.111111111, + "logps/chosen": -269.3422037760417, + "logps/rejected": -494.13335503472223, + "loss": 0.0402, + "rewards/chosen": 6.611663818359375, + "rewards/margins": 20.561937967936196, + "rewards/rejected": -13.950274149576822, + "step": 2064 + }, + { + "epoch": 0.5167021143500563, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62068270.54545455, + "logits/rejected": -36287640.615384616, + "logps/chosen": -492.9459339488636, + "logps/rejected": -502.58484825721155, + "loss": 0.0232, + "rewards/chosen": 9.643128828568893, + "rewards/margins": 21.2342930506993, + "rewards/rejected": -11.59116422213041, + "step": 2065 + }, + { + "epoch": 0.5169523332916302, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33438892.8, + "logits/rejected": -54511712.0, + "logps/chosen": -324.513671875, + "logps/rejected": -589.3586077008929, + "loss": 0.0342, + "rewards/chosen": 6.082374191284179, + "rewards/margins": 20.959342575073244, + "rewards/rejected": -14.876968383789062, + "step": 2066 + }, + { + "epoch": 0.5172025522332041, + "grad_norm": 1.421875, + "kl": 1.1260770559310913, + "learning_rate": 5e-06, + "logits/chosen": -45328661.333333336, + "logits/rejected": -50588261.333333336, + "logps/chosen": -423.48388671875, + "logps/rejected": -680.9405110677084, + "loss": 0.0136, + "rewards/chosen": 9.108111911349827, + "rewards/margins": 22.621267530653213, + "rewards/rejected": -13.513155619303385, + "step": 2067 + }, + { + "epoch": 0.517452771174778, + "grad_norm": 2.28125, + "kl": 10.033530235290527, + "learning_rate": 5e-06, + "logits/chosen": -33700582.4, + "logits/rejected": -26727598.222222224, + "logps/chosen": -370.86516927083335, + "logps/rejected": -451.0963541666667, + "loss": 0.0587, + "rewards/chosen": 7.3603159586588545, + "rewards/margins": 20.19067552354601, + "rewards/rejected": -12.830359564887154, + "step": 2068 + }, + { + "epoch": 0.5177029901163518, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46369206.85714286, + "logits/rejected": -30676254.11764706, + "logps/chosen": -451.41678292410717, + "logps/rejected": -511.03044577205884, + "loss": 0.0365, + "rewards/chosen": 7.5332810538155695, + "rewards/margins": 17.88381493191759, + "rewards/rejected": -10.350533878102022, + "step": 2069 + }, + { + "epoch": 0.5179532090579256, + "grad_norm": 16.25, + "kl": 14.29393196105957, + "learning_rate": 5e-06, + "logits/chosen": -81056467.2, + "logits/rejected": 44746034.28571428, + "logps/chosen": -462.827685546875, + "logps/rejected": -653.1022600446429, + "loss": 0.0368, + "rewards/chosen": 8.668092346191406, + "rewards/margins": 22.96399187360491, + "rewards/rejected": -14.295899527413505, + "step": 2070 + }, + { + "epoch": 0.5182034279994996, + "grad_norm": 26.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38648620.8, + "logits/rejected": -28611117.714285713, + "logps/chosen": -303.7293212890625, + "logps/rejected": -453.70078822544644, + "loss": 0.084, + "rewards/chosen": 5.249457550048828, + "rewards/margins": 13.974338858468194, + "rewards/rejected": -8.724881308419365, + "step": 2071 + }, + { + "epoch": 0.5184536469410734, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40474459.428571425, + "logits/rejected": -40719462.4, + "logps/chosen": -307.95113699776783, + "logps/rejected": -543.447802734375, + "loss": 0.0574, + "rewards/chosen": 7.5041689191545755, + "rewards/margins": 18.16812918526786, + "rewards/rejected": -10.663960266113282, + "step": 2072 + }, + { + "epoch": 0.5187038658826473, + "grad_norm": 19.25, + "kl": 8.714259147644043, + "learning_rate": 5e-06, + "logits/chosen": -54971684.571428575, + "logits/rejected": -59505945.6, + "logps/chosen": -436.32693917410717, + "logps/rejected": -552.689794921875, + "loss": 0.0456, + "rewards/chosen": 8.786645071847099, + "rewards/margins": 20.13851732526507, + "rewards/rejected": -11.35187225341797, + "step": 2073 + }, + { + "epoch": 0.5189540848242212, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54045696.0, + "logits/rejected": -58517474.461538464, + "logps/chosen": -356.51045365767044, + "logps/rejected": -623.7421123798077, + "loss": 0.0339, + "rewards/chosen": 8.108573219992898, + "rewards/margins": 21.41727260776333, + "rewards/rejected": -13.308699387770433, + "step": 2074 + }, + { + "epoch": 0.5192043037657951, + "grad_norm": 4.59375, + "kl": 6.460572719573975, + "learning_rate": 5e-06, + "logits/chosen": -45779033.6, + "logits/rejected": -28082402.285714287, + "logps/chosen": -470.540869140625, + "logps/rejected": -618.8392857142857, + "loss": 0.0274, + "rewards/chosen": 8.77569808959961, + "rewards/margins": 19.769512830461775, + "rewards/rejected": -10.993814740862165, + "step": 2075 + }, + { + "epoch": 0.5194545227073689, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72792339.6923077, + "logits/rejected": -69264535.27272727, + "logps/chosen": -430.19884314903845, + "logps/rejected": -590.5677379261364, + "loss": 0.0235, + "rewards/chosen": 8.495459336500902, + "rewards/margins": 20.33566236162519, + "rewards/rejected": -11.84020302512429, + "step": 2076 + }, + { + "epoch": 0.5197047416489429, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24482441.6, + "logits/rejected": -43688827.428571425, + "logps/chosen": -282.1775390625, + "logps/rejected": -944.1690848214286, + "loss": 0.0398, + "rewards/chosen": 7.059893798828125, + "rewards/margins": 26.580968366350447, + "rewards/rejected": -19.521074567522323, + "step": 2077 + }, + { + "epoch": 0.5199549605905167, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40506048.0, + "logits/rejected": -49761493.333333336, + "logps/chosen": -414.41216362847223, + "logps/rejected": -496.12151692708335, + "loss": 0.0422, + "rewards/chosen": 7.368762546115452, + "rewards/margins": 17.844663831922745, + "rewards/rejected": -10.475901285807291, + "step": 2078 + }, + { + "epoch": 0.5202051795320906, + "grad_norm": 6.84375, + "kl": 0.05200608819723129, + "learning_rate": 5e-06, + "logits/chosen": -90009856.0, + "logits/rejected": -40311389.538461536, + "logps/chosen": -425.7457386363636, + "logps/rejected": -548.8005183293269, + "loss": 0.0169, + "rewards/chosen": 7.362829728560015, + "rewards/margins": 21.22670255007444, + "rewards/rejected": -13.863872821514423, + "step": 2079 + }, + { + "epoch": 0.5204553984736645, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32624028.444444444, + "logits/rejected": -37109038.93333333, + "logps/chosen": -282.2387424045139, + "logps/rejected": -639.5348958333333, + "loss": 0.0205, + "rewards/chosen": 7.308046976725261, + "rewards/margins": 22.494525655110678, + "rewards/rejected": -15.186478678385416, + "step": 2080 + }, + { + "epoch": 0.5207056174152384, + "grad_norm": 9.125, + "kl": 3.9720003604888916, + "learning_rate": 5e-06, + "logits/chosen": -55842612.0, + "logits/rejected": -56257104.0, + "logps/chosen": -394.1968994140625, + "logps/rejected": -588.953857421875, + "loss": 0.0374, + "rewards/chosen": 7.986486434936523, + "rewards/margins": 19.265053749084473, + "rewards/rejected": -11.27856731414795, + "step": 2081 + }, + { + "epoch": 0.5209558363568122, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34274884.0, + "logits/rejected": -49217208.0, + "logps/chosen": -346.99847412109375, + "logps/rejected": -617.9710083007812, + "loss": 0.029, + "rewards/chosen": 6.280170440673828, + "rewards/margins": 23.859987258911133, + "rewards/rejected": -17.579816818237305, + "step": 2082 + }, + { + "epoch": 0.521206055298386, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42358688.0, + "logits/rejected": -23891298.666666668, + "logps/chosen": -344.8580729166667, + "logps/rejected": -586.412841796875, + "loss": 0.0234, + "rewards/chosen": 7.743685404459636, + "rewards/margins": 22.881924947102863, + "rewards/rejected": -15.138239542643229, + "step": 2083 + }, + { + "epoch": 0.52145627423996, + "grad_norm": 5.46875, + "kl": 2.381242275238037, + "learning_rate": 5e-06, + "logits/chosen": -53220164.92307692, + "logits/rejected": -64389312.0, + "logps/chosen": -442.71225210336536, + "logps/rejected": -647.2072088068181, + "loss": 0.0319, + "rewards/chosen": 8.017402062049278, + "rewards/margins": 21.902012324833372, + "rewards/rejected": -13.884610262784092, + "step": 2084 + }, + { + "epoch": 0.5217064931815338, + "grad_norm": 1.828125, + "kl": 1.9587072134017944, + "learning_rate": 5e-06, + "logits/chosen": -68653096.72727273, + "logits/rejected": -55428913.23076923, + "logps/chosen": -413.34419389204544, + "logps/rejected": -612.1852463942307, + "loss": 0.0213, + "rewards/chosen": 8.338054310191762, + "rewards/margins": 22.31072229772181, + "rewards/rejected": -13.972667987530048, + "step": 2085 + }, + { + "epoch": 0.5219567121231077, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28742156.0, + "logits/rejected": -40883200.0, + "logps/chosen": -403.5440673828125, + "logps/rejected": -475.2159729003906, + "loss": 0.0597, + "rewards/chosen": 6.060738563537598, + "rewards/margins": 17.26920223236084, + "rewards/rejected": -11.208463668823242, + "step": 2086 + }, + { + "epoch": 0.5222069310646816, + "grad_norm": 10.8125, + "kl": 13.573837280273438, + "learning_rate": 5e-06, + "logits/chosen": -80758272.0, + "logits/rejected": -26096030.4, + "logps/chosen": -418.25362723214283, + "logps/rejected": -697.20859375, + "loss": 0.0468, + "rewards/chosen": 7.9937270028250555, + "rewards/margins": 23.341607557024275, + "rewards/rejected": -15.347880554199218, + "step": 2087 + }, + { + "epoch": 0.5224571500062555, + "grad_norm": 8.4375, + "kl": 4.225133419036865, + "learning_rate": 5e-06, + "logits/chosen": -54701484.307692304, + "logits/rejected": -28954170.181818184, + "logps/chosen": -413.9116962139423, + "logps/rejected": -425.50905539772725, + "loss": 0.025, + "rewards/chosen": 8.783909724308895, + "rewards/margins": 19.46287035108446, + "rewards/rejected": -10.678960626775568, + "step": 2088 + }, + { + "epoch": 0.5227073689478293, + "grad_norm": 16.125, + "kl": 6.540717601776123, + "learning_rate": 5e-06, + "logits/chosen": -29450290.0, + "logits/rejected": -33228566.0, + "logps/chosen": -310.2848205566406, + "logps/rejected": -503.5518493652344, + "loss": 0.1057, + "rewards/chosen": 5.003633499145508, + "rewards/margins": 18.72876739501953, + "rewards/rejected": -13.725133895874023, + "step": 2089 + }, + { + "epoch": 0.5229575878894033, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47189081.6, + "logits/rejected": -51009380.571428575, + "logps/chosen": -434.4208984375, + "logps/rejected": -621.5802176339286, + "loss": 0.0627, + "rewards/chosen": 8.424483489990234, + "rewards/margins": 23.781891196114678, + "rewards/rejected": -15.357407706124443, + "step": 2090 + }, + { + "epoch": 0.5232078068309771, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29001192.727272727, + "logits/rejected": -37672637.538461536, + "logps/chosen": -318.8658558238636, + "logps/rejected": -454.56201171875, + "loss": 0.0326, + "rewards/chosen": 5.821513782848012, + "rewards/margins": 17.469102659425538, + "rewards/rejected": -11.647588876577524, + "step": 2091 + }, + { + "epoch": 0.523458025772551, + "grad_norm": 4.59375, + "kl": 2.3009250164031982, + "learning_rate": 5e-06, + "logits/chosen": -61417113.6, + "logits/rejected": -58015021.71428572, + "logps/chosen": -589.58125, + "logps/rejected": -683.4371512276786, + "loss": 0.0264, + "rewards/chosen": 9.70254898071289, + "rewards/margins": 26.999474116734095, + "rewards/rejected": -17.296925136021205, + "step": 2092 + }, + { + "epoch": 0.5237082447141248, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 36963522.666666664, + "logits/rejected": -38203008.0, + "logps/chosen": -427.2663981119792, + "logps/rejected": -546.7637261284722, + "loss": 0.0367, + "rewards/chosen": 7.949426015218099, + "rewards/margins": 22.361718495686848, + "rewards/rejected": -14.41229248046875, + "step": 2093 + }, + { + "epoch": 0.5239584636556988, + "grad_norm": 11.6875, + "kl": 8.629111289978027, + "learning_rate": 5e-06, + "logits/chosen": -34492838.85714286, + "logits/rejected": -5663905.6, + "logps/chosen": -411.48789760044644, + "logps/rejected": -557.04755859375, + "loss": 0.0369, + "rewards/chosen": 8.349563598632812, + "rewards/margins": 22.080616760253907, + "rewards/rejected": -13.731053161621094, + "step": 2094 + }, + { + "epoch": 0.5242086825972726, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51616777.84615385, + "logits/rejected": -7341293.818181818, + "logps/chosen": -420.189453125, + "logps/rejected": -432.1419122869318, + "loss": 0.0299, + "rewards/chosen": 8.60259775015024, + "rewards/margins": 21.33293055821132, + "rewards/rejected": -12.73033280806108, + "step": 2095 + }, + { + "epoch": 0.5244589015388464, + "grad_norm": 16.625, + "kl": 4.806304931640625, + "learning_rate": 5e-06, + "logits/chosen": -62126126.54545455, + "logits/rejected": -53115431.384615384, + "logps/chosen": -375.15371981534093, + "logps/rejected": -758.2982271634615, + "loss": 0.0542, + "rewards/chosen": 7.729522011496804, + "rewards/margins": 22.659683707710744, + "rewards/rejected": -14.930161696213942, + "step": 2096 + }, + { + "epoch": 0.5247091204804204, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32007594.666666668, + "logits/rejected": -26637034.666666668, + "logps/chosen": -370.8730061848958, + "logps/rejected": -709.4248860677084, + "loss": 0.0556, + "rewards/chosen": 6.582632700602214, + "rewards/margins": 18.42068862915039, + "rewards/rejected": -11.838055928548178, + "step": 2097 + }, + { + "epoch": 0.5249593394219942, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53731357.09090909, + "logits/rejected": -60406360.615384616, + "logps/chosen": -444.51100852272725, + "logps/rejected": -551.7612680288462, + "loss": 0.0258, + "rewards/chosen": 6.653153159401634, + "rewards/margins": 18.796559954023028, + "rewards/rejected": -12.143406794621395, + "step": 2098 + }, + { + "epoch": 0.5252095583635681, + "grad_norm": 6.125, + "kl": 14.444297790527344, + "learning_rate": 5e-06, + "logits/chosen": -58547131.733333334, + "logits/rejected": -2612326.222222222, + "logps/chosen": -520.5162109375, + "logps/rejected": -502.16948784722223, + "loss": 0.0158, + "rewards/chosen": 9.365028889973958, + "rewards/margins": 25.255394490559894, + "rewards/rejected": -15.890365600585938, + "step": 2099 + }, + { + "epoch": 0.525459777305142, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49314140.44444445, + "logits/rejected": -52897715.2, + "logps/chosen": -287.5767415364583, + "logps/rejected": -596.5087890625, + "loss": 0.024, + "rewards/chosen": 7.003359476725261, + "rewards/margins": 21.92371368408203, + "rewards/rejected": -14.920354207356771, + "step": 2100 + }, + { + "epoch": 0.5257099962467159, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39520059.07692308, + "logits/rejected": -45963450.18181818, + "logps/chosen": -376.18130258413464, + "logps/rejected": -704.6801313920455, + "loss": 0.0243, + "rewards/chosen": 7.614232576810396, + "rewards/margins": 24.37515232112858, + "rewards/rejected": -16.760919744318183, + "step": 2101 + }, + { + "epoch": 0.5259602151882897, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32568985.6, + "logits/rejected": -50713101.71428572, + "logps/chosen": -357.2099609375, + "logps/rejected": -492.99239676339283, + "loss": 0.0231, + "rewards/chosen": 7.878208923339844, + "rewards/margins": 20.923835972377233, + "rewards/rejected": -13.045627049037389, + "step": 2102 + }, + { + "epoch": 0.5262104341298637, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88828380.44444445, + "logits/rejected": -65719266.13333333, + "logps/chosen": -535.8365885416666, + "logps/rejected": -583.1625, + "loss": 0.0066, + "rewards/chosen": 9.50605689154731, + "rewards/margins": 24.21681942409939, + "rewards/rejected": -14.710762532552083, + "step": 2103 + }, + { + "epoch": 0.5264606530714375, + "grad_norm": 8.4375, + "kl": 3.560891628265381, + "learning_rate": 5e-06, + "logits/chosen": -68230980.92307693, + "logits/rejected": -55327778.90909091, + "logps/chosen": -428.39321664663464, + "logps/rejected": -760.7004616477273, + "loss": 0.012, + "rewards/chosen": 7.89062734750601, + "rewards/margins": 26.79240374798541, + "rewards/rejected": -18.901776400479402, + "step": 2104 + }, + { + "epoch": 0.5267108720130114, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53851953.23076923, + "logits/rejected": -63369972.36363637, + "logps/chosen": -437.61245492788464, + "logps/rejected": -507.2228338068182, + "loss": 0.0748, + "rewards/chosen": 7.3579277625450725, + "rewards/margins": 16.9631828894982, + "rewards/rejected": -9.605255126953125, + "step": 2105 + }, + { + "epoch": 0.5269610909545852, + "grad_norm": 4.78125, + "kl": 2.536426544189453, + "learning_rate": 5e-06, + "logits/chosen": -46098952.72727273, + "logits/rejected": -53464743.384615384, + "logps/chosen": -401.6966441761364, + "logps/rejected": -516.7514272836538, + "loss": 0.0437, + "rewards/chosen": 9.960128090598367, + "rewards/margins": 21.041033711466756, + "rewards/rejected": -11.08090562086839, + "step": 2106 + }, + { + "epoch": 0.5272113098961592, + "grad_norm": 9.75, + "kl": 8.824483871459961, + "learning_rate": 5e-06, + "logits/chosen": -36492864.0, + "logits/rejected": -19210400.0, + "logps/chosen": -263.2241962139423, + "logps/rejected": -480.88671875, + "loss": 0.0876, + "rewards/chosen": 5.928091195913462, + "rewards/margins": 19.284822263917725, + "rewards/rejected": -13.356731068004262, + "step": 2107 + }, + { + "epoch": 0.527461528837733, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50913319.384615384, + "logits/rejected": -48499389.09090909, + "logps/chosen": -446.46304086538464, + "logps/rejected": -612.26416015625, + "loss": 0.0142, + "rewards/chosen": 8.980013333834135, + "rewards/margins": 24.598332758550043, + "rewards/rejected": -15.618319424715908, + "step": 2108 + }, + { + "epoch": 0.5277117477793069, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35225565.538461536, + "logits/rejected": -44093003.63636363, + "logps/chosen": -348.47329477163464, + "logps/rejected": -610.2667347301136, + "loss": 0.0254, + "rewards/chosen": 6.221956693209135, + "rewards/margins": 20.824971552495356, + "rewards/rejected": -14.60301485928622, + "step": 2109 + }, + { + "epoch": 0.5279619667208808, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37615616.0, + "logits/rejected": -54655044.92307692, + "logps/chosen": -352.99740323153407, + "logps/rejected": -589.56201171875, + "loss": 0.0119, + "rewards/chosen": 7.750507701526988, + "rewards/margins": 24.41873115592903, + "rewards/rejected": -16.668223454402042, + "step": 2110 + }, + { + "epoch": 0.5282121856624546, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65892876.8, + "logits/rejected": -14716937.142857144, + "logps/chosen": -400.0557373046875, + "logps/rejected": -519.3572126116071, + "loss": 0.0102, + "rewards/chosen": 7.787289428710937, + "rewards/margins": 22.163644191196987, + "rewards/rejected": -14.376354762486049, + "step": 2111 + }, + { + "epoch": 0.5284624046040285, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47895084.8, + "logits/rejected": -67631305.14285715, + "logps/chosen": -436.317236328125, + "logps/rejected": -631.9432198660714, + "loss": 0.0339, + "rewards/chosen": 8.643710327148437, + "rewards/margins": 20.720758492606024, + "rewards/rejected": -12.077048165457589, + "step": 2112 + }, + { + "epoch": 0.5287126235456024, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87472378.66666667, + "logits/rejected": -67988101.33333333, + "logps/chosen": -490.9977213541667, + "logps/rejected": -532.859375, + "loss": 0.0173, + "rewards/chosen": 7.319589614868164, + "rewards/margins": 19.909045537312828, + "rewards/rejected": -12.589455922444662, + "step": 2113 + }, + { + "epoch": 0.5289628424871763, + "grad_norm": 4.8125, + "kl": 2.0825538635253906, + "learning_rate": 5e-06, + "logits/chosen": -90405730.46153846, + "logits/rejected": -44715182.54545455, + "logps/chosen": -370.0554762620192, + "logps/rejected": -631.4421164772727, + "loss": 0.0501, + "rewards/chosen": 7.491507897010217, + "rewards/margins": 21.72311134605141, + "rewards/rejected": -14.231603449041193, + "step": 2114 + }, + { + "epoch": 0.5292130614287501, + "grad_norm": 5.84375, + "kl": 10.327430725097656, + "learning_rate": 5e-06, + "logits/chosen": -29686839.272727273, + "logits/rejected": -44614148.92307692, + "logps/chosen": -419.46883877840907, + "logps/rejected": -643.4575570913462, + "loss": 0.0388, + "rewards/chosen": 7.985938332297585, + "rewards/margins": 25.45036305247487, + "rewards/rejected": -17.464424720177284, + "step": 2115 + }, + { + "epoch": 0.5294632803703241, + "grad_norm": 15.875, + "kl": 3.125221014022827, + "learning_rate": 5e-06, + "logits/chosen": -46369974.85714286, + "logits/rejected": -46014256.0, + "logps/chosen": -378.63487025669644, + "logps/rejected": -567.995361328125, + "loss": 0.0922, + "rewards/chosen": 6.825752803257534, + "rewards/margins": 17.87595727103097, + "rewards/rejected": -11.050204467773437, + "step": 2116 + }, + { + "epoch": 0.5297134993118979, + "grad_norm": 3.734375, + "kl": 4.1598219871521, + "learning_rate": 5e-06, + "logits/chosen": -58447701.333333336, + "logits/rejected": -57366624.0, + "logps/chosen": -499.395751953125, + "logps/rejected": -631.832763671875, + "loss": 0.0102, + "rewards/chosen": 10.0084228515625, + "rewards/margins": 22.379816691080727, + "rewards/rejected": -12.371393839518229, + "step": 2117 + }, + { + "epoch": 0.5299637182534718, + "grad_norm": 12.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60711925.333333336, + "logits/rejected": -35127509.333333336, + "logps/chosen": -308.4695231119792, + "logps/rejected": -558.070068359375, + "loss": 0.0547, + "rewards/chosen": 5.445240656534831, + "rewards/margins": 17.939631779988606, + "rewards/rejected": -12.494391123453775, + "step": 2118 + }, + { + "epoch": 0.5302139371950456, + "grad_norm": 6.59375, + "kl": 12.382843017578125, + "learning_rate": 5e-06, + "logits/chosen": -63010341.64705882, + "logits/rejected": -38518948.571428575, + "logps/chosen": -473.96814682904414, + "logps/rejected": -618.8705357142857, + "loss": 0.0613, + "rewards/chosen": 9.446267520680147, + "rewards/margins": 17.345983489220885, + "rewards/rejected": -7.899715968540737, + "step": 2119 + }, + { + "epoch": 0.5304641561366196, + "grad_norm": 3.234375, + "kl": 3.022669553756714, + "learning_rate": 5e-06, + "logits/chosen": -54565216.0, + "logits/rejected": -42595736.0, + "logps/chosen": -461.761962890625, + "logps/rejected": -585.8907877604166, + "loss": 0.0418, + "rewards/chosen": 8.013600667317709, + "rewards/margins": 21.019312540690105, + "rewards/rejected": -13.005711873372396, + "step": 2120 + }, + { + "epoch": 0.5307143750781934, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42197494.15384615, + "logits/rejected": -61500887.27272727, + "logps/chosen": -307.20389498197113, + "logps/rejected": -498.00204190340907, + "loss": 0.0233, + "rewards/chosen": 7.171900822566106, + "rewards/margins": 18.587839460039472, + "rewards/rejected": -11.415938637473367, + "step": 2121 + }, + { + "epoch": 0.5309645940197673, + "grad_norm": 17.0, + "kl": 0.9585012197494507, + "learning_rate": 5e-06, + "logits/chosen": -54077088.0, + "logits/rejected": -87668117.33333333, + "logps/chosen": -399.3264973958333, + "logps/rejected": -712.0403645833334, + "loss": 0.0416, + "rewards/chosen": 8.252445220947266, + "rewards/margins": 21.95688501993815, + "rewards/rejected": -13.704439798990885, + "step": 2122 + }, + { + "epoch": 0.5312148129613412, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33616712.53333333, + "logits/rejected": -57114951.11111111, + "logps/chosen": -349.10813802083334, + "logps/rejected": -464.7936740451389, + "loss": 0.0367, + "rewards/chosen": 7.003118896484375, + "rewards/margins": 17.87576904296875, + "rewards/rejected": -10.872650146484375, + "step": 2123 + }, + { + "epoch": 0.531465031902915, + "grad_norm": 9.0625, + "kl": 11.809114456176758, + "learning_rate": 5e-06, + "logits/chosen": -58904052.0, + "logits/rejected": -52542116.0, + "logps/chosen": -504.20068359375, + "logps/rejected": -678.1909790039062, + "loss": 0.0602, + "rewards/chosen": 8.911550521850586, + "rewards/margins": 26.381433486938477, + "rewards/rejected": -17.46988296508789, + "step": 2124 + }, + { + "epoch": 0.5317152508444889, + "grad_norm": 7.0, + "kl": 2.268538236618042, + "learning_rate": 5e-06, + "logits/chosen": -75820424.53333333, + "logits/rejected": -39155392.0, + "logps/chosen": -520.5255208333333, + "logps/rejected": -482.20909288194446, + "loss": 0.0396, + "rewards/chosen": 9.869484456380208, + "rewards/margins": 21.559503851996528, + "rewards/rejected": -11.69001939561632, + "step": 2125 + }, + { + "epoch": 0.5319654697860629, + "grad_norm": 8.9375, + "kl": 1.367227554321289, + "learning_rate": 5e-06, + "logits/chosen": -34305932.8, + "logits/rejected": -86356096.0, + "logps/chosen": -327.6140625, + "logps/rejected": -626.7463727678571, + "loss": 0.0558, + "rewards/chosen": 6.918932342529297, + "rewards/margins": 19.63777389526367, + "rewards/rejected": -12.718841552734375, + "step": 2126 + }, + { + "epoch": 0.5322156887276367, + "grad_norm": 14.5625, + "kl": 6.1609206199646, + "learning_rate": 5e-06, + "logits/chosen": -55220216.47058824, + "logits/rejected": -35235504.0, + "logps/chosen": -418.03406479779414, + "logps/rejected": -498.31717354910717, + "loss": 0.074, + "rewards/chosen": 8.814028571633731, + "rewards/margins": 20.84622025690159, + "rewards/rejected": -12.032191685267858, + "step": 2127 + }, + { + "epoch": 0.5324659076692105, + "grad_norm": 15.8125, + "kl": 10.758535385131836, + "learning_rate": 5e-06, + "logits/chosen": -45146057.14285714, + "logits/rejected": -54262873.6, + "logps/chosen": -473.0800083705357, + "logps/rejected": -707.765234375, + "loss": 0.1254, + "rewards/chosen": 7.696138109479632, + "rewards/margins": 23.422972215924943, + "rewards/rejected": -15.726834106445313, + "step": 2128 + }, + { + "epoch": 0.5327161266107845, + "grad_norm": 9.0625, + "kl": 5.758484840393066, + "learning_rate": 5e-06, + "logits/chosen": -56538137.6, + "logits/rejected": -34462044.44444445, + "logps/chosen": -363.94755859375, + "logps/rejected": -356.225341796875, + "loss": 0.0426, + "rewards/chosen": 7.233360290527344, + "rewards/margins": 18.888608296712242, + "rewards/rejected": -11.655248006184896, + "step": 2129 + }, + { + "epoch": 0.5329663455523583, + "grad_norm": 6.78125, + "kl": 9.110494613647461, + "learning_rate": 5e-06, + "logits/chosen": -66562820.92307692, + "logits/rejected": -30190085.818181816, + "logps/chosen": -462.06186147836536, + "logps/rejected": -368.6480158025568, + "loss": 0.0159, + "rewards/chosen": 8.03597435584435, + "rewards/margins": 16.912475265823044, + "rewards/rejected": -8.876500909978693, + "step": 2130 + }, + { + "epoch": 0.5332165644939322, + "grad_norm": 12.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25514594.46153846, + "logits/rejected": -76707072.0, + "logps/chosen": -285.0158128004808, + "logps/rejected": -612.3947975852273, + "loss": 0.0366, + "rewards/chosen": 5.488927987905649, + "rewards/margins": 21.16722555093832, + "rewards/rejected": -15.67829756303267, + "step": 2131 + }, + { + "epoch": 0.533466783435506, + "grad_norm": 15.75, + "kl": 23.703855514526367, + "learning_rate": 5e-06, + "logits/chosen": -59613158.4, + "logits/rejected": -39719881.14285714, + "logps/chosen": -522.08369140625, + "logps/rejected": -673.2114955357143, + "loss": 0.166, + "rewards/chosen": 7.790958404541016, + "rewards/margins": 20.600690024239675, + "rewards/rejected": -12.809731619698661, + "step": 2132 + }, + { + "epoch": 0.53371700237708, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51006016.0, + "logits/rejected": -33163864.615384616, + "logps/chosen": -287.73530717329544, + "logps/rejected": -489.1514423076923, + "loss": 0.033, + "rewards/chosen": 6.132544777610085, + "rewards/margins": 18.318372179578233, + "rewards/rejected": -12.18582740196815, + "step": 2133 + }, + { + "epoch": 0.5339672213186538, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43452890.666666664, + "logits/rejected": -73550229.33333333, + "logps/chosen": -432.9000651041667, + "logps/rejected": -658.461669921875, + "loss": 0.0247, + "rewards/chosen": 8.371060053507486, + "rewards/margins": 21.972912470499672, + "rewards/rejected": -13.601852416992188, + "step": 2134 + }, + { + "epoch": 0.5342174402602277, + "grad_norm": 8.875, + "kl": 3.3601222038269043, + "learning_rate": 5e-06, + "logits/chosen": -53444373.333333336, + "logits/rejected": -24643194.666666668, + "logps/chosen": -351.6663818359375, + "logps/rejected": -373.1070556640625, + "loss": 0.0538, + "rewards/chosen": 7.32926877339681, + "rewards/margins": 18.887587865193684, + "rewards/rejected": -11.558319091796875, + "step": 2135 + }, + { + "epoch": 0.5344676592018016, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46256667.428571425, + "logits/rejected": -48986227.2, + "logps/chosen": -430.87642996651783, + "logps/rejected": -626.4486328125, + "loss": 0.026, + "rewards/chosen": 9.192671639578682, + "rewards/margins": 22.69137769426618, + "rewards/rejected": -13.4987060546875, + "step": 2136 + }, + { + "epoch": 0.5347178781433755, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41488512.0, + "logits/rejected": -37580633.6, + "logps/chosen": -430.07457139756946, + "logps/rejected": -513.7670572916667, + "loss": 0.0027, + "rewards/chosen": 8.642894321017796, + "rewards/margins": 20.497755771213107, + "rewards/rejected": -11.854861450195312, + "step": 2137 + }, + { + "epoch": 0.5349680970849493, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45453704.72727273, + "logits/rejected": -51846611.692307696, + "logps/chosen": -357.63032670454544, + "logps/rejected": -463.6939227764423, + "loss": 0.0593, + "rewards/chosen": 6.423076282848012, + "rewards/margins": 18.061006345948975, + "rewards/rejected": -11.637930063100962, + "step": 2138 + }, + { + "epoch": 0.5352183160265233, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63992453.81818182, + "logits/rejected": -55921826.461538464, + "logps/chosen": -327.09965376420456, + "logps/rejected": -543.8126878004807, + "loss": 0.0435, + "rewards/chosen": 5.947715759277344, + "rewards/margins": 20.829772362342247, + "rewards/rejected": -14.882056603064903, + "step": 2139 + }, + { + "epoch": 0.5354685349680971, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53123708.0, + "logits/rejected": -54756760.0, + "logps/chosen": -531.7557373046875, + "logps/rejected": -624.1727905273438, + "loss": 0.0235, + "rewards/chosen": 11.134231567382812, + "rewards/margins": 25.82841968536377, + "rewards/rejected": -14.694188117980957, + "step": 2140 + }, + { + "epoch": 0.5357187539096709, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67625314.13333334, + "logits/rejected": -20012430.222222224, + "logps/chosen": -463.2603515625, + "logps/rejected": -495.0603841145833, + "loss": 0.0073, + "rewards/chosen": 8.080677795410157, + "rewards/margins": 20.32905019124349, + "rewards/rejected": -12.248372395833334, + "step": 2141 + }, + { + "epoch": 0.5359689728512448, + "grad_norm": 11.4375, + "kl": 9.354574203491211, + "learning_rate": 5e-06, + "logits/chosen": -49029770.666666664, + "logits/rejected": -78046650.66666667, + "logps/chosen": -325.4270833333333, + "logps/rejected": -441.2404378255208, + "loss": 0.1817, + "rewards/chosen": 6.097594579060872, + "rewards/margins": 16.70362917582194, + "rewards/rejected": -10.606034596761068, + "step": 2142 + }, + { + "epoch": 0.5362191917928187, + "grad_norm": 7.90625, + "kl": 8.570608139038086, + "learning_rate": 5e-06, + "logits/chosen": -80895488.0, + "logits/rejected": -75579944.72727273, + "logps/chosen": -548.6659029447115, + "logps/rejected": -657.7452503551136, + "loss": 0.021, + "rewards/chosen": 8.808912423940805, + "rewards/margins": 21.949294670478448, + "rewards/rejected": -13.140382246537643, + "step": 2143 + }, + { + "epoch": 0.5364694107343926, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47609930.666666664, + "logits/rejected": -39024976.0, + "logps/chosen": -342.5165201822917, + "logps/rejected": -695.5323893229166, + "loss": 0.0323, + "rewards/chosen": 6.115297953287761, + "rewards/margins": 24.136516571044922, + "rewards/rejected": -18.02121861775716, + "step": 2144 + }, + { + "epoch": 0.5367196296759664, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -106418378.66666667, + "logits/rejected": -28702352.0, + "logps/chosen": -473.4376220703125, + "logps/rejected": -562.3690592447916, + "loss": 0.0202, + "rewards/chosen": 9.191539764404297, + "rewards/margins": 23.011844635009766, + "rewards/rejected": -13.820304870605469, + "step": 2145 + }, + { + "epoch": 0.5369698486175404, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53861416.0, + "logits/rejected": -50869288.0, + "logps/chosen": -405.7999267578125, + "logps/rejected": -489.3377685546875, + "loss": 0.0031, + "rewards/chosen": 7.776082515716553, + "rewards/margins": 20.25640630722046, + "rewards/rejected": -12.480323791503906, + "step": 2146 + }, + { + "epoch": 0.5372200675591142, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57994464.0, + "logits/rejected": -70563122.28571428, + "logps/chosen": -364.7872314453125, + "logps/rejected": -768.9342912946429, + "loss": 0.019, + "rewards/chosen": 8.0123046875, + "rewards/margins": 27.013536289760047, + "rewards/rejected": -19.001231602260045, + "step": 2147 + }, + { + "epoch": 0.5374702865006881, + "grad_norm": 10.1875, + "kl": 8.6414213180542, + "learning_rate": 5e-06, + "logits/chosen": -50472853.333333336, + "logits/rejected": -39408810.666666664, + "logps/chosen": -373.7132568359375, + "logps/rejected": -545.390625, + "loss": 0.0892, + "rewards/chosen": 8.135077158610025, + "rewards/margins": 21.166544596354164, + "rewards/rejected": -13.03146743774414, + "step": 2148 + }, + { + "epoch": 0.537720505442262, + "grad_norm": 1.7578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52872763.428571425, + "logits/rejected": -68879194.35294117, + "logps/chosen": -373.35208565848217, + "logps/rejected": -737.7532169117648, + "loss": 0.0134, + "rewards/chosen": 8.416925157819476, + "rewards/margins": 25.080181634726642, + "rewards/rejected": -16.66325647690717, + "step": 2149 + }, + { + "epoch": 0.5379707243838359, + "grad_norm": 10.375, + "kl": 0.1011962890625, + "learning_rate": 5e-06, + "logits/chosen": -60058504.53333333, + "logits/rejected": -42145770.666666664, + "logps/chosen": -348.3949869791667, + "logps/rejected": -438.9693196614583, + "loss": 0.0585, + "rewards/chosen": 6.946583557128906, + "rewards/margins": 16.400017971462674, + "rewards/rejected": -9.453434414333767, + "step": 2150 + }, + { + "epoch": 0.5382209433254097, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52368337.45454545, + "logits/rejected": -45927241.84615385, + "logps/chosen": -413.10080788352275, + "logps/rejected": -624.5305739182693, + "loss": 0.0027, + "rewards/chosen": 9.568672180175781, + "rewards/margins": 24.63751396766076, + "rewards/rejected": -15.068841787484976, + "step": 2151 + }, + { + "epoch": 0.5384711622669837, + "grad_norm": 1.1875, + "kl": 0.8620737791061401, + "learning_rate": 5e-06, + "logits/chosen": -42608977.45454545, + "logits/rejected": -67789380.92307693, + "logps/chosen": -507.92258522727275, + "logps/rejected": -734.6612830528846, + "loss": 0.0147, + "rewards/chosen": 9.632912375710227, + "rewards/margins": 28.646574860686187, + "rewards/rejected": -19.01366248497596, + "step": 2152 + }, + { + "epoch": 0.5387213812085575, + "grad_norm": 15.8125, + "kl": 1.8593229055404663, + "learning_rate": 5e-06, + "logits/chosen": -57384992.0, + "logits/rejected": -61207456.0, + "logps/chosen": -381.95528738839283, + "logps/rejected": -688.5935546875, + "loss": 0.0667, + "rewards/chosen": 6.4124025617327005, + "rewards/margins": 25.52994864327567, + "rewards/rejected": -19.11754608154297, + "step": 2153 + }, + { + "epoch": 0.5389716001501313, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43190568.0, + "logits/rejected": -56134984.0, + "logps/chosen": -485.2317810058594, + "logps/rejected": -690.8965454101562, + "loss": 0.031, + "rewards/chosen": 8.927902221679688, + "rewards/margins": 23.43411636352539, + "rewards/rejected": -14.506214141845703, + "step": 2154 + }, + { + "epoch": 0.5392218190917052, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53580931.2, + "logits/rejected": -59117750.85714286, + "logps/chosen": -444.608740234375, + "logps/rejected": -624.9909319196429, + "loss": 0.0224, + "rewards/chosen": 8.351038360595703, + "rewards/margins": 23.65227519444057, + "rewards/rejected": -15.301236833844866, + "step": 2155 + }, + { + "epoch": 0.5394720380332791, + "grad_norm": 9.8125, + "kl": 1.611695647239685, + "learning_rate": 5e-06, + "logits/chosen": -22153156.0, + "logits/rejected": -39491300.0, + "logps/chosen": -451.125244140625, + "logps/rejected": -558.5101318359375, + "loss": 0.0338, + "rewards/chosen": 6.9938836097717285, + "rewards/margins": 19.404642581939697, + "rewards/rejected": -12.410758972167969, + "step": 2156 + }, + { + "epoch": 0.539722256974853, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45607395.2, + "logits/rejected": -49520960.0, + "logps/chosen": -386.5181396484375, + "logps/rejected": -675.9732142857143, + "loss": 0.0414, + "rewards/chosen": 6.091012573242187, + "rewards/margins": 21.779198128836494, + "rewards/rejected": -15.688185555594307, + "step": 2157 + }, + { + "epoch": 0.5399724759164268, + "grad_norm": 6.59375, + "kl": 0.5099624395370483, + "learning_rate": 5e-06, + "logits/chosen": -61208426.666666664, + "logits/rejected": -60301669.333333336, + "logps/chosen": -385.8745930989583, + "logps/rejected": -540.2327067057291, + "loss": 0.0136, + "rewards/chosen": 8.855892817179361, + "rewards/margins": 20.134509404500324, + "rewards/rejected": -11.278616587320963, + "step": 2158 + }, + { + "epoch": 0.5402226948580008, + "grad_norm": 10.8125, + "kl": 10.076713562011719, + "learning_rate": 5e-06, + "logits/chosen": -45392088.615384616, + "logits/rejected": -63322426.18181818, + "logps/chosen": -582.8646709735577, + "logps/rejected": -591.1146573153409, + "loss": 0.0426, + "rewards/chosen": 11.54007075383113, + "rewards/margins": 25.564244837194053, + "rewards/rejected": -14.024174083362926, + "step": 2159 + }, + { + "epoch": 0.5404729137995746, + "grad_norm": 11.625, + "kl": 1.472161054611206, + "learning_rate": 5e-06, + "logits/chosen": -42339500.307692304, + "logits/rejected": -22776603.636363637, + "logps/chosen": -437.17086087740387, + "logps/rejected": -827.0696022727273, + "loss": 0.0557, + "rewards/chosen": 8.695955716646635, + "rewards/margins": 23.84162860150104, + "rewards/rejected": -15.145672884854404, + "step": 2160 + }, + { + "epoch": 0.5407231327411485, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43561584.0, + "logits/rejected": -60191984.0, + "logps/chosen": -316.0899251302083, + "logps/rejected": -607.5961100260416, + "loss": 0.0707, + "rewards/chosen": 7.730888366699219, + "rewards/margins": 21.68212000528971, + "rewards/rejected": -13.951231638590494, + "step": 2161 + }, + { + "epoch": 0.5409733516827224, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66656418.90909091, + "logits/rejected": -39606528.0, + "logps/chosen": -345.62397904829544, + "logps/rejected": -516.9219876802885, + "loss": 0.0408, + "rewards/chosen": 6.436214793812145, + "rewards/margins": 18.246508138163108, + "rewards/rejected": -11.810293344350962, + "step": 2162 + }, + { + "epoch": 0.5412235706242963, + "grad_norm": 4.125, + "kl": 3.618314266204834, + "learning_rate": 5e-06, + "logits/chosen": -59207461.333333336, + "logits/rejected": -43887906.666666664, + "logps/chosen": -455.6234944661458, + "logps/rejected": -595.201171875, + "loss": 0.0574, + "rewards/chosen": 8.622791290283203, + "rewards/margins": 21.570638020833336, + "rewards/rejected": -12.94784673055013, + "step": 2163 + }, + { + "epoch": 0.5414737895658701, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44189516.0, + "logits/rejected": -47208928.0, + "logps/chosen": -331.9921875, + "logps/rejected": -601.8104248046875, + "loss": 0.0447, + "rewards/chosen": 6.838160514831543, + "rewards/margins": 19.5409574508667, + "rewards/rejected": -12.702796936035156, + "step": 2164 + }, + { + "epoch": 0.5417240085074441, + "grad_norm": 1.90625, + "kl": 0.47515934705734253, + "learning_rate": 5e-06, + "logits/chosen": -39371702.85714286, + "logits/rejected": -62783500.8, + "logps/chosen": -283.30032784598217, + "logps/rejected": -601.6263671875, + "loss": 0.0152, + "rewards/chosen": 6.797151838030134, + "rewards/margins": 22.176578412737165, + "rewards/rejected": -15.379426574707031, + "step": 2165 + }, + { + "epoch": 0.5419742274490179, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41829029.333333336, + "logits/rejected": -68100096.0, + "logps/chosen": -319.9404296875, + "logps/rejected": -767.939453125, + "loss": 0.0216, + "rewards/chosen": 7.023010889689128, + "rewards/margins": 27.498478571573894, + "rewards/rejected": -20.475467681884766, + "step": 2166 + }, + { + "epoch": 0.5422244463905918, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15970482.666666666, + "logits/rejected": -46390517.333333336, + "logps/chosen": -251.555419921875, + "logps/rejected": -647.8133951822916, + "loss": 0.0699, + "rewards/chosen": 4.927940050760905, + "rewards/margins": 18.13168430328369, + "rewards/rejected": -13.203744252522787, + "step": 2167 + }, + { + "epoch": 0.5424746653321656, + "grad_norm": 14.0625, + "kl": 8.705866813659668, + "learning_rate": 5e-06, + "logits/chosen": -12845481.142857144, + "logits/rejected": -63345945.6, + "logps/chosen": -418.20389229910717, + "logps/rejected": -671.7306640625, + "loss": 0.0845, + "rewards/chosen": 7.635802132742746, + "rewards/margins": 19.62263913835798, + "rewards/rejected": -11.986837005615234, + "step": 2168 + }, + { + "epoch": 0.5427248842737395, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47088011.63636363, + "logits/rejected": -37711163.07692308, + "logps/chosen": -392.6071111505682, + "logps/rejected": -616.1868239182693, + "loss": 0.0147, + "rewards/chosen": 8.458906693892045, + "rewards/margins": 26.749160553191928, + "rewards/rejected": -18.29025385929988, + "step": 2169 + }, + { + "epoch": 0.5429751032153134, + "grad_norm": 4.875, + "kl": 10.860645294189453, + "learning_rate": 5e-06, + "logits/chosen": -30952322.90909091, + "logits/rejected": -67502080.0, + "logps/chosen": -431.0446111505682, + "logps/rejected": -577.9402794471154, + "loss": 0.0405, + "rewards/chosen": 8.814153497869318, + "rewards/margins": 22.05920965021307, + "rewards/rejected": -13.24505615234375, + "step": 2170 + }, + { + "epoch": 0.5432253221568872, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22608361.6, + "logits/rejected": -51221572.571428575, + "logps/chosen": -380.4486328125, + "logps/rejected": -528.7574637276786, + "loss": 0.0164, + "rewards/chosen": 7.079256439208985, + "rewards/margins": 19.047522517613004, + "rewards/rejected": -11.968266078404017, + "step": 2171 + }, + { + "epoch": 0.5434755410984612, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28102499.2, + "logits/rejected": -52547044.571428575, + "logps/chosen": -486.08076171875, + "logps/rejected": -541.9791782924107, + "loss": 0.0484, + "rewards/chosen": 9.59075469970703, + "rewards/margins": 21.111326817103794, + "rewards/rejected": -11.520572117396764, + "step": 2172 + }, + { + "epoch": 0.543725760040035, + "grad_norm": 8.5625, + "kl": 10.987865447998047, + "learning_rate": 5e-06, + "logits/chosen": -67112864.0, + "logits/rejected": -65872614.4, + "logps/chosen": -431.044921875, + "logps/rejected": -687.9337890625, + "loss": 0.0151, + "rewards/chosen": 8.814708164760045, + "rewards/margins": 21.962219456263952, + "rewards/rejected": -13.147511291503907, + "step": 2173 + }, + { + "epoch": 0.5439759789816089, + "grad_norm": 11.4375, + "kl": 21.365909576416016, + "learning_rate": 5e-06, + "logits/chosen": -47334392.0, + "logits/rejected": -18644942.0, + "logps/chosen": -484.3040771484375, + "logps/rejected": -564.1136474609375, + "loss": 0.0232, + "rewards/chosen": 9.497515678405762, + "rewards/margins": 23.27571964263916, + "rewards/rejected": -13.778203964233398, + "step": 2174 + }, + { + "epoch": 0.5442261979231828, + "grad_norm": 0.66796875, + "kl": 1.4529647827148438, + "learning_rate": 5e-06, + "logits/chosen": -55848036.571428575, + "logits/rejected": -36699685.64705882, + "logps/chosen": -414.24155970982144, + "logps/rejected": -457.40900735294116, + "loss": 0.0015, + "rewards/chosen": 7.498930794852121, + "rewards/margins": 17.28485536976021, + "rewards/rejected": -9.785924574908089, + "step": 2175 + }, + { + "epoch": 0.5444764168647567, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40653533.09090909, + "logits/rejected": -10402633.846153846, + "logps/chosen": -294.27321555397725, + "logps/rejected": -510.5788010817308, + "loss": 0.0579, + "rewards/chosen": 6.40006533536044, + "rewards/margins": 17.556719106394095, + "rewards/rejected": -11.156653771033653, + "step": 2176 + }, + { + "epoch": 0.5447266358063305, + "grad_norm": 14.6875, + "kl": 4.0885114669799805, + "learning_rate": 5e-06, + "logits/chosen": -22234309.333333332, + "logits/rejected": -37483502.93333333, + "logps/chosen": -311.2684733072917, + "logps/rejected": -506.48675130208335, + "loss": 0.072, + "rewards/chosen": 6.507117801242405, + "rewards/margins": 18.170707278781467, + "rewards/rejected": -11.663589477539062, + "step": 2177 + }, + { + "epoch": 0.5449768547479045, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55770001.45454545, + "logits/rejected": -66267047.384615384, + "logps/chosen": -436.98100142045456, + "logps/rejected": -713.3965594951923, + "loss": 0.0081, + "rewards/chosen": 9.160180525346236, + "rewards/margins": 26.000577113011502, + "rewards/rejected": -16.840396587665264, + "step": 2178 + }, + { + "epoch": 0.5452270736894783, + "grad_norm": 15.625, + "kl": 8.537248611450195, + "learning_rate": 5e-06, + "logits/chosen": -70146816.0, + "logits/rejected": -54325971.2, + "logps/chosen": -413.77378627232144, + "logps/rejected": -544.406640625, + "loss": 0.0542, + "rewards/chosen": 7.999961308070591, + "rewards/margins": 25.58484900338309, + "rewards/rejected": -17.5848876953125, + "step": 2179 + }, + { + "epoch": 0.5454772926310522, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 31091122.285714287, + "logits/rejected": -42545980.23529412, + "logps/chosen": -529.1649693080357, + "logps/rejected": -518.1779067095588, + "loss": 0.0438, + "rewards/chosen": 6.9560121808733255, + "rewards/margins": 19.215088467638033, + "rewards/rejected": -12.259076286764707, + "step": 2180 + }, + { + "epoch": 0.545727511572626, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37047468.307692304, + "logits/rejected": -47433314.90909091, + "logps/chosen": -430.4957932692308, + "logps/rejected": -835.9010120738636, + "loss": 0.0506, + "rewards/chosen": 7.492636460524339, + "rewards/margins": 25.811034676078314, + "rewards/rejected": -18.318398215553977, + "step": 2181 + }, + { + "epoch": 0.5459777305142, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66010784.0, + "logits/rejected": -30941896.0, + "logps/chosen": -403.0221761067708, + "logps/rejected": -526.24560546875, + "loss": 0.0329, + "rewards/chosen": 8.192264556884766, + "rewards/margins": 19.355918884277344, + "rewards/rejected": -11.163654327392578, + "step": 2182 + }, + { + "epoch": 0.5462279494557738, + "grad_norm": 18.0, + "kl": 5.3979034423828125, + "learning_rate": 5e-06, + "logits/chosen": -70337253.33333333, + "logits/rejected": -27239736.0, + "logps/chosen": -393.8075358072917, + "logps/rejected": -520.8450520833334, + "loss": 0.0544, + "rewards/chosen": 7.347586313883464, + "rewards/margins": 17.980595270792644, + "rewards/rejected": -10.63300895690918, + "step": 2183 + }, + { + "epoch": 0.5464781683973476, + "grad_norm": 1.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62514629.333333336, + "logits/rejected": 52181093.333333336, + "logps/chosen": -504.5513509114583, + "logps/rejected": -918.9811197916666, + "loss": 0.0023, + "rewards/chosen": 8.76574452718099, + "rewards/margins": 26.832117716471352, + "rewards/rejected": -18.066373189290363, + "step": 2184 + }, + { + "epoch": 0.5467283873389216, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54562797.71428572, + "logits/rejected": -53581171.2, + "logps/chosen": -405.04453822544644, + "logps/rejected": -700.098095703125, + "loss": 0.0442, + "rewards/chosen": 8.844518389020648, + "rewards/margins": 28.08423854282924, + "rewards/rejected": -19.239720153808594, + "step": 2185 + }, + { + "epoch": 0.5469786062804954, + "grad_norm": 4.46875, + "kl": 0.8447456359863281, + "learning_rate": 5e-06, + "logits/chosen": -82862120.72727273, + "logits/rejected": -47114525.538461536, + "logps/chosen": -393.25727982954544, + "logps/rejected": -686.8629807692307, + "loss": 0.0107, + "rewards/chosen": 8.094938104802912, + "rewards/margins": 23.540986547936924, + "rewards/rejected": -15.446048443134014, + "step": 2186 + }, + { + "epoch": 0.5472288252220693, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10448516.0, + "logits/rejected": -59344832.0, + "logps/chosen": -282.8724365234375, + "logps/rejected": -626.5808919270834, + "loss": 0.0531, + "rewards/chosen": 6.767660776774089, + "rewards/margins": 23.443347930908203, + "rewards/rejected": -16.675687154134113, + "step": 2187 + }, + { + "epoch": 0.5474790441636432, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68673338.18181819, + "logits/rejected": -53837105.23076923, + "logps/chosen": -409.21932705965907, + "logps/rejected": -652.3308293269231, + "loss": 0.0239, + "rewards/chosen": 9.701058127663352, + "rewards/margins": 27.989755590478858, + "rewards/rejected": -18.288697462815506, + "step": 2188 + }, + { + "epoch": 0.5477292631052171, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40026880.0, + "logits/rejected": -35643818.666666664, + "logps/chosen": -308.7258029513889, + "logps/rejected": -565.1641927083333, + "loss": 0.048, + "rewards/chosen": 6.5353198581271705, + "rewards/margins": 17.64164310031467, + "rewards/rejected": -11.1063232421875, + "step": 2189 + }, + { + "epoch": 0.5479794820467909, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69521701.33333333, + "logits/rejected": -47572458.666666664, + "logps/chosen": -541.7156575520834, + "logps/rejected": -681.5446370442709, + "loss": 0.0226, + "rewards/chosen": 10.571956634521484, + "rewards/margins": 24.806456247965492, + "rewards/rejected": -14.23449961344401, + "step": 2190 + }, + { + "epoch": 0.5482297009883648, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3081955.2, + "logits/rejected": -42496416.0, + "logps/chosen": -416.4873046875, + "logps/rejected": -522.1173270089286, + "loss": 0.0048, + "rewards/chosen": 6.674021148681641, + "rewards/margins": 21.582796478271483, + "rewards/rejected": -14.908775329589844, + "step": 2191 + }, + { + "epoch": 0.5484799199299387, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58454080.0, + "logits/rejected": -64791889.06666667, + "logps/chosen": -320.16875542534723, + "logps/rejected": -707.8028645833333, + "loss": 0.0334, + "rewards/chosen": 5.107824113633898, + "rewards/margins": 20.67294455634223, + "rewards/rejected": -15.565120442708333, + "step": 2192 + }, + { + "epoch": 0.5487301388715126, + "grad_norm": 7.625, + "kl": 2.7421507835388184, + "learning_rate": 5e-06, + "logits/chosen": -21347787.636363637, + "logits/rejected": -36012430.76923077, + "logps/chosen": -315.5296519886364, + "logps/rejected": -480.9157902644231, + "loss": 0.064, + "rewards/chosen": 6.043200406161222, + "rewards/margins": 18.098869163673243, + "rewards/rejected": -12.05566875751202, + "step": 2193 + }, + { + "epoch": 0.5489803578130864, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65170368.0, + "logits/rejected": 40674552.47058824, + "logps/chosen": -574.1031668526786, + "logps/rejected": -624.6243106617648, + "loss": 0.0041, + "rewards/chosen": 9.291656494140625, + "rewards/margins": 25.288093118106616, + "rewards/rejected": -15.996436623965993, + "step": 2194 + }, + { + "epoch": 0.5492305767546604, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36908963.2, + "logits/rejected": -49748050.28571428, + "logps/chosen": -312.22421875, + "logps/rejected": -614.9411969866071, + "loss": 0.0063, + "rewards/chosen": 6.91546630859375, + "rewards/margins": 20.638973781040736, + "rewards/rejected": -13.723507472446986, + "step": 2195 + }, + { + "epoch": 0.5494807956962342, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15953094.666666666, + "logits/rejected": -41879858.666666664, + "logps/chosen": -337.8475341796875, + "logps/rejected": -522.1038411458334, + "loss": 0.0664, + "rewards/chosen": 6.207684834798177, + "rewards/margins": 19.16211191813151, + "rewards/rejected": -12.954427083333334, + "step": 2196 + }, + { + "epoch": 0.549731014637808, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56167214.54545455, + "logits/rejected": -66680300.307692304, + "logps/chosen": -452.09565873579544, + "logps/rejected": -561.7049654447115, + "loss": 0.0305, + "rewards/chosen": 8.521127874200994, + "rewards/margins": 25.107314209838016, + "rewards/rejected": -16.58618633563702, + "step": 2197 + }, + { + "epoch": 0.549981233579382, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27972622.222222224, + "logits/rejected": -43543189.333333336, + "logps/chosen": -382.10218641493054, + "logps/rejected": -666.6999348958333, + "loss": 0.024, + "rewards/chosen": 9.928428649902344, + "rewards/margins": 22.646010843912762, + "rewards/rejected": -12.717582194010417, + "step": 2198 + }, + { + "epoch": 0.5502314525209558, + "grad_norm": 2.5625, + "kl": 0.5684560537338257, + "learning_rate": 5e-06, + "logits/chosen": -45048960.0, + "logits/rejected": -64665073.23076923, + "logps/chosen": -371.20450106534093, + "logps/rejected": -554.3855168269231, + "loss": 0.0175, + "rewards/chosen": 6.731409939852628, + "rewards/margins": 19.395872689627268, + "rewards/rejected": -12.66446274977464, + "step": 2199 + }, + { + "epoch": 0.5504816714625297, + "grad_norm": 1.6328125, + "kl": 1.7726819515228271, + "learning_rate": 5e-06, + "logits/chosen": -62568110.54545455, + "logits/rejected": -38419904.0, + "logps/chosen": -365.2747913707386, + "logps/rejected": -581.5771108774038, + "loss": 0.0044, + "rewards/chosen": 7.271180586381392, + "rewards/margins": 20.96724780956348, + "rewards/rejected": -13.69606722318209, + "step": 2200 + }, + { + "epoch": 0.5507318904041036, + "grad_norm": 15.25, + "kl": 0.7482325434684753, + "learning_rate": 5e-06, + "logits/chosen": -46166922.666666664, + "logits/rejected": -48369476.266666666, + "logps/chosen": -315.99565972222223, + "logps/rejected": -600.784375, + "loss": 0.0668, + "rewards/chosen": 6.669189453125, + "rewards/margins": 21.257918294270834, + "rewards/rejected": -14.588728841145834, + "step": 2201 + }, + { + "epoch": 0.5509821093456775, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63079524.571428575, + "logits/rejected": -57004198.4, + "logps/chosen": -358.98824637276783, + "logps/rejected": -801.42880859375, + "loss": 0.0316, + "rewards/chosen": 8.077665056501116, + "rewards/margins": 26.286652483258926, + "rewards/rejected": -18.208987426757812, + "step": 2202 + }, + { + "epoch": 0.5512323282872513, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60866688.0, + "logits/rejected": -29806309.333333332, + "logps/chosen": -435.8663330078125, + "logps/rejected": -610.6060791015625, + "loss": 0.0358, + "rewards/chosen": 7.366757074991862, + "rewards/margins": 23.363701502482098, + "rewards/rejected": -15.996944427490234, + "step": 2203 + }, + { + "epoch": 0.5514825472288252, + "grad_norm": 14.0, + "kl": 10.557289123535156, + "learning_rate": 5e-06, + "logits/chosen": -29518647.111111112, + "logits/rejected": -39547400.53333333, + "logps/chosen": -324.73822699652777, + "logps/rejected": -566.5731119791667, + "loss": 0.0644, + "rewards/chosen": 7.264499240451389, + "rewards/margins": 19.48249240451389, + "rewards/rejected": -12.2179931640625, + "step": 2204 + }, + { + "epoch": 0.5517327661703991, + "grad_norm": 6.59375, + "kl": 6.861140251159668, + "learning_rate": 5e-06, + "logits/chosen": -50567394.90909091, + "logits/rejected": -69264000.0, + "logps/chosen": -330.64599609375, + "logps/rejected": -432.14599609375, + "loss": 0.0193, + "rewards/chosen": 7.343974720348012, + "rewards/margins": 15.957654459493144, + "rewards/rejected": -8.613679739145132, + "step": 2205 + }, + { + "epoch": 0.551982985111973, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44249888.0, + "logits/rejected": -53415596.307692304, + "logps/chosen": -366.6449085582386, + "logps/rejected": -648.1543719951923, + "loss": 0.0292, + "rewards/chosen": 7.450180747292259, + "rewards/margins": 20.49393121679346, + "rewards/rejected": -13.043750469501202, + "step": 2206 + }, + { + "epoch": 0.5522332040535468, + "grad_norm": 20.625, + "kl": 15.272483825683594, + "learning_rate": 5e-06, + "logits/chosen": -50977494.85714286, + "logits/rejected": -48314016.0, + "logps/chosen": -431.32212611607144, + "logps/rejected": -694.90078125, + "loss": 0.086, + "rewards/chosen": 9.32796151297433, + "rewards/margins": 24.871801539829796, + "rewards/rejected": -15.543840026855468, + "step": 2207 + }, + { + "epoch": 0.5524834229951208, + "grad_norm": 4.40625, + "kl": 1.089684247970581, + "learning_rate": 5e-06, + "logits/chosen": -31994034.666666668, + "logits/rejected": -61946890.666666664, + "logps/chosen": -368.3271484375, + "logps/rejected": -583.7980143229166, + "loss": 0.0198, + "rewards/chosen": 8.164920171101889, + "rewards/margins": 19.484787623087566, + "rewards/rejected": -11.319867451985678, + "step": 2208 + }, + { + "epoch": 0.5527336419366946, + "grad_norm": 1.0859375, + "kl": 1.9735896587371826, + "learning_rate": 5e-06, + "logits/chosen": -45969210.18181818, + "logits/rejected": -48982660.92307692, + "logps/chosen": -535.576171875, + "logps/rejected": -486.1553485576923, + "loss": 0.009, + "rewards/chosen": 9.057313398881393, + "rewards/margins": 20.035731602381993, + "rewards/rejected": -10.9784182035006, + "step": 2209 + }, + { + "epoch": 0.5529838608782685, + "grad_norm": 4.3125, + "kl": 6.8734588623046875, + "learning_rate": 5e-06, + "logits/chosen": -69413984.0, + "logits/rejected": -37256765.71428572, + "logps/chosen": -995.30712890625, + "logps/rejected": -512.4062848772321, + "loss": 0.0591, + "rewards/chosen": 14.148481750488282, + "rewards/margins": 25.39093736921038, + "rewards/rejected": -11.242455618722099, + "step": 2210 + }, + { + "epoch": 0.5532340798198424, + "grad_norm": 1.25, + "kl": 8.407530784606934, + "learning_rate": 5e-06, + "logits/chosen": -44320438.85714286, + "logits/rejected": -64208281.6, + "logps/chosen": -360.787109375, + "logps/rejected": -606.973681640625, + "loss": 0.0141, + "rewards/chosen": 7.3471205575125555, + "rewards/margins": 22.719944654192243, + "rewards/rejected": -15.372824096679688, + "step": 2211 + }, + { + "epoch": 0.5534842987614162, + "grad_norm": 1.53125, + "kl": 9.036863327026367, + "learning_rate": 5e-06, + "logits/chosen": -32719901.866666667, + "logits/rejected": -46798400.0, + "logps/chosen": -408.28974609375, + "logps/rejected": -757.9505750868055, + "loss": 0.0834, + "rewards/chosen": 8.70982157389323, + "rewards/margins": 21.787941487630206, + "rewards/rejected": -13.078119913736979, + "step": 2212 + }, + { + "epoch": 0.5537345177029901, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46420381.86666667, + "logits/rejected": -12633033.777777778, + "logps/chosen": -281.65673828125, + "logps/rejected": -463.3147243923611, + "loss": 0.0664, + "rewards/chosen": 4.480144246419271, + "rewards/margins": 14.702235073513457, + "rewards/rejected": -10.222090827094185, + "step": 2213 + }, + { + "epoch": 0.553984736644564, + "grad_norm": 10.3125, + "kl": 2.701261043548584, + "learning_rate": 5e-06, + "logits/chosen": -90977545.14285715, + "logits/rejected": -36695337.6, + "logps/chosen": -460.1714564732143, + "logps/rejected": -545.32685546875, + "loss": 0.0514, + "rewards/chosen": 8.387596675327845, + "rewards/margins": 22.643401118687223, + "rewards/rejected": -14.255804443359375, + "step": 2214 + }, + { + "epoch": 0.5542349555861379, + "grad_norm": 8.8125, + "kl": 2.6284308433532715, + "learning_rate": 5e-06, + "logits/chosen": -102740549.81818181, + "logits/rejected": -48862444.307692304, + "logps/chosen": -478.66317471590907, + "logps/rejected": -716.7600661057693, + "loss": 0.0174, + "rewards/chosen": 10.435284701260654, + "rewards/margins": 22.260294560785894, + "rewards/rejected": -11.82500985952524, + "step": 2215 + }, + { + "epoch": 0.5544851745277117, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66416279.27272727, + "logits/rejected": -67693838.76923077, + "logps/chosen": -378.44651100852275, + "logps/rejected": -532.3795072115385, + "loss": 0.0159, + "rewards/chosen": 7.745054765181108, + "rewards/margins": 24.40701976856152, + "rewards/rejected": -16.66196500338041, + "step": 2216 + }, + { + "epoch": 0.5547353934692856, + "grad_norm": 9.0, + "kl": 8.4419584274292, + "learning_rate": 5e-06, + "logits/chosen": -63761844.705882356, + "logits/rejected": -31694966.85714286, + "logps/chosen": -453.46452780330884, + "logps/rejected": -529.6546107700893, + "loss": 0.0169, + "rewards/chosen": 9.233721564797793, + "rewards/margins": 21.631866967978596, + "rewards/rejected": -12.398145403180804, + "step": 2217 + }, + { + "epoch": 0.5549856124108595, + "grad_norm": 3.609375, + "kl": 4.949073791503906, + "learning_rate": 5e-06, + "logits/chosen": -52177148.44444445, + "logits/rejected": -26743616.0, + "logps/chosen": -405.1030544704861, + "logps/rejected": -449.1039713541667, + "loss": 0.0219, + "rewards/chosen": 9.607464260525173, + "rewards/margins": 19.095936754014758, + "rewards/rejected": -9.488472493489583, + "step": 2218 + }, + { + "epoch": 0.5552358313524334, + "grad_norm": 15.0625, + "kl": 7.9950079917907715, + "learning_rate": 5e-06, + "logits/chosen": -27427912.533333335, + "logits/rejected": -27206906.666666668, + "logps/chosen": -481.2649739583333, + "logps/rejected": -358.31982421875, + "loss": 0.0733, + "rewards/chosen": 7.714934285481771, + "rewards/margins": 18.35647396511502, + "rewards/rejected": -10.641539679633247, + "step": 2219 + }, + { + "epoch": 0.5554860502940072, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -80972597.33333333, + "logits/rejected": -65675939.55555555, + "logps/chosen": -559.4909261067709, + "logps/rejected": -752.3308376736111, + "loss": 0.0103, + "rewards/chosen": 11.664863586425781, + "rewards/margins": 28.364136589898003, + "rewards/rejected": -16.69927300347222, + "step": 2220 + }, + { + "epoch": 0.5557362692355812, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40739845.333333336, + "logits/rejected": -55055258.666666664, + "logps/chosen": -434.4303792317708, + "logps/rejected": -586.0794270833334, + "loss": 0.0229, + "rewards/chosen": 9.84492047627767, + "rewards/margins": 22.0826104482015, + "rewards/rejected": -12.237689971923828, + "step": 2221 + }, + { + "epoch": 0.555986488177155, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16062915.0, + "logits/rejected": -36711800.0, + "logps/chosen": -584.0648193359375, + "logps/rejected": -610.6727294921875, + "loss": 0.0186, + "rewards/chosen": 9.46000862121582, + "rewards/margins": 21.026348114013672, + "rewards/rejected": -11.566339492797852, + "step": 2222 + }, + { + "epoch": 0.5562367071187289, + "grad_norm": 14.8125, + "kl": 12.567930221557617, + "learning_rate": 5e-06, + "logits/chosen": -43576085.333333336, + "logits/rejected": -46109168.0, + "logps/chosen": -422.2206624348958, + "logps/rejected": -593.015869140625, + "loss": 0.0956, + "rewards/chosen": 7.731258392333984, + "rewards/margins": 21.115056355794273, + "rewards/rejected": -13.383797963460287, + "step": 2223 + }, + { + "epoch": 0.5564869260603028, + "grad_norm": 16.125, + "kl": 24.70016098022461, + "learning_rate": 5e-06, + "logits/chosen": -17751926.588235293, + "logits/rejected": -24687488.0, + "logps/chosen": -412.2907284007353, + "logps/rejected": -771.4623325892857, + "loss": 0.1531, + "rewards/chosen": 8.532025505514707, + "rewards/margins": 25.56789019929261, + "rewards/rejected": -17.035864693777903, + "step": 2224 + }, + { + "epoch": 0.5567371450018767, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45025179.428571425, + "logits/rejected": -76541312.0, + "logps/chosen": -479.84622628348217, + "logps/rejected": -755.481201171875, + "loss": 0.014, + "rewards/chosen": 9.731000627790179, + "rewards/margins": 29.588976396833147, + "rewards/rejected": -19.85797576904297, + "step": 2225 + }, + { + "epoch": 0.5569873639434505, + "grad_norm": 9.375, + "kl": 8.877699851989746, + "learning_rate": 5e-06, + "logits/chosen": -75386581.33333333, + "logits/rejected": -55426234.666666664, + "logps/chosen": -414.4938151041667, + "logps/rejected": -497.8795572916667, + "loss": 0.043, + "rewards/chosen": 7.745774586995442, + "rewards/margins": 20.341101328531902, + "rewards/rejected": -12.595326741536459, + "step": 2226 + }, + { + "epoch": 0.5572375828850245, + "grad_norm": 18.25, + "kl": 4.1519880294799805, + "learning_rate": 5e-06, + "logits/chosen": -47674968.88888889, + "logits/rejected": -40126660.266666666, + "logps/chosen": -383.1032443576389, + "logps/rejected": -431.6559244791667, + "loss": 0.0426, + "rewards/chosen": 5.5805859035915795, + "rewards/margins": 17.098009406195747, + "rewards/rejected": -11.517423502604167, + "step": 2227 + }, + { + "epoch": 0.5574878018265983, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67344796.44444445, + "logits/rejected": -15390999.466666667, + "logps/chosen": -315.55967881944446, + "logps/rejected": -518.8408854166667, + "loss": 0.037, + "rewards/chosen": 5.751074473063151, + "rewards/margins": 19.282000986735028, + "rewards/rejected": -13.530926513671876, + "step": 2228 + }, + { + "epoch": 0.5577380207681721, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -88550865.45454545, + "logits/rejected": -66570003.692307696, + "logps/chosen": -426.7345525568182, + "logps/rejected": -759.4130108173077, + "loss": 0.0142, + "rewards/chosen": 8.13846241344105, + "rewards/margins": 28.333091789192252, + "rewards/rejected": -20.1946293757512, + "step": 2229 + }, + { + "epoch": 0.557988239709746, + "grad_norm": 12.9375, + "kl": 10.263299942016602, + "learning_rate": 5e-06, + "logits/chosen": -57979204.571428575, + "logits/rejected": -60437286.4, + "logps/chosen": -343.8016880580357, + "logps/rejected": -545.359912109375, + "loss": 0.1117, + "rewards/chosen": 6.547127314976284, + "rewards/margins": 17.194928523472377, + "rewards/rejected": -10.647801208496094, + "step": 2230 + }, + { + "epoch": 0.5582384586513199, + "grad_norm": 8.0625, + "kl": 23.013729095458984, + "learning_rate": 5e-06, + "logits/chosen": -50072849.06666667, + "logits/rejected": -31788206.222222224, + "logps/chosen": -498.35, + "logps/rejected": -485.59971788194446, + "loss": 0.0256, + "rewards/chosen": 10.879056803385417, + "rewards/margins": 20.03604261610243, + "rewards/rejected": -9.156985812717014, + "step": 2231 + }, + { + "epoch": 0.5584886775928938, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38280023.27272727, + "logits/rejected": -61903876.92307692, + "logps/chosen": -269.60666725852275, + "logps/rejected": -451.9533128004808, + "loss": 0.0647, + "rewards/chosen": 7.179168007590554, + "rewards/margins": 21.69106399429428, + "rewards/rejected": -14.511895986703726, + "step": 2232 + }, + { + "epoch": 0.5587388965344676, + "grad_norm": 4.71875, + "kl": 4.89161491394043, + "learning_rate": 5e-06, + "logits/chosen": -57280913.45454545, + "logits/rejected": -800019.6923076923, + "logps/chosen": -372.40576171875, + "logps/rejected": -595.7172475961538, + "loss": 0.0349, + "rewards/chosen": 8.683320478959518, + "rewards/margins": 21.70396327305507, + "rewards/rejected": -13.020642794095552, + "step": 2233 + }, + { + "epoch": 0.5589891154760416, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9705457.142857144, + "logits/rejected": -27140576.0, + "logps/chosen": -554.3320661272321, + "logps/rejected": -518.534375, + "loss": 0.0182, + "rewards/chosen": 9.368435450962611, + "rewards/margins": 19.627776881626673, + "rewards/rejected": -10.259341430664062, + "step": 2234 + }, + { + "epoch": 0.5592393344176154, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49784336.0, + "logits/rejected": -48793508.571428575, + "logps/chosen": -345.030859375, + "logps/rejected": -768.3096400669643, + "loss": 0.0059, + "rewards/chosen": 7.549298095703125, + "rewards/margins": 24.989733232770647, + "rewards/rejected": -17.44043513706752, + "step": 2235 + }, + { + "epoch": 0.5594895533591893, + "grad_norm": 1.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39050102.85714286, + "logits/rejected": -66494712.47058824, + "logps/chosen": -387.23538643973217, + "logps/rejected": -627.134765625, + "loss": 0.0108, + "rewards/chosen": 6.802137102399554, + "rewards/margins": 21.69599786325663, + "rewards/rejected": -14.893860760857077, + "step": 2236 + }, + { + "epoch": 0.5597397723007632, + "grad_norm": 3.78125, + "kl": 12.960908889770508, + "learning_rate": 5e-06, + "logits/chosen": -51196296.53333333, + "logits/rejected": -38265194.666666664, + "logps/chosen": -466.35797526041665, + "logps/rejected": -513.1847330729166, + "loss": 0.0904, + "rewards/chosen": 10.151260375976562, + "rewards/margins": 26.23682149251302, + "rewards/rejected": -16.085561116536457, + "step": 2237 + }, + { + "epoch": 0.5599899912423371, + "grad_norm": 11.9375, + "kl": 3.411080837249756, + "learning_rate": 5e-06, + "logits/chosen": -47128536.615384616, + "logits/rejected": -82026944.0, + "logps/chosen": -386.1357421875, + "logps/rejected": -653.4439808238636, + "loss": 0.049, + "rewards/chosen": 7.54829582801232, + "rewards/margins": 18.915140698839735, + "rewards/rejected": -11.366844870827414, + "step": 2238 + }, + { + "epoch": 0.5602402101839109, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16029644.8, + "logits/rejected": 23448230.85714286, + "logps/chosen": -326.1718505859375, + "logps/rejected": -642.9763532366071, + "loss": 0.059, + "rewards/chosen": 6.172935867309571, + "rewards/margins": 19.733556856427874, + "rewards/rejected": -13.560620989118304, + "step": 2239 + }, + { + "epoch": 0.5604904291254847, + "grad_norm": 3.578125, + "kl": 15.72818660736084, + "learning_rate": 5e-06, + "logits/chosen": -84745885.53846154, + "logits/rejected": -49870301.09090909, + "logps/chosen": -469.5951397235577, + "logps/rejected": -788.8330078125, + "loss": 0.0514, + "rewards/chosen": 10.115003145658052, + "rewards/margins": 31.00092150281359, + "rewards/rejected": -20.88591835715554, + "step": 2240 + }, + { + "epoch": 0.5607406480670587, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37805011.2, + "logits/rejected": -52818386.28571428, + "logps/chosen": -430.92646484375, + "logps/rejected": -647.7152622767857, + "loss": 0.007, + "rewards/chosen": 8.791746520996094, + "rewards/margins": 21.727995300292967, + "rewards/rejected": -12.936248779296875, + "step": 2241 + }, + { + "epoch": 0.5609908670086325, + "grad_norm": 6.21875, + "kl": 5.519371032714844, + "learning_rate": 5e-06, + "logits/chosen": -37944688.0, + "logits/rejected": -67067061.333333336, + "logps/chosen": -328.9994710286458, + "logps/rejected": -977.7801106770834, + "loss": 0.0422, + "rewards/chosen": 6.542582194010417, + "rewards/margins": 28.493682861328125, + "rewards/rejected": -21.951100667317707, + "step": 2242 + }, + { + "epoch": 0.5612410859502064, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43252977.777777776, + "logits/rejected": -28378161.066666666, + "logps/chosen": -303.33661566840277, + "logps/rejected": -625.6641927083333, + "loss": 0.0777, + "rewards/chosen": 6.276815626356337, + "rewards/margins": 18.42606489393446, + "rewards/rejected": -12.149249267578124, + "step": 2243 + }, + { + "epoch": 0.5614913048917803, + "grad_norm": 0.76953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60453504.0, + "logits/rejected": -45960704.0, + "logps/chosen": -487.5872395833333, + "logps/rejected": -815.9900716145834, + "loss": 0.0089, + "rewards/chosen": 10.017840067545572, + "rewards/margins": 31.05964914957682, + "rewards/rejected": -21.04180908203125, + "step": 2244 + }, + { + "epoch": 0.5617415238333542, + "grad_norm": 5.78125, + "kl": 2.5044188499450684, + "learning_rate": 5e-06, + "logits/chosen": -102826496.0, + "logits/rejected": -51530368.0, + "logps/chosen": -454.8592998798077, + "logps/rejected": -688.3220880681819, + "loss": 0.0303, + "rewards/chosen": 7.2289252647986775, + "rewards/margins": 22.817442887312883, + "rewards/rejected": -15.588517622514205, + "step": 2245 + }, + { + "epoch": 0.561991742774928, + "grad_norm": 19.375, + "kl": 17.250429153442383, + "learning_rate": 5e-06, + "logits/chosen": -52552736.0, + "logits/rejected": -47524304.0, + "logps/chosen": -346.87109375, + "logps/rejected": -444.7727355957031, + "loss": 0.1162, + "rewards/chosen": 8.528627395629883, + "rewards/margins": 18.407371520996094, + "rewards/rejected": -9.878744125366211, + "step": 2246 + }, + { + "epoch": 0.562241961716502, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29423392.0, + "logits/rejected": -50943181.71428572, + "logps/chosen": -287.190234375, + "logps/rejected": -511.52828543526783, + "loss": 0.0405, + "rewards/chosen": 7.7221923828125, + "rewards/margins": 21.86156507219587, + "rewards/rejected": -14.13937268938337, + "step": 2247 + }, + { + "epoch": 0.5624921806580758, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6685648.0, + "logits/rejected": -41198904.0, + "logps/chosen": -436.6796875, + "logps/rejected": -508.891357421875, + "loss": 0.0346, + "rewards/chosen": 7.142804463704427, + "rewards/margins": 18.437917073567707, + "rewards/rejected": -11.295112609863281, + "step": 2248 + }, + { + "epoch": 0.5627423995996497, + "grad_norm": 20.375, + "kl": 9.449443817138672, + "learning_rate": 5e-06, + "logits/chosen": -41812965.333333336, + "logits/rejected": -63191936.0, + "logps/chosen": -447.0713704427083, + "logps/rejected": -592.46240234375, + "loss": 0.0395, + "rewards/chosen": 10.017625172932943, + "rewards/margins": 24.833875020345054, + "rewards/rejected": -14.81624984741211, + "step": 2249 + }, + { + "epoch": 0.5629926185412236, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22586656.0, + "logits/rejected": -42727008.0, + "logps/chosen": -285.0314534505208, + "logps/rejected": -650.9852701822916, + "loss": 0.0235, + "rewards/chosen": 7.072900136311849, + "rewards/margins": 23.525263468424477, + "rewards/rejected": -16.45236333211263, + "step": 2250 + }, + { + "epoch": 0.5632428374827975, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23110421.333333332, + "logits/rejected": -57703594.666666664, + "logps/chosen": -288.81858317057294, + "logps/rejected": -640.3797607421875, + "loss": 0.0522, + "rewards/chosen": 7.162334442138672, + "rewards/margins": 21.836200714111328, + "rewards/rejected": -14.673866271972656, + "step": 2251 + }, + { + "epoch": 0.5634930564243713, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64644294.4, + "logits/rejected": -3635798.8571428573, + "logps/chosen": -406.491259765625, + "logps/rejected": -549.751220703125, + "loss": 0.0518, + "rewards/chosen": 6.840129852294922, + "rewards/margins": 20.174532754080637, + "rewards/rejected": -13.334402901785714, + "step": 2252 + }, + { + "epoch": 0.5637432753659452, + "grad_norm": 6.53125, + "kl": 2.65033221244812, + "learning_rate": 5e-06, + "logits/chosen": -57903829.333333336, + "logits/rejected": -49826912.0, + "logps/chosen": -427.9501139322917, + "logps/rejected": -532.7457682291666, + "loss": 0.0408, + "rewards/chosen": 7.336427052815755, + "rewards/margins": 23.527210235595703, + "rewards/rejected": -16.19078318277995, + "step": 2253 + }, + { + "epoch": 0.5639934943075191, + "grad_norm": 129.0, + "kl": 1.3301570415496826, + "learning_rate": 5e-06, + "logits/chosen": -47594416.0, + "logits/rejected": 6942874.666666667, + "logps/chosen": -466.6420084635417, + "logps/rejected": -727.72021484375, + "loss": 0.0315, + "rewards/chosen": 8.58200454711914, + "rewards/margins": 25.605364481608074, + "rewards/rejected": -17.023359934488933, + "step": 2254 + }, + { + "epoch": 0.564243713249093, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53714150.4, + "logits/rejected": -26692571.42857143, + "logps/chosen": -344.93427734375, + "logps/rejected": -452.2932826450893, + "loss": 0.0123, + "rewards/chosen": 7.925592041015625, + "rewards/margins": 17.71711883544922, + "rewards/rejected": -9.791526794433594, + "step": 2255 + }, + { + "epoch": 0.5644939321906668, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44354074.666666664, + "logits/rejected": -52597472.0, + "logps/chosen": -406.4167887369792, + "logps/rejected": -682.864990234375, + "loss": 0.0173, + "rewards/chosen": 6.738356272379558, + "rewards/margins": 22.1699956258138, + "rewards/rejected": -15.431639353434244, + "step": 2256 + }, + { + "epoch": 0.5647441511322407, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37157949.538461536, + "logits/rejected": -53536436.36363637, + "logps/chosen": -465.2844801682692, + "logps/rejected": -761.7194602272727, + "loss": 0.0345, + "rewards/chosen": 7.041176429161658, + "rewards/margins": 27.200528124829273, + "rewards/rejected": -20.159351695667613, + "step": 2257 + }, + { + "epoch": 0.5649943700738146, + "grad_norm": 2.765625, + "kl": 0.17107200622558594, + "learning_rate": 5e-06, + "logits/chosen": -25873620.0, + "logits/rejected": -35479256.0, + "logps/chosen": -325.31622314453125, + "logps/rejected": -640.5003051757812, + "loss": 0.034, + "rewards/chosen": 6.871766090393066, + "rewards/margins": 24.039748191833496, + "rewards/rejected": -17.16798210144043, + "step": 2258 + }, + { + "epoch": 0.5652445890153884, + "grad_norm": 7.1875, + "kl": 4.9744062423706055, + "learning_rate": 5e-06, + "logits/chosen": -51242870.15384615, + "logits/rejected": -67932130.9090909, + "logps/chosen": -460.6453200120192, + "logps/rejected": -574.5331587357955, + "loss": 0.0174, + "rewards/chosen": 8.406485924353966, + "rewards/margins": 22.717206194684223, + "rewards/rejected": -14.310720270330256, + "step": 2259 + }, + { + "epoch": 0.5654948079569624, + "grad_norm": 11.0625, + "kl": 6.214234828948975, + "learning_rate": 5e-06, + "logits/chosen": -45167150.222222224, + "logits/rejected": -43874649.6, + "logps/chosen": -401.38330078125, + "logps/rejected": -592.7317057291667, + "loss": 0.0725, + "rewards/chosen": 7.067115359836155, + "rewards/margins": 20.638645511203343, + "rewards/rejected": -13.571530151367188, + "step": 2260 + }, + { + "epoch": 0.5657450268985362, + "grad_norm": 21.375, + "kl": 13.548751831054688, + "learning_rate": 5e-06, + "logits/chosen": -63644416.0, + "logits/rejected": -60329924.92307692, + "logps/chosen": -466.36177201704544, + "logps/rejected": -749.6899038461538, + "loss": 0.0852, + "rewards/chosen": 8.91021728515625, + "rewards/margins": 29.64050762469952, + "rewards/rejected": -20.73029033954327, + "step": 2261 + }, + { + "epoch": 0.5659952458401101, + "grad_norm": 13.25, + "kl": 0.7518247365951538, + "learning_rate": 5e-06, + "logits/chosen": -50342486.85714286, + "logits/rejected": 3173688.8, + "logps/chosen": -325.92124720982144, + "logps/rejected": -668.8755859375, + "loss": 0.0497, + "rewards/chosen": 6.620638166155134, + "rewards/margins": 27.647838483537946, + "rewards/rejected": -21.027200317382814, + "step": 2262 + }, + { + "epoch": 0.566245464781684, + "grad_norm": 12.75, + "kl": 5.172573089599609, + "learning_rate": 5e-06, + "logits/chosen": -36157840.0, + "logits/rejected": -28679082.0, + "logps/chosen": -407.583740234375, + "logps/rejected": -578.8125, + "loss": 0.0667, + "rewards/chosen": 8.070967674255371, + "rewards/margins": 19.281394004821777, + "rewards/rejected": -11.210426330566406, + "step": 2263 + }, + { + "epoch": 0.5664956837232579, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41992298.666666664, + "logits/rejected": -61205274.666666664, + "logps/chosen": -416.782958984375, + "logps/rejected": -635.7954508463541, + "loss": 0.0243, + "rewards/chosen": 7.226793924967448, + "rewards/margins": 23.551854451497398, + "rewards/rejected": -16.32506052652995, + "step": 2264 + }, + { + "epoch": 0.5667459026648317, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57836979.2, + "logits/rejected": -18393582.222222224, + "logps/chosen": -392.0521484375, + "logps/rejected": -589.2864583333334, + "loss": 0.0579, + "rewards/chosen": 6.7623138427734375, + "rewards/margins": 22.924292670355904, + "rewards/rejected": -16.161978827582466, + "step": 2265 + }, + { + "epoch": 0.5669961216064056, + "grad_norm": 3.0625, + "kl": 0.1782754361629486, + "learning_rate": 5e-06, + "logits/chosen": -91771520.0, + "logits/rejected": -49431142.4, + "logps/chosen": -460.9553920200893, + "logps/rejected": -653.052294921875, + "loss": 0.0075, + "rewards/chosen": 7.638459341866629, + "rewards/margins": 23.203597586495537, + "rewards/rejected": -15.565138244628907, + "step": 2266 + }, + { + "epoch": 0.5672463405479795, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63439189.333333336, + "logits/rejected": -64442760.53333333, + "logps/chosen": -421.9326171875, + "logps/rejected": -575.6893229166667, + "loss": 0.0815, + "rewards/chosen": 7.174935234917535, + "rewards/margins": 21.671965874565974, + "rewards/rejected": -14.497030639648438, + "step": 2267 + }, + { + "epoch": 0.5674965594895534, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75037466.66666667, + "logits/rejected": -46718848.0, + "logps/chosen": -411.4923095703125, + "logps/rejected": -543.7148030598959, + "loss": 0.0219, + "rewards/chosen": 8.324209849039713, + "rewards/margins": 22.459842681884766, + "rewards/rejected": -14.135632832845053, + "step": 2268 + }, + { + "epoch": 0.5677467784311272, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47134817.88235294, + "logits/rejected": -71357952.0, + "logps/chosen": -385.07482192095586, + "logps/rejected": -720.5118582589286, + "loss": 0.0249, + "rewards/chosen": 7.466412263758042, + "rewards/margins": 23.845067833652013, + "rewards/rejected": -16.378655569893972, + "step": 2269 + }, + { + "epoch": 0.5679969973727012, + "grad_norm": 21.625, + "kl": 20.086185455322266, + "learning_rate": 5e-06, + "logits/chosen": -44130812.23529412, + "logits/rejected": -54799168.0, + "logps/chosen": -421.8890739889706, + "logps/rejected": -365.50840541294644, + "loss": 0.0685, + "rewards/chosen": 9.60176176183364, + "rewards/margins": 21.46457870868074, + "rewards/rejected": -11.862816946847099, + "step": 2270 + }, + { + "epoch": 0.568247216314275, + "grad_norm": 2.546875, + "kl": 3.88920259475708, + "learning_rate": 5e-06, + "logits/chosen": -50329141.333333336, + "logits/rejected": -29379850.666666668, + "logps/chosen": -508.6871744791667, + "logps/rejected": -567.8767496744791, + "loss": 0.0045, + "rewards/chosen": 9.266108194986979, + "rewards/margins": 20.555709838867188, + "rewards/rejected": -11.289601643880209, + "step": 2271 + }, + { + "epoch": 0.5684974352558488, + "grad_norm": 12.9375, + "kl": 10.461141586303711, + "learning_rate": 5e-06, + "logits/chosen": -43328233.4117647, + "logits/rejected": -50061458.28571428, + "logps/chosen": -471.9319278492647, + "logps/rejected": -512.3748604910714, + "loss": 0.0838, + "rewards/chosen": 9.13576103659237, + "rewards/margins": 22.897634554310002, + "rewards/rejected": -13.761873517717634, + "step": 2272 + }, + { + "epoch": 0.5687476541974228, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28839686.4, + "logits/rejected": -51925398.85714286, + "logps/chosen": -383.85751953125, + "logps/rejected": -589.2259347098214, + "loss": 0.0453, + "rewards/chosen": 6.904610443115234, + "rewards/margins": 22.751034000941686, + "rewards/rejected": -15.846423557826451, + "step": 2273 + }, + { + "epoch": 0.5689978731389966, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42989643.63636363, + "logits/rejected": -69651588.92307693, + "logps/chosen": -341.5509144176136, + "logps/rejected": -743.3466045673077, + "loss": 0.0185, + "rewards/chosen": 6.822572187943892, + "rewards/margins": 26.617094986922258, + "rewards/rejected": -19.794522798978367, + "step": 2274 + }, + { + "epoch": 0.5692480920805705, + "grad_norm": 15.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63942844.0, + "logits/rejected": -47546712.0, + "logps/chosen": -581.1844482421875, + "logps/rejected": -597.9588623046875, + "loss": 0.0158, + "rewards/chosen": 8.496853828430176, + "rewards/margins": 23.93095302581787, + "rewards/rejected": -15.434099197387695, + "step": 2275 + }, + { + "epoch": 0.5694983110221443, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30842835.2, + "logits/rejected": -49663547.428571425, + "logps/chosen": -314.755029296875, + "logps/rejected": -655.4777483258929, + "loss": 0.0277, + "rewards/chosen": 7.239714050292969, + "rewards/margins": 24.36540069580078, + "rewards/rejected": -17.125686645507812, + "step": 2276 + }, + { + "epoch": 0.5697485299637183, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42923501.333333336, + "logits/rejected": -69472762.66666667, + "logps/chosen": -351.4805501302083, + "logps/rejected": -741.1748046875, + "loss": 0.0219, + "rewards/chosen": 8.200290044148764, + "rewards/margins": 28.485682805379234, + "rewards/rejected": -20.28539276123047, + "step": 2277 + }, + { + "epoch": 0.5699987489052921, + "grad_norm": 5.8125, + "kl": 8.19485092163086, + "learning_rate": 5e-06, + "logits/chosen": -75436749.71428572, + "logits/rejected": -46003561.6, + "logps/chosen": -489.2398158482143, + "logps/rejected": -551.798046875, + "loss": 0.0475, + "rewards/chosen": 8.35165296282087, + "rewards/margins": 20.130722481863838, + "rewards/rejected": -11.779069519042968, + "step": 2278 + }, + { + "epoch": 0.570248967846866, + "grad_norm": 4.78125, + "kl": 1.142919898033142, + "learning_rate": 5e-06, + "logits/chosen": -31582481.454545453, + "logits/rejected": 16276025.846153846, + "logps/chosen": -417.77241654829544, + "logps/rejected": -572.8281625600962, + "loss": 0.0164, + "rewards/chosen": 7.47433818470348, + "rewards/margins": 22.97373957067103, + "rewards/rejected": -15.499401385967548, + "step": 2279 + }, + { + "epoch": 0.5704991867884399, + "grad_norm": 17.0, + "kl": 0.2522227168083191, + "learning_rate": 5e-06, + "logits/chosen": -16189352.0, + "logits/rejected": -52041101.71428572, + "logps/chosen": -393.3282470703125, + "logps/rejected": -647.5641741071429, + "loss": 0.0367, + "rewards/chosen": 7.314389038085937, + "rewards/margins": 20.963177490234376, + "rewards/rejected": -13.648788452148438, + "step": 2280 + }, + { + "epoch": 0.5707494057300138, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41014466.90909091, + "logits/rejected": -60996224.0, + "logps/chosen": -366.21147017045456, + "logps/rejected": -788.369140625, + "loss": 0.0412, + "rewards/chosen": 7.448481473055753, + "rewards/margins": 29.98891491656537, + "rewards/rejected": -22.540433443509617, + "step": 2281 + }, + { + "epoch": 0.5709996246715876, + "grad_norm": 19.875, + "kl": 2.0576376914978027, + "learning_rate": 5e-06, + "logits/chosen": -53515623.384615384, + "logits/rejected": -36472029.09090909, + "logps/chosen": -364.4727313701923, + "logps/rejected": -479.9509943181818, + "loss": 0.031, + "rewards/chosen": 7.516398503230168, + "rewards/margins": 19.44329225766909, + "rewards/rejected": -11.92689375443892, + "step": 2282 + }, + { + "epoch": 0.5712498436131616, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52971182.54545455, + "logits/rejected": -45969875.692307696, + "logps/chosen": -511.8017578125, + "logps/rejected": -724.3255709134615, + "loss": 0.021, + "rewards/chosen": 8.459647438742898, + "rewards/margins": 26.759215028135927, + "rewards/rejected": -18.29956758939303, + "step": 2283 + }, + { + "epoch": 0.5715000625547354, + "grad_norm": 0.8046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42200253.09090909, + "logits/rejected": -71571209.84615384, + "logps/chosen": -429.8780628551136, + "logps/rejected": -558.73974609375, + "loss": 0.0086, + "rewards/chosen": 9.033871737393467, + "rewards/margins": 21.062743713805727, + "rewards/rejected": -12.02887197641226, + "step": 2284 + }, + { + "epoch": 0.5717502814963092, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51366452.36363637, + "logits/rejected": -53055330.461538464, + "logps/chosen": -336.97853781960225, + "logps/rejected": -628.9108323317307, + "loss": 0.0652, + "rewards/chosen": 7.31946494362571, + "rewards/margins": 25.069990784971864, + "rewards/rejected": -17.750525841346153, + "step": 2285 + }, + { + "epoch": 0.5720005004378832, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33740840.72727273, + "logits/rejected": 35637774.76923077, + "logps/chosen": -286.12655362215907, + "logps/rejected": -729.4314903846154, + "loss": 0.0265, + "rewards/chosen": 6.3400490500710225, + "rewards/margins": 26.374036242078233, + "rewards/rejected": -20.03398719200721, + "step": 2286 + }, + { + "epoch": 0.572250719379457, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45299958.4, + "logits/rejected": -72508598.85714285, + "logps/chosen": -401.4803466796875, + "logps/rejected": -816.1028180803571, + "loss": 0.0221, + "rewards/chosen": 7.455699157714844, + "rewards/margins": 24.361087036132812, + "rewards/rejected": -16.90538787841797, + "step": 2287 + }, + { + "epoch": 0.5725009383210309, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47703718.4, + "logits/rejected": -26681656.888888888, + "logps/chosen": -282.70921223958334, + "logps/rejected": -482.5910915798611, + "loss": 0.0558, + "rewards/chosen": 7.9832616170247395, + "rewards/margins": 20.777765570746528, + "rewards/rejected": -12.794503953721788, + "step": 2288 + }, + { + "epoch": 0.5727511572626047, + "grad_norm": 2.234375, + "kl": 6.874081611633301, + "learning_rate": 5e-06, + "logits/chosen": -65971829.333333336, + "logits/rejected": -25103488.0, + "logps/chosen": -421.4246419270833, + "logps/rejected": -391.2778727213542, + "loss": 0.0174, + "rewards/chosen": 7.569124221801758, + "rewards/margins": 19.661526362101235, + "rewards/rejected": -12.092402140299479, + "step": 2289 + }, + { + "epoch": 0.5730013762041787, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39596844.8, + "logits/rejected": -47378532.571428575, + "logps/chosen": -339.8724365234375, + "logps/rejected": -373.8902064732143, + "loss": 0.0323, + "rewards/chosen": 6.276803207397461, + "rewards/margins": 16.567570332118443, + "rewards/rejected": -10.290767124720983, + "step": 2290 + }, + { + "epoch": 0.5732515951457525, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1105492.111111111, + "logits/rejected": -67873117.86666666, + "logps/chosen": -359.83184136284723, + "logps/rejected": -799.3625651041667, + "loss": 0.0104, + "rewards/chosen": 7.2234242757161455, + "rewards/margins": 26.39148457845052, + "rewards/rejected": -19.168060302734375, + "step": 2291 + }, + { + "epoch": 0.5735018140873264, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19529256.727272727, + "logits/rejected": -38796155.07692308, + "logps/chosen": -303.0553533380682, + "logps/rejected": -492.66639122596155, + "loss": 0.0382, + "rewards/chosen": 6.486169988458807, + "rewards/margins": 18.597635442560367, + "rewards/rejected": -12.111465454101562, + "step": 2292 + }, + { + "epoch": 0.5737520330289003, + "grad_norm": 7.125, + "kl": 3.7763266563415527, + "learning_rate": 5e-06, + "logits/chosen": -40520439.46666667, + "logits/rejected": -61952881.777777776, + "logps/chosen": -465.9578125, + "logps/rejected": -743.494140625, + "loss": 0.0479, + "rewards/chosen": 9.458258056640625, + "rewards/margins": 22.14397447374132, + "rewards/rejected": -12.685716417100695, + "step": 2293 + }, + { + "epoch": 0.5740022519704742, + "grad_norm": 17.375, + "kl": 16.234222412109375, + "learning_rate": 5e-06, + "logits/chosen": -47367781.333333336, + "logits/rejected": -46033674.666666664, + "logps/chosen": -459.7914225260417, + "logps/rejected": -744.4580078125, + "loss": 0.0346, + "rewards/chosen": 8.892217636108398, + "rewards/margins": 26.569321314493816, + "rewards/rejected": -17.677103678385418, + "step": 2294 + }, + { + "epoch": 0.574252470912048, + "grad_norm": 3.78125, + "kl": 0.07759666442871094, + "learning_rate": 5e-06, + "logits/chosen": -39857252.92307692, + "logits/rejected": -51110074.18181818, + "logps/chosen": -453.6886643629808, + "logps/rejected": -593.8200461647727, + "loss": 0.0091, + "rewards/chosen": 11.505894587590145, + "rewards/margins": 27.143361018254208, + "rewards/rejected": -15.637466430664062, + "step": 2295 + }, + { + "epoch": 0.574502689853622, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35104108.8, + "logits/rejected": -26908363.42857143, + "logps/chosen": -318.3389404296875, + "logps/rejected": -506.01932198660717, + "loss": 0.0386, + "rewards/chosen": 6.543833923339844, + "rewards/margins": 20.49883793422154, + "rewards/rejected": -13.955004010881696, + "step": 2296 + }, + { + "epoch": 0.5747529087951958, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62856859.428571425, + "logits/rejected": -27532281.6, + "logps/chosen": -404.59165736607144, + "logps/rejected": -509.885595703125, + "loss": 0.0498, + "rewards/chosen": 7.6663616725376675, + "rewards/margins": 20.925429970877513, + "rewards/rejected": -13.259068298339844, + "step": 2297 + }, + { + "epoch": 0.5750031277367696, + "grad_norm": 9.5, + "kl": 8.954851150512695, + "learning_rate": 5e-06, + "logits/chosen": -43359940.266666666, + "logits/rejected": -47932316.44444445, + "logps/chosen": -369.38837890625, + "logps/rejected": -556.41455078125, + "loss": 0.0362, + "rewards/chosen": 8.315555826822917, + "rewards/margins": 22.065664333767362, + "rewards/rejected": -13.750108506944445, + "step": 2298 + }, + { + "epoch": 0.5752533466783436, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41919448.0, + "logits/rejected": -70777680.0, + "logps/chosen": -322.4796142578125, + "logps/rejected": -495.68450927734375, + "loss": 0.0605, + "rewards/chosen": 6.857255458831787, + "rewards/margins": 17.698379039764404, + "rewards/rejected": -10.841123580932617, + "step": 2299 + }, + { + "epoch": 0.5755035656199174, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50534951.384615384, + "logits/rejected": -25931054.545454547, + "logps/chosen": -476.46495643028845, + "logps/rejected": -493.37917258522725, + "loss": 0.0119, + "rewards/chosen": 8.330653850848858, + "rewards/margins": 23.061831440958944, + "rewards/rejected": -14.731177590110086, + "step": 2300 + }, + { + "epoch": 0.5757537845614913, + "grad_norm": 10.125, + "kl": 2.183321714401245, + "learning_rate": 5e-06, + "logits/chosen": -71195081.84615384, + "logits/rejected": -51092573.09090909, + "logps/chosen": -361.46694711538464, + "logps/rejected": -881.6136363636364, + "loss": 0.0395, + "rewards/chosen": 7.034294715294471, + "rewards/margins": 26.224754920372597, + "rewards/rejected": -19.190460205078125, + "step": 2301 + }, + { + "epoch": 0.5760040035030651, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46676027.733333334, + "logits/rejected": -62618652.44444445, + "logps/chosen": -360.320703125, + "logps/rejected": -529.2957899305555, + "loss": 0.0475, + "rewards/chosen": 7.676609802246094, + "rewards/margins": 21.768238830566407, + "rewards/rejected": -14.091629028320312, + "step": 2302 + }, + { + "epoch": 0.5762542224446391, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58230199.46666667, + "logits/rejected": -50741600.0, + "logps/chosen": -337.98089192708335, + "logps/rejected": -681.3541666666666, + "loss": 0.044, + "rewards/chosen": 6.27715098063151, + "rewards/margins": 21.290702480740016, + "rewards/rejected": -15.013551500108507, + "step": 2303 + }, + { + "epoch": 0.5765044413862129, + "grad_norm": 5.90625, + "kl": 2.960909605026245, + "learning_rate": 5e-06, + "logits/chosen": -36897605.333333336, + "logits/rejected": -62755520.0, + "logps/chosen": -510.5487467447917, + "logps/rejected": -478.6233723958333, + "loss": 0.0182, + "rewards/chosen": 10.688741048177084, + "rewards/margins": 23.81709416707357, + "rewards/rejected": -13.128353118896484, + "step": 2304 + }, + { + "epoch": 0.5767546603277868, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56548474.18181818, + "logits/rejected": 8557757.538461538, + "logps/chosen": -420.0267223011364, + "logps/rejected": -586.5718149038462, + "loss": 0.024, + "rewards/chosen": 9.094587846235795, + "rewards/margins": 23.38417853508796, + "rewards/rejected": -14.289590688852163, + "step": 2305 + }, + { + "epoch": 0.5770048792693607, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25933668.57142857, + "logits/rejected": -48031028.705882356, + "logps/chosen": -438.46146065848217, + "logps/rejected": -676.0977711397059, + "loss": 0.0093, + "rewards/chosen": 8.443802969796318, + "rewards/margins": 24.630044568486575, + "rewards/rejected": -16.18624159869026, + "step": 2306 + }, + { + "epoch": 0.5772550982109346, + "grad_norm": 7.28125, + "kl": 20.78204917907715, + "learning_rate": 5e-06, + "logits/chosen": -36962432.0, + "logits/rejected": -68374198.85714285, + "logps/chosen": -385.8329216452206, + "logps/rejected": -482.29541015625, + "loss": 0.1205, + "rewards/chosen": 8.741358139935661, + "rewards/margins": 23.503920482988114, + "rewards/rejected": -14.762562343052455, + "step": 2307 + }, + { + "epoch": 0.5775053171525084, + "grad_norm": 7.625, + "kl": 4.505006313323975, + "learning_rate": 5e-06, + "logits/chosen": -80094016.0, + "logits/rejected": -58483433.14285714, + "logps/chosen": -571.523193359375, + "logps/rejected": -697.9043666294643, + "loss": 0.0124, + "rewards/chosen": 11.276296997070313, + "rewards/margins": 27.76383492606027, + "rewards/rejected": -16.487537928989955, + "step": 2308 + }, + { + "epoch": 0.5777555360940824, + "grad_norm": 18.0, + "kl": 30.700368881225586, + "learning_rate": 5e-06, + "logits/chosen": -59101469.86666667, + "logits/rejected": -41340650.666666664, + "logps/chosen": -406.6466796875, + "logps/rejected": -592.9188910590278, + "loss": 0.0957, + "rewards/chosen": 8.333720397949218, + "rewards/margins": 23.376862080891925, + "rewards/rejected": -15.043141682942709, + "step": 2309 + }, + { + "epoch": 0.5780057550356562, + "grad_norm": 11.6875, + "kl": 1.2578620910644531, + "learning_rate": 5e-06, + "logits/chosen": -37653874.28571428, + "logits/rejected": -37874464.0, + "logps/chosen": -328.96561104910717, + "logps/rejected": -542.91455078125, + "loss": 0.0367, + "rewards/chosen": 7.210897718157087, + "rewards/margins": 18.00687473842076, + "rewards/rejected": -10.795977020263672, + "step": 2310 + }, + { + "epoch": 0.57825597397723, + "grad_norm": 18.125, + "kl": 6.460975646972656, + "learning_rate": 5e-06, + "logits/chosen": -29729111.466666665, + "logits/rejected": -42319658.666666664, + "logps/chosen": -358.3953125, + "logps/rejected": -772.3819444444445, + "loss": 0.0671, + "rewards/chosen": 6.726534525553386, + "rewards/margins": 20.716919793023003, + "rewards/rejected": -13.990385267469618, + "step": 2311 + }, + { + "epoch": 0.578506192918804, + "grad_norm": 3.390625, + "kl": 3.4758002758026123, + "learning_rate": 5e-06, + "logits/chosen": -65347541.333333336, + "logits/rejected": -110578150.4, + "logps/chosen": -407.6927083333333, + "logps/rejected": -624.1333333333333, + "loss": 0.0105, + "rewards/chosen": 9.210013495551216, + "rewards/margins": 24.197727118598092, + "rewards/rejected": -14.987713623046876, + "step": 2312 + }, + { + "epoch": 0.5787564118603779, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54779601.06666667, + "logits/rejected": -20419100.444444444, + "logps/chosen": -371.87805989583336, + "logps/rejected": -575.3224826388889, + "loss": 0.0499, + "rewards/chosen": 7.25235850016276, + "rewards/margins": 21.78782670762804, + "rewards/rejected": -14.535468207465279, + "step": 2313 + }, + { + "epoch": 0.5790066308019517, + "grad_norm": 4.15625, + "kl": 1.7258212566375732, + "learning_rate": 5e-06, + "logits/chosen": -40238919.384615384, + "logits/rejected": -17427005.09090909, + "logps/chosen": -315.3366135817308, + "logps/rejected": -572.0110085227273, + "loss": 0.0453, + "rewards/chosen": 6.10852285531851, + "rewards/margins": 19.383652693741805, + "rewards/rejected": -13.275129838423295, + "step": 2314 + }, + { + "epoch": 0.5792568497435255, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52799317.333333336, + "logits/rejected": -55749427.2, + "logps/chosen": -340.44053819444446, + "logps/rejected": -631.1590494791667, + "loss": 0.0223, + "rewards/chosen": 7.548498365614149, + "rewards/margins": 22.93874189588759, + "rewards/rejected": -15.390243530273438, + "step": 2315 + }, + { + "epoch": 0.5795070686850995, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59717447.11111111, + "logits/rejected": 80916309.33333333, + "logps/chosen": -406.0830078125, + "logps/rejected": -557.1138671875, + "loss": 0.0606, + "rewards/chosen": 7.304797702365452, + "rewards/margins": 19.755439588758684, + "rewards/rejected": -12.45064188639323, + "step": 2316 + }, + { + "epoch": 0.5797572876266733, + "grad_norm": 2.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66924160.0, + "logits/rejected": -55256029.09090909, + "logps/chosen": -370.81531700721155, + "logps/rejected": -694.0400834517045, + "loss": 0.0179, + "rewards/chosen": 7.445796086237981, + "rewards/margins": 24.705102560403464, + "rewards/rejected": -17.259306474165484, + "step": 2317 + }, + { + "epoch": 0.5800075065682472, + "grad_norm": 3.921875, + "kl": 7.178936004638672, + "learning_rate": 5e-06, + "logits/chosen": -35259230.11764706, + "logits/rejected": -75841558.85714285, + "logps/chosen": -348.5132697610294, + "logps/rejected": -588.1996372767857, + "loss": 0.054, + "rewards/chosen": 8.185930139878216, + "rewards/margins": 23.521547205307904, + "rewards/rejected": -15.335617065429688, + "step": 2318 + }, + { + "epoch": 0.5802577255098211, + "grad_norm": 4.59375, + "kl": 0.5528386831283569, + "learning_rate": 5e-06, + "logits/chosen": -44329285.81818182, + "logits/rejected": -41249750.15384615, + "logps/chosen": -385.97745028409093, + "logps/rejected": -608.0238131009615, + "loss": 0.0389, + "rewards/chosen": 7.100744767622515, + "rewards/margins": 20.40601210160689, + "rewards/rejected": -13.305267333984375, + "step": 2319 + }, + { + "epoch": 0.580507944451395, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39376568.88888889, + "logits/rejected": -47785655.46666667, + "logps/chosen": -308.1414388020833, + "logps/rejected": -603.6977213541667, + "loss": 0.0487, + "rewards/chosen": 7.80446031358507, + "rewards/margins": 22.919541083441842, + "rewards/rejected": -15.115080769856771, + "step": 2320 + }, + { + "epoch": 0.5807581633929688, + "grad_norm": 6.59375, + "kl": 2.173941135406494, + "learning_rate": 5e-06, + "logits/chosen": -35229341.09090909, + "logits/rejected": -52741656.615384616, + "logps/chosen": -323.38973721590907, + "logps/rejected": -391.2898137019231, + "loss": 0.0612, + "rewards/chosen": 6.656245144930753, + "rewards/margins": 15.936848407025103, + "rewards/rejected": -9.28060326209435, + "step": 2321 + }, + { + "epoch": 0.5810083823345428, + "grad_norm": 17.75, + "kl": 2.079366683959961, + "learning_rate": 5e-06, + "logits/chosen": -43393614.76923077, + "logits/rejected": -29509021.09090909, + "logps/chosen": -425.3112980769231, + "logps/rejected": -515.6060014204545, + "loss": 0.0665, + "rewards/chosen": 7.763340289776142, + "rewards/margins": 19.565709841001286, + "rewards/rejected": -11.802369551225143, + "step": 2322 + }, + { + "epoch": 0.5812586012761166, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45164361.84615385, + "logits/rejected": -53843572.36363637, + "logps/chosen": -385.7033503605769, + "logps/rejected": -464.95383522727275, + "loss": 0.0289, + "rewards/chosen": 7.802693293644832, + "rewards/margins": 20.309301736471536, + "rewards/rejected": -12.506608442826705, + "step": 2323 + }, + { + "epoch": 0.5815088202176905, + "grad_norm": 6.5625, + "kl": 2.5723178386688232, + "learning_rate": 5e-06, + "logits/chosen": -80881424.0, + "logits/rejected": -38137626.666666664, + "logps/chosen": -407.3980305989583, + "logps/rejected": -510.5245768229167, + "loss": 0.0256, + "rewards/chosen": 8.806017557779947, + "rewards/margins": 21.405681610107422, + "rewards/rejected": -12.599664052327475, + "step": 2324 + }, + { + "epoch": 0.5817590391592643, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51974768.0, + "logits/rejected": -51535410.28571428, + "logps/chosen": -494.88447265625, + "logps/rejected": -586.1663643973214, + "loss": 0.0165, + "rewards/chosen": 7.506608581542968, + "rewards/margins": 22.468240356445314, + "rewards/rejected": -14.961631774902344, + "step": 2325 + }, + { + "epoch": 0.5820092581008383, + "grad_norm": 4.09375, + "kl": 3.0930585861206055, + "learning_rate": 5e-06, + "logits/chosen": -80370515.6923077, + "logits/rejected": -49963973.81818182, + "logps/chosen": -444.57192758413464, + "logps/rejected": -448.8917791193182, + "loss": 0.0144, + "rewards/chosen": 9.81182626577524, + "rewards/margins": 20.518366460199957, + "rewards/rejected": -10.706540194424717, + "step": 2326 + }, + { + "epoch": 0.5822594770424121, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34812373.333333336, + "logits/rejected": -40465093.333333336, + "logps/chosen": -284.8806559244792, + "logps/rejected": -546.6531575520834, + "loss": 0.0501, + "rewards/chosen": 6.006937662760417, + "rewards/margins": 16.237061818440754, + "rewards/rejected": -10.230124155680338, + "step": 2327 + }, + { + "epoch": 0.5825096959839859, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28590181.333333332, + "logits/rejected": -31854685.333333332, + "logps/chosen": -383.0777180989583, + "logps/rejected": -689.6525065104166, + "loss": 0.0474, + "rewards/chosen": 9.496591567993164, + "rewards/margins": 22.173229853312172, + "rewards/rejected": -12.67663828531901, + "step": 2328 + }, + { + "epoch": 0.5827599149255599, + "grad_norm": 2.578125, + "kl": 2.7513957023620605, + "learning_rate": 5e-06, + "logits/chosen": -40206442.666666664, + "logits/rejected": -31748517.333333332, + "logps/chosen": -459.46099175347223, + "logps/rejected": -444.5061442057292, + "loss": 0.0025, + "rewards/chosen": 8.752086215549046, + "rewards/margins": 21.083858066134983, + "rewards/rejected": -12.331771850585938, + "step": 2329 + }, + { + "epoch": 0.5830101338671337, + "grad_norm": 14.75, + "kl": 1.5212924480438232, + "learning_rate": 5e-06, + "logits/chosen": -31102740.363636363, + "logits/rejected": -26216091.076923076, + "logps/chosen": -429.8864080255682, + "logps/rejected": -406.771484375, + "loss": 0.0801, + "rewards/chosen": 8.48611103404652, + "rewards/margins": 17.807268636209983, + "rewards/rejected": -9.321157602163462, + "step": 2330 + }, + { + "epoch": 0.5832603528087076, + "grad_norm": 10.8125, + "kl": 8.701894760131836, + "learning_rate": 5e-06, + "logits/chosen": -61887707.428571425, + "logits/rejected": -63169836.8, + "logps/chosen": -467.74857003348217, + "logps/rejected": -619.932275390625, + "loss": 0.0271, + "rewards/chosen": 10.399968828473773, + "rewards/margins": 24.550969805036274, + "rewards/rejected": -14.1510009765625, + "step": 2331 + }, + { + "epoch": 0.5835105717502815, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50183545.6, + "logits/rejected": -47917339.428571425, + "logps/chosen": -295.367578125, + "logps/rejected": -619.44189453125, + "loss": 0.0254, + "rewards/chosen": 6.958013916015625, + "rewards/margins": 21.994104221888954, + "rewards/rejected": -15.036090305873326, + "step": 2332 + }, + { + "epoch": 0.5837607906918554, + "grad_norm": 6.15625, + "kl": 1.5171712636947632, + "learning_rate": 5e-06, + "logits/chosen": -29808851.692307692, + "logits/rejected": -29682260.363636363, + "logps/chosen": -252.19031700721155, + "logps/rejected": -423.02534623579544, + "loss": 0.055, + "rewards/chosen": 5.100712702824519, + "rewards/margins": 13.521801875187801, + "rewards/rejected": -8.421089172363281, + "step": 2333 + }, + { + "epoch": 0.5840110096334292, + "grad_norm": 5.84375, + "kl": 0.5418338775634766, + "learning_rate": 5e-06, + "logits/chosen": -28300811.636363637, + "logits/rejected": -21666141.53846154, + "logps/chosen": -236.75379527698863, + "logps/rejected": -626.2431640625, + "loss": 0.0249, + "rewards/chosen": 7.111954428932884, + "rewards/margins": 18.212664784251395, + "rewards/rejected": -11.10071035531851, + "step": 2334 + }, + { + "epoch": 0.5842612285750032, + "grad_norm": 8.75, + "kl": 4.8423614501953125, + "learning_rate": 5e-06, + "logits/chosen": -32823321.14285714, + "logits/rejected": -67425433.6, + "logps/chosen": -401.18324497767856, + "logps/rejected": -653.69970703125, + "loss": 0.0724, + "rewards/chosen": 8.084582192557198, + "rewards/margins": 19.8562504359654, + "rewards/rejected": -11.771668243408204, + "step": 2335 + }, + { + "epoch": 0.584511447516577, + "grad_norm": 6.15625, + "kl": 1.9281806945800781, + "learning_rate": 5e-06, + "logits/chosen": -50715002.666666664, + "logits/rejected": -47137184.0, + "logps/chosen": -372.6903889973958, + "logps/rejected": -650.6675618489584, + "loss": 0.0067, + "rewards/chosen": 8.786565144856771, + "rewards/margins": 19.775737762451172, + "rewards/rejected": -10.9891726175944, + "step": 2336 + }, + { + "epoch": 0.5847616664581509, + "grad_norm": 6.0, + "kl": 0.9779180288314819, + "learning_rate": 5e-06, + "logits/chosen": -39393252.571428575, + "logits/rejected": -16356969.6, + "logps/chosen": -270.75538853236606, + "logps/rejected": -515.21845703125, + "loss": 0.0353, + "rewards/chosen": 6.694460187639509, + "rewards/margins": 16.414755902971542, + "rewards/rejected": -9.720295715332032, + "step": 2337 + }, + { + "epoch": 0.5850118853997247, + "grad_norm": 7.1875, + "kl": 15.422025680541992, + "learning_rate": 5e-06, + "logits/chosen": -52898892.8, + "logits/rejected": -54560483.55555555, + "logps/chosen": -437.89283854166666, + "logps/rejected": -632.8269856770834, + "loss": 0.0295, + "rewards/chosen": 10.498996988932292, + "rewards/margins": 24.237966579861112, + "rewards/rejected": -13.73896959092882, + "step": 2338 + }, + { + "epoch": 0.5852621043412987, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58453056.0, + "logits/rejected": -53630117.333333336, + "logps/chosen": -236.92940266927084, + "logps/rejected": -625.7169189453125, + "loss": 0.0986, + "rewards/chosen": 4.669447898864746, + "rewards/margins": 18.24466609954834, + "rewards/rejected": -13.575218200683594, + "step": 2339 + }, + { + "epoch": 0.5855123232828725, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49086752.0, + "logits/rejected": -32979685.333333332, + "logps/chosen": -353.5267740885417, + "logps/rejected": -514.0420735677084, + "loss": 0.0327, + "rewards/chosen": 7.4302927652994795, + "rewards/margins": 21.044390360514324, + "rewards/rejected": -13.614097595214844, + "step": 2340 + }, + { + "epoch": 0.5857625422244463, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73090597.33333333, + "logits/rejected": -22302168.0, + "logps/chosen": -459.6234537760417, + "logps/rejected": -480.1915690104167, + "loss": 0.0038, + "rewards/chosen": 11.436658223470053, + "rewards/margins": 24.05647913614909, + "rewards/rejected": -12.619820912679037, + "step": 2341 + }, + { + "epoch": 0.5860127611660203, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74611878.4, + "logits/rejected": 14204768.0, + "logps/chosen": -425.77529296875, + "logps/rejected": -620.8494698660714, + "loss": 0.0419, + "rewards/chosen": 8.55035858154297, + "rewards/margins": 19.32414267403739, + "rewards/rejected": -10.77378409249442, + "step": 2342 + }, + { + "epoch": 0.5862629801075941, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31113291.636363637, + "logits/rejected": -49016083.692307696, + "logps/chosen": -299.64186789772725, + "logps/rejected": -457.4909855769231, + "loss": 0.0478, + "rewards/chosen": 6.267315951260653, + "rewards/margins": 17.4171079622282, + "rewards/rejected": -11.149792010967548, + "step": 2343 + }, + { + "epoch": 0.586513199049168, + "grad_norm": 2.59375, + "kl": 5.915771007537842, + "learning_rate": 5e-06, + "logits/chosen": -67885033.14285715, + "logits/rejected": 34766118.4, + "logps/chosen": -475.1763392857143, + "logps/rejected": -405.063427734375, + "loss": 0.0066, + "rewards/chosen": 10.952953883579799, + "rewards/margins": 22.23340508597238, + "rewards/rejected": -11.280451202392578, + "step": 2344 + }, + { + "epoch": 0.5867634179907419, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 23058416.0, + "logits/rejected": -53011131.07692308, + "logps/chosen": -438.82572798295456, + "logps/rejected": -582.2438025841346, + "loss": 0.0151, + "rewards/chosen": 7.608842329545454, + "rewards/margins": 22.095529622964925, + "rewards/rejected": -14.486687293419472, + "step": 2345 + }, + { + "epoch": 0.5870136369323158, + "grad_norm": 1.5, + "kl": 2.4681575298309326, + "learning_rate": 5e-06, + "logits/chosen": -49283852.8, + "logits/rejected": -51654646.85714286, + "logps/chosen": -380.2482421875, + "logps/rejected": -551.385009765625, + "loss": 0.0168, + "rewards/chosen": 8.080470275878906, + "rewards/margins": 24.979332842145645, + "rewards/rejected": -16.89886256626674, + "step": 2346 + }, + { + "epoch": 0.5872638558738896, + "grad_norm": 6.84375, + "kl": 8.959607124328613, + "learning_rate": 5e-06, + "logits/chosen": -38703020.307692304, + "logits/rejected": -45666970.18181818, + "logps/chosen": -330.8844651442308, + "logps/rejected": -463.72567471590907, + "loss": 0.0403, + "rewards/chosen": 7.360662020169771, + "rewards/margins": 20.19087331278341, + "rewards/rejected": -12.830211292613637, + "step": 2347 + }, + { + "epoch": 0.5875140748154636, + "grad_norm": 6.75, + "kl": 7.949047088623047, + "learning_rate": 5e-06, + "logits/chosen": -55599172.92307692, + "logits/rejected": -25030033.454545453, + "logps/chosen": -332.3239933894231, + "logps/rejected": -586.4134854403409, + "loss": 0.0662, + "rewards/chosen": 8.06453646146334, + "rewards/margins": 21.62874683300098, + "rewards/rejected": -13.564210371537643, + "step": 2348 + }, + { + "epoch": 0.5877642937570374, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49726147.2, + "logits/rejected": -54683328.0, + "logps/chosen": -477.727099609375, + "logps/rejected": -651.9635881696429, + "loss": 0.0488, + "rewards/chosen": 8.894602203369141, + "rewards/margins": 23.578703853062223, + "rewards/rejected": -14.68410164969308, + "step": 2349 + }, + { + "epoch": 0.5880145126986113, + "grad_norm": 3.828125, + "kl": 6.803206443786621, + "learning_rate": 5e-06, + "logits/chosen": -79640055.46666667, + "logits/rejected": -17586549.333333332, + "logps/chosen": -457.3422526041667, + "logps/rejected": -530.5589192708334, + "loss": 0.043, + "rewards/chosen": 8.495550028483073, + "rewards/margins": 21.1801262749566, + "rewards/rejected": -12.684576246473524, + "step": 2350 + }, + { + "epoch": 0.5882647316401851, + "grad_norm": 3.03125, + "kl": 2.3641486167907715, + "learning_rate": 5e-06, + "logits/chosen": -54434888.53333333, + "logits/rejected": -30501553.777777776, + "logps/chosen": -345.55244140625, + "logps/rejected": -662.8038736979166, + "loss": 0.0385, + "rewards/chosen": 6.620048014322917, + "rewards/margins": 22.77589619954427, + "rewards/rejected": -16.155848185221355, + "step": 2351 + }, + { + "epoch": 0.5885149505817591, + "grad_norm": 3.53125, + "kl": 8.91375732421875, + "learning_rate": 5e-06, + "logits/chosen": -86271360.0, + "logits/rejected": -17100435.2, + "logps/chosen": -415.5478515625, + "logps/rejected": -400.473583984375, + "loss": 0.0471, + "rewards/chosen": 8.089793613978795, + "rewards/margins": 15.29258520943778, + "rewards/rejected": -7.202791595458985, + "step": 2352 + }, + { + "epoch": 0.5887651695233329, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40121428.0, + "logits/rejected": -38563424.0, + "logps/chosen": -412.6603698730469, + "logps/rejected": -589.955322265625, + "loss": 0.0065, + "rewards/chosen": 6.609074115753174, + "rewards/margins": 20.483824253082275, + "rewards/rejected": -13.874750137329102, + "step": 2353 + }, + { + "epoch": 0.5890153884649068, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24293006.222222224, + "logits/rejected": -49893504.0, + "logps/chosen": -466.72081163194446, + "logps/rejected": -831.2415364583334, + "loss": 0.0148, + "rewards/chosen": 7.70664299858941, + "rewards/margins": 28.887809583875868, + "rewards/rejected": -21.18116658528646, + "step": 2354 + }, + { + "epoch": 0.5892656074064807, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51724388.571428575, + "logits/rejected": -28558041.6, + "logps/chosen": -245.27549525669642, + "logps/rejected": -560.195556640625, + "loss": 0.0746, + "rewards/chosen": 5.0652618408203125, + "rewards/margins": 21.977728271484374, + "rewards/rejected": -16.91246643066406, + "step": 2355 + }, + { + "epoch": 0.5895158263480546, + "grad_norm": 14.8125, + "kl": 5.259764671325684, + "learning_rate": 5e-06, + "logits/chosen": -40742892.8, + "logits/rejected": -36664939.428571425, + "logps/chosen": -388.2680419921875, + "logps/rejected": -471.5330287388393, + "loss": 0.0514, + "rewards/chosen": 7.806633758544922, + "rewards/margins": 19.28615537370954, + "rewards/rejected": -11.47952161516462, + "step": 2356 + }, + { + "epoch": 0.5897660452896284, + "grad_norm": 19.75, + "kl": 5.268315315246582, + "learning_rate": 5e-06, + "logits/chosen": -51075328.0, + "logits/rejected": -32064652.8, + "logps/chosen": -437.79136439732144, + "logps/rejected": -598.233984375, + "loss": 0.0429, + "rewards/chosen": 8.469888959612165, + "rewards/margins": 19.467304120744977, + "rewards/rejected": -10.997415161132812, + "step": 2357 + }, + { + "epoch": 0.5900162642312023, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61702896.0, + "logits/rejected": -20033510.0, + "logps/chosen": -521.8365478515625, + "logps/rejected": -794.1048583984375, + "loss": 0.0163, + "rewards/chosen": 8.349980354309082, + "rewards/margins": 27.56750202178955, + "rewards/rejected": -19.21752166748047, + "step": 2358 + }, + { + "epoch": 0.5902664831727762, + "grad_norm": 3.71875, + "kl": 2.400516986846924, + "learning_rate": 5e-06, + "logits/chosen": -47857424.0, + "logits/rejected": -90854368.0, + "logps/chosen": -490.83148193359375, + "logps/rejected": -583.340087890625, + "loss": 0.0251, + "rewards/chosen": 9.487022399902344, + "rewards/margins": 23.997264862060547, + "rewards/rejected": -14.510242462158203, + "step": 2359 + }, + { + "epoch": 0.59051670211435, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21943089.777777776, + "logits/rejected": -47499844.266666666, + "logps/chosen": -199.89720323350696, + "logps/rejected": -712.91171875, + "loss": 0.0437, + "rewards/chosen": 4.608177608913845, + "rewards/margins": 24.00356742011176, + "rewards/rejected": -19.395389811197916, + "step": 2360 + }, + { + "epoch": 0.590766921055924, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18764491.636363637, + "logits/rejected": -74306215.38461539, + "logps/chosen": -348.9466441761364, + "logps/rejected": -771.5981069711538, + "loss": 0.0018, + "rewards/chosen": 7.720674688165838, + "rewards/margins": 28.64992085703603, + "rewards/rejected": -20.929246168870193, + "step": 2361 + }, + { + "epoch": 0.5910171399974978, + "grad_norm": 14.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22710183.111111112, + "logits/rejected": -46495360.0, + "logps/chosen": -358.9510091145833, + "logps/rejected": -542.7887369791666, + "loss": 0.0223, + "rewards/chosen": 9.019293891059029, + "rewards/margins": 22.741844346788195, + "rewards/rejected": -13.722550455729166, + "step": 2362 + }, + { + "epoch": 0.5912673589390717, + "grad_norm": 3.890625, + "kl": 1.390639066696167, + "learning_rate": 5e-06, + "logits/chosen": -35027630.93333333, + "logits/rejected": -34755271.11111111, + "logps/chosen": -402.98932291666665, + "logps/rejected": -619.4079861111111, + "loss": 0.0184, + "rewards/chosen": 7.972222900390625, + "rewards/margins": 21.53596649169922, + "rewards/rejected": -13.563743591308594, + "step": 2363 + }, + { + "epoch": 0.5915175778806455, + "grad_norm": 11.4375, + "kl": 0.1906595230102539, + "learning_rate": 5e-06, + "logits/chosen": -22757603.2, + "logits/rejected": -10655867.42857143, + "logps/chosen": -391.236669921875, + "logps/rejected": -559.3989955357143, + "loss": 0.0304, + "rewards/chosen": 6.823162841796875, + "rewards/margins": 21.849928937639508, + "rewards/rejected": -15.026766095842634, + "step": 2364 + }, + { + "epoch": 0.5917677968222195, + "grad_norm": 14.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36494300.8, + "logits/rejected": -76514889.14285715, + "logps/chosen": -411.500634765625, + "logps/rejected": -566.7976771763393, + "loss": 0.049, + "rewards/chosen": 8.659796142578125, + "rewards/margins": 21.45975559779576, + "rewards/rejected": -12.799959455217634, + "step": 2365 + }, + { + "epoch": 0.5920180157637933, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37681920.0, + "logits/rejected": -44096776.53333333, + "logps/chosen": -393.09478081597223, + "logps/rejected": -580.926953125, + "loss": 0.0116, + "rewards/chosen": 9.123224046495226, + "rewards/margins": 24.13021664089627, + "rewards/rejected": -15.006992594401042, + "step": 2366 + }, + { + "epoch": 0.5922682347053672, + "grad_norm": 20.625, + "kl": 13.989477157592773, + "learning_rate": 5e-06, + "logits/chosen": -43417972.0, + "logits/rejected": -39127656.0, + "logps/chosen": -448.8272705078125, + "logps/rejected": -435.0310363769531, + "loss": 0.0635, + "rewards/chosen": 9.904816627502441, + "rewards/margins": 17.954041481018066, + "rewards/rejected": -8.049224853515625, + "step": 2367 + }, + { + "epoch": 0.5925184536469411, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39725589.333333336, + "logits/rejected": -29751944.533333335, + "logps/chosen": -309.08558485243054, + "logps/rejected": -485.8118489583333, + "loss": 0.0553, + "rewards/chosen": 8.301888359917534, + "rewards/margins": 20.836744520399307, + "rewards/rejected": -12.534856160481771, + "step": 2368 + }, + { + "epoch": 0.592768672588515, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67594128.0, + "logits/rejected": -26614237.333333332, + "logps/chosen": -439.4596354166667, + "logps/rejected": -446.2477213541667, + "loss": 0.0141, + "rewards/chosen": 8.851855595906576, + "rewards/margins": 19.0233097076416, + "rewards/rejected": -10.171454111735025, + "step": 2369 + }, + { + "epoch": 0.5930188915300888, + "grad_norm": 3.640625, + "kl": 13.0173978805542, + "learning_rate": 5e-06, + "logits/chosen": -70688616.72727273, + "logits/rejected": -30222365.53846154, + "logps/chosen": -463.54243607954544, + "logps/rejected": -449.85802283653845, + "loss": 0.0409, + "rewards/chosen": 10.731794877485795, + "rewards/margins": 21.18584714235959, + "rewards/rejected": -10.454052264873798, + "step": 2370 + }, + { + "epoch": 0.5932691104716628, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47644742.4, + "logits/rejected": -49471588.571428575, + "logps/chosen": -453.912744140625, + "logps/rejected": -553.0492815290179, + "loss": 0.0513, + "rewards/chosen": 9.097515869140626, + "rewards/margins": 20.913455636160712, + "rewards/rejected": -11.815939767020089, + "step": 2371 + }, + { + "epoch": 0.5935193294132366, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87971066.66666667, + "logits/rejected": -54626704.0, + "logps/chosen": -413.75537109375, + "logps/rejected": -801.7137044270834, + "loss": 0.0108, + "rewards/chosen": 9.622482299804688, + "rewards/margins": 27.10281244913737, + "rewards/rejected": -17.480330149332683, + "step": 2372 + }, + { + "epoch": 0.5937695483548104, + "grad_norm": 12.625, + "kl": 8.559420585632324, + "learning_rate": 5e-06, + "logits/chosen": -70359904.0, + "logits/rejected": -11195393.333333334, + "logps/chosen": -462.3204345703125, + "logps/rejected": -881.8788248697916, + "loss": 0.0107, + "rewards/chosen": 9.035934448242188, + "rewards/margins": 28.72942606608073, + "rewards/rejected": -19.693491617838543, + "step": 2373 + }, + { + "epoch": 0.5940197672963843, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53238970.666666664, + "logits/rejected": -64142517.333333336, + "logps/chosen": -408.2602132161458, + "logps/rejected": -722.5432942708334, + "loss": 0.0467, + "rewards/chosen": 6.665255228678386, + "rewards/margins": 24.710252126057945, + "rewards/rejected": -18.04499689737956, + "step": 2374 + }, + { + "epoch": 0.5942699862379582, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46869397.333333336, + "logits/rejected": -48368053.333333336, + "logps/chosen": -386.7445882161458, + "logps/rejected": -465.9615885416667, + "loss": 0.017, + "rewards/chosen": 7.617603302001953, + "rewards/margins": 22.618732452392578, + "rewards/rejected": -15.001129150390625, + "step": 2375 + }, + { + "epoch": 0.5945202051795321, + "grad_norm": 12.0625, + "kl": 6.916736125946045, + "learning_rate": 5e-06, + "logits/chosen": -60444125.86666667, + "logits/rejected": -41100003.55555555, + "logps/chosen": -356.05865885416665, + "logps/rejected": -420.3400607638889, + "loss": 0.0793, + "rewards/chosen": 8.587648518880208, + "rewards/margins": 19.842449951171872, + "rewards/rejected": -11.254801432291666, + "step": 2376 + }, + { + "epoch": 0.5947704241211059, + "grad_norm": 3.78125, + "kl": 9.687932014465332, + "learning_rate": 5e-06, + "logits/chosen": -42852541.09090909, + "logits/rejected": -48107421.538461536, + "logps/chosen": -390.78657670454544, + "logps/rejected": -530.0025916466346, + "loss": 0.0365, + "rewards/chosen": 10.144217057661576, + "rewards/margins": 24.942312760786578, + "rewards/rejected": -14.798095703125, + "step": 2377 + }, + { + "epoch": 0.5950206430626799, + "grad_norm": 1.7421875, + "kl": 9.12623405456543, + "learning_rate": 5e-06, + "logits/chosen": -41042037.333333336, + "logits/rejected": -53942768.0, + "logps/chosen": -491.999755859375, + "logps/rejected": -519.3555908203125, + "loss": 0.046, + "rewards/chosen": 9.226099650065104, + "rewards/margins": 23.725377400716145, + "rewards/rejected": -14.499277750651041, + "step": 2378 + }, + { + "epoch": 0.5952708620042537, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34590809.6, + "logits/rejected": -27609572.57142857, + "logps/chosen": -366.021240234375, + "logps/rejected": -540.4691685267857, + "loss": 0.0432, + "rewards/chosen": 8.072207641601562, + "rewards/margins": 20.365573120117187, + "rewards/rejected": -12.293365478515625, + "step": 2379 + }, + { + "epoch": 0.5955210809458276, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34217766.4, + "logits/rejected": -34494971.428571425, + "logps/chosen": -384.1599609375, + "logps/rejected": -610.5228097098214, + "loss": 0.0159, + "rewards/chosen": 8.138725280761719, + "rewards/margins": 21.475396292550222, + "rewards/rejected": -13.336671011788505, + "step": 2380 + }, + { + "epoch": 0.5957712998874015, + "grad_norm": 9.875, + "kl": 3.3857040405273438, + "learning_rate": 5e-06, + "logits/chosen": -47664741.333333336, + "logits/rejected": -78805706.66666667, + "logps/chosen": -278.1958821614583, + "logps/rejected": -652.6418863932291, + "loss": 0.0346, + "rewards/chosen": 7.988106409708659, + "rewards/margins": 21.763859430948894, + "rewards/rejected": -13.775753021240234, + "step": 2381 + }, + { + "epoch": 0.5960215188289754, + "grad_norm": 7.15625, + "kl": 3.2921533584594727, + "learning_rate": 5e-06, + "logits/chosen": -62678441.14285714, + "logits/rejected": -67503417.6, + "logps/chosen": -427.75927734375, + "logps/rejected": -502.67421875, + "loss": 0.0266, + "rewards/chosen": 8.397334507533483, + "rewards/margins": 24.30509970528739, + "rewards/rejected": -15.907765197753907, + "step": 2382 + }, + { + "epoch": 0.5962717377705492, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31261051.076923076, + "logits/rejected": 53720802.90909091, + "logps/chosen": -316.45274939903845, + "logps/rejected": -548.5925958806819, + "loss": 0.0436, + "rewards/chosen": 6.960424569936899, + "rewards/margins": 20.04407186441488, + "rewards/rejected": -13.083647294477982, + "step": 2383 + }, + { + "epoch": 0.5965219567121232, + "grad_norm": 8.6875, + "kl": 5.020663261413574, + "learning_rate": 5e-06, + "logits/chosen": -44647310.54545455, + "logits/rejected": -43354958.76923077, + "logps/chosen": -408.08935546875, + "logps/rejected": -521.2127028245193, + "loss": 0.0428, + "rewards/chosen": 9.437430641867898, + "rewards/margins": 22.75570550665155, + "rewards/rejected": -13.318274864783653, + "step": 2384 + }, + { + "epoch": 0.596772175653697, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37436140.8, + "logits/rejected": -44108365.71428572, + "logps/chosen": -484.88544921875, + "logps/rejected": -676.9610072544643, + "loss": 0.0227, + "rewards/chosen": 7.37677001953125, + "rewards/margins": 21.942423139299663, + "rewards/rejected": -14.565653119768415, + "step": 2385 + }, + { + "epoch": 0.5970223945952708, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25179603.2, + "logits/rejected": -53577717.89473684, + "logps/chosen": -253.9442626953125, + "logps/rejected": -580.9646381578947, + "loss": 0.0242, + "rewards/chosen": 4.778381729125977, + "rewards/margins": 18.633599913747688, + "rewards/rejected": -13.85521818462171, + "step": 2386 + }, + { + "epoch": 0.5972726135368447, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39877098.666666664, + "logits/rejected": -747470.6666666666, + "logps/chosen": -348.7401529947917, + "logps/rejected": -670.2941487630209, + "loss": 0.0532, + "rewards/chosen": 7.556076685587565, + "rewards/margins": 21.446739832560223, + "rewards/rejected": -13.890663146972656, + "step": 2387 + }, + { + "epoch": 0.5975228324784186, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45286792.72727273, + "logits/rejected": -34509961.84615385, + "logps/chosen": -624.4852627840909, + "logps/rejected": -654.6381460336538, + "loss": 0.0073, + "rewards/chosen": 9.596846147017045, + "rewards/margins": 25.538557572798297, + "rewards/rejected": -15.94171142578125, + "step": 2388 + }, + { + "epoch": 0.5977730514199925, + "grad_norm": 5.46875, + "kl": 3.6199450492858887, + "learning_rate": 5e-06, + "logits/chosen": -27591977.14285714, + "logits/rejected": -43189808.941176474, + "logps/chosen": -314.359375, + "logps/rejected": -548.7756204044117, + "loss": 0.0468, + "rewards/chosen": 6.42853764125279, + "rewards/margins": 17.902344583463268, + "rewards/rejected": -11.473806942210478, + "step": 2389 + }, + { + "epoch": 0.5980232703615663, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54329068.307692304, + "logits/rejected": -54851037.09090909, + "logps/chosen": -318.0179912860577, + "logps/rejected": -440.83447265625, + "loss": 0.0246, + "rewards/chosen": 6.879795367901142, + "rewards/margins": 16.53691373171506, + "rewards/rejected": -9.65711836381392, + "step": 2390 + }, + { + "epoch": 0.5982734893031403, + "grad_norm": 3.546875, + "kl": 5.364129543304443, + "learning_rate": 5e-06, + "logits/chosen": -59773737.14285714, + "logits/rejected": -85714598.4, + "logps/chosen": -403.46732003348217, + "logps/rejected": -550.05908203125, + "loss": 0.0119, + "rewards/chosen": 8.814078194754464, + "rewards/margins": 23.40273938860212, + "rewards/rejected": -14.588661193847656, + "step": 2391 + }, + { + "epoch": 0.5985237082447141, + "grad_norm": 1.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52597987.55555555, + "logits/rejected": -42430549.333333336, + "logps/chosen": -376.4436306423611, + "logps/rejected": -476.2775065104167, + "loss": 0.0149, + "rewards/chosen": 7.1332346598307295, + "rewards/margins": 20.005042521158856, + "rewards/rejected": -12.871807861328126, + "step": 2392 + }, + { + "epoch": 0.598773927186288, + "grad_norm": 6.65625, + "kl": 11.070616722106934, + "learning_rate": 5e-06, + "logits/chosen": -42674517.333333336, + "logits/rejected": -38052237.333333336, + "logps/chosen": -274.5467122395833, + "logps/rejected": -583.68212890625, + "loss": 0.0519, + "rewards/chosen": 7.428246180216472, + "rewards/margins": 20.249821345011394, + "rewards/rejected": -12.821575164794922, + "step": 2393 + }, + { + "epoch": 0.5990241461278619, + "grad_norm": 2.515625, + "kl": 1.195264220237732, + "learning_rate": 5e-06, + "logits/chosen": -62449211.733333334, + "logits/rejected": -65416832.0, + "logps/chosen": -374.1488932291667, + "logps/rejected": -691.6035698784722, + "loss": 0.0362, + "rewards/chosen": 8.672393798828125, + "rewards/margins": 26.20056660970052, + "rewards/rejected": -17.528172810872395, + "step": 2394 + }, + { + "epoch": 0.5992743650694358, + "grad_norm": 2.609375, + "kl": 0.2982266843318939, + "learning_rate": 5e-06, + "logits/chosen": -34465389.333333336, + "logits/rejected": -37882362.666666664, + "logps/chosen": -298.3574625651042, + "logps/rejected": -399.8765869140625, + "loss": 0.086, + "rewards/chosen": 6.713270823160808, + "rewards/margins": 15.534657796223957, + "rewards/rejected": -8.82138697306315, + "step": 2395 + }, + { + "epoch": 0.5995245840110096, + "grad_norm": 9.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66737117.86666667, + "logits/rejected": -82213269.33333333, + "logps/chosen": -421.7874348958333, + "logps/rejected": -645.4722222222222, + "loss": 0.0278, + "rewards/chosen": 9.091615804036458, + "rewards/margins": 25.339442274305554, + "rewards/rejected": -16.247826470269096, + "step": 2396 + }, + { + "epoch": 0.5997748029525836, + "grad_norm": 12.0625, + "kl": 1.8417282104492188, + "learning_rate": 5e-06, + "logits/chosen": -50820676.92307692, + "logits/rejected": -87421410.9090909, + "logps/chosen": -421.36147836538464, + "logps/rejected": -442.84969815340907, + "loss": 0.062, + "rewards/chosen": 7.634193420410156, + "rewards/margins": 19.366353121670812, + "rewards/rejected": -11.732159701260654, + "step": 2397 + }, + { + "epoch": 0.6000250218941574, + "grad_norm": 2.3125, + "kl": 11.575826644897461, + "learning_rate": 5e-06, + "logits/chosen": -38024016.0, + "logits/rejected": -21999800.0, + "logps/chosen": -330.56685965401783, + "logps/rejected": -492.84580078125, + "loss": 0.0302, + "rewards/chosen": 7.060295104980469, + "rewards/margins": 21.7000732421875, + "rewards/rejected": -14.639778137207031, + "step": 2398 + }, + { + "epoch": 0.6002752408357312, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55031305.84615385, + "logits/rejected": -47352904.72727273, + "logps/chosen": -364.91323617788464, + "logps/rejected": -586.2329545454545, + "loss": 0.0151, + "rewards/chosen": 7.9844231238731975, + "rewards/margins": 23.317619590492516, + "rewards/rejected": -15.333196466619318, + "step": 2399 + }, + { + "epoch": 0.6005254597773051, + "grad_norm": 7.375, + "kl": 4.64101505279541, + "learning_rate": 5e-06, + "logits/chosen": -35348868.266666666, + "logits/rejected": -19450042.666666668, + "logps/chosen": -428.7259765625, + "logps/rejected": -307.71869574652777, + "loss": 0.0202, + "rewards/chosen": 8.424148559570312, + "rewards/margins": 16.440743340386284, + "rewards/rejected": -8.016594780815971, + "step": 2400 + }, + { + "epoch": 0.600775678718879, + "grad_norm": 21.0, + "kl": 23.017948150634766, + "learning_rate": 5e-06, + "logits/chosen": -55597296.0, + "logits/rejected": -64285008.0, + "logps/chosen": -445.5265808105469, + "logps/rejected": -633.8148193359375, + "loss": 0.1226, + "rewards/chosen": 7.5456647872924805, + "rewards/margins": 21.292704582214355, + "rewards/rejected": -13.747039794921875, + "step": 2401 + }, + { + "epoch": 0.6010258976604529, + "grad_norm": 1.71875, + "kl": 0.34704622626304626, + "learning_rate": 5e-06, + "logits/chosen": -50560704.0, + "logits/rejected": -22315698.285714287, + "logps/chosen": -459.680126953125, + "logps/rejected": -848.0197405133929, + "loss": 0.029, + "rewards/chosen": 8.405497741699218, + "rewards/margins": 27.843612452915735, + "rewards/rejected": -19.438114711216517, + "step": 2402 + }, + { + "epoch": 0.6012761166020267, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52865641.14285714, + "logits/rejected": -29246608.94117647, + "logps/chosen": -368.95424107142856, + "logps/rejected": -536.3024471507352, + "loss": 0.0285, + "rewards/chosen": 6.893147059849331, + "rewards/margins": 21.018906408999147, + "rewards/rejected": -14.125759349149817, + "step": 2403 + }, + { + "epoch": 0.6015263355436007, + "grad_norm": 4.25, + "kl": 2.937960386276245, + "learning_rate": 5e-06, + "logits/chosen": -56956458.666666664, + "logits/rejected": -33671740.44444445, + "logps/chosen": -348.36982421875, + "logps/rejected": -599.9561631944445, + "loss": 0.0322, + "rewards/chosen": 7.268742370605469, + "rewards/margins": 23.20854983859592, + "rewards/rejected": -15.939807467990452, + "step": 2404 + }, + { + "epoch": 0.6017765544851745, + "grad_norm": 8.5, + "kl": 4.8682684898376465, + "learning_rate": 5e-06, + "logits/chosen": -48482880.0, + "logits/rejected": -49103861.333333336, + "logps/chosen": -370.2366536458333, + "logps/rejected": -486.220703125, + "loss": 0.027, + "rewards/chosen": 8.454511006673178, + "rewards/margins": 18.8785883585612, + "rewards/rejected": -10.424077351888021, + "step": 2405 + }, + { + "epoch": 0.6020267734267484, + "grad_norm": 0.8046875, + "kl": 1.1547292470932007, + "learning_rate": 5e-06, + "logits/chosen": -26156336.0, + "logits/rejected": -36712634.666666664, + "logps/chosen": -338.4447021484375, + "logps/rejected": -650.2509765625, + "loss": 0.0021, + "rewards/chosen": 8.808040618896484, + "rewards/margins": 24.82712173461914, + "rewards/rejected": -16.019081115722656, + "step": 2406 + }, + { + "epoch": 0.6022769923683223, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59162297.6, + "logits/rejected": -62267382.85714286, + "logps/chosen": -361.38203125, + "logps/rejected": -621.1392299107143, + "loss": 0.0142, + "rewards/chosen": 7.983201599121093, + "rewards/margins": 23.500381905691963, + "rewards/rejected": -15.51718030657087, + "step": 2407 + }, + { + "epoch": 0.6025272113098962, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12246744.0, + "logits/rejected": -9252053.714285715, + "logps/chosen": -232.0011962890625, + "logps/rejected": -590.6630859375, + "loss": 0.068, + "rewards/chosen": 4.520191955566406, + "rewards/margins": 17.950900704520087, + "rewards/rejected": -13.430708748953682, + "step": 2408 + }, + { + "epoch": 0.60277743025147, + "grad_norm": 4.625, + "kl": 0.40271252393722534, + "learning_rate": 5e-06, + "logits/chosen": -20809976.615384616, + "logits/rejected": -32241288.727272727, + "logps/chosen": -344.3565204326923, + "logps/rejected": -693.5885120738636, + "loss": 0.037, + "rewards/chosen": 7.826763446514423, + "rewards/margins": 20.470651053048513, + "rewards/rejected": -12.643887606534092, + "step": 2409 + }, + { + "epoch": 0.603027649193044, + "grad_norm": 7.09375, + "kl": 0.712755560874939, + "learning_rate": 5e-06, + "logits/chosen": -88567308.8, + "logits/rejected": -46087204.571428575, + "logps/chosen": -496.2703125, + "logps/rejected": -594.9705636160714, + "loss": 0.0096, + "rewards/chosen": 10.420065307617188, + "rewards/margins": 24.948672267368863, + "rewards/rejected": -14.528606959751674, + "step": 2410 + }, + { + "epoch": 0.6032778681346178, + "grad_norm": 8.25, + "kl": 3.6644248962402344, + "learning_rate": 5e-06, + "logits/chosen": -36913136.0, + "logits/rejected": -29025656.0, + "logps/chosen": -340.74871826171875, + "logps/rejected": -600.736572265625, + "loss": 0.036, + "rewards/chosen": 6.268530527750651, + "rewards/margins": 19.442816416422527, + "rewards/rejected": -13.174285888671875, + "step": 2411 + }, + { + "epoch": 0.6035280870761917, + "grad_norm": 5.53125, + "kl": 3.2745590209960938, + "learning_rate": 5e-06, + "logits/chosen": -105621208.61538461, + "logits/rejected": -33022385.454545453, + "logps/chosen": -373.1397235576923, + "logps/rejected": -554.189453125, + "loss": 0.0375, + "rewards/chosen": 8.13821293757512, + "rewards/margins": 20.124294147624838, + "rewards/rejected": -11.986081210049717, + "step": 2412 + }, + { + "epoch": 0.6037783060177655, + "grad_norm": 4.59375, + "kl": 4.226690292358398, + "learning_rate": 5e-06, + "logits/chosen": -7347224.533333333, + "logits/rejected": -42911473.777777776, + "logps/chosen": -443.68919270833334, + "logps/rejected": -458.14702690972223, + "loss": 0.0674, + "rewards/chosen": 8.987929280598959, + "rewards/margins": 17.01618347167969, + "rewards/rejected": -8.028254191080729, + "step": 2413 + }, + { + "epoch": 0.6040285249593395, + "grad_norm": 16.875, + "kl": 3.228240489959717, + "learning_rate": 5e-06, + "logits/chosen": -48205045.333333336, + "logits/rejected": -29698229.333333332, + "logps/chosen": -508.65087890625, + "logps/rejected": -514.1309407552084, + "loss": 0.0352, + "rewards/chosen": 9.594586690266928, + "rewards/margins": 20.49249521891276, + "rewards/rejected": -10.897908528645834, + "step": 2414 + }, + { + "epoch": 0.6042787439009133, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50839985.23076923, + "logits/rejected": -70307921.45454545, + "logps/chosen": -398.29041466346155, + "logps/rejected": -813.1967329545455, + "loss": 0.1023, + "rewards/chosen": 9.176375169020433, + "rewards/margins": 24.501223690859923, + "rewards/rejected": -15.324848521839488, + "step": 2415 + }, + { + "epoch": 0.6045289628424871, + "grad_norm": 6.25, + "kl": 8.880748748779297, + "learning_rate": 5e-06, + "logits/chosen": -36583276.307692304, + "logits/rejected": -41477521.45454545, + "logps/chosen": -364.60238882211536, + "logps/rejected": -479.5604137073864, + "loss": 0.0264, + "rewards/chosen": 8.5194091796875, + "rewards/margins": 21.022756403142758, + "rewards/rejected": -12.503347223455256, + "step": 2416 + }, + { + "epoch": 0.6047791817840611, + "grad_norm": 6.9375, + "kl": 28.100996017456055, + "learning_rate": 5e-06, + "logits/chosen": -57067653.81818182, + "logits/rejected": -57354525.538461536, + "logps/chosen": -464.3948863636364, + "logps/rejected": -540.2642352764423, + "loss": 0.0195, + "rewards/chosen": 10.887361699884588, + "rewards/margins": 20.227166716035427, + "rewards/rejected": -9.33980501615084, + "step": 2417 + }, + { + "epoch": 0.6050294007256349, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31781384.727272727, + "logits/rejected": -42848029.538461536, + "logps/chosen": -273.0755504261364, + "logps/rejected": -551.7503756009615, + "loss": 0.031, + "rewards/chosen": 7.55059120871804, + "rewards/margins": 18.972071827708426, + "rewards/rejected": -11.421480618990385, + "step": 2418 + }, + { + "epoch": 0.6052796196672088, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46424944.0, + "logits/rejected": -40721178.666666664, + "logps/chosen": -462.5451253255208, + "logps/rejected": -552.1821695963541, + "loss": 0.0095, + "rewards/chosen": 8.919522603352865, + "rewards/margins": 19.730603535970054, + "rewards/rejected": -10.811080932617188, + "step": 2419 + }, + { + "epoch": 0.6055298386087827, + "grad_norm": 10.0, + "kl": 2.8398702144622803, + "learning_rate": 5e-06, + "logits/chosen": -40781049.6, + "logits/rejected": -36981837.71428572, + "logps/chosen": -434.67626953125, + "logps/rejected": -445.69095284598217, + "loss": 0.0863, + "rewards/chosen": 10.223684692382813, + "rewards/margins": 20.374068777901787, + "rewards/rejected": -10.150384085518974, + "step": 2420 + }, + { + "epoch": 0.6057800575503566, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32788406.85714286, + "logits/rejected": 15457953.6, + "logps/chosen": -369.71212332589283, + "logps/rejected": -601.98037109375, + "loss": 0.0277, + "rewards/chosen": 8.378007071358818, + "rewards/margins": 21.968011038643972, + "rewards/rejected": -13.590003967285156, + "step": 2421 + }, + { + "epoch": 0.6060302764919304, + "grad_norm": 3.4375, + "kl": 7.394895076751709, + "learning_rate": 5e-06, + "logits/chosen": -46620725.333333336, + "logits/rejected": -59502272.0, + "logps/chosen": -375.7605794270833, + "logps/rejected": -538.0983479817709, + "loss": 0.0076, + "rewards/chosen": 8.641480763753256, + "rewards/margins": 21.585071563720703, + "rewards/rejected": -12.943590799967447, + "step": 2422 + }, + { + "epoch": 0.6062804954335043, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37054887.11111111, + "logits/rejected": -32116309.333333332, + "logps/chosen": -278.7237141927083, + "logps/rejected": -511.403515625, + "loss": 0.06, + "rewards/chosen": 5.3554361131456165, + "rewards/margins": 19.576768917507597, + "rewards/rejected": -14.22133280436198, + "step": 2423 + }, + { + "epoch": 0.6065307143750782, + "grad_norm": 4.5, + "kl": 11.455562591552734, + "learning_rate": 5e-06, + "logits/chosen": -85890417.77777778, + "logits/rejected": -44022749.86666667, + "logps/chosen": -613.96826171875, + "logps/rejected": -574.2421875, + "loss": 0.0041, + "rewards/chosen": 13.403506808810764, + "rewards/margins": 27.422169325086806, + "rewards/rejected": -14.018662516276041, + "step": 2424 + }, + { + "epoch": 0.6067809333166521, + "grad_norm": 3.734375, + "kl": 12.522576332092285, + "learning_rate": 5e-06, + "logits/chosen": -53866658.13333333, + "logits/rejected": -62558535.11111111, + "logps/chosen": -455.5142578125, + "logps/rejected": -703.6638454861111, + "loss": 0.048, + "rewards/chosen": 7.8383433024088545, + "rewards/margins": 23.481965806749134, + "rewards/rejected": -15.643622504340279, + "step": 2425 + }, + { + "epoch": 0.6070311522582259, + "grad_norm": 12.875, + "kl": 21.92177963256836, + "learning_rate": 5e-06, + "logits/chosen": -44981472.0, + "logits/rejected": -51809229.71428572, + "logps/chosen": -527.33193359375, + "logps/rejected": -510.62025669642856, + "loss": 0.1277, + "rewards/chosen": 11.091026306152344, + "rewards/margins": 25.567866734095983, + "rewards/rejected": -14.476840427943639, + "step": 2426 + }, + { + "epoch": 0.6072813711997999, + "grad_norm": 4.9375, + "kl": 10.216106414794922, + "learning_rate": 5e-06, + "logits/chosen": -41974728.53333333, + "logits/rejected": -46294620.44444445, + "logps/chosen": -483.81959635416666, + "logps/rejected": -805.7077907986111, + "loss": 0.0035, + "rewards/chosen": 10.639893595377604, + "rewards/margins": 24.071036614312064, + "rewards/rejected": -13.431143018934462, + "step": 2427 + }, + { + "epoch": 0.6075315901413737, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43851381.333333336, + "logits/rejected": -33717002.666666664, + "logps/chosen": -310.73622639973956, + "logps/rejected": -472.8647054036458, + "loss": 0.0197, + "rewards/chosen": 6.667568842569987, + "rewards/margins": 18.671311060587566, + "rewards/rejected": -12.003742218017578, + "step": 2428 + }, + { + "epoch": 0.6077818090829475, + "grad_norm": 15.3125, + "kl": 4.264209747314453, + "learning_rate": 5e-06, + "logits/chosen": -50845204.0, + "logits/rejected": -42699856.0, + "logps/chosen": -406.4794616699219, + "logps/rejected": -742.0474853515625, + "loss": 0.0252, + "rewards/chosen": 8.561019897460938, + "rewards/margins": 26.82114028930664, + "rewards/rejected": -18.260120391845703, + "step": 2429 + }, + { + "epoch": 0.6080320280245215, + "grad_norm": 7.34375, + "kl": 5.061368465423584, + "learning_rate": 5e-06, + "logits/chosen": -60793048.615384616, + "logits/rejected": -46542839.27272727, + "logps/chosen": -390.46957632211536, + "logps/rejected": -542.5413707386364, + "loss": 0.0246, + "rewards/chosen": 9.265636737530048, + "rewards/margins": 22.809200580303486, + "rewards/rejected": -13.543563842773438, + "step": 2430 + }, + { + "epoch": 0.6082822469660953, + "grad_norm": 1.5546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41714481.23076923, + "logits/rejected": -58365870.54545455, + "logps/chosen": -272.9983097956731, + "logps/rejected": -764.9367009943181, + "loss": 0.0219, + "rewards/chosen": 7.229895958533654, + "rewards/margins": 21.655509361853966, + "rewards/rejected": -14.425613403320312, + "step": 2431 + }, + { + "epoch": 0.6085324659076692, + "grad_norm": 10.75, + "kl": 13.515531539916992, + "learning_rate": 5e-06, + "logits/chosen": -18470566.0, + "logits/rejected": -60277912.0, + "logps/chosen": -303.9547119140625, + "logps/rejected": -810.8048095703125, + "loss": 0.0936, + "rewards/chosen": 7.259609222412109, + "rewards/margins": 21.473526000976562, + "rewards/rejected": -14.213916778564453, + "step": 2432 + }, + { + "epoch": 0.6087826848492431, + "grad_norm": 8.1875, + "kl": 4.744281768798828, + "learning_rate": 5e-06, + "logits/chosen": -45561118.11764706, + "logits/rejected": -58099625.14285714, + "logps/chosen": -348.9482421875, + "logps/rejected": -702.8111746651786, + "loss": 0.0401, + "rewards/chosen": 8.127595789292279, + "rewards/margins": 25.08680250664719, + "rewards/rejected": -16.95920671735491, + "step": 2433 + }, + { + "epoch": 0.609032903790817, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32406775.272727273, + "logits/rejected": -20146226.46153846, + "logps/chosen": -304.24538352272725, + "logps/rejected": -521.6920072115385, + "loss": 0.0179, + "rewards/chosen": 5.514634219082919, + "rewards/margins": 15.77713468858412, + "rewards/rejected": -10.262500469501202, + "step": 2434 + }, + { + "epoch": 0.6092831227323908, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52801644.8, + "logits/rejected": -38254160.0, + "logps/chosen": -314.5013671875, + "logps/rejected": -645.61669921875, + "loss": 0.0351, + "rewards/chosen": 6.669560241699219, + "rewards/margins": 21.186737278529577, + "rewards/rejected": -14.517177036830358, + "step": 2435 + }, + { + "epoch": 0.6095333416739647, + "grad_norm": 5.0625, + "kl": 0.36060842871665955, + "learning_rate": 5e-06, + "logits/chosen": -54979804.0, + "logits/rejected": -60203936.0, + "logps/chosen": -372.57769775390625, + "logps/rejected": -679.0423583984375, + "loss": 0.0322, + "rewards/chosen": 7.684247970581055, + "rewards/margins": 26.054086685180664, + "rewards/rejected": -18.36983871459961, + "step": 2436 + }, + { + "epoch": 0.6097835606155386, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -99905104.0, + "logits/rejected": -48196732.0, + "logps/chosen": -586.1228637695312, + "logps/rejected": -653.0353393554688, + "loss": 0.0099, + "rewards/chosen": 11.60792350769043, + "rewards/margins": 28.319639205932617, + "rewards/rejected": -16.711715698242188, + "step": 2437 + }, + { + "epoch": 0.6100337795571125, + "grad_norm": 3.859375, + "kl": 2.946535110473633, + "learning_rate": 5e-06, + "logits/chosen": -51221321.14285714, + "logits/rejected": -54871884.8, + "logps/chosen": -268.7633579799107, + "logps/rejected": -512.336376953125, + "loss": 0.0084, + "rewards/chosen": 6.748202732631138, + "rewards/margins": 20.13454153878348, + "rewards/rejected": -13.386338806152343, + "step": 2438 + }, + { + "epoch": 0.6102839984986863, + "grad_norm": 14.5, + "kl": 28.076358795166016, + "learning_rate": 5e-06, + "logits/chosen": -72316978.28571428, + "logits/rejected": -35913097.6, + "logps/chosen": -413.25620814732144, + "logps/rejected": -340.6446533203125, + "loss": 0.0684, + "rewards/chosen": 9.59689440046038, + "rewards/margins": 19.18056139264788, + "rewards/rejected": -9.5836669921875, + "step": 2439 + }, + { + "epoch": 0.6105342174402603, + "grad_norm": 8.75, + "kl": 1.1614367961883545, + "learning_rate": 5e-06, + "logits/chosen": -65940112.0, + "logits/rejected": -46362378.666666664, + "logps/chosen": -416.9586588541667, + "logps/rejected": -478.8248697916667, + "loss": 0.0596, + "rewards/chosen": 7.538974761962891, + "rewards/margins": 21.200618743896484, + "rewards/rejected": -13.661643981933594, + "step": 2440 + }, + { + "epoch": 0.6107844363818341, + "grad_norm": 4.46875, + "kl": 4.290702819824219, + "learning_rate": 5e-06, + "logits/chosen": -73597882.66666667, + "logits/rejected": -52487381.333333336, + "logps/chosen": -409.5906168619792, + "logps/rejected": -652.3874104817709, + "loss": 0.0439, + "rewards/chosen": 8.501913706461588, + "rewards/margins": 22.486363728841145, + "rewards/rejected": -13.984450022379557, + "step": 2441 + }, + { + "epoch": 0.611034655323408, + "grad_norm": 16.25, + "kl": 1.9676513671875, + "learning_rate": 5e-06, + "logits/chosen": -52199968.0, + "logits/rejected": -76611436.8, + "logps/chosen": -332.74979073660717, + "logps/rejected": -559.1935546875, + "loss": 0.0425, + "rewards/chosen": 7.0919189453125, + "rewards/margins": 20.077818298339842, + "rewards/rejected": -12.985899353027344, + "step": 2442 + }, + { + "epoch": 0.6112848742649819, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47147748.571428575, + "logits/rejected": -62141782.5882353, + "logps/chosen": -414.872802734375, + "logps/rejected": -639.9566865808823, + "loss": 0.0522, + "rewards/chosen": 8.221305302211217, + "rewards/margins": 23.338525114940996, + "rewards/rejected": -15.117219812729779, + "step": 2443 + }, + { + "epoch": 0.6115350932065557, + "grad_norm": 6.0, + "kl": 0.7596168518066406, + "learning_rate": 5e-06, + "logits/chosen": -51223074.90909091, + "logits/rejected": -49544546.461538464, + "logps/chosen": -384.83957741477275, + "logps/rejected": -490.2123272235577, + "loss": 0.0166, + "rewards/chosen": 7.436017816716975, + "rewards/margins": 18.96030863515147, + "rewards/rejected": -11.524290818434496, + "step": 2444 + }, + { + "epoch": 0.6117853121481296, + "grad_norm": 12.3125, + "kl": 2.671675682067871, + "learning_rate": 5e-06, + "logits/chosen": -45810327.27272727, + "logits/rejected": -53568546.461538464, + "logps/chosen": -436.48428622159093, + "logps/rejected": -408.6998948317308, + "loss": 0.0492, + "rewards/chosen": 9.302181590687145, + "rewards/margins": 19.53797331056395, + "rewards/rejected": -10.235791719876802, + "step": 2445 + }, + { + "epoch": 0.6120355310897035, + "grad_norm": 4.34375, + "kl": 2.4231221675872803, + "learning_rate": 5e-06, + "logits/chosen": -38487882.666666664, + "logits/rejected": -35711266.666666664, + "logps/chosen": -376.9884440104167, + "logps/rejected": -500.9215901692708, + "loss": 0.03, + "rewards/chosen": 8.16119130452474, + "rewards/margins": 21.751017252604168, + "rewards/rejected": -13.589825948079428, + "step": 2446 + }, + { + "epoch": 0.6122857500312774, + "grad_norm": 9.1875, + "kl": 17.13314437866211, + "learning_rate": 5e-06, + "logits/chosen": -63262121.14285714, + "logits/rejected": -70119187.2, + "logps/chosen": -448.94482421875, + "logps/rejected": -606.637109375, + "loss": 0.0927, + "rewards/chosen": 9.646490914481026, + "rewards/margins": 24.02421003069196, + "rewards/rejected": -14.377719116210937, + "step": 2447 + }, + { + "epoch": 0.6125359689728512, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65378331.428571425, + "logits/rejected": -49179664.0, + "logps/chosen": -348.97984095982144, + "logps/rejected": -680.881982421875, + "loss": 0.059, + "rewards/chosen": 6.142981392996652, + "rewards/margins": 27.057210213797433, + "rewards/rejected": -20.91422882080078, + "step": 2448 + }, + { + "epoch": 0.6127861879144251, + "grad_norm": 6.25, + "kl": 10.972585678100586, + "learning_rate": 5e-06, + "logits/chosen": -51959687.11111111, + "logits/rejected": -24469619.2, + "logps/chosen": -536.43212890625, + "logps/rejected": -582.3038411458333, + "loss": 0.0621, + "rewards/chosen": 8.81045193142361, + "rewards/margins": 25.85059577094184, + "rewards/rejected": -17.04014383951823, + "step": 2449 + }, + { + "epoch": 0.613036406855999, + "grad_norm": 4.3125, + "kl": 7.541066646575928, + "learning_rate": 5e-06, + "logits/chosen": -39188133.64705882, + "logits/rejected": -24990640.0, + "logps/chosen": -368.9016544117647, + "logps/rejected": -484.90304129464283, + "loss": 0.052, + "rewards/chosen": 7.752206241383272, + "rewards/margins": 19.715713757426798, + "rewards/rejected": -11.963507516043526, + "step": 2450 + }, + { + "epoch": 0.6132866257975729, + "grad_norm": 1.671875, + "kl": 4.85919713973999, + "learning_rate": 5e-06, + "logits/chosen": -63278698.666666664, + "logits/rejected": -44978922.666666664, + "logps/chosen": -490.8492431640625, + "logps/rejected": -544.3570963541666, + "loss": 0.0045, + "rewards/chosen": 9.208826065063477, + "rewards/margins": 21.66894849141439, + "rewards/rejected": -12.460122426350912, + "step": 2451 + }, + { + "epoch": 0.6135368447391467, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34878801.06666667, + "logits/rejected": -37617475.55555555, + "logps/chosen": -344.06484375, + "logps/rejected": -510.19330512152777, + "loss": 0.0574, + "rewards/chosen": 8.14933573404948, + "rewards/margins": 21.36514163547092, + "rewards/rejected": -13.21580590142144, + "step": 2452 + }, + { + "epoch": 0.6137870636807207, + "grad_norm": 1.71875, + "kl": 9.40426254272461, + "learning_rate": 5e-06, + "logits/chosen": -60463862.85714286, + "logits/rejected": -72060851.2, + "logps/chosen": -464.28348214285717, + "logps/rejected": -580.93271484375, + "loss": 0.0163, + "rewards/chosen": 9.715233939034599, + "rewards/margins": 21.630648367745536, + "rewards/rejected": -11.915414428710937, + "step": 2453 + }, + { + "epoch": 0.6140372826222945, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31084241.454545453, + "logits/rejected": -57091475.692307696, + "logps/chosen": -422.03324751420456, + "logps/rejected": -523.0791766826923, + "loss": 0.0186, + "rewards/chosen": 8.77844931862571, + "rewards/margins": 22.452993432958642, + "rewards/rejected": -13.674544114332933, + "step": 2454 + }, + { + "epoch": 0.6142875015638684, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79238136.0, + "logits/rejected": -32143772.0, + "logps/chosen": -448.37542724609375, + "logps/rejected": -695.6185302734375, + "loss": 0.0477, + "rewards/chosen": 8.31430721282959, + "rewards/margins": 21.36212730407715, + "rewards/rejected": -13.047820091247559, + "step": 2455 + }, + { + "epoch": 0.6145377205054423, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52256960.0, + "logits/rejected": -42362570.666666664, + "logps/chosen": -231.5125935872396, + "logps/rejected": -511.3548177083333, + "loss": 0.0773, + "rewards/chosen": 5.250890413920085, + "rewards/margins": 18.313175519307453, + "rewards/rejected": -13.06228510538737, + "step": 2456 + }, + { + "epoch": 0.6147879394470162, + "grad_norm": 10.8125, + "kl": 9.245050430297852, + "learning_rate": 5e-06, + "logits/chosen": -62399394.461538464, + "logits/rejected": -65150312.72727273, + "logps/chosen": -451.30814302884613, + "logps/rejected": -457.6490589488636, + "loss": 0.0846, + "rewards/chosen": 10.647026648888222, + "rewards/margins": 22.951458910962085, + "rewards/rejected": -12.304432262073863, + "step": 2457 + }, + { + "epoch": 0.61503815838859, + "grad_norm": 3.84375, + "kl": 3.278815984725952, + "learning_rate": 5e-06, + "logits/chosen": -33754066.28571428, + "logits/rejected": -45747792.0, + "logps/chosen": -333.0765904017857, + "logps/rejected": -418.51083984375, + "loss": 0.0372, + "rewards/chosen": 7.9637603759765625, + "rewards/margins": 20.561287689208985, + "rewards/rejected": -12.597527313232423, + "step": 2458 + }, + { + "epoch": 0.6152883773301638, + "grad_norm": 9.9375, + "kl": 6.308222770690918, + "learning_rate": 5e-06, + "logits/chosen": -33344029.09090909, + "logits/rejected": -83905063.38461539, + "logps/chosen": -333.86496803977275, + "logps/rejected": -616.4159780649038, + "loss": 0.0781, + "rewards/chosen": 6.912535233931108, + "rewards/margins": 23.219422347062117, + "rewards/rejected": -16.30688711313101, + "step": 2459 + }, + { + "epoch": 0.6155385962717378, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45487392.0, + "logits/rejected": -53263286.85714286, + "logps/chosen": -296.945703125, + "logps/rejected": -664.4881417410714, + "loss": 0.0391, + "rewards/chosen": 6.355614471435547, + "rewards/margins": 22.244737570626395, + "rewards/rejected": -15.889123099190849, + "step": 2460 + }, + { + "epoch": 0.6157888152133116, + "grad_norm": 5.8125, + "kl": 1.4031486511230469, + "learning_rate": 5e-06, + "logits/chosen": -41949592.0, + "logits/rejected": -43703981.333333336, + "logps/chosen": -360.7398274739583, + "logps/rejected": -548.024169921875, + "loss": 0.0111, + "rewards/chosen": 8.542339324951172, + "rewards/margins": 19.77811050415039, + "rewards/rejected": -11.235771179199219, + "step": 2461 + }, + { + "epoch": 0.6160390341548855, + "grad_norm": 7.28125, + "kl": 18.769794464111328, + "learning_rate": 5e-06, + "logits/chosen": -53391062.5882353, + "logits/rejected": -92306688.0, + "logps/chosen": -495.6667049632353, + "logps/rejected": -788.7859933035714, + "loss": 0.019, + "rewards/chosen": 10.059327069450827, + "rewards/margins": 29.820295958959754, + "rewards/rejected": -19.760968889508927, + "step": 2462 + }, + { + "epoch": 0.6162892530964594, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59000476.0, + "logits/rejected": -100337216.0, + "logps/chosen": -388.73785400390625, + "logps/rejected": -687.3478393554688, + "loss": 0.0868, + "rewards/chosen": 6.633993625640869, + "rewards/margins": 22.363086223602295, + "rewards/rejected": -15.729092597961426, + "step": 2463 + }, + { + "epoch": 0.6165394720380333, + "grad_norm": 5.46875, + "kl": 8.836140632629395, + "learning_rate": 5e-06, + "logits/chosen": -70604873.84615384, + "logits/rejected": -50963042.90909091, + "logps/chosen": -364.3679762620192, + "logps/rejected": -679.8227982954545, + "loss": 0.0646, + "rewards/chosen": 8.853098355806791, + "rewards/margins": 26.630627985600825, + "rewards/rejected": -17.777529629794035, + "step": 2464 + }, + { + "epoch": 0.6167896909796071, + "grad_norm": 15.375, + "kl": 11.596526145935059, + "learning_rate": 5e-06, + "logits/chosen": -23251889.454545453, + "logits/rejected": -40303864.615384616, + "logps/chosen": -306.76717862215907, + "logps/rejected": -687.3375901442307, + "loss": 0.0789, + "rewards/chosen": 6.736112421209162, + "rewards/margins": 21.345324269541493, + "rewards/rejected": -14.609211848332333, + "step": 2465 + }, + { + "epoch": 0.6170399099211811, + "grad_norm": 9.375, + "kl": 0.3309618830680847, + "learning_rate": 5e-06, + "logits/chosen": -12473240.0, + "logits/rejected": -35457562.666666664, + "logps/chosen": -304.7659098307292, + "logps/rejected": -481.67529296875, + "loss": 0.019, + "rewards/chosen": 8.140413920084635, + "rewards/margins": 19.698087056477863, + "rewards/rejected": -11.557673136393229, + "step": 2466 + }, + { + "epoch": 0.6172901288627549, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -112141448.0, + "logits/rejected": -61328432.0, + "logps/chosen": -484.5457763671875, + "logps/rejected": -661.2355346679688, + "loss": 0.0017, + "rewards/chosen": 8.396095275878906, + "rewards/margins": 24.701457977294922, + "rewards/rejected": -16.305362701416016, + "step": 2467 + }, + { + "epoch": 0.6175403478043288, + "grad_norm": 15.0, + "kl": 8.004132270812988, + "learning_rate": 5e-06, + "logits/chosen": -21644186.666666668, + "logits/rejected": -56096176.0, + "logps/chosen": -439.7254231770833, + "logps/rejected": -733.9973958333334, + "loss": 0.0253, + "rewards/chosen": 10.076651255289713, + "rewards/margins": 27.724920908610024, + "rewards/rejected": -17.648269653320312, + "step": 2468 + }, + { + "epoch": 0.6177905667459027, + "grad_norm": 7.71875, + "kl": 17.530385971069336, + "learning_rate": 5e-06, + "logits/chosen": -29314292.70588235, + "logits/rejected": -55182496.0, + "logps/chosen": -364.0469324448529, + "logps/rejected": -604.5481305803571, + "loss": 0.1596, + "rewards/chosen": 8.174032772288603, + "rewards/margins": 23.394158243131237, + "rewards/rejected": -15.220125470842634, + "step": 2469 + }, + { + "epoch": 0.6180407856874766, + "grad_norm": 12.75, + "kl": 4.381776332855225, + "learning_rate": 5e-06, + "logits/chosen": -64036770.461538464, + "logits/rejected": -37433041.45454545, + "logps/chosen": -462.76844200721155, + "logps/rejected": -608.7482688210227, + "loss": 0.0526, + "rewards/chosen": 9.074207012469952, + "rewards/margins": 21.898144168453616, + "rewards/rejected": -12.823937155983664, + "step": 2470 + }, + { + "epoch": 0.6182910046290504, + "grad_norm": 10.75, + "kl": 14.730803489685059, + "learning_rate": 5e-06, + "logits/chosen": -25790142.0, + "logits/rejected": -43818560.0, + "logps/chosen": -324.1304931640625, + "logps/rejected": -773.86669921875, + "loss": 0.1078, + "rewards/chosen": 6.997044563293457, + "rewards/margins": 22.20711040496826, + "rewards/rejected": -15.210065841674805, + "step": 2471 + }, + { + "epoch": 0.6185412235706242, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46968938.666666664, + "logits/rejected": -17848408.0, + "logps/chosen": -368.6352132161458, + "logps/rejected": -356.0834147135417, + "loss": 0.0119, + "rewards/chosen": 8.279898325602213, + "rewards/margins": 17.227439244588215, + "rewards/rejected": -8.947540918986002, + "step": 2472 + }, + { + "epoch": 0.6187914425121982, + "grad_norm": 5.03125, + "kl": 6.900914669036865, + "learning_rate": 5e-06, + "logits/chosen": -38755913.14285714, + "logits/rejected": -58604390.4, + "logps/chosen": -386.2473842075893, + "logps/rejected": -770.933447265625, + "loss": 0.0531, + "rewards/chosen": 9.07250486101423, + "rewards/margins": 23.436220986502512, + "rewards/rejected": -14.363716125488281, + "step": 2473 + }, + { + "epoch": 0.619041661453772, + "grad_norm": 1.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30218818.90909091, + "logits/rejected": -48422843.07692308, + "logps/chosen": -411.69770951704544, + "logps/rejected": -610.9829852764423, + "loss": 0.0277, + "rewards/chosen": 7.69169131192294, + "rewards/margins": 20.412414390724024, + "rewards/rejected": -12.720723078801083, + "step": 2474 + }, + { + "epoch": 0.6192918803953459, + "grad_norm": 23.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21350089.846153848, + "logits/rejected": -36298280.72727273, + "logps/chosen": -344.0696364182692, + "logps/rejected": -569.771484375, + "loss": 0.0838, + "rewards/chosen": 7.918998718261719, + "rewards/margins": 18.992122303355824, + "rewards/rejected": -11.073123585094105, + "step": 2475 + }, + { + "epoch": 0.6195420993369198, + "grad_norm": 2.5, + "kl": 16.14555549621582, + "learning_rate": 5e-06, + "logits/chosen": -62235485.09090909, + "logits/rejected": -44912443.07692308, + "logps/chosen": -502.25874467329544, + "logps/rejected": -682.666015625, + "loss": 0.0401, + "rewards/chosen": 10.093490600585938, + "rewards/margins": 24.65542485163762, + "rewards/rejected": -14.561934251051683, + "step": 2476 + }, + { + "epoch": 0.6197923182784937, + "grad_norm": 1.875, + "kl": 5.006343364715576, + "learning_rate": 5e-06, + "logits/chosen": -18527432.888888888, + "logits/rejected": -2498426.933333333, + "logps/chosen": -368.03358289930554, + "logps/rejected": -451.92535807291665, + "loss": 0.0354, + "rewards/chosen": 9.618890550401476, + "rewards/margins": 20.628965420193143, + "rewards/rejected": -11.010074869791667, + "step": 2477 + }, + { + "epoch": 0.6200425372200675, + "grad_norm": 2.78125, + "kl": 1.7069120407104492, + "learning_rate": 5e-06, + "logits/chosen": -21469846.4, + "logits/rejected": -60583264.0, + "logps/chosen": -365.257470703125, + "logps/rejected": -533.4583565848214, + "loss": 0.0225, + "rewards/chosen": 8.044914245605469, + "rewards/margins": 20.574822562081472, + "rewards/rejected": -12.529908316476005, + "step": 2478 + }, + { + "epoch": 0.6202927561616415, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57225426.28571428, + "logits/rejected": -51871066.35294118, + "logps/chosen": -339.60365513392856, + "logps/rejected": -571.7243795955883, + "loss": 0.052, + "rewards/chosen": 8.281544276646205, + "rewards/margins": 21.38254309902672, + "rewards/rejected": -13.100998822380514, + "step": 2479 + }, + { + "epoch": 0.6205429751032153, + "grad_norm": 4.875, + "kl": 17.004467010498047, + "learning_rate": 5e-06, + "logits/chosen": -59037905.777777776, + "logits/rejected": -61594197.333333336, + "logps/chosen": -465.43787977430554, + "logps/rejected": -530.793701171875, + "loss": 0.0462, + "rewards/chosen": 8.951393975151909, + "rewards/margins": 23.374537997775604, + "rewards/rejected": -14.423144022623697, + "step": 2480 + }, + { + "epoch": 0.6207931940447892, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26451060.363636363, + "logits/rejected": -39794791.384615384, + "logps/chosen": -311.2567027698864, + "logps/rejected": -461.65024038461536, + "loss": 0.0245, + "rewards/chosen": 6.418903004039418, + "rewards/margins": 20.62016525802079, + "rewards/rejected": -14.20126225398137, + "step": 2481 + }, + { + "epoch": 0.6210434129863631, + "grad_norm": 6.34375, + "kl": 3.008413314819336, + "learning_rate": 5e-06, + "logits/chosen": -84831718.4, + "logits/rejected": -45997353.14285714, + "logps/chosen": -361.48603515625, + "logps/rejected": -619.49755859375, + "loss": 0.0516, + "rewards/chosen": 7.368170166015625, + "rewards/margins": 24.998214285714283, + "rewards/rejected": -17.63004411969866, + "step": 2482 + }, + { + "epoch": 0.621293631927937, + "grad_norm": 1.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58083524.266666666, + "logits/rejected": -31263235.555555556, + "logps/chosen": -343.8017578125, + "logps/rejected": -748.0422634548611, + "loss": 0.0118, + "rewards/chosen": 8.820060221354167, + "rewards/margins": 29.721329074435765, + "rewards/rejected": -20.901268853081596, + "step": 2483 + }, + { + "epoch": 0.6215438508695108, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63418248.0, + "logits/rejected": -53321612.0, + "logps/chosen": -376.13330078125, + "logps/rejected": -692.8074951171875, + "loss": 0.0051, + "rewards/chosen": 7.652372360229492, + "rewards/margins": 25.042146682739258, + "rewards/rejected": -17.389774322509766, + "step": 2484 + }, + { + "epoch": 0.6217940698110846, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38241448.0, + "logits/rejected": -28476076.0, + "logps/chosen": -260.7626953125, + "logps/rejected": -537.1641845703125, + "loss": 0.0444, + "rewards/chosen": 6.6749114990234375, + "rewards/margins": 19.1830472946167, + "rewards/rejected": -12.508135795593262, + "step": 2485 + }, + { + "epoch": 0.6220442887526586, + "grad_norm": 1.328125, + "kl": 2.4345602989196777, + "learning_rate": 5e-06, + "logits/chosen": -60949216.0, + "logits/rejected": -59230170.666666664, + "logps/chosen": -423.033935546875, + "logps/rejected": -798.23193359375, + "loss": 0.0181, + "rewards/chosen": 9.869501113891602, + "rewards/margins": 32.317672093709305, + "rewards/rejected": -22.448170979817707, + "step": 2486 + }, + { + "epoch": 0.6222945076942324, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46324434.28571428, + "logits/rejected": -28065388.8, + "logps/chosen": -367.89571707589283, + "logps/rejected": -351.865234375, + "loss": 0.0054, + "rewards/chosen": 8.259424482073102, + "rewards/margins": 18.873622022356308, + "rewards/rejected": -10.614197540283204, + "step": 2487 + }, + { + "epoch": 0.6225447266358063, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34725280.0, + "logits/rejected": -40811972.571428575, + "logps/chosen": -453.8267578125, + "logps/rejected": -401.27357700892856, + "loss": 0.0188, + "rewards/chosen": 6.439046478271484, + "rewards/margins": 17.217401668003625, + "rewards/rejected": -10.778355189732142, + "step": 2488 + }, + { + "epoch": 0.6227949455773802, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66070272.0, + "logits/rejected": -42478464.0, + "logps/chosen": -421.36971768465907, + "logps/rejected": -511.40707632211536, + "loss": 0.0148, + "rewards/chosen": 8.068936434659092, + "rewards/margins": 23.190863549292505, + "rewards/rejected": -15.121927114633413, + "step": 2489 + }, + { + "epoch": 0.6230451645189541, + "grad_norm": 0.2216796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40115976.72727273, + "logits/rejected": -49449723.07692308, + "logps/chosen": -345.27397017045456, + "logps/rejected": -666.0147235576923, + "loss": 0.0005, + "rewards/chosen": 9.405317826704545, + "rewards/margins": 27.67402510209517, + "rewards/rejected": -18.268707275390625, + "step": 2490 + }, + { + "epoch": 0.6232953834605279, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39178251.63636363, + "logits/rejected": -45704359.384615384, + "logps/chosen": -385.79048295454544, + "logps/rejected": -517.8949819711538, + "loss": 0.0346, + "rewards/chosen": 7.300200028852983, + "rewards/margins": 24.56292585893111, + "rewards/rejected": -17.262725830078125, + "step": 2491 + }, + { + "epoch": 0.6235456024021019, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79184029.53846154, + "logits/rejected": -79509975.27272727, + "logps/chosen": -393.5075871394231, + "logps/rejected": -714.5269442471591, + "loss": 0.0437, + "rewards/chosen": 8.704150860126202, + "rewards/margins": 28.042769745513276, + "rewards/rejected": -19.338618885387074, + "step": 2492 + }, + { + "epoch": 0.6237958213436757, + "grad_norm": 18.125, + "kl": 0.7218529582023621, + "learning_rate": 5e-06, + "logits/chosen": -50356185.6, + "logits/rejected": -34987830.85714286, + "logps/chosen": -432.276904296875, + "logps/rejected": -427.4755859375, + "loss": 0.0588, + "rewards/chosen": 8.515191650390625, + "rewards/margins": 20.437356567382814, + "rewards/rejected": -11.922164916992188, + "step": 2493 + }, + { + "epoch": 0.6240460402852496, + "grad_norm": 11.0625, + "kl": 0.5556501150131226, + "learning_rate": 5e-06, + "logits/chosen": -30307352.615384616, + "logits/rejected": -61299514.18181818, + "logps/chosen": -480.69027944711536, + "logps/rejected": -737.0223721590909, + "loss": 0.0174, + "rewards/chosen": 9.201239365797777, + "rewards/margins": 29.963564305872353, + "rewards/rejected": -20.762324940074574, + "step": 2494 + }, + { + "epoch": 0.6242962592268235, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60971520.0, + "logits/rejected": -68579484.44444445, + "logps/chosen": -315.2081705729167, + "logps/rejected": -794.966796875, + "loss": 0.0907, + "rewards/chosen": 6.537539672851563, + "rewards/margins": 27.292520819769965, + "rewards/rejected": -20.754981146918404, + "step": 2495 + }, + { + "epoch": 0.6245464781683974, + "grad_norm": 16.125, + "kl": 13.783313751220703, + "learning_rate": 5e-06, + "logits/chosen": -32887274.666666668, + "logits/rejected": -54998522.666666664, + "logps/chosen": -318.7431640625, + "logps/rejected": -424.4005940755208, + "loss": 0.0441, + "rewards/chosen": 6.299693425496419, + "rewards/margins": 17.912112553914387, + "rewards/rejected": -11.612419128417969, + "step": 2496 + }, + { + "epoch": 0.6247966971099712, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40515432.72727273, + "logits/rejected": -48312324.92307692, + "logps/chosen": -408.87650923295456, + "logps/rejected": -655.9814453125, + "loss": 0.033, + "rewards/chosen": 6.664486971768466, + "rewards/margins": 20.35750595839707, + "rewards/rejected": -13.693018986628605, + "step": 2497 + }, + { + "epoch": 0.625046916051545, + "grad_norm": 13.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34850458.666666664, + "logits/rejected": -33612352.0, + "logps/chosen": -375.4722086588542, + "logps/rejected": -523.0933024088541, + "loss": 0.0571, + "rewards/chosen": 8.046129862467447, + "rewards/margins": 19.1494140625, + "rewards/rejected": -11.103284200032553, + "step": 2498 + }, + { + "epoch": 0.625297134993119, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28030903.111111112, + "logits/rejected": -83402333.86666666, + "logps/chosen": -300.59087456597223, + "logps/rejected": -915.5887369791667, + "loss": 0.0236, + "rewards/chosen": 8.18606906467014, + "rewards/margins": 31.845948621961806, + "rewards/rejected": -23.659879557291667, + "step": 2499 + }, + { + "epoch": 0.6255473539346929, + "grad_norm": 17.375, + "kl": 8.784231185913086, + "learning_rate": 5e-06, + "logits/chosen": -43492371.2, + "logits/rejected": -30408621.714285713, + "logps/chosen": -364.315185546875, + "logps/rejected": -654.0004185267857, + "loss": 0.0395, + "rewards/chosen": 7.9260498046875, + "rewards/margins": 19.746459306989397, + "rewards/rejected": -11.820409502301898, + "step": 2500 + }, + { + "epoch": 0.6257975728762667, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74444794.18181819, + "logits/rejected": -42855985.23076923, + "logps/chosen": -345.7245427911932, + "logps/rejected": -527.3418344350962, + "loss": 0.0611, + "rewards/chosen": 6.343276283957741, + "rewards/margins": 21.61644029950762, + "rewards/rejected": -15.27316401554988, + "step": 2501 + }, + { + "epoch": 0.6260477918178406, + "grad_norm": 15.5, + "kl": 0.7056414484977722, + "learning_rate": 5e-06, + "logits/chosen": -35859671.27272727, + "logits/rejected": -39802035.692307696, + "logps/chosen": -433.1237127130682, + "logps/rejected": -500.4910231370192, + "loss": 0.0382, + "rewards/chosen": 8.305159135298295, + "rewards/margins": 21.35317384946596, + "rewards/rejected": -13.048014714167667, + "step": 2502 + }, + { + "epoch": 0.6262980107594145, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42946251.63636363, + "logits/rejected": -58232157.538461536, + "logps/chosen": -397.76411576704544, + "logps/rejected": -716.6336388221154, + "loss": 0.0272, + "rewards/chosen": 8.896286010742188, + "rewards/margins": 25.1898686335637, + "rewards/rejected": -16.293582622821514, + "step": 2503 + }, + { + "epoch": 0.6265482297009883, + "grad_norm": 7.59375, + "kl": 4.407509803771973, + "learning_rate": 5e-06, + "logits/chosen": -45189897.14285714, + "logits/rejected": -52229900.8, + "logps/chosen": -298.3546665736607, + "logps/rejected": -561.384033203125, + "loss": 0.0589, + "rewards/chosen": 7.529937744140625, + "rewards/margins": 23.666929626464842, + "rewards/rejected": -16.136991882324217, + "step": 2504 + }, + { + "epoch": 0.6267984486425623, + "grad_norm": 3.84375, + "kl": 2.7851524353027344, + "learning_rate": 5e-06, + "logits/chosen": -62565482.666666664, + "logits/rejected": -46043829.333333336, + "logps/chosen": -386.6598307291667, + "logps/rejected": -665.7399495442709, + "loss": 0.0282, + "rewards/chosen": 8.263753255208334, + "rewards/margins": 24.091209411621094, + "rewards/rejected": -15.82745615641276, + "step": 2505 + }, + { + "epoch": 0.6270486675841361, + "grad_norm": 10.1875, + "kl": 5.4167609214782715, + "learning_rate": 5e-06, + "logits/chosen": -63304192.0, + "logits/rejected": -24828538.666666668, + "logps/chosen": -532.7721354166666, + "logps/rejected": -567.024658203125, + "loss": 0.0221, + "rewards/chosen": 11.037769317626953, + "rewards/margins": 23.651137034098305, + "rewards/rejected": -12.613367716471354, + "step": 2506 + }, + { + "epoch": 0.62729888652571, + "grad_norm": 1.1953125, + "kl": 4.852348327636719, + "learning_rate": 5e-06, + "logits/chosen": -55810683.428571425, + "logits/rejected": -32385465.6, + "logps/chosen": -412.30001395089283, + "logps/rejected": -574.458154296875, + "loss": 0.0033, + "rewards/chosen": 8.256307329450335, + "rewards/margins": 20.34444362095424, + "rewards/rejected": -12.088136291503906, + "step": 2507 + }, + { + "epoch": 0.6275491054672838, + "grad_norm": 2.203125, + "kl": 3.1178557872772217, + "learning_rate": 5e-06, + "logits/chosen": -50093008.0, + "logits/rejected": -43463205.333333336, + "logps/chosen": -401.2214762369792, + "logps/rejected": -603.3190511067709, + "loss": 0.0232, + "rewards/chosen": 7.737422943115234, + "rewards/margins": 19.691775004069008, + "rewards/rejected": -11.954352060953775, + "step": 2508 + }, + { + "epoch": 0.6277993244088578, + "grad_norm": 1.7734375, + "kl": 0.3774452209472656, + "learning_rate": 5e-06, + "logits/chosen": -26624032.0, + "logits/rejected": -48069425.23076923, + "logps/chosen": -339.2908824573864, + "logps/rejected": -887.7693058894231, + "loss": 0.0207, + "rewards/chosen": 7.492101495916193, + "rewards/margins": 27.953081251024365, + "rewards/rejected": -20.460979755108173, + "step": 2509 + }, + { + "epoch": 0.6280495433504316, + "grad_norm": 9.9375, + "kl": 5.894001007080078, + "learning_rate": 5e-06, + "logits/chosen": -9593362.0, + "logits/rejected": -71870272.0, + "logps/chosen": -458.2365417480469, + "logps/rejected": -589.8665161132812, + "loss": 0.084, + "rewards/chosen": 7.839481830596924, + "rewards/margins": 20.48694658279419, + "rewards/rejected": -12.647464752197266, + "step": 2510 + }, + { + "epoch": 0.6282997622920055, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62652595.2, + "logits/rejected": -48598084.571428575, + "logps/chosen": -431.997509765625, + "logps/rejected": -589.3092912946429, + "loss": 0.045, + "rewards/chosen": 10.407089996337891, + "rewards/margins": 24.767578887939454, + "rewards/rejected": -14.360488891601562, + "step": 2511 + }, + { + "epoch": 0.6285499812335794, + "grad_norm": 8.625, + "kl": 6.148695945739746, + "learning_rate": 5e-06, + "logits/chosen": -43121116.8, + "logits/rejected": -1173808.0, + "logps/chosen": -362.309619140625, + "logps/rejected": -693.05029296875, + "loss": 0.0231, + "rewards/chosen": 7.629971313476562, + "rewards/margins": 17.48228803362165, + "rewards/rejected": -9.852316720145089, + "step": 2512 + }, + { + "epoch": 0.6288002001751533, + "grad_norm": 2.390625, + "kl": 5.250231742858887, + "learning_rate": 5e-06, + "logits/chosen": -19799986.285714287, + "logits/rejected": -29860492.8, + "logps/chosen": -300.18265206473217, + "logps/rejected": -535.036669921875, + "loss": 0.0479, + "rewards/chosen": 7.145606449672154, + "rewards/margins": 21.082914188929966, + "rewards/rejected": -13.937307739257813, + "step": 2513 + }, + { + "epoch": 0.6290504191167271, + "grad_norm": 11.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46877904.0, + "logits/rejected": -47118160.0, + "logps/chosen": -327.307861328125, + "logps/rejected": -722.0896606445312, + "loss": 0.0315, + "rewards/chosen": 7.658298015594482, + "rewards/margins": 24.424087047576904, + "rewards/rejected": -16.765789031982422, + "step": 2514 + }, + { + "epoch": 0.629300638058301, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43921142.15384615, + "logits/rejected": -37598231.27272727, + "logps/chosen": -418.68997896634613, + "logps/rejected": -645.4795365767045, + "loss": 0.0382, + "rewards/chosen": 7.209188608022837, + "rewards/margins": 22.47498561619045, + "rewards/rejected": -15.265797008167613, + "step": 2515 + }, + { + "epoch": 0.6295508569998749, + "grad_norm": 12.3125, + "kl": 5.2679290771484375, + "learning_rate": 5e-06, + "logits/chosen": -41798291.2, + "logits/rejected": -56350637.71428572, + "logps/chosen": -331.806494140625, + "logps/rejected": -551.3922642299107, + "loss": 0.0502, + "rewards/chosen": 7.527032470703125, + "rewards/margins": 20.825257873535158, + "rewards/rejected": -13.298225402832031, + "step": 2516 + }, + { + "epoch": 0.6298010759414487, + "grad_norm": 2.6875, + "kl": 0.6283696889877319, + "learning_rate": 5e-06, + "logits/chosen": 5085506.285714285, + "logits/rejected": -74583667.2, + "logps/chosen": -434.1474609375, + "logps/rejected": -533.02958984375, + "loss": 0.0474, + "rewards/chosen": 7.849430629185268, + "rewards/margins": 20.395107051304407, + "rewards/rejected": -12.54567642211914, + "step": 2517 + }, + { + "epoch": 0.6300512948830227, + "grad_norm": 1.609375, + "kl": 0.6448568105697632, + "learning_rate": 5e-06, + "logits/chosen": -43250286.54545455, + "logits/rejected": -49213267.692307696, + "logps/chosen": -457.32563920454544, + "logps/rejected": -600.2562725360577, + "loss": 0.0016, + "rewards/chosen": 10.613924893465908, + "rewards/margins": 22.10376958246831, + "rewards/rejected": -11.489844689002403, + "step": 2518 + }, + { + "epoch": 0.6303015138245965, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -92552778.66666667, + "logits/rejected": -56945728.0, + "logps/chosen": -439.2107747395833, + "logps/rejected": -492.5219319661458, + "loss": 0.0218, + "rewards/chosen": 9.691757202148438, + "rewards/margins": 23.174907684326172, + "rewards/rejected": -13.483150482177734, + "step": 2519 + }, + { + "epoch": 0.6305517327661704, + "grad_norm": 20.5, + "kl": 12.489814758300781, + "learning_rate": 5e-06, + "logits/chosen": -37469034.666666664, + "logits/rejected": -17186919.111111112, + "logps/chosen": -427.62705078125, + "logps/rejected": -379.3385959201389, + "loss": 0.0545, + "rewards/chosen": 7.884296671549479, + "rewards/margins": 16.83777770996094, + "rewards/rejected": -8.953481038411459, + "step": 2520 + }, + { + "epoch": 0.6308019517077442, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42540660.36363637, + "logits/rejected": -26577824.0, + "logps/chosen": -376.55446555397725, + "logps/rejected": -493.6002854567308, + "loss": 0.0155, + "rewards/chosen": 7.891424005681818, + "rewards/margins": 20.473610191078453, + "rewards/rejected": -12.582186185396635, + "step": 2521 + }, + { + "epoch": 0.6310521706493182, + "grad_norm": 12.6875, + "kl": 0.2251453399658203, + "learning_rate": 5e-06, + "logits/chosen": -55376464.0, + "logits/rejected": -47450712.0, + "logps/chosen": -388.3652648925781, + "logps/rejected": -641.0706176757812, + "loss": 0.0316, + "rewards/chosen": 9.333907127380371, + "rewards/margins": 24.025349617004395, + "rewards/rejected": -14.691442489624023, + "step": 2522 + }, + { + "epoch": 0.631302389590892, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35947622.4, + "logits/rejected": -40286435.55555555, + "logps/chosen": -455.69114583333334, + "logps/rejected": -501.96088324652777, + "loss": 0.0415, + "rewards/chosen": 8.207101440429687, + "rewards/margins": 23.05621575249566, + "rewards/rejected": -14.849114312065971, + "step": 2523 + }, + { + "epoch": 0.6315526085324659, + "grad_norm": 1.03125, + "kl": 1.0947158336639404, + "learning_rate": 5e-06, + "logits/chosen": -53513393.23076923, + "logits/rejected": -65191709.09090909, + "logps/chosen": -472.59848257211536, + "logps/rejected": -646.4961825284091, + "loss": 0.0115, + "rewards/chosen": 10.214528010441708, + "rewards/margins": 27.20546983838915, + "rewards/rejected": -16.99094182794744, + "step": 2524 + }, + { + "epoch": 0.6318028274740398, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59435739.428571425, + "logits/rejected": -73352371.2, + "logps/chosen": -448.9432896205357, + "logps/rejected": -827.290234375, + "loss": 0.0169, + "rewards/chosen": 8.524293082101005, + "rewards/margins": 27.832834952218192, + "rewards/rejected": -19.30854187011719, + "step": 2525 + }, + { + "epoch": 0.6320530464156137, + "grad_norm": 1.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32835399.529411763, + "logits/rejected": -51940672.0, + "logps/chosen": -375.8563878676471, + "logps/rejected": -784.8980887276786, + "loss": 0.0244, + "rewards/chosen": 7.951719396254596, + "rewards/margins": 28.77206023400571, + "rewards/rejected": -20.820340837751115, + "step": 2526 + }, + { + "epoch": 0.6323032653571875, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40449703.384615384, + "logits/rejected": -51994827.63636363, + "logps/chosen": -373.72205528846155, + "logps/rejected": -765.6644176136364, + "loss": 0.058, + "rewards/chosen": 4.469235053429236, + "rewards/margins": 26.93773675131631, + "rewards/rejected": -22.468501697887074, + "step": 2527 + }, + { + "epoch": 0.6325534842987615, + "grad_norm": 1.25, + "kl": 0.026159923523664474, + "learning_rate": 5e-06, + "logits/chosen": -39475293.333333336, + "logits/rejected": -61318085.333333336, + "logps/chosen": -438.5337320963542, + "logps/rejected": -610.8920084635416, + "loss": 0.0053, + "rewards/chosen": 9.895198186238607, + "rewards/margins": 27.74160067240397, + "rewards/rejected": -17.846402486165363, + "step": 2528 + }, + { + "epoch": 0.6328037032403353, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -90446624.0, + "logits/rejected": -25616828.444444444, + "logps/chosen": -465.4545084635417, + "logps/rejected": -445.6344401041667, + "loss": 0.0468, + "rewards/chosen": 6.079484939575195, + "rewards/margins": 19.90537537468804, + "rewards/rejected": -13.825890435112846, + "step": 2529 + }, + { + "epoch": 0.6330539221819091, + "grad_norm": 3.28125, + "kl": 5.753346920013428, + "learning_rate": 5e-06, + "logits/chosen": -34063037.333333336, + "logits/rejected": 33536949.333333332, + "logps/chosen": -304.8963623046875, + "logps/rejected": -720.7193196614584, + "loss": 0.0281, + "rewards/chosen": 7.834623336791992, + "rewards/margins": 25.427427291870117, + "rewards/rejected": -17.592803955078125, + "step": 2530 + }, + { + "epoch": 0.6333041411234831, + "grad_norm": 12.5625, + "kl": 9.607837677001953, + "learning_rate": 5e-06, + "logits/chosen": -84769389.71428572, + "logits/rejected": -63669670.4, + "logps/chosen": -522.7791573660714, + "logps/rejected": -661.33623046875, + "loss": 0.0554, + "rewards/chosen": 8.110807691301618, + "rewards/margins": 25.702882276262557, + "rewards/rejected": -17.592074584960937, + "step": 2531 + }, + { + "epoch": 0.6335543600650569, + "grad_norm": 9.875, + "kl": 11.240137100219727, + "learning_rate": 5e-06, + "logits/chosen": -43457474.666666664, + "logits/rejected": -40157626.666666664, + "logps/chosen": -417.1097005208333, + "logps/rejected": -453.2268880208333, + "loss": 0.0649, + "rewards/chosen": 7.763933817545573, + "rewards/margins": 21.987224578857422, + "rewards/rejected": -14.22329076131185, + "step": 2532 + }, + { + "epoch": 0.6338045790066308, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11426691.555555556, + "logits/rejected": -54092083.2, + "logps/chosen": -383.9098849826389, + "logps/rejected": -463.32643229166666, + "loss": 0.0147, + "rewards/chosen": 9.147364298502604, + "rewards/margins": 21.69699198404948, + "rewards/rejected": -12.549627685546875, + "step": 2533 + }, + { + "epoch": 0.6340547979482046, + "grad_norm": 17.125, + "kl": 5.874564170837402, + "learning_rate": 5e-06, + "logits/chosen": -66065712.0, + "logits/rejected": -48915084.0, + "logps/chosen": -411.32861328125, + "logps/rejected": -392.4386901855469, + "loss": 0.0732, + "rewards/chosen": 7.4850945472717285, + "rewards/margins": 19.656890392303467, + "rewards/rejected": -12.171795845031738, + "step": 2534 + }, + { + "epoch": 0.6343050168897786, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73429000.0, + "logits/rejected": -31296168.0, + "logps/chosen": -354.84466552734375, + "logps/rejected": -570.5792846679688, + "loss": 0.0206, + "rewards/chosen": 7.279775619506836, + "rewards/margins": 23.8420467376709, + "rewards/rejected": -16.562271118164062, + "step": 2535 + }, + { + "epoch": 0.6345552358313524, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38264850.666666664, + "logits/rejected": -67447808.0, + "logps/chosen": -371.488037109375, + "logps/rejected": -774.5887044270834, + "loss": 0.0236, + "rewards/chosen": 7.023232777913411, + "rewards/margins": 24.775208791097004, + "rewards/rejected": -17.751976013183594, + "step": 2536 + }, + { + "epoch": 0.6348054547729263, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17698210.46153846, + "logits/rejected": -55249338.18181818, + "logps/chosen": -248.60697115384616, + "logps/rejected": -647.9096235795455, + "loss": 0.054, + "rewards/chosen": 6.410416236290565, + "rewards/margins": 26.86635162780335, + "rewards/rejected": -20.455935391512785, + "step": 2537 + }, + { + "epoch": 0.6350556737145002, + "grad_norm": 7.875, + "kl": 4.359576225280762, + "learning_rate": 5e-06, + "logits/chosen": -82343429.81818181, + "logits/rejected": -27947869.53846154, + "logps/chosen": -425.4182794744318, + "logps/rejected": -543.4133112980769, + "loss": 0.0244, + "rewards/chosen": 7.895267833362926, + "rewards/margins": 22.897347723687446, + "rewards/rejected": -15.00207989032452, + "step": 2538 + }, + { + "epoch": 0.6353058926560741, + "grad_norm": 3.4375, + "kl": 1.2204082012176514, + "learning_rate": 5e-06, + "logits/chosen": -72080561.23076923, + "logits/rejected": -59333690.18181818, + "logps/chosen": -473.35509314903845, + "logps/rejected": -680.5055930397727, + "loss": 0.0076, + "rewards/chosen": 9.993110069861778, + "rewards/margins": 29.56628567355496, + "rewards/rejected": -19.573175603693183, + "step": 2539 + }, + { + "epoch": 0.6355561115976479, + "grad_norm": 0.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64160458.666666664, + "logits/rejected": -83578677.33333333, + "logps/chosen": -426.4730224609375, + "logps/rejected": -705.80517578125, + "loss": 0.0092, + "rewards/chosen": 9.952310562133789, + "rewards/margins": 28.4623228708903, + "rewards/rejected": -18.51001230875651, + "step": 2540 + }, + { + "epoch": 0.6358063305392219, + "grad_norm": 9.625, + "kl": 3.2951273918151855, + "learning_rate": 5e-06, + "logits/chosen": -25205796.57142857, + "logits/rejected": -73031590.4, + "logps/chosen": -410.46718052455356, + "logps/rejected": -429.57119140625, + "loss": 0.0345, + "rewards/chosen": 7.556738172258649, + "rewards/margins": 21.21426685878209, + "rewards/rejected": -13.657528686523438, + "step": 2541 + }, + { + "epoch": 0.6360565494807957, + "grad_norm": 13.8125, + "kl": 6.436875820159912, + "learning_rate": 5e-06, + "logits/chosen": -44706156.307692304, + "logits/rejected": -45259485.09090909, + "logps/chosen": -469.2804987980769, + "logps/rejected": -649.1965553977273, + "loss": 0.0252, + "rewards/chosen": 10.109105036808895, + "rewards/margins": 30.280660989401223, + "rewards/rejected": -20.17155595259233, + "step": 2542 + }, + { + "epoch": 0.6363067684223696, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43531886.54545455, + "logits/rejected": -6025697.230769231, + "logps/chosen": -367.37300248579544, + "logps/rejected": -682.2079326923077, + "loss": 0.0077, + "rewards/chosen": 8.827782370827414, + "rewards/margins": 26.245462377588233, + "rewards/rejected": -17.41768000676082, + "step": 2543 + }, + { + "epoch": 0.6365569873639435, + "grad_norm": 13.125, + "kl": 1.0708554983139038, + "learning_rate": 5e-06, + "logits/chosen": -91921429.33333333, + "logits/rejected": -16459538.666666666, + "logps/chosen": -534.0583089192709, + "logps/rejected": -550.5040283203125, + "loss": 0.0805, + "rewards/chosen": 10.274063110351562, + "rewards/margins": 21.978111267089844, + "rewards/rejected": -11.704048156738281, + "step": 2544 + }, + { + "epoch": 0.6368072063055173, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46336632.0, + "logits/rejected": -31558392.0, + "logps/chosen": -293.7705078125, + "logps/rejected": -542.282958984375, + "loss": 0.0161, + "rewards/chosen": 7.196949481964111, + "rewards/margins": 19.90035581588745, + "rewards/rejected": -12.70340633392334, + "step": 2545 + }, + { + "epoch": 0.6370574252470912, + "grad_norm": 1.515625, + "kl": 2.2637782096862793, + "learning_rate": 5e-06, + "logits/chosen": -39512566.4, + "logits/rejected": -68732182.85714285, + "logps/chosen": -424.83955078125, + "logps/rejected": -586.6111886160714, + "loss": 0.0101, + "rewards/chosen": 8.650949096679687, + "rewards/margins": 24.075657871791293, + "rewards/rejected": -15.424708775111608, + "step": 2546 + }, + { + "epoch": 0.637307644188665, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 40297088.0, + "logits/rejected": -62848903.11111111, + "logps/chosen": -545.55546875, + "logps/rejected": -671.9635416666666, + "loss": 0.0227, + "rewards/chosen": 8.619359334309896, + "rewards/margins": 26.290486653645836, + "rewards/rejected": -17.671127319335938, + "step": 2547 + }, + { + "epoch": 0.637557863130239, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37049280.0, + "logits/rejected": -32221056.0, + "logps/chosen": -372.17744584517044, + "logps/rejected": -431.46424278846155, + "loss": 0.0268, + "rewards/chosen": 7.39745400168679, + "rewards/margins": 18.247184273246287, + "rewards/rejected": -10.849730271559496, + "step": 2548 + }, + { + "epoch": 0.6378080820718128, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46406762.666666664, + "logits/rejected": -32089821.866666667, + "logps/chosen": -321.1032443576389, + "logps/rejected": -628.3783203125, + "loss": 0.0422, + "rewards/chosen": 5.7843475341796875, + "rewards/margins": 23.312832641601563, + "rewards/rejected": -17.528485107421876, + "step": 2549 + }, + { + "epoch": 0.6380583010133867, + "grad_norm": 16.875, + "kl": 1.0286548137664795, + "learning_rate": 5e-06, + "logits/chosen": -42210924.307692304, + "logits/rejected": -15148734.545454545, + "logps/chosen": -412.9252178485577, + "logps/rejected": -406.46835049715907, + "loss": 0.0766, + "rewards/chosen": 8.12040064885066, + "rewards/margins": 17.835243011688018, + "rewards/rejected": -9.714842362837357, + "step": 2550 + }, + { + "epoch": 0.6383085199549606, + "grad_norm": 14.75, + "kl": 1.0867408514022827, + "learning_rate": 5e-06, + "logits/chosen": -60261650.28571428, + "logits/rejected": -39730502.4, + "logps/chosen": -365.9883510044643, + "logps/rejected": -697.97705078125, + "loss": 0.057, + "rewards/chosen": 6.718558175223214, + "rewards/margins": 25.85977063860212, + "rewards/rejected": -19.141212463378906, + "step": 2551 + }, + { + "epoch": 0.6385587388965345, + "grad_norm": 0.93359375, + "kl": 2.429473876953125, + "learning_rate": 5e-06, + "logits/chosen": -53061090.13333333, + "logits/rejected": -48269105.777777776, + "logps/chosen": -460.12275390625, + "logps/rejected": -900.0715603298611, + "loss": 0.0027, + "rewards/chosen": 8.950199381510417, + "rewards/margins": 29.09427286783854, + "rewards/rejected": -20.144073486328125, + "step": 2552 + }, + { + "epoch": 0.6388089578381083, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36669571.2, + "logits/rejected": -64002157.71428572, + "logps/chosen": -335.831396484375, + "logps/rejected": -828.0129743303571, + "loss": 0.0498, + "rewards/chosen": 7.682994842529297, + "rewards/margins": 25.14322934831892, + "rewards/rejected": -17.46023450578962, + "step": 2553 + }, + { + "epoch": 0.6390591767796823, + "grad_norm": 3.390625, + "kl": 6.784902095794678, + "learning_rate": 5e-06, + "logits/chosen": -43633937.45454545, + "logits/rejected": -72084475.07692307, + "logps/chosen": -364.6028497869318, + "logps/rejected": -729.9070763221154, + "loss": 0.0309, + "rewards/chosen": 8.639443137428977, + "rewards/margins": 19.99520297817417, + "rewards/rejected": -11.355759840745192, + "step": 2554 + }, + { + "epoch": 0.6393093957212561, + "grad_norm": 5.71875, + "kl": 1.9058170318603516, + "learning_rate": 5e-06, + "logits/chosen": -33871562.666666664, + "logits/rejected": -59478357.333333336, + "logps/chosen": -397.8621419270833, + "logps/rejected": -467.5590006510417, + "loss": 0.0272, + "rewards/chosen": 7.556326548258464, + "rewards/margins": 20.83548863728841, + "rewards/rejected": -13.279162089029947, + "step": 2555 + }, + { + "epoch": 0.63955961466283, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69160774.4, + "logits/rejected": -57966518.85714286, + "logps/chosen": -419.158984375, + "logps/rejected": -613.0360630580357, + "loss": 0.0149, + "rewards/chosen": 7.633789825439453, + "rewards/margins": 22.48558120727539, + "rewards/rejected": -14.851791381835938, + "step": 2556 + }, + { + "epoch": 0.6398098336044038, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34730304.0, + "logits/rejected": -50187769.6, + "logps/chosen": -348.24246651785717, + "logps/rejected": -544.58701171875, + "loss": 0.0341, + "rewards/chosen": 7.814400809151786, + "rewards/margins": 20.10708956037249, + "rewards/rejected": -12.292688751220703, + "step": 2557 + }, + { + "epoch": 0.6400600525459778, + "grad_norm": 1.953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29714656.0, + "logits/rejected": -48258982.4, + "logps/chosen": -420.725830078125, + "logps/rejected": -581.8076171875, + "loss": 0.0136, + "rewards/chosen": 8.815721299913195, + "rewards/margins": 22.09341803656684, + "rewards/rejected": -13.277696736653645, + "step": 2558 + }, + { + "epoch": 0.6403102714875516, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30447932.444444444, + "logits/rejected": -49111176.53333333, + "logps/chosen": -383.51207139756946, + "logps/rejected": -623.5916666666667, + "loss": 0.0194, + "rewards/chosen": 9.108394198947483, + "rewards/margins": 21.439952426486546, + "rewards/rejected": -12.331558227539062, + "step": 2559 + }, + { + "epoch": 0.6405604904291254, + "grad_norm": 15.375, + "kl": 13.128257751464844, + "learning_rate": 5e-06, + "logits/chosen": -56417424.0, + "logits/rejected": -38172380.0, + "logps/chosen": -374.05157470703125, + "logps/rejected": -694.3133544921875, + "loss": 0.0496, + "rewards/chosen": 7.839700698852539, + "rewards/margins": 23.434576988220215, + "rewards/rejected": -15.594876289367676, + "step": 2560 + }, + { + "epoch": 0.6408107093706994, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53447286.4, + "logits/rejected": -39319300.571428575, + "logps/chosen": -350.4548095703125, + "logps/rejected": -612.0949358258929, + "loss": 0.0107, + "rewards/chosen": 6.983589172363281, + "rewards/margins": 20.900030953543528, + "rewards/rejected": -13.916441781180245, + "step": 2561 + }, + { + "epoch": 0.6410609283122732, + "grad_norm": 6.0, + "kl": 7.286403656005859, + "learning_rate": 5e-06, + "logits/chosen": -56404580.571428575, + "logits/rejected": -7504136.0, + "logps/chosen": -516.9685407366071, + "logps/rejected": -596.980810546875, + "loss": 0.0477, + "rewards/chosen": 8.52756336757115, + "rewards/margins": 22.6569212777274, + "rewards/rejected": -14.12935791015625, + "step": 2562 + }, + { + "epoch": 0.6413111472538471, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55910715.07692308, + "logits/rejected": -37080794.18181818, + "logps/chosen": -336.80130709134613, + "logps/rejected": -506.1842151988636, + "loss": 0.0286, + "rewards/chosen": 7.195357689490685, + "rewards/margins": 19.788637441355032, + "rewards/rejected": -12.593279751864346, + "step": 2563 + }, + { + "epoch": 0.641561366195421, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45254087.11111111, + "logits/rejected": -55797064.53333333, + "logps/chosen": -465.9475368923611, + "logps/rejected": -476.24462890625, + "loss": 0.0127, + "rewards/chosen": 9.473346286349827, + "rewards/margins": 22.86783481174045, + "rewards/rejected": -13.394488525390624, + "step": 2564 + }, + { + "epoch": 0.6418115851369949, + "grad_norm": 10.5625, + "kl": 7.268516540527344, + "learning_rate": 5e-06, + "logits/chosen": -48703084.0, + "logits/rejected": -51075600.0, + "logps/chosen": -421.0494384765625, + "logps/rejected": -463.1352233886719, + "loss": 0.0502, + "rewards/chosen": 8.408331871032715, + "rewards/margins": 21.744495391845703, + "rewards/rejected": -13.336163520812988, + "step": 2565 + }, + { + "epoch": 0.6420618040785687, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30143416.0, + "logits/rejected": -16358773.0, + "logps/chosen": -408.65142822265625, + "logps/rejected": -688.597412109375, + "loss": 0.0465, + "rewards/chosen": 7.362784385681152, + "rewards/margins": 27.470458030700684, + "rewards/rejected": -20.10767364501953, + "step": 2566 + }, + { + "epoch": 0.6423120230201427, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54584772.0, + "logits/rejected": -59398960.0, + "logps/chosen": -325.2305603027344, + "logps/rejected": -780.05419921875, + "loss": 0.0364, + "rewards/chosen": 6.76908540725708, + "rewards/margins": 22.965229511260986, + "rewards/rejected": -16.196144104003906, + "step": 2567 + }, + { + "epoch": 0.6425622419617165, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18299721.846153848, + "logits/rejected": -44874088.72727273, + "logps/chosen": -230.95872145432693, + "logps/rejected": -744.8631036931819, + "loss": 0.0441, + "rewards/chosen": 5.834239666278545, + "rewards/margins": 24.437835746711784, + "rewards/rejected": -18.60359608043324, + "step": 2568 + }, + { + "epoch": 0.6428124609032904, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -78592526.22222222, + "logits/rejected": -35172352.0, + "logps/chosen": -445.94135199652777, + "logps/rejected": -443.7384765625, + "loss": 0.0207, + "rewards/chosen": 9.715159098307291, + "rewards/margins": 20.093860880533853, + "rewards/rejected": -10.378701782226562, + "step": 2569 + }, + { + "epoch": 0.6430626798448642, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69234278.4, + "logits/rejected": -37025926.85714286, + "logps/chosen": -354.488525390625, + "logps/rejected": -576.2324916294643, + "loss": 0.0696, + "rewards/chosen": 7.260047912597656, + "rewards/margins": 21.37125723702567, + "rewards/rejected": -14.111209324428014, + "step": 2570 + }, + { + "epoch": 0.6433128987864382, + "grad_norm": 9.0625, + "kl": 8.20822525024414, + "learning_rate": 5e-06, + "logits/chosen": -61277644.8, + "logits/rejected": -47898055.11111111, + "logps/chosen": -368.7809244791667, + "logps/rejected": -461.22645399305554, + "loss": 0.0355, + "rewards/chosen": 7.210947672526042, + "rewards/margins": 16.96502685546875, + "rewards/rejected": -9.754079182942709, + "step": 2571 + }, + { + "epoch": 0.643563117728012, + "grad_norm": 1.9140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 15208137.142857144, + "logits/rejected": -30714652.23529412, + "logps/chosen": -434.591552734375, + "logps/rejected": -570.2740119485294, + "loss": 0.0091, + "rewards/chosen": 7.537310464041574, + "rewards/margins": 21.761845340247916, + "rewards/rejected": -14.224534876206341, + "step": 2572 + }, + { + "epoch": 0.6438133366695858, + "grad_norm": 7.8125, + "kl": 8.893774032592773, + "learning_rate": 5e-06, + "logits/chosen": -62331833.6, + "logits/rejected": -63623762.28571428, + "logps/chosen": -494.33310546875, + "logps/rejected": -761.7556501116071, + "loss": 0.0109, + "rewards/chosen": 8.482386779785156, + "rewards/margins": 27.89463566371373, + "rewards/rejected": -19.412248883928573, + "step": 2573 + }, + { + "epoch": 0.6440635556111598, + "grad_norm": 4.3125, + "kl": 0.5777009725570679, + "learning_rate": 5e-06, + "logits/chosen": -60537403.428571425, + "logits/rejected": -36284515.2, + "logps/chosen": -414.9247349330357, + "logps/rejected": -562.340380859375, + "loss": 0.0374, + "rewards/chosen": 9.768253871372767, + "rewards/margins": 27.31368691580636, + "rewards/rejected": -17.545433044433594, + "step": 2574 + }, + { + "epoch": 0.6443137745527336, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35045336.0, + "logits/rejected": -54628672.0, + "logps/chosen": -523.97021484375, + "logps/rejected": -754.6220703125, + "loss": 0.0091, + "rewards/chosen": 8.328710556030273, + "rewards/margins": 27.313872655232746, + "rewards/rejected": -18.985162099202473, + "step": 2575 + }, + { + "epoch": 0.6445639934943075, + "grad_norm": 29.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73135059.2, + "logits/rejected": -73045385.14285715, + "logps/chosen": -470.2392578125, + "logps/rejected": -774.4449637276786, + "loss": 0.0203, + "rewards/chosen": 8.414547729492188, + "rewards/margins": 28.71714041573661, + "rewards/rejected": -20.30259268624442, + "step": 2576 + }, + { + "epoch": 0.6448142124358814, + "grad_norm": 0.8203125, + "kl": 6.586106777191162, + "learning_rate": 5e-06, + "logits/chosen": -53333492.36363637, + "logits/rejected": -83585860.92307693, + "logps/chosen": -338.6180308948864, + "logps/rejected": -767.6663912259615, + "loss": 0.0091, + "rewards/chosen": 8.525799837979404, + "rewards/margins": 30.99727529245657, + "rewards/rejected": -22.471475454477165, + "step": 2577 + }, + { + "epoch": 0.6450644313774553, + "grad_norm": 21.25, + "kl": 11.7147855758667, + "learning_rate": 5e-06, + "logits/chosen": -37827764.705882356, + "logits/rejected": -49800978.28571428, + "logps/chosen": -289.60305606617646, + "logps/rejected": -701.91845703125, + "loss": 0.1187, + "rewards/chosen": 6.395715152516084, + "rewards/margins": 22.958223871824117, + "rewards/rejected": -16.562508719308035, + "step": 2578 + }, + { + "epoch": 0.6453146503190291, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41999506.28571428, + "logits/rejected": -69891960.47058824, + "logps/chosen": -296.8842075892857, + "logps/rejected": -640.5083869485294, + "loss": 0.0226, + "rewards/chosen": 7.108986445835659, + "rewards/margins": 26.319243583358634, + "rewards/rejected": -19.210257137522976, + "step": 2579 + }, + { + "epoch": 0.6455648692606031, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 18499828.0, + "logits/rejected": -41649420.0, + "logps/chosen": -309.2397155761719, + "logps/rejected": -451.593017578125, + "loss": 0.0302, + "rewards/chosen": 6.208970069885254, + "rewards/margins": 19.026702880859375, + "rewards/rejected": -12.817732810974121, + "step": 2580 + }, + { + "epoch": 0.6458150882021769, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18156586.666666668, + "logits/rejected": -21711742.666666668, + "logps/chosen": -380.1309407552083, + "logps/rejected": -575.7657063802084, + "loss": 0.0244, + "rewards/chosen": 9.304210662841797, + "rewards/margins": 25.16513442993164, + "rewards/rejected": -15.860923767089844, + "step": 2581 + }, + { + "epoch": 0.6460653071437508, + "grad_norm": 0.412109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57841184.0, + "logits/rejected": -71727394.13333334, + "logps/chosen": -327.78602430555554, + "logps/rejected": -674.1600911458333, + "loss": 0.0011, + "rewards/chosen": 8.342170715332031, + "rewards/margins": 26.129833475748697, + "rewards/rejected": -17.787662760416666, + "step": 2582 + }, + { + "epoch": 0.6463155260853246, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65526245.333333336, + "logits/rejected": -41805144.0, + "logps/chosen": -394.0609130859375, + "logps/rejected": -636.5171305338541, + "loss": 0.0529, + "rewards/chosen": 7.537298202514648, + "rewards/margins": 24.29124387105306, + "rewards/rejected": -16.75394566853841, + "step": 2583 + }, + { + "epoch": 0.6465657450268986, + "grad_norm": 27.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41662989.71428572, + "logits/rejected": -55637952.0, + "logps/chosen": -524.7522670200893, + "logps/rejected": -603.49169921875, + "loss": 0.0438, + "rewards/chosen": 8.390419006347656, + "rewards/margins": 22.677143859863282, + "rewards/rejected": -14.286724853515626, + "step": 2584 + }, + { + "epoch": 0.6468159639684724, + "grad_norm": 0.9453125, + "kl": 1.057313323020935, + "learning_rate": 5e-06, + "logits/chosen": -49415648.0, + "logits/rejected": -75937717.33333333, + "logps/chosen": -539.6743977864584, + "logps/rejected": -685.6486002604166, + "loss": 0.0026, + "rewards/chosen": 9.297260284423828, + "rewards/margins": 25.318923950195312, + "rewards/rejected": -16.021663665771484, + "step": 2585 + }, + { + "epoch": 0.6470661829100463, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51800115.2, + "logits/rejected": -64162779.428571425, + "logps/chosen": -408.997265625, + "logps/rejected": -641.6016322544643, + "loss": 0.0075, + "rewards/chosen": 8.410508728027343, + "rewards/margins": 27.354600524902345, + "rewards/rejected": -18.944091796875, + "step": 2586 + }, + { + "epoch": 0.6473164018516202, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41831580.44444445, + "logits/rejected": -66965619.2, + "logps/chosen": -400.2749837239583, + "logps/rejected": -724.1975260416667, + "loss": 0.021, + "rewards/chosen": 7.043175591362847, + "rewards/margins": 26.89600355360243, + "rewards/rejected": -19.852827962239584, + "step": 2587 + }, + { + "epoch": 0.647566620793194, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74825216.0, + "logits/rejected": -52825144.0, + "logps/chosen": -315.0334167480469, + "logps/rejected": -582.433349609375, + "loss": 0.0316, + "rewards/chosen": 8.182498931884766, + "rewards/margins": 24.657867431640625, + "rewards/rejected": -16.47536849975586, + "step": 2588 + }, + { + "epoch": 0.6478168397347679, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37116824.615384616, + "logits/rejected": -41007755.63636363, + "logps/chosen": -270.18075796274036, + "logps/rejected": -716.0687144886364, + "loss": 0.0857, + "rewards/chosen": 4.2695943392240086, + "rewards/margins": 22.22346563272543, + "rewards/rejected": -17.95387129350142, + "step": 2589 + }, + { + "epoch": 0.6480670586763418, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38460898.666666664, + "logits/rejected": -43403832.88888889, + "logps/chosen": -541.3479817708334, + "logps/rejected": -611.3713650173611, + "loss": 0.0083, + "rewards/chosen": 8.231866836547852, + "rewards/margins": 23.593986723158096, + "rewards/rejected": -15.362119886610243, + "step": 2590 + }, + { + "epoch": 0.6483172776179157, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38085765.333333336, + "logits/rejected": -42131280.0, + "logps/chosen": -418.1512858072917, + "logps/rejected": -584.90966796875, + "loss": 0.0307, + "rewards/chosen": 8.975790659586588, + "rewards/margins": 23.90011723836263, + "rewards/rejected": -14.924326578776041, + "step": 2591 + }, + { + "epoch": 0.6485674965594895, + "grad_norm": 20.25, + "kl": 2.9637413024902344, + "learning_rate": 5e-06, + "logits/chosen": -58572352.0, + "logits/rejected": 85506803.2, + "logps/chosen": -247.87142508370536, + "logps/rejected": -681.45712890625, + "loss": 0.0784, + "rewards/chosen": 5.94172123500279, + "rewards/margins": 19.759313092912947, + "rewards/rejected": -13.817591857910156, + "step": 2592 + }, + { + "epoch": 0.6488177155010635, + "grad_norm": 4.53125, + "kl": 0.24083614349365234, + "learning_rate": 5e-06, + "logits/chosen": -66703488.0, + "logits/rejected": -36864576.0, + "logps/chosen": -408.09033203125, + "logps/rejected": -499.81069711538464, + "loss": 0.0575, + "rewards/chosen": 7.743243824351918, + "rewards/margins": 22.046770482630166, + "rewards/rejected": -14.303526658278246, + "step": 2593 + }, + { + "epoch": 0.6490679344426373, + "grad_norm": 11.875, + "kl": 0.5005773305892944, + "learning_rate": 5e-06, + "logits/chosen": -68663125.33333333, + "logits/rejected": -60389319.11111111, + "logps/chosen": -393.26490885416666, + "logps/rejected": -795.6804470486111, + "loss": 0.0527, + "rewards/chosen": 7.1790308634440105, + "rewards/margins": 27.201057773166234, + "rewards/rejected": -20.02202690972222, + "step": 2594 + }, + { + "epoch": 0.6493181533842112, + "grad_norm": 22.75, + "kl": 1.237609624862671, + "learning_rate": 5e-06, + "logits/chosen": -38587463.11111111, + "logits/rejected": -44111364.266666666, + "logps/chosen": -355.4597981770833, + "logps/rejected": -542.9458984375, + "loss": 0.0505, + "rewards/chosen": 8.114183213975695, + "rewards/margins": 21.827253892686635, + "rewards/rejected": -13.713070678710938, + "step": 2595 + }, + { + "epoch": 0.649568372325785, + "grad_norm": 6.875, + "kl": 4.719232082366943, + "learning_rate": 5e-06, + "logits/chosen": -63919808.0, + "logits/rejected": -58837194.666666664, + "logps/chosen": -337.81231689453125, + "logps/rejected": -636.07421875, + "loss": 0.0752, + "rewards/chosen": 7.54776128133138, + "rewards/margins": 19.845111846923828, + "rewards/rejected": -12.297350565592447, + "step": 2596 + }, + { + "epoch": 0.649818591267359, + "grad_norm": 18.125, + "kl": 11.642326354980469, + "learning_rate": 5e-06, + "logits/chosen": -39975397.64705882, + "logits/rejected": -49494884.571428575, + "logps/chosen": -366.72435087316177, + "logps/rejected": -570.8751395089286, + "loss": 0.0718, + "rewards/chosen": 8.154209810144762, + "rewards/margins": 21.989785458861277, + "rewards/rejected": -13.835575648716517, + "step": 2597 + }, + { + "epoch": 0.6500688102089328, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41449570.90909091, + "logits/rejected": -60956957.538461536, + "logps/chosen": -380.1803533380682, + "logps/rejected": -663.2719350961538, + "loss": 0.039, + "rewards/chosen": 8.132714705033736, + "rewards/margins": 27.300188131265706, + "rewards/rejected": -19.16747342623197, + "step": 2598 + }, + { + "epoch": 0.6503190291505067, + "grad_norm": 11.6875, + "kl": 3.1423709392547607, + "learning_rate": 5e-06, + "logits/chosen": -36936561.23076923, + "logits/rejected": -81594624.0, + "logps/chosen": -331.15767728365387, + "logps/rejected": -920.0326704545455, + "loss": 0.0523, + "rewards/chosen": 7.3363811786358175, + "rewards/margins": 34.22248125409747, + "rewards/rejected": -26.88610007546165, + "step": 2599 + }, + { + "epoch": 0.6505692480920806, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53727578.666666664, + "logits/rejected": -41584160.0, + "logps/chosen": -281.21006266276044, + "logps/rejected": -617.6715901692709, + "loss": 0.0219, + "rewards/chosen": 7.237630208333333, + "rewards/margins": 23.007659912109375, + "rewards/rejected": -15.770029703776041, + "step": 2600 + }, + { + "epoch": 0.6508194670336545, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22713586.90909091, + "logits/rejected": -54916745.84615385, + "logps/chosen": -363.50972123579544, + "logps/rejected": -601.8341346153846, + "loss": 0.0154, + "rewards/chosen": 7.86307109485973, + "rewards/margins": 24.438617359508168, + "rewards/rejected": -16.575546264648438, + "step": 2601 + }, + { + "epoch": 0.6510696859752283, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39878627.2, + "logits/rejected": -44275725.71428572, + "logps/chosen": -452.378662109375, + "logps/rejected": -631.1036551339286, + "loss": 0.007, + "rewards/chosen": 9.500386810302734, + "rewards/margins": 25.22063762119838, + "rewards/rejected": -15.720250810895648, + "step": 2602 + }, + { + "epoch": 0.6513199049168023, + "grad_norm": 2.578125, + "kl": 6.551934719085693, + "learning_rate": 5e-06, + "logits/chosen": -37560320.0, + "logits/rejected": -32953948.8, + "logps/chosen": -380.8405064174107, + "logps/rejected": -537.02451171875, + "loss": 0.0408, + "rewards/chosen": 9.16228267124721, + "rewards/margins": 22.43976069859096, + "rewards/rejected": -13.27747802734375, + "step": 2603 + }, + { + "epoch": 0.6515701238583761, + "grad_norm": 10.75, + "kl": 2.7489497661590576, + "learning_rate": 5e-06, + "logits/chosen": -48473190.4, + "logits/rejected": -48778688.0, + "logps/chosen": -272.7330729166667, + "logps/rejected": -877.6409505208334, + "loss": 0.0483, + "rewards/chosen": 7.909400431315104, + "rewards/margins": 34.88739047580295, + "rewards/rejected": -26.977990044487846, + "step": 2604 + }, + { + "epoch": 0.6518203427999499, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33295990.4, + "logits/rejected": -77323437.71428572, + "logps/chosen": -345.87158203125, + "logps/rejected": -832.4060407366071, + "loss": 0.0222, + "rewards/chosen": 7.453889465332031, + "rewards/margins": 29.814868818010602, + "rewards/rejected": -22.360979352678573, + "step": 2605 + }, + { + "epoch": 0.6520705617415238, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34417036.307692304, + "logits/rejected": -48681489.45454545, + "logps/chosen": -199.28910006009616, + "logps/rejected": -614.1506125710227, + "loss": 0.0605, + "rewards/chosen": 5.568749060997596, + "rewards/margins": 22.44240351323481, + "rewards/rejected": -16.873654452237215, + "step": 2606 + }, + { + "epoch": 0.6523207806830977, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62808541.09090909, + "logits/rejected": -29936561.230769232, + "logps/chosen": -391.92649147727275, + "logps/rejected": -619.8834134615385, + "loss": 0.0287, + "rewards/chosen": 9.098356767134232, + "rewards/margins": 23.90413820493471, + "rewards/rejected": -14.80578143780048, + "step": 2607 + }, + { + "epoch": 0.6525709996246716, + "grad_norm": 1.0234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45054949.333333336, + "logits/rejected": -37143936.0, + "logps/chosen": -422.9926350911458, + "logps/rejected": -481.450927734375, + "loss": 0.0235, + "rewards/chosen": 9.71894391377767, + "rewards/margins": 23.11368497212728, + "rewards/rejected": -13.39474105834961, + "step": 2608 + }, + { + "epoch": 0.6528212185662454, + "grad_norm": 4.09375, + "kl": 4.311176776885986, + "learning_rate": 5e-06, + "logits/chosen": -68310714.66666667, + "logits/rejected": -42685448.0, + "logps/chosen": -415.047607421875, + "logps/rejected": -511.8459065755208, + "loss": 0.0219, + "rewards/chosen": 9.17621103922526, + "rewards/margins": 22.7979736328125, + "rewards/rejected": -13.62176259358724, + "step": 2609 + }, + { + "epoch": 0.6530714375078194, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66322656.0, + "logits/rejected": -69696725.33333333, + "logps/chosen": -421.3140869140625, + "logps/rejected": -757.9734700520834, + "loss": 0.044, + "rewards/chosen": 8.70077641805013, + "rewards/margins": 23.71949640909831, + "rewards/rejected": -15.018719991048178, + "step": 2610 + }, + { + "epoch": 0.6533216564493932, + "grad_norm": 2.59375, + "kl": 5.4555487632751465, + "learning_rate": 5e-06, + "logits/chosen": -44096768.0, + "logits/rejected": -18919473.6, + "logps/chosen": -390.1876743861607, + "logps/rejected": -573.32109375, + "loss": 0.006, + "rewards/chosen": 9.067035130092076, + "rewards/margins": 20.599814060756138, + "rewards/rejected": -11.532778930664062, + "step": 2611 + }, + { + "epoch": 0.6535718753909671, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19577036.0, + "logits/rejected": -49247404.0, + "logps/chosen": -321.8989562988281, + "logps/rejected": -562.6668701171875, + "loss": 0.0041, + "rewards/chosen": 7.552963733673096, + "rewards/margins": 23.31929063796997, + "rewards/rejected": -15.766326904296875, + "step": 2612 + }, + { + "epoch": 0.653822094332541, + "grad_norm": 3.921875, + "kl": 6.442388534545898, + "learning_rate": 5e-06, + "logits/chosen": -74628342.15384616, + "logits/rejected": -41449364.36363637, + "logps/chosen": -438.7626953125, + "logps/rejected": -608.5917524857955, + "loss": 0.0099, + "rewards/chosen": 9.71207486666166, + "rewards/margins": 22.128789621633253, + "rewards/rejected": -12.416714754971592, + "step": 2613 + }, + { + "epoch": 0.6540723132741149, + "grad_norm": 2.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51834160.0, + "logits/rejected": -33132392.0, + "logps/chosen": -382.906005859375, + "logps/rejected": -702.3878580729166, + "loss": 0.0201, + "rewards/chosen": 8.743253707885742, + "rewards/margins": 25.969911575317383, + "rewards/rejected": -17.22665786743164, + "step": 2614 + }, + { + "epoch": 0.6543225322156887, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41921804.8, + "logits/rejected": -35741430.85714286, + "logps/chosen": -424.86083984375, + "logps/rejected": -463.12461635044644, + "loss": 0.0334, + "rewards/chosen": 8.369960021972656, + "rewards/margins": 19.491855948311944, + "rewards/rejected": -11.121895926339286, + "step": 2615 + }, + { + "epoch": 0.6545727511572627, + "grad_norm": 3.8125, + "kl": 4.10398006439209, + "learning_rate": 5e-06, + "logits/chosen": -41749198.54545455, + "logits/rejected": -45669518.76923077, + "logps/chosen": -484.33615944602275, + "logps/rejected": -586.2886117788462, + "loss": 0.0316, + "rewards/chosen": 10.513512351296164, + "rewards/margins": 24.996315696022727, + "rewards/rejected": -14.482803344726562, + "step": 2616 + }, + { + "epoch": 0.6548229700988365, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40621316.92307692, + "logits/rejected": -50951941.81818182, + "logps/chosen": -308.72742638221155, + "logps/rejected": -660.2762340198864, + "loss": 0.0441, + "rewards/chosen": 7.26693608210637, + "rewards/margins": 22.049863161740603, + "rewards/rejected": -14.782927079634232, + "step": 2617 + }, + { + "epoch": 0.6550731890404103, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39337940.0, + "logits/rejected": -63302864.0, + "logps/chosen": -480.95306396484375, + "logps/rejected": -740.02587890625, + "loss": 0.0204, + "rewards/chosen": 10.122875213623047, + "rewards/margins": 25.432799339294434, + "rewards/rejected": -15.309924125671387, + "step": 2618 + }, + { + "epoch": 0.6553234079819842, + "grad_norm": 11.1875, + "kl": 8.538969993591309, + "learning_rate": 5e-06, + "logits/chosen": -16812114.666666668, + "logits/rejected": -33431994.666666668, + "logps/chosen": -322.13547770182294, + "logps/rejected": -572.052978515625, + "loss": 0.0858, + "rewards/chosen": 7.46705436706543, + "rewards/margins": 22.789865493774414, + "rewards/rejected": -15.322811126708984, + "step": 2619 + }, + { + "epoch": 0.6555736269235581, + "grad_norm": 5.46875, + "kl": 16.768688201904297, + "learning_rate": 5e-06, + "logits/chosen": -32138568.0, + "logits/rejected": -38580504.0, + "logps/chosen": -355.4034423828125, + "logps/rejected": -440.1414794921875, + "loss": 0.0931, + "rewards/chosen": 8.695669174194336, + "rewards/margins": 19.525391578674316, + "rewards/rejected": -10.82972240447998, + "step": 2620 + }, + { + "epoch": 0.655823845865132, + "grad_norm": 7.5625, + "kl": 3.1007308959960938, + "learning_rate": 5e-06, + "logits/chosen": -77448777.14285715, + "logits/rejected": -38874566.4, + "logps/chosen": -450.631591796875, + "logps/rejected": -582.0001953125, + "loss": 0.0213, + "rewards/chosen": 8.787045070103236, + "rewards/margins": 21.606362697056362, + "rewards/rejected": -12.819317626953126, + "step": 2621 + }, + { + "epoch": 0.6560740648067058, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44684787.2, + "logits/rejected": -42661673.14285714, + "logps/chosen": -328.97255859375, + "logps/rejected": -541.1925223214286, + "loss": 0.0477, + "rewards/chosen": 7.321540069580078, + "rewards/margins": 18.90313208443778, + "rewards/rejected": -11.581592014857701, + "step": 2622 + }, + { + "epoch": 0.6563242837482798, + "grad_norm": 7.21875, + "kl": 8.728328704833984, + "learning_rate": 5e-06, + "logits/chosen": -56993664.0, + "logits/rejected": -47757222.4, + "logps/chosen": -450.7101353236607, + "logps/rejected": -482.53330078125, + "loss": 0.0386, + "rewards/chosen": 9.207982744489398, + "rewards/margins": 20.876314653669084, + "rewards/rejected": -11.668331909179688, + "step": 2623 + }, + { + "epoch": 0.6565745026898536, + "grad_norm": 9.75, + "kl": 3.767920970916748, + "learning_rate": 5e-06, + "logits/chosen": -64791844.571428575, + "logits/rejected": -11844357.6, + "logps/chosen": -320.60986328125, + "logps/rejected": -582.237744140625, + "loss": 0.075, + "rewards/chosen": 6.097769056047712, + "rewards/margins": 23.326014600481308, + "rewards/rejected": -17.228245544433594, + "step": 2624 + }, + { + "epoch": 0.6568247216314275, + "grad_norm": 4.8125, + "kl": 7.098320007324219, + "learning_rate": 5e-06, + "logits/chosen": -74443781.81818181, + "logits/rejected": -34297223.384615384, + "logps/chosen": -432.7678888494318, + "logps/rejected": -487.2122145432692, + "loss": 0.0092, + "rewards/chosen": 10.14233467795632, + "rewards/margins": 25.001911563473147, + "rewards/rejected": -14.859576885516827, + "step": 2625 + }, + { + "epoch": 0.6570749405730014, + "grad_norm": 8.0625, + "kl": 2.5036659240722656, + "learning_rate": 5e-06, + "logits/chosen": -28657366.85714286, + "logits/rejected": -61180857.6, + "logps/chosen": -384.55381556919644, + "logps/rejected": -636.5068359375, + "loss": 0.0547, + "rewards/chosen": 8.068157741001674, + "rewards/margins": 22.446116420200894, + "rewards/rejected": -14.377958679199219, + "step": 2626 + }, + { + "epoch": 0.6573251595145753, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -912398.4, + "logits/rejected": -47735149.71428572, + "logps/chosen": -331.27373046875, + "logps/rejected": -594.779296875, + "loss": 0.0435, + "rewards/chosen": 7.6825714111328125, + "rewards/margins": 22.75413077218192, + "rewards/rejected": -15.071559361049108, + "step": 2627 + }, + { + "epoch": 0.6575753784561491, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33419757.714285713, + "logits/rejected": -50945536.0, + "logps/chosen": -279.91859654017856, + "logps/rejected": -494.073681640625, + "loss": 0.0531, + "rewards/chosen": 6.421715872628348, + "rewards/margins": 18.4920649937221, + "rewards/rejected": -12.07034912109375, + "step": 2628 + }, + { + "epoch": 0.6578255973977231, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21487148.8, + "logits/rejected": 105332103.1111111, + "logps/chosen": -407.3789388020833, + "logps/rejected": -415.9631076388889, + "loss": 0.026, + "rewards/chosen": 7.703874715169271, + "rewards/margins": 22.485836791992188, + "rewards/rejected": -14.781962076822916, + "step": 2629 + }, + { + "epoch": 0.6580758163392969, + "grad_norm": 8.0, + "kl": 5.744670867919922, + "learning_rate": 5e-06, + "logits/chosen": -48434564.92307692, + "logits/rejected": -43658842.18181818, + "logps/chosen": -294.0241511418269, + "logps/rejected": -720.3905362215909, + "loss": 0.0465, + "rewards/chosen": 7.145712045522837, + "rewards/margins": 27.20128732961375, + "rewards/rejected": -20.05557528409091, + "step": 2630 + }, + { + "epoch": 0.6583260352808707, + "grad_norm": 2.859375, + "kl": 10.360231399536133, + "learning_rate": 5e-06, + "logits/chosen": -31647706.181818184, + "logits/rejected": -59683968.0, + "logps/chosen": -467.26265092329544, + "logps/rejected": -525.9380258413462, + "loss": 0.0044, + "rewards/chosen": 10.719039223410867, + "rewards/margins": 23.136323675409066, + "rewards/rejected": -12.417284451998198, + "step": 2631 + }, + { + "epoch": 0.6585762542224446, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53001244.8, + "logits/rejected": -24031844.57142857, + "logps/chosen": -525.626904296875, + "logps/rejected": -836.4875837053571, + "loss": 0.0043, + "rewards/chosen": 10.104798126220704, + "rewards/margins": 30.257202911376954, + "rewards/rejected": -20.15240478515625, + "step": 2632 + }, + { + "epoch": 0.6588264731640185, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44530180.0, + "logits/rejected": -46058580.0, + "logps/chosen": -293.4967346191406, + "logps/rejected": -844.9837646484375, + "loss": 0.0231, + "rewards/chosen": 7.778436183929443, + "rewards/margins": 28.70551824569702, + "rewards/rejected": -20.927082061767578, + "step": 2633 + }, + { + "epoch": 0.6590766921055924, + "grad_norm": 22.75, + "kl": 6.236053466796875, + "learning_rate": 5e-06, + "logits/chosen": -63010933.333333336, + "logits/rejected": -37121733.333333336, + "logps/chosen": -423.2620035807292, + "logps/rejected": -611.0253092447916, + "loss": 0.0658, + "rewards/chosen": 7.813900629679362, + "rewards/margins": 25.391097386678062, + "rewards/rejected": -17.5771967569987, + "step": 2634 + }, + { + "epoch": 0.6593269110471662, + "grad_norm": 5.625, + "kl": 17.54743194580078, + "learning_rate": 5e-06, + "logits/chosen": -51618589.86666667, + "logits/rejected": -53489447.11111111, + "logps/chosen": -367.08470052083334, + "logps/rejected": -655.8415798611111, + "loss": 0.0585, + "rewards/chosen": 8.570930989583333, + "rewards/margins": 22.192786831325954, + "rewards/rejected": -13.621855841742622, + "step": 2635 + }, + { + "epoch": 0.6595771299887402, + "grad_norm": 10.6875, + "kl": 10.473540306091309, + "learning_rate": 5e-06, + "logits/chosen": -34977575.384615384, + "logits/rejected": -54290164.36363637, + "logps/chosen": -367.66372445913464, + "logps/rejected": -696.6633522727273, + "loss": 0.0769, + "rewards/chosen": 8.727881798377403, + "rewards/margins": 25.78831215171547, + "rewards/rejected": -17.060430353338067, + "step": 2636 + }, + { + "epoch": 0.659827348930314, + "grad_norm": 2.65625, + "kl": 18.969594955444336, + "learning_rate": 5e-06, + "logits/chosen": -34904277.333333336, + "logits/rejected": -50602343.11111111, + "logps/chosen": -443.44622395833335, + "logps/rejected": -727.5763346354166, + "loss": 0.1004, + "rewards/chosen": 9.874061075846354, + "rewards/margins": 30.702250501844617, + "rewards/rejected": -20.828189425998264, + "step": 2637 + }, + { + "epoch": 0.6600775678718879, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56556136.72727273, + "logits/rejected": -51331032.615384616, + "logps/chosen": -404.1983753551136, + "logps/rejected": -582.3425480769231, + "loss": 0.031, + "rewards/chosen": 8.684429515491832, + "rewards/margins": 25.402517518797122, + "rewards/rejected": -16.71808800330529, + "step": 2638 + }, + { + "epoch": 0.6603277868134618, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -95580561.45454545, + "logits/rejected": -25735182.769230768, + "logps/chosen": -395.36337002840907, + "logps/rejected": -523.5632136418269, + "loss": 0.021, + "rewards/chosen": 10.181863264604049, + "rewards/margins": 24.509445990715825, + "rewards/rejected": -14.327582726111778, + "step": 2639 + }, + { + "epoch": 0.6605780057550357, + "grad_norm": 10.375, + "kl": 1.0356299877166748, + "learning_rate": 5e-06, + "logits/chosen": -60958498.461538464, + "logits/rejected": -71077952.0, + "logps/chosen": -494.8004807692308, + "logps/rejected": -475.3634144176136, + "loss": 0.0552, + "rewards/chosen": 9.590360788198618, + "rewards/margins": 24.334648452438675, + "rewards/rejected": -14.744287664240057, + "step": 2640 + }, + { + "epoch": 0.6608282246966095, + "grad_norm": 6.875, + "kl": 1.8426475524902344, + "learning_rate": 5e-06, + "logits/chosen": -53475032.615384616, + "logits/rejected": -38617565.09090909, + "logps/chosen": -306.72472205528845, + "logps/rejected": -636.8140092329545, + "loss": 0.047, + "rewards/chosen": 7.489544795109675, + "rewards/margins": 24.836225869772317, + "rewards/rejected": -17.34668107466264, + "step": 2641 + }, + { + "epoch": 0.6610784436381834, + "grad_norm": 0.70703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67584972.8, + "logits/rejected": -34447282.28571428, + "logps/chosen": -416.89189453125, + "logps/rejected": -725.1089564732143, + "loss": 0.0012, + "rewards/chosen": 9.577384948730469, + "rewards/margins": 27.50143323625837, + "rewards/rejected": -17.924048287527903, + "step": 2642 + }, + { + "epoch": 0.6613286625797573, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65972309.333333336, + "logits/rejected": -31151413.333333332, + "logps/chosen": -431.37787543402777, + "logps/rejected": -526.2116536458333, + "loss": 0.0088, + "rewards/chosen": 7.559441460503472, + "rewards/margins": 23.66524692111545, + "rewards/rejected": -16.10580546061198, + "step": 2643 + }, + { + "epoch": 0.6615788815213312, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35267704.0, + "logits/rejected": -77879048.0, + "logps/chosen": -368.94427490234375, + "logps/rejected": -641.0972290039062, + "loss": 0.0259, + "rewards/chosen": 7.364168167114258, + "rewards/margins": 24.97359848022461, + "rewards/rejected": -17.60943031311035, + "step": 2644 + }, + { + "epoch": 0.661829100462905, + "grad_norm": 18.75, + "kl": 12.506287574768066, + "learning_rate": 5e-06, + "logits/chosen": -79812040.0, + "logits/rejected": -57834392.0, + "logps/chosen": -451.6444396972656, + "logps/rejected": -684.90283203125, + "loss": 0.0774, + "rewards/chosen": 8.937474250793457, + "rewards/margins": 30.96648120880127, + "rewards/rejected": -22.029006958007812, + "step": 2645 + }, + { + "epoch": 0.662079319404479, + "grad_norm": 6.46875, + "kl": 5.260871887207031, + "learning_rate": 5e-06, + "logits/chosen": -62451155.692307696, + "logits/rejected": -79673402.18181819, + "logps/chosen": -331.97506009615387, + "logps/rejected": -604.9599609375, + "loss": 0.0453, + "rewards/chosen": 7.36358642578125, + "rewards/margins": 23.182022094726562, + "rewards/rejected": -15.818435668945312, + "step": 2646 + }, + { + "epoch": 0.6623295383460528, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75212317.53846154, + "logits/rejected": -59983901.09090909, + "logps/chosen": -367.33067908653845, + "logps/rejected": -610.1803533380681, + "loss": 0.0153, + "rewards/chosen": 7.38426501934345, + "rewards/margins": 25.03890313635339, + "rewards/rejected": -17.65463811700994, + "step": 2647 + }, + { + "epoch": 0.6625797572876266, + "grad_norm": 7.25, + "kl": 5.380631923675537, + "learning_rate": 5e-06, + "logits/chosen": -59594596.0, + "logits/rejected": -58787132.0, + "logps/chosen": -439.33392333984375, + "logps/rejected": -561.7689208984375, + "loss": 0.0424, + "rewards/chosen": 8.779102325439453, + "rewards/margins": 26.367700576782227, + "rewards/rejected": -17.588598251342773, + "step": 2648 + }, + { + "epoch": 0.6628299762292006, + "grad_norm": 8.625, + "kl": 3.855585813522339, + "learning_rate": 5e-06, + "logits/chosen": -43827140.266666666, + "logits/rejected": -73567402.66666667, + "logps/chosen": -382.0679036458333, + "logps/rejected": -598.9692925347222, + "loss": 0.0373, + "rewards/chosen": 8.954344685872396, + "rewards/margins": 22.907244194878473, + "rewards/rejected": -13.952899509006077, + "step": 2649 + }, + { + "epoch": 0.6630801951707744, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77913609.14285715, + "logits/rejected": -39956329.4117647, + "logps/chosen": -359.77748325892856, + "logps/rejected": -775.2999195772059, + "loss": 0.0164, + "rewards/chosen": 7.372807094029018, + "rewards/margins": 26.760089521648503, + "rewards/rejected": -19.387282427619486, + "step": 2650 + }, + { + "epoch": 0.6633304141123483, + "grad_norm": 0.8515625, + "kl": 2.476182460784912, + "learning_rate": 5e-06, + "logits/chosen": -50669813.333333336, + "logits/rejected": -49240709.333333336, + "logps/chosen": -430.9484049479167, + "logps/rejected": -511.4052327473958, + "loss": 0.0013, + "rewards/chosen": 10.269240697224935, + "rewards/margins": 25.08985201517741, + "rewards/rejected": -14.820611317952475, + "step": 2651 + }, + { + "epoch": 0.6635806330539222, + "grad_norm": 6.15625, + "kl": 5.9361891746521, + "learning_rate": 5e-06, + "logits/chosen": -19159848.727272727, + "logits/rejected": 128537875.6923077, + "logps/chosen": -547.6073774857955, + "logps/rejected": -489.01254507211536, + "loss": 0.0666, + "rewards/chosen": 9.78666132146662, + "rewards/margins": 23.22036775175508, + "rewards/rejected": -13.433706430288462, + "step": 2652 + }, + { + "epoch": 0.6638308519954961, + "grad_norm": 8.8125, + "kl": 0.20814132690429688, + "learning_rate": 5e-06, + "logits/chosen": -62108322.13333333, + "logits/rejected": -56376504.88888889, + "logps/chosen": -351.02565104166666, + "logps/rejected": -807.0352647569445, + "loss": 0.0903, + "rewards/chosen": 6.760453287760416, + "rewards/margins": 26.850464884440104, + "rewards/rejected": -20.090011596679688, + "step": 2653 + }, + { + "epoch": 0.6640810709370699, + "grad_norm": 1.0, + "kl": 4.038382530212402, + "learning_rate": 5e-06, + "logits/chosen": -28677050.181818184, + "logits/rejected": -70025127.38461539, + "logps/chosen": -431.50483842329544, + "logps/rejected": -928.5794771634615, + "loss": 0.0165, + "rewards/chosen": 7.8884499289772725, + "rewards/margins": 29.425836309686407, + "rewards/rejected": -21.537386380709133, + "step": 2654 + }, + { + "epoch": 0.6643312898786438, + "grad_norm": 15.0625, + "kl": 7.838709831237793, + "learning_rate": 5e-06, + "logits/chosen": -51181216.0, + "logits/rejected": -20514586.666666668, + "logps/chosen": -425.201171875, + "logps/rejected": -587.861328125, + "loss": 0.0611, + "rewards/chosen": 10.227081298828125, + "rewards/margins": 24.616216023763023, + "rewards/rejected": -14.389134724934896, + "step": 2655 + }, + { + "epoch": 0.6645815088202177, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44999498.666666664, + "logits/rejected": -32765482.666666668, + "logps/chosen": -475.5964626736111, + "logps/rejected": -602.8986979166667, + "loss": 0.0044, + "rewards/chosen": 10.809618631998697, + "rewards/margins": 25.595509338378903, + "rewards/rejected": -14.785890706380208, + "step": 2656 + }, + { + "epoch": 0.6648317277617916, + "grad_norm": 15.625, + "kl": 6.830526351928711, + "learning_rate": 5e-06, + "logits/chosen": -60633049.6, + "logits/rejected": -46874701.71428572, + "logps/chosen": -407.192236328125, + "logps/rejected": -644.5550362723214, + "loss": 0.0633, + "rewards/chosen": 9.8758544921875, + "rewards/margins": 23.820674351283483, + "rewards/rejected": -13.944819859095983, + "step": 2657 + }, + { + "epoch": 0.6650819467033654, + "grad_norm": 9.1875, + "kl": 4.8259053230285645, + "learning_rate": 5e-06, + "logits/chosen": -45385262.93333333, + "logits/rejected": -40165621.333333336, + "logps/chosen": -409.7986328125, + "logps/rejected": -661.9758029513889, + "loss": 0.0248, + "rewards/chosen": 9.176368204752604, + "rewards/margins": 22.446957397460938, + "rewards/rejected": -13.270589192708334, + "step": 2658 + }, + { + "epoch": 0.6653321656449394, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30742952.727272727, + "logits/rejected": -51226121.84615385, + "logps/chosen": -293.54150390625, + "logps/rejected": -695.4519230769231, + "loss": 0.0261, + "rewards/chosen": 8.150332364169033, + "rewards/margins": 27.299421297086703, + "rewards/rejected": -19.149088932917667, + "step": 2659 + }, + { + "epoch": 0.6655823845865132, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60711330.90909091, + "logits/rejected": -63298180.92307692, + "logps/chosen": -467.28462357954544, + "logps/rejected": -653.1494891826923, + "loss": 0.0241, + "rewards/chosen": 8.606345436789773, + "rewards/margins": 25.339166387811407, + "rewards/rejected": -16.732820951021633, + "step": 2660 + }, + { + "epoch": 0.665832603528087, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41454778.18181818, + "logits/rejected": -42790395.07692308, + "logps/chosen": -254.54814009232953, + "logps/rejected": -736.71875, + "loss": 0.0461, + "rewards/chosen": 4.777910405939275, + "rewards/margins": 21.677552176522205, + "rewards/rejected": -16.89964177058293, + "step": 2661 + }, + { + "epoch": 0.666082822469661, + "grad_norm": 0.78515625, + "kl": 8.616865158081055, + "learning_rate": 5e-06, + "logits/chosen": -59803061.333333336, + "logits/rejected": -49838549.333333336, + "logps/chosen": -396.5406901041667, + "logps/rejected": -802.1786295572916, + "loss": 0.0015, + "rewards/chosen": 10.592288970947266, + "rewards/margins": 28.317846934000652, + "rewards/rejected": -17.725557963053387, + "step": 2662 + }, + { + "epoch": 0.6663330414112348, + "grad_norm": 6.75, + "kl": 7.64814567565918, + "learning_rate": 5e-06, + "logits/chosen": -19633602.46153846, + "logits/rejected": -25652712.727272727, + "logps/chosen": -439.44193209134613, + "logps/rejected": -697.4636008522727, + "loss": 0.0293, + "rewards/chosen": 8.857303325946514, + "rewards/margins": 22.599200908954327, + "rewards/rejected": -13.741897583007812, + "step": 2663 + }, + { + "epoch": 0.6665832603528087, + "grad_norm": 1.515625, + "kl": 15.040311813354492, + "learning_rate": 5e-06, + "logits/chosen": -62735976.0, + "logits/rejected": -42569292.0, + "logps/chosen": -442.345458984375, + "logps/rejected": -526.3292846679688, + "loss": 0.0387, + "rewards/chosen": 10.63322639465332, + "rewards/margins": 24.691940307617188, + "rewards/rejected": -14.058713912963867, + "step": 2664 + }, + { + "epoch": 0.6668334792943826, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49731203.2, + "logits/rejected": -43272740.571428575, + "logps/chosen": -365.801904296875, + "logps/rejected": -525.6661202566964, + "loss": 0.0205, + "rewards/chosen": 9.561897277832031, + "rewards/margins": 21.923970903669087, + "rewards/rejected": -12.362073625837054, + "step": 2665 + }, + { + "epoch": 0.6670836982359565, + "grad_norm": 13.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60050741.333333336, + "logits/rejected": -69448346.66666667, + "logps/chosen": -385.7237955729167, + "logps/rejected": -648.8827311197916, + "loss": 0.0205, + "rewards/chosen": 8.75260861714681, + "rewards/margins": 24.330956141153973, + "rewards/rejected": -15.578347524007162, + "step": 2666 + }, + { + "epoch": 0.6673339171775303, + "grad_norm": 13.125, + "kl": 23.260469436645508, + "learning_rate": 5e-06, + "logits/chosen": -49006668.8, + "logits/rejected": -84739384.8888889, + "logps/chosen": -329.62659505208336, + "logps/rejected": -616.0418294270834, + "loss": 0.1564, + "rewards/chosen": 7.906956481933594, + "rewards/margins": 19.115918816460503, + "rewards/rejected": -11.208962334526909, + "step": 2667 + }, + { + "epoch": 0.6675841361191042, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74539680.0, + "logits/rejected": -66564555.294117644, + "logps/chosen": -542.4593680245536, + "logps/rejected": -537.1803768382352, + "loss": 0.0355, + "rewards/chosen": 12.349825177873884, + "rewards/margins": 27.5173523205669, + "rewards/rejected": -15.167527142693014, + "step": 2668 + }, + { + "epoch": 0.6678343550606781, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32113155.555555556, + "logits/rejected": -53042282.666666664, + "logps/chosen": -316.02392578125, + "logps/rejected": -573.2503255208334, + "loss": 0.0305, + "rewards/chosen": 9.00972154405382, + "rewards/margins": 21.600206163194443, + "rewards/rejected": -12.590484619140625, + "step": 2669 + }, + { + "epoch": 0.668084574002252, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36011229.538461536, + "logits/rejected": -40687918.54545455, + "logps/chosen": -357.0746319110577, + "logps/rejected": -556.7743252840909, + "loss": 0.0204, + "rewards/chosen": 9.447377718411959, + "rewards/margins": 20.97209140804264, + "rewards/rejected": -11.524713689630682, + "step": 2670 + }, + { + "epoch": 0.6683347929438258, + "grad_norm": 16.5, + "kl": 5.842411041259766, + "learning_rate": 5e-06, + "logits/chosen": -50390926.222222224, + "logits/rejected": -50245691.733333334, + "logps/chosen": -372.9076877170139, + "logps/rejected": -652.6373697916666, + "loss": 0.0202, + "rewards/chosen": 8.231065538194445, + "rewards/margins": 23.907410346137155, + "rewards/rejected": -15.676344807942709, + "step": 2671 + }, + { + "epoch": 0.6685850118853998, + "grad_norm": 3.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46119812.571428575, + "logits/rejected": -52400243.2, + "logps/chosen": -438.19559151785717, + "logps/rejected": -553.90595703125, + "loss": 0.021, + "rewards/chosen": 10.678777422223773, + "rewards/margins": 24.0945794241769, + "rewards/rejected": -13.415802001953125, + "step": 2672 + }, + { + "epoch": 0.6688352308269736, + "grad_norm": 5.0, + "kl": 9.487467765808105, + "learning_rate": 5e-06, + "logits/chosen": -37758852.571428575, + "logits/rejected": -32018579.2, + "logps/chosen": -344.4291294642857, + "logps/rejected": -624.848828125, + "loss": 0.0246, + "rewards/chosen": 8.148284912109375, + "rewards/margins": 22.366650390625, + "rewards/rejected": -14.218365478515626, + "step": 2673 + }, + { + "epoch": 0.6690854497685474, + "grad_norm": 7.71875, + "kl": 6.337097644805908, + "learning_rate": 5e-06, + "logits/chosen": -70173829.33333333, + "logits/rejected": -38445720.0, + "logps/chosen": -487.927978515625, + "logps/rejected": -467.0347900390625, + "loss": 0.0239, + "rewards/chosen": 12.398569742838541, + "rewards/margins": 26.636576334635414, + "rewards/rejected": -14.238006591796875, + "step": 2674 + }, + { + "epoch": 0.6693356687101214, + "grad_norm": 12.3125, + "kl": 15.159149169921875, + "learning_rate": 5e-06, + "logits/chosen": -46320230.4, + "logits/rejected": -44069749.333333336, + "logps/chosen": -480.1328125, + "logps/rejected": -767.9574652777778, + "loss": 0.0643, + "rewards/chosen": 10.066046142578125, + "rewards/margins": 25.87298346625434, + "rewards/rejected": -15.806937323676216, + "step": 2675 + }, + { + "epoch": 0.6695858876516952, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17475469.333333332, + "logits/rejected": -31419997.333333332, + "logps/chosen": -403.9899088541667, + "logps/rejected": -749.6282552083334, + "loss": 0.0067, + "rewards/chosen": 8.17645009358724, + "rewards/margins": 21.987686157226562, + "rewards/rejected": -13.811236063639322, + "step": 2676 + }, + { + "epoch": 0.6698361065932691, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27400892.444444444, + "logits/rejected": -20485478.4, + "logps/chosen": -394.02362738715277, + "logps/rejected": -595.7180989583334, + "loss": 0.0108, + "rewards/chosen": 8.875926547580296, + "rewards/margins": 21.959499952528212, + "rewards/rejected": -13.083573404947916, + "step": 2677 + }, + { + "epoch": 0.670086325534843, + "grad_norm": 7.65625, + "kl": 7.0587158203125, + "learning_rate": 5e-06, + "logits/chosen": -39545305.6, + "logits/rejected": -50497549.71428572, + "logps/chosen": -387.5333984375, + "logps/rejected": -535.0796595982143, + "loss": 0.048, + "rewards/chosen": 7.473659515380859, + "rewards/margins": 19.049288613455637, + "rewards/rejected": -11.575629098074776, + "step": 2678 + }, + { + "epoch": 0.6703365444764169, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30359620.0, + "logits/rejected": -41586768.0, + "logps/chosen": -425.899169921875, + "logps/rejected": -637.9644775390625, + "loss": 0.0064, + "rewards/chosen": 8.680935859680176, + "rewards/margins": 25.36396884918213, + "rewards/rejected": -16.683032989501953, + "step": 2679 + }, + { + "epoch": 0.6705867634179907, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57830132.36363637, + "logits/rejected": -40725115.07692308, + "logps/chosen": -362.08180930397725, + "logps/rejected": -680.9657451923077, + "loss": 0.0201, + "rewards/chosen": 9.598480918190695, + "rewards/margins": 26.013969528091536, + "rewards/rejected": -16.41548860990084, + "step": 2680 + }, + { + "epoch": 0.6708369823595646, + "grad_norm": 13.1875, + "kl": 15.36419677734375, + "learning_rate": 5e-06, + "logits/chosen": -46010268.0, + "logits/rejected": -53946824.0, + "logps/chosen": -314.49957275390625, + "logps/rejected": -643.9691772460938, + "loss": 0.0992, + "rewards/chosen": 7.324263572692871, + "rewards/margins": 28.42088031768799, + "rewards/rejected": -21.096616744995117, + "step": 2681 + }, + { + "epoch": 0.6710872013011385, + "grad_norm": 0.8125, + "kl": 1.7444674968719482, + "learning_rate": 5e-06, + "logits/chosen": -39192219.428571425, + "logits/rejected": -33029305.6, + "logps/chosen": -286.675048828125, + "logps/rejected": -538.48330078125, + "loss": 0.0502, + "rewards/chosen": 7.213796888078962, + "rewards/margins": 21.582511792864118, + "rewards/rejected": -14.368714904785156, + "step": 2682 + }, + { + "epoch": 0.6713374202427124, + "grad_norm": 0.73046875, + "kl": 7.757453918457031, + "learning_rate": 5e-06, + "logits/chosen": -61587110.4, + "logits/rejected": -28352715.42857143, + "logps/chosen": -519.221923828125, + "logps/rejected": -551.8696637834821, + "loss": 0.04, + "rewards/chosen": 12.053076934814452, + "rewards/margins": 24.486699022565567, + "rewards/rejected": -12.433622087751116, + "step": 2683 + }, + { + "epoch": 0.6715876391842862, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53596712.72727273, + "logits/rejected": -60716844.307692304, + "logps/chosen": -378.34969815340907, + "logps/rejected": -598.3013070913462, + "loss": 0.0238, + "rewards/chosen": 10.16517500443892, + "rewards/margins": 23.35914942601344, + "rewards/rejected": -13.19397442157452, + "step": 2684 + }, + { + "epoch": 0.6718378581258602, + "grad_norm": 2.75, + "kl": 4.350074768066406, + "learning_rate": 5e-06, + "logits/chosen": -49889644.307692304, + "logits/rejected": -62265774.54545455, + "logps/chosen": -350.44437349759613, + "logps/rejected": -568.1552290482955, + "loss": 0.0496, + "rewards/chosen": 7.237172640286959, + "rewards/margins": 24.26373957920741, + "rewards/rejected": -17.026566938920453, + "step": 2685 + }, + { + "epoch": 0.672088077067434, + "grad_norm": 15.125, + "kl": 20.56463623046875, + "learning_rate": 5e-06, + "logits/chosen": -61212128.0, + "logits/rejected": -34170332.0, + "logps/chosen": -344.35498046875, + "logps/rejected": -810.1317749023438, + "loss": 0.1279, + "rewards/chosen": 7.638521575927735, + "rewards/margins": 33.524467849731444, + "rewards/rejected": -25.88594627380371, + "step": 2686 + }, + { + "epoch": 0.6723382960090079, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43737909.333333336, + "logits/rejected": -55531195.733333334, + "logps/chosen": -399.9912380642361, + "logps/rejected": -653.2701822916666, + "loss": 0.0617, + "rewards/chosen": 6.207358890109592, + "rewards/margins": 23.761277347140844, + "rewards/rejected": -17.55391845703125, + "step": 2687 + }, + { + "epoch": 0.6725885149505818, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70385504.0, + "logits/rejected": -54044954.666666664, + "logps/chosen": -392.708984375, + "logps/rejected": -590.4505208333334, + "loss": 0.0182, + "rewards/chosen": 8.729240417480469, + "rewards/margins": 25.801105499267578, + "rewards/rejected": -17.07186508178711, + "step": 2688 + }, + { + "epoch": 0.6728387338921556, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23533370.181818184, + "logits/rejected": -48382611.692307696, + "logps/chosen": -245.6910067471591, + "logps/rejected": -484.8810847355769, + "loss": 0.0267, + "rewards/chosen": 6.251744357022372, + "rewards/margins": 19.462623169372133, + "rewards/rejected": -13.21087881234976, + "step": 2689 + }, + { + "epoch": 0.6730889528337295, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59945721.6, + "logits/rejected": -25135565.714285713, + "logps/chosen": -553.769580078125, + "logps/rejected": -587.7325613839286, + "loss": 0.0177, + "rewards/chosen": 13.16700439453125, + "rewards/margins": 29.27313145228795, + "rewards/rejected": -16.106127057756698, + "step": 2690 + }, + { + "epoch": 0.6733391717753033, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26749800.0, + "logits/rejected": -60415077.333333336, + "logps/chosen": -319.173583984375, + "logps/rejected": -525.3067626953125, + "loss": 0.045, + "rewards/chosen": 6.673659642537435, + "rewards/margins": 23.84557278951009, + "rewards/rejected": -17.171913146972656, + "step": 2691 + }, + { + "epoch": 0.6735893907168773, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37493735.384615384, + "logits/rejected": -53041326.54545455, + "logps/chosen": -393.5027418870192, + "logps/rejected": -657.62255859375, + "loss": 0.0135, + "rewards/chosen": 8.530552790715145, + "rewards/margins": 29.029251632156907, + "rewards/rejected": -20.49869884144176, + "step": 2692 + }, + { + "epoch": 0.6738396096584511, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55177660.44444445, + "logits/rejected": -66466641.06666667, + "logps/chosen": -421.4803873697917, + "logps/rejected": -626.990234375, + "loss": 0.0324, + "rewards/chosen": 8.147815280490452, + "rewards/margins": 23.37712944878472, + "rewards/rejected": -15.229314168294271, + "step": 2693 + }, + { + "epoch": 0.674089828600025, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34918650.666666664, + "logits/rejected": -54391125.333333336, + "logps/chosen": -304.37522379557294, + "logps/rejected": -627.1848551432291, + "loss": 0.0376, + "rewards/chosen": 7.261203765869141, + "rewards/margins": 22.953077952067055, + "rewards/rejected": -15.691874186197916, + "step": 2694 + }, + { + "epoch": 0.6743400475415989, + "grad_norm": 8.8125, + "kl": 7.613304138183594, + "learning_rate": 5e-06, + "logits/chosen": -32470618.181818184, + "logits/rejected": -54132199.384615384, + "logps/chosen": -296.56977982954544, + "logps/rejected": -462.1472355769231, + "loss": 0.0441, + "rewards/chosen": 6.696493668989702, + "rewards/margins": 18.86148108635749, + "rewards/rejected": -12.164987417367788, + "step": 2695 + }, + { + "epoch": 0.6745902664831728, + "grad_norm": 0.94140625, + "kl": 0.8937060236930847, + "learning_rate": 5e-06, + "logits/chosen": -62771224.0, + "logits/rejected": -45940344.0, + "logps/chosen": -489.0162658691406, + "logps/rejected": -734.91943359375, + "loss": 0.0121, + "rewards/chosen": 8.029244422912598, + "rewards/margins": 26.41169834136963, + "rewards/rejected": -18.38245391845703, + "step": 2696 + }, + { + "epoch": 0.6748404854247466, + "grad_norm": 8.8125, + "kl": 5.343951225280762, + "learning_rate": 5e-06, + "logits/chosen": -78941514.66666667, + "logits/rejected": -38618293.333333336, + "logps/chosen": -383.6469319661458, + "logps/rejected": -751.5849609375, + "loss": 0.1127, + "rewards/chosen": 7.094513575236003, + "rewards/margins": 26.123745600382488, + "rewards/rejected": -19.029232025146484, + "step": 2697 + }, + { + "epoch": 0.6750907043663206, + "grad_norm": 5.4375, + "kl": 9.451787948608398, + "learning_rate": 5e-06, + "logits/chosen": -48769685.333333336, + "logits/rejected": -48950410.666666664, + "logps/chosen": -365.4896240234375, + "logps/rejected": -604.207275390625, + "loss": 0.0841, + "rewards/chosen": 7.637644449869792, + "rewards/margins": 22.695269266764324, + "rewards/rejected": -15.057624816894531, + "step": 2698 + }, + { + "epoch": 0.6753409233078944, + "grad_norm": 6.875, + "kl": 2.1126933097839355, + "learning_rate": 5e-06, + "logits/chosen": -24267421.333333332, + "logits/rejected": -53028144.0, + "logps/chosen": -394.5754801432292, + "logps/rejected": -569.8325602213541, + "loss": 0.0161, + "rewards/chosen": 8.710105895996094, + "rewards/margins": 25.147602081298828, + "rewards/rejected": -16.437496185302734, + "step": 2699 + }, + { + "epoch": 0.6755911422494683, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52480612.571428575, + "logits/rejected": -57639262.11764706, + "logps/chosen": -374.4794921875, + "logps/rejected": -658.4264705882352, + "loss": 0.0313, + "rewards/chosen": 7.2998777117047995, + "rewards/margins": 23.097203294770058, + "rewards/rejected": -15.797325583065257, + "step": 2700 + }, + { + "epoch": 0.6758413611910422, + "grad_norm": 6.71875, + "kl": 10.380943298339844, + "learning_rate": 5e-06, + "logits/chosen": -24314040.0, + "logits/rejected": -51167120.0, + "logps/chosen": -360.02691650390625, + "logps/rejected": -629.2841796875, + "loss": 0.0884, + "rewards/chosen": 6.238309383392334, + "rewards/margins": 23.19775152206421, + "rewards/rejected": -16.959442138671875, + "step": 2701 + }, + { + "epoch": 0.676091580132616, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40188800.0, + "logits/rejected": -40446240.0, + "logps/chosen": -330.7169189453125, + "logps/rejected": -554.795654296875, + "loss": 0.0451, + "rewards/chosen": 7.702105712890625, + "rewards/margins": 21.11817910330636, + "rewards/rejected": -13.416073390415736, + "step": 2702 + }, + { + "epoch": 0.6763417990741899, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37575772.44444445, + "logits/rejected": -13623056.0, + "logps/chosen": -431.8217502170139, + "logps/rejected": -703.9953776041667, + "loss": 0.0131, + "rewards/chosen": 8.958772447374132, + "rewards/margins": 23.674991522894963, + "rewards/rejected": -14.716219075520833, + "step": 2703 + }, + { + "epoch": 0.6765920180157637, + "grad_norm": 11.0, + "kl": 5.6611785888671875, + "learning_rate": 5e-06, + "logits/chosen": -38218434.28571428, + "logits/rejected": -56203916.8, + "logps/chosen": -345.87050083705356, + "logps/rejected": -693.977734375, + "loss": 0.0319, + "rewards/chosen": 7.908814566476004, + "rewards/margins": 24.418912833077567, + "rewards/rejected": -16.510098266601563, + "step": 2704 + }, + { + "epoch": 0.6768422369573377, + "grad_norm": 5.34375, + "kl": 8.256711959838867, + "learning_rate": 5e-06, + "logits/chosen": -43704461.71428572, + "logits/rejected": -38097846.4, + "logps/chosen": -411.94796316964283, + "logps/rejected": -432.823828125, + "loss": 0.0488, + "rewards/chosen": 7.941775730678013, + "rewards/margins": 20.05404990059989, + "rewards/rejected": -12.112274169921875, + "step": 2705 + }, + { + "epoch": 0.6770924558989115, + "grad_norm": 0.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50813346.90909091, + "logits/rejected": -72436007.38461539, + "logps/chosen": -415.6368963068182, + "logps/rejected": -768.1403245192307, + "loss": 0.0129, + "rewards/chosen": 9.202072143554688, + "rewards/margins": 28.58419682429387, + "rewards/rejected": -19.38212468073918, + "step": 2706 + }, + { + "epoch": 0.6773426748404854, + "grad_norm": 8.4375, + "kl": 5.019961833953857, + "learning_rate": 5e-06, + "logits/chosen": -22201600.0, + "logits/rejected": -44908142.222222224, + "logps/chosen": -321.37428385416666, + "logps/rejected": -540.7162543402778, + "loss": 0.0379, + "rewards/chosen": 8.673769124348958, + "rewards/margins": 25.391561211480035, + "rewards/rejected": -16.717792087131077, + "step": 2707 + }, + { + "epoch": 0.6775928937820593, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46433158.4, + "logits/rejected": -47900009.14285714, + "logps/chosen": -394.57080078125, + "logps/rejected": -799.8161272321429, + "loss": 0.0036, + "rewards/chosen": 8.446973419189453, + "rewards/margins": 29.12840042114258, + "rewards/rejected": -20.681427001953125, + "step": 2708 + }, + { + "epoch": 0.6778431127236332, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47518240.0, + "logits/rejected": -36227538.666666664, + "logps/chosen": -294.8662923177083, + "logps/rejected": -536.8291422526041, + "loss": 0.0224, + "rewards/chosen": 7.184755961100261, + "rewards/margins": 23.859935760498047, + "rewards/rejected": -16.675179799397785, + "step": 2709 + }, + { + "epoch": 0.678093331665207, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32002519.272727273, + "logits/rejected": -30734468.923076924, + "logps/chosen": -271.3680974786932, + "logps/rejected": -520.1676307091346, + "loss": 0.0437, + "rewards/chosen": 5.882674477317116, + "rewards/margins": 15.740388616815313, + "rewards/rejected": -9.857714139498198, + "step": 2710 + }, + { + "epoch": 0.678343550606781, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56590912.0, + "logits/rejected": 52505133.71428572, + "logps/chosen": -414.9375, + "logps/rejected": -540.3111049107143, + "loss": 0.0132, + "rewards/chosen": 9.511048126220704, + "rewards/margins": 23.792784227643693, + "rewards/rejected": -14.281736101422991, + "step": 2711 + }, + { + "epoch": 0.6785937695483548, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50736374.15384615, + "logits/rejected": -44572462.54545455, + "logps/chosen": -350.21326622596155, + "logps/rejected": -713.8496981534091, + "loss": 0.0535, + "rewards/chosen": 8.04671360896184, + "rewards/margins": 25.778265732985275, + "rewards/rejected": -17.731552124023438, + "step": 2712 + }, + { + "epoch": 0.6788439884899287, + "grad_norm": 10.75, + "kl": 7.07819128036499, + "learning_rate": 5e-06, + "logits/chosen": -37680578.666666664, + "logits/rejected": -34412413.333333336, + "logps/chosen": -412.0034993489583, + "logps/rejected": -493.3621012369792, + "loss": 0.0659, + "rewards/chosen": 8.202827453613281, + "rewards/margins": 20.922884623209633, + "rewards/rejected": -12.720057169596354, + "step": 2713 + }, + { + "epoch": 0.6790942074315026, + "grad_norm": 6.5625, + "kl": 5.205258846282959, + "learning_rate": 5e-06, + "logits/chosen": -18819800.0, + "logits/rejected": -54122649.6, + "logps/chosen": -335.9092494419643, + "logps/rejected": -764.4919921875, + "loss": 0.0284, + "rewards/chosen": 8.238954271589007, + "rewards/margins": 27.04671031406948, + "rewards/rejected": -18.80775604248047, + "step": 2714 + }, + { + "epoch": 0.6793444263730765, + "grad_norm": 7.25, + "kl": 0.8505653142929077, + "learning_rate": 5e-06, + "logits/chosen": -55554368.0, + "logits/rejected": 20625708.307692308, + "logps/chosen": -369.5399280894886, + "logps/rejected": -424.6378831129808, + "loss": 0.027, + "rewards/chosen": 7.830690557306463, + "rewards/margins": 20.60636053552161, + "rewards/rejected": -12.775669978215145, + "step": 2715 + }, + { + "epoch": 0.6795946453146503, + "grad_norm": 8.0, + "kl": 10.472179412841797, + "learning_rate": 5e-06, + "logits/chosen": -44820272.0, + "logits/rejected": -48400645.333333336, + "logps/chosen": -350.8748372395833, + "logps/rejected": -573.3435872395834, + "loss": 0.0648, + "rewards/chosen": 8.194288889567057, + "rewards/margins": 21.978532155354817, + "rewards/rejected": -13.78424326578776, + "step": 2716 + }, + { + "epoch": 0.6798448642562241, + "grad_norm": 5.875, + "kl": 6.006850242614746, + "learning_rate": 5e-06, + "logits/chosen": -49800218.666666664, + "logits/rejected": 151990048.0, + "logps/chosen": -381.375732421875, + "logps/rejected": -508.2392171223958, + "loss": 0.0178, + "rewards/chosen": 8.715984980265299, + "rewards/margins": 21.662726720174152, + "rewards/rejected": -12.946741739908854, + "step": 2717 + }, + { + "epoch": 0.6800950831977981, + "grad_norm": 13.375, + "kl": 19.644508361816406, + "learning_rate": 5e-06, + "logits/chosen": -61825723.733333334, + "logits/rejected": -81608156.44444445, + "logps/chosen": -349.43792317708335, + "logps/rejected": -827.3313802083334, + "loss": 0.0903, + "rewards/chosen": 7.528719075520834, + "rewards/margins": 28.689302571614583, + "rewards/rejected": -21.16058349609375, + "step": 2718 + }, + { + "epoch": 0.6803453021393719, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10236976.8, + "logits/rejected": 50484251.428571425, + "logps/chosen": -436.89658203125, + "logps/rejected": -622.4107840401786, + "loss": 0.0081, + "rewards/chosen": 12.109484100341797, + "rewards/margins": 21.3252564566476, + "rewards/rejected": -9.215772356305804, + "step": 2719 + }, + { + "epoch": 0.6805955210809458, + "grad_norm": 10.5625, + "kl": 12.579024314880371, + "learning_rate": 5e-06, + "logits/chosen": -100175302.4, + "logits/rejected": -44429449.14285714, + "logps/chosen": -420.9041015625, + "logps/rejected": -625.2189592633929, + "loss": 0.0305, + "rewards/chosen": 9.574626159667968, + "rewards/margins": 22.764858572823663, + "rewards/rejected": -13.190232413155693, + "step": 2720 + }, + { + "epoch": 0.6808457400225197, + "grad_norm": 6.0, + "kl": 4.617887020111084, + "learning_rate": 5e-06, + "logits/chosen": -43769736.53333333, + "logits/rejected": 9159583.111111112, + "logps/chosen": -281.06640625, + "logps/rejected": -411.7307942708333, + "loss": 0.0231, + "rewards/chosen": 7.520711263020833, + "rewards/margins": 17.84979502360026, + "rewards/rejected": -10.329083760579428, + "step": 2721 + }, + { + "epoch": 0.6810959589640936, + "grad_norm": 2.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40860058.18181818, + "logits/rejected": -37956662.15384615, + "logps/chosen": -395.5813654119318, + "logps/rejected": -549.4824969951923, + "loss": 0.0403, + "rewards/chosen": 8.768547058105469, + "rewards/margins": 19.37740443303035, + "rewards/rejected": -10.60885737492488, + "step": 2722 + }, + { + "epoch": 0.6813461779056674, + "grad_norm": 14.25, + "kl": 1.7867101430892944, + "learning_rate": 5e-06, + "logits/chosen": -33256780.8, + "logits/rejected": -51795227.428571425, + "logps/chosen": -356.86904296875, + "logps/rejected": -715.3819056919643, + "loss": 0.0278, + "rewards/chosen": 7.858525085449219, + "rewards/margins": 24.646076965332032, + "rewards/rejected": -16.787551879882812, + "step": 2723 + }, + { + "epoch": 0.6815963968472414, + "grad_norm": 13.4375, + "kl": 14.948885917663574, + "learning_rate": 5e-06, + "logits/chosen": -46877282.461538464, + "logits/rejected": -44689230.54545455, + "logps/chosen": -405.7613055889423, + "logps/rejected": -635.93115234375, + "loss": 0.033, + "rewards/chosen": 9.542803250826323, + "rewards/margins": 21.895449124849762, + "rewards/rejected": -12.352645874023438, + "step": 2724 + }, + { + "epoch": 0.6818466157888152, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48140885.333333336, + "logits/rejected": -27085709.333333332, + "logps/chosen": -359.6389973958333, + "logps/rejected": -782.0182291666666, + "loss": 0.0044, + "rewards/chosen": 8.134845097859701, + "rewards/margins": 21.349106470743816, + "rewards/rejected": -13.214261372884115, + "step": 2725 + }, + { + "epoch": 0.6820968347303891, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20440115.692307692, + "logits/rejected": -30883706.181818184, + "logps/chosen": -304.34560922475964, + "logps/rejected": -613.7752130681819, + "loss": 0.0406, + "rewards/chosen": 7.754799476036658, + "rewards/margins": 21.875210742016773, + "rewards/rejected": -14.120411265980113, + "step": 2726 + }, + { + "epoch": 0.682347053671963, + "grad_norm": 5.625, + "kl": 8.461029052734375, + "learning_rate": 5e-06, + "logits/chosen": -2989726.8571428573, + "logits/rejected": -75557305.6, + "logps/chosen": -403.99658203125, + "logps/rejected": -622.13828125, + "loss": 0.0698, + "rewards/chosen": 9.867722647530693, + "rewards/margins": 22.36346849714007, + "rewards/rejected": -12.495745849609374, + "step": 2727 + }, + { + "epoch": 0.6825972726135369, + "grad_norm": 8.3125, + "kl": 4.878762722015381, + "learning_rate": 5e-06, + "logits/chosen": -47422841.6, + "logits/rejected": -36688630.85714286, + "logps/chosen": -421.531103515625, + "logps/rejected": -552.42138671875, + "loss": 0.0097, + "rewards/chosen": 10.10202407836914, + "rewards/margins": 22.43823405674526, + "rewards/rejected": -12.336209978376116, + "step": 2728 + }, + { + "epoch": 0.6828474915551107, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31755552.0, + "logits/rejected": -30465378.666666668, + "logps/chosen": -410.1354573567708, + "logps/rejected": -638.93603515625, + "loss": 0.0654, + "rewards/chosen": 8.828057607014975, + "rewards/margins": 22.31747817993164, + "rewards/rejected": -13.489420572916666, + "step": 2729 + }, + { + "epoch": 0.6830977104966846, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49744180.0, + "logits/rejected": -33026046.0, + "logps/chosen": -342.1230163574219, + "logps/rejected": -556.806884765625, + "loss": 0.031, + "rewards/chosen": 8.25494384765625, + "rewards/margins": 22.288183212280273, + "rewards/rejected": -14.033239364624023, + "step": 2730 + }, + { + "epoch": 0.6833479294382585, + "grad_norm": 7.625, + "kl": 7.060115814208984, + "learning_rate": 5e-06, + "logits/chosen": -60570116.92307692, + "logits/rejected": -40945297.45454545, + "logps/chosen": -464.50826322115387, + "logps/rejected": -661.791015625, + "loss": 0.0528, + "rewards/chosen": 8.06749021089994, + "rewards/margins": 20.925928876116558, + "rewards/rejected": -12.85843866521662, + "step": 2731 + }, + { + "epoch": 0.6835981483798323, + "grad_norm": 2.84375, + "kl": 14.406478881835938, + "learning_rate": 5e-06, + "logits/chosen": -75973021.0909091, + "logits/rejected": -51239133.538461536, + "logps/chosen": -477.0870472301136, + "logps/rejected": -853.2333233173077, + "loss": 0.0037, + "rewards/chosen": 11.890788685191762, + "rewards/margins": 29.41362052864128, + "rewards/rejected": -17.52283184344952, + "step": 2732 + }, + { + "epoch": 0.6838483673214062, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36164214.15384615, + "logits/rejected": -47110929.45454545, + "logps/chosen": -330.17664513221155, + "logps/rejected": -703.9414950284091, + "loss": 0.0364, + "rewards/chosen": 7.195610633263221, + "rewards/margins": 22.52943964604731, + "rewards/rejected": -15.333829012784092, + "step": 2733 + }, + { + "epoch": 0.6840985862629801, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47301571.2, + "logits/rejected": -47998121.14285714, + "logps/chosen": -359.32333984375, + "logps/rejected": -562.8724888392857, + "loss": 0.0311, + "rewards/chosen": 8.07234878540039, + "rewards/margins": 18.828605978829522, + "rewards/rejected": -10.75625719342913, + "step": 2734 + }, + { + "epoch": 0.684348805204554, + "grad_norm": 2.46875, + "kl": 17.213876724243164, + "learning_rate": 5e-06, + "logits/chosen": -51389038.54545455, + "logits/rejected": -25589176.615384616, + "logps/chosen": -485.08860085227275, + "logps/rejected": -564.5911959134615, + "loss": 0.0057, + "rewards/chosen": 13.2022441517223, + "rewards/margins": 24.74373045167723, + "rewards/rejected": -11.541486299954927, + "step": 2735 + }, + { + "epoch": 0.6845990241461278, + "grad_norm": 5.25, + "kl": 18.86608123779297, + "learning_rate": 5e-06, + "logits/chosen": -60093224.72727273, + "logits/rejected": -33090503.384615384, + "logps/chosen": -399.2390802556818, + "logps/rejected": -602.8638070913462, + "loss": 0.048, + "rewards/chosen": 9.964379744096236, + "rewards/margins": 25.582895932497678, + "rewards/rejected": -15.618516188401442, + "step": 2736 + }, + { + "epoch": 0.6848492430877018, + "grad_norm": 13.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40937245.09090909, + "logits/rejected": -54273068.307692304, + "logps/chosen": -410.59889914772725, + "logps/rejected": -668.7396334134615, + "loss": 0.0362, + "rewards/chosen": 8.760176225142045, + "rewards/margins": 23.91406111283736, + "rewards/rejected": -15.153884887695312, + "step": 2737 + }, + { + "epoch": 0.6850994620292756, + "grad_norm": 7.09375, + "kl": 5.138020992279053, + "learning_rate": 5e-06, + "logits/chosen": -57330530.461538464, + "logits/rejected": -44201844.36363637, + "logps/chosen": -371.63393930288464, + "logps/rejected": -476.93581321022725, + "loss": 0.059, + "rewards/chosen": 8.582878699669472, + "rewards/margins": 22.593139541732683, + "rewards/rejected": -14.01026084206321, + "step": 2738 + }, + { + "epoch": 0.6853496809708495, + "grad_norm": 14.5, + "kl": 4.2702860832214355, + "learning_rate": 5e-06, + "logits/chosen": -55323509.333333336, + "logits/rejected": -42431802.666666664, + "logps/chosen": -425.566162109375, + "logps/rejected": -580.7478841145834, + "loss": 0.0264, + "rewards/chosen": 9.561581293741861, + "rewards/margins": 21.445405960083008, + "rewards/rejected": -11.883824666341146, + "step": 2739 + }, + { + "epoch": 0.6855998999124233, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18671196.307692308, + "logits/rejected": -81193216.0, + "logps/chosen": -249.00026292067307, + "logps/rejected": -791.5316051136364, + "loss": 0.0473, + "rewards/chosen": 6.072866586538462, + "rewards/margins": 24.81621098018193, + "rewards/rejected": -18.743344393643465, + "step": 2740 + }, + { + "epoch": 0.6858501188539973, + "grad_norm": 13.3125, + "kl": 0.9031311869621277, + "learning_rate": 5e-06, + "logits/chosen": -48370715.428571425, + "logits/rejected": -32564150.4, + "logps/chosen": -427.9457310267857, + "logps/rejected": -520.09013671875, + "loss": 0.0439, + "rewards/chosen": 9.08301762172154, + "rewards/margins": 22.744878278459822, + "rewards/rejected": -13.66186065673828, + "step": 2741 + }, + { + "epoch": 0.6861003377955711, + "grad_norm": 2.90625, + "kl": 7.487807273864746, + "learning_rate": 5e-06, + "logits/chosen": -29795060.363636363, + "logits/rejected": -38814144.0, + "logps/chosen": -363.45174893465907, + "logps/rejected": -528.5535606971154, + "loss": 0.0531, + "rewards/chosen": 9.169233842329545, + "rewards/margins": 21.515407642284472, + "rewards/rejected": -12.346173799954927, + "step": 2742 + }, + { + "epoch": 0.686350556737145, + "grad_norm": 8.8125, + "kl": 17.490646362304688, + "learning_rate": 5e-06, + "logits/chosen": -43463378.28571428, + "logits/rejected": -46273891.2, + "logps/chosen": -373.85215541294644, + "logps/rejected": -520.4099609375, + "loss": 0.0546, + "rewards/chosen": 9.47555650983538, + "rewards/margins": 21.29911989484515, + "rewards/rejected": -11.823563385009766, + "step": 2743 + }, + { + "epoch": 0.6866007756787189, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46575138.461538464, + "logits/rejected": -95614830.54545455, + "logps/chosen": -410.03267728365387, + "logps/rejected": -570.6364524147727, + "loss": 0.0393, + "rewards/chosen": 8.750277005709135, + "rewards/margins": 21.594545804537262, + "rewards/rejected": -12.844268798828125, + "step": 2744 + }, + { + "epoch": 0.6868509946202928, + "grad_norm": 2.9375, + "kl": 3.0914268493652344, + "learning_rate": 5e-06, + "logits/chosen": -65610368.0, + "logits/rejected": -44630304.0, + "logps/chosen": -309.8809407552083, + "logps/rejected": -646.1754557291666, + "loss": 0.014, + "rewards/chosen": 8.729279200236002, + "rewards/margins": 25.35164451599121, + "rewards/rejected": -16.622365315755207, + "step": 2745 + }, + { + "epoch": 0.6871012135618666, + "grad_norm": 6.75, + "kl": 9.335000991821289, + "learning_rate": 5e-06, + "logits/chosen": -51554248.0, + "logits/rejected": -72196352.0, + "logps/chosen": -386.39996337890625, + "logps/rejected": -755.353271484375, + "loss": 0.0896, + "rewards/chosen": 8.532389640808105, + "rewards/margins": 27.59816265106201, + "rewards/rejected": -19.065773010253906, + "step": 2746 + }, + { + "epoch": 0.6873514325034406, + "grad_norm": 7.8125, + "kl": 9.820812225341797, + "learning_rate": 5e-06, + "logits/chosen": -55165445.81818182, + "logits/rejected": -36148957.538461536, + "logps/chosen": -395.64035866477275, + "logps/rejected": -653.5318509615385, + "loss": 0.0145, + "rewards/chosen": 7.758229342373935, + "rewards/margins": 25.054785081556627, + "rewards/rejected": -17.296555739182693, + "step": 2747 + }, + { + "epoch": 0.6876016514450144, + "grad_norm": 1.2265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41252877.333333336, + "logits/rejected": -57992848.0, + "logps/chosen": -497.4013671875, + "logps/rejected": -600.0843912760416, + "loss": 0.0034, + "rewards/chosen": 9.65780258178711, + "rewards/margins": 27.494061787923176, + "rewards/rejected": -17.836259206136067, + "step": 2748 + }, + { + "epoch": 0.6878518703865882, + "grad_norm": 0.55859375, + "kl": 10.487970352172852, + "learning_rate": 5e-06, + "logits/chosen": -58947251.2, + "logits/rejected": -46053888.0, + "logps/chosen": -419.1844075520833, + "logps/rejected": -577.2306857638889, + "loss": 0.001, + "rewards/chosen": 11.291046142578125, + "rewards/margins": 27.883182101779514, + "rewards/rejected": -16.59213595920139, + "step": 2749 + }, + { + "epoch": 0.6881020893281622, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28783881.14285714, + "logits/rejected": -25884744.0, + "logps/chosen": -337.6700962611607, + "logps/rejected": -520.97412109375, + "loss": 0.0434, + "rewards/chosen": 5.522030966622489, + "rewards/margins": 23.388629477364674, + "rewards/rejected": -17.866598510742186, + "step": 2750 + }, + { + "epoch": 0.688352308269736, + "grad_norm": 16.5, + "kl": 2.799732208251953, + "learning_rate": 5e-06, + "logits/chosen": -54002960.0, + "logits/rejected": -52608565.333333336, + "logps/chosen": -388.6888020833333, + "logps/rejected": -634.1927083333334, + "loss": 0.0706, + "rewards/chosen": 8.436592737833658, + "rewards/margins": 18.89079984029134, + "rewards/rejected": -10.454207102457682, + "step": 2751 + }, + { + "epoch": 0.6886025272113099, + "grad_norm": 8.75, + "kl": 10.986352920532227, + "learning_rate": 5e-06, + "logits/chosen": -46295213.71428572, + "logits/rejected": -34509945.6, + "logps/chosen": -336.02127511160717, + "logps/rejected": -552.241748046875, + "loss": 0.0562, + "rewards/chosen": 8.029896327427455, + "rewards/margins": 22.419825526646207, + "rewards/rejected": -14.38992919921875, + "step": 2752 + }, + { + "epoch": 0.6888527461528837, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37182549.333333336, + "logits/rejected": -52441898.666666664, + "logps/chosen": -291.1960720486111, + "logps/rejected": -711.1141927083333, + "loss": 0.0306, + "rewards/chosen": 7.662573072645399, + "rewards/margins": 22.63199039035373, + "rewards/rejected": -14.969417317708333, + "step": 2753 + }, + { + "epoch": 0.6891029650944577, + "grad_norm": 2.125, + "kl": 1.9093310832977295, + "learning_rate": 5e-06, + "logits/chosen": -71006865.45454545, + "logits/rejected": -30945504.0, + "logps/chosen": -373.26242897727275, + "logps/rejected": -785.4478665865385, + "loss": 0.0225, + "rewards/chosen": 9.412776600230824, + "rewards/margins": 29.724180314924332, + "rewards/rejected": -20.31140371469351, + "step": 2754 + }, + { + "epoch": 0.6893531840360315, + "grad_norm": 1.5078125, + "kl": 0.45543450117111206, + "learning_rate": 5e-06, + "logits/chosen": -34418460.8, + "logits/rejected": -37040880.0, + "logps/chosen": -415.85166015625, + "logps/rejected": -450.4140625, + "loss": 0.0044, + "rewards/chosen": 10.450902557373047, + "rewards/margins": 20.677615465436663, + "rewards/rejected": -10.226712908063616, + "step": 2755 + }, + { + "epoch": 0.6896034029776054, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32289712.0, + "logits/rejected": -49966096.0, + "logps/chosen": -334.87583414713544, + "logps/rejected": -663.6419270833334, + "loss": 0.0388, + "rewards/chosen": 8.549997329711914, + "rewards/margins": 23.470539728800453, + "rewards/rejected": -14.920542399088541, + "step": 2756 + }, + { + "epoch": 0.6898536219191793, + "grad_norm": 13.4375, + "kl": 2.6487064361572266, + "learning_rate": 5e-06, + "logits/chosen": -39437602.13333333, + "logits/rejected": -52504408.88888889, + "logps/chosen": -296.668359375, + "logps/rejected": -807.7300347222222, + "loss": 0.0736, + "rewards/chosen": 6.305873107910156, + "rewards/margins": 26.28632320827908, + "rewards/rejected": -19.980450100368923, + "step": 2757 + }, + { + "epoch": 0.6901038408607532, + "grad_norm": 8.1875, + "kl": 4.755486488342285, + "learning_rate": 5e-06, + "logits/chosen": -45840162.90909091, + "logits/rejected": -78345708.3076923, + "logps/chosen": -315.24995561079544, + "logps/rejected": -722.7267127403846, + "loss": 0.1037, + "rewards/chosen": 7.522139115767046, + "rewards/margins": 26.40235762162642, + "rewards/rejected": -18.880218505859375, + "step": 2758 + }, + { + "epoch": 0.690354059802327, + "grad_norm": 3.75, + "kl": 3.564873695373535, + "learning_rate": 5e-06, + "logits/chosen": -35138190.76923077, + "logits/rejected": -24876442.181818184, + "logps/chosen": -311.42001577524036, + "logps/rejected": -491.4080699573864, + "loss": 0.0484, + "rewards/chosen": 7.846774761493389, + "rewards/margins": 21.933150604888276, + "rewards/rejected": -14.086375843394887, + "step": 2759 + }, + { + "epoch": 0.690604278743901, + "grad_norm": 3.546875, + "kl": 10.656320571899414, + "learning_rate": 5e-06, + "logits/chosen": -30395949.714285713, + "logits/rejected": -28982515.2, + "logps/chosen": -337.8095005580357, + "logps/rejected": -498.26650390625, + "loss": 0.0615, + "rewards/chosen": 8.413687569754464, + "rewards/margins": 20.957042367117744, + "rewards/rejected": -12.543354797363282, + "step": 2760 + }, + { + "epoch": 0.6908544976854748, + "grad_norm": 5.0625, + "kl": 5.239138603210449, + "learning_rate": 5e-06, + "logits/chosen": -54257216.0, + "logits/rejected": -1798210.4, + "logps/chosen": -394.43491908482144, + "logps/rejected": -637.791259765625, + "loss": 0.0366, + "rewards/chosen": 9.841561453683036, + "rewards/margins": 23.799882071358816, + "rewards/rejected": -13.958320617675781, + "step": 2761 + }, + { + "epoch": 0.6911047166270486, + "grad_norm": 25.5, + "kl": 9.283768653869629, + "learning_rate": 5e-06, + "logits/chosen": -16548397.866666667, + "logits/rejected": -46309724.44444445, + "logps/chosen": -343.87194010416664, + "logps/rejected": -521.0581597222222, + "loss": 0.0976, + "rewards/chosen": 6.753886922200521, + "rewards/margins": 21.572693888346354, + "rewards/rejected": -14.818806966145834, + "step": 2762 + }, + { + "epoch": 0.6913549355686226, + "grad_norm": 9.5625, + "kl": 1.26590096950531, + "learning_rate": 5e-06, + "logits/chosen": -45286950.4, + "logits/rejected": 39504205.71428572, + "logps/chosen": -368.120263671875, + "logps/rejected": -511.7056361607143, + "loss": 0.0513, + "rewards/chosen": 8.549004364013673, + "rewards/margins": 19.419579315185548, + "rewards/rejected": -10.870574951171875, + "step": 2763 + }, + { + "epoch": 0.6916051545101964, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45966784.0, + "logits/rejected": -32754284.0, + "logps/chosen": -372.1645812988281, + "logps/rejected": -514.0689697265625, + "loss": 0.005, + "rewards/chosen": 10.397870063781738, + "rewards/margins": 24.037297248840332, + "rewards/rejected": -13.639427185058594, + "step": 2764 + }, + { + "epoch": 0.6918553734517703, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32034685.09090909, + "logits/rejected": -59331165.538461536, + "logps/chosen": -334.6779119318182, + "logps/rejected": -704.5286959134615, + "loss": 0.0375, + "rewards/chosen": 8.87028642134233, + "rewards/margins": 26.241751077291852, + "rewards/rejected": -17.37146465594952, + "step": 2765 + }, + { + "epoch": 0.6921055923933441, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32987436.8, + "logits/rejected": -44203186.28571428, + "logps/chosen": -313.8713134765625, + "logps/rejected": -605.2614397321429, + "loss": 0.0175, + "rewards/chosen": 7.5282035827636715, + "rewards/margins": 22.637847791399274, + "rewards/rejected": -15.109644208635602, + "step": 2766 + }, + { + "epoch": 0.6923558113349181, + "grad_norm": 5.875, + "kl": 0.09024810791015625, + "learning_rate": 5e-06, + "logits/chosen": -70322333.0909091, + "logits/rejected": -2981575.3846153845, + "logps/chosen": -403.05708451704544, + "logps/rejected": -724.8869441105769, + "loss": 0.0094, + "rewards/chosen": 8.918341203169389, + "rewards/margins": 27.123020252147754, + "rewards/rejected": -18.204679048978367, + "step": 2767 + }, + { + "epoch": 0.6926060302764919, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76205718.85714285, + "logits/rejected": -45852544.0, + "logps/chosen": -386.57223074776783, + "logps/rejected": -641.0960477941177, + "loss": 0.0222, + "rewards/chosen": 7.447065080915179, + "rewards/margins": 24.392646853663344, + "rewards/rejected": -16.945581772748163, + "step": 2768 + }, + { + "epoch": 0.6928562492180658, + "grad_norm": 1.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38298542.54545455, + "logits/rejected": -68176374.15384616, + "logps/chosen": -384.20143821022725, + "logps/rejected": -841.4265324519231, + "loss": 0.0018, + "rewards/chosen": 8.810086337002842, + "rewards/margins": 33.44242229328289, + "rewards/rejected": -24.63233595628005, + "step": 2769 + }, + { + "epoch": 0.6931064681596397, + "grad_norm": 6.125, + "kl": 4.731883525848389, + "learning_rate": 5e-06, + "logits/chosen": -73563834.18181819, + "logits/rejected": -50649875.692307696, + "logps/chosen": -431.53715376420456, + "logps/rejected": -483.5956280048077, + "loss": 0.0359, + "rewards/chosen": 9.157230723987926, + "rewards/margins": 20.216028707010764, + "rewards/rejected": -11.058797983022837, + "step": 2770 + }, + { + "epoch": 0.6933566871012136, + "grad_norm": 2.828125, + "kl": 1.4986610412597656, + "learning_rate": 5e-06, + "logits/chosen": -39752548.92307692, + "logits/rejected": -58983778.90909091, + "logps/chosen": -363.0218975360577, + "logps/rejected": -634.2046786221591, + "loss": 0.0629, + "rewards/chosen": 7.548504169170673, + "rewards/margins": 22.409459147419962, + "rewards/rejected": -14.86095497824929, + "step": 2771 + }, + { + "epoch": 0.6936069060427874, + "grad_norm": 12.375, + "kl": 12.104829788208008, + "learning_rate": 5e-06, + "logits/chosen": -16362048.0, + "logits/rejected": -51085898.666666664, + "logps/chosen": -395.4920247395833, + "logps/rejected": -705.8592122395834, + "loss": 0.0344, + "rewards/chosen": 8.538567225138346, + "rewards/margins": 27.92913373311361, + "rewards/rejected": -19.39056650797526, + "step": 2772 + }, + { + "epoch": 0.6938571249843614, + "grad_norm": 0.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57311668.36363637, + "logits/rejected": -34101346.461538464, + "logps/chosen": -480.12167080965907, + "logps/rejected": -674.0183293269231, + "loss": 0.0016, + "rewards/chosen": 11.783626209605824, + "rewards/margins": 27.319266045843804, + "rewards/rejected": -15.53563983623798, + "step": 2773 + }, + { + "epoch": 0.6941073439259352, + "grad_norm": 20.375, + "kl": 4.801861763000488, + "learning_rate": 5e-06, + "logits/chosen": -18392203.29411765, + "logits/rejected": -26590281.14285714, + "logps/chosen": -395.09349149816177, + "logps/rejected": -518.1199776785714, + "loss": 0.0869, + "rewards/chosen": 7.881542430204504, + "rewards/margins": 22.602851547112987, + "rewards/rejected": -14.721309116908483, + "step": 2774 + }, + { + "epoch": 0.694357562867509, + "grad_norm": 1.71875, + "kl": 7.473282814025879, + "learning_rate": 5e-06, + "logits/chosen": -30776266.666666668, + "logits/rejected": -11937741.333333334, + "logps/chosen": -416.1825764973958, + "logps/rejected": -637.2902425130209, + "loss": 0.0343, + "rewards/chosen": 8.383967081705729, + "rewards/margins": 21.363797505696613, + "rewards/rejected": -12.979830423990885, + "step": 2775 + }, + { + "epoch": 0.694607781809083, + "grad_norm": 11.625, + "kl": 7.142127990722656, + "learning_rate": 5e-06, + "logits/chosen": -35835084.8, + "logits/rejected": -50254346.666666664, + "logps/chosen": -370.1675130208333, + "logps/rejected": -735.6273328993055, + "loss": 0.0695, + "rewards/chosen": 7.596914672851563, + "rewards/margins": 26.12161187065972, + "rewards/rejected": -18.52469719780816, + "step": 2776 + }, + { + "epoch": 0.6948580007506568, + "grad_norm": 4.0, + "kl": 3.1983802318573, + "learning_rate": 5e-06, + "logits/chosen": -23777403.076923076, + "logits/rejected": -33873576.72727273, + "logps/chosen": -355.99643179086536, + "logps/rejected": -508.69753196022725, + "loss": 0.0225, + "rewards/chosen": 8.205616290752705, + "rewards/margins": 24.125349511633388, + "rewards/rejected": -15.919733220880682, + "step": 2777 + }, + { + "epoch": 0.6951082196922307, + "grad_norm": 14.875, + "kl": 12.318202018737793, + "learning_rate": 5e-06, + "logits/chosen": -55467181.71428572, + "logits/rejected": -53353241.6, + "logps/chosen": -401.0322963169643, + "logps/rejected": -762.9701171875, + "loss": 0.123, + "rewards/chosen": 7.3664736066545755, + "rewards/margins": 22.87324698311942, + "rewards/rejected": -15.506773376464844, + "step": 2778 + }, + { + "epoch": 0.6953584386338045, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -104850915.55555555, + "logits/rejected": -60264029.86666667, + "logps/chosen": -479.99370659722223, + "logps/rejected": -569.1280598958333, + "loss": 0.0073, + "rewards/chosen": 8.949535793728298, + "rewards/margins": 25.47959967719184, + "rewards/rejected": -16.53006388346354, + "step": 2779 + }, + { + "epoch": 0.6956086575753785, + "grad_norm": 13.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27184192.0, + "logits/rejected": -38193448.0, + "logps/chosen": -237.35748291015625, + "logps/rejected": -626.9056396484375, + "loss": 0.0513, + "rewards/chosen": 4.784920692443848, + "rewards/margins": 22.4048490524292, + "rewards/rejected": -17.61992835998535, + "step": 2780 + }, + { + "epoch": 0.6958588765169523, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35130940.44444445, + "logits/rejected": -24819801.6, + "logps/chosen": -376.9291720920139, + "logps/rejected": -480.51100260416666, + "loss": 0.0206, + "rewards/chosen": 8.273162841796875, + "rewards/margins": 19.74463907877604, + "rewards/rejected": -11.471476236979166, + "step": 2781 + }, + { + "epoch": 0.6961090954585262, + "grad_norm": 3.078125, + "kl": 3.083484649658203, + "learning_rate": 5e-06, + "logits/chosen": -56633291.294117644, + "logits/rejected": 67303872.0, + "logps/chosen": -352.5002010569853, + "logps/rejected": -534.7555454799107, + "loss": 0.0573, + "rewards/chosen": 7.823635325712316, + "rewards/margins": 19.71793551404937, + "rewards/rejected": -11.894300188337054, + "step": 2782 + }, + { + "epoch": 0.6963593144001001, + "grad_norm": 4.40625, + "kl": 17.497352600097656, + "learning_rate": 5e-06, + "logits/chosen": -61135563.294117644, + "logits/rejected": -67860630.85714285, + "logps/chosen": -427.1989315257353, + "logps/rejected": -648.0020926339286, + "loss": 0.0742, + "rewards/chosen": 8.893754846909466, + "rewards/margins": 25.07725486434808, + "rewards/rejected": -16.183500017438615, + "step": 2783 + }, + { + "epoch": 0.696609533341674, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30141705.846153848, + "logits/rejected": 1587419.6363636365, + "logps/chosen": -351.3662860576923, + "logps/rejected": -696.5416370738636, + "loss": 0.0336, + "rewards/chosen": 7.115438608022837, + "rewards/margins": 24.61346744990849, + "rewards/rejected": -17.498028841885652, + "step": 2784 + }, + { + "epoch": 0.6968597522832478, + "grad_norm": 10.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31959434.666666668, + "logits/rejected": -57100437.333333336, + "logps/chosen": -362.876220703125, + "logps/rejected": -546.749267578125, + "loss": 0.0633, + "rewards/chosen": 9.253252029418945, + "rewards/margins": 21.924666722615562, + "rewards/rejected": -12.671414693196615, + "step": 2785 + }, + { + "epoch": 0.6971099712248218, + "grad_norm": 10.375, + "kl": 3.1979804039001465, + "learning_rate": 5e-06, + "logits/chosen": -44869642.666666664, + "logits/rejected": -17174326.666666668, + "logps/chosen": -325.86204020182294, + "logps/rejected": -797.40087890625, + "loss": 0.0255, + "rewards/chosen": 8.714005788167318, + "rewards/margins": 26.19175593058268, + "rewards/rejected": -17.477750142415363, + "step": 2786 + }, + { + "epoch": 0.6973601901663956, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28893401.6, + "logits/rejected": -68220626.28571428, + "logps/chosen": -277.379443359375, + "logps/rejected": -609.3088727678571, + "loss": 0.0576, + "rewards/chosen": 7.005789184570313, + "rewards/margins": 20.82567116873605, + "rewards/rejected": -13.819881984165736, + "step": 2787 + }, + { + "epoch": 0.6976104091079695, + "grad_norm": 5.75, + "kl": 14.049592971801758, + "learning_rate": 5e-06, + "logits/chosen": -63613696.0, + "logits/rejected": -32537450.666666668, + "logps/chosen": -485.440673828125, + "logps/rejected": -557.6419270833334, + "loss": 0.0418, + "rewards/chosen": 9.490630467732748, + "rewards/margins": 22.89941469828288, + "rewards/rejected": -13.40878423055013, + "step": 2788 + }, + { + "epoch": 0.6978606280495433, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21913449.14285714, + "logits/rejected": -50411008.0, + "logps/chosen": -279.99154227120533, + "logps/rejected": -496.7777458639706, + "loss": 0.0432, + "rewards/chosen": 7.580116271972656, + "rewards/margins": 19.15657896154067, + "rewards/rejected": -11.576462689568014, + "step": 2789 + }, + { + "epoch": 0.6981108469911173, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39148781.71428572, + "logits/rejected": -35425564.8, + "logps/chosen": -305.31260463169644, + "logps/rejected": -505.523681640625, + "loss": 0.0261, + "rewards/chosen": 7.639545440673828, + "rewards/margins": 21.49573745727539, + "rewards/rejected": -13.856192016601563, + "step": 2790 + }, + { + "epoch": 0.6983610659326911, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58891893.333333336, + "logits/rejected": -44858269.86666667, + "logps/chosen": -420.4375, + "logps/rejected": -623.8641276041667, + "loss": 0.0251, + "rewards/chosen": 8.816352844238281, + "rewards/margins": 24.295028177897137, + "rewards/rejected": -15.478675333658854, + "step": 2791 + }, + { + "epoch": 0.6986112848742649, + "grad_norm": 2.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30401292.307692308, + "logits/rejected": -48266513.45454545, + "logps/chosen": -341.9787034254808, + "logps/rejected": -786.1349431818181, + "loss": 0.0451, + "rewards/chosen": 7.697232759915865, + "rewards/margins": 25.9851813683143, + "rewards/rejected": -18.287948608398438, + "step": 2792 + }, + { + "epoch": 0.6988615038158389, + "grad_norm": 2.515625, + "kl": 3.184396266937256, + "learning_rate": 5e-06, + "logits/chosen": -46040352.0, + "logits/rejected": -38574549.333333336, + "logps/chosen": -359.7793375651042, + "logps/rejected": -671.8191324869791, + "loss": 0.0309, + "rewards/chosen": 7.303638458251953, + "rewards/margins": 22.310812632242836, + "rewards/rejected": -15.007174173990885, + "step": 2793 + }, + { + "epoch": 0.6991117227574127, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48646491.428571425, + "logits/rejected": -54450356.705882356, + "logps/chosen": -448.410400390625, + "logps/rejected": -623.2508042279412, + "loss": 0.0022, + "rewards/chosen": 10.737269810267858, + "rewards/margins": 26.486329150800948, + "rewards/rejected": -15.749059340533089, + "step": 2794 + }, + { + "epoch": 0.6993619416989866, + "grad_norm": 2.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37559544.615384616, + "logits/rejected": -52700590.54545455, + "logps/chosen": -390.9215745192308, + "logps/rejected": -542.5998757102273, + "loss": 0.0203, + "rewards/chosen": 8.133526141826923, + "rewards/margins": 19.37596749259042, + "rewards/rejected": -11.242441350763494, + "step": 2795 + }, + { + "epoch": 0.6996121606405605, + "grad_norm": 0.98828125, + "kl": 2.039057493209839, + "learning_rate": 5e-06, + "logits/chosen": -65946586.666666664, + "logits/rejected": -57084096.0, + "logps/chosen": -507.5934651692708, + "logps/rejected": -535.1720377604166, + "loss": 0.0072, + "rewards/chosen": 10.154741923014322, + "rewards/margins": 25.142923990885414, + "rewards/rejected": -14.988182067871094, + "step": 2796 + }, + { + "epoch": 0.6998623795821344, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38848039.11111111, + "logits/rejected": -32067827.2, + "logps/chosen": -368.95551215277777, + "logps/rejected": -610.6512369791667, + "loss": 0.0177, + "rewards/chosen": 7.760469224717882, + "rewards/margins": 22.18584459092882, + "rewards/rejected": -14.425375366210938, + "step": 2797 + }, + { + "epoch": 0.7001125985237082, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77517914.66666667, + "logits/rejected": -37977052.44444445, + "logps/chosen": -541.1085205078125, + "logps/rejected": -499.93408203125, + "loss": 0.0254, + "rewards/chosen": 10.468310674031576, + "rewards/margins": 25.19971699184842, + "rewards/rejected": -14.731406317816841, + "step": 2798 + }, + { + "epoch": 0.7003628174652822, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40252268.0, + "logits/rejected": -20511608.0, + "logps/chosen": -361.3359680175781, + "logps/rejected": -609.9810791015625, + "loss": 0.037, + "rewards/chosen": 8.580954551696777, + "rewards/margins": 24.40458393096924, + "rewards/rejected": -15.823629379272461, + "step": 2799 + }, + { + "epoch": 0.700613036406856, + "grad_norm": 6.34375, + "kl": 5.255013942718506, + "learning_rate": 5e-06, + "logits/chosen": -54295344.0, + "logits/rejected": -62893952.0, + "logps/chosen": -399.2161458333333, + "logps/rejected": -408.4585774739583, + "loss": 0.05, + "rewards/chosen": 8.990501403808594, + "rewards/margins": 18.69763946533203, + "rewards/rejected": -9.707138061523438, + "step": 2800 + }, + { + "epoch": 0.7008632553484299, + "grad_norm": 2.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45838513.777777776, + "logits/rejected": -53645141.333333336, + "logps/chosen": -316.9411349826389, + "logps/rejected": -561.6852864583333, + "loss": 0.0262, + "rewards/chosen": 7.044909159342448, + "rewards/margins": 22.372471618652344, + "rewards/rejected": -15.327562459309895, + "step": 2801 + }, + { + "epoch": 0.7011134742900037, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54299002.18181818, + "logits/rejected": -70709356.3076923, + "logps/chosen": -415.04350142045456, + "logps/rejected": -630.3205378605769, + "loss": 0.0201, + "rewards/chosen": 7.9148476340553975, + "rewards/margins": 29.25633933327415, + "rewards/rejected": -21.34149169921875, + "step": 2802 + }, + { + "epoch": 0.7013636932315777, + "grad_norm": 3.671875, + "kl": 3.8550758361816406, + "learning_rate": 5e-06, + "logits/chosen": -23494822.4, + "logits/rejected": -7775202.285714285, + "logps/chosen": -441.475927734375, + "logps/rejected": -652.6600864955357, + "loss": 0.0186, + "rewards/chosen": 9.215643310546875, + "rewards/margins": 27.017345537458148, + "rewards/rejected": -17.80170222691127, + "step": 2803 + }, + { + "epoch": 0.7016139121731515, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36134299.428571425, + "logits/rejected": -32621203.2, + "logps/chosen": -406.44419642857144, + "logps/rejected": -666.3396484375, + "loss": 0.0123, + "rewards/chosen": 8.483455113002233, + "rewards/margins": 28.799202183314733, + "rewards/rejected": -20.3157470703125, + "step": 2804 + }, + { + "epoch": 0.7018641311147253, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63067738.666666664, + "logits/rejected": -34078133.333333336, + "logps/chosen": -322.76373291015625, + "logps/rejected": -553.343505859375, + "loss": 0.0741, + "rewards/chosen": 7.6857039133707685, + "rewards/margins": 22.321938196818035, + "rewards/rejected": -14.636234283447266, + "step": 2805 + }, + { + "epoch": 0.7021143500562993, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44109376.0, + "logits/rejected": -29337509.333333332, + "logps/chosen": -421.6363118489583, + "logps/rejected": -515.2259521484375, + "loss": 0.0546, + "rewards/chosen": 9.611385345458984, + "rewards/margins": 25.6060422261556, + "rewards/rejected": -15.994656880696615, + "step": 2806 + }, + { + "epoch": 0.7023645689978731, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42214477.71428572, + "logits/rejected": -48400134.4, + "logps/chosen": -466.38783482142856, + "logps/rejected": -835.85546875, + "loss": 0.0163, + "rewards/chosen": 9.405502319335938, + "rewards/margins": 37.369345092773436, + "rewards/rejected": -27.9638427734375, + "step": 2807 + }, + { + "epoch": 0.702614787939447, + "grad_norm": 34.5, + "kl": 8.427021026611328, + "learning_rate": 5e-06, + "logits/chosen": -47083320.0, + "logits/rejected": -68001616.0, + "logps/chosen": -334.8960266113281, + "logps/rejected": -616.2701416015625, + "loss": 0.1235, + "rewards/chosen": 6.149234771728516, + "rewards/margins": 29.302621841430664, + "rewards/rejected": -23.15338706970215, + "step": 2808 + }, + { + "epoch": 0.7028650068810209, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -90080265.84615384, + "logits/rejected": -29269917.09090909, + "logps/chosen": -428.4971454326923, + "logps/rejected": -566.1219371448864, + "loss": 0.0147, + "rewards/chosen": 10.236552311823917, + "rewards/margins": 26.204106577626476, + "rewards/rejected": -15.967554265802557, + "step": 2809 + }, + { + "epoch": 0.7031152258225948, + "grad_norm": 18.375, + "kl": 4.1148200035095215, + "learning_rate": 5e-06, + "logits/chosen": -126290662.4, + "logits/rejected": -68087488.0, + "logps/chosen": -294.8810791015625, + "logps/rejected": -619.4635881696429, + "loss": 0.1103, + "rewards/chosen": 5.11029167175293, + "rewards/margins": 21.478977802821568, + "rewards/rejected": -16.36868613106864, + "step": 2810 + }, + { + "epoch": 0.7033654447641686, + "grad_norm": 5.875, + "kl": 13.6532564163208, + "learning_rate": 5e-06, + "logits/chosen": -48643486.315789476, + "logits/rejected": -68821568.0, + "logps/chosen": -405.7858244243421, + "logps/rejected": -1019.23828125, + "loss": 0.1497, + "rewards/chosen": 7.513634932668586, + "rewards/margins": 40.27934843364515, + "rewards/rejected": -32.76571350097656, + "step": 2811 + }, + { + "epoch": 0.7036156637057426, + "grad_norm": 7.75, + "kl": 2.8656413555145264, + "learning_rate": 5e-06, + "logits/chosen": -49761417.14285714, + "logits/rejected": -92548108.8, + "logps/chosen": -401.373046875, + "logps/rejected": -702.535498046875, + "loss": 0.0992, + "rewards/chosen": 7.878729684012277, + "rewards/margins": 28.049011666434154, + "rewards/rejected": -20.170281982421876, + "step": 2812 + }, + { + "epoch": 0.7038658826473164, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -79458713.6, + "logits/rejected": -57691712.0, + "logps/chosen": -494.002099609375, + "logps/rejected": -784.9517299107143, + "loss": 0.0348, + "rewards/chosen": 10.21259002685547, + "rewards/margins": 27.619886343819754, + "rewards/rejected": -17.407296316964285, + "step": 2813 + }, + { + "epoch": 0.7041161015888903, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32719906.46153846, + "logits/rejected": -54949422.54545455, + "logps/chosen": -481.7707331730769, + "logps/rejected": -680.7263849431819, + "loss": 0.013, + "rewards/chosen": 9.475159865159254, + "rewards/margins": 28.795902385578287, + "rewards/rejected": -19.320742520419035, + "step": 2814 + }, + { + "epoch": 0.7043663205304641, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46511658.666666664, + "logits/rejected": -21299357.333333332, + "logps/chosen": -339.7562662760417, + "logps/rejected": -710.3235677083334, + "loss": 0.0151, + "rewards/chosen": 8.8230349222819, + "rewards/margins": 30.170014699300133, + "rewards/rejected": -21.34697977701823, + "step": 2815 + }, + { + "epoch": 0.7046165394720381, + "grad_norm": 3.4375, + "kl": 12.012417793273926, + "learning_rate": 5e-06, + "logits/chosen": -54868428.0, + "logits/rejected": -31207144.0, + "logps/chosen": -501.846435546875, + "logps/rejected": -500.8480224609375, + "loss": 0.0049, + "rewards/chosen": 9.695928573608398, + "rewards/margins": 20.061039924621582, + "rewards/rejected": -10.365111351013184, + "step": 2816 + }, + { + "epoch": 0.7048667584136119, + "grad_norm": 23.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19373475.2, + "logits/rejected": -29106800.0, + "logps/chosen": -404.86142578125, + "logps/rejected": -522.0775669642857, + "loss": 0.0468, + "rewards/chosen": 6.328115844726563, + "rewards/margins": 17.23578600202288, + "rewards/rejected": -10.907670157296318, + "step": 2817 + }, + { + "epoch": 0.7051169773551857, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73160306.28571428, + "logits/rejected": -55067571.2, + "logps/chosen": -356.1717006138393, + "logps/rejected": -728.42509765625, + "loss": 0.0379, + "rewards/chosen": 8.683749607631139, + "rewards/margins": 30.957196262904578, + "rewards/rejected": -22.27344665527344, + "step": 2818 + }, + { + "epoch": 0.7053671962967597, + "grad_norm": 13.5, + "kl": 13.465093612670898, + "learning_rate": 5e-06, + "logits/chosen": -64261696.0, + "logits/rejected": -94827925.33333333, + "logps/chosen": -481.3118082682292, + "logps/rejected": -738.933837890625, + "loss": 0.0297, + "rewards/chosen": 10.634162267049154, + "rewards/margins": 35.111388524373375, + "rewards/rejected": -24.47722625732422, + "step": 2819 + }, + { + "epoch": 0.7056174152383335, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16878382.85714286, + "logits/rejected": -30856640.0, + "logps/chosen": -216.86164202008928, + "logps/rejected": -618.92216796875, + "loss": 0.0971, + "rewards/chosen": 5.142184666224888, + "rewards/margins": 22.473379734584263, + "rewards/rejected": -17.331195068359374, + "step": 2820 + }, + { + "epoch": 0.7058676341799074, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44171987.692307696, + "logits/rejected": -50515688.72727273, + "logps/chosen": -376.4453125, + "logps/rejected": -679.4142844460227, + "loss": 0.0516, + "rewards/chosen": 6.357829167292668, + "rewards/margins": 23.69655486420318, + "rewards/rejected": -17.33872569691051, + "step": 2821 + }, + { + "epoch": 0.7061178531214813, + "grad_norm": 12.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35233440.0, + "logits/rejected": -50717508.92307692, + "logps/chosen": -368.59738991477275, + "logps/rejected": -476.78309044471155, + "loss": 0.019, + "rewards/chosen": 7.991248390891335, + "rewards/margins": 19.655959976302995, + "rewards/rejected": -11.66471158541166, + "step": 2822 + }, + { + "epoch": 0.7063680720630552, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64454069.333333336, + "logits/rejected": -58710229.333333336, + "logps/chosen": -388.794921875, + "logps/rejected": -781.9348958333334, + "loss": 0.0823, + "rewards/chosen": 7.467843373616536, + "rewards/margins": 27.562594095865883, + "rewards/rejected": -20.094750722249348, + "step": 2823 + }, + { + "epoch": 0.706618291004629, + "grad_norm": 12.0, + "kl": 11.920164108276367, + "learning_rate": 5e-06, + "logits/chosen": -46198409.14285714, + "logits/rejected": -27320323.2, + "logps/chosen": -344.45455496651783, + "logps/rejected": -607.85703125, + "loss": 0.0706, + "rewards/chosen": 7.8573777335030695, + "rewards/margins": 25.617419542585104, + "rewards/rejected": -17.760041809082033, + "step": 2824 + }, + { + "epoch": 0.706868509946203, + "grad_norm": 2.34375, + "kl": 0.28626760840415955, + "learning_rate": 5e-06, + "logits/chosen": -34663232.0, + "logits/rejected": -12738889.142857144, + "logps/chosen": -433.88935546875, + "logps/rejected": -719.2066127232143, + "loss": 0.0054, + "rewards/chosen": 9.68294677734375, + "rewards/margins": 26.907541983468192, + "rewards/rejected": -17.22459520612444, + "step": 2825 + }, + { + "epoch": 0.7071187288877768, + "grad_norm": 0.94140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33976740.571428575, + "logits/rejected": -11835712.0, + "logps/chosen": -401.70186941964283, + "logps/rejected": -674.78681640625, + "loss": 0.0107, + "rewards/chosen": 9.433323451450892, + "rewards/margins": 27.1864988054548, + "rewards/rejected": -17.753175354003908, + "step": 2826 + }, + { + "epoch": 0.7073689478293507, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58853845.333333336, + "logits/rejected": -46815978.666666664, + "logps/chosen": -407.94407552083334, + "logps/rejected": -877.8915473090278, + "loss": 0.0094, + "rewards/chosen": 10.130316162109375, + "rewards/margins": 35.291428629557295, + "rewards/rejected": -25.161112467447918, + "step": 2827 + }, + { + "epoch": 0.7076191667709245, + "grad_norm": 11.1875, + "kl": 38.14599609375, + "learning_rate": 5e-06, + "logits/chosen": -73945910.85714285, + "logits/rejected": -30161379.2, + "logps/chosen": -551.8123604910714, + "logps/rejected": -843.4265625, + "loss": 0.0845, + "rewards/chosen": 12.412297930036273, + "rewards/margins": 35.16861506870815, + "rewards/rejected": -22.756317138671875, + "step": 2828 + }, + { + "epoch": 0.7078693857124985, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51770572.8, + "logits/rejected": -41191488.0, + "logps/chosen": -376.494580078125, + "logps/rejected": -555.8494001116071, + "loss": 0.0282, + "rewards/chosen": 9.79822006225586, + "rewards/margins": 25.67746854509626, + "rewards/rejected": -15.879248482840401, + "step": 2829 + }, + { + "epoch": 0.7081196046540723, + "grad_norm": 3.96875, + "kl": 0.36625418066978455, + "learning_rate": 5e-06, + "logits/chosen": -75758091.63636364, + "logits/rejected": -29314171.076923076, + "logps/chosen": -423.9033203125, + "logps/rejected": -551.8607271634615, + "loss": 0.0126, + "rewards/chosen": 9.765291387384588, + "rewards/margins": 21.93703257954204, + "rewards/rejected": -12.171741192157452, + "step": 2830 + }, + { + "epoch": 0.7083698235956462, + "grad_norm": 5.0625, + "kl": 0.8606275320053101, + "learning_rate": 5e-06, + "logits/chosen": -71126853.81818181, + "logits/rejected": -40138993.23076923, + "logps/chosen": -414.7760120738636, + "logps/rejected": -591.3296649639423, + "loss": 0.0283, + "rewards/chosen": 7.694145202636719, + "rewards/margins": 21.260935269869293, + "rewards/rejected": -13.566790067232573, + "step": 2831 + }, + { + "epoch": 0.7086200425372201, + "grad_norm": 6.84375, + "kl": 13.946002960205078, + "learning_rate": 5e-06, + "logits/chosen": -42008210.28571428, + "logits/rejected": -41951542.4, + "logps/chosen": -389.9298618861607, + "logps/rejected": -679.4955078125, + "loss": 0.038, + "rewards/chosen": 9.412314278738839, + "rewards/margins": 27.424315316336497, + "rewards/rejected": -18.012001037597656, + "step": 2832 + }, + { + "epoch": 0.708870261478794, + "grad_norm": 14.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45796509.09090909, + "logits/rejected": -16990257.230769232, + "logps/chosen": -465.1394708806818, + "logps/rejected": -522.7392953725962, + "loss": 0.0364, + "rewards/chosen": 6.38507773659446, + "rewards/margins": 16.770491273253114, + "rewards/rejected": -10.385413536658653, + "step": 2833 + }, + { + "epoch": 0.7091204804203678, + "grad_norm": 1.59375, + "kl": 3.3121135234832764, + "learning_rate": 5e-06, + "logits/chosen": -57254997.333333336, + "logits/rejected": -28750023.111111112, + "logps/chosen": -503.9241536458333, + "logps/rejected": -719.1770833333334, + "loss": 0.0017, + "rewards/chosen": 10.296244303385416, + "rewards/margins": 28.75234815809462, + "rewards/rejected": -18.456103854709202, + "step": 2834 + }, + { + "epoch": 0.7093706993619417, + "grad_norm": 9.375, + "kl": 6.719563007354736, + "learning_rate": 5e-06, + "logits/chosen": -33529088.0, + "logits/rejected": -86171313.77777778, + "logps/chosen": -342.90686848958336, + "logps/rejected": -591.7131618923611, + "loss": 0.0694, + "rewards/chosen": 6.963863627115885, + "rewards/margins": 20.23766564263238, + "rewards/rejected": -13.273802015516493, + "step": 2835 + }, + { + "epoch": 0.7096209183035156, + "grad_norm": 21.25, + "kl": 26.440372467041016, + "learning_rate": 5e-06, + "logits/chosen": -29052413.866666667, + "logits/rejected": -35388757.333333336, + "logps/chosen": -502.05970052083336, + "logps/rejected": -560.2633463541666, + "loss": 0.0565, + "rewards/chosen": 9.2646240234375, + "rewards/margins": 22.778560384114584, + "rewards/rejected": -13.513936360677084, + "step": 2836 + }, + { + "epoch": 0.7098711372450894, + "grad_norm": 9.375, + "kl": 6.822956085205078, + "learning_rate": 5e-06, + "logits/chosen": -24766629.818181816, + "logits/rejected": -44858008.615384616, + "logps/chosen": -417.67631392045456, + "logps/rejected": -623.0213716947115, + "loss": 0.0405, + "rewards/chosen": 9.356787941672586, + "rewards/margins": 25.505620996435205, + "rewards/rejected": -16.14883305476262, + "step": 2837 + }, + { + "epoch": 0.7101213561866633, + "grad_norm": 2.703125, + "kl": 15.25387954711914, + "learning_rate": 5e-06, + "logits/chosen": -41985713.23076923, + "logits/rejected": -67380706.9090909, + "logps/chosen": -490.36470853365387, + "logps/rejected": -548.91748046875, + "loss": 0.0669, + "rewards/chosen": 10.400360107421875, + "rewards/margins": 21.486043063077062, + "rewards/rejected": -11.085682955655185, + "step": 2838 + }, + { + "epoch": 0.7103715751282372, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43332888.0, + "logits/rejected": -51703712.0, + "logps/chosen": -389.4402669270833, + "logps/rejected": -580.8695068359375, + "loss": 0.0587, + "rewards/chosen": 8.225232442220053, + "rewards/margins": 18.301397959391277, + "rewards/rejected": -10.076165517171225, + "step": 2839 + }, + { + "epoch": 0.7106217940698111, + "grad_norm": 18.875, + "kl": 23.652053833007812, + "learning_rate": 5e-06, + "logits/chosen": -57816952.0, + "logits/rejected": 8809010.0, + "logps/chosen": -501.9508056640625, + "logps/rejected": -412.52691650390625, + "loss": 0.0464, + "rewards/chosen": 11.180964469909668, + "rewards/margins": 19.88382911682129, + "rewards/rejected": -8.702864646911621, + "step": 2840 + }, + { + "epoch": 0.7108720130113849, + "grad_norm": 4.8125, + "kl": 3.7574310302734375, + "learning_rate": 5e-06, + "logits/chosen": -41451766.15384615, + "logits/rejected": -39796308.36363637, + "logps/chosen": -331.85216346153845, + "logps/rejected": -427.05149147727275, + "loss": 0.0224, + "rewards/chosen": 8.715910104604868, + "rewards/margins": 21.05279087520146, + "rewards/rejected": -12.336880770596592, + "step": 2841 + }, + { + "epoch": 0.7111222319529589, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45107368.72727273, + "logits/rejected": -47414321.23076923, + "logps/chosen": -467.9940074573864, + "logps/rejected": -653.5563401442307, + "loss": 0.0197, + "rewards/chosen": 10.463801297274502, + "rewards/margins": 21.725147967571978, + "rewards/rejected": -11.261346670297476, + "step": 2842 + }, + { + "epoch": 0.7113724508945327, + "grad_norm": 0.59375, + "kl": 3.7489001750946045, + "learning_rate": 5e-06, + "logits/chosen": -23924918.85714286, + "logits/rejected": -47244656.0, + "logps/chosen": -380.5603724888393, + "logps/rejected": -654.917333984375, + "loss": 0.0016, + "rewards/chosen": 10.75871821812221, + "rewards/margins": 25.424906267438615, + "rewards/rejected": -14.666188049316407, + "step": 2843 + }, + { + "epoch": 0.7116226698361066, + "grad_norm": 14.3125, + "kl": 9.8784818649292, + "learning_rate": 5e-06, + "logits/chosen": -24410053.333333332, + "logits/rejected": -23943712.0, + "logps/chosen": -399.3997802734375, + "logps/rejected": -380.7164713541667, + "loss": 0.0837, + "rewards/chosen": 9.614611307779947, + "rewards/margins": 18.097569783528645, + "rewards/rejected": -8.482958475748697, + "step": 2844 + }, + { + "epoch": 0.7118728887776805, + "grad_norm": 2.984375, + "kl": 2.053798198699951, + "learning_rate": 5e-06, + "logits/chosen": -26502024.0, + "logits/rejected": -53121080.0, + "logps/chosen": -351.9802551269531, + "logps/rejected": -660.3380126953125, + "loss": 0.037, + "rewards/chosen": 9.009044647216797, + "rewards/margins": 21.41208267211914, + "rewards/rejected": -12.403038024902344, + "step": 2845 + }, + { + "epoch": 0.7121231077192544, + "grad_norm": 10.25, + "kl": 0.039526622742414474, + "learning_rate": 5e-06, + "logits/chosen": -60147957.333333336, + "logits/rejected": -41260280.0, + "logps/chosen": -286.2404378255208, + "logps/rejected": -620.2919921875, + "loss": 0.0645, + "rewards/chosen": 7.060587565104167, + "rewards/margins": 23.003092447916668, + "rewards/rejected": -15.9425048828125, + "step": 2846 + }, + { + "epoch": 0.7123733266608282, + "grad_norm": 6.4375, + "kl": 15.201155662536621, + "learning_rate": 5e-06, + "logits/chosen": -33218082.133333333, + "logits/rejected": -20480615.111111112, + "logps/chosen": -369.7563151041667, + "logps/rejected": -514.876953125, + "loss": 0.0641, + "rewards/chosen": 9.411383056640625, + "rewards/margins": 18.668719991048178, + "rewards/rejected": -9.257336934407553, + "step": 2847 + }, + { + "epoch": 0.7126235456024022, + "grad_norm": 3.6875, + "kl": 5.470212459564209, + "learning_rate": 5e-06, + "logits/chosen": -22100365.333333332, + "logits/rejected": -39568728.0, + "logps/chosen": -362.3927408854167, + "logps/rejected": -728.1795247395834, + "loss": 0.0657, + "rewards/chosen": 8.579461415608725, + "rewards/margins": 23.26226806640625, + "rewards/rejected": -14.682806650797525, + "step": 2848 + }, + { + "epoch": 0.712873764543976, + "grad_norm": 12.875, + "kl": 10.43774127960205, + "learning_rate": 5e-06, + "logits/chosen": -58420804.92307692, + "logits/rejected": -60946234.18181818, + "logps/chosen": -414.2282902644231, + "logps/rejected": -629.3176491477273, + "loss": 0.0313, + "rewards/chosen": 9.766668466421274, + "rewards/margins": 24.70909705528846, + "rewards/rejected": -14.942428588867188, + "step": 2849 + }, + { + "epoch": 0.7131239834855498, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18501408.0, + "logits/rejected": -33000492.0, + "logps/chosen": -354.88360595703125, + "logps/rejected": -531.7135009765625, + "loss": 0.0327, + "rewards/chosen": 8.246360778808594, + "rewards/margins": 19.713088989257812, + "rewards/rejected": -11.466728210449219, + "step": 2850 + }, + { + "epoch": 0.7133742024271237, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34843936.0, + "logits/rejected": -51467786.666666664, + "logps/chosen": -333.20371500651044, + "logps/rejected": -581.0220947265625, + "loss": 0.0127, + "rewards/chosen": 8.187694549560547, + "rewards/margins": 20.489177703857422, + "rewards/rejected": -12.301483154296875, + "step": 2851 + }, + { + "epoch": 0.7136244213686976, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53458572.0, + "logits/rejected": -36250980.0, + "logps/chosen": -236.44578552246094, + "logps/rejected": -642.029296875, + "loss": 0.0315, + "rewards/chosen": 5.8505144119262695, + "rewards/margins": 20.897113800048828, + "rewards/rejected": -15.046599388122559, + "step": 2852 + }, + { + "epoch": 0.7138746403102715, + "grad_norm": 29.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45227052.307692304, + "logits/rejected": -24763938.90909091, + "logps/chosen": -372.0153620793269, + "logps/rejected": -594.9982688210227, + "loss": 0.0829, + "rewards/chosen": 8.906301645132212, + "rewards/margins": 18.668860322111968, + "rewards/rejected": -9.762558676979758, + "step": 2853 + }, + { + "epoch": 0.7141248592518453, + "grad_norm": 8.5625, + "kl": 9.637690544128418, + "learning_rate": 5e-06, + "logits/chosen": -63842805.333333336, + "logits/rejected": -53839456.0, + "logps/chosen": -393.1923828125, + "logps/rejected": -597.5767415364584, + "loss": 0.025, + "rewards/chosen": 9.196970621744791, + "rewards/margins": 23.25041325887044, + "rewards/rejected": -14.05344263712565, + "step": 2854 + }, + { + "epoch": 0.7143750781934193, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37782163.692307696, + "logits/rejected": -47577643.63636363, + "logps/chosen": -422.95748197115387, + "logps/rejected": -727.5739524147727, + "loss": 0.0164, + "rewards/chosen": 8.972540635329027, + "rewards/margins": 25.31112622881269, + "rewards/rejected": -16.338585593483664, + "step": 2855 + }, + { + "epoch": 0.7146252971349931, + "grad_norm": 5.3125, + "kl": 1.1548964977264404, + "learning_rate": 5e-06, + "logits/chosen": -15816248.888888888, + "logits/rejected": -36990732.8, + "logps/chosen": -374.362548828125, + "logps/rejected": -602.8686197916667, + "loss": 0.0122, + "rewards/chosen": 9.278047349717882, + "rewards/margins": 23.323512437608507, + "rewards/rejected": -14.045465087890625, + "step": 2856 + }, + { + "epoch": 0.714875516076567, + "grad_norm": 12.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57050950.4, + "logits/rejected": -42847936.0, + "logps/chosen": -395.9795654296875, + "logps/rejected": -666.8650948660714, + "loss": 0.0225, + "rewards/chosen": 8.862783813476563, + "rewards/margins": 23.772857230050224, + "rewards/rejected": -14.910073416573661, + "step": 2857 + }, + { + "epoch": 0.7151257350181409, + "grad_norm": 1.1171875, + "kl": 0.5791168212890625, + "learning_rate": 5e-06, + "logits/chosen": -43719726.93333333, + "logits/rejected": -50477834.666666664, + "logps/chosen": -446.62115885416665, + "logps/rejected": -709.5394965277778, + "loss": 0.0024, + "rewards/chosen": 9.364790852864584, + "rewards/margins": 27.542259046766493, + "rewards/rejected": -18.17746819390191, + "step": 2858 + }, + { + "epoch": 0.7153759539597148, + "grad_norm": 8.4375, + "kl": 3.7966461181640625, + "learning_rate": 5e-06, + "logits/chosen": -41097989.81818182, + "logits/rejected": -82372278.15384616, + "logps/chosen": -337.95634321732956, + "logps/rejected": -551.2882737379807, + "loss": 0.0309, + "rewards/chosen": 7.855382052334872, + "rewards/margins": 24.443642409531385, + "rewards/rejected": -16.588260357196514, + "step": 2859 + }, + { + "epoch": 0.7156261729012886, + "grad_norm": 3.921875, + "kl": 10.750984191894531, + "learning_rate": 5e-06, + "logits/chosen": -40742112.0, + "logits/rejected": -17511212.0, + "logps/chosen": -378.3724670410156, + "logps/rejected": -400.796630859375, + "loss": 0.0849, + "rewards/chosen": 8.816256523132324, + "rewards/margins": 18.769189834594727, + "rewards/rejected": -9.952933311462402, + "step": 2860 + }, + { + "epoch": 0.7158763918428626, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41644750.222222224, + "logits/rejected": -32165075.2, + "logps/chosen": -354.6389973958333, + "logps/rejected": -449.7046875, + "loss": 0.0435, + "rewards/chosen": 8.696370442708334, + "rewards/margins": 23.544317626953124, + "rewards/rejected": -14.847947184244791, + "step": 2861 + }, + { + "epoch": 0.7161266107844364, + "grad_norm": 13.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36305749.333333336, + "logits/rejected": -55204616.53333333, + "logps/chosen": -324.2687174479167, + "logps/rejected": -687.24140625, + "loss": 0.0506, + "rewards/chosen": 7.352183024088542, + "rewards/margins": 25.028904215494794, + "rewards/rejected": -17.67672119140625, + "step": 2862 + }, + { + "epoch": 0.7163768297260102, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57934048.0, + "logits/rejected": -43297910.85714286, + "logps/chosen": -397.777490234375, + "logps/rejected": -461.37806919642856, + "loss": 0.0408, + "rewards/chosen": 6.858859252929688, + "rewards/margins": 22.43976571219308, + "rewards/rejected": -15.580906459263392, + "step": 2863 + }, + { + "epoch": 0.7166270486675841, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62099130.666666664, + "logits/rejected": -48201440.0, + "logps/chosen": -506.9086507161458, + "logps/rejected": -579.0641276041666, + "loss": 0.0068, + "rewards/chosen": 10.800952911376953, + "rewards/margins": 26.549596150716148, + "rewards/rejected": -15.748643239339193, + "step": 2864 + }, + { + "epoch": 0.716877267609158, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59406997.333333336, + "logits/rejected": -70401912.8888889, + "logps/chosen": -347.2501220703125, + "logps/rejected": -516.6752387152778, + "loss": 0.0152, + "rewards/chosen": 7.224907557169597, + "rewards/margins": 25.40914895799425, + "rewards/rejected": -18.184241400824654, + "step": 2865 + }, + { + "epoch": 0.7171274865507319, + "grad_norm": 1.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76357426.28571428, + "logits/rejected": -37111518.11764706, + "logps/chosen": -386.46707589285717, + "logps/rejected": -570.9237706801471, + "loss": 0.0041, + "rewards/chosen": 7.317531040736607, + "rewards/margins": 23.490377217781646, + "rewards/rejected": -16.172846177045038, + "step": 2866 + }, + { + "epoch": 0.7173777054923057, + "grad_norm": 14.375, + "kl": 11.494989395141602, + "learning_rate": 5e-06, + "logits/chosen": -33947692.307692304, + "logits/rejected": -12712884.363636363, + "logps/chosen": -457.49767127403845, + "logps/rejected": -557.5231267755681, + "loss": 0.0966, + "rewards/chosen": 7.947304358849158, + "rewards/margins": 25.671394935021034, + "rewards/rejected": -17.724090576171875, + "step": 2867 + }, + { + "epoch": 0.7176279244338797, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37782504.0, + "logits/rejected": -68783272.0, + "logps/chosen": -343.82275390625, + "logps/rejected": -653.927001953125, + "loss": 0.0515, + "rewards/chosen": 7.150485515594482, + "rewards/margins": 28.440362453460693, + "rewards/rejected": -21.28987693786621, + "step": 2868 + }, + { + "epoch": 0.7178781433754535, + "grad_norm": 4.78125, + "kl": 2.913024425506592, + "learning_rate": 5e-06, + "logits/chosen": -72402656.0, + "logits/rejected": -58573580.8, + "logps/chosen": -392.57742745535717, + "logps/rejected": -824.3365234375, + "loss": 0.0426, + "rewards/chosen": 7.126686096191406, + "rewards/margins": 29.11225280761719, + "rewards/rejected": -21.985566711425783, + "step": 2869 + }, + { + "epoch": 0.7181283623170274, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37613065.14285714, + "logits/rejected": -84125056.0, + "logps/chosen": -255.48566545758928, + "logps/rejected": -897.02607421875, + "loss": 0.0637, + "rewards/chosen": 5.523496900285993, + "rewards/margins": 38.19929951259068, + "rewards/rejected": -32.675802612304686, + "step": 2870 + }, + { + "epoch": 0.7183785812586013, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51668209.23076923, + "logits/rejected": -41462021.81818182, + "logps/chosen": -398.46634615384613, + "logps/rejected": -623.2795632102273, + "loss": 0.0195, + "rewards/chosen": 7.699086115910457, + "rewards/margins": 26.738180520651223, + "rewards/rejected": -19.039094404740766, + "step": 2871 + }, + { + "epoch": 0.7186288002001752, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -85770595.55555555, + "logits/rejected": -63380795.733333334, + "logps/chosen": -346.42529296875, + "logps/rejected": -563.9303385416666, + "loss": 0.0722, + "rewards/chosen": 4.952493455674913, + "rewards/margins": 25.018030971950957, + "rewards/rejected": -20.06553751627604, + "step": 2872 + }, + { + "epoch": 0.718879019141749, + "grad_norm": 20.375, + "kl": 9.96059513092041, + "learning_rate": 5e-06, + "logits/chosen": -58355080.53333333, + "logits/rejected": -54580568.88888889, + "logps/chosen": -421.45078125, + "logps/rejected": -629.4325629340278, + "loss": 0.0907, + "rewards/chosen": 6.894812520345052, + "rewards/margins": 29.980572001139322, + "rewards/rejected": -23.08575948079427, + "step": 2873 + }, + { + "epoch": 0.7191292380833229, + "grad_norm": 3.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47940499.692307696, + "logits/rejected": -41416011.63636363, + "logps/chosen": -294.9191706730769, + "logps/rejected": -583.0656960227273, + "loss": 0.0615, + "rewards/chosen": 6.7944159874549275, + "rewards/margins": 22.63686482889669, + "rewards/rejected": -15.842448841441762, + "step": 2874 + }, + { + "epoch": 0.7193794570248968, + "grad_norm": 31.875, + "kl": 1.58340585231781, + "learning_rate": 5e-06, + "logits/chosen": -50031296.0, + "logits/rejected": -64277553.777777776, + "logps/chosen": -407.28229166666665, + "logps/rejected": -543.6356336805555, + "loss": 0.0293, + "rewards/chosen": 7.493740844726562, + "rewards/margins": 24.173035007052952, + "rewards/rejected": -16.67929416232639, + "step": 2875 + }, + { + "epoch": 0.7196296759664706, + "grad_norm": 2.21875, + "kl": 8.623088836669922, + "learning_rate": 5e-06, + "logits/chosen": -49215817.14285714, + "logits/rejected": 5751075.2, + "logps/chosen": -445.29659598214283, + "logps/rejected": -832.73798828125, + "loss": 0.0068, + "rewards/chosen": 9.496289934430804, + "rewards/margins": 30.461179460797993, + "rewards/rejected": -20.964889526367188, + "step": 2876 + }, + { + "epoch": 0.7198798949080445, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36594481.777777776, + "logits/rejected": -62171072.0, + "logps/chosen": -498.19281684027777, + "logps/rejected": -788.6013671875, + "loss": 0.0148, + "rewards/chosen": 8.378968980577257, + "rewards/margins": 32.96138678656684, + "rewards/rejected": -24.582417805989582, + "step": 2877 + }, + { + "epoch": 0.7201301138496184, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45305348.266666666, + "logits/rejected": -64915150.222222224, + "logps/chosen": -260.76728515625, + "logps/rejected": -700.75048828125, + "loss": 0.0495, + "rewards/chosen": 7.225716145833333, + "rewards/margins": 25.928771633572048, + "rewards/rejected": -18.703055487738716, + "step": 2878 + }, + { + "epoch": 0.7203803327911923, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16714944.0, + "logits/rejected": -52116539.428571425, + "logps/chosen": -416.816259765625, + "logps/rejected": -678.0104631696429, + "loss": 0.0021, + "rewards/chosen": 8.776738739013672, + "rewards/margins": 25.370780835832868, + "rewards/rejected": -16.594042096819198, + "step": 2879 + }, + { + "epoch": 0.7206305517327661, + "grad_norm": 7.71875, + "kl": 4.956021308898926, + "learning_rate": 5e-06, + "logits/chosen": -50827929.6, + "logits/rejected": 19882609.777777776, + "logps/chosen": -325.528125, + "logps/rejected": -596.7471245659722, + "loss": 0.0479, + "rewards/chosen": 7.604216003417969, + "rewards/margins": 23.96589677598741, + "rewards/rejected": -16.361680772569443, + "step": 2880 + }, + { + "epoch": 0.7208807706743401, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64226803.2, + "logits/rejected": -46863401.14285714, + "logps/chosen": -466.429150390625, + "logps/rejected": -833.7985491071429, + "loss": 0.0249, + "rewards/chosen": 6.062162017822265, + "rewards/margins": 27.55497076851981, + "rewards/rejected": -21.492808750697545, + "step": 2881 + }, + { + "epoch": 0.7211309896159139, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39330888.0, + "logits/rejected": -11901704.0, + "logps/chosen": -245.7134246826172, + "logps/rejected": -690.5980224609375, + "loss": 0.0276, + "rewards/chosen": 6.096774101257324, + "rewards/margins": 22.59409809112549, + "rewards/rejected": -16.497323989868164, + "step": 2882 + }, + { + "epoch": 0.7213812085574878, + "grad_norm": 4.65625, + "kl": 11.910772323608398, + "learning_rate": 5e-06, + "logits/chosen": -79897280.0, + "logits/rejected": -32291056.0, + "logps/chosen": -469.0347377232143, + "logps/rejected": -709.92685546875, + "loss": 0.0226, + "rewards/chosen": 10.987845284598214, + "rewards/margins": 27.889459664481024, + "rewards/rejected": -16.901614379882812, + "step": 2883 + }, + { + "epoch": 0.7216314274990617, + "grad_norm": 7.9375, + "kl": 0.6433385610580444, + "learning_rate": 5e-06, + "logits/chosen": -68300771.55555555, + "logits/rejected": -36462280.53333333, + "logps/chosen": -559.6072048611111, + "logps/rejected": -500.1732421875, + "loss": 0.032, + "rewards/chosen": 11.133505079481337, + "rewards/margins": 22.753039381239148, + "rewards/rejected": -11.619534301757813, + "step": 2884 + }, + { + "epoch": 0.7218816464406356, + "grad_norm": 1.3828125, + "kl": 1.8706579208374023, + "learning_rate": 5e-06, + "logits/chosen": -46206504.72727273, + "logits/rejected": -69710498.46153846, + "logps/chosen": -437.296875, + "logps/rejected": -772.2077824519231, + "loss": 0.0209, + "rewards/chosen": 8.938297618519176, + "rewards/margins": 27.36773276162314, + "rewards/rejected": -18.429435143103966, + "step": 2885 + }, + { + "epoch": 0.7221318653822094, + "grad_norm": 9.3125, + "kl": 9.468201637268066, + "learning_rate": 5e-06, + "logits/chosen": -48625846.15384615, + "logits/rejected": -26920116.363636363, + "logps/chosen": -390.6511418269231, + "logps/rejected": -548.4027432528409, + "loss": 0.0554, + "rewards/chosen": 9.134733346792368, + "rewards/margins": 22.46151834768015, + "rewards/rejected": -13.326785000887783, + "step": 2886 + }, + { + "epoch": 0.7223820843237833, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36453335.27272727, + "logits/rejected": -41300775.384615384, + "logps/chosen": -377.9205877130682, + "logps/rejected": -880.5459735576923, + "loss": 0.0152, + "rewards/chosen": 9.597073641690342, + "rewards/margins": 27.431426575133848, + "rewards/rejected": -17.83435293344351, + "step": 2887 + }, + { + "epoch": 0.7226323032653572, + "grad_norm": 4.875, + "kl": 10.460177421569824, + "learning_rate": 5e-06, + "logits/chosen": -39939202.461538464, + "logits/rejected": -40217064.72727273, + "logps/chosen": -356.84581580528845, + "logps/rejected": -521.3598188920455, + "loss": 0.0429, + "rewards/chosen": 9.2763671875, + "rewards/margins": 22.18054337935014, + "rewards/rejected": -12.904176191850143, + "step": 2888 + }, + { + "epoch": 0.722882522206931, + "grad_norm": 12.625, + "kl": 3.3482789993286133, + "learning_rate": 5e-06, + "logits/chosen": -26261496.615384616, + "logits/rejected": -49188683.63636363, + "logps/chosen": -312.43795072115387, + "logps/rejected": -714.7829367897727, + "loss": 0.0747, + "rewards/chosen": 6.7061033982497, + "rewards/margins": 21.515174332198562, + "rewards/rejected": -14.809070933948863, + "step": 2889 + }, + { + "epoch": 0.7231327411485049, + "grad_norm": 1.8828125, + "kl": 4.55334997177124, + "learning_rate": 5e-06, + "logits/chosen": -39667737.6, + "logits/rejected": -44198741.333333336, + "logps/chosen": -354.7807942708333, + "logps/rejected": -396.05121527777777, + "loss": 0.0395, + "rewards/chosen": 8.337481689453124, + "rewards/margins": 20.03710649278429, + "rewards/rejected": -11.699624803331163, + "step": 2890 + }, + { + "epoch": 0.7233829600900789, + "grad_norm": 9.625, + "kl": 8.35892391204834, + "learning_rate": 5e-06, + "logits/chosen": -61173485.71428572, + "logits/rejected": -32798848.0, + "logps/chosen": -321.08349609375, + "logps/rejected": -460.376220703125, + "loss": 0.0435, + "rewards/chosen": 7.999434334891183, + "rewards/margins": 20.414312417166574, + "rewards/rejected": -12.41487808227539, + "step": 2891 + }, + { + "epoch": 0.7236331790316527, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63597937.23076923, + "logits/rejected": -39373425.45454545, + "logps/chosen": -381.63656850961536, + "logps/rejected": -496.32080078125, + "loss": 0.015, + "rewards/chosen": 8.831263028658354, + "rewards/margins": 21.526453458345856, + "rewards/rejected": -12.6951904296875, + "step": 2892 + }, + { + "epoch": 0.7238833979732265, + "grad_norm": 7.65625, + "kl": 3.565218687057495, + "learning_rate": 5e-06, + "logits/chosen": -51561120.0, + "logits/rejected": -36241408.0, + "logps/chosen": -394.6404622395833, + "logps/rejected": -693.6927897135416, + "loss": 0.012, + "rewards/chosen": 10.486139933268229, + "rewards/margins": 28.127011617024742, + "rewards/rejected": -17.64087168375651, + "step": 2893 + }, + { + "epoch": 0.7241336169148005, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87192391.1111111, + "logits/rejected": -55685811.2, + "logps/chosen": -362.45068359375, + "logps/rejected": -680.239453125, + "loss": 0.073, + "rewards/chosen": 9.25250244140625, + "rewards/margins": 25.208761596679686, + "rewards/rejected": -15.956259155273438, + "step": 2894 + }, + { + "epoch": 0.7243838358563743, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23695948.8, + "logits/rejected": -59597645.71428572, + "logps/chosen": -290.9193359375, + "logps/rejected": -537.29052734375, + "loss": 0.0409, + "rewards/chosen": 6.676039123535157, + "rewards/margins": 18.117690604073662, + "rewards/rejected": -11.441651480538505, + "step": 2895 + }, + { + "epoch": 0.7246340547979482, + "grad_norm": 14.4375, + "kl": 11.795345306396484, + "learning_rate": 5e-06, + "logits/chosen": -29616284.0, + "logits/rejected": -37607488.0, + "logps/chosen": -346.43328857421875, + "logps/rejected": -1024.3037109375, + "loss": 0.0634, + "rewards/chosen": 7.913320541381836, + "rewards/margins": 28.349504470825195, + "rewards/rejected": -20.43618392944336, + "step": 2896 + }, + { + "epoch": 0.7248842737395221, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46718300.8, + "logits/rejected": -36814902.85714286, + "logps/chosen": -446.039697265625, + "logps/rejected": -496.92550223214283, + "loss": 0.0155, + "rewards/chosen": 9.608113098144532, + "rewards/margins": 23.49878605433873, + "rewards/rejected": -13.890672956194196, + "step": 2897 + }, + { + "epoch": 0.725134492681096, + "grad_norm": 10.0625, + "kl": 1.258618712425232, + "learning_rate": 5e-06, + "logits/chosen": -40302336.0, + "logits/rejected": -56714074.666666664, + "logps/chosen": -303.92909749348956, + "logps/rejected": -598.5936279296875, + "loss": 0.0448, + "rewards/chosen": 6.980060577392578, + "rewards/margins": 23.430469512939453, + "rewards/rejected": -16.450408935546875, + "step": 2898 + }, + { + "epoch": 0.7253847116226698, + "grad_norm": 9.375, + "kl": 6.403668403625488, + "learning_rate": 5e-06, + "logits/chosen": -36249843.2, + "logits/rejected": -26060048.0, + "logps/chosen": -318.43525390625, + "logps/rejected": -415.39976671006946, + "loss": 0.0527, + "rewards/chosen": 8.134361775716146, + "rewards/margins": 18.383056301540798, + "rewards/rejected": -10.248694525824654, + "step": 2899 + }, + { + "epoch": 0.7256349305642437, + "grad_norm": 5.0, + "kl": 7.78309965133667, + "learning_rate": 5e-06, + "logits/chosen": -31293245.866666667, + "logits/rejected": -56980593.777777776, + "logps/chosen": -383.75579427083335, + "logps/rejected": -852.9083116319445, + "loss": 0.0147, + "rewards/chosen": 9.44459737141927, + "rewards/margins": 33.332663981119794, + "rewards/rejected": -23.88806660970052, + "step": 2900 + }, + { + "epoch": 0.7258851495058176, + "grad_norm": 37.5, + "kl": 6.944684982299805, + "learning_rate": 5e-06, + "logits/chosen": -47888352.0, + "logits/rejected": -52359280.0, + "logps/chosen": -378.3653971354167, + "logps/rejected": -714.9058430989584, + "loss": 0.0296, + "rewards/chosen": 8.003308614095053, + "rewards/margins": 26.430039723714195, + "rewards/rejected": -18.42673110961914, + "step": 2901 + }, + { + "epoch": 0.7261353684473915, + "grad_norm": 0.6171875, + "kl": 3.27380633354187, + "learning_rate": 5e-06, + "logits/chosen": -52830562.461538464, + "logits/rejected": -31542510.545454547, + "logps/chosen": -425.10336538461536, + "logps/rejected": -505.30397727272725, + "loss": 0.0163, + "rewards/chosen": 10.095457810621996, + "rewards/margins": 23.388698444499838, + "rewards/rejected": -13.293240633877842, + "step": 2902 + }, + { + "epoch": 0.7263855873889653, + "grad_norm": 4.53125, + "kl": 1.1286303997039795, + "learning_rate": 5e-06, + "logits/chosen": -54536413.09090909, + "logits/rejected": -25687020.307692308, + "logps/chosen": -338.84275124289775, + "logps/rejected": -559.2696439302885, + "loss": 0.07, + "rewards/chosen": 8.217640269886363, + "rewards/margins": 22.982396692662803, + "rewards/rejected": -14.764756422776442, + "step": 2903 + }, + { + "epoch": 0.7266358063305393, + "grad_norm": 1.9609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 11740802.0, + "logits/rejected": -39963104.0, + "logps/chosen": -300.1917724609375, + "logps/rejected": -460.0611877441406, + "loss": 0.0292, + "rewards/chosen": 6.3447675704956055, + "rewards/margins": 21.09045124053955, + "rewards/rejected": -14.745683670043945, + "step": 2904 + }, + { + "epoch": 0.7268860252721131, + "grad_norm": 5.75, + "kl": 3.5030932426452637, + "learning_rate": 5e-06, + "logits/chosen": -42804072.0, + "logits/rejected": -78302928.0, + "logps/chosen": -347.8034973144531, + "logps/rejected": -612.3585815429688, + "loss": 0.0311, + "rewards/chosen": 7.102024555206299, + "rewards/margins": 24.9579758644104, + "rewards/rejected": -17.8559513092041, + "step": 2905 + }, + { + "epoch": 0.7271362442136869, + "grad_norm": 6.84375, + "kl": 17.318065643310547, + "learning_rate": 5e-06, + "logits/chosen": -37570128.0, + "logits/rejected": 63001360.0, + "logps/chosen": -460.0954182942708, + "logps/rejected": -580.6607259114584, + "loss": 0.025, + "rewards/chosen": 10.975963592529297, + "rewards/margins": 26.392869313557945, + "rewards/rejected": -15.416905721028646, + "step": 2906 + }, + { + "epoch": 0.7273864631552609, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30255782.4, + "logits/rejected": -4995613.714285715, + "logps/chosen": -315.675830078125, + "logps/rejected": -617.6026785714286, + "loss": 0.0235, + "rewards/chosen": 6.311699295043946, + "rewards/margins": 21.539633996146065, + "rewards/rejected": -15.22793470110212, + "step": 2907 + }, + { + "epoch": 0.7276366820968347, + "grad_norm": 0.64453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63655027.2, + "logits/rejected": -60351698.28571428, + "logps/chosen": -339.6093994140625, + "logps/rejected": -635.181640625, + "loss": 0.0087, + "rewards/chosen": 9.819187927246094, + "rewards/margins": 25.303001621791296, + "rewards/rejected": -15.483813694545201, + "step": 2908 + }, + { + "epoch": 0.7278869010384086, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22752053.333333332, + "logits/rejected": -29683916.8, + "logps/chosen": -304.55479600694446, + "logps/rejected": -501.7289713541667, + "loss": 0.0464, + "rewards/chosen": 5.375161912706163, + "rewards/margins": 16.369777594672307, + "rewards/rejected": -10.994615681966145, + "step": 2909 + }, + { + "epoch": 0.7281371199799825, + "grad_norm": 13.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31534196.57142857, + "logits/rejected": -26489886.11764706, + "logps/chosen": -209.28681291852678, + "logps/rejected": -603.5287224264706, + "loss": 0.0391, + "rewards/chosen": 5.497153690883091, + "rewards/margins": 17.27068812907243, + "rewards/rejected": -11.773534438189339, + "step": 2910 + }, + { + "epoch": 0.7283873389215564, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50321296.0, + "logits/rejected": -43530176.0, + "logps/chosen": -400.8604736328125, + "logps/rejected": -787.0475463867188, + "loss": 0.0023, + "rewards/chosen": 11.169492721557617, + "rewards/margins": 29.428300857543945, + "rewards/rejected": -18.258808135986328, + "step": 2911 + }, + { + "epoch": 0.7286375578631302, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25860368.0, + "logits/rejected": -32112621.333333332, + "logps/chosen": -372.0572916666667, + "logps/rejected": -504.5743815104167, + "loss": 0.0306, + "rewards/chosen": 10.26059341430664, + "rewards/margins": 23.600494384765625, + "rewards/rejected": -13.339900970458984, + "step": 2912 + }, + { + "epoch": 0.7288877768047041, + "grad_norm": 5.0, + "kl": 8.947591781616211, + "learning_rate": 5e-06, + "logits/chosen": -48572749.71428572, + "logits/rejected": -78966572.8, + "logps/chosen": -355.59873744419644, + "logps/rejected": -349.98388671875, + "loss": 0.0393, + "rewards/chosen": 7.420417240687779, + "rewards/margins": 17.522190311976843, + "rewards/rejected": -10.101773071289063, + "step": 2913 + }, + { + "epoch": 0.729137995746278, + "grad_norm": 8.0, + "kl": 8.74381160736084, + "learning_rate": 5e-06, + "logits/chosen": -27487990.85714286, + "logits/rejected": -56077926.4, + "logps/chosen": -466.0755092075893, + "logps/rejected": -608.82744140625, + "loss": 0.022, + "rewards/chosen": 9.35600825718471, + "rewards/margins": 25.218787493024553, + "rewards/rejected": -15.862779235839843, + "step": 2914 + }, + { + "epoch": 0.7293882146878519, + "grad_norm": 7.8125, + "kl": 1.9821374416351318, + "learning_rate": 5e-06, + "logits/chosen": -42463597.333333336, + "logits/rejected": -55375717.333333336, + "logps/chosen": -326.7588297526042, + "logps/rejected": -620.8380533854166, + "loss": 0.0586, + "rewards/chosen": 9.717333475748697, + "rewards/margins": 22.147659301757812, + "rewards/rejected": -12.430325826009115, + "step": 2915 + }, + { + "epoch": 0.7296384336294257, + "grad_norm": 1.984375, + "kl": 8.246866226196289, + "learning_rate": 5e-06, + "logits/chosen": -61584128.0, + "logits/rejected": -11450625.333333334, + "logps/chosen": -390.3129611545139, + "logps/rejected": -757.2203776041666, + "loss": 0.0272, + "rewards/chosen": 10.31535169813368, + "rewards/margins": 25.350894504123264, + "rewards/rejected": -15.035542805989584, + "step": 2916 + }, + { + "epoch": 0.7298886525709997, + "grad_norm": 14.0, + "kl": 0.8370288610458374, + "learning_rate": 5e-06, + "logits/chosen": -60769780.36363637, + "logits/rejected": -53257511.384615384, + "logps/chosen": -349.55111416903407, + "logps/rejected": -661.6254507211538, + "loss": 0.0263, + "rewards/chosen": 8.101445978338068, + "rewards/margins": 25.340078180486508, + "rewards/rejected": -17.238632202148438, + "step": 2917 + }, + { + "epoch": 0.7301388715125735, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36238523.07692308, + "logits/rejected": -37300701.09090909, + "logps/chosen": -283.8169508713942, + "logps/rejected": -484.40926846590907, + "loss": 0.0436, + "rewards/chosen": 7.3023552527794475, + "rewards/margins": 19.44380977603939, + "rewards/rejected": -12.141454523259943, + "step": 2918 + }, + { + "epoch": 0.7303890904541473, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46183744.0, + "logits/rejected": -51360104.0, + "logps/chosen": -398.9495544433594, + "logps/rejected": -430.7995300292969, + "loss": 0.0303, + "rewards/chosen": 9.59018325805664, + "rewards/margins": 21.647807121276855, + "rewards/rejected": -12.057623863220215, + "step": 2919 + }, + { + "epoch": 0.7306393093957213, + "grad_norm": 1.84375, + "kl": 2.0275282859802246, + "learning_rate": 5e-06, + "logits/chosen": -17465004.8, + "logits/rejected": -69597417.14285715, + "logps/chosen": -374.5014404296875, + "logps/rejected": -804.4541713169643, + "loss": 0.038, + "rewards/chosen": 6.34039077758789, + "rewards/margins": 29.302790941510878, + "rewards/rejected": -22.96240016392299, + "step": 2920 + }, + { + "epoch": 0.7308895283372951, + "grad_norm": 5.71875, + "kl": 3.5363919734954834, + "learning_rate": 5e-06, + "logits/chosen": -58141568.0, + "logits/rejected": -27343834.0, + "logps/chosen": -347.19744873046875, + "logps/rejected": -661.2326049804688, + "loss": 0.0625, + "rewards/chosen": 8.154752731323242, + "rewards/margins": 28.26039695739746, + "rewards/rejected": -20.10564422607422, + "step": 2921 + }, + { + "epoch": 0.731139747278869, + "grad_norm": 4.625, + "kl": 6.7801408767700195, + "learning_rate": 5e-06, + "logits/chosen": 29274835.555555556, + "logits/rejected": -33269518.933333334, + "logps/chosen": -418.17095269097223, + "logps/rejected": -412.1693359375, + "loss": 0.0887, + "rewards/chosen": 6.2674755520290795, + "rewards/margins": 19.734837002224392, + "rewards/rejected": -13.467361450195312, + "step": 2922 + }, + { + "epoch": 0.7313899662204428, + "grad_norm": 6.5625, + "kl": 4.34232234954834, + "learning_rate": 5e-06, + "logits/chosen": -43058336.0, + "logits/rejected": -60684091.07692308, + "logps/chosen": -432.8409534801136, + "logps/rejected": -534.7397085336538, + "loss": 0.0248, + "rewards/chosen": 8.46261943470348, + "rewards/margins": 22.34682165826117, + "rewards/rejected": -13.884202223557692, + "step": 2923 + }, + { + "epoch": 0.7316401851620168, + "grad_norm": 9.8125, + "kl": 3.0046985149383545, + "learning_rate": 5e-06, + "logits/chosen": -54814560.0, + "logits/rejected": -31381462.85714286, + "logps/chosen": -354.921044921875, + "logps/rejected": -564.3184988839286, + "loss": 0.013, + "rewards/chosen": 7.616036987304687, + "rewards/margins": 23.745526341029574, + "rewards/rejected": -16.12948935372489, + "step": 2924 + }, + { + "epoch": 0.7318904041035906, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47689638.4, + "logits/rejected": -55047698.28571428, + "logps/chosen": -428.25849609375, + "logps/rejected": -640.8875558035714, + "loss": 0.0356, + "rewards/chosen": 10.024781799316406, + "rewards/margins": 25.258214024135043, + "rewards/rejected": -15.233432224818639, + "step": 2925 + }, + { + "epoch": 0.7321406230451645, + "grad_norm": 2.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35587772.44444445, + "logits/rejected": -27798566.4, + "logps/chosen": -388.9150119357639, + "logps/rejected": -520.5255533854166, + "loss": 0.0121, + "rewards/chosen": 9.299229092068142, + "rewards/margins": 27.011639573838977, + "rewards/rejected": -17.712410481770835, + "step": 2926 + }, + { + "epoch": 0.7323908419867384, + "grad_norm": 3.09375, + "kl": 2.576671600341797, + "learning_rate": 5e-06, + "logits/chosen": -78720736.0, + "logits/rejected": -73070960.0, + "logps/chosen": -496.5307210286458, + "logps/rejected": -702.7225748697916, + "loss": 0.0132, + "rewards/chosen": 7.878293355305989, + "rewards/margins": 26.099018096923828, + "rewards/rejected": -18.22072474161784, + "step": 2927 + }, + { + "epoch": 0.7326410609283123, + "grad_norm": 16.0, + "kl": 6.576563835144043, + "learning_rate": 5e-06, + "logits/chosen": -13577972.0, + "logits/rejected": -47587978.666666664, + "logps/chosen": -381.9519856770833, + "logps/rejected": -708.8515625, + "loss": 0.0432, + "rewards/chosen": 8.774944305419922, + "rewards/margins": 28.3846918741862, + "rewards/rejected": -19.609747568766277, + "step": 2928 + }, + { + "epoch": 0.7328912798698861, + "grad_norm": 2.75, + "kl": 0.8835741877555847, + "learning_rate": 5e-06, + "logits/chosen": -29939008.0, + "logits/rejected": -59251974.4, + "logps/chosen": -390.05130440848217, + "logps/rejected": -686.340869140625, + "loss": 0.0437, + "rewards/chosen": 9.324417114257812, + "rewards/margins": 31.848109436035156, + "rewards/rejected": -22.523692321777343, + "step": 2929 + }, + { + "epoch": 0.7331414988114601, + "grad_norm": 1.7109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -89944221.53846154, + "logits/rejected": -72079773.0909091, + "logps/chosen": -435.64310396634613, + "logps/rejected": -723.6170099431819, + "loss": 0.015, + "rewards/chosen": 8.803032508263222, + "rewards/margins": 31.64544528347629, + "rewards/rejected": -22.842412775213067, + "step": 2930 + }, + { + "epoch": 0.7333917177530339, + "grad_norm": 1.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58497408.0, + "logits/rejected": -35028264.0, + "logps/chosen": -510.8992106119792, + "logps/rejected": -672.0108235677084, + "loss": 0.0016, + "rewards/chosen": 9.429258346557617, + "rewards/margins": 30.553325017293293, + "rewards/rejected": -21.124066670735676, + "step": 2931 + }, + { + "epoch": 0.7336419366946078, + "grad_norm": 1.5625, + "kl": 0.9634284973144531, + "learning_rate": 5e-06, + "logits/chosen": -42695528.72727273, + "logits/rejected": -45854798.76923077, + "logps/chosen": -410.88059303977275, + "logps/rejected": -517.5489783653846, + "loss": 0.0022, + "rewards/chosen": 9.711400812322443, + "rewards/margins": 23.893248017851292, + "rewards/rejected": -14.181847205528847, + "step": 2932 + }, + { + "epoch": 0.7338921556361817, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69670574.54545455, + "logits/rejected": -44802038.15384615, + "logps/chosen": -401.51669034090907, + "logps/rejected": -515.5558894230769, + "loss": 0.0206, + "rewards/chosen": 9.228928305886008, + "rewards/margins": 23.7736283415681, + "rewards/rejected": -14.54470003568209, + "step": 2933 + }, + { + "epoch": 0.7341423745777556, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45046835.2, + "logits/rejected": -50042244.571428575, + "logps/chosen": -329.166845703125, + "logps/rejected": -646.5944475446429, + "loss": 0.0198, + "rewards/chosen": 7.034855651855469, + "rewards/margins": 23.91043439592634, + "rewards/rejected": -16.87557874407087, + "step": 2934 + }, + { + "epoch": 0.7343925935193294, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34650413.71428572, + "logits/rejected": -50884422.4, + "logps/chosen": -338.978759765625, + "logps/rejected": -596.565283203125, + "loss": 0.0268, + "rewards/chosen": 8.019074031284877, + "rewards/margins": 29.139529963902064, + "rewards/rejected": -21.120455932617187, + "step": 2935 + }, + { + "epoch": 0.7346428124609032, + "grad_norm": 1.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54149321.84615385, + "logits/rejected": -50369792.0, + "logps/chosen": -406.34867037259613, + "logps/rejected": -593.7598544034091, + "loss": 0.0291, + "rewards/chosen": 9.055301372821514, + "rewards/margins": 27.88132065993089, + "rewards/rejected": -18.826019287109375, + "step": 2936 + }, + { + "epoch": 0.7348930314024772, + "grad_norm": 6.4375, + "kl": 0.33437079191207886, + "learning_rate": 5e-06, + "logits/chosen": -72834122.66666667, + "logits/rejected": -76415898.66666667, + "logps/chosen": -329.2901611328125, + "logps/rejected": -690.5845540364584, + "loss": 0.1164, + "rewards/chosen": 6.727304458618164, + "rewards/margins": 28.634594599405926, + "rewards/rejected": -21.90729014078776, + "step": 2937 + }, + { + "epoch": 0.735143250344051, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63678432.0, + "logits/rejected": -70501248.0, + "logps/chosen": -435.1626383463542, + "logps/rejected": -518.6283365885416, + "loss": 0.0021, + "rewards/chosen": 9.546963373819986, + "rewards/margins": 23.129985173543293, + "rewards/rejected": -13.583021799723307, + "step": 2938 + }, + { + "epoch": 0.7353934692856249, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67074065.45454545, + "logits/rejected": -48072832.0, + "logps/chosen": -296.30171342329544, + "logps/rejected": -751.7020733173077, + "loss": 0.0588, + "rewards/chosen": 4.480669888583097, + "rewards/margins": 23.95682979130245, + "rewards/rejected": -19.476159902719353, + "step": 2939 + }, + { + "epoch": 0.7356436882271988, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -87865914.18181819, + "logits/rejected": -88435101.53846154, + "logps/chosen": -387.6380060369318, + "logps/rejected": -876.8295522836538, + "loss": 0.0209, + "rewards/chosen": 7.457321860573509, + "rewards/margins": 30.319434052580718, + "rewards/rejected": -22.86211219200721, + "step": 2940 + }, + { + "epoch": 0.7358939071687727, + "grad_norm": 2.640625, + "kl": 3.472078323364258, + "learning_rate": 5e-06, + "logits/chosen": -1899634.6666666667, + "logits/rejected": -76914805.33333333, + "logps/chosen": -258.3375651041667, + "logps/rejected": -655.2002766927084, + "loss": 0.0496, + "rewards/chosen": 7.267150243123372, + "rewards/margins": 24.603605270385742, + "rewards/rejected": -17.33645502726237, + "step": 2941 + }, + { + "epoch": 0.7361441261103465, + "grad_norm": 0.62109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49618426.18181818, + "logits/rejected": -47196440.615384616, + "logps/chosen": -395.06063565340907, + "logps/rejected": -590.3279371995193, + "loss": 0.0014, + "rewards/chosen": 9.711784362792969, + "rewards/margins": 29.8503905076247, + "rewards/rejected": -20.13860614483173, + "step": 2942 + }, + { + "epoch": 0.7363943450519205, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51202466.90909091, + "logits/rejected": -60918606.76923077, + "logps/chosen": -473.97567471590907, + "logps/rejected": -622.8345853365385, + "loss": 0.0151, + "rewards/chosen": 8.358546170321377, + "rewards/margins": 25.67778116506296, + "rewards/rejected": -17.319234994741585, + "step": 2943 + }, + { + "epoch": 0.7366445639934943, + "grad_norm": 6.71875, + "kl": 1.7836418151855469, + "learning_rate": 5e-06, + "logits/chosen": -27899979.636363637, + "logits/rejected": -35651584.0, + "logps/chosen": -388.77689985795456, + "logps/rejected": -888.5279447115385, + "loss": 0.0358, + "rewards/chosen": 8.221864180131393, + "rewards/margins": 37.125407505702306, + "rewards/rejected": -28.903543325570915, + "step": 2944 + }, + { + "epoch": 0.7368947829350682, + "grad_norm": 10.125, + "kl": 4.3756208419799805, + "learning_rate": 5e-06, + "logits/chosen": -49095974.4, + "logits/rejected": -55985464.88888889, + "logps/chosen": -358.934375, + "logps/rejected": -759.603515625, + "loss": 0.032, + "rewards/chosen": 6.598445129394531, + "rewards/margins": 25.893936665852863, + "rewards/rejected": -19.295491536458332, + "step": 2945 + }, + { + "epoch": 0.7371450018766421, + "grad_norm": 10.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -77458002.28571428, + "logits/rejected": -36475494.4, + "logps/chosen": -438.35982840401783, + "logps/rejected": -576.227734375, + "loss": 0.0342, + "rewards/chosen": 7.649029867989676, + "rewards/margins": 26.945779745919364, + "rewards/rejected": -19.296749877929688, + "step": 2946 + }, + { + "epoch": 0.737395220818216, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25656273.454545453, + "logits/rejected": -26174281.846153848, + "logps/chosen": -385.1710759943182, + "logps/rejected": -515.7978140024038, + "loss": 0.0184, + "rewards/chosen": 6.387058604847301, + "rewards/margins": 22.33079123330283, + "rewards/rejected": -15.943732628455528, + "step": 2947 + }, + { + "epoch": 0.7376454397597898, + "grad_norm": 11.0625, + "kl": 2.1957292556762695, + "learning_rate": 5e-06, + "logits/chosen": -24279021.714285713, + "logits/rejected": -72876326.4, + "logps/chosen": -350.51820591517856, + "logps/rejected": -663.02265625, + "loss": 0.0872, + "rewards/chosen": 7.170877729143415, + "rewards/margins": 23.55446079799107, + "rewards/rejected": -16.383583068847656, + "step": 2948 + }, + { + "epoch": 0.7378956587013636, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83296090.66666667, + "logits/rejected": -43526138.666666664, + "logps/chosen": -461.2036539713542, + "logps/rejected": -671.3147786458334, + "loss": 0.014, + "rewards/chosen": 9.120169321695963, + "rewards/margins": 27.264695485432945, + "rewards/rejected": -18.14452616373698, + "step": 2949 + }, + { + "epoch": 0.7381458776429376, + "grad_norm": 2.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55780235.63636363, + "logits/rejected": -22258387.692307692, + "logps/chosen": -493.10014204545456, + "logps/rejected": -507.0874774639423, + "loss": 0.0201, + "rewards/chosen": 8.55664756081321, + "rewards/margins": 21.278862479683404, + "rewards/rejected": -12.722214918870192, + "step": 2950 + }, + { + "epoch": 0.7383960965845114, + "grad_norm": 5.6875, + "kl": 0.29381608963012695, + "learning_rate": 5e-06, + "logits/chosen": -50227168.0, + "logits/rejected": -74008890.66666667, + "logps/chosen": -332.99159071180554, + "logps/rejected": -817.2894694010416, + "loss": 0.0255, + "rewards/chosen": 7.9797253078884545, + "rewards/margins": 28.087435828314888, + "rewards/rejected": -20.107710520426433, + "step": 2951 + }, + { + "epoch": 0.7386463155260853, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40485405.538461536, + "logits/rejected": -62266839.27272727, + "logps/chosen": -300.7621319110577, + "logps/rejected": -791.8264382102273, + "loss": 0.0629, + "rewards/chosen": 7.577552208533654, + "rewards/margins": 28.469418719098282, + "rewards/rejected": -20.89186651056463, + "step": 2952 + }, + { + "epoch": 0.7388965344676592, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50676877.71428572, + "logits/rejected": -62268723.2, + "logps/chosen": -325.12894112723217, + "logps/rejected": -737.095068359375, + "loss": 0.037, + "rewards/chosen": 7.825897216796875, + "rewards/margins": 29.293589782714843, + "rewards/rejected": -21.467692565917968, + "step": 2953 + }, + { + "epoch": 0.7391467534092331, + "grad_norm": 12.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55917870.54545455, + "logits/rejected": -43009624.615384616, + "logps/chosen": -307.96388938210225, + "logps/rejected": -596.3366135817307, + "loss": 0.0296, + "rewards/chosen": 6.905091025612571, + "rewards/margins": 19.052472147908244, + "rewards/rejected": -12.147381122295673, + "step": 2954 + }, + { + "epoch": 0.7393969723508069, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35241242.666666664, + "logits/rejected": -49745301.333333336, + "logps/chosen": -412.0762125651042, + "logps/rejected": -644.2266845703125, + "loss": 0.017, + "rewards/chosen": 8.869913101196289, + "rewards/margins": 27.08336067199707, + "rewards/rejected": -18.21344757080078, + "step": 2955 + }, + { + "epoch": 0.7396471912923809, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24585198.545454547, + "logits/rejected": -56450372.92307692, + "logps/chosen": -466.87056107954544, + "logps/rejected": -762.9330679086538, + "loss": 0.0306, + "rewards/chosen": 9.220947265625, + "rewards/margins": 28.915724534254807, + "rewards/rejected": -19.694777268629807, + "step": 2956 + }, + { + "epoch": 0.7398974102339547, + "grad_norm": 6.25, + "kl": 2.567213773727417, + "learning_rate": 5e-06, + "logits/chosen": -32204025.6, + "logits/rejected": -36917560.88888889, + "logps/chosen": -370.80201822916666, + "logps/rejected": -656.787109375, + "loss": 0.0251, + "rewards/chosen": 8.222299702962239, + "rewards/margins": 22.904616800944012, + "rewards/rejected": -14.682317097981771, + "step": 2957 + }, + { + "epoch": 0.7401476291755286, + "grad_norm": 3.875, + "kl": 9.299044609069824, + "learning_rate": 5e-06, + "logits/chosen": -52585413.81818182, + "logits/rejected": -38020558.76923077, + "logps/chosen": -390.34428267045456, + "logps/rejected": -632.4707782451923, + "loss": 0.0744, + "rewards/chosen": 9.027475530450994, + "rewards/margins": 28.22293944458861, + "rewards/rejected": -19.19546391413762, + "step": 2958 + }, + { + "epoch": 0.7403978481171025, + "grad_norm": 0.9765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45409446.4, + "logits/rejected": -16195092.57142857, + "logps/chosen": -291.617626953125, + "logps/rejected": -841.8005022321429, + "loss": 0.0163, + "rewards/chosen": 7.878179931640625, + "rewards/margins": 28.662653895786832, + "rewards/rejected": -20.784473964146205, + "step": 2959 + }, + { + "epoch": 0.7406480670586764, + "grad_norm": 4.90625, + "kl": 10.160751342773438, + "learning_rate": 5e-06, + "logits/chosen": -61351449.6, + "logits/rejected": -42780306.28571428, + "logps/chosen": -563.00634765625, + "logps/rejected": -519.4230608258929, + "loss": 0.0064, + "rewards/chosen": 11.598554229736328, + "rewards/margins": 23.98524682181222, + "rewards/rejected": -12.386692592075892, + "step": 2960 + }, + { + "epoch": 0.7408982860002502, + "grad_norm": 19.5, + "kl": 9.81986141204834, + "learning_rate": 5e-06, + "logits/chosen": -12199661.47368421, + "logits/rejected": -25652560.0, + "logps/chosen": -345.3108552631579, + "logps/rejected": -449.408447265625, + "loss": 0.0786, + "rewards/chosen": 7.412800035978618, + "rewards/margins": 22.00080357601768, + "rewards/rejected": -14.588003540039063, + "step": 2961 + }, + { + "epoch": 0.741148504941824, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60431061.333333336, + "logits/rejected": -25901303.466666665, + "logps/chosen": -406.03138563368054, + "logps/rejected": -545.0197265625, + "loss": 0.0625, + "rewards/chosen": 9.985044691297743, + "rewards/margins": 22.85797356499566, + "rewards/rejected": -12.872928873697917, + "step": 2962 + }, + { + "epoch": 0.741398723883398, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37519982.93333333, + "logits/rejected": -32425891.555555556, + "logps/chosen": -352.4009114583333, + "logps/rejected": -728.9539388020834, + "loss": 0.0217, + "rewards/chosen": 9.596675618489583, + "rewards/margins": 25.51347452799479, + "rewards/rejected": -15.916798909505209, + "step": 2963 + }, + { + "epoch": 0.7416489428249718, + "grad_norm": 0.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43942478.76923077, + "logits/rejected": -24516846.545454547, + "logps/chosen": -449.21146334134613, + "logps/rejected": -455.12619850852275, + "loss": 0.0021, + "rewards/chosen": 9.571009709284855, + "rewards/margins": 22.927006968251476, + "rewards/rejected": -13.35599725896662, + "step": 2964 + }, + { + "epoch": 0.7418991617665457, + "grad_norm": 9.4375, + "kl": 0.03052012249827385, + "learning_rate": 5e-06, + "logits/chosen": -41680084.0, + "logits/rejected": -21839496.0, + "logps/chosen": -286.52325439453125, + "logps/rejected": -364.37060546875, + "loss": 0.0692, + "rewards/chosen": 6.339506149291992, + "rewards/margins": 20.323403358459473, + "rewards/rejected": -13.98389720916748, + "step": 2965 + }, + { + "epoch": 0.7421493807081196, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -81365592.0, + "logits/rejected": -44765304.0, + "logps/chosen": -330.15960693359375, + "logps/rejected": -563.4014892578125, + "loss": 0.0408, + "rewards/chosen": 6.771461009979248, + "rewards/margins": 20.151944637298584, + "rewards/rejected": -13.380483627319336, + "step": 2966 + }, + { + "epoch": 0.7423995996496935, + "grad_norm": 11.625, + "kl": 9.78278923034668, + "learning_rate": 5e-06, + "logits/chosen": -43358208.0, + "logits/rejected": -27435324.444444444, + "logps/chosen": -406.5504557291667, + "logps/rejected": -589.2847222222222, + "loss": 0.0783, + "rewards/chosen": 8.017154947916667, + "rewards/margins": 22.23306342230903, + "rewards/rejected": -14.21590847439236, + "step": 2967 + }, + { + "epoch": 0.7426498185912673, + "grad_norm": 21.5, + "kl": 8.140419006347656, + "learning_rate": 5e-06, + "logits/chosen": -40114585.6, + "logits/rejected": -13866734.222222222, + "logps/chosen": -368.9765299479167, + "logps/rejected": -728.7565104166666, + "loss": 0.0326, + "rewards/chosen": 8.143738810221354, + "rewards/margins": 25.86013895670573, + "rewards/rejected": -17.716400146484375, + "step": 2968 + }, + { + "epoch": 0.7429000375328413, + "grad_norm": 6.15625, + "kl": 1.3197168111801147, + "learning_rate": 5e-06, + "logits/chosen": -58030759.384615384, + "logits/rejected": -55027019.63636363, + "logps/chosen": -352.02095853365387, + "logps/rejected": -688.3469460227273, + "loss": 0.0379, + "rewards/chosen": 6.4862518310546875, + "rewards/margins": 21.586529818448156, + "rewards/rejected": -15.100277987393467, + "step": 2969 + }, + { + "epoch": 0.7431502564744151, + "grad_norm": 1.609375, + "kl": 17.18589973449707, + "learning_rate": 5e-06, + "logits/chosen": -49485732.0, + "logits/rejected": -104854720.0, + "logps/chosen": -339.5189514160156, + "logps/rejected": -717.4119873046875, + "loss": 0.095, + "rewards/chosen": 8.353102684020996, + "rewards/margins": 24.882023811340332, + "rewards/rejected": -16.528921127319336, + "step": 2970 + }, + { + "epoch": 0.743400475415989, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42508022.15384615, + "logits/rejected": -113613067.63636364, + "logps/chosen": -347.6548602764423, + "logps/rejected": -687.1920276988636, + "loss": 0.0211, + "rewards/chosen": 8.665538494403545, + "rewards/margins": 27.968358873487354, + "rewards/rejected": -19.30282037908381, + "step": 2971 + }, + { + "epoch": 0.7436506943575628, + "grad_norm": 5.4375, + "kl": 11.856978416442871, + "learning_rate": 5e-06, + "logits/chosen": -67924996.57142857, + "logits/rejected": -22442825.6, + "logps/chosen": -357.82589285714283, + "logps/rejected": -481.322216796875, + "loss": 0.0256, + "rewards/chosen": 8.960338592529297, + "rewards/margins": 21.452692413330077, + "rewards/rejected": -12.492353820800782, + "step": 2972 + }, + { + "epoch": 0.7439009132991368, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38896640.0, + "logits/rejected": -72241326.54545455, + "logps/chosen": -437.0188551682692, + "logps/rejected": -694.5090553977273, + "loss": 0.0023, + "rewards/chosen": 10.424429086538462, + "rewards/margins": 26.08788539646389, + "rewards/rejected": -15.663456309925426, + "step": 2973 + }, + { + "epoch": 0.7441511322407106, + "grad_norm": 6.71875, + "kl": 0.38140615820884705, + "learning_rate": 5e-06, + "logits/chosen": -48950171.428571425, + "logits/rejected": -72763993.6, + "logps/chosen": -342.34273856026783, + "logps/rejected": -750.2884765625, + "loss": 0.0324, + "rewards/chosen": 7.345164707728794, + "rewards/margins": 23.47940237862723, + "rewards/rejected": -16.134237670898436, + "step": 2974 + }, + { + "epoch": 0.7444013511822845, + "grad_norm": 12.4375, + "kl": 3.899538278579712, + "learning_rate": 5e-06, + "logits/chosen": -37282176.0, + "logits/rejected": 1396354.4, + "logps/chosen": -363.71212332589283, + "logps/rejected": -597.833251953125, + "loss": 0.0477, + "rewards/chosen": 9.398117065429688, + "rewards/margins": 22.14468002319336, + "rewards/rejected": -12.746562957763672, + "step": 2975 + }, + { + "epoch": 0.7446515701238584, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34198372.0, + "logits/rejected": -40790200.0, + "logps/chosen": -274.5306701660156, + "logps/rejected": -589.3925170898438, + "loss": 0.0369, + "rewards/chosen": 7.223034858703613, + "rewards/margins": 22.51541233062744, + "rewards/rejected": -15.292377471923828, + "step": 2976 + }, + { + "epoch": 0.7449017890654323, + "grad_norm": 6.1875, + "kl": 1.7482306957244873, + "learning_rate": 5e-06, + "logits/chosen": -48254997.333333336, + "logits/rejected": -82607473.77777778, + "logps/chosen": -434.34983723958334, + "logps/rejected": -483.54969618055554, + "loss": 0.0391, + "rewards/chosen": 8.5630615234375, + "rewards/margins": 19.870081922743054, + "rewards/rejected": -11.307020399305555, + "step": 2977 + }, + { + "epoch": 0.7451520080070061, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33927266.666666664, + "logits/rejected": -37798794.666666664, + "logps/chosen": -283.9439697265625, + "logps/rejected": -504.4635416666667, + "loss": 0.0343, + "rewards/chosen": 7.0659230550130205, + "rewards/margins": 22.59581247965495, + "rewards/rejected": -15.529889424641928, + "step": 2978 + }, + { + "epoch": 0.74540222694858, + "grad_norm": 11.6875, + "kl": 8.553287506103516, + "learning_rate": 5e-06, + "logits/chosen": -40960466.28571428, + "logits/rejected": -65892723.2, + "logps/chosen": -427.33443777901783, + "logps/rejected": -676.719482421875, + "loss": 0.0517, + "rewards/chosen": 8.275485447474889, + "rewards/margins": 23.598262241908483, + "rewards/rejected": -15.322776794433594, + "step": 2979 + }, + { + "epoch": 0.7456524458901539, + "grad_norm": 2.34375, + "kl": 0.613861083984375, + "learning_rate": 5e-06, + "logits/chosen": -14989211.076923076, + "logits/rejected": -51753105.45454545, + "logps/chosen": -253.10486778846155, + "logps/rejected": -691.1922940340909, + "loss": 0.035, + "rewards/chosen": 6.399891779972957, + "rewards/margins": 25.441764671485743, + "rewards/rejected": -19.041872891512785, + "step": 2980 + }, + { + "epoch": 0.7459026648317277, + "grad_norm": 8.25, + "kl": 2.1561267375946045, + "learning_rate": 5e-06, + "logits/chosen": -37089109.333333336, + "logits/rejected": -54329349.333333336, + "logps/chosen": -347.764892578125, + "logps/rejected": -759.8035481770834, + "loss": 0.0291, + "rewards/chosen": 8.871681849161783, + "rewards/margins": 23.838351567586262, + "rewards/rejected": -14.966669718424479, + "step": 2981 + }, + { + "epoch": 0.7461528837733017, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72515507.2, + "logits/rejected": -60621284.571428575, + "logps/chosen": -533.13037109375, + "logps/rejected": -499.085693359375, + "loss": 0.0196, + "rewards/chosen": 12.613286590576172, + "rewards/margins": 27.533389391217913, + "rewards/rejected": -14.920102800641741, + "step": 2982 + }, + { + "epoch": 0.7464031027148755, + "grad_norm": 1.2109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29078661.333333332, + "logits/rejected": -48593381.333333336, + "logps/chosen": -346.7162679036458, + "logps/rejected": -597.641357421875, + "loss": 0.0132, + "rewards/chosen": 9.14849853515625, + "rewards/margins": 24.08013153076172, + "rewards/rejected": -14.931632995605469, + "step": 2983 + }, + { + "epoch": 0.7466533216564494, + "grad_norm": 3.015625, + "kl": 11.07688045501709, + "learning_rate": 5e-06, + "logits/chosen": -49433834.666666664, + "logits/rejected": -71199367.1111111, + "logps/chosen": -412.8169921875, + "logps/rejected": -430.26915147569446, + "loss": 0.0049, + "rewards/chosen": 10.532106526692708, + "rewards/margins": 21.990336439344617, + "rewards/rejected": -11.458229912651909, + "step": 2984 + }, + { + "epoch": 0.7469035405980232, + "grad_norm": 10.9375, + "kl": 9.502801895141602, + "learning_rate": 5e-06, + "logits/chosen": -34099876.0, + "logits/rejected": -48263896.0, + "logps/chosen": -320.6567687988281, + "logps/rejected": -480.9551086425781, + "loss": 0.0763, + "rewards/chosen": 8.094695091247559, + "rewards/margins": 23.50810146331787, + "rewards/rejected": -15.413406372070312, + "step": 2985 + }, + { + "epoch": 0.7471537595395972, + "grad_norm": 14.9375, + "kl": 22.94472885131836, + "learning_rate": 5e-06, + "logits/chosen": -59953979.07692308, + "logits/rejected": -35467269.81818182, + "logps/chosen": -392.5456730769231, + "logps/rejected": -715.5534002130681, + "loss": 0.08, + "rewards/chosen": 10.32413541353666, + "rewards/margins": 28.636009803185097, + "rewards/rejected": -18.311874389648438, + "step": 2986 + }, + { + "epoch": 0.747403978481171, + "grad_norm": 11.625, + "kl": 1.1233184337615967, + "learning_rate": 5e-06, + "logits/chosen": -66068166.4, + "logits/rejected": -43138884.571428575, + "logps/chosen": -396.4162109375, + "logps/rejected": -550.4259556361607, + "loss": 0.0541, + "rewards/chosen": 9.839991760253906, + "rewards/margins": 20.280921718052454, + "rewards/rejected": -10.440929957798549, + "step": 2987 + }, + { + "epoch": 0.7476541974227449, + "grad_norm": 2.734375, + "kl": 3.221097469329834, + "learning_rate": 5e-06, + "logits/chosen": -18861942.153846152, + "logits/rejected": -24949486.545454547, + "logps/chosen": -313.9387958233173, + "logps/rejected": -503.43110795454544, + "loss": 0.0625, + "rewards/chosen": 6.365314190204327, + "rewards/margins": 21.060625303041682, + "rewards/rejected": -14.695311112837357, + "step": 2988 + }, + { + "epoch": 0.7479044163643188, + "grad_norm": 8.125, + "kl": 11.11878776550293, + "learning_rate": 5e-06, + "logits/chosen": -50350862.76923077, + "logits/rejected": -34689320.72727273, + "logps/chosen": -479.5060847355769, + "logps/rejected": -444.9208984375, + "loss": 0.0126, + "rewards/chosen": 10.012110783503605, + "rewards/margins": 21.642028968650976, + "rewards/rejected": -11.629918185147373, + "step": 2989 + }, + { + "epoch": 0.7481546353058927, + "grad_norm": 6.0, + "kl": 11.338481903076172, + "learning_rate": 5e-06, + "logits/chosen": -43079992.0, + "logits/rejected": -66883792.0, + "logps/chosen": -469.964111328125, + "logps/rejected": -660.2730712890625, + "loss": 0.0147, + "rewards/chosen": 10.641860008239746, + "rewards/margins": 25.54690170288086, + "rewards/rejected": -14.905041694641113, + "step": 2990 + }, + { + "epoch": 0.7484048542474665, + "grad_norm": 22.875, + "kl": 7.835423946380615, + "learning_rate": 5e-06, + "logits/chosen": -32104174.933333334, + "logits/rejected": -10760193.777777778, + "logps/chosen": -394.1509114583333, + "logps/rejected": -656.6400824652778, + "loss": 0.0681, + "rewards/chosen": 8.584361775716145, + "rewards/margins": 23.711491224500868, + "rewards/rejected": -15.127129448784721, + "step": 2991 + }, + { + "epoch": 0.7486550731890405, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32586333.333333332, + "logits/rejected": -11311970.666666666, + "logps/chosen": -402.3231608072917, + "logps/rejected": -730.009765625, + "loss": 0.0494, + "rewards/chosen": 9.422327677408854, + "rewards/margins": 29.532058715820312, + "rewards/rejected": -20.109731038411457, + "step": 2992 + }, + { + "epoch": 0.7489052921306143, + "grad_norm": 5.90625, + "kl": 1.0760133266448975, + "learning_rate": 5e-06, + "logits/chosen": -52827292.8, + "logits/rejected": -16611166.857142856, + "logps/chosen": -443.4484375, + "logps/rejected": -631.5638253348214, + "loss": 0.0192, + "rewards/chosen": 9.146554565429687, + "rewards/margins": 26.21226545061384, + "rewards/rejected": -17.065710885184153, + "step": 2993 + }, + { + "epoch": 0.7491555110721881, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62099514.18181818, + "logits/rejected": -64894946.461538464, + "logps/chosen": -415.0498046875, + "logps/rejected": -728.9068509615385, + "loss": 0.0612, + "rewards/chosen": 9.971081126819957, + "rewards/margins": 25.58747634354171, + "rewards/rejected": -15.616395216721754, + "step": 2994 + }, + { + "epoch": 0.7494057300137621, + "grad_norm": 10.25, + "kl": 3.3944811820983887, + "learning_rate": 5e-06, + "logits/chosen": -82164760.61538461, + "logits/rejected": -49706496.0, + "logps/chosen": -397.4314152644231, + "logps/rejected": -761.3055752840909, + "loss": 0.0506, + "rewards/chosen": 7.565390953650842, + "rewards/margins": 31.46845880254999, + "rewards/rejected": -23.90306784889915, + "step": 2995 + }, + { + "epoch": 0.7496559489553359, + "grad_norm": 8.875, + "kl": 1.0490138530731201, + "learning_rate": 5e-06, + "logits/chosen": -38914734.54545455, + "logits/rejected": -27044667.076923076, + "logps/chosen": -413.35373757102275, + "logps/rejected": -360.2961989182692, + "loss": 0.0566, + "rewards/chosen": 8.48189267245206, + "rewards/margins": 18.102880784681627, + "rewards/rejected": -9.620988112229567, + "step": 2996 + }, + { + "epoch": 0.7499061678969098, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62856738.90909091, + "logits/rejected": -13626338.461538462, + "logps/chosen": -485.50834517045456, + "logps/rejected": -637.4113581730769, + "loss": 0.0045, + "rewards/chosen": 8.025883067737926, + "rewards/margins": 26.146242048356918, + "rewards/rejected": -18.12035898061899, + "step": 2997 + }, + { + "epoch": 0.7501563868384836, + "grad_norm": 1.8203125, + "kl": 18.09365463256836, + "learning_rate": 5e-06, + "logits/chosen": -50235293.86666667, + "logits/rejected": -52473194.666666664, + "logps/chosen": -394.6966145833333, + "logps/rejected": -527.9289279513889, + "loss": 0.0283, + "rewards/chosen": 9.308648681640625, + "rewards/margins": 24.848753865559896, + "rewards/rejected": -15.540105183919271, + "step": 2998 + }, + { + "epoch": 0.7504066057800576, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22388499.692307692, + "logits/rejected": -69936779.63636364, + "logps/chosen": -402.82962740384613, + "logps/rejected": -725.9158380681819, + "loss": 0.028, + "rewards/chosen": 7.967340909517729, + "rewards/margins": 25.986092574112902, + "rewards/rejected": -18.01875166459517, + "step": 2999 + }, + { + "epoch": 0.7506568247216314, + "grad_norm": 1.7734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -76999744.0, + "logits/rejected": -63018724.571428575, + "logps/chosen": -417.80234375, + "logps/rejected": -796.9739118303571, + "loss": 0.0131, + "rewards/chosen": 9.845954895019531, + "rewards/margins": 33.757940019880024, + "rewards/rejected": -23.91198512486049, + "step": 3000 + }, + { + "epoch": 0.7509070436632053, + "grad_norm": 10.125, + "kl": 1.8281219005584717, + "learning_rate": 5e-06, + "logits/chosen": -54846262.85714286, + "logits/rejected": -44065548.8, + "logps/chosen": -475.9663783482143, + "logps/rejected": -626.612744140625, + "loss": 0.0161, + "rewards/chosen": 10.405157906668526, + "rewards/margins": 29.78905988420759, + "rewards/rejected": -19.38390197753906, + "step": 3001 + }, + { + "epoch": 0.7511572626047792, + "grad_norm": 1.3515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54012590.54545455, + "logits/rejected": -38756187.07692308, + "logps/chosen": -453.06582919034093, + "logps/rejected": -579.7786207932693, + "loss": 0.0016, + "rewards/chosen": 10.06178144975142, + "rewards/margins": 24.92873889416248, + "rewards/rejected": -14.866957444411058, + "step": 3002 + }, + { + "epoch": 0.7514074815463531, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58223310.222222224, + "logits/rejected": -50867575.46666667, + "logps/chosen": -405.23155381944446, + "logps/rejected": -686.0877604166667, + "loss": 0.0196, + "rewards/chosen": 8.472842746310764, + "rewards/margins": 27.707767062717014, + "rewards/rejected": -19.23492431640625, + "step": 3003 + }, + { + "epoch": 0.7516577004879269, + "grad_norm": 12.0, + "kl": 0.04863230511546135, + "learning_rate": 5e-06, + "logits/chosen": -50753301.333333336, + "logits/rejected": -28677317.333333332, + "logps/chosen": -337.3025716145833, + "logps/rejected": -470.729248046875, + "loss": 0.0479, + "rewards/chosen": 8.101856231689453, + "rewards/margins": 23.561314900716148, + "rewards/rejected": -15.459458669026693, + "step": 3004 + }, + { + "epoch": 0.7519079194295009, + "grad_norm": 0.126953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61957233.777777776, + "logits/rejected": -46115251.2, + "logps/chosen": -405.4639485677083, + "logps/rejected": -768.2690104166667, + "loss": 0.0003, + "rewards/chosen": 8.119927300347221, + "rewards/margins": 33.370000542534726, + "rewards/rejected": -25.2500732421875, + "step": 3005 + }, + { + "epoch": 0.7521581383710747, + "grad_norm": 5.71875, + "kl": 1.2892125844955444, + "learning_rate": 5e-06, + "logits/chosen": -67372501.33333333, + "logits/rejected": -68092364.8, + "logps/chosen": -448.887451171875, + "logps/rejected": -645.5022786458334, + "loss": 0.0134, + "rewards/chosen": 8.635713365342882, + "rewards/margins": 27.38247850206163, + "rewards/rejected": -18.74676513671875, + "step": 3006 + }, + { + "epoch": 0.7524083573126485, + "grad_norm": 0.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30905120.0, + "logits/rejected": -46393743.058823526, + "logps/chosen": -329.446533203125, + "logps/rejected": -614.9485294117648, + "loss": 0.0013, + "rewards/chosen": 8.632052285330635, + "rewards/margins": 27.0705539799538, + "rewards/rejected": -18.438501694623163, + "step": 3007 + }, + { + "epoch": 0.7526585762542225, + "grad_norm": 3.125, + "kl": 2.888256072998047, + "learning_rate": 5e-06, + "logits/chosen": -32075693.333333332, + "logits/rejected": -20852284.0, + "logps/chosen": -382.2194010416667, + "logps/rejected": -442.8057047526042, + "loss": 0.0606, + "rewards/chosen": 10.252699534098307, + "rewards/margins": 19.899840037027992, + "rewards/rejected": -9.647140502929688, + "step": 3008 + }, + { + "epoch": 0.7529087951957963, + "grad_norm": 9.9375, + "kl": 7.660856246948242, + "learning_rate": 5e-06, + "logits/chosen": -35853926.4, + "logits/rejected": -41917385.14285714, + "logps/chosen": -342.877197265625, + "logps/rejected": -409.59898158482144, + "loss": 0.0914, + "rewards/chosen": 6.096869659423828, + "rewards/margins": 18.62328153337751, + "rewards/rejected": -12.526411873953682, + "step": 3009 + }, + { + "epoch": 0.7531590141373702, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69457749.33333333, + "logits/rejected": -50875912.53333333, + "logps/chosen": -339.67206488715277, + "logps/rejected": -554.9678385416667, + "loss": 0.0301, + "rewards/chosen": 6.507584889729817, + "rewards/margins": 23.42615585327148, + "rewards/rejected": -16.918570963541665, + "step": 3010 + }, + { + "epoch": 0.753409233078944, + "grad_norm": 0.65234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41114216.0, + "logits/rejected": -31974850.666666668, + "logps/chosen": -439.2692057291667, + "logps/rejected": -519.2866617838541, + "loss": 0.0012, + "rewards/chosen": 10.649532953898111, + "rewards/margins": 24.9608097076416, + "rewards/rejected": -14.31127675374349, + "step": 3011 + }, + { + "epoch": 0.753659452020518, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43821417.6, + "logits/rejected": -43697280.0, + "logps/chosen": -471.659619140625, + "logps/rejected": -650.6641322544643, + "loss": 0.0233, + "rewards/chosen": 7.995932769775391, + "rewards/margins": 25.316506849016463, + "rewards/rejected": -17.320574079241073, + "step": 3012 + }, + { + "epoch": 0.7539096709620918, + "grad_norm": 10.3125, + "kl": 8.64933967590332, + "learning_rate": 5e-06, + "logits/chosen": -49860694.85714286, + "logits/rejected": -62711168.0, + "logps/chosen": -389.60777064732144, + "logps/rejected": -680.565234375, + "loss": 0.0629, + "rewards/chosen": 8.606111798967634, + "rewards/margins": 23.759310041155132, + "rewards/rejected": -15.1531982421875, + "step": 3013 + }, + { + "epoch": 0.7541598899036657, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49506026.666666664, + "logits/rejected": -61228134.4, + "logps/chosen": -308.4492458767361, + "logps/rejected": -723.5415364583333, + "loss": 0.0386, + "rewards/chosen": 6.334142896864149, + "rewards/margins": 24.827406650119357, + "rewards/rejected": -18.49326375325521, + "step": 3014 + }, + { + "epoch": 0.7544101088452396, + "grad_norm": 3.5625, + "kl": 0.3426574170589447, + "learning_rate": 5e-06, + "logits/chosen": -31510262.153846152, + "logits/rejected": -39597812.36363637, + "logps/chosen": -381.72787710336536, + "logps/rejected": -633.2786754261364, + "loss": 0.0375, + "rewards/chosen": 8.678029573880709, + "rewards/margins": 26.689569379899886, + "rewards/rejected": -18.011539806019176, + "step": 3015 + }, + { + "epoch": 0.7546603277868135, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43553898.666666664, + "logits/rejected": -40713237.333333336, + "logps/chosen": -401.7740071614583, + "logps/rejected": -637.66845703125, + "loss": 0.0173, + "rewards/chosen": 8.446009318033854, + "rewards/margins": 28.855069478352867, + "rewards/rejected": -20.40906016031901, + "step": 3016 + }, + { + "epoch": 0.7549105467283873, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46425720.0, + "logits/rejected": -42713692.0, + "logps/chosen": -377.6973571777344, + "logps/rejected": -679.115966796875, + "loss": 0.0034, + "rewards/chosen": 6.997097492218018, + "rewards/margins": 27.958326816558838, + "rewards/rejected": -20.96122932434082, + "step": 3017 + }, + { + "epoch": 0.7551607656699613, + "grad_norm": 12.125, + "kl": 10.50784683227539, + "learning_rate": 5e-06, + "logits/chosen": -42278912.0, + "logits/rejected": -57328137.14285714, + "logps/chosen": -383.8205997242647, + "logps/rejected": -636.2880161830357, + "loss": 0.0577, + "rewards/chosen": 8.499478508444394, + "rewards/margins": 28.35625515464975, + "rewards/rejected": -19.856776646205358, + "step": 3018 + }, + { + "epoch": 0.7554109846115351, + "grad_norm": 2.953125, + "kl": 7.741909027099609, + "learning_rate": 5e-06, + "logits/chosen": -43221589.333333336, + "logits/rejected": -54980288.0, + "logps/chosen": -424.30237630208336, + "logps/rejected": -761.2146809895834, + "loss": 0.0102, + "rewards/chosen": 10.029179890950521, + "rewards/margins": 28.80080329047309, + "rewards/rejected": -18.771623399522568, + "step": 3019 + }, + { + "epoch": 0.755661203553109, + "grad_norm": 2.4375, + "kl": 7.2004523277282715, + "learning_rate": 5e-06, + "logits/chosen": -55501452.8, + "logits/rejected": -50906510.222222224, + "logps/chosen": -310.1562825520833, + "logps/rejected": -520.9491644965278, + "loss": 0.0664, + "rewards/chosen": 7.485165913899739, + "rewards/margins": 22.12284257676866, + "rewards/rejected": -14.637676662868923, + "step": 3020 + }, + { + "epoch": 0.7559114224946828, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41625974.85714286, + "logits/rejected": -34001360.0, + "logps/chosen": -381.34116908482144, + "logps/rejected": -541.88271484375, + "loss": 0.0079, + "rewards/chosen": 7.873631068638393, + "rewards/margins": 25.508109828404017, + "rewards/rejected": -17.634478759765624, + "step": 3021 + }, + { + "epoch": 0.7561616414362567, + "grad_norm": 2.640625, + "kl": 8.345202445983887, + "learning_rate": 5e-06, + "logits/chosen": -49595253.333333336, + "logits/rejected": -25759706.666666668, + "logps/chosen": -371.2119954427083, + "logps/rejected": -448.7355143229167, + "loss": 0.0892, + "rewards/chosen": 7.154866536458333, + "rewards/margins": 18.299564361572266, + "rewards/rejected": -11.144697825113932, + "step": 3022 + }, + { + "epoch": 0.7564118603778306, + "grad_norm": 2.09375, + "kl": 2.554166316986084, + "learning_rate": 5e-06, + "logits/chosen": -47801490.28571428, + "logits/rejected": -37197817.6, + "logps/chosen": -378.62465122767856, + "logps/rejected": -571.64599609375, + "loss": 0.0061, + "rewards/chosen": 9.706957135881696, + "rewards/margins": 30.476961408342632, + "rewards/rejected": -20.770004272460938, + "step": 3023 + }, + { + "epoch": 0.7566620793194044, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1643535.3846153845, + "logits/rejected": -43807025.45454545, + "logps/chosen": -206.16231595552884, + "logps/rejected": -744.5750177556819, + "loss": 0.1165, + "rewards/chosen": 4.0923746549166164, + "rewards/margins": 23.066243285065767, + "rewards/rejected": -18.97386863014915, + "step": 3024 + }, + { + "epoch": 0.7569122982609784, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36291168.0, + "logits/rejected": -64379228.0, + "logps/chosen": -256.9881286621094, + "logps/rejected": -678.6770629882812, + "loss": 0.0831, + "rewards/chosen": 5.054719924926758, + "rewards/margins": 21.543819427490234, + "rewards/rejected": -16.489099502563477, + "step": 3025 + }, + { + "epoch": 0.7571625172025522, + "grad_norm": 1.4375, + "kl": 6.154253959655762, + "learning_rate": 5e-06, + "logits/chosen": -46648011.63636363, + "logits/rejected": -66373287.384615384, + "logps/chosen": -405.66432883522725, + "logps/rejected": -530.7264873798077, + "loss": 0.0212, + "rewards/chosen": 11.731084650213068, + "rewards/margins": 22.837119149161385, + "rewards/rejected": -11.106034498948317, + "step": 3026 + }, + { + "epoch": 0.7574127361441261, + "grad_norm": 26.5, + "kl": 11.78526496887207, + "learning_rate": 5e-06, + "logits/chosen": -18465640.615384616, + "logits/rejected": -54347217.45454545, + "logps/chosen": -448.1644756610577, + "logps/rejected": -620.7837357954545, + "loss": 0.0364, + "rewards/chosen": 10.522921048677885, + "rewards/margins": 20.262144182111832, + "rewards/rejected": -9.73922313343395, + "step": 3027 + }, + { + "epoch": 0.7576629550857, + "grad_norm": 3.171875, + "kl": 8.21436882019043, + "learning_rate": 5e-06, + "logits/chosen": -40990001.23076923, + "logits/rejected": -40971485.09090909, + "logps/chosen": -370.52798227163464, + "logps/rejected": -569.7867098721591, + "loss": 0.0316, + "rewards/chosen": 8.740723830003004, + "rewards/margins": 24.976121168870193, + "rewards/rejected": -16.235397338867188, + "step": 3028 + }, + { + "epoch": 0.7579131740272739, + "grad_norm": 1.28125, + "kl": 8.874314308166504, + "learning_rate": 5e-06, + "logits/chosen": -57312118.15384615, + "logits/rejected": -66919883.63636363, + "logps/chosen": -432.7990159254808, + "logps/rejected": -703.6788441051136, + "loss": 0.002, + "rewards/chosen": 10.246924767127403, + "rewards/margins": 27.889031256829107, + "rewards/rejected": -17.642106489701703, + "step": 3029 + }, + { + "epoch": 0.7581633929688477, + "grad_norm": 4.65625, + "kl": 12.677990913391113, + "learning_rate": 5e-06, + "logits/chosen": -45911792.0, + "logits/rejected": -55012660.0, + "logps/chosen": -386.392333984375, + "logps/rejected": -766.695556640625, + "loss": 0.0434, + "rewards/chosen": 9.33446216583252, + "rewards/margins": 26.673619270324707, + "rewards/rejected": -17.339157104492188, + "step": 3030 + }, + { + "epoch": 0.7584136119104217, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13912113.333333334, + "logits/rejected": -61974108.44444445, + "logps/chosen": -446.6190592447917, + "logps/rejected": -581.9754774305555, + "loss": 0.0162, + "rewards/chosen": 7.01948356628418, + "rewards/margins": 22.12471495734321, + "rewards/rejected": -15.105231391059029, + "step": 3031 + }, + { + "epoch": 0.7586638308519955, + "grad_norm": 3.953125, + "kl": 10.596351623535156, + "learning_rate": 5e-06, + "logits/chosen": -58056000.0, + "logits/rejected": -78050905.6, + "logps/chosen": -398.57847377232144, + "logps/rejected": -758.4318359375, + "loss": 0.0444, + "rewards/chosen": 8.558523450578962, + "rewards/margins": 28.943778882707868, + "rewards/rejected": -20.385255432128908, + "step": 3032 + }, + { + "epoch": 0.7589140497935694, + "grad_norm": 5.4375, + "kl": 16.013896942138672, + "learning_rate": 5e-06, + "logits/chosen": -29352867.76470588, + "logits/rejected": -40619245.71428572, + "logps/chosen": -407.60431985294116, + "logps/rejected": -683.6116768973214, + "loss": 0.0886, + "rewards/chosen": 10.113013772403493, + "rewards/margins": 28.51347197204077, + "rewards/rejected": -18.400458199637278, + "step": 3033 + }, + { + "epoch": 0.7591642687351432, + "grad_norm": 11.6875, + "kl": 5.124140739440918, + "learning_rate": 5e-06, + "logits/chosen": -42533597.538461536, + "logits/rejected": -18246498.90909091, + "logps/chosen": -338.30235877403845, + "logps/rejected": -369.61669921875, + "loss": 0.0875, + "rewards/chosen": 7.537200927734375, + "rewards/margins": 16.929589011452414, + "rewards/rejected": -9.39238808371804, + "step": 3034 + }, + { + "epoch": 0.7594144876767172, + "grad_norm": 3.03125, + "kl": 5.172099590301514, + "learning_rate": 5e-06, + "logits/chosen": -44596512.0, + "logits/rejected": -52321866.666666664, + "logps/chosen": -328.06646728515625, + "logps/rejected": -729.0531412760416, + "loss": 0.0416, + "rewards/chosen": 9.115330378214518, + "rewards/margins": 26.910765965779625, + "rewards/rejected": -17.795435587565105, + "step": 3035 + }, + { + "epoch": 0.759664706618291, + "grad_norm": 2.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47723562.666666664, + "logits/rejected": -42728584.53333333, + "logps/chosen": -382.64173719618054, + "logps/rejected": -655.56875, + "loss": 0.0042, + "rewards/chosen": 9.192427741156685, + "rewards/margins": 25.117186652289497, + "rewards/rejected": -15.924758911132812, + "step": 3036 + }, + { + "epoch": 0.7599149255598648, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 138294001.7777778, + "logits/rejected": -66285602.13333333, + "logps/chosen": -305.75792100694446, + "logps/rejected": -527.13994140625, + "loss": 0.0561, + "rewards/chosen": 6.09983656141493, + "rewards/margins": 17.994315931532118, + "rewards/rejected": -11.894479370117187, + "step": 3037 + }, + { + "epoch": 0.7601651445014388, + "grad_norm": 4.40625, + "kl": 5.025340557098389, + "learning_rate": 5e-06, + "logits/chosen": -37846953.14285714, + "logits/rejected": -33823040.0, + "logps/chosen": -394.14697265625, + "logps/rejected": -618.312646484375, + "loss": 0.0107, + "rewards/chosen": 10.457151140485491, + "rewards/margins": 24.93178492954799, + "rewards/rejected": -14.4746337890625, + "step": 3038 + }, + { + "epoch": 0.7604153634430126, + "grad_norm": 0.58984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30467680.0, + "logits/rejected": -32711118.769230768, + "logps/chosen": -445.6549183238636, + "logps/rejected": -558.3936298076923, + "loss": 0.0031, + "rewards/chosen": 11.27105712890625, + "rewards/margins": 24.782386192908653, + "rewards/rejected": -13.511329064002403, + "step": 3039 + }, + { + "epoch": 0.7606655823845865, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42281235.2, + "logits/rejected": -43926523.428571425, + "logps/chosen": -447.6833984375, + "logps/rejected": -599.5888671875, + "loss": 0.0021, + "rewards/chosen": 9.945236206054688, + "rewards/margins": 25.887191772460938, + "rewards/rejected": -15.94195556640625, + "step": 3040 + }, + { + "epoch": 0.7609158013261604, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24406550.153846152, + "logits/rejected": -13871012.363636363, + "logps/chosen": -311.2875225360577, + "logps/rejected": -659.4736328125, + "loss": 0.037, + "rewards/chosen": 7.30583249605619, + "rewards/margins": 21.446686377892128, + "rewards/rejected": -14.140853881835938, + "step": 3041 + }, + { + "epoch": 0.7611660202677343, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49932899.55555555, + "logits/rejected": -56336179.2, + "logps/chosen": -355.6164279513889, + "logps/rejected": -730.251171875, + "loss": 0.0297, + "rewards/chosen": 9.774457295735678, + "rewards/margins": 26.44930674235026, + "rewards/rejected": -16.674849446614584, + "step": 3042 + }, + { + "epoch": 0.7614162392093081, + "grad_norm": 31.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56807040.0, + "logits/rejected": 25440816.0, + "logps/chosen": -326.1539306640625, + "logps/rejected": -484.55615234375, + "loss": 0.064, + "rewards/chosen": 9.176647186279297, + "rewards/margins": 20.138504573277064, + "rewards/rejected": -10.961857386997767, + "step": 3043 + }, + { + "epoch": 0.7616664581508821, + "grad_norm": 16.75, + "kl": 9.875935554504395, + "learning_rate": 5e-06, + "logits/chosen": -19702426.0, + "logits/rejected": -42361112.0, + "logps/chosen": -368.391845703125, + "logps/rejected": -434.53009033203125, + "loss": 0.0393, + "rewards/chosen": 9.276948928833008, + "rewards/margins": 21.175695419311523, + "rewards/rejected": -11.898746490478516, + "step": 3044 + }, + { + "epoch": 0.7619166770924559, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26994219.42857143, + "logits/rejected": -37305974.4, + "logps/chosen": -236.5732421875, + "logps/rejected": -551.793017578125, + "loss": 0.1203, + "rewards/chosen": 5.238282884870257, + "rewards/margins": 24.706700243268692, + "rewards/rejected": -19.468417358398437, + "step": 3045 + }, + { + "epoch": 0.7621668960340298, + "grad_norm": 2.453125, + "kl": 8.878522872924805, + "learning_rate": 5e-06, + "logits/chosen": -19218248.888888888, + "logits/rejected": -52900645.333333336, + "logps/chosen": -335.7138943142361, + "logps/rejected": -598.9007568359375, + "loss": 0.0771, + "rewards/chosen": 8.101626925998264, + "rewards/margins": 20.776271396213108, + "rewards/rejected": -12.674644470214844, + "step": 3046 + }, + { + "epoch": 0.7624171149756036, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36713486.54545455, + "logits/rejected": -33648337.23076923, + "logps/chosen": -337.3917125355114, + "logps/rejected": -675.029296875, + "loss": 0.0113, + "rewards/chosen": 7.587115201083097, + "rewards/margins": 25.822898704688868, + "rewards/rejected": -18.23578350360577, + "step": 3047 + }, + { + "epoch": 0.7626673339171776, + "grad_norm": 1.0078125, + "kl": 8.387125968933105, + "learning_rate": 5e-06, + "logits/chosen": -64260174.76923077, + "logits/rejected": -55776744.72727273, + "logps/chosen": -521.5092022235577, + "logps/rejected": -526.8179598721591, + "loss": 0.013, + "rewards/chosen": 9.756273709810698, + "rewards/margins": 24.27290301556354, + "rewards/rejected": -14.516629305752842, + "step": 3048 + }, + { + "epoch": 0.7629175528587514, + "grad_norm": 2.859375, + "kl": 0.2631978988647461, + "learning_rate": 5e-06, + "logits/chosen": -61607914.666666664, + "logits/rejected": -41820837.333333336, + "logps/chosen": -395.9764811197917, + "logps/rejected": -643.2744140625, + "loss": 0.0137, + "rewards/chosen": 8.586085637410482, + "rewards/margins": 25.19300397237142, + "rewards/rejected": -16.606918334960938, + "step": 3049 + }, + { + "epoch": 0.7631677718003252, + "grad_norm": 8.3125, + "kl": 7.226790428161621, + "learning_rate": 5e-06, + "logits/chosen": -32808349.714285713, + "logits/rejected": -23309464.0, + "logps/chosen": -471.83192661830356, + "logps/rejected": -536.179931640625, + "loss": 0.0541, + "rewards/chosen": 9.304013933454241, + "rewards/margins": 24.11996852329799, + "rewards/rejected": -14.81595458984375, + "step": 3050 + }, + { + "epoch": 0.7634179907418992, + "grad_norm": 9.125, + "kl": 2.3386073112487793, + "learning_rate": 5e-06, + "logits/chosen": -48919982.222222224, + "logits/rejected": -55310980.266666666, + "logps/chosen": -358.27210828993054, + "logps/rejected": -517.2578776041667, + "loss": 0.0248, + "rewards/chosen": 7.991654290093316, + "rewards/margins": 25.360184563530815, + "rewards/rejected": -17.3685302734375, + "step": 3051 + }, + { + "epoch": 0.763668209683473, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62282101.333333336, + "logits/rejected": -61297914.666666664, + "logps/chosen": -302.46498616536456, + "logps/rejected": -788.9519856770834, + "loss": 0.0314, + "rewards/chosen": 7.059399922688802, + "rewards/margins": 31.05707295735677, + "rewards/rejected": -23.99767303466797, + "step": 3052 + }, + { + "epoch": 0.7639184286250469, + "grad_norm": 0.83203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26611738.181818184, + "logits/rejected": -82299731.6923077, + "logps/chosen": -367.4422496448864, + "logps/rejected": -626.7731745793269, + "loss": 0.0172, + "rewards/chosen": 9.286956787109375, + "rewards/margins": 26.46184363731971, + "rewards/rejected": -17.174886850210335, + "step": 3053 + }, + { + "epoch": 0.7641686475666208, + "grad_norm": 7.28125, + "kl": 5.8607611656188965, + "learning_rate": 5e-06, + "logits/chosen": -54126361.6, + "logits/rejected": -40858716.44444445, + "logps/chosen": -369.61435546875, + "logps/rejected": -678.5112847222222, + "loss": 0.0375, + "rewards/chosen": 9.48133544921875, + "rewards/margins": 32.53721483018663, + "rewards/rejected": -23.05587938096788, + "step": 3054 + }, + { + "epoch": 0.7644188665081947, + "grad_norm": 5.5625, + "kl": 9.623043060302734, + "learning_rate": 5e-06, + "logits/chosen": -21914794.666666668, + "logits/rejected": -39560439.46666667, + "logps/chosen": -467.1070963541667, + "logps/rejected": -564.4861328125, + "loss": 0.0068, + "rewards/chosen": 9.64678700764974, + "rewards/margins": 25.613659159342447, + "rewards/rejected": -15.966872151692709, + "step": 3055 + }, + { + "epoch": 0.7646690854497685, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49372183.27272727, + "logits/rejected": -38091492.92307692, + "logps/chosen": -408.87721946022725, + "logps/rejected": -671.3844651442307, + "loss": 0.0656, + "rewards/chosen": 7.8071511008522725, + "rewards/margins": 25.322459187540975, + "rewards/rejected": -17.5153080866887, + "step": 3056 + }, + { + "epoch": 0.7649193043913424, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66731180.307692304, + "logits/rejected": -75456011.63636364, + "logps/chosen": -495.62222055288464, + "logps/rejected": -721.4599609375, + "loss": 0.0055, + "rewards/chosen": 11.044830322265625, + "rewards/margins": 32.5546181418679, + "rewards/rejected": -21.509787819602273, + "step": 3057 + }, + { + "epoch": 0.7651695233329163, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32681523.2, + "logits/rejected": -29462532.57142857, + "logps/chosen": -362.75908203125, + "logps/rejected": -625.5394810267857, + "loss": 0.0482, + "rewards/chosen": 7.629109191894531, + "rewards/margins": 24.026051330566407, + "rewards/rejected": -16.396942138671875, + "step": 3058 + }, + { + "epoch": 0.7654197422744902, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50180228.571428575, + "logits/rejected": 1073312.8, + "logps/chosen": -286.12744140625, + "logps/rejected": -738.143603515625, + "loss": 0.0341, + "rewards/chosen": 7.529172624860491, + "rewards/margins": 27.828955950055807, + "rewards/rejected": -20.299783325195314, + "step": 3059 + }, + { + "epoch": 0.765669961216064, + "grad_norm": 6.65625, + "kl": 23.394771575927734, + "learning_rate": 5e-06, + "logits/chosen": -10551692.57142857, + "logits/rejected": 11470419.2, + "logps/chosen": -500.5634765625, + "logps/rejected": -804.8654296875, + "loss": 0.0498, + "rewards/chosen": 11.067224775041852, + "rewards/margins": 35.635501752580915, + "rewards/rejected": -24.568276977539064, + "step": 3060 + }, + { + "epoch": 0.765920180157638, + "grad_norm": 6.375, + "kl": 0.1891765594482422, + "learning_rate": 5e-06, + "logits/chosen": -28096485.333333332, + "logits/rejected": -61432170.666666664, + "logps/chosen": -229.6199747721354, + "logps/rejected": -589.7267252604166, + "loss": 0.0444, + "rewards/chosen": 5.316739400227864, + "rewards/margins": 19.791669209798176, + "rewards/rejected": -14.474929809570312, + "step": 3061 + }, + { + "epoch": 0.7661703990992118, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34077216.0, + "logits/rejected": -21878425.14285714, + "logps/chosen": -262.027392578125, + "logps/rejected": -810.7154715401786, + "loss": 0.0842, + "rewards/chosen": 4.591363906860352, + "rewards/margins": 30.45322390965053, + "rewards/rejected": -25.861860002790177, + "step": 3062 + }, + { + "epoch": 0.7664206180407857, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28481168.0, + "logits/rejected": -64276522.666666664, + "logps/chosen": -275.35487874348956, + "logps/rejected": -526.0913899739584, + "loss": 0.0444, + "rewards/chosen": 5.518931706746419, + "rewards/margins": 21.4695618947347, + "rewards/rejected": -15.950630187988281, + "step": 3063 + }, + { + "epoch": 0.7666708369823596, + "grad_norm": 10.125, + "kl": 17.648319244384766, + "learning_rate": 5e-06, + "logits/chosen": -65821134.76923077, + "logits/rejected": -45717364.36363637, + "logps/chosen": -479.2180739182692, + "logps/rejected": -574.78955078125, + "loss": 0.052, + "rewards/chosen": 10.21160419170673, + "rewards/margins": 23.36019011810943, + "rewards/rejected": -13.1485859264027, + "step": 3064 + }, + { + "epoch": 0.7669210559239334, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52856357.333333336, + "logits/rejected": -21393248.0, + "logps/chosen": -266.5519612630208, + "logps/rejected": -430.0556233723958, + "loss": 0.036, + "rewards/chosen": 8.488605499267578, + "rewards/margins": 21.123452504475914, + "rewards/rejected": -12.634847005208334, + "step": 3065 + }, + { + "epoch": 0.7671712748655073, + "grad_norm": 3.765625, + "kl": 9.52427864074707, + "learning_rate": 5e-06, + "logits/chosen": -3910475.4285714286, + "logits/rejected": -94303776.0, + "logps/chosen": -313.5557338169643, + "logps/rejected": -828.91220703125, + "loss": 0.1083, + "rewards/chosen": 6.9978822980608255, + "rewards/margins": 27.82281210763114, + "rewards/rejected": -20.824929809570314, + "step": 3066 + }, + { + "epoch": 0.7674214938070812, + "grad_norm": 4.4375, + "kl": 1.3731441497802734, + "learning_rate": 5e-06, + "logits/chosen": -53589184.0, + "logits/rejected": -40116995.2, + "logps/chosen": -427.18941824776783, + "logps/rejected": -529.79384765625, + "loss": 0.0218, + "rewards/chosen": 9.828845432826451, + "rewards/margins": 28.84438956124442, + "rewards/rejected": -19.015544128417968, + "step": 3067 + }, + { + "epoch": 0.7676717127486551, + "grad_norm": 2.890625, + "kl": 6.76815128326416, + "learning_rate": 5e-06, + "logits/chosen": -44723237.64705882, + "logits/rejected": -39660292.571428575, + "logps/chosen": -354.8489774816176, + "logps/rejected": -585.0604073660714, + "loss": 0.0156, + "rewards/chosen": 8.474680283490349, + "rewards/margins": 25.959602420069587, + "rewards/rejected": -17.48492213657924, + "step": 3068 + }, + { + "epoch": 0.7679219316902289, + "grad_norm": 14.5, + "kl": 24.758806228637695, + "learning_rate": 5e-06, + "logits/chosen": -71399323.42857143, + "logits/rejected": -38435932.8, + "logps/chosen": -440.54080636160717, + "logps/rejected": -705.109130859375, + "loss": 0.0961, + "rewards/chosen": 11.009863717215401, + "rewards/margins": 29.299367196219308, + "rewards/rejected": -18.289503479003905, + "step": 3069 + }, + { + "epoch": 0.7681721506318028, + "grad_norm": 15.375, + "kl": 4.885580062866211, + "learning_rate": 5e-06, + "logits/chosen": -64806386.28571428, + "logits/rejected": -69796761.6, + "logps/chosen": -375.8914271763393, + "logps/rejected": -688.26025390625, + "loss": 0.0533, + "rewards/chosen": 7.988880702427456, + "rewards/margins": 23.265810939243863, + "rewards/rejected": -15.276930236816407, + "step": 3070 + }, + { + "epoch": 0.7684223695733767, + "grad_norm": 8.4375, + "kl": 9.01988697052002, + "learning_rate": 5e-06, + "logits/chosen": -68952805.33333333, + "logits/rejected": -42101514.666666664, + "logps/chosen": -387.4458414713542, + "logps/rejected": -446.4722493489583, + "loss": 0.049, + "rewards/chosen": 8.355372746785482, + "rewards/margins": 23.46181297302246, + "rewards/rejected": -15.106440226236979, + "step": 3071 + }, + { + "epoch": 0.7686725885149506, + "grad_norm": 5.09375, + "kl": 0.5832252502441406, + "learning_rate": 5e-06, + "logits/chosen": -32751240.0, + "logits/rejected": -54761664.0, + "logps/chosen": -371.4450276692708, + "logps/rejected": -670.0087076822916, + "loss": 0.0259, + "rewards/chosen": 9.327786127726236, + "rewards/margins": 24.54293886820475, + "rewards/rejected": -15.215152740478516, + "step": 3072 + }, + { + "epoch": 0.7689228074565244, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56793488.0, + "logits/rejected": -22600864.0, + "logps/chosen": -376.9311930338542, + "logps/rejected": -867.9537760416666, + "loss": 0.0214, + "rewards/chosen": 9.333326975504557, + "rewards/margins": 31.87760798136393, + "rewards/rejected": -22.544281005859375, + "step": 3073 + }, + { + "epoch": 0.7691730263980984, + "grad_norm": 3.078125, + "kl": 5.481792449951172, + "learning_rate": 5e-06, + "logits/chosen": -36319121.06666667, + "logits/rejected": -46409752.88888889, + "logps/chosen": -295.51474609375, + "logps/rejected": -734.0636393229166, + "loss": 0.0634, + "rewards/chosen": 6.762885538736979, + "rewards/margins": 26.27461107042101, + "rewards/rejected": -19.51172553168403, + "step": 3074 + }, + { + "epoch": 0.7694232453396722, + "grad_norm": 8.3125, + "kl": 5.888150691986084, + "learning_rate": 5e-06, + "logits/chosen": -27564006.4, + "logits/rejected": -78485902.22222222, + "logps/chosen": -401.3704427083333, + "logps/rejected": -651.6440972222222, + "loss": 0.0392, + "rewards/chosen": 8.369480387369792, + "rewards/margins": 24.624107191297746, + "rewards/rejected": -16.254626803927952, + "step": 3075 + }, + { + "epoch": 0.7696734642812461, + "grad_norm": 8.75, + "kl": 8.82068920135498, + "learning_rate": 5e-06, + "logits/chosen": -33014621.714285713, + "logits/rejected": -69246464.0, + "logps/chosen": -337.24393136160717, + "logps/rejected": -803.382373046875, + "loss": 0.0436, + "rewards/chosen": 7.301273345947266, + "rewards/margins": 27.211426544189454, + "rewards/rejected": -19.910153198242188, + "step": 3076 + }, + { + "epoch": 0.76992368322282, + "grad_norm": 1.6796875, + "kl": 1.0160974264144897, + "learning_rate": 5e-06, + "logits/chosen": -15504145.23076923, + "logits/rejected": -43200189.09090909, + "logps/chosen": -188.97115384615384, + "logps/rejected": -655.0973011363636, + "loss": 0.0711, + "rewards/chosen": 6.333577669583834, + "rewards/margins": 20.95682429600429, + "rewards/rejected": -14.623246626420455, + "step": 3077 + }, + { + "epoch": 0.7701739021643939, + "grad_norm": 6.875, + "kl": 9.873197555541992, + "learning_rate": 5e-06, + "logits/chosen": -86418609.23076923, + "logits/rejected": -37327906.90909091, + "logps/chosen": -383.6145582932692, + "logps/rejected": -461.5814098011364, + "loss": 0.0889, + "rewards/chosen": 8.161186805138222, + "rewards/margins": 20.167687049278847, + "rewards/rejected": -12.006500244140625, + "step": 3078 + }, + { + "epoch": 0.7704241211059677, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46142144.0, + "logits/rejected": -50391689.84615385, + "logps/chosen": -335.21182528409093, + "logps/rejected": -615.4658578725962, + "loss": 0.0272, + "rewards/chosen": 8.337737343528055, + "rewards/margins": 24.37740667383154, + "rewards/rejected": -16.039669330303486, + "step": 3079 + }, + { + "epoch": 0.7706743400475417, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57657137.23076923, + "logits/rejected": -25353588.363636363, + "logps/chosen": -360.4960186298077, + "logps/rejected": -401.8840997869318, + "loss": 0.0623, + "rewards/chosen": 8.738087580754208, + "rewards/margins": 20.63450675911003, + "rewards/rejected": -11.896419178355824, + "step": 3080 + }, + { + "epoch": 0.7709245589891155, + "grad_norm": 1.4140625, + "kl": 0.7843869924545288, + "learning_rate": 5e-06, + "logits/chosen": -45583477.333333336, + "logits/rejected": -48008938.666666664, + "logps/chosen": -278.1892903645833, + "logps/rejected": -621.3359781901041, + "loss": 0.0134, + "rewards/chosen": 8.486738840738932, + "rewards/margins": 23.02573013305664, + "rewards/rejected": -14.538991292317709, + "step": 3081 + }, + { + "epoch": 0.7711747779306893, + "grad_norm": 5.875, + "kl": 8.27616024017334, + "learning_rate": 5e-06, + "logits/chosen": -59423910.4, + "logits/rejected": -33561108.571428575, + "logps/chosen": -470.000537109375, + "logps/rejected": -549.7661481584821, + "loss": 0.0059, + "rewards/chosen": 11.532557678222656, + "rewards/margins": 23.156912449428013, + "rewards/rejected": -11.624354771205358, + "step": 3082 + }, + { + "epoch": 0.7714249968722632, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34163730.666666664, + "logits/rejected": -29745805.333333332, + "logps/chosen": -241.61580403645834, + "logps/rejected": -576.1265462239584, + "loss": 0.0202, + "rewards/chosen": 7.229747772216797, + "rewards/margins": 20.808292388916016, + "rewards/rejected": -13.578544616699219, + "step": 3083 + }, + { + "epoch": 0.7716752158138371, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34087445.333333336, + "logits/rejected": -38345971.2, + "logps/chosen": -309.2471516927083, + "logps/rejected": -577.5503255208333, + "loss": 0.0487, + "rewards/chosen": 6.995212978786892, + "rewards/margins": 20.664888678656684, + "rewards/rejected": -13.669675699869792, + "step": 3084 + }, + { + "epoch": 0.771925434755411, + "grad_norm": 16.875, + "kl": 14.930333137512207, + "learning_rate": 5e-06, + "logits/chosen": -49902144.0, + "logits/rejected": -78610773.33333333, + "logps/chosen": -446.7172526041667, + "logps/rejected": -538.17919921875, + "loss": 0.0841, + "rewards/chosen": 9.231494140625, + "rewards/margins": 21.77554711235894, + "rewards/rejected": -12.54405297173394, + "step": 3085 + }, + { + "epoch": 0.7721756536969848, + "grad_norm": 3.5625, + "kl": 6.327107906341553, + "learning_rate": 5e-06, + "logits/chosen": -63635909.81818182, + "logits/rejected": -16798806.153846152, + "logps/chosen": -415.92569247159093, + "logps/rejected": -452.12939453125, + "loss": 0.0051, + "rewards/chosen": 9.078102111816406, + "rewards/margins": 20.25029050386869, + "rewards/rejected": -11.172188392052284, + "step": 3086 + }, + { + "epoch": 0.7724258726385588, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23788008.0, + "logits/rejected": -46431353.2631579, + "logps/chosen": -276.1698974609375, + "logps/rejected": -551.7350945723684, + "loss": 0.0104, + "rewards/chosen": 8.102255249023438, + "rewards/margins": 21.810096017937912, + "rewards/rejected": -13.707840768914474, + "step": 3087 + }, + { + "epoch": 0.7726760915801326, + "grad_norm": 10.125, + "kl": 9.914198875427246, + "learning_rate": 5e-06, + "logits/chosen": -9470930.0, + "logits/rejected": -34718650.666666664, + "logps/chosen": -283.0377604166667, + "logps/rejected": -587.2843831380209, + "loss": 0.0341, + "rewards/chosen": 7.970082600911458, + "rewards/margins": 18.455219904581707, + "rewards/rejected": -10.485137303670248, + "step": 3088 + }, + { + "epoch": 0.7729263105217065, + "grad_norm": 13.625, + "kl": 13.809676170349121, + "learning_rate": 5e-06, + "logits/chosen": -41840140.0, + "logits/rejected": -41442824.0, + "logps/chosen": -381.11895751953125, + "logps/rejected": -629.78125, + "loss": 0.0475, + "rewards/chosen": 9.175348281860352, + "rewards/margins": 26.86263084411621, + "rewards/rejected": -17.68728256225586, + "step": 3089 + }, + { + "epoch": 0.7731765294632804, + "grad_norm": 7.5, + "kl": 13.262168884277344, + "learning_rate": 5e-06, + "logits/chosen": -50675222.85714286, + "logits/rejected": -39873110.4, + "logps/chosen": -385.2784946986607, + "logps/rejected": -613.212451171875, + "loss": 0.0333, + "rewards/chosen": 8.78684561593192, + "rewards/margins": 20.94713156563895, + "rewards/rejected": -12.160285949707031, + "step": 3090 + }, + { + "epoch": 0.7734267484048543, + "grad_norm": 12.125, + "kl": 5.643848419189453, + "learning_rate": 5e-06, + "logits/chosen": -60358961.777777776, + "logits/rejected": -7662541.866666666, + "logps/chosen": -468.90614149305554, + "logps/rejected": -510.54713541666666, + "loss": 0.0288, + "rewards/chosen": 12.079994201660156, + "rewards/margins": 23.5205073038737, + "rewards/rejected": -11.440513102213542, + "step": 3091 + }, + { + "epoch": 0.7736769673464281, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9278665.6, + "logits/rejected": -39063861.333333336, + "logps/chosen": -370.7568033854167, + "logps/rejected": -783.9172092013889, + "loss": 0.0234, + "rewards/chosen": 8.525145975748698, + "rewards/margins": 26.53505401611328, + "rewards/rejected": -18.009908040364582, + "step": 3092 + }, + { + "epoch": 0.7739271862880021, + "grad_norm": 3.75, + "kl": 11.267451286315918, + "learning_rate": 5e-06, + "logits/chosen": -44673128.0, + "logits/rejected": -48458848.0, + "logps/chosen": -424.0133463541667, + "logps/rejected": -521.7499593098959, + "loss": 0.0489, + "rewards/chosen": 9.116305033365885, + "rewards/margins": 26.136512756347656, + "rewards/rejected": -17.02020772298177, + "step": 3093 + }, + { + "epoch": 0.7741774052295759, + "grad_norm": 11.625, + "kl": 17.3372859954834, + "learning_rate": 5e-06, + "logits/chosen": -39754872.0, + "logits/rejected": -42002376.0, + "logps/chosen": -395.74884033203125, + "logps/rejected": -574.26220703125, + "loss": 0.1015, + "rewards/chosen": 9.162534713745117, + "rewards/margins": 23.3634090423584, + "rewards/rejected": -14.200874328613281, + "step": 3094 + }, + { + "epoch": 0.7744276241711497, + "grad_norm": 12.9375, + "kl": 11.118032455444336, + "learning_rate": 5e-06, + "logits/chosen": -35086284.8, + "logits/rejected": -54647250.28571428, + "logps/chosen": -300.6271728515625, + "logps/rejected": -690.5827287946429, + "loss": 0.0768, + "rewards/chosen": 9.241445159912109, + "rewards/margins": 25.625485992431642, + "rewards/rejected": -16.38404083251953, + "step": 3095 + }, + { + "epoch": 0.7746778431127236, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52246890.666666664, + "logits/rejected": -64625978.666666664, + "logps/chosen": -482.2469482421875, + "logps/rejected": -643.9549153645834, + "loss": 0.0216, + "rewards/chosen": 10.69256846110026, + "rewards/margins": 32.527732849121094, + "rewards/rejected": -21.835164388020832, + "step": 3096 + }, + { + "epoch": 0.7749280620542975, + "grad_norm": 25.125, + "kl": 7.89694881439209, + "learning_rate": 5e-06, + "logits/chosen": -45727389.09090909, + "logits/rejected": -63346141.538461536, + "logps/chosen": -383.74636008522725, + "logps/rejected": -608.9039963942307, + "loss": 0.0616, + "rewards/chosen": 8.890484896573154, + "rewards/margins": 22.43183856243854, + "rewards/rejected": -13.541353665865385, + "step": 3097 + }, + { + "epoch": 0.7751782809958714, + "grad_norm": 10.9375, + "kl": 11.805471420288086, + "learning_rate": 5e-06, + "logits/chosen": -27859584.0, + "logits/rejected": -22996124.8, + "logps/chosen": -384.5111607142857, + "logps/rejected": -631.20576171875, + "loss": 0.049, + "rewards/chosen": 9.24264417375837, + "rewards/margins": 25.303004891531806, + "rewards/rejected": -16.060360717773438, + "step": 3098 + }, + { + "epoch": 0.7754284999374452, + "grad_norm": 4.40625, + "kl": 8.93884563446045, + "learning_rate": 5e-06, + "logits/chosen": -27946980.266666666, + "logits/rejected": -75413781.33333333, + "logps/chosen": -396.44397786458336, + "logps/rejected": -336.07823350694446, + "loss": 0.0687, + "rewards/chosen": 9.180886840820312, + "rewards/margins": 21.545633782280817, + "rewards/rejected": -12.364746941460503, + "step": 3099 + }, + { + "epoch": 0.7756787188790192, + "grad_norm": 7.1875, + "kl": 4.656963348388672, + "learning_rate": 5e-06, + "logits/chosen": -75081130.66666667, + "logits/rejected": 52358668.8, + "logps/chosen": -484.15386284722223, + "logps/rejected": -548.3673177083333, + "loss": 0.0147, + "rewards/chosen": 11.95937008327908, + "rewards/margins": 28.105831061469182, + "rewards/rejected": -16.146460978190103, + "step": 3100 + }, + { + "epoch": 0.775928937820593, + "grad_norm": 7.53125, + "kl": 3.4970359802246094, + "learning_rate": 5e-06, + "logits/chosen": -32969216.0, + "logits/rejected": -35889627.07692308, + "logps/chosen": -270.04201438210225, + "logps/rejected": -518.2101862980769, + "loss": 0.0497, + "rewards/chosen": 7.812269731001421, + "rewards/margins": 26.015748030655867, + "rewards/rejected": -18.203478299654446, + "step": 3101 + }, + { + "epoch": 0.7761791567621669, + "grad_norm": 4.4375, + "kl": 2.7799766063690186, + "learning_rate": 5e-06, + "logits/chosen": -90191795.2, + "logits/rejected": -51659190.85714286, + "logps/chosen": -384.98662109375, + "logps/rejected": -820.3825334821429, + "loss": 0.0065, + "rewards/chosen": 9.093458557128907, + "rewards/margins": 31.868055725097655, + "rewards/rejected": -22.77459716796875, + "step": 3102 + }, + { + "epoch": 0.7764293757037408, + "grad_norm": 11.0625, + "kl": 0.007950624451041222, + "learning_rate": 5e-06, + "logits/chosen": -77214375.38461539, + "logits/rejected": -19624429.09090909, + "logps/chosen": -492.4860276442308, + "logps/rejected": -658.0683149857955, + "loss": 0.0283, + "rewards/chosen": 12.044015737680288, + "rewards/margins": 32.13958708222929, + "rewards/rejected": -20.095571344549004, + "step": 3103 + }, + { + "epoch": 0.7766795946453147, + "grad_norm": 13.8125, + "kl": 3.122584819793701, + "learning_rate": 5e-06, + "logits/chosen": -42618348.307692304, + "logits/rejected": -90994926.54545455, + "logps/chosen": -366.75732421875, + "logps/rejected": -955.8269708806819, + "loss": 0.0521, + "rewards/chosen": 8.889064495380108, + "rewards/margins": 30.71318336966988, + "rewards/rejected": -21.824118874289773, + "step": 3104 + }, + { + "epoch": 0.7769298135868885, + "grad_norm": 11.5, + "kl": 0.9228464961051941, + "learning_rate": 5e-06, + "logits/chosen": -65750243.55555555, + "logits/rejected": -35108989.86666667, + "logps/chosen": -456.89198133680554, + "logps/rejected": -598.6826171875, + "loss": 0.0494, + "rewards/chosen": 9.414031982421875, + "rewards/margins": 25.554949951171874, + "rewards/rejected": -16.14091796875, + "step": 3105 + }, + { + "epoch": 0.7771800325284624, + "grad_norm": 7.9375, + "kl": 8.075639724731445, + "learning_rate": 5e-06, + "logits/chosen": -30144912.0, + "logits/rejected": -3574462.6666666665, + "logps/chosen": -381.9097900390625, + "logps/rejected": -754.1100260416666, + "loss": 0.0982, + "rewards/chosen": 9.778003692626953, + "rewards/margins": 29.296606699625652, + "rewards/rejected": -19.5186030069987, + "step": 3106 + }, + { + "epoch": 0.7774302514700363, + "grad_norm": 1.546875, + "kl": 4.305239677429199, + "learning_rate": 5e-06, + "logits/chosen": -37979470.54545455, + "logits/rejected": -44418043.07692308, + "logps/chosen": -338.29097123579544, + "logps/rejected": -758.5072115384615, + "loss": 0.0276, + "rewards/chosen": 8.82777266068892, + "rewards/margins": 32.933959747527865, + "rewards/rejected": -24.106187086838943, + "step": 3107 + }, + { + "epoch": 0.7776804704116101, + "grad_norm": 3.078125, + "kl": 3.9497880935668945, + "learning_rate": 5e-06, + "logits/chosen": -31301408.0, + "logits/rejected": -78377792.0, + "logps/chosen": -315.6142252604167, + "logps/rejected": -464.1765407986111, + "loss": 0.0525, + "rewards/chosen": 7.751452128092448, + "rewards/margins": 23.058428446451824, + "rewards/rejected": -15.306976318359375, + "step": 3108 + }, + { + "epoch": 0.777930689353184, + "grad_norm": 5.4375, + "kl": 0.13927333056926727, + "learning_rate": 5e-06, + "logits/chosen": -54686027.63636363, + "logits/rejected": -39946604.307692304, + "logps/chosen": -442.53901811079544, + "logps/rejected": -539.1669546274038, + "loss": 0.0116, + "rewards/chosen": 9.94445107199929, + "rewards/margins": 29.21253273703835, + "rewards/rejected": -19.268081665039062, + "step": 3109 + }, + { + "epoch": 0.778180908294758, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75545294.76923077, + "logits/rejected": -55890967.27272727, + "logps/chosen": -446.8544170673077, + "logps/rejected": -774.443359375, + "loss": 0.0488, + "rewards/chosen": 7.956207275390625, + "rewards/margins": 29.564389315518465, + "rewards/rejected": -21.60818204012784, + "step": 3110 + }, + { + "epoch": 0.7784311272363318, + "grad_norm": 11.0, + "kl": 1.6312549114227295, + "learning_rate": 5e-06, + "logits/chosen": -44586120.53333333, + "logits/rejected": -88642609.77777778, + "logps/chosen": -297.5338541666667, + "logps/rejected": -436.4309895833333, + "loss": 0.0514, + "rewards/chosen": 5.938922627766927, + "rewards/margins": 18.871905178493925, + "rewards/rejected": -12.932982550726997, + "step": 3111 + }, + { + "epoch": 0.7786813461779056, + "grad_norm": 2.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50263400.0, + "logits/rejected": -59647528.0, + "logps/chosen": -308.05780029296875, + "logps/rejected": -699.5264282226562, + "loss": 0.0574, + "rewards/chosen": 6.598553657531738, + "rewards/margins": 29.065373420715332, + "rewards/rejected": -22.466819763183594, + "step": 3112 + }, + { + "epoch": 0.7789315651194796, + "grad_norm": 12.75, + "kl": 10.12739372253418, + "learning_rate": 5e-06, + "logits/chosen": -37394070.85714286, + "logits/rejected": -58530144.0, + "logps/chosen": -422.79317801339283, + "logps/rejected": -633.23759765625, + "loss": 0.073, + "rewards/chosen": 9.65329088483538, + "rewards/margins": 31.724039786202567, + "rewards/rejected": -22.07074890136719, + "step": 3113 + }, + { + "epoch": 0.7791817840610534, + "grad_norm": 8.1875, + "kl": 2.6220130920410156, + "learning_rate": 5e-06, + "logits/chosen": -83382976.0, + "logits/rejected": -55216288.0, + "logps/chosen": -518.0105794270834, + "logps/rejected": -643.44091796875, + "loss": 0.0144, + "rewards/chosen": 9.889227549235025, + "rewards/margins": 31.907403310139976, + "rewards/rejected": -22.01817576090495, + "step": 3114 + }, + { + "epoch": 0.7794320030026273, + "grad_norm": 2.6875, + "kl": 0.8821039199829102, + "learning_rate": 5e-06, + "logits/chosen": -59582747.428571425, + "logits/rejected": -57116378.35294118, + "logps/chosen": -532.8210797991071, + "logps/rejected": -786.5436006433823, + "loss": 0.0029, + "rewards/chosen": 10.168629237583705, + "rewards/margins": 37.42045131651293, + "rewards/rejected": -27.251822078929226, + "step": 3115 + }, + { + "epoch": 0.7796822219442012, + "grad_norm": 1.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50331685.333333336, + "logits/rejected": -48506197.333333336, + "logps/chosen": -348.7652994791667, + "logps/rejected": -460.1316731770833, + "loss": 0.017, + "rewards/chosen": 8.298028310139975, + "rewards/margins": 22.982758839925133, + "rewards/rejected": -14.684730529785156, + "step": 3116 + }, + { + "epoch": 0.7799324408857751, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22882520.888888888, + "logits/rejected": -52187848.53333333, + "logps/chosen": -249.39287651909723, + "logps/rejected": -643.231640625, + "loss": 0.0311, + "rewards/chosen": 6.587571038140191, + "rewards/margins": 26.55279320610894, + "rewards/rejected": -19.96522216796875, + "step": 3117 + }, + { + "epoch": 0.7801826598273489, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40380794.666666664, + "logits/rejected": -24098288.0, + "logps/chosen": -371.811767578125, + "logps/rejected": -525.6431477864584, + "loss": 0.0314, + "rewards/chosen": 6.868104934692383, + "rewards/margins": 21.64151827494303, + "rewards/rejected": -14.77341334025065, + "step": 3118 + }, + { + "epoch": 0.7804328787689228, + "grad_norm": 2.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66292410.18181818, + "logits/rejected": -69150700.3076923, + "logps/chosen": -462.85697798295456, + "logps/rejected": -893.6654146634615, + "loss": 0.0178, + "rewards/chosen": 7.685485146262429, + "rewards/margins": 37.58231225713983, + "rewards/rejected": -29.896827110877403, + "step": 3119 + }, + { + "epoch": 0.7806830977104967, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6013406.4, + "logits/rejected": -34066429.71428572, + "logps/chosen": -487.156982421875, + "logps/rejected": -592.1492047991071, + "loss": 0.0256, + "rewards/chosen": 7.14148178100586, + "rewards/margins": 26.187391117640907, + "rewards/rejected": -19.045909336635045, + "step": 3120 + }, + { + "epoch": 0.7809333166520706, + "grad_norm": 0.4296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83720152.61538461, + "logits/rejected": -63632488.72727273, + "logps/chosen": -469.3756760817308, + "logps/rejected": -589.5751509232955, + "loss": 0.0008, + "rewards/chosen": 8.813344515286959, + "rewards/margins": 28.600082130698894, + "rewards/rejected": -19.786737615411933, + "step": 3121 + }, + { + "epoch": 0.7811835355936444, + "grad_norm": 15.625, + "kl": 0.1891886442899704, + "learning_rate": 5e-06, + "logits/chosen": -61324842.666666664, + "logits/rejected": -50074901.333333336, + "logps/chosen": -426.0803629557292, + "logps/rejected": -516.2776692708334, + "loss": 0.034, + "rewards/chosen": 10.313753763834635, + "rewards/margins": 25.528283437093098, + "rewards/rejected": -15.214529673258463, + "step": 3122 + }, + { + "epoch": 0.7814337545352184, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46754535.11111111, + "logits/rejected": -47186380.8, + "logps/chosen": -542.6650933159722, + "logps/rejected": -666.4768229166667, + "loss": 0.0049, + "rewards/chosen": 10.556115044487846, + "rewards/margins": 28.656094699435762, + "rewards/rejected": -18.099979654947916, + "step": 3123 + }, + { + "epoch": 0.7816839734767922, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54529442.461538464, + "logits/rejected": -58733416.72727273, + "logps/chosen": -423.11899038461536, + "logps/rejected": -744.0970348011364, + "loss": 0.0239, + "rewards/chosen": 10.640434852013222, + "rewards/margins": 36.29103632573481, + "rewards/rejected": -25.65060147372159, + "step": 3124 + }, + { + "epoch": 0.781934192418366, + "grad_norm": 3.859375, + "kl": 3.689861297607422, + "learning_rate": 5e-06, + "logits/chosen": -42082944.0, + "logits/rejected": -57787525.81818182, + "logps/chosen": -320.47269381009613, + "logps/rejected": -523.4486860795455, + "loss": 0.0577, + "rewards/chosen": 7.987174400916467, + "rewards/margins": 22.344153771033653, + "rewards/rejected": -14.356979370117188, + "step": 3125 + }, + { + "epoch": 0.78218441135994, + "grad_norm": 1.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46319450.666666664, + "logits/rejected": -58136485.333333336, + "logps/chosen": -396.3977457682292, + "logps/rejected": -629.2072347005209, + "loss": 0.0116, + "rewards/chosen": 9.46846071879069, + "rewards/margins": 31.06225903828939, + "rewards/rejected": -21.5937983194987, + "step": 3126 + }, + { + "epoch": 0.7824346303015138, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28529194.666666668, + "logits/rejected": -42638664.0, + "logps/chosen": -515.0146484375, + "logps/rejected": -600.2367350260416, + "loss": 0.0716, + "rewards/chosen": 8.847700754801432, + "rewards/margins": 28.76372400919596, + "rewards/rejected": -19.91602325439453, + "step": 3127 + }, + { + "epoch": 0.7826848492430877, + "grad_norm": 6.59375, + "kl": 0.9355100393295288, + "learning_rate": 5e-06, + "logits/chosen": -81510170.66666667, + "logits/rejected": -87762389.33333333, + "logps/chosen": -420.345703125, + "logps/rejected": -632.9634602864584, + "loss": 0.0103, + "rewards/chosen": 9.198383331298828, + "rewards/margins": 28.13521957397461, + "rewards/rejected": -18.93683624267578, + "step": 3128 + }, + { + "epoch": 0.7829350681846616, + "grad_norm": 4.3125, + "kl": 0.5725581049919128, + "learning_rate": 5e-06, + "logits/chosen": -58936743.384615384, + "logits/rejected": -89005998.54545455, + "logps/chosen": -412.79405799278845, + "logps/rejected": -746.6052911931819, + "loss": 0.0128, + "rewards/chosen": 8.69496859036959, + "rewards/margins": 33.89529984480851, + "rewards/rejected": -25.20033125443892, + "step": 3129 + }, + { + "epoch": 0.7831852871262355, + "grad_norm": 9.875, + "kl": 8.888015747070312, + "learning_rate": 5e-06, + "logits/chosen": -67455899.42857143, + "logits/rejected": -39160233.6, + "logps/chosen": -408.77737862723217, + "logps/rejected": -444.98759765625, + "loss": 0.0499, + "rewards/chosen": 7.465677533830915, + "rewards/margins": 20.73968418666295, + "rewards/rejected": -13.274006652832032, + "step": 3130 + }, + { + "epoch": 0.7834355060678093, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42953516.307692304, + "logits/rejected": -55263092.36363637, + "logps/chosen": -403.3117487980769, + "logps/rejected": -689.1675248579545, + "loss": 0.0572, + "rewards/chosen": 7.689930842472957, + "rewards/margins": 34.877375355967274, + "rewards/rejected": -27.187444513494317, + "step": 3131 + }, + { + "epoch": 0.7836857250093832, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38362858.666666664, + "logits/rejected": -47737888.0, + "logps/chosen": -362.32727864583336, + "logps/rejected": -577.8590494791666, + "loss": 0.0224, + "rewards/chosen": 8.77501729329427, + "rewards/margins": 23.562255520290798, + "rewards/rejected": -14.787238226996529, + "step": 3132 + }, + { + "epoch": 0.7839359439509571, + "grad_norm": 19.5, + "kl": 16.044431686401367, + "learning_rate": 5e-06, + "logits/chosen": -38804069.64705882, + "logits/rejected": -50074605.71428572, + "logps/chosen": -418.4707605698529, + "logps/rejected": -370.35672433035717, + "loss": 0.0825, + "rewards/chosen": 10.449409933651195, + "rewards/margins": 20.519178566812467, + "rewards/rejected": -10.069768633161273, + "step": 3133 + }, + { + "epoch": 0.784186162892531, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40449085.333333336, + "logits/rejected": -35129957.333333336, + "logps/chosen": -450.840087890625, + "logps/rejected": -584.0757242838541, + "loss": 0.03, + "rewards/chosen": 10.264256159464518, + "rewards/margins": 28.018547693888344, + "rewards/rejected": -17.754291534423828, + "step": 3134 + }, + { + "epoch": 0.7844363818341048, + "grad_norm": 2.453125, + "kl": 3.6843173503875732, + "learning_rate": 5e-06, + "logits/chosen": -31152984.615384616, + "logits/rejected": -62613899.63636363, + "logps/chosen": -386.9408428485577, + "logps/rejected": -664.0089222301136, + "loss": 0.0232, + "rewards/chosen": 8.866382305438702, + "rewards/margins": 26.396075908954327, + "rewards/rejected": -17.529693603515625, + "step": 3135 + }, + { + "epoch": 0.7846866007756788, + "grad_norm": 1.3828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32743926.4, + "logits/rejected": -60384649.14285714, + "logps/chosen": -249.809716796875, + "logps/rejected": -660.89794921875, + "loss": 0.0214, + "rewards/chosen": 6.709999084472656, + "rewards/margins": 30.02502986363002, + "rewards/rejected": -23.315030779157365, + "step": 3136 + }, + { + "epoch": 0.7849368197172526, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -80641664.0, + "logits/rejected": -41814842.18181818, + "logps/chosen": -346.5217848557692, + "logps/rejected": -689.0806107954545, + "loss": 0.0184, + "rewards/chosen": 8.534469017615685, + "rewards/margins": 27.653948110300345, + "rewards/rejected": -19.11947909268466, + "step": 3137 + }, + { + "epoch": 0.7851870386588264, + "grad_norm": 9.0625, + "kl": 9.07091999053955, + "learning_rate": 5e-06, + "logits/chosen": -52289996.8, + "logits/rejected": -54227637.333333336, + "logps/chosen": -397.72376302083336, + "logps/rejected": -604.8625217013889, + "loss": 0.0441, + "rewards/chosen": 8.233914693196615, + "rewards/margins": 28.718403286404083, + "rewards/rejected": -20.484488593207466, + "step": 3138 + }, + { + "epoch": 0.7854372576004004, + "grad_norm": 3.859375, + "kl": 4.453624248504639, + "learning_rate": 5e-06, + "logits/chosen": -54622132.36363637, + "logits/rejected": -49797159.384615384, + "logps/chosen": -443.37362393465907, + "logps/rejected": -521.3897986778846, + "loss": 0.0153, + "rewards/chosen": 9.03527762673118, + "rewards/margins": 23.135544303413873, + "rewards/rejected": -14.100266676682692, + "step": 3139 + }, + { + "epoch": 0.7856874765419742, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30233636.923076924, + "logits/rejected": -27408424.727272727, + "logps/chosen": -433.4579326923077, + "logps/rejected": -614.9102450284091, + "loss": 0.032, + "rewards/chosen": 9.076451228215145, + "rewards/margins": 26.53073867050918, + "rewards/rejected": -17.454287442294035, + "step": 3140 + }, + { + "epoch": 0.7859376954835481, + "grad_norm": 7.84375, + "kl": 3.48591685295105, + "learning_rate": 5e-06, + "logits/chosen": -56627895.46666667, + "logits/rejected": -21182545.777777776, + "logps/chosen": -275.36438802083336, + "logps/rejected": -787.6069878472222, + "loss": 0.0459, + "rewards/chosen": 7.977870686848958, + "rewards/margins": 27.95308363172743, + "rewards/rejected": -19.97521294487847, + "step": 3141 + }, + { + "epoch": 0.786187914425122, + "grad_norm": 6.4375, + "kl": 12.190521240234375, + "learning_rate": 5e-06, + "logits/chosen": -61239764.0, + "logits/rejected": -63715416.0, + "logps/chosen": -425.7151794433594, + "logps/rejected": -549.8751831054688, + "loss": 0.0131, + "rewards/chosen": 9.285855293273926, + "rewards/margins": 25.927979469299316, + "rewards/rejected": -16.64212417602539, + "step": 3142 + }, + { + "epoch": 0.7864381333666959, + "grad_norm": 6.34375, + "kl": 2.870297908782959, + "learning_rate": 5e-06, + "logits/chosen": -57561028.571428575, + "logits/rejected": -58605926.4, + "logps/chosen": -488.75223214285717, + "logps/rejected": -777.88671875, + "loss": 0.0088, + "rewards/chosen": 10.573829650878906, + "rewards/margins": 31.817106628417967, + "rewards/rejected": -21.24327697753906, + "step": 3143 + }, + { + "epoch": 0.7866883523082697, + "grad_norm": 5.15625, + "kl": 6.198540687561035, + "learning_rate": 5e-06, + "logits/chosen": -62383247.058823526, + "logits/rejected": -26992896.0, + "logps/chosen": -415.0096220128676, + "logps/rejected": -659.2712053571429, + "loss": 0.0511, + "rewards/chosen": 9.577201394473805, + "rewards/margins": 25.989549460531286, + "rewards/rejected": -16.41234806605748, + "step": 3144 + }, + { + "epoch": 0.7869385712498436, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24877188.923076924, + "logits/rejected": -45688320.0, + "logps/chosen": -270.0003004807692, + "logps/rejected": -729.8998579545455, + "loss": 0.0207, + "rewards/chosen": 8.143423227163462, + "rewards/margins": 30.31115615951431, + "rewards/rejected": -22.16773293235085, + "step": 3145 + }, + { + "epoch": 0.7871887901914175, + "grad_norm": 15.0, + "kl": 11.193626403808594, + "learning_rate": 5e-06, + "logits/chosen": -46192749.71428572, + "logits/rejected": -69040646.4, + "logps/chosen": -382.97042410714283, + "logps/rejected": -654.82109375, + "loss": 0.0907, + "rewards/chosen": 7.881475176130023, + "rewards/margins": 22.018438066755024, + "rewards/rejected": -14.136962890625, + "step": 3146 + }, + { + "epoch": 0.7874390091329914, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51012667.428571425, + "logits/rejected": -60799750.4, + "logps/chosen": -254.10777064732142, + "logps/rejected": -646.07841796875, + "loss": 0.0547, + "rewards/chosen": 6.962978907993862, + "rewards/margins": 27.776849147251674, + "rewards/rejected": -20.813870239257813, + "step": 3147 + }, + { + "epoch": 0.7876892280745652, + "grad_norm": 1.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33126227.2, + "logits/rejected": -86313709.71428572, + "logps/chosen": -304.51005859375, + "logps/rejected": -799.0270647321429, + "loss": 0.0252, + "rewards/chosen": 7.89716796875, + "rewards/margins": 33.42469482421875, + "rewards/rejected": -25.52752685546875, + "step": 3148 + }, + { + "epoch": 0.7879394470161392, + "grad_norm": 7.15625, + "kl": 0.5937080383300781, + "learning_rate": 5e-06, + "logits/chosen": -44233964.307692304, + "logits/rejected": -24101265.454545453, + "logps/chosen": -334.3811598557692, + "logps/rejected": -729.2784090909091, + "loss": 0.012, + "rewards/chosen": 7.730343158428486, + "rewards/margins": 26.94199504718914, + "rewards/rejected": -19.211651888760652, + "step": 3149 + }, + { + "epoch": 0.788189665957713, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68449989.81818181, + "logits/rejected": -55347456.0, + "logps/chosen": -467.30752840909093, + "logps/rejected": -669.2489483173077, + "loss": 0.0059, + "rewards/chosen": 7.826829390092329, + "rewards/margins": 27.49265486710555, + "rewards/rejected": -19.66582547701322, + "step": 3150 + }, + { + "epoch": 0.7884398848992868, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58427510.15384615, + "logits/rejected": -50430807.27272727, + "logps/chosen": -346.02005709134613, + "logps/rejected": -730.6151012073864, + "loss": 0.0414, + "rewards/chosen": 7.546366764948918, + "rewards/margins": 28.723646150602328, + "rewards/rejected": -21.17727938565341, + "step": 3151 + }, + { + "epoch": 0.7886901038408608, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47160512.0, + "logits/rejected": -42045126.4, + "logps/chosen": -417.5836704799107, + "logps/rejected": -576.8109375, + "loss": 0.0056, + "rewards/chosen": 8.983966282435826, + "rewards/margins": 30.452170017787388, + "rewards/rejected": -21.468203735351562, + "step": 3152 + }, + { + "epoch": 0.7889403227824346, + "grad_norm": 5.96875, + "kl": 18.41874122619629, + "learning_rate": 5e-06, + "logits/chosen": -59820896.0, + "logits/rejected": -42821110.85714286, + "logps/chosen": -321.6694091796875, + "logps/rejected": -618.1948939732143, + "loss": 0.094, + "rewards/chosen": 7.6344970703125, + "rewards/margins": 25.744823564801898, + "rewards/rejected": -18.110326494489396, + "step": 3153 + }, + { + "epoch": 0.7891905417240085, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51972928.0, + "logits/rejected": -63745559.27272727, + "logps/chosen": -431.43370643028845, + "logps/rejected": -476.5553089488636, + "loss": 0.016, + "rewards/chosen": 8.034474886380709, + "rewards/margins": 21.89949286400855, + "rewards/rejected": -13.865017977627842, + "step": 3154 + }, + { + "epoch": 0.7894407606655823, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11057011.692307692, + "logits/rejected": -86633629.0909091, + "logps/chosen": -333.7978515625, + "logps/rejected": -775.5379083806819, + "loss": 0.0289, + "rewards/chosen": 7.483803969163161, + "rewards/margins": 29.959417649916002, + "rewards/rejected": -22.47561368075284, + "step": 3155 + }, + { + "epoch": 0.7896909796071563, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52886726.4, + "logits/rejected": -57080114.28571428, + "logps/chosen": -320.0166748046875, + "logps/rejected": -604.2458147321429, + "loss": 0.0293, + "rewards/chosen": 6.805620574951172, + "rewards/margins": 25.07688914707729, + "rewards/rejected": -18.271268572126115, + "step": 3156 + }, + { + "epoch": 0.7899411985487301, + "grad_norm": 6.09375, + "kl": 15.784521102905273, + "learning_rate": 5e-06, + "logits/chosen": -77469696.0, + "logits/rejected": -37947859.2, + "logps/chosen": -393.7251674107143, + "logps/rejected": -607.499267578125, + "loss": 0.0314, + "rewards/chosen": 9.636716570172991, + "rewards/margins": 27.675237383161274, + "rewards/rejected": -18.03852081298828, + "step": 3157 + }, + { + "epoch": 0.790191417490304, + "grad_norm": 1.1796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62118282.666666664, + "logits/rejected": -49192293.333333336, + "logps/chosen": -334.06882731119794, + "logps/rejected": -749.6505533854166, + "loss": 0.0189, + "rewards/chosen": 8.312831242879232, + "rewards/margins": 30.779495875040688, + "rewards/rejected": -22.466664632161457, + "step": 3158 + }, + { + "epoch": 0.7904416364318779, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9216285.6, + "logits/rejected": -62547483.428571425, + "logps/chosen": -248.832421875, + "logps/rejected": -673.5, + "loss": 0.0298, + "rewards/chosen": 6.885089874267578, + "rewards/margins": 26.526974814278738, + "rewards/rejected": -19.64188494001116, + "step": 3159 + }, + { + "epoch": 0.7906918553734518, + "grad_norm": 2.453125, + "kl": 12.48333740234375, + "learning_rate": 5e-06, + "logits/chosen": -63101454.76923077, + "logits/rejected": -22890221.09090909, + "logps/chosen": -463.44057992788464, + "logps/rejected": -677.2645596590909, + "loss": 0.0137, + "rewards/chosen": 11.52263934795673, + "rewards/margins": 36.74104810594679, + "rewards/rejected": -25.21840875799006, + "step": 3160 + }, + { + "epoch": 0.7909420743150256, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28679890.666666668, + "logits/rejected": -9501543.333333334, + "logps/chosen": -436.7552897135417, + "logps/rejected": -517.740478515625, + "loss": 0.0047, + "rewards/chosen": 9.497055053710938, + "rewards/margins": 27.30694071451823, + "rewards/rejected": -17.809885660807293, + "step": 3161 + }, + { + "epoch": 0.7911922932565996, + "grad_norm": 11.0, + "kl": 1.8298888206481934, + "learning_rate": 5e-06, + "logits/chosen": -11951538.285714285, + "logits/rejected": -46614064.0, + "logps/chosen": -318.5646275111607, + "logps/rejected": -527.307373046875, + "loss": 0.08, + "rewards/chosen": 6.056464603969029, + "rewards/margins": 18.988440922328405, + "rewards/rejected": -12.931976318359375, + "step": 3162 + }, + { + "epoch": 0.7914425121981734, + "grad_norm": 7.9375, + "kl": 6.409518241882324, + "learning_rate": 5e-06, + "logits/chosen": -43496846.76923077, + "logits/rejected": -54040238.54545455, + "logps/chosen": -332.86624849759613, + "logps/rejected": -560.8447265625, + "loss": 0.0341, + "rewards/chosen": 8.752815833458534, + "rewards/margins": 19.82694372430548, + "rewards/rejected": -11.074127890846945, + "step": 3163 + }, + { + "epoch": 0.7916927311397473, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69198865.45454545, + "logits/rejected": -45574537.84615385, + "logps/chosen": -270.3463023792614, + "logps/rejected": -603.3871694711538, + "loss": 0.0611, + "rewards/chosen": 7.215987465598366, + "rewards/margins": 26.000185933146444, + "rewards/rejected": -18.784198467548077, + "step": 3164 + }, + { + "epoch": 0.7919429500813212, + "grad_norm": 6.28125, + "kl": 3.796109676361084, + "learning_rate": 5e-06, + "logits/chosen": -31222180.57142857, + "logits/rejected": -34780358.4, + "logps/chosen": -415.83189174107144, + "logps/rejected": -544.44375, + "loss": 0.0285, + "rewards/chosen": 10.159109933035714, + "rewards/margins": 26.318927437918525, + "rewards/rejected": -16.159817504882813, + "step": 3165 + }, + { + "epoch": 0.792193169022895, + "grad_norm": 3.9375, + "kl": 14.186848640441895, + "learning_rate": 5e-06, + "logits/chosen": -45562816.0, + "logits/rejected": -42035481.6, + "logps/chosen": -358.0990513392857, + "logps/rejected": -576.576123046875, + "loss": 0.0866, + "rewards/chosen": 9.054125104631696, + "rewards/margins": 24.360303388323103, + "rewards/rejected": -15.306178283691406, + "step": 3166 + }, + { + "epoch": 0.7924433879644689, + "grad_norm": 3.203125, + "kl": 1.0441970825195312, + "learning_rate": 5e-06, + "logits/chosen": -47982981.81818182, + "logits/rejected": -54493080.615384616, + "logps/chosen": -328.297607421875, + "logps/rejected": -744.0870643028846, + "loss": 0.0172, + "rewards/chosen": 8.062478498979049, + "rewards/margins": 28.275263486208615, + "rewards/rejected": -20.21278498722957, + "step": 3167 + }, + { + "epoch": 0.7926936069060427, + "grad_norm": 11.25, + "kl": 0.26417669653892517, + "learning_rate": 5e-06, + "logits/chosen": -44968453.81818182, + "logits/rejected": -54137462.15384615, + "logps/chosen": -365.7479137073864, + "logps/rejected": -635.6334134615385, + "loss": 0.0186, + "rewards/chosen": 8.448943398215555, + "rewards/margins": 26.123384222284063, + "rewards/rejected": -17.67444082406851, + "step": 3168 + }, + { + "epoch": 0.7929438258476167, + "grad_norm": 7.6875, + "kl": 14.424945831298828, + "learning_rate": 5e-06, + "logits/chosen": -89681005.71428572, + "logits/rejected": -31658153.6, + "logps/chosen": -498.68896484375, + "logps/rejected": -617.385107421875, + "loss": 0.042, + "rewards/chosen": 12.469615391322545, + "rewards/margins": 27.250058201381137, + "rewards/rejected": -14.780442810058593, + "step": 3169 + }, + { + "epoch": 0.7931940447891905, + "grad_norm": 4.75, + "kl": 9.40216064453125, + "learning_rate": 5e-06, + "logits/chosen": -41881044.0, + "logits/rejected": -32860892.0, + "logps/chosen": -440.55438232421875, + "logps/rejected": -714.7567138671875, + "loss": 0.0176, + "rewards/chosen": 9.913310050964355, + "rewards/margins": 27.049975395202637, + "rewards/rejected": -17.13666534423828, + "step": 3170 + }, + { + "epoch": 0.7934442637307644, + "grad_norm": 34.5, + "kl": 0.2805735468864441, + "learning_rate": 5e-06, + "logits/chosen": -35806854.4, + "logits/rejected": -24823442.285714287, + "logps/chosen": -358.1268798828125, + "logps/rejected": -717.6179547991071, + "loss": 0.054, + "rewards/chosen": 9.4863037109375, + "rewards/margins": 21.878467668805804, + "rewards/rejected": -12.392163957868304, + "step": 3171 + }, + { + "epoch": 0.7936944826723383, + "grad_norm": 24.75, + "kl": 5.164003372192383, + "learning_rate": 5e-06, + "logits/chosen": -55440917.333333336, + "logits/rejected": -45847984.0, + "logps/chosen": -348.71240234375, + "logps/rejected": -492.47998046875, + "loss": 0.0474, + "rewards/chosen": 8.108353932698568, + "rewards/margins": 20.167166392008465, + "rewards/rejected": -12.058812459309896, + "step": 3172 + }, + { + "epoch": 0.7939447016139122, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39877715.692307696, + "logits/rejected": -42870679.27272727, + "logps/chosen": -432.9982722355769, + "logps/rejected": -536.3340287642045, + "loss": 0.0435, + "rewards/chosen": 7.3421501746544475, + "rewards/margins": 23.575875982537973, + "rewards/rejected": -16.233725807883523, + "step": 3173 + }, + { + "epoch": 0.794194920555486, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40508986.18181818, + "logits/rejected": -49383940.92307692, + "logps/chosen": -287.96604225852275, + "logps/rejected": -744.3415715144231, + "loss": 0.0429, + "rewards/chosen": 7.526300603693182, + "rewards/margins": 26.27119947313429, + "rewards/rejected": -18.744898869441105, + "step": 3174 + }, + { + "epoch": 0.79444513949706, + "grad_norm": 3.609375, + "kl": 4.230663299560547, + "learning_rate": 5e-06, + "logits/chosen": -53834279.384615384, + "logits/rejected": -37132552.72727273, + "logps/chosen": -315.91597806490387, + "logps/rejected": -515.4821999289773, + "loss": 0.0441, + "rewards/chosen": 7.125238858736479, + "rewards/margins": 21.325274647532645, + "rewards/rejected": -14.200035788796164, + "step": 3175 + }, + { + "epoch": 0.7946953584386338, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57633701.333333336, + "logits/rejected": -44814215.11111111, + "logps/chosen": -387.0567220052083, + "logps/rejected": -545.6187065972222, + "loss": 0.057, + "rewards/chosen": 8.452921549479166, + "rewards/margins": 19.92859395345052, + "rewards/rejected": -11.475672403971354, + "step": 3176 + }, + { + "epoch": 0.7949455773802077, + "grad_norm": 0.83203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64562823.11111111, + "logits/rejected": -66326694.4, + "logps/chosen": -392.9104817708333, + "logps/rejected": -787.056640625, + "loss": 0.0008, + "rewards/chosen": 9.614667256673178, + "rewards/margins": 28.884310404459633, + "rewards/rejected": -19.269643147786457, + "step": 3177 + }, + { + "epoch": 0.7951957963217816, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23798093.333333332, + "logits/rejected": -50254560.0, + "logps/chosen": -361.1700846354167, + "logps/rejected": -621.4038899739584, + "loss": 0.0674, + "rewards/chosen": 7.391970952351888, + "rewards/margins": 22.86704953511556, + "rewards/rejected": -15.475078582763672, + "step": 3178 + }, + { + "epoch": 0.7954460152633555, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49946013.538461536, + "logits/rejected": -38336584.72727273, + "logps/chosen": -270.4592472956731, + "logps/rejected": -349.99922318892044, + "loss": 0.0369, + "rewards/chosen": 7.067205575796274, + "rewards/margins": 19.71114312018548, + "rewards/rejected": -12.643937544389205, + "step": 3179 + }, + { + "epoch": 0.7956962342049293, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63100236.8, + "logits/rejected": -61981284.571428575, + "logps/chosen": -485.782275390625, + "logps/rejected": -611.1044921875, + "loss": 0.0221, + "rewards/chosen": 10.09509506225586, + "rewards/margins": 28.49079033987863, + "rewards/rejected": -18.395695277622767, + "step": 3180 + }, + { + "epoch": 0.7959464531465031, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52217526.15384615, + "logits/rejected": -50430789.81818182, + "logps/chosen": -373.61087740384613, + "logps/rejected": -551.7339754971591, + "loss": 0.0199, + "rewards/chosen": 9.113879864032452, + "rewards/margins": 25.32080152818373, + "rewards/rejected": -16.206921664151277, + "step": 3181 + }, + { + "epoch": 0.7961966720880771, + "grad_norm": 2.796875, + "kl": 12.237716674804688, + "learning_rate": 5e-06, + "logits/chosen": -44968992.0, + "logits/rejected": -53136330.666666664, + "logps/chosen": -419.9730224609375, + "logps/rejected": -650.16650390625, + "loss": 0.0136, + "rewards/chosen": 10.710688273111979, + "rewards/margins": 28.072784423828125, + "rewards/rejected": -17.362096150716145, + "step": 3182 + }, + { + "epoch": 0.7964468910296509, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45695716.571428575, + "logits/rejected": -62012280.47058824, + "logps/chosen": -569.3485630580357, + "logps/rejected": -672.0313074448529, + "loss": 0.0411, + "rewards/chosen": 12.5328369140625, + "rewards/margins": 31.355357450597428, + "rewards/rejected": -18.822520536534928, + "step": 3183 + }, + { + "epoch": 0.7966971099712248, + "grad_norm": 12.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40817491.692307696, + "logits/rejected": -46814949.81818182, + "logps/chosen": -421.5954402043269, + "logps/rejected": -630.8254616477273, + "loss": 0.0247, + "rewards/chosen": 8.455179654634916, + "rewards/margins": 24.46166602714912, + "rewards/rejected": -16.006486372514203, + "step": 3184 + }, + { + "epoch": 0.7969473289127987, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -74645229.71428572, + "logits/rejected": -50018996.705882356, + "logps/chosen": -275.17051478794644, + "logps/rejected": -653.041015625, + "loss": 0.0153, + "rewards/chosen": 6.632861001150949, + "rewards/margins": 27.125556529069147, + "rewards/rejected": -20.492695527918197, + "step": 3185 + }, + { + "epoch": 0.7971975478543726, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34934528.0, + "logits/rejected": -48389464.0, + "logps/chosen": -409.9518737792969, + "logps/rejected": -725.6411743164062, + "loss": 0.022, + "rewards/chosen": 8.54737377166748, + "rewards/margins": 27.53396701812744, + "rewards/rejected": -18.98659324645996, + "step": 3186 + }, + { + "epoch": 0.7974477667959464, + "grad_norm": 7.625, + "kl": 11.701041221618652, + "learning_rate": 5e-06, + "logits/chosen": -38962309.333333336, + "logits/rejected": -34656282.666666664, + "logps/chosen": -365.2467447916667, + "logps/rejected": -584.3806559244791, + "loss": 0.0704, + "rewards/chosen": 7.8193613688151045, + "rewards/margins": 25.920926411946617, + "rewards/rejected": -18.10156504313151, + "step": 3187 + }, + { + "epoch": 0.7976979857375204, + "grad_norm": 14.6875, + "kl": 9.727387428283691, + "learning_rate": 5e-06, + "logits/chosen": -83727034.66666667, + "logits/rejected": -36145797.333333336, + "logps/chosen": -452.2635091145833, + "logps/rejected": -394.3990071614583, + "loss": 0.1001, + "rewards/chosen": 9.262088775634766, + "rewards/margins": 21.685614267985024, + "rewards/rejected": -12.42352549235026, + "step": 3188 + }, + { + "epoch": 0.7979482046790942, + "grad_norm": 0.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34321337.6, + "logits/rejected": -30197581.714285713, + "logps/chosen": -428.90302734375, + "logps/rejected": -725.2661830357143, + "loss": 0.0009, + "rewards/chosen": 9.975694274902343, + "rewards/margins": 31.05088086809431, + "rewards/rejected": -21.075186593191965, + "step": 3189 + }, + { + "epoch": 0.7981984236206681, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55778993.777777776, + "logits/rejected": -39186107.733333334, + "logps/chosen": -451.8445638020833, + "logps/rejected": -636.5614583333333, + "loss": 0.0047, + "rewards/chosen": 10.112894694010416, + "rewards/margins": 28.589029947916664, + "rewards/rejected": -18.47613525390625, + "step": 3190 + }, + { + "epoch": 0.798448642562242, + "grad_norm": 4.5625, + "kl": 3.388685941696167, + "learning_rate": 5e-06, + "logits/chosen": -31397070.222222224, + "logits/rejected": -78006732.8, + "logps/chosen": -399.66314019097223, + "logps/rejected": -739.7107421875, + "loss": 0.0359, + "rewards/chosen": 9.233257717556423, + "rewards/margins": 27.453185696072048, + "rewards/rejected": -18.219927978515624, + "step": 3191 + }, + { + "epoch": 0.7986988615038159, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60730909.538461536, + "logits/rejected": -48511467.63636363, + "logps/chosen": -462.8356370192308, + "logps/rejected": -628.9865056818181, + "loss": 0.0076, + "rewards/chosen": 9.333608774038462, + "rewards/margins": 28.806861503974538, + "rewards/rejected": -19.47325272993608, + "step": 3192 + }, + { + "epoch": 0.7989490804453897, + "grad_norm": 1.9140625, + "kl": 7.800597190856934, + "learning_rate": 5e-06, + "logits/chosen": -79365836.8, + "logits/rejected": -42586829.71428572, + "logps/chosen": -415.9365234375, + "logps/rejected": -691.2517438616071, + "loss": 0.016, + "rewards/chosen": 9.905123138427735, + "rewards/margins": 30.883564649309434, + "rewards/rejected": -20.978441510881698, + "step": 3193 + }, + { + "epoch": 0.7991992993869635, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20476076.307692308, + "logits/rejected": -61493434.18181818, + "logps/chosen": -286.3638258713942, + "logps/rejected": -686.7190163352273, + "loss": 0.0603, + "rewards/chosen": 6.751426696777344, + "rewards/margins": 26.39008192582564, + "rewards/rejected": -19.638655229048297, + "step": 3194 + }, + { + "epoch": 0.7994495183285375, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20343374.222222224, + "logits/rejected": -59409493.333333336, + "logps/chosen": -451.0512966579861, + "logps/rejected": -641.4867838541667, + "loss": 0.0386, + "rewards/chosen": 8.523438347710503, + "rewards/margins": 28.037704298231336, + "rewards/rejected": -19.514265950520834, + "step": 3195 + }, + { + "epoch": 0.7996997372701113, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40899112.0, + "logits/rejected": -66037236.0, + "logps/chosen": -407.66290283203125, + "logps/rejected": -555.2177124023438, + "loss": 0.0152, + "rewards/chosen": 11.925619125366211, + "rewards/margins": 27.827353477478027, + "rewards/rejected": -15.901734352111816, + "step": 3196 + }, + { + "epoch": 0.7999499562116852, + "grad_norm": 9.125, + "kl": 5.743766784667969, + "learning_rate": 5e-06, + "logits/chosen": -58228992.0, + "logits/rejected": -54248864.0, + "logps/chosen": -528.6583658854166, + "logps/rejected": -888.0361328125, + "loss": 0.024, + "rewards/chosen": 11.198453267415365, + "rewards/margins": 34.903709411621094, + "rewards/rejected": -23.70525614420573, + "step": 3197 + }, + { + "epoch": 0.8002001751532591, + "grad_norm": 11.0625, + "kl": 13.101539611816406, + "learning_rate": 5e-06, + "logits/chosen": -95874520.0, + "logits/rejected": -51234328.0, + "logps/chosen": -322.5673828125, + "logps/rejected": -633.8896484375, + "loss": 0.0934, + "rewards/chosen": 7.500769138336182, + "rewards/margins": 24.063400745391846, + "rewards/rejected": -16.562631607055664, + "step": 3198 + }, + { + "epoch": 0.800450394094833, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62050077.538461536, + "logits/rejected": -47988221.09090909, + "logps/chosen": -348.57237830528845, + "logps/rejected": -814.7113813920455, + "loss": 0.0615, + "rewards/chosen": 6.292000990647536, + "rewards/margins": 30.196763952295264, + "rewards/rejected": -23.904762961647727, + "step": 3199 + }, + { + "epoch": 0.8007006130364068, + "grad_norm": 5.96875, + "kl": 0.19045767188072205, + "learning_rate": 5e-06, + "logits/chosen": -18914081.14285714, + "logits/rejected": -84356044.8, + "logps/chosen": -396.59852818080356, + "logps/rejected": -802.9658203125, + "loss": 0.0303, + "rewards/chosen": 7.647272382463727, + "rewards/margins": 27.783457074846538, + "rewards/rejected": -20.136184692382812, + "step": 3200 + }, + { + "epoch": 0.8009508319779808, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28635168.0, + "logits/rejected": -40962779.428571425, + "logps/chosen": -395.7276611328125, + "logps/rejected": -565.0685686383929, + "loss": 0.055, + "rewards/chosen": 7.526345825195312, + "rewards/margins": 25.824136352539064, + "rewards/rejected": -18.29779052734375, + "step": 3201 + }, + { + "epoch": 0.8012010509195546, + "grad_norm": 2.046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39366330.666666664, + "logits/rejected": 10287048.0, + "logps/chosen": -329.6221516927083, + "logps/rejected": -630.472412109375, + "loss": 0.0138, + "rewards/chosen": 7.517525990804036, + "rewards/margins": 20.54009755452474, + "rewards/rejected": -13.022571563720703, + "step": 3202 + }, + { + "epoch": 0.8014512698611285, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39989789.09090909, + "logits/rejected": -62233816.615384616, + "logps/chosen": -483.79563210227275, + "logps/rejected": -765.8640324519231, + "loss": 0.0032, + "rewards/chosen": 10.0062255859375, + "rewards/margins": 30.44920935997596, + "rewards/rejected": -20.44298377403846, + "step": 3203 + }, + { + "epoch": 0.8017014888027023, + "grad_norm": 1.1171875, + "kl": 3.607430934906006, + "learning_rate": 5e-06, + "logits/chosen": -54108136.72727273, + "logits/rejected": -65516130.461538464, + "logps/chosen": -388.0989879261364, + "logps/rejected": -770.2540564903846, + "loss": 0.0133, + "rewards/chosen": 8.890317049893467, + "rewards/margins": 30.706448641690343, + "rewards/rejected": -21.816131591796875, + "step": 3204 + }, + { + "epoch": 0.8019517077442763, + "grad_norm": 1.2421875, + "kl": 4.821498870849609, + "learning_rate": 5e-06, + "logits/chosen": -33921461.333333336, + "logits/rejected": -53273552.0, + "logps/chosen": -361.0171712239583, + "logps/rejected": -718.2425944010416, + "loss": 0.0259, + "rewards/chosen": 8.330461502075195, + "rewards/margins": 28.48835055033366, + "rewards/rejected": -20.157889048258465, + "step": 3205 + }, + { + "epoch": 0.8022019266858501, + "grad_norm": 7.65625, + "kl": 6.568589210510254, + "learning_rate": 5e-06, + "logits/chosen": -36829276.0, + "logits/rejected": -22510008.0, + "logps/chosen": -309.2214660644531, + "logps/rejected": -341.2691650390625, + "loss": 0.1108, + "rewards/chosen": 7.454017639160156, + "rewards/margins": 22.788188934326172, + "rewards/rejected": -15.334171295166016, + "step": 3206 + }, + { + "epoch": 0.802452145627424, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40064986.18181818, + "logits/rejected": -58998306.461538464, + "logps/chosen": -413.9158380681818, + "logps/rejected": -929.0920973557693, + "loss": 0.0637, + "rewards/chosen": 9.797701748934658, + "rewards/margins": 36.92608685260053, + "rewards/rejected": -27.128385103665867, + "step": 3207 + }, + { + "epoch": 0.8027023645689979, + "grad_norm": 9.375, + "kl": 21.51288414001465, + "learning_rate": 5e-06, + "logits/chosen": -39970710.85714286, + "logits/rejected": -52221651.2, + "logps/chosen": -407.622314453125, + "logps/rejected": -560.9509765625, + "loss": 0.0342, + "rewards/chosen": 10.670211791992188, + "rewards/margins": 27.144992065429687, + "rewards/rejected": -16.4747802734375, + "step": 3208 + }, + { + "epoch": 0.8029525835105717, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44512540.0, + "logits/rejected": -32697496.0, + "logps/chosen": -550.233642578125, + "logps/rejected": -752.481201171875, + "loss": 0.0225, + "rewards/chosen": 10.652582168579102, + "rewards/margins": 32.21308708190918, + "rewards/rejected": -21.560504913330078, + "step": 3209 + }, + { + "epoch": 0.8032028024521456, + "grad_norm": 13.75, + "kl": 10.17340087890625, + "learning_rate": 5e-06, + "logits/chosen": -80191890.28571428, + "logits/rejected": -28133888.0, + "logps/chosen": -376.01827566964283, + "logps/rejected": -538.31103515625, + "loss": 0.0652, + "rewards/chosen": 8.905079432896205, + "rewards/margins": 24.338126918247767, + "rewards/rejected": -15.433047485351562, + "step": 3210 + }, + { + "epoch": 0.8034530213937195, + "grad_norm": 10.625, + "kl": 5.313276767730713, + "learning_rate": 5e-06, + "logits/chosen": -67854232.0, + "logits/rejected": -19706378.0, + "logps/chosen": -303.0367431640625, + "logps/rejected": -380.15625, + "loss": 0.0371, + "rewards/chosen": 7.967641830444336, + "rewards/margins": 16.66538143157959, + "rewards/rejected": -8.697739601135254, + "step": 3211 + }, + { + "epoch": 0.8037032403352934, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4038902.153846154, + "logits/rejected": -29617297.454545453, + "logps/chosen": -453.6779221754808, + "logps/rejected": -737.0744406960227, + "loss": 0.0175, + "rewards/chosen": 9.394469627967247, + "rewards/margins": 32.474164896078044, + "rewards/rejected": -23.079695268110797, + "step": 3212 + }, + { + "epoch": 0.8039534592768672, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49809746.28571428, + "logits/rejected": -57381990.4, + "logps/chosen": -385.96358816964283, + "logps/rejected": -738.7212890625, + "loss": 0.0169, + "rewards/chosen": 8.417591094970703, + "rewards/margins": 30.885486602783203, + "rewards/rejected": -22.4678955078125, + "step": 3213 + }, + { + "epoch": 0.8042036782184412, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -96051424.0, + "logits/rejected": -38469760.0, + "logps/chosen": -542.183837890625, + "logps/rejected": -511.01953125, + "loss": 0.0226, + "rewards/chosen": 12.348125457763672, + "rewards/margins": 25.98112678527832, + "rewards/rejected": -13.633001327514648, + "step": 3214 + }, + { + "epoch": 0.804453897160015, + "grad_norm": 16.125, + "kl": 16.870433807373047, + "learning_rate": 5e-06, + "logits/chosen": -62498221.71428572, + "logits/rejected": -62844435.2, + "logps/chosen": -412.41563197544644, + "logps/rejected": -716.18056640625, + "loss": 0.0659, + "rewards/chosen": 9.586285182407924, + "rewards/margins": 30.03198983328683, + "rewards/rejected": -20.445704650878906, + "step": 3215 + }, + { + "epoch": 0.8047041161015889, + "grad_norm": 1.34375, + "kl": 8.126016616821289, + "learning_rate": 5e-06, + "logits/chosen": -31591793.230769232, + "logits/rejected": -43586085.81818182, + "logps/chosen": -394.19858022836536, + "logps/rejected": -690.0055930397727, + "loss": 0.0294, + "rewards/chosen": 9.639727665827824, + "rewards/margins": 27.84536065588464, + "rewards/rejected": -18.205632990056817, + "step": 3216 + }, + { + "epoch": 0.8049543350431627, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2161596.2, + "logits/rejected": -39407762.28571428, + "logps/chosen": -185.5008544921875, + "logps/rejected": -675.2261439732143, + "loss": 0.0715, + "rewards/chosen": 5.34935302734375, + "rewards/margins": 25.01038556780134, + "rewards/rejected": -19.66103254045759, + "step": 3217 + }, + { + "epoch": 0.8052045539847367, + "grad_norm": 4.21875, + "kl": 0.5277456045150757, + "learning_rate": 5e-06, + "logits/chosen": -72121187.55555555, + "logits/rejected": -48000268.8, + "logps/chosen": -341.22479926215277, + "logps/rejected": -510.85325520833334, + "loss": 0.0434, + "rewards/chosen": 8.134419759114584, + "rewards/margins": 21.283841959635417, + "rewards/rejected": -13.149422200520833, + "step": 3218 + }, + { + "epoch": 0.8054547729263105, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43047302.4, + "logits/rejected": -36442925.71428572, + "logps/chosen": -259.7948486328125, + "logps/rejected": -585.0173688616071, + "loss": 0.0494, + "rewards/chosen": 8.889262390136718, + "rewards/margins": 21.944705200195312, + "rewards/rejected": -13.055442810058594, + "step": 3219 + }, + { + "epoch": 0.8057049918678844, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37111076.92307692, + "logits/rejected": -58050914.90909091, + "logps/chosen": -197.52497746394232, + "logps/rejected": -549.6422230113636, + "loss": 0.0939, + "rewards/chosen": 5.346764784592849, + "rewards/margins": 14.974215900981342, + "rewards/rejected": -9.627451116388494, + "step": 3220 + }, + { + "epoch": 0.8059552108094583, + "grad_norm": 1.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21275764.363636363, + "logits/rejected": -61063670.15384615, + "logps/chosen": -293.0260120738636, + "logps/rejected": -485.2038762019231, + "loss": 0.0363, + "rewards/chosen": 7.96350791237571, + "rewards/margins": 21.04931395203917, + "rewards/rejected": -13.085806039663462, + "step": 3221 + }, + { + "epoch": 0.8062054297510322, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65545780.0, + "logits/rejected": -28291148.0, + "logps/chosen": -224.6514129638672, + "logps/rejected": -554.32568359375, + "loss": 0.0317, + "rewards/chosen": 7.293304443359375, + "rewards/margins": 24.57693862915039, + "rewards/rejected": -17.283634185791016, + "step": 3222 + }, + { + "epoch": 0.806455648692606, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33249570.90909091, + "logits/rejected": -3843030.153846154, + "logps/chosen": -325.9200550426136, + "logps/rejected": -759.0063100961538, + "loss": 0.0213, + "rewards/chosen": 7.941747492009943, + "rewards/margins": 31.17902870444985, + "rewards/rejected": -23.237281212439903, + "step": 3223 + }, + { + "epoch": 0.80670586763418, + "grad_norm": 6.375, + "kl": 14.324531555175781, + "learning_rate": 5e-06, + "logits/chosen": -40541888.0, + "logits/rejected": -40822154.666666664, + "logps/chosen": -454.834765625, + "logps/rejected": -706.8444552951389, + "loss": 0.0206, + "rewards/chosen": 9.189556884765626, + "rewards/margins": 24.148826090494794, + "rewards/rejected": -14.959269205729166, + "step": 3224 + }, + { + "epoch": 0.8069560865757538, + "grad_norm": 1.96875, + "kl": 10.10914421081543, + "learning_rate": 5e-06, + "logits/chosen": -21419680.0, + "logits/rejected": -103418123.63636364, + "logps/chosen": -340.9619140625, + "logps/rejected": -835.7579012784091, + "loss": 0.0594, + "rewards/chosen": 9.025763878455528, + "rewards/margins": 34.824569808853255, + "rewards/rejected": -25.798805930397727, + "step": 3225 + }, + { + "epoch": 0.8072063055173276, + "grad_norm": 1.765625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35815560.0, + "logits/rejected": -59639690.666666664, + "logps/chosen": -393.3280843098958, + "logps/rejected": -713.4117838541666, + "loss": 0.0452, + "rewards/chosen": 8.315086364746094, + "rewards/margins": 31.43829091389974, + "rewards/rejected": -23.123204549153645, + "step": 3226 + }, + { + "epoch": 0.8074565244589016, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44187520.0, + "logits/rejected": -34528192.0, + "logps/chosen": -325.03604403409093, + "logps/rejected": -541.2082707331731, + "loss": 0.0506, + "rewards/chosen": 7.812973716042259, + "rewards/margins": 25.548450870113772, + "rewards/rejected": -17.735477154071514, + "step": 3227 + }, + { + "epoch": 0.8077067434004754, + "grad_norm": 25.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37858638.54545455, + "logits/rejected": -39233314.461538464, + "logps/chosen": -451.9734552556818, + "logps/rejected": -544.1219200721154, + "loss": 0.0473, + "rewards/chosen": 8.544602134011008, + "rewards/margins": 22.521569578797667, + "rewards/rejected": -13.97696744478666, + "step": 3228 + }, + { + "epoch": 0.8079569623420493, + "grad_norm": 21.625, + "kl": 6.388509750366211, + "learning_rate": 5e-06, + "logits/chosen": -32603664.0, + "logits/rejected": -58856056.0, + "logps/chosen": -378.42742919921875, + "logps/rejected": -782.7501220703125, + "loss": 0.0972, + "rewards/chosen": 6.66088342666626, + "rewards/margins": 26.88251543045044, + "rewards/rejected": -20.22163200378418, + "step": 3229 + }, + { + "epoch": 0.8082071812836231, + "grad_norm": 2.28125, + "kl": 0.2792040705680847, + "learning_rate": 5e-06, + "logits/chosen": -82100403.2, + "logits/rejected": -54392981.333333336, + "logps/chosen": -467.4521484375, + "logps/rejected": -704.4173177083334, + "loss": 0.0034, + "rewards/chosen": 9.284032185872396, + "rewards/margins": 30.229310777452255, + "rewards/rejected": -20.94527859157986, + "step": 3230 + }, + { + "epoch": 0.8084574002251971, + "grad_norm": 3.59375, + "kl": 10.97339153289795, + "learning_rate": 5e-06, + "logits/chosen": -56906090.666666664, + "logits/rejected": -84931813.33333333, + "logps/chosen": -405.3321126302083, + "logps/rejected": -620.7721354166666, + "loss": 0.0065, + "rewards/chosen": 10.468449910481771, + "rewards/margins": 29.229356129964195, + "rewards/rejected": -18.760906219482422, + "step": 3231 + }, + { + "epoch": 0.8087076191667709, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36412997.81818182, + "logits/rejected": -49609107.692307696, + "logps/chosen": -453.7958984375, + "logps/rejected": -657.4012169471154, + "loss": 0.0081, + "rewards/chosen": 10.475078235973012, + "rewards/margins": 33.40958047079873, + "rewards/rejected": -22.93450223482572, + "step": 3232 + }, + { + "epoch": 0.8089578381083448, + "grad_norm": 3.6875, + "kl": 5.184110164642334, + "learning_rate": 5e-06, + "logits/chosen": -51557554.28571428, + "logits/rejected": -25176941.17647059, + "logps/chosen": -412.2357700892857, + "logps/rejected": -462.0173770680147, + "loss": 0.0087, + "rewards/chosen": 9.480167933872767, + "rewards/margins": 23.669512452197676, + "rewards/rejected": -14.189344518324909, + "step": 3233 + }, + { + "epoch": 0.8092080570499187, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68717392.0, + "logits/rejected": -60180245.333333336, + "logps/chosen": -293.19236246744794, + "logps/rejected": -691.4656575520834, + "loss": 0.0318, + "rewards/chosen": 7.548010508219401, + "rewards/margins": 28.72453816731771, + "rewards/rejected": -21.17652765909831, + "step": 3234 + }, + { + "epoch": 0.8094582759914926, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47510276.571428575, + "logits/rejected": -36143558.4, + "logps/chosen": -468.8828822544643, + "logps/rejected": -526.086767578125, + "loss": 0.0164, + "rewards/chosen": 9.206066676548549, + "rewards/margins": 29.325936671665737, + "rewards/rejected": -20.119869995117188, + "step": 3235 + }, + { + "epoch": 0.8097084949330664, + "grad_norm": 6.5, + "kl": 1.6692924499511719, + "learning_rate": 5e-06, + "logits/chosen": -66954352.0, + "logits/rejected": -66944880.0, + "logps/chosen": -544.189453125, + "logps/rejected": -838.9501953125, + "loss": 0.0107, + "rewards/chosen": 12.539039611816406, + "rewards/margins": 32.93532943725586, + "rewards/rejected": -20.396289825439453, + "step": 3236 + }, + { + "epoch": 0.8099587138746404, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47749657.6, + "logits/rejected": -31216786.285714287, + "logps/chosen": -496.055712890625, + "logps/rejected": -565.6720842633929, + "loss": 0.0733, + "rewards/chosen": 9.69854736328125, + "rewards/margins": 25.811283656529017, + "rewards/rejected": -16.112736293247767, + "step": 3237 + }, + { + "epoch": 0.8102089328162142, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28576819.2, + "logits/rejected": -48359725.71428572, + "logps/chosen": -522.9619140625, + "logps/rejected": -560.15869140625, + "loss": 0.0255, + "rewards/chosen": 10.574693298339843, + "rewards/margins": 25.84295697893415, + "rewards/rejected": -15.268263680594307, + "step": 3238 + }, + { + "epoch": 0.810459151757788, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39782400.0, + "logits/rejected": -38877149.86666667, + "logps/chosen": -459.24110243055554, + "logps/rejected": -706.6247395833333, + "loss": 0.0672, + "rewards/chosen": 7.131050957573785, + "rewards/margins": 29.361438327365452, + "rewards/rejected": -22.230387369791668, + "step": 3239 + }, + { + "epoch": 0.8107093706993619, + "grad_norm": 22.375, + "kl": 4.354381561279297, + "learning_rate": 5e-06, + "logits/chosen": -39876340.0, + "logits/rejected": -42674240.0, + "logps/chosen": -346.433837890625, + "logps/rejected": -809.4393310546875, + "loss": 0.1068, + "rewards/chosen": 6.876967906951904, + "rewards/margins": 28.745389461517334, + "rewards/rejected": -21.86842155456543, + "step": 3240 + }, + { + "epoch": 0.8109595896409358, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50067578.666666664, + "logits/rejected": -29203608.0, + "logps/chosen": -438.6898600260417, + "logps/rejected": -556.4170735677084, + "loss": 0.0178, + "rewards/chosen": 10.146671295166016, + "rewards/margins": 25.847323099772133, + "rewards/rejected": -15.70065180460612, + "step": 3241 + }, + { + "epoch": 0.8112098085825097, + "grad_norm": 3.828125, + "kl": 1.0155551433563232, + "learning_rate": 5e-06, + "logits/chosen": -26810105.6, + "logits/rejected": -69103405.71428572, + "logps/chosen": -350.107958984375, + "logps/rejected": -680.0505719866071, + "loss": 0.0112, + "rewards/chosen": 6.955030822753907, + "rewards/margins": 29.35159367152623, + "rewards/rejected": -22.396562848772323, + "step": 3242 + }, + { + "epoch": 0.8114600275240835, + "grad_norm": 18.125, + "kl": 3.195341110229492, + "learning_rate": 5e-06, + "logits/chosen": -36207146.666666664, + "logits/rejected": -8706114.666666666, + "logps/chosen": -350.7351888020833, + "logps/rejected": -613.4280598958334, + "loss": 0.102, + "rewards/chosen": 7.70967165629069, + "rewards/margins": 24.492033004760742, + "rewards/rejected": -16.78236134847005, + "step": 3243 + }, + { + "epoch": 0.8117102464656575, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68851750.4, + "logits/rejected": -48645083.428571425, + "logps/chosen": -385.51201171875, + "logps/rejected": -648.0101143973214, + "loss": 0.0138, + "rewards/chosen": 8.814155578613281, + "rewards/margins": 26.67057364327567, + "rewards/rejected": -17.85641806466239, + "step": 3244 + }, + { + "epoch": 0.8119604654072313, + "grad_norm": 10.6875, + "kl": 2.8299102783203125, + "learning_rate": 5e-06, + "logits/chosen": 2463671.272727273, + "logits/rejected": -59553073.23076923, + "logps/chosen": -449.62819602272725, + "logps/rejected": -575.9292367788462, + "loss": 0.0135, + "rewards/chosen": 8.949037725275213, + "rewards/margins": 25.196340440870166, + "rewards/rejected": -16.24730271559495, + "step": 3245 + }, + { + "epoch": 0.8122106843488052, + "grad_norm": 2.15625, + "kl": 2.45910906791687, + "learning_rate": 5e-06, + "logits/chosen": -55036659.2, + "logits/rejected": -44434855.11111111, + "logps/chosen": -408.84723307291665, + "logps/rejected": -646.9089626736111, + "loss": 0.0121, + "rewards/chosen": 9.530467732747395, + "rewards/margins": 30.797013346354163, + "rewards/rejected": -21.26654561360677, + "step": 3246 + }, + { + "epoch": 0.8124609032903791, + "grad_norm": 4.46875, + "kl": 20.378314971923828, + "learning_rate": 5e-06, + "logits/chosen": -44641394.28571428, + "logits/rejected": -48323488.0, + "logps/chosen": -497.87489536830356, + "logps/rejected": -546.97685546875, + "loss": 0.0496, + "rewards/chosen": 10.162787301199776, + "rewards/margins": 26.279068429129467, + "rewards/rejected": -16.11628112792969, + "step": 3247 + }, + { + "epoch": 0.812711122231953, + "grad_norm": 6.0, + "kl": 4.379368782043457, + "learning_rate": 5e-06, + "logits/chosen": -47339720.72727273, + "logits/rejected": -48503212.307692304, + "logps/chosen": -505.10635653409093, + "logps/rejected": -551.5313251201923, + "loss": 0.036, + "rewards/chosen": 10.930896412242543, + "rewards/margins": 23.613938498330285, + "rewards/rejected": -12.68304208608774, + "step": 3248 + }, + { + "epoch": 0.8129613411735268, + "grad_norm": 7.21875, + "kl": 10.234516143798828, + "learning_rate": 5e-06, + "logits/chosen": -39130976.0, + "logits/rejected": -47825973.333333336, + "logps/chosen": -356.4173990885417, + "logps/rejected": -462.5713704427083, + "loss": 0.0138, + "rewards/chosen": 9.284433364868164, + "rewards/margins": 23.537143071492515, + "rewards/rejected": -14.25270970662435, + "step": 3249 + }, + { + "epoch": 0.8132115601151008, + "grad_norm": 1.09375, + "kl": 1.483258605003357, + "learning_rate": 5e-06, + "logits/chosen": -30850737.777777776, + "logits/rejected": -45101401.6, + "logps/chosen": -389.65654839409723, + "logps/rejected": -492.66009114583335, + "loss": 0.0119, + "rewards/chosen": 9.28047349717882, + "rewards/margins": 27.335340033637152, + "rewards/rejected": -18.054866536458334, + "step": 3250 + }, + { + "epoch": 0.8134617790566746, + "grad_norm": 11.0625, + "kl": 4.855903148651123, + "learning_rate": 5e-06, + "logits/chosen": -47246967.46666667, + "logits/rejected": -46602801.777777776, + "logps/chosen": -375.49485677083334, + "logps/rejected": -601.0052083333334, + "loss": 0.0163, + "rewards/chosen": 8.45121815999349, + "rewards/margins": 25.867638990614147, + "rewards/rejected": -17.41642083062066, + "step": 3251 + }, + { + "epoch": 0.8137119979982484, + "grad_norm": 1.1484375, + "kl": 1.8343722820281982, + "learning_rate": 5e-06, + "logits/chosen": -35914066.666666664, + "logits/rejected": -45876213.333333336, + "logps/chosen": -307.6388753255208, + "logps/rejected": -483.3712158203125, + "loss": 0.0328, + "rewards/chosen": 7.840925216674805, + "rewards/margins": 22.493195215861, + "rewards/rejected": -14.652269999186197, + "step": 3252 + }, + { + "epoch": 0.8139622169398223, + "grad_norm": 2.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61106872.88888889, + "logits/rejected": -53231906.13333333, + "logps/chosen": -486.6623806423611, + "logps/rejected": -607.91875, + "loss": 0.0422, + "rewards/chosen": 11.241613599989149, + "rewards/margins": 32.34177432590061, + "rewards/rejected": -21.100160725911458, + "step": 3253 + }, + { + "epoch": 0.8142124358813962, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29107756.8, + "logits/rejected": -27536690.285714287, + "logps/chosen": -404.698046875, + "logps/rejected": -659.7548828125, + "loss": 0.0172, + "rewards/chosen": 9.935633087158203, + "rewards/margins": 30.259422411237445, + "rewards/rejected": -20.32378932407924, + "step": 3254 + }, + { + "epoch": 0.8144626548229701, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32092966.0, + "logits/rejected": -60488304.0, + "logps/chosen": -271.8189697265625, + "logps/rejected": -721.5950927734375, + "loss": 0.036, + "rewards/chosen": 7.4692535400390625, + "rewards/margins": 24.52656364440918, + "rewards/rejected": -17.057310104370117, + "step": 3255 + }, + { + "epoch": 0.8147128737645439, + "grad_norm": 1.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39123392.0, + "logits/rejected": -72306713.6, + "logps/chosen": -341.19130161830356, + "logps/rejected": -825.44111328125, + "loss": 0.0024, + "rewards/chosen": 9.079664502825056, + "rewards/margins": 33.07671345302037, + "rewards/rejected": -23.997048950195314, + "step": 3256 + }, + { + "epoch": 0.8149630927061179, + "grad_norm": 5.625, + "kl": 0.46414631605148315, + "learning_rate": 5e-06, + "logits/chosen": -32562400.0, + "logits/rejected": -11155862.545454545, + "logps/chosen": -305.12710336538464, + "logps/rejected": -573.8989701704545, + "loss": 0.0916, + "rewards/chosen": 7.564332815317007, + "rewards/margins": 22.24373039832482, + "rewards/rejected": -14.679397583007812, + "step": 3257 + }, + { + "epoch": 0.8152133116476917, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70715847.1111111, + "logits/rejected": -49782912.0, + "logps/chosen": -277.22422960069446, + "logps/rejected": -589.951171875, + "loss": 0.0343, + "rewards/chosen": 7.242077297634548, + "rewards/margins": 25.8834469265408, + "rewards/rejected": -18.64136962890625, + "step": 3258 + }, + { + "epoch": 0.8154635305892656, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32437645.333333332, + "logits/rejected": -38410805.333333336, + "logps/chosen": -294.24135335286456, + "logps/rejected": -404.7406819661458, + "loss": 0.0316, + "rewards/chosen": 8.5135129292806, + "rewards/margins": 19.373694101969402, + "rewards/rejected": -10.860181172688803, + "step": 3259 + }, + { + "epoch": 0.8157137495308395, + "grad_norm": 2.296875, + "kl": 7.911231517791748, + "learning_rate": 5e-06, + "logits/chosen": -29033680.0, + "logits/rejected": -51246264.0, + "logps/chosen": -346.0654296875, + "logps/rejected": -705.3738403320312, + "loss": 0.0116, + "rewards/chosen": 8.201961517333984, + "rewards/margins": 26.028879165649414, + "rewards/rejected": -17.82691764831543, + "step": 3260 + }, + { + "epoch": 0.8159639684724134, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37950232.88888889, + "logits/rejected": -31082359.466666665, + "logps/chosen": -302.5338541666667, + "logps/rejected": -454.92652994791666, + "loss": 0.0225, + "rewards/chosen": 8.03598361545139, + "rewards/margins": 22.091924370659722, + "rewards/rejected": -14.055940755208333, + "step": 3261 + }, + { + "epoch": 0.8162141874139872, + "grad_norm": 8.1875, + "kl": 5.003114223480225, + "learning_rate": 5e-06, + "logits/chosen": -43110766.54545455, + "logits/rejected": -38709777.23076923, + "logps/chosen": -385.88778409090907, + "logps/rejected": -631.2416616586538, + "loss": 0.0514, + "rewards/chosen": 9.170000943270596, + "rewards/margins": 26.297740482783816, + "rewards/rejected": -17.12773953951322, + "step": 3262 + }, + { + "epoch": 0.8164644063555612, + "grad_norm": 0.91796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30791382.85714286, + "logits/rejected": -36504538.35294118, + "logps/chosen": -368.93446568080356, + "logps/rejected": -720.671875, + "loss": 0.0024, + "rewards/chosen": 9.268190656389509, + "rewards/margins": 32.190150028517266, + "rewards/rejected": -22.92195937212776, + "step": 3263 + }, + { + "epoch": 0.816714625297135, + "grad_norm": 5.65625, + "kl": 1.8144557476043701, + "learning_rate": 5e-06, + "logits/chosen": -52122326.85714286, + "logits/rejected": -49306038.4, + "logps/chosen": -364.64334542410717, + "logps/rejected": -656.7091796875, + "loss": 0.0436, + "rewards/chosen": 8.470391954694476, + "rewards/margins": 24.853457750592916, + "rewards/rejected": -16.38306579589844, + "step": 3264 + }, + { + "epoch": 0.8169648442387089, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40355571.692307696, + "logits/rejected": -43003421.09090909, + "logps/chosen": -393.0487530048077, + "logps/rejected": -616.2797407670455, + "loss": 0.0182, + "rewards/chosen": 9.44766822228065, + "rewards/margins": 26.41180313217056, + "rewards/rejected": -16.964134909889914, + "step": 3265 + }, + { + "epoch": 0.8172150631802827, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57840475.428571425, + "logits/rejected": -46496044.8, + "logps/chosen": -360.45438058035717, + "logps/rejected": -669.81591796875, + "loss": 0.0499, + "rewards/chosen": 7.632750374930246, + "rewards/margins": 25.15670667375837, + "rewards/rejected": -17.523956298828125, + "step": 3266 + }, + { + "epoch": 0.8174652821218567, + "grad_norm": 3.125, + "kl": 10.741095542907715, + "learning_rate": 5e-06, + "logits/chosen": -46431884.8, + "logits/rejected": -40464809.14285714, + "logps/chosen": -338.8407958984375, + "logps/rejected": -607.5833565848214, + "loss": 0.0721, + "rewards/chosen": 6.633848571777344, + "rewards/margins": 22.60595877511161, + "rewards/rejected": -15.972110203334264, + "step": 3267 + }, + { + "epoch": 0.8177155010634305, + "grad_norm": 1.9453125, + "kl": 2.784249782562256, + "learning_rate": 5e-06, + "logits/chosen": -24806392.0, + "logits/rejected": -47479482.666666664, + "logps/chosen": -222.7137451171875, + "logps/rejected": -727.4908854166666, + "loss": 0.0389, + "rewards/chosen": 7.228570938110352, + "rewards/margins": 24.578962326049805, + "rewards/rejected": -17.350391387939453, + "step": 3268 + }, + { + "epoch": 0.8179657200050043, + "grad_norm": 15.4375, + "kl": 29.407833099365234, + "learning_rate": 5e-06, + "logits/chosen": -63059456.0, + "logits/rejected": -56472234.666666664, + "logps/chosen": -436.94867621527777, + "logps/rejected": -604.643798828125, + "loss": 0.1904, + "rewards/chosen": 9.55215115017361, + "rewards/margins": 27.006140814887154, + "rewards/rejected": -17.453989664713543, + "step": 3269 + }, + { + "epoch": 0.8182159389465783, + "grad_norm": 6.90625, + "kl": 8.866788864135742, + "learning_rate": 5e-06, + "logits/chosen": -34011643.733333334, + "logits/rejected": -38723395.55555555, + "logps/chosen": -360.43776041666666, + "logps/rejected": -523.2466362847222, + "loss": 0.0365, + "rewards/chosen": 8.127863566080729, + "rewards/margins": 21.33568149142795, + "rewards/rejected": -13.207817925347221, + "step": 3270 + }, + { + "epoch": 0.8184661578881521, + "grad_norm": 8.4375, + "kl": 10.106219291687012, + "learning_rate": 5e-06, + "logits/chosen": -27812226.285714287, + "logits/rejected": -75996083.2, + "logps/chosen": -356.5025111607143, + "logps/rejected": -681.22666015625, + "loss": 0.0337, + "rewards/chosen": 8.201512472970146, + "rewards/margins": 29.149597494942803, + "rewards/rejected": -20.948085021972656, + "step": 3271 + }, + { + "epoch": 0.818716376829726, + "grad_norm": 7.375, + "kl": 9.702817916870117, + "learning_rate": 5e-06, + "logits/chosen": -10994260.57142857, + "logits/rejected": -51708723.2, + "logps/chosen": -410.0978306361607, + "logps/rejected": -899.335546875, + "loss": 0.054, + "rewards/chosen": 9.463233947753906, + "rewards/margins": 26.436311340332033, + "rewards/rejected": -16.973077392578126, + "step": 3272 + }, + { + "epoch": 0.8189665957712999, + "grad_norm": 10.5625, + "kl": 5.2907633781433105, + "learning_rate": 5e-06, + "logits/chosen": -16715988.363636363, + "logits/rejected": -60775227.07692308, + "logps/chosen": -357.24092240767044, + "logps/rejected": -624.3419846754807, + "loss": 0.0155, + "rewards/chosen": 8.487605701793324, + "rewards/margins": 27.185264534049935, + "rewards/rejected": -18.69765883225661, + "step": 3273 + }, + { + "epoch": 0.8192168147128738, + "grad_norm": 3.03125, + "kl": 6.142941951751709, + "learning_rate": 5e-06, + "logits/chosen": -36122069.333333336, + "logits/rejected": -35315765.333333336, + "logps/chosen": -356.3988037109375, + "logps/rejected": -631.3761393229166, + "loss": 0.0473, + "rewards/chosen": 9.049164454142252, + "rewards/margins": 24.87175178527832, + "rewards/rejected": -15.822587331136068, + "step": 3274 + }, + { + "epoch": 0.8194670336544476, + "grad_norm": 7.59375, + "kl": 9.817859649658203, + "learning_rate": 5e-06, + "logits/chosen": -32533915.733333334, + "logits/rejected": -47578282.666666664, + "logps/chosen": -504.8352864583333, + "logps/rejected": -815.2511393229166, + "loss": 0.0196, + "rewards/chosen": 9.454236857096355, + "rewards/margins": 27.86946072048611, + "rewards/rejected": -18.415223863389755, + "step": 3275 + }, + { + "epoch": 0.8197172525960216, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24456814.0, + "logits/rejected": -62693396.0, + "logps/chosen": -240.20492553710938, + "logps/rejected": -638.2871704101562, + "loss": 0.0152, + "rewards/chosen": 6.531996726989746, + "rewards/margins": 21.37136173248291, + "rewards/rejected": -14.839365005493164, + "step": 3276 + }, + { + "epoch": 0.8199674715375954, + "grad_norm": 6.34375, + "kl": 15.615396499633789, + "learning_rate": 5e-06, + "logits/chosen": -49494661.333333336, + "logits/rejected": -47888373.333333336, + "logps/chosen": -528.053955078125, + "logps/rejected": -580.2891438802084, + "loss": 0.0236, + "rewards/chosen": 10.40066655476888, + "rewards/margins": 24.22950871785482, + "rewards/rejected": -13.828842163085938, + "step": 3277 + }, + { + "epoch": 0.8202176904791693, + "grad_norm": 2.90625, + "kl": 7.928256511688232, + "learning_rate": 5e-06, + "logits/chosen": -38740041.84615385, + "logits/rejected": -52866065.45454545, + "logps/chosen": -381.10501802884613, + "logps/rejected": -514.5059925426136, + "loss": 0.0588, + "rewards/chosen": 10.038317166841948, + "rewards/margins": 23.389040433443512, + "rewards/rejected": -13.350723266601562, + "step": 3278 + }, + { + "epoch": 0.8204679094207431, + "grad_norm": 3.0, + "kl": 17.93801498413086, + "learning_rate": 5e-06, + "logits/chosen": -42899246.93333333, + "logits/rejected": -60386901.333333336, + "logps/chosen": -420.3256510416667, + "logps/rejected": -681.9513888888889, + "loss": 0.0409, + "rewards/chosen": 11.525728352864583, + "rewards/margins": 28.91897447374132, + "rewards/rejected": -17.393246120876736, + "step": 3279 + }, + { + "epoch": 0.8207181283623171, + "grad_norm": 5.0, + "kl": 15.330894470214844, + "learning_rate": 5e-06, + "logits/chosen": -28183748.923076924, + "logits/rejected": -62560034.90909091, + "logps/chosen": -404.55464993990387, + "logps/rejected": -541.0329367897727, + "loss": 0.0603, + "rewards/chosen": 10.17655005821815, + "rewards/margins": 20.944676112461757, + "rewards/rejected": -10.768126054243607, + "step": 3280 + }, + { + "epoch": 0.8209683473038909, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33816252.8, + "logits/rejected": -28825092.57142857, + "logps/chosen": -371.99404296875, + "logps/rejected": -441.69032505580356, + "loss": 0.0787, + "rewards/chosen": 8.090084838867188, + "rewards/margins": 18.765856279645647, + "rewards/rejected": -10.67577144077846, + "step": 3281 + }, + { + "epoch": 0.8212185662454647, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39737920.0, + "logits/rejected": -31501730.133333333, + "logps/chosen": -429.51475694444446, + "logps/rejected": -684.5255859375, + "loss": 0.0285, + "rewards/chosen": 11.348898145887587, + "rewards/margins": 28.945937940809465, + "rewards/rejected": -17.597039794921876, + "step": 3282 + }, + { + "epoch": 0.8214687851870387, + "grad_norm": 2.46875, + "kl": 8.799793243408203, + "learning_rate": 5e-06, + "logits/chosen": -38584037.81818182, + "logits/rejected": -32298087.384615384, + "logps/chosen": -397.57177734375, + "logps/rejected": -482.0186298076923, + "loss": 0.0452, + "rewards/chosen": 10.044185985218395, + "rewards/margins": 20.596177374566352, + "rewards/rejected": -10.551991389347958, + "step": 3283 + }, + { + "epoch": 0.8217190041286125, + "grad_norm": 17.0, + "kl": 1.331412672996521, + "learning_rate": 5e-06, + "logits/chosen": -70933166.54545455, + "logits/rejected": -29697784.615384616, + "logps/chosen": -362.0716441761364, + "logps/rejected": -517.3513371394231, + "loss": 0.0795, + "rewards/chosen": 7.550413651899858, + "rewards/margins": 23.08391491016308, + "rewards/rejected": -15.533501258263222, + "step": 3284 + }, + { + "epoch": 0.8219692230701864, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29337613.714285713, + "logits/rejected": 10111699.2, + "logps/chosen": -303.90938895089283, + "logps/rejected": -407.0588134765625, + "loss": 0.0326, + "rewards/chosen": 7.239796229771206, + "rewards/margins": 21.7516355242048, + "rewards/rejected": -14.511839294433594, + "step": 3285 + }, + { + "epoch": 0.8222194420117603, + "grad_norm": 0.1455078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43280188.0, + "logits/rejected": -75093040.0, + "logps/chosen": -348.45892333984375, + "logps/rejected": -879.1758422851562, + "loss": 0.0004, + "rewards/chosen": 8.547391891479492, + "rewards/margins": 35.47361946105957, + "rewards/rejected": -26.926227569580078, + "step": 3286 + }, + { + "epoch": 0.8224696609533342, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34922350.76923077, + "logits/rejected": -64050624.0, + "logps/chosen": -325.1035907451923, + "logps/rejected": -515.2407670454545, + "loss": 0.0304, + "rewards/chosen": 9.146081190842848, + "rewards/margins": 25.368271354195123, + "rewards/rejected": -16.222190163352273, + "step": 3287 + }, + { + "epoch": 0.822719879894908, + "grad_norm": 13.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53542378.666666664, + "logits/rejected": -35869512.53333333, + "logps/chosen": -330.1077473958333, + "logps/rejected": -633.656640625, + "loss": 0.0291, + "rewards/chosen": 9.096694946289062, + "rewards/margins": 25.020217895507812, + "rewards/rejected": -15.92352294921875, + "step": 3288 + }, + { + "epoch": 0.8229700988364819, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64330201.6, + "logits/rejected": -55948329.14285714, + "logps/chosen": -296.5574951171875, + "logps/rejected": -714.0093470982143, + "loss": 0.0255, + "rewards/chosen": 7.901792907714844, + "rewards/margins": 29.353411647251676, + "rewards/rejected": -21.45161873953683, + "step": 3289 + }, + { + "epoch": 0.8232203177780558, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37806274.90909091, + "logits/rejected": -14778756.923076924, + "logps/chosen": -241.27183948863637, + "logps/rejected": -735.8997896634615, + "loss": 0.0723, + "rewards/chosen": 7.361351013183594, + "rewards/margins": 26.64551778940054, + "rewards/rejected": -19.284166776216946, + "step": 3290 + }, + { + "epoch": 0.8234705367196297, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31221064.533333335, + "logits/rejected": -41358069.333333336, + "logps/chosen": -357.7890950520833, + "logps/rejected": -595.1922200520834, + "loss": 0.0351, + "rewards/chosen": 8.875649007161458, + "rewards/margins": 27.893011813693576, + "rewards/rejected": -19.01736280653212, + "step": 3291 + }, + { + "epoch": 0.8237207556612035, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43049591.46666667, + "logits/rejected": -34886833.777777776, + "logps/chosen": -340.0924479166667, + "logps/rejected": -665.4007161458334, + "loss": 0.0346, + "rewards/chosen": 8.422756958007813, + "rewards/margins": 25.453506808810765, + "rewards/rejected": -17.030749850802952, + "step": 3292 + }, + { + "epoch": 0.8239709746027775, + "grad_norm": 8.25, + "kl": 2.592266082763672, + "learning_rate": 5e-06, + "logits/chosen": -44962308.266666666, + "logits/rejected": 31386215.111111112, + "logps/chosen": -428.7900390625, + "logps/rejected": -777.6287977430555, + "loss": 0.0242, + "rewards/chosen": 9.709370930989584, + "rewards/margins": 31.827592637803818, + "rewards/rejected": -22.118221706814236, + "step": 3293 + }, + { + "epoch": 0.8242211935443513, + "grad_norm": 12.75, + "kl": 2.3202362060546875, + "learning_rate": 5e-06, + "logits/chosen": 11429590.857142856, + "logits/rejected": -56721280.0, + "logps/chosen": -496.2706821986607, + "logps/rejected": -655.50537109375, + "loss": 0.0321, + "rewards/chosen": 9.576468331473214, + "rewards/margins": 31.49947248186384, + "rewards/rejected": -21.923004150390625, + "step": 3294 + }, + { + "epoch": 0.8244714124859251, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28976835.2, + "logits/rejected": -50273275.428571425, + "logps/chosen": -441.313427734375, + "logps/rejected": -646.2372349330357, + "loss": 0.0226, + "rewards/chosen": 8.246953582763672, + "rewards/margins": 28.33501205444336, + "rewards/rejected": -20.088058471679688, + "step": 3295 + }, + { + "epoch": 0.8247216314274991, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50227406.76923077, + "logits/rejected": -44265649.45454545, + "logps/chosen": -351.11910306490387, + "logps/rejected": -732.3539595170455, + "loss": 0.1026, + "rewards/chosen": 6.3480072021484375, + "rewards/margins": 28.694734053178266, + "rewards/rejected": -22.34672685102983, + "step": 3296 + }, + { + "epoch": 0.824971850369073, + "grad_norm": 10.625, + "kl": 1.0299034118652344, + "learning_rate": 5e-06, + "logits/chosen": -57792118.85714286, + "logits/rejected": -45347328.0, + "logps/chosen": -379.41531808035717, + "logps/rejected": -751.85859375, + "loss": 0.0143, + "rewards/chosen": 9.028009687151227, + "rewards/margins": 27.01486576625279, + "rewards/rejected": -17.986856079101564, + "step": 3297 + }, + { + "epoch": 0.8252220693106468, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59544060.0, + "logits/rejected": -50084268.8, + "logps/chosen": -387.0478515625, + "logps/rejected": -575.128271484375, + "loss": 0.0021, + "rewards/chosen": 8.25259780883789, + "rewards/margins": 26.847887420654295, + "rewards/rejected": -18.595289611816405, + "step": 3298 + }, + { + "epoch": 0.8254722882522207, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26314806.4, + "logits/rejected": -52175844.571428575, + "logps/chosen": -310.2552490234375, + "logps/rejected": -691.3191266741071, + "loss": 0.0562, + "rewards/chosen": 7.238521575927734, + "rewards/margins": 25.701719556535995, + "rewards/rejected": -18.46319798060826, + "step": 3299 + }, + { + "epoch": 0.8257225071937946, + "grad_norm": 0.63671875, + "kl": 3.3166141510009766, + "learning_rate": 5e-06, + "logits/chosen": -41219080.0, + "logits/rejected": -53791388.0, + "logps/chosen": -512.0276489257812, + "logps/rejected": -684.2368774414062, + "loss": 0.0149, + "rewards/chosen": 11.076394081115723, + "rewards/margins": 32.61976146697998, + "rewards/rejected": -21.543367385864258, + "step": 3300 + }, + { + "epoch": 0.8259727261353684, + "grad_norm": 1.46875, + "kl": 4.814750671386719, + "learning_rate": 5e-06, + "logits/chosen": -52724790.15384615, + "logits/rejected": -34189489.45454545, + "logps/chosen": -457.7012469951923, + "logps/rejected": -797.9524147727273, + "loss": 0.0025, + "rewards/chosen": 10.689822857196514, + "rewards/margins": 38.36709946852464, + "rewards/rejected": -27.677276611328125, + "step": 3301 + }, + { + "epoch": 0.8262229450769423, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53446202.18181818, + "logits/rejected": -63393516.307692304, + "logps/chosen": -308.81716086647725, + "logps/rejected": -732.3592998798077, + "loss": 0.0196, + "rewards/chosen": 8.789416920055043, + "rewards/margins": 35.041717475944466, + "rewards/rejected": -26.252300555889423, + "step": 3302 + }, + { + "epoch": 0.8264731640185162, + "grad_norm": 0.65234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34691204.92307692, + "logits/rejected": -34995328.0, + "logps/chosen": -400.47408353365387, + "logps/rejected": -773.9894353693181, + "loss": 0.0076, + "rewards/chosen": 9.53081805889423, + "rewards/margins": 32.0947357391144, + "rewards/rejected": -22.56391768022017, + "step": 3303 + }, + { + "epoch": 0.8267233829600901, + "grad_norm": 7.46875, + "kl": 1.9938793182373047, + "learning_rate": 5e-06, + "logits/chosen": -46698871.46666667, + "logits/rejected": -35383630.222222224, + "logps/chosen": -388.05442708333334, + "logps/rejected": -416.1911349826389, + "loss": 0.0413, + "rewards/chosen": 9.964243570963541, + "rewards/margins": 23.497785780164932, + "rewards/rejected": -13.53354220920139, + "step": 3304 + }, + { + "epoch": 0.8269736019016639, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42376290.461538464, + "logits/rejected": -54592482.90909091, + "logps/chosen": -307.40576171875, + "logps/rejected": -571.3518732244319, + "loss": 0.0621, + "rewards/chosen": 7.484063955453726, + "rewards/margins": 26.982453459626313, + "rewards/rejected": -19.498389504172586, + "step": 3305 + }, + { + "epoch": 0.8272238208432379, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27703528.727272727, + "logits/rejected": -51822247.384615384, + "logps/chosen": -380.71830610795456, + "logps/rejected": -585.3508864182693, + "loss": 0.005, + "rewards/chosen": 7.5115273215553975, + "rewards/margins": 25.16945696210528, + "rewards/rejected": -17.65792964054988, + "step": 3306 + }, + { + "epoch": 0.8274740397848117, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22414901.333333332, + "logits/rejected": -39540861.333333336, + "logps/chosen": -434.964599609375, + "logps/rejected": -518.8721923828125, + "loss": 0.0132, + "rewards/chosen": 9.05865224202474, + "rewards/margins": 22.37799835205078, + "rewards/rejected": -13.319346110026041, + "step": 3307 + }, + { + "epoch": 0.8277242587263856, + "grad_norm": 23.0, + "kl": 36.345985412597656, + "learning_rate": 5e-06, + "logits/chosen": -79350621.86666666, + "logits/rejected": -21845425.777777776, + "logps/chosen": -441.016796875, + "logps/rejected": -597.7423502604166, + "loss": 0.14, + "rewards/chosen": 9.299156697591146, + "rewards/margins": 24.28133273654514, + "rewards/rejected": -14.982176038953993, + "step": 3308 + }, + { + "epoch": 0.8279744776679595, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34273088.0, + "logits/rejected": -41227766.85714286, + "logps/chosen": -478.27998046875, + "logps/rejected": -744.6792689732143, + "loss": 0.0096, + "rewards/chosen": 9.446966552734375, + "rewards/margins": 31.937083217075894, + "rewards/rejected": -22.490116664341517, + "step": 3309 + }, + { + "epoch": 0.8282246966095334, + "grad_norm": 1.4375, + "kl": 1.5874879360198975, + "learning_rate": 5e-06, + "logits/chosen": -46285351.384615384, + "logits/rejected": -42127645.09090909, + "logps/chosen": -352.20714393028845, + "logps/rejected": -598.8568892045455, + "loss": 0.0289, + "rewards/chosen": 7.323147113506611, + "rewards/margins": 25.105370901681326, + "rewards/rejected": -17.782223788174715, + "step": 3310 + }, + { + "epoch": 0.8284749155511072, + "grad_norm": 5.59375, + "kl": 3.4230384826660156, + "learning_rate": 5e-06, + "logits/chosen": -36454779.07692308, + "logits/rejected": -29546030.545454547, + "logps/chosen": -310.10146859975964, + "logps/rejected": -498.4625799005682, + "loss": 0.0701, + "rewards/chosen": 8.706908005934496, + "rewards/margins": 23.89776632669089, + "rewards/rejected": -15.190858320756393, + "step": 3311 + }, + { + "epoch": 0.8287251344926811, + "grad_norm": 5.875, + "kl": 10.114866256713867, + "learning_rate": 5e-06, + "logits/chosen": -43295266.13333333, + "logits/rejected": -69831509.33333333, + "logps/chosen": -410.04254557291665, + "logps/rejected": -695.9557291666666, + "loss": 0.0157, + "rewards/chosen": 9.840065511067708, + "rewards/margins": 24.528421698676215, + "rewards/rejected": -14.688356187608507, + "step": 3312 + }, + { + "epoch": 0.828975353434255, + "grad_norm": 8.3125, + "kl": 2.105194091796875, + "learning_rate": 5e-06, + "logits/chosen": -23932745.14285714, + "logits/rejected": -52005756.8, + "logps/chosen": -324.17822265625, + "logps/rejected": -902.579296875, + "loss": 0.0331, + "rewards/chosen": 8.256120954241071, + "rewards/margins": 35.128264508928574, + "rewards/rejected": -26.8721435546875, + "step": 3313 + }, + { + "epoch": 0.8292255723758288, + "grad_norm": 2.265625, + "kl": 15.011377334594727, + "learning_rate": 5e-06, + "logits/chosen": -69736925.86666666, + "logits/rejected": -41390293.333333336, + "logps/chosen": -452.0376302083333, + "logps/rejected": -798.68408203125, + "loss": 0.0918, + "rewards/chosen": 10.94212137858073, + "rewards/margins": 34.27952033148871, + "rewards/rejected": -23.337398952907986, + "step": 3314 + }, + { + "epoch": 0.8294757913174027, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24229200.0, + "logits/rejected": -39475280.0, + "logps/chosen": -267.7308756510417, + "logps/rejected": -497.00390625, + "loss": 0.0477, + "rewards/chosen": 5.999849955240886, + "rewards/margins": 22.282530466715496, + "rewards/rejected": -16.28268051147461, + "step": 3315 + }, + { + "epoch": 0.8297260102589766, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33809752.0, + "logits/rejected": 30859330.0, + "logps/chosen": -240.4337921142578, + "logps/rejected": -716.928466796875, + "loss": 0.0553, + "rewards/chosen": 6.703101634979248, + "rewards/margins": 23.37804365158081, + "rewards/rejected": -16.674942016601562, + "step": 3316 + }, + { + "epoch": 0.8299762292005505, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43669309.333333336, + "logits/rejected": -34923746.666666664, + "logps/chosen": -374.477294921875, + "logps/rejected": -609.7667643229166, + "loss": 0.0178, + "rewards/chosen": 10.217585881551107, + "rewards/margins": 29.231819788614906, + "rewards/rejected": -19.0142339070638, + "step": 3317 + }, + { + "epoch": 0.8302264481421243, + "grad_norm": 24.375, + "kl": 31.539608001708984, + "learning_rate": 5e-06, + "logits/chosen": -59401009.777777776, + "logits/rejected": -37880693.333333336, + "logps/chosen": -515.4853515625, + "logps/rejected": -567.8069661458334, + "loss": 0.1098, + "rewards/chosen": 11.087861802842882, + "rewards/margins": 32.558865017361114, + "rewards/rejected": -21.47100321451823, + "step": 3318 + }, + { + "epoch": 0.8304766670836983, + "grad_norm": 7.75, + "kl": 1.0870425701141357, + "learning_rate": 5e-06, + "logits/chosen": -64401036.8, + "logits/rejected": -10171996.57142857, + "logps/chosen": -409.0129150390625, + "logps/rejected": -568.3106863839286, + "loss": 0.016, + "rewards/chosen": 10.63565444946289, + "rewards/margins": 27.179124559674943, + "rewards/rejected": -16.543470110212052, + "step": 3319 + }, + { + "epoch": 0.8307268860252721, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43862707.2, + "logits/rejected": -41969664.0, + "logps/chosen": -347.829443359375, + "logps/rejected": -663.58984375, + "loss": 0.0341, + "rewards/chosen": 8.182230377197266, + "rewards/margins": 23.074542563302177, + "rewards/rejected": -14.892312186104911, + "step": 3320 + }, + { + "epoch": 0.830977104966846, + "grad_norm": 7.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57621382.4, + "logits/rejected": -25528660.57142857, + "logps/chosen": -497.491796875, + "logps/rejected": -515.17041015625, + "loss": 0.007, + "rewards/chosen": 10.971384429931641, + "rewards/margins": 25.543521445138115, + "rewards/rejected": -14.572137015206474, + "step": 3321 + }, + { + "epoch": 0.8312273239084199, + "grad_norm": 0.019287109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26781942.153846152, + "logits/rejected": -42990711.27272727, + "logps/chosen": -468.1041917067308, + "logps/rejected": -740.1134588068181, + "loss": 0.0, + "rewards/chosen": 12.621317936823917, + "rewards/margins": 37.125490522051194, + "rewards/rejected": -24.504172585227273, + "step": 3322 + }, + { + "epoch": 0.8314775428499938, + "grad_norm": 20.375, + "kl": 4.926285743713379, + "learning_rate": 5e-06, + "logits/chosen": -40254930.28571428, + "logits/rejected": -41583564.8, + "logps/chosen": -410.936767578125, + "logps/rejected": -550.081103515625, + "loss": 0.049, + "rewards/chosen": 8.16054698399135, + "rewards/margins": 24.419534410749165, + "rewards/rejected": -16.258987426757812, + "step": 3323 + }, + { + "epoch": 0.8317277617915676, + "grad_norm": 6.09375, + "kl": 16.31495475769043, + "learning_rate": 5e-06, + "logits/chosen": -49415808.0, + "logits/rejected": -54033913.6, + "logps/chosen": -516.3753836495536, + "logps/rejected": -743.772509765625, + "loss": 0.055, + "rewards/chosen": 9.727398463657924, + "rewards/margins": 27.271038600376674, + "rewards/rejected": -17.54364013671875, + "step": 3324 + }, + { + "epoch": 0.8319779807331416, + "grad_norm": 3.15625, + "kl": 0.9522311091423035, + "learning_rate": 5e-06, + "logits/chosen": -35586806.85714286, + "logits/rejected": -50276800.0, + "logps/chosen": -338.73228236607144, + "logps/rejected": -652.16865234375, + "loss": 0.0332, + "rewards/chosen": 9.390834263392858, + "rewards/margins": 26.146693638392858, + "rewards/rejected": -16.755859375, + "step": 3325 + }, + { + "epoch": 0.8322281996747154, + "grad_norm": 1.6953125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37868125.09090909, + "logits/rejected": -54182272.0, + "logps/chosen": -447.33274147727275, + "logps/rejected": -841.9423828125, + "loss": 0.0197, + "rewards/chosen": 10.815548983487217, + "rewards/margins": 32.36075255920837, + "rewards/rejected": -21.545203575721153, + "step": 3326 + }, + { + "epoch": 0.8324784186162892, + "grad_norm": 10.1875, + "kl": 13.414175033569336, + "learning_rate": 5e-06, + "logits/chosen": -20029216.0, + "logits/rejected": -37251008.0, + "logps/chosen": -477.2794494628906, + "logps/rejected": -511.9940185546875, + "loss": 0.0664, + "rewards/chosen": 10.704218864440918, + "rewards/margins": 22.965049743652344, + "rewards/rejected": -12.260830879211426, + "step": 3327 + }, + { + "epoch": 0.8327286375578631, + "grad_norm": 1.7421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9723276.307692308, + "logits/rejected": -44815924.36363637, + "logps/chosen": -329.5674579326923, + "logps/rejected": -628.5750177556819, + "loss": 0.0366, + "rewards/chosen": 7.459197998046875, + "rewards/margins": 20.49805519797585, + "rewards/rejected": -13.038857199928977, + "step": 3328 + }, + { + "epoch": 0.832978856499437, + "grad_norm": 2.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48634698.666666664, + "logits/rejected": -43427450.666666664, + "logps/chosen": -320.67331949869794, + "logps/rejected": -709.500244140625, + "loss": 0.0227, + "rewards/chosen": 8.579489390055338, + "rewards/margins": 27.103089650472008, + "rewards/rejected": -18.523600260416668, + "step": 3329 + }, + { + "epoch": 0.8332290754410109, + "grad_norm": 0.66796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47702697.6, + "logits/rejected": -47301728.0, + "logps/chosen": -344.5995849609375, + "logps/rejected": -777.52490234375, + "loss": 0.0077, + "rewards/chosen": 7.826660919189453, + "rewards/margins": 28.068726348876954, + "rewards/rejected": -20.2420654296875, + "step": 3330 + }, + { + "epoch": 0.8334792943825847, + "grad_norm": 10.75, + "kl": 17.64159393310547, + "learning_rate": 5e-06, + "logits/chosen": -31657228.8, + "logits/rejected": -32322094.222222224, + "logps/chosen": -365.816015625, + "logps/rejected": -521.9873046875, + "loss": 0.0778, + "rewards/chosen": 9.595201619466145, + "rewards/margins": 22.148414103190106, + "rewards/rejected": -12.553212483723959, + "step": 3331 + }, + { + "epoch": 0.8337295133241587, + "grad_norm": 5.46875, + "kl": 1.3476537466049194, + "learning_rate": 5e-06, + "logits/chosen": -76895522.9090909, + "logits/rejected": -31535502.769230768, + "logps/chosen": -406.1941583806818, + "logps/rejected": -609.6141826923077, + "loss": 0.0081, + "rewards/chosen": 9.512120333584873, + "rewards/margins": 26.848871244417204, + "rewards/rejected": -17.336750910832333, + "step": 3332 + }, + { + "epoch": 0.8339797322657325, + "grad_norm": 6.625, + "kl": 8.00233268737793, + "learning_rate": 5e-06, + "logits/chosen": -41232122.18181818, + "logits/rejected": -28681048.615384616, + "logps/chosen": -482.28413529829544, + "logps/rejected": -523.8064903846154, + "loss": 0.0118, + "rewards/chosen": 10.490389043634588, + "rewards/margins": 28.5664658446412, + "rewards/rejected": -18.07607680100661, + "step": 3333 + }, + { + "epoch": 0.8342299512073064, + "grad_norm": 8.0, + "kl": 2.8511955738067627, + "learning_rate": 5e-06, + "logits/chosen": -40877555.2, + "logits/rejected": -32739520.0, + "logps/chosen": -513.68232421875, + "logps/rejected": -589.7804129464286, + "loss": 0.0132, + "rewards/chosen": 10.946174621582031, + "rewards/margins": 27.19280787876674, + "rewards/rejected": -16.24663325718471, + "step": 3334 + }, + { + "epoch": 0.8344801701488803, + "grad_norm": 2.046875, + "kl": 3.5403175354003906, + "learning_rate": 5e-06, + "logits/chosen": -53730816.0, + "logits/rejected": -46655581.09090909, + "logps/chosen": -442.96048677884613, + "logps/rejected": -580.0924183238636, + "loss": 0.0038, + "rewards/chosen": 10.939537635216347, + "rewards/margins": 29.880799620301573, + "rewards/rejected": -18.941261985085227, + "step": 3335 + }, + { + "epoch": 0.8347303890904542, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33175725.333333332, + "logits/rejected": -50051552.0, + "logps/chosen": -336.46811930338544, + "logps/rejected": -826.3871256510416, + "loss": 0.0654, + "rewards/chosen": 8.239480336507162, + "rewards/margins": 28.658510843912758, + "rewards/rejected": -20.419030507405598, + "step": 3336 + }, + { + "epoch": 0.834980608032028, + "grad_norm": 1.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14426797.0, + "logits/rejected": -27603104.0, + "logps/chosen": -346.4061279296875, + "logps/rejected": -578.5408935546875, + "loss": 0.0235, + "rewards/chosen": 8.377764701843262, + "rewards/margins": 24.47706890106201, + "rewards/rejected": -16.09930419921875, + "step": 3337 + }, + { + "epoch": 0.8352308269736018, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50160246.85714286, + "logits/rejected": -58618969.6, + "logps/chosen": -464.0360630580357, + "logps/rejected": -649.570458984375, + "loss": 0.0548, + "rewards/chosen": 9.315637860979352, + "rewards/margins": 29.90846165248326, + "rewards/rejected": -20.592823791503907, + "step": 3338 + }, + { + "epoch": 0.8354810459151758, + "grad_norm": 36.75, + "kl": 7.0385589599609375, + "learning_rate": 5e-06, + "logits/chosen": -57311556.571428575, + "logits/rejected": 50120304.0, + "logps/chosen": -441.13473074776783, + "logps/rejected": -584.90263671875, + "loss": 0.0488, + "rewards/chosen": 10.004616873604911, + "rewards/margins": 29.11538805280413, + "rewards/rejected": -19.11077117919922, + "step": 3339 + }, + { + "epoch": 0.8357312648567496, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34005351.11111111, + "logits/rejected": -45461350.4, + "logps/chosen": -309.23627387152777, + "logps/rejected": -551.9491536458333, + "loss": 0.0439, + "rewards/chosen": 7.5518747965494795, + "rewards/margins": 22.887745157877603, + "rewards/rejected": -15.335870361328125, + "step": 3340 + }, + { + "epoch": 0.8359814837983235, + "grad_norm": 3.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25305896.727272727, + "logits/rejected": -42966370.461538464, + "logps/chosen": -373.67196377840907, + "logps/rejected": -696.3227914663462, + "loss": 0.0219, + "rewards/chosen": 7.287936123934659, + "rewards/margins": 31.948669860413027, + "rewards/rejected": -24.660733736478367, + "step": 3341 + }, + { + "epoch": 0.8362317027398974, + "grad_norm": 3.0625, + "kl": 2.4891600608825684, + "learning_rate": 5e-06, + "logits/chosen": -64469094.4, + "logits/rejected": -36810346.666666664, + "logps/chosen": -505.6770833333333, + "logps/rejected": -658.6105143229166, + "loss": 0.0076, + "rewards/chosen": 10.70615946451823, + "rewards/margins": 29.716394721137153, + "rewards/rejected": -19.010235256618923, + "step": 3342 + }, + { + "epoch": 0.8364819216814713, + "grad_norm": 0.48046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45972971.63636363, + "logits/rejected": -66599616.0, + "logps/chosen": -531.6958895596591, + "logps/rejected": -798.5167518028846, + "loss": 0.0009, + "rewards/chosen": 10.915121598677201, + "rewards/margins": 36.45509493100893, + "rewards/rejected": -25.53997333233173, + "step": 3343 + }, + { + "epoch": 0.8367321406230451, + "grad_norm": 19.75, + "kl": 0.16862361133098602, + "learning_rate": 5e-06, + "logits/chosen": -68221984.0, + "logits/rejected": -33289794.666666668, + "logps/chosen": -403.5641682942708, + "logps/rejected": -794.8441569010416, + "loss": 0.0522, + "rewards/chosen": 10.30563227335612, + "rewards/margins": 32.17124048868815, + "rewards/rejected": -21.86560821533203, + "step": 3344 + }, + { + "epoch": 0.8369823595646191, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3602723.5555555555, + "logits/rejected": -29795498.666666668, + "logps/chosen": -257.42442491319446, + "logps/rejected": -762.309765625, + "loss": 0.0267, + "rewards/chosen": 7.404542711046007, + "rewards/margins": 32.21650763617622, + "rewards/rejected": -24.81196492513021, + "step": 3345 + }, + { + "epoch": 0.8372325785061929, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14663041.0, + "logits/rejected": -62652836.0, + "logps/chosen": -424.8878173828125, + "logps/rejected": -506.0917663574219, + "loss": 0.0257, + "rewards/chosen": 8.07442855834961, + "rewards/margins": 26.42715835571289, + "rewards/rejected": -18.35272979736328, + "step": 3346 + }, + { + "epoch": 0.8374827974477668, + "grad_norm": 10.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -101051221.33333333, + "logits/rejected": -22954131.2, + "logps/chosen": -468.30810546875, + "logps/rejected": -543.856640625, + "loss": 0.0904, + "rewards/chosen": 10.01444583468967, + "rewards/margins": 26.01239505343967, + "rewards/rejected": -15.99794921875, + "step": 3347 + }, + { + "epoch": 0.8377330163893407, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59324985.6, + "logits/rejected": -64914322.28571428, + "logps/chosen": -403.358642578125, + "logps/rejected": -753.4827008928571, + "loss": 0.0201, + "rewards/chosen": 8.923326110839843, + "rewards/margins": 34.99824894496373, + "rewards/rejected": -26.074922834123885, + "step": 3348 + }, + { + "epoch": 0.8379832353309146, + "grad_norm": 17.0, + "kl": 3.4943671226501465, + "learning_rate": 5e-06, + "logits/chosen": -30604476.0, + "logits/rejected": -36369264.0, + "logps/chosen": -358.021240234375, + "logps/rejected": -798.4456176757812, + "loss": 0.0402, + "rewards/chosen": 9.310311317443848, + "rewards/margins": 33.51455783843994, + "rewards/rejected": -24.204246520996094, + "step": 3349 + }, + { + "epoch": 0.8382334542724884, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29860728.615384616, + "logits/rejected": -57623877.81818182, + "logps/chosen": -369.6553485576923, + "logps/rejected": -734.7033025568181, + "loss": 0.0263, + "rewards/chosen": 9.437604464017427, + "rewards/margins": 34.07962772396061, + "rewards/rejected": -24.642023259943183, + "step": 3350 + }, + { + "epoch": 0.8384836732140623, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56266896.0, + "logits/rejected": -22380741.333333332, + "logps/chosen": -386.4521484375, + "logps/rejected": -566.9683024088541, + "loss": 0.0261, + "rewards/chosen": 7.103212356567383, + "rewards/margins": 22.67793083190918, + "rewards/rejected": -15.574718475341797, + "step": 3351 + }, + { + "epoch": 0.8387338921556362, + "grad_norm": 40.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31348704.0, + "logits/rejected": -30208700.8, + "logps/chosen": -413.88490513392856, + "logps/rejected": -876.33857421875, + "loss": 0.0184, + "rewards/chosen": 9.861567905970983, + "rewards/margins": 25.928238133021765, + "rewards/rejected": -16.066670227050782, + "step": 3352 + }, + { + "epoch": 0.83898411109721, + "grad_norm": 5.84375, + "kl": 10.686391830444336, + "learning_rate": 5e-06, + "logits/chosen": -52970804.705882356, + "logits/rejected": -31700002.285714287, + "logps/chosen": -482.0576746323529, + "logps/rejected": -525.08447265625, + "loss": 0.0167, + "rewards/chosen": 8.789918787339154, + "rewards/margins": 22.76984694024094, + "rewards/rejected": -13.979928152901786, + "step": 3353 + }, + { + "epoch": 0.8392343300387839, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32388425.6, + "logits/rejected": -43144608.0, + "logps/chosen": -539.532958984375, + "logps/rejected": -510.23880440848217, + "loss": 0.0445, + "rewards/chosen": 9.477702331542968, + "rewards/margins": 22.517141941615513, + "rewards/rejected": -13.039439610072545, + "step": 3354 + }, + { + "epoch": 0.8394845489803578, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54359680.0, + "logits/rejected": 99626029.71428572, + "logps/chosen": -478.42607421875, + "logps/rejected": -521.8127092633929, + "loss": 0.0061, + "rewards/chosen": 8.03902816772461, + "rewards/margins": 25.62876946585519, + "rewards/rejected": -17.58974129813058, + "step": 3355 + }, + { + "epoch": 0.8397347679219317, + "grad_norm": 11.625, + "kl": 6.405513763427734, + "learning_rate": 5e-06, + "logits/chosen": -56226693.81818182, + "logits/rejected": -52516263.384615384, + "logps/chosen": -366.49003462357956, + "logps/rejected": -756.3671875, + "loss": 0.1104, + "rewards/chosen": 8.124447215687145, + "rewards/margins": 32.67180345441912, + "rewards/rejected": -24.54735623873197, + "step": 3356 + }, + { + "epoch": 0.8399849868635055, + "grad_norm": 12.875, + "kl": 1.3067309856414795, + "learning_rate": 5e-06, + "logits/chosen": -54273348.0, + "logits/rejected": -39804508.0, + "logps/chosen": -447.93408203125, + "logps/rejected": -651.233642578125, + "loss": 0.028, + "rewards/chosen": 7.712504863739014, + "rewards/margins": 29.240417003631592, + "rewards/rejected": -21.527912139892578, + "step": 3357 + }, + { + "epoch": 0.8402352058050795, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63352152.615384616, + "logits/rejected": -54577326.54545455, + "logps/chosen": -305.91128305288464, + "logps/rejected": -590.3220880681819, + "loss": 0.0141, + "rewards/chosen": 7.0193657508263225, + "rewards/margins": 26.883401617303598, + "rewards/rejected": -19.864035866477273, + "step": 3358 + }, + { + "epoch": 0.8404854247466533, + "grad_norm": 7.375, + "kl": 23.675823211669922, + "learning_rate": 5e-06, + "logits/chosen": -52558448.0, + "logits/rejected": -57646680.0, + "logps/chosen": -410.6289978027344, + "logps/rejected": -712.3125, + "loss": 0.0149, + "rewards/chosen": 10.158866882324219, + "rewards/margins": 31.347515106201172, + "rewards/rejected": -21.188648223876953, + "step": 3359 + }, + { + "epoch": 0.8407356436882272, + "grad_norm": 1.03125, + "kl": 0.8565292358398438, + "learning_rate": 5e-06, + "logits/chosen": -62042185.84615385, + "logits/rejected": -52227031.27272727, + "logps/chosen": -445.56651893028845, + "logps/rejected": -682.1604225852273, + "loss": 0.0017, + "rewards/chosen": 9.523953951322115, + "rewards/margins": 32.92604246339598, + "rewards/rejected": -23.402088512073863, + "step": 3360 + }, + { + "epoch": 0.8409858626298011, + "grad_norm": 0.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32252270.933333334, + "logits/rejected": -33166065.777777776, + "logps/chosen": -471.1957682291667, + "logps/rejected": -712.1775173611111, + "loss": 0.0001, + "rewards/chosen": 11.72981465657552, + "rewards/margins": 33.282666015625, + "rewards/rejected": -21.55285135904948, + "step": 3361 + }, + { + "epoch": 0.841236081571375, + "grad_norm": 2.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53902926.76923077, + "logits/rejected": -25050996.363636363, + "logps/chosen": -419.5774113581731, + "logps/rejected": -580.96484375, + "loss": 0.0124, + "rewards/chosen": 8.75354473407452, + "rewards/margins": 27.086298695811024, + "rewards/rejected": -18.332753961736504, + "step": 3362 + }, + { + "epoch": 0.8414863005129488, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34067204.571428575, + "logits/rejected": -43753324.8, + "logps/chosen": -305.95242745535717, + "logps/rejected": -628.813671875, + "loss": 0.0857, + "rewards/chosen": 7.177069527762277, + "rewards/margins": 28.553522164481027, + "rewards/rejected": -21.37645263671875, + "step": 3363 + }, + { + "epoch": 0.8417365194545227, + "grad_norm": 2.171875, + "kl": 3.4250407218933105, + "learning_rate": 5e-06, + "logits/chosen": -37947548.8, + "logits/rejected": -47395757.71428572, + "logps/chosen": -432.73818359375, + "logps/rejected": -605.6768275669643, + "loss": 0.0626, + "rewards/chosen": 8.802084350585938, + "rewards/margins": 25.125789751325335, + "rewards/rejected": -16.323705400739396, + "step": 3364 + }, + { + "epoch": 0.8419867383960966, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33304403.2, + "logits/rejected": -42468320.0, + "logps/chosen": -393.924658203125, + "logps/rejected": -630.7925502232143, + "loss": 0.017, + "rewards/chosen": 9.261844635009766, + "rewards/margins": 27.929920196533203, + "rewards/rejected": -18.668075561523438, + "step": 3365 + }, + { + "epoch": 0.8422369573376705, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33040642.666666668, + "logits/rejected": -35234941.333333336, + "logps/chosen": -391.0833333333333, + "logps/rejected": -708.1373697916666, + "loss": 0.0306, + "rewards/chosen": 9.782527923583984, + "rewards/margins": 27.579129536946613, + "rewards/rejected": -17.79660161336263, + "step": 3366 + }, + { + "epoch": 0.8424871762792443, + "grad_norm": 1.3203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20161572.0, + "logits/rejected": -30887076.0, + "logps/chosen": -376.81903076171875, + "logps/rejected": -645.9494018554688, + "loss": 0.0132, + "rewards/chosen": 9.124106407165527, + "rewards/margins": 29.376858711242676, + "rewards/rejected": -20.25275230407715, + "step": 3367 + }, + { + "epoch": 0.8427373952208183, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26727586.46153846, + "logits/rejected": -43080910.54545455, + "logps/chosen": -378.07752403846155, + "logps/rejected": -636.8332297585227, + "loss": 0.0262, + "rewards/chosen": 8.189727783203125, + "rewards/margins": 30.29148448597301, + "rewards/rejected": -22.101756702769887, + "step": 3368 + }, + { + "epoch": 0.8429876141623921, + "grad_norm": 1.25, + "kl": 5.531114101409912, + "learning_rate": 5e-06, + "logits/chosen": -55001181.09090909, + "logits/rejected": -28199010.46153846, + "logps/chosen": -337.08194247159093, + "logps/rejected": -561.1319861778846, + "loss": 0.0319, + "rewards/chosen": 8.170166015625, + "rewards/margins": 27.628819392277645, + "rewards/rejected": -19.458653376652645, + "step": 3369 + }, + { + "epoch": 0.8432378331039659, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41164069.81818182, + "logits/rejected": -49529654.15384615, + "logps/chosen": -345.78830788352275, + "logps/rejected": -517.7381310096154, + "loss": 0.0311, + "rewards/chosen": 7.5902488014914775, + "rewards/margins": 25.039951137729457, + "rewards/rejected": -17.44970233623798, + "step": 3370 + }, + { + "epoch": 0.8434880520455399, + "grad_norm": 5.9375, + "kl": 1.780255675315857, + "learning_rate": 5e-06, + "logits/chosen": -45028153.6, + "logits/rejected": -46323634.28571428, + "logps/chosen": -371.44658203125, + "logps/rejected": -651.23291015625, + "loss": 0.0148, + "rewards/chosen": 8.251386260986328, + "rewards/margins": 25.5736453465053, + "rewards/rejected": -17.322259085518972, + "step": 3371 + }, + { + "epoch": 0.8437382709871137, + "grad_norm": 7.75, + "kl": 13.699926376342773, + "learning_rate": 5e-06, + "logits/chosen": -49671157.333333336, + "logits/rejected": -51886485.333333336, + "logps/chosen": -279.90625, + "logps/rejected": -688.4078776041666, + "loss": 0.1188, + "rewards/chosen": 6.317263921101888, + "rewards/margins": 24.756689071655273, + "rewards/rejected": -18.439425150553387, + "step": 3372 + }, + { + "epoch": 0.8439884899286876, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31615602.666666668, + "logits/rejected": -44985776.0, + "logps/chosen": -339.19956461588544, + "logps/rejected": -548.2075602213541, + "loss": 0.0997, + "rewards/chosen": 6.550669352213542, + "rewards/margins": 21.682912190755207, + "rewards/rejected": -15.132242838541666, + "step": 3373 + }, + { + "epoch": 0.8442387088702615, + "grad_norm": 23.625, + "kl": 6.391028881072998, + "learning_rate": 5e-06, + "logits/chosen": -32915116.307692308, + "logits/rejected": -35180805.81818182, + "logps/chosen": -429.98343599759613, + "logps/rejected": -518.6487926136364, + "loss": 0.0525, + "rewards/chosen": 9.470690800593449, + "rewards/margins": 23.83433826153095, + "rewards/rejected": -14.3636474609375, + "step": 3374 + }, + { + "epoch": 0.8444889278118354, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21346904.0, + "logits/rejected": -49103285.333333336, + "logps/chosen": -340.59547932942706, + "logps/rejected": -570.878662109375, + "loss": 0.0209, + "rewards/chosen": 8.27093251546224, + "rewards/margins": 20.73095703125, + "rewards/rejected": -12.46002451578776, + "step": 3375 + }, + { + "epoch": 0.8447391467534092, + "grad_norm": 8.0, + "kl": 8.937817573547363, + "learning_rate": 5e-06, + "logits/chosen": -41573956.0, + "logits/rejected": -57145512.0, + "logps/chosen": -313.9154052734375, + "logps/rejected": -934.1132202148438, + "loss": 0.0285, + "rewards/chosen": 8.866175651550293, + "rewards/margins": 34.44503688812256, + "rewards/rejected": -25.578861236572266, + "step": 3376 + }, + { + "epoch": 0.8449893656949831, + "grad_norm": 2.046875, + "kl": 5.882508277893066, + "learning_rate": 5e-06, + "logits/chosen": -41217820.44444445, + "logits/rejected": -34376661.333333336, + "logps/chosen": -386.70448133680554, + "logps/rejected": -639.5324869791667, + "loss": 0.0047, + "rewards/chosen": 10.393179999457466, + "rewards/margins": 27.323064846462675, + "rewards/rejected": -16.92988484700521, + "step": 3377 + }, + { + "epoch": 0.845239584636557, + "grad_norm": 5.6875, + "kl": 1.9702622890472412, + "learning_rate": 5e-06, + "logits/chosen": -34272320.0, + "logits/rejected": -75658624.0, + "logps/chosen": -372.7003728693182, + "logps/rejected": -683.201171875, + "loss": 0.0371, + "rewards/chosen": 8.34747314453125, + "rewards/margins": 25.32803696852464, + "rewards/rejected": -16.98056382399339, + "step": 3378 + }, + { + "epoch": 0.8454898035781309, + "grad_norm": 6.78125, + "kl": 3.3111572265625, + "learning_rate": 5e-06, + "logits/chosen": -33296197.818181816, + "logits/rejected": -50877838.76923077, + "logps/chosen": -414.98073508522725, + "logps/rejected": -443.0466496394231, + "loss": 0.0119, + "rewards/chosen": 8.852550159801137, + "rewards/margins": 20.650368857217003, + "rewards/rejected": -11.797818697415865, + "step": 3379 + }, + { + "epoch": 0.8457400225197047, + "grad_norm": 4.125, + "kl": 2.5250658988952637, + "learning_rate": 5e-06, + "logits/chosen": -15815747.2, + "logits/rejected": -33770395.428571425, + "logps/chosen": -271.12080078125, + "logps/rejected": -629.4550083705357, + "loss": 0.0613, + "rewards/chosen": 6.656166076660156, + "rewards/margins": 23.83917781284877, + "rewards/rejected": -17.183011736188615, + "step": 3380 + }, + { + "epoch": 0.8459902414612787, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12792324.57142857, + "logits/rejected": -47730063.058823526, + "logps/chosen": -353.47412109375, + "logps/rejected": -730.3363396139706, + "loss": 0.0043, + "rewards/chosen": 7.810611724853516, + "rewards/margins": 23.37103832469267, + "rewards/rejected": -15.560426599839154, + "step": 3381 + }, + { + "epoch": 0.8462404604028525, + "grad_norm": 5.09375, + "kl": 0.27969712018966675, + "learning_rate": 5e-06, + "logits/chosen": -20665609.846153848, + "logits/rejected": -19446404.363636363, + "logps/chosen": -351.96446814903845, + "logps/rejected": -438.06107954545456, + "loss": 0.0495, + "rewards/chosen": 9.330155005821815, + "rewards/margins": 20.972813586255054, + "rewards/rejected": -11.642658580433238, + "step": 3382 + }, + { + "epoch": 0.8464906793444263, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43759411.2, + "logits/rejected": -63589394.28571428, + "logps/chosen": -334.468701171875, + "logps/rejected": -642.8381696428571, + "loss": 0.0424, + "rewards/chosen": 5.877351760864258, + "rewards/margins": 20.14314787728446, + "rewards/rejected": -14.265796116420201, + "step": 3383 + }, + { + "epoch": 0.8467408982860003, + "grad_norm": 1.4140625, + "kl": 1.0901451110839844, + "learning_rate": 5e-06, + "logits/chosen": -13172873.6, + "logits/rejected": -29516626.285714287, + "logps/chosen": -436.950927734375, + "logps/rejected": -640.8581194196429, + "loss": 0.0203, + "rewards/chosen": 9.508102416992188, + "rewards/margins": 26.058019365583146, + "rewards/rejected": -16.54991694859096, + "step": 3384 + }, + { + "epoch": 0.8469911172275741, + "grad_norm": 3.09375, + "kl": 4.3086981773376465, + "learning_rate": 5e-06, + "logits/chosen": -17548857.14285714, + "logits/rejected": -56037600.0, + "logps/chosen": -300.152587890625, + "logps/rejected": -795.25048828125, + "loss": 0.0629, + "rewards/chosen": 6.721431732177734, + "rewards/margins": 28.70584945678711, + "rewards/rejected": -21.984417724609376, + "step": 3385 + }, + { + "epoch": 0.847241336169148, + "grad_norm": 10.6875, + "kl": 3.5021045207977295, + "learning_rate": 5e-06, + "logits/chosen": -36030464.0, + "logits/rejected": -28365174.153846152, + "logps/chosen": -275.283447265625, + "logps/rejected": -428.3333082932692, + "loss": 0.0399, + "rewards/chosen": 5.708704861727628, + "rewards/margins": 17.09005251797763, + "rewards/rejected": -11.38134765625, + "step": 3386 + }, + { + "epoch": 0.8474915551107218, + "grad_norm": 5.1875, + "kl": 6.609493255615234, + "learning_rate": 5e-06, + "logits/chosen": -33888103.384615384, + "logits/rejected": -38353646.54545455, + "logps/chosen": -352.31971153846155, + "logps/rejected": -579.8142755681819, + "loss": 0.0757, + "rewards/chosen": 8.765191298264723, + "rewards/margins": 24.54052643675904, + "rewards/rejected": -15.775335138494318, + "step": 3387 + }, + { + "epoch": 0.8477417740522958, + "grad_norm": 3.484375, + "kl": 0.01494344137609005, + "learning_rate": 5e-06, + "logits/chosen": -30109107.692307692, + "logits/rejected": -76994112.0, + "logps/chosen": -401.13326322115387, + "logps/rejected": -718.1029829545455, + "loss": 0.0119, + "rewards/chosen": 10.662051861102764, + "rewards/margins": 32.06918687086839, + "rewards/rejected": -21.407135009765625, + "step": 3388 + }, + { + "epoch": 0.8479919929938696, + "grad_norm": 1.3203125, + "kl": 2.4547300338745117, + "learning_rate": 5e-06, + "logits/chosen": -27921094.85714286, + "logits/rejected": -31077004.8, + "logps/chosen": -344.03037806919644, + "logps/rejected": -806.986376953125, + "loss": 0.0346, + "rewards/chosen": 8.57152611868722, + "rewards/margins": 29.660779353550502, + "rewards/rejected": -21.089253234863282, + "step": 3389 + }, + { + "epoch": 0.8482422119354435, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31753132.307692308, + "logits/rejected": -40929751.27272727, + "logps/chosen": -286.71518179086536, + "logps/rejected": -587.9979580965909, + "loss": 0.0266, + "rewards/chosen": 8.309947674091045, + "rewards/margins": 24.60431607119687, + "rewards/rejected": -16.294368397105824, + "step": 3390 + }, + { + "epoch": 0.8484924308770174, + "grad_norm": 14.875, + "kl": 6.918422698974609, + "learning_rate": 5e-06, + "logits/chosen": 10752606.666666666, + "logits/rejected": -23392357.333333332, + "logps/chosen": -355.0339762369792, + "logps/rejected": -480.7571207682292, + "loss": 0.0768, + "rewards/chosen": 8.461181640625, + "rewards/margins": 21.773256937662758, + "rewards/rejected": -13.31207529703776, + "step": 3391 + }, + { + "epoch": 0.8487426498185913, + "grad_norm": 11.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41282416.0, + "logits/rejected": -68693257.14285715, + "logps/chosen": -282.015087890625, + "logps/rejected": -579.0231584821429, + "loss": 0.0365, + "rewards/chosen": 7.52685775756836, + "rewards/margins": 23.913140542166573, + "rewards/rejected": -16.386282784598215, + "step": 3392 + }, + { + "epoch": 0.8489928687601651, + "grad_norm": 2.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51253725.86666667, + "logits/rejected": -31536117.333333332, + "logps/chosen": -369.29479166666664, + "logps/rejected": -550.6883680555555, + "loss": 0.0211, + "rewards/chosen": 8.388052368164063, + "rewards/margins": 28.02307908799913, + "rewards/rejected": -19.635026719835068, + "step": 3393 + }, + { + "epoch": 0.8492430877017391, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48584016.0, + "logits/rejected": -41139082.666666664, + "logps/chosen": -389.5467936197917, + "logps/rejected": -644.7683512369791, + "loss": 0.0161, + "rewards/chosen": 9.44810676574707, + "rewards/margins": 29.582793553670246, + "rewards/rejected": -20.134686787923176, + "step": 3394 + }, + { + "epoch": 0.8494933066433129, + "grad_norm": 17.75, + "kl": 12.753987312316895, + "learning_rate": 5e-06, + "logits/chosen": -32396240.0, + "logits/rejected": -61361912.0, + "logps/chosen": -282.0979309082031, + "logps/rejected": -349.08209228515625, + "loss": 0.177, + "rewards/chosen": 7.218841075897217, + "rewards/margins": 19.01440668106079, + "rewards/rejected": -11.795565605163574, + "step": 3395 + }, + { + "epoch": 0.8497435255848867, + "grad_norm": 13.0625, + "kl": 6.300783157348633, + "learning_rate": 5e-06, + "logits/chosen": -33034992.0, + "logits/rejected": -63137706.666666664, + "logps/chosen": -344.4127604166667, + "logps/rejected": -694.491943359375, + "loss": 0.0579, + "rewards/chosen": 6.93502934773763, + "rewards/margins": 27.62053553263346, + "rewards/rejected": -20.685506184895832, + "step": 3396 + }, + { + "epoch": 0.8499937445264607, + "grad_norm": 9.125, + "kl": 3.336113691329956, + "learning_rate": 5e-06, + "logits/chosen": -59140253.538461536, + "logits/rejected": -51570513.45454545, + "logps/chosen": -490.04244290865387, + "logps/rejected": -738.6499467329545, + "loss": 0.0529, + "rewards/chosen": 10.333992591271034, + "rewards/margins": 34.03190463406223, + "rewards/rejected": -23.69791204279119, + "step": 3397 + }, + { + "epoch": 0.8502439634680345, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64041898.666666664, + "logits/rejected": -48903648.0, + "logps/chosen": -340.8223063151042, + "logps/rejected": -436.5948893229167, + "loss": 0.0511, + "rewards/chosen": 8.212958653767904, + "rewards/margins": 22.943217595418297, + "rewards/rejected": -14.73025894165039, + "step": 3398 + }, + { + "epoch": 0.8504941824096084, + "grad_norm": 2.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63139532.8, + "logits/rejected": -38169323.78947368, + "logps/chosen": -431.964404296875, + "logps/rejected": -585.9048108552631, + "loss": 0.002, + "rewards/chosen": 11.419216918945313, + "rewards/margins": 27.988580161646794, + "rewards/rejected": -16.56936324270148, + "step": 3399 + }, + { + "epoch": 0.8507444013511822, + "grad_norm": 4.625, + "kl": 2.2456088066101074, + "learning_rate": 5e-06, + "logits/chosen": -80927916.8, + "logits/rejected": -52125974.85714286, + "logps/chosen": -325.201123046875, + "logps/rejected": -772.0938895089286, + "loss": 0.0356, + "rewards/chosen": 8.067665100097656, + "rewards/margins": 31.990841456821986, + "rewards/rejected": -23.92317635672433, + "step": 3400 + }, + { + "epoch": 0.8509946202927562, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69306117.81818181, + "logits/rejected": -77617427.6923077, + "logps/chosen": -365.25270774147725, + "logps/rejected": -747.5733924278846, + "loss": 0.0195, + "rewards/chosen": 9.313447432084518, + "rewards/margins": 30.280741978358556, + "rewards/rejected": -20.96729454627404, + "step": 3401 + }, + { + "epoch": 0.85124483923433, + "grad_norm": 13.5, + "kl": 1.60513436794281, + "learning_rate": 5e-06, + "logits/chosen": -25966421.333333332, + "logits/rejected": -63326672.0, + "logps/chosen": -387.8897298177083, + "logps/rejected": -760.96142578125, + "loss": 0.0304, + "rewards/chosen": 9.741454442342123, + "rewards/margins": 30.113690058390297, + "rewards/rejected": -20.372235616048176, + "step": 3402 + }, + { + "epoch": 0.8514950581759039, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23154827.2, + "logits/rejected": -68172278.85714285, + "logps/chosen": -280.952978515625, + "logps/rejected": -561.969970703125, + "loss": 0.0331, + "rewards/chosen": 6.843350219726562, + "rewards/margins": 26.077489798409598, + "rewards/rejected": -19.234139578683035, + "step": 3403 + }, + { + "epoch": 0.8517452771174778, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38404334.54545455, + "logits/rejected": -26405550.769230768, + "logps/chosen": -353.20339133522725, + "logps/rejected": -450.88326322115387, + "loss": 0.021, + "rewards/chosen": 9.237851229580967, + "rewards/margins": 22.709310251516065, + "rewards/rejected": -13.471459021935097, + "step": 3404 + }, + { + "epoch": 0.8519954960590517, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41984470.4, + "logits/rejected": -51389417.14285714, + "logps/chosen": -359.0143310546875, + "logps/rejected": -633.2073800223214, + "loss": 0.0097, + "rewards/chosen": 9.553411102294922, + "rewards/margins": 27.156695665631972, + "rewards/rejected": -17.603284563337052, + "step": 3405 + }, + { + "epoch": 0.8522457150006255, + "grad_norm": 3.46875, + "kl": 0.5354986190795898, + "learning_rate": 5e-06, + "logits/chosen": -34911616.0, + "logits/rejected": -46378156.8, + "logps/chosen": -362.75516183035717, + "logps/rejected": -739.907421875, + "loss": 0.0337, + "rewards/chosen": 9.54271480015346, + "rewards/margins": 30.447087969098774, + "rewards/rejected": -20.904373168945312, + "step": 3406 + }, + { + "epoch": 0.8524959339421995, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56427827.2, + "logits/rejected": -52261645.71428572, + "logps/chosen": -345.6505126953125, + "logps/rejected": -699.6164899553571, + "loss": 0.0114, + "rewards/chosen": 9.564096069335937, + "rewards/margins": 31.107489885602675, + "rewards/rejected": -21.54339381626674, + "step": 3407 + }, + { + "epoch": 0.8527461528837733, + "grad_norm": 11.375, + "kl": 10.46023178100586, + "learning_rate": 5e-06, + "logits/chosen": -53238020.266666666, + "logits/rejected": -39660728.88888889, + "logps/chosen": -449.196875, + "logps/rejected": -782.0788845486111, + "loss": 0.023, + "rewards/chosen": 9.95607401529948, + "rewards/margins": 35.84249437120226, + "rewards/rejected": -25.88642035590278, + "step": 3408 + }, + { + "epoch": 0.8529963718253472, + "grad_norm": 1.5, + "kl": 0.7935384511947632, + "learning_rate": 5e-06, + "logits/chosen": -47553068.0, + "logits/rejected": -28442418.0, + "logps/chosen": -393.3840637207031, + "logps/rejected": -804.2012329101562, + "loss": 0.0039, + "rewards/chosen": 9.483860969543457, + "rewards/margins": 32.58375644683838, + "rewards/rejected": -23.099895477294922, + "step": 3409 + }, + { + "epoch": 0.8532465907669211, + "grad_norm": 1.9140625, + "kl": 1.9362802505493164, + "learning_rate": 5e-06, + "logits/chosen": -60796621.71428572, + "logits/rejected": -35164256.0, + "logps/chosen": -384.19485909598217, + "logps/rejected": -679.923779296875, + "loss": 0.0292, + "rewards/chosen": 7.987306867327009, + "rewards/margins": 24.53445042201451, + "rewards/rejected": -16.5471435546875, + "step": 3410 + }, + { + "epoch": 0.853496809708495, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40565142.15384615, + "logits/rejected": -60314461.09090909, + "logps/chosen": -429.44722806490387, + "logps/rejected": -852.6524325284091, + "loss": 0.0157, + "rewards/chosen": 9.53851083608774, + "rewards/margins": 35.177055092124675, + "rewards/rejected": -25.638544256036933, + "step": 3411 + }, + { + "epoch": 0.8537470286500688, + "grad_norm": 1.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41066076.44444445, + "logits/rejected": -18401723.733333334, + "logps/chosen": -360.92323133680554, + "logps/rejected": -516.06015625, + "loss": 0.0418, + "rewards/chosen": 7.561635335286458, + "rewards/margins": 24.195581054687498, + "rewards/rejected": -16.63394571940104, + "step": 3412 + }, + { + "epoch": 0.8539972475916426, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38734702.222222224, + "logits/rejected": -46688665.6, + "logps/chosen": -371.51898871527777, + "logps/rejected": -629.53125, + "loss": 0.011, + "rewards/chosen": 9.22223154703776, + "rewards/margins": 30.369574483235674, + "rewards/rejected": -21.147342936197916, + "step": 3413 + }, + { + "epoch": 0.8542474665332166, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46984132.92307692, + "logits/rejected": -31063697.454545453, + "logps/chosen": -345.4401292067308, + "logps/rejected": -598.7138671875, + "loss": 0.0561, + "rewards/chosen": 8.10449453500601, + "rewards/margins": 23.045280723305016, + "rewards/rejected": -14.940786188299006, + "step": 3414 + }, + { + "epoch": 0.8544976854747904, + "grad_norm": 3.015625, + "kl": 5.003227233886719, + "learning_rate": 5e-06, + "logits/chosen": -38164480.0, + "logits/rejected": -52871031.46666667, + "logps/chosen": -609.7924262152778, + "logps/rejected": -728.512109375, + "loss": 0.0073, + "rewards/chosen": 12.935448540581596, + "rewards/margins": 36.59801974826389, + "rewards/rejected": -23.66257120768229, + "step": 3415 + }, + { + "epoch": 0.8547479044163643, + "grad_norm": 6.8125, + "kl": 0.5025972127914429, + "learning_rate": 5e-06, + "logits/chosen": -32358964.363636363, + "logits/rejected": -45221154.461538464, + "logps/chosen": -381.77920809659093, + "logps/rejected": -751.0123197115385, + "loss": 0.0565, + "rewards/chosen": 6.736111727627841, + "rewards/margins": 30.62912179373361, + "rewards/rejected": -23.89301006610577, + "step": 3416 + }, + { + "epoch": 0.8549981233579382, + "grad_norm": 2.6875, + "kl": 11.569003105163574, + "learning_rate": 5e-06, + "logits/chosen": -56561984.0, + "logits/rejected": -30339544.0, + "logps/chosen": -430.8636881510417, + "logps/rejected": -846.4625651041666, + "loss": 0.017, + "rewards/chosen": 9.835779190063477, + "rewards/margins": 37.82223192850749, + "rewards/rejected": -27.98645273844401, + "step": 3417 + }, + { + "epoch": 0.8552483422995121, + "grad_norm": 1.4140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18342086.153846152, + "logits/rejected": -46345451.63636363, + "logps/chosen": -289.98140775240387, + "logps/rejected": -770.1831498579545, + "loss": 0.0118, + "rewards/chosen": 7.640285198505108, + "rewards/margins": 27.565271604311217, + "rewards/rejected": -19.92498640580611, + "step": 3418 + }, + { + "epoch": 0.8554985612410859, + "grad_norm": 0.08251953125, + "kl": 3.4349327087402344, + "learning_rate": 5e-06, + "logits/chosen": -24507884.8, + "logits/rejected": -63311753.14285714, + "logps/chosen": -380.1271240234375, + "logps/rejected": -602.6160016741071, + "loss": 0.0002, + "rewards/chosen": 10.58471221923828, + "rewards/margins": 28.48644343784877, + "rewards/rejected": -17.90173121861049, + "step": 3419 + }, + { + "epoch": 0.8557487801826599, + "grad_norm": 7.6875, + "kl": 12.221990585327148, + "learning_rate": 5e-06, + "logits/chosen": -58555200.0, + "logits/rejected": -40902781.333333336, + "logps/chosen": -410.7368570963542, + "logps/rejected": -572.3693033854166, + "loss": 0.098, + "rewards/chosen": 9.691134770711264, + "rewards/margins": 27.45423698425293, + "rewards/rejected": -17.763102213541668, + "step": 3420 + }, + { + "epoch": 0.8559989991242337, + "grad_norm": 1.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32140489.14285714, + "logits/rejected": -58306841.6, + "logps/chosen": -332.3261021205357, + "logps/rejected": -577.804833984375, + "loss": 0.0262, + "rewards/chosen": 7.202910831996372, + "rewards/margins": 24.342908695765903, + "rewards/rejected": -17.13999786376953, + "step": 3421 + }, + { + "epoch": 0.8562492180658076, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20116464.0, + "logits/rejected": -55928763.733333334, + "logps/chosen": -344.44981553819446, + "logps/rejected": -658.9731770833333, + "loss": 0.0396, + "rewards/chosen": 8.593739827473959, + "rewards/margins": 24.407590738932292, + "rewards/rejected": -15.813850911458333, + "step": 3422 + }, + { + "epoch": 0.8564994370073814, + "grad_norm": 8.6875, + "kl": 4.1583757400512695, + "learning_rate": 5e-06, + "logits/chosen": -28254267.42857143, + "logits/rejected": -32502748.8, + "logps/chosen": -273.6908656529018, + "logps/rejected": -513.40341796875, + "loss": 0.0496, + "rewards/chosen": 7.446807861328125, + "rewards/margins": 23.383950805664064, + "rewards/rejected": -15.937142944335937, + "step": 3423 + }, + { + "epoch": 0.8567496559489554, + "grad_norm": 2.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29456314.181818184, + "logits/rejected": -3746817.230769231, + "logps/chosen": -454.02543501420456, + "logps/rejected": -617.2699068509615, + "loss": 0.0036, + "rewards/chosen": 11.79313798384233, + "rewards/margins": 30.350104912177663, + "rewards/rejected": -18.556966928335335, + "step": 3424 + }, + { + "epoch": 0.8569998748905292, + "grad_norm": 7.34375, + "kl": 24.83080291748047, + "learning_rate": 5e-06, + "logits/chosen": -48454746.35294118, + "logits/rejected": -46828608.0, + "logps/chosen": -475.0335477941176, + "logps/rejected": -519.8251953125, + "loss": 0.0881, + "rewards/chosen": 10.413762709673714, + "rewards/margins": 23.49734054693655, + "rewards/rejected": -13.083577837262835, + "step": 3425 + }, + { + "epoch": 0.857250093832103, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19395606.4, + "logits/rejected": -84248384.0, + "logps/chosen": -199.56685791015624, + "logps/rejected": -775.6018415178571, + "loss": 0.0922, + "rewards/chosen": 4.519903564453125, + "rewards/margins": 28.649723161969867, + "rewards/rejected": -24.12981959751674, + "step": 3426 + }, + { + "epoch": 0.857500312773677, + "grad_norm": 7.625, + "kl": 4.2523722648620605, + "learning_rate": 5e-06, + "logits/chosen": -58089545.14285714, + "logits/rejected": -26250043.2, + "logps/chosen": -298.81734793526783, + "logps/rejected": -717.31923828125, + "loss": 0.0633, + "rewards/chosen": 7.013817923409598, + "rewards/margins": 25.99911455426897, + "rewards/rejected": -18.985296630859374, + "step": 3427 + }, + { + "epoch": 0.8577505317152508, + "grad_norm": 13.0625, + "kl": 9.870464324951172, + "learning_rate": 5e-06, + "logits/chosen": -9495052.307692308, + "logits/rejected": -60285585.45454545, + "logps/chosen": -353.49083533653845, + "logps/rejected": -628.0487393465909, + "loss": 0.079, + "rewards/chosen": 8.568535437950722, + "rewards/margins": 24.379702374651714, + "rewards/rejected": -15.811166936700994, + "step": 3428 + }, + { + "epoch": 0.8580007506568247, + "grad_norm": 20.5, + "kl": 0.5861492156982422, + "learning_rate": 5e-06, + "logits/chosen": -31791704.0, + "logits/rejected": -34544837.333333336, + "logps/chosen": -378.1505126953125, + "logps/rejected": -515.7323404947916, + "loss": 0.0199, + "rewards/chosen": 9.098532358805338, + "rewards/margins": 23.284446716308594, + "rewards/rejected": -14.185914357503256, + "step": 3429 + }, + { + "epoch": 0.8582509695983986, + "grad_norm": 20.25, + "kl": 2.6534667015075684, + "learning_rate": 5e-06, + "logits/chosen": -73893261.71428572, + "logits/rejected": -35865728.0, + "logps/chosen": -473.5694056919643, + "logps/rejected": -513.09697265625, + "loss": 0.0314, + "rewards/chosen": 11.509350367954799, + "rewards/margins": 22.890966578892296, + "rewards/rejected": -11.3816162109375, + "step": 3430 + }, + { + "epoch": 0.8585011885399725, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16895837.53846154, + "logits/rejected": -65064750.54545455, + "logps/chosen": -263.77355018028845, + "logps/rejected": -745.2493785511364, + "loss": 0.071, + "rewards/chosen": 7.381386976975661, + "rewards/margins": 24.90931888393589, + "rewards/rejected": -17.527931906960227, + "step": 3431 + }, + { + "epoch": 0.8587514074815463, + "grad_norm": 3.75, + "kl": 3.6137466430664062, + "learning_rate": 5e-06, + "logits/chosen": -36982184.0, + "logits/rejected": -20813888.0, + "logps/chosen": -402.0264587402344, + "logps/rejected": -739.8248291015625, + "loss": 0.0098, + "rewards/chosen": 9.474686622619629, + "rewards/margins": 32.261887550354004, + "rewards/rejected": -22.787200927734375, + "step": 3432 + }, + { + "epoch": 0.8590016264231203, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46333397.333333336, + "logits/rejected": -42859933.86666667, + "logps/chosen": -484.78461371527777, + "logps/rejected": -531.9123046875, + "loss": 0.0202, + "rewards/chosen": 12.38681369357639, + "rewards/margins": 30.41376478407118, + "rewards/rejected": -18.02695109049479, + "step": 3433 + }, + { + "epoch": 0.8592518453646941, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27609774.545454547, + "logits/rejected": -47873969.23076923, + "logps/chosen": -288.84237393465907, + "logps/rejected": -832.9306640625, + "loss": 0.0253, + "rewards/chosen": 7.297658053311435, + "rewards/margins": 29.15584052025855, + "rewards/rejected": -21.858182466947117, + "step": 3434 + }, + { + "epoch": 0.859502064306268, + "grad_norm": 20.75, + "kl": 9.483383178710938, + "learning_rate": 5e-06, + "logits/chosen": -41271122.666666664, + "logits/rejected": -48698330.666666664, + "logps/chosen": -409.0347493489583, + "logps/rejected": -744.7373046875, + "loss": 0.1465, + "rewards/chosen": 9.222151438395182, + "rewards/margins": 30.983539581298828, + "rewards/rejected": -21.761388142903645, + "step": 3435 + }, + { + "epoch": 0.8597522832478418, + "grad_norm": 6.28125, + "kl": 6.486830711364746, + "learning_rate": 5e-06, + "logits/chosen": -26766176.0, + "logits/rejected": -20768977.6, + "logps/chosen": -380.53170340401783, + "logps/rejected": -700.786328125, + "loss": 0.024, + "rewards/chosen": 10.49974605015346, + "rewards/margins": 26.462389482770647, + "rewards/rejected": -15.962643432617188, + "step": 3436 + }, + { + "epoch": 0.8600025021894158, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46137204.36363637, + "logits/rejected": -76598921.84615384, + "logps/chosen": -409.7512872869318, + "logps/rejected": -795.2806490384615, + "loss": 0.0635, + "rewards/chosen": 10.477496060458096, + "rewards/margins": 31.27915293019968, + "rewards/rejected": -20.801656869741585, + "step": 3437 + }, + { + "epoch": 0.8602527211309896, + "grad_norm": 7.03125, + "kl": 11.06386947631836, + "learning_rate": 5e-06, + "logits/chosen": -41409408.0, + "logits/rejected": -27813930.666666668, + "logps/chosen": -292.15944010416666, + "logps/rejected": -589.1636284722222, + "loss": 0.0667, + "rewards/chosen": 7.225406392415365, + "rewards/margins": 18.824005296495226, + "rewards/rejected": -11.59859890407986, + "step": 3438 + }, + { + "epoch": 0.8605029400725634, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58881270.85714286, + "logits/rejected": -40243420.8, + "logps/chosen": -376.37852260044644, + "logps/rejected": -586.9955078125, + "loss": 0.0211, + "rewards/chosen": 8.889854431152344, + "rewards/margins": 25.967770385742188, + "rewards/rejected": -17.077915954589844, + "step": 3439 + }, + { + "epoch": 0.8607531590141374, + "grad_norm": 6.8125, + "kl": 0.8614501953125, + "learning_rate": 5e-06, + "logits/chosen": -54089728.0, + "logits/rejected": -37478691.2, + "logps/chosen": -388.50830078125, + "logps/rejected": -545.693994140625, + "loss": 0.0311, + "rewards/chosen": 7.5315737043108255, + "rewards/margins": 24.173086765834263, + "rewards/rejected": -16.641513061523437, + "step": 3440 + }, + { + "epoch": 0.8610033779557112, + "grad_norm": 9.3125, + "kl": 4.9176764488220215, + "learning_rate": 5e-06, + "logits/chosen": -46591466.666666664, + "logits/rejected": -46553765.333333336, + "logps/chosen": -395.4475911458333, + "logps/rejected": -537.458740234375, + "loss": 0.0564, + "rewards/chosen": 9.691715240478516, + "rewards/margins": 23.488632202148438, + "rewards/rejected": -13.796916961669922, + "step": 3441 + }, + { + "epoch": 0.8612535968972851, + "grad_norm": 6.53125, + "kl": 4.45770788192749, + "learning_rate": 5e-06, + "logits/chosen": -9623728.0, + "logits/rejected": -27709010.666666668, + "logps/chosen": -503.5404459635417, + "logps/rejected": -586.5941975911459, + "loss": 0.0218, + "rewards/chosen": 10.355766932169596, + "rewards/margins": 25.699543635050453, + "rewards/rejected": -15.34377670288086, + "step": 3442 + }, + { + "epoch": 0.861503815838859, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -75128938.66666667, + "logits/rejected": -47218250.666666664, + "logps/chosen": -546.0396321614584, + "logps/rejected": -645.4585367838541, + "loss": 0.0031, + "rewards/chosen": 11.731885274251303, + "rewards/margins": 31.201409657796226, + "rewards/rejected": -19.469524383544922, + "step": 3443 + }, + { + "epoch": 0.8617540347804329, + "grad_norm": 6.8125, + "kl": 24.890928268432617, + "learning_rate": 5e-06, + "logits/chosen": -49410232.88888889, + "logits/rejected": -34724994.666666664, + "logps/chosen": -408.79058159722223, + "logps/rejected": -742.62060546875, + "loss": 0.0831, + "rewards/chosen": 9.84573703342014, + "rewards/margins": 30.377961052788628, + "rewards/rejected": -20.53222401936849, + "step": 3444 + }, + { + "epoch": 0.8620042537220067, + "grad_norm": 4.96875, + "kl": 18.95811653137207, + "learning_rate": 5e-06, + "logits/chosen": -45254224.0, + "logits/rejected": -18760269.714285713, + "logps/chosen": -525.060791015625, + "logps/rejected": -695.8046875, + "loss": 0.0443, + "rewards/chosen": 10.7411376953125, + "rewards/margins": 29.170136369977676, + "rewards/rejected": -18.428998674665177, + "step": 3445 + }, + { + "epoch": 0.8622544726635807, + "grad_norm": 3.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6674628.8, + "logits/rejected": -26047872.0, + "logps/chosen": -308.3902099609375, + "logps/rejected": -612.9534040178571, + "loss": 0.0226, + "rewards/chosen": 7.993011474609375, + "rewards/margins": 30.906912667410715, + "rewards/rejected": -22.91390119280134, + "step": 3446 + }, + { + "epoch": 0.8625046916051545, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25127037.866666667, + "logits/rejected": -40818865.777777776, + "logps/chosen": -322.5530598958333, + "logps/rejected": -571.09423828125, + "loss": 0.0587, + "rewards/chosen": 7.622718811035156, + "rewards/margins": 24.154997931586372, + "rewards/rejected": -16.532279120551216, + "step": 3447 + }, + { + "epoch": 0.8627549105467284, + "grad_norm": 5.21875, + "kl": 3.7288360595703125, + "learning_rate": 5e-06, + "logits/chosen": -59984074.666666664, + "logits/rejected": -47882800.0, + "logps/chosen": -496.1824544270833, + "logps/rejected": -712.8194986979166, + "loss": 0.031, + "rewards/chosen": 10.47537104288737, + "rewards/margins": 31.121027628580727, + "rewards/rejected": -20.64565658569336, + "step": 3448 + }, + { + "epoch": 0.8630051294883022, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31467463.111111112, + "logits/rejected": -57018077.86666667, + "logps/chosen": -357.94447157118054, + "logps/rejected": -568.282421875, + "loss": 0.0328, + "rewards/chosen": 7.756232367621528, + "rewards/margins": 25.909349229600696, + "rewards/rejected": -18.153116861979168, + "step": 3449 + }, + { + "epoch": 0.8632553484298762, + "grad_norm": 2.75, + "kl": 6.968736171722412, + "learning_rate": 5e-06, + "logits/chosen": -60291858.28571428, + "logits/rejected": -87581926.4, + "logps/chosen": -356.03721400669644, + "logps/rejected": -668.813037109375, + "loss": 0.1175, + "rewards/chosen": 10.131209237234932, + "rewards/margins": 33.115295846121654, + "rewards/rejected": -22.98408660888672, + "step": 3450 + }, + { + "epoch": 0.86350556737145, + "grad_norm": 2.546875, + "kl": 3.6997318267822266, + "learning_rate": 5e-06, + "logits/chosen": -35512466.28571428, + "logits/rejected": -45896595.2, + "logps/chosen": -488.79638671875, + "logps/rejected": -553.56044921875, + "loss": 0.0576, + "rewards/chosen": 9.47921861921038, + "rewards/margins": 29.508159964425225, + "rewards/rejected": -20.028941345214843, + "step": 3451 + }, + { + "epoch": 0.8637557863130239, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41302523.428571425, + "logits/rejected": -66977882.35294118, + "logps/chosen": -324.8150111607143, + "logps/rejected": -735.2728630514706, + "loss": 0.0238, + "rewards/chosen": 6.967808859688895, + "rewards/margins": 27.979588644845144, + "rewards/rejected": -21.01177978515625, + "step": 3452 + }, + { + "epoch": 0.8640060052545978, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21832736.0, + "logits/rejected": -34254585.6, + "logps/chosen": -272.3030482700893, + "logps/rejected": -682.8955078125, + "loss": 0.0674, + "rewards/chosen": 7.860586983816964, + "rewards/margins": 24.07650800432478, + "rewards/rejected": -16.215921020507814, + "step": 3453 + }, + { + "epoch": 0.8642562241961717, + "grad_norm": 5.96875, + "kl": 1.8852272033691406, + "learning_rate": 5e-06, + "logits/chosen": -48892112.0, + "logits/rejected": -28840530.666666668, + "logps/chosen": -393.8746337890625, + "logps/rejected": -534.0498046875, + "loss": 0.0398, + "rewards/chosen": 8.52026621500651, + "rewards/margins": 26.391554514567055, + "rewards/rejected": -17.871288299560547, + "step": 3454 + }, + { + "epoch": 0.8645064431377455, + "grad_norm": 2.109375, + "kl": 5.647876739501953, + "learning_rate": 5e-06, + "logits/chosen": -37952635.733333334, + "logits/rejected": -40269312.0, + "logps/chosen": -342.5437825520833, + "logps/rejected": -656.0493706597222, + "loss": 0.0501, + "rewards/chosen": 8.22558135986328, + "rewards/margins": 31.050113762749564, + "rewards/rejected": -22.824532402886284, + "step": 3455 + }, + { + "epoch": 0.8647566620793194, + "grad_norm": 0.94140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40074224.0, + "logits/rejected": -58969914.666666664, + "logps/chosen": -318.78940836588544, + "logps/rejected": -624.6955973307291, + "loss": 0.0094, + "rewards/chosen": 8.334267298380533, + "rewards/margins": 27.249377568562828, + "rewards/rejected": -18.915110270182293, + "step": 3456 + }, + { + "epoch": 0.8650068810208933, + "grad_norm": 2.328125, + "kl": 6.171908855438232, + "learning_rate": 5e-06, + "logits/chosen": -44496172.8, + "logits/rejected": 32292772.57142857, + "logps/chosen": -424.0076171875, + "logps/rejected": -738.0862862723214, + "loss": 0.0035, + "rewards/chosen": 11.169922637939454, + "rewards/margins": 35.951418958391464, + "rewards/rejected": -24.78149632045201, + "step": 3457 + }, + { + "epoch": 0.8652570999624671, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49062764.8, + "logits/rejected": -57530112.0, + "logps/chosen": -461.909375, + "logps/rejected": -610.2184709821429, + "loss": 0.0337, + "rewards/chosen": 9.562892150878906, + "rewards/margins": 29.530404009137833, + "rewards/rejected": -19.967511858258927, + "step": 3458 + }, + { + "epoch": 0.8655073189040411, + "grad_norm": 0.828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 126569176.0, + "logits/rejected": -42944152.0, + "logps/chosen": -533.9003295898438, + "logps/rejected": -570.6421508789062, + "loss": 0.0008, + "rewards/chosen": 10.0740385055542, + "rewards/margins": 28.47630786895752, + "rewards/rejected": -18.40226936340332, + "step": 3459 + }, + { + "epoch": 0.8657575378456149, + "grad_norm": 1.140625, + "kl": 7.083930015563965, + "learning_rate": 5e-06, + "logits/chosen": -27082567.111111112, + "logits/rejected": -2558617.066666667, + "logps/chosen": -425.09776475694446, + "logps/rejected": -825.0471354166667, + "loss": 0.0348, + "rewards/chosen": 9.886474609375, + "rewards/margins": 28.73097127278646, + "rewards/rejected": -18.84449666341146, + "step": 3460 + }, + { + "epoch": 0.8660077567871888, + "grad_norm": 2.0, + "kl": 4.610023498535156, + "learning_rate": 5e-06, + "logits/chosen": -72936487.38461539, + "logits/rejected": -18594686.545454547, + "logps/chosen": -501.4353215144231, + "logps/rejected": -622.3713156960227, + "loss": 0.0028, + "rewards/chosen": 10.840529221754808, + "rewards/margins": 28.86564380138904, + "rewards/rejected": -18.025114579634234, + "step": 3461 + }, + { + "epoch": 0.8662579757287626, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64318257.23076923, + "logits/rejected": -48642257.45454545, + "logps/chosen": -433.83199368990387, + "logps/rejected": -735.3864080255681, + "loss": 0.0643, + "rewards/chosen": 9.543318528395433, + "rewards/margins": 31.57085093251475, + "rewards/rejected": -22.027532404119317, + "step": 3462 + }, + { + "epoch": 0.8665081946703366, + "grad_norm": 19.5, + "kl": 10.288519859313965, + "learning_rate": 5e-06, + "logits/chosen": -48177984.0, + "logits/rejected": -74726542.76923077, + "logps/chosen": -387.11629971590907, + "logps/rejected": -597.6875751201923, + "loss": 0.1029, + "rewards/chosen": 8.570510864257812, + "rewards/margins": 22.266805795522835, + "rewards/rejected": -13.696294931265024, + "step": 3463 + }, + { + "epoch": 0.8667584136119104, + "grad_norm": 0.9140625, + "kl": 12.010282516479492, + "learning_rate": 5e-06, + "logits/chosen": -38682761.84615385, + "logits/rejected": -43303889.45454545, + "logps/chosen": -457.85103665865387, + "logps/rejected": -416.4244939630682, + "loss": 0.0134, + "rewards/chosen": 11.001606867863583, + "rewards/margins": 24.626482023225797, + "rewards/rejected": -13.624875155362217, + "step": 3464 + }, + { + "epoch": 0.8670086325534843, + "grad_norm": 19.0, + "kl": 0.9963874816894531, + "learning_rate": 5e-06, + "logits/chosen": -52735607.46666667, + "logits/rejected": -30933425.777777776, + "logps/chosen": -358.932421875, + "logps/rejected": -819.0618489583334, + "loss": 0.0686, + "rewards/chosen": 7.7281646728515625, + "rewards/margins": 28.729359944661457, + "rewards/rejected": -21.001195271809895, + "step": 3465 + }, + { + "epoch": 0.8672588514950582, + "grad_norm": 1.0078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54212768.0, + "logits/rejected": -49032936.0, + "logps/chosen": -401.12689208984375, + "logps/rejected": -649.1390380859375, + "loss": 0.0061, + "rewards/chosen": 9.850912094116211, + "rewards/margins": 27.142953872680664, + "rewards/rejected": -17.292041778564453, + "step": 3466 + }, + { + "epoch": 0.8675090704366321, + "grad_norm": 7.9375, + "kl": 11.11578369140625, + "learning_rate": 5e-06, + "logits/chosen": -36691840.0, + "logits/rejected": -34298195.2, + "logps/chosen": -298.06717354910717, + "logps/rejected": -778.43212890625, + "loss": 0.0602, + "rewards/chosen": 6.480438777378628, + "rewards/margins": 30.410916682652065, + "rewards/rejected": -23.930477905273438, + "step": 3467 + }, + { + "epoch": 0.8677592893782059, + "grad_norm": 2.515625, + "kl": 7.467310905456543, + "learning_rate": 5e-06, + "logits/chosen": -66754692.92307692, + "logits/rejected": -33492797.09090909, + "logps/chosen": -423.9967698317308, + "logps/rejected": -490.36421342329544, + "loss": 0.0039, + "rewards/chosen": 10.781780536358173, + "rewards/margins": 24.545684067519396, + "rewards/rejected": -13.76390353116122, + "step": 3468 + }, + { + "epoch": 0.8680095083197799, + "grad_norm": 5.875, + "kl": 4.220156669616699, + "learning_rate": 5e-06, + "logits/chosen": -32290999.466666665, + "logits/rejected": -35008462.222222224, + "logps/chosen": -318.0935546875, + "logps/rejected": -517.5664605034722, + "loss": 0.0498, + "rewards/chosen": 8.685135904947916, + "rewards/margins": 25.74177992078993, + "rewards/rejected": -17.056644015842014, + "step": 3469 + }, + { + "epoch": 0.8682597272613537, + "grad_norm": 3.140625, + "kl": 10.112049102783203, + "learning_rate": 5e-06, + "logits/chosen": -49526680.0, + "logits/rejected": -55523500.0, + "logps/chosen": -403.1163635253906, + "logps/rejected": -870.3939208984375, + "loss": 0.0204, + "rewards/chosen": 7.833972930908203, + "rewards/margins": 37.72007942199707, + "rewards/rejected": -29.886106491088867, + "step": 3470 + }, + { + "epoch": 0.8685099462029275, + "grad_norm": 11.0, + "kl": 12.619620323181152, + "learning_rate": 5e-06, + "logits/chosen": -43282796.307692304, + "logits/rejected": -37682106.18181818, + "logps/chosen": -458.3948317307692, + "logps/rejected": -528.6837713068181, + "loss": 0.0741, + "rewards/chosen": 10.180793175330528, + "rewards/margins": 29.60916564514587, + "rewards/rejected": -19.42837246981534, + "step": 3471 + }, + { + "epoch": 0.8687601651445014, + "grad_norm": 5.8125, + "kl": 4.687972545623779, + "learning_rate": 5e-06, + "logits/chosen": -31926302.11764706, + "logits/rejected": -50416045.71428572, + "logps/chosen": -399.3623621323529, + "logps/rejected": -591.3120814732143, + "loss": 0.0526, + "rewards/chosen": 9.734816607306986, + "rewards/margins": 24.717736572778527, + "rewards/rejected": -14.98291996547154, + "step": 3472 + }, + { + "epoch": 0.8690103840860753, + "grad_norm": 11.375, + "kl": 9.203490257263184, + "learning_rate": 5e-06, + "logits/chosen": -67601427.2, + "logits/rejected": -34972205.71428572, + "logps/chosen": -488.0810546875, + "logps/rejected": -576.010986328125, + "loss": 0.077, + "rewards/chosen": 9.368679809570313, + "rewards/margins": 29.90518973214286, + "rewards/rejected": -20.536509922572545, + "step": 3473 + }, + { + "epoch": 0.8692606030276492, + "grad_norm": 8.25, + "kl": 0.7488810420036316, + "learning_rate": 5e-06, + "logits/chosen": -41686637.71428572, + "logits/rejected": -57045267.2, + "logps/chosen": -348.14773995535717, + "logps/rejected": -590.3755859375, + "loss": 0.0376, + "rewards/chosen": 9.239700317382812, + "rewards/margins": 27.616017150878907, + "rewards/rejected": -18.376316833496094, + "step": 3474 + }, + { + "epoch": 0.869510821969223, + "grad_norm": 7.21875, + "kl": 11.378637313842773, + "learning_rate": 5e-06, + "logits/chosen": -33037804.0, + "logits/rejected": -27630972.0, + "logps/chosen": -359.6537170410156, + "logps/rejected": -581.793701171875, + "loss": 0.0199, + "rewards/chosen": 9.954888343811035, + "rewards/margins": 25.091553688049316, + "rewards/rejected": -15.136665344238281, + "step": 3475 + }, + { + "epoch": 0.869761040910797, + "grad_norm": 0.5078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41297801.6, + "logits/rejected": -62777778.28571428, + "logps/chosen": -507.1861328125, + "logps/rejected": -577.0569893973214, + "loss": 0.0009, + "rewards/chosen": 10.858907318115234, + "rewards/margins": 30.113482775006972, + "rewards/rejected": -19.25457545689174, + "step": 3476 + }, + { + "epoch": 0.8700112598523708, + "grad_norm": 4.65625, + "kl": 10.955331802368164, + "learning_rate": 5e-06, + "logits/chosen": -57302702.54545455, + "logits/rejected": -34351734.15384615, + "logps/chosen": -450.57426313920456, + "logps/rejected": -592.2168719951923, + "loss": 0.0103, + "rewards/chosen": 10.468408064408736, + "rewards/margins": 26.55772597306258, + "rewards/rejected": -16.089317908653847, + "step": 3477 + }, + { + "epoch": 0.8702614787939447, + "grad_norm": 2.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23608845.333333332, + "logits/rejected": -21870264.0, + "logps/chosen": -302.7200520833333, + "logps/rejected": -719.738037109375, + "loss": 0.0227, + "rewards/chosen": 8.484755833943685, + "rewards/margins": 30.38633155822754, + "rewards/rejected": -21.901575724283855, + "step": 3478 + }, + { + "epoch": 0.8705116977355186, + "grad_norm": 11.125, + "kl": 1.7276370525360107, + "learning_rate": 5e-06, + "logits/chosen": -45869765.81818182, + "logits/rejected": -29070237.53846154, + "logps/chosen": -346.1038263494318, + "logps/rejected": -552.7812124399038, + "loss": 0.0182, + "rewards/chosen": 8.338917818936435, + "rewards/margins": 22.2284840937261, + "rewards/rejected": -13.889566274789663, + "step": 3479 + }, + { + "epoch": 0.8707619166770925, + "grad_norm": 1.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49231209.14285714, + "logits/rejected": -53019574.4, + "logps/chosen": -364.5970982142857, + "logps/rejected": -710.037060546875, + "loss": 0.039, + "rewards/chosen": 7.858834947858538, + "rewards/margins": 28.742112840924946, + "rewards/rejected": -20.883277893066406, + "step": 3480 + }, + { + "epoch": 0.8710121356186663, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4741275.428571428, + "logits/rejected": -28629722.352941178, + "logps/chosen": -299.351806640625, + "logps/rejected": -393.0396369485294, + "loss": 0.0164, + "rewards/chosen": 7.616309574672154, + "rewards/margins": 17.23380673833254, + "rewards/rejected": -9.617497163660387, + "step": 3481 + }, + { + "epoch": 0.8712623545602403, + "grad_norm": 8.0625, + "kl": 8.709150314331055, + "learning_rate": 5e-06, + "logits/chosen": -16687712.0, + "logits/rejected": -53757797.333333336, + "logps/chosen": -301.19580078125, + "logps/rejected": -922.07666015625, + "loss": 0.0651, + "rewards/chosen": 7.220244513617621, + "rewards/margins": 29.746329413519966, + "rewards/rejected": -22.526084899902344, + "step": 3482 + }, + { + "epoch": 0.8715125735018141, + "grad_norm": 0.15234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44238770.666666664, + "logits/rejected": -44029120.0, + "logps/chosen": -521.3534749348959, + "logps/rejected": -530.3155517578125, + "loss": 0.0003, + "rewards/chosen": 12.447312672932943, + "rewards/margins": 31.768147786458336, + "rewards/rejected": -19.32083511352539, + "step": 3483 + }, + { + "epoch": 0.871762792443388, + "grad_norm": 6.53125, + "kl": 17.40579605102539, + "learning_rate": 5e-06, + "logits/chosen": -60392605.86666667, + "logits/rejected": -66556309.333333336, + "logps/chosen": -448.0633138020833, + "logps/rejected": -569.46337890625, + "loss": 0.078, + "rewards/chosen": 9.13824462890625, + "rewards/margins": 23.99889458550347, + "rewards/rejected": -14.860649956597221, + "step": 3484 + }, + { + "epoch": 0.8720130113849618, + "grad_norm": 24.125, + "kl": 5.751862049102783, + "learning_rate": 5e-06, + "logits/chosen": -32837833.14285714, + "logits/rejected": -13199528.0, + "logps/chosen": -319.37869698660717, + "logps/rejected": -428.903466796875, + "loss": 0.0546, + "rewards/chosen": 5.712645939418247, + "rewards/margins": 17.216526249476843, + "rewards/rejected": -11.503880310058594, + "step": 3485 + }, + { + "epoch": 0.8722632303265357, + "grad_norm": 11.9375, + "kl": 4.841195583343506, + "learning_rate": 5e-06, + "logits/chosen": -36043133.09090909, + "logits/rejected": -61274338.461538464, + "logps/chosen": -384.2327769886364, + "logps/rejected": -634.3969350961538, + "loss": 0.0671, + "rewards/chosen": 8.661341580477627, + "rewards/margins": 27.08565329171561, + "rewards/rejected": -18.42431171123798, + "step": 3486 + }, + { + "epoch": 0.8725134492681096, + "grad_norm": 2.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68651826.28571428, + "logits/rejected": -19265931.2, + "logps/chosen": -477.54268973214283, + "logps/rejected": -629.8080078125, + "loss": 0.0378, + "rewards/chosen": 9.687103271484375, + "rewards/margins": 28.023788452148438, + "rewards/rejected": -18.336685180664062, + "step": 3487 + }, + { + "epoch": 0.8727636682096834, + "grad_norm": 12.75, + "kl": 2.053499221801758, + "learning_rate": 5e-06, + "logits/chosen": -28121618.666666668, + "logits/rejected": -34491381.333333336, + "logps/chosen": -414.3540852864583, + "logps/rejected": -740.4046223958334, + "loss": 0.0469, + "rewards/chosen": 8.724291483561197, + "rewards/margins": 29.117375691731773, + "rewards/rejected": -20.393084208170574, + "step": 3488 + }, + { + "epoch": 0.8730138871512574, + "grad_norm": 7.71875, + "kl": 0.3010028302669525, + "learning_rate": 5e-06, + "logits/chosen": -19811253.333333332, + "logits/rejected": -32091890.666666668, + "logps/chosen": -361.798583984375, + "logps/rejected": -564.289794921875, + "loss": 0.0236, + "rewards/chosen": 7.995500564575195, + "rewards/margins": 28.277644475301106, + "rewards/rejected": -20.28214391072591, + "step": 3489 + }, + { + "epoch": 0.8732641060928312, + "grad_norm": 13.8125, + "kl": 16.190898895263672, + "learning_rate": 5e-06, + "logits/chosen": -3217014.153846154, + "logits/rejected": -45372791.27272727, + "logps/chosen": -367.3820612980769, + "logps/rejected": -555.1054243607955, + "loss": 0.1296, + "rewards/chosen": 9.128287095289965, + "rewards/margins": 23.462135528351045, + "rewards/rejected": -14.33384843306108, + "step": 3490 + }, + { + "epoch": 0.8735143250344051, + "grad_norm": 5.03125, + "kl": 9.775385856628418, + "learning_rate": 5e-06, + "logits/chosen": -51088753.23076923, + "logits/rejected": -55027776.0, + "logps/chosen": -298.17003455528845, + "logps/rejected": -584.8488103693181, + "loss": 0.0397, + "rewards/chosen": 7.636959956242488, + "rewards/margins": 23.450544717428567, + "rewards/rejected": -15.81358476118608, + "step": 3491 + }, + { + "epoch": 0.873764543975979, + "grad_norm": 3.484375, + "kl": 8.500158309936523, + "learning_rate": 5e-06, + "logits/chosen": -30177106.666666668, + "logits/rejected": -31627458.666666668, + "logps/chosen": -375.7908935546875, + "logps/rejected": -891.4656575520834, + "loss": 0.0038, + "rewards/chosen": 11.170824686686197, + "rewards/margins": 32.858350118001304, + "rewards/rejected": -21.687525431315105, + "step": 3492 + }, + { + "epoch": 0.8740147629175529, + "grad_norm": 2.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30430137.6, + "logits/rejected": -34492251.428571425, + "logps/chosen": -369.1553466796875, + "logps/rejected": -519.3116629464286, + "loss": 0.0333, + "rewards/chosen": 7.280263519287109, + "rewards/margins": 23.454499271937777, + "rewards/rejected": -16.17423575265067, + "step": 3493 + }, + { + "epoch": 0.8742649818591267, + "grad_norm": 14.875, + "kl": 13.738447189331055, + "learning_rate": 5e-06, + "logits/chosen": -87834368.0, + "logits/rejected": -67089960.72727273, + "logps/chosen": -479.5993840144231, + "logps/rejected": -517.4357244318181, + "loss": 0.0949, + "rewards/chosen": 11.719855675330528, + "rewards/margins": 26.882969356083372, + "rewards/rejected": -15.163113680752842, + "step": 3494 + }, + { + "epoch": 0.8745152008007007, + "grad_norm": 2.53125, + "kl": 5.4975104331970215, + "learning_rate": 5e-06, + "logits/chosen": -44753385.14285714, + "logits/rejected": -26740112.0, + "logps/chosen": -390.4558803013393, + "logps/rejected": -551.2099609375, + "loss": 0.0325, + "rewards/chosen": 8.827649797712054, + "rewards/margins": 25.071894182477678, + "rewards/rejected": -16.244244384765626, + "step": 3495 + }, + { + "epoch": 0.8747654197422745, + "grad_norm": 4.875, + "kl": 5.394972801208496, + "learning_rate": 5e-06, + "logits/chosen": -23658592.0, + "logits/rejected": -38754269.538461536, + "logps/chosen": -362.89200106534093, + "logps/rejected": -475.9330303485577, + "loss": 0.0113, + "rewards/chosen": 9.438323974609375, + "rewards/margins": 23.45937758225661, + "rewards/rejected": -14.021053607647236, + "step": 3496 + }, + { + "epoch": 0.8750156386838484, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52933876.36363637, + "logits/rejected": -33611803.07692308, + "logps/chosen": -319.91184303977275, + "logps/rejected": -695.7135667067307, + "loss": 0.008, + "rewards/chosen": 7.543406399813565, + "rewards/margins": 27.01269163118376, + "rewards/rejected": -19.469285231370193, + "step": 3497 + }, + { + "epoch": 0.8752658576254222, + "grad_norm": 18.375, + "kl": 5.734790802001953, + "learning_rate": 5e-06, + "logits/chosen": -58964072.72727273, + "logits/rejected": -46720585.84615385, + "logps/chosen": -409.29225852272725, + "logps/rejected": -672.2512019230769, + "loss": 0.061, + "rewards/chosen": 10.009342540394176, + "rewards/margins": 24.840697708663406, + "rewards/rejected": -14.83135516826923, + "step": 3498 + }, + { + "epoch": 0.8755160765669961, + "grad_norm": 5.5, + "kl": 14.852598190307617, + "learning_rate": 5e-06, + "logits/chosen": -43296871.384615384, + "logits/rejected": -24105266.90909091, + "logps/chosen": -419.0271183894231, + "logps/rejected": -506.16264204545456, + "loss": 0.0331, + "rewards/chosen": 10.60536898099459, + "rewards/margins": 25.504981407752403, + "rewards/rejected": -14.899612426757812, + "step": 3499 + }, + { + "epoch": 0.87576629550857, + "grad_norm": 21.875, + "kl": 27.28453826904297, + "learning_rate": 5e-06, + "logits/chosen": -39126731.294117644, + "logits/rejected": -33457817.14285714, + "logps/chosen": -359.83800551470586, + "logps/rejected": -516.1396484375, + "loss": 0.2804, + "rewards/chosen": 8.156498628504137, + "rewards/margins": 20.419150344463958, + "rewards/rejected": -12.262651715959821, + "step": 3500 + }, + { + "epoch": 0.8760165144501438, + "grad_norm": 3.53125, + "kl": 0.01238250732421875, + "learning_rate": 5e-06, + "logits/chosen": -39314901.333333336, + "logits/rejected": -69900167.1111111, + "logps/chosen": -401.12291666666664, + "logps/rejected": -793.1433919270834, + "loss": 0.0313, + "rewards/chosen": 9.82814229329427, + "rewards/margins": 31.292555406358506, + "rewards/rejected": -21.464413113064236, + "step": 3501 + }, + { + "epoch": 0.8762667333917178, + "grad_norm": 0.7421875, + "kl": 0.26519775390625, + "learning_rate": 5e-06, + "logits/chosen": -64505255.384615384, + "logits/rejected": -38148221.09090909, + "logps/chosen": -458.3414963942308, + "logps/rejected": -520.1825727982955, + "loss": 0.0157, + "rewards/chosen": 10.279526930588942, + "rewards/margins": 27.63161100374235, + "rewards/rejected": -17.35208407315341, + "step": 3502 + }, + { + "epoch": 0.8765169523332916, + "grad_norm": 20.25, + "kl": 5.348047256469727, + "learning_rate": 5e-06, + "logits/chosen": 37508404.36363637, + "logits/rejected": -45389602.461538464, + "logps/chosen": -431.95938387784093, + "logps/rejected": -620.1787860576923, + "loss": 0.0327, + "rewards/chosen": 9.256132646040482, + "rewards/margins": 30.644907811304904, + "rewards/rejected": -21.388775165264423, + "step": 3503 + }, + { + "epoch": 0.8767671712748655, + "grad_norm": 0.25, + "kl": 0.12527689337730408, + "learning_rate": 5e-06, + "logits/chosen": -40986042.666666664, + "logits/rejected": -19338796.0, + "logps/chosen": -526.5663655598959, + "logps/rejected": -789.27685546875, + "loss": 0.0004, + "rewards/chosen": 10.52834383646647, + "rewards/margins": 30.603637059529625, + "rewards/rejected": -20.075293223063152, + "step": 3504 + }, + { + "epoch": 0.8770173902164394, + "grad_norm": 1.8203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32284522.666666668, + "logits/rejected": -16291065.6, + "logps/chosen": -385.85582139756946, + "logps/rejected": -654.5609375, + "loss": 0.0126, + "rewards/chosen": 9.295000712076822, + "rewards/margins": 27.57289377848307, + "rewards/rejected": -18.27789306640625, + "step": 3505 + }, + { + "epoch": 0.8772676091580133, + "grad_norm": 2.6875, + "kl": 5.3422088623046875, + "learning_rate": 5e-06, + "logits/chosen": -46049078.85714286, + "logits/rejected": -19758204.8, + "logps/chosen": -407.06996372767856, + "logps/rejected": -439.0158203125, + "loss": 0.0077, + "rewards/chosen": 9.821343558175224, + "rewards/margins": 25.3222407749721, + "rewards/rejected": -15.500897216796876, + "step": 3506 + }, + { + "epoch": 0.8775178280995871, + "grad_norm": 6.84375, + "kl": 3.8835322856903076, + "learning_rate": 5e-06, + "logits/chosen": -55850555.07692308, + "logits/rejected": 80364218.18181819, + "logps/chosen": -431.5446213942308, + "logps/rejected": -591.2766335227273, + "loss": 0.0044, + "rewards/chosen": 12.38507314828726, + "rewards/margins": 29.922022999583426, + "rewards/rejected": -17.536949851296164, + "step": 3507 + }, + { + "epoch": 0.8777680470411611, + "grad_norm": 5.5, + "kl": 0.9364904165267944, + "learning_rate": 5e-06, + "logits/chosen": -42539872.0, + "logits/rejected": -18547186.285714287, + "logps/chosen": -446.951611328125, + "logps/rejected": -405.44754464285717, + "loss": 0.0034, + "rewards/chosen": 10.959182739257812, + "rewards/margins": 23.51131875174386, + "rewards/rejected": -12.552136012486049, + "step": 3508 + }, + { + "epoch": 0.8780182659827349, + "grad_norm": 9.625, + "kl": 3.875466823577881, + "learning_rate": 5e-06, + "logits/chosen": -47632085.333333336, + "logits/rejected": -42960853.333333336, + "logps/chosen": -407.4012044270833, + "logps/rejected": -695.9214680989584, + "loss": 0.0313, + "rewards/chosen": 9.362682342529297, + "rewards/margins": 25.973944346110027, + "rewards/rejected": -16.61126200358073, + "step": 3509 + }, + { + "epoch": 0.8782684849243088, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22777596.444444444, + "logits/rejected": -55068125.86666667, + "logps/chosen": -395.72089301215277, + "logps/rejected": -720.6970052083333, + "loss": 0.0335, + "rewards/chosen": 10.574483235677084, + "rewards/margins": 29.378271484375, + "rewards/rejected": -18.803788248697916, + "step": 3510 + }, + { + "epoch": 0.8785187038658826, + "grad_norm": 9.1875, + "kl": 6.7372355461120605, + "learning_rate": 5e-06, + "logits/chosen": -64550128.0, + "logits/rejected": -17351972.0, + "logps/chosen": -512.9631958007812, + "logps/rejected": -474.2779235839844, + "loss": 0.0146, + "rewards/chosen": 11.224201202392578, + "rewards/margins": 27.438823699951172, + "rewards/rejected": -16.214622497558594, + "step": 3511 + }, + { + "epoch": 0.8787689228074566, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53243881.6, + "logits/rejected": -7391868.0, + "logps/chosen": -438.39765625, + "logps/rejected": -726.9963030133929, + "loss": 0.0225, + "rewards/chosen": 10.342338562011719, + "rewards/margins": 25.175048828125, + "rewards/rejected": -14.832710266113281, + "step": 3512 + }, + { + "epoch": 0.8790191417490304, + "grad_norm": 3.4375, + "kl": 4.361793041229248, + "learning_rate": 5e-06, + "logits/chosen": -50016885.333333336, + "logits/rejected": -32127608.0, + "logps/chosen": -329.2644449869792, + "logps/rejected": -565.9168701171875, + "loss": 0.0163, + "rewards/chosen": 6.939074198404948, + "rewards/margins": 26.60351816813151, + "rewards/rejected": -19.664443969726562, + "step": 3513 + }, + { + "epoch": 0.8792693606906042, + "grad_norm": 0.9296875, + "kl": 3.7996115684509277, + "learning_rate": 5e-06, + "logits/chosen": -18059968.0, + "logits/rejected": -32363150.545454547, + "logps/chosen": -366.8538161057692, + "logps/rejected": -578.0814541903409, + "loss": 0.0227, + "rewards/chosen": 7.9247612586388225, + "rewards/margins": 23.690773277015953, + "rewards/rejected": -15.76601201837713, + "step": 3514 + }, + { + "epoch": 0.8795195796321782, + "grad_norm": 5.5, + "kl": 23.02729606628418, + "learning_rate": 5e-06, + "logits/chosen": -26394752.0, + "logits/rejected": -56730240.0, + "logps/chosen": -518.8658272879464, + "logps/rejected": -791.687451171875, + "loss": 0.0095, + "rewards/chosen": 11.83224596296038, + "rewards/margins": 34.590635245186945, + "rewards/rejected": -22.758389282226563, + "step": 3515 + }, + { + "epoch": 0.879769798573752, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39892590.54545455, + "logits/rejected": -53648802.461538464, + "logps/chosen": -451.50577059659093, + "logps/rejected": -789.908203125, + "loss": 0.046, + "rewards/chosen": 7.973960876464844, + "rewards/margins": 34.760912968562195, + "rewards/rejected": -26.786952092097355, + "step": 3516 + }, + { + "epoch": 0.8800200175153259, + "grad_norm": 0.65234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48603776.0, + "logits/rejected": -40172689.45454545, + "logps/chosen": -444.33687650240387, + "logps/rejected": -568.6198508522727, + "loss": 0.0047, + "rewards/chosen": 10.5233400785006, + "rewards/margins": 29.807641836313103, + "rewards/rejected": -19.2843017578125, + "step": 3517 + }, + { + "epoch": 0.8802702364568998, + "grad_norm": 2.140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -64918848.0, + "logits/rejected": -57443345.45454545, + "logps/chosen": -423.8599384014423, + "logps/rejected": -683.4215198863636, + "loss": 0.0236, + "rewards/chosen": 8.508799039400541, + "rewards/margins": 29.895269487287614, + "rewards/rejected": -21.386470447887074, + "step": 3518 + }, + { + "epoch": 0.8805204553984737, + "grad_norm": 4.40625, + "kl": 6.827731132507324, + "learning_rate": 5e-06, + "logits/chosen": -51399088.0, + "logits/rejected": -46543749.333333336, + "logps/chosen": -443.26904296875, + "logps/rejected": -540.4437255859375, + "loss": 0.048, + "rewards/chosen": 9.395888010660807, + "rewards/margins": 28.04160181681315, + "rewards/rejected": -18.645713806152344, + "step": 3519 + }, + { + "epoch": 0.8807706743400475, + "grad_norm": 1.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49925192.0, + "logits/rejected": -42542104.0, + "logps/chosen": -425.741455078125, + "logps/rejected": -565.3547973632812, + "loss": 0.0301, + "rewards/chosen": 9.724347114562988, + "rewards/margins": 25.833613395690918, + "rewards/rejected": -16.10926628112793, + "step": 3520 + }, + { + "epoch": 0.8810208932816214, + "grad_norm": 1.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42885288.0, + "logits/rejected": -45824368.0, + "logps/chosen": -340.09661865234375, + "logps/rejected": -695.1806030273438, + "loss": 0.0056, + "rewards/chosen": 9.08309268951416, + "rewards/margins": 32.04166507720947, + "rewards/rejected": -22.958572387695312, + "step": 3521 + }, + { + "epoch": 0.8812711122231953, + "grad_norm": 2.859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20576410.666666668, + "logits/rejected": -43797282.666666664, + "logps/chosen": -430.1531982421875, + "logps/rejected": -861.8927408854166, + "loss": 0.0414, + "rewards/chosen": 8.912598927815756, + "rewards/margins": 35.784036000569664, + "rewards/rejected": -26.871437072753906, + "step": 3522 + }, + { + "epoch": 0.8815213311647692, + "grad_norm": 11.0, + "kl": 2.283547878265381, + "learning_rate": 5e-06, + "logits/chosen": -18514584.0, + "logits/rejected": -51793033.14285714, + "logps/chosen": -368.028857421875, + "logps/rejected": -529.9784458705357, + "loss": 0.052, + "rewards/chosen": 8.050103759765625, + "rewards/margins": 24.89565756661551, + "rewards/rejected": -16.84555380684989, + "step": 3523 + }, + { + "epoch": 0.881771550106343, + "grad_norm": 4.03125, + "kl": 2.735687255859375, + "learning_rate": 5e-06, + "logits/chosen": -60683537.06666667, + "logits/rejected": -25922286.222222224, + "logps/chosen": -280.44306640625, + "logps/rejected": -628.4701063368055, + "loss": 0.0196, + "rewards/chosen": 7.375937906901042, + "rewards/margins": 23.75389472113715, + "rewards/rejected": -16.37795681423611, + "step": 3524 + }, + { + "epoch": 0.882021769047917, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19665780.363636363, + "logits/rejected": -50662971.07692308, + "logps/chosen": -337.1357421875, + "logps/rejected": -738.5169020432693, + "loss": 0.0216, + "rewards/chosen": 8.446251609108664, + "rewards/margins": 27.382603678669962, + "rewards/rejected": -18.9363520695613, + "step": 3525 + }, + { + "epoch": 0.8822719879894908, + "grad_norm": 1.96875, + "kl": 2.773669719696045, + "learning_rate": 5e-06, + "logits/chosen": -45895060.36363637, + "logits/rejected": -30374545.230769232, + "logps/chosen": -380.40047940340907, + "logps/rejected": -613.1191030649038, + "loss": 0.0028, + "rewards/chosen": 7.972742254083807, + "rewards/margins": 30.488106680916736, + "rewards/rejected": -22.51536442683293, + "step": 3526 + }, + { + "epoch": 0.8825222069310646, + "grad_norm": 11.0, + "kl": 11.179986953735352, + "learning_rate": 5e-06, + "logits/chosen": -39017080.47058824, + "logits/rejected": 5681417.142857143, + "logps/chosen": -395.7303251378676, + "logps/rejected": -808.7589285714286, + "loss": 0.0566, + "rewards/chosen": 9.184193330652574, + "rewards/margins": 34.77713025517824, + "rewards/rejected": -25.59293692452567, + "step": 3527 + }, + { + "epoch": 0.8827724258726386, + "grad_norm": 8.625, + "kl": 12.569405555725098, + "learning_rate": 5e-06, + "logits/chosen": -41951995.07692308, + "logits/rejected": -52011659.63636363, + "logps/chosen": -350.5563777043269, + "logps/rejected": -626.0881569602273, + "loss": 0.0356, + "rewards/chosen": 9.90627699631911, + "rewards/margins": 24.86199460329709, + "rewards/rejected": -14.955717606977982, + "step": 3528 + }, + { + "epoch": 0.8830226448142124, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63248659.692307696, + "logits/rejected": -40183808.0, + "logps/chosen": -396.86868990384613, + "logps/rejected": -861.9009232954545, + "loss": 0.0316, + "rewards/chosen": 8.992201585036058, + "rewards/margins": 33.6723357514068, + "rewards/rejected": -24.68013416637074, + "step": 3529 + }, + { + "epoch": 0.8832728637557863, + "grad_norm": 1.921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24235644.444444444, + "logits/rejected": -48798442.666666664, + "logps/chosen": -405.89794921875, + "logps/rejected": -753.6374348958333, + "loss": 0.0181, + "rewards/chosen": 9.289745754665798, + "rewards/margins": 30.007999335394963, + "rewards/rejected": -20.718253580729165, + "step": 3530 + }, + { + "epoch": 0.8835230826973602, + "grad_norm": 1.0390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -69412869.81818181, + "logits/rejected": -61747584.0, + "logps/chosen": -339.38077059659093, + "logps/rejected": -760.5519831730769, + "loss": 0.0029, + "rewards/chosen": 8.985397338867188, + "rewards/margins": 32.52979102501502, + "rewards/rejected": -23.544393686147835, + "step": 3531 + }, + { + "epoch": 0.8837733016389341, + "grad_norm": 11.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20123254.153846152, + "logits/rejected": -43051490.90909091, + "logps/chosen": -351.21987680288464, + "logps/rejected": -685.9308860085227, + "loss": 0.0298, + "rewards/chosen": 8.055490347055288, + "rewards/margins": 27.828644865876313, + "rewards/rejected": -19.773154518821023, + "step": 3532 + }, + { + "epoch": 0.8840235205805079, + "grad_norm": 0.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35598186.666666664, + "logits/rejected": -48570629.333333336, + "logps/chosen": -507.4412434895833, + "logps/rejected": -647.8389485677084, + "loss": 0.0025, + "rewards/chosen": 11.869539896647135, + "rewards/margins": 28.91759490966797, + "rewards/rejected": -17.048055013020832, + "step": 3533 + }, + { + "epoch": 0.8842737395220818, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30526798.0, + "logits/rejected": -30781508.0, + "logps/chosen": -360.6224365234375, + "logps/rejected": -460.099365234375, + "loss": 0.0595, + "rewards/chosen": 9.262928009033203, + "rewards/margins": 20.12716293334961, + "rewards/rejected": -10.864234924316406, + "step": 3534 + }, + { + "epoch": 0.8845239584636557, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -83971916.8, + "logits/rejected": -62708585.14285714, + "logps/chosen": -368.1653076171875, + "logps/rejected": -715.0520368303571, + "loss": 0.0453, + "rewards/chosen": 9.799618530273438, + "rewards/margins": 27.228944178989956, + "rewards/rejected": -17.429325648716517, + "step": 3535 + }, + { + "epoch": 0.8847741774052296, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 96781124.26666667, + "logits/rejected": -56679288.88888889, + "logps/chosen": -363.45813802083336, + "logps/rejected": -670.8628472222222, + "loss": 0.0183, + "rewards/chosen": 8.462307230631511, + "rewards/margins": 27.609039137098527, + "rewards/rejected": -19.146731906467014, + "step": 3536 + }, + { + "epoch": 0.8850243963468034, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20385854.222222224, + "logits/rejected": -53205521.06666667, + "logps/chosen": -347.93402777777777, + "logps/rejected": -600.1393880208333, + "loss": 0.0163, + "rewards/chosen": 8.680105421278212, + "rewards/margins": 25.979535759819875, + "rewards/rejected": -17.299430338541665, + "step": 3537 + }, + { + "epoch": 0.8852746152883774, + "grad_norm": 13.4375, + "kl": 0.5519479513168335, + "learning_rate": 5e-06, + "logits/chosen": -39253520.0, + "logits/rejected": -51883792.0, + "logps/chosen": -343.6383463541667, + "logps/rejected": -595.2027994791666, + "loss": 0.0467, + "rewards/chosen": 7.444177627563477, + "rewards/margins": 23.721469243367512, + "rewards/rejected": -16.277291615804035, + "step": 3538 + }, + { + "epoch": 0.8855248342299512, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42127385.6, + "logits/rejected": -84539235.55555555, + "logps/chosen": -285.89026692708336, + "logps/rejected": -841.9453667534722, + "loss": 0.0469, + "rewards/chosen": 7.388307189941406, + "rewards/margins": 26.546664598253038, + "rewards/rejected": -19.15835740831163, + "step": 3539 + }, + { + "epoch": 0.885775053171525, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12394200.0, + "logits/rejected": -41413717.333333336, + "logps/chosen": -234.55257161458334, + "logps/rejected": -639.273193359375, + "loss": 0.0622, + "rewards/chosen": 6.5384572347005205, + "rewards/margins": 24.415119171142578, + "rewards/rejected": -17.87666193644206, + "step": 3540 + }, + { + "epoch": 0.886025272113099, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41635814.4, + "logits/rejected": 47349489.777777776, + "logps/chosen": -347.8814453125, + "logps/rejected": -865.4913194444445, + "loss": 0.0444, + "rewards/chosen": 9.341150919596354, + "rewards/margins": 35.84440782335069, + "rewards/rejected": -26.50325690375434, + "step": 3541 + }, + { + "epoch": 0.8862754910546728, + "grad_norm": 1.53125, + "kl": 0.7327525019645691, + "learning_rate": 5e-06, + "logits/chosen": -45679222.85714286, + "logits/rejected": -48611312.0, + "logps/chosen": -455.0830775669643, + "logps/rejected": -745.722314453125, + "loss": 0.0049, + "rewards/chosen": 9.598833356584821, + "rewards/margins": 29.53981083461216, + "rewards/rejected": -19.940977478027342, + "step": 3542 + }, + { + "epoch": 0.8865257099962467, + "grad_norm": 3.734375, + "kl": 11.181112289428711, + "learning_rate": 5e-06, + "logits/chosen": 1327896.0, + "logits/rejected": -50682245.81818182, + "logps/chosen": -469.84033203125, + "logps/rejected": -636.2815163352273, + "loss": 0.0835, + "rewards/chosen": 9.44761481651893, + "rewards/margins": 26.801549364636827, + "rewards/rejected": -17.3539345481179, + "step": 3543 + }, + { + "epoch": 0.8867759289378206, + "grad_norm": 5.03125, + "kl": 6.331469535827637, + "learning_rate": 5e-06, + "logits/chosen": -39545188.571428575, + "logits/rejected": -26938828.8, + "logps/chosen": -308.0029296875, + "logps/rejected": -528.78740234375, + "loss": 0.0408, + "rewards/chosen": 9.005165100097656, + "rewards/margins": 26.53498077392578, + "rewards/rejected": -17.529815673828125, + "step": 3544 + }, + { + "epoch": 0.8870261478793945, + "grad_norm": 25.5, + "kl": 2.7045936584472656, + "learning_rate": 5e-06, + "logits/chosen": -16624669.333333334, + "logits/rejected": -40340477.333333336, + "logps/chosen": -354.5863037109375, + "logps/rejected": -678.650390625, + "loss": 0.094, + "rewards/chosen": 7.201658248901367, + "rewards/margins": 24.39222780863444, + "rewards/rejected": -17.190569559733074, + "step": 3545 + }, + { + "epoch": 0.8872763668209683, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43928469.333333336, + "logits/rejected": -64286490.666666664, + "logps/chosen": -418.9471842447917, + "logps/rejected": -750.6517740885416, + "loss": 0.0283, + "rewards/chosen": 9.349239349365234, + "rewards/margins": 34.52497227986653, + "rewards/rejected": -25.1757329305013, + "step": 3546 + }, + { + "epoch": 0.8875265857625422, + "grad_norm": 15.0625, + "kl": 8.711920738220215, + "learning_rate": 5e-06, + "logits/chosen": -46442736.0, + "logits/rejected": -6817747.333333333, + "logps/chosen": -373.9108072916667, + "logps/rejected": -582.3116861979166, + "loss": 0.0337, + "rewards/chosen": 8.72821299235026, + "rewards/margins": 23.87250264485677, + "rewards/rejected": -15.14428965250651, + "step": 3547 + }, + { + "epoch": 0.8877768047041161, + "grad_norm": 4.21875, + "kl": 2.5194449424743652, + "learning_rate": 5e-06, + "logits/chosen": -34197618.666666664, + "logits/rejected": -55995338.666666664, + "logps/chosen": -332.61171468098956, + "logps/rejected": -837.3741861979166, + "loss": 0.0124, + "rewards/chosen": 9.442577362060547, + "rewards/margins": 31.84194819132487, + "rewards/rejected": -22.399370829264324, + "step": 3548 + }, + { + "epoch": 0.88802702364569, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20156258.90909091, + "logits/rejected": -44403633.23076923, + "logps/chosen": -340.86661044034093, + "logps/rejected": -470.1190655048077, + "loss": 0.0191, + "rewards/chosen": 8.664235201748935, + "rewards/margins": 21.325013540841482, + "rewards/rejected": -12.660778339092548, + "step": 3549 + }, + { + "epoch": 0.8882772425872638, + "grad_norm": 7.84375, + "kl": 17.180910110473633, + "learning_rate": 5e-06, + "logits/chosen": -29982848.0, + "logits/rejected": -86504721.45454545, + "logps/chosen": -279.7267503004808, + "logps/rejected": -479.09419389204544, + "loss": 0.0581, + "rewards/chosen": 7.192154517540565, + "rewards/margins": 21.77524001114852, + "rewards/rejected": -14.583085493607955, + "step": 3550 + }, + { + "epoch": 0.8885274615288378, + "grad_norm": 14.8125, + "kl": 18.139162063598633, + "learning_rate": 5e-06, + "logits/chosen": -7143572.0, + "logits/rejected": -57583088.0, + "logps/chosen": -487.1918029785156, + "logps/rejected": -633.14794921875, + "loss": 0.0577, + "rewards/chosen": 10.979626655578613, + "rewards/margins": 28.057339668273926, + "rewards/rejected": -17.077713012695312, + "step": 3551 + }, + { + "epoch": 0.8887776804704116, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53758528.0, + "logits/rejected": -56177733.81818182, + "logps/chosen": -363.1078350360577, + "logps/rejected": -664.7646484375, + "loss": 0.0224, + "rewards/chosen": 9.80609365609976, + "rewards/margins": 26.052277571671496, + "rewards/rejected": -16.246183915571734, + "step": 3552 + }, + { + "epoch": 0.8890278994119855, + "grad_norm": 8.5625, + "kl": 31.09537696838379, + "learning_rate": 5e-06, + "logits/chosen": -55373048.0, + "logits/rejected": -34496304.0, + "logps/chosen": -459.2705078125, + "logps/rejected": -495.7012634277344, + "loss": 0.028, + "rewards/chosen": 11.879775047302246, + "rewards/margins": 24.93155002593994, + "rewards/rejected": -13.051774978637695, + "step": 3553 + }, + { + "epoch": 0.8892781183535594, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3185761.5, + "logits/rejected": 2810681.0, + "logps/chosen": -312.50732421875, + "logps/rejected": -653.3818969726562, + "loss": 0.0183, + "rewards/chosen": 7.304194927215576, + "rewards/margins": 25.907958507537842, + "rewards/rejected": -18.603763580322266, + "step": 3554 + }, + { + "epoch": 0.8895283372951333, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 100725650.28571428, + "logits/rejected": -40439401.6, + "logps/chosen": -331.2613002232143, + "logps/rejected": -714.6546875, + "loss": 0.0283, + "rewards/chosen": 8.577002934047155, + "rewards/margins": 23.27026312691825, + "rewards/rejected": -14.693260192871094, + "step": 3555 + }, + { + "epoch": 0.8897785562367071, + "grad_norm": 24.625, + "kl": 9.007562637329102, + "learning_rate": 5e-06, + "logits/chosen": -49211048.0, + "logits/rejected": -60229344.0, + "logps/chosen": -506.52044677734375, + "logps/rejected": -826.952392578125, + "loss": 0.0216, + "rewards/chosen": 13.77773380279541, + "rewards/margins": 33.69819355010986, + "rewards/rejected": -19.920459747314453, + "step": 3556 + }, + { + "epoch": 0.890028775178281, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27839630.545454547, + "logits/rejected": -60112452.92307692, + "logps/chosen": -361.5335138494318, + "logps/rejected": -689.3671875, + "loss": 0.036, + "rewards/chosen": 9.827920393510299, + "rewards/margins": 26.64744178398506, + "rewards/rejected": -16.81952139047476, + "step": 3557 + }, + { + "epoch": 0.8902789941198549, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41981043.2, + "logits/rejected": 29721277.714285713, + "logps/chosen": -326.7423828125, + "logps/rejected": -613.1862444196429, + "loss": 0.0722, + "rewards/chosen": 7.103857421875, + "rewards/margins": 21.988111877441405, + "rewards/rejected": -14.884254455566406, + "step": 3558 + }, + { + "epoch": 0.8905292130614287, + "grad_norm": 3.296875, + "kl": 16.763235092163086, + "learning_rate": 5e-06, + "logits/chosen": -37970861.71428572, + "logits/rejected": -48721548.8, + "logps/chosen": -318.97792271205356, + "logps/rejected": -733.8240234375, + "loss": 0.0205, + "rewards/chosen": 9.213544573102679, + "rewards/margins": 30.55724269321987, + "rewards/rejected": -21.34369812011719, + "step": 3559 + }, + { + "epoch": 0.8907794320030026, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39386720.0, + "logits/rejected": -24556263.384615384, + "logps/chosen": -439.65083451704544, + "logps/rejected": -691.5878155048077, + "loss": 0.0295, + "rewards/chosen": 10.615115772594105, + "rewards/margins": 27.964985720761174, + "rewards/rejected": -17.34986994816707, + "step": 3560 + }, + { + "epoch": 0.8910296509445765, + "grad_norm": 5.8125, + "kl": 11.556530952453613, + "learning_rate": 5e-06, + "logits/chosen": -44487637.333333336, + "logits/rejected": -41096234.666666664, + "logps/chosen": -378.9966145833333, + "logps/rejected": -513.0971137152778, + "loss": 0.0847, + "rewards/chosen": 8.752685546875, + "rewards/margins": 21.65960015190972, + "rewards/rejected": -12.906914605034721, + "step": 3561 + }, + { + "epoch": 0.8912798698861504, + "grad_norm": 1.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35319248.0, + "logits/rejected": -48383016.0, + "logps/chosen": -411.6524658203125, + "logps/rejected": -601.8757934570312, + "loss": 0.0146, + "rewards/chosen": 10.573699951171875, + "rewards/margins": 29.511112213134766, + "rewards/rejected": -18.93741226196289, + "step": 3562 + }, + { + "epoch": 0.8915300888277242, + "grad_norm": 0.78125, + "kl": 12.529175758361816, + "learning_rate": 5e-06, + "logits/chosen": -34441112.0, + "logits/rejected": -50513808.0, + "logps/chosen": -394.88262939453125, + "logps/rejected": -753.8955078125, + "loss": 0.0368, + "rewards/chosen": 10.936822891235352, + "rewards/margins": 29.312332153320312, + "rewards/rejected": -18.37550926208496, + "step": 3563 + }, + { + "epoch": 0.8917803077692982, + "grad_norm": 9.75, + "kl": 5.645042419433594, + "learning_rate": 5e-06, + "logits/chosen": -90000665.6, + "logits/rejected": -73292814.22222222, + "logps/chosen": -316.68834635416664, + "logps/rejected": -689.0061306423611, + "loss": 0.0605, + "rewards/chosen": 7.524369303385416, + "rewards/margins": 28.736654663085936, + "rewards/rejected": -21.21228535970052, + "step": 3564 + }, + { + "epoch": 0.892030526710872, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57538554.18181818, + "logits/rejected": -50735606.15384615, + "logps/chosen": -433.54545454545456, + "logps/rejected": -576.8485576923077, + "loss": 0.0165, + "rewards/chosen": 9.260657570578836, + "rewards/margins": 24.33347838075011, + "rewards/rejected": -15.072820810171274, + "step": 3565 + }, + { + "epoch": 0.8922807456524459, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48765479.384615384, + "logits/rejected": -26493460.363636363, + "logps/chosen": -471.4169170673077, + "logps/rejected": -756.3002485795455, + "loss": 0.0264, + "rewards/chosen": 11.327290461613583, + "rewards/margins": 31.671401123900516, + "rewards/rejected": -20.344110662286933, + "step": 3566 + }, + { + "epoch": 0.8925309645940198, + "grad_norm": 10.6875, + "kl": 0.8661238551139832, + "learning_rate": 5e-06, + "logits/chosen": -43011545.6, + "logits/rejected": -69104333.71428572, + "logps/chosen": -395.7413330078125, + "logps/rejected": -597.3536551339286, + "loss": 0.0272, + "rewards/chosen": 13.063336181640626, + "rewards/margins": 34.13588082449777, + "rewards/rejected": -21.072544642857142, + "step": 3567 + }, + { + "epoch": 0.8927811835355937, + "grad_norm": 0.326171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39511264.0, + "logits/rejected": -70710256.0, + "logps/chosen": -516.6149291992188, + "logps/rejected": -776.2039184570312, + "loss": 0.0004, + "rewards/chosen": 11.192928314208984, + "rewards/margins": 42.36050605773926, + "rewards/rejected": -31.167577743530273, + "step": 3568 + }, + { + "epoch": 0.8930314024771675, + "grad_norm": 14.25, + "kl": 8.936076164245605, + "learning_rate": 5e-06, + "logits/chosen": -20995629.714285713, + "logits/rejected": -4384881.2, + "logps/chosen": -499.23733956473217, + "logps/rejected": -761.3767578125, + "loss": 0.0212, + "rewards/chosen": 11.105767386300224, + "rewards/margins": 37.356707327706474, + "rewards/rejected": -26.25093994140625, + "step": 3569 + }, + { + "epoch": 0.8932816214187413, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29882135.272727273, + "logits/rejected": -77923224.61538461, + "logps/chosen": -323.5667613636364, + "logps/rejected": -657.11328125, + "loss": 0.0203, + "rewards/chosen": 8.35981542413885, + "rewards/margins": 29.99526182588164, + "rewards/rejected": -21.63544640174279, + "step": 3570 + }, + { + "epoch": 0.8935318403603153, + "grad_norm": 1.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42454236.8, + "logits/rejected": -31103620.57142857, + "logps/chosen": -406.72548828125, + "logps/rejected": -761.6640625, + "loss": 0.0096, + "rewards/chosen": 9.425423431396485, + "rewards/margins": 32.22746244158064, + "rewards/rejected": -22.802039010184153, + "step": 3571 + }, + { + "epoch": 0.8937820593018891, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -82150376.72727273, + "logits/rejected": -64283421.538461536, + "logps/chosen": -489.23655007102275, + "logps/rejected": -552.8284254807693, + "loss": 0.0407, + "rewards/chosen": 8.74586209383878, + "rewards/margins": 26.95527371493253, + "rewards/rejected": -18.20941162109375, + "step": 3572 + }, + { + "epoch": 0.894032278243463, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38546268.0, + "logits/rejected": -58857044.0, + "logps/chosen": -389.99212646484375, + "logps/rejected": -955.5507202148438, + "loss": 0.0109, + "rewards/chosen": 9.822789192199707, + "rewards/margins": 35.163357734680176, + "rewards/rejected": -25.34056854248047, + "step": 3573 + }, + { + "epoch": 0.8942824971850369, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42028868.92307692, + "logits/rejected": -60457873.45454545, + "logps/chosen": -288.2484600360577, + "logps/rejected": -868.5490944602273, + "loss": 0.0416, + "rewards/chosen": 7.456160912146935, + "rewards/margins": 34.21430078253046, + "rewards/rejected": -26.758139870383523, + "step": 3574 + }, + { + "epoch": 0.8945327161266108, + "grad_norm": 39.5, + "kl": 2.4006075859069824, + "learning_rate": 5e-06, + "logits/chosen": -37943563.294117644, + "logits/rejected": -67948278.85714285, + "logps/chosen": -330.2010282628676, + "logps/rejected": -942.3878348214286, + "loss": 0.0512, + "rewards/chosen": 8.659520766314339, + "rewards/margins": 39.58336197027639, + "rewards/rejected": -30.923841203962052, + "step": 3575 + }, + { + "epoch": 0.8947829350681846, + "grad_norm": 1.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65529949.09090909, + "logits/rejected": -55677528.615384616, + "logps/chosen": -417.38041548295456, + "logps/rejected": -721.0374098557693, + "loss": 0.0038, + "rewards/chosen": 9.235323125665838, + "rewards/margins": 33.71860184035935, + "rewards/rejected": -24.48327871469351, + "step": 3576 + }, + { + "epoch": 0.8950331540097586, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35827665.23076923, + "logits/rejected": -20044024.727272727, + "logps/chosen": -326.26476111778845, + "logps/rejected": -913.6676136363636, + "loss": 0.0198, + "rewards/chosen": 9.159064659705528, + "rewards/margins": 39.16170026872541, + "rewards/rejected": -30.002635609019887, + "step": 3577 + }, + { + "epoch": 0.8952833729513324, + "grad_norm": 0.69140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40076157.09090909, + "logits/rejected": -33894230.15384615, + "logps/chosen": -497.5905095880682, + "logps/rejected": -931.4921123798077, + "loss": 0.012, + "rewards/chosen": 9.310053045099432, + "rewards/margins": 39.167453792545345, + "rewards/rejected": -29.857400747445915, + "step": 3578 + }, + { + "epoch": 0.8955335918929063, + "grad_norm": 2.0, + "kl": 6.568772792816162, + "learning_rate": 5e-06, + "logits/chosen": -50336547.55555555, + "logits/rejected": -63288506.666666664, + "logps/chosen": -393.36089409722223, + "logps/rejected": -699.6407877604166, + "loss": 0.0245, + "rewards/chosen": 9.618316650390625, + "rewards/margins": 34.642303466796875, + "rewards/rejected": -25.02398681640625, + "step": 3579 + }, + { + "epoch": 0.8957838108344802, + "grad_norm": 4.03125, + "kl": 4.129493236541748, + "learning_rate": 5e-06, + "logits/chosen": -45677894.4, + "logits/rejected": -70438509.71428572, + "logps/chosen": -387.7941162109375, + "logps/rejected": -742.3577008928571, + "loss": 0.0255, + "rewards/chosen": 8.383448791503906, + "rewards/margins": 33.648493957519534, + "rewards/rejected": -25.265045166015625, + "step": 3580 + }, + { + "epoch": 0.8960340297760541, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24682363.42857143, + "logits/rejected": -60028320.0, + "logps/chosen": -314.90164620535717, + "logps/rejected": -712.51005859375, + "loss": 0.0827, + "rewards/chosen": 7.976354326520648, + "rewards/margins": 27.764229692731583, + "rewards/rejected": -19.787875366210937, + "step": 3581 + }, + { + "epoch": 0.8962842487176279, + "grad_norm": 5.625, + "kl": 1.534576416015625, + "learning_rate": 5e-06, + "logits/chosen": -27526957.333333332, + "logits/rejected": -45549317.333333336, + "logps/chosen": -326.50661214192706, + "logps/rejected": -665.3781331380209, + "loss": 0.0389, + "rewards/chosen": 9.024898529052734, + "rewards/margins": 29.259749094645183, + "rewards/rejected": -20.23485056559245, + "step": 3582 + }, + { + "epoch": 0.8965344676592018, + "grad_norm": 19.25, + "kl": 12.660791397094727, + "learning_rate": 5e-06, + "logits/chosen": -39787291.428571425, + "logits/rejected": -68445964.8, + "logps/chosen": -455.52308872767856, + "logps/rejected": -910.52529296875, + "loss": 0.0194, + "rewards/chosen": 9.003792354038783, + "rewards/margins": 40.383705684116904, + "rewards/rejected": -31.379913330078125, + "step": 3583 + }, + { + "epoch": 0.8967846866007757, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47908964.571428575, + "logits/rejected": -68801305.6, + "logps/chosen": -394.45689174107144, + "logps/rejected": -709.46591796875, + "loss": 0.0098, + "rewards/chosen": 7.821954454694476, + "rewards/margins": 29.60352488926479, + "rewards/rejected": -21.781570434570312, + "step": 3584 + }, + { + "epoch": 0.8970349055423495, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20332908.307692308, + "logits/rejected": -64738897.45454545, + "logps/chosen": -291.37503756009613, + "logps/rejected": -594.1895419034091, + "loss": 0.0366, + "rewards/chosen": 7.813441936786358, + "rewards/margins": 26.79250132954204, + "rewards/rejected": -18.979059392755683, + "step": 3585 + }, + { + "epoch": 0.8972851244839234, + "grad_norm": 1.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29363378.666666668, + "logits/rejected": -66764000.0, + "logps/chosen": -345.7313639322917, + "logps/rejected": -869.4364420572916, + "loss": 0.0187, + "rewards/chosen": 9.394649505615234, + "rewards/margins": 34.39015324910481, + "rewards/rejected": -24.995503743489582, + "step": 3586 + }, + { + "epoch": 0.8975353434254973, + "grad_norm": 3.203125, + "kl": 1.6825002431869507, + "learning_rate": 5e-06, + "logits/chosen": -21098100.363636363, + "logits/rejected": 52558508.307692304, + "logps/chosen": -415.849609375, + "logps/rejected": -396.3076171875, + "loss": 0.0509, + "rewards/chosen": 5.612681302157315, + "rewards/margins": 20.47586865191693, + "rewards/rejected": -14.863187349759615, + "step": 3587 + }, + { + "epoch": 0.8977855623670712, + "grad_norm": 1.6640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36833483.63636363, + "logits/rejected": -37698227.692307696, + "logps/chosen": -515.1987748579545, + "logps/rejected": -506.27163461538464, + "loss": 0.003, + "rewards/chosen": 7.8318398215553975, + "rewards/margins": 25.51888323163653, + "rewards/rejected": -17.68704341008113, + "step": 3588 + }, + { + "epoch": 0.898035781308645, + "grad_norm": 2.09375, + "kl": 2.24967360496521, + "learning_rate": 5e-06, + "logits/chosen": -63733666.90909091, + "logits/rejected": -58205312.0, + "logps/chosen": -333.69247159090907, + "logps/rejected": -712.125, + "loss": 0.0535, + "rewards/chosen": 8.564024491743607, + "rewards/margins": 35.431791825727984, + "rewards/rejected": -26.867767333984375, + "step": 3589 + }, + { + "epoch": 0.898286000250219, + "grad_norm": 2.171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36968456.72727273, + "logits/rejected": -48292470.15384615, + "logps/chosen": -331.12051669034093, + "logps/rejected": -718.1705228365385, + "loss": 0.0432, + "rewards/chosen": 10.336235046386719, + "rewards/margins": 33.70757293701172, + "rewards/rejected": -23.371337890625, + "step": 3590 + }, + { + "epoch": 0.8985362191917928, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48777437.09090909, + "logits/rejected": -52432630.15384615, + "logps/chosen": -436.1507457386364, + "logps/rejected": -638.8423978365385, + "loss": 0.0144, + "rewards/chosen": 8.619108720259232, + "rewards/margins": 28.07390514453808, + "rewards/rejected": -19.454796424278847, + "step": 3591 + }, + { + "epoch": 0.8987864381333667, + "grad_norm": 2.78125, + "kl": 4.412869453430176, + "learning_rate": 5e-06, + "logits/chosen": -37410978.90909091, + "logits/rejected": -45998050.461538464, + "logps/chosen": -371.9358575994318, + "logps/rejected": -709.0854867788462, + "loss": 0.0106, + "rewards/chosen": 9.744924371892756, + "rewards/margins": 27.605226636766552, + "rewards/rejected": -17.8603022648738, + "step": 3592 + }, + { + "epoch": 0.8990366570749406, + "grad_norm": 4.40625, + "kl": 1.3186264038085938, + "learning_rate": 5e-06, + "logits/chosen": -30307997.333333332, + "logits/rejected": -55313328.0, + "logps/chosen": -453.0516764322917, + "logps/rejected": -718.5865071614584, + "loss": 0.0064, + "rewards/chosen": 11.269307454427084, + "rewards/margins": 31.798019409179688, + "rewards/rejected": -20.528711954752605, + "step": 3593 + }, + { + "epoch": 0.8992868760165145, + "grad_norm": 7.34375, + "kl": 26.341228485107422, + "learning_rate": 5e-06, + "logits/chosen": -43001449.4117647, + "logits/rejected": -77435117.71428572, + "logps/chosen": -420.95812270220586, + "logps/rejected": -828.7267020089286, + "loss": 0.0592, + "rewards/chosen": 10.025647331686582, + "rewards/margins": 30.83181929387966, + "rewards/rejected": -20.80617196219308, + "step": 3594 + }, + { + "epoch": 0.8995370949580883, + "grad_norm": 1.0859375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35851840.0, + "logits/rejected": -52953745.45454545, + "logps/chosen": -353.4519230769231, + "logps/rejected": -652.9990234375, + "loss": 0.0207, + "rewards/chosen": 10.44244854266827, + "rewards/margins": 34.528566373811735, + "rewards/rejected": -24.086117831143465, + "step": 3595 + }, + { + "epoch": 0.8997873138996622, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48907904.0, + "logits/rejected": -51206298.666666664, + "logps/chosen": -322.0104573567708, + "logps/rejected": -597.0435384114584, + "loss": 0.0321, + "rewards/chosen": 8.561424255371094, + "rewards/margins": 23.536322275797524, + "rewards/rejected": -14.974898020426432, + "step": 3596 + }, + { + "epoch": 0.9000375328412361, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19482838.4, + "logits/rejected": -42198464.0, + "logps/chosen": -295.7649658203125, + "logps/rejected": -571.9587053571429, + "loss": 0.0164, + "rewards/chosen": 7.997134399414063, + "rewards/margins": 26.359061976841517, + "rewards/rejected": -18.361927577427455, + "step": 3597 + }, + { + "epoch": 0.90028775178281, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22118802.285714287, + "logits/rejected": -25385037.17647059, + "logps/chosen": -319.59158761160717, + "logps/rejected": -596.5837545955883, + "loss": 0.0479, + "rewards/chosen": 9.640521458217076, + "rewards/margins": 24.789113341259355, + "rewards/rejected": -15.148591883042279, + "step": 3598 + }, + { + "epoch": 0.9005379707243838, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67340470.85714285, + "logits/rejected": -51692054.5882353, + "logps/chosen": -446.910400390625, + "logps/rejected": -569.0407284007352, + "loss": 0.0115, + "rewards/chosen": 10.921763828822545, + "rewards/margins": 28.21607355710839, + "rewards/rejected": -17.294309728285846, + "step": 3599 + }, + { + "epoch": 0.9007881896659578, + "grad_norm": 1.671875, + "kl": 9.163370132446289, + "learning_rate": 5e-06, + "logits/chosen": -48403441.777777776, + "logits/rejected": -46045120.0, + "logps/chosen": -366.2111545138889, + "logps/rejected": -452.1435139973958, + "loss": 0.0358, + "rewards/chosen": 9.702921549479166, + "rewards/margins": 21.955594380696613, + "rewards/rejected": -12.252672831217447, + "step": 3600 + }, + { + "epoch": 0.9010384086075316, + "grad_norm": 10.125, + "kl": 8.149924278259277, + "learning_rate": 5e-06, + "logits/chosen": -58110934.85714286, + "logits/rejected": -25523259.2, + "logps/chosen": -431.52779715401783, + "logps/rejected": -434.67451171875, + "loss": 0.0813, + "rewards/chosen": 10.335129874093193, + "rewards/margins": 25.767835562569758, + "rewards/rejected": -15.432705688476563, + "step": 3601 + }, + { + "epoch": 0.9012886275491054, + "grad_norm": 6.96875, + "kl": 4.318479061126709, + "learning_rate": 5e-06, + "logits/chosen": -43018368.0, + "logits/rejected": -69879901.0909091, + "logps/chosen": -302.0524338942308, + "logps/rejected": -671.2958984375, + "loss": 0.0408, + "rewards/chosen": 6.9910137469951925, + "rewards/margins": 26.914056737939795, + "rewards/rejected": -19.9230429909446, + "step": 3602 + }, + { + "epoch": 0.9015388464906794, + "grad_norm": 9.375, + "kl": 5.109602928161621, + "learning_rate": 5e-06, + "logits/chosen": -33619271.384615384, + "logits/rejected": -52030784.0, + "logps/chosen": -464.6823167067308, + "logps/rejected": -725.1413352272727, + "loss": 0.071, + "rewards/chosen": 11.125636174128605, + "rewards/margins": 28.06567745608883, + "rewards/rejected": -16.940041281960227, + "step": 3603 + }, + { + "epoch": 0.9017890654322532, + "grad_norm": 7.3125, + "kl": 6.47227668762207, + "learning_rate": 5e-06, + "logits/chosen": -30454247.384615384, + "logits/rejected": -37461268.36363637, + "logps/chosen": -390.27749399038464, + "logps/rejected": -544.8722034801136, + "loss": 0.0068, + "rewards/chosen": 9.12210669884315, + "rewards/margins": 30.323131534603093, + "rewards/rejected": -21.20102483575994, + "step": 3604 + }, + { + "epoch": 0.9020392843738271, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42158925.71428572, + "logits/rejected": -47638304.0, + "logps/chosen": -404.50732421875, + "logps/rejected": -644.544140625, + "loss": 0.0294, + "rewards/chosen": 9.399901253836495, + "rewards/margins": 24.1706547328404, + "rewards/rejected": -14.770753479003906, + "step": 3605 + }, + { + "epoch": 0.902289503315401, + "grad_norm": 0.2021484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65949144.615384616, + "logits/rejected": -76457146.18181819, + "logps/chosen": -559.2515399639423, + "logps/rejected": -533.3188920454545, + "loss": 0.0003, + "rewards/chosen": 12.600909893329327, + "rewards/margins": 33.308497395548784, + "rewards/rejected": -20.70758750221946, + "step": 3606 + }, + { + "epoch": 0.9025397222569749, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15942484.0, + "logits/rejected": -30826478.0, + "logps/chosen": -398.357177734375, + "logps/rejected": -585.0400390625, + "loss": 0.0098, + "rewards/chosen": 10.620777130126953, + "rewards/margins": 30.243654251098633, + "rewards/rejected": -19.62287712097168, + "step": 3607 + }, + { + "epoch": 0.9027899411985487, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48321320.72727273, + "logits/rejected": -41513088.0, + "logps/chosen": -285.69229403409093, + "logps/rejected": -766.4764122596154, + "loss": 0.0299, + "rewards/chosen": 7.378281333229759, + "rewards/margins": 33.396251491733366, + "rewards/rejected": -26.017970158503605, + "step": 3608 + }, + { + "epoch": 0.9030401601401226, + "grad_norm": 3.140625, + "kl": 6.833434104919434, + "learning_rate": 5e-06, + "logits/chosen": -39711660.0, + "logits/rejected": -41721856.0, + "logps/chosen": -372.8978271484375, + "logps/rejected": -607.3831176757812, + "loss": 0.0199, + "rewards/chosen": 10.83343505859375, + "rewards/margins": 33.85621643066406, + "rewards/rejected": -23.022781372070312, + "step": 3609 + }, + { + "epoch": 0.9032903790816965, + "grad_norm": 17.5, + "kl": 15.860231399536133, + "learning_rate": 5e-06, + "logits/chosen": -48352465.45454545, + "logits/rejected": -46532169.84615385, + "logps/chosen": -476.14626242897725, + "logps/rejected": -625.5582181490385, + "loss": 0.0862, + "rewards/chosen": 9.75843672318892, + "rewards/margins": 31.126220489715365, + "rewards/rejected": -21.367783766526443, + "step": 3610 + }, + { + "epoch": 0.9035405980232704, + "grad_norm": 10.625, + "kl": 13.490285873413086, + "learning_rate": 5e-06, + "logits/chosen": -47593170.28571428, + "logits/rejected": -23428704.0, + "logps/chosen": -378.02894810267856, + "logps/rejected": -619.52763671875, + "loss": 0.0982, + "rewards/chosen": 7.996178763253348, + "rewards/margins": 25.96215624128069, + "rewards/rejected": -17.965977478027344, + "step": 3611 + }, + { + "epoch": 0.9037908169648442, + "grad_norm": 5.9375, + "kl": 17.10900115966797, + "learning_rate": 5e-06, + "logits/chosen": -46087322.666666664, + "logits/rejected": -43144218.666666664, + "logps/chosen": -512.732666015625, + "logps/rejected": -537.7214762369791, + "loss": 0.0398, + "rewards/chosen": 12.541951497395834, + "rewards/margins": 30.999038696289062, + "rewards/rejected": -18.45708719889323, + "step": 3612 + }, + { + "epoch": 0.9040410359064182, + "grad_norm": 2.796875, + "kl": 1.5060895681381226, + "learning_rate": 5e-06, + "logits/chosen": -41801600.0, + "logits/rejected": -42875473.06666667, + "logps/chosen": -473.3396267361111, + "logps/rejected": -767.7358072916667, + "loss": 0.0233, + "rewards/chosen": 9.908234490288628, + "rewards/margins": 32.38747846815321, + "rewards/rejected": -22.479243977864584, + "step": 3613 + }, + { + "epoch": 0.904291254847992, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42385776.0, + "logits/rejected": -24973545.14285714, + "logps/chosen": -435.598681640625, + "logps/rejected": -756.2899693080357, + "loss": 0.0376, + "rewards/chosen": 11.386499786376953, + "rewards/margins": 30.774070848737445, + "rewards/rejected": -19.38757106236049, + "step": 3614 + }, + { + "epoch": 0.9045414737895658, + "grad_norm": 0.400390625, + "kl": 2.726717710494995, + "learning_rate": 5e-06, + "logits/chosen": -47092642.461538464, + "logits/rejected": -23503534.545454547, + "logps/chosen": -455.4560546875, + "logps/rejected": -566.7881303267045, + "loss": 0.0006, + "rewards/chosen": 10.898423414963942, + "rewards/margins": 28.542838143302006, + "rewards/rejected": -17.644414728338067, + "step": 3615 + }, + { + "epoch": 0.9047916927311398, + "grad_norm": 2.546875, + "kl": 1.0237910747528076, + "learning_rate": 5e-06, + "logits/chosen": -69458221.71428572, + "logits/rejected": -25794326.4, + "logps/chosen": -339.08778599330356, + "logps/rejected": -546.679150390625, + "loss": 0.0385, + "rewards/chosen": 7.639298575265067, + "rewards/margins": 28.26591600690569, + "rewards/rejected": -20.626617431640625, + "step": 3616 + }, + { + "epoch": 0.9050419116727136, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36360762.18181818, + "logits/rejected": -19271931.076923076, + "logps/chosen": -287.3035777698864, + "logps/rejected": -632.6897536057693, + "loss": 0.0172, + "rewards/chosen": 7.67533389004794, + "rewards/margins": 23.723748853990248, + "rewards/rejected": -16.048414963942307, + "step": 3617 + }, + { + "epoch": 0.9052921306142875, + "grad_norm": 4.1875, + "kl": 8.492765426635742, + "learning_rate": 5e-06, + "logits/chosen": -33959506.28571428, + "logits/rejected": -41394275.2, + "logps/chosen": -408.01803152901783, + "logps/rejected": -793.92158203125, + "loss": 0.0117, + "rewards/chosen": 9.264268057686943, + "rewards/margins": 34.13870348249163, + "rewards/rejected": -24.874435424804688, + "step": 3618 + }, + { + "epoch": 0.9055423495558613, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30358646.4, + "logits/rejected": -36534326.85714286, + "logps/chosen": -358.475146484375, + "logps/rejected": -636.1419503348214, + "loss": 0.038, + "rewards/chosen": 6.734801483154297, + "rewards/margins": 26.761783381870814, + "rewards/rejected": -20.026981898716517, + "step": 3619 + }, + { + "epoch": 0.9057925684974353, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27859737.6, + "logits/rejected": -28766560.0, + "logps/chosen": -361.9039794921875, + "logps/rejected": -576.7251674107143, + "loss": 0.0262, + "rewards/chosen": 8.998316192626953, + "rewards/margins": 26.857804543631417, + "rewards/rejected": -17.859488351004465, + "step": 3620 + }, + { + "epoch": 0.9060427874390091, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60639701.333333336, + "logits/rejected": -6386794.666666667, + "logps/chosen": -278.45196533203125, + "logps/rejected": -621.6043294270834, + "loss": 0.0205, + "rewards/chosen": 7.352033615112305, + "rewards/margins": 28.877785364786785, + "rewards/rejected": -21.52575174967448, + "step": 3621 + }, + { + "epoch": 0.906293006380583, + "grad_norm": 10.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49173093.333333336, + "logits/rejected": 612194.6666666666, + "logps/chosen": -446.3715006510417, + "logps/rejected": -596.6970621744791, + "loss": 0.0289, + "rewards/chosen": 11.425132751464844, + "rewards/margins": 30.438565572102863, + "rewards/rejected": -19.01343282063802, + "step": 3622 + }, + { + "epoch": 0.9065432253221569, + "grad_norm": 0.734375, + "kl": 5.22381591796875, + "learning_rate": 5e-06, + "logits/chosen": -55532432.0, + "logits/rejected": -41445882.666666664, + "logps/chosen": -523.9429117838541, + "logps/rejected": -603.30078125, + "loss": 0.0013, + "rewards/chosen": 10.956832885742188, + "rewards/margins": 30.03661855061849, + "rewards/rejected": -19.0797856648763, + "step": 3623 + }, + { + "epoch": 0.9067934442637308, + "grad_norm": 12.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20152804.923076924, + "logits/rejected": 75928913.45454545, + "logps/chosen": -331.36485877403845, + "logps/rejected": -551.1948686079545, + "loss": 0.0582, + "rewards/chosen": 6.948644197904146, + "rewards/margins": 23.60828164907602, + "rewards/rejected": -16.659637451171875, + "step": 3624 + }, + { + "epoch": 0.9070436632053046, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22142537.333333332, + "logits/rejected": -51502736.0, + "logps/chosen": -224.48514811197916, + "logps/rejected": -558.330078125, + "loss": 0.1004, + "rewards/chosen": 5.383036295572917, + "rewards/margins": 20.931939442952473, + "rewards/rejected": -15.548903147379557, + "step": 3625 + }, + { + "epoch": 0.9072938821468786, + "grad_norm": 1.5390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15878730.666666666, + "logits/rejected": -16955534.222222224, + "logps/chosen": -256.0196533203125, + "logps/rejected": -492.54448784722223, + "loss": 0.0031, + "rewards/chosen": 6.986980438232422, + "rewards/margins": 21.04261144002279, + "rewards/rejected": -14.055631001790365, + "step": 3626 + }, + { + "epoch": 0.9075441010884524, + "grad_norm": 21.75, + "kl": 4.2932658195495605, + "learning_rate": 5e-06, + "logits/chosen": -31792569.6, + "logits/rejected": -40306788.571428575, + "logps/chosen": -498.42197265625, + "logps/rejected": -399.437255859375, + "loss": 0.0294, + "rewards/chosen": 12.09811019897461, + "rewards/margins": 24.81177466256278, + "rewards/rejected": -12.71366446358817, + "step": 3627 + }, + { + "epoch": 0.9077943200300262, + "grad_norm": 6.28125, + "kl": 3.3068695068359375, + "learning_rate": 5e-06, + "logits/chosen": -56337810.28571428, + "logits/rejected": -48774720.0, + "logps/chosen": -329.12081473214283, + "logps/rejected": -639.88837890625, + "loss": 0.0778, + "rewards/chosen": 9.627698625837054, + "rewards/margins": 25.88537837437221, + "rewards/rejected": -16.257679748535157, + "step": 3628 + }, + { + "epoch": 0.9080445389716002, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2210708.3636363638, + "logits/rejected": -56477572.92307692, + "logps/chosen": -281.25390625, + "logps/rejected": -781.3757512019231, + "loss": 0.0444, + "rewards/chosen": 9.01875097101385, + "rewards/margins": 32.74205043766048, + "rewards/rejected": -23.723299466646633, + "step": 3629 + }, + { + "epoch": 0.908294757913174, + "grad_norm": 0.33203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30613158.4, + "logits/rejected": -40937609.14285714, + "logps/chosen": -337.1273681640625, + "logps/rejected": -829.2527901785714, + "loss": 0.0015, + "rewards/chosen": 9.510231018066406, + "rewards/margins": 30.49420928955078, + "rewards/rejected": -20.983978271484375, + "step": 3630 + }, + { + "epoch": 0.9085449768547479, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17137554.0, + "logits/rejected": -48536776.0, + "logps/chosen": -310.9695129394531, + "logps/rejected": -741.855712890625, + "loss": 0.0262, + "rewards/chosen": 6.718559265136719, + "rewards/margins": 34.259769439697266, + "rewards/rejected": -27.541210174560547, + "step": 3631 + }, + { + "epoch": 0.9087951957963217, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44472952.0, + "logits/rejected": -1821066.0, + "logps/chosen": -459.8052673339844, + "logps/rejected": -720.4002685546875, + "loss": 0.0114, + "rewards/chosen": 10.881375312805176, + "rewards/margins": 29.489386558532715, + "rewards/rejected": -18.60801124572754, + "step": 3632 + }, + { + "epoch": 0.9090454147378957, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12176939.733333332, + "logits/rejected": -29999893.333333332, + "logps/chosen": -397.63564453125, + "logps/rejected": -648.1135525173611, + "loss": 0.0778, + "rewards/chosen": 10.032937622070312, + "rewards/margins": 29.20462103949653, + "rewards/rejected": -19.171683417426216, + "step": 3633 + }, + { + "epoch": 0.9092956336794695, + "grad_norm": 3.046875, + "kl": 3.945934295654297, + "learning_rate": 5e-06, + "logits/chosen": -32710739.692307692, + "logits/rejected": -44079232.0, + "logps/chosen": -370.9562800480769, + "logps/rejected": -650.1444424715909, + "loss": 0.0096, + "rewards/chosen": 10.378493088942308, + "rewards/margins": 29.036490913871283, + "rewards/rejected": -18.657997824928977, + "step": 3634 + }, + { + "epoch": 0.9095458526210434, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32668514.133333333, + "logits/rejected": 91630087.1111111, + "logps/chosen": -373.2248046875, + "logps/rejected": -770.1028103298611, + "loss": 0.0371, + "rewards/chosen": 9.946636962890626, + "rewards/margins": 27.15061984592014, + "rewards/rejected": -17.203982883029514, + "step": 3635 + }, + { + "epoch": 0.9097960715626173, + "grad_norm": 3.390625, + "kl": 4.988373279571533, + "learning_rate": 5e-06, + "logits/chosen": -59898325.333333336, + "logits/rejected": -29777077.333333332, + "logps/chosen": -449.71912977430554, + "logps/rejected": -564.2541666666667, + "loss": 0.0401, + "rewards/chosen": 9.938756306966146, + "rewards/margins": 25.367634073893228, + "rewards/rejected": -15.428877766927084, + "step": 3636 + }, + { + "epoch": 0.9100462905041912, + "grad_norm": 3.09375, + "kl": 32.518924713134766, + "learning_rate": 5e-06, + "logits/chosen": 13307831.466666667, + "logits/rejected": -47695146.666666664, + "logps/chosen": -521.7168619791667, + "logps/rejected": -459.9450412326389, + "loss": 0.0912, + "rewards/chosen": 10.861112467447917, + "rewards/margins": 23.13874020046658, + "rewards/rejected": -12.277627733018663, + "step": 3637 + }, + { + "epoch": 0.910296509445765, + "grad_norm": 0.31640625, + "kl": 10.693359375, + "learning_rate": 5e-06, + "logits/chosen": 28154107.076923076, + "logits/rejected": -61624128.0, + "logps/chosen": -443.9314152644231, + "logps/rejected": -802.2942116477273, + "loss": 0.0005, + "rewards/chosen": 11.090174748347355, + "rewards/margins": 35.4824310516144, + "rewards/rejected": -24.392256303267047, + "step": 3638 + }, + { + "epoch": 0.910546728387339, + "grad_norm": 15.0625, + "kl": 20.485065460205078, + "learning_rate": 5e-06, + "logits/chosen": -11660695.466666667, + "logits/rejected": -2228178.6666666665, + "logps/chosen": -396.9975911458333, + "logps/rejected": -476.83251953125, + "loss": 0.113, + "rewards/chosen": 10.477537027994792, + "rewards/margins": 22.139443800184463, + "rewards/rejected": -11.66190677218967, + "step": 3639 + }, + { + "epoch": 0.9107969473289128, + "grad_norm": 20.625, + "kl": 18.392608642578125, + "learning_rate": 5e-06, + "logits/chosen": -41422512.0, + "logits/rejected": -67094613.333333336, + "logps/chosen": -462.5016276041667, + "logps/rejected": -658.1920572916666, + "loss": 0.0966, + "rewards/chosen": 11.088214874267578, + "rewards/margins": 26.0107790629069, + "rewards/rejected": -14.922564188639322, + "step": 3640 + }, + { + "epoch": 0.9110471662704867, + "grad_norm": 9.125, + "kl": 11.363704681396484, + "learning_rate": 5e-06, + "logits/chosen": -27223797.333333332, + "logits/rejected": -51653594.666666664, + "logps/chosen": -359.3583984375, + "logps/rejected": -554.2525227864584, + "loss": 0.0687, + "rewards/chosen": 8.567693710327148, + "rewards/margins": 24.871999740600586, + "rewards/rejected": -16.304306030273438, + "step": 3641 + }, + { + "epoch": 0.9112973852120606, + "grad_norm": 1.6796875, + "kl": 9.7670259475708, + "learning_rate": 5e-06, + "logits/chosen": -38314385.06666667, + "logits/rejected": -72142933.33333333, + "logps/chosen": -385.62766927083334, + "logps/rejected": -861.6956380208334, + "loss": 0.0274, + "rewards/chosen": 9.697329711914062, + "rewards/margins": 41.79235466851128, + "rewards/rejected": -32.09502495659722, + "step": 3642 + }, + { + "epoch": 0.9115476041536344, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23347741.714285713, + "logits/rejected": -44701702.4, + "logps/chosen": -305.405029296875, + "logps/rejected": -761.87587890625, + "loss": 0.0246, + "rewards/chosen": 8.298958369663783, + "rewards/margins": 34.73727624075754, + "rewards/rejected": -26.43831787109375, + "step": 3643 + }, + { + "epoch": 0.9117978230952083, + "grad_norm": 0.057373046875, + "kl": 7.651492118835449, + "learning_rate": 5e-06, + "logits/chosen": -41963942.4, + "logits/rejected": -73286011.42857143, + "logps/chosen": -520.45634765625, + "logps/rejected": -744.5422712053571, + "loss": 0.0002, + "rewards/chosen": 13.898173522949218, + "rewards/margins": 36.88585096086774, + "rewards/rejected": -22.987677437918528, + "step": 3644 + }, + { + "epoch": 0.9120480420367821, + "grad_norm": 1.9921875, + "kl": 0.6961174011230469, + "learning_rate": 5e-06, + "logits/chosen": -55555860.0, + "logits/rejected": -77042208.0, + "logps/chosen": -403.628662109375, + "logps/rejected": -990.4042358398438, + "loss": 0.0065, + "rewards/chosen": 9.038322448730469, + "rewards/margins": 34.12835693359375, + "rewards/rejected": -25.09003448486328, + "step": 3645 + }, + { + "epoch": 0.9122982609783561, + "grad_norm": 0.89453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49404169.84615385, + "logits/rejected": -39282385.45454545, + "logps/chosen": -375.45346304086536, + "logps/rejected": -630.9327947443181, + "loss": 0.0245, + "rewards/chosen": 9.067580003004808, + "rewards/margins": 33.58444235208151, + "rewards/rejected": -24.516862349076703, + "step": 3646 + }, + { + "epoch": 0.9125484799199299, + "grad_norm": 3.53125, + "kl": 4.686428070068359, + "learning_rate": 5e-06, + "logits/chosen": -37559077.64705882, + "logits/rejected": -34224626.28571428, + "logps/chosen": -362.3977481617647, + "logps/rejected": -644.6658761160714, + "loss": 0.0142, + "rewards/chosen": 9.443285773782168, + "rewards/margins": 29.15513880112592, + "rewards/rejected": -19.71185302734375, + "step": 3647 + }, + { + "epoch": 0.9127986988615038, + "grad_norm": 13.0, + "kl": 12.220671653747559, + "learning_rate": 5e-06, + "logits/chosen": -32977894.85714286, + "logits/rejected": -55531724.8, + "logps/chosen": -408.61819893973217, + "logps/rejected": -652.091796875, + "loss": 0.0407, + "rewards/chosen": 11.029583522251674, + "rewards/margins": 30.54897286551339, + "rewards/rejected": -19.519389343261718, + "step": 3648 + }, + { + "epoch": 0.9130489178030777, + "grad_norm": 5.71875, + "kl": 5.093777656555176, + "learning_rate": 5e-06, + "logits/chosen": -67049166.76923077, + "logits/rejected": -28293629.09090909, + "logps/chosen": -356.7760667067308, + "logps/rejected": -433.81196732954544, + "loss": 0.0505, + "rewards/chosen": 10.035098736102764, + "rewards/margins": 26.78887875430234, + "rewards/rejected": -16.753780018199574, + "step": 3649 + }, + { + "epoch": 0.9132991367446516, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6953966.4, + "logits/rejected": -33446166.85714286, + "logps/chosen": -397.96494140625, + "logps/rejected": -638.7155412946429, + "loss": 0.0553, + "rewards/chosen": 9.630237579345703, + "rewards/margins": 26.14644459315709, + "rewards/rejected": -16.516207013811385, + "step": 3650 + }, + { + "epoch": 0.9135493556862254, + "grad_norm": 0.16015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2294672.0, + "logits/rejected": -40321971.692307696, + "logps/chosen": -470.29305752840907, + "logps/rejected": -534.0484525240385, + "loss": 0.0004, + "rewards/chosen": 11.673589533025568, + "rewards/margins": 27.810924503353093, + "rewards/rejected": -16.137334970327522, + "step": 3651 + }, + { + "epoch": 0.9137995746277994, + "grad_norm": 1.484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37783133.538461536, + "logits/rejected": -62245154.90909091, + "logps/chosen": -385.44771634615387, + "logps/rejected": -664.2159978693181, + "loss": 0.0154, + "rewards/chosen": 9.045796907865084, + "rewards/margins": 28.51209024282602, + "rewards/rejected": -19.466293334960938, + "step": 3652 + }, + { + "epoch": 0.9140497935693732, + "grad_norm": 9.1875, + "kl": 7.514174461364746, + "learning_rate": 5e-06, + "logits/chosen": -34458900.0, + "logits/rejected": -33113544.0, + "logps/chosen": -403.6245422363281, + "logps/rejected": -457.1943359375, + "loss": 0.034, + "rewards/chosen": 8.788313865661621, + "rewards/margins": 21.996371269226074, + "rewards/rejected": -13.208057403564453, + "step": 3653 + }, + { + "epoch": 0.9143000125109471, + "grad_norm": 4.1875, + "kl": 13.300031661987305, + "learning_rate": 5e-06, + "logits/chosen": -34659676.0, + "logits/rejected": -69080264.0, + "logps/chosen": -438.6255187988281, + "logps/rejected": -728.2763671875, + "loss": 0.0963, + "rewards/chosen": 8.88072395324707, + "rewards/margins": 28.099050521850586, + "rewards/rejected": -19.218326568603516, + "step": 3654 + }, + { + "epoch": 0.9145502314525209, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 459860.36363636365, + "logits/rejected": -41979648.0, + "logps/chosen": -318.0738636363636, + "logps/rejected": -575.5450345552885, + "loss": 0.0649, + "rewards/chosen": 5.489539059725675, + "rewards/margins": 21.010960052063414, + "rewards/rejected": -15.52142099233774, + "step": 3655 + }, + { + "epoch": 0.9148004503940949, + "grad_norm": 18.5, + "kl": 12.720632553100586, + "learning_rate": 5e-06, + "logits/chosen": -64066588.0, + "logits/rejected": -31259294.0, + "logps/chosen": -398.6112976074219, + "logps/rejected": -696.6396484375, + "loss": 0.0334, + "rewards/chosen": 10.866814613342285, + "rewards/margins": 29.42197895050049, + "rewards/rejected": -18.555164337158203, + "step": 3656 + }, + { + "epoch": 0.9150506693356687, + "grad_norm": 11.4375, + "kl": 0.30351513624191284, + "learning_rate": 5e-06, + "logits/chosen": -27128407.272727273, + "logits/rejected": -51034756.92307692, + "logps/chosen": -329.41579367897725, + "logps/rejected": -642.1020132211538, + "loss": 0.0728, + "rewards/chosen": 9.168805902654475, + "rewards/margins": 24.575102852774666, + "rewards/rejected": -15.406296950120192, + "step": 3657 + }, + { + "epoch": 0.9153008882772425, + "grad_norm": 2.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17003632.0, + "logits/rejected": 42312621.71428572, + "logps/chosen": -330.2636474609375, + "logps/rejected": -583.6966727120536, + "loss": 0.0417, + "rewards/chosen": 8.304690551757812, + "rewards/margins": 26.36024453299386, + "rewards/rejected": -18.05555398123605, + "step": 3658 + }, + { + "epoch": 0.9155511072188165, + "grad_norm": 8.1875, + "kl": 21.391815185546875, + "learning_rate": 5e-06, + "logits/chosen": -56729629.538461536, + "logits/rejected": -32502045.09090909, + "logps/chosen": -407.1477238581731, + "logps/rejected": -594.32861328125, + "loss": 0.0268, + "rewards/chosen": 10.405545748197115, + "rewards/margins": 28.40753771375109, + "rewards/rejected": -18.001991965553977, + "step": 3659 + }, + { + "epoch": 0.9158013261603903, + "grad_norm": 2.984375, + "kl": 14.207000732421875, + "learning_rate": 5e-06, + "logits/chosen": -33471081.846153848, + "logits/rejected": -42789594.18181818, + "logps/chosen": -380.37015474759613, + "logps/rejected": -564.4909889914773, + "loss": 0.0164, + "rewards/chosen": 10.338812021108774, + "rewards/margins": 27.60155727146389, + "rewards/rejected": -17.262745250355113, + "step": 3660 + }, + { + "epoch": 0.9160515451019642, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55015842.461538464, + "logits/rejected": -43462557.09090909, + "logps/chosen": -493.2356520432692, + "logps/rejected": -748.7498224431819, + "loss": 0.0021, + "rewards/chosen": 10.249497633713942, + "rewards/margins": 33.723676988294905, + "rewards/rejected": -23.474179354580965, + "step": 3661 + }, + { + "epoch": 0.9163017640435381, + "grad_norm": 18.25, + "kl": 1.6144975423812866, + "learning_rate": 5e-06, + "logits/chosen": -56993228.0, + "logits/rejected": -75634536.0, + "logps/chosen": -485.0689697265625, + "logps/rejected": -588.8642578125, + "loss": 0.0369, + "rewards/chosen": 10.926384925842285, + "rewards/margins": 27.836487770080566, + "rewards/rejected": -16.91010284423828, + "step": 3662 + }, + { + "epoch": 0.916551982985112, + "grad_norm": 1.4921875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50506960.0, + "logits/rejected": -17406766.666666668, + "logps/chosen": -356.5105387369792, + "logps/rejected": -578.3367513020834, + "loss": 0.0153, + "rewards/chosen": 9.124540328979492, + "rewards/margins": 31.49957338968913, + "rewards/rejected": -22.375033060709637, + "step": 3663 + }, + { + "epoch": 0.9168022019266858, + "grad_norm": 12.0625, + "kl": 21.990711212158203, + "learning_rate": 5e-06, + "logits/chosen": -41240800.0, + "logits/rejected": 37801856.0, + "logps/chosen": -377.4229736328125, + "logps/rejected": -658.3776041666666, + "loss": 0.0683, + "rewards/chosen": 9.736204783121744, + "rewards/margins": 30.072887420654297, + "rewards/rejected": -20.33668263753255, + "step": 3664 + }, + { + "epoch": 0.9170524208682598, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61251483.428571425, + "logits/rejected": -72784569.6, + "logps/chosen": -363.5951450892857, + "logps/rejected": -754.01552734375, + "loss": 0.0098, + "rewards/chosen": 9.216442653111049, + "rewards/margins": 28.60182168143136, + "rewards/rejected": -19.385379028320312, + "step": 3665 + }, + { + "epoch": 0.9173026398098336, + "grad_norm": 2.8125, + "kl": 3.1362476348876953, + "learning_rate": 5e-06, + "logits/chosen": 10919082.666666666, + "logits/rejected": -53927205.333333336, + "logps/chosen": -386.0754801432292, + "logps/rejected": -849.43017578125, + "loss": 0.0256, + "rewards/chosen": 9.0714480082194, + "rewards/margins": 33.74643325805664, + "rewards/rejected": -24.67498524983724, + "step": 3666 + }, + { + "epoch": 0.9175528587514075, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37247976.0, + "logits/rejected": -30369148.0, + "logps/chosen": -330.9925231933594, + "logps/rejected": -710.7865600585938, + "loss": 0.0113, + "rewards/chosen": 9.274223327636719, + "rewards/margins": 33.214176177978516, + "rewards/rejected": -23.939952850341797, + "step": 3667 + }, + { + "epoch": 0.9178030776929813, + "grad_norm": 13.5, + "kl": 16.243453979492188, + "learning_rate": 5e-06, + "logits/chosen": -52579952.0, + "logits/rejected": -50920408.0, + "logps/chosen": -476.9180908203125, + "logps/rejected": -500.33154296875, + "loss": 0.039, + "rewards/chosen": 10.705076217651367, + "rewards/margins": 27.96270179748535, + "rewards/rejected": -17.257625579833984, + "step": 3668 + }, + { + "epoch": 0.9180532966345553, + "grad_norm": 1.1484375, + "kl": 2.0773468017578125, + "learning_rate": 5e-06, + "logits/chosen": -46412266.666666664, + "logits/rejected": -48617420.8, + "logps/chosen": -418.4727376302083, + "logps/rejected": -855.7354166666667, + "loss": 0.0015, + "rewards/chosen": 11.34043460422092, + "rewards/margins": 41.99564700656467, + "rewards/rejected": -30.65521240234375, + "step": 3669 + }, + { + "epoch": 0.9183035155761291, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18294505.333333332, + "logits/rejected": -40784709.333333336, + "logps/chosen": -211.52020263671875, + "logps/rejected": -669.8411458333334, + "loss": 0.0697, + "rewards/chosen": 5.355113983154297, + "rewards/margins": 31.322254180908203, + "rewards/rejected": -25.967140197753906, + "step": 3670 + }, + { + "epoch": 0.918553734517703, + "grad_norm": 13.0, + "kl": 13.475977897644043, + "learning_rate": 5e-06, + "logits/chosen": -49750765.71428572, + "logits/rejected": -34440809.6, + "logps/chosen": -364.71895926339283, + "logps/rejected": -726.482177734375, + "loss": 0.0522, + "rewards/chosen": 7.901856558663504, + "rewards/margins": 27.06439470563616, + "rewards/rejected": -19.162538146972658, + "step": 3671 + }, + { + "epoch": 0.9188039534592769, + "grad_norm": 1.2578125, + "kl": 0.7221651077270508, + "learning_rate": 5e-06, + "logits/chosen": -28355012.57142857, + "logits/rejected": -76347520.0, + "logps/chosen": -431.0949009486607, + "logps/rejected": -698.064599609375, + "loss": 0.0225, + "rewards/chosen": 9.873589651925224, + "rewards/margins": 31.129368155343194, + "rewards/rejected": -21.255778503417968, + "step": 3672 + }, + { + "epoch": 0.9190541724008507, + "grad_norm": 0.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51378492.8, + "logits/rejected": -56532251.428571425, + "logps/chosen": -415.995556640625, + "logps/rejected": -850.5020926339286, + "loss": 0.0008, + "rewards/chosen": 10.853074645996093, + "rewards/margins": 39.044664001464845, + "rewards/rejected": -28.19158935546875, + "step": 3673 + }, + { + "epoch": 0.9193043913424246, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73830877.0909091, + "logits/rejected": -54255222.15384615, + "logps/chosen": -464.34623579545456, + "logps/rejected": -622.6720252403846, + "loss": 0.0473, + "rewards/chosen": 10.302597739479758, + "rewards/margins": 34.7514361935062, + "rewards/rejected": -24.448838454026443, + "step": 3674 + }, + { + "epoch": 0.9195546102839985, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37260390.4, + "logits/rejected": -96066510.22222222, + "logps/chosen": -298.26689453125, + "logps/rejected": -1054.7430555555557, + "loss": 0.035, + "rewards/chosen": 8.145692952473958, + "rewards/margins": 45.22034233940972, + "rewards/rejected": -37.07464938693576, + "step": 3675 + }, + { + "epoch": 0.9198048292255724, + "grad_norm": 1.6796875, + "kl": 0.7751471400260925, + "learning_rate": 5e-06, + "logits/chosen": -60191772.44444445, + "logits/rejected": -68596509.86666666, + "logps/chosen": -461.78716362847223, + "logps/rejected": -842.7582682291667, + "loss": 0.015, + "rewards/chosen": 10.981557210286459, + "rewards/margins": 37.68365275065104, + "rewards/rejected": -26.702095540364585, + "step": 3676 + }, + { + "epoch": 0.9200550481671462, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15956401.23076923, + "logits/rejected": -68831394.9090909, + "logps/chosen": -286.7046461838942, + "logps/rejected": -594.1253995028409, + "loss": 0.0493, + "rewards/chosen": 5.6898029033954325, + "rewards/margins": 25.393933596310916, + "rewards/rejected": -19.704130692915484, + "step": 3677 + }, + { + "epoch": 0.9203052671087202, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -68665557.33333333, + "logits/rejected": -24939266.666666668, + "logps/chosen": -363.361328125, + "logps/rejected": -567.293212890625, + "loss": 0.0232, + "rewards/chosen": 11.097783406575521, + "rewards/margins": 26.18262608846029, + "rewards/rejected": -15.084842681884766, + "step": 3678 + }, + { + "epoch": 0.920555486050294, + "grad_norm": 4.125, + "kl": 4.9551544189453125, + "learning_rate": 5e-06, + "logits/chosen": -15177004.307692308, + "logits/rejected": -51948096.0, + "logps/chosen": -459.3257587139423, + "logps/rejected": -756.7235440340909, + "loss": 0.0556, + "rewards/chosen": 9.360626220703125, + "rewards/margins": 29.90814208984375, + "rewards/rejected": -20.547515869140625, + "step": 3679 + }, + { + "epoch": 0.9208057049918679, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43145842.666666664, + "logits/rejected": -25372608.0, + "logps/chosen": -328.8467610677083, + "logps/rejected": -435.873291015625, + "loss": 0.0494, + "rewards/chosen": 8.137037913004557, + "rewards/margins": 22.385472615559895, + "rewards/rejected": -14.248434702555338, + "step": 3680 + }, + { + "epoch": 0.9210559239334417, + "grad_norm": 8.5, + "kl": 2.4481773376464844, + "learning_rate": 5e-06, + "logits/chosen": -38081241.14285714, + "logits/rejected": -45931785.6, + "logps/chosen": -295.50069754464283, + "logps/rejected": -714.18046875, + "loss": 0.0583, + "rewards/chosen": 7.420160566057477, + "rewards/margins": 29.984195600237165, + "rewards/rejected": -22.56403503417969, + "step": 3681 + }, + { + "epoch": 0.9213061428750157, + "grad_norm": 1.453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40405696.0, + "logits/rejected": -41863079.384615384, + "logps/chosen": -265.56733842329544, + "logps/rejected": -695.0422175480769, + "loss": 0.024, + "rewards/chosen": 7.97051308371804, + "rewards/margins": 34.750042361812994, + "rewards/rejected": -26.77952927809495, + "step": 3682 + }, + { + "epoch": 0.9215563618165895, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65520640.0, + "logits/rejected": -46573804.307692304, + "logps/chosen": -350.3289905894886, + "logps/rejected": -611.9503455528846, + "loss": 0.0386, + "rewards/chosen": 7.926854220303622, + "rewards/margins": 29.376481436349295, + "rewards/rejected": -21.449627216045673, + "step": 3683 + }, + { + "epoch": 0.9218065807581634, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26620567.272727273, + "logits/rejected": -51018712.615384616, + "logps/chosen": -228.464599609375, + "logps/rejected": -729.2767427884615, + "loss": 0.0377, + "rewards/chosen": 6.084580854936079, + "rewards/margins": 31.388066431859155, + "rewards/rejected": -25.303485576923077, + "step": 3684 + }, + { + "epoch": 0.9220567996997373, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23915882.666666668, + "logits/rejected": -67065632.0, + "logps/chosen": -255.76859537760416, + "logps/rejected": -553.4192301432291, + "loss": 0.0687, + "rewards/chosen": 6.421606699625651, + "rewards/margins": 26.695067087809246, + "rewards/rejected": -20.273460388183594, + "step": 3685 + }, + { + "epoch": 0.9223070186413111, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -91014048.0, + "logits/rejected": -25845864.0, + "logps/chosen": -346.2259114583333, + "logps/rejected": -615.8801676432291, + "loss": 0.0414, + "rewards/chosen": 9.272621154785156, + "rewards/margins": 27.147443135579426, + "rewards/rejected": -17.87482198079427, + "step": 3686 + }, + { + "epoch": 0.922557237582885, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51465553.45454545, + "logits/rejected": -94073787.07692307, + "logps/chosen": -375.5769708806818, + "logps/rejected": -956.8509615384615, + "loss": 0.0119, + "rewards/chosen": 7.712018099698153, + "rewards/margins": 35.508726426771474, + "rewards/rejected": -27.79670832707332, + "step": 3687 + }, + { + "epoch": 0.922807456524459, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30606674.285714287, + "logits/rejected": -24823450.352941178, + "logps/chosen": -272.11033412388394, + "logps/rejected": -824.8774126838235, + "loss": 0.024, + "rewards/chosen": 8.445656912667411, + "rewards/margins": 35.98136606937697, + "rewards/rejected": -27.535709156709558, + "step": 3688 + }, + { + "epoch": 0.9230576754660328, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41163616.0, + "logits/rejected": -69796608.0, + "logps/chosen": -358.54775390625, + "logps/rejected": -701.0075334821429, + "loss": 0.0364, + "rewards/chosen": 8.676412963867188, + "rewards/margins": 28.909262084960936, + "rewards/rejected": -20.23284912109375, + "step": 3689 + }, + { + "epoch": 0.9233078944076066, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35831644.44444445, + "logits/rejected": -62976179.2, + "logps/chosen": -262.1901041666667, + "logps/rejected": -950.980078125, + "loss": 0.0439, + "rewards/chosen": 8.096754286024305, + "rewards/margins": 41.010157606336804, + "rewards/rejected": -32.9134033203125, + "step": 3690 + }, + { + "epoch": 0.9235581133491806, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34080085.333333336, + "logits/rejected": -38613273.6, + "logps/chosen": -429.8879665798611, + "logps/rejected": -636.77265625, + "loss": 0.0352, + "rewards/chosen": 10.336007859971788, + "rewards/margins": 31.821125454372833, + "rewards/rejected": -21.485117594401043, + "step": 3691 + }, + { + "epoch": 0.9238083322907544, + "grad_norm": 4.65625, + "kl": 5.238432884216309, + "learning_rate": 5e-06, + "logits/chosen": -48587785.84615385, + "logits/rejected": -18208693.818181816, + "logps/chosen": -373.20169771634613, + "logps/rejected": -531.3069069602273, + "loss": 0.0401, + "rewards/chosen": 9.612536503718449, + "rewards/margins": 24.545536548107656, + "rewards/rejected": -14.933000044389205, + "step": 3692 + }, + { + "epoch": 0.9240585512323283, + "grad_norm": 7.5, + "kl": 5.6325225830078125, + "learning_rate": 5e-06, + "logits/chosen": -47413248.0, + "logits/rejected": -62455408.0, + "logps/chosen": -387.6510416666667, + "logps/rejected": -579.4986165364584, + "loss": 0.0837, + "rewards/chosen": 9.310820897420248, + "rewards/margins": 25.36291058858236, + "rewards/rejected": -16.05208969116211, + "step": 3693 + }, + { + "epoch": 0.9243087701739021, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50646572.0, + "logits/rejected": -9453488.0, + "logps/chosen": -275.2099609375, + "logps/rejected": -644.7911376953125, + "loss": 0.0252, + "rewards/chosen": 8.35729694366455, + "rewards/margins": 29.634264945983887, + "rewards/rejected": -21.276968002319336, + "step": 3694 + }, + { + "epoch": 0.9245589891154761, + "grad_norm": 6.875, + "kl": 0.6097742915153503, + "learning_rate": 5e-06, + "logits/chosen": -36478080.0, + "logits/rejected": -73992153.6, + "logps/chosen": -395.43844168526783, + "logps/rejected": -754.53779296875, + "loss": 0.0219, + "rewards/chosen": 8.53553227015904, + "rewards/margins": 30.57603977748326, + "rewards/rejected": -22.040507507324218, + "step": 3695 + }, + { + "epoch": 0.9248092080570499, + "grad_norm": 1.28125, + "kl": 1.1071523427963257, + "learning_rate": 5e-06, + "logits/chosen": -29321816.0, + "logits/rejected": -73368826.66666667, + "logps/chosen": -344.7670084635417, + "logps/rejected": -877.466552734375, + "loss": 0.0209, + "rewards/chosen": 8.654857635498047, + "rewards/margins": 34.738338470458984, + "rewards/rejected": -26.083480834960938, + "step": 3696 + }, + { + "epoch": 0.9250594269986238, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51040896.0, + "logits/rejected": -40428378.666666664, + "logps/chosen": -256.4623209635417, + "logps/rejected": -702.9195149739584, + "loss": 0.0369, + "rewards/chosen": 7.106784820556641, + "rewards/margins": 29.26878484090169, + "rewards/rejected": -22.16200002034505, + "step": 3697 + }, + { + "epoch": 0.9253096459401977, + "grad_norm": 8.0, + "kl": 2.357769012451172, + "learning_rate": 5e-06, + "logits/chosen": -45492257.88235294, + "logits/rejected": -75074098.28571428, + "logps/chosen": -371.2243221507353, + "logps/rejected": -962.2689732142857, + "loss": 0.0354, + "rewards/chosen": 10.685268626493567, + "rewards/margins": 50.217939873703386, + "rewards/rejected": -39.53267124720982, + "step": 3698 + }, + { + "epoch": 0.9255598648817716, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61758887.384615384, + "logits/rejected": -37671226.18181818, + "logps/chosen": -402.8660231370192, + "logps/rejected": -543.7151544744319, + "loss": 0.0133, + "rewards/chosen": 9.769400963416466, + "rewards/margins": 28.42345092346618, + "rewards/rejected": -18.654049960049715, + "step": 3699 + }, + { + "epoch": 0.9258100838233454, + "grad_norm": 0.33984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54026995.2, + "logits/rejected": -46542221.71428572, + "logps/chosen": -490.822265625, + "logps/rejected": -820.87060546875, + "loss": 0.0007, + "rewards/chosen": 12.331735229492187, + "rewards/margins": 41.802253069196425, + "rewards/rejected": -29.47051783970424, + "step": 3700 + }, + { + "epoch": 0.9260603027649194, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -96762604.8, + "logits/rejected": -36792221.71428572, + "logps/chosen": -336.268505859375, + "logps/rejected": -574.1924176897321, + "loss": 0.0475, + "rewards/chosen": 6.6514404296875, + "rewards/margins": 25.226182338169643, + "rewards/rejected": -18.574741908482142, + "step": 3701 + }, + { + "epoch": 0.9263105217064932, + "grad_norm": 1.0859375, + "kl": 5.960305690765381, + "learning_rate": 5e-06, + "logits/chosen": -66248536.615384616, + "logits/rejected": 3029041.4545454546, + "logps/chosen": -454.0021784855769, + "logps/rejected": -846.1086647727273, + "loss": 0.0021, + "rewards/chosen": 10.19219501201923, + "rewards/margins": 40.507014347956726, + "rewards/rejected": -30.3148193359375, + "step": 3702 + }, + { + "epoch": 0.926560740648067, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36604312.615384616, + "logits/rejected": -37822048.0, + "logps/chosen": -295.26639498197113, + "logps/rejected": -471.5929509943182, + "loss": 0.0603, + "rewards/chosen": 7.948537386380709, + "rewards/margins": 21.292259643127867, + "rewards/rejected": -13.343722256747158, + "step": 3703 + }, + { + "epoch": 0.9268109595896409, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33856205.333333336, + "logits/rejected": -48558485.333333336, + "logps/chosen": -297.96803792317706, + "logps/rejected": -774.0730794270834, + "loss": 0.0423, + "rewards/chosen": 7.227203369140625, + "rewards/margins": 34.115529378255204, + "rewards/rejected": -26.888326009114582, + "step": 3704 + }, + { + "epoch": 0.9270611785312148, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47229600.0, + "logits/rejected": -62766129.23076923, + "logps/chosen": -386.07692649147725, + "logps/rejected": -801.9137620192307, + "loss": 0.0138, + "rewards/chosen": 9.968229814009232, + "rewards/margins": 34.25131556370875, + "rewards/rejected": -24.28308574969952, + "step": 3705 + }, + { + "epoch": 0.9273113974727887, + "grad_norm": 4.5, + "kl": 9.980598449707031, + "learning_rate": 5e-06, + "logits/chosen": -61470674.28571428, + "logits/rejected": -20664419.2, + "logps/chosen": -482.77260044642856, + "logps/rejected": -643.902490234375, + "loss": 0.0143, + "rewards/chosen": 10.542388916015625, + "rewards/margins": 26.01171875, + "rewards/rejected": -15.469329833984375, + "step": 3706 + }, + { + "epoch": 0.9275616164143625, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17888912.0, + "logits/rejected": -62861909.333333336, + "logps/chosen": -340.6318088107639, + "logps/rejected": -722.9623046875, + "loss": 0.0095, + "rewards/chosen": 7.426587422688802, + "rewards/margins": 32.73001556396484, + "rewards/rejected": -25.30342814127604, + "step": 3707 + }, + { + "epoch": 0.9278118353559365, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56353472.0, + "logits/rejected": -31678795.636363637, + "logps/chosen": -328.4323167067308, + "logps/rejected": -417.66592684659093, + "loss": 0.0114, + "rewards/chosen": 8.44217036320613, + "rewards/margins": 23.704353812691217, + "rewards/rejected": -15.262183449485086, + "step": 3708 + }, + { + "epoch": 0.9280620542975103, + "grad_norm": 0.64453125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55930984.72727273, + "logits/rejected": -70537737.84615384, + "logps/chosen": -492.87375710227275, + "logps/rejected": -946.8109975961538, + "loss": 0.0055, + "rewards/chosen": 11.676055908203125, + "rewards/margins": 46.53683941180889, + "rewards/rejected": -34.86078350360577, + "step": 3709 + }, + { + "epoch": 0.9283122732390842, + "grad_norm": 4.6875, + "kl": 9.946484565734863, + "learning_rate": 5e-06, + "logits/chosen": -30531054.769230768, + "logits/rejected": -55394978.90909091, + "logps/chosen": -399.21567007211536, + "logps/rejected": -547.6901189630681, + "loss": 0.0153, + "rewards/chosen": 9.143834627591646, + "rewards/margins": 23.717856400496476, + "rewards/rejected": -14.57402177290483, + "step": 3710 + }, + { + "epoch": 0.9285624921806581, + "grad_norm": 3.34375, + "kl": 1.688489317893982, + "learning_rate": 5e-06, + "logits/chosen": -36479645.538461536, + "logits/rejected": -64894952.72727273, + "logps/chosen": -425.63172325721155, + "logps/rejected": -746.2611860795455, + "loss": 0.0406, + "rewards/chosen": 10.068436842698317, + "rewards/margins": 34.220758398096045, + "rewards/rejected": -24.152321555397727, + "step": 3711 + }, + { + "epoch": 0.928812711122232, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38882144.0, + "logits/rejected": -33640730.18181818, + "logps/chosen": -400.70120943509613, + "logps/rejected": -835.4429154829545, + "loss": 0.0281, + "rewards/chosen": 9.450376657339243, + "rewards/margins": 36.150899340222765, + "rewards/rejected": -26.700522682883523, + "step": 3712 + }, + { + "epoch": 0.9290629300638058, + "grad_norm": 20.5, + "kl": 4.710544586181641, + "learning_rate": 5e-06, + "logits/chosen": -87594067.2, + "logits/rejected": -32256434.285714287, + "logps/chosen": -576.19033203125, + "logps/rejected": -617.7624162946429, + "loss": 0.0332, + "rewards/chosen": 11.363107299804687, + "rewards/margins": 27.951182120186942, + "rewards/rejected": -16.588074820382253, + "step": 3713 + }, + { + "epoch": 0.9293131490053798, + "grad_norm": 6.78125, + "kl": 8.423606872558594, + "learning_rate": 5e-06, + "logits/chosen": -47330408.72727273, + "logits/rejected": -27152969.846153848, + "logps/chosen": -365.7396129261364, + "logps/rejected": -658.6613581730769, + "loss": 0.071, + "rewards/chosen": 7.067854447798296, + "rewards/margins": 29.247117049210555, + "rewards/rejected": -22.17926260141226, + "step": 3714 + }, + { + "epoch": 0.9295633679469536, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47833331.2, + "logits/rejected": -45388032.0, + "logps/chosen": -304.074267578125, + "logps/rejected": -640.4561244419643, + "loss": 0.0226, + "rewards/chosen": 6.993989562988281, + "rewards/margins": 29.67812826974051, + "rewards/rejected": -22.684138706752233, + "step": 3715 + }, + { + "epoch": 0.9298135868885274, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47868002.461538464, + "logits/rejected": -32947729.454545453, + "logps/chosen": -365.99673227163464, + "logps/rejected": -587.7440962357955, + "loss": 0.0407, + "rewards/chosen": 9.957493708683895, + "rewards/margins": 34.6107085968231, + "rewards/rejected": -24.653214888139203, + "step": 3716 + }, + { + "epoch": 0.9300638058301013, + "grad_norm": 1.3125, + "kl": 13.263101577758789, + "learning_rate": 5e-06, + "logits/chosen": -56243035.428571425, + "logits/rejected": -46777529.6, + "logps/chosen": -412.14571707589283, + "logps/rejected": -647.4697265625, + "loss": 0.0293, + "rewards/chosen": 8.966896057128906, + "rewards/margins": 30.687646484375, + "rewards/rejected": -21.720750427246095, + "step": 3717 + }, + { + "epoch": 0.9303140247716752, + "grad_norm": 1.359375, + "kl": 8.214506149291992, + "learning_rate": 5e-06, + "logits/chosen": -45975008.0, + "logits/rejected": -43574688.0, + "logps/chosen": -447.02845982142856, + "logps/rejected": -699.840380859375, + "loss": 0.0011, + "rewards/chosen": 10.724009922572545, + "rewards/margins": 31.35533621651786, + "rewards/rejected": -20.631326293945314, + "step": 3718 + }, + { + "epoch": 0.9305642437132491, + "grad_norm": 3.65625, + "kl": 4.150592803955078, + "learning_rate": 5e-06, + "logits/chosen": -51912950.15384615, + "logits/rejected": -53771502.54545455, + "logps/chosen": -489.05258413461536, + "logps/rejected": -893.3158735795455, + "loss": 0.0126, + "rewards/chosen": 9.8624267578125, + "rewards/margins": 38.26237349076705, + "rewards/rejected": -28.399946732954547, + "step": 3719 + }, + { + "epoch": 0.9308144626548229, + "grad_norm": 1.3046875, + "kl": 0.007616996765136719, + "learning_rate": 5e-06, + "logits/chosen": -26754808.0, + "logits/rejected": -74242970.66666667, + "logps/chosen": -282.1256103515625, + "logps/rejected": -735.69189453125, + "loss": 0.0322, + "rewards/chosen": 7.477465311686198, + "rewards/margins": 33.24460093180338, + "rewards/rejected": -25.767135620117188, + "step": 3720 + }, + { + "epoch": 0.9310646815963969, + "grad_norm": 11.9375, + "kl": 3.1887454986572266, + "learning_rate": 5e-06, + "logits/chosen": -57972976.0, + "logits/rejected": -65607861.333333336, + "logps/chosen": -445.2245686848958, + "logps/rejected": -745.7523600260416, + "loss": 0.0187, + "rewards/chosen": 9.01176643371582, + "rewards/margins": 36.364722569783524, + "rewards/rejected": -27.352956136067707, + "step": 3721 + }, + { + "epoch": 0.9313149005379707, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51586737.777777776, + "logits/rejected": -36257745.06666667, + "logps/chosen": -512.1311306423611, + "logps/rejected": -490.779296875, + "loss": 0.0055, + "rewards/chosen": 11.203750610351562, + "rewards/margins": 29.190721638997395, + "rewards/rejected": -17.986971028645833, + "step": 3722 + }, + { + "epoch": 0.9315651194795446, + "grad_norm": 2.390625, + "kl": 0.46158599853515625, + "learning_rate": 5e-06, + "logits/chosen": -49872608.0, + "logits/rejected": -58409088.0, + "logps/chosen": -417.1673177083333, + "logps/rejected": -623.9833170572916, + "loss": 0.035, + "rewards/chosen": 10.196973164876303, + "rewards/margins": 32.75976053873698, + "rewards/rejected": -22.562787373860676, + "step": 3723 + }, + { + "epoch": 0.9318153384211185, + "grad_norm": 6.0, + "kl": 0.8521296381950378, + "learning_rate": 5e-06, + "logits/chosen": -27727817.846153848, + "logits/rejected": -10215963.636363637, + "logps/chosen": -397.4178936298077, + "logps/rejected": -429.93794389204544, + "loss": 0.0449, + "rewards/chosen": 8.25039555476262, + "rewards/margins": 23.550047654371994, + "rewards/rejected": -15.299652099609375, + "step": 3724 + }, + { + "epoch": 0.9320655573626924, + "grad_norm": 11.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23437777.777777776, + "logits/rejected": -28600893.866666667, + "logps/chosen": -393.53911675347223, + "logps/rejected": -622.2699869791667, + "loss": 0.0587, + "rewards/chosen": 8.319374932183159, + "rewards/margins": 26.646492852105034, + "rewards/rejected": -18.327117919921875, + "step": 3725 + }, + { + "epoch": 0.9323157763042662, + "grad_norm": 9.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27401349.333333332, + "logits/rejected": -52528728.88888889, + "logps/chosen": -284.50889078776044, + "logps/rejected": -671.14794921875, + "loss": 0.0351, + "rewards/chosen": 7.721078236897786, + "rewards/margins": 30.882311079237194, + "rewards/rejected": -23.16123284233941, + "step": 3726 + }, + { + "epoch": 0.9325659952458402, + "grad_norm": 6.46875, + "kl": 0.6262067556381226, + "learning_rate": 5e-06, + "logits/chosen": -49242885.81818182, + "logits/rejected": -61143522.461538464, + "logps/chosen": -408.6328125, + "logps/rejected": -773.1281550480769, + "loss": 0.0188, + "rewards/chosen": 10.500030517578125, + "rewards/margins": 40.4487046461839, + "rewards/rejected": -29.94867412860577, + "step": 3727 + }, + { + "epoch": 0.932816214187414, + "grad_norm": 10.5, + "kl": 1.0335826873779297, + "learning_rate": 5e-06, + "logits/chosen": -59183572.0, + "logits/rejected": -72408160.0, + "logps/chosen": -386.2140808105469, + "logps/rejected": -688.2503051757812, + "loss": 0.0433, + "rewards/chosen": 9.209988594055176, + "rewards/margins": 26.654969215393066, + "rewards/rejected": -17.44498062133789, + "step": 3728 + }, + { + "epoch": 0.9330664331289878, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72591808.0, + "logits/rejected": -45603655.52941176, + "logps/chosen": -368.183837890625, + "logps/rejected": -809.3220358455883, + "loss": 0.0346, + "rewards/chosen": 6.652472904750279, + "rewards/margins": 33.504270569617006, + "rewards/rejected": -26.851797664866726, + "step": 3729 + }, + { + "epoch": 0.9333166520705617, + "grad_norm": 7.96875, + "kl": 4.339824676513672, + "learning_rate": 5e-06, + "logits/chosen": -58916240.0, + "logits/rejected": 52773594.666666664, + "logps/chosen": -267.23638916015625, + "logps/rejected": -931.5350748697916, + "loss": 0.0157, + "rewards/chosen": 8.109144846598307, + "rewards/margins": 35.10693232218424, + "rewards/rejected": -26.997787475585938, + "step": 3730 + }, + { + "epoch": 0.9335668710121356, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36150306.461538464, + "logits/rejected": 9082745.454545455, + "logps/chosen": -388.4695387620192, + "logps/rejected": -666.1118607954545, + "loss": 0.04, + "rewards/chosen": 6.263761667104868, + "rewards/margins": 30.762843365435835, + "rewards/rejected": -24.499081698330965, + "step": 3731 + }, + { + "epoch": 0.9338170899537095, + "grad_norm": 6.3125, + "kl": 4.371379852294922, + "learning_rate": 5e-06, + "logits/chosen": -46694469.81818182, + "logits/rejected": -28355153.230769232, + "logps/chosen": -458.1736949573864, + "logps/rejected": -431.35160006009613, + "loss": 0.0201, + "rewards/chosen": 11.083661166104404, + "rewards/margins": 29.901189523977003, + "rewards/rejected": -18.817528357872597, + "step": 3732 + }, + { + "epoch": 0.9340673088952833, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8285928.0, + "logits/rejected": -24230909.714285713, + "logps/chosen": -437.706982421875, + "logps/rejected": -631.3773716517857, + "loss": 0.0107, + "rewards/chosen": 9.065396881103515, + "rewards/margins": 33.61173782348633, + "rewards/rejected": -24.546340942382812, + "step": 3733 + }, + { + "epoch": 0.9343175278368573, + "grad_norm": 2.515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38453646.76923077, + "logits/rejected": -74616424.72727273, + "logps/chosen": -290.60263296274036, + "logps/rejected": -844.4512606534091, + "loss": 0.0815, + "rewards/chosen": 7.211082458496094, + "rewards/margins": 40.52576307816939, + "rewards/rejected": -33.3146806196733, + "step": 3734 + }, + { + "epoch": 0.9345677467784311, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36598186.666666664, + "logits/rejected": -41392834.666666664, + "logps/chosen": -324.5240478515625, + "logps/rejected": -814.3972981770834, + "loss": 0.0125, + "rewards/chosen": 9.42100461324056, + "rewards/margins": 37.825896581014, + "rewards/rejected": -28.404891967773438, + "step": 3735 + }, + { + "epoch": 0.934817965720005, + "grad_norm": 2.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -67296723.2, + "logits/rejected": -42944714.10526316, + "logps/chosen": -427.38935546875, + "logps/rejected": -637.7658305921053, + "loss": 0.0074, + "rewards/chosen": 10.53394012451172, + "rewards/margins": 35.62255353425678, + "rewards/rejected": -25.088613409745065, + "step": 3736 + }, + { + "epoch": 0.9350681846615789, + "grad_norm": 2.390625, + "kl": 4.419887542724609, + "learning_rate": 5e-06, + "logits/chosen": -41882102.15384615, + "logits/rejected": 76265431.27272727, + "logps/chosen": -520.6777719350962, + "logps/rejected": -668.8429509943181, + "loss": 0.0028, + "rewards/chosen": 12.890312781700722, + "rewards/margins": 40.335275716714925, + "rewards/rejected": -27.444962935014203, + "step": 3737 + }, + { + "epoch": 0.9353184036031528, + "grad_norm": 1.3046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60143084.307692304, + "logits/rejected": -50749952.0, + "logps/chosen": -351.4051983173077, + "logps/rejected": -680.7304243607955, + "loss": 0.0026, + "rewards/chosen": 9.971133892352764, + "rewards/margins": 35.6921976796397, + "rewards/rejected": -25.721063787286933, + "step": 3738 + }, + { + "epoch": 0.9355686225447266, + "grad_norm": 18.25, + "kl": 3.025336265563965, + "learning_rate": 5e-06, + "logits/chosen": -31696558.769230768, + "logits/rejected": -33734301.09090909, + "logps/chosen": -426.8505108173077, + "logps/rejected": -553.9435813210227, + "loss": 0.0402, + "rewards/chosen": 10.397782545823317, + "rewards/margins": 28.228139490514366, + "rewards/rejected": -17.83035694469105, + "step": 3739 + }, + { + "epoch": 0.9358188414863006, + "grad_norm": 1.1484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47182820.571428575, + "logits/rejected": -59330508.8, + "logps/chosen": -420.0761021205357, + "logps/rejected": -798.369140625, + "loss": 0.003, + "rewards/chosen": 9.45977783203125, + "rewards/margins": 36.832254028320314, + "rewards/rejected": -27.372476196289064, + "step": 3740 + }, + { + "epoch": 0.9360690604278744, + "grad_norm": 8.5625, + "kl": 9.53818416595459, + "learning_rate": 5e-06, + "logits/chosen": -49429549.71428572, + "logits/rejected": -36252153.6, + "logps/chosen": -319.5391322544643, + "logps/rejected": -710.951904296875, + "loss": 0.0725, + "rewards/chosen": 6.999656677246094, + "rewards/margins": 23.896250915527343, + "rewards/rejected": -16.89659423828125, + "step": 3741 + }, + { + "epoch": 0.9363192793694483, + "grad_norm": 2.65625, + "kl": 2.960111141204834, + "learning_rate": 5e-06, + "logits/chosen": -32384626.285714287, + "logits/rejected": -39933443.2, + "logps/chosen": -424.60518973214283, + "logps/rejected": -655.7033203125, + "loss": 0.0248, + "rewards/chosen": 8.663845607212611, + "rewards/margins": 31.505135890415737, + "rewards/rejected": -22.841290283203126, + "step": 3742 + }, + { + "epoch": 0.9365694983110221, + "grad_norm": 1.1015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19679116.307692308, + "logits/rejected": -48105879.27272727, + "logps/chosen": -381.67202524038464, + "logps/rejected": -661.2387251420455, + "loss": 0.0179, + "rewards/chosen": 10.64020244891827, + "rewards/margins": 32.50675462842821, + "rewards/rejected": -21.86655217950994, + "step": 3743 + }, + { + "epoch": 0.936819717252596, + "grad_norm": 2.015625, + "kl": 6.166493892669678, + "learning_rate": 5e-06, + "logits/chosen": -23343220.363636363, + "logits/rejected": -22544152.615384616, + "logps/chosen": -278.97270063920456, + "logps/rejected": -705.8221153846154, + "loss": 0.0635, + "rewards/chosen": 8.095463145862926, + "rewards/margins": 30.49448986987134, + "rewards/rejected": -22.399026724008415, + "step": 3744 + }, + { + "epoch": 0.9370699361941699, + "grad_norm": 7.5, + "kl": 7.7577619552612305, + "learning_rate": 5e-06, + "logits/chosen": -43230653.333333336, + "logits/rejected": -28837216.0, + "logps/chosen": -458.3128255208333, + "logps/rejected": -679.1248372395834, + "loss": 0.0099, + "rewards/chosen": 11.211034138997396, + "rewards/margins": 34.11735280354818, + "rewards/rejected": -22.90631866455078, + "step": 3745 + }, + { + "epoch": 0.9373201551357437, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60484676.571428575, + "logits/rejected": -49555020.8, + "logps/chosen": -321.01942661830356, + "logps/rejected": -772.556396484375, + "loss": 0.0155, + "rewards/chosen": 9.171854291643415, + "rewards/margins": 34.971402631487166, + "rewards/rejected": -25.79954833984375, + "step": 3746 + }, + { + "epoch": 0.9375703740773177, + "grad_norm": 8.0625, + "kl": 4.670871734619141, + "learning_rate": 5e-06, + "logits/chosen": -59696077.71428572, + "logits/rejected": -20999092.8, + "logps/chosen": -470.66552734375, + "logps/rejected": -695.096484375, + "loss": 0.0129, + "rewards/chosen": 10.515856061662946, + "rewards/margins": 30.583046613420755, + "rewards/rejected": -20.06719055175781, + "step": 3747 + }, + { + "epoch": 0.9378205930188915, + "grad_norm": 0.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65790236.44444445, + "logits/rejected": 33705531.733333334, + "logps/chosen": -409.43028428819446, + "logps/rejected": -638.7223958333333, + "loss": 0.0325, + "rewards/chosen": 9.500165303548178, + "rewards/margins": 30.179209899902347, + "rewards/rejected": -20.679044596354167, + "step": 3748 + }, + { + "epoch": 0.9380708119604654, + "grad_norm": 0.7578125, + "kl": 0.13533911108970642, + "learning_rate": 5e-06, + "logits/chosen": -46760029.538461536, + "logits/rejected": -21868519.272727273, + "logps/chosen": -359.56107271634613, + "logps/rejected": -947.2347301136364, + "loss": 0.0064, + "rewards/chosen": 9.611349252554087, + "rewards/margins": 37.564108041616585, + "rewards/rejected": -27.9527587890625, + "step": 3749 + }, + { + "epoch": 0.9383210309020393, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16086424.0, + "logits/rejected": -54801546.10526316, + "logps/chosen": -328.6706298828125, + "logps/rejected": -545.116365131579, + "loss": 0.0265, + "rewards/chosen": 6.712380218505859, + "rewards/margins": 24.638511617560138, + "rewards/rejected": -17.926131399054277, + "step": 3750 + }, + { + "epoch": 0.9385712498436132, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34635584.0, + "logits/rejected": -25193022.0, + "logps/chosen": -425.52740478515625, + "logps/rejected": -719.9127807617188, + "loss": 0.0233, + "rewards/chosen": 10.331106185913086, + "rewards/margins": 31.494319915771484, + "rewards/rejected": -21.1632137298584, + "step": 3751 + }, + { + "epoch": 0.938821468785187, + "grad_norm": 3.265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2163763.2, + "logits/rejected": -28687675.42857143, + "logps/chosen": -312.575244140625, + "logps/rejected": -690.8864397321429, + "loss": 0.0665, + "rewards/chosen": 6.536237335205078, + "rewards/margins": 30.743754686628066, + "rewards/rejected": -24.20751735142299, + "step": 3752 + }, + { + "epoch": 0.9390716877267609, + "grad_norm": 1.2578125, + "kl": 4.232607841491699, + "learning_rate": 5e-06, + "logits/chosen": -23613708.8, + "logits/rejected": -54907004.44444445, + "logps/chosen": -430.9554036458333, + "logps/rejected": -627.6022677951389, + "loss": 0.0027, + "rewards/chosen": 8.968729654947916, + "rewards/margins": 27.106306287977432, + "rewards/rejected": -18.137576633029514, + "step": 3753 + }, + { + "epoch": 0.9393219066683348, + "grad_norm": 5.375, + "kl": 12.011497497558594, + "learning_rate": 5e-06, + "logits/chosen": -55577384.0, + "logits/rejected": -48588024.0, + "logps/chosen": -386.369873046875, + "logps/rejected": -437.56298828125, + "loss": 0.0289, + "rewards/chosen": 9.366955757141113, + "rewards/margins": 24.25215244293213, + "rewards/rejected": -14.885196685791016, + "step": 3754 + }, + { + "epoch": 0.9395721256099087, + "grad_norm": 17.5, + "kl": 3.341841459274292, + "learning_rate": 5e-06, + "logits/chosen": -46243566.54545455, + "logits/rejected": -41828952.615384616, + "logps/chosen": -371.03848544034093, + "logps/rejected": -585.87451171875, + "loss": 0.1075, + "rewards/chosen": 9.532931241122158, + "rewards/margins": 26.69799729994127, + "rewards/rejected": -17.16506605881911, + "step": 3755 + }, + { + "epoch": 0.9398223445514825, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32987854.769230768, + "logits/rejected": -24177597.09090909, + "logps/chosen": -402.4352463942308, + "logps/rejected": -724.7319779829545, + "loss": 0.0527, + "rewards/chosen": 9.85379145695613, + "rewards/margins": 30.4667023345307, + "rewards/rejected": -20.612910877574574, + "step": 3756 + }, + { + "epoch": 0.9400725634930565, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30146646.85714286, + "logits/rejected": -35832256.0, + "logps/chosen": -294.4813755580357, + "logps/rejected": -815.4399701286765, + "loss": 0.0155, + "rewards/chosen": 7.019434247698102, + "rewards/margins": 30.104617784003253, + "rewards/rejected": -23.08518353630515, + "step": 3757 + }, + { + "epoch": 0.9403227824346303, + "grad_norm": 14.0625, + "kl": 8.94985580444336, + "learning_rate": 5e-06, + "logits/chosen": -32803754.666666668, + "logits/rejected": -49997429.333333336, + "logps/chosen": -354.2955729166667, + "logps/rejected": -758.4219563802084, + "loss": 0.1008, + "rewards/chosen": 9.271331151326498, + "rewards/margins": 30.976027806599937, + "rewards/rejected": -21.704696655273438, + "step": 3758 + }, + { + "epoch": 0.9405730013762041, + "grad_norm": 1.4140625, + "kl": 7.016010284423828, + "learning_rate": 5e-06, + "logits/chosen": -3993308.923076923, + "logits/rejected": -17077384.727272727, + "logps/chosen": -403.1871995192308, + "logps/rejected": -758.7692649147727, + "loss": 0.0112, + "rewards/chosen": 10.312451876126802, + "rewards/margins": 34.32435373159555, + "rewards/rejected": -24.01190185546875, + "step": 3759 + }, + { + "epoch": 0.9408232203177781, + "grad_norm": 1.109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43842537.6, + "logits/rejected": -28131154.285714287, + "logps/chosen": -349.61201171875, + "logps/rejected": -533.9872349330357, + "loss": 0.0103, + "rewards/chosen": 10.215606689453125, + "rewards/margins": 29.725306919642858, + "rewards/rejected": -19.509700230189733, + "step": 3760 + }, + { + "epoch": 0.9410734392593519, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -86298861.71428572, + "logits/rejected": -51930057.6, + "logps/chosen": -425.345703125, + "logps/rejected": -621.291845703125, + "loss": 0.0216, + "rewards/chosen": 9.118870326450892, + "rewards/margins": 26.624976893833704, + "rewards/rejected": -17.506106567382812, + "step": 3761 + }, + { + "epoch": 0.9413236582009258, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49416675.55555555, + "logits/rejected": -63749529.6, + "logps/chosen": -444.2258572048611, + "logps/rejected": -730.9154296875, + "loss": 0.0564, + "rewards/chosen": 8.821803622775608, + "rewards/margins": 33.69162784152561, + "rewards/rejected": -24.86982421875, + "step": 3762 + }, + { + "epoch": 0.9415738771424997, + "grad_norm": 11.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40426088.72727273, + "logits/rejected": -47342759.384615384, + "logps/chosen": -449.34596946022725, + "logps/rejected": -553.3724834735577, + "loss": 0.0403, + "rewards/chosen": 10.6955302845348, + "rewards/margins": 25.70827606841401, + "rewards/rejected": -15.012745783879208, + "step": 3763 + }, + { + "epoch": 0.9418240960840736, + "grad_norm": 8.9375, + "kl": 8.958549499511719, + "learning_rate": 5e-06, + "logits/chosen": -22657958.0, + "logits/rejected": -23025836.0, + "logps/chosen": -419.4216613769531, + "logps/rejected": -569.431640625, + "loss": 0.0746, + "rewards/chosen": 9.36103343963623, + "rewards/margins": 23.90878963470459, + "rewards/rejected": -14.54775619506836, + "step": 3764 + }, + { + "epoch": 0.9420743150256474, + "grad_norm": 0.0262451171875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65783824.0, + "logits/rejected": -46325248.0, + "logps/chosen": -520.4666748046875, + "logps/rejected": -705.7529296875, + "loss": 0.0, + "rewards/chosen": 13.508212089538574, + "rewards/margins": 37.70999240875244, + "rewards/rejected": -24.201780319213867, + "step": 3765 + }, + { + "epoch": 0.9423245339672213, + "grad_norm": 1.046875, + "kl": 3.8747966289520264, + "learning_rate": 5e-06, + "logits/chosen": -53797978.666666664, + "logits/rejected": -42411189.333333336, + "logps/chosen": -458.496337890625, + "logps/rejected": -627.2190755208334, + "loss": 0.0108, + "rewards/chosen": 10.138379414876303, + "rewards/margins": 31.657127380371094, + "rewards/rejected": -21.518747965494793, + "step": 3766 + }, + { + "epoch": 0.9425747529087952, + "grad_norm": 4.28125, + "kl": 7.057338237762451, + "learning_rate": 5e-06, + "logits/chosen": -48369382.4, + "logits/rejected": -76089528.8888889, + "logps/chosen": -437.45091145833334, + "logps/rejected": -643.6836480034722, + "loss": 0.0201, + "rewards/chosen": 8.683896891276042, + "rewards/margins": 31.425699530707465, + "rewards/rejected": -22.741802639431423, + "step": 3767 + }, + { + "epoch": 0.9428249718503691, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33344704.0, + "logits/rejected": -40090994.28571428, + "logps/chosen": -311.484326171875, + "logps/rejected": -692.8228236607143, + "loss": 0.0447, + "rewards/chosen": 7.7175453186035154, + "rewards/margins": 29.06184027535575, + "rewards/rejected": -21.344294956752233, + "step": 3768 + }, + { + "epoch": 0.9430751907919429, + "grad_norm": 1.8515625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39092280.0, + "logits/rejected": -60036408.0, + "logps/chosen": -319.80633544921875, + "logps/rejected": -735.3450927734375, + "loss": 0.0412, + "rewards/chosen": 8.660524368286133, + "rewards/margins": 34.95990562438965, + "rewards/rejected": -26.299381256103516, + "step": 3769 + }, + { + "epoch": 0.9433254097335169, + "grad_norm": 0.640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40973193.6, + "logits/rejected": -55989330.28571428, + "logps/chosen": -374.49833984375, + "logps/rejected": -715.5750558035714, + "loss": 0.0135, + "rewards/chosen": 10.01400146484375, + "rewards/margins": 31.73235909598214, + "rewards/rejected": -21.718357631138392, + "step": 3770 + }, + { + "epoch": 0.9435756286750907, + "grad_norm": 2.890625, + "kl": 1.5402755737304688, + "learning_rate": 5e-06, + "logits/chosen": -41408905.84615385, + "logits/rejected": -26084264.727272727, + "logps/chosen": -401.9069260817308, + "logps/rejected": -871.4486860795455, + "loss": 0.045, + "rewards/chosen": 8.598973787747896, + "rewards/margins": 35.644078768216644, + "rewards/rejected": -27.04510498046875, + "step": 3771 + }, + { + "epoch": 0.9438258476166645, + "grad_norm": 33.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11237077.818181818, + "logits/rejected": -14647158.153846154, + "logps/chosen": -513.8740678267045, + "logps/rejected": -755.6401742788462, + "loss": 0.0299, + "rewards/chosen": 8.47186348655007, + "rewards/margins": 27.952313456501994, + "rewards/rejected": -19.480449969951923, + "step": 3772 + }, + { + "epoch": 0.9440760665582385, + "grad_norm": 0.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37644730.18181818, + "logits/rejected": -59118217.84615385, + "logps/chosen": -383.2120472301136, + "logps/rejected": -738.6766826923077, + "loss": 0.0095, + "rewards/chosen": 9.43123418634588, + "rewards/margins": 36.4097052087317, + "rewards/rejected": -26.97847102238582, + "step": 3773 + }, + { + "epoch": 0.9443262854998123, + "grad_norm": 2.796875, + "kl": 11.554250717163086, + "learning_rate": 5e-06, + "logits/chosen": -53918005.333333336, + "logits/rejected": -53164677.333333336, + "logps/chosen": -406.2027994791667, + "logps/rejected": -762.990966796875, + "loss": 0.0909, + "rewards/chosen": 10.787008921305338, + "rewards/margins": 33.900404612223305, + "rewards/rejected": -23.11339569091797, + "step": 3774 + }, + { + "epoch": 0.9445765044413862, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53935409.777777776, + "logits/rejected": -52801352.53333333, + "logps/chosen": -302.1626790364583, + "logps/rejected": -569.7979166666667, + "loss": 0.013, + "rewards/chosen": 7.197200351291233, + "rewards/margins": 25.57388492160373, + "rewards/rejected": -18.3766845703125, + "step": 3775 + }, + { + "epoch": 0.9448267233829601, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55364387.55555555, + "logits/rejected": -38565789.86666667, + "logps/chosen": -368.0413411458333, + "logps/rejected": -511.10911458333334, + "loss": 0.0366, + "rewards/chosen": 7.864317152235243, + "rewards/margins": 27.399892510308156, + "rewards/rejected": -19.535575358072915, + "step": 3776 + }, + { + "epoch": 0.945076942324534, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40192880.0, + "logits/rejected": -91861208.0, + "logps/chosen": -410.23138427734375, + "logps/rejected": -1197.84423828125, + "loss": 0.0111, + "rewards/chosen": 10.929574966430664, + "rewards/margins": 52.52945518493652, + "rewards/rejected": -41.59988021850586, + "step": 3777 + }, + { + "epoch": 0.9453271612661078, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40051108.571428575, + "logits/rejected": -39784614.4, + "logps/chosen": -316.51559012276783, + "logps/rejected": -648.75126953125, + "loss": 0.0468, + "rewards/chosen": 7.9384662083217075, + "rewards/margins": 29.15797914777483, + "rewards/rejected": -21.219512939453125, + "step": 3778 + }, + { + "epoch": 0.9455773802076817, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40503266.13333333, + "logits/rejected": -53907648.0, + "logps/chosen": -390.42766927083335, + "logps/rejected": -595.6471354166666, + "loss": 0.0354, + "rewards/chosen": 8.561541748046874, + "rewards/margins": 30.970933363172744, + "rewards/rejected": -22.40939161512587, + "step": 3779 + }, + { + "epoch": 0.9458275991492556, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59208885.333333336, + "logits/rejected": -58841787.733333334, + "logps/chosen": -327.1113009982639, + "logps/rejected": -759.5977864583333, + "loss": 0.0244, + "rewards/chosen": 9.334747314453125, + "rewards/margins": 33.580501302083334, + "rewards/rejected": -24.24575398763021, + "step": 3780 + }, + { + "epoch": 0.9460778180908295, + "grad_norm": 0.58203125, + "kl": 5.253135681152344, + "learning_rate": 5e-06, + "logits/chosen": -74490830.76923077, + "logits/rejected": -33292805.818181816, + "logps/chosen": -414.23035606971155, + "logps/rejected": -427.10964133522725, + "loss": 0.0013, + "rewards/chosen": 11.294388991135817, + "rewards/margins": 26.21016463699874, + "rewards/rejected": -14.915775645862926, + "step": 3781 + }, + { + "epoch": 0.9463280370324033, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20576890.666666668, + "logits/rejected": -14607856.0, + "logps/chosen": -334.4853515625, + "logps/rejected": -800.1109212239584, + "loss": 0.0361, + "rewards/chosen": 6.976909637451172, + "rewards/margins": 31.142009735107422, + "rewards/rejected": -24.16510009765625, + "step": 3782 + }, + { + "epoch": 0.9465782559739773, + "grad_norm": 1.09375, + "kl": 11.445440292358398, + "learning_rate": 5e-06, + "logits/chosen": -54333213.538461536, + "logits/rejected": -38881413.81818182, + "logps/chosen": -430.8821364182692, + "logps/rejected": -674.87841796875, + "loss": 0.034, + "rewards/chosen": 10.138096736027645, + "rewards/margins": 35.76309673602765, + "rewards/rejected": -25.625, + "step": 3783 + }, + { + "epoch": 0.9468284749155511, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -59167424.0, + "logits/rejected": -27012451.2, + "logps/chosen": -332.4685756138393, + "logps/rejected": -661.6291015625, + "loss": 0.0441, + "rewards/chosen": 8.515264238630023, + "rewards/margins": 26.76898433140346, + "rewards/rejected": -18.253720092773438, + "step": 3784 + }, + { + "epoch": 0.947078693857125, + "grad_norm": 5.40625, + "kl": 8.7257080078125, + "learning_rate": 5e-06, + "logits/chosen": -47117354.666666664, + "logits/rejected": 75006890.66666667, + "logps/chosen": -394.72958984375, + "logps/rejected": -714.5469835069445, + "loss": 0.0134, + "rewards/chosen": 10.296828206380209, + "rewards/margins": 31.86348876953125, + "rewards/rejected": -21.566660563151043, + "step": 3785 + }, + { + "epoch": 0.9473289127986989, + "grad_norm": 4.03125, + "kl": 5.014748573303223, + "learning_rate": 5e-06, + "logits/chosen": -52380571.428571425, + "logits/rejected": -35608057.6, + "logps/chosen": -378.90073939732144, + "logps/rejected": -600.69052734375, + "loss": 0.0804, + "rewards/chosen": 9.719670976911273, + "rewards/margins": 28.280777849469864, + "rewards/rejected": -18.561106872558593, + "step": 3786 + }, + { + "epoch": 0.9475791317402728, + "grad_norm": 0.47265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -50876924.44444445, + "logits/rejected": -48648499.2, + "logps/chosen": -385.52745225694446, + "logps/rejected": -671.1296223958333, + "loss": 0.0018, + "rewards/chosen": 9.264128790961372, + "rewards/margins": 32.44622717963325, + "rewards/rejected": -23.182098388671875, + "step": 3787 + }, + { + "epoch": 0.9478293506818466, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65908036.92307692, + "logits/rejected": -36172296.72727273, + "logps/chosen": -477.62267127403845, + "logps/rejected": -635.6352982954545, + "loss": 0.0481, + "rewards/chosen": 8.829953120304989, + "rewards/margins": 32.68953064605073, + "rewards/rejected": -23.85957752574574, + "step": 3788 + }, + { + "epoch": 0.9480795696234205, + "grad_norm": 1.28125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32335712.0, + "logits/rejected": -48316750.76923077, + "logps/chosen": -364.0311390269886, + "logps/rejected": -651.7323467548077, + "loss": 0.0566, + "rewards/chosen": 9.95227744362571, + "rewards/margins": 37.16128529368581, + "rewards/rejected": -27.209007850060097, + "step": 3789 + }, + { + "epoch": 0.9483297885649944, + "grad_norm": 1.5859375, + "kl": 8.490150451660156, + "learning_rate": 5e-06, + "logits/chosen": -19082315.42857143, + "logits/rejected": -39224995.2, + "logps/chosen": -345.82603236607144, + "logps/rejected": -895.23544921875, + "loss": 0.0599, + "rewards/chosen": 7.360896519252232, + "rewards/margins": 36.67867911202567, + "rewards/rejected": -29.317782592773437, + "step": 3790 + }, + { + "epoch": 0.9485800075065682, + "grad_norm": 3.328125, + "kl": 5.8852858543396, + "learning_rate": 5e-06, + "logits/chosen": -58795672.615384616, + "logits/rejected": -63537384.72727273, + "logps/chosen": -418.384765625, + "logps/rejected": -713.4153497869319, + "loss": 0.0087, + "rewards/chosen": 9.715853177584135, + "rewards/margins": 34.93949986171056, + "rewards/rejected": -25.22364668412642, + "step": 3791 + }, + { + "epoch": 0.9488302264481421, + "grad_norm": 5.25, + "kl": 4.169834136962891, + "learning_rate": 5e-06, + "logits/chosen": -31484921.14285714, + "logits/rejected": -78941401.6, + "logps/chosen": -295.61781529017856, + "logps/rejected": -879.0400390625, + "loss": 0.0431, + "rewards/chosen": 7.6694199698311945, + "rewards/margins": 31.34405681065151, + "rewards/rejected": -23.674636840820312, + "step": 3792 + }, + { + "epoch": 0.949080445389716, + "grad_norm": 0.91015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53337629.538461536, + "logits/rejected": -77459421.0909091, + "logps/chosen": -377.2835036057692, + "logps/rejected": -823.4643998579545, + "loss": 0.0127, + "rewards/chosen": 9.11517333984375, + "rewards/margins": 36.03184925426136, + "rewards/rejected": -26.916675914417613, + "step": 3793 + }, + { + "epoch": 0.9493306643312899, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -61373172.36363637, + "logits/rejected": -51723874.461538464, + "logps/chosen": -417.7513316761364, + "logps/rejected": -696.4618389423077, + "loss": 0.022, + "rewards/chosen": 11.831304376775568, + "rewards/margins": 36.82902729594624, + "rewards/rejected": -24.997722919170673, + "step": 3794 + }, + { + "epoch": 0.9495808832728637, + "grad_norm": 7.25, + "kl": 12.1416015625, + "learning_rate": 5e-06, + "logits/chosen": -67905614.76923077, + "logits/rejected": -23353488.0, + "logps/chosen": -387.5446965144231, + "logps/rejected": -621.1163441051136, + "loss": 0.0166, + "rewards/chosen": 9.56540738619291, + "rewards/margins": 28.625521466448593, + "rewards/rejected": -19.060114080255683, + "step": 3795 + }, + { + "epoch": 0.9498311022144377, + "grad_norm": 5.46875, + "kl": 1.939288854598999, + "learning_rate": 5e-06, + "logits/chosen": -50235898.666666664, + "logits/rejected": -58083562.666666664, + "logps/chosen": -372.0367431640625, + "logps/rejected": -579.6534830729166, + "loss": 0.0178, + "rewards/chosen": 8.998250325520834, + "rewards/margins": 28.534975687662758, + "rewards/rejected": -19.536725362141926, + "step": 3796 + }, + { + "epoch": 0.9500813211560115, + "grad_norm": 7.34375, + "kl": 0.25363922119140625, + "learning_rate": 5e-06, + "logits/chosen": -52697370.666666664, + "logits/rejected": -69250197.33333333, + "logps/chosen": -565.8408610026041, + "logps/rejected": -509.6327311197917, + "loss": 0.042, + "rewards/chosen": 10.185548146565756, + "rewards/margins": 27.066181182861328, + "rewards/rejected": -16.880633036295574, + "step": 3797 + }, + { + "epoch": 0.9503315400975854, + "grad_norm": 0.953125, + "kl": 8.429555892944336, + "learning_rate": 5e-06, + "logits/chosen": -55832167.384615384, + "logits/rejected": -27408093.09090909, + "logps/chosen": -468.6623347355769, + "logps/rejected": -892.8915127840909, + "loss": 0.0021, + "rewards/chosen": 11.061791053185097, + "rewards/margins": 35.336796046970605, + "rewards/rejected": -24.27500499378551, + "step": 3798 + }, + { + "epoch": 0.9505817590391593, + "grad_norm": 2.234375, + "kl": 11.407859802246094, + "learning_rate": 5e-06, + "logits/chosen": -51186924.307692304, + "logits/rejected": -30782376.727272727, + "logps/chosen": -343.0295973557692, + "logps/rejected": -504.5138050426136, + "loss": 0.0032, + "rewards/chosen": 9.838184650127705, + "rewards/margins": 28.28085044380668, + "rewards/rejected": -18.442665793678977, + "step": 3799 + }, + { + "epoch": 0.9508319779807332, + "grad_norm": 2.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31134425.6, + "logits/rejected": -63519701.333333336, + "logps/chosen": -409.1526692708333, + "logps/rejected": -691.6558159722222, + "loss": 0.0372, + "rewards/chosen": 9.865034993489584, + "rewards/margins": 32.599343872070314, + "rewards/rejected": -22.73430887858073, + "step": 3800 + }, + { + "epoch": 0.951082196922307, + "grad_norm": 5.15625, + "kl": 5.55655574798584, + "learning_rate": 5e-06, + "logits/chosen": -72932181.33333333, + "logits/rejected": -76402396.44444445, + "logps/chosen": -426.03297526041666, + "logps/rejected": -944.8736979166666, + "loss": 0.0183, + "rewards/chosen": 9.49543965657552, + "rewards/margins": 31.519039916992188, + "rewards/rejected": -22.023600260416668, + "step": 3801 + }, + { + "epoch": 0.9513324158638808, + "grad_norm": 4.5, + "kl": 3.1226768493652344, + "learning_rate": 5e-06, + "logits/chosen": -76730135.27272727, + "logits/rejected": -68339692.3076923, + "logps/chosen": -429.6572265625, + "logps/rejected": -692.8033353365385, + "loss": 0.0322, + "rewards/chosen": 10.942499334161932, + "rewards/margins": 33.83553757033982, + "rewards/rejected": -22.893038236177883, + "step": 3802 + }, + { + "epoch": 0.9515826348054548, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56076018.28571428, + "logits/rejected": -26332581.647058822, + "logps/chosen": -426.9267578125, + "logps/rejected": -758.7770565257352, + "loss": 0.0126, + "rewards/chosen": 9.774431501116071, + "rewards/margins": 34.82515171595982, + "rewards/rejected": -25.05072021484375, + "step": 3803 + }, + { + "epoch": 0.9518328537470286, + "grad_norm": 19.375, + "kl": 3.488145351409912, + "learning_rate": 5e-06, + "logits/chosen": -26593225.14285714, + "logits/rejected": -56995757.176470585, + "logps/chosen": -342.9404296875, + "logps/rejected": -663.2612591911765, + "loss": 0.0544, + "rewards/chosen": 10.389007568359375, + "rewards/margins": 28.248809814453125, + "rewards/rejected": -17.85980224609375, + "step": 3804 + }, + { + "epoch": 0.9520830726886025, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40587483.428571425, + "logits/rejected": -62314528.0, + "logps/chosen": -357.28909737723217, + "logps/rejected": -698.70244140625, + "loss": 0.0321, + "rewards/chosen": 9.805108206612724, + "rewards/margins": 30.95259050641741, + "rewards/rejected": -21.147482299804686, + "step": 3805 + }, + { + "epoch": 0.9523332916301764, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12591857.6, + "logits/rejected": -67513694.31578948, + "logps/chosen": -259.782861328125, + "logps/rejected": -598.796875, + "loss": 0.0277, + "rewards/chosen": 5.91339111328125, + "rewards/margins": 23.642562063116777, + "rewards/rejected": -17.729170949835527, + "step": 3806 + }, + { + "epoch": 0.9525835105717503, + "grad_norm": 8.625, + "kl": 2.2471747398376465, + "learning_rate": 5e-06, + "logits/chosen": -43714816.0, + "logits/rejected": -64175654.4, + "logps/chosen": -370.87869698660717, + "logps/rejected": -622.55576171875, + "loss": 0.0565, + "rewards/chosen": 8.174957820347377, + "rewards/margins": 26.869446345738, + "rewards/rejected": -18.694488525390625, + "step": 3807 + }, + { + "epoch": 0.9528337295133241, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24678897.777777776, + "logits/rejected": -34937514.666666664, + "logps/chosen": -460.6534830729167, + "logps/rejected": -655.025, + "loss": 0.0253, + "rewards/chosen": 9.071512858072916, + "rewards/margins": 27.85586954752604, + "rewards/rejected": -18.784356689453126, + "step": 3808 + }, + { + "epoch": 0.9530839484548981, + "grad_norm": 4.1875, + "kl": 6.0159077644348145, + "learning_rate": 5e-06, + "logits/chosen": -74684583.38461539, + "logits/rejected": -57846109.09090909, + "logps/chosen": -385.26355919471155, + "logps/rejected": -629.7611416903409, + "loss": 0.0534, + "rewards/chosen": 9.353405292217548, + "rewards/margins": 31.57022436181982, + "rewards/rejected": -22.216819069602273, + "step": 3809 + }, + { + "epoch": 0.9533341673964719, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -41737565.09090909, + "logits/rejected": -43249604.92307692, + "logps/chosen": -319.38645241477275, + "logps/rejected": -720.6691706730769, + "loss": 0.0231, + "rewards/chosen": 8.204455982555043, + "rewards/margins": 29.226794849742543, + "rewards/rejected": -21.0223388671875, + "step": 3810 + }, + { + "epoch": 0.9535843863380458, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -66808120.88888889, + "logits/rejected": -51564514.13333333, + "logps/chosen": -367.369384765625, + "logps/rejected": -730.6117838541667, + "loss": 0.009, + "rewards/chosen": 8.886767069498697, + "rewards/margins": 31.945116678873696, + "rewards/rejected": -23.058349609375, + "step": 3811 + }, + { + "epoch": 0.9538346052796197, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29688746.666666668, + "logits/rejected": -80733664.0, + "logps/chosen": -341.32375081380206, + "logps/rejected": -684.650390625, + "loss": 0.0485, + "rewards/chosen": 7.714158376057942, + "rewards/margins": 28.267192840576172, + "rewards/rejected": -20.55303446451823, + "step": 3812 + }, + { + "epoch": 0.9540848242211936, + "grad_norm": 10.9375, + "kl": 9.035867691040039, + "learning_rate": 5e-06, + "logits/chosen": -36842096.0, + "logits/rejected": -73049141.33333333, + "logps/chosen": -370.1439615885417, + "logps/rejected": -683.66357421875, + "loss": 0.11, + "rewards/chosen": 8.83652114868164, + "rewards/margins": 27.136847178141277, + "rewards/rejected": -18.300326029459637, + "step": 3813 + }, + { + "epoch": 0.9543350431627674, + "grad_norm": 5.84375, + "kl": 10.510923385620117, + "learning_rate": 5e-06, + "logits/chosen": -33939623.384615384, + "logits/rejected": -47484384.0, + "logps/chosen": -375.95838341346155, + "logps/rejected": -537.7414328835227, + "loss": 0.0416, + "rewards/chosen": 9.93990501990685, + "rewards/margins": 24.21783970119236, + "rewards/rejected": -14.277934681285512, + "step": 3814 + }, + { + "epoch": 0.9545852621043412, + "grad_norm": 1.5078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21632276.8, + "logits/rejected": -25065737.14285714, + "logps/chosen": -351.5560302734375, + "logps/rejected": -533.1035853794643, + "loss": 0.0298, + "rewards/chosen": 9.866575622558594, + "rewards/margins": 32.62342027936663, + "rewards/rejected": -22.756844656808035, + "step": 3815 + }, + { + "epoch": 0.9548354810459152, + "grad_norm": 0.99609375, + "kl": 0.12196986377239227, + "learning_rate": 5e-06, + "logits/chosen": -39988608.0, + "logits/rejected": -54998028.8, + "logps/chosen": -421.6563197544643, + "logps/rejected": -693.07021484375, + "loss": 0.0191, + "rewards/chosen": 9.631364004952568, + "rewards/margins": 36.0542979649135, + "rewards/rejected": -26.422933959960936, + "step": 3816 + }, + { + "epoch": 0.955085699987489, + "grad_norm": 1.7890625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -54795264.0, + "logits/rejected": 41952544.0, + "logps/chosen": -471.416259765625, + "logps/rejected": -682.9248046875, + "loss": 0.0039, + "rewards/chosen": 9.791301727294922, + "rewards/margins": 31.806022099086217, + "rewards/rejected": -22.014720371791295, + "step": 3817 + }, + { + "epoch": 0.9553359189290629, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13535209.333333334, + "logits/rejected": -36767306.666666664, + "logps/chosen": -404.706298828125, + "logps/rejected": -541.2242838541666, + "loss": 0.0229, + "rewards/chosen": 9.475298563639322, + "rewards/margins": 29.997957865397133, + "rewards/rejected": -20.522659301757812, + "step": 3818 + }, + { + "epoch": 0.9555861378706368, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44265088.0, + "logits/rejected": -61911642.666666664, + "logps/chosen": -340.550537109375, + "logps/rejected": -591.4286702473959, + "loss": 0.0644, + "rewards/chosen": 9.378558476765951, + "rewards/margins": 28.47223472595215, + "rewards/rejected": -19.0936762491862, + "step": 3819 + }, + { + "epoch": 0.9558363568122107, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38584206.54545455, + "logits/rejected": -30558355.692307692, + "logps/chosen": -275.25337357954544, + "logps/rejected": -556.8131009615385, + "loss": 0.0237, + "rewards/chosen": 8.516375454989346, + "rewards/margins": 29.08074876478502, + "rewards/rejected": -20.564373309795673, + "step": 3820 + }, + { + "epoch": 0.9560865757537845, + "grad_norm": 14.9375, + "kl": 0.35327786207199097, + "learning_rate": 5e-06, + "logits/chosen": -40974506.666666664, + "logits/rejected": -68638620.44444445, + "logps/chosen": -398.1684895833333, + "logps/rejected": -648.5995551215278, + "loss": 0.0447, + "rewards/chosen": 9.299566650390625, + "rewards/margins": 27.868817816840277, + "rewards/rejected": -18.569251166449654, + "step": 3821 + }, + { + "epoch": 0.9563367946953585, + "grad_norm": 3.84375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -49520516.92307692, + "logits/rejected": -29289876.363636363, + "logps/chosen": -325.5168644831731, + "logps/rejected": -375.64901455965907, + "loss": 0.0525, + "rewards/chosen": 8.785682091346153, + "rewards/margins": 22.543178318263767, + "rewards/rejected": -13.757496226917613, + "step": 3822 + }, + { + "epoch": 0.9565870136369323, + "grad_norm": 0.19140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -72421932.8, + "logits/rejected": -73482477.71428572, + "logps/chosen": -474.848388671875, + "logps/rejected": -772.5800083705357, + "loss": 0.0004, + "rewards/chosen": 9.096759796142578, + "rewards/margins": 35.16759981427874, + "rewards/rejected": -26.07084001813616, + "step": 3823 + }, + { + "epoch": 0.9568372325785062, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20718141.53846154, + "logits/rejected": -41214807.27272727, + "logps/chosen": -260.9956805889423, + "logps/rejected": -487.42764559659093, + "loss": 0.0386, + "rewards/chosen": 7.06999030480018, + "rewards/margins": 24.937384492033843, + "rewards/rejected": -17.867394187233664, + "step": 3824 + }, + { + "epoch": 0.9570874515200801, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -73609013.33333333, + "logits/rejected": -47146618.666666664, + "logps/chosen": -382.869384765625, + "logps/rejected": -720.1292317708334, + "loss": 0.0136, + "rewards/chosen": 8.288688659667969, + "rewards/margins": 35.175534566243485, + "rewards/rejected": -26.88684590657552, + "step": 3825 + }, + { + "epoch": 0.957337670461654, + "grad_norm": 33.25, + "kl": 1.5505365133285522, + "learning_rate": 5e-06, + "logits/chosen": -44374230.4, + "logits/rejected": -60039355.428571425, + "logps/chosen": -270.5136474609375, + "logps/rejected": -780.1568080357143, + "loss": 0.0542, + "rewards/chosen": 7.985226440429687, + "rewards/margins": 33.07666582380022, + "rewards/rejected": -25.091439383370535, + "step": 3826 + }, + { + "epoch": 0.9575878894032278, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38941168.0, + "logits/rejected": -46705189.333333336, + "logps/chosen": -336.78476969401044, + "logps/rejected": -744.940673828125, + "loss": 0.0538, + "rewards/chosen": 8.69411849975586, + "rewards/margins": 38.29082107543945, + "rewards/rejected": -29.596702575683594, + "step": 3827 + }, + { + "epoch": 0.9578381083448017, + "grad_norm": 0.5234375, + "kl": 11.557807922363281, + "learning_rate": 5e-06, + "logits/chosen": -52912496.0, + "logits/rejected": -35287282.666666664, + "logps/chosen": -402.6027425130208, + "logps/rejected": -813.48095703125, + "loss": 0.043, + "rewards/chosen": 11.748739878336588, + "rewards/margins": 39.648398081461586, + "rewards/rejected": -27.899658203125, + "step": 3828 + }, + { + "epoch": 0.9580883272863756, + "grad_norm": 8.4375, + "kl": 15.501982688903809, + "learning_rate": 5e-06, + "logits/chosen": -56329719.46666667, + "logits/rejected": -23905813.333333332, + "logps/chosen": -341.6076171875, + "logps/rejected": -762.4013671875, + "loss": 0.0324, + "rewards/chosen": 8.641023763020833, + "rewards/margins": 30.250497097439236, + "rewards/rejected": -21.609473334418404, + "step": 3829 + }, + { + "epoch": 0.9583385462279495, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20670630.666666668, + "logits/rejected": -68291701.33333333, + "logps/chosen": -314.68979899088544, + "logps/rejected": -653.5166015625, + "loss": 0.0022, + "rewards/chosen": 8.386489868164062, + "rewards/margins": 28.009854634602863, + "rewards/rejected": -19.6233647664388, + "step": 3830 + }, + { + "epoch": 0.9585887651695233, + "grad_norm": 12.8125, + "kl": 9.518377304077148, + "learning_rate": 5e-06, + "logits/chosen": -49091000.0, + "logits/rejected": -85204000.0, + "logps/chosen": -320.4339599609375, + "logps/rejected": -593.9530029296875, + "loss": 0.0517, + "rewards/chosen": 7.857075214385986, + "rewards/margins": 26.012341022491455, + "rewards/rejected": -18.15526580810547, + "step": 3831 + }, + { + "epoch": 0.9588389841110972, + "grad_norm": 2.828125, + "kl": 7.422418117523193, + "learning_rate": 5e-06, + "logits/chosen": -16919313.14285714, + "logits/rejected": -58099008.0, + "logps/chosen": -360.50048828125, + "logps/rejected": -800.7333984375, + "loss": 0.026, + "rewards/chosen": 9.73052978515625, + "rewards/margins": 43.454653930664065, + "rewards/rejected": -33.724124145507815, + "step": 3832 + }, + { + "epoch": 0.9590892030526711, + "grad_norm": 2.015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65289045.333333336, + "logits/rejected": -64758741.333333336, + "logps/chosen": -509.7870686848958, + "logps/rejected": -574.7178548177084, + "loss": 0.0015, + "rewards/chosen": 11.338635762532553, + "rewards/margins": 30.155426025390625, + "rewards/rejected": -18.816790262858074, + "step": 3833 + }, + { + "epoch": 0.9593394219942449, + "grad_norm": 2.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28756937.6, + "logits/rejected": -58084909.71428572, + "logps/chosen": -351.7574462890625, + "logps/rejected": -762.8815569196429, + "loss": 0.0092, + "rewards/chosen": 7.608365631103515, + "rewards/margins": 30.5991580418178, + "rewards/rejected": -22.990792410714285, + "step": 3834 + }, + { + "epoch": 0.9595896409358189, + "grad_norm": 8.875, + "kl": 3.9756338596343994, + "learning_rate": 5e-06, + "logits/chosen": -23455826.285714287, + "logits/rejected": -32992726.4, + "logps/chosen": -335.25118582589283, + "logps/rejected": -818.69130859375, + "loss": 0.037, + "rewards/chosen": 8.70469502040318, + "rewards/margins": 29.234492383684433, + "rewards/rejected": -20.52979736328125, + "step": 3835 + }, + { + "epoch": 0.9598398598773927, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37097795.55555555, + "logits/rejected": -49031654.4, + "logps/chosen": -376.55889214409723, + "logps/rejected": -617.6861979166666, + "loss": 0.0154, + "rewards/chosen": 9.517410278320312, + "rewards/margins": 30.474990844726562, + "rewards/rejected": -20.95758056640625, + "step": 3836 + }, + { + "epoch": 0.9600900788189666, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15814643.2, + "logits/rejected": -35996900.571428575, + "logps/chosen": -298.586328125, + "logps/rejected": -706.1287667410714, + "loss": 0.0528, + "rewards/chosen": 6.01234130859375, + "rewards/margins": 25.44663260323661, + "rewards/rejected": -19.434291294642858, + "step": 3837 + }, + { + "epoch": 0.9603402977605404, + "grad_norm": 0.1865234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37997083.428571425, + "logits/rejected": -23202057.6, + "logps/chosen": -374.73154994419644, + "logps/rejected": -478.361669921875, + "loss": 0.0004, + "rewards/chosen": 10.213801792689733, + "rewards/margins": 25.345128086635043, + "rewards/rejected": -15.131326293945312, + "step": 3838 + }, + { + "epoch": 0.9605905167021144, + "grad_norm": 4.28125, + "kl": 4.487115383148193, + "learning_rate": 5e-06, + "logits/chosen": -38783844.571428575, + "logits/rejected": -74535673.6, + "logps/chosen": -363.5967494419643, + "logps/rejected": -838.803515625, + "loss": 0.0202, + "rewards/chosen": 8.72662843976702, + "rewards/margins": 37.61637758527483, + "rewards/rejected": -28.889749145507814, + "step": 3839 + }, + { + "epoch": 0.9608407356436882, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28871888.0, + "logits/rejected": -38986828.0, + "logps/chosen": -309.4356689453125, + "logps/rejected": -595.8763427734375, + "loss": 0.0161, + "rewards/chosen": 7.95991849899292, + "rewards/margins": 27.782046794891357, + "rewards/rejected": -19.822128295898438, + "step": 3840 + }, + { + "epoch": 0.9610909545852621, + "grad_norm": 5.28125, + "kl": 12.016315460205078, + "learning_rate": 5e-06, + "logits/chosen": -54934240.0, + "logits/rejected": -46925210.666666664, + "logps/chosen": -405.2626546223958, + "logps/rejected": -692.5524088541666, + "loss": 0.0225, + "rewards/chosen": 10.991649627685547, + "rewards/margins": 35.87665430704753, + "rewards/rejected": -24.88500467936198, + "step": 3841 + }, + { + "epoch": 0.961341173526836, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34089619.2, + "logits/rejected": -49401321.14285714, + "logps/chosen": -452.6861328125, + "logps/rejected": -719.708984375, + "loss": 0.0087, + "rewards/chosen": 11.733026123046875, + "rewards/margins": 31.139311000279015, + "rewards/rejected": -19.406284877232142, + "step": 3842 + }, + { + "epoch": 0.9615913924684099, + "grad_norm": 10.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11760846.4, + "logits/rejected": -37050210.28571428, + "logps/chosen": -257.75185546875, + "logps/rejected": -594.1285574776786, + "loss": 0.0682, + "rewards/chosen": 6.774432373046875, + "rewards/margins": 24.45775146484375, + "rewards/rejected": -17.683319091796875, + "step": 3843 + }, + { + "epoch": 0.9618416114099837, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18305284.363636363, + "logits/rejected": -57329984.0, + "logps/chosen": -473.75803444602275, + "logps/rejected": -720.1396484375, + "loss": 0.0026, + "rewards/chosen": 11.285125038840555, + "rewards/margins": 34.99987019358815, + "rewards/rejected": -23.714745154747597, + "step": 3844 + }, + { + "epoch": 0.9620918303515577, + "grad_norm": 44.75, + "kl": 12.56527328491211, + "learning_rate": 5e-06, + "logits/chosen": -55231540.36363637, + "logits/rejected": -42694680.615384616, + "logps/chosen": -408.63108132102275, + "logps/rejected": -642.4805438701923, + "loss": 0.0399, + "rewards/chosen": 11.480680985884232, + "rewards/margins": 28.094380091953944, + "rewards/rejected": -16.61369910606971, + "step": 3845 + }, + { + "epoch": 0.9623420492931315, + "grad_norm": 2.296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40305469.09090909, + "logits/rejected": -54132647.384615384, + "logps/chosen": -389.8980823863636, + "logps/rejected": -569.9939152644231, + "loss": 0.0108, + "rewards/chosen": 9.961592934348367, + "rewards/margins": 31.403055524492597, + "rewards/rejected": -21.44146259014423, + "step": 3846 + }, + { + "epoch": 0.9625922682347053, + "grad_norm": 2.734375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -48310518.15384615, + "logits/rejected": -45653006.54545455, + "logps/chosen": -386.89547025240387, + "logps/rejected": -671.8385564630681, + "loss": 0.0083, + "rewards/chosen": 10.33529545710637, + "rewards/margins": 33.40339543269231, + "rewards/rejected": -23.068099975585938, + "step": 3847 + }, + { + "epoch": 0.9628424871762793, + "grad_norm": 6.1875, + "kl": 10.920272827148438, + "learning_rate": 5e-06, + "logits/chosen": -31669776.0, + "logits/rejected": -24730280.0, + "logps/chosen": -358.03662109375, + "logps/rejected": -574.1449584960938, + "loss": 0.0829, + "rewards/chosen": 9.713077545166016, + "rewards/margins": 25.816761016845703, + "rewards/rejected": -16.103683471679688, + "step": 3848 + }, + { + "epoch": 0.9630927061178531, + "grad_norm": 19.25, + "kl": 19.09150505065918, + "learning_rate": 5e-06, + "logits/chosen": -26879507.692307692, + "logits/rejected": -51334557.09090909, + "logps/chosen": -380.67566856971155, + "logps/rejected": -692.3667436079545, + "loss": 0.0324, + "rewards/chosen": 9.16677034818209, + "rewards/margins": 30.5244432996203, + "rewards/rejected": -21.35767295143821, + "step": 3849 + }, + { + "epoch": 0.963342925059427, + "grad_norm": 1.140625, + "kl": 8.34872055053711, + "learning_rate": 5e-06, + "logits/chosen": -37483531.428571425, + "logits/rejected": -33689395.2, + "logps/chosen": -421.77322823660717, + "logps/rejected": -548.781103515625, + "loss": 0.0485, + "rewards/chosen": 10.449754987444196, + "rewards/margins": 27.008191571916853, + "rewards/rejected": -16.558436584472656, + "step": 3850 + }, + { + "epoch": 0.9635931440010008, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28091118.545454547, + "logits/rejected": -34631202.461538464, + "logps/chosen": -322.37690873579544, + "logps/rejected": -469.8505859375, + "loss": 0.0589, + "rewards/chosen": 6.198031338778409, + "rewards/margins": 25.14949195701759, + "rewards/rejected": -18.95146061823918, + "step": 3851 + }, + { + "epoch": 0.9638433629425748, + "grad_norm": 0.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -58119040.0, + "logits/rejected": -43992969.14285714, + "logps/chosen": -559.505126953125, + "logps/rejected": -675.2760881696429, + "loss": 0.0007, + "rewards/chosen": 14.47528076171875, + "rewards/margins": 37.27697099958147, + "rewards/rejected": -22.801690237862722, + "step": 3852 + }, + { + "epoch": 0.9640935818841486, + "grad_norm": 0.08056640625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40678481.45454545, + "logits/rejected": -9905734.153846154, + "logps/chosen": -371.78946200284093, + "logps/rejected": -723.5649038461538, + "loss": 0.0001, + "rewards/chosen": 11.309200633655895, + "rewards/margins": 34.0964086039083, + "rewards/rejected": -22.787207970252403, + "step": 3853 + }, + { + "epoch": 0.9643438008257225, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55221164.307692304, + "logits/rejected": -56022690.90909091, + "logps/chosen": -383.1820537860577, + "logps/rejected": -838.6635298295455, + "loss": 0.066, + "rewards/chosen": 9.849053016075722, + "rewards/margins": 33.36286083301464, + "rewards/rejected": -23.51380781693892, + "step": 3854 + }, + { + "epoch": 0.9645940197672964, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27051552.0, + "logits/rejected": -59936955.733333334, + "logps/chosen": -302.7287326388889, + "logps/rejected": -613.5576171875, + "loss": 0.0737, + "rewards/chosen": 8.684432135687935, + "rewards/margins": 28.053491380479603, + "rewards/rejected": -19.369059244791668, + "step": 3855 + }, + { + "epoch": 0.9648442387088703, + "grad_norm": 1.2109375, + "kl": 4.168708801269531, + "learning_rate": 5e-06, + "logits/chosen": -32282508.8, + "logits/rejected": -37779138.28571428, + "logps/chosen": -463.106494140625, + "logps/rejected": -662.2996651785714, + "loss": 0.0049, + "rewards/chosen": 10.612116241455078, + "rewards/margins": 34.56391154697963, + "rewards/rejected": -23.951795305524552, + "step": 3856 + }, + { + "epoch": 0.9650944576504441, + "grad_norm": 3.328125, + "kl": 11.12745475769043, + "learning_rate": 5e-06, + "logits/chosen": -32653936.0, + "logits/rejected": 32110586.0, + "logps/chosen": -425.4703369140625, + "logps/rejected": -483.9393005371094, + "loss": 0.0174, + "rewards/chosen": 9.707277297973633, + "rewards/margins": 24.246158599853516, + "rewards/rejected": -14.538881301879883, + "step": 3857 + }, + { + "epoch": 0.9653446765920181, + "grad_norm": 4.625, + "kl": 8.11190414428711, + "learning_rate": 5e-06, + "logits/chosen": -31433381.647058822, + "logits/rejected": -70687241.14285715, + "logps/chosen": -364.31198299632354, + "logps/rejected": -484.31856863839283, + "loss": 0.0387, + "rewards/chosen": 9.439571605009192, + "rewards/margins": 23.662127582966782, + "rewards/rejected": -14.222555977957589, + "step": 3858 + }, + { + "epoch": 0.9655948955335919, + "grad_norm": 0.1298828125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56802035.2, + "logits/rejected": -58600685.71428572, + "logps/chosen": -420.225146484375, + "logps/rejected": -973.6275809151786, + "loss": 0.0004, + "rewards/chosen": 11.7372802734375, + "rewards/margins": 44.23220127650669, + "rewards/rejected": -32.494921003069194, + "step": 3859 + }, + { + "epoch": 0.9658451144751657, + "grad_norm": 12.5625, + "kl": 5.907201290130615, + "learning_rate": 5e-06, + "logits/chosen": -23306130.285714287, + "logits/rejected": -53998930.823529415, + "logps/chosen": -324.04282924107144, + "logps/rejected": -726.5144186580883, + "loss": 0.0515, + "rewards/chosen": 8.301190512520927, + "rewards/margins": 33.49217131157883, + "rewards/rejected": -25.190980799057904, + "step": 3860 + }, + { + "epoch": 0.9660953334167397, + "grad_norm": 6.15625, + "kl": 2.182804822921753, + "learning_rate": 5e-06, + "logits/chosen": -32028797.333333332, + "logits/rejected": -8360502.666666667, + "logps/chosen": -348.4102783203125, + "logps/rejected": -620.0096842447916, + "loss": 0.0432, + "rewards/chosen": 8.913494110107422, + "rewards/margins": 33.54720687866211, + "rewards/rejected": -24.633712768554688, + "step": 3861 + }, + { + "epoch": 0.9663455523583135, + "grad_norm": 2.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30353171.2, + "logits/rejected": -52253929.14285714, + "logps/chosen": -399.001708984375, + "logps/rejected": -662.9752371651786, + "loss": 0.0075, + "rewards/chosen": 9.050270080566406, + "rewards/margins": 27.36557115827288, + "rewards/rejected": -18.315301077706472, + "step": 3862 + }, + { + "epoch": 0.9665957712998874, + "grad_norm": 6.28125, + "kl": 0.22182178497314453, + "learning_rate": 5e-06, + "logits/chosen": -57446326.85714286, + "logits/rejected": -40376044.8, + "logps/chosen": -392.30831473214283, + "logps/rejected": -765.30146484375, + "loss": 0.0139, + "rewards/chosen": 9.939987182617188, + "rewards/margins": 34.026486206054685, + "rewards/rejected": -24.0864990234375, + "step": 3863 + }, + { + "epoch": 0.9668459902414612, + "grad_norm": 12.3125, + "kl": 23.338483810424805, + "learning_rate": 5e-06, + "logits/chosen": -40988644.571428575, + "logits/rejected": -47086624.0, + "logps/chosen": -399.99672154017856, + "logps/rejected": -596.09453125, + "loss": 0.089, + "rewards/chosen": 9.718152727399554, + "rewards/margins": 30.955344499860495, + "rewards/rejected": -21.23719177246094, + "step": 3864 + }, + { + "epoch": 0.9670962091830352, + "grad_norm": 13.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22877720.615384616, + "logits/rejected": -46798528.0, + "logps/chosen": -319.18299278846155, + "logps/rejected": -564.1554509943181, + "loss": 0.038, + "rewards/chosen": 8.825396024263823, + "rewards/margins": 29.192720853365387, + "rewards/rejected": -20.367324829101562, + "step": 3865 + }, + { + "epoch": 0.967346428124609, + "grad_norm": 1.6015625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -33983051.63636363, + "logits/rejected": -50839020.307692304, + "logps/chosen": -432.88858309659093, + "logps/rejected": -716.7330979567307, + "loss": 0.0159, + "rewards/chosen": 11.987903941761363, + "rewards/margins": 35.22811249419526, + "rewards/rejected": -23.240208552433895, + "step": 3866 + }, + { + "epoch": 0.9675966470661829, + "grad_norm": 3.6875, + "kl": 0.5152873992919922, + "learning_rate": 5e-06, + "logits/chosen": -37303378.28571428, + "logits/rejected": -38571161.6, + "logps/chosen": -291.78201729910717, + "logps/rejected": -690.518896484375, + "loss": 0.024, + "rewards/chosen": 8.523482186453682, + "rewards/margins": 34.44908643450056, + "rewards/rejected": -25.925604248046874, + "step": 3867 + }, + { + "epoch": 0.9678468660077568, + "grad_norm": 1.078125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35910432.0, + "logits/rejected": -69161115.42857143, + "logps/chosen": -371.057666015625, + "logps/rejected": -814.048828125, + "loss": 0.0203, + "rewards/chosen": 9.005654907226562, + "rewards/margins": 35.64681178501674, + "rewards/rejected": -26.641156877790177, + "step": 3868 + }, + { + "epoch": 0.9680970849493307, + "grad_norm": 6.84375, + "kl": 1.2629725933074951, + "learning_rate": 5e-06, + "logits/chosen": -46466568.0, + "logits/rejected": -53591456.0, + "logps/chosen": -415.37957763671875, + "logps/rejected": -570.9844360351562, + "loss": 0.0328, + "rewards/chosen": 9.832305908203125, + "rewards/margins": 26.621980667114258, + "rewards/rejected": -16.789674758911133, + "step": 3869 + }, + { + "epoch": 0.9683473038909045, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47434880.0, + "logits/rejected": -37783603.692307696, + "logps/chosen": -287.724609375, + "logps/rejected": -608.3063777043269, + "loss": 0.0359, + "rewards/chosen": 6.878135681152344, + "rewards/margins": 32.91757612961989, + "rewards/rejected": -26.03944044846755, + "step": 3870 + }, + { + "epoch": 0.9685975228324785, + "grad_norm": 0.333984375, + "kl": 2.0322751998901367, + "learning_rate": 5e-06, + "logits/chosen": -46891330.90909091, + "logits/rejected": -53090422.15384615, + "logps/chosen": -413.53178267045456, + "logps/rejected": -651.9038461538462, + "loss": 0.0409, + "rewards/chosen": 10.19251181862571, + "rewards/margins": 32.75693207854158, + "rewards/rejected": -22.564420259915867, + "step": 3871 + }, + { + "epoch": 0.9688477417740523, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28443712.0, + "logits/rejected": -38963366.4, + "logps/chosen": -245.46820746527777, + "logps/rejected": -587.5257161458334, + "loss": 0.0472, + "rewards/chosen": 6.622306399875217, + "rewards/margins": 26.38343921237522, + "rewards/rejected": -19.7611328125, + "step": 3872 + }, + { + "epoch": 0.9690979607156261, + "grad_norm": 18.0, + "kl": 9.448108673095703, + "learning_rate": 5e-06, + "logits/chosen": -25862889.411764707, + "logits/rejected": -42743067.428571425, + "logps/chosen": -322.56970932904414, + "logps/rejected": -724.9907924107143, + "loss": 0.0868, + "rewards/chosen": 8.542974135454964, + "rewards/margins": 33.611475199210545, + "rewards/rejected": -25.06850106375558, + "step": 3873 + }, + { + "epoch": 0.9693481796572001, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21654961.230769232, + "logits/rejected": -52294481.45454545, + "logps/chosen": -327.39415564903845, + "logps/rejected": -871.4437144886364, + "loss": 0.0675, + "rewards/chosen": 8.409715505746695, + "rewards/margins": 34.32598914299812, + "rewards/rejected": -25.91627363725142, + "step": 3874 + }, + { + "epoch": 0.969598398598774, + "grad_norm": 2.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40782469.81818182, + "logits/rejected": -33015707.076923076, + "logps/chosen": -392.9646661931818, + "logps/rejected": -594.09228515625, + "loss": 0.0186, + "rewards/chosen": 9.928966175426137, + "rewards/margins": 28.39434280929032, + "rewards/rejected": -18.46537663386418, + "step": 3875 + }, + { + "epoch": 0.9698486175403478, + "grad_norm": 2.15625, + "kl": 7.827136516571045, + "learning_rate": 5e-06, + "logits/chosen": -39313274.666666664, + "logits/rejected": -41018488.0, + "logps/chosen": -305.38478597005206, + "logps/rejected": -684.7875162760416, + "loss": 0.0141, + "rewards/chosen": 8.029731750488281, + "rewards/margins": 26.312808990478516, + "rewards/rejected": -18.283077239990234, + "step": 3876 + }, + { + "epoch": 0.9700988364819216, + "grad_norm": 21.375, + "kl": 3.491133451461792, + "learning_rate": 5e-06, + "logits/chosen": -29397490.285714287, + "logits/rejected": -62098118.4, + "logps/chosen": -413.77476283482144, + "logps/rejected": -873.984765625, + "loss": 0.0394, + "rewards/chosen": 10.881811959402901, + "rewards/margins": 36.18558545793806, + "rewards/rejected": -25.303773498535158, + "step": 3877 + }, + { + "epoch": 0.9703490554234956, + "grad_norm": 11.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34417018.18181818, + "logits/rejected": -27988539.076923076, + "logps/chosen": -330.96395596590907, + "logps/rejected": -464.8365009014423, + "loss": 0.0591, + "rewards/chosen": 9.189322731711648, + "rewards/margins": 23.313269875266336, + "rewards/rejected": -14.123947143554688, + "step": 3878 + }, + { + "epoch": 0.9705992743650694, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30721472.0, + "logits/rejected": -62366771.2, + "logps/chosen": -406.15001085069446, + "logps/rejected": -579.9911458333333, + "loss": 0.0339, + "rewards/chosen": 11.351082695855034, + "rewards/margins": 28.347450086805555, + "rewards/rejected": -16.99636739095052, + "step": 3879 + }, + { + "epoch": 0.9708494933066433, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36149027.2, + "logits/rejected": -75715451.42857143, + "logps/chosen": -391.9965576171875, + "logps/rejected": -921.5121372767857, + "loss": 0.055, + "rewards/chosen": 8.701179504394531, + "rewards/margins": 40.30095018659319, + "rewards/rejected": -31.59977068219866, + "step": 3880 + }, + { + "epoch": 0.9710997122482172, + "grad_norm": 1.859375, + "kl": 2.748319149017334, + "learning_rate": 5e-06, + "logits/chosen": -25719410.285714287, + "logits/rejected": -53659865.6, + "logps/chosen": -418.26572963169644, + "logps/rejected": -591.8138671875, + "loss": 0.0212, + "rewards/chosen": 10.856331961495536, + "rewards/margins": 30.9186519077846, + "rewards/rejected": -20.062319946289062, + "step": 3881 + }, + { + "epoch": 0.9713499311897911, + "grad_norm": 10.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65178725.333333336, + "logits/rejected": -22346822.666666668, + "logps/chosen": -329.09877522786456, + "logps/rejected": -509.6519368489583, + "loss": 0.0251, + "rewards/chosen": 7.801107406616211, + "rewards/margins": 24.29829216003418, + "rewards/rejected": -16.49718475341797, + "step": 3882 + }, + { + "epoch": 0.9716001501313649, + "grad_norm": 12.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30783202.46153846, + "logits/rejected": -22821837.09090909, + "logps/chosen": -384.0134840745192, + "logps/rejected": -869.0830965909091, + "loss": 0.0201, + "rewards/chosen": 9.980858435997597, + "rewards/margins": 30.422841745656687, + "rewards/rejected": -20.44198330965909, + "step": 3883 + }, + { + "epoch": 0.9718503690729389, + "grad_norm": 9.75, + "kl": 21.838794708251953, + "learning_rate": 5e-06, + "logits/chosen": -32046446.933333334, + "logits/rejected": -37591107.55555555, + "logps/chosen": -435.53118489583335, + "logps/rejected": -476.92670355902777, + "loss": 0.101, + "rewards/chosen": 9.048858642578125, + "rewards/margins": 24.97481960720486, + "rewards/rejected": -15.925960964626736, + "step": 3884 + }, + { + "epoch": 0.9721005880145127, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30191610.666666668, + "logits/rejected": -39990896.0, + "logps/chosen": -335.1617024739583, + "logps/rejected": -777.8806966145834, + "loss": 0.036, + "rewards/chosen": 7.712012608846028, + "rewards/margins": 29.219130833943684, + "rewards/rejected": -21.507118225097656, + "step": 3885 + }, + { + "epoch": 0.9723508069560866, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56472936.0, + "logits/rejected": -46947752.0, + "logps/chosen": -413.96075439453125, + "logps/rejected": -467.5013732910156, + "loss": 0.0204, + "rewards/chosen": 10.486227035522461, + "rewards/margins": 28.210168838500977, + "rewards/rejected": -17.723941802978516, + "step": 3886 + }, + { + "epoch": 0.9726010258976604, + "grad_norm": 12.6875, + "kl": 1.9150289297103882, + "learning_rate": 5e-06, + "logits/chosen": -35636491.63636363, + "logits/rejected": -31300672.0, + "logps/chosen": -275.99447354403407, + "logps/rejected": -558.6902043269231, + "loss": 0.0894, + "rewards/chosen": 7.320476878773082, + "rewards/margins": 24.61486821741491, + "rewards/rejected": -17.294391338641827, + "step": 3887 + }, + { + "epoch": 0.9728512448392344, + "grad_norm": 1.4296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37074065.23076923, + "logits/rejected": -14339342.545454545, + "logps/chosen": -360.14340444711536, + "logps/rejected": -624.1553178267045, + "loss": 0.0033, + "rewards/chosen": 9.248656052809496, + "rewards/margins": 27.727687702312338, + "rewards/rejected": -18.47903164950284, + "step": 3888 + }, + { + "epoch": 0.9731014637808082, + "grad_norm": 1.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55604992.0, + "logits/rejected": -60936850.28571428, + "logps/chosen": -310.69658203125, + "logps/rejected": -687.2011021205357, + "loss": 0.0265, + "rewards/chosen": 9.678225708007812, + "rewards/margins": 32.72256774902344, + "rewards/rejected": -23.044342041015625, + "step": 3889 + }, + { + "epoch": 0.973351682722382, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23144411.636363637, + "logits/rejected": -60289777.23076923, + "logps/chosen": -309.78635475852275, + "logps/rejected": -668.6854717548077, + "loss": 0.0509, + "rewards/chosen": 7.271749323064631, + "rewards/margins": 31.78096328415237, + "rewards/rejected": -24.50921396108774, + "step": 3890 + }, + { + "epoch": 0.973601901663956, + "grad_norm": 13.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42042023.384615384, + "logits/rejected": -47910059.63636363, + "logps/chosen": -419.7848557692308, + "logps/rejected": -693.4315074573864, + "loss": 0.0206, + "rewards/chosen": 10.259602473332333, + "rewards/margins": 33.03318146392182, + "rewards/rejected": -22.77357899058949, + "step": 3891 + }, + { + "epoch": 0.9738521206055298, + "grad_norm": 2.5, + "kl": 2.666719436645508, + "learning_rate": 5e-06, + "logits/chosen": -40335667.2, + "logits/rejected": -25222595.555555556, + "logps/chosen": -388.22294921875, + "logps/rejected": -792.2281901041666, + "loss": 0.03, + "rewards/chosen": 10.124489339192708, + "rewards/margins": 34.141579182942706, + "rewards/rejected": -24.01708984375, + "step": 3892 + }, + { + "epoch": 0.9741023395471037, + "grad_norm": 6.25, + "kl": 2.3579535484313965, + "learning_rate": 5e-06, + "logits/chosen": -9998268.666666666, + "logits/rejected": -43604661.333333336, + "logps/chosen": -529.3503011067709, + "logps/rejected": -491.5458984375, + "loss": 0.0069, + "rewards/chosen": 12.51663589477539, + "rewards/margins": 29.207677205403645, + "rewards/rejected": -16.691041310628254, + "step": 3893 + }, + { + "epoch": 0.9743525584886776, + "grad_norm": 11.4375, + "kl": 1.738030195236206, + "learning_rate": 5e-06, + "logits/chosen": -70412578.46153846, + "logits/rejected": -11246549.818181818, + "logps/chosen": -367.69936899038464, + "logps/rejected": -460.8216441761364, + "loss": 0.046, + "rewards/chosen": 8.68379387488732, + "rewards/margins": 21.446622355000954, + "rewards/rejected": -12.762828480113637, + "step": 3894 + }, + { + "epoch": 0.9746027774302515, + "grad_norm": 18.625, + "kl": 2.904693841934204, + "learning_rate": 5e-06, + "logits/chosen": -54031064.0, + "logits/rejected": -35670252.0, + "logps/chosen": -474.55517578125, + "logps/rejected": -795.219970703125, + "loss": 0.0179, + "rewards/chosen": 10.20977783203125, + "rewards/margins": 27.902626037597656, + "rewards/rejected": -17.692848205566406, + "step": 3895 + }, + { + "epoch": 0.9748529963718253, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37931451.428571425, + "logits/rejected": -41357702.4, + "logps/chosen": -309.04091099330356, + "logps/rejected": -837.5548828125, + "loss": 0.0072, + "rewards/chosen": 8.340540749686104, + "rewards/margins": 37.24057126726423, + "rewards/rejected": -28.900030517578124, + "step": 3896 + }, + { + "epoch": 0.9751032153133993, + "grad_norm": 2.0, + "kl": 5.703559875488281, + "learning_rate": 5e-06, + "logits/chosen": -23395910.4, + "logits/rejected": -52158144.0, + "logps/chosen": -377.62623697916666, + "logps/rejected": -832.6595594618055, + "loss": 0.0294, + "rewards/chosen": 9.456494140625, + "rewards/margins": 33.989771864149304, + "rewards/rejected": -24.533277723524307, + "step": 3897 + }, + { + "epoch": 0.9753534342549731, + "grad_norm": 12.8125, + "kl": 4.150864601135254, + "learning_rate": 5e-06, + "logits/chosen": -40384648.53333333, + "logits/rejected": -59752149.333333336, + "logps/chosen": -387.2018229166667, + "logps/rejected": -725.9015299479166, + "loss": 0.0694, + "rewards/chosen": 7.4949900309244795, + "rewards/margins": 32.71932813856337, + "rewards/rejected": -25.22433810763889, + "step": 3898 + }, + { + "epoch": 0.975603653196547, + "grad_norm": 1.3828125, + "kl": 13.705760955810547, + "learning_rate": 5e-06, + "logits/chosen": -62480768.0, + "logits/rejected": -27682688.0, + "logps/chosen": -411.98057338169644, + "logps/rejected": -552.70419921875, + "loss": 0.0303, + "rewards/chosen": 9.918835231236049, + "rewards/margins": 30.28584267752511, + "rewards/rejected": -20.367007446289062, + "step": 3899 + }, + { + "epoch": 0.9758538721381208, + "grad_norm": 2.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -44139254.85714286, + "logits/rejected": -78279788.8, + "logps/chosen": -419.72544642857144, + "logps/rejected": -546.863525390625, + "loss": 0.0423, + "rewards/chosen": 11.096158708844866, + "rewards/margins": 30.186109270368306, + "rewards/rejected": -19.089950561523438, + "step": 3900 + }, + { + "epoch": 0.9761040910796948, + "grad_norm": 0.94140625, + "kl": 5.185084819793701, + "learning_rate": 5e-06, + "logits/chosen": -41667108.0, + "logits/rejected": -48074820.0, + "logps/chosen": -320.8394775390625, + "logps/rejected": -763.2186889648438, + "loss": 0.0221, + "rewards/chosen": 9.117944717407227, + "rewards/margins": 27.3349666595459, + "rewards/rejected": -18.217021942138672, + "step": 3901 + }, + { + "epoch": 0.9763543100212686, + "grad_norm": 12.75, + "kl": 0.5974782705307007, + "learning_rate": 5e-06, + "logits/chosen": -40336996.92307692, + "logits/rejected": -26020381.09090909, + "logps/chosen": -335.4240534855769, + "logps/rejected": -677.9532137784091, + "loss": 0.0297, + "rewards/chosen": 8.151806171123798, + "rewards/margins": 30.634008874426357, + "rewards/rejected": -22.48220270330256, + "step": 3902 + }, + { + "epoch": 0.9766045289628424, + "grad_norm": 12.9375, + "kl": 24.269527435302734, + "learning_rate": 5e-06, + "logits/chosen": -43479261.86666667, + "logits/rejected": 18193614.222222224, + "logps/chosen": -381.3014322916667, + "logps/rejected": -502.84190538194446, + "loss": 0.0774, + "rewards/chosen": 9.11646728515625, + "rewards/margins": 23.820362006293404, + "rewards/rejected": -14.703894721137154, + "step": 3903 + }, + { + "epoch": 0.9768547479044164, + "grad_norm": 0.73046875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45116666.18181818, + "logits/rejected": -45631261.538461536, + "logps/chosen": -353.5690252130682, + "logps/rejected": -862.0533353365385, + "loss": 0.0134, + "rewards/chosen": 9.191687150435014, + "rewards/margins": 31.392621927328044, + "rewards/rejected": -22.20093477689303, + "step": 3904 + }, + { + "epoch": 0.9771049668459902, + "grad_norm": 1.3359375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -52801926.4, + "logits/rejected": -35263286.85714286, + "logps/chosen": -283.116552734375, + "logps/rejected": -716.5625697544643, + "loss": 0.044, + "rewards/chosen": 7.934872436523437, + "rewards/margins": 32.24507053920201, + "rewards/rejected": -24.310198102678573, + "step": 3905 + }, + { + "epoch": 0.9773551857875641, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -57245705.84615385, + "logits/rejected": -44357844.36363637, + "logps/chosen": -545.3950946514423, + "logps/rejected": -695.5963245738636, + "loss": 0.0078, + "rewards/chosen": 12.219673743614784, + "rewards/margins": 35.781211052741206, + "rewards/rejected": -23.56153730912642, + "step": 3906 + }, + { + "epoch": 0.977605404729138, + "grad_norm": 24.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27104283.076923076, + "logits/rejected": -55196392.72727273, + "logps/chosen": -344.4867412860577, + "logps/rejected": -601.3670987215909, + "loss": 0.0694, + "rewards/chosen": 8.338092510516827, + "rewards/margins": 28.400448245602053, + "rewards/rejected": -20.062355735085227, + "step": 3907 + }, + { + "epoch": 0.9778556236707119, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -39075859.2, + "logits/rejected": -49144667.428571425, + "logps/chosen": -371.681787109375, + "logps/rejected": -664.4563337053571, + "loss": 0.0437, + "rewards/chosen": 8.994275665283203, + "rewards/margins": 27.77451858520508, + "rewards/rejected": -18.780242919921875, + "step": 3908 + }, + { + "epoch": 0.9781058426122857, + "grad_norm": 1.1328125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32540375.272727273, + "logits/rejected": -63902168.615384616, + "logps/chosen": -351.01700106534093, + "logps/rejected": -977.1657902644231, + "loss": 0.0083, + "rewards/chosen": 9.339064858176492, + "rewards/margins": 40.9016165033087, + "rewards/rejected": -31.56255164513221, + "step": 3909 + }, + { + "epoch": 0.9783560615538597, + "grad_norm": 1.3515625, + "kl": 3.9817867279052734, + "learning_rate": 5e-06, + "logits/chosen": -30538598.4, + "logits/rejected": -56442843.428571425, + "logps/chosen": -390.345556640625, + "logps/rejected": -614.03173828125, + "loss": 0.0154, + "rewards/chosen": 9.918637084960938, + "rewards/margins": 26.445374843052456, + "rewards/rejected": -16.526737758091517, + "step": 3910 + }, + { + "epoch": 0.9786062804954335, + "grad_norm": 14.0, + "kl": 14.499471664428711, + "learning_rate": 5e-06, + "logits/chosen": -49904075.63636363, + "logits/rejected": 8439950.76923077, + "logps/chosen": -375.97469815340907, + "logps/rejected": -803.7215294471154, + "loss": 0.0565, + "rewards/chosen": 11.516175703568893, + "rewards/margins": 39.51285163505928, + "rewards/rejected": -27.996675931490383, + "step": 3911 + }, + { + "epoch": 0.9788564994370074, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42368813.333333336, + "logits/rejected": -66052773.333333336, + "logps/chosen": -330.16859944661456, + "logps/rejected": -848.3619791666666, + "loss": 0.0058, + "rewards/chosen": 8.415872573852539, + "rewards/margins": 37.10623613993327, + "rewards/rejected": -28.69036356608073, + "step": 3912 + }, + { + "epoch": 0.9791067183785812, + "grad_norm": 5.71875, + "kl": 4.7139410972595215, + "learning_rate": 5e-06, + "logits/chosen": -28923680.0, + "logits/rejected": -36269680.0, + "logps/chosen": -341.94698660714283, + "logps/rejected": -634.24462890625, + "loss": 0.0419, + "rewards/chosen": 8.72720227922712, + "rewards/margins": 33.89705396379743, + "rewards/rejected": -25.16985168457031, + "step": 3913 + }, + { + "epoch": 0.9793569373201552, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -40997098.666666664, + "logits/rejected": -52620080.0, + "logps/chosen": -400.3782552083333, + "logps/rejected": -768.3256022135416, + "loss": 0.0252, + "rewards/chosen": 10.45553716023763, + "rewards/margins": 33.093939463297524, + "rewards/rejected": -22.638402303059895, + "step": 3914 + }, + { + "epoch": 0.979607156261729, + "grad_norm": 10.5, + "kl": 20.873537063598633, + "learning_rate": 5e-06, + "logits/chosen": -50235899.07692308, + "logits/rejected": 24579252.363636363, + "logps/chosen": -438.91744290865387, + "logps/rejected": -625.6250887784091, + "loss": 0.1335, + "rewards/chosen": 9.434246356670673, + "rewards/margins": 26.384742683463998, + "rewards/rejected": -16.950496326793324, + "step": 3915 + }, + { + "epoch": 0.9798573752033028, + "grad_norm": 8.25, + "kl": 1.583831787109375, + "learning_rate": 5e-06, + "logits/chosen": -52324489.14285714, + "logits/rejected": -56690886.4, + "logps/chosen": -384.96470424107144, + "logps/rejected": -531.48134765625, + "loss": 0.0524, + "rewards/chosen": 9.045016697474889, + "rewards/margins": 28.605673435756138, + "rewards/rejected": -19.56065673828125, + "step": 3916 + }, + { + "epoch": 0.9801075941448768, + "grad_norm": 1.609375, + "kl": 10.239280700683594, + "learning_rate": 5e-06, + "logits/chosen": -45535824.0, + "logits/rejected": -44611008.0, + "logps/chosen": -447.91259765625, + "logps/rejected": -914.749755859375, + "loss": 0.0034, + "rewards/chosen": 12.477359771728516, + "rewards/margins": 36.78392219543457, + "rewards/rejected": -24.306562423706055, + "step": 3917 + }, + { + "epoch": 0.9803578130864506, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1225360.0, + "logits/rejected": -31190221.714285713, + "logps/chosen": -282.1539306640625, + "logps/rejected": -556.0284598214286, + "loss": 0.0114, + "rewards/chosen": 7.905995178222656, + "rewards/margins": 25.902686200823105, + "rewards/rejected": -17.996691022600448, + "step": 3918 + }, + { + "epoch": 0.9806080320280245, + "grad_norm": 14.8125, + "kl": 20.088218688964844, + "learning_rate": 5e-06, + "logits/chosen": -17907869.714285713, + "logits/rejected": -44431916.8, + "logps/chosen": -367.9154575892857, + "logps/rejected": -637.282177734375, + "loss": 0.073, + "rewards/chosen": 7.657320840018136, + "rewards/margins": 28.288558632986888, + "rewards/rejected": -20.63123779296875, + "step": 3919 + }, + { + "epoch": 0.9808582509695984, + "grad_norm": 2.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32359758.222222224, + "logits/rejected": -40579891.2, + "logps/chosen": -370.8440212673611, + "logps/rejected": -567.5607421875, + "loss": 0.0211, + "rewards/chosen": 11.505711873372396, + "rewards/margins": 29.10923563639323, + "rewards/rejected": -17.603523763020835, + "step": 3920 + }, + { + "epoch": 0.9811084699111723, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16098978.909090908, + "logits/rejected": -35136462.76923077, + "logps/chosen": -456.0098987926136, + "logps/rejected": -618.2701322115385, + "loss": 0.0755, + "rewards/chosen": 12.162471424449574, + "rewards/margins": 28.333258355414117, + "rewards/rejected": -16.170786930964542, + "step": 3921 + }, + { + "epoch": 0.9813586888527461, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19941942.85714286, + "logits/rejected": -51920018.823529415, + "logps/chosen": -263.69559151785717, + "logps/rejected": -710.2635569852941, + "loss": 0.0139, + "rewards/chosen": 8.911597115652901, + "rewards/margins": 24.146363458713562, + "rewards/rejected": -15.234766343060661, + "step": 3922 + }, + { + "epoch": 0.9816089077943201, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29590700.307692308, + "logits/rejected": -24099643.636363637, + "logps/chosen": -379.6750300480769, + "logps/rejected": -720.9930752840909, + "loss": 0.0308, + "rewards/chosen": 10.717064490685097, + "rewards/margins": 30.015029800521745, + "rewards/rejected": -19.29796530983665, + "step": 3923 + }, + { + "epoch": 0.9818591267358939, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -46414090.666666664, + "logits/rejected": -19778786.666666668, + "logps/chosen": -463.7242024739583, + "logps/rejected": -541.4361979166666, + "loss": 0.0068, + "rewards/chosen": 11.843819936116537, + "rewards/margins": 27.001853942871094, + "rewards/rejected": -15.158034006754557, + "step": 3924 + }, + { + "epoch": 0.9821093456774678, + "grad_norm": 1.78125, + "kl": 25.73027801513672, + "learning_rate": 5e-06, + "logits/chosen": -41849339.428571425, + "logits/rejected": 490687.6, + "logps/chosen": -412.93603515625, + "logps/rejected": -559.054052734375, + "loss": 0.0052, + "rewards/chosen": 12.351627894810267, + "rewards/margins": 30.023913356236047, + "rewards/rejected": -17.67228546142578, + "step": 3925 + }, + { + "epoch": 0.9823595646190416, + "grad_norm": 35.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -38446700.0, + "logits/rejected": -36006440.0, + "logps/chosen": -453.05517578125, + "logps/rejected": -479.0089416503906, + "loss": 0.1024, + "rewards/chosen": 10.47886848449707, + "rewards/margins": 21.55883026123047, + "rewards/rejected": -11.079961776733398, + "step": 3926 + }, + { + "epoch": 0.9826097835606156, + "grad_norm": 2.609375, + "kl": 4.016099452972412, + "learning_rate": 5e-06, + "logits/chosen": -35379386.666666664, + "logits/rejected": -29712288.0, + "logps/chosen": -424.4717610677083, + "logps/rejected": -672.4393717447916, + "loss": 0.0037, + "rewards/chosen": 11.027623494466146, + "rewards/margins": 29.700346628824867, + "rewards/rejected": -18.672723134358723, + "step": 3927 + }, + { + "epoch": 0.9828600025021894, + "grad_norm": 3.59375, + "kl": 5.973641395568848, + "learning_rate": 5e-06, + "logits/chosen": -34025917.333333336, + "logits/rejected": -6558381.333333333, + "logps/chosen": -381.3688557942708, + "logps/rejected": -845.018798828125, + "loss": 0.0515, + "rewards/chosen": 9.177377065022787, + "rewards/margins": 33.514400482177734, + "rewards/rejected": -24.33702341715495, + "step": 3928 + }, + { + "epoch": 0.9831102214437633, + "grad_norm": 16.625, + "kl": 19.98219108581543, + "learning_rate": 5e-06, + "logits/chosen": -38877200.0, + "logits/rejected": -34899512.0, + "logps/chosen": -369.6253967285156, + "logps/rejected": -837.4268798828125, + "loss": 0.0471, + "rewards/chosen": 8.4776611328125, + "rewards/margins": 35.82998085021973, + "rewards/rejected": -27.352319717407227, + "step": 3929 + }, + { + "epoch": 0.9833604403853372, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -65620986.666666664, + "logits/rejected": -42909477.333333336, + "logps/chosen": -499.5901692708333, + "logps/rejected": -730.4905598958334, + "loss": 0.0364, + "rewards/chosen": 13.0157839457194, + "rewards/margins": 36.617977142333984, + "rewards/rejected": -23.602193196614582, + "step": 3930 + }, + { + "epoch": 0.983610659326911, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -56220197.333333336, + "logits/rejected": -53672437.333333336, + "logps/chosen": -245.3282674153646, + "logps/rejected": -603.2243923611111, + "loss": 0.0502, + "rewards/chosen": 7.396557490030925, + "rewards/margins": 26.250307083129883, + "rewards/rejected": -18.853749593098957, + "step": 3931 + }, + { + "epoch": 0.9838608782684849, + "grad_norm": 12.375, + "kl": 8.447867393493652, + "learning_rate": 5e-06, + "logits/chosen": -23518124.0, + "logits/rejected": -29392884.0, + "logps/chosen": -415.67431640625, + "logps/rejected": -655.8687744140625, + "loss": 0.0207, + "rewards/chosen": 10.663930892944336, + "rewards/margins": 27.354080200195312, + "rewards/rejected": -16.690149307250977, + "step": 3932 + }, + { + "epoch": 0.9841110972100588, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37172004.92307692, + "logits/rejected": -31579182.545454547, + "logps/chosen": -215.589599609375, + "logps/rejected": -777.7120028409091, + "loss": 0.0334, + "rewards/chosen": 7.376846900353065, + "rewards/margins": 34.59776364839994, + "rewards/rejected": -27.220916748046875, + "step": 3933 + }, + { + "epoch": 0.9843613161516327, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16495175.272727273, + "logits/rejected": -34304839.384615384, + "logps/chosen": -328.09086470170456, + "logps/rejected": -702.5994591346154, + "loss": 0.0614, + "rewards/chosen": 7.887223677201704, + "rewards/margins": 29.041191901360357, + "rewards/rejected": -21.153968224158653, + "step": 3934 + }, + { + "epoch": 0.9846115350932065, + "grad_norm": 6.28125, + "kl": 2.333127975463867, + "learning_rate": 5e-06, + "logits/chosen": -49624749.176470585, + "logits/rejected": -49470162.28571428, + "logps/chosen": -320.70726102941177, + "logps/rejected": -690.4435686383929, + "loss": 0.0567, + "rewards/chosen": 8.330571791704964, + "rewards/margins": 28.948365283613448, + "rewards/rejected": -20.617793491908483, + "step": 3935 + }, + { + "epoch": 0.9848617540347804, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27207965.09090909, + "logits/rejected": -72611357.53846154, + "logps/chosen": -364.87564364346593, + "logps/rejected": -817.0207331730769, + "loss": 0.0128, + "rewards/chosen": 8.27528936212713, + "rewards/margins": 33.014716195059826, + "rewards/rejected": -24.739426832932693, + "step": 3936 + }, + { + "epoch": 0.9851119729763543, + "grad_norm": 0.984375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22068662.4, + "logits/rejected": -29371730.285714287, + "logps/chosen": -471.294189453125, + "logps/rejected": -708.8239397321429, + "loss": 0.0163, + "rewards/chosen": 10.50101547241211, + "rewards/margins": 36.74225692749023, + "rewards/rejected": -26.241241455078125, + "step": 3937 + }, + { + "epoch": 0.9853621919179282, + "grad_norm": 14.375, + "kl": 0.10368029773235321, + "learning_rate": 5e-06, + "logits/chosen": -48363296.0, + "logits/rejected": -45868937.6, + "logps/chosen": -331.83921595982144, + "logps/rejected": -943.74765625, + "loss": 0.0734, + "rewards/chosen": 7.449642726353237, + "rewards/margins": 43.6378415788923, + "rewards/rejected": -36.18819885253906, + "step": 3938 + }, + { + "epoch": 0.985612410859502, + "grad_norm": 9.875, + "kl": 3.3190131187438965, + "learning_rate": 5e-06, + "logits/chosen": -46969557.333333336, + "logits/rejected": -36867586.666666664, + "logps/chosen": -377.8224690755208, + "logps/rejected": -687.1456705729166, + "loss": 0.0845, + "rewards/chosen": 11.234382629394531, + "rewards/margins": 32.21994908650716, + "rewards/rejected": -20.98556645711263, + "step": 3939 + }, + { + "epoch": 0.985862629801076, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -45813632.0, + "logits/rejected": -35051516.0, + "logps/chosen": -415.10235595703125, + "logps/rejected": -678.3410034179688, + "loss": 0.0195, + "rewards/chosen": 9.014588356018066, + "rewards/margins": 30.950024604797363, + "rewards/rejected": -21.935436248779297, + "step": 3940 + }, + { + "epoch": 0.9861128487426498, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31144051.2, + "logits/rejected": -62658733.71428572, + "logps/chosen": -285.29208984375, + "logps/rejected": -733.9418247767857, + "loss": 0.1087, + "rewards/chosen": 4.349193572998047, + "rewards/margins": 28.8267457144601, + "rewards/rejected": -24.477552141462052, + "step": 3941 + }, + { + "epoch": 0.9863630676842237, + "grad_norm": 10.4375, + "kl": 9.000922203063965, + "learning_rate": 5e-06, + "logits/chosen": -58100829.86666667, + "logits/rejected": -36515790.222222224, + "logps/chosen": -447.3507486979167, + "logps/rejected": -619.4696723090278, + "loss": 0.0661, + "rewards/chosen": 9.913232421875, + "rewards/margins": 29.73987053765191, + "rewards/rejected": -19.82663811577691, + "step": 3942 + }, + { + "epoch": 0.9866132866257976, + "grad_norm": 10.0, + "kl": 5.4238386154174805, + "learning_rate": 5e-06, + "logits/chosen": -29101499.733333334, + "logits/rejected": -75156103.1111111, + "logps/chosen": -363.73313802083334, + "logps/rejected": -890.9756944444445, + "loss": 0.0891, + "rewards/chosen": 7.556283569335937, + "rewards/margins": 39.38298102484809, + "rewards/rejected": -31.826697455512154, + "step": 3943 + }, + { + "epoch": 0.9868635055673715, + "grad_norm": 13.4375, + "kl": 2.114741802215576, + "learning_rate": 5e-06, + "logits/chosen": -42494712.88888889, + "logits/rejected": -54532599.46666667, + "logps/chosen": -531.55712890625, + "logps/rejected": -703.9520833333333, + "loss": 0.0335, + "rewards/chosen": 12.468458387586805, + "rewards/margins": 38.12098931206597, + "rewards/rejected": -25.652530924479166, + "step": 3944 + }, + { + "epoch": 0.9871137245089453, + "grad_norm": 0.96484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -53907702.15384615, + "logits/rejected": -57878446.54545455, + "logps/chosen": -391.70616736778845, + "logps/rejected": -626.0774591619319, + "loss": 0.0014, + "rewards/chosen": 8.92250706599309, + "rewards/margins": 32.94486535345758, + "rewards/rejected": -24.02235828746449, + "step": 3945 + }, + { + "epoch": 0.9873639434505193, + "grad_norm": 1.9296875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -29963148.8, + "logits/rejected": -19879853.714285713, + "logps/chosen": -417.868017578125, + "logps/rejected": -664.2801339285714, + "loss": 0.0137, + "rewards/chosen": 10.50127716064453, + "rewards/margins": 31.27513972691127, + "rewards/rejected": -20.77386256626674, + "step": 3946 + }, + { + "epoch": 0.9876141623920931, + "grad_norm": 0.72265625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -51812809.14285714, + "logits/rejected": -58584884.705882356, + "logps/chosen": -393.245849609375, + "logps/rejected": -762.5880055147059, + "loss": 0.0023, + "rewards/chosen": 10.252015250069755, + "rewards/margins": 39.32556979395762, + "rewards/rejected": -29.073554543887866, + "step": 3947 + }, + { + "epoch": 0.9878643813336669, + "grad_norm": 3.8125, + "kl": 1.9092109203338623, + "learning_rate": 5e-06, + "logits/chosen": -28000529.230769232, + "logits/rejected": -69654946.9090909, + "logps/chosen": -329.1393479567308, + "logps/rejected": -607.4361239346591, + "loss": 0.0223, + "rewards/chosen": 8.90884047288161, + "rewards/margins": 27.891068145111724, + "rewards/rejected": -18.982227672230113, + "step": 3948 + }, + { + "epoch": 0.9881146002752408, + "grad_norm": 7.125, + "kl": 2.0589828491210938, + "learning_rate": 5e-06, + "logits/chosen": -45497109.333333336, + "logits/rejected": -76610640.0, + "logps/chosen": -333.2631564670139, + "logps/rejected": -322.0388997395833, + "loss": 0.0683, + "rewards/chosen": 8.318917168511284, + "rewards/margins": 16.71895429823134, + "rewards/rejected": -8.400037129720053, + "step": 3949 + }, + { + "epoch": 0.9883648192168147, + "grad_norm": 1.6796875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47325410.461538464, + "logits/rejected": -45818845.09090909, + "logps/chosen": -321.19542518028845, + "logps/rejected": -651.3072620738636, + "loss": 0.019, + "rewards/chosen": 8.775801438551683, + "rewards/margins": 33.89396550105168, + "rewards/rejected": -25.1181640625, + "step": 3950 + }, + { + "epoch": 0.9886150381583886, + "grad_norm": 0.130859375, + "kl": 4.493025302886963, + "learning_rate": 5e-06, + "logits/chosen": -47206528.0, + "logits/rejected": -26520050.666666668, + "logps/chosen": -453.1259765625, + "logps/rejected": -566.3223470052084, + "loss": 0.0003, + "rewards/chosen": 12.524405161539713, + "rewards/margins": 33.82267506917318, + "rewards/rejected": -21.298269907633465, + "step": 3951 + }, + { + "epoch": 0.9888652570999624, + "grad_norm": 1.09375, + "kl": 2.585041046142578, + "learning_rate": 5e-06, + "logits/chosen": -32735785.14285714, + "logits/rejected": 131204300.8, + "logps/chosen": -406.31703404017856, + "logps/rejected": -747.648388671875, + "loss": 0.0015, + "rewards/chosen": 9.250094822474889, + "rewards/margins": 32.4478273664202, + "rewards/rejected": -23.197732543945314, + "step": 3952 + }, + { + "epoch": 0.9891154760415364, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20406521.846153848, + "logits/rejected": -43217384.72727273, + "logps/chosen": -308.88037109375, + "logps/rejected": -609.6330788352273, + "loss": 0.0343, + "rewards/chosen": 7.953879136305589, + "rewards/margins": 26.958494226415674, + "rewards/rejected": -19.004615090110086, + "step": 3953 + }, + { + "epoch": 0.9893656949831102, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32097112.0, + "logits/rejected": -38692576.0, + "logps/chosen": -309.71502685546875, + "logps/rejected": -725.3766276041666, + "loss": 0.0342, + "rewards/chosen": 8.32809321085612, + "rewards/margins": 28.434861501057945, + "rewards/rejected": -20.106768290201824, + "step": 3954 + }, + { + "epoch": 0.9896159139246841, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -43478304.0, + "logits/rejected": -56707520.0, + "logps/chosen": -361.45478515625, + "logps/rejected": -623.1678292410714, + "loss": 0.0142, + "rewards/chosen": 7.855131530761719, + "rewards/margins": 30.855000741141183, + "rewards/rejected": -22.999869210379465, + "step": 3955 + }, + { + "epoch": 0.989866132866258, + "grad_norm": 3.796875, + "kl": 5.011376857757568, + "learning_rate": 5e-06, + "logits/chosen": -30940270.222222224, + "logits/rejected": -64041518.93333333, + "logps/chosen": -415.21533203125, + "logps/rejected": -659.2696614583333, + "loss": 0.0382, + "rewards/chosen": 10.734215630425346, + "rewards/margins": 34.966053602430556, + "rewards/rejected": -24.23183797200521, + "step": 3956 + }, + { + "epoch": 0.9901163518078319, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -34440978.666666664, + "logits/rejected": -43845645.333333336, + "logps/chosen": -334.89255777994794, + "logps/rejected": -638.4994710286459, + "loss": 0.0406, + "rewards/chosen": 8.733612696329752, + "rewards/margins": 29.712447484334312, + "rewards/rejected": -20.97883478800456, + "step": 3957 + }, + { + "epoch": 0.9903665707494057, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30342094.0, + "logits/rejected": -5984259.0, + "logps/chosen": -271.0609436035156, + "logps/rejected": -572.8228149414062, + "loss": 0.0851, + "rewards/chosen": 6.186017036437988, + "rewards/margins": 30.027365684509277, + "rewards/rejected": -23.84134864807129, + "step": 3958 + }, + { + "epoch": 0.9906167896909797, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21171751.384615384, + "logits/rejected": -50809890.90909091, + "logps/chosen": -336.90981820913464, + "logps/rejected": -546.9691051136364, + "loss": 0.0215, + "rewards/chosen": 8.231268075796274, + "rewards/margins": 28.721452512941163, + "rewards/rejected": -20.490184437144887, + "step": 3959 + }, + { + "epoch": 0.9908670086325535, + "grad_norm": 2.234375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37973939.2, + "logits/rejected": -38702560.0, + "logps/chosen": -346.8701904296875, + "logps/rejected": -716.72900390625, + "loss": 0.0497, + "rewards/chosen": 8.475537109375, + "rewards/margins": 34.29376133510045, + "rewards/rejected": -25.818224225725448, + "step": 3960 + }, + { + "epoch": 0.9911172275741273, + "grad_norm": 9.3125, + "kl": 4.190396785736084, + "learning_rate": 5e-06, + "logits/chosen": -6810541.6, + "logits/rejected": -45286345.14285714, + "logps/chosen": -205.781982421875, + "logps/rejected": -556.1872907366071, + "loss": 0.0575, + "rewards/chosen": 7.310523986816406, + "rewards/margins": 27.405501229422434, + "rewards/rejected": -20.094977242606028, + "step": 3961 + }, + { + "epoch": 0.9913674465157012, + "grad_norm": 18.0, + "kl": 1.38067626953125, + "learning_rate": 5e-06, + "logits/chosen": -76666453.33333333, + "logits/rejected": -48690165.333333336, + "logps/chosen": -442.7099202473958, + "logps/rejected": -673.058349609375, + "loss": 0.0531, + "rewards/chosen": 11.765276590983072, + "rewards/margins": 31.92256418863932, + "rewards/rejected": -20.15728759765625, + "step": 3962 + }, + { + "epoch": 0.9916176654572751, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36729674.666666664, + "logits/rejected": -47037354.666666664, + "logps/chosen": -320.2273356119792, + "logps/rejected": -807.5828450520834, + "loss": 0.0877, + "rewards/chosen": 7.414924621582031, + "rewards/margins": 32.027104695638016, + "rewards/rejected": -24.61218007405599, + "step": 3963 + }, + { + "epoch": 0.991867884398849, + "grad_norm": 4.15625, + "kl": 15.500907897949219, + "learning_rate": 5e-06, + "logits/chosen": -49859464.0, + "logits/rejected": -37097800.0, + "logps/chosen": -449.969482421875, + "logps/rejected": -564.3818969726562, + "loss": 0.0132, + "rewards/chosen": 10.481056213378906, + "rewards/margins": 26.549829483032227, + "rewards/rejected": -16.06877326965332, + "step": 3964 + }, + { + "epoch": 0.9921181033404228, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37106672.0, + "logits/rejected": -27229385.14285714, + "logps/chosen": -383.689111328125, + "logps/rejected": -489.959716796875, + "loss": 0.0707, + "rewards/chosen": 10.878845977783204, + "rewards/margins": 26.086871228899277, + "rewards/rejected": -15.208025251116071, + "step": 3965 + }, + { + "epoch": 0.9923683222819968, + "grad_norm": 7.75, + "kl": 4.65280294418335, + "learning_rate": 5e-06, + "logits/chosen": -15188214.153846154, + "logits/rejected": -47066257.45454545, + "logps/chosen": -324.2204777644231, + "logps/rejected": -857.1129261363636, + "loss": 0.0595, + "rewards/chosen": 8.03616685133714, + "rewards/margins": 36.50703643585419, + "rewards/rejected": -28.470869584517047, + "step": 3966 + }, + { + "epoch": 0.9926185412235706, + "grad_norm": 8.5625, + "kl": 12.692632675170898, + "learning_rate": 5e-06, + "logits/chosen": -24829265.454545453, + "logits/rejected": -45168477.538461536, + "logps/chosen": -431.27028586647725, + "logps/rejected": -725.1931340144231, + "loss": 0.0318, + "rewards/chosen": 11.118859724564986, + "rewards/margins": 37.536889509721235, + "rewards/rejected": -26.41802978515625, + "step": 3967 + }, + { + "epoch": 0.9928687601651445, + "grad_norm": 2.90625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -62926003.2, + "logits/rejected": -35090912.0, + "logps/chosen": -567.55556640625, + "logps/rejected": -787.8756277901786, + "loss": 0.017, + "rewards/chosen": 10.352887725830078, + "rewards/margins": 36.05899908883231, + "rewards/rejected": -25.706111363002233, + "step": 3968 + }, + { + "epoch": 0.9931189791067184, + "grad_norm": 1.7734375, + "kl": 6.0283002853393555, + "learning_rate": 5e-06, + "logits/chosen": -29337501.53846154, + "logits/rejected": -62914682.18181818, + "logps/chosen": -418.9519230769231, + "logps/rejected": -839.6400035511364, + "loss": 0.0214, + "rewards/chosen": 11.14601839505709, + "rewards/margins": 42.964877148608224, + "rewards/rejected": -31.818858753551137, + "step": 3969 + }, + { + "epoch": 0.9933691980482923, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42198946.461538464, + "logits/rejected": -34953393.45454545, + "logps/chosen": -377.6365309495192, + "logps/rejected": -593.7579900568181, + "loss": 0.0239, + "rewards/chosen": 7.9718757042518025, + "rewards/margins": 24.810108958424387, + "rewards/rejected": -16.838233254172586, + "step": 3970 + }, + { + "epoch": 0.9936194169898661, + "grad_norm": 14.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24040375.272727273, + "logits/rejected": -46528280.615384616, + "logps/chosen": -339.61372514204544, + "logps/rejected": -547.3669621394231, + "loss": 0.0402, + "rewards/chosen": 8.295546791770242, + "rewards/margins": 27.063933632590555, + "rewards/rejected": -18.768386840820312, + "step": 3971 + }, + { + "epoch": 0.9938696359314401, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30519040.0, + "logits/rejected": -20677053.333333332, + "logps/chosen": -312.00702582465277, + "logps/rejected": -689.5808919270834, + "loss": 0.0631, + "rewards/chosen": 8.195038689507378, + "rewards/margins": 28.54941134982639, + "rewards/rejected": -20.35437266031901, + "step": 3972 + }, + { + "epoch": 0.9941198548730139, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21888375.466666665, + "logits/rejected": -29557779.555555556, + "logps/chosen": -297.1052734375, + "logps/rejected": -721.9598524305555, + "loss": 0.0315, + "rewards/chosen": 8.641639200846354, + "rewards/margins": 31.512203979492185, + "rewards/rejected": -22.870564778645832, + "step": 3973 + }, + { + "epoch": 0.9943700738145878, + "grad_norm": 8.5, + "kl": 14.943087577819824, + "learning_rate": 5e-06, + "logits/chosen": -42088554.666666664, + "logits/rejected": -31933837.333333332, + "logps/chosen": -424.4392903645833, + "logps/rejected": -615.1873779296875, + "loss": 0.0385, + "rewards/chosen": 9.567827860514322, + "rewards/margins": 28.13157399495443, + "rewards/rejected": -18.563746134440105, + "step": 3974 + }, + { + "epoch": 0.9946202927561616, + "grad_norm": 0.87109375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -31922220.307692308, + "logits/rejected": -40276116.36363637, + "logps/chosen": -333.86902794471155, + "logps/rejected": -487.08354048295456, + "loss": 0.0154, + "rewards/chosen": 9.25815171461839, + "rewards/margins": 27.50727225350333, + "rewards/rejected": -18.24912053888494, + "step": 3975 + }, + { + "epoch": 0.9948705116977355, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -36897230.222222224, + "logits/rejected": -27221506.133333333, + "logps/chosen": -465.48480902777777, + "logps/rejected": -529.3318033854167, + "loss": 0.015, + "rewards/chosen": 10.384301079644096, + "rewards/margins": 27.93383314344618, + "rewards/rejected": -17.549532063802083, + "step": 3976 + }, + { + "epoch": 0.9951207306393094, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14568724.0, + "logits/rejected": -60527620.0, + "logps/chosen": -558.1046142578125, + "logps/rejected": -587.919921875, + "loss": 0.0008, + "rewards/chosen": 11.181229591369629, + "rewards/margins": 34.777085304260254, + "rewards/rejected": -23.595855712890625, + "step": 3977 + }, + { + "epoch": 0.9953709495808832, + "grad_norm": 5.28125, + "kl": 9.59811019897461, + "learning_rate": 5e-06, + "logits/chosen": -25749104.0, + "logits/rejected": -69065280.0, + "logps/chosen": -326.09388950892856, + "logps/rejected": -835.42861328125, + "loss": 0.0574, + "rewards/chosen": 8.926060812813896, + "rewards/margins": 36.27694582257952, + "rewards/rejected": -27.350885009765626, + "step": 3978 + }, + { + "epoch": 0.9956211685224572, + "grad_norm": 0.51171875, + "kl": 6.3562469482421875, + "learning_rate": 5e-06, + "logits/chosen": -42641000.72727273, + "logits/rejected": -62957952.0, + "logps/chosen": -383.52681107954544, + "logps/rejected": -746.2035006009615, + "loss": 0.0009, + "rewards/chosen": 10.290686867453836, + "rewards/margins": 36.80545972277234, + "rewards/rejected": -26.51477285531851, + "step": 3979 + }, + { + "epoch": 0.995871387464031, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -47848285.09090909, + "logits/rejected": -14248665.846153846, + "logps/chosen": -513.8664772727273, + "logps/rejected": -508.95639272836536, + "loss": 0.0402, + "rewards/chosen": 13.651870727539062, + "rewards/margins": 28.735745943509613, + "rewards/rejected": -15.083875215970552, + "step": 3980 + }, + { + "epoch": 0.9961216064056049, + "grad_norm": 5.21875, + "kl": 6.232570648193359, + "learning_rate": 5e-06, + "logits/chosen": -27180532.363636363, + "logits/rejected": -34019874.461538464, + "logps/chosen": -338.5, + "logps/rejected": -470.90831580528845, + "loss": 0.0186, + "rewards/chosen": 8.257475419477982, + "rewards/margins": 23.03724296943291, + "rewards/rejected": -14.779767549954927, + "step": 3981 + }, + { + "epoch": 0.9963718253471788, + "grad_norm": 0.8515625, + "kl": 0.5155544281005859, + "learning_rate": 5e-06, + "logits/chosen": -51104442.666666664, + "logits/rejected": -61470432.0, + "logps/chosen": -466.1374104817708, + "logps/rejected": -1086.4558919270833, + "loss": 0.012, + "rewards/chosen": 10.900699615478516, + "rewards/margins": 49.879258473714195, + "rewards/rejected": -38.97855885823568, + "step": 3982 + }, + { + "epoch": 0.9966220442887527, + "grad_norm": 1.046875, + "kl": 15.152315139770508, + "learning_rate": 5e-06, + "logits/chosen": -29074664.0, + "logits/rejected": -44868956.0, + "logps/chosen": -527.9139404296875, + "logps/rejected": -740.472412109375, + "loss": 0.015, + "rewards/chosen": 11.776090621948242, + "rewards/margins": 39.6702995300293, + "rewards/rejected": -27.894208908081055, + "step": 3983 + }, + { + "epoch": 0.9968722632303265, + "grad_norm": 13.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -42194412.307692304, + "logits/rejected": -28314778.181818184, + "logps/chosen": -429.6138446514423, + "logps/rejected": -571.6382723721591, + "loss": 0.0144, + "rewards/chosen": 10.024604210486778, + "rewards/margins": 26.796969433764477, + "rewards/rejected": -16.7723652232777, + "step": 3984 + }, + { + "epoch": 0.9971224821719004, + "grad_norm": 11.4375, + "kl": 10.635231971740723, + "learning_rate": 5e-06, + "logits/chosen": -30552768.0, + "logits/rejected": -47285680.0, + "logps/chosen": -429.8095296223958, + "logps/rejected": -707.5057779947916, + "loss": 0.0734, + "rewards/chosen": 10.152160008748373, + "rewards/margins": 37.80118497212728, + "rewards/rejected": -27.649024963378906, + "step": 3985 + }, + { + "epoch": 0.9973727011134743, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12818886.222222222, + "logits/rejected": -69408196.26666667, + "logps/chosen": -341.0488552517361, + "logps/rejected": -722.6271484375, + "loss": 0.0364, + "rewards/chosen": 8.758168538411459, + "rewards/margins": 36.78118896484375, + "rewards/rejected": -28.023020426432293, + "step": 3986 + }, + { + "epoch": 0.9976229200550482, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -37982288.0, + "logits/rejected": -10794621.333333334, + "logps/chosen": -451.7815755208333, + "logps/rejected": -728.7311197916666, + "loss": 0.0133, + "rewards/chosen": 9.61706797281901, + "rewards/margins": 34.14792124430338, + "rewards/rejected": -24.530853271484375, + "step": 3987 + }, + { + "epoch": 0.997873138996622, + "grad_norm": 0.021484375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60591674.18181818, + "logits/rejected": -40643539.692307696, + "logps/chosen": -449.9423828125, + "logps/rejected": -545.2093599759615, + "loss": 0.0, + "rewards/chosen": 11.91179032759233, + "rewards/margins": 32.02961400172094, + "rewards/rejected": -20.117823674128605, + "step": 3988 + }, + { + "epoch": 0.998123357938196, + "grad_norm": 12.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -55121693.09090909, + "logits/rejected": -44548253.538461536, + "logps/chosen": -414.1126598011364, + "logps/rejected": -662.9449368990385, + "loss": 0.0455, + "rewards/chosen": 6.894778164950284, + "rewards/margins": 30.14309030812937, + "rewards/rejected": -23.248312143179085, + "step": 3989 + }, + { + "epoch": 0.9983735768797698, + "grad_norm": 1.421875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28642167.272727273, + "logits/rejected": -41489836.307692304, + "logps/chosen": -458.8709161931818, + "logps/rejected": -600.6727764423077, + "loss": 0.0092, + "rewards/chosen": 8.172753073952414, + "rewards/margins": 31.718522398621886, + "rewards/rejected": -23.54576932466947, + "step": 3990 + }, + { + "epoch": 0.9986237958213436, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26861908.0, + "logits/rejected": -46560456.0, + "logps/chosen": -420.3039245605469, + "logps/rejected": -522.4829711914062, + "loss": 0.0198, + "rewards/chosen": 9.867569923400879, + "rewards/margins": 29.49724292755127, + "rewards/rejected": -19.62967300415039, + "step": 3991 + }, + { + "epoch": 0.9988740147629176, + "grad_norm": 3.15625, + "kl": 12.076005935668945, + "learning_rate": 5e-06, + "logits/chosen": -49633680.0, + "logits/rejected": -35480872.0, + "logps/chosen": -440.7423400878906, + "logps/rejected": -484.1937255859375, + "loss": 0.0153, + "rewards/chosen": 9.369802474975586, + "rewards/margins": 28.970916748046875, + "rewards/rejected": -19.60111427307129, + "step": 3992 + }, + { + "epoch": 0.9991242337044914, + "grad_norm": 1.3125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -60890150.4, + "logits/rejected": -52861408.0, + "logps/chosen": -319.358544921875, + "logps/rejected": -835.9989536830357, + "loss": 0.0092, + "rewards/chosen": 7.933580017089843, + "rewards/margins": 42.47269483293806, + "rewards/rejected": -34.539114815848215, + "step": 3993 + }, + { + "epoch": 0.9993744526460653, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -70344584.53333333, + "logits/rejected": -60286464.0, + "logps/chosen": -259.4136067708333, + "logps/rejected": -998.4607204861111, + "loss": 0.076, + "rewards/chosen": 6.816383870442708, + "rewards/margins": 44.20916544596354, + "rewards/rejected": -37.392781575520836, + "step": 3994 + }, + { + "epoch": 0.9996246715876392, + "grad_norm": 11.9375, + "kl": 5.460507869720459, + "learning_rate": 5e-06, + "logits/chosen": -43662973.09090909, + "logits/rejected": -57438281.84615385, + "logps/chosen": -341.693115234375, + "logps/rejected": -897.6317608173077, + "loss": 0.0627, + "rewards/chosen": 8.85528564453125, + "rewards/margins": 38.00040377103365, + "rewards/rejected": -29.145118126502403, + "step": 3995 + }, + { + "epoch": 0.9998748905292131, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30524667.076923076, + "logits/rejected": -32380634.181818184, + "logps/chosen": -347.27786959134613, + "logps/rejected": -583.8941761363636, + "loss": 0.0694, + "rewards/chosen": 7.406897324782151, + "rewards/margins": 29.70618043912874, + "rewards/rejected": -22.29928311434659, + "step": 3996 + }, + { + "epoch": 1.0, + "grad_norm": 0.06494140625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -35362256.0, + "logits/rejected": -50782952.0, + "logps/chosen": -445.2106119791667, + "logps/rejected": -768.860107421875, + "loss": 0.0001, + "rewards/chosen": 9.19583829243978, + "rewards/margins": 48.01045163472494, + "rewards/rejected": -38.814613342285156, + "step": 3997 + } + ], + "logging_steps": 1, + "max_steps": 3997, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 1999, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}