{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 200, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.032388663967611336, "grad_norm": 220.0, "kl": 0.0, "learning_rate": 0.0, "logits/chosen": -58430595.76470588, "logits/rejected": -87931281.06666666, "logps/chosen": -210.7810489430147, "logps/rejected": -94.23323567708333, "loss": 3.8188, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06477732793522267, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 6.666666666666667e-09, "logits/chosen": -62450601.4117647, "logits/rejected": -80818483.2, "logps/chosen": -223.95020967371323, "logps/rejected": -96.331640625, "loss": 3.7399, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.09716599190283401, "grad_norm": 220.0, "kl": 0.23693042993545532, "learning_rate": 1.3333333333333334e-08, "logits/chosen": -63959364.266666666, "logits/rejected": -94508792.47058824, "logps/chosen": -216.87962239583334, "logps/rejected": -109.44955623851104, "loss": 3.7552, "rewards/chosen": 0.020882568756739297, "rewards/margins": 0.013593461086937025, "rewards/rejected": 0.007289107669802273, "step": 3 }, { "epoch": 0.12955465587044535, "grad_norm": 231.0, "kl": 0.16312503814697266, "learning_rate": 2e-08, "logits/chosen": -63178187.48717949, "logits/rejected": -89863157.76, "logps/chosen": -234.58611278044873, "logps/rejected": -104.408388671875, "loss": 3.9892, "rewards/chosen": -0.012021300120231433, "rewards/margins": 0.008119157033088879, "rewards/rejected": -0.020140457153320312, "step": 4 }, { "epoch": 0.16194331983805668, "grad_norm": 203.0, "kl": 0.0684085488319397, "learning_rate": 2.6666666666666667e-08, "logits/chosen": -54792209.655172415, "logits/rejected": -89121280.0, "logps/chosen": -209.4049703663793, "logps/rejected": -110.37323521205357, "loss": 3.6589, "rewards/chosen": -0.01066252589225769, "rewards/margins": -0.027266709293637958, "rewards/rejected": 0.016604183401380267, "step": 5 }, { "epoch": 0.19433198380566802, "grad_norm": 236.0, "kl": 0.36925557255744934, "learning_rate": 3.3333333333333334e-08, "logits/chosen": -52978085.161290325, "logits/rejected": -86475791.51515152, "logps/chosen": -158.39511403729838, "logps/rejected": -95.0108309659091, "loss": 3.9354, "rewards/chosen": -0.018092793803061207, "rewards/margins": -0.022386104625923775, "rewards/rejected": 0.004293310822862567, "step": 6 }, { "epoch": 0.22672064777327935, "grad_norm": 235.0, "kl": 0.0888870358467102, "learning_rate": 4e-08, "logits/chosen": -60282100.36363637, "logits/rejected": -84002072.77419356, "logps/chosen": -144.59225556344697, "logps/rejected": -128.4602523311492, "loss": 3.8797, "rewards/chosen": 0.014703954711104885, "rewards/margins": 0.02637073811314789, "rewards/rejected": -0.011666783402043005, "step": 7 }, { "epoch": 0.2591093117408907, "grad_norm": 196.0, "kl": 0.044393956661224365, "learning_rate": 4.666666666666667e-08, "logits/chosen": -48575488.0, "logits/rejected": -91620359.31428571, "logps/chosen": -236.1129781788793, "logps/rejected": -113.21409737723214, "loss": 3.7969, "rewards/chosen": 0.02754283362421496, "rewards/margins": 0.055130841755514665, "rewards/rejected": -0.0275880081312997, "step": 8 }, { "epoch": 0.291497975708502, "grad_norm": 192.0, "kl": 0.08808565139770508, "learning_rate": 5.3333333333333334e-08, "logits/chosen": -70911016.42105263, "logits/rejected": -87392147.6923077, "logps/chosen": -179.357421875, "logps/rejected": -124.7702167217548, "loss": 3.823, "rewards/chosen": 0.0129009579357348, "rewards/margins": -0.008311521307176904, "rewards/rejected": 0.021212479242911704, "step": 9 }, { "epoch": 0.32388663967611336, "grad_norm": 214.0, "kl": 0.2002100646495819, "learning_rate": 6e-08, "logits/chosen": -66679952.69565217, "logits/rejected": -90590114.34146342, "logps/chosen": -140.9425632642663, "logps/rejected": -99.87619688452745, "loss": 3.7412, "rewards/chosen": -0.012947371472483095, "rewards/margins": -0.019486692999731572, "rewards/rejected": 0.006539321527248476, "step": 10 }, { "epoch": 0.3562753036437247, "grad_norm": 192.0, "kl": 0.28088629245758057, "learning_rate": 6.666666666666667e-08, "logits/chosen": -58234137.6, "logits/rejected": -85435400.8275862, "logps/chosen": -193.31727120535714, "logps/rejected": -117.9307061557112, "loss": 3.7995, "rewards/chosen": 0.007582300049918039, "rewards/margins": 0.006378676481728483, "rewards/rejected": 0.001203623568189555, "step": 11 }, { "epoch": 0.38866396761133604, "grad_norm": 205.0, "kl": 0.11991578340530396, "learning_rate": 7.333333333333333e-08, "logits/chosen": -58462386.28571428, "logits/rejected": -85638883.55555555, "logps/chosen": -282.8749476841518, "logps/rejected": -97.3487548828125, "loss": 3.7668, "rewards/chosen": -0.00441945663520268, "rewards/margins": -0.017723340836782304, "rewards/rejected": 0.013303884201579623, "step": 12 }, { "epoch": 0.42105263157894735, "grad_norm": 404.0, "kl": 0.37106019258499146, "learning_rate": 8e-08, "logits/chosen": -64843439.40740741, "logits/rejected": -90882241.72972973, "logps/chosen": -249.62418619791666, "logps/rejected": -112.53064294763513, "loss": 3.7803, "rewards/chosen": -0.011102434661653306, "rewards/margins": -0.023103111409568215, "rewards/rejected": 0.012000676747914907, "step": 13 }, { "epoch": 0.4534412955465587, "grad_norm": 199.0, "kl": 0.09121906757354736, "learning_rate": 8.666666666666666e-08, "logits/chosen": -51182941.86666667, "logits/rejected": -89462159.05882353, "logps/chosen": -254.28321940104166, "logps/rejected": -88.68126005284927, "loss": 3.7676, "rewards/chosen": -0.011877447366714478, "rewards/margins": -0.012099783210193409, "rewards/rejected": 0.0002223358434789321, "step": 14 }, { "epoch": 0.48582995951417, "grad_norm": 190.0, "kl": 0.04368013143539429, "learning_rate": 9.333333333333334e-08, "logits/chosen": -54074613.76, "logits/rejected": -91609744.41025642, "logps/chosen": -233.44771484375, "logps/rejected": -126.24091045673077, "loss": 3.8555, "rewards/chosen": 0.015118194818496704, "rewards/margins": 0.012803569940420298, "rewards/rejected": 0.0023146248780764067, "step": 15 }, { "epoch": 0.5182186234817814, "grad_norm": 197.0, "kl": 0.13226062059402466, "learning_rate": 1e-07, "logits/chosen": -61068962.90909091, "logits/rejected": -94692616.25806452, "logps/chosen": -242.7887665719697, "logps/rejected": -119.58937121975806, "loss": 3.7451, "rewards/chosen": -0.0001258962985241052, "rewards/margins": -0.01505279516253187, "rewards/rejected": 0.014926898864007766, "step": 16 }, { "epoch": 0.5506072874493927, "grad_norm": 230.0, "kl": 0.29574519395828247, "learning_rate": 1.0666666666666667e-07, "logits/chosen": -62746458.838709675, "logits/rejected": -91721153.93939394, "logps/chosen": -174.30223034274192, "logps/rejected": -130.68303148674244, "loss": 3.9933, "rewards/chosen": 0.014514055944258166, "rewards/margins": 0.03509998304053835, "rewards/rejected": -0.020585927096280186, "step": 17 }, { "epoch": 0.582995951417004, "grad_norm": 196.0, "kl": 0.14987093210220337, "learning_rate": 1.1333333333333332e-07, "logits/chosen": -56216569.2631579, "logits/rejected": -88530510.76923077, "logps/chosen": -245.4069181743421, "logps/rejected": -114.16628793569711, "loss": 3.7232, "rewards/chosen": 0.004077738445056112, "rewards/margins": 0.02320850369056709, "rewards/rejected": -0.01913076524551098, "step": 18 }, { "epoch": 0.6153846153846154, "grad_norm": 187.0, "kl": 0.3079838752746582, "learning_rate": 1.2e-07, "logits/chosen": -63619196.0, "logits/rejected": -97384832.0, "logps/chosen": -250.46109008789062, "logps/rejected": -121.32066345214844, "loss": 3.7837, "rewards/chosen": -0.007823335006833076, "rewards/margins": -0.0028944951482117176, "rewards/rejected": -0.004928839858621359, "step": 19 }, { "epoch": 0.6477732793522267, "grad_norm": 178.0, "kl": 0.2797037363052368, "learning_rate": 1.2666666666666666e-07, "logits/chosen": -72328352.91428572, "logits/rejected": -87969871.44827586, "logps/chosen": -166.88225446428572, "logps/rejected": -125.47866716056035, "loss": 3.7522, "rewards/chosen": -0.005298037188393729, "rewards/margins": -0.016832793815969832, "rewards/rejected": 0.011534756627576104, "step": 20 }, { "epoch": 0.680161943319838, "grad_norm": 169.0, "kl": 0.18229222297668457, "learning_rate": 1.3333333333333334e-07, "logits/chosen": -64960970.10526316, "logits/rejected": -91011012.92307693, "logps/chosen": -207.93300267269737, "logps/rejected": -106.05503493088942, "loss": 3.7582, "rewards/chosen": -0.0011691284414968993, "rewards/margins": 0.022455291724518725, "rewards/rejected": -0.023624420166015625, "step": 21 }, { "epoch": 0.7125506072874493, "grad_norm": 245.0, "kl": 0.2452963888645172, "learning_rate": 1.4e-07, "logits/chosen": -65512395.48717949, "logits/rejected": -87635998.72, "logps/chosen": -151.80533854166666, "logps/rejected": -93.67271484375, "loss": 3.8238, "rewards/chosen": -0.01152306718704028, "rewards/margins": -0.025706659983365965, "rewards/rejected": 0.014183592796325684, "step": 22 }, { "epoch": 0.7449392712550608, "grad_norm": 207.0, "kl": 0.26080161333084106, "learning_rate": 1.4666666666666666e-07, "logits/chosen": -67878600.0, "logits/rejected": -89875760.0, "logps/chosen": -170.92079162597656, "logps/rejected": -118.2757797241211, "loss": 3.8494, "rewards/chosen": -0.0006897321436554193, "rewards/margins": 0.015984160592779517, "rewards/rejected": -0.016673892736434937, "step": 23 }, { "epoch": 0.7773279352226721, "grad_norm": 216.0, "kl": 0.32619625329971313, "learning_rate": 1.533333333333333e-07, "logits/chosen": -62605857.88235294, "logits/rejected": -85273693.86666666, "logps/chosen": -216.76492130055146, "logps/rejected": -109.14388020833333, "loss": 3.8839, "rewards/chosen": 0.025327740346684176, "rewards/margins": 0.04310074074595582, "rewards/rejected": -0.017773000399271648, "step": 24 }, { "epoch": 0.8097165991902834, "grad_norm": 217.0, "kl": 0.17487949132919312, "learning_rate": 1.6e-07, "logits/chosen": -75539535.44827586, "logits/rejected": -90929020.34285714, "logps/chosen": -189.0615234375, "logps/rejected": -125.88074776785714, "loss": 3.8346, "rewards/chosen": -0.02288691339821651, "rewards/margins": -0.018209210698827735, "rewards/rejected": -0.004677702699388777, "step": 25 }, { "epoch": 0.8421052631578947, "grad_norm": 231.0, "kl": 0.2163093388080597, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -58751488.0, "logits/rejected": -95374936.0, "logps/chosen": -256.7890319824219, "logps/rejected": -102.30900573730469, "loss": 3.9043, "rewards/chosen": -0.02519126608967781, "rewards/margins": -0.018108618445694447, "rewards/rejected": -0.007082647643983364, "step": 26 }, { "epoch": 0.8744939271255061, "grad_norm": 185.0, "kl": 0.09094679355621338, "learning_rate": 1.7333333333333332e-07, "logits/chosen": -56075064.88888889, "logits/rejected": -88336576.0, "logps/chosen": -246.36691623263889, "logps/rejected": -113.58909388950893, "loss": 3.831, "rewards/chosen": -0.01602463920911153, "rewards/margins": -0.00417077505872363, "rewards/rejected": -0.0118538641503879, "step": 27 }, { "epoch": 0.9068825910931174, "grad_norm": 207.0, "kl": 0.12784463167190552, "learning_rate": 1.8e-07, "logits/chosen": -56007577.6, "logits/rejected": -94364580.1025641, "logps/chosen": -254.408984375, "logps/rejected": -127.31337640224359, "loss": 3.7463, "rewards/chosen": 0.016985907554626464, "rewards/margins": 0.020557051835915982, "rewards/rejected": -0.0035711442812895165, "step": 28 }, { "epoch": 0.9392712550607287, "grad_norm": 197.0, "kl": 0.04759460687637329, "learning_rate": 1.8666666666666667e-07, "logits/chosen": -73918902.85714285, "logits/rejected": -90738123.03448276, "logps/chosen": -173.3303013392857, "logps/rejected": -117.31009226831897, "loss": 3.8649, "rewards/chosen": 0.0025086658341544016, "rewards/margins": 0.007073412534638579, "rewards/rejected": -0.004564746700484177, "step": 29 }, { "epoch": 0.97165991902834, "grad_norm": 227.0, "kl": 0.10075879096984863, "learning_rate": 1.9333333333333332e-07, "logits/chosen": -67702632.2962963, "logits/rejected": -93046195.8918919, "logps/chosen": -165.40802228009258, "logps/rejected": -114.92798511402027, "loss": 3.7333, "rewards/chosen": 0.01195223574285154, "rewards/margins": 0.04874239404041607, "rewards/rejected": -0.03679015829756453, "step": 30 }, { "epoch": 1.0, "grad_norm": 213.0, "kl": 0.12372662127017975, "learning_rate": 2e-07, "logits/chosen": -53873344.0, "logits/rejected": -90412911.30434783, "logps/chosen": -127.94631476151316, "logps/rejected": -102.76524286684783, "loss": 3.7935, "rewards/chosen": -0.012544832731548109, "rewards/margins": 0.0057541689829095286, "rewards/rejected": -0.018299001714457638, "step": 31 }, { "epoch": 1.0323886639676114, "grad_norm": 212.0, "kl": 0.26420849561691284, "learning_rate": 2.0666666666666666e-07, "logits/chosen": -58409840.941176474, "logits/rejected": -87994436.26666667, "logps/chosen": -210.7363712086397, "logps/rejected": -94.37674967447917, "loss": 3.7993, "rewards/chosen": 0.004469211487209096, "rewards/margins": 0.01882150529646406, "rewards/rejected": -0.014352293809254964, "step": 32 }, { "epoch": 1.0647773279352226, "grad_norm": 205.0, "kl": 0.15617316961288452, "learning_rate": 2.1333333333333334e-07, "logits/chosen": -62545208.47058824, "logits/rejected": -80585830.4, "logps/chosen": -223.84014533547793, "logps/rejected": -96.57676595052084, "loss": 3.7213, "rewards/chosen": 0.01100671466659097, "rewards/margins": 0.03551927185526081, "rewards/rejected": -0.02451255718866984, "step": 33 }, { "epoch": 1.097165991902834, "grad_norm": 212.0, "kl": 0.06040966510772705, "learning_rate": 2.1999999999999998e-07, "logits/chosen": -63960251.733333334, "logits/rejected": -94734132.70588236, "logps/chosen": -217.113916015625, "logps/rejected": -109.65385885799633, "loss": 3.7561, "rewards/chosen": -0.002544252077738444, "rewards/margins": 0.010596939979815015, "rewards/rejected": -0.01314119205755346, "step": 34 }, { "epoch": 1.1295546558704452, "grad_norm": 227.0, "kl": 0.10720312595367432, "learning_rate": 2.2666666666666663e-07, "logits/chosen": -63211966.35897436, "logits/rejected": -89783726.08, "logps/chosen": -234.25676081730768, "logps/rejected": -104.3000390625, "loss": 3.969, "rewards/chosen": 0.020914004399226263, "rewards/margins": 0.030220148517535284, "rewards/rejected": -0.009306144118309021, "step": 35 }, { "epoch": 1.1619433198380567, "grad_norm": 197.0, "kl": 0.005383551120758057, "learning_rate": 2.3333333333333333e-07, "logits/chosen": -54717854.89655172, "logits/rejected": -89043229.25714286, "logps/chosen": -209.59223464439654, "logps/rejected": -110.57509765625, "loss": 3.6599, "rewards/chosen": -0.029390470734957992, "rewards/margins": -0.025808011898266273, "rewards/rejected": -0.00358245883669172, "step": 36 }, { "epoch": 1.194331983805668, "grad_norm": 239.0, "kl": 0.1495572328567505, "learning_rate": 2.4e-07, "logits/chosen": -52902044.90322581, "logits/rejected": -86327823.51515152, "logps/chosen": -158.13993195564515, "logps/rejected": -95.27357806581439, "loss": 3.9352, "rewards/chosen": 0.0074248765745470605, "rewards/margins": 0.029407006350663633, "rewards/rejected": -0.021982129776116573, "step": 37 }, { "epoch": 1.2267206477732793, "grad_norm": 237.0, "kl": 0.17384201288223267, "learning_rate": 2.4666666666666665e-07, "logits/chosen": -60058088.72727273, "logits/rejected": -84006499.09677419, "logps/chosen": -144.7613340435606, "logps/rejected": -128.73287865423387, "loss": 3.8516, "rewards/chosen": -0.0022030806902683144, "rewards/margins": 0.036726773979610244, "rewards/rejected": -0.03892985466987856, "step": 38 }, { "epoch": 1.2591093117408907, "grad_norm": 182.0, "kl": 0.15346676111221313, "learning_rate": 2.533333333333333e-07, "logits/chosen": -48408002.20689655, "logits/rejected": -91550485.94285715, "logps/chosen": -236.32336004849137, "logps/rejected": -113.23631417410714, "loss": 3.7963, "rewards/chosen": 0.006504716544315733, "rewards/margins": 0.03631391219904857, "rewards/rejected": -0.02980919565473284, "step": 39 }, { "epoch": 1.291497975708502, "grad_norm": 211.0, "kl": 0.1730141043663025, "learning_rate": 2.6e-07, "logits/chosen": -70837530.94736843, "logits/rejected": -87514023.38461539, "logps/chosen": -179.63252981085526, "logps/rejected": -125.02696814903847, "loss": 3.8002, "rewards/chosen": -0.01461096813804225, "rewards/margins": -0.010148805675477634, "rewards/rejected": -0.0044621624625646155, "step": 40 }, { "epoch": 1.3238866396761133, "grad_norm": 221.0, "kl": 0.11359253525733948, "learning_rate": 2.6666666666666667e-07, "logits/chosen": -66725420.52173913, "logits/rejected": -90649818.53658536, "logps/chosen": -141.18204398777175, "logps/rejected": -100.03291730182927, "loss": 3.7192, "rewards/chosen": -0.036896081074424415, "rewards/margins": -0.027763956045674623, "rewards/rejected": -0.009132125028749792, "step": 41 }, { "epoch": 1.3562753036437247, "grad_norm": 198.0, "kl": 0.043334782123565674, "learning_rate": 2.733333333333333e-07, "logits/chosen": -58113312.91428571, "logits/rejected": -85511291.5862069, "logps/chosen": -193.58042689732142, "logps/rejected": -118.14533102101294, "loss": 3.7884, "rewards/chosen": -0.01873276744570051, "rewards/margins": 0.0015272747119659293, "rewards/rejected": -0.020260042157666438, "step": 42 }, { "epoch": 1.3886639676113361, "grad_norm": 192.0, "kl": 0.09052866697311401, "learning_rate": 2.8e-07, "logits/chosen": -58638514.28571428, "logits/rejected": -85584270.22222222, "logps/chosen": -283.0006801060268, "logps/rejected": -97.80012342664931, "loss": 3.7554, "rewards/chosen": -0.01699193673474448, "rewards/margins": 0.014840672176981727, "rewards/rejected": -0.03183260891172621, "step": 43 }, { "epoch": 1.4210526315789473, "grad_norm": 328.0, "kl": 0.17926907539367676, "learning_rate": 2.866666666666667e-07, "logits/chosen": -64984988.44444445, "logits/rejected": -90971129.08108108, "logps/chosen": -249.94845920138889, "logps/rejected": -112.96752269847973, "loss": 3.7747, "rewards/chosen": -0.04352985488043891, "rewards/margins": -0.011843407118284667, "rewards/rejected": -0.03168644776215424, "step": 44 }, { "epoch": 1.4534412955465588, "grad_norm": 189.0, "kl": 0.09499406814575195, "learning_rate": 2.933333333333333e-07, "logits/chosen": -51370278.4, "logits/rejected": -89449554.8235294, "logps/chosen": -254.2802734375, "logps/rejected": -89.32239487591912, "loss": 3.7488, "rewards/chosen": -0.011582699418067933, "rewards/margins": 0.05230814341236563, "rewards/rejected": -0.06389084283043356, "step": 45 }, { "epoch": 1.48582995951417, "grad_norm": 206.0, "kl": 0.044406354427337646, "learning_rate": 3e-07, "logits/chosen": -54140912.64, "logits/rejected": -91693574.56410256, "logps/chosen": -233.1951953125, "logps/rejected": -126.72333233173077, "loss": 3.8303, "rewards/chosen": 0.040367259979248046, "rewards/margins": 0.08629294444353153, "rewards/rejected": -0.04592568446428348, "step": 46 }, { "epoch": 1.5182186234817814, "grad_norm": 185.0, "kl": 0.04556882381439209, "learning_rate": 3.066666666666666e-07, "logits/chosen": -61412363.63636363, "logits/rejected": -95139228.90322581, "logps/chosen": -242.9996152935606, "logps/rejected": -119.96295362903226, "loss": 3.7595, "rewards/chosen": -0.02121250376556859, "rewards/margins": 0.001219373586007693, "rewards/rejected": -0.02243187735157628, "step": 47 }, { "epoch": 1.5506072874493926, "grad_norm": 214.0, "kl": 0.19127535820007324, "learning_rate": 3.1333333333333333e-07, "logits/chosen": -62732676.12903226, "logits/rejected": -91854072.24242425, "logps/chosen": -174.46708039314515, "logps/rejected": -130.95345052083334, "loss": 3.9625, "rewards/chosen": -0.001973065637773083, "rewards/margins": 0.045654672390088545, "rewards/rejected": -0.047627738027861626, "step": 48 }, { "epoch": 1.582995951417004, "grad_norm": 192.0, "kl": 0.2232951521873474, "learning_rate": 3.2e-07, "logits/chosen": -56241034.10526316, "logits/rejected": -88449723.07692307, "logps/chosen": -245.80191200657896, "logps/rejected": -114.557861328125, "loss": 3.6986, "rewards/chosen": -0.0354247438280206, "rewards/margins": 0.022862803356850196, "rewards/rejected": -0.058287547184870794, "step": 49 }, { "epoch": 1.6153846153846154, "grad_norm": 187.0, "kl": 0.04990750551223755, "learning_rate": 3.2666666666666663e-07, "logits/chosen": -64088552.0, "logits/rejected": -97265312.0, "logps/chosen": -250.58773803710938, "logps/rejected": -121.63117980957031, "loss": 3.7894, "rewards/chosen": -0.020488403737545013, "rewards/margins": 0.015492349863052368, "rewards/rejected": -0.03598075360059738, "step": 50 }, { "epoch": 1.6477732793522266, "grad_norm": 180.0, "kl": 0.11040717363357544, "learning_rate": 3.333333333333333e-07, "logits/chosen": -72300434.28571428, "logits/rejected": -88177575.72413793, "logps/chosen": -167.09561941964284, "logps/rejected": -126.21796706627156, "loss": 3.7607, "rewards/chosen": -0.02663487025669643, "rewards/margins": 0.03576011869120481, "rewards/rejected": -0.062394988947901235, "step": 51 }, { "epoch": 1.680161943319838, "grad_norm": 169.0, "kl": 0.031675100326538086, "learning_rate": 3.4000000000000003e-07, "logits/chosen": -65065728.0, "logits/rejected": -90847330.46153846, "logps/chosen": -207.9890779194079, "logps/rejected": -106.46443997896634, "loss": 3.735, "rewards/chosen": -0.006777437109696238, "rewards/margins": 0.05778750161892972, "rewards/rejected": -0.06456493872862595, "step": 52 }, { "epoch": 1.7125506072874495, "grad_norm": 252.0, "kl": 0.09681445360183716, "learning_rate": 3.4666666666666665e-07, "logits/chosen": -65318413.12820513, "logits/rejected": -87948451.84, "logps/chosen": -151.71540715144232, "logps/rejected": -94.358984375, "loss": 3.8119, "rewards/chosen": -0.002530088409399375, "rewards/margins": 0.051912841812158245, "rewards/rejected": -0.05444293022155762, "step": 53 }, { "epoch": 1.7449392712550607, "grad_norm": 227.0, "kl": 0.0, "learning_rate": 3.533333333333333e-07, "logits/chosen": -68032464.0, "logits/rejected": -89987720.0, "logps/chosen": -171.3027801513672, "logps/rejected": -119.03023529052734, "loss": 3.8662, "rewards/chosen": -0.03888993337750435, "rewards/margins": 0.053229544311761856, "rewards/rejected": -0.0921194776892662, "step": 54 }, { "epoch": 1.777327935222672, "grad_norm": 229.0, "kl": 0.1575755476951599, "learning_rate": 3.6e-07, "logits/chosen": -62680342.5882353, "logits/rejected": -85439786.66666667, "logps/chosen": -216.95730411305146, "logps/rejected": -109.880810546875, "loss": 3.8791, "rewards/chosen": 0.0060911914881537944, "rewards/margins": 0.09755752297008738, "rewards/rejected": -0.09146633148193359, "step": 55 }, { "epoch": 1.8097165991902835, "grad_norm": 205.0, "kl": 0.011227190494537354, "learning_rate": 3.666666666666666e-07, "logits/chosen": -75584723.86206897, "logits/rejected": -91031215.54285714, "logps/chosen": -189.27225889008622, "logps/rejected": -126.86244419642857, "loss": 3.8159, "rewards/chosen": -0.04395907500694538, "rewards/margins": 0.05888716599036908, "rewards/rejected": -0.10284624099731446, "step": 56 }, { "epoch": 1.8421052631578947, "grad_norm": 205.0, "kl": 0.13237720727920532, "learning_rate": 3.7333333333333334e-07, "logits/chosen": -59131476.0, "logits/rejected": -95583968.0, "logps/chosen": -256.7174377441406, "logps/rejected": -103.1009750366211, "loss": 3.898, "rewards/chosen": -0.01803133636713028, "rewards/margins": 0.06824938207864761, "rewards/rejected": -0.0862807184457779, "step": 57 }, { "epoch": 1.874493927125506, "grad_norm": 188.0, "kl": 0.01799929141998291, "learning_rate": 3.7999999999999996e-07, "logits/chosen": -55775914.666666664, "logits/rejected": -88368658.28571428, "logps/chosen": -246.25482855902777, "logps/rejected": -114.36540876116071, "loss": 3.8, "rewards/chosen": -0.004815898421737883, "rewards/margins": 0.0846694488492277, "rewards/rejected": -0.08948534727096558, "step": 58 }, { "epoch": 1.9068825910931175, "grad_norm": 224.0, "kl": 0.02940082550048828, "learning_rate": 3.8666666666666664e-07, "logits/chosen": -56140933.12, "logits/rejected": -94621124.92307693, "logps/chosen": -254.40115234375, "logps/rejected": -128.3409204727564, "loss": 3.7634, "rewards/chosen": 0.017768787145614626, "rewards/margins": 0.12409534274003445, "rewards/rejected": -0.10632655559441982, "step": 59 }, { "epoch": 1.9392712550607287, "grad_norm": 230.0, "kl": 0.05236637592315674, "learning_rate": 3.933333333333333e-07, "logits/chosen": -73944693.02857143, "logits/rejected": -90982444.13793103, "logps/chosen": -173.64387555803572, "logps/rejected": -118.02784886853448, "loss": 3.8531, "rewards/chosen": -0.02884887967790876, "rewards/margins": 0.04749297132633003, "rewards/rejected": -0.07634185100423879, "step": 60 }, { "epoch": 1.97165991902834, "grad_norm": 203.0, "kl": 0.10479164123535156, "learning_rate": 4e-07, "logits/chosen": -67756994.37037037, "logits/rejected": -93152989.4054054, "logps/chosen": -165.50672743055554, "logps/rejected": -115.58003853462837, "loss": 3.6903, "rewards/chosen": 0.002080100554007071, "rewards/margins": 0.1040766963490972, "rewards/rejected": -0.10199659579509013, "step": 61 }, { "epoch": 2.0, "grad_norm": 223.0, "kl": 0.018572943285107613, "learning_rate": 4.0666666666666666e-07, "logits/chosen": -53805850.94736842, "logits/rejected": -90529268.86956522, "logps/chosen": -128.19540244654604, "logps/rejected": -103.58083177649456, "loss": 3.779, "rewards/chosen": -0.03745367025074206, "rewards/margins": 0.06240318323436536, "rewards/rejected": -0.09985685348510742, "step": 62 }, { "epoch": 2.032388663967611, "grad_norm": 229.0, "kl": 0.18244534730911255, "learning_rate": 4.1333333333333333e-07, "logits/chosen": -58454268.23529412, "logits/rejected": -88176153.6, "logps/chosen": -210.8977768841912, "logps/rejected": -94.99117838541666, "loss": 3.7903, "rewards/chosen": -0.01167152208440444, "rewards/margins": 0.06412222104914048, "rewards/rejected": -0.07579374313354492, "step": 63 }, { "epoch": 2.064777327935223, "grad_norm": 203.0, "kl": 0.10842978954315186, "learning_rate": 4.1999999999999995e-07, "logits/chosen": -62472903.52941176, "logits/rejected": -80891144.53333333, "logps/chosen": -224.19175091911765, "logps/rejected": -96.9512451171875, "loss": 3.7124, "rewards/chosen": -0.024154009187922758, "rewards/margins": 0.037806827063653986, "rewards/rejected": -0.061960836251576744, "step": 64 }, { "epoch": 2.097165991902834, "grad_norm": 244.0, "kl": 0.0, "learning_rate": 4.266666666666667e-07, "logits/chosen": -64039726.93333333, "logits/rejected": -94997970.8235294, "logps/chosen": -217.54044596354166, "logps/rejected": -110.7608283547794, "loss": 3.7247, "rewards/chosen": -0.04519684314727783, "rewards/margins": 0.07864142726449405, "rewards/rejected": -0.12383827041177188, "step": 65 }, { "epoch": 2.1295546558704452, "grad_norm": 249.0, "kl": 0.018720507621765137, "learning_rate": 4.3333333333333335e-07, "logits/chosen": -63058005.333333336, "logits/rejected": -90236334.08, "logps/chosen": -234.40567407852564, "logps/rejected": -105.177529296875, "loss": 3.9646, "rewards/chosen": 0.006023437930987432, "rewards/margins": 0.10307669873421009, "rewards/rejected": -0.09705326080322266, "step": 66 }, { "epoch": 2.161943319838057, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -55136326.62068965, "logits/rejected": -89366396.34285714, "logps/chosen": -209.48905576508622, "logps/rejected": -111.65326450892857, "loss": 3.6105, "rewards/chosen": -0.0190719879906753, "rewards/margins": 0.09232676340441398, "rewards/rejected": -0.11139875139508928, "step": 67 }, { "epoch": 2.194331983805668, "grad_norm": 190.0, "kl": 0.056352436542510986, "learning_rate": 4.4666666666666664e-07, "logits/chosen": -52990773.67741936, "logits/rejected": -86883995.15151516, "logps/chosen": -158.23761970766128, "logps/rejected": -96.30680338541667, "loss": 3.8777, "rewards/chosen": -0.002344240584681111, "rewards/margins": 0.12295980812692223, "rewards/rejected": -0.12530404871160333, "step": 68 }, { "epoch": 2.2267206477732793, "grad_norm": 242.0, "kl": 0.0054090023040771484, "learning_rate": 4.5333333333333326e-07, "logits/chosen": -60425867.63636363, "logits/rejected": -84169984.0, "logps/chosen": -144.87008759469697, "logps/rejected": -129.69174489667338, "loss": 3.8105, "rewards/chosen": -0.013077873172182026, "rewards/margins": 0.12173854267608851, "rewards/rejected": -0.13481641584827053, "step": 69 }, { "epoch": 2.2591093117408905, "grad_norm": 178.0, "kl": 0.020307421684265137, "learning_rate": 4.6e-07, "logits/chosen": -48812146.75862069, "logits/rejected": -91739165.25714286, "logps/chosen": -236.0985317887931, "logps/rejected": -114.47859933035714, "loss": 3.7725, "rewards/chosen": 0.02898802428409971, "rewards/margins": 0.18302607207462707, "rewards/rejected": -0.15403804779052735, "step": 70 }, { "epoch": 2.291497975708502, "grad_norm": 202.0, "kl": 0.08195632696151733, "learning_rate": 4.6666666666666666e-07, "logits/chosen": -70915456.0, "logits/rejected": -87808964.92307693, "logps/chosen": -180.18395353618422, "logps/rejected": -126.05295973557692, "loss": 3.7815, "rewards/chosen": -0.06975392918837697, "rewards/margins": 0.03730890577138678, "rewards/rejected": -0.10706283495976375, "step": 71 }, { "epoch": 2.3238866396761133, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 4.733333333333333e-07, "logits/chosen": -66860766.60869565, "logits/rejected": -90918362.53658536, "logps/chosen": -141.63720703125, "logps/rejected": -101.3832412347561, "loss": 3.6912, "rewards/chosen": -0.08241141360739003, "rewards/margins": 0.06175242754712716, "rewards/rejected": -0.1441638411545172, "step": 72 }, { "epoch": 2.3562753036437245, "grad_norm": 201.0, "kl": 0.05285078287124634, "learning_rate": 4.8e-07, "logits/chosen": -58148995.657142855, "logits/rejected": -85458688.0, "logps/chosen": -193.9041015625, "logps/rejected": -119.37925983297414, "loss": 3.7687, "rewards/chosen": -0.05109883717128209, "rewards/margins": 0.09255356906082832, "rewards/rejected": -0.1436524062321104, "step": 73 }, { "epoch": 2.388663967611336, "grad_norm": 189.0, "kl": 0.017002761363983154, "learning_rate": 4.866666666666666e-07, "logits/chosen": -58708155.428571425, "logits/rejected": -85895623.1111111, "logps/chosen": -283.2236328125, "logps/rejected": -98.8388671875, "loss": 3.7163, "rewards/chosen": -0.039288078035627096, "rewards/margins": 0.09641826720464797, "rewards/rejected": -0.13570634524027506, "step": 74 }, { "epoch": 2.4210526315789473, "grad_norm": 258.0, "kl": 0.09225642681121826, "learning_rate": 4.933333333333333e-07, "logits/chosen": -65111812.74074074, "logits/rejected": -91157974.48648648, "logps/chosen": -250.06803385416666, "logps/rejected": -114.29911845439189, "loss": 3.737, "rewards/chosen": -0.05548596823657, "rewards/margins": 0.10936014275173764, "rewards/rejected": -0.16484611098830765, "step": 75 }, { "epoch": 2.4534412955465585, "grad_norm": 191.0, "kl": 0.10859435796737671, "learning_rate": 5e-07, "logits/chosen": -51252386.13333333, "logits/rejected": -89624688.94117647, "logps/chosen": -254.38297526041666, "logps/rejected": -90.42824419806985, "loss": 3.7249, "rewards/chosen": -0.021853423118591307, "rewards/margins": 0.15262241503771612, "rewards/rejected": -0.17447583815630743, "step": 76 }, { "epoch": 2.48582995951417, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 5.066666666666667e-07, "logits/chosen": -54352476.16, "logits/rejected": -91919642.25641026, "logps/chosen": -233.6417578125, "logps/rejected": -128.34430088141025, "loss": 3.7901, "rewards/chosen": -0.00428692102432251, "rewards/margins": 0.20373707948586878, "rewards/rejected": -0.2080240005101913, "step": 77 }, { "epoch": 2.5182186234817814, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 5.133333333333333e-07, "logits/chosen": -61378466.90909091, "logits/rejected": -95343054.4516129, "logps/chosen": -243.4647401751894, "logps/rejected": -121.86517137096774, "loss": 3.7241, "rewards/chosen": -0.06772462526957194, "rewards/margins": 0.14492905780833254, "rewards/rejected": -0.2126536830779045, "step": 78 }, { "epoch": 2.5506072874493926, "grad_norm": 230.0, "kl": 0.15216386318206787, "learning_rate": 5.2e-07, "logits/chosen": -62781588.64516129, "logits/rejected": -91950995.39393939, "logps/chosen": -174.5715095766129, "logps/rejected": -132.68966027462122, "loss": 3.9193, "rewards/chosen": -0.01241672808124173, "rewards/margins": 0.20883350253454752, "rewards/rejected": -0.22125023061578925, "step": 79 }, { "epoch": 2.582995951417004, "grad_norm": 217.0, "kl": 0.01045989990234375, "learning_rate": 5.266666666666666e-07, "logits/chosen": -56160697.2631579, "logits/rejected": -88810771.6923077, "logps/chosen": -246.09932668585526, "logps/rejected": -116.0523681640625, "loss": 3.6824, "rewards/chosen": -0.06516703806425396, "rewards/margins": 0.14257117804245428, "rewards/rejected": -0.20773821610670823, "step": 80 }, { "epoch": 2.6153846153846154, "grad_norm": 172.0, "kl": 0.05806320905685425, "learning_rate": 5.333333333333333e-07, "logits/chosen": -63793476.0, "logits/rejected": -97707888.0, "logps/chosen": -250.8136749267578, "logps/rejected": -123.37957763671875, "loss": 3.7336, "rewards/chosen": -0.043080516159534454, "rewards/margins": 0.16773956269025803, "rewards/rejected": -0.21082007884979248, "step": 81 }, { "epoch": 2.6477732793522266, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 5.4e-07, "logits/chosen": -72679197.25714286, "logits/rejected": -88729114.48275863, "logps/chosen": -167.96812220982142, "logps/rejected": -127.9656182650862, "loss": 3.712, "rewards/chosen": -0.11388427189418247, "rewards/margins": 0.12327655876798584, "rewards/rejected": -0.2371608306621683, "step": 82 }, { "epoch": 2.6801619433198383, "grad_norm": 175.0, "kl": 0.09860539436340332, "learning_rate": 5.466666666666666e-07, "logits/chosen": -65102255.15789474, "logits/rejected": -91322948.92307693, "logps/chosen": -208.51667865953948, "logps/rejected": -108.3269512469952, "loss": 3.7181, "rewards/chosen": -0.059536714302866084, "rewards/margins": 0.1912797776311033, "rewards/rejected": -0.2508164919339694, "step": 83 }, { "epoch": 2.7125506072874495, "grad_norm": 240.0, "kl": 0.0, "learning_rate": 5.533333333333334e-07, "logits/chosen": -65572621.12820513, "logits/rejected": -88185815.04, "logps/chosen": -152.19298377403845, "logps/rejected": -95.966689453125, "loss": 3.7343, "rewards/chosen": -0.05028758904872797, "rewards/margins": 0.16492761709751227, "rewards/rejected": -0.21521520614624023, "step": 84 }, { "epoch": 2.7449392712550607, "grad_norm": 192.0, "kl": 0.00746995210647583, "learning_rate": 5.6e-07, "logits/chosen": -68171560.0, "logits/rejected": -90503168.0, "logps/chosen": -172.13671875, "logps/rejected": -121.0586929321289, "loss": 3.8099, "rewards/chosen": -0.12228179723024368, "rewards/margins": 0.17268472164869308, "rewards/rejected": -0.29496651887893677, "step": 85 }, { "epoch": 2.7773279352226723, "grad_norm": 202.0, "kl": 0.010839879512786865, "learning_rate": 5.666666666666666e-07, "logits/chosen": -62651245.176470585, "logits/rejected": -85748070.4, "logps/chosen": -217.32801011029412, "logps/rejected": -111.80550130208333, "loss": 3.813, "rewards/chosen": -0.03098060453639311, "rewards/margins": 0.2529542747665854, "rewards/rejected": -0.2839348793029785, "step": 86 }, { "epoch": 2.8097165991902835, "grad_norm": 219.0, "kl": 0.10649758577346802, "learning_rate": 5.733333333333334e-07, "logits/chosen": -75832258.20689656, "logits/rejected": -91454632.22857143, "logps/chosen": -190.46363146551724, "logps/rejected": -128.76707589285715, "loss": 3.7562, "rewards/chosen": -0.16309767755968818, "rewards/margins": 0.1302118338974826, "rewards/rejected": -0.29330951145717077, "step": 87 }, { "epoch": 2.8421052631578947, "grad_norm": 228.0, "kl": 0.046126484870910645, "learning_rate": 5.8e-07, "logits/chosen": -59232008.0, "logits/rejected": -96177024.0, "logps/chosen": -256.9981994628906, "logps/rejected": -105.48146057128906, "loss": 3.8255, "rewards/chosen": -0.04610701650381088, "rewards/margins": 0.27822083979845047, "rewards/rejected": -0.32432785630226135, "step": 88 }, { "epoch": 2.8744939271255063, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 5.866666666666666e-07, "logits/chosen": -56284231.11111111, "logits/rejected": -89122020.57142857, "logps/chosen": -246.57191297743054, "logps/rejected": -116.71786934988839, "loss": 3.7563, "rewards/chosen": -0.03652524285846286, "rewards/margins": 0.2882071288805159, "rewards/rejected": -0.3247323717389788, "step": 89 }, { "epoch": 2.9068825910931175, "grad_norm": 214.0, "kl": 0.0, "learning_rate": 5.933333333333334e-07, "logits/chosen": -56218849.28, "logits/rejected": -95044286.35897435, "logps/chosen": -255.12201171875, "logps/rejected": -130.9345703125, "loss": 3.6649, "rewards/chosen": -0.05431962013244629, "rewards/margins": 0.3113717360374255, "rewards/rejected": -0.3656913561698718, "step": 90 }, { "epoch": 2.9392712550607287, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 6e-07, "logits/chosen": -74079656.22857143, "logits/rejected": -91588793.37931034, "logps/chosen": -174.35126953125, "logps/rejected": -120.26845366379311, "loss": 3.8084, "rewards/chosen": -0.09958893230983189, "rewards/margins": 0.2008122620324196, "rewards/rejected": -0.3004011943422515, "step": 91 }, { "epoch": 2.97165991902834, "grad_norm": 222.0, "kl": 0.0, "learning_rate": 6.066666666666666e-07, "logits/chosen": -67999293.62962963, "logits/rejected": -93689011.8918919, "logps/chosen": -166.47797309027777, "logps/rejected": -118.10967852618244, "loss": 3.6621, "rewards/chosen": -0.09504372985274703, "rewards/margins": 0.25991599003712573, "rewards/rejected": -0.3549597198898728, "step": 92 }, { "epoch": 3.0, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 6.133333333333332e-07, "logits/chosen": -54103104.0, "logits/rejected": -90902861.91304348, "logps/chosen": -129.50056537828948, "logps/rejected": -105.81852921195652, "loss": 3.7126, "rewards/chosen": -0.167970105221397, "rewards/margins": 0.15565773987933756, "rewards/rejected": -0.32362784510073456, "step": 93 }, { "epoch": 3.032388663967611, "grad_norm": 227.0, "kl": 0.0727849006652832, "learning_rate": 6.2e-07, "logits/chosen": -58711220.705882356, "logits/rejected": -88564070.4, "logps/chosen": -211.5251034007353, "logps/rejected": -96.87139485677083, "loss": 3.7509, "rewards/chosen": -0.07440530552583582, "rewards/margins": 0.18940984314563228, "rewards/rejected": -0.2638151486714681, "step": 94 }, { "epoch": 3.064777327935223, "grad_norm": 215.0, "kl": 0.10124093294143677, "learning_rate": 6.266666666666667e-07, "logits/chosen": -62814102.5882353, "logits/rejected": -81403332.26666667, "logps/chosen": -224.80596564797793, "logps/rejected": -98.86597493489583, "loss": 3.6665, "rewards/chosen": -0.08557542632607852, "rewards/margins": 0.1678579601587034, "rewards/rejected": -0.2534333864847819, "step": 95 }, { "epoch": 3.097165991902834, "grad_norm": 210.0, "kl": 0.0, "learning_rate": 6.333333333333332e-07, "logits/chosen": -64167667.2, "logits/rejected": -95528929.88235295, "logps/chosen": -218.38707682291667, "logps/rejected": -113.95331887637867, "loss": 3.6635, "rewards/chosen": -0.12986040910085042, "rewards/margins": 0.31322660773408184, "rewards/rejected": -0.44308701683493223, "step": 96 }, { "epoch": 3.1295546558704452, "grad_norm": 231.0, "kl": 0.0, "learning_rate": 6.4e-07, "logits/chosen": -63470861.12820513, "logits/rejected": -90707824.64, "logps/chosen": -235.18073918269232, "logps/rejected": -107.789013671875, "loss": 3.9197, "rewards/chosen": -0.07148234049479167, "rewards/margins": 0.2867210896809896, "rewards/rejected": -0.35820343017578127, "step": 97 }, { "epoch": 3.161943319838057, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 6.466666666666666e-07, "logits/chosen": -54969630.89655172, "logits/rejected": -89939763.2, "logps/chosen": -209.63872238685346, "logps/rejected": -114.52486746651786, "loss": 3.5587, "rewards/chosen": -0.03403720773499588, "rewards/margins": 0.36452140890318774, "rewards/rejected": -0.3985586166381836, "step": 98 }, { "epoch": 3.194331983805668, "grad_norm": 240.0, "kl": 0.0, "learning_rate": 6.533333333333333e-07, "logits/chosen": -53064105.29032258, "logits/rejected": -87360318.06060606, "logps/chosen": -158.5835433467742, "logps/rejected": -99.12943892045455, "loss": 3.7963, "rewards/chosen": -0.0369358524199455, "rewards/margins": 0.3706311718110115, "rewards/rejected": -0.40756702423095703, "step": 99 }, { "epoch": 3.2267206477732793, "grad_norm": 260.0, "kl": 0.0, "learning_rate": 6.6e-07, "logits/chosen": -60587337.696969695, "logits/rejected": -84534684.90322581, "logps/chosen": -145.5372869318182, "logps/rejected": -132.50162235383064, "loss": 3.7511, "rewards/chosen": -0.07979689222393614, "rewards/margins": 0.33600750929682366, "rewards/rejected": -0.4158044015207598, "step": 100 }, { "epoch": 3.2591093117408905, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 6.666666666666666e-07, "logits/chosen": -48986266.48275862, "logits/rejected": -92440115.2, "logps/chosen": -236.63941271551724, "logps/rejected": -117.69333147321429, "loss": 3.6876, "rewards/chosen": -0.025098778050521325, "rewards/margins": 0.450413263313876, "rewards/rejected": -0.47551204136439734, "step": 101 }, { "epoch": 3.291497975708502, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 6.733333333333333e-07, "logits/chosen": -71407063.57894737, "logits/rejected": -88328044.3076923, "logps/chosen": -181.23652086759867, "logps/rejected": -129.5223670372596, "loss": 3.7141, "rewards/chosen": -0.17501012902510793, "rewards/margins": 0.2789921046268602, "rewards/rejected": -0.4540022336519681, "step": 102 }, { "epoch": 3.3238866396761133, "grad_norm": 211.0, "kl": 0.0, "learning_rate": 6.800000000000001e-07, "logits/chosen": -67130056.3478261, "logits/rejected": -91396283.31707317, "logps/chosen": -143.13606063179347, "logps/rejected": -105.06101133765245, "loss": 3.569, "rewards/chosen": -0.23229586559793222, "rewards/margins": 0.279645811588615, "rewards/rejected": -0.5119416771865473, "step": 103 }, { "epoch": 3.3562753036437245, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 6.866666666666666e-07, "logits/chosen": -58641781.02857143, "logits/rejected": -86187016.8275862, "logps/chosen": -194.0300502232143, "logps/rejected": -122.5322686557112, "loss": 3.684, "rewards/chosen": -0.06369432040623256, "rewards/margins": 0.3952588793092173, "rewards/rejected": -0.4589531997154499, "step": 104 }, { "epoch": 3.388663967611336, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 6.933333333333333e-07, "logits/chosen": -58871300.571428575, "logits/rejected": -86522055.1111111, "logps/chosen": -283.37655203683033, "logps/rejected": -101.57166883680556, "loss": 3.6468, "rewards/chosen": -0.054577767848968506, "rewards/margins": 0.3544092509481642, "rewards/rejected": -0.40898701879713273, "step": 105 }, { "epoch": 3.4210526315789473, "grad_norm": 382.0, "kl": 0.0, "learning_rate": 7e-07, "logits/chosen": -65297189.925925925, "logits/rejected": -91777037.83783785, "logps/chosen": -251.05060040509258, "logps/rejected": -117.88727301520271, "loss": 3.648, "rewards/chosen": -0.15374413243046514, "rewards/margins": 0.36991781610865015, "rewards/rejected": -0.5236619485391153, "step": 106 }, { "epoch": 3.4534412955465585, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 7.066666666666666e-07, "logits/chosen": -51531144.53333333, "logits/rejected": -90403237.64705883, "logps/chosen": -255.04498697916668, "logps/rejected": -94.00696518841912, "loss": 3.644, "rewards/chosen": -0.0880549669265747, "rewards/margins": 0.4442929927040549, "rewards/rejected": -0.5323479596306296, "step": 107 }, { "epoch": 3.48582995951417, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 7.133333333333333e-07, "logits/chosen": -54512532.48, "logits/rejected": -92732192.82051282, "logps/chosen": -234.6769921875, "logps/rejected": -131.9256810897436, "loss": 3.7004, "rewards/chosen": -0.10780903816223145, "rewards/margins": 0.4583528266808926, "rewards/rejected": -0.566161864843124, "step": 108 }, { "epoch": 3.5182186234817814, "grad_norm": 204.0, "kl": 0.0, "learning_rate": 7.2e-07, "logits/chosen": -61560777.696969695, "logits/rejected": -96084892.90322581, "logps/chosen": -244.23498165246212, "logps/rejected": -126.09141885080645, "loss": 3.6479, "rewards/chosen": -0.1447492657285748, "rewards/margins": 0.4905286361977734, "rewards/rejected": -0.6352779019263483, "step": 109 }, { "epoch": 3.5506072874493926, "grad_norm": 225.0, "kl": 0.03981220722198486, "learning_rate": 7.266666666666667e-07, "logits/chosen": -63047613.93548387, "logits/rejected": -93004520.72727273, "logps/chosen": -175.2143082157258, "logps/rejected": -136.17847419507575, "loss": 3.8344, "rewards/chosen": -0.0766956344727547, "rewards/margins": 0.4934345692367032, "rewards/rejected": -0.5701302037094579, "step": 110 }, { "epoch": 3.582995951417004, "grad_norm": 208.0, "kl": 0.0, "learning_rate": 7.333333333333332e-07, "logits/chosen": -56503208.421052635, "logits/rejected": -89910852.92307693, "logps/chosen": -246.58493523848685, "logps/rejected": -119.18206317608173, "loss": 3.6194, "rewards/chosen": -0.11372504736247815, "rewards/margins": 0.40698333018221844, "rewards/rejected": -0.5207083775446966, "step": 111 }, { "epoch": 3.6153846153846154, "grad_norm": 182.0, "kl": 0.01825159788131714, "learning_rate": 7.4e-07, "logits/chosen": -64289088.0, "logits/rejected": -98815936.0, "logps/chosen": -251.60263061523438, "logps/rejected": -127.94086456298828, "loss": 3.6212, "rewards/chosen": -0.12197733670473099, "rewards/margins": 0.5449719354510307, "rewards/rejected": -0.6669492721557617, "step": 112 }, { "epoch": 3.6477732793522266, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 7.466666666666667e-07, "logits/chosen": -73170534.4, "logits/rejected": -89516879.44827586, "logps/chosen": -170.231640625, "logps/rejected": -131.79471982758622, "loss": 3.6429, "rewards/chosen": -0.34023783547537667, "rewards/margins": 0.27983285218036824, "rewards/rejected": -0.6200706876557449, "step": 113 }, { "epoch": 3.6801619433198383, "grad_norm": 182.0, "kl": 0.06295955181121826, "learning_rate": 7.533333333333332e-07, "logits/chosen": -65289175.578947365, "logits/rejected": -92120832.0, "logps/chosen": -209.48148386101974, "logps/rejected": -112.7288348858173, "loss": 3.6405, "rewards/chosen": -0.1560162619540566, "rewards/margins": 0.5349870239674803, "rewards/rejected": -0.6910032859215369, "step": 114 }, { "epoch": 3.7125506072874495, "grad_norm": 260.0, "kl": 0.0, "learning_rate": 7.599999999999999e-07, "logits/chosen": -65811022.76923077, "logits/rejected": -88804065.28, "logps/chosen": -153.22434645432693, "logps/rejected": -100.038125, "loss": 3.6606, "rewards/chosen": -0.15342459311852089, "rewards/margins": 0.4689330805264986, "rewards/rejected": -0.6223576736450195, "step": 115 }, { "epoch": 3.7449392712550607, "grad_norm": 210.0, "kl": 0.004709720611572266, "learning_rate": 7.666666666666667e-07, "logits/chosen": -68488384.0, "logits/rejected": -91338232.0, "logps/chosen": -173.6177978515625, "logps/rejected": -125.25968933105469, "loss": 3.6747, "rewards/chosen": -0.2703918218612671, "rewards/margins": 0.44467318058013916, "rewards/rejected": -0.7150650024414062, "step": 116 }, { "epoch": 3.7773279352226723, "grad_norm": 240.0, "kl": 0.0, "learning_rate": 7.733333333333333e-07, "logits/chosen": -63149522.823529415, "logits/rejected": -86811980.8, "logps/chosen": -218.0769473805147, "logps/rejected": -115.976513671875, "loss": 3.7105, "rewards/chosen": -0.10587411768296186, "rewards/margins": 0.5951634799732881, "rewards/rejected": -0.70103759765625, "step": 117 }, { "epoch": 3.8097165991902835, "grad_norm": 199.0, "kl": 0.17148053646087646, "learning_rate": 7.799999999999999e-07, "logits/chosen": -76564162.20689656, "logits/rejected": -92245716.11428571, "logps/chosen": -192.8898336476293, "logps/rejected": -132.41061662946427, "loss": 3.6601, "rewards/chosen": -0.40571768530483904, "rewards/margins": 0.2519464774672034, "rewards/rejected": -0.6576641627720424, "step": 118 }, { "epoch": 3.8421052631578947, "grad_norm": 199.0, "kl": 0.08587014675140381, "learning_rate": 7.866666666666666e-07, "logits/chosen": -59585784.0, "logits/rejected": -97084912.0, "logps/chosen": -257.84991455078125, "logps/rejected": -110.13160705566406, "loss": 3.735, "rewards/chosen": -0.1312781572341919, "rewards/margins": 0.6580657958984375, "rewards/rejected": -0.7893439531326294, "step": 119 }, { "epoch": 3.8744939271255063, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 7.933333333333333e-07, "logits/chosen": -56471552.0, "logits/rejected": -89928557.71428572, "logps/chosen": -247.12120225694446, "logps/rejected": -120.70945521763393, "loss": 3.6722, "rewards/chosen": -0.09145498275756836, "rewards/margins": 0.6324355261666434, "rewards/rejected": -0.7238905089242118, "step": 120 }, { "epoch": 3.9068825910931175, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 8e-07, "logits/chosen": -56484792.32, "logits/rejected": -95992260.92307693, "logps/chosen": -256.19287109375, "logps/rejected": -135.6700220352564, "loss": 3.5178, "rewards/chosen": -0.1614029312133789, "rewards/margins": 0.6778336950448843, "rewards/rejected": -0.8392366262582632, "step": 121 }, { "epoch": 3.9392712550607287, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 8.066666666666666e-07, "logits/chosen": -74744685.71428572, "logits/rejected": -92417739.03448276, "logps/chosen": -175.45368303571428, "logps/rejected": -124.31517712823276, "loss": 3.6676, "rewards/chosen": -0.20983006613595145, "rewards/margins": 0.49524325760714527, "rewards/rejected": -0.7050733237430967, "step": 122 }, { "epoch": 3.97165991902834, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 8.133333333333333e-07, "logits/chosen": -68318757.92592593, "logits/rejected": -94619371.24324325, "logps/chosen": -167.68947120949073, "logps/rejected": -122.23518000422297, "loss": 3.5317, "rewards/chosen": -0.21619266933865017, "rewards/margins": 0.551317100410347, "rewards/rejected": -0.7675097697489971, "step": 123 }, { "epoch": 4.0, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 8.199999999999999e-07, "logits/chosen": -54672983.578947365, "logits/rejected": -91851993.04347827, "logps/chosen": -131.13337787828948, "logps/rejected": -109.7916843580163, "loss": 3.6239, "rewards/chosen": -0.33125076795879166, "rewards/margins": 0.38969243418433847, "rewards/rejected": -0.7209432021431301, "step": 124 }, { "epoch": 4.032388663967612, "grad_norm": 228.0, "kl": 0.0, "learning_rate": 8.266666666666667e-07, "logits/chosen": -58974637.176470585, "logits/rejected": -89414613.33333333, "logps/chosen": -212.5978573069853, "logps/rejected": -100.42489420572916, "loss": 3.6389, "rewards/chosen": -0.18168081956751206, "rewards/margins": 0.43748479169957777, "rewards/rejected": -0.6191656112670898, "step": 125 }, { "epoch": 4.064777327935222, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 8.333333333333333e-07, "logits/chosen": -63364999.52941176, "logits/rejected": -81994555.73333333, "logps/chosen": -225.94140625, "logps/rejected": -101.82618001302083, "loss": 3.5936, "rewards/chosen": -0.19911931542789235, "rewards/margins": 0.35033480139339673, "rewards/rejected": -0.5494541168212891, "step": 126 }, { "epoch": 4.097165991902834, "grad_norm": 223.0, "kl": 0.0, "learning_rate": 8.399999999999999e-07, "logits/chosen": -64454229.333333336, "logits/rejected": -96394345.41176471, "logps/chosen": -218.516796875, "logps/rejected": -119.05093204273896, "loss": 3.5279, "rewards/chosen": -0.14283286730448405, "rewards/margins": 0.8100145573709525, "rewards/rejected": -0.9528474246754366, "step": 127 }, { "epoch": 4.129554655870446, "grad_norm": 227.0, "kl": 0.0, "learning_rate": 8.466666666666667e-07, "logits/chosen": -63847345.23076923, "logits/rejected": -91306946.56, "logps/chosen": -236.07965244391025, "logps/rejected": -111.72240234375, "loss": 3.8, "rewards/chosen": -0.1613748990572416, "rewards/margins": 0.5901656282865084, "rewards/rejected": -0.75154052734375, "step": 128 }, { "epoch": 4.161943319838056, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 8.533333333333334e-07, "logits/chosen": -55260376.27586207, "logits/rejected": -91079870.17142858, "logps/chosen": -210.36469558189654, "logps/rejected": -118.82433035714286, "loss": 3.4237, "rewards/chosen": -0.10663683661099138, "rewards/margins": 0.7218695241242206, "rewards/rejected": -0.828506360735212, "step": 129 }, { "epoch": 4.194331983805668, "grad_norm": 239.0, "kl": 0.0, "learning_rate": 8.599999999999999e-07, "logits/chosen": -53269929.29032258, "logits/rejected": -88130901.33333333, "logps/chosen": -158.97380607358872, "logps/rejected": -103.46803977272727, "loss": 3.6842, "rewards/chosen": -0.07596483538227697, "rewards/margins": 0.765463054354939, "rewards/rejected": -0.8414278897372159, "step": 130 }, { "epoch": 4.22672064777328, "grad_norm": 224.0, "kl": 0.0, "learning_rate": 8.666666666666667e-07, "logits/chosen": -61057683.39393939, "logits/rejected": -85674710.70967741, "logps/chosen": -146.51667554450756, "logps/rejected": -136.4623550907258, "loss": 3.6105, "rewards/chosen": -0.1777365135424065, "rewards/margins": 0.6341409524747936, "rewards/rejected": -0.8118774660172001, "step": 131 }, { "epoch": 4.2591093117408905, "grad_norm": 185.0, "kl": 0.0, "learning_rate": 8.733333333333333e-07, "logits/chosen": -49094801.655172415, "logits/rejected": -93466389.94285715, "logps/chosen": -237.12252491918105, "logps/rejected": -122.34063895089285, "loss": 3.5546, "rewards/chosen": -0.07341302674392174, "rewards/margins": 0.8668295226073618, "rewards/rejected": -0.9402425493512835, "step": 132 }, { "epoch": 4.291497975708502, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 8.799999999999999e-07, "logits/chosen": -71895033.26315789, "logits/rejected": -89686035.6923077, "logps/chosen": -182.9316277754934, "logps/rejected": -134.27398212139423, "loss": 3.5952, "rewards/chosen": -0.3445196402700324, "rewards/margins": 0.5846443658898233, "rewards/rejected": -0.9291640061598557, "step": 133 }, { "epoch": 4.323886639676114, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 8.866666666666667e-07, "logits/chosen": -67607440.69565217, "logits/rejected": -92422218.92682926, "logps/chosen": -145.24899159307066, "logps/rejected": -109.93842892530488, "loss": 3.4658, "rewards/chosen": -0.44358871294104535, "rewards/margins": 0.5560945741445111, "rewards/rejected": -0.9996832870855564, "step": 134 }, { "epoch": 4.3562753036437245, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 8.933333333333333e-07, "logits/chosen": -58873164.8, "logits/rejected": -87104247.1724138, "logps/chosen": -194.96414620535714, "logps/rejected": -126.86558795797414, "loss": 3.5809, "rewards/chosen": -0.15710413796561104, "rewards/margins": 0.7351811103632885, "rewards/rejected": -0.8922852483288995, "step": 135 }, { "epoch": 4.388663967611336, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 9e-07, "logits/chosen": -59124073.14285714, "logits/rejected": -87515932.44444445, "logps/chosen": -284.48231724330356, "logps/rejected": -105.84238009982639, "loss": 3.5378, "rewards/chosen": -0.16515704563685826, "rewards/margins": 0.6709008822365412, "rewards/rejected": -0.8360579278733995, "step": 136 }, { "epoch": 4.421052631578947, "grad_norm": 316.0, "kl": 0.0, "learning_rate": 9.066666666666665e-07, "logits/chosen": -65584450.37037037, "logits/rejected": -93110797.83783785, "logps/chosen": -252.453125, "logps/rejected": -122.6502586570946, "loss": 3.5077, "rewards/chosen": -0.2939942677815755, "rewards/margins": 0.7059658325470246, "rewards/rejected": -0.9999601003286, "step": 137 }, { "epoch": 4.4534412955465585, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 9.133333333333333e-07, "logits/chosen": -51863415.46666667, "logits/rejected": -91196197.64705883, "logps/chosen": -255.50361328125, "logps/rejected": -98.63608685661765, "loss": 3.4997, "rewards/chosen": -0.13391879399617512, "rewards/margins": 0.8613416690452426, "rewards/rejected": -0.9952604630414177, "step": 138 }, { "epoch": 4.48582995951417, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 9.2e-07, "logits/chosen": -54972656.64, "logits/rejected": -93798885.74358974, "logps/chosen": -236.46029296875, "logps/rejected": -136.74373998397436, "loss": 3.5652, "rewards/chosen": -0.28614009857177736, "rewards/margins": 0.7618288392287035, "rewards/rejected": -1.0479689378004808, "step": 139 }, { "epoch": 4.518218623481781, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.266666666666665e-07, "logits/chosen": -62021950.06060606, "logits/rejected": -97421402.83870968, "logps/chosen": -245.36388790246212, "logps/rejected": -131.17503307711692, "loss": 3.5174, "rewards/chosen": -0.2576433528553356, "rewards/margins": 0.8859953810392588, "rewards/rejected": -1.1436387338945944, "step": 140 }, { "epoch": 4.550607287449393, "grad_norm": 230.0, "kl": 0.0, "learning_rate": 9.333333333333333e-07, "logits/chosen": -63452969.29032258, "logits/rejected": -93965459.39393939, "logps/chosen": -175.92149697580646, "logps/rejected": -140.64895537405303, "loss": 3.6806, "rewards/chosen": -0.14741454585905997, "rewards/margins": 0.8697651166720125, "rewards/rejected": -1.0171796625310725, "step": 141 }, { "epoch": 4.582995951417004, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 9.399999999999999e-07, "logits/chosen": -57006187.78947368, "logits/rejected": -90852352.0, "logps/chosen": -247.59189967105263, "logps/rejected": -123.01950307992789, "loss": 3.534, "rewards/chosen": -0.21442169892160515, "rewards/margins": 0.6900306983515319, "rewards/rejected": -0.904452397273137, "step": 142 }, { "epoch": 4.615384615384615, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 9.466666666666666e-07, "logits/chosen": -64618232.0, "logits/rejected": -100268952.0, "logps/chosen": -252.03915405273438, "logps/rejected": -133.17811584472656, "loss": 3.5107, "rewards/chosen": -0.16562682390213013, "rewards/margins": 1.025047481060028, "rewards/rejected": -1.1906743049621582, "step": 143 }, { "epoch": 4.647773279352227, "grad_norm": 230.0, "kl": 0.0, "learning_rate": 9.533333333333333e-07, "logits/chosen": -73837048.68571429, "logits/rejected": -90723380.96551724, "logps/chosen": -172.4505580357143, "logps/rejected": -135.97361597521552, "loss": 3.5333, "rewards/chosen": -0.5621269771030971, "rewards/margins": 0.4758335357816349, "rewards/rejected": -1.037960512884732, "step": 144 }, { "epoch": 4.680161943319838, "grad_norm": 174.0, "kl": 0.07423794269561768, "learning_rate": 9.6e-07, "logits/chosen": -65844446.315789476, "logits/rejected": -93227086.76923077, "logps/chosen": -210.4194464432566, "logps/rejected": -117.88577974759616, "loss": 3.5285, "rewards/chosen": -0.249812828867059, "rewards/margins": 0.9568866891899571, "rewards/rejected": -1.2066995180570161, "step": 145 }, { "epoch": 4.712550607287449, "grad_norm": 240.0, "kl": 0.0, "learning_rate": 9.666666666666666e-07, "logits/chosen": -66073245.538461536, "logits/rejected": -90012160.0, "logps/chosen": -154.33119240785257, "logps/rejected": -104.64640625, "loss": 3.536, "rewards/chosen": -0.26410897572835285, "rewards/margins": 0.8190776697794597, "rewards/rejected": -1.0831866455078125, "step": 146 }, { "epoch": 4.744939271255061, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 9.733333333333333e-07, "logits/chosen": -68960184.0, "logits/rejected": -92516768.0, "logps/chosen": -174.9327392578125, "logps/rejected": -130.5812225341797, "loss": 3.5352, "rewards/chosen": -0.4018847942352295, "rewards/margins": 0.845333456993103, "rewards/rejected": -1.2472182512283325, "step": 147 }, { "epoch": 4.777327935222672, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 9.8e-07, "logits/chosen": -63528101.64705882, "logits/rejected": -87971114.66666667, "logps/chosen": -218.85384593290442, "logps/rejected": -120.32495930989583, "loss": 3.6012, "rewards/chosen": -0.18356499952428482, "rewards/margins": 0.952316551582486, "rewards/rejected": -1.1358815511067708, "step": 148 }, { "epoch": 4.809716599190283, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 9.866666666666666e-07, "logits/chosen": -77304849.65517241, "logits/rejected": -93613787.42857143, "logps/chosen": -194.98279229525863, "logps/rejected": -136.35209263392858, "loss": 3.531, "rewards/chosen": -0.6150127937053812, "rewards/margins": 0.43679809758228616, "rewards/rejected": -1.0518108912876674, "step": 149 }, { "epoch": 4.842105263157895, "grad_norm": 202.0, "kl": 0.1953182816505432, "learning_rate": 9.933333333333333e-07, "logits/chosen": -59967328.0, "logits/rejected": -98624440.0, "logps/chosen": -258.19989013671875, "logps/rejected": -115.43824768066406, "loss": 3.5744, "rewards/chosen": -0.16627463698387146, "rewards/margins": 1.1537322103977203, "rewards/rejected": -1.3200068473815918, "step": 150 }, { "epoch": 4.874493927125506, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 1e-06, "logits/chosen": -56974087.11111111, "logits/rejected": -91352310.85714285, "logps/chosen": -247.51087782118054, "logps/rejected": -125.28658621651786, "loss": 3.553, "rewards/chosen": -0.13042267163594565, "rewards/margins": 1.0511799426305863, "rewards/rejected": -1.181602614266532, "step": 151 }, { "epoch": 4.906882591093117, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 9.999888302765345e-07, "logits/chosen": -56981550.08, "logits/rejected": -97229817.43589744, "logps/chosen": -257.08314453125, "logps/rejected": -141.33010316506412, "loss": 3.384, "rewards/chosen": -0.25043046951293946, "rewards/margins": 1.1548138197874414, "rewards/rejected": -1.4052442893003807, "step": 152 }, { "epoch": 4.939271255060729, "grad_norm": 220.0, "kl": 0.0, "learning_rate": 9.99955321605189e-07, "logits/chosen": -75437582.62857144, "logits/rejected": -94106120.8275862, "logps/chosen": -176.31854073660713, "logps/rejected": -129.12183459051724, "loss": 3.5434, "rewards/chosen": -0.2963148662022182, "rewards/margins": 0.8894236644500582, "rewards/rejected": -1.1857385306522763, "step": 153 }, { "epoch": 4.97165991902834, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 9.998994754830939e-07, "logits/chosen": -68836622.22222222, "logits/rejected": -95785305.94594595, "logps/chosen": -169.1109302662037, "logps/rejected": -126.82097233952703, "loss": 3.3998, "rewards/chosen": -0.3583377555564598, "rewards/margins": 0.8677524436820854, "rewards/rejected": -1.2260901992385451, "step": 154 }, { "epoch": 5.0, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 9.998212944053918e-07, "logits/chosen": -55163964.631578945, "logits/rejected": -93045504.0, "logps/chosen": -132.9926629317434, "logps/rejected": -114.3027980638587, "loss": 3.4854, "rewards/chosen": -0.517180041262978, "rewards/margins": 0.654873586082895, "rewards/rejected": -1.172053627345873, "step": 155 }, { "epoch": 5.032388663967612, "grad_norm": 235.0, "kl": 0.0, "learning_rate": 9.997207818651273e-07, "logits/chosen": -59630369.88235294, "logits/rejected": -90848819.2, "logps/chosen": -213.7499281939338, "logps/rejected": -103.99580078125, "loss": 3.5373, "rewards/chosen": -0.29688773435704846, "rewards/margins": 0.6793691448136872, "rewards/rejected": -0.9762568791707357, "step": 156 }, { "epoch": 5.064777327935222, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 9.995979423530892e-07, "logits/chosen": -64028664.47058824, "logits/rejected": -83074372.26666667, "logps/chosen": -227.1330135569853, "logps/rejected": -104.895263671875, "loss": 3.4498, "rewards/chosen": -0.31828005173627066, "rewards/margins": 0.5380826089896409, "rewards/rejected": -0.8563626607259115, "step": 157 }, { "epoch": 5.097165991902834, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 9.99452781357611e-07, "logits/chosen": -65066653.86666667, "logits/rejected": -98157778.8235294, "logps/chosen": -219.489990234375, "logps/rejected": -124.0337775735294, "loss": 3.4166, "rewards/chosen": -0.2401509443918864, "rewards/margins": 1.2109819982566086, "rewards/rejected": -1.451132942648495, "step": 158 }, { "epoch": 5.129554655870446, "grad_norm": 232.0, "kl": 0.0, "learning_rate": 9.992853053643257e-07, "logits/chosen": -64405714.05128205, "logits/rejected": -92928952.32, "logps/chosen": -236.69583834134616, "logps/rejected": -115.80888671875, "loss": 3.6956, "rewards/chosen": -0.22299350836338142, "rewards/margins": 0.9371969213241187, "rewards/rejected": -1.1601904296875, "step": 159 }, { "epoch": 5.161943319838056, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 9.99095521855875e-07, "logits/chosen": -55735816.827586204, "logits/rejected": -92454282.97142857, "logps/chosen": -210.5728380926724, "logps/rejected": -123.35432477678572, "loss": 3.3245, "rewards/chosen": -0.12744968512962604, "rewards/margins": 1.1540555726131194, "rewards/rejected": -1.2815052577427455, "step": 160 }, { "epoch": 5.194331983805668, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 9.988834393115767e-07, "logits/chosen": -53546334.96774194, "logits/rejected": -89768354.9090909, "logps/chosen": -159.03936176915323, "logps/rejected": -108.08769087357955, "loss": 3.5483, "rewards/chosen": -0.08251856219383978, "rewards/margins": 1.2208744838673582, "rewards/rejected": -1.303393046061198, "step": 161 }, { "epoch": 5.22672064777328, "grad_norm": 226.0, "kl": 0.0, "learning_rate": 9.986490672070435e-07, "logits/chosen": -61786100.36363637, "logits/rejected": -86724954.83870968, "logps/chosen": -147.48764500473484, "logps/rejected": -141.09959362399192, "loss": 3.4577, "rewards/chosen": -0.27483367919921875, "rewards/margins": 1.0007670002598916, "rewards/rejected": -1.2756006794591104, "step": 162 }, { "epoch": 5.2591093117408905, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 9.983924160137624e-07, "logits/chosen": -49868773.51724138, "logits/rejected": -94799111.31428571, "logps/chosen": -237.3065396012931, "logps/rejected": -127.19659598214285, "loss": 3.4325, "rewards/chosen": -0.0918145755241657, "rewards/margins": 1.3340229140126647, "rewards/rejected": -1.4258374895368304, "step": 163 }, { "epoch": 5.291497975708502, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 9.981134971986247e-07, "logits/chosen": -72578391.57894737, "logits/rejected": -91101213.53846154, "logps/chosen": -184.6579718338816, "logps/rejected": -139.31804950420673, "loss": 3.493, "rewards/chosen": -0.5171534387688888, "rewards/margins": 0.9164171296092662, "rewards/rejected": -1.433570568378155, "step": 164 }, { "epoch": 5.323886639676114, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 9.978123232234146e-07, "logits/chosen": -68418910.60869566, "logits/rejected": -93767867.31707317, "logps/chosen": -147.5511527683424, "logps/rejected": -114.96096131859755, "loss": 3.3151, "rewards/chosen": -0.6738060660984205, "rewards/margins": 0.8281306836157564, "rewards/rejected": -1.501936749714177, "step": 165 }, { "epoch": 5.3562753036437245, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 9.97488907544252e-07, "logits/chosen": -59543266.74285714, "logits/rejected": -88190243.31034483, "logps/chosen": -195.37498604910715, "logps/rejected": -131.32277074353448, "loss": 3.4582, "rewards/chosen": -0.19818906784057616, "rewards/margins": 1.1398148142058273, "rewards/rejected": -1.3380038820464035, "step": 166 }, { "epoch": 5.388663967611336, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.971432646109917e-07, "logits/chosen": -59788562.28571428, "logits/rejected": -88785742.22222222, "logps/chosen": -284.75272042410717, "logps/rejected": -109.71605088975694, "loss": 3.4192, "rewards/chosen": -0.1921968970979963, "rewards/margins": 1.0312278024734012, "rewards/rejected": -1.2234246995713975, "step": 167 }, { "epoch": 5.421052631578947, "grad_norm": 232.0, "kl": 0.0, "learning_rate": 9.967754098665778e-07, "logits/chosen": -66138832.59259259, "logits/rejected": -94453497.08108108, "logps/chosen": -253.72931134259258, "logps/rejected": -126.71872360641892, "loss": 3.4038, "rewards/chosen": -0.42161104414198136, "rewards/margins": 0.9851942520599823, "rewards/rejected": -1.4068052962019637, "step": 168 }, { "epoch": 5.4534412955465585, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 9.963853597463532e-07, "logits/chosen": -52095091.2, "logits/rejected": -92561558.58823529, "logps/chosen": -256.04205729166665, "logps/rejected": -103.12436810661765, "loss": 3.3497, "rewards/chosen": -0.18776068687438965, "rewards/margins": 1.2563272953033446, "rewards/rejected": -1.4440879821777344, "step": 169 }, { "epoch": 5.48582995951417, "grad_norm": 212.0, "kl": 0.0, "learning_rate": 9.959731316773258e-07, "logits/chosen": -55681187.84, "logits/rejected": -95117896.20512821, "logps/chosen": -237.82603515625, "logps/rejected": -141.01879256810898, "loss": 3.4183, "rewards/chosen": -0.42271514892578127, "rewards/margins": 1.0527576192220052, "rewards/rejected": -1.4754727681477864, "step": 170 }, { "epoch": 5.518218623481781, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 9.9553874407739e-07, "logits/chosen": -62956140.60606061, "logits/rejected": -98763990.70967741, "logps/chosen": -246.2182765151515, "logps/rejected": -135.6988722278226, "loss": 3.3954, "rewards/chosen": -0.34307841098669806, "rewards/margins": 1.2529449024741135, "rewards/rejected": -1.5960233134608115, "step": 171 }, { "epoch": 5.550607287449393, "grad_norm": 242.0, "kl": 0.02922797203063965, "learning_rate": 9.95082216354503e-07, "logits/chosen": -63656968.258064516, "logits/rejected": -95357548.60606061, "logps/chosen": -176.2975050403226, "logps/rejected": -144.93167021780303, "loss": 3.5123, "rewards/chosen": -0.18501430942166236, "rewards/margins": 1.2604359820674476, "rewards/rejected": -1.4454502914891099, "step": 172 }, { "epoch": 5.582995951417004, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 9.946035689058187e-07, "logits/chosen": -57274246.7368421, "logits/rejected": -92300780.3076923, "logps/chosen": -248.19785670230263, "logps/rejected": -126.5517108623798, "loss": 3.4008, "rewards/chosen": -0.2750188676934493, "rewards/margins": 0.9826543958563554, "rewards/rejected": -1.2576732635498047, "step": 173 }, { "epoch": 5.615384615384615, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 9.941028231167755e-07, "logits/chosen": -65106028.0, "logits/rejected": -101820640.0, "logps/chosen": -252.35142517089844, "logps/rejected": -138.0707550048828, "loss": 3.3482, "rewards/chosen": -0.1968565285205841, "rewards/margins": 1.4830813109874725, "rewards/rejected": -1.6799378395080566, "step": 174 }, { "epoch": 5.647773279352227, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.935800013601413e-07, "logits/chosen": -74467861.94285715, "logits/rejected": -92196722.7586207, "logps/chosen": -174.39148995535714, "logps/rejected": -139.74016702586206, "loss": 3.4035, "rewards/chosen": -0.7562218257359096, "rewards/margins": 0.6583934107437509, "rewards/rejected": -1.4146152364796605, "step": 175 }, { "epoch": 5.680161943319838, "grad_norm": 176.0, "kl": 0.25683772563934326, "learning_rate": 9.930351269950143e-07, "logits/chosen": -66604469.89473684, "logits/rejected": -94476376.61538461, "logps/chosen": -211.37924033717104, "logps/rejected": -122.16629732572116, "loss": 3.3942, "rewards/chosen": -0.34579031090987356, "rewards/margins": 1.288960348739315, "rewards/rejected": -1.6347506596491888, "step": 176 }, { "epoch": 5.712550607287449, "grad_norm": 236.0, "kl": 0.0, "learning_rate": 9.924682243657778e-07, "logits/chosen": -66897624.615384616, "logits/rejected": -91426672.64, "logps/chosen": -155.15265675080127, "logps/rejected": -108.669345703125, "loss": 3.4077, "rewards/chosen": -0.3462566718077048, "rewards/margins": 1.139224437811436, "rewards/rejected": -1.4854811096191407, "step": 177 }, { "epoch": 5.744939271255061, "grad_norm": 203.0, "kl": 0.04345065355300903, "learning_rate": 9.918793188010146e-07, "logits/chosen": -69631032.0, "logits/rejected": -93599952.0, "logps/chosen": -176.5198516845703, "logps/rejected": -135.20755004882812, "loss": 3.4123, "rewards/chosen": -0.5605951547622681, "rewards/margins": 1.1492563486099243, "rewards/rejected": -1.7098515033721924, "step": 178 }, { "epoch": 5.777327935222672, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 9.91268436612374e-07, "logits/chosen": -64168478.11764706, "logits/rejected": -89267072.0, "logps/chosen": -219.43959673713235, "logps/rejected": -124.466162109375, "loss": 3.4547, "rewards/chosen": -0.24213720770443187, "rewards/margins": 1.3078640638613233, "rewards/rejected": -1.5500012715657552, "step": 179 }, { "epoch": 5.809716599190283, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 9.906356050933962e-07, "logits/chosen": -78053764.4137931, "logits/rejected": -94502816.91428572, "logps/chosen": -197.25845231681035, "logps/rejected": -140.19140625, "loss": 3.3955, "rewards/chosen": -0.8425777040678879, "rewards/margins": 0.5931662272937193, "rewards/rejected": -1.4357439313616072, "step": 180 }, { "epoch": 5.842105263157895, "grad_norm": 202.0, "kl": 0.2528773546218872, "learning_rate": 9.899808525182934e-07, "logits/chosen": -60465256.0, "logits/rejected": -100177120.0, "logps/chosen": -258.4275817871094, "logps/rejected": -119.31683349609375, "loss": 3.4679, "rewards/chosen": -0.18904456496238708, "rewards/margins": 1.5188209116458893, "rewards/rejected": -1.7078654766082764, "step": 181 }, { "epoch": 5.874493927125506, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 9.893042081406867e-07, "logits/chosen": -57508433.777777776, "logits/rejected": -92387894.85714285, "logps/chosen": -247.75748697916666, "logps/rejected": -129.10232979910714, "loss": 3.436, "rewards/chosen": -0.15508422586652967, "rewards/margins": 1.408093700333247, "rewards/rejected": -1.5631779261997767, "step": 182 }, { "epoch": 5.906882591093117, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 9.886057021922982e-07, "logits/chosen": -57409761.28, "logits/rejected": -98912426.66666667, "logps/chosen": -257.6034375, "logps/rejected": -145.91838191105768, "loss": 3.2073, "rewards/chosen": -0.302458610534668, "rewards/margins": 1.5616145314925756, "rewards/rejected": -1.8640731420272436, "step": 183 }, { "epoch": 5.939271255060729, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 9.878853658816013e-07, "logits/chosen": -75604348.34285714, "logits/rejected": -95449511.72413793, "logps/chosen": -177.4068638392857, "logps/rejected": -132.91231142241378, "loss": 3.4123, "rewards/chosen": -0.4051459176199777, "rewards/margins": 1.159641190702692, "rewards/rejected": -1.5647871083226697, "step": 184 }, { "epoch": 5.97165991902834, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 9.871432313924253e-07, "logits/chosen": -69507555.55555555, "logits/rejected": -96964469.62162162, "logps/chosen": -169.9286205150463, "logps/rejected": -130.19913956925674, "loss": 3.2837, "rewards/chosen": -0.440108440540455, "rewards/margins": 1.1237977393515952, "rewards/rejected": -1.5639061798920502, "step": 185 }, { "epoch": 6.0, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 9.863793318825186e-07, "logits/chosen": -55615016.421052635, "logits/rejected": -94030836.86956522, "logps/chosen": -134.31357935855263, "logps/rejected": -117.63295049252717, "loss": 3.3731, "rewards/chosen": -0.649271663866545, "rewards/margins": 0.8557975712302621, "rewards/rejected": -1.5050692350968071, "step": 186 }, { "epoch": 6.032388663967612, "grad_norm": 260.0, "kl": 0.0, "learning_rate": 9.85593701482066e-07, "logits/chosen": -60041626.35294118, "logits/rejected": -91988292.26666667, "logps/chosen": -214.28821518841912, "logps/rejected": -106.65919596354166, "loss": 3.3927, "rewards/chosen": -0.35071389815386605, "rewards/margins": 0.8918824102364336, "rewards/rejected": -1.2425963083902996, "step": 187 }, { "epoch": 6.064777327935222, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 9.847863752921648e-07, "logits/chosen": -64591141.64705882, "logits/rejected": -84054843.73333333, "logps/chosen": -227.66990751378677, "logps/rejected": -106.8339599609375, "loss": 3.3231, "rewards/chosen": -0.3719707657309139, "rewards/margins": 0.6782620579588647, "rewards/rejected": -1.0502328236897787, "step": 188 }, { "epoch": 6.097165991902834, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 9.839573893832563e-07, "logits/chosen": -65403413.333333336, "logits/rejected": -99308649.41176471, "logps/chosen": -219.85328776041666, "logps/rejected": -127.46609317555146, "loss": 3.2881, "rewards/chosen": -0.2764817555745443, "rewards/margins": 1.5178822760488473, "rewards/rejected": -1.7943640316233915, "step": 189 }, { "epoch": 6.129554655870446, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 9.831067807935138e-07, "logits/chosen": -64863100.71794872, "logits/rejected": -94363463.68, "logps/chosen": -236.728515625, "logps/rejected": -119.330205078125, "loss": 3.5543, "rewards/chosen": -0.22626064985226363, "rewards/margins": 1.2860617378430488, "rewards/rejected": -1.5123223876953125, "step": 190 }, { "epoch": 6.161943319838056, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 9.822345875271883e-07, "logits/chosen": -55811464.827586204, "logits/rejected": -93839147.88571429, "logps/chosen": -210.44074959590517, "logps/rejected": -126.24461495535714, "loss": 3.1873, "rewards/chosen": -0.11424061347698343, "rewards/margins": 1.4562925940076707, "rewards/rejected": -1.570533207484654, "step": 191 }, { "epoch": 6.194331983805668, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 9.8134084855291e-07, "logits/chosen": -54031000.77419355, "logits/rejected": -90785093.81818181, "logps/chosen": -158.9200478830645, "logps/rejected": -110.96604225852273, "loss": 3.3842, "rewards/chosen": -0.07058828107772334, "rewards/margins": 1.5206403196265852, "rewards/rejected": -1.5912286007043086, "step": 192 }, { "epoch": 6.22672064777328, "grad_norm": 215.0, "kl": 0.0, "learning_rate": 9.804256038019481e-07, "logits/chosen": -62475213.57575758, "logits/rejected": -87352278.70967741, "logps/chosen": -148.17927320075756, "logps/rejected": -143.75623739919354, "loss": 3.3578, "rewards/chosen": -0.3439967126557321, "rewards/margins": 1.197268713487791, "rewards/rejected": -1.5412654261435232, "step": 193 }, { "epoch": 6.2591093117408905, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 9.794888941664253e-07, "logits/chosen": -50114127.448275864, "logits/rejected": -95931677.25714286, "logps/chosen": -237.2588900862069, "logps/rejected": -130.54501953125, "loss": 3.3044, "rewards/chosen": -0.08704933626898403, "rewards/margins": 1.6736307261612615, "rewards/rejected": -1.7606800624302454, "step": 194 }, { "epoch": 6.291497975708502, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 9.78530761497492e-07, "logits/chosen": -72950359.57894737, "logits/rejected": -92061430.15384616, "logps/chosen": -185.59851716694078, "logps/rejected": -142.77226374699518, "loss": 3.3508, "rewards/chosen": -0.6112096686112253, "rewards/margins": 1.1677829109222784, "rewards/rejected": -1.7789925795335035, "step": 195 }, { "epoch": 6.323886639676114, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 9.77551248603456e-07, "logits/chosen": -69183944.3478261, "logits/rejected": -94778099.51219513, "logps/chosen": -149.1656547214674, "logps/rejected": -117.63174066310975, "loss": 3.1596, "rewards/chosen": -0.8352551667586617, "rewards/margins": 0.9337595174067347, "rewards/rejected": -1.7690146841653964, "step": 196 }, { "epoch": 6.3562753036437245, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 9.765503992478703e-07, "logits/chosen": -60052831.08571429, "logits/rejected": -89278146.20689656, "logps/chosen": -195.72306082589284, "logps/rejected": -133.82958984375, "loss": 3.349, "rewards/chosen": -0.2329949242728097, "rewards/margins": 1.3556903416300055, "rewards/rejected": -1.5886852659028152, "step": 197 }, { "epoch": 6.388663967611336, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 9.755282581475767e-07, "logits/chosen": -60195122.28571428, "logits/rejected": -89645120.0, "logps/chosen": -285.11851283482144, "logps/rejected": -112.12190755208333, "loss": 3.3204, "rewards/chosen": -0.2287752287728446, "rewards/margins": 1.2352361754765586, "rewards/rejected": -1.4640114042494032, "step": 198 }, { "epoch": 6.421052631578947, "grad_norm": 214.0, "kl": 0.0, "learning_rate": 9.74484870970709e-07, "logits/chosen": -66635742.81481481, "logits/rejected": -95713736.64864865, "logps/chosen": -254.17954282407408, "logps/rejected": -129.53260926942568, "loss": 3.273, "rewards/chosen": -0.4666371875339084, "rewards/margins": 1.221557705967992, "rewards/rejected": -1.6881948935019004, "step": 199 }, { "epoch": 6.4534412955465585, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 9.73420284334652e-07, "logits/chosen": -52548770.13333333, "logits/rejected": -93875373.1764706, "logps/chosen": -255.763134765625, "logps/rejected": -105.7856086282169, "loss": 3.2525, "rewards/chosen": -0.15987102190653482, "rewards/margins": 1.5503407880371691, "rewards/rejected": -1.710211809943704, "step": 200 }, { "epoch": 6.4534412955465585, "eval_kl": 0.0, "eval_logits/chosen": -77142447.04934542, "eval_logits/rejected": -115986221.20686175, "eval_logps/chosen": -211.08607124874118, "eval_logps/rejected": -128.5555073789102, "eval_loss": 0.29369857907295227, "eval_rewards/chosen": -0.37724699114504656, "eval_rewards/margins": 1.3236440359109576, "eval_rewards/rejected": -1.700891027056004, "eval_runtime": 64.2138, "eval_samples_per_second": 30.694, "eval_steps_per_second": 0.966, "step": 200 }, { "epoch": 6.48582995951417, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 9.723345458039593e-07, "logits/chosen": -56222848.0, "logits/rejected": -96234167.79487179, "logps/chosen": -238.5019140625, "logps/rejected": -143.9195337540064, "loss": 3.2819, "rewards/chosen": -0.4902991485595703, "rewards/margins": 1.2752472099891077, "rewards/rejected": -1.765546358548678, "step": 201 }, { "epoch": 6.518218623481781, "grad_norm": 163.0, "kl": 0.0, "learning_rate": 9.712277038882273e-07, "logits/chosen": -63160448.0, "logits/rejected": -100012098.06451613, "logps/chosen": -246.70954663825756, "logps/rejected": -138.87134576612902, "loss": 3.2747, "rewards/chosen": -0.39220518054384174, "rewards/margins": 1.5210654621366648, "rewards/rejected": -1.9132706426805066, "step": 202 }, { "epoch": 6.550607287449393, "grad_norm": 240.0, "kl": 0.0, "learning_rate": 9.700998080399285e-07, "logits/chosen": -64144788.64516129, "logits/rejected": -96506104.24242425, "logps/chosen": -176.5855909778226, "logps/rejected": -147.65744850852272, "loss": 3.3754, "rewards/chosen": -0.21382254938925466, "rewards/margins": 1.5042050856649, "rewards/rejected": -1.7180276350541548, "step": 203 }, { "epoch": 6.582995951417004, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 9.689509086522018e-07, "logits/chosen": -57705162.10526316, "logits/rejected": -93512251.07692307, "logps/chosen": -248.44014699835526, "logps/rejected": -128.82718599759616, "loss": 3.2803, "rewards/chosen": -0.2992460602208188, "rewards/margins": 1.18597580257215, "rewards/rejected": -1.4852218627929688, "step": 204 }, { "epoch": 6.615384615384615, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 9.67781057056601e-07, "logits/chosen": -65230268.0, "logits/rejected": -103285960.0, "logps/chosen": -252.68380737304688, "logps/rejected": -141.1610107421875, "loss": 3.2449, "rewards/chosen": -0.23009414970874786, "rewards/margins": 1.758869931101799, "rewards/rejected": -1.9889640808105469, "step": 205 }, { "epoch": 6.647773279352227, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 9.665903055208012e-07, "logits/chosen": -75168343.77142857, "logits/rejected": -93463534.34482759, "logps/chosen": -175.55613839285715, "logps/rejected": -142.09524851831895, "loss": 3.2832, "rewards/chosen": -0.8726874760219029, "rewards/margins": 0.7774363832520733, "rewards/rejected": -1.6501238592739762, "step": 206 }, { "epoch": 6.680161943319838, "grad_norm": 174.0, "kl": 0.27339088916778564, "learning_rate": 9.653787072462643e-07, "logits/chosen": -67068530.526315786, "logits/rejected": -95274289.23076923, "logps/chosen": -211.9501310649671, "logps/rejected": -124.63771409254808, "loss": 3.2769, "rewards/chosen": -0.402879840449283, "rewards/margins": 1.4790127306331988, "rewards/rejected": -1.8818925710824819, "step": 207 }, { "epoch": 6.712550607287449, "grad_norm": 231.0, "kl": 0.0, "learning_rate": 9.641463163658605e-07, "logits/chosen": -67239463.38461539, "logits/rejected": -92569292.8, "logps/chosen": -155.6209935897436, "logps/rejected": -110.720048828125, "loss": 3.2899, "rewards/chosen": -0.39308939224634415, "rewards/margins": 1.2974609922751403, "rewards/rejected": -1.6905503845214844, "step": 208 }, { "epoch": 6.744939271255061, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 9.628931879414516e-07, "logits/chosen": -70281176.0, "logits/rejected": -94688808.0, "logps/chosen": -177.175048828125, "logps/rejected": -137.61363220214844, "loss": 3.2975, "rewards/chosen": -0.6261154413223267, "rewards/margins": 1.3243438005447388, "rewards/rejected": -1.9504592418670654, "step": 209 }, { "epoch": 6.777327935222672, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 9.616193779614293e-07, "logits/chosen": -64514537.4117647, "logits/rejected": -90385382.4, "logps/chosen": -219.74317842371323, "logps/rejected": -126.93536783854167, "loss": 3.3416, "rewards/chosen": -0.27249841129078584, "rewards/margins": 1.524424145268459, "rewards/rejected": -1.7969225565592448, "step": 210 }, { "epoch": 6.809716599190283, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 9.603249433382144e-07, "logits/chosen": -78874774.06896552, "logits/rejected": -95458099.2, "logps/chosen": -198.69526198814654, "logps/rejected": -142.20475725446428, "loss": 3.2742, "rewards/chosen": -0.9862608416327114, "rewards/margins": 0.6508184517545654, "rewards/rejected": -1.6370792933872769, "step": 211 }, { "epoch": 6.842105263157895, "grad_norm": 184.0, "kl": 0.2491929531097412, "learning_rate": 9.590099419057141e-07, "logits/chosen": -60872712.0, "logits/rejected": -101340408.0, "logps/chosen": -258.4055480957031, "logps/rejected": -121.7928466796875, "loss": 3.3416, "rewards/chosen": -0.18684199452400208, "rewards/margins": 1.7686249911785126, "rewards/rejected": -1.9554669857025146, "step": 212 }, { "epoch": 6.874493927125506, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 9.576744324167378e-07, "logits/chosen": -57879082.666666664, "logits/rejected": -93678509.71428572, "logps/chosen": -247.47976345486111, "logps/rejected": -131.52873883928572, "loss": 3.3345, "rewards/chosen": -0.12731069988674587, "rewards/margins": 1.6785063592214433, "rewards/rejected": -1.8058170591081892, "step": 213 }, { "epoch": 6.906882591093117, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 9.563184745403722e-07, "logits/chosen": -57542656.0, "logits/rejected": -99881235.6923077, "logps/chosen": -257.97509765625, "logps/rejected": -148.26245743189102, "loss": 3.096, "rewards/chosen": -0.3396272659301758, "rewards/margins": 1.758853349930201, "rewards/rejected": -2.098480615860377, "step": 214 }, { "epoch": 6.939271255060729, "grad_norm": 208.0, "kl": 0.0, "learning_rate": 9.549421288593157e-07, "logits/chosen": -76082892.8, "logits/rejected": -96593584.55172414, "logps/chosen": -177.5617466517857, "logps/rejected": -135.0049838362069, "loss": 3.2982, "rewards/chosen": -0.4206359318324498, "rewards/margins": 1.3534188914181564, "rewards/rejected": -1.7740548232506061, "step": 215 }, { "epoch": 6.97165991902834, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 9.535454568671704e-07, "logits/chosen": -70078094.22222222, "logits/rejected": -98170077.4054054, "logps/chosen": -170.33772786458334, "logps/rejected": -132.48622255067568, "loss": 3.163, "rewards/chosen": -0.48101958522090205, "rewards/margins": 1.3115953258327298, "rewards/rejected": -1.7926149110536318, "step": 216 }, { "epoch": 7.0, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 9.521285209656962e-07, "logits/chosen": -56098374.7368421, "logits/rejected": -94667386.43478261, "logps/chosen": -135.1068693462171, "logps/rejected": -119.83950407608695, "loss": 3.2609, "rewards/chosen": -0.7286000502736945, "rewards/margins": 0.9971250407482994, "rewards/rejected": -1.725725091021994, "step": 217 }, { "epoch": 7.032388663967612, "grad_norm": 235.0, "kl": 0.0, "learning_rate": 9.506913844620217e-07, "logits/chosen": -60832783.058823526, "logits/rejected": -92965239.46666667, "logps/chosen": -214.72449448529412, "logps/rejected": -108.35077311197917, "loss": 3.2728, "rewards/chosen": -0.39434508716358857, "rewards/margins": 1.0174091395209819, "rewards/rejected": -1.4117542266845704, "step": 218 }, { "epoch": 7.064777327935222, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 9.492341115658165e-07, "logits/chosen": -65132333.176470585, "logits/rejected": -84811306.66666667, "logps/chosen": -228.4188735064338, "logps/rejected": -108.3897705078125, "loss": 3.248, "rewards/chosen": -0.4468682233025046, "rewards/margins": 0.7589439766079773, "rewards/rejected": -1.2058121999104818, "step": 219 }, { "epoch": 7.097165991902834, "grad_norm": 216.0, "kl": 0.0, "learning_rate": 9.477567673864215e-07, "logits/chosen": -65939255.46666667, "logits/rejected": -100603309.1764706, "logps/chosen": -220.11765950520834, "logps/rejected": -130.0303452435662, "loss": 3.1895, "rewards/chosen": -0.30291754404703775, "rewards/margins": 1.7478724573172777, "rewards/rejected": -2.0507900013643154, "step": 220 }, { "epoch": 7.129554655870446, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 9.462594179299405e-07, "logits/chosen": -65328771.28205128, "logits/rejected": -95510886.4, "logps/chosen": -236.89200220352564, "logps/rejected": -121.12919921875, "loss": 3.4369, "rewards/chosen": -0.24260655427590394, "rewards/margins": 1.4496146676479242, "rewards/rejected": -1.6922212219238282, "step": 221 }, { "epoch": 7.161943319838056, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 9.44742130096291e-07, "logits/chosen": -56353438.89655172, "logits/rejected": -94957363.2, "logps/chosen": -210.51030441810346, "logps/rejected": -128.38976004464286, "loss": 3.0819, "rewards/chosen": -0.12119495457616346, "rewards/margins": 1.6638531760041937, "rewards/rejected": -1.7850481305803572, "step": 222 }, { "epoch": 7.194331983805668, "grad_norm": 220.0, "kl": 0.0, "learning_rate": 9.432049716762149e-07, "logits/chosen": -54206001.548387095, "logits/rejected": -91852435.39393939, "logps/chosen": -158.88182018649192, "logps/rejected": -113.12599875710227, "loss": 3.2916, "rewards/chosen": -0.06676443161502961, "rewards/margins": 1.7404590328884033, "rewards/rejected": -1.8072234645034329, "step": 223 }, { "epoch": 7.22672064777328, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 9.416480113482503e-07, "logits/chosen": -62940737.93939394, "logits/rejected": -88258807.74193548, "logps/chosen": -148.4831025094697, "logps/rejected": -145.8203597530242, "loss": 3.2473, "rewards/chosen": -0.3743794181130149, "rewards/margins": 1.373298292635473, "rewards/rejected": -1.747677710748488, "step": 224 }, { "epoch": 7.2591093117408905, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 9.400713186756623e-07, "logits/chosen": -50639894.06896552, "logits/rejected": -96988533.02857143, "logps/chosen": -237.03690732758622, "logps/rejected": -132.83162667410716, "loss": 3.2031, "rewards/chosen": -0.06484996450358424, "rewards/margins": 1.9244915533535585, "rewards/rejected": -1.9893415178571427, "step": 225 }, { "epoch": 7.291497975708502, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 9.384749641033357e-07, "logits/chosen": -73861248.0, "logits/rejected": -93220312.61538461, "logps/chosen": -185.95038805509867, "logps/rejected": -145.14332932692307, "loss": 3.2424, "rewards/chosen": -0.6463972392835116, "rewards/margins": 1.3697013700539284, "rewards/rejected": -2.01609860933744, "step": 226 }, { "epoch": 7.323886639676114, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 9.368590189546267e-07, "logits/chosen": -69601625.04347827, "logits/rejected": -95669123.12195122, "logps/chosen": -150.1243206521739, "logps/rejected": -119.9890672637195, "loss": 3.0651, "rewards/chosen": -0.9311220749564793, "rewards/margins": 1.073624524938719, "rewards/rejected": -2.0047465998951983, "step": 227 }, { "epoch": 7.3562753036437245, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 9.352235554281773e-07, "logits/chosen": -60476328.22857143, "logits/rejected": -90111011.31034483, "logps/chosen": -195.48328683035714, "logps/rejected": -135.86571423760776, "loss": 3.2539, "rewards/chosen": -0.20901996067592077, "rewards/margins": 1.5832775341466143, "rewards/rejected": -1.792297494822535, "step": 228 }, { "epoch": 7.388663967611336, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 9.335686465946886e-07, "logits/chosen": -60742925.71428572, "logits/rejected": -90614691.55555555, "logps/chosen": -285.0276402064732, "logps/rejected": -114.05181206597223, "loss": 3.2212, "rewards/chosen": -0.21968691689627512, "rewards/margins": 1.4373150023203047, "rewards/rejected": -1.65700191921658, "step": 229 }, { "epoch": 7.421052631578947, "grad_norm": 243.0, "kl": 0.0, "learning_rate": 9.318943663936569e-07, "logits/chosen": -67078437.925925925, "logits/rejected": -96883331.45945945, "logps/chosen": -254.83188657407408, "logps/rejected": -131.86358477618242, "loss": 3.1714, "rewards/chosen": -0.5318703828034578, "rewards/margins": 1.3894218964142366, "rewards/rejected": -1.9212922792176943, "step": 230 }, { "epoch": 7.4534412955465585, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 9.302007896300697e-07, "logits/chosen": -52654301.86666667, "logits/rejected": -94614896.94117647, "logps/chosen": -255.72353515625, "logps/rejected": -107.79805261948529, "loss": 3.1333, "rewards/chosen": -0.15591179529825847, "rewards/margins": 1.7555445932874492, "rewards/rejected": -1.9114563885857077, "step": 231 }, { "epoch": 7.48582995951417, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 9.284879919710631e-07, "logits/chosen": -56632893.44, "logits/rejected": -97366567.38461539, "logps/chosen": -238.73591796875, "logps/rejected": -145.61743790064102, "loss": 3.1966, "rewards/chosen": -0.5137019348144531, "rewards/margins": 1.4216348383976862, "rewards/rejected": -1.9353367732121394, "step": 232 }, { "epoch": 7.518218623481781, "grad_norm": 169.0, "kl": 0.0, "learning_rate": 9.267560499425424e-07, "logits/chosen": -63812239.515151516, "logits/rejected": -101234688.0, "logps/chosen": -246.65836588541666, "logps/rejected": -140.67149697580646, "loss": 3.1868, "rewards/chosen": -0.3870873306736802, "rewards/margins": 1.7061980145884168, "rewards/rejected": -2.093285345262097, "step": 233 }, { "epoch": 7.550607287449393, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 9.250050409257611e-07, "logits/chosen": -64757702.19354839, "logits/rejected": -97548730.18181819, "logps/chosen": -176.59056829637098, "logps/rejected": -149.50257457386363, "loss": 3.2909, "rewards/chosen": -0.21432172098467428, "rewards/margins": 1.6882187749167223, "rewards/rejected": -1.9025404959013967, "step": 234 }, { "epoch": 7.582995951417004, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 9.232350431538656e-07, "logits/chosen": -58187061.89473684, "logits/rejected": -94484814.76923077, "logps/chosen": -248.21343030427633, "logps/rejected": -130.44036395733173, "loss": 3.1836, "rewards/chosen": -0.27657767346030787, "rewards/margins": 1.369960107301411, "rewards/rejected": -1.6465377807617188, "step": 235 }, { "epoch": 7.615384615384615, "grad_norm": 174.0, "kl": 0.0, "learning_rate": 9.214461357083985e-07, "logits/chosen": -65923708.0, "logits/rejected": -104427760.0, "logps/chosen": -252.69491577148438, "logps/rejected": -143.23907470703125, "loss": 3.1462, "rewards/chosen": -0.23120331764221191, "rewards/margins": 1.9655675888061523, "rewards/rejected": -2.1967709064483643, "step": 236 }, { "epoch": 7.647773279352227, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.196383985157656e-07, "logits/chosen": -75513439.08571428, "logits/rejected": -94364707.31034483, "logps/chosen": -176.4595703125, "logps/rejected": -143.82543103448276, "loss": 3.192, "rewards/chosen": -0.9630287170410157, "rewards/margins": 0.8601124467520878, "rewards/rejected": -1.8231411637931034, "step": 237 }, { "epoch": 7.680161943319838, "grad_norm": 186.0, "kl": 0.2926532030105591, "learning_rate": 9.178119123436649e-07, "logits/chosen": -67436618.10526316, "logits/rejected": -96375246.76923077, "logps/chosen": -212.06575092516448, "logps/rejected": -126.75918344350961, "loss": 3.2137, "rewards/chosen": -0.4144437689530222, "rewards/margins": 1.6795955611626627, "rewards/rejected": -2.094039330115685, "step": 238 }, { "epoch": 7.712550607287449, "grad_norm": 223.0, "kl": 0.0, "learning_rate": 9.159667587974785e-07, "logits/chosen": -67735965.53846154, "logits/rejected": -93502791.68, "logps/chosen": -155.74129857772436, "logps/rejected": -112.822978515625, "loss": 3.1726, "rewards/chosen": -0.4051205806243114, "rewards/margins": 1.4957229252350634, "rewards/rejected": -1.900843505859375, "step": 239 }, { "epoch": 7.744939271255061, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 9.141030203166256e-07, "logits/chosen": -70799248.0, "logits/rejected": -95561552.0, "logps/chosen": -177.97103881835938, "logps/rejected": -139.7872314453125, "loss": 3.1999, "rewards/chosen": -0.7057160139083862, "rewards/margins": 1.462103247642517, "rewards/rejected": -2.1678192615509033, "step": 240 }, { "epoch": 7.777327935222672, "grad_norm": 212.0, "kl": 0.0, "learning_rate": 9.122207801708801e-07, "logits/chosen": -65076141.176470585, "logits/rejected": -91102412.8, "logps/chosen": -219.56000114889707, "logps/rejected": -128.51177571614582, "loss": 3.2295, "rewards/chosen": -0.2541801789227654, "rewards/margins": 1.7003829619463753, "rewards/rejected": -1.9545631408691406, "step": 241 }, { "epoch": 7.809716599190283, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.103201224566497e-07, "logits/chosen": -79426348.13793103, "logits/rejected": -96227540.11428571, "logps/chosen": -199.74701980064654, "logps/rejected": -143.68773716517856, "loss": 3.1877, "rewards/chosen": -1.091436320337756, "rewards/margins": 0.6939400921901457, "rewards/rejected": -1.7853764125279017, "step": 242 }, { "epoch": 7.842105263157895, "grad_norm": 171.0, "kl": 0.32616788148880005, "learning_rate": 9.084011320932188e-07, "logits/chosen": -60969656.0, "logits/rejected": -102558344.0, "logps/chosen": -258.640380859375, "logps/rejected": -123.86648559570312, "loss": 3.2517, "rewards/chosen": -0.21032822132110596, "rewards/margins": 1.9525035619735718, "rewards/rejected": -2.1628317832946777, "step": 243 }, { "epoch": 7.874493927125506, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 9.064638948189538e-07, "logits/chosen": -58330549.333333336, "logits/rejected": -94501805.71428572, "logps/chosen": -247.38720703125, "logps/rejected": -133.05278669084822, "loss": 3.2518, "rewards/chosen": -0.11805566151936848, "rewards/margins": 1.8401678176153276, "rewards/rejected": -1.958223479134696, "step": 244 }, { "epoch": 7.906882591093117, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.045084971874737e-07, "logits/chosen": -58010946.56, "logits/rejected": -100820814.76923077, "logps/chosen": -258.1535546875, "logps/rejected": -150.38654346955127, "loss": 3.0115, "rewards/chosen": -0.357471809387207, "rewards/margins": 1.9534176058646961, "rewards/rejected": -2.310889415251903, "step": 245 }, { "epoch": 7.939271255060729, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 9.025350265637815e-07, "logits/chosen": -76450004.11428571, "logits/rejected": -97642875.5862069, "logps/chosen": -177.54049944196427, "logps/rejected": -137.0269143992457, "loss": 3.1959, "rewards/chosen": -0.41851163591657364, "rewards/margins": 1.5577352965406597, "rewards/rejected": -1.9762469324572334, "step": 246 }, { "epoch": 7.97165991902834, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 9.005435711203618e-07, "logits/chosen": -70475733.33333333, "logits/rejected": -98785888.86486487, "logps/chosen": -170.7393844039352, "logps/rejected": -134.28969594594594, "loss": 3.0606, "rewards/chosen": -0.5211844974093967, "rewards/margins": 1.4517781655709665, "rewards/rejected": -1.9729626629803632, "step": 247 }, { "epoch": 8.0, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 8.985342198332406e-07, "logits/chosen": -56464353.684210524, "logits/rejected": -95736943.30434783, "logps/chosen": -135.89539216694078, "logps/rejected": -121.2200343919837, "loss": 3.163, "rewards/chosen": -0.8074518002961811, "rewards/margins": 1.0563260237739613, "rewards/rejected": -1.8637778240701426, "step": 248 }, { "epoch": 8.03238866396761, "grad_norm": 222.0, "kl": 0.0, "learning_rate": 8.965070624780115e-07, "logits/chosen": -61136862.11764706, "logits/rejected": -93872947.2, "logps/chosen": -214.90665211397058, "logps/rejected": -109.716455078125, "loss": 3.2072, "rewards/chosen": -0.41255920073565316, "rewards/margins": 1.1357639855029538, "rewards/rejected": -1.5483231862386069, "step": 249 }, { "epoch": 8.064777327935223, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 8.944621896258224e-07, "logits/chosen": -65452182.5882353, "logits/rejected": -85745203.2, "logps/chosen": -228.57809627757354, "logps/rejected": -109.6790771484375, "loss": 3.171, "rewards/chosen": -0.4627881330602309, "rewards/margins": 0.871955239539053, "rewards/rejected": -1.3347433725992839, "step": 250 }, { "epoch": 8.097165991902834, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 8.923996926393305e-07, "logits/chosen": -66272162.13333333, "logits/rejected": -101397797.64705883, "logps/chosen": -220.34915364583333, "logps/rejected": -131.8910414751838, "loss": 3.0849, "rewards/chosen": -0.3260688781738281, "rewards/margins": 1.9107903873219207, "rewards/rejected": -2.236859265495749, "step": 251 }, { "epoch": 8.129554655870445, "grad_norm": 227.0, "kl": 0.0, "learning_rate": 8.903196636686197e-07, "logits/chosen": -65849974.15384615, "logits/rejected": -96421734.4, "logps/chosen": -236.94503705929486, "logps/rejected": -122.508896484375, "loss": 3.371, "rewards/chosen": -0.24791299379788911, "rewards/margins": 1.5822778936532829, "rewards/rejected": -1.8301908874511719, "step": 252 }, { "epoch": 8.161943319838057, "grad_norm": 161.0, "kl": 0.0, "learning_rate": 8.882221956470836e-07, "logits/chosen": -56595155.862068966, "logits/rejected": -95794848.91428572, "logps/chosen": -210.66039197198276, "logps/rejected": -129.80171595982142, "loss": 3.0067, "rewards/chosen": -0.13620573898841595, "rewards/margins": 1.7900380703028786, "rewards/rejected": -1.9262438092912946, "step": 253 }, { "epoch": 8.194331983805668, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 8.861073822872733e-07, "logits/chosen": -54373916.90322581, "logits/rejected": -92542029.57575758, "logps/chosen": -158.83270854334677, "logps/rejected": -114.67375414299242, "loss": 3.1687, "rewards/chosen": -0.06185272432142688, "rewards/margins": 1.9001471110686063, "rewards/rejected": -1.9619998353900332, "step": 254 }, { "epoch": 8.226720647773279, "grad_norm": 214.0, "kl": 0.0, "learning_rate": 8.839753180767107e-07, "logits/chosen": -63268448.96969697, "logits/rejected": -89012306.58064516, "logps/chosen": -148.8386896306818, "logps/rejected": -147.22150642641128, "loss": 3.1673, "rewards/chosen": -0.40993794527920807, "rewards/margins": 1.4778542113094386, "rewards/rejected": -1.8877921565886466, "step": 255 }, { "epoch": 8.259109311740891, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 8.818260982736661e-07, "logits/chosen": -51074745.37931035, "logits/rejected": -97956293.48571429, "logps/chosen": -236.98644598599137, "logps/rejected": -134.256640625, "loss": 3.1121, "rewards/chosen": -0.059805117804428626, "rewards/margins": 2.072036923211196, "rewards/rejected": -2.131842041015625, "step": 256 }, { "epoch": 8.291497975708502, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 8.796598189029029e-07, "logits/chosen": -74012564.21052632, "logits/rejected": -94074112.0, "logps/chosen": -186.44219006990133, "logps/rejected": -146.62846491887018, "loss": 3.1515, "rewards/chosen": -0.6955768685591849, "rewards/margins": 1.469036415038321, "rewards/rejected": -2.164613283597506, "step": 257 }, { "epoch": 8.323886639676113, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 8.774765767513874e-07, "logits/chosen": -70070772.86956522, "logits/rejected": -96451228.09756097, "logps/chosen": -150.84981105638587, "logps/rejected": -121.71354563643293, "loss": 2.9867, "rewards/chosen": -1.0036715631899626, "rewards/margins": 1.1735239160402202, "rewards/rejected": -2.177195479230183, "step": 258 }, { "epoch": 8.356275303643725, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 8.752764693639638e-07, "logits/chosen": -61003702.85714286, "logits/rejected": -90766124.13793103, "logps/chosen": -195.6492885044643, "logps/rejected": -137.05028455010776, "loss": 3.1731, "rewards/chosen": -0.22561860765729633, "rewards/margins": 1.685136944437262, "rewards/rejected": -1.9107555520945583, "step": 259 }, { "epoch": 8.388663967611336, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 8.730595950389967e-07, "logits/chosen": -60814843.428571425, "logits/rejected": -91367274.66666667, "logps/chosen": -285.06014578683033, "logps/rejected": -115.19327799479167, "loss": 3.1373, "rewards/chosen": -0.22293782234191895, "rewards/margins": 1.5482103294796414, "rewards/rejected": -1.7711481518215604, "step": 260 }, { "epoch": 8.421052631578947, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 8.708260528239788e-07, "logits/chosen": -67363949.03703703, "logits/rejected": -98014982.91891892, "logps/chosen": -255.33013237847223, "logps/rejected": -132.92724609375, "loss": 3.056, "rewards/chosen": -0.5816964396723995, "rewards/margins": 1.4459617651021994, "rewards/rejected": -2.027658204774599, "step": 261 }, { "epoch": 8.45344129554656, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 8.685759425111054e-07, "logits/chosen": -53188189.86666667, "logits/rejected": -95383913.41176471, "logps/chosen": -255.84373372395834, "logps/rejected": -109.32036994485294, "loss": 3.0451, "rewards/chosen": -0.1679290771484375, "rewards/margins": 1.8957604352165673, "rewards/rejected": -2.0636895123650048, "step": 262 }, { "epoch": 8.48582995951417, "grad_norm": 174.0, "kl": 0.0, "learning_rate": 8.663093646328166e-07, "logits/chosen": -57241712.64, "logits/rejected": -98063478.15384616, "logps/chosen": -238.8626953125, "logps/rejected": -147.1845202323718, "loss": 3.0891, "rewards/chosen": -0.5263804626464844, "rewards/margins": 1.5656652479905349, "rewards/rejected": -2.092045710637019, "step": 263 }, { "epoch": 8.518218623481781, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 8.640264204573046e-07, "logits/chosen": -64148619.63636363, "logits/rejected": -102109117.93548387, "logps/chosen": -246.94140625, "logps/rejected": -142.4841544858871, "loss": 3.1034, "rewards/chosen": -0.41539241328383936, "rewards/margins": 1.8591594705367251, "rewards/rejected": -2.2745518838205645, "step": 264 }, { "epoch": 8.550607287449393, "grad_norm": 210.0, "kl": 0.0, "learning_rate": 8.617272119839902e-07, "logits/chosen": -64987726.451612905, "logits/rejected": -98316613.81818181, "logps/chosen": -176.54696950604838, "logps/rejected": -150.72878196022728, "loss": 3.1912, "rewards/chosen": -0.2099611682276572, "rewards/margins": 1.8152008061301788, "rewards/rejected": -2.025161974357836, "step": 265 }, { "epoch": 8.582995951417004, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 8.594118419389647e-07, "logits/chosen": -58319925.89473684, "logits/rejected": -95481777.23076923, "logps/chosen": -248.0672029194079, "logps/rejected": -131.7604041466346, "loss": 3.121, "rewards/chosen": -0.2619515469199733, "rewards/margins": 1.5165908983361864, "rewards/rejected": -1.7785424452561598, "step": 266 }, { "epoch": 8.615384615384615, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 8.570804137704003e-07, "logits/chosen": -66109800.0, "logits/rejected": -105424344.0, "logps/chosen": -252.68231201171875, "logps/rejected": -144.7725830078125, "loss": 3.1032, "rewards/chosen": -0.22994595766067505, "rewards/margins": 2.1201741099357605, "rewards/rejected": -2.3501200675964355, "step": 267 }, { "epoch": 8.647773279352228, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 8.54733031643929e-07, "logits/chosen": -76308333.71428572, "logits/rejected": -95257705.93103448, "logps/chosen": -177.07209821428572, "logps/rejected": -145.09445716594828, "loss": 3.1061, "rewards/chosen": -1.0242833818708148, "rewards/margins": 0.9257603950688404, "rewards/rejected": -1.9500437769396552, "step": 268 }, { "epoch": 8.680161943319838, "grad_norm": 171.0, "kl": 0.34954798221588135, "learning_rate": 8.523698004379875e-07, "logits/chosen": -68105216.0, "logits/rejected": -96775000.61538461, "logps/chosen": -212.20703125, "logps/rejected": -127.7769305889423, "loss": 3.1649, "rewards/chosen": -0.4285718516299599, "rewards/margins": 1.7672414007457162, "rewards/rejected": -2.195813252375676, "step": 269 }, { "epoch": 8.712550607287449, "grad_norm": 231.0, "kl": 0.0, "learning_rate": 8.499908257391323e-07, "logits/chosen": -68097083.07692307, "logits/rejected": -94308720.64, "logps/chosen": -155.97443409455127, "logps/rejected": -113.86224609375, "loss": 3.1306, "rewards/chosen": -0.42843324710161257, "rewards/margins": 1.5763363451835435, "rewards/rejected": -2.004769592285156, "step": 270 }, { "epoch": 8.744939271255062, "grad_norm": 236.0, "kl": 0.0, "learning_rate": 8.475962138373212e-07, "logits/chosen": -71194480.0, "logits/rejected": -96231992.0, "logps/chosen": -178.3039093017578, "logps/rejected": -141.05906677246094, "loss": 3.1401, "rewards/chosen": -0.7390018701553345, "rewards/margins": 1.5560022592544556, "rewards/rejected": -2.29500412940979, "step": 271 }, { "epoch": 8.777327935222672, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 8.451860717211652e-07, "logits/chosen": -65470554.35294118, "logits/rejected": -91992644.26666667, "logps/chosen": -219.72277113970588, "logps/rejected": -130.14352213541667, "loss": 3.1629, "rewards/chosen": -0.2704560055452235, "rewards/margins": 1.8472819552702064, "rewards/rejected": -2.11773796081543, "step": 272 }, { "epoch": 8.809716599190283, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 8.427605070731481e-07, "logits/chosen": -80051561.93103448, "logits/rejected": -96999665.37142856, "logps/chosen": -200.22407058189654, "logps/rejected": -144.83091517857142, "loss": 3.0988, "rewards/chosen": -1.1391406881398167, "rewards/margins": 0.7605528281827278, "rewards/rejected": -1.8996935163225446, "step": 273 }, { "epoch": 8.842105263157894, "grad_norm": 174.0, "kl": 0.33903443813323975, "learning_rate": 8.403196282648155e-07, "logits/chosen": -61416080.0, "logits/rejected": -103547584.0, "logps/chosen": -258.36090087890625, "logps/rejected": -125.1744613647461, "loss": 3.1583, "rewards/chosen": -0.18237808346748352, "rewards/margins": 2.1112501323223114, "rewards/rejected": -2.293628215789795, "step": 274 }, { "epoch": 8.874493927125506, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 8.378635443519326e-07, "logits/chosen": -58604956.44444445, "logits/rejected": -95174281.14285715, "logps/chosen": -247.49026150173611, "logps/rejected": -134.39338030133928, "loss": 3.176, "rewards/chosen": -0.12835978137122261, "rewards/margins": 1.9639225138558283, "rewards/rejected": -2.092282295227051, "step": 275 }, { "epoch": 8.906882591093117, "grad_norm": 158.0, "kl": 0.0, "learning_rate": 8.353923650696117e-07, "logits/chosen": -58184314.88, "logits/rejected": -101710185.02564102, "logps/chosen": -258.45134765625, "logps/rejected": -152.23571464342947, "loss": 2.9285, "rewards/chosen": -0.3872526550292969, "rewards/margins": 2.108552351731521, "rewards/rejected": -2.4958050067608175, "step": 276 }, { "epoch": 8.939271255060728, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 8.329062008274098e-07, "logits/chosen": -77036032.0, "logits/rejected": -98802899.86206897, "logps/chosen": -178.03736049107144, "logps/rejected": -138.2449740705819, "loss": 3.1156, "rewards/chosen": -0.4681967054094587, "rewards/margins": 1.6298560100235964, "rewards/rejected": -2.098052715433055, "step": 277 }, { "epoch": 8.97165991902834, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 8.304051627043951e-07, "logits/chosen": -70749624.8888889, "logits/rejected": -99539317.62162162, "logps/chosen": -170.75705295138889, "logps/rejected": -135.2934966216216, "loss": 3.0087, "rewards/chosen": -0.5229519384878653, "rewards/margins": 1.5503900497405976, "rewards/rejected": -2.0733419882284627, "step": 278 }, { "epoch": 9.0, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 8.278893624441847e-07, "logits/chosen": -56785340.631578945, "logits/rejected": -96406839.6521739, "logps/chosen": -136.22370990953948, "logps/rejected": -122.41340438179348, "loss": 3.1086, "rewards/chosen": -0.8402846486944902, "rewards/margins": 1.1428296767874222, "rewards/rejected": -1.9831143254819124, "step": 279 }, { "epoch": 9.03238866396761, "grad_norm": 214.0, "kl": 0.0, "learning_rate": 8.253589124499511e-07, "logits/chosen": -61582000.941176474, "logits/rejected": -94555818.66666667, "logps/chosen": -214.94738051470588, "logps/rejected": -110.60725911458333, "loss": 3.1054, "rewards/chosen": -0.41663164250990925, "rewards/margins": 1.2207698111440621, "rewards/rejected": -1.6374014536539714, "step": 280 }, { "epoch": 9.064777327935223, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 8.228139257794012e-07, "logits/chosen": -66101383.52941176, "logits/rejected": -86001049.6, "logps/chosen": -228.7639590992647, "logps/rejected": -110.32359212239584, "loss": 3.1086, "rewards/chosen": -0.4813748527975643, "rewards/margins": 0.9178199917662377, "rewards/rejected": -1.399194844563802, "step": 281 }, { "epoch": 9.097165991902834, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 8.202545161397241e-07, "logits/chosen": -66456524.8, "logits/rejected": -102268438.58823529, "logps/chosen": -220.43671875, "logps/rejected": -133.09693818933823, "loss": 3.0521, "rewards/chosen": -0.33482268651326497, "rewards/margins": 2.022627462125292, "rewards/rejected": -2.357450148638557, "step": 282 }, { "epoch": 9.129554655870445, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 8.176807978825118e-07, "logits/chosen": -66117100.307692304, "logits/rejected": -96738580.48, "logps/chosen": -237.09620392628204, "logps/rejected": -123.557412109375, "loss": 3.2924, "rewards/chosen": -0.2630309324998122, "rewards/margins": 1.6720119446974535, "rewards/rejected": -1.9350428771972656, "step": 283 }, { "epoch": 9.161943319838057, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 8.150928859986487e-07, "logits/chosen": -56772281.37931035, "logits/rejected": -96661577.14285715, "logps/chosen": -210.34304283405172, "logps/rejected": -130.77176339285714, "loss": 2.9648, "rewards/chosen": -0.10446934864438813, "rewards/margins": 1.918780686232844, "rewards/rejected": -2.023250034877232, "step": 284 }, { "epoch": 9.194331983805668, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 8.124908961131757e-07, "logits/chosen": -54764007.22580645, "logits/rejected": -92978703.51515152, "logps/chosen": -158.61907762096774, "logps/rejected": -115.82316080729167, "loss": 3.1142, "rewards/chosen": -0.040489827432940086, "rewards/margins": 2.036450073283206, "rewards/rejected": -2.076939900716146, "step": 285 }, { "epoch": 9.226720647773279, "grad_norm": 217.0, "kl": 0.0, "learning_rate": 8.098749444801224e-07, "logits/chosen": -63514360.24242424, "logits/rejected": -89530244.12903225, "logps/chosen": -148.8549952651515, "logps/rejected": -148.2848097278226, "loss": 3.1043, "rewards/chosen": -0.41156893065481476, "rewards/margins": 1.5825537283516118, "rewards/rejected": -1.9941226590064265, "step": 286 }, { "epoch": 9.259109311740891, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 8.072451479773143e-07, "logits/chosen": -51306637.24137931, "logits/rejected": -98705554.28571428, "logps/chosen": -237.14225821659483, "logps/rejected": -135.66897321428573, "loss": 3.0743, "rewards/chosen": -0.07538544720616834, "rewards/margins": 2.1976899835276487, "rewards/rejected": -2.273075430733817, "step": 287 }, { "epoch": 9.291497975708502, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 8.0460162410115e-07, "logits/chosen": -74732025.26315789, "logits/rejected": -95007300.92307693, "logps/chosen": -186.92341694078948, "logps/rejected": -147.9090857872596, "loss": 3.0931, "rewards/chosen": -0.7436988730179636, "rewards/margins": 1.54897526497783, "rewards/rejected": -2.2926741379957933, "step": 288 }, { "epoch": 9.323886639676113, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 8.019444909613521e-07, "logits/chosen": -70447159.6521739, "logits/rejected": -96842964.29268293, "logps/chosen": -151.52042289402175, "logps/rejected": -122.7792373285061, "loss": 2.9209, "rewards/chosen": -1.0707318679146145, "rewards/margins": 1.2130315523764488, "rewards/rejected": -2.2837634202910633, "step": 289 }, { "epoch": 9.356275303643725, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 7.992738672756908e-07, "logits/chosen": -61173346.74285714, "logits/rejected": -91356672.0, "logps/chosen": -195.72730189732144, "logps/rejected": -138.0972521551724, "loss": 3.1156, "rewards/chosen": -0.23342056274414064, "rewards/margins": 1.7820309343009162, "rewards/rejected": -2.0154514970450568, "step": 290 }, { "epoch": 9.388663967611336, "grad_norm": 185.0, "kl": 0.0, "learning_rate": 7.965898723646776e-07, "logits/chosen": -61117956.571428575, "logits/rejected": -91804181.33333333, "logps/chosen": -285.02743094308033, "logps/rejected": -116.11239963107639, "loss": 3.0761, "rewards/chosen": -0.21966862678527832, "rewards/margins": 1.6433917946285672, "rewards/rejected": -1.8630604214138455, "step": 291 }, { "epoch": 9.421052631578947, "grad_norm": 280.0, "kl": 0.0, "learning_rate": 7.938926261462365e-07, "logits/chosen": -67616483.55555555, "logits/rejected": -98708756.75675675, "logps/chosen": -255.13259548611111, "logps/rejected": -134.1481735641892, "loss": 3.0287, "rewards/chosen": -0.5619420652036313, "rewards/margins": 1.587807896855596, "rewards/rejected": -2.1497499620592273, "step": 292 }, { "epoch": 9.45344129554656, "grad_norm": 164.0, "kl": 0.0, "learning_rate": 7.911822491303452e-07, "logits/chosen": -53335927.46666667, "logits/rejected": -95898789.64705883, "logps/chosen": -255.715966796875, "logps/rejected": -110.31078383501838, "loss": 2.9834, "rewards/chosen": -0.15515419642130535, "rewards/margins": 2.007576301051121, "rewards/rejected": -2.1627304974724266, "step": 293 }, { "epoch": 9.48582995951417, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 7.884588624136503e-07, "logits/chosen": -57378785.28, "logits/rejected": -98681816.61538461, "logps/chosen": -239.1314453125, "logps/rejected": -148.2501502403846, "loss": 3.0397, "rewards/chosen": -0.5532550811767578, "rewards/margins": 1.6453546866392479, "rewards/rejected": -2.1986097678160057, "step": 294 }, { "epoch": 9.518218623481781, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 7.857225876740583e-07, "logits/chosen": -64545652.36363637, "logits/rejected": -102624454.19354838, "logps/chosen": -247.0961766098485, "logps/rejected": -143.7275863155242, "loss": 3.041, "rewards/chosen": -0.4308715011134292, "rewards/margins": 1.9680230242299426, "rewards/rejected": -2.3988945253433718, "step": 295 }, { "epoch": 9.550607287449393, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 7.829735471652977e-07, "logits/chosen": -65327657.29032258, "logits/rejected": -98793673.6969697, "logps/chosen": -176.57855027721774, "logps/rejected": -151.97844164299244, "loss": 3.1413, "rewards/chosen": -0.2131201067278462, "rewards/margins": 1.9370069573701651, "rewards/rejected": -2.1501270640980112, "step": 296 }, { "epoch": 9.582995951417004, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 7.802118637114573e-07, "logits/chosen": -58608431.15789474, "logits/rejected": -96020548.92307693, "logps/chosen": -248.18331106085526, "logps/rejected": -132.69559420072116, "loss": 3.08, "rewards/chosen": -0.27356298346268504, "rewards/margins": 1.5984983791706533, "rewards/rejected": -1.8720613626333384, "step": 297 }, { "epoch": 9.615384615384615, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 7.774376607014994e-07, "logits/chosen": -66192584.0, "logits/rejected": -106159160.0, "logps/chosen": -252.6024932861328, "logps/rejected": -146.11524963378906, "loss": 3.0409, "rewards/chosen": -0.2219638079404831, "rewards/margins": 2.262425258755684, "rewards/rejected": -2.484389066696167, "step": 298 }, { "epoch": 9.647773279352228, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 7.746510620837458e-07, "logits/chosen": -76464098.74285714, "logits/rejected": -95603005.79310344, "logps/chosen": -177.7017857142857, "logps/rejected": -145.97341392780172, "loss": 3.059, "rewards/chosen": -1.0872510637555803, "rewards/margins": 0.9506875475051955, "rewards/rejected": -2.037938611260776, "step": 299 }, { "epoch": 9.680161943319838, "grad_norm": 166.0, "kl": 0.3986997604370117, "learning_rate": 7.718521923603404e-07, "logits/chosen": -68340021.89473684, "logits/rejected": -97238488.61538461, "logps/chosen": -212.5210603412829, "logps/rejected": -128.78448955829327, "loss": 3.0978, "rewards/chosen": -0.45997343565288346, "rewards/margins": 1.8365966820041175, "rewards/rejected": -2.296570117657001, "step": 300 }, { "epoch": 9.712550607287449, "grad_norm": 226.0, "kl": 0.0, "learning_rate": 7.690411765816864e-07, "logits/chosen": -68242195.6923077, "logits/rejected": -94812928.0, "logps/chosen": -156.16728014823718, "logps/rejected": -114.796640625, "loss": 3.0423, "rewards/chosen": -0.44771796006422776, "rewards/margins": 1.6504920313908504, "rewards/rejected": -2.0982099914550782, "step": 301 }, { "epoch": 9.744939271255062, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 7.662181403408592e-07, "logits/chosen": -71550824.0, "logits/rejected": -96827824.0, "logps/chosen": -178.41879272460938, "logps/rejected": -142.3582763671875, "loss": 3.0433, "rewards/chosen": -0.7504904270172119, "rewards/margins": 1.6744349002838135, "rewards/rejected": -2.4249253273010254, "step": 302 }, { "epoch": 9.777327935222672, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 7.633832097679957e-07, "logits/chosen": -65655808.0, "logits/rejected": -92415803.73333333, "logps/chosen": -220.0100815716912, "logps/rejected": -130.86422526041667, "loss": 3.0974, "rewards/chosen": -0.29918743582332835, "rewards/margins": 1.8906209646486767, "rewards/rejected": -2.189808400472005, "step": 303 }, { "epoch": 9.809716599190283, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 7.60536511524658e-07, "logits/chosen": -80359909.51724137, "logits/rejected": -97658046.17142858, "logps/chosen": -200.64870689655172, "logps/rejected": -145.7033203125, "loss": 3.0474, "rewards/chosen": -1.181604056522764, "rewards/margins": 0.8053302783684191, "rewards/rejected": -1.986934334891183, "step": 304 }, { "epoch": 9.842105263157894, "grad_norm": 170.0, "kl": 0.38847053050994873, "learning_rate": 7.576781727981749e-07, "logits/chosen": -61844200.0, "logits/rejected": -104248728.0, "logps/chosen": -258.5477294921875, "logps/rejected": -126.07926940917969, "loss": 3.1096, "rewards/chosen": -0.20106235146522522, "rewards/margins": 2.1830473840236664, "rewards/rejected": -2.3841097354888916, "step": 305 }, { "epoch": 9.874493927125506, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 7.548083212959587e-07, "logits/chosen": -58909134.222222224, "logits/rejected": -95752502.85714285, "logps/chosen": -247.49329969618054, "logps/rejected": -135.36624581473214, "loss": 3.1361, "rewards/chosen": -0.1286644140879313, "rewards/margins": 2.060904786700294, "rewards/rejected": -2.1895692007882253, "step": 306 }, { "epoch": 9.906882591093117, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 7.519270852398001e-07, "logits/chosen": -58490521.6, "logits/rejected": -102195521.64102565, "logps/chosen": -258.3875, "logps/rejected": -153.12002954727564, "loss": 2.8674, "rewards/chosen": -0.38086856842041017, "rewards/margins": 2.2033691416031274, "rewards/rejected": -2.5842377100235376, "step": 307 }, { "epoch": 9.939271255060728, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 7.490345933601394e-07, "logits/chosen": -77243509.02857143, "logits/rejected": -99201341.79310344, "logps/chosen": -178.14437779017857, "logps/rejected": -139.1087015086207, "loss": 3.0598, "rewards/chosen": -0.4788997650146484, "rewards/margins": 1.7055259836131127, "rewards/rejected": -2.184425748627761, "step": 308 }, { "epoch": 9.97165991902834, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 7.461309748903137e-07, "logits/chosen": -71162638.22222222, "logits/rejected": -99941209.94594595, "logps/chosen": -171.35163483796296, "logps/rejected": -136.0713550464527, "loss": 2.9573, "rewards/chosen": -0.582410247237594, "rewards/margins": 1.5687165112347456, "rewards/rejected": -2.1511267584723397, "step": 309 }, { "epoch": 10.0, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 7.43216359560785e-07, "logits/chosen": -56970293.89473684, "logits/rejected": -96843464.3478261, "logps/chosen": -136.66714638157896, "logps/rejected": -123.33866550611413, "loss": 3.0628, "rewards/chosen": -0.884627994738127, "rewards/margins": 1.191012808059937, "rewards/rejected": -2.075640802798064, "step": 310 }, { "epoch": 10.03238866396761, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 7.402908775933419e-07, "logits/chosen": -61779237.64705882, "logits/rejected": -95183308.8, "logps/chosen": -214.87415268841912, "logps/rejected": -111.50768229166667, "loss": 3.0712, "rewards/chosen": -0.40930919086231904, "rewards/margins": 1.3181350128323426, "rewards/rejected": -1.7274442036946616, "step": 311 }, { "epoch": 10.064777327935223, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 7.373546596952827e-07, "logits/chosen": -66141816.47058824, "logits/rejected": -86566493.86666666, "logps/chosen": -229.13196518841912, "logps/rejected": -110.87522786458334, "loss": 3.0742, "rewards/chosen": -0.51817585440243, "rewards/margins": 0.9361826915366975, "rewards/rejected": -1.4543585459391275, "step": 312 }, { "epoch": 10.097165991902834, "grad_norm": 200.0, "kl": 0.0, "learning_rate": 7.344078370535755e-07, "logits/chosen": -66708241.06666667, "logits/rejected": -102690183.52941176, "logps/chosen": -220.53509114583332, "logps/rejected": -133.85311351102942, "loss": 2.9652, "rewards/chosen": -0.3446622848510742, "rewards/margins": 2.0884050369262694, "rewards/rejected": -2.4330673217773438, "step": 313 }, { "epoch": 10.129554655870445, "grad_norm": 223.0, "kl": 0.0, "learning_rate": 7.314505413289963e-07, "logits/chosen": -66415760.41025641, "logits/rejected": -97746411.52, "logps/chosen": -236.982421875, "logps/rejected": -124.301083984375, "loss": 3.2645, "rewards/chosen": -0.2516512993054512, "rewards/margins": 1.7577592536730646, "rewards/rejected": -2.009410552978516, "step": 314 }, { "epoch": 10.161943319838057, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 7.284829046502467e-07, "logits/chosen": -57094033.655172415, "logits/rejected": -97199389.25714286, "logps/chosen": -210.60376818426724, "logps/rejected": -131.71128627232142, "loss": 2.9492, "rewards/chosen": -0.13054232761777682, "rewards/margins": 1.986657815378875, "rewards/rejected": -2.117200142996652, "step": 315 }, { "epoch": 10.194331983805668, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 7.255050596080509e-07, "logits/chosen": -54756698.838709675, "logits/rejected": -93596516.84848484, "logps/chosen": -158.54352003528226, "logps/rejected": -116.5855379971591, "loss": 3.0725, "rewards/chosen": -0.03293378506937335, "rewards/margins": 2.120244112066043, "rewards/rejected": -2.1531778971354165, "step": 316 }, { "epoch": 10.226720647773279, "grad_norm": 224.0, "kl": 0.0, "learning_rate": 7.225171392492315e-07, "logits/chosen": -63881801.696969695, "logits/rejected": -89890758.19354838, "logps/chosen": -149.04974550189394, "logps/rejected": -149.1017830141129, "loss": 3.0462, "rewards/chosen": -0.43104382717248163, "rewards/margins": 1.6447761112410768, "rewards/rejected": -2.0758199384135585, "step": 317 }, { "epoch": 10.259109311740891, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 7.195192770707654e-07, "logits/chosen": -51546081.10344828, "logits/rejected": -99088603.42857143, "logps/chosen": -236.8742086476293, "logps/rejected": -136.63518415178572, "loss": 3.0227, "rewards/chosen": -0.04858063007223195, "rewards/margins": 2.321115196866942, "rewards/rejected": -2.369695826939174, "step": 318 }, { "epoch": 10.291497975708502, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 7.165116070138182e-07, "logits/chosen": -74903653.05263157, "logits/rejected": -95339264.0, "logps/chosen": -187.0942254317434, "logps/rejected": -149.06288499098557, "loss": 3.0731, "rewards/chosen": -0.7607813383403578, "rewards/margins": 1.6472719864324037, "rewards/rejected": -2.4080533247727613, "step": 319 }, { "epoch": 10.323886639676113, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 7.134942634577615e-07, "logits/chosen": -70849914.43478261, "logits/rejected": -97451944.58536585, "logps/chosen": -151.50901197350544, "logps/rejected": -123.82236089939025, "loss": 2.8709, "rewards/chosen": -1.0695920197860054, "rewards/margins": 1.318482715529829, "rewards/rejected": -2.3880747353158345, "step": 320 }, { "epoch": 10.356275303643725, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 7.104673812141675e-07, "logits/chosen": -61415014.4, "logits/rejected": -91735869.79310344, "logps/chosen": -195.57022879464284, "logps/rejected": -139.26278791756465, "loss": 3.0755, "rewards/chosen": -0.21771132605416435, "rewards/margins": 1.9142937735383734, "rewards/rejected": -2.1320050995925377, "step": 321 }, { "epoch": 10.388663967611336, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 7.074310955207868e-07, "logits/chosen": -61545709.71428572, "logits/rejected": -92193251.55555555, "logps/chosen": -285.07822963169644, "logps/rejected": -117.01312934027777, "loss": 3.0344, "rewards/chosen": -0.22474472863333567, "rewards/margins": 1.7283889603993248, "rewards/rejected": -1.9531336890326605, "step": 322 }, { "epoch": 10.421052631578947, "grad_norm": 360.0, "kl": 0.0, "learning_rate": 7.04385542035506e-07, "logits/chosen": -67899150.22222222, "logits/rejected": -99048669.4054054, "logps/chosen": -255.69565610532408, "logps/rejected": -134.87616131756758, "loss": 2.9983, "rewards/chosen": -0.6182486216227213, "rewards/margins": 1.6043022602528065, "rewards/rejected": -2.2225508818755277, "step": 323 }, { "epoch": 10.45344129554656, "grad_norm": 167.0, "kl": 0.0, "learning_rate": 7.013308568302854e-07, "logits/chosen": -53443707.733333334, "logits/rejected": -96427309.1764706, "logps/chosen": -255.70485026041666, "logps/rejected": -111.05091050091912, "loss": 2.9387, "rewards/chosen": -0.15404144922892252, "rewards/margins": 2.082700907015333, "rewards/rejected": -2.2367423562442554, "step": 324 }, { "epoch": 10.48582995951417, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 6.982671763850814e-07, "logits/chosen": -57626496.0, "logits/rejected": -99155147.48717949, "logps/chosen": -239.3466015625, "logps/rejected": -149.0160632011218, "loss": 2.9875, "rewards/chosen": -0.574769287109375, "rewards/margins": 1.700430211776342, "rewards/rejected": -2.275199498885717, "step": 325 }, { "epoch": 10.518218623481781, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 6.951946375817474e-07, "logits/chosen": -64868639.03030303, "logits/rejected": -103222321.5483871, "logps/chosen": -247.205078125, "logps/rejected": -144.45980342741936, "loss": 2.976, "rewards/chosen": -0.4417598608768348, "rewards/margins": 2.030354456001945, "rewards/rejected": -2.47211431687878, "step": 326 }, { "epoch": 10.550607287449393, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 6.921133776979186e-07, "logits/chosen": -65694720.0, "logits/rejected": -99278219.63636364, "logps/chosen": -176.5769279233871, "logps/rejected": -152.66569010416666, "loss": 3.0862, "rewards/chosen": -0.2129576744571809, "rewards/margins": 2.0058928948343673, "rewards/rejected": -2.2188505692915483, "step": 327 }, { "epoch": 10.582995951417004, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 6.890235344008781e-07, "logits/chosen": -58686329.2631579, "logits/rejected": -96480009.84615384, "logps/chosen": -247.99038856907896, "logps/rejected": -133.462890625, "loss": 3.0455, "rewards/chosen": -0.2542699512682463, "rewards/margins": 1.694520378884999, "rewards/rejected": -1.9487903301532452, "step": 328 }, { "epoch": 10.615384615384615, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 6.859252457414066e-07, "logits/chosen": -66534960.0, "logits/rejected": -106542016.0, "logps/chosen": -252.64999389648438, "logps/rejected": -146.95968627929688, "loss": 2.9967, "rewards/chosen": -0.2267126739025116, "rewards/margins": 2.3421192467212677, "rewards/rejected": -2.5688319206237793, "step": 329 }, { "epoch": 10.647773279352228, "grad_norm": 152.0, "kl": 0.0, "learning_rate": 6.828186501476144e-07, "logits/chosen": -76715066.51428571, "logits/rejected": -96186862.34482759, "logps/chosen": -177.94132254464284, "logps/rejected": -146.52976831896552, "loss": 3.0052, "rewards/chosen": -1.1112047467912947, "rewards/margins": 0.9823705663821969, "rewards/rejected": -2.0935753131734915, "step": 330 }, { "epoch": 10.680161943319838, "grad_norm": 184.0, "kl": 0.3870738744735718, "learning_rate": 6.797038864187563e-07, "logits/chosen": -68528949.89473684, "logits/rejected": -97728482.46153846, "logps/chosen": -212.32182874177633, "logps/rejected": -129.7121863731971, "loss": 3.0682, "rewards/chosen": -0.44005047647576584, "rewards/margins": 1.9492892639839696, "rewards/rejected": -2.3893397404597354, "step": 331 }, { "epoch": 10.712550607287449, "grad_norm": 236.0, "kl": 0.0, "learning_rate": 6.765810937190306e-07, "logits/chosen": -68265472.0, "logits/rejected": -95188264.96, "logps/chosen": -156.35124949919873, "logps/rejected": -115.59439453125, "loss": 2.999, "rewards/chosen": -0.46611477778508115, "rewards/margins": 1.7118698413555438, "rewards/rejected": -2.177984619140625, "step": 332 }, { "epoch": 10.744939271255062, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 6.734504115713602e-07, "logits/chosen": -71831984.0, "logits/rejected": -97308848.0, "logps/chosen": -178.6875762939453, "logps/rejected": -142.67935180664062, "loss": 3.021, "rewards/chosen": -0.7773687243461609, "rewards/margins": 1.6796627640724182, "rewards/rejected": -2.457031488418579, "step": 333 }, { "epoch": 10.777327935222672, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 6.703119798511612e-07, "logits/chosen": -65919126.5882353, "logits/rejected": -92876680.53333333, "logps/chosen": -220.09176815257354, "logps/rejected": -131.73116861979167, "loss": 3.0617, "rewards/chosen": -0.3073549551122329, "rewards/margins": 1.9691472726709702, "rewards/rejected": -2.276502227783203, "step": 334 }, { "epoch": 10.809716599190283, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 6.671659387800908e-07, "logits/chosen": -80524438.06896552, "logits/rejected": -97971295.08571428, "logps/chosen": -201.07888267780172, "logps/rejected": -146.3240234375, "loss": 2.9973, "rewards/chosen": -1.2246241076239224, "rewards/margins": 0.8243818912600063, "rewards/rejected": -2.0490059988839286, "step": 335 }, { "epoch": 10.842105263157894, "grad_norm": 181.0, "kl": 0.40821921825408936, "learning_rate": 6.640124289197845e-07, "logits/chosen": -61865056.0, "logits/rejected": -104776096.0, "logps/chosen": -258.7056579589844, "logps/rejected": -126.95762634277344, "loss": 3.0811, "rewards/chosen": -0.21685275435447693, "rewards/margins": 2.2550927698612213, "rewards/rejected": -2.4719455242156982, "step": 336 }, { "epoch": 10.874493927125506, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 6.608515911655743e-07, "logits/chosen": -59061923.55555555, "logits/rejected": -96263698.28571428, "logps/chosen": -247.29961480034723, "logps/rejected": -136.06971958705358, "loss": 3.0953, "rewards/chosen": -0.10929477214813232, "rewards/margins": 2.150621669633048, "rewards/rejected": -2.25991644178118, "step": 337 }, { "epoch": 10.906882591093117, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 6.576835667401952e-07, "logits/chosen": -58525066.24, "logits/rejected": -102586696.20512821, "logps/chosen": -258.4833984375, "logps/rejected": -154.0860877403846, "loss": 2.8202, "rewards/chosen": -0.39045482635498047, "rewards/margins": 2.2903876466017503, "rewards/rejected": -2.680842472956731, "step": 338 }, { "epoch": 10.939271255060728, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 6.545084971874736e-07, "logits/chosen": -77362176.0, "logits/rejected": -99851378.7586207, "logps/chosen": -178.48032924107142, "logps/rejected": -139.7308560075431, "loss": 3.0113, "rewards/chosen": -0.5124941689627511, "rewards/margins": 1.7341473189480787, "rewards/rejected": -2.24664148791083, "step": 339 }, { "epoch": 10.97165991902834, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 6.513265243660057e-07, "logits/chosen": -71298962.96296297, "logits/rejected": -100436203.24324325, "logps/chosen": -171.26258680555554, "logps/rejected": -137.03676625844594, "loss": 2.9469, "rewards/chosen": -0.5735058961091218, "rewards/margins": 1.6741628083619506, "rewards/rejected": -2.2476687044710726, "step": 340 }, { "epoch": 11.0, "grad_norm": 174.0, "kl": 0.0, "learning_rate": 6.48137790442817e-07, "logits/chosen": -57239383.578947365, "logits/rejected": -97043244.52173913, "logps/chosen": -136.87861071134867, "logps/rejected": -124.2124660326087, "loss": 3.0227, "rewards/chosen": -0.9057742670962685, "rewards/margins": 1.2572469864066187, "rewards/rejected": -2.163021253502887, "step": 341 }, { "epoch": 11.03238866396761, "grad_norm": 210.0, "kl": 0.0, "learning_rate": 6.449424378870122e-07, "logits/chosen": -61877609.4117647, "logits/rejected": -95455402.66666667, "logps/chosen": -215.1293227251838, "logps/rejected": -111.83929036458333, "loss": 3.0175, "rewards/chosen": -0.43482617770924287, "rewards/margins": 1.3257803337246765, "rewards/rejected": -1.7606065114339193, "step": 342 }, { "epoch": 11.064777327935223, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 6.417406094634089e-07, "logits/chosen": -66498650.35294118, "logits/rejected": -87018538.66666667, "logps/chosen": -229.10110294117646, "logps/rejected": -111.25556640625, "loss": 3.0131, "rewards/chosen": -0.5150904935948989, "rewards/margins": 0.9773019828048407, "rewards/rejected": -1.4923924763997396, "step": 343 }, { "epoch": 11.097165991902834, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 6.385324482261596e-07, "logits/chosen": -66936622.93333333, "logits/rejected": -103119728.94117647, "logps/chosen": -220.70305989583332, "logps/rejected": -134.835693359375, "loss": 2.9157, "rewards/chosen": -0.3614577611287435, "rewards/margins": 2.169866737664915, "rewards/rejected": -2.531324498793658, "step": 344 }, { "epoch": 11.129554655870445, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 6.353180975123594e-07, "logits/chosen": -66439358.35897436, "logits/rejected": -97964892.16, "logps/chosen": -237.02393830128204, "logps/rejected": -124.96294921875, "loss": 3.1997, "rewards/chosen": -0.25580261915158004, "rewards/margins": 1.8197936943249824, "rewards/rejected": -2.0755963134765625, "step": 345 }, { "epoch": 11.161943319838057, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 6.32097700935643e-07, "logits/chosen": -56968518.62068965, "logits/rejected": -97509844.11428571, "logps/chosen": -210.48178205818965, "logps/rejected": -132.1926478794643, "loss": 2.9007, "rewards/chosen": -0.11834272845038052, "rewards/margins": 2.0469949703498425, "rewards/rejected": -2.165337698800223, "step": 346 }, { "epoch": 11.194331983805668, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 6.288714023797671e-07, "logits/chosen": -54813060.12903226, "logits/rejected": -93902157.57575758, "logps/chosen": -158.27471333165323, "logps/rejected": -117.26080137310606, "loss": 3.0155, "rewards/chosen": -0.006054014928879276, "rewards/margins": 2.21465095962131, "rewards/rejected": -2.2207049745501894, "step": 347 }, { "epoch": 11.226720647773279, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 6.256393459921824e-07, "logits/chosen": -64142324.36363637, "logits/rejected": -90090743.74193548, "logps/chosen": -149.2150361032197, "logps/rejected": -149.71875, "loss": 2.9987, "rewards/chosen": -0.44757247693610913, "rewards/margins": 1.689944898394662, "rewards/rejected": -2.1375173753307712, "step": 348 }, { "epoch": 11.259109311740891, "grad_norm": 169.0, "kl": 0.0, "learning_rate": 6.224016761775932e-07, "logits/chosen": -51707489.10344828, "logits/rejected": -99570278.4, "logps/chosen": -236.94027815193965, "logps/rejected": -137.37063337053573, "loss": 2.9742, "rewards/chosen": -0.05518572905967976, "rewards/margins": 2.3880550633510347, "rewards/rejected": -2.4432407924107142, "step": 349 }, { "epoch": 11.291497975708502, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 6.191585375915055e-07, "logits/chosen": -75037689.26315789, "logits/rejected": -95690781.53846154, "logps/chosen": -187.18649773848685, "logps/rejected": -149.31526066706732, "loss": 3.0458, "rewards/chosen": -0.7700092917994449, "rewards/margins": 1.6632817766444403, "rewards/rejected": -2.4332910684438853, "step": 350 }, { "epoch": 11.323886639676113, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 6.159100751337641e-07, "logits/chosen": -70969266.08695652, "logits/rejected": -97567194.53658536, "logps/chosen": -151.87946883491847, "logps/rejected": -124.0783036394817, "loss": 2.8333, "rewards/chosen": -1.1066370425016985, "rewards/margins": 1.3070335671353062, "rewards/rejected": -2.4136706096370046, "step": 351 }, { "epoch": 11.356275303643725, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 6.126564339420783e-07, "logits/chosen": -61596979.2, "logits/rejected": -92134859.03448276, "logps/chosen": -195.90521763392857, "logps/rejected": -139.67433324353448, "loss": 3.037, "rewards/chosen": -0.251210457938058, "rewards/margins": 1.9219487796276074, "rewards/rejected": -2.1731592375656654, "step": 352 }, { "epoch": 11.388663967611336, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 6.093977593855375e-07, "logits/chosen": -61521147.428571425, "logits/rejected": -92662428.44444445, "logps/chosen": -285.13204520089283, "logps/rejected": -117.372802734375, "loss": 3.0021, "rewards/chosen": -0.2301291057041713, "rewards/margins": 1.7589708207145571, "rewards/rejected": -1.9890999264187283, "step": 353 }, { "epoch": 11.421052631578947, "grad_norm": 334.0, "kl": 0.0, "learning_rate": 6.061341970581164e-07, "logits/chosen": -67998326.51851852, "logits/rejected": -99411552.86486487, "logps/chosen": -255.69585503472223, "logps/rejected": -135.5848949535473, "loss": 2.9729, "rewards/chosen": -0.618268472177011, "rewards/margins": 1.6751539174978203, "rewards/rejected": -2.293422389674831, "step": 354 }, { "epoch": 11.45344129554656, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 6.028658927721697e-07, "logits/chosen": -53487547.733333334, "logits/rejected": -96852291.76470588, "logps/chosen": -255.83855794270832, "logps/rejected": -111.61545697380515, "loss": 2.9072, "rewards/chosen": -0.1674118995666504, "rewards/margins": 2.125784385905546, "rewards/rejected": -2.2931962854721966, "step": 355 }, { "epoch": 11.48582995951417, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 5.99592992551918e-07, "logits/chosen": -57599313.92, "logits/rejected": -99410294.15384616, "logps/chosen": -239.4209765625, "logps/rejected": -149.71891276041666, "loss": 2.9702, "rewards/chosen": -0.5822079086303711, "rewards/margins": 1.763276702685234, "rewards/rejected": -2.345484611315605, "step": 356 }, { "epoch": 11.518218623481781, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 5.963156426269227e-07, "logits/chosen": -64905968.484848484, "logits/rejected": -103663070.96774194, "logps/chosen": -247.22037760416666, "logps/rejected": -144.78213205645162, "loss": 2.959, "rewards/chosen": -0.4432914618289832, "rewards/margins": 2.061058769710835, "rewards/rejected": -2.5043502315398185, "step": 357 }, { "epoch": 11.550607287449393, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 5.930339894255532e-07, "logits/chosen": -65530826.32258064, "logits/rejected": -99848137.6969697, "logps/chosen": -176.7787613407258, "logps/rejected": -153.20124585700756, "loss": 3.0479, "rewards/chosen": -0.2331395149230957, "rewards/margins": 2.039268479202733, "rewards/rejected": -2.2724079941258286, "step": 358 }, { "epoch": 11.582995951417004, "grad_norm": 164.0, "kl": 0.0, "learning_rate": 5.897481795684446e-07, "logits/chosen": -59134396.631578945, "logits/rejected": -96720640.0, "logps/chosen": -248.07285670230263, "logps/rejected": -133.74541766826923, "loss": 3.0121, "rewards/chosen": -0.26251848120438426, "rewards/margins": 1.7145262845614662, "rewards/rejected": -1.9770447657658503, "step": 359 }, { "epoch": 11.615384615384615, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 5.864583598619467e-07, "logits/chosen": -66893916.0, "logits/rejected": -106991936.0, "logps/chosen": -252.66595458984375, "logps/rejected": -147.30953979492188, "loss": 2.9848, "rewards/chosen": -0.22830769419670105, "rewards/margins": 2.3755078613758087, "rewards/rejected": -2.6038155555725098, "step": 360 }, { "epoch": 11.647773279352228, "grad_norm": 167.0, "kl": 0.0, "learning_rate": 5.83164677291565e-07, "logits/chosen": -77062253.71428572, "logits/rejected": -96580934.62068966, "logps/chosen": -178.29112723214286, "logps/rejected": -147.08740234375, "loss": 2.9673, "rewards/chosen": -1.1461866106305802, "rewards/margins": 1.0031522102543875, "rewards/rejected": -2.1493388208849677, "step": 361 }, { "epoch": 11.680161943319838, "grad_norm": 154.0, "kl": 0.302021861076355, "learning_rate": 5.798672790153937e-07, "logits/chosen": -68697734.73684211, "logits/rejected": -97920275.6923077, "logps/chosen": -212.50377775493422, "logps/rejected": -130.27116511418268, "loss": 3.0491, "rewards/chosen": -0.4582456287584807, "rewards/margins": 1.9869927780830907, "rewards/rejected": -2.4452384068415713, "step": 362 }, { "epoch": 11.712550607287449, "grad_norm": 222.0, "kl": 0.0, "learning_rate": 5.7656631235754e-07, "logits/chosen": -68498136.61538461, "logits/rejected": -95319818.24, "logps/chosen": -156.39588341346155, "logps/rejected": -116.028857421875, "loss": 2.999, "rewards/chosen": -0.47057851155598956, "rewards/margins": 1.7508523050944012, "rewards/rejected": -2.2214308166503907, "step": 363 }, { "epoch": 11.744939271255062, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 5.732619248015434e-07, "logits/chosen": -71797296.0, "logits/rejected": -97528544.0, "logps/chosen": -178.90850830078125, "logps/rejected": -143.28900146484375, "loss": 2.9969, "rewards/chosen": -0.7994619011878967, "rewards/margins": 1.718535840511322, "rewards/rejected": -2.5179977416992188, "step": 364 }, { "epoch": 11.777327935222672, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 5.699542639837843e-07, "logits/chosen": -65975868.23529412, "logits/rejected": -93179699.2, "logps/chosen": -220.08093979779412, "logps/rejected": -131.855908203125, "loss": 3.0389, "rewards/chosen": -0.3062733762404498, "rewards/margins": 1.982702911601347, "rewards/rejected": -2.288976287841797, "step": 365 }, { "epoch": 11.809716599190283, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 5.666434776868895e-07, "logits/chosen": -80727357.79310344, "logits/rejected": -98277039.54285714, "logps/chosen": -201.17682516163794, "logps/rejected": -146.57950613839284, "loss": 2.9803, "rewards/chosen": -1.234417882458917, "rewards/margins": 0.8401341630907482, "rewards/rejected": -2.074552045549665, "step": 366 }, { "epoch": 11.842105263157894, "grad_norm": 172.0, "kl": 0.39742761850357056, "learning_rate": 5.633297138331284e-07, "logits/chosen": -61957560.0, "logits/rejected": -105042832.0, "logps/chosen": -258.6969299316406, "logps/rejected": -127.34425354003906, "loss": 3.0461, "rewards/chosen": -0.21598157286643982, "rewards/margins": 2.2946256697177887, "rewards/rejected": -2.5106072425842285, "step": 367 }, { "epoch": 11.874493927125506, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 5.600131204778048e-07, "logits/chosen": -59192512.0, "logits/rejected": -96605540.57142857, "logps/chosen": -247.47710503472223, "logps/rejected": -136.70819091796875, "loss": 3.0603, "rewards/chosen": -0.12704458501603869, "rewards/margins": 2.1967193985742233, "rewards/rejected": -2.323763983590262, "step": 368 }, { "epoch": 11.906882591093117, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 5.56693845802641e-07, "logits/chosen": -58737807.36, "logits/rejected": -103143082.66666667, "logps/chosen": -258.499140625, "logps/rejected": -154.24395282451923, "loss": 2.8116, "rewards/chosen": -0.3920318603515625, "rewards/margins": 2.304597394894331, "rewards/rejected": -2.6966292552458935, "step": 369 }, { "epoch": 11.939271255060728, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 5.533720381091582e-07, "logits/chosen": -77575789.71428572, "logits/rejected": -100264156.68965517, "logps/chosen": -178.55064174107142, "logps/rejected": -140.40645204741378, "loss": 2.9824, "rewards/chosen": -0.5195248740059989, "rewards/margins": 1.7946775333047502, "rewards/rejected": -2.314202407310749, "step": 370 }, { "epoch": 11.97165991902834, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 5.500478458120493e-07, "logits/chosen": -71578173.62962963, "logits/rejected": -100940066.5945946, "logps/chosen": -171.4014576099537, "logps/rejected": -137.14829233530406, "loss": 2.915, "rewards/chosen": -0.5873909349794741, "rewards/margins": 1.6714298822977642, "rewards/rejected": -2.2588208172772384, "step": 371 }, { "epoch": 12.0, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 5.467214174325493e-07, "logits/chosen": -57407784.421052635, "logits/rejected": -97183799.6521739, "logps/chosen": -137.0067716899671, "logps/rejected": -124.547119140625, "loss": 2.9997, "rewards/chosen": -0.9185907464278372, "rewards/margins": 1.2778963486171695, "rewards/rejected": -2.196487095045007, "step": 372 }, { "epoch": 12.03238866396761, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 5.433929015917988e-07, "logits/chosen": -62205601.88235294, "logits/rejected": -95745962.66666667, "logps/chosen": -215.14547909007354, "logps/rejected": -112.37322591145833, "loss": 3.0145, "rewards/chosen": -0.43644265567555146, "rewards/margins": 1.3775575376024434, "rewards/rejected": -1.8140001932779948, "step": 373 }, { "epoch": 12.064777327935223, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 5.400624470042037e-07, "logits/chosen": -66520688.941176474, "logits/rejected": -87154321.06666666, "logps/chosen": -229.30230353860293, "logps/rejected": -111.72171223958334, "loss": 2.9927, "rewards/chosen": -0.5352107777315027, "rewards/margins": 1.0037967906278722, "rewards/rejected": -1.539007568359375, "step": 374 }, { "epoch": 12.097165991902834, "grad_norm": 215.0, "kl": 0.0, "learning_rate": 5.36730202470791e-07, "logits/chosen": -67337612.8, "logits/rejected": -103419407.05882353, "logps/chosen": -220.56637369791667, "logps/rejected": -135.18458467371323, "loss": 2.9151, "rewards/chosen": -0.3477913538614909, "rewards/margins": 2.218422814911487, "rewards/rejected": -2.566214168772978, "step": 375 }, { "epoch": 12.129554655870445, "grad_norm": 254.0, "kl": 0.0, "learning_rate": 5.333963168725609e-07, "logits/chosen": -66774317.94871795, "logits/rejected": -98000967.68, "logps/chosen": -237.08841646634616, "logps/rejected": -125.35671875, "loss": 3.1906, "rewards/chosen": -0.26224940862411106, "rewards/margins": 1.8527251091981545, "rewards/rejected": -2.1149745178222656, "step": 376 }, { "epoch": 12.161943319838057, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 5.300609391638335e-07, "logits/chosen": -56944269.24137931, "logits/rejected": -97837143.77142857, "logps/chosen": -210.49350080818965, "logps/rejected": -132.54422433035714, "loss": 2.8653, "rewards/chosen": -0.11951449821735251, "rewards/margins": 2.080981194444478, "rewards/rejected": -2.20049569266183, "step": 377 }, { "epoch": 12.194331983805668, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 5.267242183655961e-07, "logits/chosen": -54866456.77419355, "logits/rejected": -94181081.21212122, "logps/chosen": -158.11824281754033, "logps/rejected": -117.57677112926136, "loss": 2.9811, "rewards/chosen": 0.009594423155630789, "rewards/margins": 2.261896650753879, "rewards/rejected": -2.252302227598248, "step": 378 }, { "epoch": 12.226720647773279, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 5.233863035588426e-07, "logits/chosen": -64303697.45454545, "logits/rejected": -90242295.74193548, "logps/chosen": -149.16764322916666, "logps/rejected": -150.07774697580646, "loss": 2.9796, "rewards/chosen": -0.4428321664983576, "rewards/margins": 1.7305844634159568, "rewards/rejected": -2.1734166299143145, "step": 379 }, { "epoch": 12.259109311740891, "grad_norm": 222.0, "kl": 0.0, "learning_rate": 5.200473438779146e-07, "logits/chosen": -52087825.655172415, "logits/rejected": -99907649.82857142, "logps/chosen": -237.14458176185346, "logps/rejected": -137.6441685267857, "loss": 2.9653, "rewards/chosen": -0.0756178724354711, "rewards/margins": 2.394976697179484, "rewards/rejected": -2.4705945696149554, "step": 380 }, { "epoch": 12.291497975708502, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 5.167074885038372e-07, "logits/chosen": -75272373.89473684, "logits/rejected": -96073432.61538461, "logps/chosen": -187.40669973273026, "logps/rejected": -150.01200984074518, "loss": 3.0149, "rewards/chosen": -0.7920285777041787, "rewards/margins": 1.710938376453724, "rewards/rejected": -2.5029669541579027, "step": 381 }, { "epoch": 12.323886639676113, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 5.133668866576544e-07, "logits/chosen": -71279504.69565217, "logits/rejected": -97946786.34146342, "logps/chosen": -151.93790336277175, "logps/rejected": -124.80961794969512, "loss": 2.7985, "rewards/chosen": -1.1124801635742188, "rewards/margins": 1.3743215421350992, "rewards/rejected": -2.486801705709318, "step": 382 }, { "epoch": 12.356275303643725, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 5.100256875937613e-07, "logits/chosen": -61767467.885714285, "logits/rejected": -92229190.62068966, "logps/chosen": -195.62333984375, "logps/rejected": -140.0376986799569, "loss": 3.0236, "rewards/chosen": -0.22302398681640626, "rewards/margins": 1.9864712945346175, "rewards/rejected": -2.209495281351024, "step": 383 }, { "epoch": 12.388663967611336, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 5.066840405932363e-07, "logits/chosen": -61738985.14285714, "logits/rejected": -92859832.8888889, "logps/chosen": -285.20682198660717, "logps/rejected": -117.97832573784723, "loss": 2.9566, "rewards/chosen": -0.2376061167035784, "rewards/margins": 1.8120476783268038, "rewards/rejected": -2.049653795030382, "step": 384 }, { "epoch": 12.421052631578947, "grad_norm": 216.0, "kl": 0.0, "learning_rate": 5.033420949571712e-07, "logits/chosen": -67980837.92592593, "logits/rejected": -99720150.48648648, "logps/chosen": -255.52560763888889, "logps/rejected": -136.02293602195945, "loss": 2.9316, "rewards/chosen": -0.6012460214120371, "rewards/margins": 1.7359806400638917, "rewards/rejected": -2.337226661475929, "step": 385 }, { "epoch": 12.45344129554656, "grad_norm": 157.0, "kl": 0.0, "learning_rate": 5e-07, "logits/chosen": -53652727.46666667, "logits/rejected": -96935988.70588236, "logps/chosen": -255.8826171875, "logps/rejected": -111.8104607077206, "loss": 2.8778, "rewards/chosen": -0.17181793848673502, "rewards/margins": 2.14087885501338, "rewards/rejected": -2.312696793500115, "step": 386 }, { "epoch": 12.48582995951417, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 4.96657905042829e-07, "logits/chosen": -57756974.08, "logits/rejected": -99834486.15384616, "logps/chosen": -239.43640625, "logps/rejected": -150.11487129407053, "loss": 2.9352, "rewards/chosen": -0.5837510299682617, "rewards/margins": 1.8013283783350236, "rewards/rejected": -2.385079408303285, "step": 387 }, { "epoch": 12.518218623481781, "grad_norm": 174.0, "kl": 0.0, "learning_rate": 4.933159594067636e-07, "logits/chosen": -65163364.84848485, "logits/rejected": -103761845.67741935, "logps/chosen": -247.18761837121212, "logps/rejected": -145.4609847530242, "loss": 2.9628, "rewards/chosen": -0.44001206484707917, "rewards/margins": 2.1322240270128, "rewards/rejected": -2.572236091859879, "step": 388 }, { "epoch": 12.550607287449393, "grad_norm": 215.0, "kl": 0.0, "learning_rate": 4.899743124062388e-07, "logits/chosen": -65673224.258064516, "logits/rejected": -99906544.48484848, "logps/chosen": -176.6609122983871, "logps/rejected": -153.7152284564394, "loss": 3.0232, "rewards/chosen": -0.22135422306676064, "rewards/margins": 2.102451383659684, "rewards/rejected": -2.3238056067264443, "step": 389 }, { "epoch": 12.582995951417004, "grad_norm": 149.0, "kl": 0.0, "learning_rate": 4.866331133423456e-07, "logits/chosen": -59082826.10526316, "logits/rejected": -96785811.6923077, "logps/chosen": -248.17495888157896, "logps/rejected": -134.40794020432693, "loss": 2.9573, "rewards/chosen": -0.2727272385045102, "rewards/margins": 1.7705692820220826, "rewards/rejected": -2.0432965205265927, "step": 390 }, { "epoch": 12.615384615384615, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 4.832925114961628e-07, "logits/chosen": -66944908.0, "logits/rejected": -107288816.0, "logps/chosen": -252.65260314941406, "logps/rejected": -147.99545288085938, "loss": 2.9563, "rewards/chosen": -0.22697299718856812, "rewards/margins": 2.445434868335724, "rewards/rejected": -2.672407865524292, "step": 391 }, { "epoch": 12.647773279352228, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 4.799526561220855e-07, "logits/chosen": -77148218.51428571, "logits/rejected": -96747105.10344827, "logps/chosen": -178.52057756696428, "logps/rejected": -147.5990369073276, "loss": 2.9546, "rewards/chosen": -1.1691302708217075, "rewards/margins": 1.031372453661388, "rewards/rejected": -2.2005027244830955, "step": 392 }, { "epoch": 12.680161943319838, "grad_norm": 165.0, "kl": 0.3369103670120239, "learning_rate": 4.766136964411575e-07, "logits/chosen": -68810549.89473684, "logits/rejected": -98137846.15384616, "logps/chosen": -212.39981239720396, "logps/rejected": -130.5043006310096, "loss": 3.041, "rewards/chosen": -0.4478492736816406, "rewards/margins": 2.0207017751840444, "rewards/rejected": -2.468551048865685, "step": 393 }, { "epoch": 12.712550607287449, "grad_norm": 239.0, "kl": 0.0, "learning_rate": 4.7327578163440397e-07, "logits/chosen": -68761127.38461539, "logits/rejected": -95771729.92, "logps/chosen": -156.4126477363782, "logps/rejected": -116.554169921875, "loss": 2.9397, "rewards/chosen": -0.47225526662973255, "rewards/margins": 1.80170774606558, "rewards/rejected": -2.2739630126953125, "step": 394 }, { "epoch": 12.744939271255062, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 4.699390608361665e-07, "logits/chosen": -72064720.0, "logits/rejected": -97630952.0, "logps/chosen": -179.0907440185547, "logps/rejected": -143.79690551757812, "loss": 2.9713, "rewards/chosen": -0.8176858425140381, "rewards/margins": 1.7510998249053955, "rewards/rejected": -2.5687856674194336, "step": 395 }, { "epoch": 12.777327935222672, "grad_norm": 222.0, "kl": 0.0, "learning_rate": 4.666036831274392e-07, "logits/chosen": -66282330.35294118, "logits/rejected": -93253836.8, "logps/chosen": -220.14513442095588, "logps/rejected": -132.57906901041667, "loss": 3.005, "rewards/chosen": -0.3126918568330653, "rewards/margins": 2.0485999013863356, "rewards/rejected": -2.361291758219401, "step": 396 }, { "epoch": 12.809716599190283, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 4.63269797529209e-07, "logits/chosen": -81224995.31034483, "logits/rejected": -98459845.48571429, "logps/chosen": -201.45218211206895, "logps/rejected": -147.26661551339285, "loss": 2.9455, "rewards/chosen": -1.2619523673221982, "rewards/margins": 0.8813104413413064, "rewards/rejected": -2.1432628086635046, "step": 397 }, { "epoch": 12.842105263157894, "grad_norm": 169.0, "kl": 0.367470920085907, "learning_rate": 4.599375529957962e-07, "logits/chosen": -62049564.0, "logits/rejected": -105151920.0, "logps/chosen": -258.6617736816406, "logps/rejected": -127.78885650634766, "loss": 3.0405, "rewards/chosen": -0.2124641239643097, "rewards/margins": 2.342603415250778, "rewards/rejected": -2.555067539215088, "step": 398 }, { "epoch": 12.874493927125506, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 4.566070984082013e-07, "logits/chosen": -59460174.222222224, "logits/rejected": -96761700.57142857, "logps/chosen": -247.31312391493054, "logps/rejected": -136.95270647321428, "loss": 3.0286, "rewards/chosen": -0.11064546638064915, "rewards/margins": 2.2375704542038934, "rewards/rejected": -2.3482159205845425, "step": 399 }, { "epoch": 12.906882591093117, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 4.5327858256745065e-07, "logits/chosen": -58901780.48, "logits/rejected": -103169772.3076923, "logps/chosen": -258.33087890625, "logps/rejected": -154.88575470753204, "loss": 2.7745, "rewards/chosen": -0.375203971862793, "rewards/margins": 2.385605315183982, "rewards/rejected": -2.760809287046775, "step": 400 }, { "epoch": 12.906882591093117, "eval_kl": 0.0, "eval_logits/chosen": -80514820.38267875, "eval_logits/rejected": -122508565.95761856, "eval_logps/chosen": -211.8609327794562, "eval_logps/rejected": -134.68407858224018, "eval_loss": 0.26694580912590027, "eval_rewards/chosen": -0.454732935234139, "eval_rewards/margins": 1.8590146208960325, "eval_rewards/rejected": -2.3137475561301715, "eval_runtime": 64.2083, "eval_samples_per_second": 30.697, "eval_steps_per_second": 0.966, "step": 400 }, { "epoch": 12.939271255060728, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 4.499521541879508e-07, "logits/chosen": -77762464.91428572, "logits/rejected": -100488412.68965517, "logps/chosen": -178.36229073660715, "logps/rejected": -140.72516500538794, "loss": 2.9719, "rewards/chosen": -0.5006898607526507, "rewards/margins": 1.84538289525826, "rewards/rejected": -2.3460727560109107, "step": 401 }, { "epoch": 12.97165991902834, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 4.466279618908419e-07, "logits/chosen": -71628984.8888889, "logits/rejected": -101016506.8108108, "logps/chosen": -171.49994574652777, "logps/rejected": -137.81358213682432, "loss": 2.886, "rewards/chosen": -0.5972412250660084, "rewards/margins": 1.728110180722104, "rewards/rejected": -2.3253514057881124, "step": 402 }, { "epoch": 13.0, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 4.43306154197359e-07, "logits/chosen": -57514947.368421055, "logits/rejected": -97582647.6521739, "logps/chosen": -137.25752981085526, "logps/rejected": -124.87584918478261, "loss": 2.9705, "rewards/chosen": -0.9436655546489515, "rewards/margins": 1.2856941135851687, "rewards/rejected": -2.22935966823412, "step": 403 }, { "epoch": 13.03238866396761, "grad_norm": 212.0, "kl": 0.0, "learning_rate": 4.399868795221951e-07, "logits/chosen": -62363561.4117647, "logits/rejected": -96009873.06666666, "logps/chosen": -215.01226447610293, "logps/rejected": -112.50869140625, "loss": 2.9889, "rewards/chosen": -0.42311934863819795, "rewards/margins": 1.4044259445340026, "rewards/rejected": -1.8275452931722005, "step": 404 }, { "epoch": 13.064777327935223, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 4.3667028616687156e-07, "logits/chosen": -66683655.52941176, "logits/rejected": -87116928.0, "logps/chosen": -229.189453125, "logps/rejected": -111.966552734375, "loss": 2.9998, "rewards/chosen": -0.5239222470451804, "rewards/margins": 1.039568556991278, "rewards/rejected": -1.5634908040364583, "step": 405 }, { "epoch": 13.097165991902834, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 4.333565223131107e-07, "logits/chosen": -67220795.73333333, "logits/rejected": -103755565.1764706, "logps/chosen": -220.74016927083332, "logps/rejected": -135.38880112591912, "loss": 2.8688, "rewards/chosen": -0.3651699701944987, "rewards/margins": 2.221465395011154, "rewards/rejected": -2.5866353652056526, "step": 406 }, { "epoch": 13.129554655870445, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 4.3004573601621576e-07, "logits/chosen": -66763953.23076923, "logits/rejected": -98261268.48, "logps/chosen": -237.33042868589743, "logps/rejected": -125.47279296875, "loss": 3.1564, "rewards/chosen": -0.2864517798790565, "rewards/margins": 1.840128725492037, "rewards/rejected": -2.1265805053710936, "step": 407 }, { "epoch": 13.161943319838057, "grad_norm": 164.0, "kl": 0.0, "learning_rate": 4.267380751984567e-07, "logits/chosen": -57138290.75862069, "logits/rejected": -98104619.88571429, "logps/chosen": -210.46821120689654, "logps/rejected": -132.85344587053572, "loss": 2.8493, "rewards/chosen": -0.11698566634079506, "rewards/margins": 2.1144321803388926, "rewards/rejected": -2.2314178466796877, "step": 408 }, { "epoch": 13.194331983805668, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 4.2343368764245994e-07, "logits/chosen": -55141797.161290325, "logits/rejected": -94156396.60606061, "logps/chosen": -158.40467489919354, "logps/rejected": -117.95622484611742, "loss": 2.9667, "rewards/chosen": -0.019050984613357053, "rewards/margins": 2.271195227507855, "rewards/rejected": -2.290246212121212, "step": 409 }, { "epoch": 13.226720647773279, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 4.201327209846065e-07, "logits/chosen": -64434482.42424242, "logits/rejected": -90488105.29032259, "logps/chosen": -149.4125828598485, "logps/rejected": -150.57333669354838, "loss": 2.9511, "rewards/chosen": -0.4673265977339311, "rewards/margins": 1.7556500868363814, "rewards/rejected": -2.2229766845703125, "step": 410 }, { "epoch": 13.259109311740891, "grad_norm": 192.0, "kl": 0.0, "learning_rate": 4.1683532270843495e-07, "logits/chosen": -52033964.137931034, "logits/rejected": -99901571.65714286, "logps/chosen": -236.8319302262931, "logps/rejected": -137.90398995535713, "loss": 2.956, "rewards/chosen": -0.044351536652137494, "rewards/margins": 2.452225480995742, "rewards/rejected": -2.4965770176478794, "step": 411 }, { "epoch": 13.291497975708502, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 4.135416401380534e-07, "logits/chosen": -75395577.26315789, "logits/rejected": -95964288.0, "logps/chosen": -187.52959241365133, "logps/rejected": -150.21195162259616, "loss": 2.9836, "rewards/chosen": -0.8043166712710732, "rewards/margins": 1.7186428178177187, "rewards/rejected": -2.522959489088792, "step": 412 }, { "epoch": 13.323886639676113, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 4.1025182043155545e-07, "logits/chosen": -71402568.3478261, "logits/rejected": -98190410.92682926, "logps/chosen": -152.3931194802989, "logps/rejected": -125.17225609756098, "loss": 2.8058, "rewards/chosen": -1.158001941183339, "rewards/margins": 1.3650637653953577, "rewards/rejected": -2.5230657065786968, "step": 413 }, { "epoch": 13.356275303643725, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 4.069660105744469e-07, "logits/chosen": -61850938.51428571, "logits/rejected": -92347418.48275863, "logps/chosen": -195.61358816964287, "logps/rejected": -140.38009327855605, "loss": 2.9815, "rewards/chosen": -0.22204859597342355, "rewards/margins": 2.02168635570357, "rewards/rejected": -2.2437349516769935, "step": 414 }, { "epoch": 13.388663967611336, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 4.036843573730773e-07, "logits/chosen": -61841362.28571428, "logits/rejected": -93016832.0, "logps/chosen": -285.03006417410717, "logps/rejected": -118.03655327690973, "loss": 2.9634, "rewards/chosen": -0.2199288947241647, "rewards/margins": 1.835546340261187, "rewards/rejected": -2.0554752349853516, "step": 415 }, { "epoch": 13.421052631578947, "grad_norm": 236.0, "kl": 0.0, "learning_rate": 4.0040700744808204e-07, "logits/chosen": -68229584.5925926, "logits/rejected": -100075575.35135135, "logps/chosen": -255.61593967013889, "logps/rejected": -136.12808804898648, "loss": 2.9279, "rewards/chosen": -0.6102768226906106, "rewards/margins": 1.7374660312473118, "rewards/rejected": -2.3477428539379224, "step": 416 }, { "epoch": 13.45344129554656, "grad_norm": 162.0, "kl": 0.0, "learning_rate": 3.9713410722783014e-07, "logits/chosen": -53743496.53333333, "logits/rejected": -97241682.8235294, "logps/chosen": -255.80555013020833, "logps/rejected": -112.13170668658088, "loss": 2.8831, "rewards/chosen": -0.1641114393870036, "rewards/margins": 2.1807109393325503, "rewards/rejected": -2.344822378719554, "step": 417 }, { "epoch": 13.48582995951417, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 3.9386580294188363e-07, "logits/chosen": -58015687.68, "logits/rejected": -99712177.23076923, "logps/chosen": -239.59330078125, "logps/rejected": -150.53117487980768, "loss": 2.9356, "rewards/chosen": -0.5994418334960937, "rewards/margins": 1.8272686376327125, "rewards/rejected": -2.426710471128806, "step": 418 }, { "epoch": 13.518218623481781, "grad_norm": 164.0, "kl": 0.0, "learning_rate": 3.906022406144624e-07, "logits/chosen": -65337879.27272727, "logits/rejected": -104208516.12903225, "logps/chosen": -247.24338600852272, "logps/rejected": -145.6847120715726, "loss": 2.9268, "rewards/chosen": -0.4455898169315223, "rewards/margins": 2.149017874679491, "rewards/rejected": -2.5946076916110132, "step": 419 }, { "epoch": 13.550607287449393, "grad_norm": 203.0, "kl": 0.0, "learning_rate": 3.873435660579217e-07, "logits/chosen": -65686804.64516129, "logits/rejected": -100139101.0909091, "logps/chosen": -176.82069052419354, "logps/rejected": -153.87301728219697, "loss": 3.0137, "rewards/chosen": -0.23733255940098916, "rewards/margins": 2.1022541031226853, "rewards/rejected": -2.3395866625236743, "step": 420 }, { "epoch": 13.582995951417004, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 3.840899248662358e-07, "logits/chosen": -59151023.15789474, "logits/rejected": -97065028.92307693, "logps/chosen": -248.03626130756578, "logps/rejected": -134.3540978064904, "loss": 2.971, "rewards/chosen": -0.25885752627724096, "rewards/margins": 1.7790538154632938, "rewards/rejected": -2.037911341740535, "step": 421 }, { "epoch": 13.615384615384615, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 3.8084146240849453e-07, "logits/chosen": -66869288.0, "logits/rejected": -107588216.0, "logps/chosen": -252.61904907226562, "logps/rejected": -148.1591796875, "loss": 2.9301, "rewards/chosen": -0.22361889481544495, "rewards/margins": 2.465162366628647, "rewards/rejected": -2.688781261444092, "step": 422 }, { "epoch": 13.647773279352228, "grad_norm": 155.0, "kl": 0.0, "learning_rate": 3.77598323822407e-07, "logits/chosen": -77296362.05714285, "logits/rejected": -97172427.03448276, "logps/chosen": -178.31141183035714, "logps/rejected": -147.4927768049569, "loss": 2.9457, "rewards/chosen": -1.1482150486537388, "rewards/margins": 1.0416607184950353, "rewards/rejected": -2.189875767148774, "step": 423 }, { "epoch": 13.680161943319838, "grad_norm": 180.0, "kl": 0.3856762647628784, "learning_rate": 3.743606540078177e-07, "logits/chosen": -68903215.15789473, "logits/rejected": -98439798.15384616, "logps/chosen": -212.71296772203948, "logps/rejected": -130.68912447415866, "loss": 3.0237, "rewards/chosen": -0.4791660810771741, "rewards/margins": 2.0078674694787155, "rewards/rejected": -2.4870335505558896, "step": 424 }, { "epoch": 13.712550607287449, "grad_norm": 233.0, "kl": 0.0, "learning_rate": 3.7112859762023305e-07, "logits/chosen": -68740831.17948718, "logits/rejected": -95837839.36, "logps/chosen": -156.58494841746796, "logps/rejected": -116.330234375, "loss": 2.9363, "rewards/chosen": -0.48948473808092946, "rewards/margins": 1.762084475786258, "rewards/rejected": -2.2515692138671874, "step": 425 }, { "epoch": 13.744939271255062, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 3.67902299064357e-07, "logits/chosen": -72169888.0, "logits/rejected": -97907488.0, "logps/chosen": -179.15684509277344, "logps/rejected": -144.07077026367188, "loss": 2.9644, "rewards/chosen": -0.8242971897125244, "rewards/margins": 1.7718760967254639, "rewards/rejected": -2.5961732864379883, "step": 426 }, { "epoch": 13.777327935222672, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 3.646819024876406e-07, "logits/chosen": -66282797.176470585, "logits/rejected": -93732411.73333333, "logps/chosen": -220.14165900735293, "logps/rejected": -132.7333984375, "loss": 2.9911, "rewards/chosen": -0.312345729154699, "rewards/margins": 2.064380294201421, "rewards/rejected": -2.37672602335612, "step": 427 }, { "epoch": 13.809716599190283, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 3.614675517738405e-07, "logits/chosen": -81145494.06896552, "logits/rejected": -98771792.45714286, "logps/chosen": -201.3799838362069, "logps/rejected": -147.63282645089285, "loss": 2.9303, "rewards/chosen": -1.2547336446827855, "rewards/margins": 0.925151783609625, "rewards/rejected": -2.1798854282924105, "step": 428 }, { "epoch": 13.842105263157894, "grad_norm": 188.0, "kl": 0.38798022270202637, "learning_rate": 3.582593905365912e-07, "logits/chosen": -62279860.0, "logits/rejected": -105478680.0, "logps/chosen": -258.7704772949219, "logps/rejected": -128.01951599121094, "loss": 3.0279, "rewards/chosen": -0.22333630919456482, "rewards/margins": 2.3547967970371246, "rewards/rejected": -2.5781331062316895, "step": 429 }, { "epoch": 13.874493927125506, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 3.5505756211298774e-07, "logits/chosen": -59527082.666666664, "logits/rejected": -96977956.57142857, "logps/chosen": -247.30409071180554, "logps/rejected": -136.9112548828125, "loss": 3.039, "rewards/chosen": -0.10974003209008111, "rewards/margins": 2.2343312199153598, "rewards/rejected": -2.344071252005441, "step": 430 }, { "epoch": 13.906882591093117, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 3.5186220955718303e-07, "logits/chosen": -58701363.2, "logits/rejected": -103313033.84615384, "logps/chosen": -258.4644921875, "logps/rejected": -155.13411458333334, "loss": 2.7653, "rewards/chosen": -0.3885665512084961, "rewards/margins": 2.3970807187985153, "rewards/rejected": -2.7856472700070114, "step": 431 }, { "epoch": 13.939271255060728, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 3.486734756339943e-07, "logits/chosen": -77798129.37142856, "logits/rejected": -100511867.5862069, "logps/chosen": -178.74274553571428, "logps/rejected": -140.58995319234913, "loss": 2.9644, "rewards/chosen": -0.5387344905308314, "rewards/margins": 1.793816610627574, "rewards/rejected": -2.3325511011584052, "step": 432 }, { "epoch": 13.97165991902834, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 3.454915028125263e-07, "logits/chosen": -71885468.44444445, "logits/rejected": -101091120.43243243, "logps/chosen": -171.57468894675927, "logps/rejected": -138.02104888091216, "loss": 2.8645, "rewards/chosen": -0.6047153472900391, "rewards/margins": 1.741381619427655, "rewards/rejected": -2.346096966717694, "step": 433 }, { "epoch": 14.0, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 3.4231643325980485e-07, "logits/chosen": -57590480.84210526, "logits/rejected": -97733409.39130434, "logps/chosen": -136.9785798725329, "logps/rejected": -124.88034986413044, "loss": 2.9596, "rewards/chosen": -0.9157708820543791, "rewards/margins": 1.3140384228878763, "rewards/rejected": -2.2298093049422554, "step": 434 }, { "epoch": 14.03238866396761, "grad_norm": 200.0, "kl": 0.0, "learning_rate": 3.391484088344256e-07, "logits/chosen": -62335006.11764706, "logits/rejected": -96016162.13333334, "logps/chosen": -215.0373965992647, "logps/rejected": -112.78855794270834, "loss": 2.9837, "rewards/chosen": -0.42563365487491384, "rewards/margins": 1.4298984190996955, "rewards/rejected": -1.8555320739746093, "step": 435 }, { "epoch": 14.064777327935223, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 3.359875710802154e-07, "logits/chosen": -66720323.76470588, "logits/rejected": -87461324.8, "logps/chosen": -229.37296070772058, "logps/rejected": -112.27146809895834, "loss": 2.951, "rewards/chosen": -0.5422763263477999, "rewards/margins": 1.0517066244985542, "rewards/rejected": -1.593982950846354, "step": 436 }, { "epoch": 14.097165991902834, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 3.328340612199091e-07, "logits/chosen": -67281425.06666666, "logits/rejected": -103862031.05882353, "logps/chosen": -220.54358723958333, "logps/rejected": -135.44522633272058, "loss": 2.8496, "rewards/chosen": -0.34551318486531574, "rewards/margins": 2.2467647907780663, "rewards/rejected": -2.5922779756433822, "step": 437 }, { "epoch": 14.129554655870445, "grad_norm": 212.0, "kl": 0.0, "learning_rate": 3.296880201488387e-07, "logits/chosen": -66975730.87179487, "logits/rejected": -98592235.52, "logps/chosen": -237.08318309294873, "logps/rejected": -125.760625, "loss": 3.1563, "rewards/chosen": -0.26172520564152646, "rewards/margins": 1.8936396320049578, "rewards/rejected": -2.1553648376464842, "step": 438 }, { "epoch": 14.161943319838057, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 3.2654958842863966e-07, "logits/chosen": -57296750.344827585, "logits/rejected": -98417473.82857142, "logps/chosen": -210.61644665948276, "logps/rejected": -133.2125, "loss": 2.8353, "rewards/chosen": -0.1318079685342723, "rewards/margins": 2.135515770653786, "rewards/rejected": -2.267323739188058, "step": 439 }, { "epoch": 14.194331983805668, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 3.234189062809695e-07, "logits/chosen": -55212981.67741936, "logits/rejected": -94428074.66666667, "logps/chosen": -158.2477791078629, "logps/rejected": -118.16590465198864, "loss": 2.9616, "rewards/chosen": -0.0033594177615258003, "rewards/margins": 2.3078544512755244, "rewards/rejected": -2.3112138690370503, "step": 440 }, { "epoch": 14.226720647773279, "grad_norm": 204.0, "kl": 0.0, "learning_rate": 3.2029611358124365e-07, "logits/chosen": -64443834.18181818, "logits/rejected": -90383673.80645162, "logps/chosen": -149.40093809185606, "logps/rejected": -150.58826864919354, "loss": 2.9449, "rewards/chosen": -0.46616334626168915, "rewards/margins": 1.758306484651239, "rewards/rejected": -2.2244698309129283, "step": 441 }, { "epoch": 14.259109311740891, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 3.171813498523857e-07, "logits/chosen": -52027153.655172415, "logits/rejected": -100159546.51428571, "logps/chosen": -236.96869948814654, "logps/rejected": -138.04977678571427, "loss": 2.943, "rewards/chosen": -0.05802883361947948, "rewards/margins": 2.4531279569776188, "rewards/rejected": -2.511156790597098, "step": 442 }, { "epoch": 14.291497975708502, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 3.1407475425859343e-07, "logits/chosen": -75414339.36842105, "logits/rejected": -96486921.84615384, "logps/chosen": -187.29546155427633, "logps/rejected": -150.39766751802884, "loss": 2.9584, "rewards/chosen": -0.7809036656429893, "rewards/margins": 1.7606292909938797, "rewards/rejected": -2.541532956636869, "step": 443 }, { "epoch": 14.323886639676113, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 3.1097646559912206e-07, "logits/chosen": -71349837.91304348, "logits/rejected": -98318598.24390244, "logps/chosen": -152.32198963994566, "logps/rejected": -125.24911871189025, "loss": 2.7775, "rewards/chosen": -1.1508886917777683, "rewards/margins": 1.379862792418592, "rewards/rejected": -2.5307514841963603, "step": 444 }, { "epoch": 14.356275303643725, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 3.0788662230208145e-07, "logits/chosen": -62047692.8, "logits/rejected": -92549384.8275862, "logps/chosen": -195.6550502232143, "logps/rejected": -140.62846848060346, "loss": 3.0008, "rewards/chosen": -0.2261941637311663, "rewards/margins": 2.042380254961587, "rewards/rejected": -2.2685744186927534, "step": 445 }, { "epoch": 14.388663967611336, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 3.048053624182526e-07, "logits/chosen": -61581298.28571428, "logits/rejected": -93053937.77777778, "logps/chosen": -285.24771554129467, "logps/rejected": -118.25196668836806, "loss": 2.9588, "rewards/chosen": -0.24169693674360002, "rewards/margins": 1.8353205294836135, "rewards/rejected": -2.0770174662272134, "step": 446 }, { "epoch": 14.421052631578947, "grad_norm": 326.0, "kl": 0.0, "learning_rate": 3.017328236149186e-07, "logits/chosen": -68228887.7037037, "logits/rejected": -100119116.1081081, "logps/chosen": -255.85997178819446, "logps/rejected": -136.4729465793919, "loss": 2.9033, "rewards/chosen": -0.6346799355966074, "rewards/margins": 1.7475479878224172, "rewards/rejected": -2.3822279234190247, "step": 447 }, { "epoch": 14.45344129554656, "grad_norm": 174.0, "kl": 0.0, "learning_rate": 2.986691431697148e-07, "logits/chosen": -53854459.733333334, "logits/rejected": -97388167.52941176, "logps/chosen": -255.93505859375, "logps/rejected": -112.32900103400735, "loss": 2.8598, "rewards/chosen": -0.1770633061726888, "rewards/margins": 2.187487564834894, "rewards/rejected": -2.364550871007583, "step": 448 }, { "epoch": 14.48582995951417, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 2.9561445796449414e-07, "logits/chosen": -58128983.04, "logits/rejected": -99960910.76923077, "logps/chosen": -239.3297265625, "logps/rejected": -150.4403295272436, "loss": 2.9055, "rewards/chosen": -0.573082618713379, "rewards/margins": 1.844545547289726, "rewards/rejected": -2.417628166003105, "step": 449 }, { "epoch": 14.518218623481781, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 2.9256890447921315e-07, "logits/chosen": -65360314.18181818, "logits/rejected": -104209267.61290322, "logps/chosen": -247.2750946969697, "logps/rejected": -145.80722341229838, "loss": 2.9257, "rewards/chosen": -0.44876208449854993, "rewards/margins": 2.158096445853643, "rewards/rejected": -2.6068585303521927, "step": 450 }, { "epoch": 14.550607287449393, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 2.895326187858326e-07, "logits/chosen": -65681193.29032258, "logits/rejected": -100278194.42424242, "logps/chosen": -176.77208291330646, "logps/rejected": -153.88634883996212, "loss": 3.0083, "rewards/chosen": -0.2324707277359501, "rewards/margins": 2.108446917342767, "rewards/rejected": -2.340917645078717, "step": 451 }, { "epoch": 14.582995951417004, "grad_norm": 153.0, "kl": 0.0, "learning_rate": 2.865057365422386e-07, "logits/chosen": -59178880.0, "logits/rejected": -97249112.61538461, "logps/chosen": -247.91049033717104, "logps/rejected": -134.51881760817307, "loss": 2.9599, "rewards/chosen": -0.24628260261134097, "rewards/margins": 1.8081014822369161, "rewards/rejected": -2.054384084848257, "step": 452 }, { "epoch": 14.615384615384615, "grad_norm": 169.0, "kl": 0.0, "learning_rate": 2.8348839298618177e-07, "logits/chosen": -66901720.0, "logits/rejected": -107571936.0, "logps/chosen": -252.63880920410156, "logps/rejected": -148.36029052734375, "loss": 2.9447, "rewards/chosen": -0.2255946844816208, "rewards/margins": 2.483295515179634, "rewards/rejected": -2.708890199661255, "step": 453 }, { "epoch": 14.647773279352228, "grad_norm": 163.0, "kl": 0.0, "learning_rate": 2.8048072292923464e-07, "logits/chosen": -77316527.54285714, "logits/rejected": -97096403.86206897, "logps/chosen": -178.4888950892857, "logps/rejected": -147.96482691271552, "loss": 2.9268, "rewards/chosen": -1.1659620012555802, "rewards/margins": 1.0711180137296028, "rewards/rejected": -2.237080014985183, "step": 454 }, { "epoch": 14.680161943319838, "grad_norm": 192.0, "kl": 0.372228741645813, "learning_rate": 2.774828607507683e-07, "logits/chosen": -68992552.42105263, "logits/rejected": -98634801.23076923, "logps/chosen": -212.55967310855263, "logps/rejected": -131.10398512620193, "loss": 3.0125, "rewards/chosen": -0.46383536489386307, "rewards/margins": 2.064683898740452, "rewards/rejected": -2.528519263634315, "step": 455 }, { "epoch": 14.712550607287449, "grad_norm": 234.0, "kl": 0.0, "learning_rate": 2.74494940391949e-07, "logits/chosen": -68925525.33333333, "logits/rejected": -96213186.56, "logps/chosen": -156.44189453125, "logps/rejected": -116.6107421875, "loss": 2.9425, "rewards/chosen": -0.47518011239858776, "rewards/margins": 1.8044396385779748, "rewards/rejected": -2.2796197509765626, "step": 456 }, { "epoch": 14.744939271255062, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 2.715170953497532e-07, "logits/chosen": -72270920.0, "logits/rejected": -97976480.0, "logps/chosen": -179.15814208984375, "logps/rejected": -144.1017608642578, "loss": 2.9455, "rewards/chosen": -0.8244253993034363, "rewards/margins": 1.7748473286628723, "rewards/rejected": -2.5992727279663086, "step": 457 }, { "epoch": 14.777327935222672, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 2.685494586710038e-07, "logits/chosen": -66368609.88235294, "logits/rejected": -93580117.33333333, "logps/chosen": -220.22845818014707, "logps/rejected": -132.73451334635416, "loss": 2.9813, "rewards/chosen": -0.32102542765000286, "rewards/margins": 2.0558107133005183, "rewards/rejected": -2.376836140950521, "step": 458 }, { "epoch": 14.809716599190283, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 2.655921629464245e-07, "logits/chosen": -81319688.8275862, "logits/rejected": -98513598.17142858, "logps/chosen": -201.75286233836206, "logps/rejected": -147.54061104910716, "loss": 2.9369, "rewards/chosen": -1.2920216527478448, "rewards/margins": 0.8786435432622, "rewards/rejected": -2.1706651960100447, "step": 459 }, { "epoch": 14.842105263157894, "grad_norm": 167.0, "kl": 0.3162778615951538, "learning_rate": 2.626453403047172e-07, "logits/chosen": -62228756.0, "logits/rejected": -105336456.0, "logps/chosen": -258.8127746582031, "logps/rejected": -128.1526641845703, "loss": 3.0003, "rewards/chosen": -0.22756414115428925, "rewards/margins": 2.3638841658830643, "rewards/rejected": -2.5914483070373535, "step": 460 }, { "epoch": 14.874493927125506, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 2.597091224066581e-07, "logits/chosen": -59445205.333333336, "logits/rejected": -96873947.42857143, "logps/chosen": -247.08816189236111, "logps/rejected": -137.38323102678572, "loss": 3.0211, "rewards/chosen": -0.08814783891042073, "rewards/margins": 2.3031202100572132, "rewards/rejected": -2.391268048967634, "step": 461 }, { "epoch": 14.906882591093117, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 2.5678364043921503e-07, "logits/chosen": -58916346.88, "logits/rejected": -103354164.51282051, "logps/chosen": -258.32203125, "logps/rejected": -155.28740985576923, "loss": 2.7557, "rewards/chosen": -0.374318962097168, "rewards/margins": 2.426655761523125, "rewards/rejected": -2.8009747236202927, "step": 462 }, { "epoch": 14.939271255060728, "grad_norm": 161.0, "kl": 0.0, "learning_rate": 2.538690251096862e-07, "logits/chosen": -77990078.17142858, "logits/rejected": -100813841.65517241, "logps/chosen": -178.31856863839286, "logps/rejected": -141.10777545797413, "loss": 2.9307, "rewards/chosen": -0.49631690979003906, "rewards/margins": 1.8880162074648101, "rewards/rejected": -2.384333117254849, "step": 463 }, { "epoch": 14.97165991902834, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 2.5096540663986064e-07, "logits/chosen": -71577751.7037037, "logits/rejected": -101257977.08108108, "logps/chosen": -171.62841796875, "logps/rejected": -138.1628220016892, "loss": 2.8692, "rewards/chosen": -0.6100874300356265, "rewards/margins": 1.7501876323192087, "rewards/rejected": -2.360275062354835, "step": 464 }, { "epoch": 15.0, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 2.480729147601999e-07, "logits/chosen": -57674132.21052632, "logits/rejected": -97789952.0, "logps/chosen": -137.22797594572367, "logps/rejected": -125.16964588994566, "loss": 2.9382, "rewards/chosen": -0.940712477031507, "rewards/margins": 1.31802583340922, "rewards/rejected": -2.258738310440727, "step": 465 }, { "epoch": 15.03238866396761, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 2.451916787040412e-07, "logits/chosen": -62508879.058823526, "logits/rejected": -96349806.93333334, "logps/chosen": -215.2633846507353, "logps/rejected": -113.00658365885417, "loss": 2.9556, "rewards/chosen": -0.44823427761302276, "rewards/margins": 1.4291012072095683, "rewards/rejected": -1.8773354848225912, "step": 466 }, { "epoch": 15.064777327935223, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 2.423218272018252e-07, "logits/chosen": -66979681.88235294, "logits/rejected": -87443899.73333333, "logps/chosen": -229.34899184283088, "logps/rejected": -112.09307454427083, "loss": 2.9546, "rewards/chosen": -0.539879181805779, "rewards/margins": 1.036263447181851, "rewards/rejected": -1.5761426289876301, "step": 467 }, { "epoch": 15.097165991902834, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 2.394634884753419e-07, "logits/chosen": -67305501.86666666, "logits/rejected": -104019855.05882353, "logps/chosen": -220.8958984375, "logps/rejected": -135.80086741727942, "loss": 2.8524, "rewards/chosen": -0.38074350357055664, "rewards/margins": 2.2470991190742042, "rewards/rejected": -2.627842622644761, "step": 468 }, { "epoch": 15.129554655870445, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 2.3661679023200422e-07, "logits/chosen": -66880617.025641024, "logits/rejected": -98502768.64, "logps/chosen": -237.0897435897436, "logps/rejected": -125.94109375, "loss": 3.151, "rewards/chosen": -0.26238353435809797, "rewards/margins": 1.9110278731126051, "rewards/rejected": -2.173411407470703, "step": 469 }, { "epoch": 15.161943319838057, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 2.3378185965914078e-07, "logits/chosen": -57161392.551724136, "logits/rejected": -98508946.28571428, "logps/chosen": -210.51459792564654, "logps/rejected": -133.46876395089285, "loss": 2.8341, "rewards/chosen": -0.12162569473529684, "rewards/margins": 2.1713221289254174, "rewards/rejected": -2.292947823660714, "step": 470 }, { "epoch": 15.194331983805668, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 2.309588234183137e-07, "logits/chosen": -55166220.38709678, "logits/rejected": -94622347.63636364, "logps/chosen": -158.29580393145162, "logps/rejected": -117.99982244318181, "loss": 2.9478, "rewards/chosen": -0.008161975491431451, "rewards/margins": 2.2864436264261823, "rewards/rejected": -2.2946056019176138, "step": 471 }, { "epoch": 15.226720647773279, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 2.2814780763965956e-07, "logits/chosen": -64386944.0, "logits/rejected": -90639855.48387097, "logps/chosen": -149.33127663352272, "logps/rejected": -151.09208039314515, "loss": 2.9481, "rewards/chosen": -0.459195512713808, "rewards/margins": 1.8156549019314792, "rewards/rejected": -2.2748504146452873, "step": 472 }, { "epoch": 15.259109311740891, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 2.2534893791625404e-07, "logits/chosen": -52017911.172413796, "logits/rejected": -100169230.62857144, "logps/chosen": -236.86188375538794, "logps/rejected": -138.26692243303572, "loss": 2.9403, "rewards/chosen": -0.047348754159335434, "rewards/margins": 2.4855210752909995, "rewards/rejected": -2.532869829450335, "step": 473 }, { "epoch": 15.291497975708502, "grad_norm": 162.0, "kl": 0.0, "learning_rate": 2.2256233929850044e-07, "logits/chosen": -75536161.68421052, "logits/rejected": -96382089.84615384, "logps/chosen": -187.44907740542763, "logps/rejected": -150.49297626201923, "loss": 2.9537, "rewards/chosen": -0.7962660538522821, "rewards/margins": 1.7547963099923691, "rewards/rejected": -2.5510623638446512, "step": 474 }, { "epoch": 15.323886639676113, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 2.197881362885426e-07, "logits/chosen": -71331773.2173913, "logits/rejected": -98335812.68292683, "logps/chosen": -152.35098930027175, "logps/rejected": -125.35432545731707, "loss": 2.7602, "rewards/chosen": -1.153787778771442, "rewards/margins": 1.3874846404835122, "rewards/rejected": -2.541272419254954, "step": 475 }, { "epoch": 15.356275303643725, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 2.1702645283470234e-07, "logits/chosen": -61982515.2, "logits/rejected": -92642312.8275862, "logps/chosen": -195.61316964285714, "logps/rejected": -140.80540308459052, "loss": 3.0001, "rewards/chosen": -0.22200731549944197, "rewards/margins": 2.064258616781, "rewards/rejected": -2.286265932280442, "step": 476 }, { "epoch": 15.388663967611336, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 2.1427741232594182e-07, "logits/chosen": -62008032.0, "logits/rejected": -93219576.8888889, "logps/chosen": -285.46048409598217, "logps/rejected": -118.15388997395833, "loss": 2.9731, "rewards/chosen": -0.26297320638384136, "rewards/margins": 1.8042366731734503, "rewards/rejected": -2.0672098795572915, "step": 477 }, { "epoch": 15.421052631578947, "grad_norm": 280.0, "kl": 0.0, "learning_rate": 2.1154113758634966e-07, "logits/chosen": -68126729.48148148, "logits/rejected": -100055365.1891892, "logps/chosen": -255.69733796296296, "logps/rejected": -136.45947265625, "loss": 2.8915, "rewards/chosen": -0.6184189407913773, "rewards/margins": 1.7624612603936942, "rewards/rejected": -2.3808802011850716, "step": 478 }, { "epoch": 15.45344129554656, "grad_norm": 151.0, "kl": 0.0, "learning_rate": 2.0881775086965492e-07, "logits/chosen": -53814545.06666667, "logits/rejected": -97413240.47058824, "logps/chosen": -255.84069010416667, "logps/rejected": -112.38763786764706, "loss": 2.843, "rewards/chosen": -0.167624568939209, "rewards/margins": 2.20279083812938, "rewards/rejected": -2.370415407068589, "step": 479 }, { "epoch": 15.48582995951417, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 2.0610737385376348e-07, "logits/chosen": -58100858.88, "logits/rejected": -99996987.07692307, "logps/chosen": -239.43080078125, "logps/rejected": -150.7254356971154, "loss": 2.9057, "rewards/chosen": -0.5831904602050781, "rewards/margins": 1.8629473837828026, "rewards/rejected": -2.4461378439878807, "step": 480 }, { "epoch": 15.518218623481781, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 2.0341012763532239e-07, "logits/chosen": -65494512.484848484, "logits/rejected": -104369911.74193548, "logps/chosen": -247.40511067708334, "logps/rejected": -145.88788432459677, "loss": 2.9188, "rewards/chosen": -0.46176453792687616, "rewards/margins": 2.153159246882852, "rewards/rejected": -2.614923784809728, "step": 481 }, { "epoch": 15.550607287449393, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 2.0072613272430922e-07, "logits/chosen": -65853493.67741936, "logits/rejected": -100311373.57575758, "logps/chosen": -176.9901083669355, "logps/rejected": -154.20259232954547, "loss": 3.0325, "rewards/chosen": -0.2542735069028793, "rewards/margins": 2.1182700592518318, "rewards/rejected": -2.372543566154711, "step": 482 }, { "epoch": 15.582995951417004, "grad_norm": 155.0, "kl": 0.0, "learning_rate": 1.980555090386477e-07, "logits/chosen": -59290933.89473684, "logits/rejected": -97237415.38461539, "logps/chosen": -247.99252158717104, "logps/rejected": -134.71644005408655, "loss": 2.9476, "rewards/chosen": -0.254484728762978, "rewards/margins": 1.8196621288654775, "rewards/rejected": -2.0741468576284556, "step": 483 }, { "epoch": 15.615384615384615, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 1.953983758988502e-07, "logits/chosen": -66973624.0, "logits/rejected": -107598976.0, "logps/chosen": -252.64303588867188, "logps/rejected": -148.0606231689453, "loss": 2.9289, "rewards/chosen": -0.2260175347328186, "rewards/margins": 2.452908217906952, "rewards/rejected": -2.6789257526397705, "step": 484 }, { "epoch": 15.647773279352228, "grad_norm": 161.0, "kl": 0.0, "learning_rate": 1.927548520226857e-07, "logits/chosen": -77407802.51428571, "logits/rejected": -97110713.37931034, "logps/chosen": -178.57099609375, "logps/rejected": -148.07145743534483, "loss": 2.9161, "rewards/chosen": -1.1741735185895648, "rewards/margins": 1.0735708114548856, "rewards/rejected": -2.2477443300444504, "step": 485 }, { "epoch": 15.680161943319838, "grad_norm": 175.0, "kl": 0.352486252784729, "learning_rate": 1.9012505551987762e-07, "logits/chosen": -68995590.73684211, "logits/rejected": -98562087.38461539, "logps/chosen": -212.70760947779604, "logps/rejected": -131.20113431490384, "loss": 3.0077, "rewards/chosen": -0.47862815856933594, "rewards/margins": 2.0596065521240234, "rewards/rejected": -2.5382347106933594, "step": 486 }, { "epoch": 15.712550607287449, "grad_norm": 237.0, "kl": 0.0, "learning_rate": 1.8750910388682427e-07, "logits/chosen": -68801404.71794872, "logits/rejected": -96091484.16, "logps/chosen": -156.53382912660257, "logps/rejected": -116.83669921875, "loss": 2.9114, "rewards/chosen": -0.4843734839023688, "rewards/margins": 1.8178414819179436, "rewards/rejected": -2.3022149658203124, "step": 487 }, { "epoch": 15.744939271255062, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 1.8490711400135117e-07, "logits/chosen": -72317648.0, "logits/rejected": -98281968.0, "logps/chosen": -179.27587890625, "logps/rejected": -144.23171997070312, "loss": 2.9381, "rewards/chosen": -0.8361989855766296, "rewards/margins": 1.7760685086250305, "rewards/rejected": -2.61226749420166, "step": 488 }, { "epoch": 15.777327935222672, "grad_norm": 205.0, "kl": 0.0, "learning_rate": 1.8231920211748818e-07, "logits/chosen": -66403734.5882353, "logits/rejected": -93627955.2, "logps/chosen": -220.07602826286765, "logps/rejected": -132.8814208984375, "loss": 2.9773, "rewards/chosen": -0.30578338398652916, "rewards/margins": 2.0857439190733666, "rewards/rejected": -2.391527303059896, "step": 489 }, { "epoch": 15.809716599190283, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 1.7974548386027584e-07, "logits/chosen": -81335931.5862069, "logits/rejected": -98753543.31428571, "logps/chosen": -201.59363213900863, "logps/rejected": -147.62652064732143, "loss": 2.9142, "rewards/chosen": -1.2760967386179958, "rewards/margins": 0.9031569758072273, "rewards/rejected": -2.179253714425223, "step": 490 }, { "epoch": 15.842105263157894, "grad_norm": 181.0, "kl": 0.3179197311401367, "learning_rate": 1.7718607422059879e-07, "logits/chosen": -62185552.0, "logits/rejected": -105514544.0, "logps/chosen": -258.72589111328125, "logps/rejected": -128.19235229492188, "loss": 3.0068, "rewards/chosen": -0.2188776135444641, "rewards/margins": 2.376538932323456, "rewards/rejected": -2.59541654586792, "step": 491 }, { "epoch": 15.874493927125506, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 1.746410875500488e-07, "logits/chosen": -59510869.333333336, "logits/rejected": -96976210.28571428, "logps/chosen": -247.15388997395834, "logps/rejected": -137.29727608816964, "loss": 3.0141, "rewards/chosen": -0.09472141000959608, "rewards/margins": 2.287950491148328, "rewards/rejected": -2.382671901157924, "step": 492 }, { "epoch": 15.906882591093117, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 1.7211063755581524e-07, "logits/chosen": -59008824.32, "logits/rejected": -103630204.71794872, "logps/chosen": -258.5851171875, "logps/rejected": -155.38668118990384, "loss": 2.7657, "rewards/chosen": -0.40062652587890624, "rewards/margins": 2.4102763875325524, "rewards/rejected": -2.8109029134114585, "step": 493 }, { "epoch": 15.939271255060728, "grad_norm": 185.0, "kl": 0.0, "learning_rate": 1.695948372956047e-07, "logits/chosen": -77968332.8, "logits/rejected": -100711794.7586207, "logps/chosen": -178.47477678571428, "logps/rejected": -140.9263537176724, "loss": 2.9268, "rewards/chosen": -0.5119389125279018, "rewards/margins": 1.8542520306967747, "rewards/rejected": -2.3661909432246766, "step": 494 }, { "epoch": 15.97165991902834, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 1.6709379917259025e-07, "logits/chosen": -71700868.74074075, "logits/rejected": -101220282.8108108, "logps/chosen": -171.52345558449073, "logps/rejected": -138.14950644003378, "loss": 2.8751, "rewards/chosen": -0.5995931272153501, "rewards/margins": 1.759349471694595, "rewards/rejected": -2.358942598909945, "step": 495 }, { "epoch": 16.0, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 1.6460763493038838e-07, "logits/chosen": -57558447.15789474, "logits/rejected": -97733442.7826087, "logps/chosen": -137.12277703536185, "logps/rejected": -125.16896654211956, "loss": 2.9446, "rewards/chosen": -0.9301911404258326, "rewards/margins": 1.3284801640281416, "rewards/rejected": -2.258671304453974, "step": 496 }, { "epoch": 16.032388663967613, "grad_norm": 209.0, "kl": 0.0, "learning_rate": 1.621364556480675e-07, "logits/chosen": -62457434.35294118, "logits/rejected": -96218606.93333334, "logps/chosen": -215.25729549632354, "logps/rejected": -113.06044108072916, "loss": 2.979, "rewards/chosen": -0.44762342116411996, "rewards/margins": 1.4350972717883541, "rewards/rejected": -1.882720692952474, "step": 497 }, { "epoch": 16.06477732793522, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 1.596803717351845e-07, "logits/chosen": -66882778.35294118, "logits/rejected": -87651618.13333334, "logps/chosen": -229.46245978860293, "logps/rejected": -112.18683268229167, "loss": 2.9595, "rewards/chosen": -0.5512245402616613, "rewards/margins": 1.034294105978573, "rewards/rejected": -1.5855186462402344, "step": 498 }, { "epoch": 16.097165991902834, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 1.572394929268519e-07, "logits/chosen": -67265625.6, "logits/rejected": -103988698.35294117, "logps/chosen": -220.6365234375, "logps/rejected": -135.8659237132353, "loss": 2.8599, "rewards/chosen": -0.35480489730834963, "rewards/margins": 2.279542906144086, "rewards/rejected": -2.6343478034524357, "step": 499 }, { "epoch": 16.129554655870447, "grad_norm": 216.0, "kl": 0.0, "learning_rate": 1.5481392827883488e-07, "logits/chosen": -66750713.43589743, "logits/rejected": -98567116.8, "logps/chosen": -237.20452724358975, "logps/rejected": -126.133759765625, "loss": 3.1347, "rewards/chosen": -0.27386181171123797, "rewards/margins": 1.918815648005559, "rewards/rejected": -2.192677459716797, "step": 500 }, { "epoch": 16.161943319838056, "grad_norm": 158.0, "kl": 0.0, "learning_rate": 1.5240378616267886e-07, "logits/chosen": -57390106.48275862, "logits/rejected": -98513437.25714286, "logps/chosen": -210.66638604525863, "logps/rejected": -133.52818080357142, "loss": 2.8099, "rewards/chosen": -0.13680349547287513, "rewards/margins": 2.162087408544982, "rewards/rejected": -2.298890904017857, "step": 501 }, { "epoch": 16.194331983805668, "grad_norm": 167.0, "kl": 0.0, "learning_rate": 1.5000917426086767e-07, "logits/chosen": -55172727.741935484, "logits/rejected": -94554538.66666667, "logps/chosen": -158.20306199596774, "logps/rejected": -118.24235026041667, "loss": 2.957, "rewards/chosen": 0.0011104576049312468, "rewards/margins": 2.3199692111560677, "rewards/rejected": -2.3188587535511362, "step": 502 }, { "epoch": 16.22672064777328, "grad_norm": 206.0, "kl": 0.0, "learning_rate": 1.4763019956201251e-07, "logits/chosen": -64566582.303030305, "logits/rejected": -90447368.25806452, "logps/chosen": -149.30603397253788, "logps/rejected": -150.93699596774192, "loss": 2.9521, "rewards/chosen": -0.45667295744924835, "rewards/margins": 1.8026683747710137, "rewards/rejected": -2.259341332220262, "step": 503 }, { "epoch": 16.25910931174089, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 1.4526696835607088e-07, "logits/chosen": -52211526.62068965, "logits/rejected": -100267673.6, "logps/chosen": -236.86282664331895, "logps/rejected": -138.3693359375, "loss": 2.9131, "rewards/chosen": -0.04743953408866093, "rewards/margins": 2.4956724305458255, "rewards/rejected": -2.5431119646344866, "step": 504 }, { "epoch": 16.291497975708502, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 1.429195862295997e-07, "logits/chosen": -75706199.57894737, "logits/rejected": -96506082.46153846, "logps/chosen": -187.4133943256579, "logps/rejected": -150.78190730168268, "loss": 2.9668, "rewards/chosen": -0.792698107267681, "rewards/margins": 1.7872576539815679, "rewards/rejected": -2.579955761249249, "step": 505 }, { "epoch": 16.323886639676115, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 1.405881580610354e-07, "logits/chosen": -71461665.39130434, "logits/rejected": -98170012.09756097, "logps/chosen": -152.32670261548913, "logps/rejected": -125.26076600609755, "loss": 2.754, "rewards/chosen": -1.1513595581054688, "rewards/margins": 1.3805577347918256, "rewards/rejected": -2.5319172928972944, "step": 506 }, { "epoch": 16.356275303643724, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 1.3827278801600978e-07, "logits/chosen": -61968288.91428571, "logits/rejected": -92630086.62068966, "logps/chosen": -195.53969029017858, "logps/rejected": -140.6947652882543, "loss": 3.0028, "rewards/chosen": -0.21465841020856585, "rewards/margins": 2.0605449000015637, "rewards/rejected": -2.2752033102101294, "step": 507 }, { "epoch": 16.388663967611336, "grad_norm": 175.0, "kl": 0.0, "learning_rate": 1.3597357954269534e-07, "logits/chosen": -61942454.85714286, "logits/rejected": -93099264.0, "logps/chosen": -285.23728724888394, "logps/rejected": -118.09457736545139, "loss": 2.9507, "rewards/chosen": -0.2406529358455113, "rewards/margins": 1.820625513318985, "rewards/rejected": -2.0612784491644964, "step": 508 }, { "epoch": 16.42105263157895, "grad_norm": 292.0, "kl": 0.0, "learning_rate": 1.3369063536718344e-07, "logits/chosen": -68302644.14814815, "logits/rejected": -100084355.45945945, "logps/chosen": -255.68625217013889, "logps/rejected": -136.66236011402026, "loss": 2.8921, "rewards/chosen": -0.6173092877423322, "rewards/margins": 1.783860360299264, "rewards/rejected": -2.4011696480415963, "step": 509 }, { "epoch": 16.453441295546558, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 1.3142405748889457e-07, "logits/chosen": -53853845.333333336, "logits/rejected": -97233852.23529412, "logps/chosen": -255.89596354166667, "logps/rejected": -112.433349609375, "loss": 2.8551, "rewards/chosen": -0.17315131823221844, "rewards/margins": 2.2018352181303733, "rewards/rejected": -2.374986536362592, "step": 510 }, { "epoch": 16.48582995951417, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 1.291739471760212e-07, "logits/chosen": -58027361.28, "logits/rejected": -99997787.8974359, "logps/chosen": -239.58205078125, "logps/rejected": -150.87414863782053, "loss": 2.884, "rewards/chosen": -0.598314323425293, "rewards/margins": 1.8626936017549953, "rewards/rejected": -2.4610079251802883, "step": 511 }, { "epoch": 16.518218623481783, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 1.2694040496100317e-07, "logits/chosen": -65406844.121212125, "logits/rejected": -104295952.51612903, "logps/chosen": -247.16460996685606, "logps/rejected": -145.9378465221774, "loss": 2.9182, "rewards/chosen": -0.43771379644220526, "rewards/margins": 2.1822079801139944, "rewards/rejected": -2.6199217765561995, "step": 512 }, { "epoch": 16.55060728744939, "grad_norm": 208.0, "kl": 0.0, "learning_rate": 1.2472353063603623e-07, "logits/chosen": -65801112.77419355, "logits/rejected": -100316315.15151516, "logps/chosen": -176.78066721270162, "logps/rejected": -154.41607481060606, "loss": 2.9716, "rewards/chosen": -0.23333032669559603, "rewards/margins": 2.1605609789388738, "rewards/rejected": -2.3938913056344697, "step": 513 }, { "epoch": 16.582995951417004, "grad_norm": 147.0, "kl": 0.0, "learning_rate": 1.225234232486127e-07, "logits/chosen": -59296714.10526316, "logits/rejected": -97136187.07692307, "logps/chosen": -248.07570929276315, "logps/rejected": -134.6264366736779, "loss": 2.9543, "rewards/chosen": -0.26280465878938375, "rewards/margins": 1.802341273921704, "rewards/rejected": -2.0651459327110877, "step": 514 }, { "epoch": 16.615384615384617, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 1.2034018109709716e-07, "logits/chosen": -66955652.0, "logits/rejected": -107874416.0, "logps/chosen": -252.43841552734375, "logps/rejected": -148.32968139648438, "loss": 2.9283, "rewards/chosen": -0.20555496215820312, "rewards/margins": 2.5002758502960205, "rewards/rejected": -2.7058308124542236, "step": 515 }, { "epoch": 16.647773279352226, "grad_norm": 151.0, "kl": 0.0, "learning_rate": 1.1817390172633402e-07, "logits/chosen": -77372818.28571428, "logits/rejected": -97091725.2413793, "logps/chosen": -178.37101004464284, "logps/rejected": -148.0040914601293, "loss": 2.9075, "rewards/chosen": -1.1541733877999443, "rewards/margins": 1.0868344499559823, "rewards/rejected": -2.2410078377559266, "step": 516 }, { "epoch": 16.68016194331984, "grad_norm": 179.0, "kl": 0.3875166177749634, "learning_rate": 1.1602468192328934e-07, "logits/chosen": -68933901.4736842, "logits/rejected": -98502035.6923077, "logps/chosen": -212.50601356907896, "logps/rejected": -131.29400165264423, "loss": 3.0059, "rewards/chosen": -0.45846803564774363, "rewards/margins": 2.089053115381403, "rewards/rejected": -2.5475211510291467, "step": 517 }, { "epoch": 16.71255060728745, "grad_norm": 246.0, "kl": 0.0, "learning_rate": 1.1389261771272662e-07, "logits/chosen": -68786051.28205128, "logits/rejected": -96301568.0, "logps/chosen": -156.49189953926282, "logps/rejected": -116.798447265625, "loss": 2.933, "rewards/chosen": -0.4801797133225661, "rewards/margins": 1.8182107896071216, "rewards/rejected": -2.2983905029296876, "step": 518 }, { "epoch": 16.74493927125506, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 1.117778043529164e-07, "logits/chosen": -72464488.0, "logits/rejected": -98028480.0, "logps/chosen": -179.13558959960938, "logps/rejected": -144.46353149414062, "loss": 2.9402, "rewards/chosen": -0.822170078754425, "rewards/margins": 1.8132795691490173, "rewards/rejected": -2.6354496479034424, "step": 519 }, { "epoch": 16.777327935222672, "grad_norm": 217.0, "kl": 0.0, "learning_rate": 1.096803363313803e-07, "logits/chosen": -66442465.88235294, "logits/rejected": -93625156.26666667, "logps/chosen": -220.18663832720588, "logps/rejected": -133.10751139322917, "loss": 2.9885, "rewards/chosen": -0.3168419950148639, "rewards/margins": 2.097293747172636, "rewards/rejected": -2.4141357421875, "step": 520 }, { "epoch": 16.809716599190285, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 1.076003073606695e-07, "logits/chosen": -81186727.72413793, "logits/rejected": -98583420.34285714, "logps/chosen": -201.51794854525863, "logps/rejected": -147.66849888392858, "loss": 2.9199, "rewards/chosen": -1.2685279846191406, "rewards/margins": 0.9149234226771763, "rewards/rejected": -2.183451407296317, "step": 521 }, { "epoch": 16.842105263157894, "grad_norm": 169.0, "kl": 0.41440093517303467, "learning_rate": 1.0553781037417769e-07, "logits/chosen": -62079276.0, "logits/rejected": -105588544.0, "logps/chosen": -258.708251953125, "logps/rejected": -128.41506958007812, "loss": 3.0055, "rewards/chosen": -0.2171132117509842, "rewards/margins": 2.4005759209394455, "rewards/rejected": -2.6176891326904297, "step": 522 }, { "epoch": 16.874493927125506, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 1.034929375219884e-07, "logits/chosen": -59659783.11111111, "logits/rejected": -97221988.57142857, "logps/chosen": -247.32115342881946, "logps/rejected": -137.59490094866072, "loss": 3.0208, "rewards/chosen": -0.11144917541080052, "rewards/margins": 2.300986628683787, "rewards/rejected": -2.4124358040945872, "step": 523 }, { "epoch": 16.90688259109312, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 1.0146578016675933e-07, "logits/chosen": -58962713.6, "logits/rejected": -103577193.02564102, "logps/chosen": -258.52013671875, "logps/rejected": -155.6429662459936, "loss": 2.7598, "rewards/chosen": -0.3941326522827148, "rewards/margins": 2.4423977886102137, "rewards/rejected": -2.8365304408929286, "step": 524 }, { "epoch": 16.939271255060728, "grad_norm": 184.0, "kl": 0.0, "learning_rate": 9.94564288796384e-08, "logits/chosen": -78155834.51428571, "logits/rejected": -100666977.10344827, "logps/chosen": -178.48168247767856, "logps/rejected": -141.1480334051724, "loss": 2.9247, "rewards/chosen": -0.5126292637416294, "rewards/margins": 1.8757300691651593, "rewards/rejected": -2.3883593329067887, "step": 525 }, { "epoch": 16.97165991902834, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 9.746497343621857e-08, "logits/chosen": -71768225.18518518, "logits/rejected": -101309571.45945945, "logps/chosen": -171.5868778935185, "logps/rejected": -138.16539537584458, "loss": 2.8693, "rewards/chosen": -0.6059340017813223, "rewards/margins": 1.7545971607899404, "rewards/rejected": -2.3605311625712626, "step": 526 }, { "epoch": 17.0, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 9.549150281252632e-08, "logits/chosen": -57630915.368421055, "logits/rejected": -97753054.60869566, "logps/chosen": -137.20222553453948, "logps/rejected": -125.1779148267663, "loss": 2.9593, "rewards/chosen": -0.9381364521227384, "rewards/margins": 1.3214301451938377, "rewards/rejected": -2.259566597316576, "step": 527 }, { "epoch": 17.032388663967613, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 9.35361051810461e-08, "logits/chosen": -62322898.823529415, "logits/rejected": -96247534.93333334, "logps/chosen": -215.25043083639707, "logps/rejected": -113.10892740885417, "loss": 2.9437, "rewards/chosen": -0.44693722444422107, "rewards/margins": 1.4406315672631356, "rewards/rejected": -1.8875687917073567, "step": 528 }, { "epoch": 17.06477732793522, "grad_norm": 185.0, "kl": 0.0, "learning_rate": 9.159886790678123e-08, "logits/chosen": -66784993.88235294, "logits/rejected": -87290052.26666667, "logps/chosen": -229.31448184742646, "logps/rejected": -112.194873046875, "loss": 2.9611, "rewards/chosen": -0.5364259832045611, "rewards/margins": 1.0498971826889933, "rewards/rejected": -1.5863231658935546, "step": 529 }, { "epoch": 17.097165991902834, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 8.967987754335022e-08, "logits/chosen": -67198178.13333334, "logits/rejected": -103980107.29411764, "logps/chosen": -220.67874348958333, "logps/rejected": -135.6750919117647, "loss": 2.8743, "rewards/chosen": -0.3590281804402669, "rewards/margins": 2.256236487743901, "rewards/rejected": -2.6152646681841683, "step": 530 }, { "epoch": 17.129554655870447, "grad_norm": 217.0, "kl": 0.0, "learning_rate": 8.777921982911996e-08, "logits/chosen": -66921275.07692308, "logits/rejected": -98682398.72, "logps/chosen": -236.96742287660257, "logps/rejected": -126.133935546875, "loss": 3.1426, "rewards/chosen": -0.2501518298418094, "rewards/margins": 1.9425443981855346, "rewards/rejected": -2.192696228027344, "step": 531 }, { "epoch": 17.161943319838056, "grad_norm": 162.0, "kl": 0.0, "learning_rate": 8.589697968337445e-08, "logits/chosen": -57470411.03448276, "logits/rejected": -98188390.4, "logps/chosen": -210.47636045258622, "logps/rejected": -133.27004743303573, "loss": 2.833, "rewards/chosen": -0.11780128807857118, "rewards/margins": 2.15527523256875, "rewards/rejected": -2.2730765206473214, "step": 532 }, { "epoch": 17.194331983805668, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 8.403324120252159e-08, "logits/chosen": -55086773.67741936, "logits/rejected": -94626784.96969697, "logps/chosen": -158.35012915826613, "logps/rejected": -118.15443744081439, "loss": 2.9424, "rewards/chosen": -0.013595548368269397, "rewards/margins": 2.29647183074513, "rewards/rejected": -2.3100673791133994, "step": 533 }, { "epoch": 17.22672064777328, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 8.218808765633512e-08, "logits/chosen": -64649712.484848484, "logits/rejected": -90529139.61290322, "logps/chosen": -149.3858457623106, "logps/rejected": -151.01765688004033, "loss": 2.9468, "rewards/chosen": -0.46465359312115295, "rewards/margins": 1.8027549624326524, "rewards/rejected": -2.2674085555538053, "step": 534 }, { "epoch": 17.25910931174089, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 8.036160148423449e-08, "logits/chosen": -52062287.448275864, "logits/rejected": -100227349.94285715, "logps/chosen": -236.89537311422413, "logps/rejected": -138.3841517857143, "loss": 2.9191, "rewards/chosen": -0.05069636476450953, "rewards/margins": 2.4938963563571424, "rewards/rejected": -2.544592721121652, "step": 535 }, { "epoch": 17.291497975708502, "grad_norm": 168.0, "kl": 0.0, "learning_rate": 7.85538642916015e-08, "logits/chosen": -75476372.21052632, "logits/rejected": -96351783.38461539, "logps/chosen": -187.63449578536185, "logps/rejected": -150.68473933293268, "loss": 2.9639, "rewards/chosen": -0.8148067875912315, "rewards/margins": 1.7554326462842191, "rewards/rejected": -2.5702394338754506, "step": 536 }, { "epoch": 17.323886639676115, "grad_norm": 162.0, "kl": 0.0, "learning_rate": 7.676495684613432e-08, "logits/chosen": -71513789.2173913, "logits/rejected": -98448970.92682926, "logps/chosen": -152.41903023097825, "logps/rejected": -125.38232421875, "loss": 2.7433, "rewards/chosen": -1.1605935304061226, "rewards/margins": 1.3834790627245939, "rewards/rejected": -2.5440725931307164, "step": 537 }, { "epoch": 17.356275303643724, "grad_norm": 194.0, "kl": 0.0, "learning_rate": 7.499495907423887e-08, "logits/chosen": -61937064.22857143, "logits/rejected": -92665714.7586207, "logps/chosen": -195.5723911830357, "logps/rejected": -140.7021484375, "loss": 2.9873, "rewards/chosen": -0.21792729241507394, "rewards/margins": 2.0580137013214563, "rewards/rejected": -2.2759409937365302, "step": 538 }, { "epoch": 17.388663967611336, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 7.324395005745771e-08, "logits/chosen": -61821138.28571428, "logits/rejected": -93124373.33333333, "logps/chosen": -285.289306640625, "logps/rejected": -118.33546278211806, "loss": 2.9449, "rewards/chosen": -0.2458540712084089, "rewards/margins": 1.8395124957675026, "rewards/rejected": -2.0853665669759116, "step": 539 }, { "epoch": 17.42105263157895, "grad_norm": 272.0, "kl": 0.0, "learning_rate": 7.15120080289368e-08, "logits/chosen": -68272412.44444445, "logits/rejected": -100160684.97297297, "logps/chosen": -255.66140407986111, "logps/rejected": -136.53081450591216, "loss": 2.9001, "rewards/chosen": -0.614824789541739, "rewards/margins": 1.7731905451288688, "rewards/rejected": -2.388015334670608, "step": 540 }, { "epoch": 17.453441295546558, "grad_norm": 158.0, "kl": 0.0, "learning_rate": 6.979921036993041e-08, "logits/chosen": -53865634.13333333, "logits/rejected": -97347493.64705883, "logps/chosen": -256.08116861979164, "logps/rejected": -112.34694536994485, "loss": 2.8506, "rewards/chosen": -0.19167362848917643, "rewards/margins": 2.1746717209909474, "rewards/rejected": -2.366345349480124, "step": 541 }, { "epoch": 17.48582995951417, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 6.810563360634297e-08, "logits/chosen": -58191329.28, "logits/rejected": -100113985.64102565, "logps/chosen": -239.63498046875, "logps/rejected": -150.86087740384616, "loss": 2.8875, "rewards/chosen": -0.6036064910888672, "rewards/margins": 1.856074897570488, "rewards/rejected": -2.459681388659355, "step": 542 }, { "epoch": 17.518218623481783, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 6.643135340531136e-08, "logits/chosen": -65342572.60606061, "logits/rejected": -104207797.67741935, "logps/chosen": -247.26737097537878, "logps/rejected": -146.08889868951613, "loss": 2.9088, "rewards/chosen": -0.44798634269020776, "rewards/margins": 2.1870396661618585, "rewards/rejected": -2.6350260088520665, "step": 543 }, { "epoch": 17.55060728744939, "grad_norm": 204.0, "kl": 0.0, "learning_rate": 6.477644457182274e-08, "logits/chosen": -65851994.838709675, "logits/rejected": -100249150.06060606, "logps/chosen": -176.79646547379033, "logps/rejected": -154.09309895833334, "loss": 2.9946, "rewards/chosen": -0.23491102649319556, "rewards/margins": 2.126681352990114, "rewards/rejected": -2.3615923794833096, "step": 544 }, { "epoch": 17.582995951417004, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 6.314098104537325e-08, "logits/chosen": -59303397.05263158, "logits/rejected": -97414084.92307693, "logps/chosen": -248.18045847039474, "logps/rejected": -134.71282489483173, "loss": 2.9371, "rewards/chosen": -0.27327728271484375, "rewards/margins": 1.800506298358624, "rewards/rejected": -2.0737835810734677, "step": 545 }, { "epoch": 17.615384615384617, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 6.152503589666425e-08, "logits/chosen": -66938004.0, "logits/rejected": -107663360.0, "logps/chosen": -252.8166961669922, "logps/rejected": -148.14114379882812, "loss": 2.9093, "rewards/chosen": -0.24338212609291077, "rewards/margins": 2.443595737218857, "rewards/rejected": -2.6869778633117676, "step": 546 }, { "epoch": 17.647773279352226, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 5.992868132433753e-08, "logits/chosen": -77400085.94285715, "logits/rejected": -97167430.62068966, "logps/chosen": -178.77896205357143, "logps/rejected": -147.91205886314654, "loss": 2.9139, "rewards/chosen": -1.194969940185547, "rewards/margins": 1.0368344800225617, "rewards/rejected": -2.2318044202081087, "step": 547 }, { "epoch": 17.68016194331984, "grad_norm": 175.0, "kl": 0.3301295042037964, "learning_rate": 5.835198865174956e-08, "logits/chosen": -68776070.73684211, "logits/rejected": -98493134.76923077, "logps/chosen": -212.61870374177633, "logps/rejected": -131.0840125450721, "loss": 2.9906, "rewards/chosen": -0.4697403154875103, "rewards/margins": 2.0567829811621294, "rewards/rejected": -2.5265232966496396, "step": 548 }, { "epoch": 17.71255060728745, "grad_norm": 251.0, "kl": 0.0, "learning_rate": 5.6795028323784964e-08, "logits/chosen": -68724657.23076923, "logits/rejected": -96095385.6, "logps/chosen": -156.78529397035257, "logps/rejected": -116.891171875, "loss": 2.9246, "rewards/chosen": -0.5095198704646184, "rewards/margins": 1.7981426356388972, "rewards/rejected": -2.3076625061035156, "step": 549 }, { "epoch": 17.74493927125506, "grad_norm": 197.0, "kl": 0.0, "learning_rate": 5.5257869903709006e-08, "logits/chosen": -72334520.0, "logits/rejected": -98095864.0, "logps/chosen": -179.38539123535156, "logps/rejected": -144.47195434570312, "loss": 2.932, "rewards/chosen": -0.8471524715423584, "rewards/margins": 1.7891395092010498, "rewards/rejected": -2.636291980743408, "step": 550 }, { "epoch": 17.777327935222672, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 5.3740582070059435e-08, "logits/chosen": -66278377.4117647, "logits/rejected": -93661149.86666666, "logps/chosen": -220.12833180147058, "logps/rejected": -133.17548828125, "loss": 2.9854, "rewards/chosen": -0.3110121278201832, "rewards/margins": 2.1099229307735667, "rewards/rejected": -2.42093505859375, "step": 551 }, { "epoch": 17.809716599190285, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 5.224323261357844e-08, "logits/chosen": -81292279.1724138, "logits/rejected": -98692937.14285715, "logps/chosen": -201.70048154633622, "logps/rejected": -147.60934709821427, "loss": 2.918, "rewards/chosen": -1.2867823633654365, "rewards/margins": 0.8907562631691617, "rewards/rejected": -2.177538626534598, "step": 552 }, { "epoch": 17.842105263157894, "grad_norm": 188.0, "kl": 0.37376564741134644, "learning_rate": 5.076588843418345e-08, "logits/chosen": -62283384.0, "logits/rejected": -105595032.0, "logps/chosen": -258.7069396972656, "logps/rejected": -128.3071746826172, "loss": 3.0257, "rewards/chosen": -0.21698012948036194, "rewards/margins": 2.389920324087143, "rewards/rejected": -2.606900453567505, "step": 553 }, { "epoch": 17.874493927125506, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 4.9308615537978214e-08, "logits/chosen": -59580394.666666664, "logits/rejected": -96900973.71428572, "logps/chosen": -247.07424587673611, "logps/rejected": -137.46269880022322, "loss": 3.0214, "rewards/chosen": -0.08675977918836805, "rewards/margins": 2.312455101618691, "rewards/rejected": -2.399214880807059, "step": 554 }, { "epoch": 17.90688259109312, "grad_norm": 163.0, "kl": 0.0, "learning_rate": 4.787147903430383e-08, "logits/chosen": -58961940.48, "logits/rejected": -103539252.51282051, "logps/chosen": -258.60703125, "logps/rejected": -155.64167668269232, "loss": 2.7277, "rewards/chosen": -0.40282066345214845, "rewards/margins": 2.4335822296142577, "rewards/rejected": -2.8364028930664062, "step": 555 }, { "epoch": 17.939271255060728, "grad_norm": 183.0, "kl": 0.0, "learning_rate": 4.645454313282965e-08, "logits/chosen": -77900734.17142858, "logits/rejected": -100613261.2413793, "logps/chosen": -178.32025669642857, "logps/rejected": -141.02693123653017, "loss": 2.9295, "rewards/chosen": -0.49648639133998324, "rewards/margins": 1.8797624616200113, "rewards/rejected": -2.3762488529599946, "step": 556 }, { "epoch": 17.97165991902834, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 4.5057871140684325e-08, "logits/chosen": -71784485.92592593, "logits/rejected": -100938316.1081081, "logps/chosen": -171.49191623263889, "logps/rejected": -138.17463312922297, "loss": 2.8601, "rewards/chosen": -0.596438655146846, "rewards/margins": 1.7650160765624023, "rewards/rejected": -2.3614547317092485, "step": 557 }, { "epoch": 18.0, "grad_norm": 169.0, "kl": 0.0, "learning_rate": 4.368152545962761e-08, "logits/chosen": -57664821.89473684, "logits/rejected": -97947981.91304348, "logps/chosen": -136.99324115953948, "logps/rejected": -125.08189325747283, "loss": 2.9319, "rewards/chosen": -0.917237030832391, "rewards/margins": 1.3327268124717873, "rewards/rejected": -2.249963843304178, "step": 558 }, { "epoch": 18.032388663967613, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 4.232556758326211e-08, "logits/chosen": -62489682.823529415, "logits/rejected": -96397380.26666667, "logps/chosen": -215.28403607536765, "logps/rejected": -113.0524658203125, "loss": 2.9712, "rewards/chosen": -0.45029746784883384, "rewards/margins": 1.4316262076882755, "rewards/rejected": -1.8819236755371094, "step": 559 }, { "epoch": 18.06477732793522, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 4.099005809428596e-08, "logits/chosen": -66912150.5882353, "logits/rejected": -87508428.8, "logps/chosen": -229.46840533088235, "logps/rejected": -112.22766927083333, "loss": 2.9529, "rewards/chosen": -0.5518186232622933, "rewards/margins": 1.0377846736533969, "rewards/rejected": -1.5896032969156901, "step": 560 }, { "epoch": 18.097165991902834, "grad_norm": 189.0, "kl": 0.0, "learning_rate": 3.967505666178555e-08, "logits/chosen": -67375082.66666667, "logits/rejected": -103958415.05882353, "logps/chosen": -220.66917317708334, "logps/rejected": -135.69921875, "loss": 2.8372, "rewards/chosen": -0.35807018280029296, "rewards/margins": 2.259607842389275, "rewards/rejected": -2.617678025189568, "step": 561 }, { "epoch": 18.129554655870447, "grad_norm": 219.0, "kl": 0.0, "learning_rate": 3.8380622038570734e-08, "logits/chosen": -66868309.333333336, "logits/rejected": -98590597.12, "logps/chosen": -237.05105669070514, "logps/rejected": -126.12166015625, "loss": 3.1264, "rewards/chosen": -0.2585125947609926, "rewards/margins": 1.9329551481589293, "rewards/rejected": -2.191467742919922, "step": 562 }, { "epoch": 18.161943319838056, "grad_norm": 147.0, "kl": 0.0, "learning_rate": 3.7106812058548375e-08, "logits/chosen": -57360220.68965517, "logits/rejected": -98360122.51428571, "logps/chosen": -210.47285829741378, "logps/rejected": -133.51815011160716, "loss": 2.8319, "rewards/chosen": -0.11745032770880337, "rewards/margins": 2.1804376379022457, "rewards/rejected": -2.297887965611049, "step": 563 }, { "epoch": 18.194331983805668, "grad_norm": 171.0, "kl": 0.0, "learning_rate": 3.5853683634139434e-08, "logits/chosen": -55154473.29032258, "logits/rejected": -94562575.51515152, "logps/chosen": -158.26022240423387, "logps/rejected": -118.30360736268939, "loss": 2.9236, "rewards/chosen": -0.004606052752464048, "rewards/margins": 2.320378179832171, "rewards/rejected": -2.3249842325846353, "step": 564 }, { "epoch": 18.22672064777328, "grad_norm": 210.0, "kl": 0.0, "learning_rate": 3.4621292753735765e-08, "logits/chosen": -64493164.60606061, "logits/rejected": -90325512.25806452, "logps/chosen": -149.2129941998106, "logps/rejected": -151.09390751008064, "loss": 2.9445, "rewards/chosen": -0.447367812647964, "rewards/margins": 1.8276637385900654, "rewards/rejected": -2.2750315512380292, "step": 565 }, { "epoch": 18.25910931174089, "grad_norm": 170.0, "kl": 0.0, "learning_rate": 3.3409694479198727e-08, "logits/chosen": -51881529.37931035, "logits/rejected": -100193455.54285714, "logps/chosen": -236.85349878771552, "logps/rejected": -138.38702566964287, "loss": 2.9358, "rewards/chosen": -0.04651077040310564, "rewards/margins": 2.4983694699010237, "rewards/rejected": -2.5448802403041295, "step": 566 }, { "epoch": 18.291497975708502, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 3.2218942943399105e-08, "logits/chosen": -75596227.36842105, "logits/rejected": -96579347.6923077, "logps/chosen": -187.51459703947367, "logps/rejected": -150.71299391526443, "loss": 2.9537, "rewards/chosen": -0.8028179469861483, "rewards/margins": 1.7702478841248794, "rewards/rejected": -2.5730658311110277, "step": 567 }, { "epoch": 18.323886639676115, "grad_norm": 188.0, "kl": 0.0, "learning_rate": 3.104909134779821e-08, "logits/chosen": -71561800.3478261, "logits/rejected": -98362205.65853658, "logps/chosen": -152.42496390964675, "logps/rejected": -125.33241234756098, "loss": 2.7637, "rewards/chosen": -1.16118688168733, "rewards/margins": 1.377894970923188, "rewards/rejected": -2.539081852610518, "step": 568 }, { "epoch": 18.356275303643724, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 2.990019196007154e-08, "logits/chosen": -61929976.68571428, "logits/rejected": -92608617.93103448, "logps/chosen": -195.66780133928572, "logps/rejected": -140.78766500538794, "loss": 2.987, "rewards/chosen": -0.227470098223005, "rewards/margins": 2.0570234467830564, "rewards/rejected": -2.2844935450060615, "step": 569 }, { "epoch": 18.388663967611336, "grad_norm": 187.0, "kl": 0.0, "learning_rate": 2.8772296111772677e-08, "logits/chosen": -62021385.14285714, "logits/rejected": -93179377.77777778, "logps/chosen": -285.5918666294643, "logps/rejected": -118.30512152777777, "loss": 2.9572, "rewards/chosen": -0.27610950810568674, "rewards/margins": 1.806223738761175, "rewards/rejected": -2.082333246866862, "step": 570 }, { "epoch": 18.42105263157895, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 2.766545419604066e-08, "logits/chosen": -68372252.44444445, "logits/rejected": -100200434.16216215, "logps/chosen": -255.88917824074073, "logps/rejected": -136.4621779983108, "loss": 2.8812, "rewards/chosen": -0.6375995212131076, "rewards/margins": 1.743552038977454, "rewards/rejected": -2.3811515601905615, "step": 571 }, { "epoch": 18.453441295546558, "grad_norm": 150.0, "kl": 0.0, "learning_rate": 2.657971566534789e-08, "logits/chosen": -53812787.2, "logits/rejected": -97560244.70588236, "logps/chosen": -255.74368489583333, "logps/rejected": -112.67920639935662, "loss": 2.8599, "rewards/chosen": -0.15792407989501953, "rewards/margins": 2.241647955950569, "rewards/rejected": -2.3995720358455883, "step": 572 }, { "epoch": 18.48582995951417, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 2.5515129029290984e-08, "logits/chosen": -58096752.64, "logits/rejected": -100098743.79487179, "logps/chosen": -239.57787109375, "logps/rejected": -150.77488982371796, "loss": 2.9133, "rewards/chosen": -0.5978978729248047, "rewards/margins": 1.85318479684683, "rewards/rejected": -2.4510826697716346, "step": 573 }, { "epoch": 18.518218623481783, "grad_norm": 166.0, "kl": 0.0, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -65436291.878787875, "logits/rejected": -104457348.12903225, "logps/chosen": -247.24650804924244, "logps/rejected": -146.01524697580646, "loss": 2.9285, "rewards/chosen": -0.4459011193477746, "rewards/margins": 2.1817608010151286, "rewards/rejected": -2.627661920362903, "step": 574 }, { "epoch": 18.55060728744939, "grad_norm": 213.0, "kl": 0.0, "learning_rate": 2.3449600752129596e-08, "logits/chosen": -65541793.03225806, "logits/rejected": -100370975.03030303, "logps/chosen": -176.71548954133064, "logps/rejected": -154.4198330965909, "loss": 2.9989, "rewards/chosen": -0.22681243958011751, "rewards/margins": 2.1674554807116677, "rewards/rejected": -2.394267920291785, "step": 575 }, { "epoch": 18.582995951417004, "grad_norm": 173.0, "kl": 0.0, "learning_rate": 2.2448751396543786e-08, "logits/chosen": -59313367.578947365, "logits/rejected": -97213942.15384616, "logps/chosen": -248.3287931743421, "logps/rejected": -134.81012432391827, "loss": 2.944, "rewards/chosen": -0.28811131025615494, "rewards/margins": 1.795403563541922, "rewards/rejected": -2.083514873798077, "step": 576 }, { "epoch": 18.615384615384617, "grad_norm": 164.0, "kl": 0.0, "learning_rate": 2.1469238502507926e-08, "logits/chosen": -67056376.0, "logits/rejected": -107874520.0, "logps/chosen": -252.60391235351562, "logps/rejected": -148.420654296875, "loss": 2.9308, "rewards/chosen": -0.2221033275127411, "rewards/margins": 2.492825537919998, "rewards/rejected": -2.7149288654327393, "step": 577 }, { "epoch": 18.647773279352226, "grad_norm": 167.0, "kl": 0.0, "learning_rate": 2.0511105833574684e-08, "logits/chosen": -77421992.22857143, "logits/rejected": -97286479.44827586, "logps/chosen": -178.56407645089286, "logps/rejected": -147.943359375, "loss": 2.9163, "rewards/chosen": -1.1734817504882813, "rewards/margins": 1.0614525630556304, "rewards/rejected": -2.2349343135439117, "step": 578 }, { "epoch": 18.68016194331984, "grad_norm": 187.0, "kl": 0.39462482929229736, "learning_rate": 1.9574396198051958e-08, "logits/chosen": -68922650.94736843, "logits/rejected": -98604278.15384616, "logps/chosen": -212.4785284745066, "logps/rejected": -131.06280048076923, "loss": 3.0108, "rewards/chosen": -0.4557186427869295, "rewards/margins": 2.068681921553515, "rewards/rejected": -2.5244005643404446, "step": 579 }, { "epoch": 18.71255060728745, "grad_norm": 239.0, "kl": 0.0, "learning_rate": 1.865915144708985e-08, "logits/chosen": -68730748.71794872, "logits/rejected": -96218624.0, "logps/chosen": -156.55384865785257, "logps/rejected": -117.003486328125, "loss": 2.8854, "rewards/chosen": -0.4863753196520683, "rewards/margins": 1.832518875904572, "rewards/rejected": -2.3188941955566404, "step": 580 }, { "epoch": 18.74493927125506, "grad_norm": 207.0, "kl": 0.0, "learning_rate": 1.776541247281177e-08, "logits/chosen": -72270192.0, "logits/rejected": -98189624.0, "logps/chosen": -179.48814392089844, "logps/rejected": -144.32000732421875, "loss": 2.9355, "rewards/chosen": -0.8574260473251343, "rewards/margins": 1.763670802116394, "rewards/rejected": -2.6210968494415283, "step": 581 }, { "epoch": 18.777327935222672, "grad_norm": 190.0, "kl": 0.0, "learning_rate": 1.6893219206486232e-08, "logits/chosen": -66197511.52941176, "logits/rejected": -93673710.93333334, "logps/chosen": -219.97254136029412, "logps/rejected": -133.07320963541667, "loss": 2.9689, "rewards/chosen": -0.29543253954719095, "rewards/margins": 2.1152740441116635, "rewards/rejected": -2.4107065836588544, "step": 582 }, { "epoch": 18.809716599190285, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 1.604261061674378e-08, "logits/chosen": -81222991.44827586, "logits/rejected": -98668814.62857144, "logps/chosen": -201.54125134698276, "logps/rejected": -147.55199497767856, "loss": 2.9148, "rewards/chosen": -1.2708594223548626, "rewards/margins": 0.9009423354576376, "rewards/rejected": -2.1718017578125, "step": 583 }, { "epoch": 18.842105263157894, "grad_norm": 173.0, "kl": 0.49798399209976196, "learning_rate": 1.521362470783527e-08, "logits/chosen": -62185016.0, "logits/rejected": -105551344.0, "logps/chosen": -258.7596740722656, "logps/rejected": -128.31753540039062, "loss": 3.0035, "rewards/chosen": -0.22225487232208252, "rewards/margins": 2.3856805562973022, "rewards/rejected": -2.6079354286193848, "step": 584 }, { "epoch": 18.874493927125506, "grad_norm": 198.0, "kl": 0.0, "learning_rate": 1.4406298517934067e-08, "logits/chosen": -59548643.55555555, "logits/rejected": -96952009.14285715, "logps/chosen": -247.26814778645834, "logps/rejected": -137.45335170200892, "loss": 3.0096, "rewards/chosen": -0.10614852772818671, "rewards/margins": 2.2921317522487943, "rewards/rejected": -2.398280279976981, "step": 585 }, { "epoch": 18.90688259109312, "grad_norm": 179.0, "kl": 0.0, "learning_rate": 1.3620668117481471e-08, "logits/chosen": -58998584.32, "logits/rejected": -103587006.35897435, "logps/chosen": -258.81755859375, "logps/rejected": -155.08145532852564, "loss": 2.7471, "rewards/chosen": -0.4238716125488281, "rewards/margins": 2.356509223351112, "rewards/rejected": -2.78038083589994, "step": 586 }, { "epoch": 18.939271255060728, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 1.2856768607574564e-08, "logits/chosen": -77692064.91428572, "logits/rejected": -100637643.03448276, "logps/chosen": -178.57513950892857, "logps/rejected": -141.15901131465517, "loss": 2.9353, "rewards/chosen": -0.5219748360770089, "rewards/margins": 1.8674815511468597, "rewards/rejected": -2.3894563872238685, "step": 587 }, { "epoch": 18.97165991902834, "grad_norm": 178.0, "kl": 0.0, "learning_rate": 1.2114634118398636e-08, "logits/chosen": -71724122.07407407, "logits/rejected": -101234425.08108108, "logps/chosen": -171.4249312789352, "logps/rejected": -138.18636507601352, "loss": 2.8538, "rewards/chosen": -0.5897406118887442, "rewards/margins": 1.7728880155790558, "rewards/rejected": -2.3626286274678, "step": 588 }, { "epoch": 19.0, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 1.1394297807701736e-08, "logits/chosen": -57727393.684210524, "logits/rejected": -98007841.39130434, "logps/chosen": -137.22663959703948, "logps/rejected": -125.0175144361413, "loss": 2.9374, "rewards/chosen": -0.94057886224044, "rewards/margins": 1.3029475964997945, "rewards/rejected": -2.2435264587402344, "step": 589 }, { "epoch": 19.032388663967613, "grad_norm": 223.0, "kl": 0.0, "learning_rate": 1.0695791859313297e-08, "logits/chosen": -62233856.0, "logits/rejected": -96324292.26666667, "logps/chosen": -215.00531364889707, "logps/rejected": -113.19478352864583, "loss": 2.9551, "rewards/chosen": -0.4224257749669692, "rewards/margins": 1.4737296459721585, "rewards/rejected": -1.8961554209391276, "step": 590 }, { "epoch": 19.06477732793522, "grad_norm": 193.0, "kl": 0.0, "learning_rate": 1.0019147481706625e-08, "logits/chosen": -66884080.941176474, "logits/rejected": -87609847.46666667, "logps/chosen": -229.33608111213235, "logps/rejected": -112.12271321614584, "loss": 2.97, "rewards/chosen": -0.5385867287130917, "rewards/margins": 1.040519920049929, "rewards/rejected": -1.5791066487630208, "step": 591 }, { "epoch": 19.097165991902834, "grad_norm": 199.0, "kl": 0.0, "learning_rate": 9.364394906603901e-09, "logits/chosen": -67289881.6, "logits/rejected": -103940103.52941176, "logps/chosen": -220.681787109375, "logps/rejected": -135.76351390165442, "loss": 2.8715, "rewards/chosen": -0.3593327840169271, "rewards/margins": 2.26477435242896, "rewards/rejected": -2.624107136445887, "step": 592 }, { "epoch": 19.129554655870447, "grad_norm": 224.0, "kl": 0.0, "learning_rate": 8.731563387626096e-09, "logits/chosen": -66846457.43589743, "logits/rejected": -98716825.6, "logps/chosen": -237.10003505608975, "logps/rejected": -126.0630078125, "loss": 3.1252, "rewards/chosen": -0.26341386941763073, "rewards/margins": 1.9221888527503381, "rewards/rejected": -2.185602722167969, "step": 593 }, { "epoch": 19.161943319838056, "grad_norm": 172.0, "kl": 0.0, "learning_rate": 8.12068119898529e-09, "logits/chosen": -57211475.862068966, "logits/rejected": -98426411.88571429, "logps/chosen": -210.41665544181035, "logps/rejected": -133.2943359375, "loss": 2.8466, "rewards/chosen": -0.11182959326382341, "rewards/margins": 2.163675690636846, "rewards/rejected": -2.2755052839006695, "step": 594 }, { "epoch": 19.194331983805668, "grad_norm": 177.0, "kl": 0.0, "learning_rate": 7.531775634222137e-09, "logits/chosen": -54971697.548387095, "logits/rejected": -94485984.96969697, "logps/chosen": -158.12134576612902, "logps/rejected": -118.03771602746212, "loss": 2.9513, "rewards/chosen": 0.009283790665288125, "rewards/margins": 2.3076795833085173, "rewards/rejected": -2.298395792643229, "step": 595 }, { "epoch": 19.22672064777328, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 6.964873004985716e-09, "logits/chosen": -64525327.515151516, "logits/rejected": -90796213.67741935, "logps/chosen": -149.28101325757575, "logps/rejected": -150.80052923387098, "loss": 2.9345, "rewards/chosen": -0.4541704004461115, "rewards/margins": 1.7915246521622554, "rewards/rejected": -2.245695052608367, "step": 596 }, { "epoch": 19.25910931174089, "grad_norm": 176.0, "kl": 0.0, "learning_rate": 6.419998639858537e-09, "logits/chosen": -52022404.4137931, "logits/rejected": -100259540.11428571, "logps/chosen": -237.0428845635776, "logps/rejected": -138.39637276785714, "loss": 2.9298, "rewards/chosen": -0.06544678375638764, "rewards/margins": 2.4803679483864696, "rewards/rejected": -2.545814732142857, "step": 597 }, { "epoch": 19.291497975708502, "grad_norm": 160.0, "kl": 0.0, "learning_rate": 5.897176883224442e-09, "logits/chosen": -75506405.05263157, "logits/rejected": -96458377.84615384, "logps/chosen": -187.66165964226974, "logps/rejected": -150.69378192608173, "loss": 2.9687, "rewards/chosen": -0.8175224504972759, "rewards/margins": 1.7536219469448815, "rewards/rejected": -2.5711443974421573, "step": 598 }, { "epoch": 19.323886639676115, "grad_norm": 196.0, "kl": 0.0, "learning_rate": 5.396431094181197e-09, "logits/chosen": -71299483.82608695, "logits/rejected": -98316450.34146342, "logps/chosen": -152.37476647418478, "logps/rejected": -125.3078672827744, "loss": 2.7654, "rewards/chosen": -1.1561669059421704, "rewards/margins": 1.3804597702916328, "rewards/rejected": -2.5366266762338032, "step": 599 }, { "epoch": 19.356275303643724, "grad_norm": 182.0, "kl": 0.0, "learning_rate": 4.917783645496887e-09, "logits/chosen": -61824577.82857143, "logits/rejected": -92605651.86206897, "logps/chosen": -195.48423549107142, "logps/rejected": -140.68437668372846, "loss": 2.9994, "rewards/chosen": -0.2091134752546038, "rewards/margins": 2.065050132638715, "rewards/rejected": -2.274163607893319, "step": 600 }, { "epoch": 19.356275303643724, "eval_kl": 0.0, "eval_logits/chosen": -80932414.38872105, "eval_logits/rejected": -123042566.97477296, "eval_logps/chosen": -211.94826283987916, "eval_logps/rejected": -135.2585141271443, "eval_loss": 0.2648468315601349, "eval_rewards/chosen": -0.4634648041662733, "eval_rewards/margins": 1.9077257644670516, "eval_rewards/rejected": -2.371190568633325, "eval_runtime": 64.2068, "eval_samples_per_second": 30.698, "eval_steps_per_second": 0.966, "step": 600 }, { "epoch": 19.388663967611336, "grad_norm": 202.0, "kl": 0.0, "learning_rate": 4.461255922609985e-09, "logits/chosen": -61884233.14285714, "logits/rejected": -93251349.33333333, "logps/chosen": -285.1388462611607, "logps/rejected": -118.31480577256944, "loss": 2.9312, "rewards/chosen": -0.23080904143197195, "rewards/margins": 1.8524912311917259, "rewards/rejected": -2.0833002726236978, "step": 601 }, { "epoch": 19.42105263157895, "grad_norm": 180.0, "kl": 0.0, "learning_rate": 4.026868322674126e-09, "logits/chosen": -68158340.74074075, "logits/rejected": -100057357.83783785, "logps/chosen": -255.65849247685185, "logps/rejected": -136.62403663429055, "loss": 2.8616, "rewards/chosen": -0.6145321881329572, "rewards/margins": 1.7828052356555775, "rewards/rejected": -2.3973374237885348, "step": 602 }, { "epoch": 19.453441295546558, "grad_norm": 158.0, "kl": 0.0, "learning_rate": 3.614640253646828e-09, "logits/chosen": -53845440.0, "logits/rejected": -97514646.58823529, "logps/chosen": -255.58486328125, "logps/rejected": -112.47178021599265, "loss": 2.8591, "rewards/chosen": -0.14204190572102865, "rewards/margins": 2.236787152757832, "rewards/rejected": -2.3788290584788605, "step": 603 }, { "epoch": 19.48582995951417, "grad_norm": 195.0, "kl": 0.0, "learning_rate": 3.224590133422189e-09, "logits/chosen": -58276469.76, "logits/rejected": -100134688.82051282, "logps/chosen": -239.5990625, "logps/rejected": -150.77152193509616, "loss": 2.911, "rewards/chosen": -0.60001708984375, "rewards/margins": 1.850728712815505, "rewards/rejected": -2.450745802659255, "step": 604 }, { "epoch": 19.518218623481783, "grad_norm": 185.0, "kl": 0.0, "learning_rate": 2.856735389008269e-09, "logits/chosen": -65148276.36363637, "logits/rejected": -104359539.61290322, "logps/chosen": -247.1654385653409, "logps/rejected": -145.9519279233871, "loss": 2.8979, "rewards/chosen": -0.43779532114664715, "rewards/margins": 2.1835324789888118, "rewards/rejected": -2.6213278001354587, "step": 605 }, { "epoch": 19.55060728744939, "grad_norm": 227.0, "kl": 0.0, "learning_rate": 2.511092455747932e-09, "logits/chosen": -65956707.09677419, "logits/rejected": -100433291.63636364, "logps/chosen": -176.9268995715726, "logps/rejected": -154.3150301846591, "loss": 3.0174, "rewards/chosen": -0.24795398404521327, "rewards/margins": 2.135832072935729, "rewards/rejected": -2.3837860569809424, "step": 606 }, { "epoch": 19.582995951417004, "grad_norm": 153.0, "kl": 0.0, "learning_rate": 2.1876767765853233e-09, "logits/chosen": -59242549.89473684, "logits/rejected": -97430350.76923077, "logps/chosen": -248.02847450657896, "logps/rejected": -134.95637394831732, "loss": 2.9473, "rewards/chosen": -0.2580819380910773, "rewards/margins": 1.84005701783215, "rewards/rejected": -2.0981389559232273, "step": 607 }, { "epoch": 19.615384615384617, "grad_norm": 159.0, "kl": 0.0, "learning_rate": 1.886502801375145e-09, "logits/chosen": -66908516.0, "logits/rejected": -107828312.0, "logps/chosen": -252.5504150390625, "logps/rejected": -148.5430450439453, "loss": 2.9293, "rewards/chosen": -0.21675539016723633, "rewards/margins": 2.51041316986084, "rewards/rejected": -2.727168560028076, "step": 608 }, { "epoch": 19.647773279352226, "grad_norm": 162.0, "kl": 0.0, "learning_rate": 1.6075839862374486e-09, "logits/chosen": -77311590.4, "logits/rejected": -97191715.31034483, "logps/chosen": -178.54609375, "logps/rejected": -148.06206223060346, "loss": 2.9278, "rewards/chosen": -1.171681431361607, "rewards/margins": 1.0751242200729296, "rewards/rejected": -2.2468056514345367, "step": 609 }, { "epoch": 19.68016194331984, "grad_norm": 190.0, "kl": 0.36493146419525146, "learning_rate": 1.350932792956394e-09, "logits/chosen": -68956570.94736843, "logits/rejected": -98587549.53846154, "logps/chosen": -212.4560418379934, "logps/rejected": -131.00518329326923, "loss": 2.9961, "rewards/chosen": -0.4534718362908614, "rewards/margins": 2.065167654863736, "rewards/rejected": -2.5186394911545973, "step": 610 }, { "epoch": 19.71255060728745, "grad_norm": 221.0, "kl": 0.0, "learning_rate": 1.116560688423418e-09, "logits/chosen": -68719432.20512821, "logits/rejected": -96288501.76, "logps/chosen": -156.50179036458334, "logps/rejected": -116.9461328125, "loss": 2.8878, "rewards/chosen": -0.48116830679086536, "rewards/margins": 1.831990720308744, "rewards/rejected": -2.3131590270996094, "step": 611 }, { "epoch": 19.74493927125506, "grad_norm": 225.0, "kl": 0.0, "learning_rate": 9.044781441249205e-10, "logits/chosen": -72238784.0, "logits/rejected": -98076880.0, "logps/chosen": -179.20614624023438, "logps/rejected": -144.458740234375, "loss": 2.931, "rewards/chosen": -0.829226016998291, "rewards/margins": 1.8057453632354736, "rewards/rejected": -2.6349713802337646, "step": 612 }, { "epoch": 19.777327935222672, "grad_norm": 228.0, "kl": 0.0, "learning_rate": 7.146946356743067e-10, "logits/chosen": -66387584.0, "logits/rejected": -93888068.26666667, "logps/chosen": -220.02590762867646, "logps/rejected": -132.79898274739583, "loss": 2.9749, "rewards/chosen": -0.3007695815142463, "rewards/margins": 2.082514415067785, "rewards/rejected": -2.383283996582031, "step": 613 }, { "epoch": 19.809716599190285, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 5.472186423889358e-10, "logits/chosen": -81260146.7586207, "logits/rejected": -98581621.02857143, "logps/chosen": -201.7942483836207, "logps/rejected": -147.68328683035713, "loss": 2.9374, "rewards/chosen": -1.2961596784920528, "rewards/margins": 0.8887740111703357, "rewards/rejected": -2.1849336896623885, "step": 614 }, { "epoch": 19.842105263157894, "grad_norm": 187.0, "kl": 0.4474652409553528, "learning_rate": 4.020576469108139e-10, "logits/chosen": -61966072.0, "logits/rejected": -105689056.0, "logps/chosen": -258.4248046875, "logps/rejected": -128.25125122070312, "loss": 3.0108, "rewards/chosen": -0.18876656889915466, "rewards/margins": 2.412539392709732, "rewards/rejected": -2.6013059616088867, "step": 615 }, { "epoch": 19.874493927125506, "grad_norm": 201.0, "kl": 0.0, "learning_rate": 2.7921813487269407e-10, "logits/chosen": -59527402.666666664, "logits/rejected": -96933705.14285715, "logps/chosen": -247.00466579861111, "logps/rejected": -137.4833984375, "loss": 3.0069, "rewards/chosen": -0.07980091041988796, "rewards/margins": 2.321484261088901, "rewards/rejected": -2.401285171508789, "step": 616 }, { "epoch": 19.90688259109312, "grad_norm": 165.0, "kl": 0.0, "learning_rate": 1.787055946081417e-10, "logits/chosen": -59101905.92, "logits/rejected": -103402528.82051282, "logps/chosen": -258.68126953125, "logps/rejected": -155.44696514423077, "loss": 2.7413, "rewards/chosen": -0.4102425003051758, "rewards/margins": 2.406688612913474, "rewards/rejected": -2.81693111321865, "step": 617 }, { "epoch": 19.939271255060728, "grad_norm": 191.0, "kl": 0.0, "learning_rate": 1.0052451690617525e-10, "logits/chosen": -77806496.91428572, "logits/rejected": -100760434.7586207, "logps/chosen": -178.48556082589286, "logps/rejected": -141.04423154633622, "loss": 2.9302, "rewards/chosen": -0.5130174364362444, "rewards/margins": 1.864962499834634, "rewards/rejected": -2.3779799362708784, "step": 618 }, { "epoch": 19.97165991902834, "grad_norm": 186.0, "kl": 0.0, "learning_rate": 4.46783948109819e-11, "logits/chosen": -71679924.14814815, "logits/rejected": -101188552.64864865, "logps/chosen": -171.4169017650463, "logps/rejected": -138.0185942778716, "loss": 2.8654, "rewards/chosen": -0.58893797132704, "rewards/margins": 1.7569142361660977, "rewards/rejected": -2.3458522074931376, "step": 619 }, { "epoch": 20.0, "grad_norm": 181.0, "kl": 0.0, "learning_rate": 1.1169723465487279e-11, "logits/chosen": -57547904.0, "logits/rejected": -97859750.95652173, "logps/chosen": -137.3400750411184, "logps/rejected": -125.08204186480978, "loss": 2.9397, "rewards/chosen": -0.9519195556640625, "rewards/margins": 1.298059214716372, "rewards/rejected": -2.2499787703804346, "step": 620 } ], "logging_steps": 1, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }