diff --git "a/V2/checkpoint-620/trainer_state.json" "b/V2/checkpoint-620/trainer_state.json" new file mode 100644--- /dev/null +++ "b/V2/checkpoint-620/trainer_state.json" @@ -0,0 +1,9382 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 20.0, + "eval_steps": 200, + "global_step": 620, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.032388663967611336, + "grad_norm": 220.0, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": -58430595.76470588, + "logits/rejected": -87931281.06666666, + "logps/chosen": -210.7810489430147, + "logps/rejected": -94.23323567708333, + "loss": 3.8188, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.06477732793522267, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 6.666666666666667e-09, + "logits/chosen": -62450601.4117647, + "logits/rejected": -80818483.2, + "logps/chosen": -223.95020967371323, + "logps/rejected": -96.331640625, + "loss": 3.7399, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.09716599190283401, + "grad_norm": 220.0, + "kl": 0.23693042993545532, + "learning_rate": 1.3333333333333334e-08, + "logits/chosen": -63959364.266666666, + "logits/rejected": -94508792.47058824, + "logps/chosen": -216.87962239583334, + "logps/rejected": -109.44955623851104, + "loss": 3.7552, + "rewards/chosen": 0.020882568756739297, + "rewards/margins": 0.013593461086937025, + "rewards/rejected": 0.007289107669802273, + "step": 3 + }, + { + "epoch": 0.12955465587044535, + "grad_norm": 231.0, + "kl": 0.16312503814697266, + "learning_rate": 2e-08, + "logits/chosen": -63178187.48717949, + "logits/rejected": -89863157.76, + "logps/chosen": -234.58611278044873, + "logps/rejected": -104.408388671875, + "loss": 3.9892, + "rewards/chosen": -0.012021300120231433, + "rewards/margins": 0.008119157033088879, + "rewards/rejected": -0.020140457153320312, + "step": 4 + }, + { + "epoch": 0.16194331983805668, + "grad_norm": 203.0, + "kl": 0.0684085488319397, + "learning_rate": 2.6666666666666667e-08, + "logits/chosen": -54792209.655172415, + "logits/rejected": -89121280.0, + "logps/chosen": -209.4049703663793, + "logps/rejected": -110.37323521205357, + "loss": 3.6589, + "rewards/chosen": -0.01066252589225769, + "rewards/margins": -0.027266709293637958, + "rewards/rejected": 0.016604183401380267, + "step": 5 + }, + { + "epoch": 0.19433198380566802, + "grad_norm": 236.0, + "kl": 0.36925557255744934, + "learning_rate": 3.3333333333333334e-08, + "logits/chosen": -52978085.161290325, + "logits/rejected": -86475791.51515152, + "logps/chosen": -158.39511403729838, + "logps/rejected": -95.0108309659091, + "loss": 3.9354, + "rewards/chosen": -0.018092793803061207, + "rewards/margins": -0.022386104625923775, + "rewards/rejected": 0.004293310822862567, + "step": 6 + }, + { + "epoch": 0.22672064777327935, + "grad_norm": 235.0, + "kl": 0.0888870358467102, + "learning_rate": 4e-08, + "logits/chosen": -60282100.36363637, + "logits/rejected": -84002072.77419356, + "logps/chosen": -144.59225556344697, + "logps/rejected": -128.4602523311492, + "loss": 3.8797, + "rewards/chosen": 0.014703954711104885, + "rewards/margins": 0.02637073811314789, + "rewards/rejected": -0.011666783402043005, + "step": 7 + }, + { + "epoch": 0.2591093117408907, + "grad_norm": 196.0, + "kl": 0.044393956661224365, + "learning_rate": 4.666666666666667e-08, + "logits/chosen": -48575488.0, + "logits/rejected": -91620359.31428571, + "logps/chosen": -236.1129781788793, + "logps/rejected": -113.21409737723214, + "loss": 3.7969, + "rewards/chosen": 0.02754283362421496, + "rewards/margins": 0.055130841755514665, + "rewards/rejected": -0.0275880081312997, + "step": 8 + }, + { + "epoch": 0.291497975708502, + "grad_norm": 192.0, + "kl": 0.08808565139770508, + "learning_rate": 5.3333333333333334e-08, + "logits/chosen": -70911016.42105263, + "logits/rejected": -87392147.6923077, + "logps/chosen": -179.357421875, + "logps/rejected": -124.7702167217548, + "loss": 3.823, + "rewards/chosen": 0.0129009579357348, + "rewards/margins": -0.008311521307176904, + "rewards/rejected": 0.021212479242911704, + "step": 9 + }, + { + "epoch": 0.32388663967611336, + "grad_norm": 214.0, + "kl": 0.2002100646495819, + "learning_rate": 6e-08, + "logits/chosen": -66679952.69565217, + "logits/rejected": -90590114.34146342, + "logps/chosen": -140.9425632642663, + "logps/rejected": -99.87619688452745, + "loss": 3.7412, + "rewards/chosen": -0.012947371472483095, + "rewards/margins": -0.019486692999731572, + "rewards/rejected": 0.006539321527248476, + "step": 10 + }, + { + "epoch": 0.3562753036437247, + "grad_norm": 192.0, + "kl": 0.28088629245758057, + "learning_rate": 6.666666666666667e-08, + "logits/chosen": -58234137.6, + "logits/rejected": -85435400.8275862, + "logps/chosen": -193.31727120535714, + "logps/rejected": -117.9307061557112, + "loss": 3.7995, + "rewards/chosen": 0.007582300049918039, + "rewards/margins": 0.006378676481728483, + "rewards/rejected": 0.001203623568189555, + "step": 11 + }, + { + "epoch": 0.38866396761133604, + "grad_norm": 205.0, + "kl": 0.11991578340530396, + "learning_rate": 7.333333333333333e-08, + "logits/chosen": -58462386.28571428, + "logits/rejected": -85638883.55555555, + "logps/chosen": -282.8749476841518, + "logps/rejected": -97.3487548828125, + "loss": 3.7668, + "rewards/chosen": -0.00441945663520268, + "rewards/margins": -0.017723340836782304, + "rewards/rejected": 0.013303884201579623, + "step": 12 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 404.0, + "kl": 0.37106019258499146, + "learning_rate": 8e-08, + "logits/chosen": -64843439.40740741, + "logits/rejected": -90882241.72972973, + "logps/chosen": -249.62418619791666, + "logps/rejected": -112.53064294763513, + "loss": 3.7803, + "rewards/chosen": -0.011102434661653306, + "rewards/margins": -0.023103111409568215, + "rewards/rejected": 0.012000676747914907, + "step": 13 + }, + { + "epoch": 0.4534412955465587, + "grad_norm": 199.0, + "kl": 0.09121906757354736, + "learning_rate": 8.666666666666666e-08, + "logits/chosen": -51182941.86666667, + "logits/rejected": -89462159.05882353, + "logps/chosen": -254.28321940104166, + "logps/rejected": -88.68126005284927, + "loss": 3.7676, + "rewards/chosen": -0.011877447366714478, + "rewards/margins": -0.012099783210193409, + "rewards/rejected": 0.0002223358434789321, + "step": 14 + }, + { + "epoch": 0.48582995951417, + "grad_norm": 190.0, + "kl": 0.04368013143539429, + "learning_rate": 9.333333333333334e-08, + "logits/chosen": -54074613.76, + "logits/rejected": -91609744.41025642, + "logps/chosen": -233.44771484375, + "logps/rejected": -126.24091045673077, + "loss": 3.8555, + "rewards/chosen": 0.015118194818496704, + "rewards/margins": 0.012803569940420298, + "rewards/rejected": 0.0023146248780764067, + "step": 15 + }, + { + "epoch": 0.5182186234817814, + "grad_norm": 197.0, + "kl": 0.13226062059402466, + "learning_rate": 1e-07, + "logits/chosen": -61068962.90909091, + "logits/rejected": -94692616.25806452, + "logps/chosen": -242.7887665719697, + "logps/rejected": -119.58937121975806, + "loss": 3.7451, + "rewards/chosen": -0.0001258962985241052, + "rewards/margins": -0.01505279516253187, + "rewards/rejected": 0.014926898864007766, + "step": 16 + }, + { + "epoch": 0.5506072874493927, + "grad_norm": 230.0, + "kl": 0.29574519395828247, + "learning_rate": 1.0666666666666667e-07, + "logits/chosen": -62746458.838709675, + "logits/rejected": -91721153.93939394, + "logps/chosen": -174.30223034274192, + "logps/rejected": -130.68303148674244, + "loss": 3.9933, + "rewards/chosen": 0.014514055944258166, + "rewards/margins": 0.03509998304053835, + "rewards/rejected": -0.020585927096280186, + "step": 17 + }, + { + "epoch": 0.582995951417004, + "grad_norm": 196.0, + "kl": 0.14987093210220337, + "learning_rate": 1.1333333333333332e-07, + "logits/chosen": -56216569.2631579, + "logits/rejected": -88530510.76923077, + "logps/chosen": -245.4069181743421, + "logps/rejected": -114.16628793569711, + "loss": 3.7232, + "rewards/chosen": 0.004077738445056112, + "rewards/margins": 0.02320850369056709, + "rewards/rejected": -0.01913076524551098, + "step": 18 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 187.0, + "kl": 0.3079838752746582, + "learning_rate": 1.2e-07, + "logits/chosen": -63619196.0, + "logits/rejected": -97384832.0, + "logps/chosen": -250.46109008789062, + "logps/rejected": -121.32066345214844, + "loss": 3.7837, + "rewards/chosen": -0.007823335006833076, + "rewards/margins": -0.0028944951482117176, + "rewards/rejected": -0.004928839858621359, + "step": 19 + }, + { + "epoch": 0.6477732793522267, + "grad_norm": 178.0, + "kl": 0.2797037363052368, + "learning_rate": 1.2666666666666666e-07, + "logits/chosen": -72328352.91428572, + "logits/rejected": -87969871.44827586, + "logps/chosen": -166.88225446428572, + "logps/rejected": -125.47866716056035, + "loss": 3.7522, + "rewards/chosen": -0.005298037188393729, + "rewards/margins": -0.016832793815969832, + "rewards/rejected": 0.011534756627576104, + "step": 20 + }, + { + "epoch": 0.680161943319838, + "grad_norm": 169.0, + "kl": 0.18229222297668457, + "learning_rate": 1.3333333333333334e-07, + "logits/chosen": -64960970.10526316, + "logits/rejected": -91011012.92307693, + "logps/chosen": -207.93300267269737, + "logps/rejected": -106.05503493088942, + "loss": 3.7582, + "rewards/chosen": -0.0011691284414968993, + "rewards/margins": 0.022455291724518725, + "rewards/rejected": -0.023624420166015625, + "step": 21 + }, + { + "epoch": 0.7125506072874493, + "grad_norm": 245.0, + "kl": 0.2452963888645172, + "learning_rate": 1.4e-07, + "logits/chosen": -65512395.48717949, + "logits/rejected": -87635998.72, + "logps/chosen": -151.80533854166666, + "logps/rejected": -93.67271484375, + "loss": 3.8238, + "rewards/chosen": -0.01152306718704028, + "rewards/margins": -0.025706659983365965, + "rewards/rejected": 0.014183592796325684, + "step": 22 + }, + { + "epoch": 0.7449392712550608, + "grad_norm": 207.0, + "kl": 0.26080161333084106, + "learning_rate": 1.4666666666666666e-07, + "logits/chosen": -67878600.0, + "logits/rejected": -89875760.0, + "logps/chosen": -170.92079162597656, + "logps/rejected": -118.2757797241211, + "loss": 3.8494, + "rewards/chosen": -0.0006897321436554193, + "rewards/margins": 0.015984160592779517, + "rewards/rejected": -0.016673892736434937, + "step": 23 + }, + { + "epoch": 0.7773279352226721, + "grad_norm": 216.0, + "kl": 0.32619625329971313, + "learning_rate": 1.533333333333333e-07, + "logits/chosen": -62605857.88235294, + "logits/rejected": -85273693.86666666, + "logps/chosen": -216.76492130055146, + "logps/rejected": -109.14388020833333, + "loss": 3.8839, + "rewards/chosen": 0.025327740346684176, + "rewards/margins": 0.04310074074595582, + "rewards/rejected": -0.017773000399271648, + "step": 24 + }, + { + "epoch": 0.8097165991902834, + "grad_norm": 217.0, + "kl": 0.17487949132919312, + "learning_rate": 1.6e-07, + "logits/chosen": -75539535.44827586, + "logits/rejected": -90929020.34285714, + "logps/chosen": -189.0615234375, + "logps/rejected": -125.88074776785714, + "loss": 3.8346, + "rewards/chosen": -0.02288691339821651, + "rewards/margins": -0.018209210698827735, + "rewards/rejected": -0.004677702699388777, + "step": 25 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 231.0, + "kl": 0.2163093388080597, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -58751488.0, + "logits/rejected": -95374936.0, + "logps/chosen": -256.7890319824219, + "logps/rejected": -102.30900573730469, + "loss": 3.9043, + "rewards/chosen": -0.02519126608967781, + "rewards/margins": -0.018108618445694447, + "rewards/rejected": -0.007082647643983364, + "step": 26 + }, + { + "epoch": 0.8744939271255061, + "grad_norm": 185.0, + "kl": 0.09094679355621338, + "learning_rate": 1.7333333333333332e-07, + "logits/chosen": -56075064.88888889, + "logits/rejected": -88336576.0, + "logps/chosen": -246.36691623263889, + "logps/rejected": -113.58909388950893, + "loss": 3.831, + "rewards/chosen": -0.01602463920911153, + "rewards/margins": -0.00417077505872363, + "rewards/rejected": -0.0118538641503879, + "step": 27 + }, + { + "epoch": 0.9068825910931174, + "grad_norm": 207.0, + "kl": 0.12784463167190552, + "learning_rate": 1.8e-07, + "logits/chosen": -56007577.6, + "logits/rejected": -94364580.1025641, + "logps/chosen": -254.408984375, + "logps/rejected": -127.31337640224359, + "loss": 3.7463, + "rewards/chosen": 0.016985907554626464, + "rewards/margins": 0.020557051835915982, + "rewards/rejected": -0.0035711442812895165, + "step": 28 + }, + { + "epoch": 0.9392712550607287, + "grad_norm": 197.0, + "kl": 0.04759460687637329, + "learning_rate": 1.8666666666666667e-07, + "logits/chosen": -73918902.85714285, + "logits/rejected": -90738123.03448276, + "logps/chosen": -173.3303013392857, + "logps/rejected": -117.31009226831897, + "loss": 3.8649, + "rewards/chosen": 0.0025086658341544016, + "rewards/margins": 0.007073412534638579, + "rewards/rejected": -0.004564746700484177, + "step": 29 + }, + { + "epoch": 0.97165991902834, + "grad_norm": 227.0, + "kl": 0.10075879096984863, + "learning_rate": 1.9333333333333332e-07, + "logits/chosen": -67702632.2962963, + "logits/rejected": -93046195.8918919, + "logps/chosen": -165.40802228009258, + "logps/rejected": -114.92798511402027, + "loss": 3.7333, + "rewards/chosen": 0.01195223574285154, + "rewards/margins": 0.04874239404041607, + "rewards/rejected": -0.03679015829756453, + "step": 30 + }, + { + "epoch": 1.0, + "grad_norm": 213.0, + "kl": 0.12372662127017975, + "learning_rate": 2e-07, + "logits/chosen": -53873344.0, + "logits/rejected": -90412911.30434783, + "logps/chosen": -127.94631476151316, + "logps/rejected": -102.76524286684783, + "loss": 3.7935, + "rewards/chosen": -0.012544832731548109, + "rewards/margins": 0.0057541689829095286, + "rewards/rejected": -0.018299001714457638, + "step": 31 + }, + { + "epoch": 1.0323886639676114, + "grad_norm": 212.0, + "kl": 0.26420849561691284, + "learning_rate": 2.0666666666666666e-07, + "logits/chosen": -58409840.941176474, + "logits/rejected": -87994436.26666667, + "logps/chosen": -210.7363712086397, + "logps/rejected": -94.37674967447917, + "loss": 3.7993, + "rewards/chosen": 0.004469211487209096, + "rewards/margins": 0.01882150529646406, + "rewards/rejected": -0.014352293809254964, + "step": 32 + }, + { + "epoch": 1.0647773279352226, + "grad_norm": 205.0, + "kl": 0.15617316961288452, + "learning_rate": 2.1333333333333334e-07, + "logits/chosen": -62545208.47058824, + "logits/rejected": -80585830.4, + "logps/chosen": -223.84014533547793, + "logps/rejected": -96.57676595052084, + "loss": 3.7213, + "rewards/chosen": 0.01100671466659097, + "rewards/margins": 0.03551927185526081, + "rewards/rejected": -0.02451255718866984, + "step": 33 + }, + { + "epoch": 1.097165991902834, + "grad_norm": 212.0, + "kl": 0.06040966510772705, + "learning_rate": 2.1999999999999998e-07, + "logits/chosen": -63960251.733333334, + "logits/rejected": -94734132.70588236, + "logps/chosen": -217.113916015625, + "logps/rejected": -109.65385885799633, + "loss": 3.7561, + "rewards/chosen": -0.002544252077738444, + "rewards/margins": 0.010596939979815015, + "rewards/rejected": -0.01314119205755346, + "step": 34 + }, + { + "epoch": 1.1295546558704452, + "grad_norm": 227.0, + "kl": 0.10720312595367432, + "learning_rate": 2.2666666666666663e-07, + "logits/chosen": -63211966.35897436, + "logits/rejected": -89783726.08, + "logps/chosen": -234.25676081730768, + "logps/rejected": -104.3000390625, + "loss": 3.969, + "rewards/chosen": 0.020914004399226263, + "rewards/margins": 0.030220148517535284, + "rewards/rejected": -0.009306144118309021, + "step": 35 + }, + { + "epoch": 1.1619433198380567, + "grad_norm": 197.0, + "kl": 0.005383551120758057, + "learning_rate": 2.3333333333333333e-07, + "logits/chosen": -54717854.89655172, + "logits/rejected": -89043229.25714286, + "logps/chosen": -209.59223464439654, + "logps/rejected": -110.57509765625, + "loss": 3.6599, + "rewards/chosen": -0.029390470734957992, + "rewards/margins": -0.025808011898266273, + "rewards/rejected": -0.00358245883669172, + "step": 36 + }, + { + "epoch": 1.194331983805668, + "grad_norm": 239.0, + "kl": 0.1495572328567505, + "learning_rate": 2.4e-07, + "logits/chosen": -52902044.90322581, + "logits/rejected": -86327823.51515152, + "logps/chosen": -158.13993195564515, + "logps/rejected": -95.27357806581439, + "loss": 3.9352, + "rewards/chosen": 0.0074248765745470605, + "rewards/margins": 0.029407006350663633, + "rewards/rejected": -0.021982129776116573, + "step": 37 + }, + { + "epoch": 1.2267206477732793, + "grad_norm": 237.0, + "kl": 0.17384201288223267, + "learning_rate": 2.4666666666666665e-07, + "logits/chosen": -60058088.72727273, + "logits/rejected": -84006499.09677419, + "logps/chosen": -144.7613340435606, + "logps/rejected": -128.73287865423387, + "loss": 3.8516, + "rewards/chosen": -0.0022030806902683144, + "rewards/margins": 0.036726773979610244, + "rewards/rejected": -0.03892985466987856, + "step": 38 + }, + { + "epoch": 1.2591093117408907, + "grad_norm": 182.0, + "kl": 0.15346676111221313, + "learning_rate": 2.533333333333333e-07, + "logits/chosen": -48408002.20689655, + "logits/rejected": -91550485.94285715, + "logps/chosen": -236.32336004849137, + "logps/rejected": -113.23631417410714, + "loss": 3.7963, + "rewards/chosen": 0.006504716544315733, + "rewards/margins": 0.03631391219904857, + "rewards/rejected": -0.02980919565473284, + "step": 39 + }, + { + "epoch": 1.291497975708502, + "grad_norm": 211.0, + "kl": 0.1730141043663025, + "learning_rate": 2.6e-07, + "logits/chosen": -70837530.94736843, + "logits/rejected": -87514023.38461539, + "logps/chosen": -179.63252981085526, + "logps/rejected": -125.02696814903847, + "loss": 3.8002, + "rewards/chosen": -0.01461096813804225, + "rewards/margins": -0.010148805675477634, + "rewards/rejected": -0.0044621624625646155, + "step": 40 + }, + { + "epoch": 1.3238866396761133, + "grad_norm": 221.0, + "kl": 0.11359253525733948, + "learning_rate": 2.6666666666666667e-07, + "logits/chosen": -66725420.52173913, + "logits/rejected": -90649818.53658536, + "logps/chosen": -141.18204398777175, + "logps/rejected": -100.03291730182927, + "loss": 3.7192, + "rewards/chosen": -0.036896081074424415, + "rewards/margins": -0.027763956045674623, + "rewards/rejected": -0.009132125028749792, + "step": 41 + }, + { + "epoch": 1.3562753036437247, + "grad_norm": 198.0, + "kl": 0.043334782123565674, + "learning_rate": 2.733333333333333e-07, + "logits/chosen": -58113312.91428571, + "logits/rejected": -85511291.5862069, + "logps/chosen": -193.58042689732142, + "logps/rejected": -118.14533102101294, + "loss": 3.7884, + "rewards/chosen": -0.01873276744570051, + "rewards/margins": 0.0015272747119659293, + "rewards/rejected": -0.020260042157666438, + "step": 42 + }, + { + "epoch": 1.3886639676113361, + "grad_norm": 192.0, + "kl": 0.09052866697311401, + "learning_rate": 2.8e-07, + "logits/chosen": -58638514.28571428, + "logits/rejected": -85584270.22222222, + "logps/chosen": -283.0006801060268, + "logps/rejected": -97.80012342664931, + "loss": 3.7554, + "rewards/chosen": -0.01699193673474448, + "rewards/margins": 0.014840672176981727, + "rewards/rejected": -0.03183260891172621, + "step": 43 + }, + { + "epoch": 1.4210526315789473, + "grad_norm": 328.0, + "kl": 0.17926907539367676, + "learning_rate": 2.866666666666667e-07, + "logits/chosen": -64984988.44444445, + "logits/rejected": -90971129.08108108, + "logps/chosen": -249.94845920138889, + "logps/rejected": -112.96752269847973, + "loss": 3.7747, + "rewards/chosen": -0.04352985488043891, + "rewards/margins": -0.011843407118284667, + "rewards/rejected": -0.03168644776215424, + "step": 44 + }, + { + "epoch": 1.4534412955465588, + "grad_norm": 189.0, + "kl": 0.09499406814575195, + "learning_rate": 2.933333333333333e-07, + "logits/chosen": -51370278.4, + "logits/rejected": -89449554.8235294, + "logps/chosen": -254.2802734375, + "logps/rejected": -89.32239487591912, + "loss": 3.7488, + "rewards/chosen": -0.011582699418067933, + "rewards/margins": 0.05230814341236563, + "rewards/rejected": -0.06389084283043356, + "step": 45 + }, + { + "epoch": 1.48582995951417, + "grad_norm": 206.0, + "kl": 0.044406354427337646, + "learning_rate": 3e-07, + "logits/chosen": -54140912.64, + "logits/rejected": -91693574.56410256, + "logps/chosen": -233.1951953125, + "logps/rejected": -126.72333233173077, + "loss": 3.8303, + "rewards/chosen": 0.040367259979248046, + "rewards/margins": 0.08629294444353153, + "rewards/rejected": -0.04592568446428348, + "step": 46 + }, + { + "epoch": 1.5182186234817814, + "grad_norm": 185.0, + "kl": 0.04556882381439209, + "learning_rate": 3.066666666666666e-07, + "logits/chosen": -61412363.63636363, + "logits/rejected": -95139228.90322581, + "logps/chosen": -242.9996152935606, + "logps/rejected": -119.96295362903226, + "loss": 3.7595, + "rewards/chosen": -0.02121250376556859, + "rewards/margins": 0.001219373586007693, + "rewards/rejected": -0.02243187735157628, + "step": 47 + }, + { + "epoch": 1.5506072874493926, + "grad_norm": 214.0, + "kl": 0.19127535820007324, + "learning_rate": 3.1333333333333333e-07, + "logits/chosen": -62732676.12903226, + "logits/rejected": -91854072.24242425, + "logps/chosen": -174.46708039314515, + "logps/rejected": -130.95345052083334, + "loss": 3.9625, + "rewards/chosen": -0.001973065637773083, + "rewards/margins": 0.045654672390088545, + "rewards/rejected": -0.047627738027861626, + "step": 48 + }, + { + "epoch": 1.582995951417004, + "grad_norm": 192.0, + "kl": 0.2232951521873474, + "learning_rate": 3.2e-07, + "logits/chosen": -56241034.10526316, + "logits/rejected": -88449723.07692307, + "logps/chosen": -245.80191200657896, + "logps/rejected": -114.557861328125, + "loss": 3.6986, + "rewards/chosen": -0.0354247438280206, + "rewards/margins": 0.022862803356850196, + "rewards/rejected": -0.058287547184870794, + "step": 49 + }, + { + "epoch": 1.6153846153846154, + "grad_norm": 187.0, + "kl": 0.04990750551223755, + "learning_rate": 3.2666666666666663e-07, + "logits/chosen": -64088552.0, + "logits/rejected": -97265312.0, + "logps/chosen": -250.58773803710938, + "logps/rejected": -121.63117980957031, + "loss": 3.7894, + "rewards/chosen": -0.020488403737545013, + "rewards/margins": 0.015492349863052368, + "rewards/rejected": -0.03598075360059738, + "step": 50 + }, + { + "epoch": 1.6477732793522266, + "grad_norm": 180.0, + "kl": 0.11040717363357544, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -72300434.28571428, + "logits/rejected": -88177575.72413793, + "logps/chosen": -167.09561941964284, + "logps/rejected": -126.21796706627156, + "loss": 3.7607, + "rewards/chosen": -0.02663487025669643, + "rewards/margins": 0.03576011869120481, + "rewards/rejected": -0.062394988947901235, + "step": 51 + }, + { + "epoch": 1.680161943319838, + "grad_norm": 169.0, + "kl": 0.031675100326538086, + "learning_rate": 3.4000000000000003e-07, + "logits/chosen": -65065728.0, + "logits/rejected": -90847330.46153846, + "logps/chosen": -207.9890779194079, + "logps/rejected": -106.46443997896634, + "loss": 3.735, + "rewards/chosen": -0.006777437109696238, + "rewards/margins": 0.05778750161892972, + "rewards/rejected": -0.06456493872862595, + "step": 52 + }, + { + "epoch": 1.7125506072874495, + "grad_norm": 252.0, + "kl": 0.09681445360183716, + "learning_rate": 3.4666666666666665e-07, + "logits/chosen": -65318413.12820513, + "logits/rejected": -87948451.84, + "logps/chosen": -151.71540715144232, + "logps/rejected": -94.358984375, + "loss": 3.8119, + "rewards/chosen": -0.002530088409399375, + "rewards/margins": 0.051912841812158245, + "rewards/rejected": -0.05444293022155762, + "step": 53 + }, + { + "epoch": 1.7449392712550607, + "grad_norm": 227.0, + "kl": 0.0, + "learning_rate": 3.533333333333333e-07, + "logits/chosen": -68032464.0, + "logits/rejected": -89987720.0, + "logps/chosen": -171.3027801513672, + "logps/rejected": -119.03023529052734, + "loss": 3.8662, + "rewards/chosen": -0.03888993337750435, + "rewards/margins": 0.053229544311761856, + "rewards/rejected": -0.0921194776892662, + "step": 54 + }, + { + "epoch": 1.777327935222672, + "grad_norm": 229.0, + "kl": 0.1575755476951599, + "learning_rate": 3.6e-07, + "logits/chosen": -62680342.5882353, + "logits/rejected": -85439786.66666667, + "logps/chosen": -216.95730411305146, + "logps/rejected": -109.880810546875, + "loss": 3.8791, + "rewards/chosen": 0.0060911914881537944, + "rewards/margins": 0.09755752297008738, + "rewards/rejected": -0.09146633148193359, + "step": 55 + }, + { + "epoch": 1.8097165991902835, + "grad_norm": 205.0, + "kl": 0.011227190494537354, + "learning_rate": 3.666666666666666e-07, + "logits/chosen": -75584723.86206897, + "logits/rejected": -91031215.54285714, + "logps/chosen": -189.27225889008622, + "logps/rejected": -126.86244419642857, + "loss": 3.8159, + "rewards/chosen": -0.04395907500694538, + "rewards/margins": 0.05888716599036908, + "rewards/rejected": -0.10284624099731446, + "step": 56 + }, + { + "epoch": 1.8421052631578947, + "grad_norm": 205.0, + "kl": 0.13237720727920532, + "learning_rate": 3.7333333333333334e-07, + "logits/chosen": -59131476.0, + "logits/rejected": -95583968.0, + "logps/chosen": -256.7174377441406, + "logps/rejected": -103.1009750366211, + "loss": 3.898, + "rewards/chosen": -0.01803133636713028, + "rewards/margins": 0.06824938207864761, + "rewards/rejected": -0.0862807184457779, + "step": 57 + }, + { + "epoch": 1.874493927125506, + "grad_norm": 188.0, + "kl": 0.01799929141998291, + "learning_rate": 3.7999999999999996e-07, + "logits/chosen": -55775914.666666664, + "logits/rejected": -88368658.28571428, + "logps/chosen": -246.25482855902777, + "logps/rejected": -114.36540876116071, + "loss": 3.8, + "rewards/chosen": -0.004815898421737883, + "rewards/margins": 0.0846694488492277, + "rewards/rejected": -0.08948534727096558, + "step": 58 + }, + { + "epoch": 1.9068825910931175, + "grad_norm": 224.0, + "kl": 0.02940082550048828, + "learning_rate": 3.8666666666666664e-07, + "logits/chosen": -56140933.12, + "logits/rejected": -94621124.92307693, + "logps/chosen": -254.40115234375, + "logps/rejected": -128.3409204727564, + "loss": 3.7634, + "rewards/chosen": 0.017768787145614626, + "rewards/margins": 0.12409534274003445, + "rewards/rejected": -0.10632655559441982, + "step": 59 + }, + { + "epoch": 1.9392712550607287, + "grad_norm": 230.0, + "kl": 0.05236637592315674, + "learning_rate": 3.933333333333333e-07, + "logits/chosen": -73944693.02857143, + "logits/rejected": -90982444.13793103, + "logps/chosen": -173.64387555803572, + "logps/rejected": -118.02784886853448, + "loss": 3.8531, + "rewards/chosen": -0.02884887967790876, + "rewards/margins": 0.04749297132633003, + "rewards/rejected": -0.07634185100423879, + "step": 60 + }, + { + "epoch": 1.97165991902834, + "grad_norm": 203.0, + "kl": 0.10479164123535156, + "learning_rate": 4e-07, + "logits/chosen": -67756994.37037037, + "logits/rejected": -93152989.4054054, + "logps/chosen": -165.50672743055554, + "logps/rejected": -115.58003853462837, + "loss": 3.6903, + "rewards/chosen": 0.002080100554007071, + "rewards/margins": 0.1040766963490972, + "rewards/rejected": -0.10199659579509013, + "step": 61 + }, + { + "epoch": 2.0, + "grad_norm": 223.0, + "kl": 0.018572943285107613, + "learning_rate": 4.0666666666666666e-07, + "logits/chosen": -53805850.94736842, + "logits/rejected": -90529268.86956522, + "logps/chosen": -128.19540244654604, + "logps/rejected": -103.58083177649456, + "loss": 3.779, + "rewards/chosen": -0.03745367025074206, + "rewards/margins": 0.06240318323436536, + "rewards/rejected": -0.09985685348510742, + "step": 62 + }, + { + "epoch": 2.032388663967611, + "grad_norm": 229.0, + "kl": 0.18244534730911255, + "learning_rate": 4.1333333333333333e-07, + "logits/chosen": -58454268.23529412, + "logits/rejected": -88176153.6, + "logps/chosen": -210.8977768841912, + "logps/rejected": -94.99117838541666, + "loss": 3.7903, + "rewards/chosen": -0.01167152208440444, + "rewards/margins": 0.06412222104914048, + "rewards/rejected": -0.07579374313354492, + "step": 63 + }, + { + "epoch": 2.064777327935223, + "grad_norm": 203.0, + "kl": 0.10842978954315186, + "learning_rate": 4.1999999999999995e-07, + "logits/chosen": -62472903.52941176, + "logits/rejected": -80891144.53333333, + "logps/chosen": -224.19175091911765, + "logps/rejected": -96.9512451171875, + "loss": 3.7124, + "rewards/chosen": -0.024154009187922758, + "rewards/margins": 0.037806827063653986, + "rewards/rejected": -0.061960836251576744, + "step": 64 + }, + { + "epoch": 2.097165991902834, + "grad_norm": 244.0, + "kl": 0.0, + "learning_rate": 4.266666666666667e-07, + "logits/chosen": -64039726.93333333, + "logits/rejected": -94997970.8235294, + "logps/chosen": -217.54044596354166, + "logps/rejected": -110.7608283547794, + "loss": 3.7247, + "rewards/chosen": -0.04519684314727783, + "rewards/margins": 0.07864142726449405, + "rewards/rejected": -0.12383827041177188, + "step": 65 + }, + { + "epoch": 2.1295546558704452, + "grad_norm": 249.0, + "kl": 0.018720507621765137, + "learning_rate": 4.3333333333333335e-07, + "logits/chosen": -63058005.333333336, + "logits/rejected": -90236334.08, + "logps/chosen": -234.40567407852564, + "logps/rejected": -105.177529296875, + "loss": 3.9646, + "rewards/chosen": 0.006023437930987432, + "rewards/margins": 0.10307669873421009, + "rewards/rejected": -0.09705326080322266, + "step": 66 + }, + { + "epoch": 2.161943319838057, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 4.3999999999999997e-07, + "logits/chosen": -55136326.62068965, + "logits/rejected": -89366396.34285714, + "logps/chosen": -209.48905576508622, + "logps/rejected": -111.65326450892857, + "loss": 3.6105, + "rewards/chosen": -0.0190719879906753, + "rewards/margins": 0.09232676340441398, + "rewards/rejected": -0.11139875139508928, + "step": 67 + }, + { + "epoch": 2.194331983805668, + "grad_norm": 190.0, + "kl": 0.056352436542510986, + "learning_rate": 4.4666666666666664e-07, + "logits/chosen": -52990773.67741936, + "logits/rejected": -86883995.15151516, + "logps/chosen": -158.23761970766128, + "logps/rejected": -96.30680338541667, + "loss": 3.8777, + "rewards/chosen": -0.002344240584681111, + "rewards/margins": 0.12295980812692223, + "rewards/rejected": -0.12530404871160333, + "step": 68 + }, + { + "epoch": 2.2267206477732793, + "grad_norm": 242.0, + "kl": 0.0054090023040771484, + "learning_rate": 4.5333333333333326e-07, + "logits/chosen": -60425867.63636363, + "logits/rejected": -84169984.0, + "logps/chosen": -144.87008759469697, + "logps/rejected": -129.69174489667338, + "loss": 3.8105, + "rewards/chosen": -0.013077873172182026, + "rewards/margins": 0.12173854267608851, + "rewards/rejected": -0.13481641584827053, + "step": 69 + }, + { + "epoch": 2.2591093117408905, + "grad_norm": 178.0, + "kl": 0.020307421684265137, + "learning_rate": 4.6e-07, + "logits/chosen": -48812146.75862069, + "logits/rejected": -91739165.25714286, + "logps/chosen": -236.0985317887931, + "logps/rejected": -114.47859933035714, + "loss": 3.7725, + "rewards/chosen": 0.02898802428409971, + "rewards/margins": 0.18302607207462707, + "rewards/rejected": -0.15403804779052735, + "step": 70 + }, + { + "epoch": 2.291497975708502, + "grad_norm": 202.0, + "kl": 0.08195632696151733, + "learning_rate": 4.6666666666666666e-07, + "logits/chosen": -70915456.0, + "logits/rejected": -87808964.92307693, + "logps/chosen": -180.18395353618422, + "logps/rejected": -126.05295973557692, + "loss": 3.7815, + "rewards/chosen": -0.06975392918837697, + "rewards/margins": 0.03730890577138678, + "rewards/rejected": -0.10706283495976375, + "step": 71 + }, + { + "epoch": 2.3238866396761133, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 4.733333333333333e-07, + "logits/chosen": -66860766.60869565, + "logits/rejected": -90918362.53658536, + "logps/chosen": -141.63720703125, + "logps/rejected": -101.3832412347561, + "loss": 3.6912, + "rewards/chosen": -0.08241141360739003, + "rewards/margins": 0.06175242754712716, + "rewards/rejected": -0.1441638411545172, + "step": 72 + }, + { + "epoch": 2.3562753036437245, + "grad_norm": 201.0, + "kl": 0.05285078287124634, + "learning_rate": 4.8e-07, + "logits/chosen": -58148995.657142855, + "logits/rejected": -85458688.0, + "logps/chosen": -193.9041015625, + "logps/rejected": -119.37925983297414, + "loss": 3.7687, + "rewards/chosen": -0.05109883717128209, + "rewards/margins": 0.09255356906082832, + "rewards/rejected": -0.1436524062321104, + "step": 73 + }, + { + "epoch": 2.388663967611336, + "grad_norm": 189.0, + "kl": 0.017002761363983154, + "learning_rate": 4.866666666666666e-07, + "logits/chosen": -58708155.428571425, + "logits/rejected": -85895623.1111111, + "logps/chosen": -283.2236328125, + "logps/rejected": -98.8388671875, + "loss": 3.7163, + "rewards/chosen": -0.039288078035627096, + "rewards/margins": 0.09641826720464797, + "rewards/rejected": -0.13570634524027506, + "step": 74 + }, + { + "epoch": 2.4210526315789473, + "grad_norm": 258.0, + "kl": 0.09225642681121826, + "learning_rate": 4.933333333333333e-07, + "logits/chosen": -65111812.74074074, + "logits/rejected": -91157974.48648648, + "logps/chosen": -250.06803385416666, + "logps/rejected": -114.29911845439189, + "loss": 3.737, + "rewards/chosen": -0.05548596823657, + "rewards/margins": 0.10936014275173764, + "rewards/rejected": -0.16484611098830765, + "step": 75 + }, + { + "epoch": 2.4534412955465585, + "grad_norm": 191.0, + "kl": 0.10859435796737671, + "learning_rate": 5e-07, + "logits/chosen": -51252386.13333333, + "logits/rejected": -89624688.94117647, + "logps/chosen": -254.38297526041666, + "logps/rejected": -90.42824419806985, + "loss": 3.7249, + "rewards/chosen": -0.021853423118591307, + "rewards/margins": 0.15262241503771612, + "rewards/rejected": -0.17447583815630743, + "step": 76 + }, + { + "epoch": 2.48582995951417, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 5.066666666666667e-07, + "logits/chosen": -54352476.16, + "logits/rejected": -91919642.25641026, + "logps/chosen": -233.6417578125, + "logps/rejected": -128.34430088141025, + "loss": 3.7901, + "rewards/chosen": -0.00428692102432251, + "rewards/margins": 0.20373707948586878, + "rewards/rejected": -0.2080240005101913, + "step": 77 + }, + { + "epoch": 2.5182186234817814, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 5.133333333333333e-07, + "logits/chosen": -61378466.90909091, + "logits/rejected": -95343054.4516129, + "logps/chosen": -243.4647401751894, + "logps/rejected": -121.86517137096774, + "loss": 3.7241, + "rewards/chosen": -0.06772462526957194, + "rewards/margins": 0.14492905780833254, + "rewards/rejected": -0.2126536830779045, + "step": 78 + }, + { + "epoch": 2.5506072874493926, + "grad_norm": 230.0, + "kl": 0.15216386318206787, + "learning_rate": 5.2e-07, + "logits/chosen": -62781588.64516129, + "logits/rejected": -91950995.39393939, + "logps/chosen": -174.5715095766129, + "logps/rejected": -132.68966027462122, + "loss": 3.9193, + "rewards/chosen": -0.01241672808124173, + "rewards/margins": 0.20883350253454752, + "rewards/rejected": -0.22125023061578925, + "step": 79 + }, + { + "epoch": 2.582995951417004, + "grad_norm": 217.0, + "kl": 0.01045989990234375, + "learning_rate": 5.266666666666666e-07, + "logits/chosen": -56160697.2631579, + "logits/rejected": -88810771.6923077, + "logps/chosen": -246.09932668585526, + "logps/rejected": -116.0523681640625, + "loss": 3.6824, + "rewards/chosen": -0.06516703806425396, + "rewards/margins": 0.14257117804245428, + "rewards/rejected": -0.20773821610670823, + "step": 80 + }, + { + "epoch": 2.6153846153846154, + "grad_norm": 172.0, + "kl": 0.05806320905685425, + "learning_rate": 5.333333333333333e-07, + "logits/chosen": -63793476.0, + "logits/rejected": -97707888.0, + "logps/chosen": -250.8136749267578, + "logps/rejected": -123.37957763671875, + "loss": 3.7336, + "rewards/chosen": -0.043080516159534454, + "rewards/margins": 0.16773956269025803, + "rewards/rejected": -0.21082007884979248, + "step": 81 + }, + { + "epoch": 2.6477732793522266, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 5.4e-07, + "logits/chosen": -72679197.25714286, + "logits/rejected": -88729114.48275863, + "logps/chosen": -167.96812220982142, + "logps/rejected": -127.9656182650862, + "loss": 3.712, + "rewards/chosen": -0.11388427189418247, + "rewards/margins": 0.12327655876798584, + "rewards/rejected": -0.2371608306621683, + "step": 82 + }, + { + "epoch": 2.6801619433198383, + "grad_norm": 175.0, + "kl": 0.09860539436340332, + "learning_rate": 5.466666666666666e-07, + "logits/chosen": -65102255.15789474, + "logits/rejected": -91322948.92307693, + "logps/chosen": -208.51667865953948, + "logps/rejected": -108.3269512469952, + "loss": 3.7181, + "rewards/chosen": -0.059536714302866084, + "rewards/margins": 0.1912797776311033, + "rewards/rejected": -0.2508164919339694, + "step": 83 + }, + { + "epoch": 2.7125506072874495, + "grad_norm": 240.0, + "kl": 0.0, + "learning_rate": 5.533333333333334e-07, + "logits/chosen": -65572621.12820513, + "logits/rejected": -88185815.04, + "logps/chosen": -152.19298377403845, + "logps/rejected": -95.966689453125, + "loss": 3.7343, + "rewards/chosen": -0.05028758904872797, + "rewards/margins": 0.16492761709751227, + "rewards/rejected": -0.21521520614624023, + "step": 84 + }, + { + "epoch": 2.7449392712550607, + "grad_norm": 192.0, + "kl": 0.00746995210647583, + "learning_rate": 5.6e-07, + "logits/chosen": -68171560.0, + "logits/rejected": -90503168.0, + "logps/chosen": -172.13671875, + "logps/rejected": -121.0586929321289, + "loss": 3.8099, + "rewards/chosen": -0.12228179723024368, + "rewards/margins": 0.17268472164869308, + "rewards/rejected": -0.29496651887893677, + "step": 85 + }, + { + "epoch": 2.7773279352226723, + "grad_norm": 202.0, + "kl": 0.010839879512786865, + "learning_rate": 5.666666666666666e-07, + "logits/chosen": -62651245.176470585, + "logits/rejected": -85748070.4, + "logps/chosen": -217.32801011029412, + "logps/rejected": -111.80550130208333, + "loss": 3.813, + "rewards/chosen": -0.03098060453639311, + "rewards/margins": 0.2529542747665854, + "rewards/rejected": -0.2839348793029785, + "step": 86 + }, + { + "epoch": 2.8097165991902835, + "grad_norm": 219.0, + "kl": 0.10649758577346802, + "learning_rate": 5.733333333333334e-07, + "logits/chosen": -75832258.20689656, + "logits/rejected": -91454632.22857143, + "logps/chosen": -190.46363146551724, + "logps/rejected": -128.76707589285715, + "loss": 3.7562, + "rewards/chosen": -0.16309767755968818, + "rewards/margins": 0.1302118338974826, + "rewards/rejected": -0.29330951145717077, + "step": 87 + }, + { + "epoch": 2.8421052631578947, + "grad_norm": 228.0, + "kl": 0.046126484870910645, + "learning_rate": 5.8e-07, + "logits/chosen": -59232008.0, + "logits/rejected": -96177024.0, + "logps/chosen": -256.9981994628906, + "logps/rejected": -105.48146057128906, + "loss": 3.8255, + "rewards/chosen": -0.04610701650381088, + "rewards/margins": 0.27822083979845047, + "rewards/rejected": -0.32432785630226135, + "step": 88 + }, + { + "epoch": 2.8744939271255063, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 5.866666666666666e-07, + "logits/chosen": -56284231.11111111, + "logits/rejected": -89122020.57142857, + "logps/chosen": -246.57191297743054, + "logps/rejected": -116.71786934988839, + "loss": 3.7563, + "rewards/chosen": -0.03652524285846286, + "rewards/margins": 0.2882071288805159, + "rewards/rejected": -0.3247323717389788, + "step": 89 + }, + { + "epoch": 2.9068825910931175, + "grad_norm": 214.0, + "kl": 0.0, + "learning_rate": 5.933333333333334e-07, + "logits/chosen": -56218849.28, + "logits/rejected": -95044286.35897435, + "logps/chosen": -255.12201171875, + "logps/rejected": -130.9345703125, + "loss": 3.6649, + "rewards/chosen": -0.05431962013244629, + "rewards/margins": 0.3113717360374255, + "rewards/rejected": -0.3656913561698718, + "step": 90 + }, + { + "epoch": 2.9392712550607287, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 6e-07, + "logits/chosen": -74079656.22857143, + "logits/rejected": -91588793.37931034, + "logps/chosen": -174.35126953125, + "logps/rejected": -120.26845366379311, + "loss": 3.8084, + "rewards/chosen": -0.09958893230983189, + "rewards/margins": 0.2008122620324196, + "rewards/rejected": -0.3004011943422515, + "step": 91 + }, + { + "epoch": 2.97165991902834, + "grad_norm": 222.0, + "kl": 0.0, + "learning_rate": 6.066666666666666e-07, + "logits/chosen": -67999293.62962963, + "logits/rejected": -93689011.8918919, + "logps/chosen": -166.47797309027777, + "logps/rejected": -118.10967852618244, + "loss": 3.6621, + "rewards/chosen": -0.09504372985274703, + "rewards/margins": 0.25991599003712573, + "rewards/rejected": -0.3549597198898728, + "step": 92 + }, + { + "epoch": 3.0, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 6.133333333333332e-07, + "logits/chosen": -54103104.0, + "logits/rejected": -90902861.91304348, + "logps/chosen": -129.50056537828948, + "logps/rejected": -105.81852921195652, + "loss": 3.7126, + "rewards/chosen": -0.167970105221397, + "rewards/margins": 0.15565773987933756, + "rewards/rejected": -0.32362784510073456, + "step": 93 + }, + { + "epoch": 3.032388663967611, + "grad_norm": 227.0, + "kl": 0.0727849006652832, + "learning_rate": 6.2e-07, + "logits/chosen": -58711220.705882356, + "logits/rejected": -88564070.4, + "logps/chosen": -211.5251034007353, + "logps/rejected": -96.87139485677083, + "loss": 3.7509, + "rewards/chosen": -0.07440530552583582, + "rewards/margins": 0.18940984314563228, + "rewards/rejected": -0.2638151486714681, + "step": 94 + }, + { + "epoch": 3.064777327935223, + "grad_norm": 215.0, + "kl": 0.10124093294143677, + "learning_rate": 6.266666666666667e-07, + "logits/chosen": -62814102.5882353, + "logits/rejected": -81403332.26666667, + "logps/chosen": -224.80596564797793, + "logps/rejected": -98.86597493489583, + "loss": 3.6665, + "rewards/chosen": -0.08557542632607852, + "rewards/margins": 0.1678579601587034, + "rewards/rejected": -0.2534333864847819, + "step": 95 + }, + { + "epoch": 3.097165991902834, + "grad_norm": 210.0, + "kl": 0.0, + "learning_rate": 6.333333333333332e-07, + "logits/chosen": -64167667.2, + "logits/rejected": -95528929.88235295, + "logps/chosen": -218.38707682291667, + "logps/rejected": -113.95331887637867, + "loss": 3.6635, + "rewards/chosen": -0.12986040910085042, + "rewards/margins": 0.31322660773408184, + "rewards/rejected": -0.44308701683493223, + "step": 96 + }, + { + "epoch": 3.1295546558704452, + "grad_norm": 231.0, + "kl": 0.0, + "learning_rate": 6.4e-07, + "logits/chosen": -63470861.12820513, + "logits/rejected": -90707824.64, + "logps/chosen": -235.18073918269232, + "logps/rejected": -107.789013671875, + "loss": 3.9197, + "rewards/chosen": -0.07148234049479167, + "rewards/margins": 0.2867210896809896, + "rewards/rejected": -0.35820343017578127, + "step": 97 + }, + { + "epoch": 3.161943319838057, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 6.466666666666666e-07, + "logits/chosen": -54969630.89655172, + "logits/rejected": -89939763.2, + "logps/chosen": -209.63872238685346, + "logps/rejected": -114.52486746651786, + "loss": 3.5587, + "rewards/chosen": -0.03403720773499588, + "rewards/margins": 0.36452140890318774, + "rewards/rejected": -0.3985586166381836, + "step": 98 + }, + { + "epoch": 3.194331983805668, + "grad_norm": 240.0, + "kl": 0.0, + "learning_rate": 6.533333333333333e-07, + "logits/chosen": -53064105.29032258, + "logits/rejected": -87360318.06060606, + "logps/chosen": -158.5835433467742, + "logps/rejected": -99.12943892045455, + "loss": 3.7963, + "rewards/chosen": -0.0369358524199455, + "rewards/margins": 0.3706311718110115, + "rewards/rejected": -0.40756702423095703, + "step": 99 + }, + { + "epoch": 3.2267206477732793, + "grad_norm": 260.0, + "kl": 0.0, + "learning_rate": 6.6e-07, + "logits/chosen": -60587337.696969695, + "logits/rejected": -84534684.90322581, + "logps/chosen": -145.5372869318182, + "logps/rejected": -132.50162235383064, + "loss": 3.7511, + "rewards/chosen": -0.07979689222393614, + "rewards/margins": 0.33600750929682366, + "rewards/rejected": -0.4158044015207598, + "step": 100 + }, + { + "epoch": 3.2591093117408905, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 6.666666666666666e-07, + "logits/chosen": -48986266.48275862, + "logits/rejected": -92440115.2, + "logps/chosen": -236.63941271551724, + "logps/rejected": -117.69333147321429, + "loss": 3.6876, + "rewards/chosen": -0.025098778050521325, + "rewards/margins": 0.450413263313876, + "rewards/rejected": -0.47551204136439734, + "step": 101 + }, + { + "epoch": 3.291497975708502, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 6.733333333333333e-07, + "logits/chosen": -71407063.57894737, + "logits/rejected": -88328044.3076923, + "logps/chosen": -181.23652086759867, + "logps/rejected": -129.5223670372596, + "loss": 3.7141, + "rewards/chosen": -0.17501012902510793, + "rewards/margins": 0.2789921046268602, + "rewards/rejected": -0.4540022336519681, + "step": 102 + }, + { + "epoch": 3.3238866396761133, + "grad_norm": 211.0, + "kl": 0.0, + "learning_rate": 6.800000000000001e-07, + "logits/chosen": -67130056.3478261, + "logits/rejected": -91396283.31707317, + "logps/chosen": -143.13606063179347, + "logps/rejected": -105.06101133765245, + "loss": 3.569, + "rewards/chosen": -0.23229586559793222, + "rewards/margins": 0.279645811588615, + "rewards/rejected": -0.5119416771865473, + "step": 103 + }, + { + "epoch": 3.3562753036437245, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 6.866666666666666e-07, + "logits/chosen": -58641781.02857143, + "logits/rejected": -86187016.8275862, + "logps/chosen": -194.0300502232143, + "logps/rejected": -122.5322686557112, + "loss": 3.684, + "rewards/chosen": -0.06369432040623256, + "rewards/margins": 0.3952588793092173, + "rewards/rejected": -0.4589531997154499, + "step": 104 + }, + { + "epoch": 3.388663967611336, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 6.933333333333333e-07, + "logits/chosen": -58871300.571428575, + "logits/rejected": -86522055.1111111, + "logps/chosen": -283.37655203683033, + "logps/rejected": -101.57166883680556, + "loss": 3.6468, + "rewards/chosen": -0.054577767848968506, + "rewards/margins": 0.3544092509481642, + "rewards/rejected": -0.40898701879713273, + "step": 105 + }, + { + "epoch": 3.4210526315789473, + "grad_norm": 382.0, + "kl": 0.0, + "learning_rate": 7e-07, + "logits/chosen": -65297189.925925925, + "logits/rejected": -91777037.83783785, + "logps/chosen": -251.05060040509258, + "logps/rejected": -117.88727301520271, + "loss": 3.648, + "rewards/chosen": -0.15374413243046514, + "rewards/margins": 0.36991781610865015, + "rewards/rejected": -0.5236619485391153, + "step": 106 + }, + { + "epoch": 3.4534412955465585, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 7.066666666666666e-07, + "logits/chosen": -51531144.53333333, + "logits/rejected": -90403237.64705883, + "logps/chosen": -255.04498697916668, + "logps/rejected": -94.00696518841912, + "loss": 3.644, + "rewards/chosen": -0.0880549669265747, + "rewards/margins": 0.4442929927040549, + "rewards/rejected": -0.5323479596306296, + "step": 107 + }, + { + "epoch": 3.48582995951417, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 7.133333333333333e-07, + "logits/chosen": -54512532.48, + "logits/rejected": -92732192.82051282, + "logps/chosen": -234.6769921875, + "logps/rejected": -131.9256810897436, + "loss": 3.7004, + "rewards/chosen": -0.10780903816223145, + "rewards/margins": 0.4583528266808926, + "rewards/rejected": -0.566161864843124, + "step": 108 + }, + { + "epoch": 3.5182186234817814, + "grad_norm": 204.0, + "kl": 0.0, + "learning_rate": 7.2e-07, + "logits/chosen": -61560777.696969695, + "logits/rejected": -96084892.90322581, + "logps/chosen": -244.23498165246212, + "logps/rejected": -126.09141885080645, + "loss": 3.6479, + "rewards/chosen": -0.1447492657285748, + "rewards/margins": 0.4905286361977734, + "rewards/rejected": -0.6352779019263483, + "step": 109 + }, + { + "epoch": 3.5506072874493926, + "grad_norm": 225.0, + "kl": 0.03981220722198486, + "learning_rate": 7.266666666666667e-07, + "logits/chosen": -63047613.93548387, + "logits/rejected": -93004520.72727273, + "logps/chosen": -175.2143082157258, + "logps/rejected": -136.17847419507575, + "loss": 3.8344, + "rewards/chosen": -0.0766956344727547, + "rewards/margins": 0.4934345692367032, + "rewards/rejected": -0.5701302037094579, + "step": 110 + }, + { + "epoch": 3.582995951417004, + "grad_norm": 208.0, + "kl": 0.0, + "learning_rate": 7.333333333333332e-07, + "logits/chosen": -56503208.421052635, + "logits/rejected": -89910852.92307693, + "logps/chosen": -246.58493523848685, + "logps/rejected": -119.18206317608173, + "loss": 3.6194, + "rewards/chosen": -0.11372504736247815, + "rewards/margins": 0.40698333018221844, + "rewards/rejected": -0.5207083775446966, + "step": 111 + }, + { + "epoch": 3.6153846153846154, + "grad_norm": 182.0, + "kl": 0.01825159788131714, + "learning_rate": 7.4e-07, + "logits/chosen": -64289088.0, + "logits/rejected": -98815936.0, + "logps/chosen": -251.60263061523438, + "logps/rejected": -127.94086456298828, + "loss": 3.6212, + "rewards/chosen": -0.12197733670473099, + "rewards/margins": 0.5449719354510307, + "rewards/rejected": -0.6669492721557617, + "step": 112 + }, + { + "epoch": 3.6477732793522266, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 7.466666666666667e-07, + "logits/chosen": -73170534.4, + "logits/rejected": -89516879.44827586, + "logps/chosen": -170.231640625, + "logps/rejected": -131.79471982758622, + "loss": 3.6429, + "rewards/chosen": -0.34023783547537667, + "rewards/margins": 0.27983285218036824, + "rewards/rejected": -0.6200706876557449, + "step": 113 + }, + { + "epoch": 3.6801619433198383, + "grad_norm": 182.0, + "kl": 0.06295955181121826, + "learning_rate": 7.533333333333332e-07, + "logits/chosen": -65289175.578947365, + "logits/rejected": -92120832.0, + "logps/chosen": -209.48148386101974, + "logps/rejected": -112.7288348858173, + "loss": 3.6405, + "rewards/chosen": -0.1560162619540566, + "rewards/margins": 0.5349870239674803, + "rewards/rejected": -0.6910032859215369, + "step": 114 + }, + { + "epoch": 3.7125506072874495, + "grad_norm": 260.0, + "kl": 0.0, + "learning_rate": 7.599999999999999e-07, + "logits/chosen": -65811022.76923077, + "logits/rejected": -88804065.28, + "logps/chosen": -153.22434645432693, + "logps/rejected": -100.038125, + "loss": 3.6606, + "rewards/chosen": -0.15342459311852089, + "rewards/margins": 0.4689330805264986, + "rewards/rejected": -0.6223576736450195, + "step": 115 + }, + { + "epoch": 3.7449392712550607, + "grad_norm": 210.0, + "kl": 0.004709720611572266, + "learning_rate": 7.666666666666667e-07, + "logits/chosen": -68488384.0, + "logits/rejected": -91338232.0, + "logps/chosen": -173.6177978515625, + "logps/rejected": -125.25968933105469, + "loss": 3.6747, + "rewards/chosen": -0.2703918218612671, + "rewards/margins": 0.44467318058013916, + "rewards/rejected": -0.7150650024414062, + "step": 116 + }, + { + "epoch": 3.7773279352226723, + "grad_norm": 240.0, + "kl": 0.0, + "learning_rate": 7.733333333333333e-07, + "logits/chosen": -63149522.823529415, + "logits/rejected": -86811980.8, + "logps/chosen": -218.0769473805147, + "logps/rejected": -115.976513671875, + "loss": 3.7105, + "rewards/chosen": -0.10587411768296186, + "rewards/margins": 0.5951634799732881, + "rewards/rejected": -0.70103759765625, + "step": 117 + }, + { + "epoch": 3.8097165991902835, + "grad_norm": 199.0, + "kl": 0.17148053646087646, + "learning_rate": 7.799999999999999e-07, + "logits/chosen": -76564162.20689656, + "logits/rejected": -92245716.11428571, + "logps/chosen": -192.8898336476293, + "logps/rejected": -132.41061662946427, + "loss": 3.6601, + "rewards/chosen": -0.40571768530483904, + "rewards/margins": 0.2519464774672034, + "rewards/rejected": -0.6576641627720424, + "step": 118 + }, + { + "epoch": 3.8421052631578947, + "grad_norm": 199.0, + "kl": 0.08587014675140381, + "learning_rate": 7.866666666666666e-07, + "logits/chosen": -59585784.0, + "logits/rejected": -97084912.0, + "logps/chosen": -257.84991455078125, + "logps/rejected": -110.13160705566406, + "loss": 3.735, + "rewards/chosen": -0.1312781572341919, + "rewards/margins": 0.6580657958984375, + "rewards/rejected": -0.7893439531326294, + "step": 119 + }, + { + "epoch": 3.8744939271255063, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 7.933333333333333e-07, + "logits/chosen": -56471552.0, + "logits/rejected": -89928557.71428572, + "logps/chosen": -247.12120225694446, + "logps/rejected": -120.70945521763393, + "loss": 3.6722, + "rewards/chosen": -0.09145498275756836, + "rewards/margins": 0.6324355261666434, + "rewards/rejected": -0.7238905089242118, + "step": 120 + }, + { + "epoch": 3.9068825910931175, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 8e-07, + "logits/chosen": -56484792.32, + "logits/rejected": -95992260.92307693, + "logps/chosen": -256.19287109375, + "logps/rejected": -135.6700220352564, + "loss": 3.5178, + "rewards/chosen": -0.1614029312133789, + "rewards/margins": 0.6778336950448843, + "rewards/rejected": -0.8392366262582632, + "step": 121 + }, + { + "epoch": 3.9392712550607287, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 8.066666666666666e-07, + "logits/chosen": -74744685.71428572, + "logits/rejected": -92417739.03448276, + "logps/chosen": -175.45368303571428, + "logps/rejected": -124.31517712823276, + "loss": 3.6676, + "rewards/chosen": -0.20983006613595145, + "rewards/margins": 0.49524325760714527, + "rewards/rejected": -0.7050733237430967, + "step": 122 + }, + { + "epoch": 3.97165991902834, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 8.133333333333333e-07, + "logits/chosen": -68318757.92592593, + "logits/rejected": -94619371.24324325, + "logps/chosen": -167.68947120949073, + "logps/rejected": -122.23518000422297, + "loss": 3.5317, + "rewards/chosen": -0.21619266933865017, + "rewards/margins": 0.551317100410347, + "rewards/rejected": -0.7675097697489971, + "step": 123 + }, + { + "epoch": 4.0, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 8.199999999999999e-07, + "logits/chosen": -54672983.578947365, + "logits/rejected": -91851993.04347827, + "logps/chosen": -131.13337787828948, + "logps/rejected": -109.7916843580163, + "loss": 3.6239, + "rewards/chosen": -0.33125076795879166, + "rewards/margins": 0.38969243418433847, + "rewards/rejected": -0.7209432021431301, + "step": 124 + }, + { + "epoch": 4.032388663967612, + "grad_norm": 228.0, + "kl": 0.0, + "learning_rate": 8.266666666666667e-07, + "logits/chosen": -58974637.176470585, + "logits/rejected": -89414613.33333333, + "logps/chosen": -212.5978573069853, + "logps/rejected": -100.42489420572916, + "loss": 3.6389, + "rewards/chosen": -0.18168081956751206, + "rewards/margins": 0.43748479169957777, + "rewards/rejected": -0.6191656112670898, + "step": 125 + }, + { + "epoch": 4.064777327935222, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 8.333333333333333e-07, + "logits/chosen": -63364999.52941176, + "logits/rejected": -81994555.73333333, + "logps/chosen": -225.94140625, + "logps/rejected": -101.82618001302083, + "loss": 3.5936, + "rewards/chosen": -0.19911931542789235, + "rewards/margins": 0.35033480139339673, + "rewards/rejected": -0.5494541168212891, + "step": 126 + }, + { + "epoch": 4.097165991902834, + "grad_norm": 223.0, + "kl": 0.0, + "learning_rate": 8.399999999999999e-07, + "logits/chosen": -64454229.333333336, + "logits/rejected": -96394345.41176471, + "logps/chosen": -218.516796875, + "logps/rejected": -119.05093204273896, + "loss": 3.5279, + "rewards/chosen": -0.14283286730448405, + "rewards/margins": 0.8100145573709525, + "rewards/rejected": -0.9528474246754366, + "step": 127 + }, + { + "epoch": 4.129554655870446, + "grad_norm": 227.0, + "kl": 0.0, + "learning_rate": 8.466666666666667e-07, + "logits/chosen": -63847345.23076923, + "logits/rejected": -91306946.56, + "logps/chosen": -236.07965244391025, + "logps/rejected": -111.72240234375, + "loss": 3.8, + "rewards/chosen": -0.1613748990572416, + "rewards/margins": 0.5901656282865084, + "rewards/rejected": -0.75154052734375, + "step": 128 + }, + { + "epoch": 4.161943319838056, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 8.533333333333334e-07, + "logits/chosen": -55260376.27586207, + "logits/rejected": -91079870.17142858, + "logps/chosen": -210.36469558189654, + "logps/rejected": -118.82433035714286, + "loss": 3.4237, + "rewards/chosen": -0.10663683661099138, + "rewards/margins": 0.7218695241242206, + "rewards/rejected": -0.828506360735212, + "step": 129 + }, + { + "epoch": 4.194331983805668, + "grad_norm": 239.0, + "kl": 0.0, + "learning_rate": 8.599999999999999e-07, + "logits/chosen": -53269929.29032258, + "logits/rejected": -88130901.33333333, + "logps/chosen": -158.97380607358872, + "logps/rejected": -103.46803977272727, + "loss": 3.6842, + "rewards/chosen": -0.07596483538227697, + "rewards/margins": 0.765463054354939, + "rewards/rejected": -0.8414278897372159, + "step": 130 + }, + { + "epoch": 4.22672064777328, + "grad_norm": 224.0, + "kl": 0.0, + "learning_rate": 8.666666666666667e-07, + "logits/chosen": -61057683.39393939, + "logits/rejected": -85674710.70967741, + "logps/chosen": -146.51667554450756, + "logps/rejected": -136.4623550907258, + "loss": 3.6105, + "rewards/chosen": -0.1777365135424065, + "rewards/margins": 0.6341409524747936, + "rewards/rejected": -0.8118774660172001, + "step": 131 + }, + { + "epoch": 4.2591093117408905, + "grad_norm": 185.0, + "kl": 0.0, + "learning_rate": 8.733333333333333e-07, + "logits/chosen": -49094801.655172415, + "logits/rejected": -93466389.94285715, + "logps/chosen": -237.12252491918105, + "logps/rejected": -122.34063895089285, + "loss": 3.5546, + "rewards/chosen": -0.07341302674392174, + "rewards/margins": 0.8668295226073618, + "rewards/rejected": -0.9402425493512835, + "step": 132 + }, + { + "epoch": 4.291497975708502, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 8.799999999999999e-07, + "logits/chosen": -71895033.26315789, + "logits/rejected": -89686035.6923077, + "logps/chosen": -182.9316277754934, + "logps/rejected": -134.27398212139423, + "loss": 3.5952, + "rewards/chosen": -0.3445196402700324, + "rewards/margins": 0.5846443658898233, + "rewards/rejected": -0.9291640061598557, + "step": 133 + }, + { + "epoch": 4.323886639676114, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 8.866666666666667e-07, + "logits/chosen": -67607440.69565217, + "logits/rejected": -92422218.92682926, + "logps/chosen": -145.24899159307066, + "logps/rejected": -109.93842892530488, + "loss": 3.4658, + "rewards/chosen": -0.44358871294104535, + "rewards/margins": 0.5560945741445111, + "rewards/rejected": -0.9996832870855564, + "step": 134 + }, + { + "epoch": 4.3562753036437245, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 8.933333333333333e-07, + "logits/chosen": -58873164.8, + "logits/rejected": -87104247.1724138, + "logps/chosen": -194.96414620535714, + "logps/rejected": -126.86558795797414, + "loss": 3.5809, + "rewards/chosen": -0.15710413796561104, + "rewards/margins": 0.7351811103632885, + "rewards/rejected": -0.8922852483288995, + "step": 135 + }, + { + "epoch": 4.388663967611336, + "grad_norm": 192.0, + "kl": 0.0, + "learning_rate": 9e-07, + "logits/chosen": -59124073.14285714, + "logits/rejected": -87515932.44444445, + "logps/chosen": -284.48231724330356, + "logps/rejected": -105.84238009982639, + "loss": 3.5378, + "rewards/chosen": -0.16515704563685826, + "rewards/margins": 0.6709008822365412, + "rewards/rejected": -0.8360579278733995, + "step": 136 + }, + { + "epoch": 4.421052631578947, + "grad_norm": 316.0, + "kl": 0.0, + "learning_rate": 9.066666666666665e-07, + "logits/chosen": -65584450.37037037, + "logits/rejected": -93110797.83783785, + "logps/chosen": -252.453125, + "logps/rejected": -122.6502586570946, + "loss": 3.5077, + "rewards/chosen": -0.2939942677815755, + "rewards/margins": 0.7059658325470246, + "rewards/rejected": -0.9999601003286, + "step": 137 + }, + { + "epoch": 4.4534412955465585, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 9.133333333333333e-07, + "logits/chosen": -51863415.46666667, + "logits/rejected": -91196197.64705883, + "logps/chosen": -255.50361328125, + "logps/rejected": -98.63608685661765, + "loss": 3.4997, + "rewards/chosen": -0.13391879399617512, + "rewards/margins": 0.8613416690452426, + "rewards/rejected": -0.9952604630414177, + "step": 138 + }, + { + "epoch": 4.48582995951417, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 9.2e-07, + "logits/chosen": -54972656.64, + "logits/rejected": -93798885.74358974, + "logps/chosen": -236.46029296875, + "logps/rejected": -136.74373998397436, + "loss": 3.5652, + "rewards/chosen": -0.28614009857177736, + "rewards/margins": 0.7618288392287035, + "rewards/rejected": -1.0479689378004808, + "step": 139 + }, + { + "epoch": 4.518218623481781, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.266666666666665e-07, + "logits/chosen": -62021950.06060606, + "logits/rejected": -97421402.83870968, + "logps/chosen": -245.36388790246212, + "logps/rejected": -131.17503307711692, + "loss": 3.5174, + "rewards/chosen": -0.2576433528553356, + "rewards/margins": 0.8859953810392588, + "rewards/rejected": -1.1436387338945944, + "step": 140 + }, + { + "epoch": 4.550607287449393, + "grad_norm": 230.0, + "kl": 0.0, + "learning_rate": 9.333333333333333e-07, + "logits/chosen": -63452969.29032258, + "logits/rejected": -93965459.39393939, + "logps/chosen": -175.92149697580646, + "logps/rejected": -140.64895537405303, + "loss": 3.6806, + "rewards/chosen": -0.14741454585905997, + "rewards/margins": 0.8697651166720125, + "rewards/rejected": -1.0171796625310725, + "step": 141 + }, + { + "epoch": 4.582995951417004, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 9.399999999999999e-07, + "logits/chosen": -57006187.78947368, + "logits/rejected": -90852352.0, + "logps/chosen": -247.59189967105263, + "logps/rejected": -123.01950307992789, + "loss": 3.534, + "rewards/chosen": -0.21442169892160515, + "rewards/margins": 0.6900306983515319, + "rewards/rejected": -0.904452397273137, + "step": 142 + }, + { + "epoch": 4.615384615384615, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 9.466666666666666e-07, + "logits/chosen": -64618232.0, + "logits/rejected": -100268952.0, + "logps/chosen": -252.03915405273438, + "logps/rejected": -133.17811584472656, + "loss": 3.5107, + "rewards/chosen": -0.16562682390213013, + "rewards/margins": 1.025047481060028, + "rewards/rejected": -1.1906743049621582, + "step": 143 + }, + { + "epoch": 4.647773279352227, + "grad_norm": 230.0, + "kl": 0.0, + "learning_rate": 9.533333333333333e-07, + "logits/chosen": -73837048.68571429, + "logits/rejected": -90723380.96551724, + "logps/chosen": -172.4505580357143, + "logps/rejected": -135.97361597521552, + "loss": 3.5333, + "rewards/chosen": -0.5621269771030971, + "rewards/margins": 0.4758335357816349, + "rewards/rejected": -1.037960512884732, + "step": 144 + }, + { + "epoch": 4.680161943319838, + "grad_norm": 174.0, + "kl": 0.07423794269561768, + "learning_rate": 9.6e-07, + "logits/chosen": -65844446.315789476, + "logits/rejected": -93227086.76923077, + "logps/chosen": -210.4194464432566, + "logps/rejected": -117.88577974759616, + "loss": 3.5285, + "rewards/chosen": -0.249812828867059, + "rewards/margins": 0.9568866891899571, + "rewards/rejected": -1.2066995180570161, + "step": 145 + }, + { + "epoch": 4.712550607287449, + "grad_norm": 240.0, + "kl": 0.0, + "learning_rate": 9.666666666666666e-07, + "logits/chosen": -66073245.538461536, + "logits/rejected": -90012160.0, + "logps/chosen": -154.33119240785257, + "logps/rejected": -104.64640625, + "loss": 3.536, + "rewards/chosen": -0.26410897572835285, + "rewards/margins": 0.8190776697794597, + "rewards/rejected": -1.0831866455078125, + "step": 146 + }, + { + "epoch": 4.744939271255061, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 9.733333333333333e-07, + "logits/chosen": -68960184.0, + "logits/rejected": -92516768.0, + "logps/chosen": -174.9327392578125, + "logps/rejected": -130.5812225341797, + "loss": 3.5352, + "rewards/chosen": -0.4018847942352295, + "rewards/margins": 0.845333456993103, + "rewards/rejected": -1.2472182512283325, + "step": 147 + }, + { + "epoch": 4.777327935222672, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 9.8e-07, + "logits/chosen": -63528101.64705882, + "logits/rejected": -87971114.66666667, + "logps/chosen": -218.85384593290442, + "logps/rejected": -120.32495930989583, + "loss": 3.6012, + "rewards/chosen": -0.18356499952428482, + "rewards/margins": 0.952316551582486, + "rewards/rejected": -1.1358815511067708, + "step": 148 + }, + { + "epoch": 4.809716599190283, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 9.866666666666666e-07, + "logits/chosen": -77304849.65517241, + "logits/rejected": -93613787.42857143, + "logps/chosen": -194.98279229525863, + "logps/rejected": -136.35209263392858, + "loss": 3.531, + "rewards/chosen": -0.6150127937053812, + "rewards/margins": 0.43679809758228616, + "rewards/rejected": -1.0518108912876674, + "step": 149 + }, + { + "epoch": 4.842105263157895, + "grad_norm": 202.0, + "kl": 0.1953182816505432, + "learning_rate": 9.933333333333333e-07, + "logits/chosen": -59967328.0, + "logits/rejected": -98624440.0, + "logps/chosen": -258.19989013671875, + "logps/rejected": -115.43824768066406, + "loss": 3.5744, + "rewards/chosen": -0.16627463698387146, + "rewards/margins": 1.1537322103977203, + "rewards/rejected": -1.3200068473815918, + "step": 150 + }, + { + "epoch": 4.874493927125506, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 1e-06, + "logits/chosen": -56974087.11111111, + "logits/rejected": -91352310.85714285, + "logps/chosen": -247.51087782118054, + "logps/rejected": -125.28658621651786, + "loss": 3.553, + "rewards/chosen": -0.13042267163594565, + "rewards/margins": 1.0511799426305863, + "rewards/rejected": -1.181602614266532, + "step": 151 + }, + { + "epoch": 4.906882591093117, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 9.999888302765345e-07, + "logits/chosen": -56981550.08, + "logits/rejected": -97229817.43589744, + "logps/chosen": -257.08314453125, + "logps/rejected": -141.33010316506412, + "loss": 3.384, + "rewards/chosen": -0.25043046951293946, + "rewards/margins": 1.1548138197874414, + "rewards/rejected": -1.4052442893003807, + "step": 152 + }, + { + "epoch": 4.939271255060729, + "grad_norm": 220.0, + "kl": 0.0, + "learning_rate": 9.99955321605189e-07, + "logits/chosen": -75437582.62857144, + "logits/rejected": -94106120.8275862, + "logps/chosen": -176.31854073660713, + "logps/rejected": -129.12183459051724, + "loss": 3.5434, + "rewards/chosen": -0.2963148662022182, + "rewards/margins": 0.8894236644500582, + "rewards/rejected": -1.1857385306522763, + "step": 153 + }, + { + "epoch": 4.97165991902834, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 9.998994754830939e-07, + "logits/chosen": -68836622.22222222, + "logits/rejected": -95785305.94594595, + "logps/chosen": -169.1109302662037, + "logps/rejected": -126.82097233952703, + "loss": 3.3998, + "rewards/chosen": -0.3583377555564598, + "rewards/margins": 0.8677524436820854, + "rewards/rejected": -1.2260901992385451, + "step": 154 + }, + { + "epoch": 5.0, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 9.998212944053918e-07, + "logits/chosen": -55163964.631578945, + "logits/rejected": -93045504.0, + "logps/chosen": -132.9926629317434, + "logps/rejected": -114.3027980638587, + "loss": 3.4854, + "rewards/chosen": -0.517180041262978, + "rewards/margins": 0.654873586082895, + "rewards/rejected": -1.172053627345873, + "step": 155 + }, + { + "epoch": 5.032388663967612, + "grad_norm": 235.0, + "kl": 0.0, + "learning_rate": 9.997207818651273e-07, + "logits/chosen": -59630369.88235294, + "logits/rejected": -90848819.2, + "logps/chosen": -213.7499281939338, + "logps/rejected": -103.99580078125, + "loss": 3.5373, + "rewards/chosen": -0.29688773435704846, + "rewards/margins": 0.6793691448136872, + "rewards/rejected": -0.9762568791707357, + "step": 156 + }, + { + "epoch": 5.064777327935222, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 9.995979423530892e-07, + "logits/chosen": -64028664.47058824, + "logits/rejected": -83074372.26666667, + "logps/chosen": -227.1330135569853, + "logps/rejected": -104.895263671875, + "loss": 3.4498, + "rewards/chosen": -0.31828005173627066, + "rewards/margins": 0.5380826089896409, + "rewards/rejected": -0.8563626607259115, + "step": 157 + }, + { + "epoch": 5.097165991902834, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 9.99452781357611e-07, + "logits/chosen": -65066653.86666667, + "logits/rejected": -98157778.8235294, + "logps/chosen": -219.489990234375, + "logps/rejected": -124.0337775735294, + "loss": 3.4166, + "rewards/chosen": -0.2401509443918864, + "rewards/margins": 1.2109819982566086, + "rewards/rejected": -1.451132942648495, + "step": 158 + }, + { + "epoch": 5.129554655870446, + "grad_norm": 232.0, + "kl": 0.0, + "learning_rate": 9.992853053643257e-07, + "logits/chosen": -64405714.05128205, + "logits/rejected": -92928952.32, + "logps/chosen": -236.69583834134616, + "logps/rejected": -115.80888671875, + "loss": 3.6956, + "rewards/chosen": -0.22299350836338142, + "rewards/margins": 0.9371969213241187, + "rewards/rejected": -1.1601904296875, + "step": 159 + }, + { + "epoch": 5.161943319838056, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 9.99095521855875e-07, + "logits/chosen": -55735816.827586204, + "logits/rejected": -92454282.97142857, + "logps/chosen": -210.5728380926724, + "logps/rejected": -123.35432477678572, + "loss": 3.3245, + "rewards/chosen": -0.12744968512962604, + "rewards/margins": 1.1540555726131194, + "rewards/rejected": -1.2815052577427455, + "step": 160 + }, + { + "epoch": 5.194331983805668, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 9.988834393115767e-07, + "logits/chosen": -53546334.96774194, + "logits/rejected": -89768354.9090909, + "logps/chosen": -159.03936176915323, + "logps/rejected": -108.08769087357955, + "loss": 3.5483, + "rewards/chosen": -0.08251856219383978, + "rewards/margins": 1.2208744838673582, + "rewards/rejected": -1.303393046061198, + "step": 161 + }, + { + "epoch": 5.22672064777328, + "grad_norm": 226.0, + "kl": 0.0, + "learning_rate": 9.986490672070435e-07, + "logits/chosen": -61786100.36363637, + "logits/rejected": -86724954.83870968, + "logps/chosen": -147.48764500473484, + "logps/rejected": -141.09959362399192, + "loss": 3.4577, + "rewards/chosen": -0.27483367919921875, + "rewards/margins": 1.0007670002598916, + "rewards/rejected": -1.2756006794591104, + "step": 162 + }, + { + "epoch": 5.2591093117408905, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 9.983924160137624e-07, + "logits/chosen": -49868773.51724138, + "logits/rejected": -94799111.31428571, + "logps/chosen": -237.3065396012931, + "logps/rejected": -127.19659598214285, + "loss": 3.4325, + "rewards/chosen": -0.0918145755241657, + "rewards/margins": 1.3340229140126647, + "rewards/rejected": -1.4258374895368304, + "step": 163 + }, + { + "epoch": 5.291497975708502, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 9.981134971986247e-07, + "logits/chosen": -72578391.57894737, + "logits/rejected": -91101213.53846154, + "logps/chosen": -184.6579718338816, + "logps/rejected": -139.31804950420673, + "loss": 3.493, + "rewards/chosen": -0.5171534387688888, + "rewards/margins": 0.9164171296092662, + "rewards/rejected": -1.433570568378155, + "step": 164 + }, + { + "epoch": 5.323886639676114, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 9.978123232234146e-07, + "logits/chosen": -68418910.60869566, + "logits/rejected": -93767867.31707317, + "logps/chosen": -147.5511527683424, + "logps/rejected": -114.96096131859755, + "loss": 3.3151, + "rewards/chosen": -0.6738060660984205, + "rewards/margins": 0.8281306836157564, + "rewards/rejected": -1.501936749714177, + "step": 165 + }, + { + "epoch": 5.3562753036437245, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 9.97488907544252e-07, + "logits/chosen": -59543266.74285714, + "logits/rejected": -88190243.31034483, + "logps/chosen": -195.37498604910715, + "logps/rejected": -131.32277074353448, + "loss": 3.4582, + "rewards/chosen": -0.19818906784057616, + "rewards/margins": 1.1398148142058273, + "rewards/rejected": -1.3380038820464035, + "step": 166 + }, + { + "epoch": 5.388663967611336, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.971432646109917e-07, + "logits/chosen": -59788562.28571428, + "logits/rejected": -88785742.22222222, + "logps/chosen": -284.75272042410717, + "logps/rejected": -109.71605088975694, + "loss": 3.4192, + "rewards/chosen": -0.1921968970979963, + "rewards/margins": 1.0312278024734012, + "rewards/rejected": -1.2234246995713975, + "step": 167 + }, + { + "epoch": 5.421052631578947, + "grad_norm": 232.0, + "kl": 0.0, + "learning_rate": 9.967754098665778e-07, + "logits/chosen": -66138832.59259259, + "logits/rejected": -94453497.08108108, + "logps/chosen": -253.72931134259258, + "logps/rejected": -126.71872360641892, + "loss": 3.4038, + "rewards/chosen": -0.42161104414198136, + "rewards/margins": 0.9851942520599823, + "rewards/rejected": -1.4068052962019637, + "step": 168 + }, + { + "epoch": 5.4534412955465585, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 9.963853597463532e-07, + "logits/chosen": -52095091.2, + "logits/rejected": -92561558.58823529, + "logps/chosen": -256.04205729166665, + "logps/rejected": -103.12436810661765, + "loss": 3.3497, + "rewards/chosen": -0.18776068687438965, + "rewards/margins": 1.2563272953033446, + "rewards/rejected": -1.4440879821777344, + "step": 169 + }, + { + "epoch": 5.48582995951417, + "grad_norm": 212.0, + "kl": 0.0, + "learning_rate": 9.959731316773258e-07, + "logits/chosen": -55681187.84, + "logits/rejected": -95117896.20512821, + "logps/chosen": -237.82603515625, + "logps/rejected": -141.01879256810898, + "loss": 3.4183, + "rewards/chosen": -0.42271514892578127, + "rewards/margins": 1.0527576192220052, + "rewards/rejected": -1.4754727681477864, + "step": 170 + }, + { + "epoch": 5.518218623481781, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 9.9553874407739e-07, + "logits/chosen": -62956140.60606061, + "logits/rejected": -98763990.70967741, + "logps/chosen": -246.2182765151515, + "logps/rejected": -135.6988722278226, + "loss": 3.3954, + "rewards/chosen": -0.34307841098669806, + "rewards/margins": 1.2529449024741135, + "rewards/rejected": -1.5960233134608115, + "step": 171 + }, + { + "epoch": 5.550607287449393, + "grad_norm": 242.0, + "kl": 0.02922797203063965, + "learning_rate": 9.95082216354503e-07, + "logits/chosen": -63656968.258064516, + "logits/rejected": -95357548.60606061, + "logps/chosen": -176.2975050403226, + "logps/rejected": -144.93167021780303, + "loss": 3.5123, + "rewards/chosen": -0.18501430942166236, + "rewards/margins": 1.2604359820674476, + "rewards/rejected": -1.4454502914891099, + "step": 172 + }, + { + "epoch": 5.582995951417004, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 9.946035689058187e-07, + "logits/chosen": -57274246.7368421, + "logits/rejected": -92300780.3076923, + "logps/chosen": -248.19785670230263, + "logps/rejected": -126.5517108623798, + "loss": 3.4008, + "rewards/chosen": -0.2750188676934493, + "rewards/margins": 0.9826543958563554, + "rewards/rejected": -1.2576732635498047, + "step": 173 + }, + { + "epoch": 5.615384615384615, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 9.941028231167755e-07, + "logits/chosen": -65106028.0, + "logits/rejected": -101820640.0, + "logps/chosen": -252.35142517089844, + "logps/rejected": -138.0707550048828, + "loss": 3.3482, + "rewards/chosen": -0.1968565285205841, + "rewards/margins": 1.4830813109874725, + "rewards/rejected": -1.6799378395080566, + "step": 174 + }, + { + "epoch": 5.647773279352227, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.935800013601413e-07, + "logits/chosen": -74467861.94285715, + "logits/rejected": -92196722.7586207, + "logps/chosen": -174.39148995535714, + "logps/rejected": -139.74016702586206, + "loss": 3.4035, + "rewards/chosen": -0.7562218257359096, + "rewards/margins": 0.6583934107437509, + "rewards/rejected": -1.4146152364796605, + "step": 175 + }, + { + "epoch": 5.680161943319838, + "grad_norm": 176.0, + "kl": 0.25683772563934326, + "learning_rate": 9.930351269950143e-07, + "logits/chosen": -66604469.89473684, + "logits/rejected": -94476376.61538461, + "logps/chosen": -211.37924033717104, + "logps/rejected": -122.16629732572116, + "loss": 3.3942, + "rewards/chosen": -0.34579031090987356, + "rewards/margins": 1.288960348739315, + "rewards/rejected": -1.6347506596491888, + "step": 176 + }, + { + "epoch": 5.712550607287449, + "grad_norm": 236.0, + "kl": 0.0, + "learning_rate": 9.924682243657778e-07, + "logits/chosen": -66897624.615384616, + "logits/rejected": -91426672.64, + "logps/chosen": -155.15265675080127, + "logps/rejected": -108.669345703125, + "loss": 3.4077, + "rewards/chosen": -0.3462566718077048, + "rewards/margins": 1.139224437811436, + "rewards/rejected": -1.4854811096191407, + "step": 177 + }, + { + "epoch": 5.744939271255061, + "grad_norm": 203.0, + "kl": 0.04345065355300903, + "learning_rate": 9.918793188010146e-07, + "logits/chosen": -69631032.0, + "logits/rejected": -93599952.0, + "logps/chosen": -176.5198516845703, + "logps/rejected": -135.20755004882812, + "loss": 3.4123, + "rewards/chosen": -0.5605951547622681, + "rewards/margins": 1.1492563486099243, + "rewards/rejected": -1.7098515033721924, + "step": 178 + }, + { + "epoch": 5.777327935222672, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 9.91268436612374e-07, + "logits/chosen": -64168478.11764706, + "logits/rejected": -89267072.0, + "logps/chosen": -219.43959673713235, + "logps/rejected": -124.466162109375, + "loss": 3.4547, + "rewards/chosen": -0.24213720770443187, + "rewards/margins": 1.3078640638613233, + "rewards/rejected": -1.5500012715657552, + "step": 179 + }, + { + "epoch": 5.809716599190283, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 9.906356050933962e-07, + "logits/chosen": -78053764.4137931, + "logits/rejected": -94502816.91428572, + "logps/chosen": -197.25845231681035, + "logps/rejected": -140.19140625, + "loss": 3.3955, + "rewards/chosen": -0.8425777040678879, + "rewards/margins": 0.5931662272937193, + "rewards/rejected": -1.4357439313616072, + "step": 180 + }, + { + "epoch": 5.842105263157895, + "grad_norm": 202.0, + "kl": 0.2528773546218872, + "learning_rate": 9.899808525182934e-07, + "logits/chosen": -60465256.0, + "logits/rejected": -100177120.0, + "logps/chosen": -258.4275817871094, + "logps/rejected": -119.31683349609375, + "loss": 3.4679, + "rewards/chosen": -0.18904456496238708, + "rewards/margins": 1.5188209116458893, + "rewards/rejected": -1.7078654766082764, + "step": 181 + }, + { + "epoch": 5.874493927125506, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 9.893042081406867e-07, + "logits/chosen": -57508433.777777776, + "logits/rejected": -92387894.85714285, + "logps/chosen": -247.75748697916666, + "logps/rejected": -129.10232979910714, + "loss": 3.436, + "rewards/chosen": -0.15508422586652967, + "rewards/margins": 1.408093700333247, + "rewards/rejected": -1.5631779261997767, + "step": 182 + }, + { + "epoch": 5.906882591093117, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 9.886057021922982e-07, + "logits/chosen": -57409761.28, + "logits/rejected": -98912426.66666667, + "logps/chosen": -257.6034375, + "logps/rejected": -145.91838191105768, + "loss": 3.2073, + "rewards/chosen": -0.302458610534668, + "rewards/margins": 1.5616145314925756, + "rewards/rejected": -1.8640731420272436, + "step": 183 + }, + { + "epoch": 5.939271255060729, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 9.878853658816013e-07, + "logits/chosen": -75604348.34285714, + "logits/rejected": -95449511.72413793, + "logps/chosen": -177.4068638392857, + "logps/rejected": -132.91231142241378, + "loss": 3.4123, + "rewards/chosen": -0.4051459176199777, + "rewards/margins": 1.159641190702692, + "rewards/rejected": -1.5647871083226697, + "step": 184 + }, + { + "epoch": 5.97165991902834, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 9.871432313924253e-07, + "logits/chosen": -69507555.55555555, + "logits/rejected": -96964469.62162162, + "logps/chosen": -169.9286205150463, + "logps/rejected": -130.19913956925674, + "loss": 3.2837, + "rewards/chosen": -0.440108440540455, + "rewards/margins": 1.1237977393515952, + "rewards/rejected": -1.5639061798920502, + "step": 185 + }, + { + "epoch": 6.0, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 9.863793318825186e-07, + "logits/chosen": -55615016.421052635, + "logits/rejected": -94030836.86956522, + "logps/chosen": -134.31357935855263, + "logps/rejected": -117.63295049252717, + "loss": 3.3731, + "rewards/chosen": -0.649271663866545, + "rewards/margins": 0.8557975712302621, + "rewards/rejected": -1.5050692350968071, + "step": 186 + }, + { + "epoch": 6.032388663967612, + "grad_norm": 260.0, + "kl": 0.0, + "learning_rate": 9.85593701482066e-07, + "logits/chosen": -60041626.35294118, + "logits/rejected": -91988292.26666667, + "logps/chosen": -214.28821518841912, + "logps/rejected": -106.65919596354166, + "loss": 3.3927, + "rewards/chosen": -0.35071389815386605, + "rewards/margins": 0.8918824102364336, + "rewards/rejected": -1.2425963083902996, + "step": 187 + }, + { + "epoch": 6.064777327935222, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 9.847863752921648e-07, + "logits/chosen": -64591141.64705882, + "logits/rejected": -84054843.73333333, + "logps/chosen": -227.66990751378677, + "logps/rejected": -106.8339599609375, + "loss": 3.3231, + "rewards/chosen": -0.3719707657309139, + "rewards/margins": 0.6782620579588647, + "rewards/rejected": -1.0502328236897787, + "step": 188 + }, + { + "epoch": 6.097165991902834, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 9.839573893832563e-07, + "logits/chosen": -65403413.333333336, + "logits/rejected": -99308649.41176471, + "logps/chosen": -219.85328776041666, + "logps/rejected": -127.46609317555146, + "loss": 3.2881, + "rewards/chosen": -0.2764817555745443, + "rewards/margins": 1.5178822760488473, + "rewards/rejected": -1.7943640316233915, + "step": 189 + }, + { + "epoch": 6.129554655870446, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 9.831067807935138e-07, + "logits/chosen": -64863100.71794872, + "logits/rejected": -94363463.68, + "logps/chosen": -236.728515625, + "logps/rejected": -119.330205078125, + "loss": 3.5543, + "rewards/chosen": -0.22626064985226363, + "rewards/margins": 1.2860617378430488, + "rewards/rejected": -1.5123223876953125, + "step": 190 + }, + { + "epoch": 6.161943319838056, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 9.822345875271883e-07, + "logits/chosen": -55811464.827586204, + "logits/rejected": -93839147.88571429, + "logps/chosen": -210.44074959590517, + "logps/rejected": -126.24461495535714, + "loss": 3.1873, + "rewards/chosen": -0.11424061347698343, + "rewards/margins": 1.4562925940076707, + "rewards/rejected": -1.570533207484654, + "step": 191 + }, + { + "epoch": 6.194331983805668, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 9.8134084855291e-07, + "logits/chosen": -54031000.77419355, + "logits/rejected": -90785093.81818181, + "logps/chosen": -158.9200478830645, + "logps/rejected": -110.96604225852273, + "loss": 3.3842, + "rewards/chosen": -0.07058828107772334, + "rewards/margins": 1.5206403196265852, + "rewards/rejected": -1.5912286007043086, + "step": 192 + }, + { + "epoch": 6.22672064777328, + "grad_norm": 215.0, + "kl": 0.0, + "learning_rate": 9.804256038019481e-07, + "logits/chosen": -62475213.57575758, + "logits/rejected": -87352278.70967741, + "logps/chosen": -148.17927320075756, + "logps/rejected": -143.75623739919354, + "loss": 3.3578, + "rewards/chosen": -0.3439967126557321, + "rewards/margins": 1.197268713487791, + "rewards/rejected": -1.5412654261435232, + "step": 193 + }, + { + "epoch": 6.2591093117408905, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 9.794888941664253e-07, + "logits/chosen": -50114127.448275864, + "logits/rejected": -95931677.25714286, + "logps/chosen": -237.2588900862069, + "logps/rejected": -130.54501953125, + "loss": 3.3044, + "rewards/chosen": -0.08704933626898403, + "rewards/margins": 1.6736307261612615, + "rewards/rejected": -1.7606800624302454, + "step": 194 + }, + { + "epoch": 6.291497975708502, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 9.78530761497492e-07, + "logits/chosen": -72950359.57894737, + "logits/rejected": -92061430.15384616, + "logps/chosen": -185.59851716694078, + "logps/rejected": -142.77226374699518, + "loss": 3.3508, + "rewards/chosen": -0.6112096686112253, + "rewards/margins": 1.1677829109222784, + "rewards/rejected": -1.7789925795335035, + "step": 195 + }, + { + "epoch": 6.323886639676114, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 9.77551248603456e-07, + "logits/chosen": -69183944.3478261, + "logits/rejected": -94778099.51219513, + "logps/chosen": -149.1656547214674, + "logps/rejected": -117.63174066310975, + "loss": 3.1596, + "rewards/chosen": -0.8352551667586617, + "rewards/margins": 0.9337595174067347, + "rewards/rejected": -1.7690146841653964, + "step": 196 + }, + { + "epoch": 6.3562753036437245, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 9.765503992478703e-07, + "logits/chosen": -60052831.08571429, + "logits/rejected": -89278146.20689656, + "logps/chosen": -195.72306082589284, + "logps/rejected": -133.82958984375, + "loss": 3.349, + "rewards/chosen": -0.2329949242728097, + "rewards/margins": 1.3556903416300055, + "rewards/rejected": -1.5886852659028152, + "step": 197 + }, + { + "epoch": 6.388663967611336, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 9.755282581475767e-07, + "logits/chosen": -60195122.28571428, + "logits/rejected": -89645120.0, + "logps/chosen": -285.11851283482144, + "logps/rejected": -112.12190755208333, + "loss": 3.3204, + "rewards/chosen": -0.2287752287728446, + "rewards/margins": 1.2352361754765586, + "rewards/rejected": -1.4640114042494032, + "step": 198 + }, + { + "epoch": 6.421052631578947, + "grad_norm": 214.0, + "kl": 0.0, + "learning_rate": 9.74484870970709e-07, + "logits/chosen": -66635742.81481481, + "logits/rejected": -95713736.64864865, + "logps/chosen": -254.17954282407408, + "logps/rejected": -129.53260926942568, + "loss": 3.273, + "rewards/chosen": -0.4666371875339084, + "rewards/margins": 1.221557705967992, + "rewards/rejected": -1.6881948935019004, + "step": 199 + }, + { + "epoch": 6.4534412955465585, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 9.73420284334652e-07, + "logits/chosen": -52548770.13333333, + "logits/rejected": -93875373.1764706, + "logps/chosen": -255.763134765625, + "logps/rejected": -105.7856086282169, + "loss": 3.2525, + "rewards/chosen": -0.15987102190653482, + "rewards/margins": 1.5503407880371691, + "rewards/rejected": -1.710211809943704, + "step": 200 + }, + { + "epoch": 6.4534412955465585, + "eval_kl": 0.0, + "eval_logits/chosen": -77142447.04934542, + "eval_logits/rejected": -115986221.20686175, + "eval_logps/chosen": -211.08607124874118, + "eval_logps/rejected": -128.5555073789102, + "eval_loss": 0.29369857907295227, + "eval_rewards/chosen": -0.37724699114504656, + "eval_rewards/margins": 1.3236440359109576, + "eval_rewards/rejected": -1.700891027056004, + "eval_runtime": 64.2138, + "eval_samples_per_second": 30.694, + "eval_steps_per_second": 0.966, + "step": 200 + }, + { + "epoch": 6.48582995951417, + "grad_norm": 192.0, + "kl": 0.0, + "learning_rate": 9.723345458039593e-07, + "logits/chosen": -56222848.0, + "logits/rejected": -96234167.79487179, + "logps/chosen": -238.5019140625, + "logps/rejected": -143.9195337540064, + "loss": 3.2819, + "rewards/chosen": -0.4902991485595703, + "rewards/margins": 1.2752472099891077, + "rewards/rejected": -1.765546358548678, + "step": 201 + }, + { + "epoch": 6.518218623481781, + "grad_norm": 163.0, + "kl": 0.0, + "learning_rate": 9.712277038882273e-07, + "logits/chosen": -63160448.0, + "logits/rejected": -100012098.06451613, + "logps/chosen": -246.70954663825756, + "logps/rejected": -138.87134576612902, + "loss": 3.2747, + "rewards/chosen": -0.39220518054384174, + "rewards/margins": 1.5210654621366648, + "rewards/rejected": -1.9132706426805066, + "step": 202 + }, + { + "epoch": 6.550607287449393, + "grad_norm": 240.0, + "kl": 0.0, + "learning_rate": 9.700998080399285e-07, + "logits/chosen": -64144788.64516129, + "logits/rejected": -96506104.24242425, + "logps/chosen": -176.5855909778226, + "logps/rejected": -147.65744850852272, + "loss": 3.3754, + "rewards/chosen": -0.21382254938925466, + "rewards/margins": 1.5042050856649, + "rewards/rejected": -1.7180276350541548, + "step": 203 + }, + { + "epoch": 6.582995951417004, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 9.689509086522018e-07, + "logits/chosen": -57705162.10526316, + "logits/rejected": -93512251.07692307, + "logps/chosen": -248.44014699835526, + "logps/rejected": -128.82718599759616, + "loss": 3.2803, + "rewards/chosen": -0.2992460602208188, + "rewards/margins": 1.18597580257215, + "rewards/rejected": -1.4852218627929688, + "step": 204 + }, + { + "epoch": 6.615384615384615, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 9.67781057056601e-07, + "logits/chosen": -65230268.0, + "logits/rejected": -103285960.0, + "logps/chosen": -252.68380737304688, + "logps/rejected": -141.1610107421875, + "loss": 3.2449, + "rewards/chosen": -0.23009414970874786, + "rewards/margins": 1.758869931101799, + "rewards/rejected": -1.9889640808105469, + "step": 205 + }, + { + "epoch": 6.647773279352227, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 9.665903055208012e-07, + "logits/chosen": -75168343.77142857, + "logits/rejected": -93463534.34482759, + "logps/chosen": -175.55613839285715, + "logps/rejected": -142.09524851831895, + "loss": 3.2832, + "rewards/chosen": -0.8726874760219029, + "rewards/margins": 0.7774363832520733, + "rewards/rejected": -1.6501238592739762, + "step": 206 + }, + { + "epoch": 6.680161943319838, + "grad_norm": 174.0, + "kl": 0.27339088916778564, + "learning_rate": 9.653787072462643e-07, + "logits/chosen": -67068530.526315786, + "logits/rejected": -95274289.23076923, + "logps/chosen": -211.9501310649671, + "logps/rejected": -124.63771409254808, + "loss": 3.2769, + "rewards/chosen": -0.402879840449283, + "rewards/margins": 1.4790127306331988, + "rewards/rejected": -1.8818925710824819, + "step": 207 + }, + { + "epoch": 6.712550607287449, + "grad_norm": 231.0, + "kl": 0.0, + "learning_rate": 9.641463163658605e-07, + "logits/chosen": -67239463.38461539, + "logits/rejected": -92569292.8, + "logps/chosen": -155.6209935897436, + "logps/rejected": -110.720048828125, + "loss": 3.2899, + "rewards/chosen": -0.39308939224634415, + "rewards/margins": 1.2974609922751403, + "rewards/rejected": -1.6905503845214844, + "step": 208 + }, + { + "epoch": 6.744939271255061, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 9.628931879414516e-07, + "logits/chosen": -70281176.0, + "logits/rejected": -94688808.0, + "logps/chosen": -177.175048828125, + "logps/rejected": -137.61363220214844, + "loss": 3.2975, + "rewards/chosen": -0.6261154413223267, + "rewards/margins": 1.3243438005447388, + "rewards/rejected": -1.9504592418670654, + "step": 209 + }, + { + "epoch": 6.777327935222672, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 9.616193779614293e-07, + "logits/chosen": -64514537.4117647, + "logits/rejected": -90385382.4, + "logps/chosen": -219.74317842371323, + "logps/rejected": -126.93536783854167, + "loss": 3.3416, + "rewards/chosen": -0.27249841129078584, + "rewards/margins": 1.524424145268459, + "rewards/rejected": -1.7969225565592448, + "step": 210 + }, + { + "epoch": 6.809716599190283, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 9.603249433382144e-07, + "logits/chosen": -78874774.06896552, + "logits/rejected": -95458099.2, + "logps/chosen": -198.69526198814654, + "logps/rejected": -142.20475725446428, + "loss": 3.2742, + "rewards/chosen": -0.9862608416327114, + "rewards/margins": 0.6508184517545654, + "rewards/rejected": -1.6370792933872769, + "step": 211 + }, + { + "epoch": 6.842105263157895, + "grad_norm": 184.0, + "kl": 0.2491929531097412, + "learning_rate": 9.590099419057141e-07, + "logits/chosen": -60872712.0, + "logits/rejected": -101340408.0, + "logps/chosen": -258.4055480957031, + "logps/rejected": -121.7928466796875, + "loss": 3.3416, + "rewards/chosen": -0.18684199452400208, + "rewards/margins": 1.7686249911785126, + "rewards/rejected": -1.9554669857025146, + "step": 212 + }, + { + "epoch": 6.874493927125506, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 9.576744324167378e-07, + "logits/chosen": -57879082.666666664, + "logits/rejected": -93678509.71428572, + "logps/chosen": -247.47976345486111, + "logps/rejected": -131.52873883928572, + "loss": 3.3345, + "rewards/chosen": -0.12731069988674587, + "rewards/margins": 1.6785063592214433, + "rewards/rejected": -1.8058170591081892, + "step": 213 + }, + { + "epoch": 6.906882591093117, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 9.563184745403722e-07, + "logits/chosen": -57542656.0, + "logits/rejected": -99881235.6923077, + "logps/chosen": -257.97509765625, + "logps/rejected": -148.26245743189102, + "loss": 3.096, + "rewards/chosen": -0.3396272659301758, + "rewards/margins": 1.758853349930201, + "rewards/rejected": -2.098480615860377, + "step": 214 + }, + { + "epoch": 6.939271255060729, + "grad_norm": 208.0, + "kl": 0.0, + "learning_rate": 9.549421288593157e-07, + "logits/chosen": -76082892.8, + "logits/rejected": -96593584.55172414, + "logps/chosen": -177.5617466517857, + "logps/rejected": -135.0049838362069, + "loss": 3.2982, + "rewards/chosen": -0.4206359318324498, + "rewards/margins": 1.3534188914181564, + "rewards/rejected": -1.7740548232506061, + "step": 215 + }, + { + "epoch": 6.97165991902834, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 9.535454568671704e-07, + "logits/chosen": -70078094.22222222, + "logits/rejected": -98170077.4054054, + "logps/chosen": -170.33772786458334, + "logps/rejected": -132.48622255067568, + "loss": 3.163, + "rewards/chosen": -0.48101958522090205, + "rewards/margins": 1.3115953258327298, + "rewards/rejected": -1.7926149110536318, + "step": 216 + }, + { + "epoch": 7.0, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 9.521285209656962e-07, + "logits/chosen": -56098374.7368421, + "logits/rejected": -94667386.43478261, + "logps/chosen": -135.1068693462171, + "logps/rejected": -119.83950407608695, + "loss": 3.2609, + "rewards/chosen": -0.7286000502736945, + "rewards/margins": 0.9971250407482994, + "rewards/rejected": -1.725725091021994, + "step": 217 + }, + { + "epoch": 7.032388663967612, + "grad_norm": 235.0, + "kl": 0.0, + "learning_rate": 9.506913844620217e-07, + "logits/chosen": -60832783.058823526, + "logits/rejected": -92965239.46666667, + "logps/chosen": -214.72449448529412, + "logps/rejected": -108.35077311197917, + "loss": 3.2728, + "rewards/chosen": -0.39434508716358857, + "rewards/margins": 1.0174091395209819, + "rewards/rejected": -1.4117542266845704, + "step": 218 + }, + { + "epoch": 7.064777327935222, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 9.492341115658165e-07, + "logits/chosen": -65132333.176470585, + "logits/rejected": -84811306.66666667, + "logps/chosen": -228.4188735064338, + "logps/rejected": -108.3897705078125, + "loss": 3.248, + "rewards/chosen": -0.4468682233025046, + "rewards/margins": 0.7589439766079773, + "rewards/rejected": -1.2058121999104818, + "step": 219 + }, + { + "epoch": 7.097165991902834, + "grad_norm": 216.0, + "kl": 0.0, + "learning_rate": 9.477567673864215e-07, + "logits/chosen": -65939255.46666667, + "logits/rejected": -100603309.1764706, + "logps/chosen": -220.11765950520834, + "logps/rejected": -130.0303452435662, + "loss": 3.1895, + "rewards/chosen": -0.30291754404703775, + "rewards/margins": 1.7478724573172777, + "rewards/rejected": -2.0507900013643154, + "step": 220 + }, + { + "epoch": 7.129554655870446, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 9.462594179299405e-07, + "logits/chosen": -65328771.28205128, + "logits/rejected": -95510886.4, + "logps/chosen": -236.89200220352564, + "logps/rejected": -121.12919921875, + "loss": 3.4369, + "rewards/chosen": -0.24260655427590394, + "rewards/margins": 1.4496146676479242, + "rewards/rejected": -1.6922212219238282, + "step": 221 + }, + { + "epoch": 7.161943319838056, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 9.44742130096291e-07, + "logits/chosen": -56353438.89655172, + "logits/rejected": -94957363.2, + "logps/chosen": -210.51030441810346, + "logps/rejected": -128.38976004464286, + "loss": 3.0819, + "rewards/chosen": -0.12119495457616346, + "rewards/margins": 1.6638531760041937, + "rewards/rejected": -1.7850481305803572, + "step": 222 + }, + { + "epoch": 7.194331983805668, + "grad_norm": 220.0, + "kl": 0.0, + "learning_rate": 9.432049716762149e-07, + "logits/chosen": -54206001.548387095, + "logits/rejected": -91852435.39393939, + "logps/chosen": -158.88182018649192, + "logps/rejected": -113.12599875710227, + "loss": 3.2916, + "rewards/chosen": -0.06676443161502961, + "rewards/margins": 1.7404590328884033, + "rewards/rejected": -1.8072234645034329, + "step": 223 + }, + { + "epoch": 7.22672064777328, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 9.416480113482503e-07, + "logits/chosen": -62940737.93939394, + "logits/rejected": -88258807.74193548, + "logps/chosen": -148.4831025094697, + "logps/rejected": -145.8203597530242, + "loss": 3.2473, + "rewards/chosen": -0.3743794181130149, + "rewards/margins": 1.373298292635473, + "rewards/rejected": -1.747677710748488, + "step": 224 + }, + { + "epoch": 7.2591093117408905, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 9.400713186756623e-07, + "logits/chosen": -50639894.06896552, + "logits/rejected": -96988533.02857143, + "logps/chosen": -237.03690732758622, + "logps/rejected": -132.83162667410716, + "loss": 3.2031, + "rewards/chosen": -0.06484996450358424, + "rewards/margins": 1.9244915533535585, + "rewards/rejected": -1.9893415178571427, + "step": 225 + }, + { + "epoch": 7.291497975708502, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 9.384749641033357e-07, + "logits/chosen": -73861248.0, + "logits/rejected": -93220312.61538461, + "logps/chosen": -185.95038805509867, + "logps/rejected": -145.14332932692307, + "loss": 3.2424, + "rewards/chosen": -0.6463972392835116, + "rewards/margins": 1.3697013700539284, + "rewards/rejected": -2.01609860933744, + "step": 226 + }, + { + "epoch": 7.323886639676114, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 9.368590189546267e-07, + "logits/chosen": -69601625.04347827, + "logits/rejected": -95669123.12195122, + "logps/chosen": -150.1243206521739, + "logps/rejected": -119.9890672637195, + "loss": 3.0651, + "rewards/chosen": -0.9311220749564793, + "rewards/margins": 1.073624524938719, + "rewards/rejected": -2.0047465998951983, + "step": 227 + }, + { + "epoch": 7.3562753036437245, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 9.352235554281773e-07, + "logits/chosen": -60476328.22857143, + "logits/rejected": -90111011.31034483, + "logps/chosen": -195.48328683035714, + "logps/rejected": -135.86571423760776, + "loss": 3.2539, + "rewards/chosen": -0.20901996067592077, + "rewards/margins": 1.5832775341466143, + "rewards/rejected": -1.792297494822535, + "step": 228 + }, + { + "epoch": 7.388663967611336, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 9.335686465946886e-07, + "logits/chosen": -60742925.71428572, + "logits/rejected": -90614691.55555555, + "logps/chosen": -285.0276402064732, + "logps/rejected": -114.05181206597223, + "loss": 3.2212, + "rewards/chosen": -0.21968691689627512, + "rewards/margins": 1.4373150023203047, + "rewards/rejected": -1.65700191921658, + "step": 229 + }, + { + "epoch": 7.421052631578947, + "grad_norm": 243.0, + "kl": 0.0, + "learning_rate": 9.318943663936569e-07, + "logits/chosen": -67078437.925925925, + "logits/rejected": -96883331.45945945, + "logps/chosen": -254.83188657407408, + "logps/rejected": -131.86358477618242, + "loss": 3.1714, + "rewards/chosen": -0.5318703828034578, + "rewards/margins": 1.3894218964142366, + "rewards/rejected": -1.9212922792176943, + "step": 230 + }, + { + "epoch": 7.4534412955465585, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 9.302007896300697e-07, + "logits/chosen": -52654301.86666667, + "logits/rejected": -94614896.94117647, + "logps/chosen": -255.72353515625, + "logps/rejected": -107.79805261948529, + "loss": 3.1333, + "rewards/chosen": -0.15591179529825847, + "rewards/margins": 1.7555445932874492, + "rewards/rejected": -1.9114563885857077, + "step": 231 + }, + { + "epoch": 7.48582995951417, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 9.284879919710631e-07, + "logits/chosen": -56632893.44, + "logits/rejected": -97366567.38461539, + "logps/chosen": -238.73591796875, + "logps/rejected": -145.61743790064102, + "loss": 3.1966, + "rewards/chosen": -0.5137019348144531, + "rewards/margins": 1.4216348383976862, + "rewards/rejected": -1.9353367732121394, + "step": 232 + }, + { + "epoch": 7.518218623481781, + "grad_norm": 169.0, + "kl": 0.0, + "learning_rate": 9.267560499425424e-07, + "logits/chosen": -63812239.515151516, + "logits/rejected": -101234688.0, + "logps/chosen": -246.65836588541666, + "logps/rejected": -140.67149697580646, + "loss": 3.1868, + "rewards/chosen": -0.3870873306736802, + "rewards/margins": 1.7061980145884168, + "rewards/rejected": -2.093285345262097, + "step": 233 + }, + { + "epoch": 7.550607287449393, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 9.250050409257611e-07, + "logits/chosen": -64757702.19354839, + "logits/rejected": -97548730.18181819, + "logps/chosen": -176.59056829637098, + "logps/rejected": -149.50257457386363, + "loss": 3.2909, + "rewards/chosen": -0.21432172098467428, + "rewards/margins": 1.6882187749167223, + "rewards/rejected": -1.9025404959013967, + "step": 234 + }, + { + "epoch": 7.582995951417004, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 9.232350431538656e-07, + "logits/chosen": -58187061.89473684, + "logits/rejected": -94484814.76923077, + "logps/chosen": -248.21343030427633, + "logps/rejected": -130.44036395733173, + "loss": 3.1836, + "rewards/chosen": -0.27657767346030787, + "rewards/margins": 1.369960107301411, + "rewards/rejected": -1.6465377807617188, + "step": 235 + }, + { + "epoch": 7.615384615384615, + "grad_norm": 174.0, + "kl": 0.0, + "learning_rate": 9.214461357083985e-07, + "logits/chosen": -65923708.0, + "logits/rejected": -104427760.0, + "logps/chosen": -252.69491577148438, + "logps/rejected": -143.23907470703125, + "loss": 3.1462, + "rewards/chosen": -0.23120331764221191, + "rewards/margins": 1.9655675888061523, + "rewards/rejected": -2.1967709064483643, + "step": 236 + }, + { + "epoch": 7.647773279352227, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.196383985157656e-07, + "logits/chosen": -75513439.08571428, + "logits/rejected": -94364707.31034483, + "logps/chosen": -176.4595703125, + "logps/rejected": -143.82543103448276, + "loss": 3.192, + "rewards/chosen": -0.9630287170410157, + "rewards/margins": 0.8601124467520878, + "rewards/rejected": -1.8231411637931034, + "step": 237 + }, + { + "epoch": 7.680161943319838, + "grad_norm": 186.0, + "kl": 0.2926532030105591, + "learning_rate": 9.178119123436649e-07, + "logits/chosen": -67436618.10526316, + "logits/rejected": -96375246.76923077, + "logps/chosen": -212.06575092516448, + "logps/rejected": -126.75918344350961, + "loss": 3.2137, + "rewards/chosen": -0.4144437689530222, + "rewards/margins": 1.6795955611626627, + "rewards/rejected": -2.094039330115685, + "step": 238 + }, + { + "epoch": 7.712550607287449, + "grad_norm": 223.0, + "kl": 0.0, + "learning_rate": 9.159667587974785e-07, + "logits/chosen": -67735965.53846154, + "logits/rejected": -93502791.68, + "logps/chosen": -155.74129857772436, + "logps/rejected": -112.822978515625, + "loss": 3.1726, + "rewards/chosen": -0.4051205806243114, + "rewards/margins": 1.4957229252350634, + "rewards/rejected": -1.900843505859375, + "step": 239 + }, + { + "epoch": 7.744939271255061, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 9.141030203166256e-07, + "logits/chosen": -70799248.0, + "logits/rejected": -95561552.0, + "logps/chosen": -177.97103881835938, + "logps/rejected": -139.7872314453125, + "loss": 3.1999, + "rewards/chosen": -0.7057160139083862, + "rewards/margins": 1.462103247642517, + "rewards/rejected": -2.1678192615509033, + "step": 240 + }, + { + "epoch": 7.777327935222672, + "grad_norm": 212.0, + "kl": 0.0, + "learning_rate": 9.122207801708801e-07, + "logits/chosen": -65076141.176470585, + "logits/rejected": -91102412.8, + "logps/chosen": -219.56000114889707, + "logps/rejected": -128.51177571614582, + "loss": 3.2295, + "rewards/chosen": -0.2541801789227654, + "rewards/margins": 1.7003829619463753, + "rewards/rejected": -1.9545631408691406, + "step": 241 + }, + { + "epoch": 7.809716599190283, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.103201224566497e-07, + "logits/chosen": -79426348.13793103, + "logits/rejected": -96227540.11428571, + "logps/chosen": -199.74701980064654, + "logps/rejected": -143.68773716517856, + "loss": 3.1877, + "rewards/chosen": -1.091436320337756, + "rewards/margins": 0.6939400921901457, + "rewards/rejected": -1.7853764125279017, + "step": 242 + }, + { + "epoch": 7.842105263157895, + "grad_norm": 171.0, + "kl": 0.32616788148880005, + "learning_rate": 9.084011320932188e-07, + "logits/chosen": -60969656.0, + "logits/rejected": -102558344.0, + "logps/chosen": -258.640380859375, + "logps/rejected": -123.86648559570312, + "loss": 3.2517, + "rewards/chosen": -0.21032822132110596, + "rewards/margins": 1.9525035619735718, + "rewards/rejected": -2.1628317832946777, + "step": 243 + }, + { + "epoch": 7.874493927125506, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 9.064638948189538e-07, + "logits/chosen": -58330549.333333336, + "logits/rejected": -94501805.71428572, + "logps/chosen": -247.38720703125, + "logps/rejected": -133.05278669084822, + "loss": 3.2518, + "rewards/chosen": -0.11805566151936848, + "rewards/margins": 1.8401678176153276, + "rewards/rejected": -1.958223479134696, + "step": 244 + }, + { + "epoch": 7.906882591093117, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.045084971874737e-07, + "logits/chosen": -58010946.56, + "logits/rejected": -100820814.76923077, + "logps/chosen": -258.1535546875, + "logps/rejected": -150.38654346955127, + "loss": 3.0115, + "rewards/chosen": -0.357471809387207, + "rewards/margins": 1.9534176058646961, + "rewards/rejected": -2.310889415251903, + "step": 245 + }, + { + "epoch": 7.939271255060729, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 9.025350265637815e-07, + "logits/chosen": -76450004.11428571, + "logits/rejected": -97642875.5862069, + "logps/chosen": -177.54049944196427, + "logps/rejected": -137.0269143992457, + "loss": 3.1959, + "rewards/chosen": -0.41851163591657364, + "rewards/margins": 1.5577352965406597, + "rewards/rejected": -1.9762469324572334, + "step": 246 + }, + { + "epoch": 7.97165991902834, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 9.005435711203618e-07, + "logits/chosen": -70475733.33333333, + "logits/rejected": -98785888.86486487, + "logps/chosen": -170.7393844039352, + "logps/rejected": -134.28969594594594, + "loss": 3.0606, + "rewards/chosen": -0.5211844974093967, + "rewards/margins": 1.4517781655709665, + "rewards/rejected": -1.9729626629803632, + "step": 247 + }, + { + "epoch": 8.0, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 8.985342198332406e-07, + "logits/chosen": -56464353.684210524, + "logits/rejected": -95736943.30434783, + "logps/chosen": -135.89539216694078, + "logps/rejected": -121.2200343919837, + "loss": 3.163, + "rewards/chosen": -0.8074518002961811, + "rewards/margins": 1.0563260237739613, + "rewards/rejected": -1.8637778240701426, + "step": 248 + }, + { + "epoch": 8.03238866396761, + "grad_norm": 222.0, + "kl": 0.0, + "learning_rate": 8.965070624780115e-07, + "logits/chosen": -61136862.11764706, + "logits/rejected": -93872947.2, + "logps/chosen": -214.90665211397058, + "logps/rejected": -109.716455078125, + "loss": 3.2072, + "rewards/chosen": -0.41255920073565316, + "rewards/margins": 1.1357639855029538, + "rewards/rejected": -1.5483231862386069, + "step": 249 + }, + { + "epoch": 8.064777327935223, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 8.944621896258224e-07, + "logits/chosen": -65452182.5882353, + "logits/rejected": -85745203.2, + "logps/chosen": -228.57809627757354, + "logps/rejected": -109.6790771484375, + "loss": 3.171, + "rewards/chosen": -0.4627881330602309, + "rewards/margins": 0.871955239539053, + "rewards/rejected": -1.3347433725992839, + "step": 250 + }, + { + "epoch": 8.097165991902834, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 8.923996926393305e-07, + "logits/chosen": -66272162.13333333, + "logits/rejected": -101397797.64705883, + "logps/chosen": -220.34915364583333, + "logps/rejected": -131.8910414751838, + "loss": 3.0849, + "rewards/chosen": -0.3260688781738281, + "rewards/margins": 1.9107903873219207, + "rewards/rejected": -2.236859265495749, + "step": 251 + }, + { + "epoch": 8.129554655870445, + "grad_norm": 227.0, + "kl": 0.0, + "learning_rate": 8.903196636686197e-07, + "logits/chosen": -65849974.15384615, + "logits/rejected": -96421734.4, + "logps/chosen": -236.94503705929486, + "logps/rejected": -122.508896484375, + "loss": 3.371, + "rewards/chosen": -0.24791299379788911, + "rewards/margins": 1.5822778936532829, + "rewards/rejected": -1.8301908874511719, + "step": 252 + }, + { + "epoch": 8.161943319838057, + "grad_norm": 161.0, + "kl": 0.0, + "learning_rate": 8.882221956470836e-07, + "logits/chosen": -56595155.862068966, + "logits/rejected": -95794848.91428572, + "logps/chosen": -210.66039197198276, + "logps/rejected": -129.80171595982142, + "loss": 3.0067, + "rewards/chosen": -0.13620573898841595, + "rewards/margins": 1.7900380703028786, + "rewards/rejected": -1.9262438092912946, + "step": 253 + }, + { + "epoch": 8.194331983805668, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 8.861073822872733e-07, + "logits/chosen": -54373916.90322581, + "logits/rejected": -92542029.57575758, + "logps/chosen": -158.83270854334677, + "logps/rejected": -114.67375414299242, + "loss": 3.1687, + "rewards/chosen": -0.06185272432142688, + "rewards/margins": 1.9001471110686063, + "rewards/rejected": -1.9619998353900332, + "step": 254 + }, + { + "epoch": 8.226720647773279, + "grad_norm": 214.0, + "kl": 0.0, + "learning_rate": 8.839753180767107e-07, + "logits/chosen": -63268448.96969697, + "logits/rejected": -89012306.58064516, + "logps/chosen": -148.8386896306818, + "logps/rejected": -147.22150642641128, + "loss": 3.1673, + "rewards/chosen": -0.40993794527920807, + "rewards/margins": 1.4778542113094386, + "rewards/rejected": -1.8877921565886466, + "step": 255 + }, + { + "epoch": 8.259109311740891, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 8.818260982736661e-07, + "logits/chosen": -51074745.37931035, + "logits/rejected": -97956293.48571429, + "logps/chosen": -236.98644598599137, + "logps/rejected": -134.256640625, + "loss": 3.1121, + "rewards/chosen": -0.059805117804428626, + "rewards/margins": 2.072036923211196, + "rewards/rejected": -2.131842041015625, + "step": 256 + }, + { + "epoch": 8.291497975708502, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 8.796598189029029e-07, + "logits/chosen": -74012564.21052632, + "logits/rejected": -94074112.0, + "logps/chosen": -186.44219006990133, + "logps/rejected": -146.62846491887018, + "loss": 3.1515, + "rewards/chosen": -0.6955768685591849, + "rewards/margins": 1.469036415038321, + "rewards/rejected": -2.164613283597506, + "step": 257 + }, + { + "epoch": 8.323886639676113, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 8.774765767513874e-07, + "logits/chosen": -70070772.86956522, + "logits/rejected": -96451228.09756097, + "logps/chosen": -150.84981105638587, + "logps/rejected": -121.71354563643293, + "loss": 2.9867, + "rewards/chosen": -1.0036715631899626, + "rewards/margins": 1.1735239160402202, + "rewards/rejected": -2.177195479230183, + "step": 258 + }, + { + "epoch": 8.356275303643725, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 8.752764693639638e-07, + "logits/chosen": -61003702.85714286, + "logits/rejected": -90766124.13793103, + "logps/chosen": -195.6492885044643, + "logps/rejected": -137.05028455010776, + "loss": 3.1731, + "rewards/chosen": -0.22561860765729633, + "rewards/margins": 1.685136944437262, + "rewards/rejected": -1.9107555520945583, + "step": 259 + }, + { + "epoch": 8.388663967611336, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 8.730595950389967e-07, + "logits/chosen": -60814843.428571425, + "logits/rejected": -91367274.66666667, + "logps/chosen": -285.06014578683033, + "logps/rejected": -115.19327799479167, + "loss": 3.1373, + "rewards/chosen": -0.22293782234191895, + "rewards/margins": 1.5482103294796414, + "rewards/rejected": -1.7711481518215604, + "step": 260 + }, + { + "epoch": 8.421052631578947, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 8.708260528239788e-07, + "logits/chosen": -67363949.03703703, + "logits/rejected": -98014982.91891892, + "logps/chosen": -255.33013237847223, + "logps/rejected": -132.92724609375, + "loss": 3.056, + "rewards/chosen": -0.5816964396723995, + "rewards/margins": 1.4459617651021994, + "rewards/rejected": -2.027658204774599, + "step": 261 + }, + { + "epoch": 8.45344129554656, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 8.685759425111054e-07, + "logits/chosen": -53188189.86666667, + "logits/rejected": -95383913.41176471, + "logps/chosen": -255.84373372395834, + "logps/rejected": -109.32036994485294, + "loss": 3.0451, + "rewards/chosen": -0.1679290771484375, + "rewards/margins": 1.8957604352165673, + "rewards/rejected": -2.0636895123650048, + "step": 262 + }, + { + "epoch": 8.48582995951417, + "grad_norm": 174.0, + "kl": 0.0, + "learning_rate": 8.663093646328166e-07, + "logits/chosen": -57241712.64, + "logits/rejected": -98063478.15384616, + "logps/chosen": -238.8626953125, + "logps/rejected": -147.1845202323718, + "loss": 3.0891, + "rewards/chosen": -0.5263804626464844, + "rewards/margins": 1.5656652479905349, + "rewards/rejected": -2.092045710637019, + "step": 263 + }, + { + "epoch": 8.518218623481781, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 8.640264204573046e-07, + "logits/chosen": -64148619.63636363, + "logits/rejected": -102109117.93548387, + "logps/chosen": -246.94140625, + "logps/rejected": -142.4841544858871, + "loss": 3.1034, + "rewards/chosen": -0.41539241328383936, + "rewards/margins": 1.8591594705367251, + "rewards/rejected": -2.2745518838205645, + "step": 264 + }, + { + "epoch": 8.550607287449393, + "grad_norm": 210.0, + "kl": 0.0, + "learning_rate": 8.617272119839902e-07, + "logits/chosen": -64987726.451612905, + "logits/rejected": -98316613.81818181, + "logps/chosen": -176.54696950604838, + "logps/rejected": -150.72878196022728, + "loss": 3.1912, + "rewards/chosen": -0.2099611682276572, + "rewards/margins": 1.8152008061301788, + "rewards/rejected": -2.025161974357836, + "step": 265 + }, + { + "epoch": 8.582995951417004, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 8.594118419389647e-07, + "logits/chosen": -58319925.89473684, + "logits/rejected": -95481777.23076923, + "logps/chosen": -248.0672029194079, + "logps/rejected": -131.7604041466346, + "loss": 3.121, + "rewards/chosen": -0.2619515469199733, + "rewards/margins": 1.5165908983361864, + "rewards/rejected": -1.7785424452561598, + "step": 266 + }, + { + "epoch": 8.615384615384615, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 8.570804137704003e-07, + "logits/chosen": -66109800.0, + "logits/rejected": -105424344.0, + "logps/chosen": -252.68231201171875, + "logps/rejected": -144.7725830078125, + "loss": 3.1032, + "rewards/chosen": -0.22994595766067505, + "rewards/margins": 2.1201741099357605, + "rewards/rejected": -2.3501200675964355, + "step": 267 + }, + { + "epoch": 8.647773279352228, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 8.54733031643929e-07, + "logits/chosen": -76308333.71428572, + "logits/rejected": -95257705.93103448, + "logps/chosen": -177.07209821428572, + "logps/rejected": -145.09445716594828, + "loss": 3.1061, + "rewards/chosen": -1.0242833818708148, + "rewards/margins": 0.9257603950688404, + "rewards/rejected": -1.9500437769396552, + "step": 268 + }, + { + "epoch": 8.680161943319838, + "grad_norm": 171.0, + "kl": 0.34954798221588135, + "learning_rate": 8.523698004379875e-07, + "logits/chosen": -68105216.0, + "logits/rejected": -96775000.61538461, + "logps/chosen": -212.20703125, + "logps/rejected": -127.7769305889423, + "loss": 3.1649, + "rewards/chosen": -0.4285718516299599, + "rewards/margins": 1.7672414007457162, + "rewards/rejected": -2.195813252375676, + "step": 269 + }, + { + "epoch": 8.712550607287449, + "grad_norm": 231.0, + "kl": 0.0, + "learning_rate": 8.499908257391323e-07, + "logits/chosen": -68097083.07692307, + "logits/rejected": -94308720.64, + "logps/chosen": -155.97443409455127, + "logps/rejected": -113.86224609375, + "loss": 3.1306, + "rewards/chosen": -0.42843324710161257, + "rewards/margins": 1.5763363451835435, + "rewards/rejected": -2.004769592285156, + "step": 270 + }, + { + "epoch": 8.744939271255062, + "grad_norm": 236.0, + "kl": 0.0, + "learning_rate": 8.475962138373212e-07, + "logits/chosen": -71194480.0, + "logits/rejected": -96231992.0, + "logps/chosen": -178.3039093017578, + "logps/rejected": -141.05906677246094, + "loss": 3.1401, + "rewards/chosen": -0.7390018701553345, + "rewards/margins": 1.5560022592544556, + "rewards/rejected": -2.29500412940979, + "step": 271 + }, + { + "epoch": 8.777327935222672, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 8.451860717211652e-07, + "logits/chosen": -65470554.35294118, + "logits/rejected": -91992644.26666667, + "logps/chosen": -219.72277113970588, + "logps/rejected": -130.14352213541667, + "loss": 3.1629, + "rewards/chosen": -0.2704560055452235, + "rewards/margins": 1.8472819552702064, + "rewards/rejected": -2.11773796081543, + "step": 272 + }, + { + "epoch": 8.809716599190283, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 8.427605070731481e-07, + "logits/chosen": -80051561.93103448, + "logits/rejected": -96999665.37142856, + "logps/chosen": -200.22407058189654, + "logps/rejected": -144.83091517857142, + "loss": 3.0988, + "rewards/chosen": -1.1391406881398167, + "rewards/margins": 0.7605528281827278, + "rewards/rejected": -1.8996935163225446, + "step": 273 + }, + { + "epoch": 8.842105263157894, + "grad_norm": 174.0, + "kl": 0.33903443813323975, + "learning_rate": 8.403196282648155e-07, + "logits/chosen": -61416080.0, + "logits/rejected": -103547584.0, + "logps/chosen": -258.36090087890625, + "logps/rejected": -125.1744613647461, + "loss": 3.1583, + "rewards/chosen": -0.18237808346748352, + "rewards/margins": 2.1112501323223114, + "rewards/rejected": -2.293628215789795, + "step": 274 + }, + { + "epoch": 8.874493927125506, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 8.378635443519326e-07, + "logits/chosen": -58604956.44444445, + "logits/rejected": -95174281.14285715, + "logps/chosen": -247.49026150173611, + "logps/rejected": -134.39338030133928, + "loss": 3.176, + "rewards/chosen": -0.12835978137122261, + "rewards/margins": 1.9639225138558283, + "rewards/rejected": -2.092282295227051, + "step": 275 + }, + { + "epoch": 8.906882591093117, + "grad_norm": 158.0, + "kl": 0.0, + "learning_rate": 8.353923650696117e-07, + "logits/chosen": -58184314.88, + "logits/rejected": -101710185.02564102, + "logps/chosen": -258.45134765625, + "logps/rejected": -152.23571464342947, + "loss": 2.9285, + "rewards/chosen": -0.3872526550292969, + "rewards/margins": 2.108552351731521, + "rewards/rejected": -2.4958050067608175, + "step": 276 + }, + { + "epoch": 8.939271255060728, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 8.329062008274098e-07, + "logits/chosen": -77036032.0, + "logits/rejected": -98802899.86206897, + "logps/chosen": -178.03736049107144, + "logps/rejected": -138.2449740705819, + "loss": 3.1156, + "rewards/chosen": -0.4681967054094587, + "rewards/margins": 1.6298560100235964, + "rewards/rejected": -2.098052715433055, + "step": 277 + }, + { + "epoch": 8.97165991902834, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 8.304051627043951e-07, + "logits/chosen": -70749624.8888889, + "logits/rejected": -99539317.62162162, + "logps/chosen": -170.75705295138889, + "logps/rejected": -135.2934966216216, + "loss": 3.0087, + "rewards/chosen": -0.5229519384878653, + "rewards/margins": 1.5503900497405976, + "rewards/rejected": -2.0733419882284627, + "step": 278 + }, + { + "epoch": 9.0, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 8.278893624441847e-07, + "logits/chosen": -56785340.631578945, + "logits/rejected": -96406839.6521739, + "logps/chosen": -136.22370990953948, + "logps/rejected": -122.41340438179348, + "loss": 3.1086, + "rewards/chosen": -0.8402846486944902, + "rewards/margins": 1.1428296767874222, + "rewards/rejected": -1.9831143254819124, + "step": 279 + }, + { + "epoch": 9.03238866396761, + "grad_norm": 214.0, + "kl": 0.0, + "learning_rate": 8.253589124499511e-07, + "logits/chosen": -61582000.941176474, + "logits/rejected": -94555818.66666667, + "logps/chosen": -214.94738051470588, + "logps/rejected": -110.60725911458333, + "loss": 3.1054, + "rewards/chosen": -0.41663164250990925, + "rewards/margins": 1.2207698111440621, + "rewards/rejected": -1.6374014536539714, + "step": 280 + }, + { + "epoch": 9.064777327935223, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 8.228139257794012e-07, + "logits/chosen": -66101383.52941176, + "logits/rejected": -86001049.6, + "logps/chosen": -228.7639590992647, + "logps/rejected": -110.32359212239584, + "loss": 3.1086, + "rewards/chosen": -0.4813748527975643, + "rewards/margins": 0.9178199917662377, + "rewards/rejected": -1.399194844563802, + "step": 281 + }, + { + "epoch": 9.097165991902834, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 8.202545161397241e-07, + "logits/chosen": -66456524.8, + "logits/rejected": -102268438.58823529, + "logps/chosen": -220.43671875, + "logps/rejected": -133.09693818933823, + "loss": 3.0521, + "rewards/chosen": -0.33482268651326497, + "rewards/margins": 2.022627462125292, + "rewards/rejected": -2.357450148638557, + "step": 282 + }, + { + "epoch": 9.129554655870445, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 8.176807978825118e-07, + "logits/chosen": -66117100.307692304, + "logits/rejected": -96738580.48, + "logps/chosen": -237.09620392628204, + "logps/rejected": -123.557412109375, + "loss": 3.2924, + "rewards/chosen": -0.2630309324998122, + "rewards/margins": 1.6720119446974535, + "rewards/rejected": -1.9350428771972656, + "step": 283 + }, + { + "epoch": 9.161943319838057, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 8.150928859986487e-07, + "logits/chosen": -56772281.37931035, + "logits/rejected": -96661577.14285715, + "logps/chosen": -210.34304283405172, + "logps/rejected": -130.77176339285714, + "loss": 2.9648, + "rewards/chosen": -0.10446934864438813, + "rewards/margins": 1.918780686232844, + "rewards/rejected": -2.023250034877232, + "step": 284 + }, + { + "epoch": 9.194331983805668, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 8.124908961131757e-07, + "logits/chosen": -54764007.22580645, + "logits/rejected": -92978703.51515152, + "logps/chosen": -158.61907762096774, + "logps/rejected": -115.82316080729167, + "loss": 3.1142, + "rewards/chosen": -0.040489827432940086, + "rewards/margins": 2.036450073283206, + "rewards/rejected": -2.076939900716146, + "step": 285 + }, + { + "epoch": 9.226720647773279, + "grad_norm": 217.0, + "kl": 0.0, + "learning_rate": 8.098749444801224e-07, + "logits/chosen": -63514360.24242424, + "logits/rejected": -89530244.12903225, + "logps/chosen": -148.8549952651515, + "logps/rejected": -148.2848097278226, + "loss": 3.1043, + "rewards/chosen": -0.41156893065481476, + "rewards/margins": 1.5825537283516118, + "rewards/rejected": -1.9941226590064265, + "step": 286 + }, + { + "epoch": 9.259109311740891, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 8.072451479773143e-07, + "logits/chosen": -51306637.24137931, + "logits/rejected": -98705554.28571428, + "logps/chosen": -237.14225821659483, + "logps/rejected": -135.66897321428573, + "loss": 3.0743, + "rewards/chosen": -0.07538544720616834, + "rewards/margins": 2.1976899835276487, + "rewards/rejected": -2.273075430733817, + "step": 287 + }, + { + "epoch": 9.291497975708502, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 8.0460162410115e-07, + "logits/chosen": -74732025.26315789, + "logits/rejected": -95007300.92307693, + "logps/chosen": -186.92341694078948, + "logps/rejected": -147.9090857872596, + "loss": 3.0931, + "rewards/chosen": -0.7436988730179636, + "rewards/margins": 1.54897526497783, + "rewards/rejected": -2.2926741379957933, + "step": 288 + }, + { + "epoch": 9.323886639676113, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 8.019444909613521e-07, + "logits/chosen": -70447159.6521739, + "logits/rejected": -96842964.29268293, + "logps/chosen": -151.52042289402175, + "logps/rejected": -122.7792373285061, + "loss": 2.9209, + "rewards/chosen": -1.0707318679146145, + "rewards/margins": 1.2130315523764488, + "rewards/rejected": -2.2837634202910633, + "step": 289 + }, + { + "epoch": 9.356275303643725, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 7.992738672756908e-07, + "logits/chosen": -61173346.74285714, + "logits/rejected": -91356672.0, + "logps/chosen": -195.72730189732144, + "logps/rejected": -138.0972521551724, + "loss": 3.1156, + "rewards/chosen": -0.23342056274414064, + "rewards/margins": 1.7820309343009162, + "rewards/rejected": -2.0154514970450568, + "step": 290 + }, + { + "epoch": 9.388663967611336, + "grad_norm": 185.0, + "kl": 0.0, + "learning_rate": 7.965898723646776e-07, + "logits/chosen": -61117956.571428575, + "logits/rejected": -91804181.33333333, + "logps/chosen": -285.02743094308033, + "logps/rejected": -116.11239963107639, + "loss": 3.0761, + "rewards/chosen": -0.21966862678527832, + "rewards/margins": 1.6433917946285672, + "rewards/rejected": -1.8630604214138455, + "step": 291 + }, + { + "epoch": 9.421052631578947, + "grad_norm": 280.0, + "kl": 0.0, + "learning_rate": 7.938926261462365e-07, + "logits/chosen": -67616483.55555555, + "logits/rejected": -98708756.75675675, + "logps/chosen": -255.13259548611111, + "logps/rejected": -134.1481735641892, + "loss": 3.0287, + "rewards/chosen": -0.5619420652036313, + "rewards/margins": 1.587807896855596, + "rewards/rejected": -2.1497499620592273, + "step": 292 + }, + { + "epoch": 9.45344129554656, + "grad_norm": 164.0, + "kl": 0.0, + "learning_rate": 7.911822491303452e-07, + "logits/chosen": -53335927.46666667, + "logits/rejected": -95898789.64705883, + "logps/chosen": -255.715966796875, + "logps/rejected": -110.31078383501838, + "loss": 2.9834, + "rewards/chosen": -0.15515419642130535, + "rewards/margins": 2.007576301051121, + "rewards/rejected": -2.1627304974724266, + "step": 293 + }, + { + "epoch": 9.48582995951417, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 7.884588624136503e-07, + "logits/chosen": -57378785.28, + "logits/rejected": -98681816.61538461, + "logps/chosen": -239.1314453125, + "logps/rejected": -148.2501502403846, + "loss": 3.0397, + "rewards/chosen": -0.5532550811767578, + "rewards/margins": 1.6453546866392479, + "rewards/rejected": -2.1986097678160057, + "step": 294 + }, + { + "epoch": 9.518218623481781, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 7.857225876740583e-07, + "logits/chosen": -64545652.36363637, + "logits/rejected": -102624454.19354838, + "logps/chosen": -247.0961766098485, + "logps/rejected": -143.7275863155242, + "loss": 3.041, + "rewards/chosen": -0.4308715011134292, + "rewards/margins": 1.9680230242299426, + "rewards/rejected": -2.3988945253433718, + "step": 295 + }, + { + "epoch": 9.550607287449393, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 7.829735471652977e-07, + "logits/chosen": -65327657.29032258, + "logits/rejected": -98793673.6969697, + "logps/chosen": -176.57855027721774, + "logps/rejected": -151.97844164299244, + "loss": 3.1413, + "rewards/chosen": -0.2131201067278462, + "rewards/margins": 1.9370069573701651, + "rewards/rejected": -2.1501270640980112, + "step": 296 + }, + { + "epoch": 9.582995951417004, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 7.802118637114573e-07, + "logits/chosen": -58608431.15789474, + "logits/rejected": -96020548.92307693, + "logps/chosen": -248.18331106085526, + "logps/rejected": -132.69559420072116, + "loss": 3.08, + "rewards/chosen": -0.27356298346268504, + "rewards/margins": 1.5984983791706533, + "rewards/rejected": -1.8720613626333384, + "step": 297 + }, + { + "epoch": 9.615384615384615, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 7.774376607014994e-07, + "logits/chosen": -66192584.0, + "logits/rejected": -106159160.0, + "logps/chosen": -252.6024932861328, + "logps/rejected": -146.11524963378906, + "loss": 3.0409, + "rewards/chosen": -0.2219638079404831, + "rewards/margins": 2.262425258755684, + "rewards/rejected": -2.484389066696167, + "step": 298 + }, + { + "epoch": 9.647773279352228, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 7.746510620837458e-07, + "logits/chosen": -76464098.74285714, + "logits/rejected": -95603005.79310344, + "logps/chosen": -177.7017857142857, + "logps/rejected": -145.97341392780172, + "loss": 3.059, + "rewards/chosen": -1.0872510637555803, + "rewards/margins": 0.9506875475051955, + "rewards/rejected": -2.037938611260776, + "step": 299 + }, + { + "epoch": 9.680161943319838, + "grad_norm": 166.0, + "kl": 0.3986997604370117, + "learning_rate": 7.718521923603404e-07, + "logits/chosen": -68340021.89473684, + "logits/rejected": -97238488.61538461, + "logps/chosen": -212.5210603412829, + "logps/rejected": -128.78448955829327, + "loss": 3.0978, + "rewards/chosen": -0.45997343565288346, + "rewards/margins": 1.8365966820041175, + "rewards/rejected": -2.296570117657001, + "step": 300 + }, + { + "epoch": 9.712550607287449, + "grad_norm": 226.0, + "kl": 0.0, + "learning_rate": 7.690411765816864e-07, + "logits/chosen": -68242195.6923077, + "logits/rejected": -94812928.0, + "logps/chosen": -156.16728014823718, + "logps/rejected": -114.796640625, + "loss": 3.0423, + "rewards/chosen": -0.44771796006422776, + "rewards/margins": 1.6504920313908504, + "rewards/rejected": -2.0982099914550782, + "step": 301 + }, + { + "epoch": 9.744939271255062, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 7.662181403408592e-07, + "logits/chosen": -71550824.0, + "logits/rejected": -96827824.0, + "logps/chosen": -178.41879272460938, + "logps/rejected": -142.3582763671875, + "loss": 3.0433, + "rewards/chosen": -0.7504904270172119, + "rewards/margins": 1.6744349002838135, + "rewards/rejected": -2.4249253273010254, + "step": 302 + }, + { + "epoch": 9.777327935222672, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 7.633832097679957e-07, + "logits/chosen": -65655808.0, + "logits/rejected": -92415803.73333333, + "logps/chosen": -220.0100815716912, + "logps/rejected": -130.86422526041667, + "loss": 3.0974, + "rewards/chosen": -0.29918743582332835, + "rewards/margins": 1.8906209646486767, + "rewards/rejected": -2.189808400472005, + "step": 303 + }, + { + "epoch": 9.809716599190283, + "grad_norm": 192.0, + "kl": 0.0, + "learning_rate": 7.60536511524658e-07, + "logits/chosen": -80359909.51724137, + "logits/rejected": -97658046.17142858, + "logps/chosen": -200.64870689655172, + "logps/rejected": -145.7033203125, + "loss": 3.0474, + "rewards/chosen": -1.181604056522764, + "rewards/margins": 0.8053302783684191, + "rewards/rejected": -1.986934334891183, + "step": 304 + }, + { + "epoch": 9.842105263157894, + "grad_norm": 170.0, + "kl": 0.38847053050994873, + "learning_rate": 7.576781727981749e-07, + "logits/chosen": -61844200.0, + "logits/rejected": -104248728.0, + "logps/chosen": -258.5477294921875, + "logps/rejected": -126.07926940917969, + "loss": 3.1096, + "rewards/chosen": -0.20106235146522522, + "rewards/margins": 2.1830473840236664, + "rewards/rejected": -2.3841097354888916, + "step": 305 + }, + { + "epoch": 9.874493927125506, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 7.548083212959587e-07, + "logits/chosen": -58909134.222222224, + "logits/rejected": -95752502.85714285, + "logps/chosen": -247.49329969618054, + "logps/rejected": -135.36624581473214, + "loss": 3.1361, + "rewards/chosen": -0.1286644140879313, + "rewards/margins": 2.060904786700294, + "rewards/rejected": -2.1895692007882253, + "step": 306 + }, + { + "epoch": 9.906882591093117, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 7.519270852398001e-07, + "logits/chosen": -58490521.6, + "logits/rejected": -102195521.64102565, + "logps/chosen": -258.3875, + "logps/rejected": -153.12002954727564, + "loss": 2.8674, + "rewards/chosen": -0.38086856842041017, + "rewards/margins": 2.2033691416031274, + "rewards/rejected": -2.5842377100235376, + "step": 307 + }, + { + "epoch": 9.939271255060728, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 7.490345933601394e-07, + "logits/chosen": -77243509.02857143, + "logits/rejected": -99201341.79310344, + "logps/chosen": -178.14437779017857, + "logps/rejected": -139.1087015086207, + "loss": 3.0598, + "rewards/chosen": -0.4788997650146484, + "rewards/margins": 1.7055259836131127, + "rewards/rejected": -2.184425748627761, + "step": 308 + }, + { + "epoch": 9.97165991902834, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 7.461309748903137e-07, + "logits/chosen": -71162638.22222222, + "logits/rejected": -99941209.94594595, + "logps/chosen": -171.35163483796296, + "logps/rejected": -136.0713550464527, + "loss": 2.9573, + "rewards/chosen": -0.582410247237594, + "rewards/margins": 1.5687165112347456, + "rewards/rejected": -2.1511267584723397, + "step": 309 + }, + { + "epoch": 10.0, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 7.43216359560785e-07, + "logits/chosen": -56970293.89473684, + "logits/rejected": -96843464.3478261, + "logps/chosen": -136.66714638157896, + "logps/rejected": -123.33866550611413, + "loss": 3.0628, + "rewards/chosen": -0.884627994738127, + "rewards/margins": 1.191012808059937, + "rewards/rejected": -2.075640802798064, + "step": 310 + }, + { + "epoch": 10.03238866396761, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 7.402908775933419e-07, + "logits/chosen": -61779237.64705882, + "logits/rejected": -95183308.8, + "logps/chosen": -214.87415268841912, + "logps/rejected": -111.50768229166667, + "loss": 3.0712, + "rewards/chosen": -0.40930919086231904, + "rewards/margins": 1.3181350128323426, + "rewards/rejected": -1.7274442036946616, + "step": 311 + }, + { + "epoch": 10.064777327935223, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 7.373546596952827e-07, + "logits/chosen": -66141816.47058824, + "logits/rejected": -86566493.86666666, + "logps/chosen": -229.13196518841912, + "logps/rejected": -110.87522786458334, + "loss": 3.0742, + "rewards/chosen": -0.51817585440243, + "rewards/margins": 0.9361826915366975, + "rewards/rejected": -1.4543585459391275, + "step": 312 + }, + { + "epoch": 10.097165991902834, + "grad_norm": 200.0, + "kl": 0.0, + "learning_rate": 7.344078370535755e-07, + "logits/chosen": -66708241.06666667, + "logits/rejected": -102690183.52941176, + "logps/chosen": -220.53509114583332, + "logps/rejected": -133.85311351102942, + "loss": 2.9652, + "rewards/chosen": -0.3446622848510742, + "rewards/margins": 2.0884050369262694, + "rewards/rejected": -2.4330673217773438, + "step": 313 + }, + { + "epoch": 10.129554655870445, + "grad_norm": 223.0, + "kl": 0.0, + "learning_rate": 7.314505413289963e-07, + "logits/chosen": -66415760.41025641, + "logits/rejected": -97746411.52, + "logps/chosen": -236.982421875, + "logps/rejected": -124.301083984375, + "loss": 3.2645, + "rewards/chosen": -0.2516512993054512, + "rewards/margins": 1.7577592536730646, + "rewards/rejected": -2.009410552978516, + "step": 314 + }, + { + "epoch": 10.161943319838057, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 7.284829046502467e-07, + "logits/chosen": -57094033.655172415, + "logits/rejected": -97199389.25714286, + "logps/chosen": -210.60376818426724, + "logps/rejected": -131.71128627232142, + "loss": 2.9492, + "rewards/chosen": -0.13054232761777682, + "rewards/margins": 1.986657815378875, + "rewards/rejected": -2.117200142996652, + "step": 315 + }, + { + "epoch": 10.194331983805668, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 7.255050596080509e-07, + "logits/chosen": -54756698.838709675, + "logits/rejected": -93596516.84848484, + "logps/chosen": -158.54352003528226, + "logps/rejected": -116.5855379971591, + "loss": 3.0725, + "rewards/chosen": -0.03293378506937335, + "rewards/margins": 2.120244112066043, + "rewards/rejected": -2.1531778971354165, + "step": 316 + }, + { + "epoch": 10.226720647773279, + "grad_norm": 224.0, + "kl": 0.0, + "learning_rate": 7.225171392492315e-07, + "logits/chosen": -63881801.696969695, + "logits/rejected": -89890758.19354838, + "logps/chosen": -149.04974550189394, + "logps/rejected": -149.1017830141129, + "loss": 3.0462, + "rewards/chosen": -0.43104382717248163, + "rewards/margins": 1.6447761112410768, + "rewards/rejected": -2.0758199384135585, + "step": 317 + }, + { + "epoch": 10.259109311740891, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 7.195192770707654e-07, + "logits/chosen": -51546081.10344828, + "logits/rejected": -99088603.42857143, + "logps/chosen": -236.8742086476293, + "logps/rejected": -136.63518415178572, + "loss": 3.0227, + "rewards/chosen": -0.04858063007223195, + "rewards/margins": 2.321115196866942, + "rewards/rejected": -2.369695826939174, + "step": 318 + }, + { + "epoch": 10.291497975708502, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 7.165116070138182e-07, + "logits/chosen": -74903653.05263157, + "logits/rejected": -95339264.0, + "logps/chosen": -187.0942254317434, + "logps/rejected": -149.06288499098557, + "loss": 3.0731, + "rewards/chosen": -0.7607813383403578, + "rewards/margins": 1.6472719864324037, + "rewards/rejected": -2.4080533247727613, + "step": 319 + }, + { + "epoch": 10.323886639676113, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 7.134942634577615e-07, + "logits/chosen": -70849914.43478261, + "logits/rejected": -97451944.58536585, + "logps/chosen": -151.50901197350544, + "logps/rejected": -123.82236089939025, + "loss": 2.8709, + "rewards/chosen": -1.0695920197860054, + "rewards/margins": 1.318482715529829, + "rewards/rejected": -2.3880747353158345, + "step": 320 + }, + { + "epoch": 10.356275303643725, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 7.104673812141675e-07, + "logits/chosen": -61415014.4, + "logits/rejected": -91735869.79310344, + "logps/chosen": -195.57022879464284, + "logps/rejected": -139.26278791756465, + "loss": 3.0755, + "rewards/chosen": -0.21771132605416435, + "rewards/margins": 1.9142937735383734, + "rewards/rejected": -2.1320050995925377, + "step": 321 + }, + { + "epoch": 10.388663967611336, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 7.074310955207868e-07, + "logits/chosen": -61545709.71428572, + "logits/rejected": -92193251.55555555, + "logps/chosen": -285.07822963169644, + "logps/rejected": -117.01312934027777, + "loss": 3.0344, + "rewards/chosen": -0.22474472863333567, + "rewards/margins": 1.7283889603993248, + "rewards/rejected": -1.9531336890326605, + "step": 322 + }, + { + "epoch": 10.421052631578947, + "grad_norm": 360.0, + "kl": 0.0, + "learning_rate": 7.04385542035506e-07, + "logits/chosen": -67899150.22222222, + "logits/rejected": -99048669.4054054, + "logps/chosen": -255.69565610532408, + "logps/rejected": -134.87616131756758, + "loss": 2.9983, + "rewards/chosen": -0.6182486216227213, + "rewards/margins": 1.6043022602528065, + "rewards/rejected": -2.2225508818755277, + "step": 323 + }, + { + "epoch": 10.45344129554656, + "grad_norm": 167.0, + "kl": 0.0, + "learning_rate": 7.013308568302854e-07, + "logits/chosen": -53443707.733333334, + "logits/rejected": -96427309.1764706, + "logps/chosen": -255.70485026041666, + "logps/rejected": -111.05091050091912, + "loss": 2.9387, + "rewards/chosen": -0.15404144922892252, + "rewards/margins": 2.082700907015333, + "rewards/rejected": -2.2367423562442554, + "step": 324 + }, + { + "epoch": 10.48582995951417, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 6.982671763850814e-07, + "logits/chosen": -57626496.0, + "logits/rejected": -99155147.48717949, + "logps/chosen": -239.3466015625, + "logps/rejected": -149.0160632011218, + "loss": 2.9875, + "rewards/chosen": -0.574769287109375, + "rewards/margins": 1.700430211776342, + "rewards/rejected": -2.275199498885717, + "step": 325 + }, + { + "epoch": 10.518218623481781, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 6.951946375817474e-07, + "logits/chosen": -64868639.03030303, + "logits/rejected": -103222321.5483871, + "logps/chosen": -247.205078125, + "logps/rejected": -144.45980342741936, + "loss": 2.976, + "rewards/chosen": -0.4417598608768348, + "rewards/margins": 2.030354456001945, + "rewards/rejected": -2.47211431687878, + "step": 326 + }, + { + "epoch": 10.550607287449393, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 6.921133776979186e-07, + "logits/chosen": -65694720.0, + "logits/rejected": -99278219.63636364, + "logps/chosen": -176.5769279233871, + "logps/rejected": -152.66569010416666, + "loss": 3.0862, + "rewards/chosen": -0.2129576744571809, + "rewards/margins": 2.0058928948343673, + "rewards/rejected": -2.2188505692915483, + "step": 327 + }, + { + "epoch": 10.582995951417004, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 6.890235344008781e-07, + "logits/chosen": -58686329.2631579, + "logits/rejected": -96480009.84615384, + "logps/chosen": -247.99038856907896, + "logps/rejected": -133.462890625, + "loss": 3.0455, + "rewards/chosen": -0.2542699512682463, + "rewards/margins": 1.694520378884999, + "rewards/rejected": -1.9487903301532452, + "step": 328 + }, + { + "epoch": 10.615384615384615, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 6.859252457414066e-07, + "logits/chosen": -66534960.0, + "logits/rejected": -106542016.0, + "logps/chosen": -252.64999389648438, + "logps/rejected": -146.95968627929688, + "loss": 2.9967, + "rewards/chosen": -0.2267126739025116, + "rewards/margins": 2.3421192467212677, + "rewards/rejected": -2.5688319206237793, + "step": 329 + }, + { + "epoch": 10.647773279352228, + "grad_norm": 152.0, + "kl": 0.0, + "learning_rate": 6.828186501476144e-07, + "logits/chosen": -76715066.51428571, + "logits/rejected": -96186862.34482759, + "logps/chosen": -177.94132254464284, + "logps/rejected": -146.52976831896552, + "loss": 3.0052, + "rewards/chosen": -1.1112047467912947, + "rewards/margins": 0.9823705663821969, + "rewards/rejected": -2.0935753131734915, + "step": 330 + }, + { + "epoch": 10.680161943319838, + "grad_norm": 184.0, + "kl": 0.3870738744735718, + "learning_rate": 6.797038864187563e-07, + "logits/chosen": -68528949.89473684, + "logits/rejected": -97728482.46153846, + "logps/chosen": -212.32182874177633, + "logps/rejected": -129.7121863731971, + "loss": 3.0682, + "rewards/chosen": -0.44005047647576584, + "rewards/margins": 1.9492892639839696, + "rewards/rejected": -2.3893397404597354, + "step": 331 + }, + { + "epoch": 10.712550607287449, + "grad_norm": 236.0, + "kl": 0.0, + "learning_rate": 6.765810937190306e-07, + "logits/chosen": -68265472.0, + "logits/rejected": -95188264.96, + "logps/chosen": -156.35124949919873, + "logps/rejected": -115.59439453125, + "loss": 2.999, + "rewards/chosen": -0.46611477778508115, + "rewards/margins": 1.7118698413555438, + "rewards/rejected": -2.177984619140625, + "step": 332 + }, + { + "epoch": 10.744939271255062, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 6.734504115713602e-07, + "logits/chosen": -71831984.0, + "logits/rejected": -97308848.0, + "logps/chosen": -178.6875762939453, + "logps/rejected": -142.67935180664062, + "loss": 3.021, + "rewards/chosen": -0.7773687243461609, + "rewards/margins": 1.6796627640724182, + "rewards/rejected": -2.457031488418579, + "step": 333 + }, + { + "epoch": 10.777327935222672, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 6.703119798511612e-07, + "logits/chosen": -65919126.5882353, + "logits/rejected": -92876680.53333333, + "logps/chosen": -220.09176815257354, + "logps/rejected": -131.73116861979167, + "loss": 3.0617, + "rewards/chosen": -0.3073549551122329, + "rewards/margins": 1.9691472726709702, + "rewards/rejected": -2.276502227783203, + "step": 334 + }, + { + "epoch": 10.809716599190283, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 6.671659387800908e-07, + "logits/chosen": -80524438.06896552, + "logits/rejected": -97971295.08571428, + "logps/chosen": -201.07888267780172, + "logps/rejected": -146.3240234375, + "loss": 2.9973, + "rewards/chosen": -1.2246241076239224, + "rewards/margins": 0.8243818912600063, + "rewards/rejected": -2.0490059988839286, + "step": 335 + }, + { + "epoch": 10.842105263157894, + "grad_norm": 181.0, + "kl": 0.40821921825408936, + "learning_rate": 6.640124289197845e-07, + "logits/chosen": -61865056.0, + "logits/rejected": -104776096.0, + "logps/chosen": -258.7056579589844, + "logps/rejected": -126.95762634277344, + "loss": 3.0811, + "rewards/chosen": -0.21685275435447693, + "rewards/margins": 2.2550927698612213, + "rewards/rejected": -2.4719455242156982, + "step": 336 + }, + { + "epoch": 10.874493927125506, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 6.608515911655743e-07, + "logits/chosen": -59061923.55555555, + "logits/rejected": -96263698.28571428, + "logps/chosen": -247.29961480034723, + "logps/rejected": -136.06971958705358, + "loss": 3.0953, + "rewards/chosen": -0.10929477214813232, + "rewards/margins": 2.150621669633048, + "rewards/rejected": -2.25991644178118, + "step": 337 + }, + { + "epoch": 10.906882591093117, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 6.576835667401952e-07, + "logits/chosen": -58525066.24, + "logits/rejected": -102586696.20512821, + "logps/chosen": -258.4833984375, + "logps/rejected": -154.0860877403846, + "loss": 2.8202, + "rewards/chosen": -0.39045482635498047, + "rewards/margins": 2.2903876466017503, + "rewards/rejected": -2.680842472956731, + "step": 338 + }, + { + "epoch": 10.939271255060728, + "grad_norm": 192.0, + "kl": 0.0, + "learning_rate": 6.545084971874736e-07, + "logits/chosen": -77362176.0, + "logits/rejected": -99851378.7586207, + "logps/chosen": -178.48032924107142, + "logps/rejected": -139.7308560075431, + "loss": 3.0113, + "rewards/chosen": -0.5124941689627511, + "rewards/margins": 1.7341473189480787, + "rewards/rejected": -2.24664148791083, + "step": 339 + }, + { + "epoch": 10.97165991902834, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 6.513265243660057e-07, + "logits/chosen": -71298962.96296297, + "logits/rejected": -100436203.24324325, + "logps/chosen": -171.26258680555554, + "logps/rejected": -137.03676625844594, + "loss": 2.9469, + "rewards/chosen": -0.5735058961091218, + "rewards/margins": 1.6741628083619506, + "rewards/rejected": -2.2476687044710726, + "step": 340 + }, + { + "epoch": 11.0, + "grad_norm": 174.0, + "kl": 0.0, + "learning_rate": 6.48137790442817e-07, + "logits/chosen": -57239383.578947365, + "logits/rejected": -97043244.52173913, + "logps/chosen": -136.87861071134867, + "logps/rejected": -124.2124660326087, + "loss": 3.0227, + "rewards/chosen": -0.9057742670962685, + "rewards/margins": 1.2572469864066187, + "rewards/rejected": -2.163021253502887, + "step": 341 + }, + { + "epoch": 11.03238866396761, + "grad_norm": 210.0, + "kl": 0.0, + "learning_rate": 6.449424378870122e-07, + "logits/chosen": -61877609.4117647, + "logits/rejected": -95455402.66666667, + "logps/chosen": -215.1293227251838, + "logps/rejected": -111.83929036458333, + "loss": 3.0175, + "rewards/chosen": -0.43482617770924287, + "rewards/margins": 1.3257803337246765, + "rewards/rejected": -1.7606065114339193, + "step": 342 + }, + { + "epoch": 11.064777327935223, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 6.417406094634089e-07, + "logits/chosen": -66498650.35294118, + "logits/rejected": -87018538.66666667, + "logps/chosen": -229.10110294117646, + "logps/rejected": -111.25556640625, + "loss": 3.0131, + "rewards/chosen": -0.5150904935948989, + "rewards/margins": 0.9773019828048407, + "rewards/rejected": -1.4923924763997396, + "step": 343 + }, + { + "epoch": 11.097165991902834, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 6.385324482261596e-07, + "logits/chosen": -66936622.93333333, + "logits/rejected": -103119728.94117647, + "logps/chosen": -220.70305989583332, + "logps/rejected": -134.835693359375, + "loss": 2.9157, + "rewards/chosen": -0.3614577611287435, + "rewards/margins": 2.169866737664915, + "rewards/rejected": -2.531324498793658, + "step": 344 + }, + { + "epoch": 11.129554655870445, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 6.353180975123594e-07, + "logits/chosen": -66439358.35897436, + "logits/rejected": -97964892.16, + "logps/chosen": -237.02393830128204, + "logps/rejected": -124.96294921875, + "loss": 3.1997, + "rewards/chosen": -0.25580261915158004, + "rewards/margins": 1.8197936943249824, + "rewards/rejected": -2.0755963134765625, + "step": 345 + }, + { + "epoch": 11.161943319838057, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 6.32097700935643e-07, + "logits/chosen": -56968518.62068965, + "logits/rejected": -97509844.11428571, + "logps/chosen": -210.48178205818965, + "logps/rejected": -132.1926478794643, + "loss": 2.9007, + "rewards/chosen": -0.11834272845038052, + "rewards/margins": 2.0469949703498425, + "rewards/rejected": -2.165337698800223, + "step": 346 + }, + { + "epoch": 11.194331983805668, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 6.288714023797671e-07, + "logits/chosen": -54813060.12903226, + "logits/rejected": -93902157.57575758, + "logps/chosen": -158.27471333165323, + "logps/rejected": -117.26080137310606, + "loss": 3.0155, + "rewards/chosen": -0.006054014928879276, + "rewards/margins": 2.21465095962131, + "rewards/rejected": -2.2207049745501894, + "step": 347 + }, + { + "epoch": 11.226720647773279, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 6.256393459921824e-07, + "logits/chosen": -64142324.36363637, + "logits/rejected": -90090743.74193548, + "logps/chosen": -149.2150361032197, + "logps/rejected": -149.71875, + "loss": 2.9987, + "rewards/chosen": -0.44757247693610913, + "rewards/margins": 1.689944898394662, + "rewards/rejected": -2.1375173753307712, + "step": 348 + }, + { + "epoch": 11.259109311740891, + "grad_norm": 169.0, + "kl": 0.0, + "learning_rate": 6.224016761775932e-07, + "logits/chosen": -51707489.10344828, + "logits/rejected": -99570278.4, + "logps/chosen": -236.94027815193965, + "logps/rejected": -137.37063337053573, + "loss": 2.9742, + "rewards/chosen": -0.05518572905967976, + "rewards/margins": 2.3880550633510347, + "rewards/rejected": -2.4432407924107142, + "step": 349 + }, + { + "epoch": 11.291497975708502, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 6.191585375915055e-07, + "logits/chosen": -75037689.26315789, + "logits/rejected": -95690781.53846154, + "logps/chosen": -187.18649773848685, + "logps/rejected": -149.31526066706732, + "loss": 3.0458, + "rewards/chosen": -0.7700092917994449, + "rewards/margins": 1.6632817766444403, + "rewards/rejected": -2.4332910684438853, + "step": 350 + }, + { + "epoch": 11.323886639676113, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 6.159100751337641e-07, + "logits/chosen": -70969266.08695652, + "logits/rejected": -97567194.53658536, + "logps/chosen": -151.87946883491847, + "logps/rejected": -124.0783036394817, + "loss": 2.8333, + "rewards/chosen": -1.1066370425016985, + "rewards/margins": 1.3070335671353062, + "rewards/rejected": -2.4136706096370046, + "step": 351 + }, + { + "epoch": 11.356275303643725, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 6.126564339420783e-07, + "logits/chosen": -61596979.2, + "logits/rejected": -92134859.03448276, + "logps/chosen": -195.90521763392857, + "logps/rejected": -139.67433324353448, + "loss": 3.037, + "rewards/chosen": -0.251210457938058, + "rewards/margins": 1.9219487796276074, + "rewards/rejected": -2.1731592375656654, + "step": 352 + }, + { + "epoch": 11.388663967611336, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 6.093977593855375e-07, + "logits/chosen": -61521147.428571425, + "logits/rejected": -92662428.44444445, + "logps/chosen": -285.13204520089283, + "logps/rejected": -117.372802734375, + "loss": 3.0021, + "rewards/chosen": -0.2301291057041713, + "rewards/margins": 1.7589708207145571, + "rewards/rejected": -1.9890999264187283, + "step": 353 + }, + { + "epoch": 11.421052631578947, + "grad_norm": 334.0, + "kl": 0.0, + "learning_rate": 6.061341970581164e-07, + "logits/chosen": -67998326.51851852, + "logits/rejected": -99411552.86486487, + "logps/chosen": -255.69585503472223, + "logps/rejected": -135.5848949535473, + "loss": 2.9729, + "rewards/chosen": -0.618268472177011, + "rewards/margins": 1.6751539174978203, + "rewards/rejected": -2.293422389674831, + "step": 354 + }, + { + "epoch": 11.45344129554656, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 6.028658927721697e-07, + "logits/chosen": -53487547.733333334, + "logits/rejected": -96852291.76470588, + "logps/chosen": -255.83855794270832, + "logps/rejected": -111.61545697380515, + "loss": 2.9072, + "rewards/chosen": -0.1674118995666504, + "rewards/margins": 2.125784385905546, + "rewards/rejected": -2.2931962854721966, + "step": 355 + }, + { + "epoch": 11.48582995951417, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 5.99592992551918e-07, + "logits/chosen": -57599313.92, + "logits/rejected": -99410294.15384616, + "logps/chosen": -239.4209765625, + "logps/rejected": -149.71891276041666, + "loss": 2.9702, + "rewards/chosen": -0.5822079086303711, + "rewards/margins": 1.763276702685234, + "rewards/rejected": -2.345484611315605, + "step": 356 + }, + { + "epoch": 11.518218623481781, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 5.963156426269227e-07, + "logits/chosen": -64905968.484848484, + "logits/rejected": -103663070.96774194, + "logps/chosen": -247.22037760416666, + "logps/rejected": -144.78213205645162, + "loss": 2.959, + "rewards/chosen": -0.4432914618289832, + "rewards/margins": 2.061058769710835, + "rewards/rejected": -2.5043502315398185, + "step": 357 + }, + { + "epoch": 11.550607287449393, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 5.930339894255532e-07, + "logits/chosen": -65530826.32258064, + "logits/rejected": -99848137.6969697, + "logps/chosen": -176.7787613407258, + "logps/rejected": -153.20124585700756, + "loss": 3.0479, + "rewards/chosen": -0.2331395149230957, + "rewards/margins": 2.039268479202733, + "rewards/rejected": -2.2724079941258286, + "step": 358 + }, + { + "epoch": 11.582995951417004, + "grad_norm": 164.0, + "kl": 0.0, + "learning_rate": 5.897481795684446e-07, + "logits/chosen": -59134396.631578945, + "logits/rejected": -96720640.0, + "logps/chosen": -248.07285670230263, + "logps/rejected": -133.74541766826923, + "loss": 3.0121, + "rewards/chosen": -0.26251848120438426, + "rewards/margins": 1.7145262845614662, + "rewards/rejected": -1.9770447657658503, + "step": 359 + }, + { + "epoch": 11.615384615384615, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 5.864583598619467e-07, + "logits/chosen": -66893916.0, + "logits/rejected": -106991936.0, + "logps/chosen": -252.66595458984375, + "logps/rejected": -147.30953979492188, + "loss": 2.9848, + "rewards/chosen": -0.22830769419670105, + "rewards/margins": 2.3755078613758087, + "rewards/rejected": -2.6038155555725098, + "step": 360 + }, + { + "epoch": 11.647773279352228, + "grad_norm": 167.0, + "kl": 0.0, + "learning_rate": 5.83164677291565e-07, + "logits/chosen": -77062253.71428572, + "logits/rejected": -96580934.62068966, + "logps/chosen": -178.29112723214286, + "logps/rejected": -147.08740234375, + "loss": 2.9673, + "rewards/chosen": -1.1461866106305802, + "rewards/margins": 1.0031522102543875, + "rewards/rejected": -2.1493388208849677, + "step": 361 + }, + { + "epoch": 11.680161943319838, + "grad_norm": 154.0, + "kl": 0.302021861076355, + "learning_rate": 5.798672790153937e-07, + "logits/chosen": -68697734.73684211, + "logits/rejected": -97920275.6923077, + "logps/chosen": -212.50377775493422, + "logps/rejected": -130.27116511418268, + "loss": 3.0491, + "rewards/chosen": -0.4582456287584807, + "rewards/margins": 1.9869927780830907, + "rewards/rejected": -2.4452384068415713, + "step": 362 + }, + { + "epoch": 11.712550607287449, + "grad_norm": 222.0, + "kl": 0.0, + "learning_rate": 5.7656631235754e-07, + "logits/chosen": -68498136.61538461, + "logits/rejected": -95319818.24, + "logps/chosen": -156.39588341346155, + "logps/rejected": -116.028857421875, + "loss": 2.999, + "rewards/chosen": -0.47057851155598956, + "rewards/margins": 1.7508523050944012, + "rewards/rejected": -2.2214308166503907, + "step": 363 + }, + { + "epoch": 11.744939271255062, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 5.732619248015434e-07, + "logits/chosen": -71797296.0, + "logits/rejected": -97528544.0, + "logps/chosen": -178.90850830078125, + "logps/rejected": -143.28900146484375, + "loss": 2.9969, + "rewards/chosen": -0.7994619011878967, + "rewards/margins": 1.718535840511322, + "rewards/rejected": -2.5179977416992188, + "step": 364 + }, + { + "epoch": 11.777327935222672, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 5.699542639837843e-07, + "logits/chosen": -65975868.23529412, + "logits/rejected": -93179699.2, + "logps/chosen": -220.08093979779412, + "logps/rejected": -131.855908203125, + "loss": 3.0389, + "rewards/chosen": -0.3062733762404498, + "rewards/margins": 1.982702911601347, + "rewards/rejected": -2.288976287841797, + "step": 365 + }, + { + "epoch": 11.809716599190283, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 5.666434776868895e-07, + "logits/chosen": -80727357.79310344, + "logits/rejected": -98277039.54285714, + "logps/chosen": -201.17682516163794, + "logps/rejected": -146.57950613839284, + "loss": 2.9803, + "rewards/chosen": -1.234417882458917, + "rewards/margins": 0.8401341630907482, + "rewards/rejected": -2.074552045549665, + "step": 366 + }, + { + "epoch": 11.842105263157894, + "grad_norm": 172.0, + "kl": 0.39742761850357056, + "learning_rate": 5.633297138331284e-07, + "logits/chosen": -61957560.0, + "logits/rejected": -105042832.0, + "logps/chosen": -258.6969299316406, + "logps/rejected": -127.34425354003906, + "loss": 3.0461, + "rewards/chosen": -0.21598157286643982, + "rewards/margins": 2.2946256697177887, + "rewards/rejected": -2.5106072425842285, + "step": 367 + }, + { + "epoch": 11.874493927125506, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 5.600131204778048e-07, + "logits/chosen": -59192512.0, + "logits/rejected": -96605540.57142857, + "logps/chosen": -247.47710503472223, + "logps/rejected": -136.70819091796875, + "loss": 3.0603, + "rewards/chosen": -0.12704458501603869, + "rewards/margins": 2.1967193985742233, + "rewards/rejected": -2.323763983590262, + "step": 368 + }, + { + "epoch": 11.906882591093117, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 5.56693845802641e-07, + "logits/chosen": -58737807.36, + "logits/rejected": -103143082.66666667, + "logps/chosen": -258.499140625, + "logps/rejected": -154.24395282451923, + "loss": 2.8116, + "rewards/chosen": -0.3920318603515625, + "rewards/margins": 2.304597394894331, + "rewards/rejected": -2.6966292552458935, + "step": 369 + }, + { + "epoch": 11.939271255060728, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 5.533720381091582e-07, + "logits/chosen": -77575789.71428572, + "logits/rejected": -100264156.68965517, + "logps/chosen": -178.55064174107142, + "logps/rejected": -140.40645204741378, + "loss": 2.9824, + "rewards/chosen": -0.5195248740059989, + "rewards/margins": 1.7946775333047502, + "rewards/rejected": -2.314202407310749, + "step": 370 + }, + { + "epoch": 11.97165991902834, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 5.500478458120493e-07, + "logits/chosen": -71578173.62962963, + "logits/rejected": -100940066.5945946, + "logps/chosen": -171.4014576099537, + "logps/rejected": -137.14829233530406, + "loss": 2.915, + "rewards/chosen": -0.5873909349794741, + "rewards/margins": 1.6714298822977642, + "rewards/rejected": -2.2588208172772384, + "step": 371 + }, + { + "epoch": 12.0, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 5.467214174325493e-07, + "logits/chosen": -57407784.421052635, + "logits/rejected": -97183799.6521739, + "logps/chosen": -137.0067716899671, + "logps/rejected": -124.547119140625, + "loss": 2.9997, + "rewards/chosen": -0.9185907464278372, + "rewards/margins": 1.2778963486171695, + "rewards/rejected": -2.196487095045007, + "step": 372 + }, + { + "epoch": 12.03238866396761, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 5.433929015917988e-07, + "logits/chosen": -62205601.88235294, + "logits/rejected": -95745962.66666667, + "logps/chosen": -215.14547909007354, + "logps/rejected": -112.37322591145833, + "loss": 3.0145, + "rewards/chosen": -0.43644265567555146, + "rewards/margins": 1.3775575376024434, + "rewards/rejected": -1.8140001932779948, + "step": 373 + }, + { + "epoch": 12.064777327935223, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 5.400624470042037e-07, + "logits/chosen": -66520688.941176474, + "logits/rejected": -87154321.06666666, + "logps/chosen": -229.30230353860293, + "logps/rejected": -111.72171223958334, + "loss": 2.9927, + "rewards/chosen": -0.5352107777315027, + "rewards/margins": 1.0037967906278722, + "rewards/rejected": -1.539007568359375, + "step": 374 + }, + { + "epoch": 12.097165991902834, + "grad_norm": 215.0, + "kl": 0.0, + "learning_rate": 5.36730202470791e-07, + "logits/chosen": -67337612.8, + "logits/rejected": -103419407.05882353, + "logps/chosen": -220.56637369791667, + "logps/rejected": -135.18458467371323, + "loss": 2.9151, + "rewards/chosen": -0.3477913538614909, + "rewards/margins": 2.218422814911487, + "rewards/rejected": -2.566214168772978, + "step": 375 + }, + { + "epoch": 12.129554655870445, + "grad_norm": 254.0, + "kl": 0.0, + "learning_rate": 5.333963168725609e-07, + "logits/chosen": -66774317.94871795, + "logits/rejected": -98000967.68, + "logps/chosen": -237.08841646634616, + "logps/rejected": -125.35671875, + "loss": 3.1906, + "rewards/chosen": -0.26224940862411106, + "rewards/margins": 1.8527251091981545, + "rewards/rejected": -2.1149745178222656, + "step": 376 + }, + { + "epoch": 12.161943319838057, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 5.300609391638335e-07, + "logits/chosen": -56944269.24137931, + "logits/rejected": -97837143.77142857, + "logps/chosen": -210.49350080818965, + "logps/rejected": -132.54422433035714, + "loss": 2.8653, + "rewards/chosen": -0.11951449821735251, + "rewards/margins": 2.080981194444478, + "rewards/rejected": -2.20049569266183, + "step": 377 + }, + { + "epoch": 12.194331983805668, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 5.267242183655961e-07, + "logits/chosen": -54866456.77419355, + "logits/rejected": -94181081.21212122, + "logps/chosen": -158.11824281754033, + "logps/rejected": -117.57677112926136, + "loss": 2.9811, + "rewards/chosen": 0.009594423155630789, + "rewards/margins": 2.261896650753879, + "rewards/rejected": -2.252302227598248, + "step": 378 + }, + { + "epoch": 12.226720647773279, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 5.233863035588426e-07, + "logits/chosen": -64303697.45454545, + "logits/rejected": -90242295.74193548, + "logps/chosen": -149.16764322916666, + "logps/rejected": -150.07774697580646, + "loss": 2.9796, + "rewards/chosen": -0.4428321664983576, + "rewards/margins": 1.7305844634159568, + "rewards/rejected": -2.1734166299143145, + "step": 379 + }, + { + "epoch": 12.259109311740891, + "grad_norm": 222.0, + "kl": 0.0, + "learning_rate": 5.200473438779146e-07, + "logits/chosen": -52087825.655172415, + "logits/rejected": -99907649.82857142, + "logps/chosen": -237.14458176185346, + "logps/rejected": -137.6441685267857, + "loss": 2.9653, + "rewards/chosen": -0.0756178724354711, + "rewards/margins": 2.394976697179484, + "rewards/rejected": -2.4705945696149554, + "step": 380 + }, + { + "epoch": 12.291497975708502, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 5.167074885038372e-07, + "logits/chosen": -75272373.89473684, + "logits/rejected": -96073432.61538461, + "logps/chosen": -187.40669973273026, + "logps/rejected": -150.01200984074518, + "loss": 3.0149, + "rewards/chosen": -0.7920285777041787, + "rewards/margins": 1.710938376453724, + "rewards/rejected": -2.5029669541579027, + "step": 381 + }, + { + "epoch": 12.323886639676113, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 5.133668866576544e-07, + "logits/chosen": -71279504.69565217, + "logits/rejected": -97946786.34146342, + "logps/chosen": -151.93790336277175, + "logps/rejected": -124.80961794969512, + "loss": 2.7985, + "rewards/chosen": -1.1124801635742188, + "rewards/margins": 1.3743215421350992, + "rewards/rejected": -2.486801705709318, + "step": 382 + }, + { + "epoch": 12.356275303643725, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 5.100256875937613e-07, + "logits/chosen": -61767467.885714285, + "logits/rejected": -92229190.62068966, + "logps/chosen": -195.62333984375, + "logps/rejected": -140.0376986799569, + "loss": 3.0236, + "rewards/chosen": -0.22302398681640626, + "rewards/margins": 1.9864712945346175, + "rewards/rejected": -2.209495281351024, + "step": 383 + }, + { + "epoch": 12.388663967611336, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 5.066840405932363e-07, + "logits/chosen": -61738985.14285714, + "logits/rejected": -92859832.8888889, + "logps/chosen": -285.20682198660717, + "logps/rejected": -117.97832573784723, + "loss": 2.9566, + "rewards/chosen": -0.2376061167035784, + "rewards/margins": 1.8120476783268038, + "rewards/rejected": -2.049653795030382, + "step": 384 + }, + { + "epoch": 12.421052631578947, + "grad_norm": 216.0, + "kl": 0.0, + "learning_rate": 5.033420949571712e-07, + "logits/chosen": -67980837.92592593, + "logits/rejected": -99720150.48648648, + "logps/chosen": -255.52560763888889, + "logps/rejected": -136.02293602195945, + "loss": 2.9316, + "rewards/chosen": -0.6012460214120371, + "rewards/margins": 1.7359806400638917, + "rewards/rejected": -2.337226661475929, + "step": 385 + }, + { + "epoch": 12.45344129554656, + "grad_norm": 157.0, + "kl": 0.0, + "learning_rate": 5e-07, + "logits/chosen": -53652727.46666667, + "logits/rejected": -96935988.70588236, + "logps/chosen": -255.8826171875, + "logps/rejected": -111.8104607077206, + "loss": 2.8778, + "rewards/chosen": -0.17181793848673502, + "rewards/margins": 2.14087885501338, + "rewards/rejected": -2.312696793500115, + "step": 386 + }, + { + "epoch": 12.48582995951417, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 4.96657905042829e-07, + "logits/chosen": -57756974.08, + "logits/rejected": -99834486.15384616, + "logps/chosen": -239.43640625, + "logps/rejected": -150.11487129407053, + "loss": 2.9352, + "rewards/chosen": -0.5837510299682617, + "rewards/margins": 1.8013283783350236, + "rewards/rejected": -2.385079408303285, + "step": 387 + }, + { + "epoch": 12.518218623481781, + "grad_norm": 174.0, + "kl": 0.0, + "learning_rate": 4.933159594067636e-07, + "logits/chosen": -65163364.84848485, + "logits/rejected": -103761845.67741935, + "logps/chosen": -247.18761837121212, + "logps/rejected": -145.4609847530242, + "loss": 2.9628, + "rewards/chosen": -0.44001206484707917, + "rewards/margins": 2.1322240270128, + "rewards/rejected": -2.572236091859879, + "step": 388 + }, + { + "epoch": 12.550607287449393, + "grad_norm": 215.0, + "kl": 0.0, + "learning_rate": 4.899743124062388e-07, + "logits/chosen": -65673224.258064516, + "logits/rejected": -99906544.48484848, + "logps/chosen": -176.6609122983871, + "logps/rejected": -153.7152284564394, + "loss": 3.0232, + "rewards/chosen": -0.22135422306676064, + "rewards/margins": 2.102451383659684, + "rewards/rejected": -2.3238056067264443, + "step": 389 + }, + { + "epoch": 12.582995951417004, + "grad_norm": 149.0, + "kl": 0.0, + "learning_rate": 4.866331133423456e-07, + "logits/chosen": -59082826.10526316, + "logits/rejected": -96785811.6923077, + "logps/chosen": -248.17495888157896, + "logps/rejected": -134.40794020432693, + "loss": 2.9573, + "rewards/chosen": -0.2727272385045102, + "rewards/margins": 1.7705692820220826, + "rewards/rejected": -2.0432965205265927, + "step": 390 + }, + { + "epoch": 12.615384615384615, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 4.832925114961628e-07, + "logits/chosen": -66944908.0, + "logits/rejected": -107288816.0, + "logps/chosen": -252.65260314941406, + "logps/rejected": -147.99545288085938, + "loss": 2.9563, + "rewards/chosen": -0.22697299718856812, + "rewards/margins": 2.445434868335724, + "rewards/rejected": -2.672407865524292, + "step": 391 + }, + { + "epoch": 12.647773279352228, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 4.799526561220855e-07, + "logits/chosen": -77148218.51428571, + "logits/rejected": -96747105.10344827, + "logps/chosen": -178.52057756696428, + "logps/rejected": -147.5990369073276, + "loss": 2.9546, + "rewards/chosen": -1.1691302708217075, + "rewards/margins": 1.031372453661388, + "rewards/rejected": -2.2005027244830955, + "step": 392 + }, + { + "epoch": 12.680161943319838, + "grad_norm": 165.0, + "kl": 0.3369103670120239, + "learning_rate": 4.766136964411575e-07, + "logits/chosen": -68810549.89473684, + "logits/rejected": -98137846.15384616, + "logps/chosen": -212.39981239720396, + "logps/rejected": -130.5043006310096, + "loss": 3.041, + "rewards/chosen": -0.4478492736816406, + "rewards/margins": 2.0207017751840444, + "rewards/rejected": -2.468551048865685, + "step": 393 + }, + { + "epoch": 12.712550607287449, + "grad_norm": 239.0, + "kl": 0.0, + "learning_rate": 4.7327578163440397e-07, + "logits/chosen": -68761127.38461539, + "logits/rejected": -95771729.92, + "logps/chosen": -156.4126477363782, + "logps/rejected": -116.554169921875, + "loss": 2.9397, + "rewards/chosen": -0.47225526662973255, + "rewards/margins": 1.80170774606558, + "rewards/rejected": -2.2739630126953125, + "step": 394 + }, + { + "epoch": 12.744939271255062, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 4.699390608361665e-07, + "logits/chosen": -72064720.0, + "logits/rejected": -97630952.0, + "logps/chosen": -179.0907440185547, + "logps/rejected": -143.79690551757812, + "loss": 2.9713, + "rewards/chosen": -0.8176858425140381, + "rewards/margins": 1.7510998249053955, + "rewards/rejected": -2.5687856674194336, + "step": 395 + }, + { + "epoch": 12.777327935222672, + "grad_norm": 222.0, + "kl": 0.0, + "learning_rate": 4.666036831274392e-07, + "logits/chosen": -66282330.35294118, + "logits/rejected": -93253836.8, + "logps/chosen": -220.14513442095588, + "logps/rejected": -132.57906901041667, + "loss": 3.005, + "rewards/chosen": -0.3126918568330653, + "rewards/margins": 2.0485999013863356, + "rewards/rejected": -2.361291758219401, + "step": 396 + }, + { + "epoch": 12.809716599190283, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 4.63269797529209e-07, + "logits/chosen": -81224995.31034483, + "logits/rejected": -98459845.48571429, + "logps/chosen": -201.45218211206895, + "logps/rejected": -147.26661551339285, + "loss": 2.9455, + "rewards/chosen": -1.2619523673221982, + "rewards/margins": 0.8813104413413064, + "rewards/rejected": -2.1432628086635046, + "step": 397 + }, + { + "epoch": 12.842105263157894, + "grad_norm": 169.0, + "kl": 0.367470920085907, + "learning_rate": 4.599375529957962e-07, + "logits/chosen": -62049564.0, + "logits/rejected": -105151920.0, + "logps/chosen": -258.6617736816406, + "logps/rejected": -127.78885650634766, + "loss": 3.0405, + "rewards/chosen": -0.2124641239643097, + "rewards/margins": 2.342603415250778, + "rewards/rejected": -2.555067539215088, + "step": 398 + }, + { + "epoch": 12.874493927125506, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 4.566070984082013e-07, + "logits/chosen": -59460174.222222224, + "logits/rejected": -96761700.57142857, + "logps/chosen": -247.31312391493054, + "logps/rejected": -136.95270647321428, + "loss": 3.0286, + "rewards/chosen": -0.11064546638064915, + "rewards/margins": 2.2375704542038934, + "rewards/rejected": -2.3482159205845425, + "step": 399 + }, + { + "epoch": 12.906882591093117, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 4.5327858256745065e-07, + "logits/chosen": -58901780.48, + "logits/rejected": -103169772.3076923, + "logps/chosen": -258.33087890625, + "logps/rejected": -154.88575470753204, + "loss": 2.7745, + "rewards/chosen": -0.375203971862793, + "rewards/margins": 2.385605315183982, + "rewards/rejected": -2.760809287046775, + "step": 400 + }, + { + "epoch": 12.906882591093117, + "eval_kl": 0.0, + "eval_logits/chosen": -80514820.38267875, + "eval_logits/rejected": -122508565.95761856, + "eval_logps/chosen": -211.8609327794562, + "eval_logps/rejected": -134.68407858224018, + "eval_loss": 0.26694580912590027, + "eval_rewards/chosen": -0.454732935234139, + "eval_rewards/margins": 1.8590146208960325, + "eval_rewards/rejected": -2.3137475561301715, + "eval_runtime": 64.2083, + "eval_samples_per_second": 30.697, + "eval_steps_per_second": 0.966, + "step": 400 + }, + { + "epoch": 12.939271255060728, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 4.499521541879508e-07, + "logits/chosen": -77762464.91428572, + "logits/rejected": -100488412.68965517, + "logps/chosen": -178.36229073660715, + "logps/rejected": -140.72516500538794, + "loss": 2.9719, + "rewards/chosen": -0.5006898607526507, + "rewards/margins": 1.84538289525826, + "rewards/rejected": -2.3460727560109107, + "step": 401 + }, + { + "epoch": 12.97165991902834, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 4.466279618908419e-07, + "logits/chosen": -71628984.8888889, + "logits/rejected": -101016506.8108108, + "logps/chosen": -171.49994574652777, + "logps/rejected": -137.81358213682432, + "loss": 2.886, + "rewards/chosen": -0.5972412250660084, + "rewards/margins": 1.728110180722104, + "rewards/rejected": -2.3253514057881124, + "step": 402 + }, + { + "epoch": 13.0, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 4.43306154197359e-07, + "logits/chosen": -57514947.368421055, + "logits/rejected": -97582647.6521739, + "logps/chosen": -137.25752981085526, + "logps/rejected": -124.87584918478261, + "loss": 2.9705, + "rewards/chosen": -0.9436655546489515, + "rewards/margins": 1.2856941135851687, + "rewards/rejected": -2.22935966823412, + "step": 403 + }, + { + "epoch": 13.03238866396761, + "grad_norm": 212.0, + "kl": 0.0, + "learning_rate": 4.399868795221951e-07, + "logits/chosen": -62363561.4117647, + "logits/rejected": -96009873.06666666, + "logps/chosen": -215.01226447610293, + "logps/rejected": -112.50869140625, + "loss": 2.9889, + "rewards/chosen": -0.42311934863819795, + "rewards/margins": 1.4044259445340026, + "rewards/rejected": -1.8275452931722005, + "step": 404 + }, + { + "epoch": 13.064777327935223, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 4.3667028616687156e-07, + "logits/chosen": -66683655.52941176, + "logits/rejected": -87116928.0, + "logps/chosen": -229.189453125, + "logps/rejected": -111.966552734375, + "loss": 2.9998, + "rewards/chosen": -0.5239222470451804, + "rewards/margins": 1.039568556991278, + "rewards/rejected": -1.5634908040364583, + "step": 405 + }, + { + "epoch": 13.097165991902834, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 4.333565223131107e-07, + "logits/chosen": -67220795.73333333, + "logits/rejected": -103755565.1764706, + "logps/chosen": -220.74016927083332, + "logps/rejected": -135.38880112591912, + "loss": 2.8688, + "rewards/chosen": -0.3651699701944987, + "rewards/margins": 2.221465395011154, + "rewards/rejected": -2.5866353652056526, + "step": 406 + }, + { + "epoch": 13.129554655870445, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 4.3004573601621576e-07, + "logits/chosen": -66763953.23076923, + "logits/rejected": -98261268.48, + "logps/chosen": -237.33042868589743, + "logps/rejected": -125.47279296875, + "loss": 3.1564, + "rewards/chosen": -0.2864517798790565, + "rewards/margins": 1.840128725492037, + "rewards/rejected": -2.1265805053710936, + "step": 407 + }, + { + "epoch": 13.161943319838057, + "grad_norm": 164.0, + "kl": 0.0, + "learning_rate": 4.267380751984567e-07, + "logits/chosen": -57138290.75862069, + "logits/rejected": -98104619.88571429, + "logps/chosen": -210.46821120689654, + "logps/rejected": -132.85344587053572, + "loss": 2.8493, + "rewards/chosen": -0.11698566634079506, + "rewards/margins": 2.1144321803388926, + "rewards/rejected": -2.2314178466796877, + "step": 408 + }, + { + "epoch": 13.194331983805668, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 4.2343368764245994e-07, + "logits/chosen": -55141797.161290325, + "logits/rejected": -94156396.60606061, + "logps/chosen": -158.40467489919354, + "logps/rejected": -117.95622484611742, + "loss": 2.9667, + "rewards/chosen": -0.019050984613357053, + "rewards/margins": 2.271195227507855, + "rewards/rejected": -2.290246212121212, + "step": 409 + }, + { + "epoch": 13.226720647773279, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 4.201327209846065e-07, + "logits/chosen": -64434482.42424242, + "logits/rejected": -90488105.29032259, + "logps/chosen": -149.4125828598485, + "logps/rejected": -150.57333669354838, + "loss": 2.9511, + "rewards/chosen": -0.4673265977339311, + "rewards/margins": 1.7556500868363814, + "rewards/rejected": -2.2229766845703125, + "step": 410 + }, + { + "epoch": 13.259109311740891, + "grad_norm": 192.0, + "kl": 0.0, + "learning_rate": 4.1683532270843495e-07, + "logits/chosen": -52033964.137931034, + "logits/rejected": -99901571.65714286, + "logps/chosen": -236.8319302262931, + "logps/rejected": -137.90398995535713, + "loss": 2.956, + "rewards/chosen": -0.044351536652137494, + "rewards/margins": 2.452225480995742, + "rewards/rejected": -2.4965770176478794, + "step": 411 + }, + { + "epoch": 13.291497975708502, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 4.135416401380534e-07, + "logits/chosen": -75395577.26315789, + "logits/rejected": -95964288.0, + "logps/chosen": -187.52959241365133, + "logps/rejected": -150.21195162259616, + "loss": 2.9836, + "rewards/chosen": -0.8043166712710732, + "rewards/margins": 1.7186428178177187, + "rewards/rejected": -2.522959489088792, + "step": 412 + }, + { + "epoch": 13.323886639676113, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 4.1025182043155545e-07, + "logits/chosen": -71402568.3478261, + "logits/rejected": -98190410.92682926, + "logps/chosen": -152.3931194802989, + "logps/rejected": -125.17225609756098, + "loss": 2.8058, + "rewards/chosen": -1.158001941183339, + "rewards/margins": 1.3650637653953577, + "rewards/rejected": -2.5230657065786968, + "step": 413 + }, + { + "epoch": 13.356275303643725, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 4.069660105744469e-07, + "logits/chosen": -61850938.51428571, + "logits/rejected": -92347418.48275863, + "logps/chosen": -195.61358816964287, + "logps/rejected": -140.38009327855605, + "loss": 2.9815, + "rewards/chosen": -0.22204859597342355, + "rewards/margins": 2.02168635570357, + "rewards/rejected": -2.2437349516769935, + "step": 414 + }, + { + "epoch": 13.388663967611336, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 4.036843573730773e-07, + "logits/chosen": -61841362.28571428, + "logits/rejected": -93016832.0, + "logps/chosen": -285.03006417410717, + "logps/rejected": -118.03655327690973, + "loss": 2.9634, + "rewards/chosen": -0.2199288947241647, + "rewards/margins": 1.835546340261187, + "rewards/rejected": -2.0554752349853516, + "step": 415 + }, + { + "epoch": 13.421052631578947, + "grad_norm": 236.0, + "kl": 0.0, + "learning_rate": 4.0040700744808204e-07, + "logits/chosen": -68229584.5925926, + "logits/rejected": -100075575.35135135, + "logps/chosen": -255.61593967013889, + "logps/rejected": -136.12808804898648, + "loss": 2.9279, + "rewards/chosen": -0.6102768226906106, + "rewards/margins": 1.7374660312473118, + "rewards/rejected": -2.3477428539379224, + "step": 416 + }, + { + "epoch": 13.45344129554656, + "grad_norm": 162.0, + "kl": 0.0, + "learning_rate": 3.9713410722783014e-07, + "logits/chosen": -53743496.53333333, + "logits/rejected": -97241682.8235294, + "logps/chosen": -255.80555013020833, + "logps/rejected": -112.13170668658088, + "loss": 2.8831, + "rewards/chosen": -0.1641114393870036, + "rewards/margins": 2.1807109393325503, + "rewards/rejected": -2.344822378719554, + "step": 417 + }, + { + "epoch": 13.48582995951417, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 3.9386580294188363e-07, + "logits/chosen": -58015687.68, + "logits/rejected": -99712177.23076923, + "logps/chosen": -239.59330078125, + "logps/rejected": -150.53117487980768, + "loss": 2.9356, + "rewards/chosen": -0.5994418334960937, + "rewards/margins": 1.8272686376327125, + "rewards/rejected": -2.426710471128806, + "step": 418 + }, + { + "epoch": 13.518218623481781, + "grad_norm": 164.0, + "kl": 0.0, + "learning_rate": 3.906022406144624e-07, + "logits/chosen": -65337879.27272727, + "logits/rejected": -104208516.12903225, + "logps/chosen": -247.24338600852272, + "logps/rejected": -145.6847120715726, + "loss": 2.9268, + "rewards/chosen": -0.4455898169315223, + "rewards/margins": 2.149017874679491, + "rewards/rejected": -2.5946076916110132, + "step": 419 + }, + { + "epoch": 13.550607287449393, + "grad_norm": 203.0, + "kl": 0.0, + "learning_rate": 3.873435660579217e-07, + "logits/chosen": -65686804.64516129, + "logits/rejected": -100139101.0909091, + "logps/chosen": -176.82069052419354, + "logps/rejected": -153.87301728219697, + "loss": 3.0137, + "rewards/chosen": -0.23733255940098916, + "rewards/margins": 2.1022541031226853, + "rewards/rejected": -2.3395866625236743, + "step": 420 + }, + { + "epoch": 13.582995951417004, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 3.840899248662358e-07, + "logits/chosen": -59151023.15789474, + "logits/rejected": -97065028.92307693, + "logps/chosen": -248.03626130756578, + "logps/rejected": -134.3540978064904, + "loss": 2.971, + "rewards/chosen": -0.25885752627724096, + "rewards/margins": 1.7790538154632938, + "rewards/rejected": -2.037911341740535, + "step": 421 + }, + { + "epoch": 13.615384615384615, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 3.8084146240849453e-07, + "logits/chosen": -66869288.0, + "logits/rejected": -107588216.0, + "logps/chosen": -252.61904907226562, + "logps/rejected": -148.1591796875, + "loss": 2.9301, + "rewards/chosen": -0.22361889481544495, + "rewards/margins": 2.465162366628647, + "rewards/rejected": -2.688781261444092, + "step": 422 + }, + { + "epoch": 13.647773279352228, + "grad_norm": 155.0, + "kl": 0.0, + "learning_rate": 3.77598323822407e-07, + "logits/chosen": -77296362.05714285, + "logits/rejected": -97172427.03448276, + "logps/chosen": -178.31141183035714, + "logps/rejected": -147.4927768049569, + "loss": 2.9457, + "rewards/chosen": -1.1482150486537388, + "rewards/margins": 1.0416607184950353, + "rewards/rejected": -2.189875767148774, + "step": 423 + }, + { + "epoch": 13.680161943319838, + "grad_norm": 180.0, + "kl": 0.3856762647628784, + "learning_rate": 3.743606540078177e-07, + "logits/chosen": -68903215.15789473, + "logits/rejected": -98439798.15384616, + "logps/chosen": -212.71296772203948, + "logps/rejected": -130.68912447415866, + "loss": 3.0237, + "rewards/chosen": -0.4791660810771741, + "rewards/margins": 2.0078674694787155, + "rewards/rejected": -2.4870335505558896, + "step": 424 + }, + { + "epoch": 13.712550607287449, + "grad_norm": 233.0, + "kl": 0.0, + "learning_rate": 3.7112859762023305e-07, + "logits/chosen": -68740831.17948718, + "logits/rejected": -95837839.36, + "logps/chosen": -156.58494841746796, + "logps/rejected": -116.330234375, + "loss": 2.9363, + "rewards/chosen": -0.48948473808092946, + "rewards/margins": 1.762084475786258, + "rewards/rejected": -2.2515692138671874, + "step": 425 + }, + { + "epoch": 13.744939271255062, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 3.67902299064357e-07, + "logits/chosen": -72169888.0, + "logits/rejected": -97907488.0, + "logps/chosen": -179.15684509277344, + "logps/rejected": -144.07077026367188, + "loss": 2.9644, + "rewards/chosen": -0.8242971897125244, + "rewards/margins": 1.7718760967254639, + "rewards/rejected": -2.5961732864379883, + "step": 426 + }, + { + "epoch": 13.777327935222672, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 3.646819024876406e-07, + "logits/chosen": -66282797.176470585, + "logits/rejected": -93732411.73333333, + "logps/chosen": -220.14165900735293, + "logps/rejected": -132.7333984375, + "loss": 2.9911, + "rewards/chosen": -0.312345729154699, + "rewards/margins": 2.064380294201421, + "rewards/rejected": -2.37672602335612, + "step": 427 + }, + { + "epoch": 13.809716599190283, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 3.614675517738405e-07, + "logits/chosen": -81145494.06896552, + "logits/rejected": -98771792.45714286, + "logps/chosen": -201.3799838362069, + "logps/rejected": -147.63282645089285, + "loss": 2.9303, + "rewards/chosen": -1.2547336446827855, + "rewards/margins": 0.925151783609625, + "rewards/rejected": -2.1798854282924105, + "step": 428 + }, + { + "epoch": 13.842105263157894, + "grad_norm": 188.0, + "kl": 0.38798022270202637, + "learning_rate": 3.582593905365912e-07, + "logits/chosen": -62279860.0, + "logits/rejected": -105478680.0, + "logps/chosen": -258.7704772949219, + "logps/rejected": -128.01951599121094, + "loss": 3.0279, + "rewards/chosen": -0.22333630919456482, + "rewards/margins": 2.3547967970371246, + "rewards/rejected": -2.5781331062316895, + "step": 429 + }, + { + "epoch": 13.874493927125506, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 3.5505756211298774e-07, + "logits/chosen": -59527082.666666664, + "logits/rejected": -96977956.57142857, + "logps/chosen": -247.30409071180554, + "logps/rejected": -136.9112548828125, + "loss": 3.039, + "rewards/chosen": -0.10974003209008111, + "rewards/margins": 2.2343312199153598, + "rewards/rejected": -2.344071252005441, + "step": 430 + }, + { + "epoch": 13.906882591093117, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 3.5186220955718303e-07, + "logits/chosen": -58701363.2, + "logits/rejected": -103313033.84615384, + "logps/chosen": -258.4644921875, + "logps/rejected": -155.13411458333334, + "loss": 2.7653, + "rewards/chosen": -0.3885665512084961, + "rewards/margins": 2.3970807187985153, + "rewards/rejected": -2.7856472700070114, + "step": 431 + }, + { + "epoch": 13.939271255060728, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 3.486734756339943e-07, + "logits/chosen": -77798129.37142856, + "logits/rejected": -100511867.5862069, + "logps/chosen": -178.74274553571428, + "logps/rejected": -140.58995319234913, + "loss": 2.9644, + "rewards/chosen": -0.5387344905308314, + "rewards/margins": 1.793816610627574, + "rewards/rejected": -2.3325511011584052, + "step": 432 + }, + { + "epoch": 13.97165991902834, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 3.454915028125263e-07, + "logits/chosen": -71885468.44444445, + "logits/rejected": -101091120.43243243, + "logps/chosen": -171.57468894675927, + "logps/rejected": -138.02104888091216, + "loss": 2.8645, + "rewards/chosen": -0.6047153472900391, + "rewards/margins": 1.741381619427655, + "rewards/rejected": -2.346096966717694, + "step": 433 + }, + { + "epoch": 14.0, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 3.4231643325980485e-07, + "logits/chosen": -57590480.84210526, + "logits/rejected": -97733409.39130434, + "logps/chosen": -136.9785798725329, + "logps/rejected": -124.88034986413044, + "loss": 2.9596, + "rewards/chosen": -0.9157708820543791, + "rewards/margins": 1.3140384228878763, + "rewards/rejected": -2.2298093049422554, + "step": 434 + }, + { + "epoch": 14.03238866396761, + "grad_norm": 200.0, + "kl": 0.0, + "learning_rate": 3.391484088344256e-07, + "logits/chosen": -62335006.11764706, + "logits/rejected": -96016162.13333334, + "logps/chosen": -215.0373965992647, + "logps/rejected": -112.78855794270834, + "loss": 2.9837, + "rewards/chosen": -0.42563365487491384, + "rewards/margins": 1.4298984190996955, + "rewards/rejected": -1.8555320739746093, + "step": 435 + }, + { + "epoch": 14.064777327935223, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 3.359875710802154e-07, + "logits/chosen": -66720323.76470588, + "logits/rejected": -87461324.8, + "logps/chosen": -229.37296070772058, + "logps/rejected": -112.27146809895834, + "loss": 2.951, + "rewards/chosen": -0.5422763263477999, + "rewards/margins": 1.0517066244985542, + "rewards/rejected": -1.593982950846354, + "step": 436 + }, + { + "epoch": 14.097165991902834, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 3.328340612199091e-07, + "logits/chosen": -67281425.06666666, + "logits/rejected": -103862031.05882353, + "logps/chosen": -220.54358723958333, + "logps/rejected": -135.44522633272058, + "loss": 2.8496, + "rewards/chosen": -0.34551318486531574, + "rewards/margins": 2.2467647907780663, + "rewards/rejected": -2.5922779756433822, + "step": 437 + }, + { + "epoch": 14.129554655870445, + "grad_norm": 212.0, + "kl": 0.0, + "learning_rate": 3.296880201488387e-07, + "logits/chosen": -66975730.87179487, + "logits/rejected": -98592235.52, + "logps/chosen": -237.08318309294873, + "logps/rejected": -125.760625, + "loss": 3.1563, + "rewards/chosen": -0.26172520564152646, + "rewards/margins": 1.8936396320049578, + "rewards/rejected": -2.1553648376464842, + "step": 438 + }, + { + "epoch": 14.161943319838057, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 3.2654958842863966e-07, + "logits/chosen": -57296750.344827585, + "logits/rejected": -98417473.82857142, + "logps/chosen": -210.61644665948276, + "logps/rejected": -133.2125, + "loss": 2.8353, + "rewards/chosen": -0.1318079685342723, + "rewards/margins": 2.135515770653786, + "rewards/rejected": -2.267323739188058, + "step": 439 + }, + { + "epoch": 14.194331983805668, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 3.234189062809695e-07, + "logits/chosen": -55212981.67741936, + "logits/rejected": -94428074.66666667, + "logps/chosen": -158.2477791078629, + "logps/rejected": -118.16590465198864, + "loss": 2.9616, + "rewards/chosen": -0.0033594177615258003, + "rewards/margins": 2.3078544512755244, + "rewards/rejected": -2.3112138690370503, + "step": 440 + }, + { + "epoch": 14.226720647773279, + "grad_norm": 204.0, + "kl": 0.0, + "learning_rate": 3.2029611358124365e-07, + "logits/chosen": -64443834.18181818, + "logits/rejected": -90383673.80645162, + "logps/chosen": -149.40093809185606, + "logps/rejected": -150.58826864919354, + "loss": 2.9449, + "rewards/chosen": -0.46616334626168915, + "rewards/margins": 1.758306484651239, + "rewards/rejected": -2.2244698309129283, + "step": 441 + }, + { + "epoch": 14.259109311740891, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 3.171813498523857e-07, + "logits/chosen": -52027153.655172415, + "logits/rejected": -100159546.51428571, + "logps/chosen": -236.96869948814654, + "logps/rejected": -138.04977678571427, + "loss": 2.943, + "rewards/chosen": -0.05802883361947948, + "rewards/margins": 2.4531279569776188, + "rewards/rejected": -2.511156790597098, + "step": 442 + }, + { + "epoch": 14.291497975708502, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 3.1407475425859343e-07, + "logits/chosen": -75414339.36842105, + "logits/rejected": -96486921.84615384, + "logps/chosen": -187.29546155427633, + "logps/rejected": -150.39766751802884, + "loss": 2.9584, + "rewards/chosen": -0.7809036656429893, + "rewards/margins": 1.7606292909938797, + "rewards/rejected": -2.541532956636869, + "step": 443 + }, + { + "epoch": 14.323886639676113, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 3.1097646559912206e-07, + "logits/chosen": -71349837.91304348, + "logits/rejected": -98318598.24390244, + "logps/chosen": -152.32198963994566, + "logps/rejected": -125.24911871189025, + "loss": 2.7775, + "rewards/chosen": -1.1508886917777683, + "rewards/margins": 1.379862792418592, + "rewards/rejected": -2.5307514841963603, + "step": 444 + }, + { + "epoch": 14.356275303643725, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 3.0788662230208145e-07, + "logits/chosen": -62047692.8, + "logits/rejected": -92549384.8275862, + "logps/chosen": -195.6550502232143, + "logps/rejected": -140.62846848060346, + "loss": 3.0008, + "rewards/chosen": -0.2261941637311663, + "rewards/margins": 2.042380254961587, + "rewards/rejected": -2.2685744186927534, + "step": 445 + }, + { + "epoch": 14.388663967611336, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 3.048053624182526e-07, + "logits/chosen": -61581298.28571428, + "logits/rejected": -93053937.77777778, + "logps/chosen": -285.24771554129467, + "logps/rejected": -118.25196668836806, + "loss": 2.9588, + "rewards/chosen": -0.24169693674360002, + "rewards/margins": 1.8353205294836135, + "rewards/rejected": -2.0770174662272134, + "step": 446 + }, + { + "epoch": 14.421052631578947, + "grad_norm": 326.0, + "kl": 0.0, + "learning_rate": 3.017328236149186e-07, + "logits/chosen": -68228887.7037037, + "logits/rejected": -100119116.1081081, + "logps/chosen": -255.85997178819446, + "logps/rejected": -136.4729465793919, + "loss": 2.9033, + "rewards/chosen": -0.6346799355966074, + "rewards/margins": 1.7475479878224172, + "rewards/rejected": -2.3822279234190247, + "step": 447 + }, + { + "epoch": 14.45344129554656, + "grad_norm": 174.0, + "kl": 0.0, + "learning_rate": 2.986691431697148e-07, + "logits/chosen": -53854459.733333334, + "logits/rejected": -97388167.52941176, + "logps/chosen": -255.93505859375, + "logps/rejected": -112.32900103400735, + "loss": 2.8598, + "rewards/chosen": -0.1770633061726888, + "rewards/margins": 2.187487564834894, + "rewards/rejected": -2.364550871007583, + "step": 448 + }, + { + "epoch": 14.48582995951417, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 2.9561445796449414e-07, + "logits/chosen": -58128983.04, + "logits/rejected": -99960910.76923077, + "logps/chosen": -239.3297265625, + "logps/rejected": -150.4403295272436, + "loss": 2.9055, + "rewards/chosen": -0.573082618713379, + "rewards/margins": 1.844545547289726, + "rewards/rejected": -2.417628166003105, + "step": 449 + }, + { + "epoch": 14.518218623481781, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 2.9256890447921315e-07, + "logits/chosen": -65360314.18181818, + "logits/rejected": -104209267.61290322, + "logps/chosen": -247.2750946969697, + "logps/rejected": -145.80722341229838, + "loss": 2.9257, + "rewards/chosen": -0.44876208449854993, + "rewards/margins": 2.158096445853643, + "rewards/rejected": -2.6068585303521927, + "step": 450 + }, + { + "epoch": 14.550607287449393, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 2.895326187858326e-07, + "logits/chosen": -65681193.29032258, + "logits/rejected": -100278194.42424242, + "logps/chosen": -176.77208291330646, + "logps/rejected": -153.88634883996212, + "loss": 3.0083, + "rewards/chosen": -0.2324707277359501, + "rewards/margins": 2.108446917342767, + "rewards/rejected": -2.340917645078717, + "step": 451 + }, + { + "epoch": 14.582995951417004, + "grad_norm": 153.0, + "kl": 0.0, + "learning_rate": 2.865057365422386e-07, + "logits/chosen": -59178880.0, + "logits/rejected": -97249112.61538461, + "logps/chosen": -247.91049033717104, + "logps/rejected": -134.51881760817307, + "loss": 2.9599, + "rewards/chosen": -0.24628260261134097, + "rewards/margins": 1.8081014822369161, + "rewards/rejected": -2.054384084848257, + "step": 452 + }, + { + "epoch": 14.615384615384615, + "grad_norm": 169.0, + "kl": 0.0, + "learning_rate": 2.8348839298618177e-07, + "logits/chosen": -66901720.0, + "logits/rejected": -107571936.0, + "logps/chosen": -252.63880920410156, + "logps/rejected": -148.36029052734375, + "loss": 2.9447, + "rewards/chosen": -0.2255946844816208, + "rewards/margins": 2.483295515179634, + "rewards/rejected": -2.708890199661255, + "step": 453 + }, + { + "epoch": 14.647773279352228, + "grad_norm": 163.0, + "kl": 0.0, + "learning_rate": 2.8048072292923464e-07, + "logits/chosen": -77316527.54285714, + "logits/rejected": -97096403.86206897, + "logps/chosen": -178.4888950892857, + "logps/rejected": -147.96482691271552, + "loss": 2.9268, + "rewards/chosen": -1.1659620012555802, + "rewards/margins": 1.0711180137296028, + "rewards/rejected": -2.237080014985183, + "step": 454 + }, + { + "epoch": 14.680161943319838, + "grad_norm": 192.0, + "kl": 0.372228741645813, + "learning_rate": 2.774828607507683e-07, + "logits/chosen": -68992552.42105263, + "logits/rejected": -98634801.23076923, + "logps/chosen": -212.55967310855263, + "logps/rejected": -131.10398512620193, + "loss": 3.0125, + "rewards/chosen": -0.46383536489386307, + "rewards/margins": 2.064683898740452, + "rewards/rejected": -2.528519263634315, + "step": 455 + }, + { + "epoch": 14.712550607287449, + "grad_norm": 234.0, + "kl": 0.0, + "learning_rate": 2.74494940391949e-07, + "logits/chosen": -68925525.33333333, + "logits/rejected": -96213186.56, + "logps/chosen": -156.44189453125, + "logps/rejected": -116.6107421875, + "loss": 2.9425, + "rewards/chosen": -0.47518011239858776, + "rewards/margins": 1.8044396385779748, + "rewards/rejected": -2.2796197509765626, + "step": 456 + }, + { + "epoch": 14.744939271255062, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 2.715170953497532e-07, + "logits/chosen": -72270920.0, + "logits/rejected": -97976480.0, + "logps/chosen": -179.15814208984375, + "logps/rejected": -144.1017608642578, + "loss": 2.9455, + "rewards/chosen": -0.8244253993034363, + "rewards/margins": 1.7748473286628723, + "rewards/rejected": -2.5992727279663086, + "step": 457 + }, + { + "epoch": 14.777327935222672, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 2.685494586710038e-07, + "logits/chosen": -66368609.88235294, + "logits/rejected": -93580117.33333333, + "logps/chosen": -220.22845818014707, + "logps/rejected": -132.73451334635416, + "loss": 2.9813, + "rewards/chosen": -0.32102542765000286, + "rewards/margins": 2.0558107133005183, + "rewards/rejected": -2.376836140950521, + "step": 458 + }, + { + "epoch": 14.809716599190283, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 2.655921629464245e-07, + "logits/chosen": -81319688.8275862, + "logits/rejected": -98513598.17142858, + "logps/chosen": -201.75286233836206, + "logps/rejected": -147.54061104910716, + "loss": 2.9369, + "rewards/chosen": -1.2920216527478448, + "rewards/margins": 0.8786435432622, + "rewards/rejected": -2.1706651960100447, + "step": 459 + }, + { + "epoch": 14.842105263157894, + "grad_norm": 167.0, + "kl": 0.3162778615951538, + "learning_rate": 2.626453403047172e-07, + "logits/chosen": -62228756.0, + "logits/rejected": -105336456.0, + "logps/chosen": -258.8127746582031, + "logps/rejected": -128.1526641845703, + "loss": 3.0003, + "rewards/chosen": -0.22756414115428925, + "rewards/margins": 2.3638841658830643, + "rewards/rejected": -2.5914483070373535, + "step": 460 + }, + { + "epoch": 14.874493927125506, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 2.597091224066581e-07, + "logits/chosen": -59445205.333333336, + "logits/rejected": -96873947.42857143, + "logps/chosen": -247.08816189236111, + "logps/rejected": -137.38323102678572, + "loss": 3.0211, + "rewards/chosen": -0.08814783891042073, + "rewards/margins": 2.3031202100572132, + "rewards/rejected": -2.391268048967634, + "step": 461 + }, + { + "epoch": 14.906882591093117, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 2.5678364043921503e-07, + "logits/chosen": -58916346.88, + "logits/rejected": -103354164.51282051, + "logps/chosen": -258.32203125, + "logps/rejected": -155.28740985576923, + "loss": 2.7557, + "rewards/chosen": -0.374318962097168, + "rewards/margins": 2.426655761523125, + "rewards/rejected": -2.8009747236202927, + "step": 462 + }, + { + "epoch": 14.939271255060728, + "grad_norm": 161.0, + "kl": 0.0, + "learning_rate": 2.538690251096862e-07, + "logits/chosen": -77990078.17142858, + "logits/rejected": -100813841.65517241, + "logps/chosen": -178.31856863839286, + "logps/rejected": -141.10777545797413, + "loss": 2.9307, + "rewards/chosen": -0.49631690979003906, + "rewards/margins": 1.8880162074648101, + "rewards/rejected": -2.384333117254849, + "step": 463 + }, + { + "epoch": 14.97165991902834, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 2.5096540663986064e-07, + "logits/chosen": -71577751.7037037, + "logits/rejected": -101257977.08108108, + "logps/chosen": -171.62841796875, + "logps/rejected": -138.1628220016892, + "loss": 2.8692, + "rewards/chosen": -0.6100874300356265, + "rewards/margins": 1.7501876323192087, + "rewards/rejected": -2.360275062354835, + "step": 464 + }, + { + "epoch": 15.0, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 2.480729147601999e-07, + "logits/chosen": -57674132.21052632, + "logits/rejected": -97789952.0, + "logps/chosen": -137.22797594572367, + "logps/rejected": -125.16964588994566, + "loss": 2.9382, + "rewards/chosen": -0.940712477031507, + "rewards/margins": 1.31802583340922, + "rewards/rejected": -2.258738310440727, + "step": 465 + }, + { + "epoch": 15.03238866396761, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 2.451916787040412e-07, + "logits/chosen": -62508879.058823526, + "logits/rejected": -96349806.93333334, + "logps/chosen": -215.2633846507353, + "logps/rejected": -113.00658365885417, + "loss": 2.9556, + "rewards/chosen": -0.44823427761302276, + "rewards/margins": 1.4291012072095683, + "rewards/rejected": -1.8773354848225912, + "step": 466 + }, + { + "epoch": 15.064777327935223, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 2.423218272018252e-07, + "logits/chosen": -66979681.88235294, + "logits/rejected": -87443899.73333333, + "logps/chosen": -229.34899184283088, + "logps/rejected": -112.09307454427083, + "loss": 2.9546, + "rewards/chosen": -0.539879181805779, + "rewards/margins": 1.036263447181851, + "rewards/rejected": -1.5761426289876301, + "step": 467 + }, + { + "epoch": 15.097165991902834, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 2.394634884753419e-07, + "logits/chosen": -67305501.86666666, + "logits/rejected": -104019855.05882353, + "logps/chosen": -220.8958984375, + "logps/rejected": -135.80086741727942, + "loss": 2.8524, + "rewards/chosen": -0.38074350357055664, + "rewards/margins": 2.2470991190742042, + "rewards/rejected": -2.627842622644761, + "step": 468 + }, + { + "epoch": 15.129554655870445, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 2.3661679023200422e-07, + "logits/chosen": -66880617.025641024, + "logits/rejected": -98502768.64, + "logps/chosen": -237.0897435897436, + "logps/rejected": -125.94109375, + "loss": 3.151, + "rewards/chosen": -0.26238353435809797, + "rewards/margins": 1.9110278731126051, + "rewards/rejected": -2.173411407470703, + "step": 469 + }, + { + "epoch": 15.161943319838057, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 2.3378185965914078e-07, + "logits/chosen": -57161392.551724136, + "logits/rejected": -98508946.28571428, + "logps/chosen": -210.51459792564654, + "logps/rejected": -133.46876395089285, + "loss": 2.8341, + "rewards/chosen": -0.12162569473529684, + "rewards/margins": 2.1713221289254174, + "rewards/rejected": -2.292947823660714, + "step": 470 + }, + { + "epoch": 15.194331983805668, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 2.309588234183137e-07, + "logits/chosen": -55166220.38709678, + "logits/rejected": -94622347.63636364, + "logps/chosen": -158.29580393145162, + "logps/rejected": -117.99982244318181, + "loss": 2.9478, + "rewards/chosen": -0.008161975491431451, + "rewards/margins": 2.2864436264261823, + "rewards/rejected": -2.2946056019176138, + "step": 471 + }, + { + "epoch": 15.226720647773279, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 2.2814780763965956e-07, + "logits/chosen": -64386944.0, + "logits/rejected": -90639855.48387097, + "logps/chosen": -149.33127663352272, + "logps/rejected": -151.09208039314515, + "loss": 2.9481, + "rewards/chosen": -0.459195512713808, + "rewards/margins": 1.8156549019314792, + "rewards/rejected": -2.2748504146452873, + "step": 472 + }, + { + "epoch": 15.259109311740891, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 2.2534893791625404e-07, + "logits/chosen": -52017911.172413796, + "logits/rejected": -100169230.62857144, + "logps/chosen": -236.86188375538794, + "logps/rejected": -138.26692243303572, + "loss": 2.9403, + "rewards/chosen": -0.047348754159335434, + "rewards/margins": 2.4855210752909995, + "rewards/rejected": -2.532869829450335, + "step": 473 + }, + { + "epoch": 15.291497975708502, + "grad_norm": 162.0, + "kl": 0.0, + "learning_rate": 2.2256233929850044e-07, + "logits/chosen": -75536161.68421052, + "logits/rejected": -96382089.84615384, + "logps/chosen": -187.44907740542763, + "logps/rejected": -150.49297626201923, + "loss": 2.9537, + "rewards/chosen": -0.7962660538522821, + "rewards/margins": 1.7547963099923691, + "rewards/rejected": -2.5510623638446512, + "step": 474 + }, + { + "epoch": 15.323886639676113, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 2.197881362885426e-07, + "logits/chosen": -71331773.2173913, + "logits/rejected": -98335812.68292683, + "logps/chosen": -152.35098930027175, + "logps/rejected": -125.35432545731707, + "loss": 2.7602, + "rewards/chosen": -1.153787778771442, + "rewards/margins": 1.3874846404835122, + "rewards/rejected": -2.541272419254954, + "step": 475 + }, + { + "epoch": 15.356275303643725, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 2.1702645283470234e-07, + "logits/chosen": -61982515.2, + "logits/rejected": -92642312.8275862, + "logps/chosen": -195.61316964285714, + "logps/rejected": -140.80540308459052, + "loss": 3.0001, + "rewards/chosen": -0.22200731549944197, + "rewards/margins": 2.064258616781, + "rewards/rejected": -2.286265932280442, + "step": 476 + }, + { + "epoch": 15.388663967611336, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 2.1427741232594182e-07, + "logits/chosen": -62008032.0, + "logits/rejected": -93219576.8888889, + "logps/chosen": -285.46048409598217, + "logps/rejected": -118.15388997395833, + "loss": 2.9731, + "rewards/chosen": -0.26297320638384136, + "rewards/margins": 1.8042366731734503, + "rewards/rejected": -2.0672098795572915, + "step": 477 + }, + { + "epoch": 15.421052631578947, + "grad_norm": 280.0, + "kl": 0.0, + "learning_rate": 2.1154113758634966e-07, + "logits/chosen": -68126729.48148148, + "logits/rejected": -100055365.1891892, + "logps/chosen": -255.69733796296296, + "logps/rejected": -136.45947265625, + "loss": 2.8915, + "rewards/chosen": -0.6184189407913773, + "rewards/margins": 1.7624612603936942, + "rewards/rejected": -2.3808802011850716, + "step": 478 + }, + { + "epoch": 15.45344129554656, + "grad_norm": 151.0, + "kl": 0.0, + "learning_rate": 2.0881775086965492e-07, + "logits/chosen": -53814545.06666667, + "logits/rejected": -97413240.47058824, + "logps/chosen": -255.84069010416667, + "logps/rejected": -112.38763786764706, + "loss": 2.843, + "rewards/chosen": -0.167624568939209, + "rewards/margins": 2.20279083812938, + "rewards/rejected": -2.370415407068589, + "step": 479 + }, + { + "epoch": 15.48582995951417, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 2.0610737385376348e-07, + "logits/chosen": -58100858.88, + "logits/rejected": -99996987.07692307, + "logps/chosen": -239.43080078125, + "logps/rejected": -150.7254356971154, + "loss": 2.9057, + "rewards/chosen": -0.5831904602050781, + "rewards/margins": 1.8629473837828026, + "rewards/rejected": -2.4461378439878807, + "step": 480 + }, + { + "epoch": 15.518218623481781, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 2.0341012763532239e-07, + "logits/chosen": -65494512.484848484, + "logits/rejected": -104369911.74193548, + "logps/chosen": -247.40511067708334, + "logps/rejected": -145.88788432459677, + "loss": 2.9188, + "rewards/chosen": -0.46176453792687616, + "rewards/margins": 2.153159246882852, + "rewards/rejected": -2.614923784809728, + "step": 481 + }, + { + "epoch": 15.550607287449393, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 2.0072613272430922e-07, + "logits/chosen": -65853493.67741936, + "logits/rejected": -100311373.57575758, + "logps/chosen": -176.9901083669355, + "logps/rejected": -154.20259232954547, + "loss": 3.0325, + "rewards/chosen": -0.2542735069028793, + "rewards/margins": 2.1182700592518318, + "rewards/rejected": -2.372543566154711, + "step": 482 + }, + { + "epoch": 15.582995951417004, + "grad_norm": 155.0, + "kl": 0.0, + "learning_rate": 1.980555090386477e-07, + "logits/chosen": -59290933.89473684, + "logits/rejected": -97237415.38461539, + "logps/chosen": -247.99252158717104, + "logps/rejected": -134.71644005408655, + "loss": 2.9476, + "rewards/chosen": -0.254484728762978, + "rewards/margins": 1.8196621288654775, + "rewards/rejected": -2.0741468576284556, + "step": 483 + }, + { + "epoch": 15.615384615384615, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 1.953983758988502e-07, + "logits/chosen": -66973624.0, + "logits/rejected": -107598976.0, + "logps/chosen": -252.64303588867188, + "logps/rejected": -148.0606231689453, + "loss": 2.9289, + "rewards/chosen": -0.2260175347328186, + "rewards/margins": 2.452908217906952, + "rewards/rejected": -2.6789257526397705, + "step": 484 + }, + { + "epoch": 15.647773279352228, + "grad_norm": 161.0, + "kl": 0.0, + "learning_rate": 1.927548520226857e-07, + "logits/chosen": -77407802.51428571, + "logits/rejected": -97110713.37931034, + "logps/chosen": -178.57099609375, + "logps/rejected": -148.07145743534483, + "loss": 2.9161, + "rewards/chosen": -1.1741735185895648, + "rewards/margins": 1.0735708114548856, + "rewards/rejected": -2.2477443300444504, + "step": 485 + }, + { + "epoch": 15.680161943319838, + "grad_norm": 175.0, + "kl": 0.352486252784729, + "learning_rate": 1.9012505551987762e-07, + "logits/chosen": -68995590.73684211, + "logits/rejected": -98562087.38461539, + "logps/chosen": -212.70760947779604, + "logps/rejected": -131.20113431490384, + "loss": 3.0077, + "rewards/chosen": -0.47862815856933594, + "rewards/margins": 2.0596065521240234, + "rewards/rejected": -2.5382347106933594, + "step": 486 + }, + { + "epoch": 15.712550607287449, + "grad_norm": 237.0, + "kl": 0.0, + "learning_rate": 1.8750910388682427e-07, + "logits/chosen": -68801404.71794872, + "logits/rejected": -96091484.16, + "logps/chosen": -156.53382912660257, + "logps/rejected": -116.83669921875, + "loss": 2.9114, + "rewards/chosen": -0.4843734839023688, + "rewards/margins": 1.8178414819179436, + "rewards/rejected": -2.3022149658203124, + "step": 487 + }, + { + "epoch": 15.744939271255062, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 1.8490711400135117e-07, + "logits/chosen": -72317648.0, + "logits/rejected": -98281968.0, + "logps/chosen": -179.27587890625, + "logps/rejected": -144.23171997070312, + "loss": 2.9381, + "rewards/chosen": -0.8361989855766296, + "rewards/margins": 1.7760685086250305, + "rewards/rejected": -2.61226749420166, + "step": 488 + }, + { + "epoch": 15.777327935222672, + "grad_norm": 205.0, + "kl": 0.0, + "learning_rate": 1.8231920211748818e-07, + "logits/chosen": -66403734.5882353, + "logits/rejected": -93627955.2, + "logps/chosen": -220.07602826286765, + "logps/rejected": -132.8814208984375, + "loss": 2.9773, + "rewards/chosen": -0.30578338398652916, + "rewards/margins": 2.0857439190733666, + "rewards/rejected": -2.391527303059896, + "step": 489 + }, + { + "epoch": 15.809716599190283, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 1.7974548386027584e-07, + "logits/chosen": -81335931.5862069, + "logits/rejected": -98753543.31428571, + "logps/chosen": -201.59363213900863, + "logps/rejected": -147.62652064732143, + "loss": 2.9142, + "rewards/chosen": -1.2760967386179958, + "rewards/margins": 0.9031569758072273, + "rewards/rejected": -2.179253714425223, + "step": 490 + }, + { + "epoch": 15.842105263157894, + "grad_norm": 181.0, + "kl": 0.3179197311401367, + "learning_rate": 1.7718607422059879e-07, + "logits/chosen": -62185552.0, + "logits/rejected": -105514544.0, + "logps/chosen": -258.72589111328125, + "logps/rejected": -128.19235229492188, + "loss": 3.0068, + "rewards/chosen": -0.2188776135444641, + "rewards/margins": 2.376538932323456, + "rewards/rejected": -2.59541654586792, + "step": 491 + }, + { + "epoch": 15.874493927125506, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 1.746410875500488e-07, + "logits/chosen": -59510869.333333336, + "logits/rejected": -96976210.28571428, + "logps/chosen": -247.15388997395834, + "logps/rejected": -137.29727608816964, + "loss": 3.0141, + "rewards/chosen": -0.09472141000959608, + "rewards/margins": 2.287950491148328, + "rewards/rejected": -2.382671901157924, + "step": 492 + }, + { + "epoch": 15.906882591093117, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 1.7211063755581524e-07, + "logits/chosen": -59008824.32, + "logits/rejected": -103630204.71794872, + "logps/chosen": -258.5851171875, + "logps/rejected": -155.38668118990384, + "loss": 2.7657, + "rewards/chosen": -0.40062652587890624, + "rewards/margins": 2.4102763875325524, + "rewards/rejected": -2.8109029134114585, + "step": 493 + }, + { + "epoch": 15.939271255060728, + "grad_norm": 185.0, + "kl": 0.0, + "learning_rate": 1.695948372956047e-07, + "logits/chosen": -77968332.8, + "logits/rejected": -100711794.7586207, + "logps/chosen": -178.47477678571428, + "logps/rejected": -140.9263537176724, + "loss": 2.9268, + "rewards/chosen": -0.5119389125279018, + "rewards/margins": 1.8542520306967747, + "rewards/rejected": -2.3661909432246766, + "step": 494 + }, + { + "epoch": 15.97165991902834, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 1.6709379917259025e-07, + "logits/chosen": -71700868.74074075, + "logits/rejected": -101220282.8108108, + "logps/chosen": -171.52345558449073, + "logps/rejected": -138.14950644003378, + "loss": 2.8751, + "rewards/chosen": -0.5995931272153501, + "rewards/margins": 1.759349471694595, + "rewards/rejected": -2.358942598909945, + "step": 495 + }, + { + "epoch": 16.0, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 1.6460763493038838e-07, + "logits/chosen": -57558447.15789474, + "logits/rejected": -97733442.7826087, + "logps/chosen": -137.12277703536185, + "logps/rejected": -125.16896654211956, + "loss": 2.9446, + "rewards/chosen": -0.9301911404258326, + "rewards/margins": 1.3284801640281416, + "rewards/rejected": -2.258671304453974, + "step": 496 + }, + { + "epoch": 16.032388663967613, + "grad_norm": 209.0, + "kl": 0.0, + "learning_rate": 1.621364556480675e-07, + "logits/chosen": -62457434.35294118, + "logits/rejected": -96218606.93333334, + "logps/chosen": -215.25729549632354, + "logps/rejected": -113.06044108072916, + "loss": 2.979, + "rewards/chosen": -0.44762342116411996, + "rewards/margins": 1.4350972717883541, + "rewards/rejected": -1.882720692952474, + "step": 497 + }, + { + "epoch": 16.06477732793522, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 1.596803717351845e-07, + "logits/chosen": -66882778.35294118, + "logits/rejected": -87651618.13333334, + "logps/chosen": -229.46245978860293, + "logps/rejected": -112.18683268229167, + "loss": 2.9595, + "rewards/chosen": -0.5512245402616613, + "rewards/margins": 1.034294105978573, + "rewards/rejected": -1.5855186462402344, + "step": 498 + }, + { + "epoch": 16.097165991902834, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 1.572394929268519e-07, + "logits/chosen": -67265625.6, + "logits/rejected": -103988698.35294117, + "logps/chosen": -220.6365234375, + "logps/rejected": -135.8659237132353, + "loss": 2.8599, + "rewards/chosen": -0.35480489730834963, + "rewards/margins": 2.279542906144086, + "rewards/rejected": -2.6343478034524357, + "step": 499 + }, + { + "epoch": 16.129554655870447, + "grad_norm": 216.0, + "kl": 0.0, + "learning_rate": 1.5481392827883488e-07, + "logits/chosen": -66750713.43589743, + "logits/rejected": -98567116.8, + "logps/chosen": -237.20452724358975, + "logps/rejected": -126.133759765625, + "loss": 3.1347, + "rewards/chosen": -0.27386181171123797, + "rewards/margins": 1.918815648005559, + "rewards/rejected": -2.192677459716797, + "step": 500 + }, + { + "epoch": 16.161943319838056, + "grad_norm": 158.0, + "kl": 0.0, + "learning_rate": 1.5240378616267886e-07, + "logits/chosen": -57390106.48275862, + "logits/rejected": -98513437.25714286, + "logps/chosen": -210.66638604525863, + "logps/rejected": -133.52818080357142, + "loss": 2.8099, + "rewards/chosen": -0.13680349547287513, + "rewards/margins": 2.162087408544982, + "rewards/rejected": -2.298890904017857, + "step": 501 + }, + { + "epoch": 16.194331983805668, + "grad_norm": 167.0, + "kl": 0.0, + "learning_rate": 1.5000917426086767e-07, + "logits/chosen": -55172727.741935484, + "logits/rejected": -94554538.66666667, + "logps/chosen": -158.20306199596774, + "logps/rejected": -118.24235026041667, + "loss": 2.957, + "rewards/chosen": 0.0011104576049312468, + "rewards/margins": 2.3199692111560677, + "rewards/rejected": -2.3188587535511362, + "step": 502 + }, + { + "epoch": 16.22672064777328, + "grad_norm": 206.0, + "kl": 0.0, + "learning_rate": 1.4763019956201251e-07, + "logits/chosen": -64566582.303030305, + "logits/rejected": -90447368.25806452, + "logps/chosen": -149.30603397253788, + "logps/rejected": -150.93699596774192, + "loss": 2.9521, + "rewards/chosen": -0.45667295744924835, + "rewards/margins": 1.8026683747710137, + "rewards/rejected": -2.259341332220262, + "step": 503 + }, + { + "epoch": 16.25910931174089, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 1.4526696835607088e-07, + "logits/chosen": -52211526.62068965, + "logits/rejected": -100267673.6, + "logps/chosen": -236.86282664331895, + "logps/rejected": -138.3693359375, + "loss": 2.9131, + "rewards/chosen": -0.04743953408866093, + "rewards/margins": 2.4956724305458255, + "rewards/rejected": -2.5431119646344866, + "step": 504 + }, + { + "epoch": 16.291497975708502, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 1.429195862295997e-07, + "logits/chosen": -75706199.57894737, + "logits/rejected": -96506082.46153846, + "logps/chosen": -187.4133943256579, + "logps/rejected": -150.78190730168268, + "loss": 2.9668, + "rewards/chosen": -0.792698107267681, + "rewards/margins": 1.7872576539815679, + "rewards/rejected": -2.579955761249249, + "step": 505 + }, + { + "epoch": 16.323886639676115, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 1.405881580610354e-07, + "logits/chosen": -71461665.39130434, + "logits/rejected": -98170012.09756097, + "logps/chosen": -152.32670261548913, + "logps/rejected": -125.26076600609755, + "loss": 2.754, + "rewards/chosen": -1.1513595581054688, + "rewards/margins": 1.3805577347918256, + "rewards/rejected": -2.5319172928972944, + "step": 506 + }, + { + "epoch": 16.356275303643724, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 1.3827278801600978e-07, + "logits/chosen": -61968288.91428571, + "logits/rejected": -92630086.62068966, + "logps/chosen": -195.53969029017858, + "logps/rejected": -140.6947652882543, + "loss": 3.0028, + "rewards/chosen": -0.21465841020856585, + "rewards/margins": 2.0605449000015637, + "rewards/rejected": -2.2752033102101294, + "step": 507 + }, + { + "epoch": 16.388663967611336, + "grad_norm": 175.0, + "kl": 0.0, + "learning_rate": 1.3597357954269534e-07, + "logits/chosen": -61942454.85714286, + "logits/rejected": -93099264.0, + "logps/chosen": -285.23728724888394, + "logps/rejected": -118.09457736545139, + "loss": 2.9507, + "rewards/chosen": -0.2406529358455113, + "rewards/margins": 1.820625513318985, + "rewards/rejected": -2.0612784491644964, + "step": 508 + }, + { + "epoch": 16.42105263157895, + "grad_norm": 292.0, + "kl": 0.0, + "learning_rate": 1.3369063536718344e-07, + "logits/chosen": -68302644.14814815, + "logits/rejected": -100084355.45945945, + "logps/chosen": -255.68625217013889, + "logps/rejected": -136.66236011402026, + "loss": 2.8921, + "rewards/chosen": -0.6173092877423322, + "rewards/margins": 1.783860360299264, + "rewards/rejected": -2.4011696480415963, + "step": 509 + }, + { + "epoch": 16.453441295546558, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 1.3142405748889457e-07, + "logits/chosen": -53853845.333333336, + "logits/rejected": -97233852.23529412, + "logps/chosen": -255.89596354166667, + "logps/rejected": -112.433349609375, + "loss": 2.8551, + "rewards/chosen": -0.17315131823221844, + "rewards/margins": 2.2018352181303733, + "rewards/rejected": -2.374986536362592, + "step": 510 + }, + { + "epoch": 16.48582995951417, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 1.291739471760212e-07, + "logits/chosen": -58027361.28, + "logits/rejected": -99997787.8974359, + "logps/chosen": -239.58205078125, + "logps/rejected": -150.87414863782053, + "loss": 2.884, + "rewards/chosen": -0.598314323425293, + "rewards/margins": 1.8626936017549953, + "rewards/rejected": -2.4610079251802883, + "step": 511 + }, + { + "epoch": 16.518218623481783, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 1.2694040496100317e-07, + "logits/chosen": -65406844.121212125, + "logits/rejected": -104295952.51612903, + "logps/chosen": -247.16460996685606, + "logps/rejected": -145.9378465221774, + "loss": 2.9182, + "rewards/chosen": -0.43771379644220526, + "rewards/margins": 2.1822079801139944, + "rewards/rejected": -2.6199217765561995, + "step": 512 + }, + { + "epoch": 16.55060728744939, + "grad_norm": 208.0, + "kl": 0.0, + "learning_rate": 1.2472353063603623e-07, + "logits/chosen": -65801112.77419355, + "logits/rejected": -100316315.15151516, + "logps/chosen": -176.78066721270162, + "logps/rejected": -154.41607481060606, + "loss": 2.9716, + "rewards/chosen": -0.23333032669559603, + "rewards/margins": 2.1605609789388738, + "rewards/rejected": -2.3938913056344697, + "step": 513 + }, + { + "epoch": 16.582995951417004, + "grad_norm": 147.0, + "kl": 0.0, + "learning_rate": 1.225234232486127e-07, + "logits/chosen": -59296714.10526316, + "logits/rejected": -97136187.07692307, + "logps/chosen": -248.07570929276315, + "logps/rejected": -134.6264366736779, + "loss": 2.9543, + "rewards/chosen": -0.26280465878938375, + "rewards/margins": 1.802341273921704, + "rewards/rejected": -2.0651459327110877, + "step": 514 + }, + { + "epoch": 16.615384615384617, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 1.2034018109709716e-07, + "logits/chosen": -66955652.0, + "logits/rejected": -107874416.0, + "logps/chosen": -252.43841552734375, + "logps/rejected": -148.32968139648438, + "loss": 2.9283, + "rewards/chosen": -0.20555496215820312, + "rewards/margins": 2.5002758502960205, + "rewards/rejected": -2.7058308124542236, + "step": 515 + }, + { + "epoch": 16.647773279352226, + "grad_norm": 151.0, + "kl": 0.0, + "learning_rate": 1.1817390172633402e-07, + "logits/chosen": -77372818.28571428, + "logits/rejected": -97091725.2413793, + "logps/chosen": -178.37101004464284, + "logps/rejected": -148.0040914601293, + "loss": 2.9075, + "rewards/chosen": -1.1541733877999443, + "rewards/margins": 1.0868344499559823, + "rewards/rejected": -2.2410078377559266, + "step": 516 + }, + { + "epoch": 16.68016194331984, + "grad_norm": 179.0, + "kl": 0.3875166177749634, + "learning_rate": 1.1602468192328934e-07, + "logits/chosen": -68933901.4736842, + "logits/rejected": -98502035.6923077, + "logps/chosen": -212.50601356907896, + "logps/rejected": -131.29400165264423, + "loss": 3.0059, + "rewards/chosen": -0.45846803564774363, + "rewards/margins": 2.089053115381403, + "rewards/rejected": -2.5475211510291467, + "step": 517 + }, + { + "epoch": 16.71255060728745, + "grad_norm": 246.0, + "kl": 0.0, + "learning_rate": 1.1389261771272662e-07, + "logits/chosen": -68786051.28205128, + "logits/rejected": -96301568.0, + "logps/chosen": -156.49189953926282, + "logps/rejected": -116.798447265625, + "loss": 2.933, + "rewards/chosen": -0.4801797133225661, + "rewards/margins": 1.8182107896071216, + "rewards/rejected": -2.2983905029296876, + "step": 518 + }, + { + "epoch": 16.74493927125506, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 1.117778043529164e-07, + "logits/chosen": -72464488.0, + "logits/rejected": -98028480.0, + "logps/chosen": -179.13558959960938, + "logps/rejected": -144.46353149414062, + "loss": 2.9402, + "rewards/chosen": -0.822170078754425, + "rewards/margins": 1.8132795691490173, + "rewards/rejected": -2.6354496479034424, + "step": 519 + }, + { + "epoch": 16.777327935222672, + "grad_norm": 217.0, + "kl": 0.0, + "learning_rate": 1.096803363313803e-07, + "logits/chosen": -66442465.88235294, + "logits/rejected": -93625156.26666667, + "logps/chosen": -220.18663832720588, + "logps/rejected": -133.10751139322917, + "loss": 2.9885, + "rewards/chosen": -0.3168419950148639, + "rewards/margins": 2.097293747172636, + "rewards/rejected": -2.4141357421875, + "step": 520 + }, + { + "epoch": 16.809716599190285, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 1.076003073606695e-07, + "logits/chosen": -81186727.72413793, + "logits/rejected": -98583420.34285714, + "logps/chosen": -201.51794854525863, + "logps/rejected": -147.66849888392858, + "loss": 2.9199, + "rewards/chosen": -1.2685279846191406, + "rewards/margins": 0.9149234226771763, + "rewards/rejected": -2.183451407296317, + "step": 521 + }, + { + "epoch": 16.842105263157894, + "grad_norm": 169.0, + "kl": 0.41440093517303467, + "learning_rate": 1.0553781037417769e-07, + "logits/chosen": -62079276.0, + "logits/rejected": -105588544.0, + "logps/chosen": -258.708251953125, + "logps/rejected": -128.41506958007812, + "loss": 3.0055, + "rewards/chosen": -0.2171132117509842, + "rewards/margins": 2.4005759209394455, + "rewards/rejected": -2.6176891326904297, + "step": 522 + }, + { + "epoch": 16.874493927125506, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 1.034929375219884e-07, + "logits/chosen": -59659783.11111111, + "logits/rejected": -97221988.57142857, + "logps/chosen": -247.32115342881946, + "logps/rejected": -137.59490094866072, + "loss": 3.0208, + "rewards/chosen": -0.11144917541080052, + "rewards/margins": 2.300986628683787, + "rewards/rejected": -2.4124358040945872, + "step": 523 + }, + { + "epoch": 16.90688259109312, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 1.0146578016675933e-07, + "logits/chosen": -58962713.6, + "logits/rejected": -103577193.02564102, + "logps/chosen": -258.52013671875, + "logps/rejected": -155.6429662459936, + "loss": 2.7598, + "rewards/chosen": -0.3941326522827148, + "rewards/margins": 2.4423977886102137, + "rewards/rejected": -2.8365304408929286, + "step": 524 + }, + { + "epoch": 16.939271255060728, + "grad_norm": 184.0, + "kl": 0.0, + "learning_rate": 9.94564288796384e-08, + "logits/chosen": -78155834.51428571, + "logits/rejected": -100666977.10344827, + "logps/chosen": -178.48168247767856, + "logps/rejected": -141.1480334051724, + "loss": 2.9247, + "rewards/chosen": -0.5126292637416294, + "rewards/margins": 1.8757300691651593, + "rewards/rejected": -2.3883593329067887, + "step": 525 + }, + { + "epoch": 16.97165991902834, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 9.746497343621857e-08, + "logits/chosen": -71768225.18518518, + "logits/rejected": -101309571.45945945, + "logps/chosen": -171.5868778935185, + "logps/rejected": -138.16539537584458, + "loss": 2.8693, + "rewards/chosen": -0.6059340017813223, + "rewards/margins": 1.7545971607899404, + "rewards/rejected": -2.3605311625712626, + "step": 526 + }, + { + "epoch": 17.0, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 9.549150281252632e-08, + "logits/chosen": -57630915.368421055, + "logits/rejected": -97753054.60869566, + "logps/chosen": -137.20222553453948, + "logps/rejected": -125.1779148267663, + "loss": 2.9593, + "rewards/chosen": -0.9381364521227384, + "rewards/margins": 1.3214301451938377, + "rewards/rejected": -2.259566597316576, + "step": 527 + }, + { + "epoch": 17.032388663967613, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 9.35361051810461e-08, + "logits/chosen": -62322898.823529415, + "logits/rejected": -96247534.93333334, + "logps/chosen": -215.25043083639707, + "logps/rejected": -113.10892740885417, + "loss": 2.9437, + "rewards/chosen": -0.44693722444422107, + "rewards/margins": 1.4406315672631356, + "rewards/rejected": -1.8875687917073567, + "step": 528 + }, + { + "epoch": 17.06477732793522, + "grad_norm": 185.0, + "kl": 0.0, + "learning_rate": 9.159886790678123e-08, + "logits/chosen": -66784993.88235294, + "logits/rejected": -87290052.26666667, + "logps/chosen": -229.31448184742646, + "logps/rejected": -112.194873046875, + "loss": 2.9611, + "rewards/chosen": -0.5364259832045611, + "rewards/margins": 1.0498971826889933, + "rewards/rejected": -1.5863231658935546, + "step": 529 + }, + { + "epoch": 17.097165991902834, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 8.967987754335022e-08, + "logits/chosen": -67198178.13333334, + "logits/rejected": -103980107.29411764, + "logps/chosen": -220.67874348958333, + "logps/rejected": -135.6750919117647, + "loss": 2.8743, + "rewards/chosen": -0.3590281804402669, + "rewards/margins": 2.256236487743901, + "rewards/rejected": -2.6152646681841683, + "step": 530 + }, + { + "epoch": 17.129554655870447, + "grad_norm": 217.0, + "kl": 0.0, + "learning_rate": 8.777921982911996e-08, + "logits/chosen": -66921275.07692308, + "logits/rejected": -98682398.72, + "logps/chosen": -236.96742287660257, + "logps/rejected": -126.133935546875, + "loss": 3.1426, + "rewards/chosen": -0.2501518298418094, + "rewards/margins": 1.9425443981855346, + "rewards/rejected": -2.192696228027344, + "step": 531 + }, + { + "epoch": 17.161943319838056, + "grad_norm": 162.0, + "kl": 0.0, + "learning_rate": 8.589697968337445e-08, + "logits/chosen": -57470411.03448276, + "logits/rejected": -98188390.4, + "logps/chosen": -210.47636045258622, + "logps/rejected": -133.27004743303573, + "loss": 2.833, + "rewards/chosen": -0.11780128807857118, + "rewards/margins": 2.15527523256875, + "rewards/rejected": -2.2730765206473214, + "step": 532 + }, + { + "epoch": 17.194331983805668, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 8.403324120252159e-08, + "logits/chosen": -55086773.67741936, + "logits/rejected": -94626784.96969697, + "logps/chosen": -158.35012915826613, + "logps/rejected": -118.15443744081439, + "loss": 2.9424, + "rewards/chosen": -0.013595548368269397, + "rewards/margins": 2.29647183074513, + "rewards/rejected": -2.3100673791133994, + "step": 533 + }, + { + "epoch": 17.22672064777328, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 8.218808765633512e-08, + "logits/chosen": -64649712.484848484, + "logits/rejected": -90529139.61290322, + "logps/chosen": -149.3858457623106, + "logps/rejected": -151.01765688004033, + "loss": 2.9468, + "rewards/chosen": -0.46465359312115295, + "rewards/margins": 1.8027549624326524, + "rewards/rejected": -2.2674085555538053, + "step": 534 + }, + { + "epoch": 17.25910931174089, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 8.036160148423449e-08, + "logits/chosen": -52062287.448275864, + "logits/rejected": -100227349.94285715, + "logps/chosen": -236.89537311422413, + "logps/rejected": -138.3841517857143, + "loss": 2.9191, + "rewards/chosen": -0.05069636476450953, + "rewards/margins": 2.4938963563571424, + "rewards/rejected": -2.544592721121652, + "step": 535 + }, + { + "epoch": 17.291497975708502, + "grad_norm": 168.0, + "kl": 0.0, + "learning_rate": 7.85538642916015e-08, + "logits/chosen": -75476372.21052632, + "logits/rejected": -96351783.38461539, + "logps/chosen": -187.63449578536185, + "logps/rejected": -150.68473933293268, + "loss": 2.9639, + "rewards/chosen": -0.8148067875912315, + "rewards/margins": 1.7554326462842191, + "rewards/rejected": -2.5702394338754506, + "step": 536 + }, + { + "epoch": 17.323886639676115, + "grad_norm": 162.0, + "kl": 0.0, + "learning_rate": 7.676495684613432e-08, + "logits/chosen": -71513789.2173913, + "logits/rejected": -98448970.92682926, + "logps/chosen": -152.41903023097825, + "logps/rejected": -125.38232421875, + "loss": 2.7433, + "rewards/chosen": -1.1605935304061226, + "rewards/margins": 1.3834790627245939, + "rewards/rejected": -2.5440725931307164, + "step": 537 + }, + { + "epoch": 17.356275303643724, + "grad_norm": 194.0, + "kl": 0.0, + "learning_rate": 7.499495907423887e-08, + "logits/chosen": -61937064.22857143, + "logits/rejected": -92665714.7586207, + "logps/chosen": -195.5723911830357, + "logps/rejected": -140.7021484375, + "loss": 2.9873, + "rewards/chosen": -0.21792729241507394, + "rewards/margins": 2.0580137013214563, + "rewards/rejected": -2.2759409937365302, + "step": 538 + }, + { + "epoch": 17.388663967611336, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 7.324395005745771e-08, + "logits/chosen": -61821138.28571428, + "logits/rejected": -93124373.33333333, + "logps/chosen": -285.289306640625, + "logps/rejected": -118.33546278211806, + "loss": 2.9449, + "rewards/chosen": -0.2458540712084089, + "rewards/margins": 1.8395124957675026, + "rewards/rejected": -2.0853665669759116, + "step": 539 + }, + { + "epoch": 17.42105263157895, + "grad_norm": 272.0, + "kl": 0.0, + "learning_rate": 7.15120080289368e-08, + "logits/chosen": -68272412.44444445, + "logits/rejected": -100160684.97297297, + "logps/chosen": -255.66140407986111, + "logps/rejected": -136.53081450591216, + "loss": 2.9001, + "rewards/chosen": -0.614824789541739, + "rewards/margins": 1.7731905451288688, + "rewards/rejected": -2.388015334670608, + "step": 540 + }, + { + "epoch": 17.453441295546558, + "grad_norm": 158.0, + "kl": 0.0, + "learning_rate": 6.979921036993041e-08, + "logits/chosen": -53865634.13333333, + "logits/rejected": -97347493.64705883, + "logps/chosen": -256.08116861979164, + "logps/rejected": -112.34694536994485, + "loss": 2.8506, + "rewards/chosen": -0.19167362848917643, + "rewards/margins": 2.1746717209909474, + "rewards/rejected": -2.366345349480124, + "step": 541 + }, + { + "epoch": 17.48582995951417, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 6.810563360634297e-08, + "logits/chosen": -58191329.28, + "logits/rejected": -100113985.64102565, + "logps/chosen": -239.63498046875, + "logps/rejected": -150.86087740384616, + "loss": 2.8875, + "rewards/chosen": -0.6036064910888672, + "rewards/margins": 1.856074897570488, + "rewards/rejected": -2.459681388659355, + "step": 542 + }, + { + "epoch": 17.518218623481783, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 6.643135340531136e-08, + "logits/chosen": -65342572.60606061, + "logits/rejected": -104207797.67741935, + "logps/chosen": -247.26737097537878, + "logps/rejected": -146.08889868951613, + "loss": 2.9088, + "rewards/chosen": -0.44798634269020776, + "rewards/margins": 2.1870396661618585, + "rewards/rejected": -2.6350260088520665, + "step": 543 + }, + { + "epoch": 17.55060728744939, + "grad_norm": 204.0, + "kl": 0.0, + "learning_rate": 6.477644457182274e-08, + "logits/chosen": -65851994.838709675, + "logits/rejected": -100249150.06060606, + "logps/chosen": -176.79646547379033, + "logps/rejected": -154.09309895833334, + "loss": 2.9946, + "rewards/chosen": -0.23491102649319556, + "rewards/margins": 2.126681352990114, + "rewards/rejected": -2.3615923794833096, + "step": 544 + }, + { + "epoch": 17.582995951417004, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 6.314098104537325e-08, + "logits/chosen": -59303397.05263158, + "logits/rejected": -97414084.92307693, + "logps/chosen": -248.18045847039474, + "logps/rejected": -134.71282489483173, + "loss": 2.9371, + "rewards/chosen": -0.27327728271484375, + "rewards/margins": 1.800506298358624, + "rewards/rejected": -2.0737835810734677, + "step": 545 + }, + { + "epoch": 17.615384615384617, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 6.152503589666425e-08, + "logits/chosen": -66938004.0, + "logits/rejected": -107663360.0, + "logps/chosen": -252.8166961669922, + "logps/rejected": -148.14114379882812, + "loss": 2.9093, + "rewards/chosen": -0.24338212609291077, + "rewards/margins": 2.443595737218857, + "rewards/rejected": -2.6869778633117676, + "step": 546 + }, + { + "epoch": 17.647773279352226, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 5.992868132433753e-08, + "logits/chosen": -77400085.94285715, + "logits/rejected": -97167430.62068966, + "logps/chosen": -178.77896205357143, + "logps/rejected": -147.91205886314654, + "loss": 2.9139, + "rewards/chosen": -1.194969940185547, + "rewards/margins": 1.0368344800225617, + "rewards/rejected": -2.2318044202081087, + "step": 547 + }, + { + "epoch": 17.68016194331984, + "grad_norm": 175.0, + "kl": 0.3301295042037964, + "learning_rate": 5.835198865174956e-08, + "logits/chosen": -68776070.73684211, + "logits/rejected": -98493134.76923077, + "logps/chosen": -212.61870374177633, + "logps/rejected": -131.0840125450721, + "loss": 2.9906, + "rewards/chosen": -0.4697403154875103, + "rewards/margins": 2.0567829811621294, + "rewards/rejected": -2.5265232966496396, + "step": 548 + }, + { + "epoch": 17.71255060728745, + "grad_norm": 251.0, + "kl": 0.0, + "learning_rate": 5.6795028323784964e-08, + "logits/chosen": -68724657.23076923, + "logits/rejected": -96095385.6, + "logps/chosen": -156.78529397035257, + "logps/rejected": -116.891171875, + "loss": 2.9246, + "rewards/chosen": -0.5095198704646184, + "rewards/margins": 1.7981426356388972, + "rewards/rejected": -2.3076625061035156, + "step": 549 + }, + { + "epoch": 17.74493927125506, + "grad_norm": 197.0, + "kl": 0.0, + "learning_rate": 5.5257869903709006e-08, + "logits/chosen": -72334520.0, + "logits/rejected": -98095864.0, + "logps/chosen": -179.38539123535156, + "logps/rejected": -144.47195434570312, + "loss": 2.932, + "rewards/chosen": -0.8471524715423584, + "rewards/margins": 1.7891395092010498, + "rewards/rejected": -2.636291980743408, + "step": 550 + }, + { + "epoch": 17.777327935222672, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 5.3740582070059435e-08, + "logits/chosen": -66278377.4117647, + "logits/rejected": -93661149.86666666, + "logps/chosen": -220.12833180147058, + "logps/rejected": -133.17548828125, + "loss": 2.9854, + "rewards/chosen": -0.3110121278201832, + "rewards/margins": 2.1099229307735667, + "rewards/rejected": -2.42093505859375, + "step": 551 + }, + { + "epoch": 17.809716599190285, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 5.224323261357844e-08, + "logits/chosen": -81292279.1724138, + "logits/rejected": -98692937.14285715, + "logps/chosen": -201.70048154633622, + "logps/rejected": -147.60934709821427, + "loss": 2.918, + "rewards/chosen": -1.2867823633654365, + "rewards/margins": 0.8907562631691617, + "rewards/rejected": -2.177538626534598, + "step": 552 + }, + { + "epoch": 17.842105263157894, + "grad_norm": 188.0, + "kl": 0.37376564741134644, + "learning_rate": 5.076588843418345e-08, + "logits/chosen": -62283384.0, + "logits/rejected": -105595032.0, + "logps/chosen": -258.7069396972656, + "logps/rejected": -128.3071746826172, + "loss": 3.0257, + "rewards/chosen": -0.21698012948036194, + "rewards/margins": 2.389920324087143, + "rewards/rejected": -2.606900453567505, + "step": 553 + }, + { + "epoch": 17.874493927125506, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 4.9308615537978214e-08, + "logits/chosen": -59580394.666666664, + "logits/rejected": -96900973.71428572, + "logps/chosen": -247.07424587673611, + "logps/rejected": -137.46269880022322, + "loss": 3.0214, + "rewards/chosen": -0.08675977918836805, + "rewards/margins": 2.312455101618691, + "rewards/rejected": -2.399214880807059, + "step": 554 + }, + { + "epoch": 17.90688259109312, + "grad_norm": 163.0, + "kl": 0.0, + "learning_rate": 4.787147903430383e-08, + "logits/chosen": -58961940.48, + "logits/rejected": -103539252.51282051, + "logps/chosen": -258.60703125, + "logps/rejected": -155.64167668269232, + "loss": 2.7277, + "rewards/chosen": -0.40282066345214845, + "rewards/margins": 2.4335822296142577, + "rewards/rejected": -2.8364028930664062, + "step": 555 + }, + { + "epoch": 17.939271255060728, + "grad_norm": 183.0, + "kl": 0.0, + "learning_rate": 4.645454313282965e-08, + "logits/chosen": -77900734.17142858, + "logits/rejected": -100613261.2413793, + "logps/chosen": -178.32025669642857, + "logps/rejected": -141.02693123653017, + "loss": 2.9295, + "rewards/chosen": -0.49648639133998324, + "rewards/margins": 1.8797624616200113, + "rewards/rejected": -2.3762488529599946, + "step": 556 + }, + { + "epoch": 17.97165991902834, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 4.5057871140684325e-08, + "logits/chosen": -71784485.92592593, + "logits/rejected": -100938316.1081081, + "logps/chosen": -171.49191623263889, + "logps/rejected": -138.17463312922297, + "loss": 2.8601, + "rewards/chosen": -0.596438655146846, + "rewards/margins": 1.7650160765624023, + "rewards/rejected": -2.3614547317092485, + "step": 557 + }, + { + "epoch": 18.0, + "grad_norm": 169.0, + "kl": 0.0, + "learning_rate": 4.368152545962761e-08, + "logits/chosen": -57664821.89473684, + "logits/rejected": -97947981.91304348, + "logps/chosen": -136.99324115953948, + "logps/rejected": -125.08189325747283, + "loss": 2.9319, + "rewards/chosen": -0.917237030832391, + "rewards/margins": 1.3327268124717873, + "rewards/rejected": -2.249963843304178, + "step": 558 + }, + { + "epoch": 18.032388663967613, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 4.232556758326211e-08, + "logits/chosen": -62489682.823529415, + "logits/rejected": -96397380.26666667, + "logps/chosen": -215.28403607536765, + "logps/rejected": -113.0524658203125, + "loss": 2.9712, + "rewards/chosen": -0.45029746784883384, + "rewards/margins": 1.4316262076882755, + "rewards/rejected": -1.8819236755371094, + "step": 559 + }, + { + "epoch": 18.06477732793522, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 4.099005809428596e-08, + "logits/chosen": -66912150.5882353, + "logits/rejected": -87508428.8, + "logps/chosen": -229.46840533088235, + "logps/rejected": -112.22766927083333, + "loss": 2.9529, + "rewards/chosen": -0.5518186232622933, + "rewards/margins": 1.0377846736533969, + "rewards/rejected": -1.5896032969156901, + "step": 560 + }, + { + "epoch": 18.097165991902834, + "grad_norm": 189.0, + "kl": 0.0, + "learning_rate": 3.967505666178555e-08, + "logits/chosen": -67375082.66666667, + "logits/rejected": -103958415.05882353, + "logps/chosen": -220.66917317708334, + "logps/rejected": -135.69921875, + "loss": 2.8372, + "rewards/chosen": -0.35807018280029296, + "rewards/margins": 2.259607842389275, + "rewards/rejected": -2.617678025189568, + "step": 561 + }, + { + "epoch": 18.129554655870447, + "grad_norm": 219.0, + "kl": 0.0, + "learning_rate": 3.8380622038570734e-08, + "logits/chosen": -66868309.333333336, + "logits/rejected": -98590597.12, + "logps/chosen": -237.05105669070514, + "logps/rejected": -126.12166015625, + "loss": 3.1264, + "rewards/chosen": -0.2585125947609926, + "rewards/margins": 1.9329551481589293, + "rewards/rejected": -2.191467742919922, + "step": 562 + }, + { + "epoch": 18.161943319838056, + "grad_norm": 147.0, + "kl": 0.0, + "learning_rate": 3.7106812058548375e-08, + "logits/chosen": -57360220.68965517, + "logits/rejected": -98360122.51428571, + "logps/chosen": -210.47285829741378, + "logps/rejected": -133.51815011160716, + "loss": 2.8319, + "rewards/chosen": -0.11745032770880337, + "rewards/margins": 2.1804376379022457, + "rewards/rejected": -2.297887965611049, + "step": 563 + }, + { + "epoch": 18.194331983805668, + "grad_norm": 171.0, + "kl": 0.0, + "learning_rate": 3.5853683634139434e-08, + "logits/chosen": -55154473.29032258, + "logits/rejected": -94562575.51515152, + "logps/chosen": -158.26022240423387, + "logps/rejected": -118.30360736268939, + "loss": 2.9236, + "rewards/chosen": -0.004606052752464048, + "rewards/margins": 2.320378179832171, + "rewards/rejected": -2.3249842325846353, + "step": 564 + }, + { + "epoch": 18.22672064777328, + "grad_norm": 210.0, + "kl": 0.0, + "learning_rate": 3.4621292753735765e-08, + "logits/chosen": -64493164.60606061, + "logits/rejected": -90325512.25806452, + "logps/chosen": -149.2129941998106, + "logps/rejected": -151.09390751008064, + "loss": 2.9445, + "rewards/chosen": -0.447367812647964, + "rewards/margins": 1.8276637385900654, + "rewards/rejected": -2.2750315512380292, + "step": 565 + }, + { + "epoch": 18.25910931174089, + "grad_norm": 170.0, + "kl": 0.0, + "learning_rate": 3.3409694479198727e-08, + "logits/chosen": -51881529.37931035, + "logits/rejected": -100193455.54285714, + "logps/chosen": -236.85349878771552, + "logps/rejected": -138.38702566964287, + "loss": 2.9358, + "rewards/chosen": -0.04651077040310564, + "rewards/margins": 2.4983694699010237, + "rewards/rejected": -2.5448802403041295, + "step": 566 + }, + { + "epoch": 18.291497975708502, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 3.2218942943399105e-08, + "logits/chosen": -75596227.36842105, + "logits/rejected": -96579347.6923077, + "logps/chosen": -187.51459703947367, + "logps/rejected": -150.71299391526443, + "loss": 2.9537, + "rewards/chosen": -0.8028179469861483, + "rewards/margins": 1.7702478841248794, + "rewards/rejected": -2.5730658311110277, + "step": 567 + }, + { + "epoch": 18.323886639676115, + "grad_norm": 188.0, + "kl": 0.0, + "learning_rate": 3.104909134779821e-08, + "logits/chosen": -71561800.3478261, + "logits/rejected": -98362205.65853658, + "logps/chosen": -152.42496390964675, + "logps/rejected": -125.33241234756098, + "loss": 2.7637, + "rewards/chosen": -1.16118688168733, + "rewards/margins": 1.377894970923188, + "rewards/rejected": -2.539081852610518, + "step": 568 + }, + { + "epoch": 18.356275303643724, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 2.990019196007154e-08, + "logits/chosen": -61929976.68571428, + "logits/rejected": -92608617.93103448, + "logps/chosen": -195.66780133928572, + "logps/rejected": -140.78766500538794, + "loss": 2.987, + "rewards/chosen": -0.227470098223005, + "rewards/margins": 2.0570234467830564, + "rewards/rejected": -2.2844935450060615, + "step": 569 + }, + { + "epoch": 18.388663967611336, + "grad_norm": 187.0, + "kl": 0.0, + "learning_rate": 2.8772296111772677e-08, + "logits/chosen": -62021385.14285714, + "logits/rejected": -93179377.77777778, + "logps/chosen": -285.5918666294643, + "logps/rejected": -118.30512152777777, + "loss": 2.9572, + "rewards/chosen": -0.27610950810568674, + "rewards/margins": 1.806223738761175, + "rewards/rejected": -2.082333246866862, + "step": 570 + }, + { + "epoch": 18.42105263157895, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 2.766545419604066e-08, + "logits/chosen": -68372252.44444445, + "logits/rejected": -100200434.16216215, + "logps/chosen": -255.88917824074073, + "logps/rejected": -136.4621779983108, + "loss": 2.8812, + "rewards/chosen": -0.6375995212131076, + "rewards/margins": 1.743552038977454, + "rewards/rejected": -2.3811515601905615, + "step": 571 + }, + { + "epoch": 18.453441295546558, + "grad_norm": 150.0, + "kl": 0.0, + "learning_rate": 2.657971566534789e-08, + "logits/chosen": -53812787.2, + "logits/rejected": -97560244.70588236, + "logps/chosen": -255.74368489583333, + "logps/rejected": -112.67920639935662, + "loss": 2.8599, + "rewards/chosen": -0.15792407989501953, + "rewards/margins": 2.241647955950569, + "rewards/rejected": -2.3995720358455883, + "step": 572 + }, + { + "epoch": 18.48582995951417, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 2.5515129029290984e-08, + "logits/chosen": -58096752.64, + "logits/rejected": -100098743.79487179, + "logps/chosen": -239.57787109375, + "logps/rejected": -150.77488982371796, + "loss": 2.9133, + "rewards/chosen": -0.5978978729248047, + "rewards/margins": 1.85318479684683, + "rewards/rejected": -2.4510826697716346, + "step": 573 + }, + { + "epoch": 18.518218623481783, + "grad_norm": 166.0, + "kl": 0.0, + "learning_rate": 2.4471741852423233e-08, + "logits/chosen": -65436291.878787875, + "logits/rejected": -104457348.12903225, + "logps/chosen": -247.24650804924244, + "logps/rejected": -146.01524697580646, + "loss": 2.9285, + "rewards/chosen": -0.4459011193477746, + "rewards/margins": 2.1817608010151286, + "rewards/rejected": -2.627661920362903, + "step": 574 + }, + { + "epoch": 18.55060728744939, + "grad_norm": 213.0, + "kl": 0.0, + "learning_rate": 2.3449600752129596e-08, + "logits/chosen": -65541793.03225806, + "logits/rejected": -100370975.03030303, + "logps/chosen": -176.71548954133064, + "logps/rejected": -154.4198330965909, + "loss": 2.9989, + "rewards/chosen": -0.22681243958011751, + "rewards/margins": 2.1674554807116677, + "rewards/rejected": -2.394267920291785, + "step": 575 + }, + { + "epoch": 18.582995951417004, + "grad_norm": 173.0, + "kl": 0.0, + "learning_rate": 2.2448751396543786e-08, + "logits/chosen": -59313367.578947365, + "logits/rejected": -97213942.15384616, + "logps/chosen": -248.3287931743421, + "logps/rejected": -134.81012432391827, + "loss": 2.944, + "rewards/chosen": -0.28811131025615494, + "rewards/margins": 1.795403563541922, + "rewards/rejected": -2.083514873798077, + "step": 576 + }, + { + "epoch": 18.615384615384617, + "grad_norm": 164.0, + "kl": 0.0, + "learning_rate": 2.1469238502507926e-08, + "logits/chosen": -67056376.0, + "logits/rejected": -107874520.0, + "logps/chosen": -252.60391235351562, + "logps/rejected": -148.420654296875, + "loss": 2.9308, + "rewards/chosen": -0.2221033275127411, + "rewards/margins": 2.492825537919998, + "rewards/rejected": -2.7149288654327393, + "step": 577 + }, + { + "epoch": 18.647773279352226, + "grad_norm": 167.0, + "kl": 0.0, + "learning_rate": 2.0511105833574684e-08, + "logits/chosen": -77421992.22857143, + "logits/rejected": -97286479.44827586, + "logps/chosen": -178.56407645089286, + "logps/rejected": -147.943359375, + "loss": 2.9163, + "rewards/chosen": -1.1734817504882813, + "rewards/margins": 1.0614525630556304, + "rewards/rejected": -2.2349343135439117, + "step": 578 + }, + { + "epoch": 18.68016194331984, + "grad_norm": 187.0, + "kl": 0.39462482929229736, + "learning_rate": 1.9574396198051958e-08, + "logits/chosen": -68922650.94736843, + "logits/rejected": -98604278.15384616, + "logps/chosen": -212.4785284745066, + "logps/rejected": -131.06280048076923, + "loss": 3.0108, + "rewards/chosen": -0.4557186427869295, + "rewards/margins": 2.068681921553515, + "rewards/rejected": -2.5244005643404446, + "step": 579 + }, + { + "epoch": 18.71255060728745, + "grad_norm": 239.0, + "kl": 0.0, + "learning_rate": 1.865915144708985e-08, + "logits/chosen": -68730748.71794872, + "logits/rejected": -96218624.0, + "logps/chosen": -156.55384865785257, + "logps/rejected": -117.003486328125, + "loss": 2.8854, + "rewards/chosen": -0.4863753196520683, + "rewards/margins": 1.832518875904572, + "rewards/rejected": -2.3188941955566404, + "step": 580 + }, + { + "epoch": 18.74493927125506, + "grad_norm": 207.0, + "kl": 0.0, + "learning_rate": 1.776541247281177e-08, + "logits/chosen": -72270192.0, + "logits/rejected": -98189624.0, + "logps/chosen": -179.48814392089844, + "logps/rejected": -144.32000732421875, + "loss": 2.9355, + "rewards/chosen": -0.8574260473251343, + "rewards/margins": 1.763670802116394, + "rewards/rejected": -2.6210968494415283, + "step": 581 + }, + { + "epoch": 18.777327935222672, + "grad_norm": 190.0, + "kl": 0.0, + "learning_rate": 1.6893219206486232e-08, + "logits/chosen": -66197511.52941176, + "logits/rejected": -93673710.93333334, + "logps/chosen": -219.97254136029412, + "logps/rejected": -133.07320963541667, + "loss": 2.9689, + "rewards/chosen": -0.29543253954719095, + "rewards/margins": 2.1152740441116635, + "rewards/rejected": -2.4107065836588544, + "step": 582 + }, + { + "epoch": 18.809716599190285, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 1.604261061674378e-08, + "logits/chosen": -81222991.44827586, + "logits/rejected": -98668814.62857144, + "logps/chosen": -201.54125134698276, + "logps/rejected": -147.55199497767856, + "loss": 2.9148, + "rewards/chosen": -1.2708594223548626, + "rewards/margins": 0.9009423354576376, + "rewards/rejected": -2.1718017578125, + "step": 583 + }, + { + "epoch": 18.842105263157894, + "grad_norm": 173.0, + "kl": 0.49798399209976196, + "learning_rate": 1.521362470783527e-08, + "logits/chosen": -62185016.0, + "logits/rejected": -105551344.0, + "logps/chosen": -258.7596740722656, + "logps/rejected": -128.31753540039062, + "loss": 3.0035, + "rewards/chosen": -0.22225487232208252, + "rewards/margins": 2.3856805562973022, + "rewards/rejected": -2.6079354286193848, + "step": 584 + }, + { + "epoch": 18.874493927125506, + "grad_norm": 198.0, + "kl": 0.0, + "learning_rate": 1.4406298517934067e-08, + "logits/chosen": -59548643.55555555, + "logits/rejected": -96952009.14285715, + "logps/chosen": -247.26814778645834, + "logps/rejected": -137.45335170200892, + "loss": 3.0096, + "rewards/chosen": -0.10614852772818671, + "rewards/margins": 2.2921317522487943, + "rewards/rejected": -2.398280279976981, + "step": 585 + }, + { + "epoch": 18.90688259109312, + "grad_norm": 179.0, + "kl": 0.0, + "learning_rate": 1.3620668117481471e-08, + "logits/chosen": -58998584.32, + "logits/rejected": -103587006.35897435, + "logps/chosen": -258.81755859375, + "logps/rejected": -155.08145532852564, + "loss": 2.7471, + "rewards/chosen": -0.4238716125488281, + "rewards/margins": 2.356509223351112, + "rewards/rejected": -2.78038083589994, + "step": 586 + }, + { + "epoch": 18.939271255060728, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 1.2856768607574564e-08, + "logits/chosen": -77692064.91428572, + "logits/rejected": -100637643.03448276, + "logps/chosen": -178.57513950892857, + "logps/rejected": -141.15901131465517, + "loss": 2.9353, + "rewards/chosen": -0.5219748360770089, + "rewards/margins": 1.8674815511468597, + "rewards/rejected": -2.3894563872238685, + "step": 587 + }, + { + "epoch": 18.97165991902834, + "grad_norm": 178.0, + "kl": 0.0, + "learning_rate": 1.2114634118398636e-08, + "logits/chosen": -71724122.07407407, + "logits/rejected": -101234425.08108108, + "logps/chosen": -171.4249312789352, + "logps/rejected": -138.18636507601352, + "loss": 2.8538, + "rewards/chosen": -0.5897406118887442, + "rewards/margins": 1.7728880155790558, + "rewards/rejected": -2.3626286274678, + "step": 588 + }, + { + "epoch": 19.0, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 1.1394297807701736e-08, + "logits/chosen": -57727393.684210524, + "logits/rejected": -98007841.39130434, + "logps/chosen": -137.22663959703948, + "logps/rejected": -125.0175144361413, + "loss": 2.9374, + "rewards/chosen": -0.94057886224044, + "rewards/margins": 1.3029475964997945, + "rewards/rejected": -2.2435264587402344, + "step": 589 + }, + { + "epoch": 19.032388663967613, + "grad_norm": 223.0, + "kl": 0.0, + "learning_rate": 1.0695791859313297e-08, + "logits/chosen": -62233856.0, + "logits/rejected": -96324292.26666667, + "logps/chosen": -215.00531364889707, + "logps/rejected": -113.19478352864583, + "loss": 2.9551, + "rewards/chosen": -0.4224257749669692, + "rewards/margins": 1.4737296459721585, + "rewards/rejected": -1.8961554209391276, + "step": 590 + }, + { + "epoch": 19.06477732793522, + "grad_norm": 193.0, + "kl": 0.0, + "learning_rate": 1.0019147481706625e-08, + "logits/chosen": -66884080.941176474, + "logits/rejected": -87609847.46666667, + "logps/chosen": -229.33608111213235, + "logps/rejected": -112.12271321614584, + "loss": 2.97, + "rewards/chosen": -0.5385867287130917, + "rewards/margins": 1.040519920049929, + "rewards/rejected": -1.5791066487630208, + "step": 591 + }, + { + "epoch": 19.097165991902834, + "grad_norm": 199.0, + "kl": 0.0, + "learning_rate": 9.364394906603901e-09, + "logits/chosen": -67289881.6, + "logits/rejected": -103940103.52941176, + "logps/chosen": -220.681787109375, + "logps/rejected": -135.76351390165442, + "loss": 2.8715, + "rewards/chosen": -0.3593327840169271, + "rewards/margins": 2.26477435242896, + "rewards/rejected": -2.624107136445887, + "step": 592 + }, + { + "epoch": 19.129554655870447, + "grad_norm": 224.0, + "kl": 0.0, + "learning_rate": 8.731563387626096e-09, + "logits/chosen": -66846457.43589743, + "logits/rejected": -98716825.6, + "logps/chosen": -237.10003505608975, + "logps/rejected": -126.0630078125, + "loss": 3.1252, + "rewards/chosen": -0.26341386941763073, + "rewards/margins": 1.9221888527503381, + "rewards/rejected": -2.185602722167969, + "step": 593 + }, + { + "epoch": 19.161943319838056, + "grad_norm": 172.0, + "kl": 0.0, + "learning_rate": 8.12068119898529e-09, + "logits/chosen": -57211475.862068966, + "logits/rejected": -98426411.88571429, + "logps/chosen": -210.41665544181035, + "logps/rejected": -133.2943359375, + "loss": 2.8466, + "rewards/chosen": -0.11182959326382341, + "rewards/margins": 2.163675690636846, + "rewards/rejected": -2.2755052839006695, + "step": 594 + }, + { + "epoch": 19.194331983805668, + "grad_norm": 177.0, + "kl": 0.0, + "learning_rate": 7.531775634222137e-09, + "logits/chosen": -54971697.548387095, + "logits/rejected": -94485984.96969697, + "logps/chosen": -158.12134576612902, + "logps/rejected": -118.03771602746212, + "loss": 2.9513, + "rewards/chosen": 0.009283790665288125, + "rewards/margins": 2.3076795833085173, + "rewards/rejected": -2.298395792643229, + "step": 595 + }, + { + "epoch": 19.22672064777328, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 6.964873004985716e-09, + "logits/chosen": -64525327.515151516, + "logits/rejected": -90796213.67741935, + "logps/chosen": -149.28101325757575, + "logps/rejected": -150.80052923387098, + "loss": 2.9345, + "rewards/chosen": -0.4541704004461115, + "rewards/margins": 1.7915246521622554, + "rewards/rejected": -2.245695052608367, + "step": 596 + }, + { + "epoch": 19.25910931174089, + "grad_norm": 176.0, + "kl": 0.0, + "learning_rate": 6.419998639858537e-09, + "logits/chosen": -52022404.4137931, + "logits/rejected": -100259540.11428571, + "logps/chosen": -237.0428845635776, + "logps/rejected": -138.39637276785714, + "loss": 2.9298, + "rewards/chosen": -0.06544678375638764, + "rewards/margins": 2.4803679483864696, + "rewards/rejected": -2.545814732142857, + "step": 597 + }, + { + "epoch": 19.291497975708502, + "grad_norm": 160.0, + "kl": 0.0, + "learning_rate": 5.897176883224442e-09, + "logits/chosen": -75506405.05263157, + "logits/rejected": -96458377.84615384, + "logps/chosen": -187.66165964226974, + "logps/rejected": -150.69378192608173, + "loss": 2.9687, + "rewards/chosen": -0.8175224504972759, + "rewards/margins": 1.7536219469448815, + "rewards/rejected": -2.5711443974421573, + "step": 598 + }, + { + "epoch": 19.323886639676115, + "grad_norm": 196.0, + "kl": 0.0, + "learning_rate": 5.396431094181197e-09, + "logits/chosen": -71299483.82608695, + "logits/rejected": -98316450.34146342, + "logps/chosen": -152.37476647418478, + "logps/rejected": -125.3078672827744, + "loss": 2.7654, + "rewards/chosen": -1.1561669059421704, + "rewards/margins": 1.3804597702916328, + "rewards/rejected": -2.5366266762338032, + "step": 599 + }, + { + "epoch": 19.356275303643724, + "grad_norm": 182.0, + "kl": 0.0, + "learning_rate": 4.917783645496887e-09, + "logits/chosen": -61824577.82857143, + "logits/rejected": -92605651.86206897, + "logps/chosen": -195.48423549107142, + "logps/rejected": -140.68437668372846, + "loss": 2.9994, + "rewards/chosen": -0.2091134752546038, + "rewards/margins": 2.065050132638715, + "rewards/rejected": -2.274163607893319, + "step": 600 + }, + { + "epoch": 19.356275303643724, + "eval_kl": 0.0, + "eval_logits/chosen": -80932414.38872105, + "eval_logits/rejected": -123042566.97477296, + "eval_logps/chosen": -211.94826283987916, + "eval_logps/rejected": -135.2585141271443, + "eval_loss": 0.2648468315601349, + "eval_rewards/chosen": -0.4634648041662733, + "eval_rewards/margins": 1.9077257644670516, + "eval_rewards/rejected": -2.371190568633325, + "eval_runtime": 64.2068, + "eval_samples_per_second": 30.698, + "eval_steps_per_second": 0.966, + "step": 600 + }, + { + "epoch": 19.388663967611336, + "grad_norm": 202.0, + "kl": 0.0, + "learning_rate": 4.461255922609985e-09, + "logits/chosen": -61884233.14285714, + "logits/rejected": -93251349.33333333, + "logps/chosen": -285.1388462611607, + "logps/rejected": -118.31480577256944, + "loss": 2.9312, + "rewards/chosen": -0.23080904143197195, + "rewards/margins": 1.8524912311917259, + "rewards/rejected": -2.0833002726236978, + "step": 601 + }, + { + "epoch": 19.42105263157895, + "grad_norm": 180.0, + "kl": 0.0, + "learning_rate": 4.026868322674126e-09, + "logits/chosen": -68158340.74074075, + "logits/rejected": -100057357.83783785, + "logps/chosen": -255.65849247685185, + "logps/rejected": -136.62403663429055, + "loss": 2.8616, + "rewards/chosen": -0.6145321881329572, + "rewards/margins": 1.7828052356555775, + "rewards/rejected": -2.3973374237885348, + "step": 602 + }, + { + "epoch": 19.453441295546558, + "grad_norm": 158.0, + "kl": 0.0, + "learning_rate": 3.614640253646828e-09, + "logits/chosen": -53845440.0, + "logits/rejected": -97514646.58823529, + "logps/chosen": -255.58486328125, + "logps/rejected": -112.47178021599265, + "loss": 2.8591, + "rewards/chosen": -0.14204190572102865, + "rewards/margins": 2.236787152757832, + "rewards/rejected": -2.3788290584788605, + "step": 603 + }, + { + "epoch": 19.48582995951417, + "grad_norm": 195.0, + "kl": 0.0, + "learning_rate": 3.224590133422189e-09, + "logits/chosen": -58276469.76, + "logits/rejected": -100134688.82051282, + "logps/chosen": -239.5990625, + "logps/rejected": -150.77152193509616, + "loss": 2.911, + "rewards/chosen": -0.60001708984375, + "rewards/margins": 1.850728712815505, + "rewards/rejected": -2.450745802659255, + "step": 604 + }, + { + "epoch": 19.518218623481783, + "grad_norm": 185.0, + "kl": 0.0, + "learning_rate": 2.856735389008269e-09, + "logits/chosen": -65148276.36363637, + "logits/rejected": -104359539.61290322, + "logps/chosen": -247.1654385653409, + "logps/rejected": -145.9519279233871, + "loss": 2.8979, + "rewards/chosen": -0.43779532114664715, + "rewards/margins": 2.1835324789888118, + "rewards/rejected": -2.6213278001354587, + "step": 605 + }, + { + "epoch": 19.55060728744939, + "grad_norm": 227.0, + "kl": 0.0, + "learning_rate": 2.511092455747932e-09, + "logits/chosen": -65956707.09677419, + "logits/rejected": -100433291.63636364, + "logps/chosen": -176.9268995715726, + "logps/rejected": -154.3150301846591, + "loss": 3.0174, + "rewards/chosen": -0.24795398404521327, + "rewards/margins": 2.135832072935729, + "rewards/rejected": -2.3837860569809424, + "step": 606 + }, + { + "epoch": 19.582995951417004, + "grad_norm": 153.0, + "kl": 0.0, + "learning_rate": 2.1876767765853233e-09, + "logits/chosen": -59242549.89473684, + "logits/rejected": -97430350.76923077, + "logps/chosen": -248.02847450657896, + "logps/rejected": -134.95637394831732, + "loss": 2.9473, + "rewards/chosen": -0.2580819380910773, + "rewards/margins": 1.84005701783215, + "rewards/rejected": -2.0981389559232273, + "step": 607 + }, + { + "epoch": 19.615384615384617, + "grad_norm": 159.0, + "kl": 0.0, + "learning_rate": 1.886502801375145e-09, + "logits/chosen": -66908516.0, + "logits/rejected": -107828312.0, + "logps/chosen": -252.5504150390625, + "logps/rejected": -148.5430450439453, + "loss": 2.9293, + "rewards/chosen": -0.21675539016723633, + "rewards/margins": 2.51041316986084, + "rewards/rejected": -2.727168560028076, + "step": 608 + }, + { + "epoch": 19.647773279352226, + "grad_norm": 162.0, + "kl": 0.0, + "learning_rate": 1.6075839862374486e-09, + "logits/chosen": -77311590.4, + "logits/rejected": -97191715.31034483, + "logps/chosen": -178.54609375, + "logps/rejected": -148.06206223060346, + "loss": 2.9278, + "rewards/chosen": -1.171681431361607, + "rewards/margins": 1.0751242200729296, + "rewards/rejected": -2.2468056514345367, + "step": 609 + }, + { + "epoch": 19.68016194331984, + "grad_norm": 190.0, + "kl": 0.36493146419525146, + "learning_rate": 1.350932792956394e-09, + "logits/chosen": -68956570.94736843, + "logits/rejected": -98587549.53846154, + "logps/chosen": -212.4560418379934, + "logps/rejected": -131.00518329326923, + "loss": 2.9961, + "rewards/chosen": -0.4534718362908614, + "rewards/margins": 2.065167654863736, + "rewards/rejected": -2.5186394911545973, + "step": 610 + }, + { + "epoch": 19.71255060728745, + "grad_norm": 221.0, + "kl": 0.0, + "learning_rate": 1.116560688423418e-09, + "logits/chosen": -68719432.20512821, + "logits/rejected": -96288501.76, + "logps/chosen": -156.50179036458334, + "logps/rejected": -116.9461328125, + "loss": 2.8878, + "rewards/chosen": -0.48116830679086536, + "rewards/margins": 1.831990720308744, + "rewards/rejected": -2.3131590270996094, + "step": 611 + }, + { + "epoch": 19.74493927125506, + "grad_norm": 225.0, + "kl": 0.0, + "learning_rate": 9.044781441249205e-10, + "logits/chosen": -72238784.0, + "logits/rejected": -98076880.0, + "logps/chosen": -179.20614624023438, + "logps/rejected": -144.458740234375, + "loss": 2.931, + "rewards/chosen": -0.829226016998291, + "rewards/margins": 1.8057453632354736, + "rewards/rejected": -2.6349713802337646, + "step": 612 + }, + { + "epoch": 19.777327935222672, + "grad_norm": 228.0, + "kl": 0.0, + "learning_rate": 7.146946356743067e-10, + "logits/chosen": -66387584.0, + "logits/rejected": -93888068.26666667, + "logps/chosen": -220.02590762867646, + "logps/rejected": -132.79898274739583, + "loss": 2.9749, + "rewards/chosen": -0.3007695815142463, + "rewards/margins": 2.082514415067785, + "rewards/rejected": -2.383283996582031, + "step": 613 + }, + { + "epoch": 19.809716599190285, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 5.472186423889358e-10, + "logits/chosen": -81260146.7586207, + "logits/rejected": -98581621.02857143, + "logps/chosen": -201.7942483836207, + "logps/rejected": -147.68328683035713, + "loss": 2.9374, + "rewards/chosen": -1.2961596784920528, + "rewards/margins": 0.8887740111703357, + "rewards/rejected": -2.1849336896623885, + "step": 614 + }, + { + "epoch": 19.842105263157894, + "grad_norm": 187.0, + "kl": 0.4474652409553528, + "learning_rate": 4.020576469108139e-10, + "logits/chosen": -61966072.0, + "logits/rejected": -105689056.0, + "logps/chosen": -258.4248046875, + "logps/rejected": -128.25125122070312, + "loss": 3.0108, + "rewards/chosen": -0.18876656889915466, + "rewards/margins": 2.412539392709732, + "rewards/rejected": -2.6013059616088867, + "step": 615 + }, + { + "epoch": 19.874493927125506, + "grad_norm": 201.0, + "kl": 0.0, + "learning_rate": 2.7921813487269407e-10, + "logits/chosen": -59527402.666666664, + "logits/rejected": -96933705.14285715, + "logps/chosen": -247.00466579861111, + "logps/rejected": -137.4833984375, + "loss": 3.0069, + "rewards/chosen": -0.07980091041988796, + "rewards/margins": 2.321484261088901, + "rewards/rejected": -2.401285171508789, + "step": 616 + }, + { + "epoch": 19.90688259109312, + "grad_norm": 165.0, + "kl": 0.0, + "learning_rate": 1.787055946081417e-10, + "logits/chosen": -59101905.92, + "logits/rejected": -103402528.82051282, + "logps/chosen": -258.68126953125, + "logps/rejected": -155.44696514423077, + "loss": 2.7413, + "rewards/chosen": -0.4102425003051758, + "rewards/margins": 2.406688612913474, + "rewards/rejected": -2.81693111321865, + "step": 617 + }, + { + "epoch": 19.939271255060728, + "grad_norm": 191.0, + "kl": 0.0, + "learning_rate": 1.0052451690617525e-10, + "logits/chosen": -77806496.91428572, + "logits/rejected": -100760434.7586207, + "logps/chosen": -178.48556082589286, + "logps/rejected": -141.04423154633622, + "loss": 2.9302, + "rewards/chosen": -0.5130174364362444, + "rewards/margins": 1.864962499834634, + "rewards/rejected": -2.3779799362708784, + "step": 618 + }, + { + "epoch": 19.97165991902834, + "grad_norm": 186.0, + "kl": 0.0, + "learning_rate": 4.46783948109819e-11, + "logits/chosen": -71679924.14814815, + "logits/rejected": -101188552.64864865, + "logps/chosen": -171.4169017650463, + "logps/rejected": -138.0185942778716, + "loss": 2.8654, + "rewards/chosen": -0.58893797132704, + "rewards/margins": 1.7569142361660977, + "rewards/rejected": -2.3458522074931376, + "step": 619 + }, + { + "epoch": 20.0, + "grad_norm": 181.0, + "kl": 0.0, + "learning_rate": 1.1169723465487279e-11, + "logits/chosen": -57547904.0, + "logits/rejected": -97859750.95652173, + "logps/chosen": -137.3400750411184, + "logps/rejected": -125.08204186480978, + "loss": 2.9397, + "rewards/chosen": -0.9519195556640625, + "rewards/margins": 1.298059214716372, + "rewards/rejected": -2.2499787703804346, + "step": 620 + } + ], + "logging_steps": 1, + "max_steps": 620, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 20, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}