{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1089, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "logps_train/chosen": -68.20826721191406, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -100.6875, "logps_train/rejected": -100.67741394042969, "rewards_train/accuracies": 0.546875, "rewards_train/chosen": 0.00036479232949204743, "rewards_train/margins": 0.0001866817765403539, "rewards_train/rejected": 0.00017811055295169353, "step": 0 }, { "epoch": 0.0, "learning_rate": 7.272727272727272e-09, "loss": 0.6931, "step": 1 }, { "epoch": 0.0, "logps_train/chosen": -68.90934753417969, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -103.70527648925781, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0006671309238299727, "rewards_train/margins": -0.0005823791725561023, "rewards_train/rejected": 0.001249510096386075, "step": 1 }, { "epoch": 0.01, "learning_rate": 1.4545454545454544e-08, "loss": 0.6934, "step": 2 }, { "epoch": 0.01, "logps_train/chosen": -68.79719543457031, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -103.63449096679688, "rewards_train/accuracies": 0.515625, "rewards_train/chosen": -0.0003254711045883596, "rewards_train/margins": -0.0006948648660909384, "rewards_train/rejected": 0.00036939376150257885, "step": 2 }, { "epoch": 0.01, "learning_rate": 2.1818181818181816e-08, "loss": 0.6935, "step": 3 }, { "epoch": 0.01, "logps_train/chosen": -69.8244400024414, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -106.37655639648438, "rewards_train/accuracies": 0.421875, "rewards_train/chosen": -0.0005594193935394287, "rewards_train/margins": -0.0007455529121216387, "rewards_train/rejected": 0.00018613351858220994, "step": 3 }, { "epoch": 0.01, "learning_rate": 2.9090909090909088e-08, "loss": 0.6935, "step": 4 }, { "epoch": 0.01, "logps_train/chosen": -68.92247009277344, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -108.00054931640625, "rewards_train/accuracies": 0.34375, "rewards_train/chosen": -0.0007927060942165554, "rewards_train/margins": -0.001128172967582941, "rewards_train/rejected": 0.0003354668733663857, "step": 4 }, { "epoch": 0.01, "learning_rate": 3.636363636363636e-08, "loss": 0.6937, "step": 5 }, { "epoch": 0.01, "logps_train/chosen": -70.27412414550781, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -103.69878387451172, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.0004679679695982486, "rewards_train/margins": -0.0015774848579894751, "rewards_train/rejected": 0.0020454528275877237, "step": 5 }, { "epoch": 0.02, "learning_rate": 4.363636363636363e-08, "loss": 0.694, "step": 6 }, { "epoch": 0.02, "logps_train/chosen": -68.1624984741211, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -103.00569152832031, "rewards_train/accuracies": 0.5, "rewards_train/chosen": 0.00040093655115924776, "rewards_train/margins": 0.0005797266931040213, "rewards_train/rejected": -0.0001787901419447735, "step": 6 }, { "epoch": 0.02, "learning_rate": 5.09090909090909e-08, "loss": 0.6929, "step": 7 }, { "epoch": 0.02, "logps_train/chosen": -70.56243896484375, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -105.72572326660156, "rewards_train/accuracies": 0.5, "rewards_train/chosen": -0.0002870082971639931, "rewards_train/margins": 2.024171408265829e-05, "rewards_train/rejected": -0.0003072500112466514, "step": 7 }, { "epoch": 0.02, "learning_rate": 5.8181818181818176e-08, "loss": 0.6931, "step": 8 }, { "epoch": 0.02, "logps_train/chosen": -70.049560546875, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -104.95716857910156, "rewards_train/accuracies": 0.40625, "rewards_train/chosen": -0.0002193212858401239, "rewards_train/margins": -0.0009871602524071932, "rewards_train/rejected": 0.0007678389665670693, "step": 8 }, { "epoch": 0.02, "learning_rate": 6.545454545454545e-08, "loss": 0.6936, "step": 9 }, { "epoch": 0.02, "logps_train/chosen": -71.70824432373047, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -108.16968536376953, "rewards_train/accuracies": 0.515625, "rewards_train/chosen": 0.000513565493747592, "rewards_train/margins": 0.000489652156829834, "rewards_train/rejected": 2.3913336917757988e-05, "step": 9 }, { "epoch": 0.03, "learning_rate": 7.272727272727273e-08, "loss": 0.6929, "step": 10 }, { "epoch": 0.03, "logps_train/chosen": -71.06797790527344, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -105.80705261230469, "rewards_train/accuracies": 0.59375, "rewards_train/chosen": 0.0010151267051696777, "rewards_train/margins": 0.0016916334861889482, "rewards_train/rejected": -0.0006765067810192704, "step": 10 }, { "epoch": 0.03, "learning_rate": 8e-08, "loss": 0.6923, "step": 11 }, { "epoch": 0.03, "logps_train/chosen": -68.51644897460938, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -105.06561279296875, "rewards_train/accuracies": 0.5625, "rewards_train/chosen": 0.0014318585162982345, "rewards_train/margins": 0.0004731296794489026, "rewards_train/rejected": 0.0009587288368493319, "step": 11 }, { "epoch": 0.03, "learning_rate": 8.727272727272726e-08, "loss": 0.6929, "step": 12 }, { "epoch": 0.03, "logps_train/chosen": -68.77277374267578, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.25, "logps_train/rejected": -103.26889038085938, "rewards_train/accuracies": 0.546875, "rewards_train/chosen": 0.00026169419288635254, "rewards_train/margins": 0.0010769129730761051, "rewards_train/rejected": -0.0008152187801897526, "step": 12 }, { "epoch": 0.04, "learning_rate": 9.454545454545454e-08, "loss": 0.6926, "step": 13 }, { "epoch": 0.04, "logps_train/chosen": -71.79501342773438, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -104.2459487915039, "rewards_train/accuracies": 0.734375, "rewards_train/chosen": 0.0019443272612988949, "rewards_train/margins": 0.0018328129372093827, "rewards_train/rejected": 0.00011151432408951223, "step": 13 }, { "epoch": 0.04, "learning_rate": 1.018181818181818e-07, "loss": 0.6922, "step": 14 }, { "epoch": 0.04, "logps_train/chosen": -68.43260955810547, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -106.68400573730469, "rewards_train/accuracies": 0.625, "rewards_train/chosen": 0.0006356477970257401, "rewards_train/margins": 0.0029235483380034566, "rewards_train/rejected": -0.0022879005409777164, "step": 14 }, { "epoch": 0.04, "learning_rate": 1.0909090909090908e-07, "loss": 0.6917, "step": 15 }, { "epoch": 0.04, "logps_train/chosen": -68.88594055175781, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -103.8121337890625, "rewards_train/accuracies": 0.71875, "rewards_train/chosen": 0.001884812256321311, "rewards_train/margins": 0.0038004935486242175, "rewards_train/rejected": -0.0019156812923029065, "step": 15 }, { "epoch": 0.04, "learning_rate": 1.1636363636363635e-07, "loss": 0.6913, "step": 16 }, { "epoch": 0.04, "logps_train/chosen": -69.98684692382812, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -106.99495697021484, "rewards_train/accuracies": 0.6875, "rewards_train/chosen": 0.0016573191387578845, "rewards_train/margins": 0.0032526373397558928, "rewards_train/rejected": -0.0015953182009980083, "step": 16 }, { "epoch": 0.05, "learning_rate": 1.2363636363636363e-07, "loss": 0.6915, "step": 17 }, { "epoch": 0.05, "logps_train/chosen": -68.33543395996094, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -101.96949768066406, "rewards_train/accuracies": 0.78125, "rewards_train/chosen": 0.0014662384055554867, "rewards_train/margins": 0.004372930619865656, "rewards_train/rejected": -0.0029066922143101692, "step": 17 }, { "epoch": 0.05, "learning_rate": 1.309090909090909e-07, "loss": 0.691, "step": 18 }, { "epoch": 0.05, "logps_train/chosen": -67.81690979003906, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -103.70281982421875, "rewards_train/accuracies": 0.796875, "rewards_train/chosen": 0.0020492197945713997, "rewards_train/margins": 0.004362976644188166, "rewards_train/rejected": -0.002313756849616766, "step": 18 }, { "epoch": 0.05, "learning_rate": 1.3818181818181818e-07, "loss": 0.691, "step": 19 }, { "epoch": 0.05, "logps_train/chosen": -72.88006591796875, "logps_train/ref_chosen": -72.9375, "logps_train/ref_rejected": -109.875, "logps_train/rejected": -109.89097595214844, "rewards_train/accuracies": 0.890625, "rewards_train/chosen": 0.0041325269266963005, "rewards_train/margins": 0.007976132677868009, "rewards_train/rejected": -0.003843605751171708, "step": 19 }, { "epoch": 0.06, "learning_rate": 1.4545454545454545e-07, "loss": 0.6892, "step": 20 }, { "epoch": 0.06, "logps_train/chosen": -68.0521011352539, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -104.31704711914062, "rewards_train/accuracies": 0.890625, "rewards_train/chosen": 0.005190163850784302, "rewards_train/margins": 0.007891535758972168, "rewards_train/rejected": -0.002701371908187866, "step": 20 }, { "epoch": 0.06, "learning_rate": 1.5272727272727273e-07, "loss": 0.6892, "step": 21 }, { "epoch": 0.06, "logps_train/chosen": -70.7330093383789, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -106.08378601074219, "rewards_train/accuracies": 0.9375, "rewards_train/chosen": 0.004384547472000122, "rewards_train/margins": 0.009638816118240356, "rewards_train/rejected": -0.005254268646240234, "step": 21 }, { "epoch": 0.06, "learning_rate": 1.6e-07, "loss": 0.6884, "step": 22 }, { "epoch": 0.06, "logps_train/chosen": -70.51871490478516, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -108.2440185546875, "rewards_train/accuracies": 0.953125, "rewards_train/chosen": 0.006087273359298706, "rewards_train/margins": 0.010273999068886042, "rewards_train/rejected": -0.004186725709587336, "step": 22 }, { "epoch": 0.06, "learning_rate": 1.6727272727272725e-07, "loss": 0.688, "step": 23 }, { "epoch": 0.06, "logps_train/chosen": -70.80014038085938, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -107.96000671386719, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.0069488706067204475, "rewards_train/margins": 0.01232411339879036, "rewards_train/rejected": -0.005375242792069912, "step": 23 }, { "epoch": 0.07, "learning_rate": 1.7454545454545453e-07, "loss": 0.687, "step": 24 }, { "epoch": 0.07, "logps_train/chosen": -67.912841796875, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -102.026123046875, "rewards_train/accuracies": 0.921875, "rewards_train/chosen": 0.007251209579408169, "rewards_train/margins": 0.012060618959367275, "rewards_train/rejected": -0.0048094093799591064, "step": 24 }, { "epoch": 0.07, "learning_rate": 1.818181818181818e-07, "loss": 0.6871, "step": 25 }, { "epoch": 0.07, "logps_train/chosen": -70.3351058959961, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -107.56623840332031, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.008237183094024658, "rewards_train/margins": 0.01559334434568882, "rewards_train/rejected": -0.007356161251664162, "step": 25 }, { "epoch": 0.07, "learning_rate": 1.8909090909090908e-07, "loss": 0.6854, "step": 26 }, { "epoch": 0.07, "logps_train/chosen": -70.02577209472656, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -103.64686584472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.009434301406145096, "rewards_train/margins": 0.017040712758898735, "rewards_train/rejected": -0.007606411352753639, "step": 26 }, { "epoch": 0.07, "learning_rate": 1.9636363636363635e-07, "loss": 0.6847, "step": 27 }, { "epoch": 0.07, "logps_train/chosen": -71.46843719482422, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -107.7547607421875, "rewards_train/accuracies": 0.953125, "rewards_train/chosen": 0.011408383026719093, "rewards_train/margins": 0.01959891989827156, "rewards_train/rejected": -0.008190536871552467, "step": 27 }, { "epoch": 0.08, "learning_rate": 2.036363636363636e-07, "loss": 0.6834, "step": 28 }, { "epoch": 0.08, "logps_train/chosen": -70.89143371582031, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -105.13034057617188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.012663150206208229, "rewards_train/margins": 0.020716620609164238, "rewards_train/rejected": -0.008053470402956009, "step": 28 }, { "epoch": 0.08, "learning_rate": 2.1090909090909088e-07, "loss": 0.6829, "step": 29 }, { "epoch": 0.08, "logps_train/chosen": -71.6605224609375, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -106.22760009765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.013195788487792015, "rewards_train/margins": 0.025408875197172165, "rewards_train/rejected": -0.01221308670938015, "step": 29 }, { "epoch": 0.08, "learning_rate": 2.1818181818181815e-07, "loss": 0.6805, "step": 30 }, { "epoch": 0.08, "logps_train/chosen": -70.30020904541016, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -107.31147766113281, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.01529121957719326, "rewards_train/margins": 0.028861218132078648, "rewards_train/rejected": -0.013569998554885387, "step": 30 }, { "epoch": 0.09, "learning_rate": 2.2545454545454543e-07, "loss": 0.6788, "step": 31 }, { "epoch": 0.09, "logps_train/chosen": -68.96434020996094, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -107.31681823730469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.01714061014354229, "rewards_train/margins": 0.03202534466981888, "rewards_train/rejected": -0.014884734526276588, "step": 31 }, { "epoch": 0.09, "learning_rate": 2.327272727272727e-07, "loss": 0.6773, "step": 32 }, { "epoch": 0.09, "logps_train/chosen": -70.68670654296875, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -108.9375, "logps_train/rejected": -109.07977294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.01770683564245701, "rewards_train/margins": 0.032325029373168945, "rewards_train/rejected": -0.014618193730711937, "step": 32 }, { "epoch": 0.09, "learning_rate": 2.4e-07, "loss": 0.6772, "step": 33 }, { "epoch": 0.09, "logps_train/chosen": -69.39637756347656, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -103.8118667602539, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.017101097851991653, "rewards_train/margins": 0.03324852138757706, "rewards_train/rejected": -0.016147423535585403, "step": 33 }, { "epoch": 0.09, "learning_rate": 2.4727272727272725e-07, "loss": 0.6767, "step": 34 }, { "epoch": 0.09, "logps_train/chosen": -70.11258697509766, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -106.30245971679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.02072347328066826, "rewards_train/margins": 0.037199998274445534, "rewards_train/rejected": -0.016476524993777275, "step": 34 }, { "epoch": 0.1, "learning_rate": 2.5454545454545453e-07, "loss": 0.6747, "step": 35 }, { "epoch": 0.1, "logps_train/chosen": -67.84988403320312, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -106.79217529296875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.024386346340179443, "rewards_train/margins": 0.043496690690517426, "rewards_train/rejected": -0.019110344350337982, "step": 35 }, { "epoch": 0.1, "learning_rate": 2.618181818181818e-07, "loss": 0.6717, "step": 36 }, { "epoch": 0.1, "logps_train/chosen": -68.7771987915039, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -103.85659790039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0268700011074543, "rewards_train/margins": 0.04534200578927994, "rewards_train/rejected": -0.018472004681825638, "step": 36 }, { "epoch": 0.1, "learning_rate": 2.690909090909091e-07, "loss": 0.6708, "step": 37 }, { "epoch": 0.1, "logps_train/chosen": -68.25245666503906, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -102.84796142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.028270024806261063, "rewards_train/margins": 0.050810571759939194, "rewards_train/rejected": -0.02254054695367813, "step": 37 }, { "epoch": 0.1, "learning_rate": 2.7636363636363635e-07, "loss": 0.6681, "step": 38 }, { "epoch": 0.1, "logps_train/chosen": -70.00979614257812, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -106.92184448242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.030685031786561012, "rewards_train/margins": 0.05607261694967747, "rewards_train/rejected": -0.025387585163116455, "step": 38 }, { "epoch": 0.11, "learning_rate": 2.8363636363636363e-07, "loss": 0.6655, "step": 39 }, { "epoch": 0.11, "logps_train/chosen": -69.95113372802734, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -107.75, "logps_train/rejected": -107.9970474243164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0327671654522419, "rewards_train/margins": 0.05918094515800476, "rewards_train/rejected": -0.026413779705762863, "step": 39 }, { "epoch": 0.11, "learning_rate": 2.909090909090909e-07, "loss": 0.6641, "step": 40 }, { "epoch": 0.11, "logps_train/chosen": -68.8032455444336, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -106.25089263916016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.03256603702902794, "rewards_train/margins": 0.06576059386134148, "rewards_train/rejected": -0.03319455683231354, "step": 40 }, { "epoch": 0.11, "learning_rate": 2.981818181818182e-07, "loss": 0.6609, "step": 41 }, { "epoch": 0.11, "logps_train/chosen": -71.21537017822266, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -105.4375, "logps_train/rejected": -105.82281494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.035835836082696915, "rewards_train/margins": 0.07114468887448311, "rewards_train/rejected": -0.035308852791786194, "step": 41 }, { "epoch": 0.12, "learning_rate": 3.0545454545454546e-07, "loss": 0.6583, "step": 42 }, { "epoch": 0.12, "logps_train/chosen": -67.79315948486328, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -102.82209777832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.035527877509593964, "rewards_train/margins": 0.0719371847808361, "rewards_train/rejected": -0.03640930727124214, "step": 42 }, { "epoch": 0.12, "learning_rate": 3.1272727272727273e-07, "loss": 0.6579, "step": 43 }, { "epoch": 0.12, "logps_train/chosen": -67.5360336303711, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -105.35246276855469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04356423020362854, "rewards_train/margins": 0.08769756183028221, "rewards_train/rejected": -0.04413333162665367, "step": 43 }, { "epoch": 0.12, "learning_rate": 3.2e-07, "loss": 0.6503, "step": 44 }, { "epoch": 0.12, "logps_train/chosen": -68.9448013305664, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -105.67100524902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04214154928922653, "rewards_train/margins": 0.08463196083903313, "rewards_train/rejected": -0.042490411549806595, "step": 44 }, { "epoch": 0.12, "learning_rate": 3.272727272727273e-07, "loss": 0.6519, "step": 45 }, { "epoch": 0.12, "logps_train/chosen": -69.7159423828125, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -105.74375915527344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04818149656057358, "rewards_train/margins": 0.09638512134552002, "rewards_train/rejected": -0.04820362478494644, "step": 45 }, { "epoch": 0.13, "learning_rate": 3.345454545454545e-07, "loss": 0.6463, "step": 46 }, { "epoch": 0.13, "logps_train/chosen": -67.38115692138672, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -104.35668182373047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05353473499417305, "rewards_train/margins": 0.10433956608176231, "rewards_train/rejected": -0.050804831087589264, "step": 46 }, { "epoch": 0.13, "learning_rate": 3.418181818181818e-07, "loss": 0.6425, "step": 47 }, { "epoch": 0.13, "logps_train/chosen": -68.85475158691406, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -105.32669830322266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05480801314115524, "rewards_train/margins": 0.10935314372181892, "rewards_train/rejected": -0.05454513058066368, "step": 47 }, { "epoch": 0.13, "learning_rate": 3.4909090909090905e-07, "loss": 0.6401, "step": 48 }, { "epoch": 0.13, "logps_train/chosen": -66.13357543945312, "logps_train/ref_chosen": -66.6875, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -102.56704711914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.054318591952323914, "rewards_train/margins": 0.11014498025178909, "rewards_train/rejected": -0.05582638829946518, "step": 48 }, { "epoch": 0.13, "learning_rate": 3.5636363636363633e-07, "loss": 0.6398, "step": 49 }, { "epoch": 0.13, "logps_train/chosen": -69.08721923828125, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -105.47769165039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0613958016037941, "rewards_train/margins": 0.12059043347835541, "rewards_train/rejected": -0.05919463187456131, "step": 49 }, { "epoch": 0.14, "learning_rate": 3.636363636363636e-07, "loss": 0.6349, "step": 50 }, { "epoch": 0.14, "logps_train/chosen": -68.92622375488281, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -105.63352966308594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.06460527330636978, "rewards_train/margins": 0.12922805547714233, "rewards_train/rejected": -0.06462278217077255, "step": 50 }, { "epoch": 0.14, "learning_rate": 3.709090909090909e-07, "loss": 0.6308, "step": 51 }, { "epoch": 0.14, "logps_train/chosen": -67.61592102050781, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -104.59382629394531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06365200877189636, "rewards_train/margins": 0.13270244002342224, "rewards_train/rejected": -0.06905043125152588, "step": 51 }, { "epoch": 0.14, "learning_rate": 3.7818181818181816e-07, "loss": 0.6292, "step": 52 }, { "epoch": 0.14, "logps_train/chosen": -69.05972290039062, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -107.9375, "logps_train/rejected": -108.69145202636719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07083481550216675, "rewards_train/margins": 0.1438867747783661, "rewards_train/rejected": -0.07305195927619934, "step": 52 }, { "epoch": 0.15, "learning_rate": 3.8545454545454543e-07, "loss": 0.6241, "step": 53 }, { "epoch": 0.15, "logps_train/chosen": -67.79457092285156, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -104.9063949584961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07596300542354584, "rewards_train/margins": 0.15830150991678238, "rewards_train/rejected": -0.08233850449323654, "step": 53 }, { "epoch": 0.15, "learning_rate": 3.927272727272727e-07, "loss": 0.6174, "step": 54 }, { "epoch": 0.15, "logps_train/chosen": -67.89226531982422, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -103.80994415283203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0714179277420044, "rewards_train/margins": 0.15153326094150543, "rewards_train/rejected": -0.08011533319950104, "step": 54 }, { "epoch": 0.15, "learning_rate": 4e-07, "loss": 0.6207, "step": 55 }, { "epoch": 0.15, "logps_train/chosen": -68.1738052368164, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -105.87425231933594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07329365611076355, "rewards_train/margins": 0.1675550490617752, "rewards_train/rejected": -0.09426139295101166, "step": 55 }, { "epoch": 0.15, "learning_rate": 3.999990768796313e-07, "loss": 0.6134, "step": 56 }, { "epoch": 0.15, "logps_train/chosen": -67.1877670288086, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -103.38972473144531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08752249926328659, "rewards_train/margins": 0.17844735831022263, "rewards_train/rejected": -0.09092485904693604, "step": 56 }, { "epoch": 0.16, "learning_rate": 3.999963075270469e-07, "loss": 0.6082, "step": 57 }, { "epoch": 0.16, "logps_train/chosen": -67.5474853515625, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -103.81512451171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08543713390827179, "rewards_train/margins": 0.18277043849229813, "rewards_train/rejected": -0.09733330458402634, "step": 57 }, { "epoch": 0.16, "learning_rate": 3.999916919678111e-07, "loss": 0.6063, "step": 58 }, { "epoch": 0.16, "logps_train/chosen": -66.7203369140625, "logps_train/ref_chosen": -67.5625, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -103.37339782714844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08704867959022522, "rewards_train/margins": 0.1830795854330063, "rewards_train/rejected": -0.09603090584278107, "step": 58 }, { "epoch": 0.16, "learning_rate": 3.999852302445311e-07, "loss": 0.6062, "step": 59 }, { "epoch": 0.16, "logps_train/chosen": -67.58486938476562, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -105.48396301269531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09410108625888824, "rewards_train/margins": 0.19933365285396576, "rewards_train/rejected": -0.10523256659507751, "step": 59 }, { "epoch": 0.17, "learning_rate": 3.9997692241685643e-07, "loss": 0.599, "step": 60 }, { "epoch": 0.17, "logps_train/chosen": -70.75531768798828, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -108.35310363769531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10386262089014053, "rewards_train/margins": 0.21334318071603775, "rewards_train/rejected": -0.10948055982589722, "step": 60 }, { "epoch": 0.17, "learning_rate": 3.9996676856147826e-07, "loss": 0.5929, "step": 61 }, { "epoch": 0.17, "logps_train/chosen": -66.65058898925781, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -104.74565124511719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09543925523757935, "rewards_train/margins": 0.20935913920402527, "rewards_train/rejected": -0.11391988396644592, "step": 61 }, { "epoch": 0.17, "learning_rate": 3.99954768772129e-07, "loss": 0.5947, "step": 62 }, { "epoch": 0.17, "logps_train/chosen": -68.85789489746094, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -104.5528564453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10547089576721191, "rewards_train/margins": 0.21270965784788132, "rewards_train/rejected": -0.1072387620806694, "step": 62 }, { "epoch": 0.17, "learning_rate": 3.9994092315958115e-07, "loss": 0.593, "step": 63 }, { "epoch": 0.17, "logps_train/chosen": -69.61751556396484, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -103.94869995117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11500631272792816, "rewards_train/margins": 0.22882124781608582, "rewards_train/rejected": -0.11381493508815765, "step": 63 }, { "epoch": 0.18, "learning_rate": 3.999252318516462e-07, "loss": 0.586, "step": 64 }, { "epoch": 0.18, "logps_train/chosen": -66.31761932373047, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -103.05818176269531, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.10378479212522507, "rewards_train/margins": 0.22879234701395035, "rewards_train/rejected": -0.12500755488872528, "step": 64 }, { "epoch": 0.18, "learning_rate": 3.9990769499317407e-07, "loss": 0.5865, "step": 65 }, { "epoch": 0.18, "logps_train/chosen": -68.59765625, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -108.4375, "logps_train/rejected": -109.80982208251953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12250924110412598, "rewards_train/margins": 0.26057130098342896, "rewards_train/rejected": -0.13806205987930298, "step": 65 }, { "epoch": 0.18, "learning_rate": 3.998883127460509e-07, "loss": 0.5723, "step": 66 }, { "epoch": 0.18, "logps_train/chosen": -67.88196563720703, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -108.20626068115234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12088553607463837, "rewards_train/margins": 0.26074983179569244, "rewards_train/rejected": -0.13986429572105408, "step": 66 }, { "epoch": 0.18, "learning_rate": 3.9986708528919823e-07, "loss": 0.5724, "step": 67 }, { "epoch": 0.18, "logps_train/chosen": -65.95365905761719, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -101.6875, "logps_train/rejected": -103.16552734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11840332299470901, "rewards_train/margins": 0.26425347477197647, "rewards_train/rejected": -0.14585015177726746, "step": 67 }, { "epoch": 0.19, "learning_rate": 3.9984401281857095e-07, "loss": 0.5707, "step": 68 }, { "epoch": 0.19, "logps_train/chosen": -66.28451538085938, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -102.875, "logps_train/rejected": -104.37060546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11192946135997772, "rewards_train/margins": 0.25797444581985474, "rewards_train/rejected": -0.14604498445987701, "step": 68 }, { "epoch": 0.19, "learning_rate": 3.9981909554715585e-07, "loss": 0.5739, "step": 69 }, { "epoch": 0.19, "logps_train/chosen": -67.37400817871094, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -106.39096069335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1233411654829979, "rewards_train/margins": 0.29051391035318375, "rewards_train/rejected": -0.16717274487018585, "step": 69 }, { "epoch": 0.19, "learning_rate": 3.997923337049693e-07, "loss": 0.5595, "step": 70 }, { "epoch": 0.19, "logps_train/chosen": -69.73243713378906, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -108.26152801513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13593673706054688, "rewards_train/margins": 0.29880836606025696, "rewards_train/rejected": -0.16287162899971008, "step": 70 }, { "epoch": 0.2, "learning_rate": 3.9976372753905526e-07, "loss": 0.5559, "step": 71 }, { "epoch": 0.2, "logps_train/chosen": -69.82305145263672, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -107.0625, "logps_train/rejected": -108.84854125976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13859310746192932, "rewards_train/margins": 0.3175870180130005, "rewards_train/rejected": -0.17899391055107117, "step": 71 }, { "epoch": 0.2, "learning_rate": 3.997332773134831e-07, "loss": 0.5483, "step": 72 }, { "epoch": 0.2, "logps_train/chosen": -67.85124206542969, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -105.12037658691406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13552939891815186, "rewards_train/margins": 0.29776284098625183, "rewards_train/rejected": -0.16223344206809998, "step": 72 }, { "epoch": 0.2, "learning_rate": 3.997009833093451e-07, "loss": 0.5564, "step": 73 }, { "epoch": 0.2, "logps_train/chosen": -68.6871109008789, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -106.25495910644531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1547262966632843, "rewards_train/margins": 0.32563284039497375, "rewards_train/rejected": -0.17090654373168945, "step": 73 }, { "epoch": 0.2, "learning_rate": 3.9966684582475383e-07, "loss": 0.5446, "step": 74 }, { "epoch": 0.2, "logps_train/chosen": -69.89390563964844, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -109.25, "logps_train/rejected": -111.31571197509766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1491348296403885, "rewards_train/margins": 0.35746416449546814, "rewards_train/rejected": -0.20832933485507965, "step": 74 }, { "epoch": 0.21, "learning_rate": 3.996308651748392e-07, "loss": 0.5315, "step": 75 }, { "epoch": 0.21, "logps_train/chosen": -67.97065734863281, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -104.77208709716797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14292438328266144, "rewards_train/margins": 0.32403917610645294, "rewards_train/rejected": -0.1811147928237915, "step": 75 }, { "epoch": 0.21, "learning_rate": 3.995930416917461e-07, "loss": 0.5454, "step": 76 }, { "epoch": 0.21, "logps_train/chosen": -69.47181701660156, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -109.39265441894531, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14969384670257568, "rewards_train/margins": 0.3381780982017517, "rewards_train/rejected": -0.18848425149917603, "step": 76 }, { "epoch": 0.21, "learning_rate": 3.995533757246306e-07, "loss": 0.5404, "step": 77 }, { "epoch": 0.21, "logps_train/chosen": -66.78229522705078, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -106.65013122558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14970046281814575, "rewards_train/margins": 0.34254544973373413, "rewards_train/rejected": -0.19284498691558838, "step": 77 }, { "epoch": 0.21, "learning_rate": 3.9951186763965753e-07, "loss": 0.5375, "step": 78 }, { "epoch": 0.21, "logps_train/chosen": -66.29652404785156, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -103.95500183105469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.147642582654953, "rewards_train/margins": 0.35407963395118713, "rewards_train/rejected": -0.20643705129623413, "step": 78 }, { "epoch": 0.22, "learning_rate": 3.9946851781999633e-07, "loss": 0.534, "step": 79 }, { "epoch": 0.22, "logps_train/chosen": -67.20491027832031, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -106.25450134277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16681337356567383, "rewards_train/margins": 0.38884595036506653, "rewards_train/rejected": -0.2220325767993927, "step": 79 }, { "epoch": 0.22, "learning_rate": 3.9946851781999633e-07, "loss": 0.5188, "step": 80 }, { "epoch": 0.22, "logps_train/chosen": -67.45674896240234, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -105.1875, "logps_train/rejected": -107.3750228881836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1447058916091919, "rewards_train/margins": 0.36267703771591187, "rewards_train/rejected": -0.21797114610671997, "step": 80 }, { "epoch": 0.22, "learning_rate": 3.9942332666581814e-07, "loss": 0.5303, "step": 81 }, { "epoch": 0.22, "logps_train/chosen": -66.7303466796875, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -106.87014770507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15626177191734314, "rewards_train/margins": 0.36837390065193176, "rewards_train/rejected": -0.21211212873458862, "step": 81 }, { "epoch": 0.23, "learning_rate": 3.9937629459429163e-07, "loss": 0.5279, "step": 82 }, { "epoch": 0.23, "logps_train/chosen": -68.03829956054688, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -107.97977447509766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1608675867319107, "rewards_train/margins": 0.3873605877161026, "rewards_train/rejected": -0.2264930009841919, "step": 82 }, { "epoch": 0.23, "learning_rate": 3.9932742203957945e-07, "loss": 0.5202, "step": 83 }, { "epoch": 0.23, "logps_train/chosen": -67.91797637939453, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -105.77668762207031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16645440459251404, "rewards_train/margins": 0.40838050842285156, "rewards_train/rejected": -0.24192610383033752, "step": 83 }, { "epoch": 0.23, "learning_rate": 3.99276709452834e-07, "loss": 0.511, "step": 84 }, { "epoch": 0.23, "logps_train/chosen": -67.27054595947266, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -109.02593994140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1810021698474884, "rewards_train/margins": 0.418069064617157, "rewards_train/rejected": -0.23706689476966858, "step": 84 }, { "epoch": 0.23, "learning_rate": 3.992241573021937e-07, "loss": 0.5075, "step": 85 }, { "epoch": 0.23, "logps_train/chosen": -69.11302947998047, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -106.82901000976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16794495284557343, "rewards_train/margins": 0.4099283367395401, "rewards_train/rejected": -0.24198338389396667, "step": 85 }, { "epoch": 0.24, "learning_rate": 3.991697660727781e-07, "loss": 0.5109, "step": 86 }, { "epoch": 0.24, "logps_train/chosen": -67.7979736328125, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -104.70287322998047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17606210708618164, "rewards_train/margins": 0.44254088401794434, "rewards_train/rejected": -0.2664787769317627, "step": 86 }, { "epoch": 0.24, "learning_rate": 3.991135362666836e-07, "loss": 0.4983, "step": 87 }, { "epoch": 0.24, "logps_train/chosen": -70.18103790283203, "logps_train/ref_chosen": -72.0625, "logps_train/ref_rejected": -108.75, "logps_train/rejected": -111.82156372070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19131991267204285, "rewards_train/margins": 0.4966202974319458, "rewards_train/rejected": -0.30530038475990295, "step": 87 }, { "epoch": 0.24, "learning_rate": 3.990554684029791e-07, "loss": 0.4767, "step": 88 }, { "epoch": 0.24, "logps_train/chosen": -67.24535369873047, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -105.05155944824219, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.1760995090007782, "rewards_train/margins": 0.4305230975151062, "rewards_train/rejected": -0.254423588514328, "step": 88 }, { "epoch": 0.25, "learning_rate": 3.989955630177008e-07, "loss": 0.504, "step": 89 }, { "epoch": 0.25, "logps_train/chosen": -68.52679443359375, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -109.90853881835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19458597898483276, "rewards_train/margins": 0.48202213644981384, "rewards_train/rejected": -0.2874361574649811, "step": 89 }, { "epoch": 0.25, "learning_rate": 3.989338206638476e-07, "loss": 0.4842, "step": 90 }, { "epoch": 0.25, "logps_train/chosen": -66.56008911132812, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -105.50776672363281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1899876594543457, "rewards_train/margins": 0.4462333619594574, "rewards_train/rejected": -0.2562457025051117, "step": 90 }, { "epoch": 0.25, "learning_rate": 3.988702419113757e-07, "loss": 0.4975, "step": 91 }, { "epoch": 0.25, "logps_train/chosen": -67.7061767578125, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -104.62736511230469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17073996365070343, "rewards_train/margins": 0.4512007385492325, "rewards_train/rejected": -0.28046077489852905, "step": 91 }, { "epoch": 0.25, "learning_rate": 3.988048273471935e-07, "loss": 0.4962, "step": 92 }, { "epoch": 0.25, "logps_train/chosen": -68.85783386230469, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -108.00345611572266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19370874762535095, "rewards_train/margins": 0.48428890109062195, "rewards_train/rejected": -0.290580153465271, "step": 92 }, { "epoch": 0.26, "learning_rate": 3.9873757757515616e-07, "loss": 0.4831, "step": 93 }, { "epoch": 0.26, "logps_train/chosen": -67.57554626464844, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -107.50221252441406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.178480327129364, "rewards_train/margins": 0.4949617087841034, "rewards_train/rejected": -0.3164813816547394, "step": 93 }, { "epoch": 0.26, "learning_rate": 3.986684932160601e-07, "loss": 0.48, "step": 94 }, { "epoch": 0.26, "logps_train/chosen": -66.971923828125, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -106.1875, "logps_train/rejected": -109.36359405517578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19567888975143433, "rewards_train/margins": 0.5138739347457886, "rewards_train/rejected": -0.31819504499435425, "step": 94 }, { "epoch": 0.26, "learning_rate": 3.98597574907637e-07, "loss": 0.4716, "step": 95 }, { "epoch": 0.26, "logps_train/chosen": -66.74336242675781, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -107.10295104980469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18728457391262054, "rewards_train/margins": 0.5076379328966141, "rewards_train/rejected": -0.32035335898399353, "step": 95 }, { "epoch": 0.26, "learning_rate": 3.9852482330454834e-07, "loss": 0.4735, "step": 96 }, { "epoch": 0.26, "logps_train/chosen": -66.67648315429688, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -106.68460845947266, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20583781599998474, "rewards_train/margins": 0.5046703219413757, "rewards_train/rejected": -0.298832505941391, "step": 96 }, { "epoch": 0.27, "learning_rate": 3.9845023907837886e-07, "loss": 0.4755, "step": 97 }, { "epoch": 0.27, "logps_train/chosen": -67.07733154296875, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -106.95794677734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2104305624961853, "rewards_train/margins": 0.5393310785293579, "rewards_train/rejected": -0.3289005160331726, "step": 97 }, { "epoch": 0.27, "learning_rate": 3.983738229176308e-07, "loss": 0.4637, "step": 98 }, { "epoch": 0.27, "logps_train/chosen": -65.3255615234375, "logps_train/ref_chosen": -67.3125, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -105.21673583984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1977173089981079, "rewards_train/margins": 0.5143124163150787, "rewards_train/rejected": -0.3165951073169708, "step": 98 }, { "epoch": 0.27, "learning_rate": 3.9829557552771736e-07, "loss": 0.4718, "step": 99 }, { "epoch": 0.27, "logps_train/chosen": -65.5377197265625, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -105.665283203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18553388118743896, "rewards_train/margins": 0.49483489990234375, "rewards_train/rejected": -0.3093010187149048, "step": 99 }, { "epoch": 0.28, "learning_rate": 3.98215497630956e-07, "loss": 0.4801, "step": 100 }, { "epoch": 0.28, "logps_train/chosen": -66.48321533203125, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -105.61015319824219, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2122248262166977, "rewards_train/margins": 0.5693336576223373, "rewards_train/rejected": -0.35710883140563965, "step": 100 }, { "epoch": 0.28, "learning_rate": 3.9813358996656224e-07, "loss": 0.4508, "step": 101 }, { "epoch": 0.28, "logps_train/chosen": -67.13420867919922, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -109.5264892578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.206451877951622, "rewards_train/margins": 0.5875180214643478, "rewards_train/rejected": -0.38106614351272583, "step": 101 }, { "epoch": 0.28, "learning_rate": 3.9804985329064225e-07, "loss": 0.4451, "step": 102 }, { "epoch": 0.28, "logps_train/chosen": -66.3199691772461, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -109.13957977294922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20647987723350525, "rewards_train/margins": 0.5612580180168152, "rewards_train/rejected": -0.35477814078330994, "step": 102 }, { "epoch": 0.28, "learning_rate": 3.9796428837618654e-07, "loss": 0.457, "step": 103 }, { "epoch": 0.28, "logps_train/chosen": -68.23517608642578, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -107.3125, "logps_train/rejected": -111.25909423828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22067147493362427, "rewards_train/margins": 0.6154289245605469, "rewards_train/rejected": -0.3947574496269226, "step": 103 }, { "epoch": 0.29, "learning_rate": 3.978768960130621e-07, "loss": 0.436, "step": 104 }, { "epoch": 0.29, "logps_train/chosen": -67.19078063964844, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -108.8671646118164, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22091242671012878, "rewards_train/margins": 0.598937451839447, "rewards_train/rejected": -0.37802502512931824, "step": 104 }, { "epoch": 0.29, "learning_rate": 3.9778767700800574e-07, "loss": 0.4426, "step": 105 }, { "epoch": 0.29, "logps_train/chosen": -68.67965698242188, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -109.8125, "logps_train/rejected": -113.64967346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23017846047878265, "rewards_train/margins": 0.6173134595155716, "rewards_train/rejected": -0.38713499903678894, "step": 105 }, { "epoch": 0.29, "learning_rate": 3.976966321846162e-07, "loss": 0.436, "step": 106 }, { "epoch": 0.29, "logps_train/chosen": -66.22254943847656, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -106.2476577758789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2150503545999527, "rewards_train/margins": 0.589816227555275, "rewards_train/rejected": -0.37476587295532227, "step": 106 }, { "epoch": 0.29, "learning_rate": 3.976037623833468e-07, "loss": 0.4467, "step": 107 }, { "epoch": 0.29, "logps_train/chosen": -66.51698303222656, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -108.35374450683594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.20914120972156525, "rewards_train/margins": 0.6184413582086563, "rewards_train/rejected": -0.40930014848709106, "step": 107 }, { "epoch": 0.3, "learning_rate": 3.975090684614976e-07, "loss": 0.4368, "step": 108 }, { "epoch": 0.3, "logps_train/chosen": -67.39501190185547, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -108.845703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22470778226852417, "rewards_train/margins": 0.638770192861557, "rewards_train/rejected": -0.41406241059303284, "step": 108 }, { "epoch": 0.3, "learning_rate": 3.974125512932075e-07, "loss": 0.4293, "step": 109 }, { "epoch": 0.3, "logps_train/chosen": -65.62530517578125, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -107.24943542480469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2328311800956726, "rewards_train/margins": 0.6238875985145569, "rewards_train/rejected": -0.3910564184188843, "step": 109 }, { "epoch": 0.3, "learning_rate": 3.9731421176944614e-07, "loss": 0.4342, "step": 110 }, { "epoch": 0.3, "logps_train/chosen": -64.89907836914062, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -108.16387176513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21980899572372437, "rewards_train/margins": 0.6379542946815491, "rewards_train/rejected": -0.4181452989578247, "step": 110 }, { "epoch": 0.31, "learning_rate": 3.972140507980057e-07, "loss": 0.4309, "step": 111 }, { "epoch": 0.31, "logps_train/chosen": -67.36904907226562, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -107.73622131347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2307712435722351, "rewards_train/margins": 0.6316397190093994, "rewards_train/rejected": -0.4008684754371643, "step": 111 }, { "epoch": 0.31, "learning_rate": 3.9711206930349246e-07, "loss": 0.4335, "step": 112 }, { "epoch": 0.31, "logps_train/chosen": -66.8480453491211, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -109.35238647460938, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2411227822303772, "rewards_train/margins": 0.6776802837848663, "rewards_train/rejected": -0.43655750155448914, "step": 112 }, { "epoch": 0.31, "learning_rate": 3.970082682273184e-07, "loss": 0.4184, "step": 113 }, { "epoch": 0.31, "logps_train/chosen": -66.50444030761719, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -107.12423706054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21874544024467468, "rewards_train/margins": 0.6500650942325592, "rewards_train/rejected": -0.4313196539878845, "step": 113 }, { "epoch": 0.31, "learning_rate": 3.9690264852769235e-07, "loss": 0.4256, "step": 114 }, { "epoch": 0.31, "logps_train/chosen": -64.12308502197266, "logps_train/ref_chosen": -66.5625, "logps_train/ref_rejected": -98.9375, "logps_train/rejected": -103.03860473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24569928646087646, "rewards_train/margins": 0.6537590026855469, "rewards_train/rejected": -0.4080597162246704, "step": 114 }, { "epoch": 0.32, "learning_rate": 3.967952111796114e-07, "loss": 0.4233, "step": 115 }, { "epoch": 0.32, "logps_train/chosen": -67.82093811035156, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -108.35289764404297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24173381924629211, "rewards_train/margins": 0.6923558115959167, "rewards_train/rejected": -0.45062199234962463, "step": 115 }, { "epoch": 0.32, "learning_rate": 3.9668595717485146e-07, "loss": 0.4097, "step": 116 }, { "epoch": 0.32, "logps_train/chosen": -64.75138092041016, "logps_train/ref_chosen": -67.1875, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -106.4718017578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24151208996772766, "rewards_train/margins": 0.6912312209606171, "rewards_train/rejected": -0.4497191309928894, "step": 116 }, { "epoch": 0.32, "learning_rate": 3.965748875219585e-07, "loss": 0.4136, "step": 117 }, { "epoch": 0.32, "logps_train/chosen": -66.48712921142578, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -110.81269836425781, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23995935916900635, "rewards_train/margins": 0.7128308415412903, "rewards_train/rejected": -0.47287148237228394, "step": 117 }, { "epoch": 0.33, "learning_rate": 3.9646200324623926e-07, "loss": 0.408, "step": 118 }, { "epoch": 0.33, "logps_train/chosen": -67.99176788330078, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -108.875, "logps_train/rejected": -114.07585906982422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24672174453735352, "rewards_train/margins": 0.7672961950302124, "rewards_train/rejected": -0.5205744504928589, "step": 118 }, { "epoch": 0.33, "learning_rate": 3.9634730538975135e-07, "loss": 0.3884, "step": 119 }, { "epoch": 0.33, "logps_train/chosen": -66.47265625, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -106.81314086914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22475579380989075, "rewards_train/margins": 0.7325346171855927, "rewards_train/rejected": -0.5077788233757019, "step": 119 }, { "epoch": 0.33, "learning_rate": 3.9623079501129404e-07, "loss": 0.4005, "step": 120 }, { "epoch": 0.33, "logps_train/chosen": -65.56671142578125, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -110.96617126464844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24952983856201172, "rewards_train/margins": 0.7754433751106262, "rewards_train/rejected": -0.5259135365486145, "step": 120 }, { "epoch": 0.33, "learning_rate": 3.9611247318639843e-07, "loss": 0.3849, "step": 121 }, { "epoch": 0.33, "logps_train/chosen": -66.82403564453125, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -112.54737854003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25578004121780396, "rewards_train/margins": 0.7897177934646606, "rewards_train/rejected": -0.5339377522468567, "step": 121 }, { "epoch": 0.34, "learning_rate": 3.959923410073173e-07, "loss": 0.3836, "step": 122 }, { "epoch": 0.34, "logps_train/chosen": -68.23504638671875, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -109.59066772460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25764721632003784, "rewards_train/margins": 0.7532377243041992, "rewards_train/rejected": -0.4955905079841614, "step": 122 }, { "epoch": 0.34, "learning_rate": 3.958703995830154e-07, "loss": 0.3947, "step": 123 }, { "epoch": 0.34, "logps_train/chosen": -67.43562316894531, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -113.74366760253906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2706463634967804, "rewards_train/margins": 0.8420837223529816, "rewards_train/rejected": -0.5714373588562012, "step": 123 }, { "epoch": 0.34, "learning_rate": 3.9574665003915876e-07, "loss": 0.3661, "step": 124 }, { "epoch": 0.34, "logps_train/chosen": -67.67195129394531, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -107.06608581542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2587814927101135, "rewards_train/margins": 0.7754490375518799, "rewards_train/rejected": -0.5166675448417664, "step": 124 }, { "epoch": 0.34, "learning_rate": 3.956210935181047e-07, "loss": 0.387, "step": 125 }, { "epoch": 0.34, "logps_train/chosen": -67.3424301147461, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -108.5625, "logps_train/rejected": -114.26753997802734, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26692914962768555, "rewards_train/margins": 0.8359688520431519, "rewards_train/rejected": -0.5690397024154663, "step": 125 }, { "epoch": 0.35, "learning_rate": 3.9549373117889086e-07, "loss": 0.3692, "step": 126 }, { "epoch": 0.35, "logps_train/chosen": -67.47750854492188, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -111.07245635986328, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.213332861661911, "rewards_train/margins": 0.7676487863063812, "rewards_train/rejected": -0.5543159246444702, "step": 126 }, { "epoch": 0.35, "learning_rate": 3.953645641972251e-07, "loss": 0.3928, "step": 127 }, { "epoch": 0.35, "logps_train/chosen": -67.55777740478516, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -101.6875, "logps_train/rejected": -106.74423217773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2760089635848999, "rewards_train/margins": 0.7842700481414795, "rewards_train/rejected": -0.5082610845565796, "step": 127 }, { "epoch": 0.35, "learning_rate": 3.9523359376547414e-07, "loss": 0.3828, "step": 128 }, { "epoch": 0.35, "logps_train/chosen": -68.59558868408203, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -112.00325775146484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30753064155578613, "rewards_train/margins": 0.8595166206359863, "rewards_train/rejected": -0.5519859790802002, "step": 128 }, { "epoch": 0.36, "learning_rate": 3.9510082109265264e-07, "loss": 0.3624, "step": 129 }, { "epoch": 0.36, "logps_train/chosen": -68.55984497070312, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -108.5625, "logps_train/rejected": -114.5945816040039, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2700900435447693, "rewards_train/margins": 0.8699783682823181, "rewards_train/rejected": -0.5998883247375488, "step": 129 }, { "epoch": 0.36, "learning_rate": 3.949662474044122e-07, "loss": 0.3597, "step": 130 }, { "epoch": 0.36, "logps_train/chosen": -68.42821502685547, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -112.58767700195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28584063053131104, "rewards_train/margins": 0.8949018716812134, "rewards_train/rejected": -0.6090612411499023, "step": 130 }, { "epoch": 0.36, "learning_rate": 3.9482987394303e-07, "loss": 0.3505, "step": 131 }, { "epoch": 0.36, "logps_train/chosen": -68.0450210571289, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -113.04205322265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31268560886383057, "rewards_train/margins": 0.9487262964248657, "rewards_train/rejected": -0.6360406875610352, "step": 131 }, { "epoch": 0.36, "learning_rate": 3.9469170196739715e-07, "loss": 0.3332, "step": 132 }, { "epoch": 0.36, "logps_train/chosen": -65.32820892333984, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -107.70977020263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2967684268951416, "rewards_train/margins": 0.8852254152297974, "rewards_train/rejected": -0.5884569883346558, "step": 132 }, { "epoch": 0.37, "learning_rate": 3.945517327530074e-07, "loss": 0.3515, "step": 133 }, { "epoch": 0.37, "logps_train/chosen": -65.1033935546875, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -105.3125, "logps_train/rejected": -111.46701049804688, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2595822811126709, "rewards_train/margins": 0.8774752616882324, "rewards_train/rejected": -0.6178929805755615, "step": 133 }, { "epoch": 0.37, "learning_rate": 3.9440996759194504e-07, "loss": 0.3562, "step": 134 }, { "epoch": 0.37, "logps_train/chosen": -66.87942504882812, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -110.99348449707031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2851046919822693, "rewards_train/margins": 0.9608197808265686, "rewards_train/rejected": -0.6757150888442993, "step": 134 }, { "epoch": 0.37, "learning_rate": 3.942664077928731e-07, "loss": 0.3309, "step": 135 }, { "epoch": 0.37, "logps_train/chosen": -64.57243347167969, "logps_train/ref_chosen": -67.3125, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -110.83653259277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27263951301574707, "rewards_train/margins": 0.9062925577163696, "rewards_train/rejected": -0.6336530447006226, "step": 135 }, { "epoch": 0.37, "learning_rate": 3.9412105468102137e-07, "loss": 0.3497, "step": 136 }, { "epoch": 0.37, "logps_train/chosen": -68.01128387451172, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -110.80046844482422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2861275374889374, "rewards_train/margins": 0.9387328326702118, "rewards_train/rejected": -0.6526052951812744, "step": 136 }, { "epoch": 0.38, "learning_rate": 3.9397390959817405e-07, "loss": 0.3402, "step": 137 }, { "epoch": 0.38, "logps_train/chosen": -65.0491943359375, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -110.0616683959961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27716052532196045, "rewards_train/margins": 0.9310812950134277, "rewards_train/rejected": -0.6539207696914673, "step": 137 }, { "epoch": 0.38, "learning_rate": 3.938249739026573e-07, "loss": 0.3437, "step": 138 }, { "epoch": 0.38, "logps_train/chosen": -66.88468933105469, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -111.49201202392578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3078686594963074, "rewards_train/margins": 0.9529684782028198, "rewards_train/rejected": -0.6450998187065125, "step": 138 }, { "epoch": 0.38, "learning_rate": 3.9367424896932694e-07, "loss": 0.3372, "step": 139 }, { "epoch": 0.38, "logps_train/chosen": -67.82188415527344, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -110.2417984008789, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2891005873680115, "rewards_train/margins": 0.9565420150756836, "rewards_train/rejected": -0.6674414277076721, "step": 139 }, { "epoch": 0.39, "learning_rate": 3.9352173618955553e-07, "loss": 0.3365, "step": 140 }, { "epoch": 0.39, "logps_train/chosen": -66.6313247680664, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -111.77627563476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27514904737472534, "rewards_train/margins": 1.01537424325943, "rewards_train/rejected": -0.7402251958847046, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.9336743697121957e-07, "loss": 0.3182, "step": 141 }, { "epoch": 0.39, "logps_train/chosen": -66.3953628540039, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -110.52206420898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30284640192985535, "rewards_train/margins": 1.0114006698131561, "rewards_train/rejected": -0.7085542678833008, "step": 141 }, { "epoch": 0.39, "learning_rate": 3.932113527386865e-07, "loss": 0.3228, "step": 142 }, { "epoch": 0.39, "logps_train/chosen": -67.705322265625, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -103.25, "logps_train/rejected": -110.55565643310547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30993664264678955, "rewards_train/margins": 1.037279486656189, "rewards_train/rejected": -0.7273428440093994, "step": 142 }, { "epoch": 0.39, "learning_rate": 3.9305348493280184e-07, "loss": 0.3132, "step": 143 }, { "epoch": 0.39, "logps_train/chosen": -67.63337707519531, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -114.1865234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3025798201560974, "rewards_train/margins": 1.0563890933990479, "rewards_train/rejected": -0.7538092732429504, "step": 143 }, { "epoch": 0.4, "learning_rate": 3.928938350108753e-07, "loss": 0.3104, "step": 144 }, { "epoch": 0.4, "logps_train/chosen": -65.19500732421875, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -111.509521484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2878721356391907, "rewards_train/margins": 1.0099671483039856, "rewards_train/rejected": -0.7220950126647949, "step": 144 }, { "epoch": 0.4, "learning_rate": 3.9273240444666793e-07, "loss": 0.3245, "step": 145 }, { "epoch": 0.4, "logps_train/chosen": -67.14523315429688, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -110.59342193603516, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2925075888633728, "rewards_train/margins": 0.9968202710151672, "rewards_train/rejected": -0.7043126821517944, "step": 145 }, { "epoch": 0.4, "learning_rate": 3.9256919473037805e-07, "loss": 0.3276, "step": 146 }, { "epoch": 0.4, "logps_train/chosen": -67.87757873535156, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -113.87308502197266, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.29564064741134644, "rewards_train/margins": 1.0789942741394043, "rewards_train/rejected": -0.7833536267280579, "step": 146 }, { "epoch": 0.4, "learning_rate": 3.924042073686279e-07, "loss": 0.31, "step": 147 }, { "epoch": 0.4, "logps_train/chosen": -66.71026611328125, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -106.875, "logps_train/rejected": -114.87745666503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3216000497341156, "rewards_train/margins": 1.1230181157588959, "rewards_train/rejected": -0.8014180660247803, "step": 147 }, { "epoch": 0.41, "learning_rate": 3.9223744388444937e-07, "loss": 0.2908, "step": 148 }, { "epoch": 0.41, "logps_train/chosen": -67.41333770751953, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -112.50668334960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2804920971393585, "rewards_train/margins": 1.081160694360733, "rewards_train/rejected": -0.8006685972213745, "step": 148 }, { "epoch": 0.41, "learning_rate": 3.9206890581727016e-07, "loss": 0.3041, "step": 149 }, { "epoch": 0.41, "logps_train/chosen": -68.53289031982422, "logps_train/ref_chosen": -71.875, "logps_train/ref_rejected": -109.75, "logps_train/rejected": -118.39039611816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33069556951522827, "rewards_train/margins": 1.1952234506607056, "rewards_train/rejected": -0.8645278811454773, "step": 149 }, { "epoch": 0.41, "learning_rate": 3.9189859472289947e-07, "loss": 0.274, "step": 150 }, { "epoch": 0.41, "logps_train/chosen": -67.04539489746094, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -107.375, "logps_train/rejected": -115.51918029785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3296896517276764, "rewards_train/margins": 1.1419588029384613, "rewards_train/rejected": -0.8122691512107849, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.917265121735137e-07, "loss": 0.286, "step": 151 }, { "epoch": 0.42, "logps_train/chosen": -65.2917709350586, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -112.35641479492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2758035957813263, "rewards_train/margins": 1.0735547840595245, "rewards_train/rejected": -0.7977511882781982, "step": 151 }, { "epoch": 0.42, "learning_rate": 3.91552659757642e-07, "loss": 0.3087, "step": 152 }, { "epoch": 0.42, "logps_train/chosen": -66.42459869384766, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -110.56661224365234, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.309053897857666, "rewards_train/margins": 1.1259687542915344, "rewards_train/rejected": -0.8169148564338684, "step": 152 }, { "epoch": 0.42, "learning_rate": 3.9137703908015135e-07, "loss": 0.2971, "step": 153 }, { "epoch": 0.42, "logps_train/chosen": -68.48994445800781, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -115.14610290527344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.322587788105011, "rewards_train/margins": 1.1567292213439941, "rewards_train/rejected": -0.8341414332389832, "step": 153 }, { "epoch": 0.42, "learning_rate": 3.9119965176223205e-07, "loss": 0.2846, "step": 154 }, { "epoch": 0.42, "logps_train/chosen": -63.799896240234375, "logps_train/ref_chosen": -66.5, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -110.21923065185547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2692292630672455, "rewards_train/margins": 1.107851356267929, "rewards_train/rejected": -0.8386220932006836, "step": 154 }, { "epoch": 0.43, "learning_rate": 3.910204994413825e-07, "loss": 0.3033, "step": 155 }, { "epoch": 0.43, "logps_train/chosen": -65.6150894165039, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -115.9381103515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30352982878685, "rewards_train/margins": 1.1786400973796844, "rewards_train/rejected": -0.8751102685928345, "step": 155 }, { "epoch": 0.43, "learning_rate": 3.908395837713943e-07, "loss": 0.2863, "step": 156 }, { "epoch": 0.43, "logps_train/chosen": -68.37384796142578, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -117.33738708496094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3323906362056732, "rewards_train/margins": 1.242594689130783, "rewards_train/rejected": -0.9102040529251099, "step": 156 }, { "epoch": 0.43, "learning_rate": 3.906569064223368e-07, "loss": 0.2676, "step": 157 }, { "epoch": 0.43, "logps_train/chosen": -66.83506774902344, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -112.38905334472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29461824893951416, "rewards_train/margins": 1.1498319506645203, "rewards_train/rejected": -0.8552137017250061, "step": 157 }, { "epoch": 0.44, "learning_rate": 3.9047246908054186e-07, "loss": 0.2894, "step": 158 }, { "epoch": 0.44, "logps_train/chosen": -67.91863250732422, "logps_train/ref_chosen": -70.9375, "logps_train/ref_rejected": -109.875, "logps_train/rejected": -118.97506713867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3046211004257202, "rewards_train/margins": 1.2181430459022522, "rewards_train/rejected": -0.913521945476532, "step": 158 }, { "epoch": 0.44, "learning_rate": 3.9028627344858824e-07, "loss": 0.2744, "step": 159 }, { "epoch": 0.44, "logps_train/chosen": -65.3331298828125, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -110.50358581542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3166872560977936, "rewards_train/margins": 1.1516160666942596, "rewards_train/rejected": -0.8349288105964661, "step": 159 }, { "epoch": 0.44, "learning_rate": 3.9009832124528563e-07, "loss": 0.2932, "step": 160 }, { "epoch": 0.44, "logps_train/chosen": -64.2282485961914, "logps_train/ref_chosen": -66.875, "logps_train/ref_rejected": -98.875, "logps_train/rejected": -107.28849029541016, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2655053734779358, "rewards_train/margins": 1.1069518327713013, "rewards_train/rejected": -0.8414464592933655, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.899086142056591e-07, "loss": 0.3036, "step": 161 }, { "epoch": 0.44, "logps_train/chosen": -66.19865417480469, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -110.38446044921875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.27129703760147095, "rewards_train/margins": 1.1437271237373352, "rewards_train/rejected": -0.8724300861358643, "step": 161 }, { "epoch": 0.45, "learning_rate": 3.8971715408093296e-07, "loss": 0.2978, "step": 162 }, { "epoch": 0.45, "logps_train/chosen": -63.84934997558594, "logps_train/ref_chosen": -67.1875, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -111.30949401855469, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3363536596298218, "rewards_train/margins": 1.3042654395103455, "rewards_train/rejected": -0.9679117798805237, "step": 162 }, { "epoch": 0.45, "learning_rate": 3.8952394263851473e-07, "loss": 0.2514, "step": 163 }, { "epoch": 0.45, "logps_train/chosen": -65.56851959228516, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -105.4375, "logps_train/rejected": -115.2369613647461, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2952476441860199, "rewards_train/margins": 1.274168998003006, "rewards_train/rejected": -0.9789213538169861, "step": 163 }, { "epoch": 0.45, "learning_rate": 3.893289816619785e-07, "loss": 0.2618, "step": 164 }, { "epoch": 0.45, "logps_train/chosen": -64.88726806640625, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -112.51302337646484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3217713534832001, "rewards_train/margins": 1.2337671220302582, "rewards_train/rejected": -0.9119957685470581, "step": 164 }, { "epoch": 0.45, "learning_rate": 3.891322729510488e-07, "loss": 0.2688, "step": 165 }, { "epoch": 0.45, "logps_train/chosen": -65.66615295410156, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -114.52912902832031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3406597673892975, "rewards_train/margins": 1.3229675590991974, "rewards_train/rejected": -0.9823077917098999, "step": 165 }, { "epoch": 0.46, "learning_rate": 3.8893381832158374e-07, "loss": 0.2503, "step": 166 }, { "epoch": 0.46, "logps_train/chosen": -65.76097106933594, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -112.68556213378906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3270767629146576, "rewards_train/margins": 1.2965125143527985, "rewards_train/rejected": -0.9694357514381409, "step": 166 }, { "epoch": 0.46, "learning_rate": 3.8873361960555853e-07, "loss": 0.2588, "step": 167 }, { "epoch": 0.46, "logps_train/chosen": -69.190673828125, "logps_train/ref_chosen": -72.25, "logps_train/ref_rejected": -108.9375, "logps_train/rejected": -118.93572998046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.306421160697937, "rewards_train/margins": 1.3049747943878174, "rewards_train/rejected": -0.9985536336898804, "step": 167 }, { "epoch": 0.46, "learning_rate": 3.885316786510482e-07, "loss": 0.2621, "step": 168 }, { "epoch": 0.46, "logps_train/chosen": -67.1773681640625, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -119.10082244873047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36180347204208374, "rewards_train/margins": 1.4337510466575623, "rewards_train/rejected": -1.0719475746154785, "step": 168 }, { "epoch": 0.47, "learning_rate": 3.8832799732221083e-07, "loss": 0.2253, "step": 169 }, { "epoch": 0.47, "logps_train/chosen": -65.84580993652344, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -111.2647933959961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3027724027633667, "rewards_train/margins": 1.2570351362228394, "rewards_train/rejected": -0.9542627334594727, "step": 169 }, { "epoch": 0.47, "learning_rate": 3.881225774992703e-07, "loss": 0.271, "step": 170 }, { "epoch": 0.47, "logps_train/chosen": -68.2993392944336, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -118.13156127929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3450171947479248, "rewards_train/margins": 1.4376662969589233, "rewards_train/rejected": -1.0926491022109985, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.879154210784989e-07, "loss": 0.2321, "step": 171 }, { "epoch": 0.47, "logps_train/chosen": -68.13359069824219, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -107.3125, "logps_train/rejected": -117.83003234863281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2984578609466553, "rewards_train/margins": 1.3485993146896362, "rewards_train/rejected": -1.050141453742981, "step": 171 }, { "epoch": 0.47, "learning_rate": 3.877065299721996e-07, "loss": 0.2511, "step": 172 }, { "epoch": 0.47, "logps_train/chosen": -64.36878967285156, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -114.93661499023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32894134521484375, "rewards_train/margins": 1.4316843748092651, "rewards_train/rejected": -1.1027430295944214, "step": 172 }, { "epoch": 0.48, "learning_rate": 3.8749590610868885e-07, "loss": 0.2274, "step": 173 }, { "epoch": 0.48, "logps_train/chosen": -68.22483825683594, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -116.96788787841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30827760696411133, "rewards_train/margins": 1.404480218887329, "rewards_train/rejected": -1.0962026119232178, "step": 173 }, { "epoch": 0.48, "learning_rate": 3.872835514322785e-07, "loss": 0.2379, "step": 174 }, { "epoch": 0.48, "logps_train/chosen": -65.52174377441406, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -116.16912078857422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34416329860687256, "rewards_train/margins": 1.452872395515442, "rewards_train/rejected": -1.1087090969085693, "step": 174 }, { "epoch": 0.48, "learning_rate": 3.8706946790325763e-07, "loss": 0.2268, "step": 175 }, { "epoch": 0.48, "logps_train/chosen": -66.98771667480469, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -117.7575912475586, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.2859450578689575, "rewards_train/margins": 1.4194189310073853, "rewards_train/rejected": -1.1334738731384277, "step": 175 }, { "epoch": 0.48, "learning_rate": 3.8685365749787504e-07, "loss": 0.2369, "step": 176 }, { "epoch": 0.48, "logps_train/chosen": -65.60944366455078, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -113.75597381591797, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.27977830171585083, "rewards_train/margins": 1.341215193271637, "rewards_train/rejected": -1.0614368915557861, "step": 176 }, { "epoch": 0.49, "learning_rate": 3.866361222083205e-07, "loss": 0.2538, "step": 177 }, { "epoch": 0.49, "logps_train/chosen": -66.27265930175781, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -107.0625, "logps_train/rejected": -117.65594482421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3459765911102295, "rewards_train/margins": 1.4080555438995361, "rewards_train/rejected": -1.0620789527893066, "step": 177 }, { "epoch": 0.49, "learning_rate": 3.864168640427066e-07, "loss": 0.2403, "step": 178 }, { "epoch": 0.49, "logps_train/chosen": -65.53939819335938, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -114.0365982055664, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32911744713783264, "rewards_train/margins": 1.4585579931735992, "rewards_train/rejected": -1.1294405460357666, "step": 178 }, { "epoch": 0.49, "learning_rate": 3.8619588502505005e-07, "loss": 0.2258, "step": 179 }, { "epoch": 0.49, "logps_train/chosen": -66.5870361328125, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -115.69666290283203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2786986529827118, "rewards_train/margins": 1.4782480895519257, "rewards_train/rejected": -1.1995494365692139, "step": 179 }, { "epoch": 0.5, "learning_rate": 3.859731871952533e-07, "loss": 0.227, "step": 180 }, { "epoch": 0.5, "logps_train/chosen": -65.64060974121094, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -115.1658935546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.283302366733551, "rewards_train/margins": 1.4153218865394592, "rewards_train/rejected": -1.1320195198059082, "step": 180 }, { "epoch": 0.5, "learning_rate": 3.857487726090853e-07, "loss": 0.238, "step": 181 }, { "epoch": 0.5, "logps_train/chosen": -65.44219970703125, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -116.03903198242188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.3379576802253723, "rewards_train/margins": 1.4693017601966858, "rewards_train/rejected": -1.1313440799713135, "step": 181 }, { "epoch": 0.5, "learning_rate": 3.855226433381628e-07, "loss": 0.2255, "step": 182 }, { "epoch": 0.5, "logps_train/chosen": -66.49085235595703, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -117.62995910644531, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.3029167950153351, "rewards_train/margins": 1.5004823505878448, "rewards_train/rejected": -1.1975655555725098, "step": 182 }, { "epoch": 0.5, "learning_rate": 3.8529480146993116e-07, "loss": 0.2224, "step": 183 }, { "epoch": 0.5, "logps_train/chosen": -65.6891098022461, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -116.45549774169922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3273293375968933, "rewards_train/margins": 1.538406789302826, "rewards_train/rejected": -1.2110774517059326, "step": 183 }, { "epoch": 0.51, "learning_rate": 3.8506524910764504e-07, "loss": 0.2185, "step": 184 }, { "epoch": 0.51, "logps_train/chosen": -66.42987060546875, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -101.375, "logps_train/rejected": -113.10614013671875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2214173823595047, "rewards_train/margins": 1.3926764279603958, "rewards_train/rejected": -1.1712590456008911, "step": 184 }, { "epoch": 0.51, "learning_rate": 3.8483398837034917e-07, "loss": 0.2491, "step": 185 }, { "epoch": 0.51, "logps_train/chosen": -66.41133880615234, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -103.3125, "logps_train/rejected": -115.15336608886719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3254678249359131, "rewards_train/margins": 1.5118980407714844, "rewards_train/rejected": -1.1864302158355713, "step": 185 }, { "epoch": 0.51, "learning_rate": 3.846010213928585e-07, "loss": 0.225, "step": 186 }, { "epoch": 0.51, "logps_train/chosen": -67.17622375488281, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -121.104736328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35791486501693726, "rewards_train/margins": 1.7002249360084534, "rewards_train/rejected": -1.3423100709915161, "step": 186 }, { "epoch": 0.52, "learning_rate": 3.843663503257385e-07, "loss": 0.1797, "step": 187 }, { "epoch": 0.52, "logps_train/chosen": -65.65629577636719, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -118.10845947265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3367144465446472, "rewards_train/margins": 1.6092299818992615, "rewards_train/rejected": -1.2725155353546143, "step": 187 }, { "epoch": 0.52, "learning_rate": 3.841299773352857e-07, "loss": 0.2069, "step": 188 }, { "epoch": 0.52, "logps_train/chosen": -66.4447021484375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -119.11358642578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35372307896614075, "rewards_train/margins": 1.6647887527942657, "rewards_train/rejected": -1.311065673828125, "step": 188 }, { "epoch": 0.52, "learning_rate": 3.838919046035074e-07, "loss": 0.188, "step": 189 }, { "epoch": 0.52, "logps_train/chosen": -65.93359375, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -117.13343048095703, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2958986759185791, "rewards_train/margins": 1.5937633514404297, "rewards_train/rejected": -1.2978646755218506, "step": 189 }, { "epoch": 0.52, "learning_rate": 3.8365213432810137e-07, "loss": 0.2104, "step": 190 }, { "epoch": 0.52, "logps_train/chosen": -64.42159271240234, "logps_train/ref_chosen": -66.8125, "logps_train/ref_rejected": -100.75, "logps_train/rejected": -112.67054748535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23801709711551666, "rewards_train/margins": 1.4267025142908096, "rewards_train/rejected": -1.188685417175293, "step": 190 }, { "epoch": 0.53, "learning_rate": 3.8341066872243583e-07, "loss": 0.2473, "step": 191 }, { "epoch": 0.53, "logps_train/chosen": -67.44851684570312, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -119.18185424804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3566613495349884, "rewards_train/margins": 1.6147882044315338, "rewards_train/rejected": -1.2581268548965454, "step": 191 }, { "epoch": 0.53, "learning_rate": 3.83167510015529e-07, "loss": 0.2025, "step": 192 }, { "epoch": 0.53, "logps_train/chosen": -64.89630126953125, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -117.63510131835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2903500199317932, "rewards_train/margins": 1.630129873752594, "rewards_train/rejected": -1.3397798538208008, "step": 192 }, { "epoch": 0.53, "learning_rate": 3.8292266045202856e-07, "loss": 0.206, "step": 193 }, { "epoch": 0.53, "logps_train/chosen": -66.47341918945312, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -121.96983337402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35617390275001526, "rewards_train/margins": 1.7340661585330963, "rewards_train/rejected": -1.377892255783081, "step": 193 }, { "epoch": 0.53, "learning_rate": 3.826761222921905e-07, "loss": 0.1841, "step": 194 }, { "epoch": 0.53, "logps_train/chosen": -65.3082504272461, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -100.25, "logps_train/rejected": -113.4607162475586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3069191873073578, "rewards_train/margins": 1.629504770040512, "rewards_train/rejected": -1.3225855827331543, "step": 194 }, { "epoch": 0.54, "learning_rate": 3.824278978118589e-07, "loss": 0.1996, "step": 195 }, { "epoch": 0.54, "logps_train/chosen": -66.54264831542969, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -118.18567657470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28045207262039185, "rewards_train/margins": 1.6357381939888, "rewards_train/rejected": -1.3552861213684082, "step": 195 }, { "epoch": 0.54, "learning_rate": 3.821779893024445e-07, "loss": 0.2077, "step": 196 }, { "epoch": 0.54, "logps_train/chosen": -69.48822021484375, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -108.4375, "logps_train/rejected": -122.43540954589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3486877679824829, "rewards_train/margins": 1.747208833694458, "rewards_train/rejected": -1.398521065711975, "step": 196 }, { "epoch": 0.54, "learning_rate": 3.819263990709037e-07, "loss": 0.1844, "step": 197 }, { "epoch": 0.54, "logps_train/chosen": -67.82771301269531, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -114.873779296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28324460983276367, "rewards_train/margins": 1.6238936185836792, "rewards_train/rejected": -1.3406490087509155, "step": 197 }, { "epoch": 0.55, "learning_rate": 3.81673129439717e-07, "loss": 0.2134, "step": 198 }, { "epoch": 0.55, "logps_train/chosen": -65.90495300292969, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -117.00414276123047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29998284578323364, "rewards_train/margins": 1.6630921959877014, "rewards_train/rejected": -1.3631093502044678, "step": 198 }, { "epoch": 0.55, "learning_rate": 3.8141818274686813e-07, "loss": 0.2009, "step": 199 }, { "epoch": 0.55, "logps_train/chosen": -68.09138488769531, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -120.63011169433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3309003710746765, "rewards_train/margins": 1.7492335438728333, "rewards_train/rejected": -1.4183331727981567, "step": 199 }, { "epoch": 0.55, "learning_rate": 3.811615613458219e-07, "loss": 0.1806, "step": 200 }, { "epoch": 0.55, "logps_train/chosen": -67.11273956298828, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -108.75, "logps_train/rejected": -123.14047241210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31216347217559814, "rewards_train/margins": 1.749501347541809, "rewards_train/rejected": -1.437337875366211, "step": 200 }, { "epoch": 0.55, "learning_rate": 3.8090326760550264e-07, "loss": 0.1859, "step": 201 }, { "epoch": 0.55, "logps_train/chosen": -68.36871337890625, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -108.4375, "logps_train/rejected": -122.29510498046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21845079958438873, "rewards_train/margins": 1.601183906197548, "rewards_train/rejected": -1.3827331066131592, "step": 201 }, { "epoch": 0.56, "learning_rate": 3.806433039102726e-07, "loss": 0.2218, "step": 202 }, { "epoch": 0.56, "logps_train/chosen": -65.55032348632812, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -117.14064025878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2829066216945648, "rewards_train/margins": 1.734226554632187, "rewards_train/rejected": -1.451319932937622, "step": 202 }, { "epoch": 0.56, "learning_rate": 3.803816726599095e-07, "loss": 0.1939, "step": 203 }, { "epoch": 0.56, "logps_train/chosen": -65.89816284179688, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -118.95291900634766, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3216097056865692, "rewards_train/margins": 1.7436592280864716, "rewards_train/rejected": -1.4220495223999023, "step": 203 }, { "epoch": 0.56, "learning_rate": 3.801183762695848e-07, "loss": 0.1885, "step": 204 }, { "epoch": 0.56, "logps_train/chosen": -67.71683502197266, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -120.57841491699219, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2818318009376526, "rewards_train/margins": 1.7574473023414612, "rewards_train/rejected": -1.4756155014038086, "step": 204 }, { "epoch": 0.56, "learning_rate": 3.79853417169841e-07, "loss": 0.1882, "step": 205 }, { "epoch": 0.56, "logps_train/chosen": -69.13641357421875, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -121.61457824707031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2883606255054474, "rewards_train/margins": 1.807142049074173, "rewards_train/rejected": -1.5187814235687256, "step": 205 }, { "epoch": 0.57, "learning_rate": 3.795867978065696e-07, "loss": 0.1768, "step": 206 }, { "epoch": 0.57, "logps_train/chosen": -68.05232238769531, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -121.86480712890625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.33148664236068726, "rewards_train/margins": 1.8594714999198914, "rewards_train/rejected": -1.527984857559204, "step": 206 }, { "epoch": 0.57, "learning_rate": 3.793185206409883e-07, "loss": 0.1711, "step": 207 }, { "epoch": 0.57, "logps_train/chosen": -66.64213562011719, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -123.04705810546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34037643671035767, "rewards_train/margins": 1.9772112965583801, "rewards_train/rejected": -1.6368348598480225, "step": 207 }, { "epoch": 0.57, "learning_rate": 3.790485881496181e-07, "loss": 0.1581, "step": 208 }, { "epoch": 0.57, "logps_train/chosen": -65.75901794433594, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -122.47673797607422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3647235631942749, "rewards_train/margins": 1.9748976230621338, "rewards_train/rejected": -1.6101740598678589, "step": 208 }, { "epoch": 0.58, "learning_rate": 3.7877700282426093e-07, "loss": 0.149, "step": 209 }, { "epoch": 0.58, "logps_train/chosen": -67.1761703491211, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -119.12893676757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26836949586868286, "rewards_train/margins": 1.7590953707695007, "rewards_train/rejected": -1.4907258749008179, "step": 209 }, { "epoch": 0.58, "learning_rate": 3.7850376717197624e-07, "loss": 0.1946, "step": 210 }, { "epoch": 0.58, "logps_train/chosen": -66.6448974609375, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -118.13958740234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3180783987045288, "rewards_train/margins": 1.863580584526062, "rewards_train/rejected": -1.5455021858215332, "step": 210 }, { "epoch": 0.58, "learning_rate": 3.782288837150579e-07, "loss": 0.1716, "step": 211 }, { "epoch": 0.58, "logps_train/chosen": -67.99040222167969, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -107.3125, "logps_train/rejected": -123.44499206542969, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21746429800987244, "rewards_train/margins": 1.8329595029354095, "rewards_train/rejected": -1.615495204925537, "step": 211 }, { "epoch": 0.58, "learning_rate": 3.779523549910112e-07, "loss": 0.1869, "step": 212 }, { "epoch": 0.58, "logps_train/chosen": -68.016357421875, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -124.78965759277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3181886076927185, "rewards_train/margins": 1.960338532924652, "rewards_train/rejected": -1.6421499252319336, "step": 212 }, { "epoch": 0.59, "learning_rate": 3.7767418355252907e-07, "loss": 0.1615, "step": 213 }, { "epoch": 0.59, "logps_train/chosen": -65.8499984741211, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -120.34625244140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27671903371810913, "rewards_train/margins": 1.939274251461029, "rewards_train/rejected": -1.66255521774292, "step": 213 }, { "epoch": 0.59, "learning_rate": 3.7739437196746874e-07, "loss": 0.1577, "step": 214 }, { "epoch": 0.59, "logps_train/chosen": -64.1513671875, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -122.22541809082031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3262208104133606, "rewards_train/margins": 2.0327475666999817, "rewards_train/rejected": -1.706526756286621, "step": 214 }, { "epoch": 0.59, "learning_rate": 3.7739437196746874e-07, "loss": 0.1473, "step": 215 }, { "epoch": 0.59, "logps_train/chosen": -68.11614227294922, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -125.73965454101562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35484129190444946, "rewards_train/margins": 2.114744007587433, "rewards_train/rejected": -1.7599027156829834, "step": 215 }, { "epoch": 0.6, "learning_rate": 3.771129228188279e-07, "loss": 0.1398, "step": 216 }, { "epoch": 0.6, "logps_train/chosen": -68.81795501708984, "logps_train/ref_chosen": -72.375, "logps_train/ref_rejected": -111.625, "logps_train/rejected": -128.6156768798828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35809728503227234, "rewards_train/margins": 2.058141201734543, "rewards_train/rejected": -1.7000439167022705, "step": 216 }, { "epoch": 0.6, "learning_rate": 3.76829838704721e-07, "loss": 0.1407, "step": 217 }, { "epoch": 0.6, "logps_train/chosen": -65.75769805908203, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -121.30233001708984, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40684735774993896, "rewards_train/margins": 1.983076572418213, "rewards_train/rejected": -1.576229214668274, "step": 217 }, { "epoch": 0.6, "learning_rate": 3.7654512223835513e-07, "loss": 0.1529, "step": 218 }, { "epoch": 0.6, "logps_train/chosen": -66.90615844726562, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -110.125, "logps_train/rejected": -126.88652801513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33755773305892944, "rewards_train/margins": 2.0113672614097595, "rewards_train/rejected": -1.67380952835083, "step": 218 }, { "epoch": 0.6, "learning_rate": 3.76258776048006e-07, "loss": 0.1571, "step": 219 }, { "epoch": 0.6, "logps_train/chosen": -64.61837005615234, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -122.85336303710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3095495104789734, "rewards_train/margins": 2.0090953707695007, "rewards_train/rejected": -1.6995458602905273, "step": 219 }, { "epoch": 0.61, "learning_rate": 3.7597080277699364e-07, "loss": 0.1478, "step": 220 }, { "epoch": 0.61, "logps_train/chosen": -62.8966064453125, "logps_train/ref_chosen": -66.3125, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -118.597412109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3429568409919739, "rewards_train/margins": 2.0109983086586, "rewards_train/rejected": -1.668041467666626, "step": 220 }, { "epoch": 0.61, "learning_rate": 3.75681205083658e-07, "loss": 0.1642, "step": 221 }, { "epoch": 0.61, "logps_train/chosen": -64.59232330322266, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -119.57453918457031, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.335396945476532, "rewards_train/margins": 1.9974401593208313, "rewards_train/rejected": -1.6620432138442993, "step": 221 }, { "epoch": 0.61, "learning_rate": 3.753899856413343e-07, "loss": 0.1516, "step": 222 }, { "epoch": 0.61, "logps_train/chosen": -64.88716888427734, "logps_train/ref_chosen": -67.1875, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -120.6978759765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22949601709842682, "rewards_train/margins": 1.9376629739999771, "rewards_train/rejected": -1.7081669569015503, "step": 222 }, { "epoch": 0.61, "learning_rate": 3.750971471383285e-07, "loss": 0.1768, "step": 223 }, { "epoch": 0.61, "logps_train/chosen": -65.24188232421875, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -119.3963851928711, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3348449766635895, "rewards_train/margins": 2.0215534269809723, "rewards_train/rejected": -1.6867084503173828, "step": 223 }, { "epoch": 0.62, "learning_rate": 3.7480269227789263e-07, "loss": 0.1506, "step": 224 }, { "epoch": 0.62, "logps_train/chosen": -66.01509857177734, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -120.08037567138672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3277382254600525, "rewards_train/margins": 2.052181899547577, "rewards_train/rejected": -1.7244436740875244, "step": 224 }, { "epoch": 0.62, "learning_rate": 3.7450662377819936e-07, "loss": 0.141, "step": 225 }, { "epoch": 0.62, "logps_train/chosen": -67.20602416992188, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -124.2911376953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34546205401420593, "rewards_train/margins": 2.1728664338588715, "rewards_train/rejected": -1.8274043798446655, "step": 225 }, { "epoch": 0.62, "learning_rate": 3.7420894437231724e-07, "loss": 0.1344, "step": 226 }, { "epoch": 0.62, "logps_train/chosen": -67.56610107421875, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -123.29722595214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28811630606651306, "rewards_train/margins": 2.078190177679062, "rewards_train/rejected": -1.7900738716125488, "step": 226 }, { "epoch": 0.63, "learning_rate": 3.739096568081857e-07, "loss": 0.1476, "step": 227 }, { "epoch": 0.63, "logps_train/chosen": -68.35181427001953, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -109.1875, "logps_train/rejected": -127.40929412841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3604728877544403, "rewards_train/margins": 2.1826522052288055, "rewards_train/rejected": -1.8221793174743652, "step": 227 }, { "epoch": 0.63, "learning_rate": 3.7360876384858903e-07, "loss": 0.1337, "step": 228 }, { "epoch": 0.63, "logps_train/chosen": -67.3569564819336, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -122.32514953613281, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28427544236183167, "rewards_train/margins": 2.0697201788425446, "rewards_train/rejected": -1.785444736480713, "step": 228 }, { "epoch": 0.63, "learning_rate": 3.7330626827113155e-07, "loss": 0.1499, "step": 229 }, { "epoch": 0.63, "logps_train/chosen": -64.38630676269531, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -120.96087646484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.371086448431015, "rewards_train/margins": 2.1694198548793793, "rewards_train/rejected": -1.7983334064483643, "step": 229 }, { "epoch": 0.63, "learning_rate": 3.7300217286821153e-07, "loss": 0.1323, "step": 230 }, { "epoch": 0.63, "logps_train/chosen": -68.5704345703125, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -122.4629135131836, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24163834750652313, "rewards_train/margins": 2.0911521166563034, "rewards_train/rejected": -1.8495137691497803, "step": 230 }, { "epoch": 0.64, "learning_rate": 3.726964804469954e-07, "loss": 0.1568, "step": 231 }, { "epoch": 0.64, "logps_train/chosen": -68.672607421875, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -125.5069808959961, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3246828317642212, "rewards_train/margins": 2.270156145095825, "rewards_train/rejected": -1.945473313331604, "step": 231 }, { "epoch": 0.64, "learning_rate": 3.723891938293925e-07, "loss": 0.1236, "step": 232 }, { "epoch": 0.64, "logps_train/chosen": -64.94799041748047, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -122.21669006347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35422420501708984, "rewards_train/margins": 2.2347800731658936, "rewards_train/rejected": -1.8805558681488037, "step": 232 }, { "epoch": 0.64, "learning_rate": 3.7208031585202786e-07, "loss": 0.1332, "step": 233 }, { "epoch": 0.64, "logps_train/chosen": -64.60661315917969, "logps_train/ref_chosen": -67.5625, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -120.41319274902344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29783493280410767, "rewards_train/margins": 2.0988219380378723, "rewards_train/rejected": -1.8009870052337646, "step": 233 }, { "epoch": 0.64, "learning_rate": 3.7176984936621716e-07, "loss": 0.1516, "step": 234 }, { "epoch": 0.64, "logps_train/chosen": -70.03324890136719, "logps_train/ref_chosen": -73.1875, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -127.7325439453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31200700998306274, "rewards_train/margins": 2.248932898044586, "rewards_train/rejected": -1.9369258880615234, "step": 234 }, { "epoch": 0.65, "learning_rate": 3.714577972379398e-07, "loss": 0.1348, "step": 235 }, { "epoch": 0.65, "logps_train/chosen": -65.72310638427734, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -120.07861328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2396036982536316, "rewards_train/margins": 2.054105579853058, "rewards_train/rejected": -1.8145018815994263, "step": 235 }, { "epoch": 0.65, "learning_rate": 3.7114416234781246e-07, "loss": 0.1523, "step": 236 }, { "epoch": 0.65, "logps_train/chosen": -65.64006042480469, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -106.1875, "logps_train/rejected": -125.13241577148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2765706479549408, "rewards_train/margins": 2.1701835095882416, "rewards_train/rejected": -1.8936128616333008, "step": 236 }, { "epoch": 0.65, "learning_rate": 3.708289475910627e-07, "loss": 0.1424, "step": 237 }, { "epoch": 0.65, "logps_train/chosen": -68.27067565917969, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -107.75, "logps_train/rejected": -128.1477508544922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33460235595703125, "rewards_train/margins": 2.375450849533081, "rewards_train/rejected": -2.04084849357605, "step": 237 }, { "epoch": 0.66, "learning_rate": 3.705121558775022e-07, "loss": 0.1174, "step": 238 }, { "epoch": 0.66, "logps_train/chosen": -69.55337524414062, "logps_train/ref_chosen": -72.6875, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -127.07475280761719, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31106865406036377, "rewards_train/margins": 2.3651262521743774, "rewards_train/rejected": -2.0540575981140137, "step": 238 }, { "epoch": 0.66, "learning_rate": 3.701937901314997e-07, "loss": 0.1122, "step": 239 }, { "epoch": 0.66, "logps_train/chosen": -67.24723052978516, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -124.09799194335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3198084235191345, "rewards_train/margins": 2.3136902451515198, "rewards_train/rejected": -1.9938818216323853, "step": 239 }, { "epoch": 0.66, "learning_rate": 3.6987385329195433e-07, "loss": 0.1234, "step": 240 }, { "epoch": 0.66, "logps_train/chosen": -65.5339584350586, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -122.10250854492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23312723636627197, "rewards_train/margins": 2.2783387899398804, "rewards_train/rejected": -2.0452115535736084, "step": 240 }, { "epoch": 0.66, "learning_rate": 3.695523483122682e-07, "loss": 0.1348, "step": 241 }, { "epoch": 0.66, "logps_train/chosen": -68.99905395507812, "logps_train/ref_chosen": -72.625, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -127.16194915771484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3620084524154663, "rewards_train/margins": 2.470781445503235, "rewards_train/rejected": -2.1087729930877686, "step": 241 }, { "epoch": 0.67, "learning_rate": 3.6922927816031926e-07, "loss": 0.0972, "step": 242 }, { "epoch": 0.67, "logps_train/chosen": -66.30967712402344, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -122.1876220703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2995986342430115, "rewards_train/margins": 2.326368749141693, "rewards_train/rejected": -2.0267701148986816, "step": 242 }, { "epoch": 0.67, "learning_rate": 3.6890464581843393e-07, "loss": 0.1233, "step": 243 }, { "epoch": 0.67, "logps_train/chosen": -68.20955657958984, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -124.61915588378906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26825350522994995, "rewards_train/margins": 2.302825629711151, "rewards_train/rejected": -2.034572124481201, "step": 243 }, { "epoch": 0.67, "learning_rate": 3.685784542833594e-07, "loss": 0.1427, "step": 244 }, { "epoch": 0.67, "logps_train/chosen": -69.18128967285156, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -109.0625, "logps_train/rejected": -129.1211395263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2838733494281769, "rewards_train/margins": 2.2887604534626007, "rewards_train/rejected": -2.004887104034424, "step": 244 }, { "epoch": 0.67, "learning_rate": 3.6825070656623623e-07, "loss": 0.1316, "step": 245 }, { "epoch": 0.67, "logps_train/chosen": -67.39363098144531, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.875, "logps_train/rejected": -127.90760803222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3002850413322449, "rewards_train/margins": 2.4032530188560486, "rewards_train/rejected": -2.1029679775238037, "step": 245 }, { "epoch": 0.68, "learning_rate": 3.679214056925703e-07, "loss": 0.1131, "step": 246 }, { "epoch": 0.68, "logps_train/chosen": -67.8221435546875, "logps_train/ref_chosen": -70.9375, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -129.6324920654297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3097775876522064, "rewards_train/margins": 2.5053506791591644, "rewards_train/rejected": -2.195573091506958, "step": 246 }, { "epoch": 0.68, "learning_rate": 3.675905547022051e-07, "loss": 0.1085, "step": 247 }, { "epoch": 0.68, "logps_train/chosen": -66.85993957519531, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -108.75, "logps_train/rejected": -130.54115295410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29755109548568726, "rewards_train/margins": 2.475737988948822, "rewards_train/rejected": -2.1781868934631348, "step": 247 }, { "epoch": 0.68, "learning_rate": 3.6725815664929353e-07, "loss": 0.1199, "step": 248 }, { "epoch": 0.68, "logps_train/chosen": -67.43453216552734, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -126.64222717285156, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.3077673316001892, "rewards_train/margins": 2.4207696318626404, "rewards_train/rejected": -2.113002300262451, "step": 248 }, { "epoch": 0.69, "learning_rate": 3.6692421460226964e-07, "loss": 0.1213, "step": 249 }, { "epoch": 0.69, "logps_train/chosen": -65.75762939453125, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -125.96774291992188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.34718620777130127, "rewards_train/margins": 2.4495269060134888, "rewards_train/rejected": -2.1023406982421875, "step": 249 }, { "epoch": 0.69, "learning_rate": 3.665887316438206e-07, "loss": 0.1219, "step": 250 }, { "epoch": 0.69, "logps_train/chosen": -66.62548828125, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -107.9375, "logps_train/rejected": -130.83229064941406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36054670810699463, "rewards_train/margins": 2.6494396924972534, "rewards_train/rejected": -2.288892984390259, "step": 250 }, { "epoch": 0.69, "learning_rate": 3.662517108708578e-07, "loss": 0.1003, "step": 251 }, { "epoch": 0.69, "logps_train/chosen": -63.33588409423828, "logps_train/ref_chosen": -66.5625, "logps_train/ref_rejected": -101.125, "logps_train/rejected": -121.44000244140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31909704208374023, "rewards_train/margins": 2.34713077545166, "rewards_train/rejected": -2.02803373336792, "step": 251 }, { "epoch": 0.69, "learning_rate": 3.659131553944888e-07, "loss": 0.1339, "step": 252 }, { "epoch": 0.69, "logps_train/chosen": -67.04878234863281, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -127.98023986816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3228071928024292, "rewards_train/margins": 2.4271305799484253, "rewards_train/rejected": -2.104323387145996, "step": 252 }, { "epoch": 0.7, "learning_rate": 3.65573068339988e-07, "loss": 0.1229, "step": 253 }, { "epoch": 0.7, "logps_train/chosen": -65.21377563476562, "logps_train/ref_chosen": -67.3125, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -122.9236068725586, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20889562368392944, "rewards_train/margins": 2.290660321712494, "rewards_train/rejected": -2.0817646980285645, "step": 253 }, { "epoch": 0.7, "learning_rate": 3.652314528467683e-07, "loss": 0.1395, "step": 254 }, { "epoch": 0.7, "logps_train/chosen": -66.54710388183594, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -103.1875, "logps_train/rejected": -125.13101196289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23596423864364624, "rewards_train/margins": 2.430071532726288, "rewards_train/rejected": -2.1941072940826416, "step": 254 }, { "epoch": 0.7, "learning_rate": 3.6488831206835206e-07, "loss": 0.1299, "step": 255 }, { "epoch": 0.7, "logps_train/chosen": -66.20193481445312, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -127.46617889404297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27653512358665466, "rewards_train/margins": 2.4606045186519623, "rewards_train/rejected": -2.1840693950653076, "step": 255 }, { "epoch": 0.71, "learning_rate": 3.6454364917234146e-07, "loss": 0.1158, "step": 256 }, { "epoch": 0.71, "logps_train/chosen": -68.06721496582031, "logps_train/ref_chosen": -70.9375, "logps_train/ref_rejected": -105.1875, "logps_train/rejected": -126.96392059326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2871263325214386, "rewards_train/margins": 2.464377671480179, "rewards_train/rejected": -2.1772513389587402, "step": 256 }, { "epoch": 0.71, "learning_rate": 3.6419746734039016e-07, "loss": 0.1199, "step": 257 }, { "epoch": 0.71, "logps_train/chosen": -68.26837158203125, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -128.2704315185547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28590649366378784, "rewards_train/margins": 2.541562259197235, "rewards_train/rejected": -2.2556557655334473, "step": 257 }, { "epoch": 0.71, "learning_rate": 3.6384976976817294e-07, "loss": 0.1053, "step": 258 }, { "epoch": 0.71, "logps_train/chosen": -65.9413070678711, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -103.0625, "logps_train/rejected": -125.15313720703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23462902009487152, "rewards_train/margins": 2.446719989180565, "rewards_train/rejected": -2.2120909690856934, "step": 258 }, { "epoch": 0.71, "learning_rate": 3.6350055966535694e-07, "loss": 0.1306, "step": 259 }, { "epoch": 0.71, "logps_train/chosen": -67.0677261352539, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -129.29359436035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23980918526649475, "rewards_train/margins": 2.607840210199356, "rewards_train/rejected": -2.3680310249328613, "step": 259 }, { "epoch": 0.72, "learning_rate": 3.631498402555719e-07, "loss": 0.1147, "step": 260 }, { "epoch": 0.72, "logps_train/chosen": -69.5660629272461, "logps_train/ref_chosen": -72.3125, "logps_train/ref_rejected": -110.0625, "logps_train/rejected": -133.06678771972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2747902274131775, "rewards_train/margins": 2.5755123496055603, "rewards_train/rejected": -2.300722122192383, "step": 260 }, { "epoch": 0.72, "learning_rate": 3.6279761477637994e-07, "loss": 0.1108, "step": 261 }, { "epoch": 0.72, "logps_train/chosen": -67.3389892578125, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -127.41424560546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30447980761528015, "rewards_train/margins": 2.5819389522075653, "rewards_train/rejected": -2.277459144592285, "step": 261 }, { "epoch": 0.72, "learning_rate": 3.624438864792463e-07, "loss": 0.1063, "step": 262 }, { "epoch": 0.72, "logps_train/chosen": -65.97480010986328, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -100.875, "logps_train/rejected": -123.11128234863281, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.1852349489927292, "rewards_train/margins": 2.4066175669431686, "rewards_train/rejected": -2.2213826179504395, "step": 262 }, { "epoch": 0.72, "learning_rate": 3.62088658629509e-07, "loss": 0.1356, "step": 263 }, { "epoch": 0.72, "logps_train/chosen": -66.5597152709961, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -128.99050903320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3054051995277405, "rewards_train/margins": 2.652210772037506, "rewards_train/rejected": -2.3468055725097656, "step": 263 }, { "epoch": 0.73, "learning_rate": 3.6173193450634854e-07, "loss": 0.0985, "step": 264 }, { "epoch": 0.73, "logps_train/chosen": -68.06388854980469, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -131.51055908203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3559156060218811, "rewards_train/margins": 2.823279917240143, "rewards_train/rejected": -2.4673643112182617, "step": 264 }, { "epoch": 0.73, "learning_rate": 3.613737174027581e-07, "loss": 0.0785, "step": 265 }, { "epoch": 0.73, "logps_train/chosen": -66.04621887207031, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -107.3125, "logps_train/rejected": -131.23416137695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3049972653388977, "rewards_train/margins": 2.6984809041023254, "rewards_train/rejected": -2.3934836387634277, "step": 265 }, { "epoch": 0.73, "learning_rate": 3.6101401062551257e-07, "loss": 0.091, "step": 266 }, { "epoch": 0.73, "logps_train/chosen": -68.33982849121094, "logps_train/ref_chosen": -72.25, "logps_train/ref_rejected": -108.9375, "logps_train/rejected": -133.20794677734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.39072492718696594, "rewards_train/margins": 2.821041375398636, "rewards_train/rejected": -2.43031644821167, "step": 266 }, { "epoch": 0.74, "learning_rate": 3.606528174951386e-07, "loss": 0.0841, "step": 267 }, { "epoch": 0.74, "logps_train/chosen": -65.80998229980469, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -125.50814056396484, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2731037735939026, "rewards_train/margins": 2.637541353702545, "rewards_train/rejected": -2.3644375801086426, "step": 267 }, { "epoch": 0.74, "learning_rate": 3.602901413458835e-07, "loss": 0.1018, "step": 268 }, { "epoch": 0.74, "logps_train/chosen": -68.4359130859375, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -130.5703887939453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28668296337127686, "rewards_train/margins": 2.6941128969192505, "rewards_train/rejected": -2.4074299335479736, "step": 268 }, { "epoch": 0.74, "learning_rate": 3.5992598552568476e-07, "loss": 0.0982, "step": 269 }, { "epoch": 0.74, "logps_train/chosen": -68.09016418457031, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -133.064208984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.333415150642395, "rewards_train/margins": 2.853068470954895, "rewards_train/rejected": -2.5196533203125, "step": 269 }, { "epoch": 0.74, "learning_rate": 3.5956035339613876e-07, "loss": 0.0821, "step": 270 }, { "epoch": 0.74, "logps_train/chosen": -70.06459045410156, "logps_train/ref_chosen": -73.375, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -134.149169921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3305521607398987, "rewards_train/margins": 2.9212499260902405, "rewards_train/rejected": -2.590697765350342, "step": 270 }, { "epoch": 0.75, "learning_rate": 3.591932483324703e-07, "loss": 0.0746, "step": 271 }, { "epoch": 0.75, "logps_train/chosen": -67.28622436523438, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -128.99029541015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26957035064697266, "rewards_train/margins": 2.7408652305603027, "rewards_train/rejected": -2.47129487991333, "step": 271 }, { "epoch": 0.75, "learning_rate": 3.5882467372350096e-07, "loss": 0.1004, "step": 272 }, { "epoch": 0.75, "logps_train/chosen": -69.58003997802734, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -130.30633544921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20630262792110443, "rewards_train/margins": 2.607248291373253, "rewards_train/rejected": -2.4009456634521484, "step": 272 }, { "epoch": 0.75, "learning_rate": 3.58454632971618e-07, "loss": 0.1207, "step": 273 }, { "epoch": 0.75, "logps_train/chosen": -66.97982788085938, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -107.4375, "logps_train/rejected": -131.28094482421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23942020535469055, "rewards_train/margins": 2.6232774555683136, "rewards_train/rejected": -2.383857250213623, "step": 273 }, { "epoch": 0.75, "learning_rate": 3.5808312949274295e-07, "loss": 0.1263, "step": 274 }, { "epoch": 0.75, "logps_train/chosen": -66.86262512207031, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -131.24501037597656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29166704416275024, "rewards_train/margins": 2.8173399567604065, "rewards_train/rejected": -2.5256729125976562, "step": 274 }, { "epoch": 0.76, "learning_rate": 3.5771016671630025e-07, "loss": 0.0987, "step": 275 }, { "epoch": 0.76, "logps_train/chosen": -66.28524780273438, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -100.625, "logps_train/rejected": -124.329833984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2577057480812073, "rewards_train/margins": 2.627115309238434, "rewards_train/rejected": -2.3694095611572266, "step": 275 }, { "epoch": 0.76, "learning_rate": 3.573357480851851e-07, "loss": 0.1081, "step": 276 }, { "epoch": 0.76, "logps_train/chosen": -65.4220962524414, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -133.07095336914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30788859724998474, "rewards_train/margins": 2.797895759344101, "rewards_train/rejected": -2.490007162094116, "step": 276 }, { "epoch": 0.76, "learning_rate": 3.569598770557322e-07, "loss": 0.1002, "step": 277 }, { "epoch": 0.76, "logps_train/chosen": -66.59175872802734, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -130.04037475585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.244974747300148, "rewards_train/margins": 2.695397987961769, "rewards_train/rejected": -2.450423240661621, "step": 277 }, { "epoch": 0.77, "learning_rate": 3.565825570976836e-07, "loss": 0.1066, "step": 278 }, { "epoch": 0.77, "logps_train/chosen": -69.0568618774414, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -129.68838500976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11618903279304504, "rewards_train/margins": 2.6317074596881866, "rewards_train/rejected": -2.5155184268951416, "step": 278 }, { "epoch": 0.77, "learning_rate": 3.5620379169415664e-07, "loss": 0.1371, "step": 279 }, { "epoch": 0.77, "logps_train/chosen": -67.53836822509766, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -126.67141723632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1842491328716278, "rewards_train/margins": 2.6661371886730194, "rewards_train/rejected": -2.4818880558013916, "step": 279 }, { "epoch": 0.77, "learning_rate": 3.558235843416119e-07, "loss": 0.1136, "step": 280 }, { "epoch": 0.77, "logps_train/chosen": -66.9909439086914, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -129.6933135986328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27478235960006714, "rewards_train/margins": 2.808566987514496, "rewards_train/rejected": -2.5337846279144287, "step": 280 }, { "epoch": 0.77, "learning_rate": 3.5544193854982095e-07, "loss": 0.0943, "step": 281 }, { "epoch": 0.77, "logps_train/chosen": -68.01266479492188, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -133.2298583984375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.1970731019973755, "rewards_train/margins": 2.822158932685852, "rewards_train/rejected": -2.6250858306884766, "step": 281 }, { "epoch": 0.78, "learning_rate": 3.5505885784183385e-07, "loss": 0.107, "step": 282 }, { "epoch": 0.78, "logps_train/chosen": -68.47537231445312, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -130.70187377929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22155389189720154, "rewards_train/margins": 2.8551205098629, "rewards_train/rejected": -2.6335666179656982, "step": 282 }, { "epoch": 0.78, "learning_rate": 3.5467434575394654e-07, "loss": 0.09, "step": 283 }, { "epoch": 0.78, "logps_train/chosen": -67.5008544921875, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -132.7442626953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22730672359466553, "rewards_train/margins": 2.9385489225387573, "rewards_train/rejected": -2.711242198944092, "step": 283 }, { "epoch": 0.78, "learning_rate": 3.5428840583566853e-07, "loss": 0.0902, "step": 284 }, { "epoch": 0.78, "logps_train/chosen": -66.03974914550781, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -133.53634643554688, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.18079136312007904, "rewards_train/margins": 2.805519327521324, "rewards_train/rejected": -2.624727964401245, "step": 284 }, { "epoch": 0.79, "learning_rate": 3.539010416496898e-07, "loss": 0.11, "step": 285 }, { "epoch": 0.79, "logps_train/chosen": -65.24209594726562, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -126.38204956054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27725496888160706, "rewards_train/margins": 2.735772520303726, "rewards_train/rejected": -2.458517551422119, "step": 285 }, { "epoch": 0.79, "learning_rate": 3.5351225677184795e-07, "loss": 0.0917, "step": 286 }, { "epoch": 0.79, "logps_train/chosen": -66.01216125488281, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -130.0604705810547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3484423756599426, "rewards_train/margins": 2.867575705051422, "rewards_train/rejected": -2.5191333293914795, "step": 286 }, { "epoch": 0.79, "learning_rate": 3.5312205479109554e-07, "loss": 0.0854, "step": 287 }, { "epoch": 0.79, "logps_train/chosen": -68.590576171875, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -108.0625, "logps_train/rejected": -135.56381225585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18767113983631134, "rewards_train/margins": 2.9363379031419754, "rewards_train/rejected": -2.748666763305664, "step": 287 }, { "epoch": 0.79, "learning_rate": 3.527304393094664e-07, "loss": 0.0983, "step": 288 }, { "epoch": 0.79, "logps_train/chosen": -66.76884460449219, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -133.05078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31144511699676514, "rewards_train/margins": 2.9147645235061646, "rewards_train/rejected": -2.6033194065093994, "step": 288 }, { "epoch": 0.8, "learning_rate": 3.5233741394204286e-07, "loss": 0.0862, "step": 289 }, { "epoch": 0.8, "logps_train/chosen": -65.44815826416016, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -131.27899169921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31065312027931213, "rewards_train/margins": 3.0414818823337555, "rewards_train/rejected": -2.7308287620544434, "step": 289 }, { "epoch": 0.8, "learning_rate": 3.519429823169221e-07, "loss": 0.0751, "step": 290 }, { "epoch": 0.8, "logps_train/chosen": -66.91666412353516, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -128.71873474121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2712244987487793, "rewards_train/margins": 2.9670233726501465, "rewards_train/rejected": -2.695798873901367, "step": 290 }, { "epoch": 0.8, "learning_rate": 3.5154714807518293e-07, "loss": 0.0792, "step": 291 }, { "epoch": 0.8, "logps_train/chosen": -63.89239501953125, "logps_train/ref_chosen": -66.25, "logps_train/ref_rejected": -99.6875, "logps_train/rejected": -125.11399841308594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.23405125737190247, "rewards_train/margins": 2.7743578255176544, "rewards_train/rejected": -2.540306568145752, "step": 291 }, { "epoch": 0.8, "learning_rate": 3.5114991487085164e-07, "loss": 0.1142, "step": 292 }, { "epoch": 0.8, "logps_train/chosen": -66.84786987304688, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -131.34991455078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2671661078929901, "rewards_train/margins": 3.0198332965373993, "rewards_train/rejected": -2.752667188644409, "step": 292 }, { "epoch": 0.81, "learning_rate": 3.5075128637086903e-07, "loss": 0.0792, "step": 293 }, { "epoch": 0.81, "logps_train/chosen": -67.3742904663086, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -131.2333221435547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2400124967098236, "rewards_train/margins": 2.8557280004024506, "rewards_train/rejected": -2.615715503692627, "step": 293 }, { "epoch": 0.81, "learning_rate": 3.503512662550559e-07, "loss": 0.0963, "step": 294 }, { "epoch": 0.81, "logps_train/chosen": -67.09126281738281, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -134.7032470703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31069812178611755, "rewards_train/margins": 3.1394210755825043, "rewards_train/rejected": -2.8287229537963867, "step": 294 }, { "epoch": 0.81, "learning_rate": 3.499498582160794e-07, "loss": 0.0783, "step": 295 }, { "epoch": 0.81, "logps_train/chosen": -65.33592224121094, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -131.55853271484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21421056985855103, "rewards_train/margins": 2.9111762642860413, "rewards_train/rejected": -2.6969656944274902, "step": 295 }, { "epoch": 0.82, "learning_rate": 3.4954706595941897e-07, "loss": 0.1048, "step": 296 }, { "epoch": 0.82, "logps_train/chosen": -68.13926696777344, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -133.11788940429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30101490020751953, "rewards_train/margins": 3.1451289653778076, "rewards_train/rejected": -2.844114065170288, "step": 296 }, { "epoch": 0.82, "learning_rate": 3.49142893203332e-07, "loss": 0.0642, "step": 297 }, { "epoch": 0.82, "logps_train/chosen": -67.19644165039062, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -131.61801147460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18982839584350586, "rewards_train/margins": 2.938152313232422, "rewards_train/rejected": -2.748323917388916, "step": 297 }, { "epoch": 0.82, "learning_rate": 3.487373436788194e-07, "loss": 0.0985, "step": 298 }, { "epoch": 0.82, "logps_train/chosen": -67.5679931640625, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -100.9375, "logps_train/rejected": -128.45806884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19525137543678284, "rewards_train/margins": 2.9464296400547028, "rewards_train/rejected": -2.75117826461792, "step": 298 }, { "epoch": 0.82, "learning_rate": 3.4833042112959153e-07, "loss": 0.0937, "step": 299 }, { "epoch": 0.82, "logps_train/chosen": -67.9539794921875, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -134.08505249023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18824531137943268, "rewards_train/margins": 3.0095925480127335, "rewards_train/rejected": -2.821347236633301, "step": 299 }, { "epoch": 0.83, "learning_rate": 3.4792212931203323e-07, "loss": 0.0919, "step": 300 }, { "epoch": 0.83, "logps_train/chosen": -68.53665161132812, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -107.75, "logps_train/rejected": -135.36862182617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28236985206604004, "rewards_train/margins": 3.0475523471832275, "rewards_train/rejected": -2.7651824951171875, "step": 300 }, { "epoch": 0.83, "learning_rate": 3.4751247199516957e-07, "loss": 0.0779, "step": 301 }, { "epoch": 0.83, "logps_train/chosen": -67.10432434082031, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -137.05740356445312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2928876280784607, "rewards_train/margins": 3.402094781398773, "rewards_train/rejected": -3.1092071533203125, "step": 301 }, { "epoch": 0.83, "learning_rate": 3.4710145296063055e-07, "loss": 0.057, "step": 302 }, { "epoch": 0.83, "logps_train/chosen": -67.32778930664062, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -130.89678955078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22928127646446228, "rewards_train/margins": 3.1138828694820404, "rewards_train/rejected": -2.884601593017578, "step": 302 }, { "epoch": 0.83, "learning_rate": 3.466890760026167e-07, "loss": 0.0777, "step": 303 }, { "epoch": 0.83, "logps_train/chosen": -66.2976303100586, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -131.63577270507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1613990068435669, "rewards_train/margins": 2.9494892358779907, "rewards_train/rejected": -2.788090229034424, "step": 303 }, { "epoch": 0.84, "learning_rate": 3.4627534492786366e-07, "loss": 0.0958, "step": 304 }, { "epoch": 0.84, "logps_train/chosen": -66.49894714355469, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -131.60093688964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23599360883235931, "rewards_train/margins": 3.051458105444908, "rewards_train/rejected": -2.815464496612549, "step": 304 }, { "epoch": 0.84, "learning_rate": 3.458602635556073e-07, "loss": 0.089, "step": 305 }, { "epoch": 0.84, "logps_train/chosen": -67.44700622558594, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -136.04501342773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2987073063850403, "rewards_train/margins": 3.250766098499298, "rewards_train/rejected": -2.952058792114258, "step": 305 }, { "epoch": 0.84, "learning_rate": 3.4544383571754823e-07, "loss": 0.0604, "step": 306 }, { "epoch": 0.84, "logps_train/chosen": -64.61748504638672, "logps_train/ref_chosen": -66.9375, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -133.56503295898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23063428699970245, "rewards_train/margins": 3.211649015545845, "rewards_train/rejected": -2.9810147285461426, "step": 306 }, { "epoch": 0.85, "learning_rate": 3.4502606525781674e-07, "loss": 0.087, "step": 307 }, { "epoch": 0.85, "logps_train/chosen": -66.83490753173828, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -133.4494171142578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37564000487327576, "rewards_train/margins": 3.419703096151352, "rewards_train/rejected": -3.044063091278076, "step": 307 }, { "epoch": 0.85, "learning_rate": 3.4460695603293696e-07, "loss": 0.0488, "step": 308 }, { "epoch": 0.85, "logps_train/chosen": -67.14591217041016, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -134.74688720703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31612157821655273, "rewards_train/margins": 3.2953031063079834, "rewards_train/rejected": -2.9791815280914307, "step": 308 }, { "epoch": 0.85, "learning_rate": 3.441865119117916e-07, "loss": 0.0621, "step": 309 }, { "epoch": 0.85, "logps_train/chosen": -66.0912857055664, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -132.5958251953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22666282951831818, "rewards_train/margins": 3.126282885670662, "rewards_train/rejected": -2.8996200561523438, "step": 309 }, { "epoch": 0.85, "learning_rate": 3.4376473677558585e-07, "loss": 0.0726, "step": 310 }, { "epoch": 0.85, "logps_train/chosen": -68.49051666259766, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -136.6322021484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2934287488460541, "rewards_train/margins": 3.3455158174037933, "rewards_train/rejected": -3.0520870685577393, "step": 310 }, { "epoch": 0.86, "learning_rate": 3.43341634517812e-07, "loss": 0.0686, "step": 311 }, { "epoch": 0.86, "logps_train/chosen": -64.74295043945312, "logps_train/ref_chosen": -67.5625, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -133.37338256835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28034353256225586, "rewards_train/margins": 3.3684639930725098, "rewards_train/rejected": -3.088120460510254, "step": 311 }, { "epoch": 0.86, "learning_rate": 3.4291720904421315e-07, "loss": 0.0664, "step": 312 }, { "epoch": 0.86, "logps_train/chosen": -67.50613403320312, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -104.8125, "logps_train/rejected": -133.0216064453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3377161920070648, "rewards_train/margins": 3.1588216722011566, "rewards_train/rejected": -2.821105480194092, "step": 312 }, { "epoch": 0.86, "learning_rate": 3.424914642727473e-07, "loss": 0.0587, "step": 313 }, { "epoch": 0.86, "logps_train/chosen": -67.13553619384766, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -133.82254028320312, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.14865314960479736, "rewards_train/margins": 3.12104332447052, "rewards_train/rejected": -2.9723901748657227, "step": 313 }, { "epoch": 0.87, "learning_rate": 3.420644041335512e-07, "loss": 0.0855, "step": 314 }, { "epoch": 0.87, "logps_train/chosen": -66.77447509765625, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -132.61305236816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22504323720932007, "rewards_train/margins": 3.2262901663780212, "rewards_train/rejected": -3.001246929168701, "step": 314 }, { "epoch": 0.87, "learning_rate": 3.416360325689039e-07, "loss": 0.0801, "step": 315 }, { "epoch": 0.87, "logps_train/chosen": -66.22480010986328, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -136.57968139648438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1502741575241089, "rewards_train/margins": 3.2816790342330933, "rewards_train/rejected": -3.1314048767089844, "step": 315 }, { "epoch": 0.87, "learning_rate": 3.4120635353319054e-07, "loss": 0.0794, "step": 316 }, { "epoch": 0.87, "logps_train/chosen": -69.58503723144531, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -134.70640563964844, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.21551984548568726, "rewards_train/margins": 3.1842073798179626, "rewards_train/rejected": -2.9686875343322754, "step": 316 }, { "epoch": 0.87, "learning_rate": 3.407753709928659e-07, "loss": 0.1014, "step": 317 }, { "epoch": 0.87, "logps_train/chosen": -65.6987533569336, "logps_train/ref_chosen": -67.5625, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -135.76202392578125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.18764445185661316, "rewards_train/margins": 3.1805462539196014, "rewards_train/rejected": -2.9929018020629883, "step": 317 }, { "epoch": 0.88, "learning_rate": 3.403430889264176e-07, "loss": 0.0929, "step": 318 }, { "epoch": 0.88, "logps_train/chosen": -66.62776947021484, "logps_train/ref_chosen": -67.25, "logps_train/ref_rejected": -103.3125, "logps_train/rejected": -133.08958435058594, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.06383436918258667, "rewards_train/margins": 3.042763650417328, "rewards_train/rejected": -2.978929281234741, "step": 318 }, { "epoch": 0.88, "learning_rate": 3.3990951132432945e-07, "loss": 0.1287, "step": 319 }, { "epoch": 0.88, "logps_train/chosen": -66.22761535644531, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -132.00331115722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19188672304153442, "rewards_train/margins": 3.1653626561164856, "rewards_train/rejected": -2.973475933074951, "step": 319 }, { "epoch": 0.88, "learning_rate": 3.3947464218904453e-07, "loss": 0.0898, "step": 320 }, { "epoch": 0.88, "logps_train/chosen": -67.04156494140625, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -139.52511596679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37401750683784485, "rewards_train/margins": 3.653383046388626, "rewards_train/rejected": -3.2793655395507812, "step": 320 }, { "epoch": 0.88, "learning_rate": 3.3903848553492847e-07, "loss": 0.0405, "step": 321 }, { "epoch": 0.88, "logps_train/chosen": -69.00726318359375, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -135.0195770263672, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.14170485734939575, "rewards_train/margins": 3.2646098732948303, "rewards_train/rejected": -3.1229050159454346, "step": 321 }, { "epoch": 0.89, "learning_rate": 3.3860104538823216e-07, "loss": 0.0929, "step": 322 }, { "epoch": 0.89, "logps_train/chosen": -65.16903686523438, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -133.66810607910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2553616762161255, "rewards_train/margins": 3.352153182029724, "rewards_train/rejected": -3.0967915058135986, "step": 322 }, { "epoch": 0.89, "learning_rate": 3.381623257870546e-07, "loss": 0.0676, "step": 323 }, { "epoch": 0.89, "logps_train/chosen": -67.19290924072266, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -137.26885986328125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2653283178806305, "rewards_train/margins": 3.3824972212314606, "rewards_train/rejected": -3.11716890335083, "step": 323 }, { "epoch": 0.89, "learning_rate": 3.37722330781306e-07, "loss": 0.0682, "step": 324 }, { "epoch": 0.89, "logps_train/chosen": -69.40605163574219, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -139.03497314453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22340866923332214, "rewards_train/margins": 3.4508332908153534, "rewards_train/rejected": -3.2274246215820312, "step": 324 }, { "epoch": 0.9, "learning_rate": 3.3728106443266973e-07, "loss": 0.0651, "step": 325 }, { "epoch": 0.9, "logps_train/chosen": -66.4033203125, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -138.51998901367188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.22397470474243164, "rewards_train/margins": 3.5248007774353027, "rewards_train/rejected": -3.300826072692871, "step": 325 }, { "epoch": 0.9, "learning_rate": 3.3683853081456544e-07, "loss": 0.067, "step": 326 }, { "epoch": 0.9, "logps_train/chosen": -66.38858032226562, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -140.66122436523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3411223292350769, "rewards_train/margins": 3.7560736536979675, "rewards_train/rejected": -3.4149513244628906, "step": 326 }, { "epoch": 0.9, "learning_rate": 3.3639473401211096e-07, "loss": 0.0335, "step": 327 }, { "epoch": 0.9, "logps_train/chosen": -66.67025756835938, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -135.61935424804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2577791213989258, "rewards_train/margins": 3.4193246364593506, "rewards_train/rejected": -3.161545515060425, "step": 327 }, { "epoch": 0.9, "learning_rate": 3.3594967812208507e-07, "loss": 0.0663, "step": 328 }, { "epoch": 0.9, "logps_train/chosen": -67.5455322265625, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -108.6875, "logps_train/rejected": -142.40020751953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24876704812049866, "rewards_train/margins": 3.622624546289444, "rewards_train/rejected": -3.3738574981689453, "step": 328 }, { "epoch": 0.91, "learning_rate": 3.3550336725288925e-07, "loss": 0.0613, "step": 329 }, { "epoch": 0.91, "logps_train/chosen": -67.96173095703125, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -137.1765594482422, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.06862208247184753, "rewards_train/margins": 3.379099816083908, "rewards_train/rejected": -3.3104777336120605, "step": 329 }, { "epoch": 0.91, "learning_rate": 3.350558055245102e-07, "loss": 0.085, "step": 330 }, { "epoch": 0.91, "logps_train/chosen": -69.84065246582031, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -136.17428588867188, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.15685272216796875, "rewards_train/margins": 3.2250630855560303, "rewards_train/rejected": -3.0682103633880615, "step": 330 }, { "epoch": 0.91, "learning_rate": 3.3460699706848125e-07, "loss": 0.1044, "step": 331 }, { "epoch": 0.91, "logps_train/chosen": -67.86951446533203, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -139.00511169433594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.21519652009010315, "rewards_train/margins": 3.5385102331638336, "rewards_train/rejected": -3.3233137130737305, "step": 331 }, { "epoch": 0.91, "learning_rate": 3.341569460278447e-07, "loss": 0.0705, "step": 332 }, { "epoch": 0.91, "logps_train/chosen": -68.58423614501953, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -138.9049072265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21506324410438538, "rewards_train/margins": 3.5780159533023834, "rewards_train/rejected": -3.362952709197998, "step": 332 }, { "epoch": 0.92, "learning_rate": 3.3370565655711343e-07, "loss": 0.0666, "step": 333 }, { "epoch": 0.92, "logps_train/chosen": -67.23384857177734, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -138.63101196289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2568400800228119, "rewards_train/margins": 3.4952336251735687, "rewards_train/rejected": -3.238393545150757, "step": 333 }, { "epoch": 0.92, "learning_rate": 3.3325313282223243e-07, "loss": 0.0517, "step": 334 }, { "epoch": 0.92, "logps_train/chosen": -67.66128540039062, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -106.875, "logps_train/rejected": -140.31362915039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2520357668399811, "rewards_train/margins": 3.5924326479434967, "rewards_train/rejected": -3.3403968811035156, "step": 334 }, { "epoch": 0.92, "learning_rate": 3.3279937900054045e-07, "loss": 0.0519, "step": 335 }, { "epoch": 0.92, "logps_train/chosen": -68.50994873046875, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -139.92880249023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16741366684436798, "rewards_train/margins": 3.4828540831804276, "rewards_train/rejected": -3.3154404163360596, "step": 335 }, { "epoch": 0.93, "learning_rate": 3.3234439928073143e-07, "loss": 0.0727, "step": 336 }, { "epoch": 0.93, "logps_train/chosen": -66.80670166015625, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -134.8261260986328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2211846113204956, "rewards_train/margins": 3.390954852104187, "rewards_train/rejected": -3.1697702407836914, "step": 336 }, { "epoch": 0.93, "learning_rate": 3.318881978628159e-07, "loss": 0.0731, "step": 337 }, { "epoch": 0.93, "logps_train/chosen": -69.88027954101562, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -109.4375, "logps_train/rejected": -142.9235076904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1428806185722351, "rewards_train/margins": 3.489236533641815, "rewards_train/rejected": -3.34635591506958, "step": 337 }, { "epoch": 0.93, "learning_rate": 3.3143077895808205e-07, "loss": 0.0623, "step": 338 }, { "epoch": 0.93, "logps_train/chosen": -69.09772491455078, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -137.1251220703125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.1300717443227768, "rewards_train/margins": 3.4371646493673325, "rewards_train/rejected": -3.3070929050445557, "step": 338 }, { "epoch": 0.93, "learning_rate": 3.3097214678905704e-07, "loss": 0.0832, "step": 339 }, { "epoch": 0.93, "logps_train/chosen": -65.15505981445312, "logps_train/ref_chosen": -67.25, "logps_train/ref_rejected": -99.75, "logps_train/rejected": -131.45098876953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2109096646308899, "rewards_train/margins": 3.381204903125763, "rewards_train/rejected": -3.170295238494873, "step": 339 }, { "epoch": 0.94, "learning_rate": 3.3051230558946765e-07, "loss": 0.0777, "step": 340 }, { "epoch": 0.94, "logps_train/chosen": -67.87702941894531, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -101.3125, "logps_train/rejected": -134.67642211914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17572447657585144, "rewards_train/margins": 3.508796662092209, "rewards_train/rejected": -3.3330721855163574, "step": 340 }, { "epoch": 0.94, "learning_rate": 3.3005125960420177e-07, "loss": 0.065, "step": 341 }, { "epoch": 0.94, "logps_train/chosen": -65.6467514038086, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -136.78076171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22385023534297943, "rewards_train/margins": 3.456369861960411, "rewards_train/rejected": -3.2325196266174316, "step": 341 }, { "epoch": 0.94, "learning_rate": 3.295890130892688e-07, "loss": 0.0747, "step": 342 }, { "epoch": 0.94, "logps_train/chosen": -67.6991958618164, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -134.4911651611328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12524637579917908, "rewards_train/margins": 3.3357881009578705, "rewards_train/rejected": -3.2105417251586914, "step": 342 }, { "epoch": 0.94, "learning_rate": 3.2912557031176044e-07, "loss": 0.0919, "step": 343 }, { "epoch": 0.94, "logps_train/chosen": -66.79322814941406, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -137.11495971679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2970927953720093, "rewards_train/margins": 3.5521925687789917, "rewards_train/rejected": -3.2550997734069824, "step": 343 }, { "epoch": 0.95, "learning_rate": 3.2866093554981145e-07, "loss": 0.0523, "step": 344 }, { "epoch": 0.95, "logps_train/chosen": -65.90947723388672, "logps_train/ref_chosen": -67.0625, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -135.97750854492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11525347828865051, "rewards_train/margins": 3.4664221107959747, "rewards_train/rejected": -3.351168632507324, "step": 344 }, { "epoch": 0.95, "learning_rate": 3.2819511309255984e-07, "loss": 0.0785, "step": 345 }, { "epoch": 0.95, "logps_train/chosen": -69.99095153808594, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -137.08047485351562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.03288719803094864, "rewards_train/margins": 3.315250851213932, "rewards_train/rejected": -3.2823636531829834, "step": 345 }, { "epoch": 0.95, "learning_rate": 3.2772810724010763e-07, "loss": 0.099, "step": 346 }, { "epoch": 0.95, "logps_train/chosen": -70.64158630371094, "logps_train/ref_chosen": -72.625, "logps_train/ref_rejected": -108.9375, "logps_train/rejected": -145.43893432617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19887876510620117, "rewards_train/margins": 3.850975275039673, "rewards_train/rejected": -3.6520965099334717, "step": 346 }, { "epoch": 0.96, "learning_rate": 3.2725992230348107e-07, "loss": 0.0454, "step": 347 }, { "epoch": 0.96, "logps_train/chosen": -66.86836242675781, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -140.80630493164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2484665811061859, "rewards_train/margins": 3.6804166734218597, "rewards_train/rejected": -3.431950092315674, "step": 347 }, { "epoch": 0.96, "learning_rate": 3.267905626045905e-07, "loss": 0.0639, "step": 348 }, { "epoch": 0.96, "logps_train/chosen": -64.71005249023438, "logps_train/ref_chosen": -66.3125, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -135.2750701904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1576562523841858, "rewards_train/margins": 3.4324281811714172, "rewards_train/rejected": -3.2747719287872314, "step": 348 }, { "epoch": 0.96, "learning_rate": 3.263200324761911e-07, "loss": 0.0892, "step": 349 }, { "epoch": 0.96, "logps_train/chosen": -68.00846862792969, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -138.52120971679688, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.21736648678779602, "rewards_train/margins": 3.615581303834915, "rewards_train/rejected": -3.398214817047119, "step": 349 }, { "epoch": 0.96, "learning_rate": 3.258483362618421e-07, "loss": 0.0727, "step": 350 }, { "epoch": 0.96, "logps_train/chosen": -68.8123779296875, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -140.05181884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18834275007247925, "rewards_train/margins": 3.551043212413788, "rewards_train/rejected": -3.3627004623413086, "step": 350 }, { "epoch": 0.97, "learning_rate": 3.253754783158675e-07, "loss": 0.079, "step": 351 }, { "epoch": 0.97, "logps_train/chosen": -66.83012390136719, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -137.41770935058594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06972173601388931, "rewards_train/margins": 3.387420929968357, "rewards_train/rejected": -3.3176991939544678, "step": 351 }, { "epoch": 0.97, "learning_rate": 3.2490146300331525e-07, "loss": 0.0878, "step": 352 }, { "epoch": 0.97, "logps_train/chosen": -66.83062744140625, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -136.5624237060547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08349047601222992, "rewards_train/margins": 3.486998662352562, "rewards_train/rejected": -3.403508186340332, "step": 352 }, { "epoch": 0.97, "learning_rate": 3.2442629469991725e-07, "loss": 0.0739, "step": 353 }, { "epoch": 0.97, "logps_train/chosen": -68.31663513183594, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -139.09585571289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17946994304656982, "rewards_train/margins": 3.663861632347107, "rewards_train/rejected": -3.484391689300537, "step": 353 }, { "epoch": 0.98, "learning_rate": 3.23949977792049e-07, "loss": 0.0529, "step": 354 }, { "epoch": 0.98, "logps_train/chosen": -72.23503112792969, "logps_train/ref_chosen": -73.375, "logps_train/ref_rejected": -109.0625, "logps_train/rejected": -144.73007202148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11331292986869812, "rewards_train/margins": 3.680362194776535, "rewards_train/rejected": -3.567049264907837, "step": 354 }, { "epoch": 0.98, "learning_rate": 3.2347251667668874e-07, "loss": 0.0618, "step": 355 }, { "epoch": 0.98, "logps_train/chosen": -69.28684997558594, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -142.75164794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2251724898815155, "rewards_train/margins": 3.798775166273117, "rewards_train/rejected": -3.5736026763916016, "step": 355 }, { "epoch": 0.98, "learning_rate": 3.2299391576137735e-07, "loss": 0.0614, "step": 356 }, { "epoch": 0.98, "logps_train/chosen": -66.52472686767578, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -99.6875, "logps_train/rejected": -133.35296630859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24415791034698486, "rewards_train/margins": 3.6125584840774536, "rewards_train/rejected": -3.3684005737304688, "step": 356 }, { "epoch": 0.98, "learning_rate": 3.2251417946417735e-07, "loss": 0.0674, "step": 357 }, { "epoch": 0.98, "logps_train/chosen": -66.7098388671875, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -140.3497314453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18736621737480164, "rewards_train/margins": 3.6944582164287567, "rewards_train/rejected": -3.507091999053955, "step": 357 }, { "epoch": 0.99, "learning_rate": 3.2203331221363217e-07, "loss": 0.0568, "step": 358 }, { "epoch": 0.99, "logps_train/chosen": -66.38276672363281, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -138.66773986816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22681081295013428, "rewards_train/margins": 3.663995385169983, "rewards_train/rejected": -3.4371845722198486, "step": 358 }, { "epoch": 0.99, "learning_rate": 3.215513184487254e-07, "loss": 0.0542, "step": 359 }, { "epoch": 0.99, "logps_train/chosen": -67.81857299804688, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -138.86468505859375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2029077559709549, "rewards_train/margins": 3.5293185263872147, "rewards_train/rejected": -3.3264107704162598, "step": 359 }, { "epoch": 0.99, "learning_rate": 3.210682026188398e-07, "loss": 0.0732, "step": 360 }, { "epoch": 0.99, "logps_train/chosen": -67.75372314453125, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -142.95162963867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27487218379974365, "rewards_train/margins": 3.844059109687805, "rewards_train/rejected": -3.5691869258880615, "step": 360 }, { "epoch": 0.99, "learning_rate": 3.2058396918371576e-07, "loss": 0.0486, "step": 361 }, { "epoch": 0.99, "logps_train/chosen": -67.27276611328125, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -141.90377807617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18165849149227142, "rewards_train/margins": 3.769985184073448, "rewards_train/rejected": -3.5883266925811768, "step": 361 }, { "epoch": 1.0, "learning_rate": 3.200986226134109e-07, "loss": 0.0611, "step": 362 }, { "epoch": 1.0, "logps_train/chosen": -66.00161743164062, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -141.49349975585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3390958607196808, "rewards_train/margins": 4.0148623287677765, "rewards_train/rejected": -3.6757664680480957, "step": 362 }, { "epoch": 1.0, "learning_rate": 3.1961216738825816e-07, "loss": 0.044, "step": 363 }, { "epoch": 1.0, "logps_train/chosen": -68.56130981445312, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -102.875, "logps_train/rejected": -139.26051330566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20065590739250183, "rewards_train/margins": 3.8435034453868866, "rewards_train/rejected": -3.6428475379943848, "step": 363 }, { "epoch": 1.0, "learning_rate": 3.1912460799882485e-07, "loss": 0.0547, "step": 364 }, { "epoch": 1.0, "logps_train/chosen": -68.38632202148438, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -141.16909790039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1995023638010025, "rewards_train/margins": 3.6873608976602554, "rewards_train/rejected": -3.487858533859253, "step": 364 }, { "epoch": 1.01, "learning_rate": 3.18635948945871e-07, "loss": 0.0533, "step": 365 }, { "epoch": 1.01, "logps_train/chosen": -66.7513427734375, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -141.15634155273438, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.32662394642829895, "rewards_train/margins": 3.9353237450122833, "rewards_train/rejected": -3.6086997985839844, "step": 365 }, { "epoch": 1.01, "learning_rate": 3.1814619474030795e-07, "loss": 0.0469, "step": 366 }, { "epoch": 1.01, "logps_train/chosen": -67.3671875, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -139.67283630371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21274414658546448, "rewards_train/margins": 3.810692220926285, "rewards_train/rejected": -3.5979480743408203, "step": 366 }, { "epoch": 1.01, "learning_rate": 3.1765534990315647e-07, "loss": 0.0635, "step": 367 }, { "epoch": 1.01, "logps_train/chosen": -66.82073974609375, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -140.1304931640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3327697515487671, "rewards_train/margins": 3.975408911705017, "rewards_train/rejected": -3.64263916015625, "step": 367 }, { "epoch": 1.01, "learning_rate": 3.1716341896550514e-07, "loss": 0.044, "step": 368 }, { "epoch": 1.01, "logps_train/chosen": -67.87286376953125, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -141.09197998046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18029235303401947, "rewards_train/margins": 3.687782272696495, "rewards_train/rejected": -3.5074899196624756, "step": 368 }, { "epoch": 1.02, "learning_rate": 3.166704064684688e-07, "loss": 0.0613, "step": 369 }, { "epoch": 1.02, "logps_train/chosen": -64.04429626464844, "logps_train/ref_chosen": -66.3125, "logps_train/ref_rejected": -101.3125, "logps_train/rejected": -135.58489990234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2267722487449646, "rewards_train/margins": 3.654842793941498, "rewards_train/rejected": -3.428070545196533, "step": 369 }, { "epoch": 1.02, "learning_rate": 3.161763169631461e-07, "loss": 0.0734, "step": 370 }, { "epoch": 1.02, "logps_train/chosen": -65.69500732421875, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -137.751708984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26677805185317993, "rewards_train/margins": 3.840680181980133, "rewards_train/rejected": -3.573902130126953, "step": 370 }, { "epoch": 1.02, "learning_rate": 3.15681155010578e-07, "loss": 0.0611, "step": 371 }, { "epoch": 1.02, "logps_train/chosen": -68.19982147216797, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -140.50332641601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05755724012851715, "rewards_train/margins": 3.602616921067238, "rewards_train/rejected": -3.5450596809387207, "step": 371 }, { "epoch": 1.02, "learning_rate": 3.1518492518170526e-07, "loss": 0.0887, "step": 372 }, { "epoch": 1.02, "logps_train/chosen": -67.7779312133789, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -138.68540954589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04193387180566788, "rewards_train/margins": 3.526588775217533, "rewards_train/rejected": -3.4846549034118652, "step": 372 }, { "epoch": 1.03, "learning_rate": 3.146876320573266e-07, "loss": 0.0895, "step": 373 }, { "epoch": 1.03, "logps_train/chosen": -69.6133041381836, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -142.595703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21064220368862152, "rewards_train/margins": 4.025389298796654, "rewards_train/rejected": -3.8147470951080322, "step": 373 }, { "epoch": 1.03, "learning_rate": 3.14189280228056e-07, "loss": 0.0472, "step": 374 }, { "epoch": 1.03, "logps_train/chosen": -69.12132263183594, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -108.8125, "logps_train/rejected": -146.25961303710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1453382670879364, "rewards_train/margins": 3.889463633298874, "rewards_train/rejected": -3.7441253662109375, "step": 374 }, { "epoch": 1.03, "learning_rate": 3.1368987429428086e-07, "loss": 0.0544, "step": 375 }, { "epoch": 1.03, "logps_train/chosen": -67.7657470703125, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -136.505126953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24403147399425507, "rewards_train/margins": 3.6839979141950607, "rewards_train/rejected": -3.4399664402008057, "step": 375 }, { "epoch": 1.04, "learning_rate": 3.1318941886611906e-07, "loss": 0.0566, "step": 376 }, { "epoch": 1.04, "logps_train/chosen": -68.50090026855469, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -140.54978942871094, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.29292744398117065, "rewards_train/margins": 3.9061099886894226, "rewards_train/rejected": -3.613182544708252, "step": 376 }, { "epoch": 1.04, "learning_rate": 3.126879185633764e-07, "loss": 0.0564, "step": 377 }, { "epoch": 1.04, "logps_train/chosen": -68.75784301757812, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -141.7568359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1416471004486084, "rewards_train/margins": 3.8022918701171875, "rewards_train/rejected": -3.660644769668579, "step": 377 }, { "epoch": 1.04, "learning_rate": 3.121853780155046e-07, "loss": 0.0609, "step": 378 }, { "epoch": 1.04, "logps_train/chosen": -65.40780639648438, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -143.24688720703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26385724544525146, "rewards_train/margins": 3.9508503675460815, "rewards_train/rejected": -3.68699312210083, "step": 378 }, { "epoch": 1.04, "learning_rate": 3.1168180186155757e-07, "loss": 0.0378, "step": 379 }, { "epoch": 1.04, "logps_train/chosen": -68.79998779296875, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -140.04110717773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12581130862236023, "rewards_train/margins": 3.708632677793503, "rewards_train/rejected": -3.5828213691711426, "step": 379 }, { "epoch": 1.05, "learning_rate": 3.111771947501496e-07, "loss": 0.0659, "step": 380 }, { "epoch": 1.05, "logps_train/chosen": -69.84382629394531, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -144.40672302246094, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2156175971031189, "rewards_train/margins": 3.9050206542015076, "rewards_train/rejected": -3.6894030570983887, "step": 380 }, { "epoch": 1.05, "learning_rate": 3.1067156133941146e-07, "loss": 0.0534, "step": 381 }, { "epoch": 1.05, "logps_train/chosen": -67.57372283935547, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -136.64511108398438, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.0044440701603889465, "rewards_train/margins": 3.5163170769810677, "rewards_train/rejected": -3.5118730068206787, "step": 381 }, { "epoch": 1.05, "learning_rate": 3.1016490629694834e-07, "loss": 0.0938, "step": 382 }, { "epoch": 1.05, "logps_train/chosen": -65.22940063476562, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -104.8125, "logps_train/rejected": -140.64578247070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3125094473361969, "rewards_train/margins": 3.897498518228531, "rewards_train/rejected": -3.584989070892334, "step": 382 }, { "epoch": 1.06, "learning_rate": 3.096572342997961e-07, "loss": 0.0517, "step": 383 }, { "epoch": 1.06, "logps_train/chosen": -67.75238037109375, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -134.0069122314453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.024847261607646942, "rewards_train/margins": 3.373597390949726, "rewards_train/rejected": -3.398444652557373, "step": 383 }, { "epoch": 1.06, "learning_rate": 3.091485500343783e-07, "loss": 0.0928, "step": 384 }, { "epoch": 1.06, "logps_train/chosen": -68.06730651855469, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -139.97264099121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16875799000263214, "rewards_train/margins": 3.906452015042305, "rewards_train/rejected": -3.737694025039673, "step": 384 }, { "epoch": 1.06, "learning_rate": 3.08638858196463e-07, "loss": 0.063, "step": 385 }, { "epoch": 1.06, "logps_train/chosen": -65.75227355957031, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -136.98385620117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1247720718383789, "rewards_train/margins": 3.724329948425293, "rewards_train/rejected": -3.599557876586914, "step": 385 }, { "epoch": 1.06, "learning_rate": 3.081281634911195e-07, "loss": 0.067, "step": 386 }, { "epoch": 1.06, "logps_train/chosen": -65.16221618652344, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -144.6802215576172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24598507583141327, "rewards_train/margins": 3.992913767695427, "rewards_train/rejected": -3.7469286918640137, "step": 386 }, { "epoch": 1.07, "learning_rate": 3.0761647063267454e-07, "loss": 0.0486, "step": 387 }, { "epoch": 1.07, "logps_train/chosen": -69.76686096191406, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -146.12939453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09538350999355316, "rewards_train/margins": 4.07013975083828, "rewards_train/rejected": -3.9747562408447266, "step": 387 }, { "epoch": 1.07, "learning_rate": 3.0710378434466915e-07, "loss": 0.0545, "step": 388 }, { "epoch": 1.07, "logps_train/chosen": -67.57146453857422, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -145.77467346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21961142122745514, "rewards_train/margins": 4.044148936867714, "rewards_train/rejected": -3.824537515640259, "step": 388 }, { "epoch": 1.07, "learning_rate": 3.065901093598148e-07, "loss": 0.0551, "step": 389 }, { "epoch": 1.07, "logps_train/chosen": -67.05436706542969, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -142.23619079589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25657451152801514, "rewards_train/margins": 3.9574393033981323, "rewards_train/rejected": -3.700864791870117, "step": 389 }, { "epoch": 1.07, "learning_rate": 3.0607545041995e-07, "loss": 0.0433, "step": 390 }, { "epoch": 1.07, "logps_train/chosen": -68.13766479492188, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -143.06292724609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09707401692867279, "rewards_train/margins": 3.930807575583458, "rewards_train/rejected": -3.833733558654785, "step": 390 }, { "epoch": 1.08, "learning_rate": 3.0555981227599626e-07, "loss": 0.0538, "step": 391 }, { "epoch": 1.08, "logps_train/chosen": -67.59823608398438, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -142.85888671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16664119064807892, "rewards_train/margins": 4.009171441197395, "rewards_train/rejected": -3.8425302505493164, "step": 391 }, { "epoch": 1.08, "learning_rate": 3.0504319968791426e-07, "loss": 0.0512, "step": 392 }, { "epoch": 1.08, "logps_train/chosen": -67.70074462890625, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -147.58787536621094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28969159722328186, "rewards_train/margins": 4.324748665094376, "rewards_train/rejected": -4.035057067871094, "step": 392 }, { "epoch": 1.08, "learning_rate": 3.045256174246601e-07, "loss": 0.0399, "step": 393 }, { "epoch": 1.08, "logps_train/chosen": -67.54145812988281, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -143.26846313476562, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.23252442479133606, "rewards_train/margins": 4.205072432756424, "rewards_train/rejected": -3.972548007965088, "step": 393 }, { "epoch": 1.09, "learning_rate": 3.040070702641409e-07, "loss": 0.0458, "step": 394 }, { "epoch": 1.09, "logps_train/chosen": -67.4007339477539, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -144.0384063720703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1940578818321228, "rewards_train/margins": 4.1210426688194275, "rewards_train/rejected": -3.9269847869873047, "step": 394 }, { "epoch": 1.09, "learning_rate": 3.0348756299317135e-07, "loss": 0.0468, "step": 395 }, { "epoch": 1.09, "logps_train/chosen": -68.01539611816406, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -138.61019897460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09933947026729584, "rewards_train/margins": 3.7538161128759384, "rewards_train/rejected": -3.6544766426086426, "step": 395 }, { "epoch": 1.09, "learning_rate": 3.029671004074288e-07, "loss": 0.0678, "step": 396 }, { "epoch": 1.09, "logps_train/chosen": -71.08706665039062, "logps_train/ref_chosen": -72.3125, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -147.60275268554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12146936357021332, "rewards_train/margins": 4.062114581465721, "rewards_train/rejected": -3.940645217895508, "step": 396 }, { "epoch": 1.09, "learning_rate": 3.024456873114093e-07, "loss": 0.0597, "step": 397 }, { "epoch": 1.09, "logps_train/chosen": -66.07621765136719, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -143.4192657470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2822704017162323, "rewards_train/margins": 4.12263485789299, "rewards_train/rejected": -3.840364456176758, "step": 397 }, { "epoch": 1.1, "learning_rate": 3.0192332851838347e-07, "loss": 0.0412, "step": 398 }, { "epoch": 1.1, "logps_train/chosen": -68.2171859741211, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -140.95687866210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07945317029953003, "rewards_train/margins": 3.7791454195976257, "rewards_train/rejected": -3.6996922492980957, "step": 398 }, { "epoch": 1.1, "learning_rate": 3.014000288503516e-07, "loss": 0.0741, "step": 399 }, { "epoch": 1.1, "logps_train/chosen": -64.15694427490234, "logps_train/ref_chosen": -65.8125, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -138.21337890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16521337628364563, "rewards_train/margins": 3.9344504177570343, "rewards_train/rejected": -3.7692370414733887, "step": 399 }, { "epoch": 1.1, "learning_rate": 3.0087579313799974e-07, "loss": 0.0628, "step": 400 }, { "epoch": 1.1, "logps_train/chosen": -67.6678466796875, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -143.8508758544922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12984654307365417, "rewards_train/margins": 3.957121819257736, "rewards_train/rejected": -3.827275276184082, "step": 400 }, { "epoch": 1.1, "learning_rate": 3.0035062622065426e-07, "loss": 0.0646, "step": 401 }, { "epoch": 1.1, "logps_train/chosen": -68.72688293457031, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -141.52206420898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16876749694347382, "rewards_train/margins": 3.9659941643476486, "rewards_train/rejected": -3.797226667404175, "step": 401 }, { "epoch": 1.11, "learning_rate": 2.9982453294623805e-07, "loss": 0.056, "step": 402 }, { "epoch": 1.11, "logps_train/chosen": -67.03195190429688, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -100.9375, "logps_train/rejected": -138.79888916015625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.04602382332086563, "rewards_train/margins": 3.834213949739933, "rewards_train/rejected": -3.7881901264190674, "step": 402 }, { "epoch": 1.11, "learning_rate": 2.992975181712254e-07, "loss": 0.0783, "step": 403 }, { "epoch": 1.11, "logps_train/chosen": -71.15237426757812, "logps_train/ref_chosen": -73.375, "logps_train/ref_rejected": -112.125, "logps_train/rejected": -154.114990234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2251434326171875, "rewards_train/margins": 4.424924373626709, "rewards_train/rejected": -4.1997809410095215, "step": 403 }, { "epoch": 1.11, "learning_rate": 2.987695867605969e-07, "loss": 0.0333, "step": 404 }, { "epoch": 1.11, "logps_train/chosen": -68.90428161621094, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -110.875, "logps_train/rejected": -151.5803985595703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3125010132789612, "rewards_train/margins": 4.381478369235992, "rewards_train/rejected": -4.068977355957031, "step": 404 }, { "epoch": 1.12, "learning_rate": 2.982407435877949e-07, "loss": 0.0403, "step": 405 }, { "epoch": 1.12, "logps_train/chosen": -68.2847900390625, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -140.83175659179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21800532937049866, "rewards_train/margins": 4.076278358697891, "rewards_train/rejected": -3.8582730293273926, "step": 405 }, { "epoch": 1.12, "learning_rate": 2.977109935346786e-07, "loss": 0.0485, "step": 406 }, { "epoch": 1.12, "logps_train/chosen": -71.52619934082031, "logps_train/ref_chosen": -73.875, "logps_train/ref_rejected": -109.3125, "logps_train/rejected": -150.80264282226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23536819219589233, "rewards_train/margins": 4.382917821407318, "rewards_train/rejected": -4.147549629211426, "step": 406 }, { "epoch": 1.12, "learning_rate": 2.9718034149147846e-07, "loss": 0.0392, "step": 407 }, { "epoch": 1.12, "logps_train/chosen": -68.84717559814453, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -146.27133178710938, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.19340752065181732, "rewards_train/margins": 4.166341051459312, "rewards_train/rejected": -3.972933530807495, "step": 407 }, { "epoch": 1.12, "learning_rate": 2.966487923567517e-07, "loss": 0.0701, "step": 408 }, { "epoch": 1.12, "logps_train/chosen": -69.78475189208984, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -147.05006408691406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09007957577705383, "rewards_train/margins": 4.029851466417313, "rewards_train/rejected": -3.939771890640259, "step": 408 }, { "epoch": 1.13, "learning_rate": 2.9611635103733677e-07, "loss": 0.0596, "step": 409 }, { "epoch": 1.13, "logps_train/chosen": -67.79942321777344, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -148.49984741210938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2799696922302246, "rewards_train/margins": 4.319406032562256, "rewards_train/rejected": -4.039436340332031, "step": 409 }, { "epoch": 1.13, "learning_rate": 2.9558302244830765e-07, "loss": 0.0408, "step": 410 }, { "epoch": 1.13, "logps_train/chosen": -67.3544921875, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -103.0625, "logps_train/rejected": -141.63778686523438, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.22773495316505432, "rewards_train/margins": 4.083213239908218, "rewards_train/rejected": -3.855478286743164, "step": 410 }, { "epoch": 1.13, "learning_rate": 2.9504881151292944e-07, "loss": 0.0557, "step": 411 }, { "epoch": 1.13, "logps_train/chosen": -69.97514343261719, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -143.96722412109375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.02021033689379692, "rewards_train/margins": 3.8819719068706036, "rewards_train/rejected": -3.8617615699768066, "step": 411 }, { "epoch": 1.13, "learning_rate": 2.945137231626119e-07, "loss": 0.0817, "step": 412 }, { "epoch": 1.13, "logps_train/chosen": -68.45214080810547, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -143.090087890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23129934072494507, "rewards_train/margins": 4.107690274715424, "rewards_train/rejected": -3.8763909339904785, "step": 412 }, { "epoch": 1.14, "learning_rate": 2.9397776233686476e-07, "loss": 0.045, "step": 413 }, { "epoch": 1.14, "logps_train/chosen": -65.42911529541016, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -146.6817626953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3136802017688751, "rewards_train/margins": 4.350898295640945, "rewards_train/rejected": -4.03721809387207, "step": 413 }, { "epoch": 1.14, "learning_rate": 2.9344093398325147e-07, "loss": 0.0461, "step": 414 }, { "epoch": 1.14, "logps_train/chosen": -66.49281311035156, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -101.5625, "logps_train/rejected": -140.83453369140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11331621557474136, "rewards_train/margins": 4.038615860044956, "rewards_train/rejected": -3.925299644470215, "step": 414 }, { "epoch": 1.14, "learning_rate": 2.9290324305734386e-07, "loss": 0.0564, "step": 415 }, { "epoch": 1.14, "logps_train/chosen": -66.36939239501953, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -144.52540588378906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21472102403640747, "rewards_train/margins": 4.2030046582221985, "rewards_train/rejected": -3.988283634185791, "step": 415 }, { "epoch": 1.15, "learning_rate": 2.923646945226765e-07, "loss": 0.0472, "step": 416 }, { "epoch": 1.15, "logps_train/chosen": -66.76016235351562, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -143.42495727539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.236532524228096, "rewards_train/margins": 4.326293781399727, "rewards_train/rejected": -4.089761257171631, "step": 416 }, { "epoch": 1.15, "learning_rate": 2.9182529335070045e-07, "loss": 0.0398, "step": 417 }, { "epoch": 1.15, "logps_train/chosen": -65.48939514160156, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -144.1302032470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2791858911514282, "rewards_train/margins": 4.308515191078186, "rewards_train/rejected": -4.029329299926758, "step": 417 }, { "epoch": 1.15, "learning_rate": 2.91285044520738e-07, "loss": 0.045, "step": 418 }, { "epoch": 1.15, "logps_train/chosen": -66.08970642089844, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -142.92567443847656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22330400347709656, "rewards_train/margins": 4.243801325559616, "rewards_train/rejected": -4.0204973220825195, "step": 418 }, { "epoch": 1.15, "learning_rate": 2.90743953019936e-07, "loss": 0.046, "step": 419 }, { "epoch": 1.15, "logps_train/chosen": -70.30278778076172, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -107.5625, "logps_train/rejected": -149.048828125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.16381344199180603, "rewards_train/margins": 4.3107379376888275, "rewards_train/rejected": -4.1469244956970215, "step": 419 }, { "epoch": 1.16, "learning_rate": 2.9020202384322037e-07, "loss": 0.0473, "step": 420 }, { "epoch": 1.16, "logps_train/chosen": -66.52884674072266, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -149.80517578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18686161935329437, "rewards_train/margins": 4.328073129057884, "rewards_train/rejected": -4.14121150970459, "step": 420 }, { "epoch": 1.16, "learning_rate": 2.896592619932497e-07, "loss": 0.041, "step": 421 }, { "epoch": 1.16, "logps_train/chosen": -66.80413818359375, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -146.0087127685547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23223333060741425, "rewards_train/margins": 4.379883095622063, "rewards_train/rejected": -4.147649765014648, "step": 421 }, { "epoch": 1.16, "learning_rate": 2.891156724803692e-07, "loss": 0.038, "step": 422 }, { "epoch": 1.16, "logps_train/chosen": -66.68498229980469, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -101.6875, "logps_train/rejected": -142.43418884277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19185340404510498, "rewards_train/margins": 4.264618754386902, "rewards_train/rejected": -4.072765350341797, "step": 422 }, { "epoch": 1.17, "learning_rate": 2.8857126032256424e-07, "loss": 0.0635, "step": 423 }, { "epoch": 1.17, "logps_train/chosen": -68.22858428955078, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -150.6878662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2254817932844162, "rewards_train/margins": 4.443095967173576, "rewards_train/rejected": -4.21761417388916, "step": 423 }, { "epoch": 1.17, "learning_rate": 2.880260305454146e-07, "loss": 0.0467, "step": 424 }, { "epoch": 1.17, "logps_train/chosen": -69.40853881835938, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -147.53033447265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22301360964775085, "rewards_train/margins": 4.422239035367966, "rewards_train/rejected": -4.199225425720215, "step": 424 }, { "epoch": 1.17, "learning_rate": 2.874799881820472e-07, "loss": 0.0401, "step": 425 }, { "epoch": 1.17, "logps_train/chosen": -67.6269302368164, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -145.17034912109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20801027119159698, "rewards_train/margins": 4.249946668744087, "rewards_train/rejected": -4.04193639755249, "step": 425 }, { "epoch": 1.17, "learning_rate": 2.869331382730905e-07, "loss": 0.0636, "step": 426 }, { "epoch": 1.17, "logps_train/chosen": -66.05145263671875, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -101.4375, "logps_train/rejected": -140.43582153320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1536923199892044, "rewards_train/margins": 4.051278427243233, "rewards_train/rejected": -3.8975861072540283, "step": 426 }, { "epoch": 1.18, "learning_rate": 2.863854858666272e-07, "loss": 0.0738, "step": 427 }, { "epoch": 1.18, "logps_train/chosen": -67.84514617919922, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -144.6107177734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.045598823577165604, "rewards_train/margins": 4.073674838989973, "rewards_train/rejected": -4.119273662567139, "step": 427 }, { "epoch": 1.18, "learning_rate": 2.8583703601814833e-07, "loss": 0.0612, "step": 428 }, { "epoch": 1.18, "logps_train/chosen": -68.24170684814453, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -145.59219360351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.030370891094207764, "rewards_train/margins": 4.120889842510223, "rewards_train/rejected": -4.090518951416016, "step": 428 }, { "epoch": 1.18, "learning_rate": 2.852877937905061e-07, "loss": 0.0642, "step": 429 }, { "epoch": 1.18, "logps_train/chosen": -67.56143951416016, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -110.5, "logps_train/rejected": -153.239013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33438336849212646, "rewards_train/margins": 4.609309554100037, "rewards_train/rejected": -4.27492618560791, "step": 429 }, { "epoch": 1.18, "learning_rate": 2.8473776425386746e-07, "loss": 0.0269, "step": 430 }, { "epoch": 1.18, "logps_train/chosen": -67.678466796875, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -141.31805419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07229048013687134, "rewards_train/margins": 3.8615668416023254, "rewards_train/rejected": -3.789276361465454, "step": 430 }, { "epoch": 1.19, "learning_rate": 2.84186952485667e-07, "loss": 0.0724, "step": 431 }, { "epoch": 1.19, "logps_train/chosen": -64.65007019042969, "logps_train/ref_chosen": -65.9375, "logps_train/ref_rejected": -98.75, "logps_train/rejected": -137.392822265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12810799479484558, "rewards_train/margins": 3.990046948194504, "rewards_train/rejected": -3.861938953399658, "step": 431 }, { "epoch": 1.19, "learning_rate": 2.836353635705604e-07, "loss": 0.0731, "step": 432 }, { "epoch": 1.19, "logps_train/chosen": -70.71253967285156, "logps_train/ref_chosen": -72.875, "logps_train/ref_rejected": -109.75, "logps_train/rejected": -153.11529541015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2159041464328766, "rewards_train/margins": 4.55028572678566, "rewards_train/rejected": -4.334381580352783, "step": 432 }, { "epoch": 1.19, "learning_rate": 2.8308300260037727e-07, "loss": 0.0404, "step": 433 }, { "epoch": 1.19, "logps_train/chosen": -68.47380065917969, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -141.15786743164062, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": -0.02672600746154785, "rewards_train/margins": 3.8214826583862305, "rewards_train/rejected": -3.8482086658477783, "step": 433 }, { "epoch": 1.2, "learning_rate": 2.825298746740743e-07, "loss": 0.0896, "step": 434 }, { "epoch": 1.2, "logps_train/chosen": -67.75090789794922, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -145.23406982421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2988349497318268, "rewards_train/margins": 4.566478878259659, "rewards_train/rejected": -4.267643928527832, "step": 434 }, { "epoch": 1.2, "learning_rate": 2.8197598489768793e-07, "loss": 0.0303, "step": 435 }, { "epoch": 1.2, "logps_train/chosen": -65.41513061523438, "logps_train/ref_chosen": -66.6875, "logps_train/ref_rejected": -100.875, "logps_train/rejected": -142.21762084960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12523497641086578, "rewards_train/margins": 4.26247538626194, "rewards_train/rejected": -4.137240409851074, "step": 435 }, { "epoch": 1.2, "learning_rate": 2.8142133838428754e-07, "loss": 0.0541, "step": 436 }, { "epoch": 1.2, "logps_train/chosen": -70.45610809326172, "logps_train/ref_chosen": -72.75, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -149.36294555664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23251411318778992, "rewards_train/margins": 4.619493216276169, "rewards_train/rejected": -4.386979103088379, "step": 436 }, { "epoch": 1.2, "learning_rate": 2.8086594025392813e-07, "loss": 0.0349, "step": 437 }, { "epoch": 1.2, "logps_train/chosen": -67.05702209472656, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -142.78958129882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18384842574596405, "rewards_train/margins": 4.200306937098503, "rewards_train/rejected": -4.016458511352539, "step": 437 }, { "epoch": 1.21, "learning_rate": 2.8030979563360293e-07, "loss": 0.0649, "step": 438 }, { "epoch": 1.21, "logps_train/chosen": -68.00106811523438, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -152.56570434570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21737366914749146, "rewards_train/margins": 4.4722355008125305, "rewards_train/rejected": -4.254861831665039, "step": 438 }, { "epoch": 1.21, "learning_rate": 2.797529096571962e-07, "loss": 0.0406, "step": 439 }, { "epoch": 1.21, "logps_train/chosen": -69.27908325195312, "logps_train/ref_chosen": -72.1875, "logps_train/ref_rejected": -112.125, "logps_train/rejected": -155.35812377929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2925994396209717, "rewards_train/margins": 4.613960027694702, "rewards_train/rejected": -4.3213605880737305, "step": 439 }, { "epoch": 1.21, "learning_rate": 2.7919528746543583e-07, "loss": 0.0328, "step": 440 }, { "epoch": 1.21, "logps_train/chosen": -70.037841796875, "logps_train/ref_chosen": -73.0, "logps_train/ref_rejected": -107.8125, "logps_train/rejected": -150.87832641601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29792505502700806, "rewards_train/margins": 4.604604780673981, "rewards_train/rejected": -4.306679725646973, "step": 440 }, { "epoch": 1.21, "learning_rate": 2.786369342058459e-07, "loss": 0.0363, "step": 441 }, { "epoch": 1.21, "logps_train/chosen": -67.52095031738281, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -147.97525024414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1457565873861313, "rewards_train/margins": 4.340694293379784, "rewards_train/rejected": -4.194937705993652, "step": 441 }, { "epoch": 1.22, "learning_rate": 2.780778550326989e-07, "loss": 0.0547, "step": 442 }, { "epoch": 1.22, "logps_train/chosen": -69.66340637207031, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -109.125, "logps_train/rejected": -152.74951171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14479249715805054, "rewards_train/margins": 4.509147584438324, "rewards_train/rejected": -4.364355087280273, "step": 442 }, { "epoch": 1.22, "learning_rate": 2.775180551069688e-07, "loss": 0.0493, "step": 443 }, { "epoch": 1.22, "logps_train/chosen": -68.1794662475586, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -149.6548309326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13039332628250122, "rewards_train/margins": 4.445094406604767, "rewards_train/rejected": -4.314701080322266, "step": 443 }, { "epoch": 1.22, "learning_rate": 2.769575395962826e-07, "loss": 0.0549, "step": 444 }, { "epoch": 1.22, "logps_train/chosen": -67.76611328125, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -148.75982666015625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.15859344601631165, "rewards_train/margins": 4.435357362031937, "rewards_train/rejected": -4.276763916015625, "step": 444 }, { "epoch": 1.23, "learning_rate": 2.7639631367487316e-07, "loss": 0.0568, "step": 445 }, { "epoch": 1.23, "logps_train/chosen": -67.33597564697266, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -146.01861572265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.015865158289670944, "rewards_train/margins": 4.262745212763548, "rewards_train/rejected": -4.246880054473877, "step": 445 }, { "epoch": 1.23, "learning_rate": 2.758343825235313e-07, "loss": 0.0573, "step": 446 }, { "epoch": 1.23, "logps_train/chosen": -67.9198989868164, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -150.83026123046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32978755235671997, "rewards_train/margins": 4.698555290699005, "rewards_train/rejected": -4.368767738342285, "step": 446 }, { "epoch": 1.23, "learning_rate": 2.7527175132955786e-07, "loss": 0.0312, "step": 447 }, { "epoch": 1.23, "logps_train/chosen": -69.00973510742188, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -152.2115020751953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2072780728340149, "rewards_train/margins": 4.564072906970978, "rewards_train/rejected": -4.356794834136963, "step": 447 }, { "epoch": 1.23, "learning_rate": 2.747084252867161e-07, "loss": 0.0464, "step": 448 }, { "epoch": 1.23, "logps_train/chosen": -69.63262939453125, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -146.45040893554688, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.03327004984021187, "rewards_train/margins": 4.325626540929079, "rewards_train/rejected": -4.292356491088867, "step": 448 }, { "epoch": 1.24, "learning_rate": 2.7414440959518337e-07, "loss": 0.0618, "step": 449 }, { "epoch": 1.24, "logps_train/chosen": -66.91033935546875, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -101.375, "logps_train/rejected": -141.32388305664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2849920392036438, "rewards_train/margins": 4.2802694439888, "rewards_train/rejected": -3.9952774047851562, "step": 449 }, { "epoch": 1.24, "learning_rate": 2.735797094615035e-07, "loss": 0.0362, "step": 450 }, { "epoch": 1.24, "logps_train/chosen": -69.53321075439453, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -147.13729858398438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1535639464855194, "rewards_train/margins": 4.367293804883957, "rewards_train/rejected": -4.2137298583984375, "step": 450 }, { "epoch": 1.24, "learning_rate": 2.730143300985384e-07, "loss": 0.0535, "step": 451 }, { "epoch": 1.24, "logps_train/chosen": -67.26081848144531, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -147.74249267578125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2449142187833786, "rewards_train/margins": 4.59299199283123, "rewards_train/rejected": -4.348077774047852, "step": 451 }, { "epoch": 1.25, "learning_rate": 2.7244827672542006e-07, "loss": 0.0449, "step": 452 }, { "epoch": 1.25, "logps_train/chosen": -67.3250732421875, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -148.29379272460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36905527114868164, "rewards_train/margins": 4.662839412689209, "rewards_train/rejected": -4.293784141540527, "step": 452 }, { "epoch": 1.25, "learning_rate": 2.7188155456750254e-07, "loss": 0.038, "step": 453 }, { "epoch": 1.25, "logps_train/chosen": -67.43411254882812, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -149.56356811523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.294039249420166, "rewards_train/margins": 4.736137866973877, "rewards_train/rejected": -4.442098617553711, "step": 453 }, { "epoch": 1.25, "learning_rate": 2.713141688563135e-07, "loss": 0.0185, "step": 454 }, { "epoch": 1.25, "logps_train/chosen": -65.80799865722656, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -147.2124481201172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3147561848163605, "rewards_train/margins": 4.723793298006058, "rewards_train/rejected": -4.409037113189697, "step": 454 }, { "epoch": 1.25, "learning_rate": 2.7074612482950607e-07, "loss": 0.0209, "step": 455 }, { "epoch": 1.25, "logps_train/chosen": -70.75172424316406, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -107.3125, "logps_train/rejected": -151.5996856689453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.019996993243694305, "rewards_train/margins": 4.405792362987995, "rewards_train/rejected": -4.4257893562316895, "step": 455 }, { "epoch": 1.26, "learning_rate": 2.7017742773081027e-07, "loss": 0.0527, "step": 456 }, { "epoch": 1.26, "logps_train/chosen": -66.3563232421875, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -148.058349609375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.20660440623760223, "rewards_train/margins": 4.542420789599419, "rewards_train/rejected": -4.335816383361816, "step": 456 }, { "epoch": 1.26, "learning_rate": 2.6960808280998486e-07, "loss": 0.0429, "step": 457 }, { "epoch": 1.26, "logps_train/chosen": -67.74063873291016, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -142.02783203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06449323892593384, "rewards_train/margins": 3.9497637152671814, "rewards_train/rejected": -4.014256954193115, "step": 457 }, { "epoch": 1.26, "learning_rate": 2.6903809532276884e-07, "loss": 0.0798, "step": 458 }, { "epoch": 1.26, "logps_train/chosen": -69.42481994628906, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -147.48207092285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.018162120133638382, "rewards_train/margins": 4.202552188187838, "rewards_train/rejected": -4.184390068054199, "step": 458 }, { "epoch": 1.26, "learning_rate": 2.684674705308327e-07, "loss": 0.0702, "step": 459 }, { "epoch": 1.26, "logps_train/chosen": -68.33694458007812, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -146.77366638183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20634397864341736, "rewards_train/margins": 4.3099797666072845, "rewards_train/rejected": -4.103635787963867, "step": 459 }, { "epoch": 1.27, "learning_rate": 2.6789621370173025e-07, "loss": 0.0624, "step": 460 }, { "epoch": 1.27, "logps_train/chosen": -69.23265075683594, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -107.625, "logps_train/rejected": -155.04776000976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21223296225070953, "rewards_train/margins": 4.955973997712135, "rewards_train/rejected": -4.743741035461426, "step": 460 }, { "epoch": 1.27, "learning_rate": 2.6732433010884953e-07, "loss": 0.0251, "step": 461 }, { "epoch": 1.27, "logps_train/chosen": -66.15486907958984, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -144.66513061523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14979617297649384, "rewards_train/margins": 4.34501950442791, "rewards_train/rejected": -4.195223331451416, "step": 461 }, { "epoch": 1.27, "learning_rate": 2.667518250313645e-07, "loss": 0.0598, "step": 462 }, { "epoch": 1.27, "logps_train/chosen": -68.98805236816406, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -111.0, "logps_train/rejected": -156.0538787841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16447632014751434, "rewards_train/margins": 4.67132942378521, "rewards_train/rejected": -4.506853103637695, "step": 462 }, { "epoch": 1.28, "learning_rate": 2.6617870375418625e-07, "loss": 0.0362, "step": 463 }, { "epoch": 1.28, "logps_train/chosen": -66.52449035644531, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -146.34027099609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18046161532402039, "rewards_train/margins": 4.4243514239788055, "rewards_train/rejected": -4.243889808654785, "step": 463 }, { "epoch": 1.28, "learning_rate": 2.656049715679138e-07, "loss": 0.0464, "step": 464 }, { "epoch": 1.28, "logps_train/chosen": -70.00262451171875, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -150.48062133789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12078265845775604, "rewards_train/margins": 4.584859654307365, "rewards_train/rejected": -4.464076995849609, "step": 464 }, { "epoch": 1.28, "learning_rate": 2.65030633768786e-07, "loss": 0.039, "step": 465 }, { "epoch": 1.28, "logps_train/chosen": -70.23699188232422, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -150.22210693359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10613442957401276, "rewards_train/margins": 4.341627135872841, "rewards_train/rejected": -4.235492706298828, "step": 465 }, { "epoch": 1.28, "learning_rate": 2.6445569565863204e-07, "loss": 0.0654, "step": 466 }, { "epoch": 1.28, "logps_train/chosen": -68.28746795654297, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -147.16229248046875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.019593127071857452, "rewards_train/margins": 4.3636530712246895, "rewards_train/rejected": -4.344059944152832, "step": 466 }, { "epoch": 1.29, "learning_rate": 2.6388016254482267e-07, "loss": 0.0675, "step": 467 }, { "epoch": 1.29, "logps_train/chosen": -69.61527252197266, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -151.91232299804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0967247486114502, "rewards_train/margins": 4.5034849643707275, "rewards_train/rejected": -4.406760215759277, "step": 467 }, { "epoch": 1.29, "learning_rate": 2.633040397402214e-07, "loss": 0.0568, "step": 468 }, { "epoch": 1.29, "logps_train/chosen": -68.61021423339844, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -153.71710205078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30177104473114014, "rewards_train/margins": 4.820648550987244, "rewards_train/rejected": -4.5188775062561035, "step": 468 }, { "epoch": 1.29, "learning_rate": 2.6272733256313507e-07, "loss": 0.0281, "step": 469 }, { "epoch": 1.29, "logps_train/chosen": -66.93436431884766, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -145.83148193359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04137769341468811, "rewards_train/margins": 4.39249387383461, "rewards_train/rejected": -4.351116180419922, "step": 469 }, { "epoch": 1.29, "learning_rate": 2.621500463372651e-07, "loss": 0.0561, "step": 470 }, { "epoch": 1.29, "logps_train/chosen": -69.10176849365234, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -153.04412841796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24358288943767548, "rewards_train/margins": 4.96567265689373, "rewards_train/rejected": -4.722089767456055, "step": 470 }, { "epoch": 1.3, "learning_rate": 2.615721863916582e-07, "loss": 0.0254, "step": 471 }, { "epoch": 1.3, "logps_train/chosen": -68.58969116210938, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -152.84939575195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2092437446117401, "rewards_train/margins": 4.797698944807053, "rewards_train/rejected": -4.5884552001953125, "step": 471 }, { "epoch": 1.3, "learning_rate": 2.609937580606574e-07, "loss": 0.0328, "step": 472 }, { "epoch": 1.3, "logps_train/chosen": -69.78268432617188, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -148.4790802001953, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.10283519327640533, "rewards_train/margins": 4.489316955208778, "rewards_train/rejected": -4.386481761932373, "step": 472 }, { "epoch": 1.3, "learning_rate": 2.604147666838523e-07, "loss": 0.0656, "step": 473 }, { "epoch": 1.3, "logps_train/chosen": -67.78311157226562, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -107.9375, "logps_train/rejected": -152.14306640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26060447096824646, "rewards_train/margins": 4.678719133138657, "rewards_train/rejected": -4.41811466217041, "step": 473 }, { "epoch": 1.31, "learning_rate": 2.598352176060302e-07, "loss": 0.0387, "step": 474 }, { "epoch": 1.31, "logps_train/chosen": -68.21180725097656, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -151.91964721679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19322359561920166, "rewards_train/margins": 4.685091614723206, "rewards_train/rejected": -4.491868019104004, "step": 474 }, { "epoch": 1.31, "learning_rate": 2.592551161771268e-07, "loss": 0.0443, "step": 475 }, { "epoch": 1.31, "logps_train/chosen": -66.26914978027344, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -145.91592407226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2697159945964813, "rewards_train/margins": 4.502763479948044, "rewards_train/rejected": -4.2330474853515625, "step": 475 }, { "epoch": 1.31, "learning_rate": 2.586744677521765e-07, "loss": 0.047, "step": 476 }, { "epoch": 1.31, "logps_train/chosen": -66.99176788330078, "logps_train/ref_chosen": -67.0625, "logps_train/ref_rejected": -101.1875, "logps_train/rejected": -145.7929229736328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.004387697204947472, "rewards_train/margins": 4.464245637878776, "rewards_train/rejected": -4.459857940673828, "step": 476 }, { "epoch": 1.31, "learning_rate": 2.5809327769126314e-07, "loss": 0.0535, "step": 477 }, { "epoch": 1.31, "logps_train/chosen": -70.02531433105469, "logps_train/ref_chosen": -73.0625, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -151.34332275390625, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.304157555103302, "rewards_train/margins": 4.636439144611359, "rewards_train/rejected": -4.332281589508057, "step": 477 }, { "epoch": 1.32, "learning_rate": 2.5751155135947067e-07, "loss": 0.0542, "step": 478 }, { "epoch": 1.32, "logps_train/chosen": -68.37837982177734, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -149.3774871826172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0268511064350605, "rewards_train/margins": 4.449667524546385, "rewards_train/rejected": -4.476518630981445, "step": 478 }, { "epoch": 1.32, "learning_rate": 2.5692929412683317e-07, "loss": 0.0584, "step": 479 }, { "epoch": 1.32, "logps_train/chosen": -67.2031021118164, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -147.3956298828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15571478009223938, "rewards_train/margins": 4.435805112123489, "rewards_train/rejected": -4.28009033203125, "step": 479 }, { "epoch": 1.32, "learning_rate": 2.5634651136828594e-07, "loss": 0.0553, "step": 480 }, { "epoch": 1.32, "logps_train/chosen": -68.10614776611328, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -99.375, "logps_train/rejected": -143.15652465820312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.03904347866773605, "rewards_train/margins": 4.413144163787365, "rewards_train/rejected": -4.374100685119629, "step": 480 }, { "epoch": 1.33, "learning_rate": 2.557632084636152e-07, "loss": 0.0591, "step": 481 }, { "epoch": 1.33, "logps_train/chosen": -65.8223648071289, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -147.26168823242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21913033723831177, "rewards_train/margins": 4.557066261768341, "rewards_train/rejected": -4.337935924530029, "step": 481 }, { "epoch": 1.33, "learning_rate": 2.5517939079740894e-07, "loss": 0.0423, "step": 482 }, { "epoch": 1.33, "logps_train/chosen": -65.68840789794922, "logps_train/ref_chosen": -66.75, "logps_train/ref_rejected": -102.875, "logps_train/rejected": -145.3392333984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10801500082015991, "rewards_train/margins": 4.35458368062973, "rewards_train/rejected": -4.24656867980957, "step": 482 }, { "epoch": 1.33, "learning_rate": 2.545950637590069e-07, "loss": 0.0625, "step": 483 }, { "epoch": 1.33, "logps_train/chosen": -68.27511596679688, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -152.12376403808594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": -0.06481601297855377, "rewards_train/margins": 4.431594356894493, "rewards_train/rejected": -4.496410369873047, "step": 483 }, { "epoch": 1.33, "learning_rate": 2.5401023274245104e-07, "loss": 0.0839, "step": 484 }, { "epoch": 1.33, "logps_train/chosen": -67.32601928710938, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -146.61143493652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.001968424767255783, "rewards_train/margins": 4.433717291802168, "rewards_train/rejected": -4.431748867034912, "step": 484 }, { "epoch": 1.34, "learning_rate": 2.5342490314643553e-07, "loss": 0.0735, "step": 485 }, { "epoch": 1.34, "logps_train/chosen": -68.60681915283203, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -150.8391876220703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2639273405075073, "rewards_train/margins": 4.778071284294128, "rewards_train/rejected": -4.514143943786621, "step": 485 }, { "epoch": 1.34, "learning_rate": 2.5283908037425725e-07, "loss": 0.0444, "step": 486 }, { "epoch": 1.34, "logps_train/chosen": -70.05635070800781, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -150.86654663085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11633762717247009, "rewards_train/margins": 4.545667499303818, "rewards_train/rejected": -4.429329872131348, "step": 486 }, { "epoch": 1.34, "learning_rate": 2.522527698337653e-07, "loss": 0.0438, "step": 487 }, { "epoch": 1.34, "logps_train/chosen": -70.10136413574219, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -151.64080810546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1656450778245926, "rewards_train/margins": 4.639199689030647, "rewards_train/rejected": -4.473554611206055, "step": 487 }, { "epoch": 1.34, "learning_rate": 2.5166597693731193e-07, "loss": 0.0382, "step": 488 }, { "epoch": 1.34, "logps_train/chosen": -67.95059967041016, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -153.69508361816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1913173645734787, "rewards_train/margins": 5.038902565836906, "rewards_train/rejected": -4.847585201263428, "step": 488 }, { "epoch": 1.35, "learning_rate": 2.510787071017017e-07, "loss": 0.0313, "step": 489 }, { "epoch": 1.35, "logps_train/chosen": -66.21101379394531, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -147.03457641601562, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.1947184056043625, "rewards_train/margins": 4.4241519421339035, "rewards_train/rejected": -4.229433536529541, "step": 489 }, { "epoch": 1.35, "learning_rate": 2.504909657481422e-07, "loss": 0.0554, "step": 490 }, { "epoch": 1.35, "logps_train/chosen": -66.50759887695312, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -148.03277587890625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2343474179506302, "rewards_train/margins": 4.387576177716255, "rewards_train/rejected": -4.153228759765625, "step": 490 }, { "epoch": 1.35, "learning_rate": 2.4990275830219354e-07, "loss": 0.0728, "step": 491 }, { "epoch": 1.35, "logps_train/chosen": -68.69912719726562, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -152.53062438964844, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.11714731156826019, "rewards_train/margins": 4.6539991945028305, "rewards_train/rejected": -4.53685188293457, "step": 491 }, { "epoch": 1.36, "learning_rate": 2.493140901937184e-07, "loss": 0.062, "step": 492 }, { "epoch": 1.36, "logps_train/chosen": -65.18254089355469, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -146.9124755859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22847440838813782, "rewards_train/margins": 4.626119881868362, "rewards_train/rejected": -4.397645473480225, "step": 492 }, { "epoch": 1.36, "learning_rate": 2.487249668568322e-07, "loss": 0.0355, "step": 493 }, { "epoch": 1.36, "logps_train/chosen": -69.07369995117188, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -151.69366455078125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.23911452293395996, "rewards_train/margins": 4.7430031299591064, "rewards_train/rejected": -4.5038886070251465, "step": 493 }, { "epoch": 1.36, "learning_rate": 2.4813539372985225e-07, "loss": 0.0493, "step": 494 }, { "epoch": 1.36, "logps_train/chosen": -68.58980560302734, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -152.3354949951172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2548862397670746, "rewards_train/margins": 4.765389055013657, "rewards_train/rejected": -4.510502815246582, "step": 494 }, { "epoch": 1.36, "learning_rate": 2.475453762552482e-07, "loss": 0.035, "step": 495 }, { "epoch": 1.36, "logps_train/chosen": -64.58512115478516, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -148.400390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.308040976524353, "rewards_train/margins": 4.892612338066101, "rewards_train/rejected": -4.584571361541748, "step": 495 }, { "epoch": 1.37, "learning_rate": 2.469549198795917e-07, "loss": 0.0274, "step": 496 }, { "epoch": 1.37, "logps_train/chosen": -67.02243041992188, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -150.6532440185547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22578442096710205, "rewards_train/margins": 4.845601677894592, "rewards_train/rejected": -4.61981725692749, "step": 496 }, { "epoch": 1.37, "learning_rate": 2.4636403005350564e-07, "loss": 0.0278, "step": 497 }, { "epoch": 1.37, "logps_train/chosen": -68.67863464355469, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -152.31295776367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23731274902820587, "rewards_train/margins": 4.766265347599983, "rewards_train/rejected": -4.528952598571777, "step": 497 }, { "epoch": 1.37, "learning_rate": 2.4577271223161444e-07, "loss": 0.0368, "step": 498 }, { "epoch": 1.37, "logps_train/chosen": -68.31184387207031, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -105.9375, "logps_train/rejected": -152.58505249023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1685224324464798, "rewards_train/margins": 4.833374574780464, "rewards_train/rejected": -4.664852142333984, "step": 498 }, { "epoch": 1.37, "learning_rate": 2.4518097187249333e-07, "loss": 0.0352, "step": 499 }, { "epoch": 1.37, "logps_train/chosen": -69.51658630371094, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -149.35508728027344, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.08667135238647461, "rewards_train/margins": 4.684386730194092, "rewards_train/rejected": -4.597715377807617, "step": 499 }, { "epoch": 1.38, "learning_rate": 2.445888144386181e-07, "loss": 0.0594, "step": 500 }, { "epoch": 1.38, "logps_train/chosen": -67.93736267089844, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -151.5426025390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2386365681886673, "rewards_train/margins": 4.75129659473896, "rewards_train/rejected": -4.512660026550293, "step": 500 }, { "epoch": 1.38, "learning_rate": 2.439962453963147e-07, "loss": 0.0298, "step": 501 }, { "epoch": 1.38, "logps_train/chosen": -65.73873138427734, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -152.092041015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31728896498680115, "rewards_train/margins": 4.955936998128891, "rewards_train/rejected": -4.63864803314209, "step": 501 }, { "epoch": 1.38, "learning_rate": 2.4340327021570856e-07, "loss": 0.0439, "step": 502 }, { "epoch": 1.38, "logps_train/chosen": -68.51842498779297, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -146.16217041015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.015540264546871185, "rewards_train/margins": 4.393867157399654, "rewards_train/rejected": -4.378326892852783, "step": 502 }, { "epoch": 1.39, "learning_rate": 2.4280989437067435e-07, "loss": 0.0755, "step": 503 }, { "epoch": 1.39, "logps_train/chosen": -67.36289978027344, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.1875, "logps_train/rejected": -147.10391235351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07899351418018341, "rewards_train/margins": 4.467217639088631, "rewards_train/rejected": -4.388224124908447, "step": 503 }, { "epoch": 1.39, "learning_rate": 2.4221612333878546e-07, "loss": 0.0499, "step": 504 }, { "epoch": 1.39, "logps_train/chosen": -66.972900390625, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -145.0050048828125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.11096148937940598, "rewards_train/margins": 4.397400431334972, "rewards_train/rejected": -4.286438941955566, "step": 504 }, { "epoch": 1.39, "learning_rate": 2.416219626012632e-07, "loss": 0.0589, "step": 505 }, { "epoch": 1.39, "logps_train/chosen": -65.83431243896484, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -152.67074584960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36910775303840637, "rewards_train/margins": 5.0869636833667755, "rewards_train/rejected": -4.717855930328369, "step": 505 }, { "epoch": 1.39, "learning_rate": 2.4102741764292626e-07, "loss": 0.0164, "step": 506 }, { "epoch": 1.39, "logps_train/chosen": -66.69345092773438, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -150.14572143554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12118268013000488, "rewards_train/margins": 4.750988245010376, "rewards_train/rejected": -4.629805564880371, "step": 506 }, { "epoch": 1.4, "learning_rate": 2.4043249395214035e-07, "loss": 0.0465, "step": 507 }, { "epoch": 1.4, "logps_train/chosen": -66.62055969238281, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -107.4375, "logps_train/rejected": -152.92657470703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2708534002304077, "rewards_train/margins": 4.821910262107849, "rewards_train/rejected": -4.551056861877441, "step": 507 }, { "epoch": 1.4, "learning_rate": 2.398371970207672e-07, "loss": 0.0294, "step": 508 }, { "epoch": 1.4, "logps_train/chosen": -64.6028823852539, "logps_train/ref_chosen": -65.9375, "logps_train/ref_rejected": -101.1875, "logps_train/rejected": -147.0994110107422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13238734006881714, "rewards_train/margins": 4.720453441143036, "rewards_train/rejected": -4.588066101074219, "step": 508 }, { "epoch": 1.4, "learning_rate": 2.392415323441141e-07, "loss": 0.0349, "step": 509 }, { "epoch": 1.4, "logps_train/chosen": -70.35090637207031, "logps_train/ref_chosen": -72.6875, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -155.79490661621094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2360028624534607, "rewards_train/margins": 5.000453889369965, "rewards_train/rejected": -4.764451026916504, "step": 509 }, { "epoch": 1.4, "learning_rate": 2.386455054208829e-07, "loss": 0.0458, "step": 510 }, { "epoch": 1.4, "logps_train/chosen": -67.539794921875, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -149.8653564453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2575928568840027, "rewards_train/margins": 4.790466010570526, "rewards_train/rejected": -4.532873153686523, "step": 510 }, { "epoch": 1.41, "learning_rate": 2.3804912175311963e-07, "loss": 0.0393, "step": 511 }, { "epoch": 1.41, "logps_train/chosen": -68.96965026855469, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -148.3440704345703, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.13062281906604767, "rewards_train/margins": 4.544913247227669, "rewards_train/rejected": -4.414290428161621, "step": 511 }, { "epoch": 1.41, "learning_rate": 2.3745238684616336e-07, "loss": 0.0618, "step": 512 }, { "epoch": 1.41, "logps_train/chosen": -67.14103698730469, "logps_train/ref_chosen": -67.0, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -151.39630126953125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": -0.015910685062408447, "rewards_train/margins": 4.672986447811127, "rewards_train/rejected": -4.688897132873535, "step": 512 }, { "epoch": 1.41, "learning_rate": 2.3685530620859553e-07, "loss": 0.0516, "step": 513 }, { "epoch": 1.41, "logps_train/chosen": -66.8527603149414, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -109.0, "logps_train/rejected": -158.0223388671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24441145360469818, "rewards_train/margins": 5.14850090444088, "rewards_train/rejected": -4.904089450836182, "step": 513 }, { "epoch": 1.42, "learning_rate": 2.3625788535218924e-07, "loss": 0.0223, "step": 514 }, { "epoch": 1.42, "logps_train/chosen": -64.8753662109375, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -149.1715850830078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29849860072135925, "rewards_train/margins": 4.72493502497673, "rewards_train/rejected": -4.426436424255371, "step": 514 }, { "epoch": 1.42, "learning_rate": 2.356601297918579e-07, "loss": 0.0367, "step": 515 }, { "epoch": 1.42, "logps_train/chosen": -67.87754821777344, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -152.56114196777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18612241744995117, "rewards_train/margins": 4.714697360992432, "rewards_train/rejected": -4.5285749435424805, "step": 515 }, { "epoch": 1.42, "learning_rate": 2.35062045045605e-07, "loss": 0.051, "step": 516 }, { "epoch": 1.42, "logps_train/chosen": -64.33438110351562, "logps_train/ref_chosen": -66.5625, "logps_train/ref_rejected": -101.0625, "logps_train/rejected": -145.5874786376953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22383761405944824, "rewards_train/margins": 4.676726579666138, "rewards_train/rejected": -4.4528889656066895, "step": 516 }, { "epoch": 1.42, "learning_rate": 2.3446363663447263e-07, "loss": 0.0459, "step": 517 }, { "epoch": 1.42, "logps_train/chosen": -66.06584167480469, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -153.07691955566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3282797932624817, "rewards_train/margins": 5.007260620594025, "rewards_train/rejected": -4.678980827331543, "step": 517 }, { "epoch": 1.43, "learning_rate": 2.3386491008249071e-07, "loss": 0.0308, "step": 518 }, { "epoch": 1.43, "logps_train/chosen": -69.39799499511719, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -150.10272216796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06254462152719498, "rewards_train/margins": 4.60045412927866, "rewards_train/rejected": -4.537909507751465, "step": 518 }, { "epoch": 1.43, "learning_rate": 2.3326587091662602e-07, "loss": 0.0481, "step": 519 }, { "epoch": 1.43, "logps_train/chosen": -68.55818176269531, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -107.4375, "logps_train/rejected": -157.0637969970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18632090091705322, "rewards_train/margins": 5.151196122169495, "rewards_train/rejected": -4.964875221252441, "step": 519 }, { "epoch": 1.43, "learning_rate": 2.3266652466673102e-07, "loss": 0.0144, "step": 520 }, { "epoch": 1.43, "logps_train/chosen": -66.79846954345703, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -151.59393310546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2402697652578354, "rewards_train/margins": 4.84131346642971, "rewards_train/rejected": -4.601043701171875, "step": 520 }, { "epoch": 1.44, "learning_rate": 2.3206687686549311e-07, "loss": 0.0345, "step": 521 }, { "epoch": 1.44, "logps_train/chosen": -67.40617370605469, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -104.8125, "logps_train/rejected": -150.42599487304688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13116027414798737, "rewards_train/margins": 4.692706599831581, "rewards_train/rejected": -4.561546325683594, "step": 521 }, { "epoch": 1.44, "learning_rate": 2.3146693304838333e-07, "loss": 0.0561, "step": 522 }, { "epoch": 1.44, "logps_train/chosen": -68.24472045898438, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -150.555419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10555725544691086, "rewards_train/margins": 4.778726391494274, "rewards_train/rejected": -4.673169136047363, "step": 522 }, { "epoch": 1.44, "learning_rate": 2.3086669875360512e-07, "loss": 0.0479, "step": 523 }, { "epoch": 1.44, "logps_train/chosen": -65.86939239501953, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -153.81790161132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3329337239265442, "rewards_train/margins": 5.03767329454422, "rewards_train/rejected": -4.704739570617676, "step": 523 }, { "epoch": 1.44, "learning_rate": 2.302661795220436e-07, "loss": 0.0271, "step": 524 }, { "epoch": 1.44, "logps_train/chosen": -67.38636779785156, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -105.1875, "logps_train/rejected": -151.43017578125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.19163642823696136, "rewards_train/margins": 4.812877997756004, "rewards_train/rejected": -4.621241569519043, "step": 524 }, { "epoch": 1.45, "learning_rate": 2.2966538089721408e-07, "loss": 0.0449, "step": 525 }, { "epoch": 1.45, "logps_train/chosen": -67.33525085449219, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -153.8387451171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20192384719848633, "rewards_train/margins": 4.972565174102783, "rewards_train/rejected": -4.770641326904297, "step": 525 }, { "epoch": 1.45, "learning_rate": 2.2906430842521112e-07, "loss": 0.0287, "step": 526 }, { "epoch": 1.45, "logps_train/chosen": -69.02153778076172, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -150.67572021484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07875456660985947, "rewards_train/margins": 4.735389851033688, "rewards_train/rejected": -4.656635284423828, "step": 526 }, { "epoch": 1.45, "learning_rate": 2.2846296765465703e-07, "loss": 0.0403, "step": 527 }, { "epoch": 1.45, "logps_train/chosen": -67.4461669921875, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -147.787353515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.01749308407306671, "rewards_train/margins": 4.648766353726387, "rewards_train/rejected": -4.63127326965332, "step": 527 }, { "epoch": 1.45, "learning_rate": 2.27861364136651e-07, "loss": 0.0408, "step": 528 }, { "epoch": 1.45, "logps_train/chosen": -69.89947509765625, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -151.57705688476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.144769549369812, "rewards_train/margins": 4.8112629652023315, "rewards_train/rejected": -4.6664934158325195, "step": 528 }, { "epoch": 1.46, "learning_rate": 2.2725950342471765e-07, "loss": 0.0405, "step": 529 }, { "epoch": 1.46, "logps_train/chosen": -69.33050537109375, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -109.5, "logps_train/rejected": -158.69984436035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2299860715866089, "rewards_train/margins": 5.147334456443787, "rewards_train/rejected": -4.917348384857178, "step": 529 }, { "epoch": 1.46, "learning_rate": 2.266573910747558e-07, "loss": 0.0261, "step": 530 }, { "epoch": 1.46, "logps_train/chosen": -64.34436798095703, "logps_train/ref_chosen": -65.6875, "logps_train/ref_rejected": -98.625, "logps_train/rejected": -141.57810974121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13504567742347717, "rewards_train/margins": 4.4268414080142975, "rewards_train/rejected": -4.29179573059082, "step": 530 }, { "epoch": 1.46, "learning_rate": 2.2605503264498714e-07, "loss": 0.0513, "step": 531 }, { "epoch": 1.46, "logps_train/chosen": -70.02984619140625, "logps_train/ref_chosen": -72.125, "logps_train/ref_rejected": -109.625, "logps_train/rejected": -158.73687744140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2100520133972168, "rewards_train/margins": 5.119236946105957, "rewards_train/rejected": -4.90918493270874, "step": 531 }, { "epoch": 1.47, "learning_rate": 2.2545243369590513e-07, "loss": 0.0311, "step": 532 }, { "epoch": 1.47, "logps_train/chosen": -64.47931671142578, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -101.1875, "logps_train/rejected": -146.59197998046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3010430932044983, "rewards_train/margins": 4.838758051395416, "rewards_train/rejected": -4.537714958190918, "step": 532 }, { "epoch": 1.47, "learning_rate": 2.248495997902233e-07, "loss": 0.0244, "step": 533 }, { "epoch": 1.47, "logps_train/chosen": -65.97037506103516, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -150.28067016601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25354838371276855, "rewards_train/margins": 4.9284422397613525, "rewards_train/rejected": -4.674893856048584, "step": 533 }, { "epoch": 1.47, "learning_rate": 2.2424653649282428e-07, "loss": 0.0341, "step": 534 }, { "epoch": 1.47, "logps_train/chosen": -69.75567626953125, "logps_train/ref_chosen": -72.75, "logps_train/ref_rejected": -109.875, "logps_train/rejected": -160.06954956054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29806584119796753, "rewards_train/margins": 5.318303763866425, "rewards_train/rejected": -5.020237922668457, "step": 534 }, { "epoch": 1.47, "learning_rate": 2.2364324937070825e-07, "loss": 0.0326, "step": 535 }, { "epoch": 1.47, "logps_train/chosen": -68.35952758789062, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -107.375, "logps_train/rejected": -156.30970764160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2337743639945984, "rewards_train/margins": 5.127978503704071, "rewards_train/rejected": -4.894204139709473, "step": 535 }, { "epoch": 1.48, "learning_rate": 2.230397439929414e-07, "loss": 0.032, "step": 536 }, { "epoch": 1.48, "logps_train/chosen": -66.20968627929688, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -149.7369384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2151152342557907, "rewards_train/margins": 4.785488769412041, "rewards_train/rejected": -4.57037353515625, "step": 536 }, { "epoch": 1.48, "learning_rate": 2.2243602593060493e-07, "loss": 0.0398, "step": 537 }, { "epoch": 1.48, "logps_train/chosen": -67.14309692382812, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -149.96603393554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20204748213291168, "rewards_train/margins": 4.736150875687599, "rewards_train/rejected": -4.5341033935546875, "step": 537 }, { "epoch": 1.48, "learning_rate": 2.2183210075674314e-07, "loss": 0.0467, "step": 538 }, { "epoch": 1.48, "logps_train/chosen": -64.81015014648438, "logps_train/ref_chosen": -67.5625, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -149.4559326171875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27709007263183594, "rewards_train/margins": 4.786111354827881, "rewards_train/rejected": -4.509021282196045, "step": 538 }, { "epoch": 1.48, "learning_rate": 2.2122797404631236e-07, "loss": 0.0465, "step": 539 }, { "epoch": 1.48, "logps_train/chosen": -66.21721649169922, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -152.19195556640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2551828622817993, "rewards_train/margins": 5.028527855873108, "rewards_train/rejected": -4.773344993591309, "step": 539 }, { "epoch": 1.49, "learning_rate": 2.2062365137612936e-07, "loss": 0.0379, "step": 540 }, { "epoch": 1.49, "logps_train/chosen": -66.98933410644531, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -149.30764770507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15135952830314636, "rewards_train/margins": 4.715668171644211, "rewards_train/rejected": -4.5643086433410645, "step": 540 }, { "epoch": 1.49, "learning_rate": 2.200191383248197e-07, "loss": 0.0323, "step": 541 }, { "epoch": 1.49, "logps_train/chosen": -68.69373321533203, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -151.01942443847656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1539178192615509, "rewards_train/margins": 4.808692961931229, "rewards_train/rejected": -4.654775142669678, "step": 541 }, { "epoch": 1.49, "learning_rate": 2.194144404727665e-07, "loss": 0.0516, "step": 542 }, { "epoch": 1.49, "logps_train/chosen": -67.15787506103516, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -149.5911865234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24075552821159363, "rewards_train/margins": 4.947726219892502, "rewards_train/rejected": -4.706970691680908, "step": 542 }, { "epoch": 1.5, "learning_rate": 2.188095634020589e-07, "loss": 0.0398, "step": 543 }, { "epoch": 1.5, "logps_train/chosen": -66.5037612915039, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -150.8788299560547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.296841025352478, "rewards_train/margins": 4.98125684261322, "rewards_train/rejected": -4.684415817260742, "step": 543 }, { "epoch": 1.5, "learning_rate": 2.182045126964402e-07, "loss": 0.0285, "step": 544 }, { "epoch": 1.5, "logps_train/chosen": -66.99253845214844, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -152.08169555664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17037542164325714, "rewards_train/margins": 4.8250299245119095, "rewards_train/rejected": -4.654654502868652, "step": 544 }, { "epoch": 1.5, "learning_rate": 2.1759929394125688e-07, "loss": 0.0419, "step": 545 }, { "epoch": 1.5, "logps_train/chosen": -67.56024932861328, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -148.12893676757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10256870836019516, "rewards_train/margins": 4.690414033830166, "rewards_train/rejected": -4.587845325469971, "step": 545 }, { "epoch": 1.5, "learning_rate": 2.1699391272340639e-07, "loss": 0.0403, "step": 546 }, { "epoch": 1.5, "logps_train/chosen": -68.40435791015625, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -107.375, "logps_train/rejected": -157.12063598632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2205992043018341, "rewards_train/margins": 5.191257506608963, "rewards_train/rejected": -4.970658302307129, "step": 546 }, { "epoch": 1.51, "learning_rate": 2.1638837463128614e-07, "loss": 0.0376, "step": 547 }, { "epoch": 1.51, "logps_train/chosen": -72.22874450683594, "logps_train/ref_chosen": -72.125, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -154.3566131591797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.011546269059181213, "rewards_train/margins": 4.758881434798241, "rewards_train/rejected": -4.770427703857422, "step": 547 }, { "epoch": 1.51, "learning_rate": 2.1578268525474152e-07, "loss": 0.0537, "step": 548 }, { "epoch": 1.51, "logps_train/chosen": -66.26715087890625, "logps_train/ref_chosen": -66.875, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -147.31875610351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06107774004340172, "rewards_train/margins": 4.6171709559857845, "rewards_train/rejected": -4.556093215942383, "step": 548 }, { "epoch": 1.51, "learning_rate": 2.1517685018501463e-07, "loss": 0.0527, "step": 549 }, { "epoch": 1.51, "logps_train/chosen": -65.90951538085938, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -154.63058471679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20821872353553772, "rewards_train/margins": 5.061900764703751, "rewards_train/rejected": -4.853682041168213, "step": 549 }, { "epoch": 1.52, "learning_rate": 2.145708750146924e-07, "loss": 0.0269, "step": 550 }, { "epoch": 1.52, "logps_train/chosen": -69.69514465332031, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -151.86135864257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0356125608086586, "rewards_train/margins": 4.777021862566471, "rewards_train/rejected": -4.7414093017578125, "step": 550 }, { "epoch": 1.52, "learning_rate": 2.1396476533765499e-07, "loss": 0.0571, "step": 551 }, { "epoch": 1.52, "logps_train/chosen": -71.34486389160156, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -154.53158569335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06910524517297745, "rewards_train/margins": 4.73454274982214, "rewards_train/rejected": -4.803647994995117, "step": 551 }, { "epoch": 1.52, "learning_rate": 2.1335852674902433e-07, "loss": 0.0428, "step": 552 }, { "epoch": 1.52, "logps_train/chosen": -65.45652770996094, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -105.3125, "logps_train/rejected": -151.87994384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3242204785346985, "rewards_train/margins": 4.9790104031562805, "rewards_train/rejected": -4.654789924621582, "step": 552 }, { "epoch": 1.52, "learning_rate": 2.1275216484511226e-07, "loss": 0.0267, "step": 553 }, { "epoch": 1.52, "logps_train/chosen": -69.12800598144531, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -148.69796752929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06752169132232666, "rewards_train/margins": 4.598401665687561, "rewards_train/rejected": -4.530879974365234, "step": 553 }, { "epoch": 1.53, "learning_rate": 2.121456852233691e-07, "loss": 0.0559, "step": 554 }, { "epoch": 1.53, "logps_train/chosen": -71.19970703125, "logps_train/ref_chosen": -72.625, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -158.11013793945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1403321772813797, "rewards_train/margins": 5.1030063182115555, "rewards_train/rejected": -4.962674140930176, "step": 554 }, { "epoch": 1.53, "learning_rate": 2.115390934823317e-07, "loss": 0.0338, "step": 555 }, { "epoch": 1.53, "logps_train/chosen": -68.44865417480469, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -110.0, "logps_train/rejected": -161.03179931640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3320392966270447, "rewards_train/margins": 5.435023248195648, "rewards_train/rejected": -5.1029839515686035, "step": 555 }, { "epoch": 1.53, "learning_rate": 2.1093239522157202e-07, "loss": 0.0159, "step": 556 }, { "epoch": 1.53, "logps_train/chosen": -70.40196228027344, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -149.20956420898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12928655743598938, "rewards_train/margins": 4.713524609804153, "rewards_train/rejected": -4.584238052368164, "step": 556 }, { "epoch": 1.53, "learning_rate": 2.1032559604164524e-07, "loss": 0.0511, "step": 557 }, { "epoch": 1.53, "logps_train/chosen": -67.90544128417969, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -153.672119140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18225881457328796, "rewards_train/margins": 4.889312952756882, "rewards_train/rejected": -4.707054138183594, "step": 557 }, { "epoch": 1.54, "learning_rate": 2.0971870154403825e-07, "loss": 0.0438, "step": 558 }, { "epoch": 1.54, "logps_train/chosen": -67.03142547607422, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -153.99539184570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21057796478271484, "rewards_train/margins": 4.983946323394775, "rewards_train/rejected": -4.7733683586120605, "step": 558 }, { "epoch": 1.54, "learning_rate": 2.091117173311177e-07, "loss": 0.0333, "step": 559 }, { "epoch": 1.54, "logps_train/chosen": -66.75933074951172, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -151.78871154785156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1498480886220932, "rewards_train/margins": 4.870906934142113, "rewards_train/rejected": -4.7210588455200195, "step": 559 }, { "epoch": 1.54, "learning_rate": 2.0850464900607857e-07, "loss": 0.0388, "step": 560 }, { "epoch": 1.54, "logps_train/chosen": -69.20394897460938, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -108.6875, "logps_train/rejected": -158.17037963867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2627592086791992, "rewards_train/margins": 5.209582328796387, "rewards_train/rejected": -4.9468231201171875, "step": 560 }, { "epoch": 1.55, "learning_rate": 2.0789750217289215e-07, "loss": 0.0308, "step": 561 }, { "epoch": 1.55, "logps_train/chosen": -66.70816040039062, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -151.68460083007812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3383638858795166, "rewards_train/margins": 5.072646379470825, "rewards_train/rejected": -4.734282493591309, "step": 561 }, { "epoch": 1.55, "learning_rate": 2.0729028243625457e-07, "loss": 0.0287, "step": 562 }, { "epoch": 1.55, "logps_train/chosen": -65.20228576660156, "logps_train/ref_chosen": -66.25, "logps_train/ref_rejected": -98.625, "logps_train/rejected": -143.93148803710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10662718862295151, "rewards_train/margins": 4.6348840817809105, "rewards_train/rejected": -4.528256893157959, "step": 562 }, { "epoch": 1.55, "learning_rate": 2.0668299540153492e-07, "loss": 0.0583, "step": 563 }, { "epoch": 1.55, "logps_train/chosen": -67.98037719726562, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -108.875, "logps_train/rejected": -157.74313354492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12163927406072617, "rewards_train/margins": 5.006597064435482, "rewards_train/rejected": -4.884957790374756, "step": 563 }, { "epoch": 1.55, "learning_rate": 2.060756466747234e-07, "loss": 0.0408, "step": 564 }, { "epoch": 1.55, "logps_train/chosen": -67.50233459472656, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -151.947021484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22828249633312225, "rewards_train/margins": 5.0193715542554855, "rewards_train/rejected": -4.791089057922363, "step": 564 }, { "epoch": 1.56, "learning_rate": 2.0546824186238002e-07, "loss": 0.0379, "step": 565 }, { "epoch": 1.56, "logps_train/chosen": -66.10317993164062, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -100.75, "logps_train/rejected": -147.6089324951172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30838263034820557, "rewards_train/margins": 4.996522545814514, "rewards_train/rejected": -4.688139915466309, "step": 565 }, { "epoch": 1.56, "learning_rate": 2.048607865715821e-07, "loss": 0.0261, "step": 566 }, { "epoch": 1.56, "logps_train/chosen": -68.58525085449219, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -103.3125, "logps_train/rejected": -149.39697265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08498136699199677, "rewards_train/margins": 4.69469778239727, "rewards_train/rejected": -4.609716415405273, "step": 566 }, { "epoch": 1.56, "learning_rate": 2.0425328640987334e-07, "loss": 0.0525, "step": 567 }, { "epoch": 1.56, "logps_train/chosen": -68.21597290039062, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -155.17820739746094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16936977207660675, "rewards_train/margins": 5.038605764508247, "rewards_train/rejected": -4.869235992431641, "step": 567 }, { "epoch": 1.56, "learning_rate": 2.036457469852113e-07, "loss": 0.0334, "step": 568 }, { "epoch": 1.56, "logps_train/chosen": -69.56260681152344, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -151.39627075195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.006309933960437775, "rewards_train/margins": 4.9022622630000114, "rewards_train/rejected": -4.908572196960449, "step": 568 }, { "epoch": 1.57, "learning_rate": 2.0303817390591634e-07, "loss": 0.0372, "step": 569 }, { "epoch": 1.57, "logps_train/chosen": -67.63655853271484, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -151.24978637695312, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.17237913608551025, "rewards_train/margins": 4.82328474521637, "rewards_train/rejected": -4.650905609130859, "step": 569 }, { "epoch": 1.57, "learning_rate": 2.0243057278061916e-07, "loss": 0.0434, "step": 570 }, { "epoch": 1.57, "logps_train/chosen": -69.53721618652344, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -150.9163818359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15526266396045685, "rewards_train/margins": 4.847682669758797, "rewards_train/rejected": -4.69242000579834, "step": 570 }, { "epoch": 1.57, "learning_rate": 2.018229492182096e-07, "loss": 0.0356, "step": 571 }, { "epoch": 1.57, "logps_train/chosen": -68.70756530761719, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -154.38804626464844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.04199691116809845, "rewards_train/margins": 4.718389555811882, "rewards_train/rejected": -4.7603864669799805, "step": 571 }, { "epoch": 1.58, "learning_rate": 2.0121530882778447e-07, "loss": 0.0553, "step": 572 }, { "epoch": 1.58, "logps_train/chosen": -66.46324920654297, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -148.71823120117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22657543420791626, "rewards_train/margins": 4.906210958957672, "rewards_train/rejected": -4.679635524749756, "step": 572 }, { "epoch": 1.58, "learning_rate": 2.0060765721859606e-07, "loss": 0.0342, "step": 573 }, { "epoch": 1.58, "logps_train/chosen": -67.97417449951172, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -154.540771484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1706002652645111, "rewards_train/margins": 5.038544982671738, "rewards_train/rejected": -4.867944717407227, "step": 573 }, { "epoch": 1.58, "learning_rate": 2e-07, "loss": 0.045, "step": 574 }, { "epoch": 1.58, "logps_train/chosen": -67.37968444824219, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -149.82113647460938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15163162350654602, "rewards_train/margins": 4.87759330868721, "rewards_train/rejected": -4.725961685180664, "step": 574 }, { "epoch": 1.58, "learning_rate": 1.9939234278140403e-07, "loss": 0.0433, "step": 575 }, { "epoch": 1.58, "logps_train/chosen": -68.02896118164062, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -155.63455200195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20252388715744019, "rewards_train/margins": 5.120862185955048, "rewards_train/rejected": -4.918338298797607, "step": 575 }, { "epoch": 1.59, "learning_rate": 1.987846911722155e-07, "loss": 0.0228, "step": 576 }, { "epoch": 1.59, "logps_train/chosen": -68.30584716796875, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -109.8125, "logps_train/rejected": -161.10647583007812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24431705474853516, "rewards_train/margins": 5.370687484741211, "rewards_train/rejected": -5.126370429992676, "step": 576 }, { "epoch": 1.59, "learning_rate": 1.981770507817904e-07, "loss": 0.0271, "step": 577 }, { "epoch": 1.59, "logps_train/chosen": -67.79449462890625, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -151.7444305419922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0964290127158165, "rewards_train/margins": 4.823215626180172, "rewards_train/rejected": -4.7267866134643555, "step": 577 }, { "epoch": 1.59, "learning_rate": 1.9756942721938085e-07, "loss": 0.0592, "step": 578 }, { "epoch": 1.59, "logps_train/chosen": -66.50116729736328, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -154.13026428222656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25300827622413635, "rewards_train/margins": 5.067401319742203, "rewards_train/rejected": -4.814393043518066, "step": 578 }, { "epoch": 1.6, "learning_rate": 1.9696182609408367e-07, "loss": 0.0299, "step": 579 }, { "epoch": 1.6, "logps_train/chosen": -67.51536560058594, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -154.65066528320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26628631353378296, "rewards_train/margins": 5.079594552516937, "rewards_train/rejected": -4.813308238983154, "step": 579 }, { "epoch": 1.6, "learning_rate": 1.963542530147887e-07, "loss": 0.0315, "step": 580 }, { "epoch": 1.6, "logps_train/chosen": -65.85713195800781, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -156.05996704101562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35701215267181396, "rewards_train/margins": 5.245968222618103, "rewards_train/rejected": -4.888956069946289, "step": 580 }, { "epoch": 1.6, "learning_rate": 1.957467135901267e-07, "loss": 0.0308, "step": 581 }, { "epoch": 1.6, "logps_train/chosen": -67.76951599121094, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -152.1795654296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25088047981262207, "rewards_train/margins": 5.018348932266235, "rewards_train/rejected": -4.767468452453613, "step": 581 }, { "epoch": 1.6, "learning_rate": 1.951392134284179e-07, "loss": 0.0322, "step": 582 }, { "epoch": 1.6, "logps_train/chosen": -69.37258911132812, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -156.94586181640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23310218751430511, "rewards_train/margins": 5.257374152541161, "rewards_train/rejected": -5.0242719650268555, "step": 582 }, { "epoch": 1.61, "learning_rate": 1.9453175813762e-07, "loss": 0.028, "step": 583 }, { "epoch": 1.61, "logps_train/chosen": -69.00342559814453, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -152.60720825195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29506736993789673, "rewards_train/margins": 5.111990034580231, "rewards_train/rejected": -4.816922664642334, "step": 583 }, { "epoch": 1.61, "learning_rate": 1.9392435332527658e-07, "loss": 0.0296, "step": 584 }, { "epoch": 1.61, "logps_train/chosen": -68.92576599121094, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -107.5625, "logps_train/rejected": -158.80264282226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15410269796848297, "rewards_train/margins": 5.278313055634499, "rewards_train/rejected": -5.124210357666016, "step": 584 }, { "epoch": 1.61, "learning_rate": 1.9331700459846514e-07, "loss": 0.0306, "step": 585 }, { "epoch": 1.61, "logps_train/chosen": -65.53682708740234, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -149.7218017578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3573525846004486, "rewards_train/margins": 5.0906170308589935, "rewards_train/rejected": -4.733264446258545, "step": 585 }, { "epoch": 1.61, "learning_rate": 1.927097175637454e-07, "loss": 0.0324, "step": 586 }, { "epoch": 1.61, "logps_train/chosen": -68.18551635742188, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -148.484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.033156827092170715, "rewards_train/margins": 4.586574032902718, "rewards_train/rejected": -4.553417205810547, "step": 586 }, { "epoch": 1.62, "learning_rate": 1.9210249782710783e-07, "loss": 0.0529, "step": 587 }, { "epoch": 1.62, "logps_train/chosen": -68.81388854980469, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -107.0625, "logps_train/rejected": -154.78558349609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3117749094963074, "rewards_train/margins": 5.084668815135956, "rewards_train/rejected": -4.772893905639648, "step": 587 }, { "epoch": 1.62, "learning_rate": 1.9149535099392146e-07, "loss": 0.0234, "step": 588 }, { "epoch": 1.62, "logps_train/chosen": -68.586669921875, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -151.01019287109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13493679463863373, "rewards_train/margins": 4.944257244467735, "rewards_train/rejected": -4.809320449829102, "step": 588 }, { "epoch": 1.62, "learning_rate": 1.9088828266888225e-07, "loss": 0.0487, "step": 589 }, { "epoch": 1.62, "logps_train/chosen": -67.08953094482422, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -150.29989624023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12586113810539246, "rewards_train/margins": 4.947939842939377, "rewards_train/rejected": -4.822078704833984, "step": 589 }, { "epoch": 1.63, "learning_rate": 1.9028129845596176e-07, "loss": 0.0399, "step": 590 }, { "epoch": 1.63, "logps_train/chosen": -68.61676025390625, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -153.79580688476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.032367028295993805, "rewards_train/margins": 5.00852993875742, "rewards_train/rejected": -4.976162910461426, "step": 590 }, { "epoch": 1.63, "learning_rate": 1.8967440395835477e-07, "loss": 0.0395, "step": 591 }, { "epoch": 1.63, "logps_train/chosen": -68.04627990722656, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -153.07911682128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1502063125371933, "rewards_train/margins": 4.947473272681236, "rewards_train/rejected": -4.797266960144043, "step": 591 }, { "epoch": 1.63, "learning_rate": 1.89067604778428e-07, "loss": 0.0296, "step": 592 }, { "epoch": 1.63, "logps_train/chosen": -69.74247741699219, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -154.0406951904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.004268094897270203, "rewards_train/margins": 4.716442510485649, "rewards_train/rejected": -4.712174415588379, "step": 592 }, { "epoch": 1.63, "learning_rate": 1.8846090651766827e-07, "loss": 0.0518, "step": 593 }, { "epoch": 1.63, "logps_train/chosen": -65.17233276367188, "logps_train/ref_chosen": -66.625, "logps_train/ref_rejected": -101.4375, "logps_train/rejected": -146.11351013183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14463156461715698, "rewards_train/margins": 4.613600432872772, "rewards_train/rejected": -4.468968868255615, "step": 593 }, { "epoch": 1.64, "learning_rate": 1.8785431477663091e-07, "loss": 0.0656, "step": 594 }, { "epoch": 1.64, "logps_train/chosen": -69.57219696044922, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -106.1875, "logps_train/rejected": -154.47189331054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21690160036087036, "rewards_train/margins": 5.042900383472443, "rewards_train/rejected": -4.825998783111572, "step": 594 }, { "epoch": 1.64, "learning_rate": 1.8724783515488775e-07, "loss": 0.0392, "step": 595 }, { "epoch": 1.64, "logps_train/chosen": -66.16594696044922, "logps_train/ref_chosen": -66.625, "logps_train/ref_rejected": -100.3125, "logps_train/rejected": -146.78350830078125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.045270346105098724, "rewards_train/margins": 4.690516851842403, "rewards_train/rejected": -4.645246505737305, "step": 595 }, { "epoch": 1.64, "learning_rate": 1.8664147325097568e-07, "loss": 0.0518, "step": 596 }, { "epoch": 1.64, "logps_train/chosen": -67.10777282714844, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -105.9375, "logps_train/rejected": -155.64187622070312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2970848083496094, "rewards_train/margins": 5.266302108764648, "rewards_train/rejected": -4.969217300415039, "step": 596 }, { "epoch": 1.64, "learning_rate": 1.8603523466234502e-07, "loss": 0.029, "step": 597 }, { "epoch": 1.64, "logps_train/chosen": -70.48933410644531, "logps_train/ref_chosen": -72.5625, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -157.37063598632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20492395758628845, "rewards_train/margins": 5.153608173131943, "rewards_train/rejected": -4.948684215545654, "step": 597 }, { "epoch": 1.65, "learning_rate": 1.8542912498530765e-07, "loss": 0.0274, "step": 598 }, { "epoch": 1.65, "logps_train/chosen": -70.74288177490234, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -155.40188598632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07604613900184631, "rewards_train/margins": 4.838116496801376, "rewards_train/rejected": -4.914162635803223, "step": 598 }, { "epoch": 1.65, "learning_rate": 1.8482314981498532e-07, "loss": 0.047, "step": 599 }, { "epoch": 1.65, "logps_train/chosen": -66.38411712646484, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -154.96253967285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09576807171106339, "rewards_train/margins": 5.034209825098515, "rewards_train/rejected": -4.938441753387451, "step": 599 }, { "epoch": 1.65, "learning_rate": 1.8421731474525846e-07, "loss": 0.0391, "step": 600 }, { "epoch": 1.65, "logps_train/chosen": -66.82028198242188, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -154.81939697265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35923147201538086, "rewards_train/margins": 5.3824782371521, "rewards_train/rejected": -5.023246765136719, "step": 600 }, { "epoch": 1.66, "learning_rate": 1.836116253687139e-07, "loss": 0.0153, "step": 601 }, { "epoch": 1.66, "logps_train/chosen": -66.31555938720703, "logps_train/ref_chosen": -65.625, "logps_train/ref_rejected": -99.25, "logps_train/rejected": -145.79156494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06690751761198044, "rewards_train/margins": 4.584416754543781, "rewards_train/rejected": -4.651324272155762, "step": 601 }, { "epoch": 1.66, "learning_rate": 1.830060872765936e-07, "loss": 0.0563, "step": 602 }, { "epoch": 1.66, "logps_train/chosen": -67.24017333984375, "logps_train/ref_chosen": -67.5, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -151.50149536132812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.026470743119716644, "rewards_train/margins": 4.733163438737392, "rewards_train/rejected": -4.706692695617676, "step": 602 }, { "epoch": 1.66, "learning_rate": 1.8240070605874315e-07, "loss": 0.0469, "step": 603 }, { "epoch": 1.66, "logps_train/chosen": -67.59569549560547, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -99.9375, "logps_train/rejected": -146.8134765625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.029590528458356857, "rewards_train/margins": 4.716358583420515, "rewards_train/rejected": -4.686768054962158, "step": 603 }, { "epoch": 1.66, "learning_rate": 1.817954873035598e-07, "loss": 0.0656, "step": 604 }, { "epoch": 1.66, "logps_train/chosen": -67.47167205810547, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -153.41482543945312, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.16562503576278687, "rewards_train/margins": 5.014237344264984, "rewards_train/rejected": -4.848612308502197, "step": 604 }, { "epoch": 1.67, "learning_rate": 1.8119043659794117e-07, "loss": 0.0525, "step": 605 }, { "epoch": 1.67, "logps_train/chosen": -72.03549194335938, "logps_train/ref_chosen": -72.875, "logps_train/ref_rejected": -106.875, "logps_train/rejected": -156.07630920410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08419498801231384, "rewards_train/margins": 5.007158130407333, "rewards_train/rejected": -4.9229631423950195, "step": 605 }, { "epoch": 1.67, "learning_rate": 1.8058555952723348e-07, "loss": 0.0468, "step": 606 }, { "epoch": 1.67, "logps_train/chosen": -66.6108169555664, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -111.0625, "logps_train/rejected": -164.11416625976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.385549396276474, "rewards_train/margins": 5.6885689198970795, "rewards_train/rejected": -5.3030195236206055, "step": 606 }, { "epoch": 1.67, "learning_rate": 1.799808616751803e-07, "loss": 0.0157, "step": 607 }, { "epoch": 1.67, "logps_train/chosen": -65.29801940917969, "logps_train/ref_chosen": -64.6875, "logps_train/ref_rejected": -95.625, "logps_train/rejected": -141.98545837402344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.062174633145332336, "rewards_train/margins": 4.576800510287285, "rewards_train/rejected": -4.638975143432617, "step": 607 }, { "epoch": 1.67, "learning_rate": 1.793763486238707e-07, "loss": 0.0576, "step": 608 }, { "epoch": 1.67, "logps_train/chosen": -68.87055969238281, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -153.54226684570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0659225657582283, "rewards_train/margins": 4.991049595177174, "rewards_train/rejected": -4.925127029418945, "step": 608 }, { "epoch": 1.68, "learning_rate": 1.7877202595368762e-07, "loss": 0.0431, "step": 609 }, { "epoch": 1.68, "logps_train/chosen": -68.72001647949219, "logps_train/ref_chosen": -72.625, "logps_train/ref_rejected": -108.9375, "logps_train/rejected": -161.05560302734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38781261444091797, "rewards_train/margins": 5.602357864379883, "rewards_train/rejected": -5.214545249938965, "step": 609 }, { "epoch": 1.68, "learning_rate": 1.781678992432569e-07, "loss": 0.015, "step": 610 }, { "epoch": 1.68, "logps_train/chosen": -65.04176330566406, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -154.9681396484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43102923035621643, "rewards_train/margins": 5.321985155344009, "rewards_train/rejected": -4.890955924987793, "step": 610 }, { "epoch": 1.68, "learning_rate": 1.775639740693951e-07, "loss": 0.0315, "step": 611 }, { "epoch": 1.68, "logps_train/chosen": -68.51319885253906, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -153.91441345214844, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.17104355990886688, "rewards_train/margins": 5.2154151648283005, "rewards_train/rejected": -5.044371604919434, "step": 611 }, { "epoch": 1.69, "learning_rate": 1.7696025600705858e-07, "loss": 0.0417, "step": 612 }, { "epoch": 1.69, "logps_train/chosen": -66.7509765625, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -150.87060546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18691468238830566, "rewards_train/margins": 4.937648057937622, "rewards_train/rejected": -4.750733375549316, "step": 612 }, { "epoch": 1.69, "learning_rate": 1.763567506292918e-07, "loss": 0.0447, "step": 613 }, { "epoch": 1.69, "logps_train/chosen": -67.93217468261719, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -149.592041015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1022413969039917, "rewards_train/margins": 4.793965220451355, "rewards_train/rejected": -4.691723823547363, "step": 613 }, { "epoch": 1.69, "learning_rate": 1.7575346350717576e-07, "loss": 0.0585, "step": 614 }, { "epoch": 1.69, "logps_train/chosen": -68.40090942382812, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -108.3125, "logps_train/rejected": -158.2354278564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10805407166481018, "rewards_train/margins": 5.098198801279068, "rewards_train/rejected": -4.990144729614258, "step": 614 }, { "epoch": 1.69, "learning_rate": 1.7515040020977668e-07, "loss": 0.0421, "step": 615 }, { "epoch": 1.69, "logps_train/chosen": -66.92945098876953, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -152.22735595703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26433029770851135, "rewards_train/margins": 5.1003455221652985, "rewards_train/rejected": -4.836015224456787, "step": 615 }, { "epoch": 1.7, "learning_rate": 1.745475663040949e-07, "loss": 0.0359, "step": 616 }, { "epoch": 1.7, "logps_train/chosen": -67.68791198730469, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -155.65744018554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17769290506839752, "rewards_train/margins": 5.173711314797401, "rewards_train/rejected": -4.996018409729004, "step": 616 }, { "epoch": 1.7, "learning_rate": 1.7394496735501287e-07, "loss": 0.0319, "step": 617 }, { "epoch": 1.7, "logps_train/chosen": -64.89759826660156, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -154.00180053710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24788661301136017, "rewards_train/margins": 5.289277985692024, "rewards_train/rejected": -5.041391372680664, "step": 617 }, { "epoch": 1.7, "learning_rate": 1.733426089252443e-07, "loss": 0.0401, "step": 618 }, { "epoch": 1.7, "logps_train/chosen": -69.88397979736328, "logps_train/ref_chosen": -72.0625, "logps_train/ref_rejected": -109.375, "logps_train/rejected": -161.08572387695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21550817787647247, "rewards_train/margins": 5.388240531086922, "rewards_train/rejected": -5.172732353210449, "step": 618 }, { "epoch": 1.71, "learning_rate": 1.7274049657528233e-07, "loss": 0.026, "step": 619 }, { "epoch": 1.71, "logps_train/chosen": -70.68059539794922, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -105.1875, "logps_train/rejected": -154.58151245117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10669669508934021, "rewards_train/margins": 5.046001046895981, "rewards_train/rejected": -4.939304351806641, "step": 619 }, { "epoch": 1.71, "learning_rate": 1.72138635863349e-07, "loss": 0.0341, "step": 620 }, { "epoch": 1.71, "logps_train/chosen": -69.17831420898438, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -159.5664825439453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22611397504806519, "rewards_train/margins": 5.36757630109787, "rewards_train/rejected": -5.141462326049805, "step": 620 }, { "epoch": 1.71, "learning_rate": 1.7153703234534298e-07, "loss": 0.0243, "step": 621 }, { "epoch": 1.71, "logps_train/chosen": -67.41352844238281, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -101.0625, "logps_train/rejected": -147.98678588867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11665550619363785, "rewards_train/margins": 4.808108486235142, "rewards_train/rejected": -4.691452980041504, "step": 621 }, { "epoch": 1.71, "learning_rate": 1.709356915747889e-07, "loss": 0.0343, "step": 622 }, { "epoch": 1.71, "logps_train/chosen": -66.41160583496094, "logps_train/ref_chosen": -66.875, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -149.49655151367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04555816203355789, "rewards_train/margins": 4.7286111786961555, "rewards_train/rejected": -4.683053016662598, "step": 622 }, { "epoch": 1.72, "learning_rate": 1.703346191027859e-07, "loss": 0.0565, "step": 623 }, { "epoch": 1.72, "logps_train/chosen": -68.77906799316406, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -153.89395141601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22072641551494598, "rewards_train/margins": 5.1057280749082565, "rewards_train/rejected": -4.8850016593933105, "step": 623 }, { "epoch": 1.72, "learning_rate": 1.6973382047795638e-07, "loss": 0.0391, "step": 624 }, { "epoch": 1.72, "logps_train/chosen": -67.4591064453125, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -155.97787475585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20902064442634583, "rewards_train/margins": 5.322530776262283, "rewards_train/rejected": -5.1135101318359375, "step": 624 }, { "epoch": 1.72, "learning_rate": 1.6913330124639483e-07, "loss": 0.0295, "step": 625 }, { "epoch": 1.72, "logps_train/chosen": -65.7410888671875, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -151.6112060546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.272327184677124, "rewards_train/margins": 5.267822027206421, "rewards_train/rejected": -4.995494842529297, "step": 625 }, { "epoch": 1.72, "learning_rate": 1.6853306695161668e-07, "loss": 0.0257, "step": 626 }, { "epoch": 1.72, "logps_train/chosen": -68.141357421875, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -154.33074951171875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.18156763911247253, "rewards_train/margins": 5.175092190504074, "rewards_train/rejected": -4.993524551391602, "step": 626 }, { "epoch": 1.73, "learning_rate": 1.6793312313450687e-07, "loss": 0.048, "step": 627 }, { "epoch": 1.73, "logps_train/chosen": -62.76592254638672, "logps_train/ref_chosen": -66.875, "logps_train/ref_rejected": -99.375, "logps_train/rejected": -150.01548767089844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41149383783340454, "rewards_train/margins": 5.4778857827186584, "rewards_train/rejected": -5.066391944885254, "step": 627 }, { "epoch": 1.73, "learning_rate": 1.673334753332689e-07, "loss": 0.0182, "step": 628 }, { "epoch": 1.73, "logps_train/chosen": -66.07769012451172, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -151.58436584472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32216256856918335, "rewards_train/margins": 5.256526410579681, "rewards_train/rejected": -4.934363842010498, "step": 628 }, { "epoch": 1.73, "learning_rate": 1.66734129083374e-07, "loss": 0.0343, "step": 629 }, { "epoch": 1.73, "logps_train/chosen": -66.43489074707031, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -105.4375, "logps_train/rejected": -154.22369384765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2425459772348404, "rewards_train/margins": 5.121361628174782, "rewards_train/rejected": -4.878815650939941, "step": 629 }, { "epoch": 1.74, "learning_rate": 1.661350899175093e-07, "loss": 0.0306, "step": 630 }, { "epoch": 1.74, "logps_train/chosen": -68.18148803710938, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -157.4860076904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26861923933029175, "rewards_train/margins": 5.322103202342987, "rewards_train/rejected": -5.053483963012695, "step": 630 }, { "epoch": 1.74, "learning_rate": 1.6553636336552738e-07, "loss": 0.0261, "step": 631 }, { "epoch": 1.74, "logps_train/chosen": -66.84745788574219, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -153.12570190429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10031276196241379, "rewards_train/margins": 5.162296824157238, "rewards_train/rejected": -5.061984062194824, "step": 631 }, { "epoch": 1.74, "learning_rate": 1.6493795495439497e-07, "loss": 0.0418, "step": 632 }, { "epoch": 1.74, "logps_train/chosen": -67.9469223022461, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -156.2662811279297, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.16961507499217987, "rewards_train/margins": 5.1958029717206955, "rewards_train/rejected": -5.026187896728516, "step": 632 }, { "epoch": 1.74, "learning_rate": 1.643398702081421e-07, "loss": 0.044, "step": 633 }, { "epoch": 1.74, "logps_train/chosen": -67.54681396484375, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -155.93218994140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2848207354545593, "rewards_train/margins": 5.290831744670868, "rewards_train/rejected": -5.006011009216309, "step": 633 }, { "epoch": 1.75, "learning_rate": 1.6374211464781085e-07, "loss": 0.0331, "step": 634 }, { "epoch": 1.75, "logps_train/chosen": -68.85943603515625, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -157.91734313964844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21171268820762634, "rewards_train/margins": 5.380791515111923, "rewards_train/rejected": -5.169078826904297, "step": 634 }, { "epoch": 1.75, "learning_rate": 1.6314469379140448e-07, "loss": 0.031, "step": 635 }, { "epoch": 1.75, "logps_train/chosen": -70.56250762939453, "logps_train/ref_chosen": -71.875, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -152.93521118164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1307607740163803, "rewards_train/margins": 4.956215485930443, "rewards_train/rejected": -4.8254547119140625, "step": 635 }, { "epoch": 1.75, "learning_rate": 1.6254761315383665e-07, "loss": 0.0497, "step": 636 }, { "epoch": 1.75, "logps_train/chosen": -70.68528747558594, "logps_train/ref_chosen": -73.125, "logps_train/ref_rejected": -112.6875, "logps_train/rejected": -168.46441650390625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2439224272966385, "rewards_train/margins": 5.822883799672127, "rewards_train/rejected": -5.578961372375488, "step": 636 }, { "epoch": 1.75, "learning_rate": 1.6195087824688038e-07, "loss": 0.0236, "step": 637 }, { "epoch": 1.75, "logps_train/chosen": -68.55054473876953, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -154.02542114257812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10724980384111404, "rewards_train/margins": 5.10617882758379, "rewards_train/rejected": -4.998929023742676, "step": 637 }, { "epoch": 1.76, "learning_rate": 1.6135449457911708e-07, "loss": 0.0274, "step": 638 }, { "epoch": 1.76, "logps_train/chosen": -66.96701049804688, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -154.5872344970703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19626794755458832, "rewards_train/margins": 5.192978724837303, "rewards_train/rejected": -4.996710777282715, "step": 638 }, { "epoch": 1.76, "learning_rate": 1.6075846765588592e-07, "loss": 0.0449, "step": 639 }, { "epoch": 1.76, "logps_train/chosen": -67.77047729492188, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -101.0625, "logps_train/rejected": -149.1805419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08667309582233429, "rewards_train/margins": 4.901211574673653, "rewards_train/rejected": -4.814538478851318, "step": 639 }, { "epoch": 1.76, "learning_rate": 1.6016280297923282e-07, "loss": 0.0638, "step": 640 }, { "epoch": 1.76, "logps_train/chosen": -65.9714126586914, "logps_train/ref_chosen": -66.75, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -149.30638122558594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07341481000185013, "rewards_train/margins": 4.831152446568012, "rewards_train/rejected": -4.757737636566162, "step": 640 }, { "epoch": 1.77, "learning_rate": 1.5956750604785972e-07, "loss": 0.0521, "step": 641 }, { "epoch": 1.77, "logps_train/chosen": -68.8818130493164, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -158.82928466796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2417500913143158, "rewards_train/margins": 5.534443706274033, "rewards_train/rejected": -5.292693614959717, "step": 641 }, { "epoch": 1.77, "learning_rate": 1.5897258235707375e-07, "loss": 0.0215, "step": 642 }, { "epoch": 1.77, "logps_train/chosen": -69.52482604980469, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -152.6689453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19922664761543274, "rewards_train/margins": 5.11519268155098, "rewards_train/rejected": -4.915966033935547, "step": 642 }, { "epoch": 1.77, "learning_rate": 1.5837803739873686e-07, "loss": 0.037, "step": 643 }, { "epoch": 1.77, "logps_train/chosen": -65.5746841430664, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -154.06985473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31836172938346863, "rewards_train/margins": 5.334527462720871, "rewards_train/rejected": -5.016165733337402, "step": 643 }, { "epoch": 1.77, "learning_rate": 1.5778387666121457e-07, "loss": 0.0308, "step": 644 }, { "epoch": 1.77, "logps_train/chosen": -66.72906494140625, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -155.55471801757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18578557670116425, "rewards_train/margins": 5.134714886546135, "rewards_train/rejected": -4.948929309844971, "step": 644 }, { "epoch": 1.78, "learning_rate": 1.5719010562932563e-07, "loss": 0.0438, "step": 645 }, { "epoch": 1.78, "logps_train/chosen": -68.69331359863281, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -155.39971923828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17417441308498383, "rewards_train/margins": 5.116588220000267, "rewards_train/rejected": -4.942413806915283, "step": 645 }, { "epoch": 1.78, "learning_rate": 1.5659672978429145e-07, "loss": 0.0286, "step": 646 }, { "epoch": 1.78, "logps_train/chosen": -66.33013916015625, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -101.4375, "logps_train/rejected": -150.15916442871094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19022829830646515, "rewards_train/margins": 5.064102962613106, "rewards_train/rejected": -4.873874664306641, "step": 646 }, { "epoch": 1.78, "learning_rate": 1.5600375460368534e-07, "loss": 0.0365, "step": 647 }, { "epoch": 1.78, "logps_train/chosen": -67.93759155273438, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -107.5625, "logps_train/rejected": -158.9217071533203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12884798645973206, "rewards_train/margins": 5.265062242746353, "rewards_train/rejected": -5.136214256286621, "step": 647 }, { "epoch": 1.79, "learning_rate": 1.5541118556138183e-07, "loss": 0.032, "step": 648 }, { "epoch": 1.79, "logps_train/chosen": -68.27298736572266, "logps_train/ref_chosen": -72.3125, "logps_train/ref_rejected": -109.375, "logps_train/rejected": -159.85726928710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.402730792760849, "rewards_train/margins": 5.453107684850693, "rewards_train/rejected": -5.050376892089844, "step": 648 }, { "epoch": 1.79, "learning_rate": 1.5481902812750665e-07, "loss": 0.0219, "step": 649 }, { "epoch": 1.79, "logps_train/chosen": -66.46004486083984, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -154.86102294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28456154465675354, "rewards_train/margins": 5.299179941415787, "rewards_train/rejected": -5.014618396759033, "step": 649 }, { "epoch": 1.79, "learning_rate": 1.5422728776838557e-07, "loss": 0.041, "step": 650 }, { "epoch": 1.79, "logps_train/chosen": -69.22164916992188, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -152.2090301513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05591179057955742, "rewards_train/margins": 4.924177896231413, "rewards_train/rejected": -4.8682661056518555, "step": 650 }, { "epoch": 1.79, "learning_rate": 1.5363596994649432e-07, "loss": 0.0452, "step": 651 }, { "epoch": 1.79, "logps_train/chosen": -63.732093811035156, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -153.44186401367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47908511757850647, "rewards_train/margins": 5.583331257104874, "rewards_train/rejected": -5.104246139526367, "step": 651 }, { "epoch": 1.8, "learning_rate": 1.530450801204083e-07, "loss": 0.0176, "step": 652 }, { "epoch": 1.8, "logps_train/chosen": -66.20587158203125, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -150.32037353515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22785049676895142, "rewards_train/margins": 5.033861696720123, "rewards_train/rejected": -4.806011199951172, "step": 652 }, { "epoch": 1.8, "learning_rate": 1.5245462374475177e-07, "loss": 0.0447, "step": 653 }, { "epoch": 1.8, "logps_train/chosen": -66.40668487548828, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -156.1258544921875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.36777883768081665, "rewards_train/margins": 5.38974004983902, "rewards_train/rejected": -5.021961212158203, "step": 653 }, { "epoch": 1.8, "learning_rate": 1.518646062701478e-07, "loss": 0.0277, "step": 654 }, { "epoch": 1.8, "logps_train/chosen": -70.05567169189453, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -154.43612670898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.014551609754562378, "rewards_train/margins": 5.176229029893875, "rewards_train/rejected": -5.1907806396484375, "step": 654 }, { "epoch": 1.8, "learning_rate": 1.512750331431678e-07, "loss": 0.0402, "step": 655 }, { "epoch": 1.8, "logps_train/chosen": -63.99253463745117, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -155.39747619628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43941840529441833, "rewards_train/margins": 5.520181268453598, "rewards_train/rejected": -5.08076286315918, "step": 655 }, { "epoch": 1.81, "learning_rate": 1.5068590980628155e-07, "loss": 0.0249, "step": 656 }, { "epoch": 1.81, "logps_train/chosen": -67.2586669921875, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -156.755615234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2429322600364685, "rewards_train/margins": 5.434509694576263, "rewards_train/rejected": -5.191577434539795, "step": 656 }, { "epoch": 1.81, "learning_rate": 1.5009724169780652e-07, "loss": 0.0235, "step": 657 }, { "epoch": 1.81, "logps_train/chosen": -67.5926284790039, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -162.0375518798828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3225734531879425, "rewards_train/margins": 5.676963597536087, "rewards_train/rejected": -5.3543901443481445, "step": 657 }, { "epoch": 1.81, "learning_rate": 1.4950903425185782e-07, "loss": 0.0285, "step": 658 }, { "epoch": 1.81, "logps_train/chosen": -69.58660888671875, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -160.1326446533203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07888771593570709, "rewards_train/margins": 5.253577962517738, "rewards_train/rejected": -5.174690246582031, "step": 658 }, { "epoch": 1.82, "learning_rate": 1.489212928982983e-07, "loss": 0.0418, "step": 659 }, { "epoch": 1.82, "logps_train/chosen": -68.12442016601562, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -153.74026489257812, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.17276215553283691, "rewards_train/margins": 5.152648210525513, "rewards_train/rejected": -4.979886054992676, "step": 659 }, { "epoch": 1.82, "learning_rate": 1.4833402306268815e-07, "loss": 0.0534, "step": 660 }, { "epoch": 1.82, "logps_train/chosen": -66.64434814453125, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -154.85806274414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22501827776432037, "rewards_train/margins": 5.255160108208656, "rewards_train/rejected": -5.030141830444336, "step": 660 }, { "epoch": 1.82, "learning_rate": 1.4774723016623468e-07, "loss": 0.0295, "step": 661 }, { "epoch": 1.82, "logps_train/chosen": -66.07440185546875, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -149.96253967285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1659487932920456, "rewards_train/margins": 4.951167985796928, "rewards_train/rejected": -4.785219192504883, "step": 661 }, { "epoch": 1.82, "learning_rate": 1.4716091962574278e-07, "loss": 0.0419, "step": 662 }, { "epoch": 1.82, "logps_train/chosen": -64.60162353515625, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -153.42678833007812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4506775438785553, "rewards_train/margins": 5.549998432397842, "rewards_train/rejected": -5.099320888519287, "step": 662 }, { "epoch": 1.83, "learning_rate": 1.4657509685356445e-07, "loss": 0.014, "step": 663 }, { "epoch": 1.83, "logps_train/chosen": -66.431884765625, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -159.69699096679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31042489409446716, "rewards_train/margins": 5.597799390554428, "rewards_train/rejected": -5.287374496459961, "step": 663 }, { "epoch": 1.83, "learning_rate": 1.4598976725754897e-07, "loss": 0.0203, "step": 664 }, { "epoch": 1.83, "logps_train/chosen": -64.82017517089844, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -154.09164428710938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.261976957321167, "rewards_train/margins": 5.432175397872925, "rewards_train/rejected": -5.170198440551758, "step": 664 }, { "epoch": 1.83, "learning_rate": 1.4540493624099313e-07, "loss": 0.0335, "step": 665 }, { "epoch": 1.83, "logps_train/chosen": -64.81145477294922, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -154.10226440429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2285228669643402, "rewards_train/margins": 5.2946105897426605, "rewards_train/rejected": -5.06608772277832, "step": 665 }, { "epoch": 1.83, "learning_rate": 1.4482060920259112e-07, "loss": 0.0379, "step": 666 }, { "epoch": 1.83, "logps_train/chosen": -68.55516052246094, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -109.4375, "logps_train/rejected": -162.34706115722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32226741313934326, "rewards_train/margins": 5.61078155040741, "rewards_train/rejected": -5.288514137268066, "step": 666 }, { "epoch": 1.84, "learning_rate": 1.442367915363848e-07, "loss": 0.0287, "step": 667 }, { "epoch": 1.84, "logps_train/chosen": -65.55398559570312, "logps_train/ref_chosen": -66.9375, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -152.53945922851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13810738921165466, "rewards_train/margins": 5.209730714559555, "rewards_train/rejected": -5.0716233253479, "step": 667 }, { "epoch": 1.84, "learning_rate": 1.4365348863171404e-07, "loss": 0.0464, "step": 668 }, { "epoch": 1.84, "logps_train/chosen": -69.75843811035156, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -109.9375, "logps_train/rejected": -162.87734985351562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17781856656074524, "rewards_train/margins": 5.473951131105423, "rewards_train/rejected": -5.296132564544678, "step": 668 }, { "epoch": 1.84, "learning_rate": 1.4307070587316684e-07, "loss": 0.0228, "step": 669 }, { "epoch": 1.84, "logps_train/chosen": -66.94766998291016, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -151.16619873046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12012533843517303, "rewards_train/margins": 4.926003023982048, "rewards_train/rejected": -4.805877685546875, "step": 669 }, { "epoch": 1.85, "learning_rate": 1.4248844864052937e-07, "loss": 0.048, "step": 670 }, { "epoch": 1.85, "logps_train/chosen": -65.20796203613281, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -154.52940368652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3284708261489868, "rewards_train/margins": 5.324184060096741, "rewards_train/rejected": -4.995713233947754, "step": 670 }, { "epoch": 1.85, "learning_rate": 1.419067223087368e-07, "loss": 0.0259, "step": 671 }, { "epoch": 1.85, "logps_train/chosen": -68.08456420898438, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -158.35867309570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23197366297245026, "rewards_train/margins": 5.516963019967079, "rewards_train/rejected": -5.284989356994629, "step": 671 }, { "epoch": 1.85, "learning_rate": 1.413255322478235e-07, "loss": 0.0341, "step": 672 }, { "epoch": 1.85, "logps_train/chosen": -68.64926147460938, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -159.49928283691406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21827656030654907, "rewards_train/margins": 5.554827272891998, "rewards_train/rejected": -5.336550712585449, "step": 672 }, { "epoch": 1.85, "learning_rate": 1.4074488382287322e-07, "loss": 0.0239, "step": 673 }, { "epoch": 1.85, "logps_train/chosen": -68.15715026855469, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -155.8306121826172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17437249422073364, "rewards_train/margins": 5.188195049762726, "rewards_train/rejected": -5.013822555541992, "step": 673 }, { "epoch": 1.86, "learning_rate": 1.4016478239396978e-07, "loss": 0.0519, "step": 674 }, { "epoch": 1.86, "logps_train/chosen": -68.92044067382812, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -161.84632873535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16142329573631287, "rewards_train/margins": 5.494298070669174, "rewards_train/rejected": -5.332874774932861, "step": 674 }, { "epoch": 1.86, "learning_rate": 1.3958523331614776e-07, "loss": 0.0375, "step": 675 }, { "epoch": 1.86, "logps_train/chosen": -67.187255859375, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -156.73077392578125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.26149922609329224, "rewards_train/margins": 5.380377113819122, "rewards_train/rejected": -5.11887788772583, "step": 675 }, { "epoch": 1.86, "learning_rate": 1.3900624193934265e-07, "loss": 0.0356, "step": 676 }, { "epoch": 1.86, "logps_train/chosen": -69.04873657226562, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -157.7906494140625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.22730404138565063, "rewards_train/margins": 5.512033998966217, "rewards_train/rejected": -5.284729957580566, "step": 676 }, { "epoch": 1.87, "learning_rate": 1.3842781360834183e-07, "loss": 0.0316, "step": 677 }, { "epoch": 1.87, "logps_train/chosen": -66.90245819091797, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -157.7968292236328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34847474098205566, "rewards_train/margins": 5.623079538345337, "rewards_train/rejected": -5.274604797363281, "step": 677 }, { "epoch": 1.87, "learning_rate": 1.3784995366273498e-07, "loss": 0.0164, "step": 678 }, { "epoch": 1.87, "logps_train/chosen": -66.83355712890625, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -153.55908203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2940370440483093, "rewards_train/margins": 5.2452585101127625, "rewards_train/rejected": -4.951221466064453, "step": 678 }, { "epoch": 1.87, "learning_rate": 1.37272667436865e-07, "loss": 0.0323, "step": 679 }, { "epoch": 1.87, "logps_train/chosen": -68.21736907958984, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -158.65133666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1928144097328186, "rewards_train/margins": 5.420154631137848, "rewards_train/rejected": -5.227340221405029, "step": 679 }, { "epoch": 1.87, "learning_rate": 1.3669596025977863e-07, "loss": 0.0238, "step": 680 }, { "epoch": 1.87, "logps_train/chosen": -66.30073547363281, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -107.5625, "logps_train/rejected": -159.83924865722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4546433985233307, "rewards_train/margins": 5.679779678583145, "rewards_train/rejected": -5.2251362800598145, "step": 680 }, { "epoch": 1.88, "learning_rate": 1.361198374551773e-07, "loss": 0.0127, "step": 681 }, { "epoch": 1.88, "logps_train/chosen": -68.89916229248047, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -155.60401916503906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.215796560049057, "rewards_train/margins": 5.368239492177963, "rewards_train/rejected": -5.152442932128906, "step": 681 }, { "epoch": 1.88, "learning_rate": 1.3554430434136794e-07, "loss": 0.0304, "step": 682 }, { "epoch": 1.88, "logps_train/chosen": -66.52230834960938, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -103.25, "logps_train/rejected": -154.01437377929688, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.21837469935417175, "rewards_train/margins": 5.292663067579269, "rewards_train/rejected": -5.074288368225098, "step": 682 }, { "epoch": 1.88, "learning_rate": 1.34969366231214e-07, "loss": 0.0483, "step": 683 }, { "epoch": 1.88, "logps_train/chosen": -66.08643341064453, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -154.66793823242188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22504760324954987, "rewards_train/margins": 5.450629726052284, "rewards_train/rejected": -5.225582122802734, "step": 683 }, { "epoch": 1.88, "learning_rate": 1.3439502843208617e-07, "loss": 0.0326, "step": 684 }, { "epoch": 1.88, "logps_train/chosen": -68.8004150390625, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -157.05950927734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.023205885663628578, "rewards_train/margins": 5.123028626665473, "rewards_train/rejected": -5.146234512329102, "step": 684 }, { "epoch": 1.89, "learning_rate": 1.3382129624581378e-07, "loss": 0.052, "step": 685 }, { "epoch": 1.89, "logps_train/chosen": -65.67825317382812, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -156.79147338867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3124484717845917, "rewards_train/margins": 5.5236272513866425, "rewards_train/rejected": -5.211178779602051, "step": 685 }, { "epoch": 1.89, "learning_rate": 1.3324817496863548e-07, "loss": 0.0302, "step": 686 }, { "epoch": 1.89, "logps_train/chosen": -67.13638305664062, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -101.5625, "logps_train/rejected": -150.29537963867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17376360297203064, "rewards_train/margins": 5.045098632574081, "rewards_train/rejected": -4.871335029602051, "step": 686 }, { "epoch": 1.89, "learning_rate": 1.3267566989115043e-07, "loss": 0.0388, "step": 687 }, { "epoch": 1.89, "logps_train/chosen": -66.1181869506836, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -160.791748046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.375485360622406, "rewards_train/margins": 5.606514871120453, "rewards_train/rejected": -5.231029510498047, "step": 687 }, { "epoch": 1.9, "learning_rate": 1.3210378629826976e-07, "loss": 0.0174, "step": 688 }, { "epoch": 1.9, "logps_train/chosen": -67.21061706542969, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -152.86993408203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0927075445652008, "rewards_train/margins": 4.987269788980484, "rewards_train/rejected": -4.894562244415283, "step": 688 }, { "epoch": 1.9, "learning_rate": 1.315325294691673e-07, "loss": 0.0492, "step": 689 }, { "epoch": 1.9, "logps_train/chosen": -66.3368911743164, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -152.84999084472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14609605073928833, "rewards_train/margins": 5.159463703632355, "rewards_train/rejected": -5.013367652893066, "step": 689 }, { "epoch": 1.9, "learning_rate": 1.3096190467723122e-07, "loss": 0.0453, "step": 690 }, { "epoch": 1.9, "logps_train/chosen": -68.64492797851562, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -158.117919921875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2024511694908142, "rewards_train/margins": 5.321862637996674, "rewards_train/rejected": -5.119411468505859, "step": 690 }, { "epoch": 1.9, "learning_rate": 1.303919171900151e-07, "loss": 0.037, "step": 691 }, { "epoch": 1.9, "logps_train/chosen": -68.42428588867188, "logps_train/ref_chosen": -70.8125, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -152.22137451171875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.23974891006946564, "rewards_train/margins": 5.167354539036751, "rewards_train/rejected": -4.927605628967285, "step": 691 }, { "epoch": 1.91, "learning_rate": 1.2982257226918974e-07, "loss": 0.0543, "step": 692 }, { "epoch": 1.91, "logps_train/chosen": -65.11691284179688, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -155.6824188232422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33518409729003906, "rewards_train/margins": 5.475496292114258, "rewards_train/rejected": -5.140312194824219, "step": 692 }, { "epoch": 1.91, "learning_rate": 1.2925387517049397e-07, "loss": 0.0263, "step": 693 }, { "epoch": 1.91, "logps_train/chosen": -69.20176696777344, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -155.97244262695312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11932533234357834, "rewards_train/margins": 5.273502998054028, "rewards_train/rejected": -5.154177665710449, "step": 693 }, { "epoch": 1.91, "learning_rate": 1.2868583114368645e-07, "loss": 0.0342, "step": 694 }, { "epoch": 1.91, "logps_train/chosen": -68.79548645019531, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -155.31800842285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2086845189332962, "rewards_train/margins": 5.276130273938179, "rewards_train/rejected": -5.067445755004883, "step": 694 }, { "epoch": 1.91, "learning_rate": 1.2811844543249747e-07, "loss": 0.0367, "step": 695 }, { "epoch": 1.91, "logps_train/chosen": -67.7977294921875, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -157.3505096435547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20528598129749298, "rewards_train/margins": 5.605472519993782, "rewards_train/rejected": -5.400186538696289, "step": 695 }, { "epoch": 1.92, "learning_rate": 1.2755172327457997e-07, "loss": 0.0325, "step": 696 }, { "epoch": 1.92, "logps_train/chosen": -66.20695495605469, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -155.7821044921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.202717125415802, "rewards_train/margins": 5.389521896839142, "rewards_train/rejected": -5.18680477142334, "step": 696 }, { "epoch": 1.92, "learning_rate": 1.269856699014616e-07, "loss": 0.0349, "step": 697 }, { "epoch": 1.92, "logps_train/chosen": -67.29386901855469, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -102.1875, "logps_train/rejected": -153.4443359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25591665506362915, "rewards_train/margins": 5.381405889987946, "rewards_train/rejected": -5.125489234924316, "step": 697 }, { "epoch": 1.92, "learning_rate": 1.264202905384965e-07, "loss": 0.0437, "step": 698 }, { "epoch": 1.92, "logps_train/chosen": -64.83659362792969, "logps_train/ref_chosen": -66.5625, "logps_train/ref_rejected": -99.75, "logps_train/rejected": -151.36622619628906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1711745709180832, "rewards_train/margins": 5.329769179224968, "rewards_train/rejected": -5.158594608306885, "step": 698 }, { "epoch": 1.93, "learning_rate": 1.2585559040481664e-07, "loss": 0.0354, "step": 699 }, { "epoch": 1.93, "logps_train/chosen": -67.01655578613281, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -153.96841430664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11616604775190353, "rewards_train/margins": 5.267988137900829, "rewards_train/rejected": -5.151822090148926, "step": 699 }, { "epoch": 1.93, "learning_rate": 1.2529157471328388e-07, "loss": 0.0459, "step": 700 }, { "epoch": 1.93, "logps_train/chosen": -66.21332550048828, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -158.45706176757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31318914890289307, "rewards_train/margins": 5.521004319190979, "rewards_train/rejected": -5.207815170288086, "step": 700 }, { "epoch": 1.93, "learning_rate": 1.2472824867044212e-07, "loss": 0.0264, "step": 701 }, { "epoch": 1.93, "logps_train/chosen": -69.52742004394531, "logps_train/ref_chosen": -73.5, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -159.56651306152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.39808785915374756, "rewards_train/margins": 5.564993262290955, "rewards_train/rejected": -5.166905403137207, "step": 701 }, { "epoch": 1.93, "learning_rate": 1.2416561747646875e-07, "loss": 0.0261, "step": 702 }, { "epoch": 1.93, "logps_train/chosen": -67.85823059082031, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -155.56272888183594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3498213291168213, "rewards_train/margins": 5.2714269161224365, "rewards_train/rejected": -4.921605587005615, "step": 702 }, { "epoch": 1.94, "learning_rate": 1.2360368632512687e-07, "loss": 0.0332, "step": 703 }, { "epoch": 1.94, "logps_train/chosen": -67.1106948852539, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -104.8125, "logps_train/rejected": -157.79937744140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29444801807403564, "rewards_train/margins": 5.591330170631409, "rewards_train/rejected": -5.296882152557373, "step": 703 }, { "epoch": 1.94, "learning_rate": 1.2304246040371737e-07, "loss": 0.0215, "step": 704 }, { "epoch": 1.94, "logps_train/chosen": -67.25715637207031, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -158.6771697998047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3158857524394989, "rewards_train/margins": 5.560897082090378, "rewards_train/rejected": -5.245011329650879, "step": 704 }, { "epoch": 1.94, "learning_rate": 1.2248194489303116e-07, "loss": 0.0368, "step": 705 }, { "epoch": 1.94, "logps_train/chosen": -65.3885726928711, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -161.0911407470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4298441708087921, "rewards_train/margins": 5.749845772981644, "rewards_train/rejected": -5.320001602172852, "step": 705 }, { "epoch": 1.94, "learning_rate": 1.2192214496730106e-07, "loss": 0.013, "step": 706 }, { "epoch": 1.94, "logps_train/chosen": -68.89302825927734, "logps_train/ref_chosen": -72.75, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -163.0194091796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3838905394077301, "rewards_train/margins": 5.820402532815933, "rewards_train/rejected": -5.436511993408203, "step": 706 }, { "epoch": 1.95, "learning_rate": 1.213630657941541e-07, "loss": 0.015, "step": 707 }, { "epoch": 1.95, "logps_train/chosen": -67.93257141113281, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -156.6305694580078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1857956200838089, "rewards_train/margins": 5.3883045464754105, "rewards_train/rejected": -5.202508926391602, "step": 707 }, { "epoch": 1.95, "learning_rate": 1.2080471253456415e-07, "loss": 0.0375, "step": 708 }, { "epoch": 1.95, "logps_train/chosen": -65.13835906982422, "logps_train/ref_chosen": -66.125, "logps_train/ref_rejected": -101.75, "logps_train/rejected": -151.37620544433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09671102464199066, "rewards_train/margins": 5.061284884810448, "rewards_train/rejected": -4.964573860168457, "step": 708 }, { "epoch": 1.95, "learning_rate": 1.202470903428038e-07, "loss": 0.0453, "step": 709 }, { "epoch": 1.95, "logps_train/chosen": -66.22064208984375, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -159.99583435058594, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.36245760321617126, "rewards_train/margins": 5.78928741812706, "rewards_train/rejected": -5.426829814910889, "step": 709 }, { "epoch": 1.96, "learning_rate": 1.1969020436639703e-07, "loss": 0.0277, "step": 710 }, { "epoch": 1.96, "logps_train/chosen": -66.31906127929688, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -161.05633544921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24157997965812683, "rewards_train/margins": 5.675925225019455, "rewards_train/rejected": -5.434345245361328, "step": 710 }, { "epoch": 1.96, "learning_rate": 1.1913405974607185e-07, "loss": 0.0214, "step": 711 }, { "epoch": 1.96, "logps_train/chosen": -67.51802062988281, "logps_train/ref_chosen": -71.75, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -163.41209411621094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4213421940803528, "rewards_train/margins": 5.935892403125763, "rewards_train/rejected": -5.51455020904541, "step": 711 }, { "epoch": 1.96, "learning_rate": 1.1857866161571247e-07, "loss": 0.0111, "step": 712 }, { "epoch": 1.96, "logps_train/chosen": -66.44335174560547, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -156.16783142089844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38164132833480835, "rewards_train/margins": 5.722838342189789, "rewards_train/rejected": -5.3411970138549805, "step": 712 }, { "epoch": 1.96, "learning_rate": 1.1802401510231214e-07, "loss": 0.0197, "step": 713 }, { "epoch": 1.96, "logps_train/chosen": -67.3542709350586, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -157.60638427734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3030005991458893, "rewards_train/margins": 5.509731441736221, "rewards_train/rejected": -5.206730842590332, "step": 713 }, { "epoch": 1.97, "learning_rate": 1.1747012532592573e-07, "loss": 0.0278, "step": 714 }, { "epoch": 1.97, "logps_train/chosen": -68.0119400024414, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -155.99676513671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2148706465959549, "rewards_train/margins": 5.42831726372242, "rewards_train/rejected": -5.213446617126465, "step": 714 }, { "epoch": 1.97, "learning_rate": 1.1691699739962274e-07, "loss": 0.0278, "step": 715 }, { "epoch": 1.97, "logps_train/chosen": -66.28433227539062, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -156.32081604003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18768025934696198, "rewards_train/margins": 5.366832748055458, "rewards_train/rejected": -5.179152488708496, "step": 715 }, { "epoch": 1.97, "learning_rate": 1.1636463642943963e-07, "loss": 0.0313, "step": 716 }, { "epoch": 1.97, "logps_train/chosen": -66.2294921875, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -156.17750549316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14418922364711761, "rewards_train/margins": 5.46657882630825, "rewards_train/rejected": -5.322389602661133, "step": 716 }, { "epoch": 1.98, "learning_rate": 1.1581304751433303e-07, "loss": 0.0338, "step": 717 }, { "epoch": 1.98, "logps_train/chosen": -67.73876190185547, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -163.69989013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33432692289352417, "rewards_train/margins": 5.887031733989716, "rewards_train/rejected": -5.552704811096191, "step": 717 }, { "epoch": 1.98, "learning_rate": 1.1526223574613254e-07, "loss": 0.023, "step": 718 }, { "epoch": 1.98, "logps_train/chosen": -64.64842224121094, "logps_train/ref_chosen": -66.4375, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -153.2304229736328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18095825612545013, "rewards_train/margins": 5.244332775473595, "rewards_train/rejected": -5.0633745193481445, "step": 718 }, { "epoch": 1.98, "learning_rate": 1.1471220620949393e-07, "loss": 0.0311, "step": 719 }, { "epoch": 1.98, "logps_train/chosen": -67.86856079101562, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -153.21185302734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08951133489608765, "rewards_train/margins": 4.9358925223350525, "rewards_train/rejected": -4.846381187438965, "step": 719 }, { "epoch": 1.98, "learning_rate": 1.1416296398185163e-07, "loss": 0.0368, "step": 720 }, { "epoch": 1.98, "logps_train/chosen": -66.470947265625, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -161.264892578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4542238712310791, "rewards_train/margins": 5.8126490116119385, "rewards_train/rejected": -5.358425140380859, "step": 720 }, { "epoch": 1.99, "learning_rate": 1.1361451413337281e-07, "loss": 0.0171, "step": 721 }, { "epoch": 1.99, "logps_train/chosen": -68.20703887939453, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -159.54428100585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24863187968730927, "rewards_train/margins": 5.629231855273247, "rewards_train/rejected": -5.3805999755859375, "step": 721 }, { "epoch": 1.99, "learning_rate": 1.1306686172690952e-07, "loss": 0.0323, "step": 722 }, { "epoch": 1.99, "logps_train/chosen": -64.36468505859375, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -159.63539123535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.529840350151062, "rewards_train/margins": 5.985127806663513, "rewards_train/rejected": -5.455287456512451, "step": 722 }, { "epoch": 1.99, "learning_rate": 1.1252001181795272e-07, "loss": 0.0082, "step": 723 }, { "epoch": 1.99, "logps_train/chosen": -68.40699768066406, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -109.1875, "logps_train/rejected": -161.4206085205078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13019871711730957, "rewards_train/margins": 5.351166009902954, "rewards_train/rejected": -5.2209672927856445, "step": 723 }, { "epoch": 1.99, "learning_rate": 1.1197396945458535e-07, "loss": 0.0415, "step": 724 }, { "epoch": 1.99, "logps_train/chosen": -68.13179016113281, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -157.51124572753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1760788857936859, "rewards_train/margins": 5.4393130242824554, "rewards_train/rejected": -5.2632341384887695, "step": 724 }, { "epoch": 2.0, "learning_rate": 1.1142873967743574e-07, "loss": 0.0382, "step": 725 }, { "epoch": 2.0, "logps_train/chosen": -64.61801147460938, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -101.375, "logps_train/rejected": -152.19354248046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30724236369132996, "rewards_train/margins": 5.388119667768478, "rewards_train/rejected": -5.080877304077148, "step": 725 }, { "epoch": 2.0, "learning_rate": 1.1088432751963086e-07, "loss": 0.0313, "step": 726 }, { "epoch": 2.0, "logps_train/chosen": -65.82510375976562, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -101.4375, "logps_train/rejected": -152.36842346191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27393442392349243, "rewards_train/margins": 5.365902960300446, "rewards_train/rejected": -5.091968536376953, "step": 726 }, { "epoch": 2.0, "learning_rate": 1.103407380067503e-07, "loss": 0.0267, "step": 727 }, { "epoch": 2.0, "logps_train/chosen": -68.26959991455078, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -158.39505004882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21185792982578278, "rewards_train/margins": 5.395210400223732, "rewards_train/rejected": -5.183352470397949, "step": 727 }, { "epoch": 2.01, "learning_rate": 1.097979761567796e-07, "loss": 0.0379, "step": 728 }, { "epoch": 2.01, "logps_train/chosen": -66.43197631835938, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -153.7588348388672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2584132254123688, "rewards_train/margins": 5.350556284189224, "rewards_train/rejected": -5.0921430587768555, "step": 728 }, { "epoch": 2.01, "learning_rate": 1.0925604698006403e-07, "loss": 0.0209, "step": 729 }, { "epoch": 2.01, "logps_train/chosen": -66.16560363769531, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -153.41873168945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2663987874984741, "rewards_train/margins": 5.25456178188324, "rewards_train/rejected": -4.988162994384766, "step": 729 }, { "epoch": 2.01, "learning_rate": 1.0871495547926201e-07, "loss": 0.0323, "step": 730 }, { "epoch": 2.01, "logps_train/chosen": -69.2822265625, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -153.10931396484375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.11206108331680298, "rewards_train/margins": 5.213812410831451, "rewards_train/rejected": -5.101751327514648, "step": 730 }, { "epoch": 2.01, "learning_rate": 1.081747066492995e-07, "loss": 0.0568, "step": 731 }, { "epoch": 2.01, "logps_train/chosen": -69.55679321289062, "logps_train/ref_chosen": -74.125, "logps_train/ref_rejected": -109.625, "logps_train/rejected": -165.6373291015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4540378451347351, "rewards_train/margins": 6.056930720806122, "rewards_train/rejected": -5.602892875671387, "step": 731 }, { "epoch": 2.02, "learning_rate": 1.0763530547732359e-07, "loss": 0.0206, "step": 732 }, { "epoch": 2.02, "logps_train/chosen": -64.73431396484375, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -152.79811096191406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.269585520029068, "rewards_train/margins": 5.389337450265884, "rewards_train/rejected": -5.119751930236816, "step": 732 }, { "epoch": 2.02, "learning_rate": 1.070967569426561e-07, "loss": 0.0372, "step": 733 }, { "epoch": 2.02, "logps_train/chosen": -66.8555908203125, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -153.58950805664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1873902678489685, "rewards_train/margins": 5.179789483547211, "rewards_train/rejected": -4.992399215698242, "step": 733 }, { "epoch": 2.02, "learning_rate": 1.0655906601674858e-07, "loss": 0.0424, "step": 734 }, { "epoch": 2.02, "logps_train/chosen": -69.12783813476562, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -156.80230712890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1103116124868393, "rewards_train/margins": 5.323354348540306, "rewards_train/rejected": -5.213042736053467, "step": 734 }, { "epoch": 2.02, "learning_rate": 1.0602223766313523e-07, "loss": 0.0377, "step": 735 }, { "epoch": 2.02, "logps_train/chosen": -67.0743408203125, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -161.13226318359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28221505880355835, "rewards_train/margins": 5.78655618429184, "rewards_train/rejected": -5.504341125488281, "step": 735 }, { "epoch": 2.03, "learning_rate": 1.0548627683738804e-07, "loss": 0.0249, "step": 736 }, { "epoch": 2.03, "logps_train/chosen": -68.35066223144531, "logps_train/ref_chosen": -70.9375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -158.41574096679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25790277123451233, "rewards_train/margins": 5.6228653490543365, "rewards_train/rejected": -5.364962577819824, "step": 736 }, { "epoch": 2.03, "learning_rate": 1.0495118848707053e-07, "loss": 0.036, "step": 737 }, { "epoch": 2.03, "logps_train/chosen": -66.64083862304688, "logps_train/ref_chosen": -66.8125, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -161.09228515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.015652552247047424, "rewards_train/margins": 5.312870875000954, "rewards_train/rejected": -5.297218322753906, "step": 737 }, { "epoch": 2.03, "learning_rate": 1.0441697755169232e-07, "loss": 0.0343, "step": 738 }, { "epoch": 2.03, "logps_train/chosen": -67.95858001708984, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -153.8871307373047, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.1629307121038437, "rewards_train/margins": 5.362094148993492, "rewards_train/rejected": -5.199163436889648, "step": 738 }, { "epoch": 2.04, "learning_rate": 1.0388364896266324e-07, "loss": 0.0444, "step": 739 }, { "epoch": 2.04, "logps_train/chosen": -69.4432373046875, "logps_train/ref_chosen": -72.125, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -160.12257385253906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2664186656475067, "rewards_train/margins": 5.652601450681686, "rewards_train/rejected": -5.38618278503418, "step": 739 }, { "epoch": 2.04, "learning_rate": 1.0335120764324821e-07, "loss": 0.0481, "step": 740 }, { "epoch": 2.04, "logps_train/chosen": -63.1064453125, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -155.42391967773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4829103350639343, "rewards_train/margins": 5.6415130496025085, "rewards_train/rejected": -5.158602714538574, "step": 740 }, { "epoch": 2.04, "learning_rate": 1.0281965850852148e-07, "loss": 0.0141, "step": 741 }, { "epoch": 2.04, "logps_train/chosen": -68.38813781738281, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -153.79676818847656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1925337165594101, "rewards_train/margins": 5.2610771507024765, "rewards_train/rejected": -5.068543434143066, "step": 741 }, { "epoch": 2.04, "learning_rate": 1.0228900646532149e-07, "loss": 0.0294, "step": 742 }, { "epoch": 2.04, "logps_train/chosen": -69.05306243896484, "logps_train/ref_chosen": -71.875, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -162.70944213867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28492820262908936, "rewards_train/margins": 5.88712203502655, "rewards_train/rejected": -5.602193832397461, "step": 742 }, { "epoch": 2.05, "learning_rate": 1.0175925641220514e-07, "loss": 0.0194, "step": 743 }, { "epoch": 2.05, "logps_train/chosen": -64.99679565429688, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -106.1875, "logps_train/rejected": -159.8814697265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3645777106285095, "rewards_train/margins": 5.732313096523285, "rewards_train/rejected": -5.367735385894775, "step": 743 }, { "epoch": 2.05, "learning_rate": 1.0123041323940316e-07, "loss": 0.0147, "step": 744 }, { "epoch": 2.05, "logps_train/chosen": -67.80470275878906, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -161.69784545898438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40009617805480957, "rewards_train/margins": 5.781111001968384, "rewards_train/rejected": -5.381014823913574, "step": 744 }, { "epoch": 2.05, "learning_rate": 1.0070248182877467e-07, "loss": 0.0243, "step": 745 }, { "epoch": 2.05, "logps_train/chosen": -69.47105407714844, "logps_train/ref_chosen": -72.125, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -159.24508666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26466232538223267, "rewards_train/margins": 5.52471786737442, "rewards_train/rejected": -5.2600555419921875, "step": 745 }, { "epoch": 2.06, "learning_rate": 1.0017546705376188e-07, "loss": 0.0369, "step": 746 }, { "epoch": 2.06, "logps_train/chosen": -68.42433166503906, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -99.875, "logps_train/rejected": -149.3594970703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09028119593858719, "rewards_train/margins": 5.039317317306995, "rewards_train/rejected": -4.949036121368408, "step": 746 }, { "epoch": 2.06, "learning_rate": 9.964937377934577e-08, "loss": 0.0456, "step": 747 }, { "epoch": 2.06, "logps_train/chosen": -68.53237915039062, "logps_train/ref_chosen": -71.6875, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -159.13076782226562, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.3153662085533142, "rewards_train/margins": 5.641822278499603, "rewards_train/rejected": -5.326456069946289, "step": 747 }, { "epoch": 2.06, "learning_rate": 9.91242068620003e-08, "loss": 0.0252, "step": 748 }, { "epoch": 2.06, "logps_train/chosen": -64.95510864257812, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -101.1875, "logps_train/rejected": -153.11102294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24018247425556183, "rewards_train/margins": 5.43429334461689, "rewards_train/rejected": -5.194110870361328, "step": 748 }, { "epoch": 2.06, "learning_rate": 9.859997114964839e-08, "loss": 0.0364, "step": 749 }, { "epoch": 2.06, "logps_train/chosen": -67.74374389648438, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -103.3125, "logps_train/rejected": -155.88619995117188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18338893353939056, "rewards_train/margins": 5.440709337592125, "rewards_train/rejected": -5.257320404052734, "step": 749 }, { "epoch": 2.07, "learning_rate": 9.807667148161648e-08, "loss": 0.0366, "step": 750 }, { "epoch": 2.07, "logps_train/chosen": -65.69606018066406, "logps_train/ref_chosen": -66.6875, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -151.90748596191406, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.09680064767599106, "rewards_train/margins": 5.052832446992397, "rewards_train/rejected": -4.956031799316406, "step": 750 }, { "epoch": 2.07, "learning_rate": 9.755431268859069e-08, "loss": 0.0539, "step": 751 }, { "epoch": 2.07, "logps_train/chosen": -65.71572875976562, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -158.44955444335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4544041156768799, "rewards_train/margins": 5.626800775527954, "rewards_train/rejected": -5.172396659851074, "step": 751 }, { "epoch": 2.07, "learning_rate": 9.703289959257121e-08, "loss": 0.0275, "step": 752 }, { "epoch": 2.07, "logps_train/chosen": -63.95400619506836, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -158.55172729492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4208106994628906, "rewards_train/margins": 5.643268585205078, "rewards_train/rejected": -5.2224578857421875, "step": 752 }, { "epoch": 2.07, "learning_rate": 9.65124370068286e-08, "loss": 0.0332, "step": 753 }, { "epoch": 2.07, "logps_train/chosen": -66.50859832763672, "logps_train/ref_chosen": -66.8125, "logps_train/ref_rejected": -101.0, "logps_train/rejected": -151.88482666015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.032977886497974396, "rewards_train/margins": 5.124489612877369, "rewards_train/rejected": -5.0915117263793945, "step": 753 }, { "epoch": 2.08, "learning_rate": 9.599292973585904e-08, "loss": 0.0388, "step": 754 }, { "epoch": 2.08, "logps_train/chosen": -70.0636215209961, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -162.07559204101562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19231918454170227, "rewards_train/margins": 5.53425195813179, "rewards_train/rejected": -5.341932773590088, "step": 754 }, { "epoch": 2.08, "learning_rate": 9.547438257533998e-08, "loss": 0.0299, "step": 755 }, { "epoch": 2.08, "logps_train/chosen": -65.30667114257812, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -101.3125, "logps_train/rejected": -151.63839721679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36952829360961914, "rewards_train/margins": 5.402705669403076, "rewards_train/rejected": -5.033177375793457, "step": 755 }, { "epoch": 2.08, "learning_rate": 9.495680031208575e-08, "loss": 0.0235, "step": 756 }, { "epoch": 2.08, "logps_train/chosen": -65.88661193847656, "logps_train/ref_chosen": -67.0625, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -153.29776000976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11685679852962494, "rewards_train/margins": 5.214943155646324, "rewards_train/rejected": -5.098086357116699, "step": 756 }, { "epoch": 2.09, "learning_rate": 9.444018772400375e-08, "loss": 0.0433, "step": 757 }, { "epoch": 2.09, "logps_train/chosen": -66.23023223876953, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -153.78529357910156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.236156165599823, "rewards_train/margins": 5.2023321986198425, "rewards_train/rejected": -4.9661760330200195, "step": 757 }, { "epoch": 2.09, "learning_rate": 9.392454958005005e-08, "loss": 0.0314, "step": 758 }, { "epoch": 2.09, "logps_train/chosen": -66.19554901123047, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -152.59068298339844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18020112812519073, "rewards_train/margins": 5.25215919315815, "rewards_train/rejected": -5.071958065032959, "step": 758 }, { "epoch": 2.09, "learning_rate": 9.340989064018516e-08, "loss": 0.0373, "step": 759 }, { "epoch": 2.09, "logps_train/chosen": -68.74402618408203, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -108.3125, "logps_train/rejected": -160.59202575683594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.05738448724150658, "rewards_train/margins": 5.2832870446145535, "rewards_train/rejected": -5.225902557373047, "step": 759 }, { "epoch": 2.09, "learning_rate": 9.28962156553309e-08, "loss": 0.0471, "step": 760 }, { "epoch": 2.09, "logps_train/chosen": -68.75921630859375, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -110.875, "logps_train/rejected": -166.97674560546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1524476855993271, "rewards_train/margins": 5.7631101459264755, "rewards_train/rejected": -5.610662460327148, "step": 760 }, { "epoch": 2.1, "learning_rate": 9.238352936732547e-08, "loss": 0.0408, "step": 761 }, { "epoch": 2.1, "logps_train/chosen": -67.25343322753906, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -154.67938232421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3009253442287445, "rewards_train/margins": 5.452506631612778, "rewards_train/rejected": -5.151581287384033, "step": 761 }, { "epoch": 2.1, "learning_rate": 9.187183650888055e-08, "loss": 0.0345, "step": 762 }, { "epoch": 2.1, "logps_train/chosen": -67.68775939941406, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -159.49679565429688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14679975807666779, "rewards_train/margins": 5.68339367210865, "rewards_train/rejected": -5.536593914031982, "step": 762 }, { "epoch": 2.1, "learning_rate": 9.136114180353691e-08, "loss": 0.0207, "step": 763 }, { "epoch": 2.1, "logps_train/chosen": -69.24821472167969, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -159.9560089111328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1789388209581375, "rewards_train/margins": 5.59426636993885, "rewards_train/rejected": -5.415327548980713, "step": 763 }, { "epoch": 2.1, "learning_rate": 9.085144996562173e-08, "loss": 0.0175, "step": 764 }, { "epoch": 2.1, "logps_train/chosen": -66.85502624511719, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -157.34014892578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3177686333656311, "rewards_train/margins": 5.579224646091461, "rewards_train/rejected": -5.26145601272583, "step": 764 }, { "epoch": 2.11, "learning_rate": 9.034276570020391e-08, "loss": 0.0337, "step": 765 }, { "epoch": 2.11, "logps_train/chosen": -65.65994262695312, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -100.8125, "logps_train/rejected": -150.78036499023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17116355895996094, "rewards_train/margins": 5.16726541519165, "rewards_train/rejected": -4.9961018562316895, "step": 765 }, { "epoch": 2.11, "learning_rate": 8.983509370305164e-08, "loss": 0.0344, "step": 766 }, { "epoch": 2.11, "logps_train/chosen": -66.52761840820312, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -157.77731323242188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.18674004077911377, "rewards_train/margins": 5.500800251960754, "rewards_train/rejected": -5.314060211181641, "step": 766 }, { "epoch": 2.11, "learning_rate": 8.932843866058851e-08, "loss": 0.0359, "step": 767 }, { "epoch": 2.11, "logps_train/chosen": -68.10135650634766, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -110.625, "logps_train/rejected": -168.2027130126953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2936725616455078, "rewards_train/margins": 6.050174713134766, "rewards_train/rejected": -5.756502151489258, "step": 767 }, { "epoch": 2.12, "learning_rate": 8.882280524985047e-08, "loss": 0.0163, "step": 768 }, { "epoch": 2.12, "logps_train/chosen": -68.92106628417969, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -158.35687255859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23723949491977692, "rewards_train/margins": 5.477322235703468, "rewards_train/rejected": -5.240082740783691, "step": 768 }, { "epoch": 2.12, "learning_rate": 8.831819813844241e-08, "loss": 0.0289, "step": 769 }, { "epoch": 2.12, "logps_train/chosen": -66.82981872558594, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -108.5625, "logps_train/rejected": -161.819091796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29792651534080505, "rewards_train/margins": 5.6257825791835785, "rewards_train/rejected": -5.327856063842773, "step": 769 }, { "epoch": 2.12, "learning_rate": 8.781462198449541e-08, "loss": 0.0262, "step": 770 }, { "epoch": 2.12, "logps_train/chosen": -67.6777114868164, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -159.66159057617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19419163465499878, "rewards_train/margins": 5.57578057050705, "rewards_train/rejected": -5.381588935852051, "step": 770 }, { "epoch": 2.12, "learning_rate": 8.73120814366236e-08, "loss": 0.0233, "step": 771 }, { "epoch": 2.12, "logps_train/chosen": -64.41299438476562, "logps_train/ref_chosen": -67.1875, "logps_train/ref_rejected": -103.1875, "logps_train/rejected": -157.31390380859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2780856192111969, "rewards_train/margins": 5.6894568502902985, "rewards_train/rejected": -5.411371231079102, "step": 771 }, { "epoch": 2.13, "learning_rate": 8.681058113388094e-08, "loss": 0.0244, "step": 772 }, { "epoch": 2.13, "logps_train/chosen": -66.52317810058594, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -159.64208984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27663689851760864, "rewards_train/margins": 5.664965927600861, "rewards_train/rejected": -5.388329029083252, "step": 772 }, { "epoch": 2.13, "learning_rate": 8.631012570571912e-08, "loss": 0.0216, "step": 773 }, { "epoch": 2.13, "logps_train/chosen": -69.38488006591797, "logps_train/ref_chosen": -72.8125, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -162.48330688476562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34466657042503357, "rewards_train/margins": 5.730301946401596, "rewards_train/rejected": -5.3856353759765625, "step": 773 }, { "epoch": 2.13, "learning_rate": 8.581071977194397e-08, "loss": 0.0209, "step": 774 }, { "epoch": 2.13, "logps_train/chosen": -68.5250473022461, "logps_train/ref_chosen": -71.8125, "logps_train/ref_rejected": -108.3125, "logps_train/rejected": -161.99842834472656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32771986722946167, "rewards_train/margins": 5.697093188762665, "rewards_train/rejected": -5.369373321533203, "step": 774 }, { "epoch": 2.13, "learning_rate": 8.531236794267347e-08, "loss": 0.0323, "step": 775 }, { "epoch": 2.13, "logps_train/chosen": -67.34577178955078, "logps_train/ref_chosen": -72.125, "logps_train/ref_rejected": -108.5625, "logps_train/rejected": -164.61488342285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4786067008972168, "rewards_train/margins": 6.083356857299805, "rewards_train/rejected": -5.604750156402588, "step": 775 }, { "epoch": 2.14, "learning_rate": 8.481507481829468e-08, "loss": 0.009, "step": 776 }, { "epoch": 2.14, "logps_train/chosen": -66.40730285644531, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -102.25, "logps_train/rejected": -153.30409240722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37303900718688965, "rewards_train/margins": 5.476006269454956, "rewards_train/rejected": -5.102967262268066, "step": 776 }, { "epoch": 2.14, "learning_rate": 8.431884498942201e-08, "loss": 0.0275, "step": 777 }, { "epoch": 2.14, "logps_train/chosen": -65.10470581054688, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -160.8020782470703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3961210250854492, "rewards_train/margins": 6.012364387512207, "rewards_train/rejected": -5.616243362426758, "step": 777 }, { "epoch": 2.14, "learning_rate": 8.382368303685387e-08, "loss": 0.0195, "step": 778 }, { "epoch": 2.14, "logps_train/chosen": -66.85968017578125, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -107.25, "logps_train/rejected": -160.5249786376953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27799665927886963, "rewards_train/margins": 5.603150010108948, "rewards_train/rejected": -5.325153350830078, "step": 778 }, { "epoch": 2.15, "learning_rate": 8.33295935315312e-08, "loss": 0.0246, "step": 779 }, { "epoch": 2.15, "logps_train/chosen": -65.76838684082031, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -161.42864990234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4953293800354004, "rewards_train/margins": 6.031553268432617, "rewards_train/rejected": -5.536223888397217, "step": 779 }, { "epoch": 2.15, "learning_rate": 8.28365810344948e-08, "loss": 0.0111, "step": 780 }, { "epoch": 2.15, "logps_train/chosen": -62.91313934326172, "logps_train/ref_chosen": -64.75, "logps_train/ref_rejected": -96.5625, "logps_train/rejected": -148.03457641601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18544384837150574, "rewards_train/margins": 5.333629101514816, "rewards_train/rejected": -5.1481852531433105, "step": 780 }, { "epoch": 2.15, "learning_rate": 8.234465009684357e-08, "loss": 0.0269, "step": 781 }, { "epoch": 2.15, "logps_train/chosen": -66.10012817382812, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -156.49258422851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3573208451271057, "rewards_train/margins": 5.643590033054352, "rewards_train/rejected": -5.286269187927246, "step": 781 }, { "epoch": 2.15, "learning_rate": 8.185380525969205e-08, "loss": 0.0198, "step": 782 }, { "epoch": 2.15, "logps_train/chosen": -64.36170196533203, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -100.0, "logps_train/rejected": -150.977294921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27969852089881897, "rewards_train/margins": 5.377916485071182, "rewards_train/rejected": -5.098217964172363, "step": 782 }, { "epoch": 2.16, "learning_rate": 8.136405105412895e-08, "loss": 0.0294, "step": 783 }, { "epoch": 2.16, "logps_train/chosen": -68.329833984375, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -159.29058837890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1569090485572815, "rewards_train/margins": 5.41780287027359, "rewards_train/rejected": -5.260893821716309, "step": 783 }, { "epoch": 2.16, "learning_rate": 8.087539200117515e-08, "loss": 0.0365, "step": 784 }, { "epoch": 2.16, "logps_train/chosen": -65.18717956542969, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -160.3211212158203, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3347492218017578, "rewards_train/margins": 5.874772548675537, "rewards_train/rejected": -5.540023326873779, "step": 784 }, { "epoch": 2.16, "learning_rate": 8.038783261174192e-08, "loss": 0.0107, "step": 785 }, { "epoch": 2.16, "logps_train/chosen": -66.13192749023438, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -152.18870544433594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21107470989227295, "rewards_train/margins": 5.196449160575867, "rewards_train/rejected": -4.985374450683594, "step": 785 }, { "epoch": 2.17, "learning_rate": 7.990137738658917e-08, "loss": 0.0242, "step": 786 }, { "epoch": 2.17, "logps_train/chosen": -68.00529479980469, "logps_train/ref_chosen": -72.6875, "logps_train/ref_rejected": -109.1875, "logps_train/rejected": -166.75836181640625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47061365842819214, "rewards_train/margins": 6.231020152568817, "rewards_train/rejected": -5.760406494140625, "step": 786 }, { "epoch": 2.17, "learning_rate": 7.941603081628425e-08, "loss": 0.0036, "step": 787 }, { "epoch": 2.17, "logps_train/chosen": -67.29344177246094, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -163.58294677734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33213067054748535, "rewards_train/margins": 5.873335123062134, "rewards_train/rejected": -5.541204452514648, "step": 787 }, { "epoch": 2.17, "learning_rate": 7.893179738116027e-08, "loss": 0.0141, "step": 788 }, { "epoch": 2.17, "logps_train/chosen": -68.72358703613281, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -160.2436981201172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1370161771774292, "rewards_train/margins": 5.466269373893738, "rewards_train/rejected": -5.329253196716309, "step": 788 }, { "epoch": 2.17, "learning_rate": 7.84486815512745e-08, "loss": 0.0435, "step": 789 }, { "epoch": 2.17, "logps_train/chosen": -68.15737915039062, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -155.7804412841797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.283578485250473, "rewards_train/margins": 5.457326501607895, "rewards_train/rejected": -5.173748016357422, "step": 789 }, { "epoch": 2.18, "learning_rate": 7.796668778636784e-08, "loss": 0.0296, "step": 790 }, { "epoch": 2.18, "logps_train/chosen": -69.15058898925781, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -159.57022094726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1181926429271698, "rewards_train/margins": 5.412615746259689, "rewards_train/rejected": -5.2944231033325195, "step": 790 }, { "epoch": 2.18, "learning_rate": 7.748582053582268e-08, "loss": 0.0408, "step": 791 }, { "epoch": 2.18, "logps_train/chosen": -68.1909408569336, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -161.68536376953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19907009601593018, "rewards_train/margins": 5.7570120096206665, "rewards_train/rejected": -5.557941913604736, "step": 791 }, { "epoch": 2.18, "learning_rate": 7.700608423862268e-08, "loss": 0.0235, "step": 792 }, { "epoch": 2.18, "logps_train/chosen": -66.46942138671875, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -105.4375, "logps_train/rejected": -158.95819091796875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.381426602602005, "rewards_train/margins": 5.732324868440628, "rewards_train/rejected": -5.350898265838623, "step": 792 }, { "epoch": 2.18, "learning_rate": 7.652748332331127e-08, "loss": 0.0339, "step": 793 }, { "epoch": 2.18, "logps_train/chosen": -68.14637756347656, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -107.6875, "logps_train/rejected": -164.32974243164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3390243649482727, "rewards_train/margins": 6.001589596271515, "rewards_train/rejected": -5.662565231323242, "step": 793 }, { "epoch": 2.19, "learning_rate": 7.605002220795105e-08, "loss": 0.0209, "step": 794 }, { "epoch": 2.19, "logps_train/chosen": -66.8729476928711, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -160.19281005859375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22212913632392883, "rewards_train/margins": 5.733206063508987, "rewards_train/rejected": -5.511076927185059, "step": 794 }, { "epoch": 2.19, "learning_rate": 7.557370530008273e-08, "loss": 0.0204, "step": 795 }, { "epoch": 2.19, "logps_train/chosen": -68.265869140625, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -156.47735595703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10720254480838776, "rewards_train/margins": 5.35864831507206, "rewards_train/rejected": -5.251445770263672, "step": 795 }, { "epoch": 2.19, "learning_rate": 7.509853699668473e-08, "loss": 0.0412, "step": 796 }, { "epoch": 2.19, "logps_train/chosen": -64.06623840332031, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -161.21926879882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4257488548755646, "rewards_train/margins": 5.917988330125809, "rewards_train/rejected": -5.492239475250244, "step": 796 }, { "epoch": 2.2, "learning_rate": 7.462452168413249e-08, "loss": 0.0146, "step": 797 }, { "epoch": 2.2, "logps_train/chosen": -70.24516296386719, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -155.10914611816406, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": -0.008451398462057114, "rewards_train/margins": 5.126242700964212, "rewards_train/rejected": -5.1346940994262695, "step": 797 }, { "epoch": 2.2, "learning_rate": 7.415166373815794e-08, "loss": 0.0695, "step": 798 }, { "epoch": 2.2, "logps_train/chosen": -66.43734741210938, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -157.9674835205078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28790581226348877, "rewards_train/margins": 5.532407879829407, "rewards_train/rejected": -5.244502067565918, "step": 798 }, { "epoch": 2.2, "learning_rate": 7.367996752380897e-08, "loss": 0.0406, "step": 799 }, { "epoch": 2.2, "logps_train/chosen": -67.6162109375, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -104.5, "logps_train/rejected": -157.430419921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27387627959251404, "rewards_train/margins": 5.564720243215561, "rewards_train/rejected": -5.290843963623047, "step": 799 }, { "epoch": 2.2, "learning_rate": 7.320943739540947e-08, "loss": 0.0385, "step": 800 }, { "epoch": 2.2, "logps_train/chosen": -65.72269439697266, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -154.99295043945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3076134920120239, "rewards_train/margins": 5.50871479511261, "rewards_train/rejected": -5.201101303100586, "step": 800 }, { "epoch": 2.21, "learning_rate": 7.2740077696519e-08, "loss": 0.03, "step": 801 }, { "epoch": 2.21, "logps_train/chosen": -68.80709838867188, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -103.125, "logps_train/rejected": -157.19921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10840180516242981, "rewards_train/margins": 5.519145041704178, "rewards_train/rejected": -5.410743236541748, "step": 801 }, { "epoch": 2.21, "learning_rate": 7.22718927598923e-08, "loss": 0.0247, "step": 802 }, { "epoch": 2.21, "logps_train/chosen": -64.68287658691406, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -158.0079345703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3447003662586212, "rewards_train/margins": 5.643346339464188, "rewards_train/rejected": -5.298645973205566, "step": 802 }, { "epoch": 2.21, "learning_rate": 7.180488690744017e-08, "loss": 0.0276, "step": 803 }, { "epoch": 2.21, "logps_train/chosen": -66.2870101928711, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -153.552001953125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.26846691966056824, "rewards_train/margins": 5.457506150007248, "rewards_train/rejected": -5.18903923034668, "step": 803 }, { "epoch": 2.21, "learning_rate": 7.133906445018856e-08, "loss": 0.0419, "step": 804 }, { "epoch": 2.21, "logps_train/chosen": -70.08931732177734, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -164.20765686035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.04414437711238861, "rewards_train/margins": 5.640251860022545, "rewards_train/rejected": -5.596107482910156, "step": 804 }, { "epoch": 2.22, "learning_rate": 7.08744296882395e-08, "loss": 0.0404, "step": 805 }, { "epoch": 2.22, "logps_train/chosen": -68.02159118652344, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -160.72952270507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3168836534023285, "rewards_train/margins": 5.899211496114731, "rewards_train/rejected": -5.582327842712402, "step": 805 }, { "epoch": 2.22, "learning_rate": 7.041098691073117e-08, "loss": 0.0136, "step": 806 }, { "epoch": 2.22, "logps_train/chosen": -65.33316040039062, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -106.5625, "logps_train/rejected": -160.52304077148438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4197603464126587, "rewards_train/margins": 5.817182183265686, "rewards_train/rejected": -5.397421836853027, "step": 806 }, { "epoch": 2.22, "learning_rate": 6.994874039579826e-08, "loss": 0.0234, "step": 807 }, { "epoch": 2.22, "logps_train/chosen": -67.2214126586914, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -103.25, "logps_train/rejected": -155.3375701904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10617858916521072, "rewards_train/margins": 5.317865677177906, "rewards_train/rejected": -5.211687088012695, "step": 807 }, { "epoch": 2.23, "learning_rate": 6.94876944105323e-08, "loss": 0.0504, "step": 808 }, { "epoch": 2.23, "logps_train/chosen": -67.53368377685547, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -160.0167694091797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27724695205688477, "rewards_train/margins": 5.650603771209717, "rewards_train/rejected": -5.373356819152832, "step": 808 }, { "epoch": 2.23, "learning_rate": 6.9027853210943e-08, "loss": 0.025, "step": 809 }, { "epoch": 2.23, "logps_train/chosen": -65.79275512695312, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -153.42738342285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29108622670173645, "rewards_train/margins": 5.440465003252029, "rewards_train/rejected": -5.149378776550293, "step": 809 }, { "epoch": 2.23, "learning_rate": 6.856922104191789e-08, "loss": 0.0291, "step": 810 }, { "epoch": 2.23, "logps_train/chosen": -67.04986572265625, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -105.1875, "logps_train/rejected": -159.3077850341797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4098574221134186, "rewards_train/margins": 5.8229595720767975, "rewards_train/rejected": -5.413102149963379, "step": 810 }, { "epoch": 2.23, "learning_rate": 6.811180213718411e-08, "loss": 0.0116, "step": 811 }, { "epoch": 2.23, "logps_train/chosen": -65.72881317138672, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -101.5, "logps_train/rejected": -150.80633544921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20299755036830902, "rewards_train/margins": 5.1326063722372055, "rewards_train/rejected": -4.9296088218688965, "step": 811 }, { "epoch": 2.24, "learning_rate": 6.765560071926857e-08, "loss": 0.0555, "step": 812 }, { "epoch": 2.24, "logps_train/chosen": -66.43477630615234, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -160.47494506835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22327035665512085, "rewards_train/margins": 5.618810594081879, "rewards_train/rejected": -5.395540237426758, "step": 812 }, { "epoch": 2.24, "learning_rate": 6.720062099945958e-08, "loss": 0.0305, "step": 813 }, { "epoch": 2.24, "logps_train/chosen": -66.42425537109375, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -158.44007873535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3433166742324829, "rewards_train/margins": 5.632247090339661, "rewards_train/rejected": -5.288930416107178, "step": 813 }, { "epoch": 2.24, "learning_rate": 6.674686717776767e-08, "loss": 0.0271, "step": 814 }, { "epoch": 2.24, "logps_train/chosen": -68.10212707519531, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -102.8125, "logps_train/rejected": -156.2907257080078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.010066017508506775, "rewards_train/margins": 5.338831439614296, "rewards_train/rejected": -5.348897457122803, "step": 814 }, { "epoch": 2.25, "learning_rate": 6.629434344288658e-08, "loss": 0.0339, "step": 815 }, { "epoch": 2.25, "logps_train/chosen": -65.96024322509766, "logps_train/ref_chosen": -66.875, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -154.05328369140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.08776455372571945, "rewards_train/margins": 5.294460587203503, "rewards_train/rejected": -5.206696033477783, "step": 815 }, { "epoch": 2.25, "learning_rate": 6.584305397215535e-08, "loss": 0.0401, "step": 816 }, { "epoch": 2.25, "logps_train/chosen": -69.81639099121094, "logps_train/ref_chosen": -72.6875, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -163.81851196289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2845231592655182, "rewards_train/margins": 5.803580433130264, "rewards_train/rejected": -5.519057273864746, "step": 816 }, { "epoch": 2.25, "learning_rate": 6.53930029315188e-08, "loss": 0.017, "step": 817 }, { "epoch": 2.25, "logps_train/chosen": -69.94316101074219, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -160.01028442382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25524479150772095, "rewards_train/margins": 5.576098024845123, "rewards_train/rejected": -5.320853233337402, "step": 817 }, { "epoch": 2.25, "learning_rate": 6.494419447548982e-08, "loss": 0.0298, "step": 818 }, { "epoch": 2.25, "logps_train/chosen": -66.77123260498047, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -155.50820922851562, "rewards_train/accuracies": 0.96875, "rewards_train/chosen": 0.2610117495059967, "rewards_train/margins": 5.470230251550674, "rewards_train/rejected": -5.209218502044678, "step": 818 }, { "epoch": 2.26, "learning_rate": 6.449663274711071e-08, "loss": 0.0445, "step": 819 }, { "epoch": 2.26, "logps_train/chosen": -66.75633239746094, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -158.61805725097656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2515157461166382, "rewards_train/margins": 5.676212668418884, "rewards_train/rejected": -5.424696922302246, "step": 819 }, { "epoch": 2.26, "learning_rate": 6.405032187791497e-08, "loss": 0.0302, "step": 820 }, { "epoch": 2.26, "logps_train/chosen": -68.00981140136719, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -161.31472778320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19521014392375946, "rewards_train/margins": 5.574241802096367, "rewards_train/rejected": -5.379031658172607, "step": 820 }, { "epoch": 2.26, "learning_rate": 6.360526598788905e-08, "loss": 0.0278, "step": 821 }, { "epoch": 2.26, "logps_train/chosen": -68.58769226074219, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -110.125, "logps_train/rejected": -166.50384521484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2537306845188141, "rewards_train/margins": 5.894250780344009, "rewards_train/rejected": -5.640520095825195, "step": 821 }, { "epoch": 2.26, "learning_rate": 6.316146918543457e-08, "loss": 0.0328, "step": 822 }, { "epoch": 2.26, "logps_train/chosen": -67.60225677490234, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -155.14024353027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15481317043304443, "rewards_train/margins": 5.325039267539978, "rewards_train/rejected": -5.170226097106934, "step": 822 }, { "epoch": 2.27, "learning_rate": 6.271893556733021e-08, "loss": 0.0501, "step": 823 }, { "epoch": 2.27, "logps_train/chosen": -64.37535095214844, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -157.69354248046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4696430563926697, "rewards_train/margins": 5.976888597011566, "rewards_train/rejected": -5.5072455406188965, "step": 823 }, { "epoch": 2.27, "learning_rate": 6.2277669218694e-08, "loss": 0.0155, "step": 824 }, { "epoch": 2.27, "logps_train/chosen": -69.2836685180664, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -108.625, "logps_train/rejected": -164.62612915039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22832269966602325, "rewards_train/margins": 5.830194666981697, "rewards_train/rejected": -5.601871967315674, "step": 824 }, { "epoch": 2.27, "learning_rate": 6.183767421294536e-08, "loss": 0.0216, "step": 825 }, { "epoch": 2.27, "logps_train/chosen": -68.05274963378906, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -103.1875, "logps_train/rejected": -155.38104248046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17577962577342987, "rewards_train/margins": 5.396794602274895, "rewards_train/rejected": -5.221014976501465, "step": 825 }, { "epoch": 2.28, "learning_rate": 6.139895461176788e-08, "loss": 0.0319, "step": 826 }, { "epoch": 2.28, "logps_train/chosen": -68.3321304321289, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -163.8220672607422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06742218136787415, "rewards_train/margins": 5.801240235567093, "rewards_train/rejected": -5.733818054199219, "step": 826 }, { "epoch": 2.28, "learning_rate": 6.096151446507154e-08, "loss": 0.0371, "step": 827 }, { "epoch": 2.28, "logps_train/chosen": -67.9520263671875, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -102.0, "logps_train/rejected": -154.09317016601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11173079907894135, "rewards_train/margins": 5.323732599616051, "rewards_train/rejected": -5.212001800537109, "step": 827 }, { "epoch": 2.28, "learning_rate": 6.052535781095545e-08, "loss": 0.0419, "step": 828 }, { "epoch": 2.28, "logps_train/chosen": -65.09171295166016, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -100.25, "logps_train/rejected": -153.37843322753906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23330944776535034, "rewards_train/margins": 5.5483004450798035, "rewards_train/rejected": -5.314990997314453, "step": 828 }, { "epoch": 2.28, "learning_rate": 6.009048867567059e-08, "loss": 0.0243, "step": 829 }, { "epoch": 2.28, "logps_train/chosen": -65.40660095214844, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -159.988525390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.39937859773635864, "rewards_train/margins": 5.8721564412117, "rewards_train/rejected": -5.472777843475342, "step": 829 }, { "epoch": 2.29, "learning_rate": 5.965691107358237e-08, "loss": 0.0143, "step": 830 }, { "epoch": 2.29, "logps_train/chosen": -65.53184509277344, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -154.88967895507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2712292969226837, "rewards_train/margins": 5.484026461839676, "rewards_train/rejected": -5.212797164916992, "step": 830 }, { "epoch": 2.29, "learning_rate": 5.9224629007134056e-08, "loss": 0.0245, "step": 831 }, { "epoch": 2.29, "logps_train/chosen": -64.6622085571289, "logps_train/ref_chosen": -66.6875, "logps_train/ref_rejected": -101.125, "logps_train/rejected": -152.27587890625, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.19989269971847534, "rewards_train/margins": 5.313663184642792, "rewards_train/rejected": -5.113770484924316, "step": 831 }, { "epoch": 2.29, "learning_rate": 5.879364646680942e-08, "loss": 0.0583, "step": 832 }, { "epoch": 2.29, "logps_train/chosen": -66.94700622558594, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -158.822265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13532909750938416, "rewards_train/margins": 5.636695712804794, "rewards_train/rejected": -5.50136661529541, "step": 832 }, { "epoch": 2.29, "learning_rate": 5.836396743109615e-08, "loss": 0.0255, "step": 833 }, { "epoch": 2.29, "logps_train/chosen": -66.92247009277344, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -100.9375, "logps_train/rejected": -155.9794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15765532851219177, "rewards_train/margins": 5.661757081747055, "rewards_train/rejected": -5.504101753234863, "step": 833 }, { "epoch": 2.3, "learning_rate": 5.7935595866448825e-08, "loss": 0.0295, "step": 834 }, { "epoch": 2.3, "logps_train/chosen": -66.34773254394531, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -156.11322021484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2344648391008377, "rewards_train/margins": 5.553209498524666, "rewards_train/rejected": -5.318744659423828, "step": 834 }, { "epoch": 2.3, "learning_rate": 5.750853572725267e-08, "loss": 0.0271, "step": 835 }, { "epoch": 2.3, "logps_train/chosen": -67.67308044433594, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.4375, "logps_train/rejected": -156.78781127929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09445920586585999, "rewards_train/margins": 5.330174118280411, "rewards_train/rejected": -5.235714912414551, "step": 835 }, { "epoch": 2.3, "learning_rate": 5.708279095578681e-08, "loss": 0.0453, "step": 836 }, { "epoch": 2.3, "logps_train/chosen": -65.75440979003906, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -156.58306884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1975575089454651, "rewards_train/margins": 5.281353056430817, "rewards_train/rejected": -5.083795547485352, "step": 836 }, { "epoch": 2.31, "learning_rate": 5.665836548218801e-08, "loss": 0.0394, "step": 837 }, { "epoch": 2.31, "logps_train/chosen": -66.34302520751953, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -156.52919006347656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2414790391921997, "rewards_train/margins": 5.600549817085266, "rewards_train/rejected": -5.359070777893066, "step": 837 }, { "epoch": 2.31, "learning_rate": 5.623526322441417e-08, "loss": 0.0236, "step": 838 }, { "epoch": 2.31, "logps_train/chosen": -62.99601745605469, "logps_train/ref_chosen": -64.75, "logps_train/ref_rejected": -101.0625, "logps_train/rejected": -152.2332763671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1782791018486023, "rewards_train/margins": 5.298726260662079, "rewards_train/rejected": -5.120447158813477, "step": 838 }, { "epoch": 2.31, "learning_rate": 5.5813488088208425e-08, "loss": 0.039, "step": 839 }, { "epoch": 2.31, "logps_train/chosen": -69.02832794189453, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -162.67689514160156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2223624587059021, "rewards_train/margins": 5.80831378698349, "rewards_train/rejected": -5.585951328277588, "step": 839 }, { "epoch": 2.31, "learning_rate": 5.539304396706301e-08, "loss": 0.0342, "step": 840 }, { "epoch": 2.31, "logps_train/chosen": -66.79438781738281, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -159.81736755371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3022997975349426, "rewards_train/margins": 5.700149834156036, "rewards_train/rejected": -5.397850036621094, "step": 840 }, { "epoch": 2.32, "learning_rate": 5.497393474218324e-08, "loss": 0.0355, "step": 841 }, { "epoch": 2.32, "logps_train/chosen": -67.872314453125, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -157.11126708984375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2071535289287567, "rewards_train/margins": 5.414813727140427, "rewards_train/rejected": -5.20766019821167, "step": 841 }, { "epoch": 2.32, "learning_rate": 5.4556164282451754e-08, "loss": 0.0385, "step": 842 }, { "epoch": 2.32, "logps_train/chosen": -66.70671081542969, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -155.25405883789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21653613448143005, "rewards_train/margins": 5.496825784444809, "rewards_train/rejected": -5.280289649963379, "step": 842 }, { "epoch": 2.32, "learning_rate": 5.4139736444392715e-08, "loss": 0.0342, "step": 843 }, { "epoch": 2.32, "logps_train/chosen": -67.99958801269531, "logps_train/ref_chosen": -69.375, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -160.4158935546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13788332045078278, "rewards_train/margins": 5.724687710404396, "rewards_train/rejected": -5.586804389953613, "step": 843 }, { "epoch": 2.33, "learning_rate": 5.37246550721363e-08, "loss": 0.0339, "step": 844 }, { "epoch": 2.33, "logps_train/chosen": -68.988037109375, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -162.3740997314453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12829598784446716, "rewards_train/margins": 5.579377740621567, "rewards_train/rejected": -5.4510817527771, "step": 844 }, { "epoch": 2.33, "learning_rate": 5.331092399738326e-08, "loss": 0.0351, "step": 845 }, { "epoch": 2.33, "logps_train/chosen": -65.1112289428711, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -158.20350646972656, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.20337918400764465, "rewards_train/margins": 5.546532183885574, "rewards_train/rejected": -5.34315299987793, "step": 845 }, { "epoch": 2.33, "learning_rate": 5.2898547039369443e-08, "loss": 0.0392, "step": 846 }, { "epoch": 2.33, "logps_train/chosen": -67.14300537109375, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -156.4827423095703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16777949035167694, "rewards_train/margins": 5.348427340388298, "rewards_train/rejected": -5.180647850036621, "step": 846 }, { "epoch": 2.33, "learning_rate": 5.248752800483045e-08, "loss": 0.0291, "step": 847 }, { "epoch": 2.33, "logps_train/chosen": -63.50425720214844, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -154.04415893554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5101215243339539, "rewards_train/margins": 5.670396506786346, "rewards_train/rejected": -5.160274982452393, "step": 847 }, { "epoch": 2.34, "learning_rate": 5.207787068796676e-08, "loss": 0.0193, "step": 848 }, { "epoch": 2.34, "logps_train/chosen": -65.9396743774414, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -102.875, "logps_train/rejected": -155.9588623046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26994866132736206, "rewards_train/margins": 5.578041732311249, "rewards_train/rejected": -5.308093070983887, "step": 848 }, { "epoch": 2.34, "learning_rate": 5.166957887040848e-08, "loss": 0.0221, "step": 849 }, { "epoch": 2.34, "logps_train/chosen": -72.01116943359375, "logps_train/ref_chosen": -73.6875, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -163.59371948242188, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.16680245101451874, "rewards_train/margins": 5.716068312525749, "rewards_train/rejected": -5.5492658615112305, "step": 849 }, { "epoch": 2.34, "learning_rate": 5.126265632118063e-08, "loss": 0.0344, "step": 850 }, { "epoch": 2.34, "logps_train/chosen": -68.3447265625, "logps_train/ref_chosen": -72.75, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -169.51174926757812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43813556432724, "rewards_train/margins": 6.192533910274506, "rewards_train/rejected": -5.754398345947266, "step": 850 }, { "epoch": 2.34, "learning_rate": 5.085710679666797e-08, "loss": 0.0163, "step": 851 }, { "epoch": 2.34, "logps_train/chosen": -65.53709411621094, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -158.50244140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3627457618713379, "rewards_train/margins": 5.648342132568359, "rewards_train/rejected": -5.2855963706970215, "step": 851 }, { "epoch": 2.35, "learning_rate": 5.0452934040581e-08, "loss": 0.0313, "step": 852 }, { "epoch": 2.35, "logps_train/chosen": -63.950653076171875, "logps_train/ref_chosen": -67.375, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -155.97732543945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33970049023628235, "rewards_train/margins": 5.729717046022415, "rewards_train/rejected": -5.390016555786133, "step": 852 }, { "epoch": 2.35, "learning_rate": 5.005014178392055e-08, "loss": 0.022, "step": 853 }, { "epoch": 2.35, "logps_train/chosen": -67.5860595703125, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -161.13418579101562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15950891375541687, "rewards_train/margins": 5.635526388883591, "rewards_train/rejected": -5.476017475128174, "step": 853 }, { "epoch": 2.35, "learning_rate": 4.964873374494407e-08, "loss": 0.0242, "step": 854 }, { "epoch": 2.35, "logps_train/chosen": -66.56703186035156, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -162.35316467285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3386584520339966, "rewards_train/margins": 5.824072957038879, "rewards_train/rejected": -5.485414505004883, "step": 854 }, { "epoch": 2.36, "learning_rate": 4.9248713629130986e-08, "loss": 0.0262, "step": 855 }, { "epoch": 2.36, "logps_train/chosen": -64.9847412109375, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -161.20455932617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3739376664161682, "rewards_train/margins": 5.862361967563629, "rewards_train/rejected": -5.488424301147461, "step": 855 }, { "epoch": 2.36, "learning_rate": 4.8850085129148365e-08, "loss": 0.0198, "step": 856 }, { "epoch": 2.36, "logps_train/chosen": -68.59393310546875, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -162.2159881591797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1907534897327423, "rewards_train/margins": 5.781932383775711, "rewards_train/rejected": -5.591178894042969, "step": 856 }, { "epoch": 2.36, "learning_rate": 4.845285192481716e-08, "loss": 0.0257, "step": 857 }, { "epoch": 2.36, "logps_train/chosen": -66.3919677734375, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -159.44546508789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27081239223480225, "rewards_train/margins": 5.578543066978455, "rewards_train/rejected": -5.307730674743652, "step": 857 }, { "epoch": 2.36, "learning_rate": 4.805701768307786e-08, "loss": 0.0254, "step": 858 }, { "epoch": 2.36, "logps_train/chosen": -67.58547973632812, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -157.89013671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06288722157478333, "rewards_train/margins": 5.39726260304451, "rewards_train/rejected": -5.334375381469727, "step": 858 }, { "epoch": 2.37, "learning_rate": 4.7662586057957166e-08, "loss": 0.0483, "step": 859 }, { "epoch": 2.37, "logps_train/chosen": -67.68645477294922, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -160.58114624023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33819061517715454, "rewards_train/margins": 5.820766985416412, "rewards_train/rejected": -5.482576370239258, "step": 859 }, { "epoch": 2.37, "learning_rate": 4.72695606905336e-08, "loss": 0.0173, "step": 860 }, { "epoch": 2.37, "logps_train/chosen": -63.236568450927734, "logps_train/ref_chosen": -65.875, "logps_train/ref_rejected": -97.625, "logps_train/rejected": -149.51089477539062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26315975189208984, "rewards_train/margins": 5.452043056488037, "rewards_train/rejected": -5.188883304595947, "step": 860 }, { "epoch": 2.37, "learning_rate": 4.687794520890447e-08, "loss": 0.0306, "step": 861 }, { "epoch": 2.37, "logps_train/chosen": -70.21781921386719, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -161.46551513671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.03847181797027588, "rewards_train/margins": 5.514565110206604, "rewards_train/rejected": -5.476093292236328, "step": 861 }, { "epoch": 2.37, "learning_rate": 4.648774322815201e-08, "loss": 0.0414, "step": 862 }, { "epoch": 2.37, "logps_train/chosen": -68.66389465332031, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -105.0, "logps_train/rejected": -160.3599853515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20651063323020935, "rewards_train/margins": 5.740947335958481, "rewards_train/rejected": -5.5344367027282715, "step": 862 }, { "epoch": 2.38, "learning_rate": 4.6098958350310236e-08, "loss": 0.0321, "step": 863 }, { "epoch": 2.38, "logps_train/chosen": -68.29707336425781, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -160.0120086669922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27507781982421875, "rewards_train/margins": 5.673250198364258, "rewards_train/rejected": -5.398172378540039, "step": 863 }, { "epoch": 2.38, "learning_rate": 4.5711594164331415e-08, "loss": 0.0274, "step": 864 }, { "epoch": 2.38, "logps_train/chosen": -66.62042236328125, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -158.53158569335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2642759680747986, "rewards_train/margins": 5.644095838069916, "rewards_train/rejected": -5.379819869995117, "step": 864 }, { "epoch": 2.38, "learning_rate": 4.532565424605344e-08, "loss": 0.0228, "step": 865 }, { "epoch": 2.38, "logps_train/chosen": -67.52391052246094, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -160.79478454589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25659385323524475, "rewards_train/margins": 5.708923488855362, "rewards_train/rejected": -5.452329635620117, "step": 865 }, { "epoch": 2.39, "learning_rate": 4.4941142158166155e-08, "loss": 0.0326, "step": 866 }, { "epoch": 2.39, "logps_train/chosen": -70.00852966308594, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -161.1269073486328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1525159776210785, "rewards_train/margins": 5.615011781454086, "rewards_train/rejected": -5.462495803833008, "step": 866 }, { "epoch": 2.39, "learning_rate": 4.455806145017902e-08, "loss": 0.0346, "step": 867 }, { "epoch": 2.39, "logps_train/chosen": -69.13497924804688, "logps_train/ref_chosen": -72.3125, "logps_train/ref_rejected": -111.6875, "logps_train/rejected": -167.89096069335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31697097420692444, "rewards_train/margins": 5.940832763910294, "rewards_train/rejected": -5.623861789703369, "step": 867 }, { "epoch": 2.39, "learning_rate": 4.417641565838812e-08, "loss": 0.0239, "step": 868 }, { "epoch": 2.39, "logps_train/chosen": -69.01119995117188, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -165.671142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2925812900066376, "rewards_train/margins": 6.036355704069138, "rewards_train/rejected": -5.7437744140625, "step": 868 }, { "epoch": 2.39, "learning_rate": 4.3796208305843405e-08, "loss": 0.0245, "step": 869 }, { "epoch": 2.39, "logps_train/chosen": -68.11665344238281, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -157.62841796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22588381171226501, "rewards_train/margins": 5.415972083806992, "rewards_train/rejected": -5.190088272094727, "step": 869 }, { "epoch": 2.4, "learning_rate": 4.341744290231644e-08, "loss": 0.0336, "step": 870 }, { "epoch": 2.4, "logps_train/chosen": -66.62077331542969, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -104.5625, "logps_train/rejected": -156.58477783203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18128180503845215, "rewards_train/margins": 5.385169267654419, "rewards_train/rejected": -5.203887462615967, "step": 870 }, { "epoch": 2.4, "learning_rate": 4.3040122944267796e-08, "loss": 0.0367, "step": 871 }, { "epoch": 2.4, "logps_train/chosen": -68.87993621826172, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -108.875, "logps_train/rejected": -163.5870361328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30619579553604126, "rewards_train/margins": 5.774566233158112, "rewards_train/rejected": -5.46837043762207, "step": 871 }, { "epoch": 2.4, "learning_rate": 4.266425191481493e-08, "loss": 0.0218, "step": 872 }, { "epoch": 2.4, "logps_train/chosen": -66.37095642089844, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -105.9375, "logps_train/rejected": -162.3633270263672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21568778157234192, "rewards_train/margins": 5.858661204576492, "rewards_train/rejected": -5.64297342300415, "step": 872 }, { "epoch": 2.4, "learning_rate": 4.228983328369977e-08, "loss": 0.0252, "step": 873 }, { "epoch": 2.4, "logps_train/chosen": -69.1755599975586, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -106.3125, "logps_train/rejected": -160.260986328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3355693221092224, "rewards_train/margins": 5.728172242641449, "rewards_train/rejected": -5.392602920532227, "step": 873 }, { "epoch": 2.41, "learning_rate": 4.1916870507257005e-08, "loss": 0.0361, "step": 874 }, { "epoch": 2.41, "logps_train/chosen": -68.4031982421875, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -104.875, "logps_train/rejected": -158.344482421875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1877078413963318, "rewards_train/margins": 5.53680557012558, "rewards_train/rejected": -5.349097728729248, "step": 874 }, { "epoch": 2.41, "learning_rate": 4.1545367028382e-08, "loss": 0.0336, "step": 875 }, { "epoch": 2.41, "logps_train/chosen": -67.48086547851562, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -159.1671905517578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27168869972229004, "rewards_train/margins": 5.599931001663208, "rewards_train/rejected": -5.328242301940918, "step": 875 }, { "epoch": 2.41, "learning_rate": 4.117532627649909e-08, "loss": 0.0394, "step": 876 }, { "epoch": 2.41, "logps_train/chosen": -67.48236083984375, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -107.8125, "logps_train/rejected": -164.12181091308594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3284236788749695, "rewards_train/margins": 5.9578897356987, "rewards_train/rejected": -5.6294660568237305, "step": 876 }, { "epoch": 2.42, "learning_rate": 4.080675166752967e-08, "loss": 0.016, "step": 877 }, { "epoch": 2.42, "logps_train/chosen": -67.85269165039062, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -157.38150024414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12547284364700317, "rewards_train/margins": 5.462451279163361, "rewards_train/rejected": -5.336978435516357, "step": 877 }, { "epoch": 2.42, "learning_rate": 4.043964660386125e-08, "loss": 0.0334, "step": 878 }, { "epoch": 2.42, "logps_train/chosen": -64.97225952148438, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -158.1437225341797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33163151144981384, "rewards_train/margins": 5.642341464757919, "rewards_train/rejected": -5.3107099533081055, "step": 878 }, { "epoch": 2.42, "learning_rate": 4.0074014474315266e-08, "loss": 0.0225, "step": 879 }, { "epoch": 2.42, "logps_train/chosen": -65.94577026367188, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -101.5625, "logps_train/rejected": -155.89620971679688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23667266964912415, "rewards_train/margins": 5.668383866548538, "rewards_train/rejected": -5.431711196899414, "step": 879 }, { "epoch": 2.42, "learning_rate": 3.9709858654116466e-08, "loss": 0.0233, "step": 880 }, { "epoch": 2.42, "logps_train/chosen": -70.08140563964844, "logps_train/ref_chosen": -72.25, "logps_train/ref_rejected": -108.0625, "logps_train/rejected": -161.75506591796875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.21627308428287506, "rewards_train/margins": 5.588361039757729, "rewards_train/rejected": -5.3720879554748535, "step": 880 }, { "epoch": 2.43, "learning_rate": 3.934718250486142e-08, "loss": 0.0432, "step": 881 }, { "epoch": 2.43, "logps_train/chosen": -68.12062072753906, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -157.91273498535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28578972816467285, "rewards_train/margins": 5.774230718612671, "rewards_train/rejected": -5.488440990447998, "step": 881 }, { "epoch": 2.43, "learning_rate": 3.898598937448743e-08, "loss": 0.0391, "step": 882 }, { "epoch": 2.43, "logps_train/chosen": -68.11199951171875, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -108.0625, "logps_train/rejected": -163.65113830566406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18543091416358948, "rewards_train/margins": 5.744977384805679, "rewards_train/rejected": -5.55954647064209, "step": 882 }, { "epoch": 2.43, "learning_rate": 3.862628259724192e-08, "loss": 0.0312, "step": 883 }, { "epoch": 2.43, "logps_train/chosen": -66.33497619628906, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -104.0625, "logps_train/rejected": -155.94329833984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.204685777425766, "rewards_train/margins": 5.391789048910141, "rewards_train/rejected": -5.187103271484375, "step": 883 }, { "epoch": 2.44, "learning_rate": 3.8268065493651424e-08, "loss": 0.0343, "step": 884 }, { "epoch": 2.44, "logps_train/chosen": -68.28138732910156, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -161.12290954589844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30052345991134644, "rewards_train/margins": 5.742307364940643, "rewards_train/rejected": -5.441783905029297, "step": 884 }, { "epoch": 2.44, "learning_rate": 3.7911341370491034e-08, "loss": 0.0282, "step": 885 }, { "epoch": 2.44, "logps_train/chosen": -66.68991088867188, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -158.58883666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1430206596851349, "rewards_train/margins": 5.625438719987869, "rewards_train/rejected": -5.482418060302734, "step": 885 }, { "epoch": 2.44, "learning_rate": 3.7556113520753674e-08, "loss": 0.0285, "step": 886 }, { "epoch": 2.44, "logps_train/chosen": -64.60972595214844, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -155.44546508789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3329722285270691, "rewards_train/margins": 5.693436801433563, "rewards_train/rejected": -5.360464572906494, "step": 886 }, { "epoch": 2.44, "learning_rate": 3.7202385223620045e-08, "loss": 0.0221, "step": 887 }, { "epoch": 2.44, "logps_train/chosen": -66.58460998535156, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -109.75, "logps_train/rejected": -167.18902587890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3348005712032318, "rewards_train/margins": 6.082512229681015, "rewards_train/rejected": -5.747711658477783, "step": 887 }, { "epoch": 2.45, "learning_rate": 3.685015974442811e-08, "loss": 0.0147, "step": 888 }, { "epoch": 2.45, "logps_train/chosen": -66.40040588378906, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -99.1875, "logps_train/rejected": -151.26315307617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1423325538635254, "rewards_train/margins": 5.347651958465576, "rewards_train/rejected": -5.205319404602051, "step": 888 }, { "epoch": 2.45, "learning_rate": 3.649944033464307e-08, "loss": 0.0433, "step": 889 }, { "epoch": 2.45, "logps_train/chosen": -68.73011779785156, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -101.1875, "logps_train/rejected": -152.17706298828125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": -0.11231820285320282, "rewards_train/margins": 4.9860523492097855, "rewards_train/rejected": -5.098370552062988, "step": 889 }, { "epoch": 2.45, "learning_rate": 3.6150230231827065e-08, "loss": 0.0773, "step": 890 }, { "epoch": 2.45, "logps_train/chosen": -66.27803039550781, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -157.3416748046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13669857382774353, "rewards_train/margins": 5.518035262823105, "rewards_train/rejected": -5.381336688995361, "step": 890 }, { "epoch": 2.45, "learning_rate": 3.5802532659609864e-08, "loss": 0.0371, "step": 891 }, { "epoch": 2.45, "logps_train/chosen": -65.20626068115234, "logps_train/ref_chosen": -67.0625, "logps_train/ref_rejected": -99.0, "logps_train/rejected": -149.09796142578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18899263441562653, "rewards_train/margins": 5.199177876114845, "rewards_train/rejected": -5.010185241699219, "step": 891 }, { "epoch": 2.46, "learning_rate": 3.5456350827658474e-08, "loss": 0.0378, "step": 892 }, { "epoch": 2.46, "logps_train/chosen": -67.26744079589844, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -161.66732788085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41964295506477356, "rewards_train/margins": 5.885886698961258, "rewards_train/rejected": -5.466243743896484, "step": 892 }, { "epoch": 2.46, "learning_rate": 3.5111687931647985e-08, "loss": 0.0275, "step": 893 }, { "epoch": 2.46, "logps_train/chosen": -66.64231872558594, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -159.32540893554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2274182140827179, "rewards_train/margins": 5.762206822633743, "rewards_train/rejected": -5.534788608551025, "step": 893 }, { "epoch": 2.46, "learning_rate": 3.476854715323161e-08, "loss": 0.0319, "step": 894 }, { "epoch": 2.46, "logps_train/chosen": -66.20166015625, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -156.32711791992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.27534177899360657, "rewards_train/margins": 5.550826340913773, "rewards_train/rejected": -5.275484561920166, "step": 894 }, { "epoch": 2.47, "learning_rate": 3.4426931660012006e-08, "loss": 0.0221, "step": 895 }, { "epoch": 2.47, "logps_train/chosen": -66.94076538085938, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -160.95816040039062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31490784883499146, "rewards_train/margins": 5.845928966999054, "rewards_train/rejected": -5.5310211181640625, "step": 895 }, { "epoch": 2.47, "learning_rate": 3.408684460551121e-08, "loss": 0.0315, "step": 896 }, { "epoch": 2.47, "logps_train/chosen": -66.15296936035156, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -157.52243041992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20960527658462524, "rewards_train/margins": 5.5671215653419495, "rewards_train/rejected": -5.357516288757324, "step": 896 }, { "epoch": 2.47, "learning_rate": 3.374828912914214e-08, "loss": 0.0401, "step": 897 }, { "epoch": 2.47, "logps_train/chosen": -64.65728759765625, "logps_train/ref_chosen": -66.9375, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -158.1363983154297, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.2312926948070526, "rewards_train/margins": 5.68526503443718, "rewards_train/rejected": -5.453972339630127, "step": 897 }, { "epoch": 2.47, "learning_rate": 3.34112683561794e-08, "loss": 0.0385, "step": 898 }, { "epoch": 2.47, "logps_train/chosen": -67.69975280761719, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -159.52828979492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3491167426109314, "rewards_train/margins": 5.773722589015961, "rewards_train/rejected": -5.424605846405029, "step": 898 }, { "epoch": 2.48, "learning_rate": 3.307578539773033e-08, "loss": 0.0249, "step": 899 }, { "epoch": 2.48, "logps_train/chosen": -67.38185119628906, "logps_train/ref_chosen": -70.125, "logps_train/ref_rejected": -101.8125, "logps_train/rejected": -155.73895263671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2731427848339081, "rewards_train/margins": 5.666861981153488, "rewards_train/rejected": -5.39371919631958, "step": 899 }, { "epoch": 2.48, "learning_rate": 3.2741843350706444e-08, "loss": 0.0229, "step": 900 }, { "epoch": 2.48, "logps_train/chosen": -65.38573455810547, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -160.32286071777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4165043234825134, "rewards_train/margins": 5.952695786952972, "rewards_train/rejected": -5.536191463470459, "step": 900 }, { "epoch": 2.48, "learning_rate": 3.2409445297794856e-08, "loss": 0.0183, "step": 901 }, { "epoch": 2.48, "logps_train/chosen": -67.60276794433594, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -105.9375, "logps_train/rejected": -159.60443115234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3031995892524719, "rewards_train/margins": 5.66847687959671, "rewards_train/rejected": -5.365277290344238, "step": 901 }, { "epoch": 2.48, "learning_rate": 3.20785943074297e-08, "loss": 0.0307, "step": 902 }, { "epoch": 2.48, "logps_train/chosen": -67.3558349609375, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -161.13064575195312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2871708869934082, "rewards_train/margins": 5.686856746673584, "rewards_train/rejected": -5.399685859680176, "step": 902 }, { "epoch": 2.49, "learning_rate": 3.174929343376373e-08, "loss": 0.0188, "step": 903 }, { "epoch": 2.49, "logps_train/chosen": -68.52336883544922, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -162.06195068359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.11153022944927216, "rewards_train/margins": 5.6767095774412155, "rewards_train/rejected": -5.565179347991943, "step": 903 }, { "epoch": 2.49, "learning_rate": 3.1421545716640595e-08, "loss": 0.038, "step": 904 }, { "epoch": 2.49, "logps_train/chosen": -67.38668060302734, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -102.1875, "logps_train/rejected": -155.7990264892578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14341214299201965, "rewards_train/margins": 5.505151301622391, "rewards_train/rejected": -5.361739158630371, "step": 904 }, { "epoch": 2.49, "learning_rate": 3.109535418156606e-08, "loss": 0.0313, "step": 905 }, { "epoch": 2.49, "logps_train/chosen": -66.96540832519531, "logps_train/ref_chosen": -67.875, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -158.9186248779297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09217948466539383, "rewards_train/margins": 5.556210704147816, "rewards_train/rejected": -5.464031219482422, "step": 905 }, { "epoch": 2.5, "learning_rate": 3.0770721839680746e-08, "loss": 0.0341, "step": 906 }, { "epoch": 2.5, "logps_train/chosen": -65.26129150390625, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -156.35086059570312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18851979076862335, "rewards_train/margins": 5.478292778134346, "rewards_train/rejected": -5.289772987365723, "step": 906 }, { "epoch": 2.5, "learning_rate": 3.044765168773176e-08, "loss": 0.0541, "step": 907 }, { "epoch": 2.5, "logps_train/chosen": -64.78651428222656, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -105.4375, "logps_train/rejected": -159.40228271484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42960047721862793, "rewards_train/margins": 5.828714609146118, "rewards_train/rejected": -5.39911413192749, "step": 907 }, { "epoch": 2.5, "learning_rate": 3.0126146708045675e-08, "loss": 0.0222, "step": 908 }, { "epoch": 2.5, "logps_train/chosen": -66.48110961914062, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -157.72494506835938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3022313117980957, "rewards_train/margins": 5.6045098304748535, "rewards_train/rejected": -5.302278518676758, "step": 908 }, { "epoch": 2.5, "learning_rate": 2.980620986850029e-08, "loss": 0.0186, "step": 909 }, { "epoch": 2.5, "logps_train/chosen": -67.77129364013672, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -161.72341918945312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3612983524799347, "rewards_train/margins": 5.822312623262405, "rewards_train/rejected": -5.461014270782471, "step": 909 }, { "epoch": 2.51, "learning_rate": 2.9487844122497808e-08, "loss": 0.0172, "step": 910 }, { "epoch": 2.51, "logps_train/chosen": -66.66883850097656, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -102.9375, "logps_train/rejected": -156.6935577392578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10713982582092285, "rewards_train/margins": 5.481671571731567, "rewards_train/rejected": -5.3745317459106445, "step": 910 }, { "epoch": 2.51, "learning_rate": 2.9171052408937313e-08, "loss": 0.0421, "step": 911 }, { "epoch": 2.51, "logps_train/chosen": -66.64204406738281, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -165.11090087890625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.337602823972702, "rewards_train/margins": 6.03375169634819, "rewards_train/rejected": -5.696148872375488, "step": 911 }, { "epoch": 2.51, "learning_rate": 2.8855837652187574e-08, "loss": 0.0176, "step": 912 }, { "epoch": 2.51, "logps_train/chosen": -65.69207000732422, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -161.63528442382812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40828314423561096, "rewards_train/margins": 6.011752516031265, "rewards_train/rejected": -5.603469371795654, "step": 912 }, { "epoch": 2.52, "learning_rate": 2.8542202762060207e-08, "loss": 0.0163, "step": 913 }, { "epoch": 2.52, "logps_train/chosen": -68.29422760009766, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -106.125, "logps_train/rejected": -159.53048706054688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1341513693332672, "rewards_train/margins": 5.472160249948502, "rewards_train/rejected": -5.338008880615234, "step": 913 }, { "epoch": 2.52, "learning_rate": 2.82301506337828e-08, "loss": 0.035, "step": 914 }, { "epoch": 2.52, "logps_train/chosen": -70.25897216796875, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.25, "logps_train/rejected": -156.02833557128906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.1500673145055771, "rewards_train/margins": 5.125275626778603, "rewards_train/rejected": -5.27534294128418, "step": 914 }, { "epoch": 2.52, "learning_rate": 2.7919684147972166e-08, "loss": 0.0467, "step": 915 }, { "epoch": 2.52, "logps_train/chosen": -67.3016586303711, "logps_train/ref_chosen": -69.5625, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -159.79002380371094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2249128222465515, "rewards_train/margins": 5.5765716433525085, "rewards_train/rejected": -5.351658821105957, "step": 915 }, { "epoch": 2.52, "learning_rate": 2.7610806170607493e-08, "loss": 0.0249, "step": 916 }, { "epoch": 2.52, "logps_train/chosen": -63.541343688964844, "logps_train/ref_chosen": -67.25, "logps_train/ref_rejected": -101.3125, "logps_train/rejected": -156.61514282226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37145134806632996, "rewards_train/margins": 5.904351204633713, "rewards_train/rejected": -5.532899856567383, "step": 916 }, { "epoch": 2.53, "learning_rate": 2.730351955300454e-08, "loss": 0.0263, "step": 917 }, { "epoch": 2.53, "logps_train/chosen": -68.00364685058594, "logps_train/ref_chosen": -67.1875, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -156.88677978515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.07980860024690628, "rewards_train/margins": 5.243683449923992, "rewards_train/rejected": -5.323492050170898, "step": 917 }, { "epoch": 2.53, "learning_rate": 2.6997827131788486e-08, "loss": 0.0356, "step": 918 }, { "epoch": 2.53, "logps_train/chosen": -67.15682983398438, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -103.6875, "logps_train/rejected": -155.9263916015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.208096444606781, "rewards_train/margins": 5.433841645717621, "rewards_train/rejected": -5.22574520111084, "step": 918 }, { "epoch": 2.53, "learning_rate": 2.6693731728868464e-08, "loss": 0.0373, "step": 919 }, { "epoch": 2.53, "logps_train/chosen": -66.37696838378906, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -155.85928344726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16674628853797913, "rewards_train/margins": 5.5463763773441315, "rewards_train/rejected": -5.379630088806152, "step": 919 }, { "epoch": 2.53, "learning_rate": 2.6391236151410902e-08, "loss": 0.0323, "step": 920 }, { "epoch": 2.53, "logps_train/chosen": -65.16275787353516, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -155.99484252929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38230764865875244, "rewards_train/margins": 5.558062195777893, "rewards_train/rejected": -5.175754547119141, "step": 920 }, { "epoch": 2.54, "learning_rate": 2.609034319181431e-08, "loss": 0.0201, "step": 921 }, { "epoch": 2.54, "logps_train/chosen": -68.24360656738281, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -112.0, "logps_train/rejected": -167.92588806152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2650429606437683, "rewards_train/margins": 5.854115188121796, "rewards_train/rejected": -5.589072227478027, "step": 921 }, { "epoch": 2.54, "learning_rate": 2.579105562768271e-08, "loss": 0.0335, "step": 922 }, { "epoch": 2.54, "logps_train/chosen": -66.55358123779297, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -158.9915771484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43043333292007446, "rewards_train/margins": 5.8924813866615295, "rewards_train/rejected": -5.462048053741455, "step": 922 }, { "epoch": 2.54, "learning_rate": 2.5493376221800656e-08, "loss": 0.0145, "step": 923 }, { "epoch": 2.54, "logps_train/chosen": -65.18876647949219, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -158.60707092285156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3722856640815735, "rewards_train/margins": 5.845004856586456, "rewards_train/rejected": -5.472719192504883, "step": 923 }, { "epoch": 2.55, "learning_rate": 2.5197307722107396e-08, "loss": 0.0185, "step": 924 }, { "epoch": 2.55, "logps_train/chosen": -66.80719757080078, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -158.781494140625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34701478481292725, "rewards_train/margins": 5.885125041007996, "rewards_train/rejected": -5.538110256195068, "step": 924 }, { "epoch": 2.55, "learning_rate": 2.4902852861671464e-08, "loss": 0.0192, "step": 925 }, { "epoch": 2.55, "logps_train/chosen": -68.286865234375, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -164.04998779296875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19392070174217224, "rewards_train/margins": 6.010833829641342, "rewards_train/rejected": -5.81691312789917, "step": 925 }, { "epoch": 2.55, "learning_rate": 2.4610014358665697e-08, "loss": 0.0187, "step": 926 }, { "epoch": 2.55, "logps_train/chosen": -67.72999572753906, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -163.0071563720703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37265461683273315, "rewards_train/margins": 5.970928370952606, "rewards_train/rejected": -5.598273754119873, "step": 926 }, { "epoch": 2.55, "learning_rate": 2.431879491634199e-08, "loss": 0.0167, "step": 927 }, { "epoch": 2.55, "logps_train/chosen": -66.64955139160156, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -157.69418334960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4352882504463196, "rewards_train/margins": 5.856268227100372, "rewards_train/rejected": -5.420979976654053, "step": 927 }, { "epoch": 2.56, "learning_rate": 2.4029197223006338e-08, "loss": 0.0232, "step": 928 }, { "epoch": 2.56, "logps_train/chosen": -68.13321685791016, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.0, "logps_train/rejected": -155.4695281982422, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0027744509279727936, "rewards_train/margins": 5.246619012206793, "rewards_train/rejected": -5.249393463134766, "step": 928 }, { "epoch": 2.56, "learning_rate": 2.3741223951993983e-08, "loss": 0.0471, "step": 929 }, { "epoch": 2.56, "logps_train/chosen": -65.72279357910156, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -157.5675811767578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31561189889907837, "rewards_train/margins": 5.63941103219986, "rewards_train/rejected": -5.323799133300781, "step": 929 }, { "epoch": 2.56, "learning_rate": 2.3454877761644854e-08, "loss": 0.0191, "step": 930 }, { "epoch": 2.56, "logps_train/chosen": -65.09967803955078, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -99.6875, "logps_train/rejected": -152.4776153564453, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3763605058193207, "rewards_train/margins": 5.652344316244125, "rewards_train/rejected": -5.275983810424805, "step": 930 }, { "epoch": 2.56, "learning_rate": 2.3170161295279e-08, "loss": 0.0194, "step": 931 }, { "epoch": 2.56, "logps_train/chosen": -65.669189453125, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -157.30020141601562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.310522198677063, "rewards_train/margins": 5.654116749763489, "rewards_train/rejected": -5.343594551086426, "step": 931 }, { "epoch": 2.57, "learning_rate": 2.2887077181172132e-08, "loss": 0.0208, "step": 932 }, { "epoch": 2.57, "logps_train/chosen": -63.402427673339844, "logps_train/ref_chosen": -66.0, "logps_train/ref_rejected": -99.875, "logps_train/rejected": -149.414794921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26307791471481323, "rewards_train/margins": 5.218327701091766, "rewards_train/rejected": -4.955249786376953, "step": 932 }, { "epoch": 2.57, "learning_rate": 2.260562803253123e-08, "loss": 0.0405, "step": 933 }, { "epoch": 2.57, "logps_train/chosen": -64.07797241210938, "logps_train/ref_chosen": -65.0625, "logps_train/ref_rejected": -98.375, "logps_train/rejected": -148.80894470214844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09493710100650787, "rewards_train/margins": 5.137160077691078, "rewards_train/rejected": -5.04222297668457, "step": 933 }, { "epoch": 2.57, "learning_rate": 2.2325816447470915e-08, "loss": 0.0459, "step": 934 }, { "epoch": 2.57, "logps_train/chosen": -67.02873992919922, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -101.375, "logps_train/rejected": -157.2313995361328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10244841128587723, "rewards_train/margins": 5.684376187622547, "rewards_train/rejected": -5.58192777633667, "step": 934 }, { "epoch": 2.58, "learning_rate": 2.204764500898877e-08, "loss": 0.0254, "step": 935 }, { "epoch": 2.58, "logps_train/chosen": -66.39512634277344, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -105.8125, "logps_train/rejected": -157.972900390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.40179574489593506, "rewards_train/margins": 5.620569109916687, "rewards_train/rejected": -5.218773365020752, "step": 935 }, { "epoch": 2.58, "learning_rate": 2.177111628494206e-08, "loss": 0.0166, "step": 936 }, { "epoch": 2.58, "logps_train/chosen": -67.72613525390625, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -163.76133728027344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2108820527791977, "rewards_train/margins": 5.997709140181541, "rewards_train/rejected": -5.786827087402344, "step": 936 }, { "epoch": 2.58, "learning_rate": 2.1496232828023775e-08, "loss": 0.0239, "step": 937 }, { "epoch": 2.58, "logps_train/chosen": -66.23889923095703, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -159.5063934326172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3776240050792694, "rewards_train/margins": 5.7037515342235565, "rewards_train/rejected": -5.326127529144287, "step": 937 }, { "epoch": 2.58, "learning_rate": 2.1222997175739054e-08, "loss": 0.0178, "step": 938 }, { "epoch": 2.58, "logps_train/chosen": -66.9578628540039, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -110.1875, "logps_train/rejected": -166.93955993652344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3443993926048279, "rewards_train/margins": 6.020876348018646, "rewards_train/rejected": -5.676476955413818, "step": 938 }, { "epoch": 2.59, "learning_rate": 2.0951411850381873e-08, "loss": 0.0117, "step": 939 }, { "epoch": 2.59, "logps_train/chosen": -65.58403015136719, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -154.7783203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25575754046440125, "rewards_train/margins": 5.476655691862106, "rewards_train/rejected": -5.220898151397705, "step": 939 }, { "epoch": 2.59, "learning_rate": 2.06814793590117e-08, "loss": 0.0297, "step": 940 }, { "epoch": 2.59, "logps_train/chosen": -67.73765563964844, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -156.41065979003906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.0006208382546901703, "rewards_train/margins": 5.2113933600485325, "rewards_train/rejected": -5.212014198303223, "step": 940 }, { "epoch": 2.59, "learning_rate": 2.0413202193430367e-08, "loss": 0.0496, "step": 941 }, { "epoch": 2.59, "logps_train/chosen": -70.766845703125, "logps_train/ref_chosen": -72.1875, "logps_train/ref_rejected": -107.75, "logps_train/rejected": -161.66372680664062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.14113816618919373, "rewards_train/margins": 5.535147279500961, "rewards_train/rejected": -5.394009113311768, "step": 941 }, { "epoch": 2.6, "learning_rate": 2.0146582830158976e-08, "loss": 0.0269, "step": 942 }, { "epoch": 2.6, "logps_train/chosen": -66.67913818359375, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.8125, "logps_train/rejected": -160.298828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36948925256729126, "rewards_train/margins": 5.71646124124527, "rewards_train/rejected": -5.3469719886779785, "step": 942 }, { "epoch": 2.6, "learning_rate": 1.988162373041522e-08, "loss": 0.0208, "step": 943 }, { "epoch": 2.6, "logps_train/chosen": -65.36445617675781, "logps_train/ref_chosen": -66.3125, "logps_train/ref_rejected": -102.5625, "logps_train/rejected": -153.73443603515625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.09680570662021637, "rewards_train/margins": 5.215854778885841, "rewards_train/rejected": -5.119049072265625, "step": 943 }, { "epoch": 2.6, "learning_rate": 1.961832734009048e-08, "loss": 0.035, "step": 944 }, { "epoch": 2.6, "logps_train/chosen": -64.94390869140625, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -157.84844970703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.29774796962738037, "rewards_train/margins": 5.608569502830505, "rewards_train/rejected": -5.310821533203125, "step": 944 }, { "epoch": 2.6, "learning_rate": 1.9356696089727433e-08, "loss": 0.0284, "step": 945 }, { "epoch": 2.6, "logps_train/chosen": -67.39112091064453, "logps_train/ref_chosen": -67.6875, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -156.22610473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.029003415256738663, "rewards_train/margins": 5.379933629184961, "rewards_train/rejected": -5.350930213928223, "step": 945 }, { "epoch": 2.61, "learning_rate": 1.909673239449734e-08, "loss": 0.0311, "step": 946 }, { "epoch": 2.61, "logps_train/chosen": -67.5015640258789, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -164.5861358642578, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28524473309516907, "rewards_train/margins": 6.040244370698929, "rewards_train/rejected": -5.75499963760376, "step": 946 }, { "epoch": 2.61, "learning_rate": 1.8838438654178134e-08, "loss": 0.0225, "step": 947 }, { "epoch": 2.61, "logps_train/chosen": -66.43592071533203, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -104.8125, "logps_train/rejected": -158.1378631591797, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.22813601791858673, "rewards_train/margins": 5.559988930821419, "rewards_train/rejected": -5.331852912902832, "step": 947 }, { "epoch": 2.61, "learning_rate": 1.858181725313186e-08, "loss": 0.0314, "step": 948 }, { "epoch": 2.61, "logps_train/chosen": -67.11196899414062, "logps_train/ref_chosen": -72.1875, "logps_train/ref_rejected": -105.3125, "logps_train/rejected": -163.28184509277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5089690685272217, "rewards_train/margins": 6.307416677474976, "rewards_train/rejected": -5.798447608947754, "step": 948 }, { "epoch": 2.61, "learning_rate": 1.832687056028297e-08, "loss": 0.0054, "step": 949 }, { "epoch": 2.61, "logps_train/chosen": -68.96331024169922, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -107.5, "logps_train/rejected": -161.08213806152344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2077217400074005, "rewards_train/margins": 5.566863566637039, "rewards_train/rejected": -5.359141826629639, "step": 949 }, { "epoch": 2.62, "learning_rate": 1.807360092909631e-08, "loss": 0.0256, "step": 950 }, { "epoch": 2.62, "logps_train/chosen": -70.64167785644531, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -110.0625, "logps_train/rejected": -166.25924682617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.10091976821422577, "rewards_train/margins": 5.721766993403435, "rewards_train/rejected": -5.620847225189209, "step": 950 }, { "epoch": 2.62, "learning_rate": 1.782201069755549e-08, "loss": 0.0404, "step": 951 }, { "epoch": 2.62, "logps_train/chosen": -68.61839294433594, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -108.5, "logps_train/rejected": -162.82861328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2150653898715973, "rewards_train/margins": 5.646558195352554, "rewards_train/rejected": -5.431492805480957, "step": 951 }, { "epoch": 2.62, "learning_rate": 1.7572102188141092e-08, "loss": 0.0347, "step": 952 }, { "epoch": 2.62, "logps_train/chosen": -66.7886962890625, "logps_train/ref_chosen": -69.9375, "logps_train/ref_rejected": -107.875, "logps_train/rejected": -160.56573486328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3174196183681488, "rewards_train/margins": 5.587371438741684, "rewards_train/rejected": -5.269951820373535, "step": 952 }, { "epoch": 2.63, "learning_rate": 1.7323877707809497e-08, "loss": 0.0333, "step": 953 }, { "epoch": 2.63, "logps_train/chosen": -67.3975601196289, "logps_train/ref_chosen": -70.0, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -163.29681396484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25946277379989624, "rewards_train/margins": 5.876058995723724, "rewards_train/rejected": -5.616596221923828, "step": 953 }, { "epoch": 2.63, "learning_rate": 1.7077339547971505e-08, "loss": 0.0216, "step": 954 }, { "epoch": 2.63, "logps_train/chosen": -66.0711669921875, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -156.9792022705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3089480400085449, "rewards_train/margins": 5.624740123748779, "rewards_train/rejected": -5.315792083740234, "step": 954 }, { "epoch": 2.63, "learning_rate": 1.6832489984470977e-08, "loss": 0.0327, "step": 955 }, { "epoch": 2.63, "logps_train/chosen": -68.6166000366211, "logps_train/ref_chosen": -71.1875, "logps_train/ref_rejected": -106.75, "logps_train/rejected": -162.99649047851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25855523347854614, "rewards_train/margins": 5.8847673535346985, "rewards_train/rejected": -5.626212120056152, "step": 955 }, { "epoch": 2.63, "learning_rate": 1.658933127756419e-08, "loss": 0.0209, "step": 956 }, { "epoch": 2.63, "logps_train/chosen": -66.59550476074219, "logps_train/ref_chosen": -68.3125, "logps_train/ref_rejected": -99.875, "logps_train/rejected": -153.30360412597656, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.17223641276359558, "rewards_train/margins": 5.51861235499382, "rewards_train/rejected": -5.346375942230225, "step": 956 }, { "epoch": 2.64, "learning_rate": 1.6347865671898677e-08, "loss": 0.0483, "step": 957 }, { "epoch": 2.64, "logps_train/chosen": -67.30067443847656, "logps_train/ref_chosen": -71.5, "logps_train/ref_rejected": -108.0625, "logps_train/rejected": -166.29574584960938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42217814922332764, "rewards_train/margins": 6.2488240003585815, "rewards_train/rejected": -5.826645851135254, "step": 957 }, { "epoch": 2.64, "learning_rate": 1.6108095396492627e-08, "loss": 0.0094, "step": 958 }, { "epoch": 2.64, "logps_train/chosen": -63.79620361328125, "logps_train/ref_chosen": -66.8125, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -160.5796356201172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3049010932445526, "rewards_train/margins": 5.792650192975998, "rewards_train/rejected": -5.487749099731445, "step": 958 }, { "epoch": 2.64, "learning_rate": 1.5870022664714224e-08, "loss": 0.0378, "step": 959 }, { "epoch": 2.64, "logps_train/chosen": -67.66763305664062, "logps_train/ref_chosen": -70.5625, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -162.58189392089844, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28802138566970825, "rewards_train/margins": 5.849091827869415, "rewards_train/rejected": -5.561070442199707, "step": 959 }, { "epoch": 2.64, "learning_rate": 1.5633649674261483e-08, "loss": 0.022, "step": 960 }, { "epoch": 2.64, "logps_train/chosen": -67.87935638427734, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -157.5625762939453, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.26079481840133667, "rewards_train/margins": 5.5834590792655945, "rewards_train/rejected": -5.322664260864258, "step": 960 }, { "epoch": 2.65, "learning_rate": 1.539897860714152e-08, "loss": 0.0429, "step": 961 }, { "epoch": 2.65, "logps_train/chosen": -68.51922607421875, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -158.18048095703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21199369430541992, "rewards_train/margins": 5.645277500152588, "rewards_train/rejected": -5.433283805847168, "step": 961 }, { "epoch": 2.65, "learning_rate": 1.5166011629650787e-08, "loss": 0.0197, "step": 962 }, { "epoch": 2.65, "logps_train/chosen": -68.522705078125, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -158.91708374023438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1651121824979782, "rewards_train/margins": 5.6766931265592575, "rewards_train/rejected": -5.511580944061279, "step": 962 }, { "epoch": 2.65, "learning_rate": 1.49347508923549e-08, "loss": 0.0283, "step": 963 }, { "epoch": 2.65, "logps_train/chosen": -65.30668640136719, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -160.29412841796875, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.41430145502090454, "rewards_train/margins": 5.732483446598053, "rewards_train/rejected": -5.318181991577148, "step": 963 }, { "epoch": 2.66, "learning_rate": 1.4705198530068863e-08, "loss": 0.041, "step": 964 }, { "epoch": 2.66, "logps_train/chosen": -64.08869171142578, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -103.5625, "logps_train/rejected": -157.97860717773438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4363456666469574, "rewards_train/margins": 5.876101106405258, "rewards_train/rejected": -5.439755439758301, "step": 964 }, { "epoch": 2.66, "learning_rate": 1.4477356661837226e-08, "loss": 0.0223, "step": 965 }, { "epoch": 2.66, "logps_train/chosen": -66.37642669677734, "logps_train/ref_chosen": -67.125, "logps_train/ref_rejected": -106.0, "logps_train/rejected": -158.12942504882812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07451537251472473, "rewards_train/margins": 5.288337737321854, "rewards_train/rejected": -5.213822364807129, "step": 965 }, { "epoch": 2.66, "learning_rate": 1.425122739091471e-08, "loss": 0.0414, "step": 966 }, { "epoch": 2.66, "logps_train/chosen": -66.03927612304688, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -105.75, "logps_train/rejected": -160.48556518554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24958783388137817, "rewards_train/margins": 5.721484005451202, "rewards_train/rejected": -5.471896171569824, "step": 966 }, { "epoch": 2.66, "learning_rate": 1.4026812804746713e-08, "loss": 0.0245, "step": 967 }, { "epoch": 2.66, "logps_train/chosen": -66.02252960205078, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -103.4375, "logps_train/rejected": -158.0391082763672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.43612605333328247, "rewards_train/margins": 5.894822418689728, "rewards_train/rejected": -5.458696365356445, "step": 967 }, { "epoch": 2.67, "learning_rate": 1.3804114974949931e-08, "loss": 0.0211, "step": 968 }, { "epoch": 2.67, "logps_train/chosen": -62.05565643310547, "logps_train/ref_chosen": -65.875, "logps_train/ref_rejected": -100.25, "logps_train/rejected": -153.15841674804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37968820333480835, "rewards_train/margins": 5.672433793544769, "rewards_train/rejected": -5.292745590209961, "step": 968 }, { "epoch": 2.67, "learning_rate": 1.3583135957293413e-08, "loss": 0.0161, "step": 969 }, { "epoch": 2.67, "logps_train/chosen": -65.84346008300781, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -106.4375, "logps_train/rejected": -159.1611328125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.46658167243003845, "rewards_train/margins": 5.736795753240585, "rewards_train/rejected": -5.270214080810547, "step": 969 }, { "epoch": 2.67, "learning_rate": 1.336387779167949e-08, "loss": 0.0178, "step": 970 }, { "epoch": 2.67, "logps_train/chosen": -68.22756958007812, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -110.375, "logps_train/rejected": -164.423095703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1528044044971466, "rewards_train/margins": 5.556637316942215, "rewards_train/rejected": -5.403832912445068, "step": 970 }, { "epoch": 2.67, "learning_rate": 1.3146342502124963e-08, "loss": 0.047, "step": 971 }, { "epoch": 2.67, "logps_train/chosen": -66.01213836669922, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -157.1708221435547, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.349274218082428, "rewards_train/margins": 5.67563396692276, "rewards_train/rejected": -5.326359748840332, "step": 971 }, { "epoch": 2.68, "learning_rate": 1.2930532096742331e-08, "loss": 0.0232, "step": 972 }, { "epoch": 2.68, "logps_train/chosen": -67.2785873413086, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -103.1875, "logps_train/rejected": -156.26412963867188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.31901615858078003, "rewards_train/margins": 5.624726951122284, "rewards_train/rejected": -5.305710792541504, "step": 972 }, { "epoch": 2.68, "learning_rate": 1.2716448567721517e-08, "loss": 0.0257, "step": 973 }, { "epoch": 2.68, "logps_train/chosen": -64.79254913330078, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -102.3125, "logps_train/rejected": -157.568359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.42553040385246277, "rewards_train/margins": 5.94891831278801, "rewards_train/rejected": -5.523387908935547, "step": 973 }, { "epoch": 2.68, "learning_rate": 1.2504093891311086e-08, "loss": 0.0231, "step": 974 }, { "epoch": 2.68, "logps_train/chosen": -67.51920318603516, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -162.36752319335938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4063318371772766, "rewards_train/margins": 6.1091983914375305, "rewards_train/rejected": -5.702866554260254, "step": 974 }, { "epoch": 2.69, "learning_rate": 1.2293470027800345e-08, "loss": 0.0129, "step": 975 }, { "epoch": 2.69, "logps_train/chosen": -67.88999938964844, "logps_train/ref_chosen": -68.625, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -159.38412475585938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07042372226715088, "rewards_train/margins": 5.589744210243225, "rewards_train/rejected": -5.519320487976074, "step": 975 }, { "epoch": 2.69, "learning_rate": 1.2084578921501076e-08, "loss": 0.0438, "step": 976 }, { "epoch": 2.69, "logps_train/chosen": -65.986328125, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -156.7150115966797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30097702145576477, "rewards_train/margins": 5.502799779176712, "rewards_train/rejected": -5.201822757720947, "step": 976 }, { "epoch": 2.69, "learning_rate": 1.1877422500729673e-08, "loss": 0.0303, "step": 977 }, { "epoch": 2.69, "logps_train/chosen": -68.20723724365234, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -155.52252197265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22317251563072205, "rewards_train/margins": 5.3427108228206635, "rewards_train/rejected": -5.119538307189941, "step": 977 }, { "epoch": 2.69, "learning_rate": 1.1672002677789161e-08, "loss": 0.0338, "step": 978 }, { "epoch": 2.69, "logps_train/chosen": -66.48420715332031, "logps_train/ref_chosen": -68.9375, "logps_train/ref_rejected": -104.0, "logps_train/rejected": -159.4398193359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24386411905288696, "rewards_train/margins": 5.787260591983795, "rewards_train/rejected": -5.543396472930908, "step": 978 }, { "epoch": 2.7, "learning_rate": 1.1468321348951814e-08, "loss": 0.0289, "step": 979 }, { "epoch": 2.7, "logps_train/chosen": -65.70230102539062, "logps_train/ref_chosen": -68.6875, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -158.2545166015625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3002777099609375, "rewards_train/margins": 5.764889717102051, "rewards_train/rejected": -5.464612007141113, "step": 979 }, { "epoch": 2.7, "learning_rate": 1.1266380394441522e-08, "loss": 0.0224, "step": 980 }, { "epoch": 2.7, "logps_train/chosen": -66.95060729980469, "logps_train/ref_chosen": -69.5, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -158.42984008789062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25860118865966797, "rewards_train/margins": 5.7171125411987305, "rewards_train/rejected": -5.4585113525390625, "step": 980 }, { "epoch": 2.7, "learning_rate": 1.1066181678416265e-08, "loss": 0.0288, "step": 981 }, { "epoch": 2.7, "logps_train/chosen": -67.72517395019531, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -158.29678344726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20033428072929382, "rewards_train/margins": 5.6143395602703094, "rewards_train/rejected": -5.414005279541016, "step": 981 }, { "epoch": 2.71, "learning_rate": 1.0867727048951203e-08, "loss": 0.0316, "step": 982 }, { "epoch": 2.71, "logps_train/chosen": -67.29257202148438, "logps_train/ref_chosen": -70.875, "logps_train/ref_rejected": -107.75, "logps_train/rejected": -163.58070373535156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3567289710044861, "rewards_train/margins": 5.941460072994232, "rewards_train/rejected": -5.584731101989746, "step": 982 }, { "epoch": 2.71, "learning_rate": 1.0671018338021487e-08, "loss": 0.0149, "step": 983 }, { "epoch": 2.71, "logps_train/chosen": -68.70735931396484, "logps_train/ref_chosen": -71.5625, "logps_train/ref_rejected": -106.6875, "logps_train/rejected": -163.541748046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28468459844589233, "rewards_train/margins": 5.9721121191978455, "rewards_train/rejected": -5.687427520751953, "step": 983 }, { "epoch": 2.71, "learning_rate": 1.0476057361485269e-08, "loss": 0.0258, "step": 984 }, { "epoch": 2.71, "logps_train/chosen": -67.72982025146484, "logps_train/ref_chosen": -69.0, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -159.14039611816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12442944198846817, "rewards_train/margins": 5.5009701028466225, "rewards_train/rejected": -5.376540660858154, "step": 984 }, { "epoch": 2.71, "learning_rate": 1.0282845919066985e-08, "loss": 0.0403, "step": 985 }, { "epoch": 2.71, "logps_train/chosen": -64.50677490234375, "logps_train/ref_chosen": -67.75, "logps_train/ref_rejected": -101.9375, "logps_train/rejected": -155.68896484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32700780034065247, "rewards_train/margins": 5.703228980302811, "rewards_train/rejected": -5.376221179962158, "step": 985 }, { "epoch": 2.72, "learning_rate": 1.0091385794340923e-08, "loss": 0.0176, "step": 986 }, { "epoch": 2.72, "logps_train/chosen": -66.46505737304688, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -158.77633666992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32737061381340027, "rewards_train/margins": 5.831665486097336, "rewards_train/rejected": -5.5042948722839355, "step": 986 }, { "epoch": 2.72, "learning_rate": 9.901678754714393e-09, "loss": 0.0167, "step": 987 }, { "epoch": 2.72, "logps_train/chosen": -67.74182891845703, "logps_train/ref_chosen": -68.75, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -160.14944458007812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1027219146490097, "rewards_train/margins": 5.569326147437096, "rewards_train/rejected": -5.466604232788086, "step": 987 }, { "epoch": 2.72, "learning_rate": 9.71372655141176e-09, "loss": 0.034, "step": 988 }, { "epoch": 2.72, "logps_train/chosen": -66.6724853515625, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -162.09503173828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3097047209739685, "rewards_train/margins": 5.953583657741547, "rewards_train/rejected": -5.643878936767578, "step": 988 }, { "epoch": 2.72, "learning_rate": 9.527530919458083e-09, "loss": 0.0144, "step": 989 }, { "epoch": 2.72, "logps_train/chosen": -65.98908996582031, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -102.5, "logps_train/rejected": -155.7503662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.18175484240055084, "rewards_train/margins": 5.505131497979164, "rewards_train/rejected": -5.323376655578613, "step": 989 }, { "epoch": 2.73, "learning_rate": 9.343093577663208e-09, "loss": 0.0279, "step": 990 }, { "epoch": 2.73, "logps_train/chosen": -64.65237426757812, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -102.4375, "logps_train/rejected": -156.08102416992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3483854830265045, "rewards_train/margins": 5.711859375238419, "rewards_train/rejected": -5.363473892211914, "step": 990 }, { "epoch": 2.73, "learning_rate": 9.160416228605728e-09, "loss": 0.0243, "step": 991 }, { "epoch": 2.73, "logps_train/chosen": -67.72705078125, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -162.22938537597656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2720701992511749, "rewards_train/margins": 6.1013562977313995, "rewards_train/rejected": -5.829286098480225, "step": 991 }, { "epoch": 2.73, "learning_rate": 8.979500558617515e-09, "loss": 0.0298, "step": 992 }, { "epoch": 2.73, "logps_train/chosen": -66.05540466308594, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -159.4921875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3654550313949585, "rewards_train/margins": 5.753980994224548, "rewards_train/rejected": -5.38852596282959, "step": 992 }, { "epoch": 2.74, "learning_rate": 8.800348237767963e-09, "loss": 0.0301, "step": 993 }, { "epoch": 2.74, "logps_train/chosen": -64.54524993896484, "logps_train/ref_chosen": -67.625, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -157.56539916992188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30826812982559204, "rewards_train/margins": 5.689369738101959, "rewards_train/rejected": -5.381101608276367, "step": 993 }, { "epoch": 2.74, "learning_rate": 8.622960919848643e-09, "loss": 0.0276, "step": 994 }, { "epoch": 2.74, "logps_train/chosen": -66.37980651855469, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -111.375, "logps_train/rejected": -169.14056396484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.47408002614974976, "rewards_train/margins": 6.2522963881492615, "rewards_train/rejected": -5.778216361999512, "step": 994 }, { "epoch": 2.74, "learning_rate": 8.44734024235798e-09, "loss": 0.0148, "step": 995 }, { "epoch": 2.74, "logps_train/chosen": -67.22234344482422, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -156.2835693359375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34475818276405334, "rewards_train/margins": 5.596650272607803, "rewards_train/rejected": -5.25189208984375, "step": 995 }, { "epoch": 2.74, "learning_rate": 8.273487826486248e-09, "loss": 0.0356, "step": 996 }, { "epoch": 2.74, "logps_train/chosen": -72.28166198730469, "logps_train/ref_chosen": -71.625, "logps_train/ref_rejected": -109.6875, "logps_train/rejected": -165.7756805419922, "rewards_train/accuracies": 1.0, "rewards_train/chosen": -0.06664297729730606, "rewards_train/margins": 5.5437381491065025, "rewards_train/rejected": -5.610381126403809, "step": 996 }, { "epoch": 2.75, "learning_rate": 8.101405277100548e-09, "loss": 0.0375, "step": 997 }, { "epoch": 2.75, "logps_train/chosen": -67.22645568847656, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -108.25, "logps_train/rejected": -163.69754028320312, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2632427215576172, "rewards_train/margins": 5.805458068847656, "rewards_train/rejected": -5.542215347290039, "step": 997 }, { "epoch": 2.75, "learning_rate": 7.931094182729836e-09, "loss": 0.0211, "step": 998 }, { "epoch": 2.75, "logps_train/chosen": -65.62165832519531, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -160.88937377929688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3220633268356323, "rewards_train/margins": 5.802797198295593, "rewards_train/rejected": -5.480733871459961, "step": 998 }, { "epoch": 2.75, "learning_rate": 7.762556115550634e-09, "loss": 0.0306, "step": 999 }, { "epoch": 2.75, "logps_train/chosen": -64.41771697998047, "logps_train/ref_chosen": -67.4375, "logps_train/ref_rejected": -104.25, "logps_train/rejected": -158.27783203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3031500577926636, "rewards_train/margins": 5.702808260917664, "rewards_train/rejected": -5.399658203125, "step": 999 }, { "epoch": 2.75, "learning_rate": 7.59579263137209e-09, "loss": 0.0192, "step": 1000 }, { "epoch": 2.75, "logps_train/chosen": -69.35594177246094, "logps_train/ref_chosen": -71.9375, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -163.1025390625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2577155828475952, "rewards_train/margins": 6.001073718070984, "rewards_train/rejected": -5.743358135223389, "step": 1000 }, { "epoch": 2.76, "learning_rate": 7.430805269621942e-09, "loss": 0.0202, "step": 1001 }, { "epoch": 2.76, "logps_train/chosen": -66.04875183105469, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -160.9725341796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4012773633003235, "rewards_train/margins": 5.862496197223663, "rewards_train/rejected": -5.46121883392334, "step": 1001 }, { "epoch": 2.76, "learning_rate": 7.267595553332073e-09, "loss": 0.0269, "step": 1002 }, { "epoch": 2.76, "logps_train/chosen": -66.07994842529297, "logps_train/ref_chosen": -69.4375, "logps_train/ref_rejected": -101.25, "logps_train/rejected": -154.02825927734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.33638954162597656, "rewards_train/margins": 5.6122636795043945, "rewards_train/rejected": -5.275874137878418, "step": 1002 }, { "epoch": 2.76, "learning_rate": 7.106164989124708e-09, "loss": 0.0287, "step": 1003 }, { "epoch": 2.76, "logps_train/chosen": -65.5354995727539, "logps_train/ref_chosen": -67.8125, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -157.45428466796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22555187344551086, "rewards_train/margins": 5.5399267971515656, "rewards_train/rejected": -5.314374923706055, "step": 1003 }, { "epoch": 2.77, "learning_rate": 6.946515067198166e-09, "loss": 0.031, "step": 1004 }, { "epoch": 2.77, "logps_train/chosen": -65.39405822753906, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -159.14125061035156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.28300631046295166, "rewards_train/margins": 5.625208020210266, "rewards_train/rejected": -5.3422017097473145, "step": 1004 }, { "epoch": 2.77, "learning_rate": 6.7886472613134515e-09, "loss": 0.0303, "step": 1005 }, { "epoch": 2.77, "logps_train/chosen": -69.4130630493164, "logps_train/ref_chosen": -70.1875, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -162.28411865234375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07754149287939072, "rewards_train/margins": 5.586618088185787, "rewards_train/rejected": -5.5090765953063965, "step": 1005 }, { "epoch": 2.77, "learning_rate": 6.632563028780436e-09, "loss": 0.0232, "step": 1006 }, { "epoch": 2.77, "logps_train/chosen": -65.9798812866211, "logps_train/ref_chosen": -66.75, "logps_train/ref_rejected": -101.4375, "logps_train/rejected": -153.5218963623047, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.07901342213153839, "rewards_train/margins": 5.285207822918892, "rewards_train/rejected": -5.2061944007873535, "step": 1006 }, { "epoch": 2.77, "learning_rate": 6.478263810444473e-09, "loss": 0.0386, "step": 1007 }, { "epoch": 2.77, "logps_train/chosen": -65.0595932006836, "logps_train/ref_chosen": -67.25, "logps_train/ref_rejected": -100.5625, "logps_train/rejected": -152.28184509277344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22177501022815704, "rewards_train/margins": 5.396443322300911, "rewards_train/rejected": -5.174668312072754, "step": 1007 }, { "epoch": 2.78, "learning_rate": 6.32575103067301e-09, "loss": 0.0358, "step": 1008 }, { "epoch": 2.78, "logps_train/chosen": -64.91993713378906, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -107.0, "logps_train/rejected": -159.63778686523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4919912815093994, "rewards_train/margins": 5.757138013839722, "rewards_train/rejected": -5.265146732330322, "step": 1008 }, { "epoch": 2.78, "learning_rate": 6.1750260973426615e-09, "loss": 0.0144, "step": 1009 }, { "epoch": 2.78, "logps_train/chosen": -66.87361145019531, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -102.125, "logps_train/rejected": -156.05799865722656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.15204289555549622, "rewards_train/margins": 5.546808749437332, "rewards_train/rejected": -5.394765853881836, "step": 1009 }, { "epoch": 2.78, "learning_rate": 6.026090401825956e-09, "loss": 0.0352, "step": 1010 }, { "epoch": 2.78, "logps_train/chosen": -66.37083435058594, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -161.05963134765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2966082692146301, "rewards_train/margins": 5.795784294605255, "rewards_train/rejected": -5.499176025390625, "step": 1010 }, { "epoch": 2.79, "learning_rate": 5.87894531897859e-09, "loss": 0.0215, "step": 1011 }, { "epoch": 2.79, "logps_train/chosen": -68.3606948852539, "logps_train/ref_chosen": -71.25, "logps_train/ref_rejected": -106.9375, "logps_train/rejected": -162.72061157226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.291225790977478, "rewards_train/margins": 5.871830344200134, "rewards_train/rejected": -5.580604553222656, "step": 1011 }, { "epoch": 2.79, "learning_rate": 5.733592207126881e-09, "loss": 0.0248, "step": 1012 }, { "epoch": 2.79, "logps_train/chosen": -68.74098205566406, "logps_train/ref_chosen": -71.875, "logps_train/ref_rejected": -106.625, "logps_train/rejected": -162.24990844726562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3120833933353424, "rewards_train/margins": 5.876918941736221, "rewards_train/rejected": -5.564835548400879, "step": 1012 }, { "epoch": 2.79, "learning_rate": 5.590032408054957e-09, "loss": 0.029, "step": 1013 }, { "epoch": 2.79, "logps_train/chosen": -68.06318664550781, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -160.1543426513672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1139935553073883, "rewards_train/margins": 5.50144949555397, "rewards_train/rejected": -5.387455940246582, "step": 1013 }, { "epoch": 2.79, "learning_rate": 5.448267246992588e-09, "loss": 0.0328, "step": 1014 }, { "epoch": 2.79, "logps_train/chosen": -69.82548522949219, "logps_train/ref_chosen": -72.5, "logps_train/ref_rejected": -110.875, "logps_train/rejected": -166.0260009765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2665727734565735, "rewards_train/margins": 5.778841316699982, "rewards_train/rejected": -5.512268543243408, "step": 1014 }, { "epoch": 2.8, "learning_rate": 5.308298032602798e-09, "loss": 0.0277, "step": 1015 }, { "epoch": 2.8, "logps_train/chosen": -65.53713989257812, "logps_train/ref_chosen": -69.125, "logps_train/ref_rejected": -102.1875, "logps_train/rejected": -153.14483642578125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36142271757125854, "rewards_train/margins": 5.457254111766815, "rewards_train/rejected": -5.095831394195557, "step": 1015 }, { "epoch": 2.8, "learning_rate": 5.170126056970003e-09, "loss": 0.039, "step": 1016 }, { "epoch": 2.8, "logps_train/chosen": -67.48880767822266, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -157.65301513671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.06396063417196274, "rewards_train/margins": 5.354458414018154, "rewards_train/rejected": -5.290497779846191, "step": 1016 }, { "epoch": 2.8, "learning_rate": 5.033752595587782e-09, "loss": 0.0404, "step": 1017 }, { "epoch": 2.8, "logps_train/chosen": -66.76911163330078, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -104.75, "logps_train/rejected": -158.7862548828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.12919208407402039, "rewards_train/margins": 5.529058665037155, "rewards_train/rejected": -5.399866580963135, "step": 1017 }, { "epoch": 2.8, "learning_rate": 4.8991789073473454e-09, "loss": 0.039, "step": 1018 }, { "epoch": 2.8, "logps_train/chosen": -66.20808410644531, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -161.91355895996094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23207230865955353, "rewards_train/margins": 5.771769002079964, "rewards_train/rejected": -5.53969669342041, "step": 1018 }, { "epoch": 2.81, "learning_rate": 4.766406234525844e-09, "loss": 0.024, "step": 1019 }, { "epoch": 2.81, "logps_train/chosen": -67.0869140625, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -103.9375, "logps_train/rejected": -157.76724243164062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3545405864715576, "rewards_train/margins": 5.733900785446167, "rewards_train/rejected": -5.379360198974609, "step": 1019 }, { "epoch": 2.81, "learning_rate": 4.635435802774879e-09, "loss": 0.0212, "step": 1020 }, { "epoch": 2.81, "logps_train/chosen": -65.53138732910156, "logps_train/ref_chosen": -68.0625, "logps_train/ref_rejected": -102.6875, "logps_train/rejected": -156.45751953125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.25159773230552673, "rewards_train/margins": 5.628795474767685, "rewards_train/rejected": -5.377197742462158, "step": 1020 }, { "epoch": 2.81, "learning_rate": 4.506268821109116e-09, "loss": 0.0154, "step": 1021 }, { "epoch": 2.81, "logps_train/chosen": -68.42332458496094, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -103.625, "logps_train/rejected": -157.4024658203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20283368229866028, "rewards_train/margins": 5.579408198595047, "rewards_train/rejected": -5.376574516296387, "step": 1021 }, { "epoch": 2.82, "learning_rate": 4.378906481895339e-09, "loss": 0.0462, "step": 1022 }, { "epoch": 2.82, "logps_train/chosen": -65.69317626953125, "logps_train/ref_chosen": -70.4375, "logps_train/ref_rejected": -108.0, "logps_train/rejected": -164.07110595703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.472772479057312, "rewards_train/margins": 6.077538371086121, "rewards_train/rejected": -5.604765892028809, "step": 1022 }, { "epoch": 2.82, "learning_rate": 4.253349960841235e-09, "loss": 0.0142, "step": 1023 }, { "epoch": 2.82, "logps_train/chosen": -64.75332641601562, "logps_train/ref_chosen": -68.1875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -159.77423095703125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3401457667350769, "rewards_train/margins": 5.9376861453056335, "rewards_train/rejected": -5.597540378570557, "step": 1023 }, { "epoch": 2.82, "learning_rate": 4.129600416984558e-09, "loss": 0.0197, "step": 1024 }, { "epoch": 2.82, "logps_train/chosen": -65.35295104980469, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -105.9375, "logps_train/rejected": -159.25001525878906, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3072826564311981, "rewards_train/margins": 5.640926092863083, "rewards_train/rejected": -5.333643436431885, "step": 1024 }, { "epoch": 2.82, "learning_rate": 4.00765899268265e-09, "loss": 0.0403, "step": 1025 }, { "epoch": 2.82, "logps_train/chosen": -67.77906799316406, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -108.125, "logps_train/rejected": -163.9713897705078, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3517806828022003, "rewards_train/margins": 5.936517030000687, "rewards_train/rejected": -5.584736347198486, "step": 1025 }, { "epoch": 2.83, "learning_rate": 3.88752681360156e-09, "loss": 0.0196, "step": 1026 }, { "epoch": 2.83, "logps_train/chosen": -65.76188659667969, "logps_train/ref_chosen": -69.6875, "logps_train/ref_rejected": -106.5, "logps_train/rejected": -161.30514526367188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.39280596375465393, "rewards_train/margins": 5.875664621591568, "rewards_train/rejected": -5.482858657836914, "step": 1026 }, { "epoch": 2.83, "learning_rate": 3.769204988705965e-09, "loss": 0.0244, "step": 1027 }, { "epoch": 2.83, "logps_train/chosen": -66.12217712402344, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -104.125, "logps_train/rejected": -158.49525451660156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23363128304481506, "rewards_train/margins": 5.670462757349014, "rewards_train/rejected": -5.436831474304199, "step": 1027 }, { "epoch": 2.83, "learning_rate": 3.652694610248641e-09, "loss": 0.0195, "step": 1028 }, { "epoch": 2.83, "logps_train/chosen": -71.14165496826172, "logps_train/ref_chosen": -72.25, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -164.03790283203125, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.11381334066390991, "rewards_train/margins": 5.798364579677582, "rewards_train/rejected": -5.684551239013672, "step": 1028 }, { "epoch": 2.83, "learning_rate": 3.5379967537607413e-09, "loss": 0.0345, "step": 1029 }, { "epoch": 2.83, "logps_train/chosen": -68.50759887695312, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -108.75, "logps_train/rejected": -164.90170288085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24670083820819855, "rewards_train/margins": 5.861089780926704, "rewards_train/rejected": -5.614388942718506, "step": 1029 }, { "epoch": 2.84, "learning_rate": 3.4251124780414475e-09, "loss": 0.0237, "step": 1030 }, { "epoch": 2.84, "logps_train/chosen": -66.07243347167969, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -106.0625, "logps_train/rejected": -160.98934936523438, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.30022764205932617, "rewards_train/margins": 5.7911553382873535, "rewards_train/rejected": -5.490927696228027, "step": 1030 }, { "epoch": 2.84, "learning_rate": 3.314042825148533e-09, "loss": 0.0229, "step": 1031 }, { "epoch": 2.84, "logps_train/chosen": -69.19380187988281, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -110.0625, "logps_train/rejected": -167.20962524414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.21548277139663696, "rewards_train/margins": 5.92916876077652, "rewards_train/rejected": -5.713685989379883, "step": 1031 }, { "epoch": 2.84, "learning_rate": 3.204788820388593e-09, "loss": 0.0157, "step": 1032 }, { "epoch": 2.84, "logps_train/chosen": -67.21359252929688, "logps_train/ref_chosen": -68.25, "logps_train/ref_rejected": -104.375, "logps_train/rejected": -158.0453643798828, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.10559362173080444, "rewards_train/margins": 5.476145684719086, "rewards_train/rejected": -5.370552062988281, "step": 1032 }, { "epoch": 2.85, "learning_rate": 3.0973514723076077e-09, "loss": 0.0434, "step": 1033 }, { "epoch": 2.85, "logps_train/chosen": -65.68002319335938, "logps_train/ref_chosen": -68.0, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -159.06689453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2334621548652649, "rewards_train/margins": 5.627993881702423, "rewards_train/rejected": -5.394531726837158, "step": 1033 }, { "epoch": 2.85, "learning_rate": 2.991731772681594e-09, "loss": 0.0243, "step": 1034 }, { "epoch": 2.85, "logps_train/chosen": -65.43231964111328, "logps_train/ref_chosen": -69.0625, "logps_train/ref_rejected": -102.75, "logps_train/rejected": -156.43397521972656, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.36360403895378113, "rewards_train/margins": 5.729170769453049, "rewards_train/rejected": -5.365566730499268, "step": 1034 }, { "epoch": 2.85, "learning_rate": 2.8879306965075233e-09, "loss": 0.021, "step": 1035 }, { "epoch": 2.85, "logps_train/chosen": -67.49330139160156, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -157.3006591796875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.22801393270492554, "rewards_train/margins": 5.406128346920013, "rewards_train/rejected": -5.178114414215088, "step": 1035 }, { "epoch": 2.85, "learning_rate": 2.7859492019942866e-09, "loss": 0.0463, "step": 1036 }, { "epoch": 2.85, "logps_train/chosen": -66.43163299560547, "logps_train/ref_chosen": -71.4375, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -158.079833984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.5023443698883057, "rewards_train/margins": 5.799975156784058, "rewards_train/rejected": -5.297630786895752, "step": 1036 }, { "epoch": 2.86, "learning_rate": 2.6857882305538316e-09, "loss": 0.0282, "step": 1037 }, { "epoch": 2.86, "logps_train/chosen": -68.30293273925781, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -108.5625, "logps_train/rejected": -163.56814575195312, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.24485307931900024, "rewards_train/margins": 5.742828905582428, "rewards_train/rejected": -5.497975826263428, "step": 1037 }, { "epoch": 2.86, "learning_rate": 2.5874487067924612e-09, "loss": 0.0408, "step": 1038 }, { "epoch": 2.86, "logps_train/chosen": -70.20140075683594, "logps_train/ref_chosen": -72.0, "logps_train/ref_rejected": -108.375, "logps_train/rejected": -162.54898071289062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.17966493964195251, "rewards_train/margins": 5.598480552434921, "rewards_train/rejected": -5.418815612792969, "step": 1038 }, { "epoch": 2.86, "learning_rate": 2.490931538502372e-09, "loss": 0.0346, "step": 1039 }, { "epoch": 2.86, "logps_train/chosen": -67.66616821289062, "logps_train/ref_chosen": -70.5, "logps_train/ref_rejected": -104.625, "logps_train/rejected": -159.050048828125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2831394374370575, "rewards_train/margins": 5.727695196866989, "rewards_train/rejected": -5.444555759429932, "step": 1039 }, { "epoch": 2.87, "learning_rate": 2.396237616653196e-09, "loss": 0.0246, "step": 1040 }, { "epoch": 2.87, "logps_train/chosen": -63.951290130615234, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -159.1787109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.41883575916290283, "rewards_train/margins": 5.958875298500061, "rewards_train/rejected": -5.540039539337158, "step": 1040 }, { "epoch": 2.87, "learning_rate": 2.3033678153837832e-09, "loss": 0.0181, "step": 1041 }, { "epoch": 2.87, "logps_train/chosen": -66.80366516113281, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -160.47720336914062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2968311905860901, "rewards_train/margins": 5.838399946689606, "rewards_train/rejected": -5.541568756103516, "step": 1041 }, { "epoch": 2.87, "learning_rate": 2.2123229919942534e-09, "loss": 0.0221, "step": 1042 }, { "epoch": 2.87, "logps_train/chosen": -67.03445434570312, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -158.81118774414062, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2808317542076111, "rewards_train/margins": 5.62288898229599, "rewards_train/rejected": -5.342057228088379, "step": 1042 }, { "epoch": 2.87, "learning_rate": 2.123103986937869e-09, "loss": 0.037, "step": 1043 }, { "epoch": 2.87, "logps_train/chosen": -65.30138397216797, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -159.37176513671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2827523648738861, "rewards_train/margins": 5.682332366704941, "rewards_train/rejected": -5.399580001831055, "step": 1043 }, { "epoch": 2.88, "learning_rate": 2.035711623813463e-09, "loss": 0.0225, "step": 1044 }, { "epoch": 2.88, "logps_train/chosen": -65.8865966796875, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -105.875, "logps_train/rejected": -159.1226043701172, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2621219754219055, "rewards_train/margins": 5.585711300373077, "rewards_train/rejected": -5.323589324951172, "step": 1044 }, { "epoch": 2.88, "learning_rate": 1.9501467093576917e-09, "loss": 0.0247, "step": 1045 }, { "epoch": 2.88, "logps_train/chosen": -65.25044250488281, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.625, "logps_train/rejected": -161.9442138671875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4501018226146698, "rewards_train/margins": 6.085147827863693, "rewards_train/rejected": -5.635046005249023, "step": 1045 }, { "epoch": 2.88, "learning_rate": 1.866410033437793e-09, "loss": 0.0157, "step": 1046 }, { "epoch": 2.88, "logps_train/chosen": -68.27830505371094, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -157.29505920410156, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19951283931732178, "rewards_train/margins": 5.542251944541931, "rewards_train/rejected": -5.342739105224609, "step": 1046 }, { "epoch": 2.88, "learning_rate": 1.7845023690439943e-09, "loss": 0.0303, "step": 1047 }, { "epoch": 2.88, "logps_train/chosen": -69.25877380371094, "logps_train/ref_chosen": -72.5625, "logps_train/ref_rejected": -110.125, "logps_train/rejected": -167.5377197265625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.32846900820732117, "rewards_train/margins": 6.072280615568161, "rewards_train/rejected": -5.74381160736084, "step": 1047 }, { "epoch": 2.89, "learning_rate": 1.7044244722826507e-09, "loss": 0.0141, "step": 1048 }, { "epoch": 2.89, "logps_train/chosen": -67.42340087890625, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -105.3125, "logps_train/rejected": -162.70916748046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2955504059791565, "rewards_train/margins": 6.035022795200348, "rewards_train/rejected": -5.739472389221191, "step": 1048 }, { "epoch": 2.89, "learning_rate": 1.6261770823691622e-09, "loss": 0.0195, "step": 1049 }, { "epoch": 2.89, "logps_train/chosen": -66.29232025146484, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -106.25, "logps_train/rejected": -161.75735473632812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4077795743942261, "rewards_train/margins": 5.959737181663513, "rewards_train/rejected": -5.551957607269287, "step": 1049 }, { "epoch": 2.89, "learning_rate": 1.5497609216211348e-09, "loss": 0.0177, "step": 1050 }, { "epoch": 2.89, "logps_train/chosen": -66.96209716796875, "logps_train/ref_chosen": -70.9375, "logps_train/ref_rejected": -106.375, "logps_train/rejected": -160.64820861816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.397197961807251, "rewards_train/margins": 5.825886011123657, "rewards_train/rejected": -5.428688049316406, "step": 1050 }, { "epoch": 2.9, "learning_rate": 1.4751766954516742e-09, "loss": 0.024, "step": 1051 }, { "epoch": 2.9, "logps_train/chosen": -68.46621704101562, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -108.1875, "logps_train/rejected": -165.1266326904297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2828211784362793, "rewards_train/margins": 5.978882789611816, "rewards_train/rejected": -5.696061611175537, "step": 1051 }, { "epoch": 2.9, "learning_rate": 1.4024250923629687e-09, "loss": 0.0203, "step": 1052 }, { "epoch": 2.9, "logps_train/chosen": -65.94908142089844, "logps_train/ref_chosen": -69.1875, "logps_train/ref_rejected": -103.5, "logps_train/rejected": -156.06031799316406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3263804316520691, "rewards_train/margins": 5.582168281078339, "rewards_train/rejected": -5.2557878494262695, "step": 1052 }, { "epoch": 2.9, "learning_rate": 1.3315067839398952e-09, "loss": 0.0244, "step": 1053 }, { "epoch": 2.9, "logps_train/chosen": -66.15184020996094, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -154.923583984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2723645567893982, "rewards_train/margins": 5.384742200374603, "rewards_train/rejected": -5.112377643585205, "step": 1053 }, { "epoch": 2.9, "learning_rate": 1.2624224248438008e-09, "loss": 0.0244, "step": 1054 }, { "epoch": 2.9, "logps_train/chosen": -69.36329650878906, "logps_train/ref_chosen": -71.3125, "logps_train/ref_rejected": -105.5, "logps_train/rejected": -159.44113159179688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.19545669853687286, "rewards_train/margins": 5.588398173451424, "rewards_train/rejected": -5.392941474914551, "step": 1054 }, { "epoch": 2.91, "learning_rate": 1.1951726528065088e-09, "loss": 0.0347, "step": 1055 }, { "epoch": 2.91, "logps_train/chosen": -67.33541107177734, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -105.6875, "logps_train/rejected": -161.0606689453125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2538124918937683, "rewards_train/margins": 5.790251433849335, "rewards_train/rejected": -5.536438941955566, "step": 1055 }, { "epoch": 2.91, "learning_rate": 1.129758088624322e-09, "loss": 0.032, "step": 1056 }, { "epoch": 2.91, "logps_train/chosen": -67.12294006347656, "logps_train/ref_chosen": -71.125, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -161.15061950683594, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3971787393093109, "rewards_train/margins": 5.9541846215724945, "rewards_train/rejected": -5.557005882263184, "step": 1056 }, { "epoch": 2.91, "learning_rate": 1.0661793361524063e-09, "loss": 0.0224, "step": 1057 }, { "epoch": 2.91, "logps_train/chosen": -68.76046752929688, "logps_train/ref_chosen": -69.75, "logps_train/ref_rejected": -105.125, "logps_train/rejected": -157.646240234375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.09592632204294205, "rewards_train/margins": 5.346047438681126, "rewards_train/rejected": -5.250121116638184, "step": 1057 }, { "epoch": 2.91, "learning_rate": 1.0044369822991728e-09, "loss": 0.0577, "step": 1058 }, { "epoch": 2.91, "logps_train/chosen": -68.08665466308594, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -105.5625, "logps_train/rejected": -160.27383422851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.26941096782684326, "rewards_train/margins": 5.739469885826111, "rewards_train/rejected": -5.470058917999268, "step": 1058 }, { "epoch": 2.92, "learning_rate": 9.445315970209255e-10, "loss": 0.0232, "step": 1059 }, { "epoch": 2.92, "logps_train/chosen": -66.83868408203125, "logps_train/ref_chosen": -69.25, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -157.00828552246094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24127799272537231, "rewards_train/margins": 5.561052143573761, "rewards_train/rejected": -5.319774150848389, "step": 1059 }, { "epoch": 2.92, "learning_rate": 8.864637333164005e-10, "loss": 0.0314, "step": 1060 }, { "epoch": 2.92, "logps_train/chosen": -66.75032043457031, "logps_train/ref_chosen": -70.0625, "logps_train/ref_rejected": -107.125, "logps_train/rejected": -160.6194610595703, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3331223726272583, "rewards_train/margins": 5.681055188179016, "rewards_train/rejected": -5.347932815551758, "step": 1060 }, { "epoch": 2.92, "learning_rate": 8.302339272219017e-10, "loss": 0.0289, "step": 1061 }, { "epoch": 2.92, "logps_train/chosen": -68.88334655761719, "logps_train/ref_chosen": -70.3125, "logps_train/ref_rejected": -109.4375, "logps_train/rejected": -166.00900268554688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.13998563587665558, "rewards_train/margins": 5.800700262188911, "rewards_train/rejected": -5.660714626312256, "step": 1061 }, { "epoch": 2.93, "learning_rate": 7.758426978062394e-10, "loss": 0.0257, "step": 1062 }, { "epoch": 2.93, "logps_train/chosen": -67.68728637695312, "logps_train/ref_chosen": -70.25, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -159.71482849121094, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2592008411884308, "rewards_train/margins": 5.848798543214798, "rewards_train/rejected": -5.589597702026367, "step": 1062 }, { "epoch": 2.93, "learning_rate": 7.232905471659334e-10, "loss": 0.0271, "step": 1063 }, { "epoch": 2.93, "logps_train/chosen": -66.87930297851562, "logps_train/ref_chosen": -70.6875, "logps_train/ref_rejected": -107.5625, "logps_train/rejected": -164.74795532226562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.37935546040534973, "rewards_train/margins": 6.0962410271167755, "rewards_train/rejected": -5.716885566711426, "step": 1063 }, { "epoch": 2.93, "learning_rate": 6.725779604205728e-10, "loss": 0.0148, "step": 1064 }, { "epoch": 2.93, "logps_train/chosen": -66.20487976074219, "logps_train/ref_chosen": -68.875, "logps_train/ref_rejected": -100.5, "logps_train/rejected": -154.2837677001953, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2633987069129944, "rewards_train/margins": 5.6417747139930725, "rewards_train/rejected": -5.378376007080078, "step": 1064 }, { "epoch": 2.93, "learning_rate": 6.237054057083746e-10, "loss": 0.0185, "step": 1065 }, { "epoch": 2.93, "logps_train/chosen": -66.80791473388672, "logps_train/ref_chosen": -68.375, "logps_train/ref_rejected": -101.875, "logps_train/rejected": -154.04440307617188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1590522974729538, "rewards_train/margins": 5.374234601855278, "rewards_train/rejected": -5.215182304382324, "step": 1065 }, { "epoch": 2.94, "learning_rate": 5.766733341818542e-10, "loss": 0.0419, "step": 1066 }, { "epoch": 2.94, "logps_train/chosen": -65.75324249267578, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -103.8125, "logps_train/rejected": -159.25758361816406, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4105156362056732, "rewards_train/margins": 5.955072730779648, "rewards_train/rejected": -5.544557094573975, "step": 1066 }, { "epoch": 2.94, "learning_rate": 5.314821800036284e-10, "loss": 0.0204, "step": 1067 }, { "epoch": 2.94, "logps_train/chosen": -67.14183044433594, "logps_train/ref_chosen": -70.625, "logps_train/ref_rejected": -105.375, "logps_train/rejected": -161.542724609375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.34514302015304565, "rewards_train/margins": 5.962404906749725, "rewards_train/rejected": -5.61726188659668, "step": 1067 }, { "epoch": 2.94, "learning_rate": 4.881323603424636e-10, "loss": 0.0139, "step": 1068 }, { "epoch": 2.94, "logps_train/chosen": -67.1773681640625, "logps_train/ref_chosen": -69.875, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -158.3732147216797, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2663453221321106, "rewards_train/margins": 5.713724434375763, "rewards_train/rejected": -5.447379112243652, "step": 1068 }, { "epoch": 2.94, "learning_rate": 4.466242753693672e-10, "loss": 0.0221, "step": 1069 }, { "epoch": 2.94, "logps_train/chosen": -67.81536865234375, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -103.75, "logps_train/rejected": -158.9730987548828, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.2554258108139038, "rewards_train/margins": 5.78203284740448, "rewards_train/rejected": -5.526607036590576, "step": 1069 }, { "epoch": 2.95, "learning_rate": 4.069583082539463e-10, "loss": 0.0194, "step": 1070 }, { "epoch": 2.95, "logps_train/chosen": -68.37708282470703, "logps_train/ref_chosen": -72.25, "logps_train/ref_rejected": -111.0625, "logps_train/rejected": -166.9571533203125, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.38572919368743896, "rewards_train/margins": 5.976854681968689, "rewards_train/rejected": -5.59112548828125, "step": 1070 }, { "epoch": 2.95, "learning_rate": 3.6913482516078844e-10, "loss": 0.0214, "step": 1071 }, { "epoch": 2.95, "logps_train/chosen": -66.10365295410156, "logps_train/ref_chosen": -69.8125, "logps_train/ref_rejected": -103.875, "logps_train/rejected": -160.59124755859375, "rewards_train/accuracies": 0.984375, "rewards_train/chosen": 0.37015172839164734, "rewards_train/margins": 6.041579931974411, "rewards_train/rejected": -5.671428203582764, "step": 1071 }, { "epoch": 2.95, "learning_rate": 3.331541752461975e-10, "loss": 0.029, "step": 1072 }, { "epoch": 2.95, "logps_train/chosen": -65.5573501586914, "logps_train/ref_chosen": -68.8125, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -158.50717163085938, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3251243233680725, "rewards_train/margins": 5.753917753696442, "rewards_train/rejected": -5.428793430328369, "step": 1072 }, { "epoch": 2.96, "learning_rate": 2.9901669065486303e-10, "loss": 0.0237, "step": 1073 }, { "epoch": 2.96, "logps_train/chosen": -65.62014770507812, "logps_train/ref_chosen": -68.125, "logps_train/ref_rejected": -101.125, "logps_train/rejected": -152.81936645507812, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.24945932626724243, "rewards_train/margins": 5.419531881809235, "rewards_train/rejected": -5.170072555541992, "step": 1073 }, { "epoch": 2.96, "learning_rate": 2.6672268651686256e-10, "loss": 0.0249, "step": 1074 }, { "epoch": 2.96, "logps_train/chosen": -65.84650421142578, "logps_train/ref_chosen": -67.9375, "logps_train/ref_rejected": -103.0625, "logps_train/rejected": -156.275146484375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.20660920441150665, "rewards_train/margins": 5.5259213000535965, "rewards_train/rejected": -5.31931209564209, "step": 1074 }, { "epoch": 2.96, "learning_rate": 2.3627246094473084e-10, "loss": 0.0255, "step": 1075 }, { "epoch": 2.96, "logps_train/chosen": -64.96188354492188, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.375, "logps_train/rejected": -158.02383422851562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.35434919595718384, "rewards_train/margins": 5.916694700717926, "rewards_train/rejected": -5.562345504760742, "step": 1075 }, { "epoch": 2.96, "learning_rate": 2.0766629503070622e-10, "loss": 0.0197, "step": 1076 }, { "epoch": 2.96, "logps_train/chosen": -69.35052490234375, "logps_train/ref_chosen": -71.0625, "logps_train/ref_rejected": -107.1875, "logps_train/rejected": -163.83297729492188, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.16948775947093964, "rewards_train/margins": 5.832179829478264, "rewards_train/rejected": -5.662692070007324, "step": 1076 }, { "epoch": 2.97, "learning_rate": 1.809044528441328e-10, "loss": 0.0224, "step": 1077 }, { "epoch": 2.97, "logps_train/chosen": -67.32605743408203, "logps_train/ref_chosen": -71.0, "logps_train/ref_rejected": -105.25, "logps_train/rejected": -160.6119842529297, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3687610328197479, "rewards_train/margins": 5.902030438184738, "rewards_train/rejected": -5.53326940536499, "step": 1077 }, { "epoch": 2.97, "learning_rate": 1.5598718142901812e-10, "loss": 0.0202, "step": 1078 }, { "epoch": 2.97, "logps_train/chosen": -67.9366455078125, "logps_train/ref_chosen": -71.375, "logps_train/ref_rejected": -104.3125, "logps_train/rejected": -159.32510375976562, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3405146300792694, "rewards_train/margins": 5.842360943555832, "rewards_train/rejected": -5.5018463134765625, "step": 1078 }, { "epoch": 2.97, "learning_rate": 1.3291471080176807e-10, "loss": 0.016, "step": 1079 }, { "epoch": 2.97, "logps_train/chosen": -67.74932861328125, "logps_train/ref_chosen": -71.875, "logps_train/ref_rejected": -105.0625, "logps_train/rejected": -161.033935546875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.4151546359062195, "rewards_train/margins": 6.0136653780937195, "rewards_train/rejected": -5.5985107421875, "step": 1079 }, { "epoch": 2.98, "learning_rate": 1.1168725394907763e-10, "loss": 0.0194, "step": 1080 }, { "epoch": 2.98, "logps_train/chosen": -66.41375732421875, "logps_train/ref_chosen": -70.375, "logps_train/ref_rejected": -107.8125, "logps_train/rejected": -161.66552734375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3930968642234802, "rewards_train/margins": 5.779572904109955, "rewards_train/rejected": -5.386476039886475, "step": 1080 }, { "epoch": 2.98, "learning_rate": 9.230500682591014e-11, "loss": 0.0224, "step": 1081 }, { "epoch": 2.98, "logps_train/chosen": -67.20576477050781, "logps_train/ref_chosen": -70.75, "logps_train/ref_rejected": -104.1875, "logps_train/rejected": -158.3662109375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3520306348800659, "rewards_train/margins": 5.767753958702087, "rewards_train/rejected": -5.4157233238220215, "step": 1081 }, { "epoch": 2.98, "learning_rate": 7.476814835374323e-11, "loss": 0.0299, "step": 1082 }, { "epoch": 2.98, "logps_train/chosen": -67.21371459960938, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -104.9375, "logps_train/rejected": -161.53379821777344, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.23976166546344757, "rewards_train/margins": 5.897341087460518, "rewards_train/rejected": -5.65757942199707, "step": 1082 }, { "epoch": 2.98, "learning_rate": 5.907684041885907e-11, "loss": 0.0194, "step": 1083 }, { "epoch": 2.98, "logps_train/chosen": -68.00108337402344, "logps_train/ref_chosen": -69.3125, "logps_train/ref_rejected": -101.625, "logps_train/rejected": -154.83306884765625, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.1306048035621643, "rewards_train/margins": 5.452046453952789, "rewards_train/rejected": -5.321441650390625, "step": 1083 }, { "epoch": 2.99, "learning_rate": 4.523122787096767e-11, "loss": 0.0357, "step": 1084 }, { "epoch": 2.99, "logps_train/chosen": -68.83242797851562, "logps_train/ref_chosen": -69.625, "logps_train/ref_rejected": -104.6875, "logps_train/rejected": -159.0712432861328, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.0814545676112175, "rewards_train/margins": 5.5192926451563835, "rewards_train/rejected": -5.437838077545166, "step": 1084 }, { "epoch": 2.99, "learning_rate": 3.323143852171917e-11, "loss": 0.0374, "step": 1085 }, { "epoch": 2.99, "logps_train/chosen": -66.8816909790039, "logps_train/ref_chosen": -68.4375, "logps_train/ref_rejected": -99.8125, "logps_train/rejected": -154.588623046875, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.154067263007164, "rewards_train/margins": 5.631971582770348, "rewards_train/rejected": -5.477904319763184, "step": 1085 }, { "epoch": 2.99, "learning_rate": 2.307758314359365e-11, "loss": 0.0304, "step": 1086 }, { "epoch": 2.99, "logps_train/chosen": -65.09921264648438, "logps_train/ref_chosen": -68.5, "logps_train/ref_rejected": -102.0625, "logps_train/rejected": -155.20919799804688, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3426177501678467, "rewards_train/margins": 5.655921697616577, "rewards_train/rejected": -5.3133039474487305, "step": 1086 }, { "epoch": 2.99, "learning_rate": 1.476975546890191e-11, "loss": 0.0263, "step": 1087 }, { "epoch": 2.99, "logps_train/chosen": -65.20054626464844, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -103.375, "logps_train/rejected": -156.48236083984375, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3376111686229706, "rewards_train/margins": 5.647518128156662, "rewards_train/rejected": -5.309906959533691, "step": 1087 }, { "epoch": 3.0, "learning_rate": 8.308032188919512e-12, "loss": 0.0226, "step": 1088 }, { "epoch": 3.0, "logps_train/chosen": -64.72383880615234, "logps_train/ref_chosen": -68.5625, "logps_train/ref_rejected": -102.625, "logps_train/rejected": -153.7519989013672, "rewards_train/accuracies": 1.0, "rewards_train/chosen": 0.3870885968208313, "rewards_train/margins": 5.500082314014435, "rewards_train/rejected": -5.1129937171936035, "step": 1088 }, { "epoch": 3.0, "learning_rate": 3.692472953109593e-12, "loss": 0.0224, "step": 1089 }, { "epoch": 3.0, "step": 1089, "total_flos": 0.0, "train_loss": 0.1240052535659159, "train_runtime": 17518.8057, "train_samples_per_second": 3.972, "train_steps_per_second": 0.062 } ], "logging_steps": 1.0, "max_steps": 1089, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }