{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995965030262273, "eval_steps": 500, "global_step": 1858, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 2.6881720430107528e-09, "logits/chosen": -2.4663572311401367, "logits/rejected": -2.057170867919922, "logps/chosen": -246.4422607421875, "logps/rejected": -173.7652587890625, "loss": 0.5938, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "rewards/safe_rewards": 0.0, "rewards/unsafe_rewards": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 2.6881720430107527e-08, "logits/chosen": -2.3338096141815186, "logits/rejected": -2.1100988388061523, "logps/chosen": -199.19329833984375, "logps/rejected": -169.358642578125, "loss": 1.134, "rewards/accuracies": 0.4097222089767456, "rewards/chosen": -0.03551425039768219, "rewards/margins": -0.041799187660217285, "rewards/rejected": 0.006284935399889946, "rewards/safe_rewards": -0.01677405834197998, "rewards/unsafe_rewards": -0.0542544424533844, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.3763440860215054e-08, "logits/chosen": -2.329479694366455, "logits/rejected": -2.0858876705169678, "logps/chosen": -215.32296752929688, "logps/rejected": -176.8864288330078, "loss": 1.1266, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.031086910516023636, "rewards/margins": -0.04154179245233536, "rewards/rejected": 0.010454884730279446, "rewards/safe_rewards": -0.04110833257436752, "rewards/unsafe_rewards": -0.021065494045615196, "step": 20 }, { "epoch": 0.02, "learning_rate": 8.064516129032257e-08, "logits/chosen": -2.322885036468506, "logits/rejected": -2.1038832664489746, "logps/chosen": -199.3030242919922, "logps/rejected": -180.7991943359375, "loss": 1.1716, "rewards/accuracies": 0.515625, "rewards/chosen": 0.0008645713096484542, "rewards/margins": 0.027558892965316772, "rewards/rejected": -0.026694318279623985, "rewards/safe_rewards": -0.0032820613123476505, "rewards/unsafe_rewards": 0.005011203698813915, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0752688172043011e-07, "logits/chosen": -2.268714427947998, "logits/rejected": -1.9988443851470947, "logps/chosen": -197.72109985351562, "logps/rejected": -177.70603942871094, "loss": 1.1036, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.023218240588903427, "rewards/margins": 0.022794129326939583, "rewards/rejected": 0.0004241138813085854, "rewards/safe_rewards": 0.03502867370843887, "rewards/unsafe_rewards": 0.011407810263335705, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3440860215053762e-07, "logits/chosen": -2.374366283416748, "logits/rejected": -2.07818603515625, "logps/chosen": -191.63714599609375, "logps/rejected": -162.17771911621094, "loss": 1.1473, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.015013009309768677, "rewards/margins": 0.0906001627445221, "rewards/rejected": -0.10561318695545197, "rewards/safe_rewards": -0.018471335992217064, "rewards/unsafe_rewards": -0.011554678902029991, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.6129032258064515e-07, "logits/chosen": -2.346019983291626, "logits/rejected": -2.1285576820373535, "logps/chosen": -186.499755859375, "logps/rejected": -175.0586700439453, "loss": 1.0107, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.009731076657772064, "rewards/margins": 0.04699288681149483, "rewards/rejected": -0.037261806428432465, "rewards/safe_rewards": -0.01583387330174446, "rewards/unsafe_rewards": 0.03529602661728859, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8817204301075268e-07, "logits/chosen": -2.3234503269195557, "logits/rejected": -2.110891819000244, "logps/chosen": -221.27426147460938, "logps/rejected": -179.11380004882812, "loss": 2.1985, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.11594200134277344, "rewards/margins": 0.07270021736621857, "rewards/rejected": 0.04324179142713547, "rewards/safe_rewards": 0.0875079482793808, "rewards/unsafe_rewards": 0.14437603950500488, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1505376344086022e-07, "logits/chosen": -2.3453927040100098, "logits/rejected": -2.1327505111694336, "logps/chosen": -197.19949340820312, "logps/rejected": -176.77151489257812, "loss": 2.7155, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 0.15263572335243225, "rewards/margins": 0.012048400938510895, "rewards/rejected": 0.14058732986450195, "rewards/safe_rewards": 0.18941155076026917, "rewards/unsafe_rewards": 0.11585988849401474, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.4193548387096775e-07, "logits/chosen": -2.3641719818115234, "logits/rejected": -2.137413263320923, "logps/chosen": -216.1211395263672, "logps/rejected": -168.5092315673828, "loss": 2.721, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.008618640713393688, "rewards/margins": 0.006397470831871033, "rewards/rejected": 0.002221171511337161, "rewards/safe_rewards": 0.031759221106767654, "rewards/unsafe_rewards": -0.014521944336593151, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6881720430107523e-07, "logits/chosen": -2.360917568206787, "logits/rejected": -2.153608798980713, "logps/chosen": -201.7233428955078, "logps/rejected": -190.54605102539062, "loss": 1.4712, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.07169636338949203, "rewards/margins": 0.08422265201807022, "rewards/rejected": -0.012526283040642738, "rewards/safe_rewards": 0.09221886098384857, "rewards/unsafe_rewards": 0.0511738546192646, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.956989247311828e-07, "logits/chosen": -2.3796088695526123, "logits/rejected": -2.148357629776001, "logps/chosen": -207.0086212158203, "logps/rejected": -176.24658203125, "loss": 4.9646, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0011837140191346407, "rewards/margins": 0.02744489349424839, "rewards/rejected": -0.02626117691397667, "rewards/safe_rewards": -0.013010969385504723, "rewards/unsafe_rewards": 0.015378397889435291, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.225806451612903e-07, "logits/chosen": -2.378938913345337, "logits/rejected": -2.1289939880371094, "logps/chosen": -203.86172485351562, "logps/rejected": -168.72509765625, "loss": 5.8793, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.11708948761224747, "rewards/margins": -0.0013303399318829179, "rewards/rejected": 0.1184198409318924, "rewards/safe_rewards": 0.14375139772891998, "rewards/unsafe_rewards": 0.09042758494615555, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.4946236559139783e-07, "logits/chosen": -2.4672460556030273, "logits/rejected": -2.235044479370117, "logps/chosen": -211.15414428710938, "logps/rejected": -167.7396697998047, "loss": 2.9066, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": -0.141743004322052, "rewards/margins": 0.018900588154792786, "rewards/rejected": -0.1606435775756836, "rewards/safe_rewards": -0.20868048071861267, "rewards/unsafe_rewards": -0.07480548322200775, "step": 130 }, { "epoch": 0.08, "learning_rate": 3.7634408602150537e-07, "logits/chosen": -2.469130516052246, "logits/rejected": -2.2549142837524414, "logps/chosen": -219.2992401123047, "logps/rejected": -180.1728057861328, "loss": 14.0865, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.04582630842924118, "rewards/margins": 0.07086005806922913, "rewards/rejected": -0.025033747777342796, "rewards/safe_rewards": 0.05242709070444107, "rewards/unsafe_rewards": 0.03922552615404129, "step": 140 }, { "epoch": 0.08, "learning_rate": 4.0322580645161285e-07, "logits/chosen": -2.4029898643493652, "logits/rejected": -2.2180120944976807, "logps/chosen": -205.2784881591797, "logps/rejected": -167.4949951171875, "loss": 1531.726, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.9918906092643738, "rewards/margins": 0.18755348026752472, "rewards/rejected": -1.1794440746307373, "rewards/safe_rewards": -0.9263350367546082, "rewards/unsafe_rewards": -1.0574461221694946, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.3010752688172043e-07, "logits/chosen": -2.3345110416412354, "logits/rejected": -2.1149539947509766, "logps/chosen": -209.245849609375, "logps/rejected": -186.2938995361328, "loss": 76.4742, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": -0.40810996294021606, "rewards/margins": -0.12766215205192566, "rewards/rejected": -0.280447781085968, "rewards/safe_rewards": -0.5669787526130676, "rewards/unsafe_rewards": -0.2492411583662033, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.569892473118279e-07, "logits/chosen": -2.364396810531616, "logits/rejected": -2.153006076812744, "logps/chosen": -193.48985290527344, "logps/rejected": -157.84793090820312, "loss": 366.3049, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.4611927568912506, "rewards/margins": -0.09943069517612457, "rewards/rejected": -0.36176207661628723, "rewards/safe_rewards": -0.3680972456932068, "rewards/unsafe_rewards": -0.5542882680892944, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.838709677419355e-07, "logits/chosen": -2.379281520843506, "logits/rejected": -2.1527695655822754, "logps/chosen": -201.44384765625, "logps/rejected": -176.66293334960938, "loss": 212.1672, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -1.0156188011169434, "rewards/margins": 0.32616162300109863, "rewards/rejected": -1.341780424118042, "rewards/safe_rewards": -0.7996016144752502, "rewards/unsafe_rewards": -1.2316361665725708, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999929391798331e-07, "logits/chosen": -2.4363088607788086, "logits/rejected": -2.1664328575134277, "logps/chosen": -214.74972534179688, "logps/rejected": -172.83267211914062, "loss": 281.1637, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -2.0342252254486084, "rewards/margins": -1.3259267807006836, "rewards/rejected": -0.7082984447479248, "rewards/safe_rewards": -1.9764223098754883, "rewards/unsafe_rewards": -2.0920281410217285, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.9991350953333e-07, "logits/chosen": -2.399965763092041, "logits/rejected": -2.1533687114715576, "logps/chosen": -211.14138793945312, "logps/rejected": -183.2847442626953, "loss": 37.4693, "rewards/accuracies": 0.453125, "rewards/chosen": -0.5885945558547974, "rewards/margins": -0.33234477043151855, "rewards/rejected": -0.2562498152256012, "rewards/safe_rewards": 0.10301212966442108, "rewards/unsafe_rewards": -1.2802014350891113, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.997458523498236e-07, "logits/chosen": -2.4136548042297363, "logits/rejected": -2.1710681915283203, "logps/chosen": -192.46209716796875, "logps/rejected": -160.3273468017578, "loss": 19.4933, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 0.7500754594802856, "rewards/margins": 0.07740475982427597, "rewards/rejected": 0.6726706624031067, "rewards/safe_rewards": 0.8147931098937988, "rewards/unsafe_rewards": 0.6853577494621277, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.99490026817712e-07, "logits/chosen": -2.3793249130249023, "logits/rejected": -2.126897096633911, "logps/chosen": -206.8174591064453, "logps/rejected": -174.28512573242188, "loss": 618.2743, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.6119144558906555, "rewards/margins": 1.0083643198013306, "rewards/rejected": -0.3964497447013855, "rewards/safe_rewards": 0.269029825925827, "rewards/unsafe_rewards": 0.9547992944717407, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.991461232516674e-07, "logits/chosen": -2.278285503387451, "logits/rejected": -2.0165598392486572, "logps/chosen": -220.05496215820312, "logps/rejected": -191.4230499267578, "loss": 117.4644, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": -2.331136465072632, "rewards/margins": -0.27771270275115967, "rewards/rejected": -2.053424119949341, "rewards/safe_rewards": -1.6258525848388672, "rewards/unsafe_rewards": -3.0364208221435547, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.98714263060751e-07, "logits/chosen": -2.2665092945098877, "logits/rejected": -1.9782488346099854, "logps/chosen": -189.6136016845703, "logps/rejected": -156.85269165039062, "loss": 123.5274, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.6799499988555908, "rewards/margins": -0.4719271659851074, "rewards/rejected": -1.2080228328704834, "rewards/safe_rewards": -1.867531418800354, "rewards/unsafe_rewards": -1.4923683404922485, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.98194598705552e-07, "logits/chosen": -2.2388875484466553, "logits/rejected": -2.0419199466705322, "logps/chosen": -203.91488647460938, "logps/rejected": -175.87570190429688, "loss": 29.462, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -0.5732041597366333, "rewards/margins": 0.3381038308143616, "rewards/rejected": -0.9113079905509949, "rewards/safe_rewards": -0.5594094395637512, "rewards/unsafe_rewards": -0.5869989395141602, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.975873136443648e-07, "logits/chosen": -2.323503017425537, "logits/rejected": -2.1084866523742676, "logps/chosen": -219.4092254638672, "logps/rejected": -188.0467071533203, "loss": 514.7106, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": 0.01872560940682888, "rewards/margins": 0.048060666769742966, "rewards/rejected": -0.029335061088204384, "rewards/safe_rewards": -0.101626917719841, "rewards/unsafe_rewards": 0.13907812535762787, "step": 260 }, { "epoch": 0.15, "learning_rate": 4.968926222684212e-07, "logits/chosen": -2.3192670345306396, "logits/rejected": -2.128873586654663, "logps/chosen": -195.8466796875, "logps/rejected": -173.4759063720703, "loss": 62.0019, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.5205889940261841, "rewards/margins": 0.10105878114700317, "rewards/rejected": 0.4195302128791809, "rewards/safe_rewards": 0.4973847270011902, "rewards/unsafe_rewards": 0.5437930822372437, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.961107698262044e-07, "logits/chosen": -2.3513216972351074, "logits/rejected": -2.1132161617279053, "logps/chosen": -209.58480834960938, "logps/rejected": -173.8505096435547, "loss": 19.9099, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": 1.4319963455200195, "rewards/margins": -0.04141209274530411, "rewards/rejected": 1.4734083414077759, "rewards/safe_rewards": 0.7625109553337097, "rewards/unsafe_rewards": 2.1014816761016846, "step": 280 }, { "epoch": 0.16, "learning_rate": 4.952420323368673e-07, "logits/chosen": -2.327949047088623, "logits/rejected": -2.081421136856079, "logps/chosen": -202.83131408691406, "logps/rejected": -173.12339782714844, "loss": 166.1931, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 1.1239261627197266, "rewards/margins": 0.29456058144569397, "rewards/rejected": 0.8293657302856445, "rewards/safe_rewards": 0.95171719789505, "rewards/unsafe_rewards": 1.2961351871490479, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.942867164927899e-07, "logits/chosen": -2.3304100036621094, "logits/rejected": -2.148871898651123, "logps/chosen": -200.2861785888672, "logps/rejected": -173.5687713623047, "loss": 83.8678, "rewards/accuracies": 0.546875, "rewards/chosen": 1.1026077270507812, "rewards/margins": 0.16524335741996765, "rewards/rejected": 0.9373642206192017, "rewards/safe_rewards": 1.20353102684021, "rewards/unsafe_rewards": 1.001684308052063, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.932451595513062e-07, "logits/chosen": -2.3603804111480713, "logits/rejected": -2.1054179668426514, "logps/chosen": -222.5138702392578, "logps/rejected": -189.41696166992188, "loss": 125.375, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 1.2356212139129639, "rewards/margins": 0.4028751254081726, "rewards/rejected": 0.8327462077140808, "rewards/safe_rewards": 1.2047992944717407, "rewards/unsafe_rewards": 1.2664434909820557, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.921177292156419e-07, "logits/chosen": -2.4207069873809814, "logits/rejected": -2.131692409515381, "logps/chosen": -197.57579040527344, "logps/rejected": -173.03189086914062, "loss": 32.4693, "rewards/accuracies": 0.4375, "rewards/chosen": 1.0067864656448364, "rewards/margins": -0.07762779295444489, "rewards/rejected": 1.0844142436981201, "rewards/safe_rewards": 0.9899358749389648, "rewards/unsafe_rewards": 1.0236369371414185, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.909048235051033e-07, "logits/chosen": -2.3886237144470215, "logits/rejected": -2.2095794677734375, "logps/chosen": -201.99131774902344, "logps/rejected": -180.18301391601562, "loss": 165.1989, "rewards/accuracies": 0.546875, "rewards/chosen": 0.8952449560165405, "rewards/margins": 0.1524442732334137, "rewards/rejected": 0.7428006529808044, "rewards/safe_rewards": 0.9638195037841797, "rewards/unsafe_rewards": 0.8266702890396118, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.896068706145631e-07, "logits/chosen": -2.4264276027679443, "logits/rejected": -2.1699893474578857, "logps/chosen": -209.13687133789062, "logps/rejected": -161.4777374267578, "loss": 63.6332, "rewards/accuracies": 0.515625, "rewards/chosen": 0.5357077717781067, "rewards/margins": 0.20826852321624756, "rewards/rejected": 0.32743921875953674, "rewards/safe_rewards": 0.6318890452384949, "rewards/unsafe_rewards": 0.4395265579223633, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.882243287632946e-07, "logits/chosen": -2.4155266284942627, "logits/rejected": -2.1885287761688232, "logps/chosen": -190.31680297851562, "logps/rejected": -167.34011840820312, "loss": 22.5493, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.2450559437274933, "rewards/margins": 0.11199624836444855, "rewards/rejected": 0.13305969536304474, "rewards/safe_rewards": 0.32091599702835083, "rewards/unsafe_rewards": 0.16919586062431335, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.867576860332048e-07, "logits/chosen": -2.4087131023406982, "logits/rejected": -2.1696860790252686, "logps/chosen": -182.63320922851562, "logps/rejected": -157.3323974609375, "loss": 39.9616, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.33797144889831543, "rewards/margins": 0.2170281708240509, "rewards/rejected": 0.12094320356845856, "rewards/safe_rewards": 0.7084277868270874, "rewards/unsafe_rewards": -0.0324850007891655, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.85207460196526e-07, "logits/chosen": -2.3588593006134033, "logits/rejected": -2.1359121799468994, "logps/chosen": -201.29721069335938, "logps/rejected": -180.4462432861328, "loss": 18.4967, "rewards/accuracies": 0.421875, "rewards/chosen": -0.20872633159160614, "rewards/margins": -0.07106774300336838, "rewards/rejected": -0.13765858113765717, "rewards/safe_rewards": -0.24322757124900818, "rewards/unsafe_rewards": -0.1742250919342041, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.835741985330259e-07, "logits/chosen": -2.393688678741455, "logits/rejected": -2.1949095726013184, "logps/chosen": -196.72280883789062, "logps/rejected": -164.93276977539062, "loss": 13.0753, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.13964949548244476, "rewards/margins": 0.004895883612334728, "rewards/rejected": -0.14454536139965057, "rewards/safe_rewards": -0.14425238966941833, "rewards/unsafe_rewards": -0.1350466012954712, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.818584776367992e-07, "logits/chosen": -2.348188638687134, "logits/rejected": -2.183293342590332, "logps/chosen": -207.3245086669922, "logps/rejected": -185.33078002929688, "loss": 405.7585, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.9341610074043274, "rewards/margins": -0.25985628366470337, "rewards/rejected": 1.1940172910690308, "rewards/safe_rewards": 1.5165033340454102, "rewards/unsafe_rewards": 0.3518185615539551, "step": 390 }, { "epoch": 0.22, "learning_rate": 4.800609032127122e-07, "logits/chosen": -2.362936496734619, "logits/rejected": -2.117405652999878, "logps/chosen": -205.0863037109375, "logps/rejected": -173.82562255859375, "loss": 250.8796, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 0.8158755302429199, "rewards/margins": 0.04819601774215698, "rewards/rejected": 0.7676795721054077, "rewards/safe_rewards": 0.8238789439201355, "rewards/unsafe_rewards": 0.8078721761703491, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78182109862569e-07, "logits/chosen": -2.334447145462036, "logits/rejected": -2.1603846549987793, "logps/chosen": -193.15878295898438, "logps/rejected": -169.64031982421875, "loss": 43.271, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 1.1323336362838745, "rewards/margins": -0.2824760377407074, "rewards/rejected": 1.4148097038269043, "rewards/safe_rewards": 1.1305078268051147, "rewards/unsafe_rewards": 1.1341596841812134, "step": 410 }, { "epoch": 0.23, "learning_rate": 4.7622276086107677e-07, "logits/chosen": -2.4567148685455322, "logits/rejected": -2.2268338203430176, "logps/chosen": -221.8797149658203, "logps/rejected": -183.58682250976562, "loss": 170.0915, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 1.5294605493545532, "rewards/margins": -0.26579660177230835, "rewards/rejected": 1.7952572107315063, "rewards/safe_rewards": 1.6476377248764038, "rewards/unsafe_rewards": 1.4112837314605713, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.741835479216879e-07, "logits/chosen": -2.4018983840942383, "logits/rejected": -2.1745998859405518, "logps/chosen": -224.1997833251953, "logps/rejected": -202.8693084716797, "loss": 318.6482, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 1.9682689905166626, "rewards/margins": 1.3845123052597046, "rewards/rejected": 0.5837565660476685, "rewards/safe_rewards": 1.9361345767974854, "rewards/unsafe_rewards": 2.0004029273986816, "step": 430 }, { "epoch": 0.24, "learning_rate": 4.720651909524036e-07, "logits/chosen": -2.368582248687744, "logits/rejected": -2.1598029136657715, "logps/chosen": -199.04641723632812, "logps/rejected": -171.59878540039062, "loss": 20.6844, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.34873148798942566, "rewards/margins": -0.22367699444293976, "rewards/rejected": 0.5724084973335266, "rewards/safe_rewards": 0.4507713317871094, "rewards/unsafe_rewards": 0.24669162929058075, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.698684378016222e-07, "logits/chosen": -2.4238266944885254, "logits/rejected": -2.1877074241638184, "logps/chosen": -206.9587860107422, "logps/rejected": -166.5978546142578, "loss": 36.0619, "rewards/accuracies": 0.484375, "rewards/chosen": -0.48088520765304565, "rewards/margins": -0.556584894657135, "rewards/rejected": 0.07569964975118637, "rewards/safe_rewards": -0.8808043599128723, "rewards/unsafe_rewards": -0.08096615970134735, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.675940639941256e-07, "logits/chosen": -2.381782054901123, "logits/rejected": -2.2072319984436035, "logps/chosen": -202.72836303710938, "logps/rejected": -178.13565063476562, "loss": 19.0221, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.561229407787323, "rewards/margins": 0.22027714550495148, "rewards/rejected": 0.3409522473812103, "rewards/safe_rewards": 0.481137752532959, "rewards/unsafe_rewards": 0.641321063041687, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.6524287245729286e-07, "logits/chosen": -2.3484253883361816, "logits/rejected": -2.134340524673462, "logps/chosen": -198.06240844726562, "logps/rejected": -166.09368896484375, "loss": 26.6374, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.22380805015563965, "rewards/margins": -0.03325925022363663, "rewards/rejected": 0.2570672631263733, "rewards/safe_rewards": 0.1073969230055809, "rewards/unsafe_rewards": 0.3402191996574402, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.628156932376418e-07, "logits/chosen": -2.3849387168884277, "logits/rejected": -2.1502578258514404, "logps/chosen": -202.72006225585938, "logps/rejected": -165.7488555908203, "loss": 163.8104, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 0.22546739876270294, "rewards/margins": 0.1902145892381668, "rewards/rejected": 0.03525285795331001, "rewards/safe_rewards": -0.18428334593772888, "rewards/unsafe_rewards": 0.6352182030677795, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-07, "logits/chosen": -2.3536932468414307, "logits/rejected": -2.1680946350097656, "logps/chosen": -209.31454467773438, "logps/rejected": -199.3543701171875, "loss": 75.0922, "rewards/accuracies": 0.453125, "rewards/chosen": 0.6691935658454895, "rewards/margins": 0.05490832403302193, "rewards/rejected": 0.6142852902412415, "rewards/safe_rewards": 0.21514494717121124, "rewards/unsafe_rewards": 1.1232421398162842, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.5773682576397776e-07, "logits/chosen": -2.360821008682251, "logits/rejected": -2.1603407859802246, "logps/chosen": -201.4778594970703, "logps/rejected": -169.81484985351562, "loss": 131.6857, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.10828091204166412, "rewards/margins": -0.5544548034667969, "rewards/rejected": 0.662735641002655, "rewards/safe_rewards": -0.3237845301628113, "rewards/unsafe_rewards": 0.5403462648391724, "step": 500 }, { "epoch": 0.27, "eval_logits/chosen": -2.056485414505005, "eval_logits/rejected": -1.803229808807373, "eval_logps/chosen": -130.9681396484375, "eval_logps/rejected": -92.36480712890625, "eval_loss": 0.8894476294517517, "eval_rewards/accuracies": 0.45462244749069214, "eval_rewards/chosen": -0.10225697606801987, "eval_rewards/margins": -0.08933582156896591, "eval_rewards/rejected": -0.012921147979795933, "eval_rewards/safe_rewards": -0.10428992658853531, "eval_rewards/unsafe_rewards": -0.10168781876564026, "eval_runtime": 2237.5747, "eval_samples_per_second": 14.768, "eval_steps_per_second": 0.923, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.5508693051414774e-07, "logits/chosen": -2.3876683712005615, "logits/rejected": -2.2101075649261475, "logps/chosen": -197.6197509765625, "logps/rejected": -179.2535858154297, "loss": 10.2996, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.15894845128059387, "rewards/margins": -0.012749219313263893, "rewards/rejected": 0.17169766128063202, "rewards/safe_rewards": 0.22355195879936218, "rewards/unsafe_rewards": 0.09434493631124496, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.52364632956877e-07, "logits/chosen": -2.3795700073242188, "logits/rejected": -2.167722225189209, "logps/chosen": -209.80477905273438, "logps/rejected": -170.1284942626953, "loss": 102.0474, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.054634951055049896, "rewards/margins": -0.5382941961288452, "rewards/rejected": 0.48365920782089233, "rewards/safe_rewards": 0.07545175403356552, "rewards/unsafe_rewards": -0.1847216635942459, "step": 520 }, { "epoch": 0.29, "learning_rate": 4.4957089415108895e-07, "logits/chosen": -2.3528215885162354, "logits/rejected": -2.1418814659118652, "logps/chosen": -187.97207641601562, "logps/rejected": -165.0076446533203, "loss": 120.2137, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.6140010952949524, "rewards/margins": 0.31187087297439575, "rewards/rejected": 0.30213022232055664, "rewards/safe_rewards": 0.45020800828933716, "rewards/unsafe_rewards": 0.7777942419052124, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.467067003767745e-07, "logits/chosen": -2.441636800765991, "logits/rejected": -2.219637870788574, "logps/chosen": -215.01718139648438, "logps/rejected": -178.1621856689453, "loss": 31.3751, "rewards/accuracies": 0.578125, "rewards/chosen": 0.5192718505859375, "rewards/margins": 0.13771791756153107, "rewards/rejected": 0.3815539479255676, "rewards/safe_rewards": 0.3229585587978363, "rewards/unsafe_rewards": 0.7155852317810059, "step": 540 }, { "epoch": 0.3, "learning_rate": 4.437730627868027e-07, "logits/chosen": -2.378955602645874, "logits/rejected": -2.138523578643799, "logps/chosen": -181.02993774414062, "logps/rejected": -161.35678100585938, "loss": 48.7052, "rewards/accuracies": 0.484375, "rewards/chosen": 0.13924112915992737, "rewards/margins": 0.37750715017318726, "rewards/rejected": -0.23826603591442108, "rewards/safe_rewards": 0.4204103946685791, "rewards/unsafe_rewards": -0.14192816615104675, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.4077101704995163e-07, "logits/chosen": -2.4157960414886475, "logits/rejected": -2.1959304809570312, "logps/chosen": -204.2389373779297, "logps/rejected": -188.56707763671875, "loss": 23.5436, "rewards/accuracies": 0.484375, "rewards/chosen": -0.36682039499282837, "rewards/margins": -0.1311464011669159, "rewards/rejected": -0.23567399382591248, "rewards/safe_rewards": -0.4130277633666992, "rewards/unsafe_rewards": -0.3206130862236023, "step": 560 }, { "epoch": 0.31, "learning_rate": 4.3770162298528356e-07, "logits/chosen": -2.4378573894500732, "logits/rejected": -2.243499994277954, "logps/chosen": -201.71572875976562, "logps/rejected": -169.5461883544922, "loss": 48.0924, "rewards/accuracies": 0.565625011920929, "rewards/chosen": 0.21430592238903046, "rewards/margins": 0.6119717359542847, "rewards/rejected": -0.397665798664093, "rewards/safe_rewards": -0.27201324701309204, "rewards/unsafe_rewards": 0.7006251811981201, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.3456596418799476e-07, "logits/chosen": -2.383977174758911, "logits/rejected": -2.204479694366455, "logps/chosen": -208.63818359375, "logps/rejected": -172.74533081054688, "loss": 40.887, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.31667083501815796, "rewards/margins": 0.20149526000022888, "rewards/rejected": 0.11517556756734848, "rewards/safe_rewards": 0.08531586080789566, "rewards/unsafe_rewards": 0.5480257868766785, "step": 580 }, { "epoch": 0.32, "learning_rate": 4.313651476468715e-07, "logits/chosen": -2.452789783477783, "logits/rejected": -2.2367706298828125, "logps/chosen": -206.00991821289062, "logps/rejected": -181.455810546875, "loss": 17.7462, "rewards/accuracies": 0.46875, "rewards/chosen": 0.025053849443793297, "rewards/margins": -0.13409826159477234, "rewards/rejected": 0.1591521054506302, "rewards/safe_rewards": -0.05954737588763237, "rewards/unsafe_rewards": 0.10965506732463837, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.2810030335348693e-07, "logits/chosen": -2.4099035263061523, "logits/rejected": -2.218843936920166, "logps/chosen": -218.79177856445312, "logps/rejected": -168.67431640625, "loss": 74.9431, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.23495905101299286, "rewards/margins": -0.6415296196937561, "rewards/rejected": 0.40657052397727966, "rewards/safe_rewards": -0.5243301391601562, "rewards/unsafe_rewards": 0.05441205948591232, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.2477258390327806e-07, "logits/chosen": -2.4378225803375244, "logits/rejected": -2.2049014568328857, "logps/chosen": -190.23843383789062, "logps/rejected": -167.7356719970703, "loss": 24.3453, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.24785485863685608, "rewards/margins": -0.3185795247554779, "rewards/rejected": 0.07072468847036362, "rewards/safe_rewards": -0.1916726529598236, "rewards/unsafe_rewards": -0.30403703451156616, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.2138316408864197e-07, "logits/chosen": -2.4828572273254395, "logits/rejected": -2.251974105834961, "logps/chosen": -195.9208221435547, "logps/rejected": -162.9341278076172, "loss": 47.0644, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.2951027750968933, "rewards/margins": 0.2895166277885437, "rewards/rejected": 0.005586123559623957, "rewards/safe_rewards": 0.2697201073169708, "rewards/unsafe_rewards": 0.3204854130744934, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.179332404841962e-07, "logits/chosen": -2.4540035724639893, "logits/rejected": -2.223843812942505, "logps/chosen": -208.46463012695312, "logps/rejected": -176.60848999023438, "loss": 25.2961, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": 0.14698375761508942, "rewards/margins": 0.1325828731060028, "rewards/rejected": 0.014400847256183624, "rewards/safe_rewards": 0.04335422068834305, "rewards/unsafe_rewards": 0.250613272190094, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.1442403102434954e-07, "logits/chosen": -2.4651191234588623, "logits/rejected": -2.252150535583496, "logps/chosen": -212.79736328125, "logps/rejected": -179.38711547851562, "loss": 117.4084, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.48046717047691345, "rewards/margins": -0.18567809462547302, "rewards/rejected": -0.29478907585144043, "rewards/safe_rewards": -0.6295033693313599, "rewards/unsafe_rewards": -0.3314310312271118, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.108567745733318e-07, "logits/chosen": -2.447937488555908, "logits/rejected": -2.201697826385498, "logps/chosen": -184.49168395996094, "logps/rejected": -166.9139404296875, "loss": 10.7524, "rewards/accuracies": 0.5625, "rewards/chosen": -0.005866925232112408, "rewards/margins": 0.17292609810829163, "rewards/rejected": -0.1787930279970169, "rewards/safe_rewards": 0.03122568130493164, "rewards/unsafe_rewards": -0.042959537357091904, "step": 650 }, { "epoch": 0.36, "learning_rate": 4.0723273048783426e-07, "logits/chosen": -2.44038462638855, "logits/rejected": -2.2175660133361816, "logps/chosen": -211.3206787109375, "logps/rejected": -165.2122802734375, "loss": 81.9566, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.5619795918464661, "rewards/margins": 0.545897364616394, "rewards/rejected": 0.016082104295492172, "rewards/safe_rewards": 1.0618271827697754, "rewards/unsafe_rewards": 0.06213190406560898, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.0355317817241697e-07, "logits/chosen": -2.3970015048980713, "logits/rejected": -2.163048267364502, "logps/chosen": -229.952880859375, "logps/rejected": -176.55599975585938, "loss": 26.2558, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.38434115052223206, "rewards/margins": 0.2878434658050537, "rewards/rejected": 0.09649765491485596, "rewards/safe_rewards": 0.5011194944381714, "rewards/unsafe_rewards": 0.26756277680397034, "step": 670 }, { "epoch": 0.37, "learning_rate": 3.998194166278367e-07, "logits/chosen": -2.4422953128814697, "logits/rejected": -2.2152860164642334, "logps/chosen": -193.12109375, "logps/rejected": -156.7648162841797, "loss": 157.1721, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": -0.3413035273551941, "rewards/margins": -0.26979130506515503, "rewards/rejected": -0.07151220738887787, "rewards/safe_rewards": -0.49346867203712463, "rewards/unsafe_rewards": -0.18913838267326355, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.9603276399245855e-07, "logits/chosen": -2.4512076377868652, "logits/rejected": -2.217556953430176, "logps/chosen": -212.5731658935547, "logps/rejected": -172.98239135742188, "loss": 140.5213, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.045460961759090424, "rewards/margins": 0.4976281523704529, "rewards/rejected": -0.5430891513824463, "rewards/safe_rewards": 0.1586223542690277, "rewards/unsafe_rewards": -0.24954433739185333, "step": 690 }, { "epoch": 0.38, "learning_rate": 3.9219455707691e-07, "logits/chosen": -2.443801164627075, "logits/rejected": -2.217026710510254, "logps/chosen": -223.50064086914062, "logps/rejected": -188.3572998046875, "loss": 239.7127, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -1.1688416004180908, "rewards/margins": -0.6104832291603088, "rewards/rejected": -0.5583583116531372, "rewards/safe_rewards": -0.5376420021057129, "rewards/unsafe_rewards": -1.8000411987304688, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.883061508921439e-07, "logits/chosen": -2.4577882289886475, "logits/rejected": -2.289802074432373, "logps/chosen": -199.79066467285156, "logps/rejected": -191.25059509277344, "loss": 127.1414, "rewards/accuracies": 0.484375, "rewards/chosen": -0.8074787855148315, "rewards/margins": -0.5068421363830566, "rewards/rejected": -0.30063679814338684, "rewards/safe_rewards": -1.0235812664031982, "rewards/unsafe_rewards": -0.5913764238357544, "step": 710 }, { "epoch": 0.39, "learning_rate": 3.8436891817107555e-07, "logits/chosen": -2.384692668914795, "logits/rejected": -2.2363414764404297, "logps/chosen": -192.9431915283203, "logps/rejected": -173.0110626220703, "loss": 88.3357, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.845304012298584, "rewards/margins": 0.22771115601062775, "rewards/rejected": -1.0730152130126953, "rewards/safe_rewards": -0.7142607569694519, "rewards/unsafe_rewards": -0.9763473272323608, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8038424888396414e-07, "logits/chosen": -2.4334444999694824, "logits/rejected": -2.2202000617980957, "logps/chosen": -190.13265991210938, "logps/rejected": -173.72535705566406, "loss": 46.5741, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.9059289693832397, "rewards/margins": 0.07719539105892181, "rewards/rejected": -0.9831243753433228, "rewards/safe_rewards": -1.5265599489212036, "rewards/unsafe_rewards": -0.2852979004383087, "step": 730 }, { "epoch": 0.4, "learning_rate": 3.763535497477079e-07, "logits/chosen": -2.428952693939209, "logits/rejected": -2.205458641052246, "logps/chosen": -203.35873413085938, "logps/rejected": -178.9982452392578, "loss": 30.0399, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.06326188147068024, "rewards/margins": 0.4762391149997711, "rewards/rejected": -0.4129772186279297, "rewards/safe_rewards": 0.01683131232857704, "rewards/unsafe_rewards": 0.10969245433807373, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.7227824372922795e-07, "logits/chosen": -2.4341301918029785, "logits/rejected": -2.2008628845214844, "logps/chosen": -189.18417358398438, "logps/rejected": -167.0784454345703, "loss": 12.3092, "rewards/accuracies": 0.46875, "rewards/chosen": 0.09529106318950653, "rewards/margins": 0.02994244359433651, "rewards/rejected": 0.06534863263368607, "rewards/safe_rewards": 0.103404700756073, "rewards/unsafe_rewards": 0.08717743307352066, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.681597695431148e-07, "logits/chosen": -2.397660732269287, "logits/rejected": -2.248548984527588, "logps/chosen": -201.36961364746094, "logps/rejected": -183.10923767089844, "loss": 44.1826, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.05803655833005905, "rewards/margins": 0.0876794308423996, "rewards/rejected": -0.14571599662303925, "rewards/safe_rewards": -0.19255781173706055, "rewards/unsafe_rewards": 0.07648466527462006, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.639995811437159e-07, "logits/chosen": -2.3755042552948, "logits/rejected": -2.191373348236084, "logps/chosen": -197.1927032470703, "logps/rejected": -179.4755859375, "loss": 154.7574, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.14616283774375916, "rewards/margins": 0.41009521484375, "rewards/rejected": -0.26393240690231323, "rewards/safe_rewards": 0.3775586485862732, "rewards/unsafe_rewards": -0.08523297309875488, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.597991472118426e-07, "logits/chosen": -2.4273521900177, "logits/rejected": -2.192534923553467, "logps/chosen": -206.8874053955078, "logps/rejected": -176.24118041992188, "loss": 36.5319, "rewards/accuracies": 0.515625, "rewards/chosen": -0.164406418800354, "rewards/margins": -0.1354350596666336, "rewards/rejected": -0.028971344232559204, "rewards/safe_rewards": 0.2160978764295578, "rewards/unsafe_rewards": -0.5449106097221375, "step": 780 }, { "epoch": 0.43, "learning_rate": 3.5555995063627836e-07, "logits/chosen": -2.415065050125122, "logits/rejected": -2.194133758544922, "logps/chosen": -222.50820922851562, "logps/rejected": -191.37088012695312, "loss": 16.1129, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.1634823977947235, "rewards/margins": 0.29501140117645264, "rewards/rejected": -0.1315290331840515, "rewards/safe_rewards": -0.023002928122878075, "rewards/unsafe_rewards": 0.34996774792671204, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.512834879902715e-07, "logits/chosen": -2.446582794189453, "logits/rejected": -2.2151386737823486, "logps/chosen": -193.52993774414062, "logps/rejected": -169.22207641601562, "loss": 17.2298, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.15118324756622314, "rewards/margins": 0.320087730884552, "rewards/rejected": -0.16890448331832886, "rewards/safe_rewards": 0.15818454325199127, "rewards/unsafe_rewards": 0.14418195188045502, "step": 800 }, { "epoch": 0.44, "learning_rate": 3.4697126900319616e-07, "logits/chosen": -2.4158897399902344, "logits/rejected": -2.180227756500244, "logps/chosen": -200.93173217773438, "logps/rejected": -167.99073791503906, "loss": 22.7375, "rewards/accuracies": 0.484375, "rewards/chosen": 0.10535750538110733, "rewards/margins": -0.006962819490581751, "rewards/rejected": 0.11232032626867294, "rewards/safe_rewards": -0.18741589784622192, "rewards/unsafe_rewards": 0.3981309235095978, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.426248160275693e-07, "logits/chosen": -2.4130988121032715, "logits/rejected": -2.223747730255127, "logps/chosen": -196.2846221923828, "logps/rejected": -177.1783447265625, "loss": 62.6098, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": 0.08924231678247452, "rewards/margins": -0.5832756757736206, "rewards/rejected": 0.6725180745124817, "rewards/safe_rewards": 0.08092136681079865, "rewards/unsafe_rewards": 0.09756331145763397, "step": 820 }, { "epoch": 0.45, "learning_rate": 3.3824566350161094e-07, "logits/chosen": -2.4248764514923096, "logits/rejected": -2.1799604892730713, "logps/chosen": -211.0237274169922, "logps/rejected": -165.1766815185547, "loss": 8.7437, "rewards/accuracies": 0.515625, "rewards/chosen": 0.5879140496253967, "rewards/margins": 0.23111946880817413, "rewards/rejected": 0.356794536113739, "rewards/safe_rewards": 0.5438351631164551, "rewards/unsafe_rewards": 0.6319928765296936, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.338353574075381e-07, "logits/chosen": -2.3919012546539307, "logits/rejected": -2.212056875228882, "logps/chosen": -188.0956268310547, "logps/rejected": -166.2266387939453, "loss": 23.4515, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.4208035469055176, "rewards/margins": 0.025497043505311012, "rewards/rejected": 0.3953064978122711, "rewards/safe_rewards": 0.5599286556243896, "rewards/unsafe_rewards": 0.2816784679889679, "step": 840 }, { "epoch": 0.46, "learning_rate": 3.2939545472578314e-07, "logits/chosen": -2.4613280296325684, "logits/rejected": -2.1779792308807373, "logps/chosen": -220.7722625732422, "logps/rejected": -177.66567993164062, "loss": 71.1367, "rewards/accuracies": 0.46875, "rewards/chosen": 0.6730166077613831, "rewards/margins": 0.4062492251396179, "rewards/rejected": 0.26676732301712036, "rewards/safe_rewards": 0.1429261863231659, "rewards/unsafe_rewards": 1.2031069993972778, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2492752288532916e-07, "logits/chosen": -2.4267163276672363, "logits/rejected": -2.2031116485595703, "logps/chosen": -192.3984832763672, "logps/rejected": -171.2382354736328, "loss": 46.0145, "rewards/accuracies": 0.484375, "rewards/chosen": 0.1684725284576416, "rewards/margins": -0.08331739902496338, "rewards/rejected": 0.25178998708724976, "rewards/safe_rewards": 0.12440772354602814, "rewards/unsafe_rewards": 0.21253737807273865, "step": 860 }, { "epoch": 0.47, "learning_rate": 3.204331392103574e-07, "logits/chosen": -2.483734369277954, "logits/rejected": -2.2113869190216064, "logps/chosen": -211.577880859375, "logps/rejected": -163.4304656982422, "loss": 155.5441, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.19171550869941711, "rewards/margins": -0.2531381845474243, "rewards/rejected": 0.44485369324684143, "rewards/safe_rewards": 0.10490121692419052, "rewards/unsafe_rewards": 0.2785297632217407, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.159138903634006e-07, "logits/chosen": -2.409116744995117, "logits/rejected": -2.2290921211242676, "logps/chosen": -203.94369506835938, "logps/rejected": -173.5029754638672, "loss": 9.3153, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.4058583676815033, "rewards/margins": 0.18885207176208496, "rewards/rejected": 0.21700629591941833, "rewards/safe_rewards": 0.3220987915992737, "rewards/unsafe_rewards": 0.48961788415908813, "step": 880 }, { "epoch": 0.48, "learning_rate": 3.1137137178519977e-07, "logits/chosen": -2.4068942070007324, "logits/rejected": -2.212474822998047, "logps/chosen": -184.1978759765625, "logps/rejected": -157.02920532226562, "loss": 47.3581, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.041205547749996185, "rewards/margins": -0.1057499423623085, "rewards/rejected": 0.06454440206289291, "rewards/safe_rewards": -0.18809974193572998, "rewards/unsafe_rewards": 0.10568861663341522, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.068071871314626e-07, "logits/chosen": -2.3744447231292725, "logits/rejected": -2.1711204051971436, "logps/chosen": -193.363525390625, "logps/rejected": -157.98092651367188, "loss": 36.4272, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.1274958997964859, "rewards/margins": -0.06520196795463562, "rewards/rejected": 0.19269786775112152, "rewards/safe_rewards": 0.27050352096557617, "rewards/unsafe_rewards": -0.015511776320636272, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.022229477067205e-07, "logits/chosen": -2.4298009872436523, "logits/rejected": -2.2137274742126465, "logps/chosen": -212.06454467773438, "logps/rejected": -162.7147216796875, "loss": 22.3251, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": 0.2818406820297241, "rewards/margins": 0.29059693217277527, "rewards/rejected": -0.008756252937018871, "rewards/safe_rewards": 0.12103135883808136, "rewards/unsafe_rewards": 0.4426499903202057, "step": 910 }, { "epoch": 0.49, "learning_rate": 2.976202718954869e-07, "logits/chosen": -2.4414241313934326, "logits/rejected": -2.214113235473633, "logps/chosen": -208.3417510986328, "logps/rejected": -185.30526733398438, "loss": 15.9322, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.002296045422554016, "rewards/margins": 0.06845332682132721, "rewards/rejected": -0.0661572739481926, "rewards/safe_rewards": 0.11791107803583145, "rewards/unsafe_rewards": -0.11331899464130402, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.930007845909146e-07, "logits/chosen": -2.465981960296631, "logits/rejected": -2.2979178428649902, "logps/chosen": -220.63400268554688, "logps/rejected": -194.15982055664062, "loss": 20.6631, "rewards/accuracies": 0.44062501192092896, "rewards/chosen": 0.008412945084273815, "rewards/margins": -0.06071774289011955, "rewards/rejected": 0.06913068145513535, "rewards/safe_rewards": 0.15175995230674744, "rewards/unsafe_rewards": -0.13493406772613525, "step": 930 }, { "epoch": 0.51, "learning_rate": 2.8836611662115634e-07, "logits/chosen": -2.411681890487671, "logits/rejected": -2.184065818786621, "logps/chosen": -201.34774780273438, "logps/rejected": -158.77896118164062, "loss": 53.4563, "rewards/accuracies": 0.484375, "rewards/chosen": 0.30983632802963257, "rewards/margins": 0.02700033411383629, "rewards/rejected": 0.282835990190506, "rewards/safe_rewards": 0.16020536422729492, "rewards/unsafe_rewards": 0.4594673216342926, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8371790417362986e-07, "logits/chosen": -2.4363036155700684, "logits/rejected": -2.2508435249328613, "logps/chosen": -194.97052001953125, "logps/rejected": -184.87435913085938, "loss": 15.1437, "rewards/accuracies": 0.5, "rewards/chosen": 0.3793017268180847, "rewards/margins": 0.050546444952487946, "rewards/rejected": 0.328755259513855, "rewards/safe_rewards": 0.453242689371109, "rewards/unsafe_rewards": 0.30536073446273804, "step": 950 }, { "epoch": 0.52, "learning_rate": 2.7905778821739056e-07, "logits/chosen": -2.430182456970215, "logits/rejected": -2.181687116622925, "logps/chosen": -207.5760955810547, "logps/rejected": -161.82400512695312, "loss": 36.165, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.2813587188720703, "rewards/margins": -0.009190035052597523, "rewards/rejected": 0.29054874181747437, "rewards/safe_rewards": 0.5300852060317993, "rewards/unsafe_rewards": 0.03263214975595474, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.74387413923817e-07, "logits/chosen": -2.3779215812683105, "logits/rejected": -2.2126498222351074, "logps/chosen": -216.20980834960938, "logps/rejected": -191.72068786621094, "loss": 35.9574, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.40009957551956177, "rewards/margins": 0.11698710918426514, "rewards/rejected": 0.28311246633529663, "rewards/safe_rewards": 0.3544066548347473, "rewards/unsafe_rewards": 0.4457924962043762, "step": 970 }, { "epoch": 0.53, "learning_rate": 2.69708430085812e-07, "logits/chosen": -2.442641496658325, "logits/rejected": -2.2196171283721924, "logps/chosen": -210.2590789794922, "logps/rejected": -178.38427734375, "loss": 143.7915, "rewards/accuracies": 0.46875, "rewards/chosen": 0.8005061149597168, "rewards/margins": 0.7907932996749878, "rewards/rejected": 0.009712839499115944, "rewards/safe_rewards": 1.2023097276687622, "rewards/unsafe_rewards": 0.39870262145996094, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6502248853572504e-07, "logits/chosen": -2.397225856781006, "logits/rejected": -2.1839497089385986, "logps/chosen": -191.41046142578125, "logps/rejected": -162.9442901611328, "loss": 12.2808, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.014359796419739723, "rewards/margins": -0.05102130025625229, "rewards/rejected": 0.06538109481334686, "rewards/safe_rewards": -0.18064935505390167, "rewards/unsafe_rewards": 0.20936894416809082, "step": 990 }, { "epoch": 0.54, "learning_rate": 2.6033124356220325e-07, "logits/chosen": -2.364447593688965, "logits/rejected": -2.1461973190307617, "logps/chosen": -199.1238555908203, "logps/rejected": -159.5116729736328, "loss": 34.7958, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.03608076646924019, "rewards/margins": -0.22421510517597198, "rewards/rejected": 0.1881343126296997, "rewards/safe_rewards": 0.28278595209121704, "rewards/unsafe_rewards": -0.3549474775791168, "step": 1000 }, { "epoch": 0.54, "eval_logits/chosen": -2.0551209449768066, "eval_logits/rejected": -1.7989723682403564, "eval_logps/chosen": -130.9921875, "eval_logps/rejected": -92.4808578491211, "eval_loss": 0.7397361993789673, "eval_rewards/accuracies": 0.5028436779975891, "eval_rewards/chosen": -0.12634092569351196, "eval_rewards/margins": 0.00263192574493587, "eval_rewards/rejected": -0.1289728581905365, "eval_rewards/safe_rewards": -0.12365306168794632, "eval_rewards/unsafe_rewards": -0.1263761818408966, "eval_runtime": 1869.3277, "eval_samples_per_second": 17.677, "eval_steps_per_second": 1.105, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.55636351326173e-07, "logits/chosen": -2.4121344089508057, "logits/rejected": -2.2164716720581055, "logps/chosen": -214.9409637451172, "logps/rejected": -175.6654815673828, "loss": 72.6154, "rewards/accuracies": 0.53125, "rewards/chosen": 0.20922379195690155, "rewards/margins": -0.15886008739471436, "rewards/rejected": 0.3680838942527771, "rewards/safe_rewards": 0.6282423734664917, "rewards/unsafe_rewards": -0.20979471504688263, "step": 1010 }, { "epoch": 0.55, "learning_rate": 2.509394692761622e-07, "logits/chosen": -2.39310884475708, "logits/rejected": -2.1510488986968994, "logps/chosen": -218.1635284423828, "logps/rejected": -180.8001251220703, "loss": 79.5377, "rewards/accuracies": 0.5, "rewards/chosen": 0.365112841129303, "rewards/margins": 0.28045016527175903, "rewards/rejected": 0.08466275036334991, "rewards/safe_rewards": 0.08056111633777618, "rewards/unsafe_rewards": 0.649664580821991, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.462422555631674e-07, "logits/chosen": -2.4212746620178223, "logits/rejected": -2.187579393386841, "logps/chosen": -197.0594024658203, "logps/rejected": -160.92257690429688, "loss": 30.6297, "rewards/accuracies": 0.515625, "rewards/chosen": 0.516227126121521, "rewards/margins": 0.2513945698738098, "rewards/rejected": 0.2648325562477112, "rewards/safe_rewards": 0.3707699179649353, "rewards/unsafe_rewards": 0.6616843938827515, "step": 1030 }, { "epoch": 0.56, "learning_rate": 2.415463684552728e-07, "logits/chosen": -2.3526053428649902, "logits/rejected": -2.168795585632324, "logps/chosen": -187.2362518310547, "logps/rejected": -158.90509033203125, "loss": 16.6677, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": 0.2424931526184082, "rewards/margins": -0.007419240660965443, "rewards/rejected": 0.2499123513698578, "rewards/safe_rewards": 0.3042396008968353, "rewards/unsafe_rewards": 0.1807466745376587, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.3685346575222807e-07, "logits/chosen": -2.388552188873291, "logits/rejected": -2.140934467315674, "logps/chosen": -206.6807098388672, "logps/rejected": -170.2689666748047, "loss": 9.8385, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.22210577130317688, "rewards/margins": -0.04671960324048996, "rewards/rejected": 0.26882538199424744, "rewards/safe_rewards": 0.3220168948173523, "rewards/unsafe_rewards": 0.12219462543725967, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.321652042001919e-07, "logits/chosen": -2.390388011932373, "logits/rejected": -2.10972261428833, "logps/chosen": -209.7392120361328, "logps/rejected": -183.13662719726562, "loss": 9.9643, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.16931462287902832, "rewards/margins": 0.08294131606817245, "rewards/rejected": 0.08637328445911407, "rewards/safe_rewards": 0.11830408871173859, "rewards/unsafe_rewards": 0.22032511234283447, "step": 1060 }, { "epoch": 0.58, "learning_rate": 2.2748323890684662e-07, "logits/chosen": -2.3839309215545654, "logits/rejected": -2.1623384952545166, "logps/chosen": -198.49668884277344, "logps/rejected": -169.58737182617188, "loss": 11.8899, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.18948496878147125, "rewards/margins": -0.021000146865844727, "rewards/rejected": 0.2104850709438324, "rewards/safe_rewards": 0.07809984683990479, "rewards/unsafe_rewards": 0.3008700907230377, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.2280922275709213e-07, "logits/chosen": -2.402510166168213, "logits/rejected": -2.1689133644104004, "logps/chosen": -204.17782592773438, "logps/rejected": -179.0993194580078, "loss": 46.4965, "rewards/accuracies": 0.484375, "rewards/chosen": -0.004160255193710327, "rewards/margins": 0.15431872010231018, "rewards/rejected": -0.1584789752960205, "rewards/safe_rewards": -0.011111170053482056, "rewards/unsafe_rewards": 0.0027906596660614014, "step": 1080 }, { "epoch": 0.59, "learning_rate": 2.1814480582952375e-07, "logits/chosen": -2.410515308380127, "logits/rejected": -2.184720993041992, "logps/chosen": -203.24267578125, "logps/rejected": -181.4256134033203, "loss": 102.4097, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.45176035165786743, "rewards/margins": 0.40241655707359314, "rewards/rejected": 0.04934380576014519, "rewards/safe_rewards": 0.11663278192281723, "rewards/unsafe_rewards": 0.7868879437446594, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1349163481390187e-07, "logits/chosen": -2.397282600402832, "logits/rejected": -2.194654703140259, "logps/chosen": -193.00746154785156, "logps/rejected": -171.80690002441406, "loss": 7.6309, "rewards/accuracies": 0.515625, "rewards/chosen": 0.22561688721179962, "rewards/margins": 0.030238542705774307, "rewards/rejected": 0.1953783482313156, "rewards/safe_rewards": 0.10761779546737671, "rewards/unsafe_rewards": 0.34361597895622253, "step": 1100 }, { "epoch": 0.6, "learning_rate": 2.0885135242981647e-07, "logits/chosen": -2.398287057876587, "logits/rejected": -2.1465389728546143, "logps/chosen": -213.0477752685547, "logps/rejected": -162.02694702148438, "loss": 7.6341, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.43572482466697693, "rewards/margins": 0.21323783695697784, "rewards/rejected": 0.22248701751232147, "rewards/safe_rewards": 0.5539884567260742, "rewards/unsafe_rewards": 0.3174612522125244, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.0422559684675494e-07, "logits/chosen": -2.4309935569763184, "logits/rejected": -2.1530261039733887, "logps/chosen": -217.1282958984375, "logps/rejected": -168.8966522216797, "loss": 12.2909, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.23469574749469757, "rewards/margins": -0.31583258509635925, "rewards/rejected": 0.08113676309585571, "rewards/safe_rewards": -0.3013322949409485, "rewards/unsafe_rewards": -0.16805927455425262, "step": 1120 }, { "epoch": 0.61, "learning_rate": 1.9961600110577457e-07, "logits/chosen": -2.349834680557251, "logits/rejected": -2.1397252082824707, "logps/chosen": -207.71615600585938, "logps/rejected": -192.11148071289062, "loss": 83.4484, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3409779667854309, "rewards/margins": -0.09031665325164795, "rewards/rejected": -0.25066131353378296, "rewards/safe_rewards": -0.046450722962617874, "rewards/unsafe_rewards": -0.6355050802230835, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.950241925429867e-07, "logits/chosen": -2.4354217052459717, "logits/rejected": -2.2282073497772217, "logps/chosen": -202.4095458984375, "logps/rejected": -172.94119262695312, "loss": 10.2059, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13358193635940552, "rewards/margins": 0.09321316331624985, "rewards/rejected": -0.22679507732391357, "rewards/safe_rewards": -0.3772156536579132, "rewards/unsafe_rewards": 0.110051728785038, "step": 1140 }, { "epoch": 0.62, "learning_rate": 1.9045179221505495e-07, "logits/chosen": -2.385145664215088, "logits/rejected": -2.1816518306732178, "logps/chosen": -222.2650909423828, "logps/rejected": -183.89297485351562, "loss": 70.6764, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.3544660210609436, "rewards/margins": -0.31712788343429565, "rewards/rejected": -0.037338145077228546, "rewards/safe_rewards": -0.14638884365558624, "rewards/unsafe_rewards": -0.5625432729721069, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.8590041432690893e-07, "logits/chosen": -2.3393194675445557, "logits/rejected": -2.157670736312866, "logps/chosen": -191.87765502929688, "logps/rejected": -167.9620819091797, "loss": 15.8742, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.08758888393640518, "rewards/margins": 0.09322256594896317, "rewards/rejected": -0.005633688066154718, "rewards/safe_rewards": 0.3342171907424927, "rewards/unsafe_rewards": -0.1590394526720047, "step": 1160 }, { "epoch": 0.63, "learning_rate": 1.813716656618788e-07, "logits/chosen": -2.371502637863159, "logits/rejected": -2.179802417755127, "logps/chosen": -185.43954467773438, "logps/rejected": -159.95692443847656, "loss": 31.7421, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.07055462151765823, "rewards/margins": -0.016725819557905197, "rewards/rejected": 0.08728043735027313, "rewards/safe_rewards": 0.0999542772769928, "rewards/unsafe_rewards": 0.041154973208904266, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.7686714501444788e-07, "logits/chosen": -2.408245086669922, "logits/rejected": -2.111708402633667, "logps/chosen": -220.0321807861328, "logps/rejected": -177.4727783203125, "loss": 30.3254, "rewards/accuracies": 0.46875, "rewards/chosen": 0.08441750705242157, "rewards/margins": -0.06825534999370575, "rewards/rejected": 0.15267284214496613, "rewards/safe_rewards": -0.1991117298603058, "rewards/unsafe_rewards": 0.36794668436050415, "step": 1180 }, { "epoch": 0.64, "learning_rate": 1.7238844262582768e-07, "logits/chosen": -2.3922970294952393, "logits/rejected": -2.2358450889587402, "logps/chosen": -214.66928100585938, "logps/rejected": -185.44973754882812, "loss": 25.5158, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.1254725158214569, "rewards/margins": -0.16257312893867493, "rewards/rejected": 0.28804564476013184, "rewards/safe_rewards": -0.2854710817337036, "rewards/unsafe_rewards": 0.5364161133766174, "step": 1190 }, { "epoch": 0.65, "learning_rate": 1.679371396225504e-07, "logits/chosen": -2.381708860397339, "logits/rejected": -2.1555020809173584, "logps/chosen": -204.30628967285156, "logps/rejected": -180.3300323486328, "loss": 22.5219, "rewards/accuracies": 0.453125, "rewards/chosen": 0.1503904014825821, "rewards/margins": -0.30795037746429443, "rewards/rejected": 0.45834073424339294, "rewards/safe_rewards": 0.4240906834602356, "rewards/unsafe_rewards": -0.12330994755029678, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6351480745828096e-07, "logits/chosen": -2.4050099849700928, "logits/rejected": -2.1825802326202393, "logps/chosen": -198.45777893066406, "logps/rejected": -172.17959594726562, "loss": 37.0212, "rewards/accuracies": 0.484375, "rewards/chosen": 0.6139317750930786, "rewards/margins": 0.5727913975715637, "rewards/rejected": 0.04114028066396713, "rewards/safe_rewards": 0.8607079386711121, "rewards/unsafe_rewards": 0.3671554923057556, "step": 1210 }, { "epoch": 0.66, "learning_rate": 1.5912300735904248e-07, "logits/chosen": -2.4449119567871094, "logits/rejected": -2.174882173538208, "logps/chosen": -223.2691192626953, "logps/rejected": -173.9079132080078, "loss": 21.6142, "rewards/accuracies": 0.503125011920929, "rewards/chosen": 0.29134517908096313, "rewards/margins": 0.21831652522087097, "rewards/rejected": 0.07302861660718918, "rewards/safe_rewards": 0.24711818993091583, "rewards/unsafe_rewards": 0.33557215332984924, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5476328977205395e-07, "logits/chosen": -2.383089542388916, "logits/rejected": -2.1814026832580566, "logps/chosen": -195.18643188476562, "logps/rejected": -165.39920043945312, "loss": 279.4912, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.45090776681900024, "rewards/margins": -0.30230337381362915, "rewards/rejected": -0.14860430359840393, "rewards/safe_rewards": 0.5930166840553284, "rewards/unsafe_rewards": -1.4948322772979736, "step": 1230 }, { "epoch": 0.67, "learning_rate": 1.5043719381837112e-07, "logits/chosen": -2.4133849143981934, "logits/rejected": -2.2195193767547607, "logps/chosen": -219.2970428466797, "logps/rejected": -189.27816772460938, "loss": 29.5997, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.13977238535881042, "rewards/margins": 0.13596734404563904, "rewards/rejected": 0.0038050352595746517, "rewards/safe_rewards": 0.10270917415618896, "rewards/unsafe_rewards": 0.17683559656143188, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.461462467495284e-07, "logits/chosen": -2.3971149921417236, "logits/rejected": -2.2017760276794434, "logps/chosen": -195.2748260498047, "logps/rejected": -168.03221130371094, "loss": 29.824, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.27122873067855835, "rewards/margins": 0.40271610021591187, "rewards/rejected": -0.13148736953735352, "rewards/safe_rewards": 0.28425708413124084, "rewards/unsafe_rewards": 0.25820040702819824, "step": 1250 }, { "epoch": 0.68, "learning_rate": 1.4189196340836865e-07, "logits/chosen": -2.4611334800720215, "logits/rejected": -2.2188827991485596, "logps/chosen": -199.0708465576172, "logps/rejected": -166.50717163085938, "loss": 42.7807, "rewards/accuracies": 0.484375, "rewards/chosen": 0.1606270670890808, "rewards/margins": 0.13977651298046112, "rewards/rejected": 0.020850548520684242, "rewards/safe_rewards": 0.23887856304645538, "rewards/unsafe_rewards": 0.08237558603286743, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.3767584569425561e-07, "logits/chosen": -2.5276553630828857, "logits/rejected": -2.2806928157806396, "logps/chosen": -214.76614379882812, "logps/rejected": -178.0789031982422, "loss": 7.7411, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.06127176806330681, "rewards/margins": 0.017500977963209152, "rewards/rejected": 0.043770790100097656, "rewards/safe_rewards": 0.12450921535491943, "rewards/unsafe_rewards": -0.0019656748045235872, "step": 1270 }, { "epoch": 0.69, "learning_rate": 1.334993820328541e-07, "logits/chosen": -2.461317539215088, "logits/rejected": -2.2503418922424316, "logps/chosen": -204.41952514648438, "logps/rejected": -171.56008911132812, "loss": 88.8508, "rewards/accuracies": 0.453125, "rewards/chosen": -0.142044335603714, "rewards/margins": -0.2231227457523346, "rewards/rejected": 0.0810784175992012, "rewards/safe_rewards": -0.2867421507835388, "rewards/unsafe_rewards": 0.002653457224369049, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.2936404685066852e-07, "logits/chosen": -2.3843283653259277, "logits/rejected": -2.1979799270629883, "logps/chosen": -205.46273803710938, "logps/rejected": -180.91793823242188, "loss": 66.3165, "rewards/accuracies": 0.484375, "rewards/chosen": -0.1233854666352272, "rewards/margins": -0.03703648969531059, "rewards/rejected": -0.0863489955663681, "rewards/safe_rewards": -0.44312816858291626, "rewards/unsafe_rewards": 0.19635725021362305, "step": 1290 }, { "epoch": 0.7, "learning_rate": 1.252713000545221e-07, "logits/chosen": -2.455895185470581, "logits/rejected": -2.2126731872558594, "logps/chosen": -211.90866088867188, "logps/rejected": -172.7696533203125, "loss": 8.9746, "rewards/accuracies": 0.4375, "rewards/chosen": 0.057679928839206696, "rewards/margins": -0.1852763295173645, "rewards/rejected": 0.2429562509059906, "rewards/safe_rewards": 0.14237050712108612, "rewards/unsafe_rewards": -0.027010658755898476, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2122258651616304e-07, "logits/chosen": -2.445269823074341, "logits/rejected": -2.224661350250244, "logps/chosen": -209.90713500976562, "logps/rejected": -173.6033935546875, "loss": 63.3258, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.0619967095553875, "rewards/margins": -0.29980406165122986, "rewards/rejected": 0.23780739307403564, "rewards/safe_rewards": -0.2617853283882141, "rewards/unsafe_rewards": 0.13779191672801971, "step": 1310 }, { "epoch": 0.71, "learning_rate": 1.1721933556217792e-07, "logits/chosen": -2.4175376892089844, "logits/rejected": -2.23214054107666, "logps/chosen": -195.77786254882812, "logps/rejected": -175.40225219726562, "loss": 11.5399, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.27427220344543457, "rewards/margins": 0.10898621380329132, "rewards/rejected": 0.16528600454330444, "rewards/safe_rewards": 0.21214981377124786, "rewards/unsafe_rewards": 0.3363945782184601, "step": 1320 }, { "epoch": 0.72, "learning_rate": 1.1326296046939333e-07, "logits/chosen": -2.3801956176757812, "logits/rejected": -2.162496328353882, "logps/chosen": -184.91856384277344, "logps/rejected": -153.4582061767578, "loss": 63.4268, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12642014026641846, "rewards/margins": -0.3640880286693573, "rewards/rejected": 0.23766788840293884, "rewards/safe_rewards": -0.23899023234844208, "rewards/unsafe_rewards": -0.013849982991814613, "step": 1330 }, { "epoch": 0.72, "learning_rate": 1.0935485796594351e-07, "logits/chosen": -2.4861056804656982, "logits/rejected": -2.239741086959839, "logps/chosen": -222.3768310546875, "logps/rejected": -176.0164337158203, "loss": 21.3914, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.18739810585975647, "rewards/margins": 0.09455545991659164, "rewards/rejected": 0.09284263849258423, "rewards/safe_rewards": 0.19133667647838593, "rewards/unsafe_rewards": 0.1834595501422882, "step": 1340 }, { "epoch": 0.73, "learning_rate": 1.0549640773818028e-07, "logits/chosen": -2.4289638996124268, "logits/rejected": -2.237046003341675, "logps/chosen": -204.95181274414062, "logps/rejected": -158.824951171875, "loss": 8.4938, "rewards/accuracies": 0.484375, "rewards/chosen": 0.01927146315574646, "rewards/margins": -0.0841434970498085, "rewards/rejected": 0.10341496765613556, "rewards/safe_rewards": 0.04857074096798897, "rewards/unsafe_rewards": -0.010027825832366943, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0168897194359921e-07, "logits/chosen": -2.4466030597686768, "logits/rejected": -2.194831132888794, "logps/chosen": -222.03775024414062, "logps/rejected": -183.56564331054688, "loss": 82.0212, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.2902601659297943, "rewards/margins": 0.6036115884780884, "rewards/rejected": -0.3133513927459717, "rewards/safe_rewards": 0.4282899498939514, "rewards/unsafe_rewards": 0.152230367064476, "step": 1360 }, { "epoch": 0.74, "learning_rate": 9.793389472995392e-08, "logits/chosen": -2.4077987670898438, "logits/rejected": -2.1739821434020996, "logps/chosen": -209.699951171875, "logps/rejected": -166.60293579101562, "loss": 11.3477, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.014321346767246723, "rewards/margins": -0.030286794528365135, "rewards/rejected": 0.015965450555086136, "rewards/safe_rewards": 0.03987512364983559, "rewards/unsafe_rewards": -0.06851781159639359, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.423250176072874e-08, "logits/chosen": -2.401275634765625, "logits/rejected": -2.192737340927124, "logps/chosen": -181.48147583007812, "logps/rejected": -154.23431396484375, "loss": 15.9486, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0320340096950531, "rewards/margins": 0.20615582168102264, "rewards/rejected": -0.23818981647491455, "rewards/safe_rewards": 0.015104318037629128, "rewards/unsafe_rewards": -0.07917235046625137, "step": 1380 }, { "epoch": 0.75, "learning_rate": 9.058609974713654e-08, "logits/chosen": -2.4539401531219482, "logits/rejected": -2.1792826652526855, "logps/chosen": -206.2873992919922, "logps/rejected": -171.813232421875, "loss": 27.4047, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.08179975301027298, "rewards/margins": -0.07024437934160233, "rewards/rejected": -0.011555373668670654, "rewards/safe_rewards": 0.006575888488441706, "rewards/unsafe_rewards": -0.17017540335655212, "step": 1390 }, { "epoch": 0.75, "learning_rate": 8.699597598680753e-08, "logits/chosen": -2.3884987831115723, "logits/rejected": -2.1706833839416504, "logps/chosen": -183.61544799804688, "logps/rejected": -168.7871856689453, "loss": 34.4575, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.44646185636520386, "rewards/margins": 0.1550489217042923, "rewards/rejected": 0.29141297936439514, "rewards/safe_rewards": 0.32389289140701294, "rewards/unsafe_rewards": 0.5690308809280396, "step": 1400 }, { "epoch": 0.76, "learning_rate": 8.346339790933166e-08, "logits/chosen": -2.4721839427948, "logits/rejected": -2.2297019958496094, "logps/chosen": -200.0784149169922, "logps/rejected": -159.7423858642578, "loss": 6.7397, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.12982772290706635, "rewards/margins": -0.028519075363874435, "rewards/rejected": 0.15834678709506989, "rewards/safe_rewards": -0.03010488487780094, "rewards/unsafe_rewards": 0.2897603511810303, "step": 1410 }, { "epoch": 0.76, "learning_rate": 7.998961262881506e-08, "logits/chosen": -2.418222665786743, "logits/rejected": -2.1581873893737793, "logps/chosen": -220.6064453125, "logps/rejected": -172.82266235351562, "loss": 6.4288, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.35179823637008667, "rewards/margins": 0.23103070259094238, "rewards/rejected": 0.12076754868030548, "rewards/safe_rewards": 0.3257552981376648, "rewards/unsafe_rewards": 0.37784117460250854, "step": 1420 }, { "epoch": 0.77, "learning_rate": 7.657584650360846e-08, "logits/chosen": -2.396697521209717, "logits/rejected": -2.2003862857818604, "logps/chosen": -199.44009399414062, "logps/rejected": -172.6617431640625, "loss": 35.7268, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.024801719933748245, "rewards/margins": 0.26587414741516113, "rewards/rejected": -0.2906758785247803, "rewards/safe_rewards": 0.05263688042759895, "rewards/unsafe_rewards": -0.10224030166864395, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.322330470336313e-08, "logits/chosen": -2.3913733959198, "logits/rejected": -2.189946413040161, "logps/chosen": -190.08120727539062, "logps/rejected": -170.0216522216797, "loss": 9.582, "rewards/accuracies": 0.546875, "rewards/chosen": -0.11784199625253677, "rewards/margins": -0.031096214428544044, "rewards/rejected": -0.08674577623605728, "rewards/safe_rewards": -0.293480783700943, "rewards/unsafe_rewards": 0.05779681354761124, "step": 1440 }, { "epoch": 0.78, "learning_rate": 6.993317078356709e-08, "logits/chosen": -2.3910608291625977, "logits/rejected": -2.2192938327789307, "logps/chosen": -199.07406616210938, "logps/rejected": -170.1977996826172, "loss": 45.9652, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.10676655918359756, "rewards/margins": -0.08676379173994064, "rewards/rejected": -0.02000277489423752, "rewards/safe_rewards": -0.0720798522233963, "rewards/unsafe_rewards": -0.14145328104496002, "step": 1450 }, { "epoch": 0.79, "learning_rate": 6.67066062677118e-08, "logits/chosen": -2.4357597827911377, "logits/rejected": -2.2244791984558105, "logps/chosen": -208.4618682861328, "logps/rejected": -167.52764892578125, "loss": 20.8808, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.2480795830488205, "rewards/margins": -0.08884197473526001, "rewards/rejected": 0.3369216322898865, "rewards/safe_rewards": 0.1537085473537445, "rewards/unsafe_rewards": 0.34245067834854126, "step": 1460 }, { "epoch": 0.79, "learning_rate": 6.354475023723685e-08, "logits/chosen": -2.3960747718811035, "logits/rejected": -2.1642906665802, "logps/chosen": -216.65756225585938, "logps/rejected": -171.6775665283203, "loss": 59.1855, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.5152679681777954, "rewards/margins": 0.2937913239002228, "rewards/rejected": 0.22147664427757263, "rewards/safe_rewards": 0.2597041726112366, "rewards/unsafe_rewards": 0.7708317041397095, "step": 1470 }, { "epoch": 0.8, "learning_rate": 6.044871892939746e-08, "logits/chosen": -2.4158756732940674, "logits/rejected": -2.2148139476776123, "logps/chosen": -225.4951171875, "logps/rejected": -189.0193328857422, "loss": 30.3887, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.14366625249385834, "rewards/margins": 0.25630486011505127, "rewards/rejected": -0.11263859272003174, "rewards/safe_rewards": 0.08603324741125107, "rewards/unsafe_rewards": 0.20129923522472382, "step": 1480 }, { "epoch": 0.8, "learning_rate": 5.741960534319676e-08, "logits/chosen": -2.391890525817871, "logits/rejected": -2.2089953422546387, "logps/chosen": -190.7472686767578, "logps/rejected": -160.5789031982422, "loss": 29.5828, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.04550771787762642, "rewards/margins": -0.21935884654521942, "rewards/rejected": 0.1738511323928833, "rewards/safe_rewards": -0.12179826200008392, "rewards/unsafe_rewards": 0.030782824382185936, "step": 1490 }, { "epoch": 0.81, "learning_rate": 5.44584788535217e-08, "logits/chosen": -2.4144439697265625, "logits/rejected": -2.209897994995117, "logps/chosen": -213.13632202148438, "logps/rejected": -176.97024536132812, "loss": 15.9924, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.15160061419010162, "rewards/margins": -0.01647660695016384, "rewards/rejected": -0.13512399792671204, "rewards/safe_rewards": -0.37716132402420044, "rewards/unsafe_rewards": 0.07396010309457779, "step": 1500 }, { "epoch": 0.81, "eval_logits/chosen": -2.084881544113159, "eval_logits/rejected": -1.833509922027588, "eval_logps/chosen": -131.02365112304688, "eval_logps/rejected": -92.45955657958984, "eval_loss": 0.6823093295097351, "eval_rewards/accuracies": 0.4713214039802551, "eval_rewards/chosen": -0.1577797532081604, "eval_rewards/margins": -0.05011267587542534, "eval_rewards/rejected": -0.10766706615686417, "eval_rewards/safe_rewards": -0.15565218031406403, "eval_rewards/unsafe_rewards": -0.15351000428199768, "eval_runtime": 1880.4558, "eval_samples_per_second": 17.572, "eval_steps_per_second": 1.099, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.156638483361933e-08, "logits/chosen": -2.435300827026367, "logits/rejected": -2.1943700313568115, "logps/chosen": -206.97384643554688, "logps/rejected": -174.73373413085938, "loss": 6.0946, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": -0.2385288029909134, "rewards/margins": 0.05665416270494461, "rewards/rejected": -0.295183002948761, "rewards/safe_rewards": -0.1594325453042984, "rewards/unsafe_rewards": -0.3176250755786896, "step": 1510 }, { "epoch": 0.82, "learning_rate": 4.8744344286046236e-08, "logits/chosen": -2.4003233909606934, "logits/rejected": -2.177899122238159, "logps/chosen": -207.0956573486328, "logps/rejected": -169.01504516601562, "loss": 45.2569, "rewards/accuracies": 0.4906249940395355, "rewards/chosen": 0.26545172929763794, "rewards/margins": 0.4598563611507416, "rewards/rejected": -0.19440460205078125, "rewards/safe_rewards": 0.313471257686615, "rewards/unsafe_rewards": 0.21743226051330566, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.599335348222169e-08, "logits/chosen": -2.4335553646087646, "logits/rejected": -2.246596574783325, "logps/chosen": -207.1642608642578, "logps/rejected": -186.18722534179688, "loss": 5.5153, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0844786986708641, "rewards/margins": 0.10317480564117432, "rewards/rejected": -0.1876535415649414, "rewards/safe_rewards": -0.22987417876720428, "rewards/unsafe_rewards": 0.06091681867837906, "step": 1530 }, { "epoch": 0.83, "learning_rate": 4.331438361071163e-08, "logits/chosen": -2.3511147499084473, "logits/rejected": -2.206602096557617, "logps/chosen": -212.7078857421875, "logps/rejected": -194.2686767578125, "loss": 21.5544, "rewards/accuracies": 0.47187501192092896, "rewards/chosen": -0.17899686098098755, "rewards/margins": 0.10844133794307709, "rewards/rejected": -0.28743821382522583, "rewards/safe_rewards": -0.30973827838897705, "rewards/unsafe_rewards": -0.04825545474886894, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.0708380434367864e-08, "logits/chosen": -2.4302127361297607, "logits/rejected": -2.1903905868530273, "logps/chosen": -199.45376586914062, "logps/rejected": -171.4712677001953, "loss": 11.7548, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.16563379764556885, "rewards/margins": -0.16774305701255798, "rewards/rejected": 0.0021092891693115234, "rewards/safe_rewards": -0.17964015901088715, "rewards/unsafe_rewards": -0.15162742137908936, "step": 1550 }, { "epoch": 0.84, "learning_rate": 3.817626395644305e-08, "logits/chosen": -2.428711414337158, "logits/rejected": -2.232553005218506, "logps/chosen": -206.1396942138672, "logps/rejected": -177.48374938964844, "loss": 20.469, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.2369929999113083, "rewards/margins": -0.06113150715827942, "rewards/rejected": -0.17586149275302887, "rewards/safe_rewards": -0.15375518798828125, "rewards/unsafe_rewards": -0.32023078203201294, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.571892809580013e-08, "logits/chosen": -2.395301580429077, "logits/rejected": -2.1873881816864014, "logps/chosen": -195.25765991210938, "logps/rejected": -175.76754760742188, "loss": 47.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.030344385653734207, "rewards/margins": 0.31907138228416443, "rewards/rejected": -0.34941577911376953, "rewards/safe_rewards": -0.09232734888792038, "rewards/unsafe_rewards": 0.03163857385516167, "step": 1570 }, { "epoch": 0.85, "learning_rate": 3.333724037132976e-08, "logits/chosen": -2.4109716415405273, "logits/rejected": -2.1891541481018066, "logps/chosen": -198.6385040283203, "logps/rejected": -170.99563598632812, "loss": 5.2251, "rewards/accuracies": 0.4281249940395355, "rewards/chosen": -0.1846873015165329, "rewards/margins": -0.06299707293510437, "rewards/rejected": -0.12169022858142853, "rewards/safe_rewards": -0.1805131733417511, "rewards/unsafe_rewards": -0.18886145949363708, "step": 1580 }, { "epoch": 0.86, "learning_rate": 3.1032041595688506e-08, "logits/chosen": -2.3785929679870605, "logits/rejected": -2.171466827392578, "logps/chosen": -216.2442626953125, "logps/rejected": -185.38406372070312, "loss": 21.8766, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.09203994274139404, "rewards/margins": -0.34605592489242554, "rewards/rejected": 0.2540159523487091, "rewards/safe_rewards": -0.014932965859770775, "rewards/unsafe_rewards": -0.16914694011211395, "step": 1590 }, { "epoch": 0.86, "learning_rate": 2.880414557846453e-08, "logits/chosen": -2.4211525917053223, "logits/rejected": -2.259765863418579, "logps/chosen": -200.02296447753906, "logps/rejected": -164.5922393798828, "loss": 78.4789, "rewards/accuracies": 0.503125011920929, "rewards/chosen": -0.19741728901863098, "rewards/margins": 0.0014421313535422087, "rewards/rejected": -0.19885942339897156, "rewards/safe_rewards": -0.07222781330347061, "rewards/unsafe_rewards": -0.32260677218437195, "step": 1600 }, { "epoch": 0.87, "learning_rate": 2.6654338838876662e-08, "logits/chosen": -2.4327399730682373, "logits/rejected": -2.1489098072052, "logps/chosen": -206.57406616210938, "logps/rejected": -162.18191528320312, "loss": 31.8357, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.03170815855264664, "rewards/margins": 0.1048579216003418, "rewards/rejected": -0.13656608760356903, "rewards/safe_rewards": -0.28655681014060974, "rewards/unsafe_rewards": 0.22314047813415527, "step": 1610 }, { "epoch": 0.87, "learning_rate": 2.4583380328107805e-08, "logits/chosen": -2.4065799713134766, "logits/rejected": -2.168668508529663, "logps/chosen": -219.0827178955078, "logps/rejected": -174.2585906982422, "loss": 19.5756, "rewards/accuracies": 0.53125, "rewards/chosen": -0.28594768047332764, "rewards/margins": -0.2515542805194855, "rewards/rejected": -0.034393392503261566, "rewards/safe_rewards": -0.40954461693763733, "rewards/unsafe_rewards": -0.16235077381134033, "step": 1620 }, { "epoch": 0.88, "learning_rate": 2.259200116137039e-08, "logits/chosen": -2.381093740463257, "logits/rejected": -2.1939659118652344, "logps/chosen": -204.22921752929688, "logps/rejected": -187.06576538085938, "loss": 169.2986, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.762048602104187, "rewards/margins": 0.7863933444023132, "rewards/rejected": -0.024344712495803833, "rewards/safe_rewards": 0.9233208894729614, "rewards/unsafe_rewards": 0.6007765531539917, "step": 1630 }, { "epoch": 0.88, "learning_rate": 2.068090435979958e-08, "logits/chosen": -2.3571343421936035, "logits/rejected": -2.1805238723754883, "logps/chosen": -194.33248901367188, "logps/rejected": -165.1163787841797, "loss": 56.9496, "rewards/accuracies": 0.484375, "rewards/chosen": -0.3039317727088928, "rewards/margins": -0.40204334259033203, "rewards/rejected": 0.09811154752969742, "rewards/safe_rewards": 0.22571036219596863, "rewards/unsafe_rewards": -0.8335739374160767, "step": 1640 }, { "epoch": 0.89, "learning_rate": 1.8850764602263423e-08, "logits/chosen": -2.415358304977417, "logits/rejected": -2.1450016498565674, "logps/chosen": -200.4285888671875, "logps/rejected": -173.5339813232422, "loss": 27.6237, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.12354181706905365, "rewards/margins": -0.11882360279560089, "rewards/rejected": -0.0047182366251945496, "rewards/safe_rewards": -0.05137089639902115, "rewards/unsafe_rewards": -0.19571277499198914, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.710222798718028e-08, "logits/chosen": -2.4396722316741943, "logits/rejected": -2.2350778579711914, "logps/chosen": -203.3378448486328, "logps/rejected": -178.9970245361328, "loss": 22.9552, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.003960543777793646, "rewards/margins": 0.2720267176628113, "rewards/rejected": -0.2680661380290985, "rewards/safe_rewards": 0.06284536421298981, "rewards/unsafe_rewards": -0.054924286901950836, "step": 1660 }, { "epoch": 0.9, "learning_rate": 1.5435911804424356e-08, "logits/chosen": -2.4028568267822266, "logits/rejected": -2.2349255084991455, "logps/chosen": -229.4883270263672, "logps/rejected": -185.6358642578125, "loss": 31.5896, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.2171170711517334, "rewards/margins": 0.017191190272569656, "rewards/rejected": -0.23430824279785156, "rewards/safe_rewards": 0.15241694450378418, "rewards/unsafe_rewards": -0.586651086807251, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.3852404317403199e-08, "logits/chosen": -2.395153284072876, "logits/rejected": -2.2008633613586426, "logps/chosen": -220.5502471923828, "logps/rejected": -194.44186401367188, "loss": 26.1714, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.21780619025230408, "rewards/margins": -0.0533115491271019, "rewards/rejected": -0.16449466347694397, "rewards/safe_rewards": -0.4852879047393799, "rewards/unsafe_rewards": 0.04967564344406128, "step": 1680 }, { "epoch": 0.91, "learning_rate": 1.235226455538113e-08, "logits/chosen": -2.4504330158233643, "logits/rejected": -2.2494871616363525, "logps/chosen": -201.50564575195312, "logps/rejected": -167.95364379882812, "loss": 5.2467, "rewards/accuracies": 0.46875, "rewards/chosen": -0.037842657417058945, "rewards/margins": -0.06888096779584885, "rewards/rejected": 0.031038302928209305, "rewards/safe_rewards": -0.037054188549518585, "rewards/unsafe_rewards": -0.0386311374604702, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.0936022116124321e-08, "logits/chosen": -2.4290854930877686, "logits/rejected": -2.204906463623047, "logps/chosen": -199.54847717285156, "logps/rejected": -165.02816772460938, "loss": 41.987, "rewards/accuracies": 0.5093749761581421, "rewards/chosen": -0.15811631083488464, "rewards/margins": -0.3054047226905823, "rewards/rejected": 0.14728839695453644, "rewards/safe_rewards": -0.39381590485572815, "rewards/unsafe_rewards": 0.07758323848247528, "step": 1700 }, { "epoch": 0.92, "learning_rate": 9.60417697893534e-09, "logits/chosen": -2.4069314002990723, "logits/rejected": -2.2242488861083984, "logps/chosen": -199.82015991210938, "logps/rejected": -173.9343719482422, "loss": 22.6453, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": 0.1551128476858139, "rewards/margins": -0.032198842614889145, "rewards/rejected": 0.18731167912483215, "rewards/safe_rewards": 0.21557751297950745, "rewards/unsafe_rewards": 0.09464815258979797, "step": 1710 }, { "epoch": 0.93, "learning_rate": 8.357199328144576e-09, "logits/chosen": -2.4046077728271484, "logits/rejected": -2.2161166667938232, "logps/chosen": -216.55093383789062, "logps/rejected": -187.6187286376953, "loss": 56.4505, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.08073421567678452, "rewards/margins": -0.025125902146100998, "rewards/rejected": 0.10586012899875641, "rewards/safe_rewards": 0.07104392349720001, "rewards/unsafe_rewards": 0.09042453020811081, "step": 1720 }, { "epoch": 0.93, "learning_rate": 7.1955293871198144e-09, "logits/chosen": -2.4008450508117676, "logits/rejected": -2.261340379714966, "logps/chosen": -187.19436645507812, "logps/rejected": -169.91722106933594, "loss": 18.4483, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.12119190394878387, "rewards/margins": -0.34029996395111084, "rewards/rejected": 0.21910807490348816, "rewards/safe_rewards": -0.20904748141765594, "rewards/unsafe_rewards": -0.03333630412817001, "step": 1730 }, { "epoch": 0.94, "learning_rate": 6.119577262853254e-09, "logits/chosen": -2.4227774143218994, "logits/rejected": -2.1880428791046143, "logps/chosen": -193.263671875, "logps/rejected": -162.72183227539062, "loss": 27.596, "rewards/accuracies": 0.5, "rewards/chosen": -0.09984199702739716, "rewards/margins": -0.025053083896636963, "rewards/rejected": -0.07478892058134079, "rewards/safe_rewards": -0.26086345314979553, "rewards/unsafe_rewards": 0.06117943674325943, "step": 1740 }, { "epoch": 0.94, "learning_rate": 5.129722801180542e-09, "logits/chosen": -2.3443946838378906, "logits/rejected": -2.1799635887145996, "logps/chosen": -197.2679443359375, "logps/rejected": -180.6214599609375, "loss": 19.3736, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3499890863895416, "rewards/margins": -0.13464350998401642, "rewards/rejected": -0.2153455764055252, "rewards/safe_rewards": -0.1743151694536209, "rewards/unsafe_rewards": -0.5256629586219788, "step": 1750 }, { "epoch": 0.95, "learning_rate": 4.226315452682816e-09, "logits/chosen": -2.413181781768799, "logits/rejected": -2.187439441680908, "logps/chosen": -196.54916381835938, "logps/rejected": -173.30929565429688, "loss": 31.0958, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.147268146276474, "rewards/margins": 0.10999511182308197, "rewards/rejected": -0.2572632431983948, "rewards/safe_rewards": -0.11902125179767609, "rewards/unsafe_rewards": -0.17551502585411072, "step": 1760 }, { "epoch": 0.95, "learning_rate": 3.4096741493194193e-09, "logits/chosen": -2.443580389022827, "logits/rejected": -2.2651684284210205, "logps/chosen": -199.41049194335938, "logps/rejected": -173.95718383789062, "loss": 9.8716, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.056076280772686005, "rewards/margins": 0.028754467144608498, "rewards/rejected": 0.027321819216012955, "rewards/safe_rewards": 0.034864675253629684, "rewards/unsafe_rewards": 0.07728789001703262, "step": 1770 }, { "epoch": 0.96, "learning_rate": 2.6800871918346846e-09, "logits/chosen": -2.4057886600494385, "logits/rejected": -2.155165672302246, "logps/chosen": -203.48025512695312, "logps/rejected": -172.94015502929688, "loss": 41.8074, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.21421018242835999, "rewards/margins": -0.43174242973327637, "rewards/rejected": 0.2175322324037552, "rewards/safe_rewards": -0.06811753660440445, "rewards/unsafe_rewards": -0.36030280590057373, "step": 1780 }, { "epoch": 0.96, "learning_rate": 2.0378121479783796e-09, "logits/chosen": -2.389869213104248, "logits/rejected": -2.1555044651031494, "logps/chosen": -196.02059936523438, "logps/rejected": -167.43655395507812, "loss": 61.0971, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.008689677342772484, "rewards/margins": 0.18058201670646667, "rewards/rejected": -0.17189235985279083, "rewards/safe_rewards": 0.042548321187496185, "rewards/unsafe_rewards": -0.025168979540467262, "step": 1790 }, { "epoch": 0.97, "learning_rate": 1.4830757615760247e-09, "logits/chosen": -2.4289557933807373, "logits/rejected": -2.1850333213806152, "logps/chosen": -207.24124145507812, "logps/rejected": -170.49305725097656, "loss": 144.1229, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.07967615127563477, "rewards/margins": 0.1154344230890274, "rewards/rejected": -0.19511058926582336, "rewards/safe_rewards": -0.10584060847759247, "rewards/unsafe_rewards": -0.05351167917251587, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.0160738724809548e-09, "logits/chosen": -2.4409990310668945, "logits/rejected": -2.207919120788574, "logps/chosen": -196.10601806640625, "logps/rejected": -171.36843872070312, "loss": 18.7773, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.08281825482845306, "rewards/margins": 0.23606376349925995, "rewards/rejected": -0.3188820481300354, "rewards/safe_rewards": 0.05616650730371475, "rewards/unsafe_rewards": -0.2218029946088791, "step": 1810 }, { "epoch": 0.98, "learning_rate": 6.369713474366212e-10, "logits/chosen": -2.420626640319824, "logits/rejected": -2.1977345943450928, "logps/chosen": -219.4222869873047, "logps/rejected": -181.95010375976562, "loss": 17.5266, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.16265834867954254, "rewards/margins": -0.11877261102199554, "rewards/rejected": -0.0438857302069664, "rewards/safe_rewards": -0.035406678915023804, "rewards/unsafe_rewards": -0.2899099886417389, "step": 1820 }, { "epoch": 0.98, "learning_rate": 3.459020218731512e-10, "logits/chosen": -2.4327456951141357, "logits/rejected": -2.220496654510498, "logps/chosen": -202.61898803710938, "logps/rejected": -167.197021484375, "loss": 43.7242, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.15349379181861877, "rewards/margins": 0.14935937523841858, "rewards/rejected": 0.004134447779506445, "rewards/safe_rewards": 0.08035097271203995, "rewards/unsafe_rewards": 0.2266366183757782, "step": 1830 }, { "epoch": 0.99, "learning_rate": 1.429686526593088e-10, "logits/chosen": -2.398090124130249, "logits/rejected": -2.192744255065918, "logps/chosen": -206.80520629882812, "logps/rejected": -175.9212646484375, "loss": 23.3409, "rewards/accuracies": 0.453125, "rewards/chosen": 0.3794136941432953, "rewards/margins": 0.2774004638195038, "rewards/rejected": 0.10201327502727509, "rewards/safe_rewards": 0.5806846022605896, "rewards/unsafe_rewards": 0.17814283072948456, "step": 1840 }, { "epoch": 1.0, "learning_rate": 2.824288182584622e-11, "logits/chosen": -2.4241063594818115, "logits/rejected": -2.2421114444732666, "logps/chosen": -206.7459716796875, "logps/rejected": -168.176513671875, "loss": 19.5817, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.24208331108093262, "rewards/margins": -0.22288334369659424, "rewards/rejected": -0.019199971109628677, "rewards/safe_rewards": 0.039939720183610916, "rewards/unsafe_rewards": -0.5241063237190247, "step": 1850 }, { "epoch": 1.0, "step": 1858, "total_flos": 0.0, "train_loss": 67.04043597990268, "train_runtime": 46860.0347, "train_samples_per_second": 1.269, "train_steps_per_second": 0.04 } ], "logging_steps": 10, "max_steps": 1858, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }