diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -15,7 +15,7 @@ "logits/rejected": -2.057170867919922, "logps/chosen": -246.4422607421875, "logps/rejected": -173.7652587890625, - "loss": 7612.5, + "loss": 0.5938, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -27,3025 +27,3025 @@ { "epoch": 0.01, "learning_rate": 2.6881720430107527e-08, - "logits/chosen": -2.3338747024536133, - "logits/rejected": -2.1101577281951904, - "logps/chosen": -199.17718505859375, - "logps/rejected": -169.33853149414062, - "loss": 8542.1936, - "rewards/accuracies": 0.4236111044883728, - "rewards/chosen": -0.00019407388754189014, - "rewards/margins": -0.0004579208616632968, - "rewards/rejected": 0.00026384700322523713, - "rewards/safe_rewards": -7.85427400842309e-05, - "rewards/unsafe_rewards": -0.0003096049767918885, + "logits/chosen": -2.3338096141815186, + "logits/rejected": -2.1100988388061523, + "logps/chosen": -199.19329833984375, + "logps/rejected": -169.358642578125, + "loss": 1.134, + "rewards/accuracies": 0.4097222089767456, + "rewards/chosen": -0.03551425039768219, + "rewards/margins": -0.041799187660217285, + "rewards/rejected": 0.006284935399889946, + "rewards/safe_rewards": -0.01677405834197998, + "rewards/unsafe_rewards": -0.0542544424533844, "step": 10 }, { "epoch": 0.01, "learning_rate": 5.3763440860215054e-08, - "logits/chosen": -2.3297791481018066, - "logits/rejected": -2.0859668254852295, - "logps/chosen": -215.31082153320312, - "logps/rejected": -176.90184020996094, - "loss": 8615.6992, - "rewards/accuracies": 0.503125011920929, - "rewards/chosen": -0.0001893683074740693, - "rewards/margins": -0.00013989376020617783, - "rewards/rejected": -4.9474612751509994e-05, - "rewards/safe_rewards": 2.0141320419497788e-05, - "rewards/unsafe_rewards": -0.00039887792081572115, + "logits/chosen": -2.329479694366455, + "logits/rejected": -2.0858876705169678, + "logps/chosen": -215.32296752929688, + "logps/rejected": -176.8864288330078, + "loss": 1.1266, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.031086910516023636, + "rewards/margins": -0.04154179245233536, + "rewards/rejected": 0.010454884730279446, + "rewards/safe_rewards": -0.04110833257436752, + "rewards/unsafe_rewards": -0.021065494045615196, "step": 20 }, { "epoch": 0.02, "learning_rate": 8.064516129032257e-08, - "logits/chosen": -2.3232197761535645, - "logits/rejected": -2.104114532470703, - "logps/chosen": -199.29653930664062, - "logps/rejected": -180.81167602539062, - "loss": 8655.0875, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 7.340547745116055e-05, - "rewards/margins": 0.00046515712165273726, - "rewards/rejected": -0.00039175161509774625, - "rewards/safe_rewards": 0.00016999247600324452, - "rewards/unsafe_rewards": -2.318151564395521e-05, + "logits/chosen": -2.322885036468506, + "logits/rejected": -2.1038832664489746, + "logps/chosen": -199.3030242919922, + "logps/rejected": -180.7991943359375, + "loss": 1.1716, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.0008645713096484542, + "rewards/margins": 0.027558892965316772, + "rewards/rejected": -0.026694318279623985, + "rewards/safe_rewards": -0.0032820613123476505, + "rewards/unsafe_rewards": 0.005011203698813915, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.0752688172043011e-07, - "logits/chosen": -2.26924991607666, - "logits/rejected": -1.9993360042572021, - "logps/chosen": -197.77706909179688, - "logps/rejected": -177.79495239257812, - "loss": 8307.5406, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": -0.0003278615477029234, - "rewards/margins": 0.000557084393221885, - "rewards/rejected": -0.000884945853613317, - "rewards/safe_rewards": -0.00021670451678801328, - "rewards/unsafe_rewards": -0.0004390186513774097, + "logits/chosen": -2.268714427947998, + "logits/rejected": -1.9988443851470947, + "logps/chosen": -197.72109985351562, + "logps/rejected": -177.70603942871094, + "loss": 1.1036, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.023218240588903427, + "rewards/margins": 0.022794129326939583, + "rewards/rejected": 0.0004241138813085854, + "rewards/safe_rewards": 0.03502867370843887, + "rewards/unsafe_rewards": 0.011407810263335705, "step": 40 }, { "epoch": 0.03, "learning_rate": 1.3440860215053762e-07, - "logits/chosen": -2.3771767616271973, - "logits/rejected": -2.081153392791748, - "logps/chosen": -191.57861328125, - "logps/rejected": -162.315185546875, - "loss": 8305.5875, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0004353286640252918, - "rewards/margins": 0.0028661643154919147, - "rewards/rejected": -0.0024308357387781143, - "rewards/safe_rewards": -0.00014582322910428047, - "rewards/unsafe_rewards": 0.0010164804989472032, + "logits/chosen": -2.374366283416748, + "logits/rejected": -2.07818603515625, + "logps/chosen": -191.63714599609375, + "logps/rejected": -162.17771911621094, + "loss": 1.1473, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -0.015013009309768677, + "rewards/margins": 0.0906001627445221, + "rewards/rejected": -0.10561318695545197, + "rewards/safe_rewards": -0.018471335992217064, + "rewards/unsafe_rewards": -0.011554678902029991, "step": 50 }, { "epoch": 0.03, "learning_rate": 1.6129032258064515e-07, - "logits/chosen": -2.350670576095581, - "logits/rejected": -2.13375186920166, - "logps/chosen": -186.54489135742188, - "logps/rejected": -175.369873046875, - "loss": 8113.0641, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.00035417350591160357, - "rewards/margins": 0.0031304885633289814, - "rewards/rejected": -0.0034846621565520763, - "rewards/safe_rewards": -0.0002569267526268959, - "rewards/unsafe_rewards": -0.0004514198808465153, + "logits/chosen": -2.346019983291626, + "logits/rejected": -2.1285576820373535, + "logps/chosen": -186.499755859375, + "logps/rejected": -175.0586700439453, + "loss": 1.0107, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.009731076657772064, + "rewards/margins": 0.04699288681149483, + "rewards/rejected": -0.037261806428432465, + "rewards/safe_rewards": -0.01583387330174446, + "rewards/unsafe_rewards": 0.03529602661728859, "step": 60 }, { "epoch": 0.04, "learning_rate": 1.8817204301075268e-07, - "logits/chosen": -2.322838306427002, - "logits/rejected": -2.109368085861206, - "logps/chosen": -221.5357208251953, - "logps/rejected": -179.95974731445312, - "loss": 8132.6656, - "rewards/accuracies": 0.5718749761581421, - "rewards/chosen": -0.0014549014158546925, - "rewards/margins": 0.006572114769369364, - "rewards/rejected": -0.008027016185224056, - "rewards/safe_rewards": -0.00250421604141593, - "rewards/unsafe_rewards": -0.00040558649925515056, + "logits/chosen": -2.3234503269195557, + "logits/rejected": -2.110891819000244, + "logps/chosen": -221.27426147460938, + "logps/rejected": -179.11380004882812, + "loss": 2.1985, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.11594200134277344, + "rewards/margins": 0.07270021736621857, + "rewards/rejected": 0.04324179142713547, + "rewards/safe_rewards": 0.0875079482793808, + "rewards/unsafe_rewards": 0.14437603950500488, "step": 70 }, { "epoch": 0.04, "learning_rate": 2.1505376344086022e-07, - "logits/chosen": -2.331857919692993, - "logits/rejected": -2.11533784866333, - "logps/chosen": -198.42869567871094, - "logps/rejected": -178.8989715576172, - "loss": 8433.2156, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.010765710845589638, - "rewards/margins": 0.009102851152420044, - "rewards/rejected": -0.019868558272719383, - "rewards/safe_rewards": -0.01374301128089428, - "rewards/unsafe_rewards": -0.007788407150655985, + "logits/chosen": -2.3453927040100098, + "logits/rejected": -2.1327505111694336, + "logps/chosen": -197.19949340820312, + "logps/rejected": -176.77151489257812, + "loss": 2.7155, + "rewards/accuracies": 0.4468750059604645, + "rewards/chosen": 0.15263572335243225, + "rewards/margins": 0.012048400938510895, + "rewards/rejected": 0.14058732986450195, + "rewards/safe_rewards": 0.18941155076026917, + "rewards/unsafe_rewards": 0.11585988849401474, "step": 80 }, { "epoch": 0.05, "learning_rate": 2.4193548387096775e-07, - "logits/chosen": -2.335395097732544, - "logits/rejected": -2.1030659675598145, - "logps/chosen": -218.33627319335938, - "logps/rejected": -172.68142700195312, - "loss": 8180.8656, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.022065121680498123, - "rewards/margins": 0.01963471807539463, - "rewards/rejected": -0.0416998453438282, - "rewards/safe_rewards": -0.02183777093887329, - "rewards/unsafe_rewards": -0.022292476147413254, + "logits/chosen": -2.3641719818115234, + "logits/rejected": -2.137413263320923, + "logps/chosen": -216.1211395263672, + "logps/rejected": -168.5092315673828, + "loss": 2.721, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.008618640713393688, + "rewards/margins": 0.006397470831871033, + "rewards/rejected": 0.002221171511337161, + "rewards/safe_rewards": 0.031759221106767654, + "rewards/unsafe_rewards": -0.014521944336593151, "step": 90 }, { "epoch": 0.05, "learning_rate": 2.6881720430107523e-07, - "logits/chosen": -2.324553966522217, - "logits/rejected": -2.1137032508850098, - "logps/chosen": -206.8110809326172, - "logps/rejected": -198.6154327392578, - "loss": 8117.1062, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.050160281360149384, - "rewards/margins": 0.0306589063256979, - "rewards/rejected": -0.08081920444965363, - "rewards/safe_rewards": -0.04988502338528633, - "rewards/unsafe_rewards": -0.05043555423617363, + "logits/chosen": -2.360917568206787, + "logits/rejected": -2.153608798980713, + "logps/chosen": -201.7233428955078, + "logps/rejected": -190.54605102539062, + "loss": 1.4712, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.07169636338949203, + "rewards/margins": 0.08422265201807022, + "rewards/rejected": -0.012526283040642738, + "rewards/safe_rewards": 0.09221886098384857, + "rewards/unsafe_rewards": 0.0511738546192646, "step": 100 }, { "epoch": 0.06, "learning_rate": 2.956989247311828e-07, - "logits/chosen": -2.341404914855957, - "logits/rejected": -2.1051011085510254, - "logps/chosen": -215.013427734375, - "logps/rejected": -187.58670043945312, - "loss": 8260.3953, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.0800362378358841, - "rewards/margins": 0.03362729027867317, - "rewards/rejected": -0.11366353929042816, - "rewards/safe_rewards": -0.079714335501194, - "rewards/unsafe_rewards": -0.08035816252231598, + "logits/chosen": -2.3796088695526123, + "logits/rejected": -2.148357629776001, + "logps/chosen": -207.0086212158203, + "logps/rejected": -176.24658203125, + "loss": 4.9646, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.0011837140191346407, + "rewards/margins": 0.02744489349424839, + "rewards/rejected": -0.02626117691397667, + "rewards/safe_rewards": -0.013010969385504723, + "rewards/unsafe_rewards": 0.015378397889435291, "step": 110 }, { "epoch": 0.06, "learning_rate": 3.225806451612903e-07, - "logits/chosen": -2.317996025085449, - "logits/rejected": -2.05900502204895, - "logps/chosen": -213.1697235107422, - "logps/rejected": -182.4001922607422, - "loss": 7881.807, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.09190914034843445, - "rewards/margins": 0.043657559901475906, - "rewards/rejected": -0.13556669652462006, - "rewards/safe_rewards": -0.08523599803447723, - "rewards/unsafe_rewards": -0.09858228266239166, + "logits/chosen": -2.378938913345337, + "logits/rejected": -2.1289939880371094, + "logps/chosen": -203.86172485351562, + "logps/rejected": -168.72509765625, + "loss": 5.8793, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.11708948761224747, + "rewards/margins": -0.0013303399318829179, + "rewards/rejected": 0.1184198409318924, + "rewards/safe_rewards": 0.14375139772891998, + "rewards/unsafe_rewards": 0.09042758494615555, "step": 120 }, { "epoch": 0.07, "learning_rate": 3.4946236559139783e-07, - "logits/chosen": -2.354461193084717, - "logits/rejected": -2.1032989025115967, - "logps/chosen": -220.25735473632812, - "logps/rejected": -181.8567657470703, - "loss": 7824.5234, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.09244942665100098, - "rewards/margins": 0.05032774806022644, - "rewards/rejected": -0.14277717471122742, - "rewards/safe_rewards": -0.07864584028720856, - "rewards/unsafe_rewards": -0.1062530130147934, + "logits/chosen": -2.4672460556030273, + "logits/rejected": -2.235044479370117, + "logps/chosen": -211.15414428710938, + "logps/rejected": -167.7396697998047, + "loss": 2.9066, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": -0.141743004322052, + "rewards/margins": 0.018900588154792786, + "rewards/rejected": -0.1606435775756836, + "rewards/safe_rewards": -0.20868048071861267, + "rewards/unsafe_rewards": -0.07480548322200775, "step": 130 }, { "epoch": 0.08, "learning_rate": 3.7634408602150537e-07, - "logits/chosen": -2.331094741821289, - "logits/rejected": -2.0899605751037598, - "logps/chosen": -230.6691131591797, - "logps/rejected": -196.5701141357422, - "loss": 7728.4891, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.1132405549287796, - "rewards/margins": 0.05098314210772514, - "rewards/rejected": -0.16422370076179504, - "rewards/safe_rewards": -0.10501746088266373, - "rewards/unsafe_rewards": -0.12146364152431488, + "logits/chosen": -2.469130516052246, + "logits/rejected": -2.2549142837524414, + "logps/chosen": -219.2992401123047, + "logps/rejected": -180.1728057861328, + "loss": 14.0865, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.04582630842924118, + "rewards/margins": 0.07086005806922913, + "rewards/rejected": -0.025033747777342796, + "rewards/safe_rewards": 0.05242709070444107, + "rewards/unsafe_rewards": 0.03922552615404129, "step": 140 }, { "epoch": 0.08, "learning_rate": 4.0322580645161285e-07, - "logits/chosen": -2.230912208557129, - "logits/rejected": -2.0194005966186523, - "logps/chosen": -217.6083526611328, - "logps/rejected": -185.86074829101562, - "loss": 7080.1617, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.1332172453403473, - "rewards/margins": 0.062234263867139816, - "rewards/rejected": -0.1954515129327774, - "rewards/safe_rewards": -0.13880565762519836, - "rewards/unsafe_rewards": -0.1276288479566574, + "logits/chosen": -2.4029898643493652, + "logits/rejected": -2.2180120944976807, + "logps/chosen": -205.2784881591797, + "logps/rejected": -167.4949951171875, + "loss": 1531.726, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -0.9918906092643738, + "rewards/margins": 0.18755348026752472, + "rewards/rejected": -1.1794440746307373, + "rewards/safe_rewards": -0.9263350367546082, + "rewards/unsafe_rewards": -1.0574461221694946, "step": 150 }, { "epoch": 0.09, "learning_rate": 4.3010752688172043e-07, - "logits/chosen": -2.15290904045105, - "logits/rejected": -1.8807637691497803, - "logps/chosen": -228.8234405517578, - "logps/rejected": -211.19802856445312, - "loss": 7557.0047, - "rewards/accuracies": 0.5406249761581421, - "rewards/chosen": -0.1998569667339325, - "rewards/margins": 0.051988668739795685, - "rewards/rejected": -0.2518456280231476, - "rewards/safe_rewards": -0.20650985836982727, - "rewards/unsafe_rewards": -0.19320407509803772, + "logits/chosen": -2.3345110416412354, + "logits/rejected": -2.1149539947509766, + "logps/chosen": -209.245849609375, + "logps/rejected": -186.2938995361328, + "loss": 76.4742, + "rewards/accuracies": 0.4468750059604645, + "rewards/chosen": -0.40810996294021606, + "rewards/margins": -0.12766215205192566, + "rewards/rejected": -0.280447781085968, + "rewards/safe_rewards": -0.5669787526130676, + "rewards/unsafe_rewards": -0.2492411583662033, "step": 160 }, { "epoch": 0.09, "learning_rate": 4.569892473118279e-07, - "logits/chosen": -2.163496494293213, - "logits/rejected": -1.8812412023544312, - "logps/chosen": -209.8403778076172, - "logps/rejected": -179.97496032714844, - "loss": 7154.7875, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.16811709105968475, - "rewards/margins": 0.05677107721567154, - "rewards/rejected": -0.2248881608247757, - "rewards/safe_rewards": -0.1681157797574997, - "rewards/unsafe_rewards": -0.16811838746070862, + "logits/chosen": -2.364396810531616, + "logits/rejected": -2.153006076812744, + "logps/chosen": -193.48985290527344, + "logps/rejected": -157.84793090820312, + "loss": 366.3049, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.4611927568912506, + "rewards/margins": -0.09943069517612457, + "rewards/rejected": -0.36176207661628723, + "rewards/safe_rewards": -0.3680972456932068, + "rewards/unsafe_rewards": -0.5542882680892944, "step": 170 }, { "epoch": 0.1, "learning_rate": 4.838709677419355e-07, - "logits/chosen": -2.079279899597168, - "logits/rejected": -1.7519344091415405, - "logps/chosen": -228.12905883789062, - "logps/rejected": -209.23184204101562, - "loss": 7375.8891, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.2770082652568817, - "rewards/margins": 0.06209835410118103, - "rewards/rejected": -0.33910661935806274, - "rewards/safe_rewards": -0.27290791273117065, - "rewards/unsafe_rewards": -0.2811085879802704, + "logits/chosen": -2.379281520843506, + "logits/rejected": -2.1527695655822754, + "logps/chosen": -201.44384765625, + "logps/rejected": -176.66293334960938, + "loss": 212.1672, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -1.0156188011169434, + "rewards/margins": 0.32616162300109863, + "rewards/rejected": -1.341780424118042, + "rewards/safe_rewards": -0.7996016144752502, + "rewards/unsafe_rewards": -1.2316361665725708, "step": 180 }, { "epoch": 0.1, "learning_rate": 4.999929391798331e-07, - "logits/chosen": -2.132680892944336, - "logits/rejected": -1.7472947835922241, - "logps/chosen": -237.369384765625, - "logps/rejected": -204.32150268554688, - "loss": 7349.0242, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.24653896689414978, - "rewards/margins": 0.07543239742517471, - "rewards/rejected": -0.3219713568687439, - "rewards/safe_rewards": -0.24422617256641388, - "rewards/unsafe_rewards": -0.24885177612304688, + "logits/chosen": -2.4363088607788086, + "logits/rejected": -2.1664328575134277, + "logps/chosen": -214.74972534179688, + "logps/rejected": -172.83267211914062, + "loss": 281.1637, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": -2.0342252254486084, + "rewards/margins": -1.3259267807006836, + "rewards/rejected": -0.7082984447479248, + "rewards/safe_rewards": -1.9764223098754883, + "rewards/unsafe_rewards": -2.0920281410217285, "step": 190 }, { "epoch": 0.11, "learning_rate": 4.9991350953333e-07, - "logits/chosen": -2.0583910942077637, - "logits/rejected": -1.7268041372299194, - "logps/chosen": -248.58944702148438, - "logps/rejected": -226.4416046142578, - "loss": 7112.2336, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": -0.38036662340164185, - "rewards/margins": 0.05376458168029785, - "rewards/rejected": -0.4341312348842621, - "rewards/safe_rewards": -0.3542497754096985, - "rewards/unsafe_rewards": -0.40648356080055237, + "logits/chosen": -2.399965763092041, + "logits/rejected": -2.1533687114715576, + "logps/chosen": -211.14138793945312, + "logps/rejected": -183.2847442626953, + "loss": 37.4693, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.5885945558547974, + "rewards/margins": -0.33234477043151855, + "rewards/rejected": -0.2562498152256012, + "rewards/safe_rewards": 0.10301212966442108, + "rewards/unsafe_rewards": -1.2802014350891113, "step": 200 }, { "epoch": 0.11, "learning_rate": 4.997458523498236e-07, - "logits/chosen": -2.1265130043029785, - "logits/rejected": -1.8143631219863892, - "logps/chosen": -225.8239288330078, - "logps/rejected": -198.24595642089844, - "loss": 7020.6945, - "rewards/accuracies": 0.559374988079071, - "rewards/chosen": -0.32611721754074097, - "rewards/margins": 0.04634212702512741, - "rewards/rejected": -0.372459352016449, - "rewards/safe_rewards": -0.33249932527542114, - "rewards/unsafe_rewards": -0.3197351396083832, + "logits/chosen": -2.4136548042297363, + "logits/rejected": -2.1710681915283203, + "logps/chosen": -192.46209716796875, + "logps/rejected": -160.3273468017578, + "loss": 19.4933, + "rewards/accuracies": 0.46562498807907104, + "rewards/chosen": 0.7500754594802856, + "rewards/margins": 0.07740475982427597, + "rewards/rejected": 0.6726706624031067, + "rewards/safe_rewards": 0.8147931098937988, + "rewards/unsafe_rewards": 0.6853577494621277, "step": 210 }, { "epoch": 0.12, "learning_rate": 4.99490026817712e-07, - "logits/chosen": -2.1002330780029297, - "logits/rejected": -1.7667055130004883, - "logps/chosen": -240.4268798828125, - "logps/rejected": -214.17312622070312, - "loss": 6918.2156, - "rewards/accuracies": 0.565625011920929, - "rewards/chosen": -0.3299749791622162, - "rewards/margins": 0.07286922633647919, - "rewards/rejected": -0.40284425020217896, - "rewards/safe_rewards": -0.33103418350219727, - "rewards/unsafe_rewards": -0.3289158344268799, + "logits/chosen": -2.3793249130249023, + "logits/rejected": -2.126897096633911, + "logps/chosen": -206.8174591064453, + "logps/rejected": -174.28512573242188, + "loss": 618.2743, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": 0.6119144558906555, + "rewards/margins": 1.0083643198013306, + "rewards/rejected": -0.3964497447013855, + "rewards/safe_rewards": 0.269029825925827, + "rewards/unsafe_rewards": 0.9547992944717407, "step": 220 }, { "epoch": 0.12, "learning_rate": 4.991461232516674e-07, - "logits/chosen": -2.132354259490967, - "logits/rejected": -1.7921921014785767, - "logps/chosen": -249.4021453857422, - "logps/rejected": -229.88247680664062, - "loss": 7642.9477, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.3167831003665924, - "rewards/margins": 0.08834515511989594, - "rewards/rejected": -0.4051283001899719, - "rewards/safe_rewards": -0.3021748661994934, - "rewards/unsafe_rewards": -0.3313913643360138, + "logits/chosen": -2.278285503387451, + "logits/rejected": -2.0165598392486572, + "logps/chosen": -220.05496215820312, + "logps/rejected": -191.4230499267578, + "loss": 117.4644, + "rewards/accuracies": 0.44062501192092896, + "rewards/chosen": -2.331136465072632, + "rewards/margins": -0.27771270275115967, + "rewards/rejected": -2.053424119949341, + "rewards/safe_rewards": -1.6258525848388672, + "rewards/unsafe_rewards": -3.0364208221435547, "step": 230 }, { "epoch": 0.13, "learning_rate": 4.98714263060751e-07, - "logits/chosen": -2.251471519470215, - "logits/rejected": -1.886639952659607, - "logps/chosen": -207.30184936523438, - "logps/rejected": -179.5860595703125, - "loss": 7244.568, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.19368217885494232, - "rewards/margins": 0.04573160782456398, - "rewards/rejected": -0.239413782954216, - "rewards/safe_rewards": -0.2037370651960373, - "rewards/unsafe_rewards": -0.18362729251384735, + "logits/chosen": -2.2665092945098877, + "logits/rejected": -1.9782488346099854, + "logps/chosen": -189.6136016845703, + "logps/rejected": -156.85269165039062, + "loss": 123.5274, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -1.6799499988555908, + "rewards/margins": -0.4719271659851074, + "rewards/rejected": -1.2080228328704834, + "rewards/safe_rewards": -1.867531418800354, + "rewards/unsafe_rewards": -1.4923683404922485, "step": 240 }, { "epoch": 0.13, "learning_rate": 4.98194598705552e-07, - "logits/chosen": -2.288367748260498, - "logits/rejected": -2.053013563156128, - "logps/chosen": -224.88290405273438, - "logps/rejected": -203.2853240966797, - "loss": 7454.4141, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": -0.21541249752044678, - "rewards/margins": 0.06779678165912628, - "rewards/rejected": -0.28320926427841187, - "rewards/safe_rewards": -0.22131112217903137, - "rewards/unsafe_rewards": -0.20951387286186218, + "logits/chosen": -2.2388875484466553, + "logits/rejected": -2.0419199466705322, + "logps/chosen": -203.91488647460938, + "logps/rejected": -175.87570190429688, + "loss": 29.462, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": -0.5732041597366333, + "rewards/margins": 0.3381038308143616, + "rewards/rejected": -0.9113079905509949, + "rewards/safe_rewards": -0.5594094395637512, + "rewards/unsafe_rewards": -0.5869989395141602, "step": 250 }, { "epoch": 0.14, "learning_rate": 4.975873136443648e-07, - "logits/chosen": -2.33178973197937, - "logits/rejected": -2.110666036605835, - "logps/chosen": -250.9014434814453, - "logps/rejected": -225.95675659179688, - "loss": 6933.1133, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.3147348463535309, - "rewards/margins": 0.06465913355350494, - "rewards/rejected": -0.379393994808197, - "rewards/safe_rewards": -0.3217589557170868, - "rewards/unsafe_rewards": -0.307710736989975, + "logits/chosen": -2.323503017425537, + "logits/rejected": -2.1084866523742676, + "logps/chosen": -219.4092254638672, + "logps/rejected": -188.0467071533203, + "loss": 514.7106, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": 0.01872560940682888, + "rewards/margins": 0.048060666769742966, + "rewards/rejected": -0.029335061088204384, + "rewards/safe_rewards": -0.101626917719841, + "rewards/unsafe_rewards": 0.13907812535762787, "step": 260 }, { "epoch": 0.15, "learning_rate": 4.968926222684212e-07, - "logits/chosen": -2.2538371086120605, - "logits/rejected": -2.0441083908081055, - "logps/chosen": -231.07321166992188, - "logps/rejected": -215.10226440429688, - "loss": 7070.5437, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.347059428691864, - "rewards/margins": 0.0650092363357544, - "rewards/rejected": -0.4120686650276184, - "rewards/safe_rewards": -0.3425368070602417, - "rewards/unsafe_rewards": -0.3515821099281311, + "logits/chosen": -2.3192670345306396, + "logits/rejected": -2.128873586654663, + "logps/chosen": -195.8466796875, + "logps/rejected": -173.4759063720703, + "loss": 62.0019, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": 0.5205889940261841, + "rewards/margins": 0.10105878114700317, + "rewards/rejected": 0.4195302128791809, + "rewards/safe_rewards": 0.4973847270011902, + "rewards/unsafe_rewards": 0.5437930822372437, "step": 270 }, { "epoch": 0.15, "learning_rate": 4.961107698262044e-07, - "logits/chosen": -2.240346670150757, - "logits/rejected": -1.963796615600586, - "logps/chosen": -245.887939453125, - "logps/rejected": -214.9673309326172, - "loss": 7118.2719, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.3487110733985901, - "rewards/margins": 0.047723181545734406, - "rewards/rejected": -0.3964342474937439, - "rewards/safe_rewards": -0.36569300293922424, - "rewards/unsafe_rewards": -0.33172911405563354, + "logits/chosen": -2.3513216972351074, + "logits/rejected": -2.1132161617279053, + "logps/chosen": -209.58480834960938, + "logps/rejected": -173.8505096435547, + "loss": 19.9099, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": 1.4319963455200195, + "rewards/margins": -0.04141209274530411, + "rewards/rejected": 1.4734083414077759, + "rewards/safe_rewards": 0.7625109553337097, + "rewards/unsafe_rewards": 2.1014816761016846, "step": 280 }, { "epoch": 0.16, "learning_rate": 4.952420323368673e-07, - "logits/chosen": -2.258303642272949, - "logits/rejected": -1.9825427532196045, - "logps/chosen": -243.5653839111328, - "logps/rejected": -221.63369750976562, - "loss": 7056.0906, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.3961013853549957, - "rewards/margins": 0.0807078555226326, - "rewards/rejected": -0.4768092632293701, - "rewards/safe_rewards": -0.39416471123695374, - "rewards/unsafe_rewards": -0.3980380594730377, + "logits/chosen": -2.327949047088623, + "logits/rejected": -2.081421136856079, + "logps/chosen": -202.83131408691406, + "logps/rejected": -173.12339782714844, + "loss": 166.1931, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 1.1239261627197266, + "rewards/margins": 0.29456058144569397, + "rewards/rejected": 0.8293657302856445, + "rewards/safe_rewards": 0.95171719789505, + "rewards/unsafe_rewards": 1.2961351871490479, "step": 290 }, { "epoch": 0.16, "learning_rate": 4.942867164927899e-07, - "logits/chosen": -2.2862093448638916, - "logits/rejected": -2.087953805923462, - "logps/chosen": -232.6704864501953, - "logps/rejected": -214.80374145507812, - "loss": 7359.575, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.3128170967102051, - "rewards/margins": 0.09015890210866928, - "rewards/rejected": -0.40297597646713257, - "rewards/safe_rewards": -0.2959223985671997, - "rewards/unsafe_rewards": -0.3297117352485657, + "logits/chosen": -2.3304100036621094, + "logits/rejected": -2.148871898651123, + "logps/chosen": -200.2861785888672, + "logps/rejected": -173.5687713623047, + "loss": 83.8678, + "rewards/accuracies": 0.546875, + "rewards/chosen": 1.1026077270507812, + "rewards/margins": 0.16524335741996765, + "rewards/rejected": 0.9373642206192017, + "rewards/safe_rewards": 1.20353102684021, + "rewards/unsafe_rewards": 1.001684308052063, "step": 300 }, { "epoch": 0.17, "learning_rate": 4.932451595513062e-07, - "logits/chosen": -2.278350591659546, - "logits/rejected": -1.9919618368148804, - "logps/chosen": -261.5721740722656, - "logps/rejected": -237.3791961669922, - "loss": 6861.6203, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.3782269060611725, - "rewards/margins": 0.09306754171848297, - "rewards/rejected": -0.4712944030761719, - "rewards/safe_rewards": -0.37359195947647095, - "rewards/unsafe_rewards": -0.38286182284355164, + "logits/chosen": -2.3603804111480713, + "logits/rejected": -2.1054179668426514, + "logps/chosen": -222.5138702392578, + "logps/rejected": -189.41696166992188, + "loss": 125.375, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 1.2356212139129639, + "rewards/margins": 0.4028751254081726, + "rewards/rejected": 0.8327462077140808, + "rewards/safe_rewards": 1.2047992944717407, + "rewards/unsafe_rewards": 1.2664434909820557, "step": 310 }, { "epoch": 0.17, "learning_rate": 4.921177292156419e-07, - "logits/chosen": -2.2868103981018066, - "logits/rejected": -1.9530357122421265, - "logps/chosen": -238.9241485595703, - "logps/rejected": -221.6173095703125, - "loss": 6779.8594, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4034157395362854, - "rewards/margins": 0.0715942531824112, - "rewards/rejected": -0.4750100076198578, - "rewards/safe_rewards": -0.39607614278793335, - "rewards/unsafe_rewards": -0.41075533628463745, + "logits/chosen": -2.4207069873809814, + "logits/rejected": -2.131692409515381, + "logps/chosen": -197.57579040527344, + "logps/rejected": -173.03189086914062, + "loss": 32.4693, + "rewards/accuracies": 0.4375, + "rewards/chosen": 1.0067864656448364, + "rewards/margins": -0.07762779295444489, + "rewards/rejected": 1.0844142436981201, + "rewards/safe_rewards": 0.9899358749389648, + "rewards/unsafe_rewards": 1.0236369371414185, "step": 320 }, { "epoch": 0.18, "learning_rate": 4.909048235051033e-07, - "logits/chosen": -2.248591184616089, - "logits/rejected": -2.041801929473877, - "logps/chosen": -233.26199340820312, - "logps/rejected": -221.5611572265625, - "loss": 6971.7297, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.30375435948371887, - "rewards/margins": 0.10259900242090225, - "rewards/rejected": -0.40635329484939575, - "rewards/safe_rewards": -0.3013189435005188, - "rewards/unsafe_rewards": -0.30618971586227417, + "logits/chosen": -2.3886237144470215, + "logits/rejected": -2.2095794677734375, + "logps/chosen": -201.99131774902344, + "logps/rejected": -180.18301391601562, + "loss": 165.1989, + "rewards/accuracies": 0.546875, + "rewards/chosen": 0.8952449560165405, + "rewards/margins": 0.1524442732334137, + "rewards/rejected": 0.7428006529808044, + "rewards/safe_rewards": 0.9638195037841797, + "rewards/unsafe_rewards": 0.8266702890396118, "step": 330 }, { "epoch": 0.18, "learning_rate": 4.896068706145631e-07, - "logits/chosen": -2.239377975463867, - "logits/rejected": -1.9611393213272095, - "logps/chosen": -252.949951171875, - "logps/rejected": -212.8416290283203, - "loss": 7127.9594, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.43277350068092346, - "rewards/margins": 0.07759107649326324, - "rewards/rejected": -0.5103645920753479, - "rewards/safe_rewards": -0.42603859305381775, - "rewards/unsafe_rewards": -0.4395083785057068, + "logits/chosen": -2.4264276027679443, + "logits/rejected": -2.1699893474578857, + "logps/chosen": -209.13687133789062, + "logps/rejected": -161.4777374267578, + "loss": 63.6332, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.5357077717781067, + "rewards/margins": 0.20826852321624756, + "rewards/rejected": 0.32743921875953674, + "rewards/safe_rewards": 0.6318890452384949, + "rewards/unsafe_rewards": 0.4395265579223633, "step": 340 }, { "epoch": 0.19, "learning_rate": 4.882243287632946e-07, - "logits/chosen": -2.258354425430298, - "logits/rejected": -1.9823997020721436, - "logps/chosen": -240.0575714111328, - "logps/rejected": -224.93618774414062, - "loss": 6857.2828, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.4949573576450348, - "rewards/margins": 0.07967236638069153, - "rewards/rejected": -0.5746296644210815, - "rewards/safe_rewards": -0.4892745912075043, - "rewards/unsafe_rewards": -0.5006400346755981, + "logits/chosen": -2.4155266284942627, + "logits/rejected": -2.1885287761688232, + "logps/chosen": -190.31680297851562, + "logps/rejected": -167.34011840820312, + "loss": 22.5493, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.2450559437274933, + "rewards/margins": 0.11199624836444855, + "rewards/rejected": 0.13305969536304474, + "rewards/safe_rewards": 0.32091599702835083, + "rewards/unsafe_rewards": 0.16919586062431335, "step": 350 }, { "epoch": 0.19, "learning_rate": 4.867576860332048e-07, - "logits/chosen": -2.325746774673462, - "logits/rejected": -2.0531835556030273, - "logps/chosen": -223.86062622070312, - "logps/rejected": -206.43603515625, - "loss": 6847.6039, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.4088946282863617, - "rewards/margins": 0.08093228191137314, - "rewards/rejected": -0.48982691764831543, - "rewards/safe_rewards": -0.41694265604019165, - "rewards/unsafe_rewards": -0.4008466303348541, + "logits/chosen": -2.4087131023406982, + "logits/rejected": -2.1696860790252686, + "logps/chosen": -182.63320922851562, + "logps/rejected": -157.3323974609375, + "loss": 39.9616, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.33797144889831543, + "rewards/margins": 0.2170281708240509, + "rewards/rejected": 0.12094320356845856, + "rewards/safe_rewards": 0.7084277868270874, + "rewards/unsafe_rewards": -0.0324850007891655, "step": 360 }, { "epoch": 0.2, "learning_rate": 4.85207460196526e-07, - "logits/chosen": -2.2898755073547363, - "logits/rejected": -2.0400357246398926, - "logps/chosen": -238.4681396484375, - "logps/rejected": -224.0597686767578, - "loss": 6861.7656, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.373796671628952, - "rewards/margins": 0.06371524930000305, - "rewards/rejected": -0.4375119209289551, - "rewards/safe_rewards": -0.3857375979423523, - "rewards/unsafe_rewards": -0.361855685710907, + "logits/chosen": -2.3588593006134033, + "logits/rejected": -2.1359121799468994, + "logps/chosen": -201.29721069335938, + "logps/rejected": -180.4462432861328, + "loss": 18.4967, + "rewards/accuracies": 0.421875, + "rewards/chosen": -0.20872633159160614, + "rewards/margins": -0.07106774300336838, + "rewards/rejected": -0.13765858113765717, + "rewards/safe_rewards": -0.24322757124900818, + "rewards/unsafe_rewards": -0.1742250919342041, "step": 370 }, { "epoch": 0.2, "learning_rate": 4.835741985330259e-07, - "logits/chosen": -2.275742292404175, - "logits/rejected": -2.050511121749878, - "logps/chosen": -232.7757110595703, - "logps/rejected": -209.2894287109375, - "loss": 6446.7797, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.3619254231452942, - "rewards/margins": 0.08308672159910202, - "rewards/rejected": -0.4450121521949768, - "rewards/safe_rewards": -0.3546380400657654, - "rewards/unsafe_rewards": -0.3692127764225006, + "logits/chosen": -2.393688678741455, + "logits/rejected": -2.1949095726013184, + "logps/chosen": -196.72280883789062, + "logps/rejected": -164.93276977539062, + "loss": 13.0753, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": -0.13964949548244476, + "rewards/margins": 0.004895883612334728, + "rewards/rejected": -0.14454536139965057, + "rewards/safe_rewards": -0.14425238966941833, + "rewards/unsafe_rewards": -0.1350466012954712, "step": 380 }, { "epoch": 0.21, "learning_rate": 4.818584776367992e-07, - "logits/chosen": -2.1700243949890137, - "logits/rejected": -1.9761574268341064, - "logps/chosen": -248.4570770263672, - "logps/rejected": -236.026611328125, - "loss": 6785.3953, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": -0.4019840657711029, - "rewards/margins": 0.09303420037031174, - "rewards/rejected": -0.49501824378967285, - "rewards/safe_rewards": -0.41988474130630493, - "rewards/unsafe_rewards": -0.38408344984054565, + "logits/chosen": -2.348188638687134, + "logits/rejected": -2.183293342590332, + "logps/chosen": -207.3245086669922, + "logps/rejected": -185.33078002929688, + "loss": 405.7585, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": 0.9341610074043274, + "rewards/margins": -0.25985628366470337, + "rewards/rejected": 1.1940172910690308, + "rewards/safe_rewards": 1.5165033340454102, + "rewards/unsafe_rewards": 0.3518185615539551, "step": 390 }, { "epoch": 0.22, "learning_rate": 4.800609032127122e-07, - "logits/chosen": -2.213772773742676, - "logits/rejected": -1.9254707098007202, - "logps/chosen": -244.1571044921875, - "logps/rejected": -219.7991943359375, - "loss": 6917.5641, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.3825494050979614, - "rewards/margins": 0.06950954347848892, - "rewards/rejected": -0.45205894112586975, - "rewards/safe_rewards": -0.3747442066669464, - "rewards/unsafe_rewards": -0.39035457372665405, + "logits/chosen": -2.362936496734619, + "logits/rejected": -2.117405652999878, + "logps/chosen": -205.0863037109375, + "logps/rejected": -173.82562255859375, + "loss": 250.8796, + "rewards/accuracies": 0.46562498807907104, + "rewards/chosen": 0.8158755302429199, + "rewards/margins": 0.04819601774215698, + "rewards/rejected": 0.7676795721054077, + "rewards/safe_rewards": 0.8238789439201355, + "rewards/unsafe_rewards": 0.8078721761703491, "step": 400 }, { "epoch": 0.22, "learning_rate": 4.78182109862569e-07, - "logits/chosen": -2.149578809738159, - "logits/rejected": -1.9344444274902344, - "logps/chosen": -230.39987182617188, - "logps/rejected": -214.0001678466797, - "loss": 7505.7875, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.3610875904560089, - "rewards/margins": 0.06836280971765518, - "rewards/rejected": -0.4294503629207611, - "rewards/safe_rewards": -0.3561205565929413, - "rewards/unsafe_rewards": -0.36605459451675415, + "logits/chosen": -2.334447145462036, + "logits/rejected": -2.1603846549987793, + "logps/chosen": -193.15878295898438, + "logps/rejected": -169.64031982421875, + "loss": 43.271, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 1.1323336362838745, + "rewards/margins": -0.2824760377407074, + "rewards/rejected": 1.4148097038269043, + "rewards/safe_rewards": 1.1305078268051147, + "rewards/unsafe_rewards": 1.1341596841812134, "step": 410 }, { "epoch": 0.23, "learning_rate": 4.7622276086107677e-07, - "logits/chosen": -2.2380170822143555, - "logits/rejected": -1.9596973657608032, - "logps/chosen": -260.88958740234375, - "logps/rejected": -231.9978485107422, - "loss": 7023.0203, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.3748038709163666, - "rewards/margins": 0.09135393798351288, - "rewards/rejected": -0.46615785360336304, - "rewards/safe_rewards": -0.37403541803359985, - "rewards/unsafe_rewards": -0.3755723834037781, + "logits/chosen": -2.4567148685455322, + "logits/rejected": -2.2268338203430176, + "logps/chosen": -221.8797149658203, + "logps/rejected": -183.58682250976562, + "loss": 170.0915, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 1.5294605493545532, + "rewards/margins": -0.26579660177230835, + "rewards/rejected": 1.7952572107315063, + "rewards/safe_rewards": 1.6476377248764038, + "rewards/unsafe_rewards": 1.4112837314605713, "step": 420 }, { "epoch": 0.23, "learning_rate": 4.741835479216879e-07, - "logits/chosen": -2.1961891651153564, - "logits/rejected": -1.9379104375839233, - "logps/chosen": -265.2579650878906, - "logps/rejected": -250.7912139892578, - "loss": 6681.5836, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.3908992409706116, - "rewards/margins": 0.08248193562030792, - "rewards/rejected": -0.4733811914920807, - "rewards/safe_rewards": -0.40423211455345154, - "rewards/unsafe_rewards": -0.3775663375854492, + "logits/chosen": -2.4018983840942383, + "logits/rejected": -2.1745998859405518, + "logps/chosen": -224.1997833251953, + "logps/rejected": -202.8693084716797, + "loss": 318.6482, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 1.9682689905166626, + "rewards/margins": 1.3845123052597046, + "rewards/rejected": 0.5837565660476685, + "rewards/safe_rewards": 1.9361345767974854, + "rewards/unsafe_rewards": 2.0004029273986816, "step": 430 }, { "epoch": 0.24, "learning_rate": 4.720651909524036e-07, - "logits/chosen": -2.185619831085205, - "logits/rejected": -1.9523022174835205, - "logps/chosen": -230.352294921875, - "logps/rejected": -210.71206665039062, - "loss": 6920.7953, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.30957138538360596, - "rewards/margins": 0.0758373960852623, - "rewards/rejected": -0.38540878891944885, - "rewards/safe_rewards": -0.3266102075576782, - "rewards/unsafe_rewards": -0.2925325632095337, + "logits/chosen": -2.368582248687744, + "logits/rejected": -2.1598029136657715, + "logps/chosen": -199.04641723632812, + "logps/rejected": -171.59878540039062, + "loss": 20.6844, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.34873148798942566, + "rewards/margins": -0.22367699444293976, + "rewards/rejected": 0.5724084973335266, + "rewards/safe_rewards": 0.4507713317871094, + "rewards/unsafe_rewards": 0.24669162929058075, "step": 440 }, { "epoch": 0.24, "learning_rate": 4.698684378016222e-07, - "logits/chosen": -2.2177813053131104, - "logits/rejected": -1.9565246105194092, - "logps/chosen": -239.7762451171875, - "logps/rejected": -208.4960479736328, - "loss": 6838.5781, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.3329833149909973, - "rewards/margins": 0.08524159342050552, - "rewards/rejected": -0.41822490096092224, - "rewards/safe_rewards": -0.32857412099838257, - "rewards/unsafe_rewards": -0.3373924791812897, + "logits/chosen": -2.4238266944885254, + "logits/rejected": -2.1877074241638184, + "logps/chosen": -206.9587860107422, + "logps/rejected": -166.5978546142578, + "loss": 36.0619, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.48088520765304565, + "rewards/margins": -0.556584894657135, + "rewards/rejected": 0.07569964975118637, + "rewards/safe_rewards": -0.8808043599128723, + "rewards/unsafe_rewards": -0.08096615970134735, "step": 450 }, { "epoch": 0.25, "learning_rate": 4.675940639941256e-07, - "logits/chosen": -2.1189966201782227, - "logits/rejected": -1.9002841711044312, - "logps/chosen": -242.6001739501953, - "logps/rejected": -224.87594604492188, - "loss": 6790.9219, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.393105685710907, - "rewards/margins": 0.07088775932788849, - "rewards/rejected": -0.46399348974227905, - "rewards/safe_rewards": -0.3847719132900238, - "rewards/unsafe_rewards": -0.40143948793411255, + "logits/chosen": -2.381782054901123, + "logits/rejected": -2.2072319984436035, + "logps/chosen": -202.72836303710938, + "logps/rejected": -178.13565063476562, + "loss": 19.0221, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.561229407787323, + "rewards/margins": 0.22027714550495148, + "rewards/rejected": 0.3409522473812103, + "rewards/safe_rewards": 0.481137752532959, + "rewards/unsafe_rewards": 0.641321063041687, "step": 460 }, { "epoch": 0.25, "learning_rate": 4.6524287245729286e-07, - "logits/chosen": -2.0616540908813477, - "logits/rejected": -1.8031222820281982, - "logps/chosen": -235.3325958251953, - "logps/rejected": -215.1667022705078, - "loss": 6658.0063, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.37046387791633606, - "rewards/margins": 0.11769552528858185, - "rewards/rejected": -0.4881593585014343, - "rewards/safe_rewards": -0.37195223569869995, - "rewards/unsafe_rewards": -0.3689754605293274, + "logits/chosen": -2.3484253883361816, + "logits/rejected": -2.134340524673462, + "logps/chosen": -198.06240844726562, + "logps/rejected": -166.09368896484375, + "loss": 26.6374, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.22380805015563965, + "rewards/margins": -0.03325925022363663, + "rewards/rejected": 0.2570672631263733, + "rewards/safe_rewards": 0.1073969230055809, + "rewards/unsafe_rewards": 0.3402191996574402, "step": 470 }, { "epoch": 0.26, "learning_rate": 4.628156932376418e-07, - "logits/chosen": -2.127551555633545, - "logits/rejected": -1.8270717859268188, - "logps/chosen": -238.7838592529297, - "logps/rejected": -211.7913360595703, - "loss": 6571.0922, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.358383446931839, - "rewards/margins": 0.10168886184692383, - "rewards/rejected": -0.4600723385810852, - "rewards/safe_rewards": -0.3699692189693451, - "rewards/unsafe_rewards": -0.3467976748943329, + "logits/chosen": -2.3849387168884277, + "logits/rejected": -2.1502578258514404, + "logps/chosen": -202.72006225585938, + "logps/rejected": -165.7488555908203, + "loss": 163.8104, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": 0.22546739876270294, + "rewards/margins": 0.1902145892381668, + "rewards/rejected": 0.03525285795331001, + "rewards/safe_rewards": -0.18428334593772888, + "rewards/unsafe_rewards": 0.6352182030677795, "step": 480 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-07, - "logits/chosen": -2.1131837368011475, - "logits/rejected": -1.8676449060440063, - "logps/chosen": -252.4139404296875, - "logps/rejected": -249.065185546875, - "loss": 6808.1812, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.42430177330970764, - "rewards/margins": 0.06666339188814163, - "rewards/rejected": -0.49096518754959106, - "rewards/safe_rewards": -0.4318160116672516, - "rewards/unsafe_rewards": -0.4167875647544861, + "logits/chosen": -2.3536932468414307, + "logits/rejected": -2.1680946350097656, + "logps/chosen": -209.31454467773438, + "logps/rejected": -199.3543701171875, + "loss": 75.0922, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.6691935658454895, + "rewards/margins": 0.05490832403302193, + "rewards/rejected": 0.6142852902412415, + "rewards/safe_rewards": 0.21514494717121124, + "rewards/unsafe_rewards": 1.1232421398162842, "step": 490 }, { "epoch": 0.27, "learning_rate": 4.5773682576397776e-07, - "logits/chosen": -2.147376537322998, - "logits/rejected": -1.9110462665557861, - "logps/chosen": -240.01022338867188, - "logps/rejected": -216.8660125732422, - "loss": 6785.8539, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.38424068689346313, - "rewards/margins": 0.07964359223842621, - "rewards/rejected": -0.46388429403305054, - "rewards/safe_rewards": -0.376841276884079, - "rewards/unsafe_rewards": -0.39164015650749207, + "logits/chosen": -2.360821008682251, + "logits/rejected": -2.1603407859802246, + "logps/chosen": -201.4778594970703, + "logps/rejected": -169.81484985351562, + "loss": 131.6857, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.10828091204166412, + "rewards/margins": -0.5544548034667969, + "rewards/rejected": 0.662735641002655, + "rewards/safe_rewards": -0.3237845301628113, + "rewards/unsafe_rewards": 0.5403462648391724, "step": 500 }, { "epoch": 0.27, - "eval_logits/chosen": -1.7600023746490479, - "eval_logits/rejected": -1.4655544757843018, - "eval_logps/chosen": -195.498779296875, - "eval_logps/rejected": -155.7913818359375, - "eval_loss": 3119.06689453125, - "eval_rewards/accuracies": 0.47652468085289, - "eval_rewards/chosen": -0.6463291049003601, - "eval_rewards/margins": -0.011934175156056881, - "eval_rewards/rejected": -0.6343949437141418, - "eval_rewards/safe_rewards": -0.6474616527557373, - "eval_rewards/unsafe_rewards": -0.6439096927642822, - "eval_runtime": 1793.7735, - "eval_samples_per_second": 18.422, - "eval_steps_per_second": 1.152, + "eval_logits/chosen": -2.056485414505005, + "eval_logits/rejected": -1.803229808807373, + "eval_logps/chosen": -130.9681396484375, + "eval_logps/rejected": -92.36480712890625, + "eval_loss": 0.8894476294517517, + "eval_rewards/accuracies": 0.45462244749069214, + "eval_rewards/chosen": -0.10225697606801987, + "eval_rewards/margins": -0.08933582156896591, + "eval_rewards/rejected": -0.012921147979795933, + "eval_rewards/safe_rewards": -0.10428992658853531, + "eval_rewards/unsafe_rewards": -0.10168781876564026, + "eval_runtime": 2237.5747, + "eval_samples_per_second": 14.768, + "eval_steps_per_second": 0.923, "step": 500 }, { "epoch": 0.27, "learning_rate": 4.5508693051414774e-07, - "logits/chosen": -2.199183225631714, - "logits/rejected": -1.9916718006134033, - "logps/chosen": -239.27005004882812, - "logps/rejected": -231.099853515625, - "loss": 6512.4289, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4149134159088135, - "rewards/margins": 0.10183224827051163, - "rewards/rejected": -0.5167456865310669, - "rewards/safe_rewards": -0.4191102981567383, - "rewards/unsafe_rewards": -0.4107164740562439, + "logits/chosen": -2.3876683712005615, + "logits/rejected": -2.2101075649261475, + "logps/chosen": -197.6197509765625, + "logps/rejected": -179.2535858154297, + "loss": 10.2996, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.15894845128059387, + "rewards/margins": -0.012749219313263893, + "rewards/rejected": 0.17169766128063202, + "rewards/safe_rewards": 0.22355195879936218, + "rewards/unsafe_rewards": 0.09434493631124496, "step": 510 }, { "epoch": 0.28, "learning_rate": 4.52364632956877e-07, - "logits/chosen": -2.1832046508789062, - "logits/rejected": -1.9507207870483398, - "logps/chosen": -252.33029174804688, - "logps/rejected": -222.3158416748047, - "loss": 7180.6562, - "rewards/accuracies": 0.6468750238418579, - "rewards/chosen": -0.42580121755599976, - "rewards/margins": 0.09123551845550537, - "rewards/rejected": -0.5170367956161499, - "rewards/safe_rewards": -0.4184451103210449, - "rewards/unsafe_rewards": -0.43315738439559937, + "logits/chosen": -2.3795700073242188, + "logits/rejected": -2.167722225189209, + "logps/chosen": -209.80477905273438, + "logps/rejected": -170.1284942626953, + "loss": 102.0474, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.054634951055049896, + "rewards/margins": -0.5382941961288452, + "rewards/rejected": 0.48365920782089233, + "rewards/safe_rewards": 0.07545175403356552, + "rewards/unsafe_rewards": -0.1847216635942459, "step": 520 }, { "epoch": 0.29, "learning_rate": 4.4957089415108895e-07, - "logits/chosen": -2.1672799587249756, - "logits/rejected": -1.9240920543670654, - "logps/chosen": -227.26931762695312, - "logps/rejected": -213.41995239257812, - "loss": 6686.6555, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.38683217763900757, - "rewards/margins": 0.09426978975534439, - "rewards/rejected": -0.481101930141449, - "rewards/safe_rewards": -0.37474173307418823, - "rewards/unsafe_rewards": -0.3989226520061493, + "logits/chosen": -2.3528215885162354, + "logits/rejected": -2.1418814659118652, + "logps/chosen": -187.97207641601562, + "logps/rejected": -165.0076446533203, + "loss": 120.2137, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.6140010952949524, + "rewards/margins": 0.31187087297439575, + "rewards/rejected": 0.30213022232055664, + "rewards/safe_rewards": 0.45020800828933716, + "rewards/unsafe_rewards": 0.7777942419052124, "step": 530 }, { "epoch": 0.29, "learning_rate": 4.467067003767745e-07, - "logits/chosen": -2.2515788078308105, - "logits/rejected": -1.981323003768921, - "logps/chosen": -250.74972534179688, - "logps/rejected": -223.9951629638672, - "loss": 6660.8516, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.35213276743888855, - "rewards/margins": 0.10238126665353775, - "rewards/rejected": -0.4545140266418457, - "rewards/safe_rewards": -0.35352325439453125, - "rewards/unsafe_rewards": -0.35074225068092346, + "logits/chosen": -2.441636800765991, + "logits/rejected": -2.219637870788574, + "logps/chosen": -215.01718139648438, + "logps/rejected": -178.1621856689453, + "loss": 31.3751, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.5192718505859375, + "rewards/margins": 0.13771791756153107, + "rewards/rejected": 0.3815539479255676, + "rewards/safe_rewards": 0.3229585587978363, + "rewards/unsafe_rewards": 0.7155852317810059, "step": 540 }, { "epoch": 0.3, "learning_rate": 4.437730627868027e-07, - "logits/chosen": -2.167005777359009, - "logits/rejected": -1.8684250116348267, - "logps/chosen": -221.5258026123047, - "logps/rejected": -211.6287384033203, - "loss": 6479.7195, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.4035661816596985, - "rewards/margins": 0.10153625160455704, - "rewards/rejected": -0.5051023960113525, - "rewards/safe_rewards": -0.40968450903892517, - "rewards/unsafe_rewards": -0.39744776487350464, + "logits/chosen": -2.378955602645874, + "logits/rejected": -2.138523578643799, + "logps/chosen": -181.02993774414062, + "logps/rejected": -161.35678100585938, + "loss": 48.7052, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.13924112915992737, + "rewards/margins": 0.37750715017318726, + "rewards/rejected": -0.23826603591442108, + "rewards/safe_rewards": 0.4204103946685791, + "rewards/unsafe_rewards": -0.14192816615104675, "step": 550 }, { "epoch": 0.3, "learning_rate": 4.4077101704995163e-07, - "logits/chosen": -2.207162380218506, - "logits/rejected": -1.9421132802963257, - "logps/chosen": -243.27017211914062, - "logps/rejected": -237.16085815429688, - "loss": 6695.0641, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.3939805030822754, - "rewards/margins": 0.09431411325931549, - "rewards/rejected": -0.4882946014404297, - "rewards/safe_rewards": -0.3932721018791199, - "rewards/unsafe_rewards": -0.3946888744831085, + "logits/chosen": -2.4157960414886475, + "logits/rejected": -2.1959304809570312, + "logps/chosen": -204.2389373779297, + "logps/rejected": -188.56707763671875, + "loss": 23.5436, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.36682039499282837, + "rewards/margins": -0.1311464011669159, + "rewards/rejected": -0.23567399382591248, + "rewards/safe_rewards": -0.4130277633666992, + "rewards/unsafe_rewards": -0.3206130862236023, "step": 560 }, { "epoch": 0.31, "learning_rate": 4.3770162298528356e-07, - "logits/chosen": -2.230370283126831, - "logits/rejected": -2.010568857192993, - "logps/chosen": -239.39120483398438, - "logps/rejected": -215.85934448242188, - "loss": 6883.2531, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.37461167573928833, - "rewards/margins": 0.09249675273895264, - "rewards/rejected": -0.4671083986759186, - "rewards/safe_rewards": -0.37565773725509644, - "rewards/unsafe_rewards": -0.3735656142234802, + "logits/chosen": -2.4378573894500732, + "logits/rejected": -2.243499994277954, + "logps/chosen": -201.71572875976562, + "logps/rejected": -169.5461883544922, + "loss": 48.0924, + "rewards/accuracies": 0.565625011920929, + "rewards/chosen": 0.21430592238903046, + "rewards/margins": 0.6119717359542847, + "rewards/rejected": -0.397665798664093, + "rewards/safe_rewards": -0.27201324701309204, + "rewards/unsafe_rewards": 0.7006251811981201, "step": 570 }, { "epoch": 0.31, "learning_rate": 4.3456596418799476e-07, - "logits/chosen": -2.1258435249328613, - "logits/rejected": -1.9077991247177124, - "logps/chosen": -251.86349487304688, - "logps/rejected": -226.2504425048828, - "loss": 6485.532, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4290865361690521, - "rewards/margins": 0.10481268167495728, - "rewards/rejected": -0.533899188041687, - "rewards/safe_rewards": -0.4373076856136322, - "rewards/unsafe_rewards": -0.42086538672447205, + "logits/chosen": -2.383977174758911, + "logits/rejected": -2.204479694366455, + "logps/chosen": -208.63818359375, + "logps/rejected": -172.74533081054688, + "loss": 40.887, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.31667083501815796, + "rewards/margins": 0.20149526000022888, + "rewards/rejected": 0.11517556756734848, + "rewards/safe_rewards": 0.08531586080789566, + "rewards/unsafe_rewards": 0.5480257868766785, "step": 580 }, { "epoch": 0.32, "learning_rate": 4.313651476468715e-07, - "logits/chosen": -2.182234764099121, - "logits/rejected": -1.9257800579071045, - "logps/chosen": -245.7446746826172, - "logps/rejected": -230.3665313720703, - "loss": 6464.5875, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.3970971703529358, - "rewards/margins": 0.09041866660118103, - "rewards/rejected": -0.4875158369541168, - "rewards/safe_rewards": -0.4161418378353119, - "rewards/unsafe_rewards": -0.3780525028705597, + "logits/chosen": -2.452789783477783, + "logits/rejected": -2.2367706298828125, + "logps/chosen": -206.00991821289062, + "logps/rejected": -181.455810546875, + "loss": 17.7462, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.025053849443793297, + "rewards/margins": -0.13409826159477234, + "rewards/rejected": 0.1591521054506302, + "rewards/safe_rewards": -0.05954737588763237, + "rewards/unsafe_rewards": 0.10965506732463837, "step": 590 }, { "epoch": 0.32, "learning_rate": 4.2810030335348693e-07, - "logits/chosen": -2.113201379776001, - "logits/rejected": -1.8778488636016846, - "logps/chosen": -256.13543701171875, - "logps/rejected": -213.571044921875, - "loss": 6756.0938, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.37578636407852173, - "rewards/margins": 0.06911512464284897, - "rewards/rejected": -0.4449015259742737, - "rewards/safe_rewards": -0.38628315925598145, - "rewards/unsafe_rewards": -0.3652896285057068, + "logits/chosen": -2.4099035263061523, + "logits/rejected": -2.218843936920166, + "logps/chosen": -218.79177856445312, + "logps/rejected": -168.67431640625, + "loss": 74.9431, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.23495905101299286, + "rewards/margins": -0.6415296196937561, + "rewards/rejected": 0.40657052397727966, + "rewards/safe_rewards": -0.5243301391601562, + "rewards/unsafe_rewards": 0.05441205948591232, "step": 600 }, { "epoch": 0.33, "learning_rate": 4.2477258390327806e-07, - "logits/chosen": -2.132603645324707, - "logits/rejected": -1.8388347625732422, - "logps/chosen": -228.72323608398438, - "logps/rejected": -217.3914031982422, - "loss": 6564.557, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.3873269259929657, - "rewards/margins": 0.10852308571338654, - "rewards/rejected": -0.4958500266075134, - "rewards/safe_rewards": -0.3940195143222809, - "rewards/unsafe_rewards": -0.3806343674659729, + "logits/chosen": -2.4378225803375244, + "logits/rejected": -2.2049014568328857, + "logps/chosen": -190.23843383789062, + "logps/rejected": -167.7356719970703, + "loss": 24.3453, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.24785485863685608, + "rewards/margins": -0.3185795247554779, + "rewards/rejected": 0.07072468847036362, + "rewards/safe_rewards": -0.1916726529598236, + "rewards/unsafe_rewards": -0.30403703451156616, "step": 610 }, { "epoch": 0.33, "learning_rate": 4.2138316408864197e-07, - "logits/chosen": -2.144538402557373, - "logits/rejected": -1.8722187280654907, - "logps/chosen": -231.070068359375, - "logps/rejected": -211.3180694580078, - "loss": 5815.3586, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.3485415577888489, - "rewards/margins": 0.13524214923381805, - "rewards/rejected": -0.48378363251686096, - "rewards/safe_rewards": -0.36353689432144165, - "rewards/unsafe_rewards": -0.33354613184928894, + "logits/chosen": -2.4828572273254395, + "logits/rejected": -2.251974105834961, + "logps/chosen": -195.9208221435547, + "logps/rejected": -162.9341278076172, + "loss": 47.0644, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.2951027750968933, + "rewards/margins": 0.2895166277885437, + "rewards/rejected": 0.005586123559623957, + "rewards/safe_rewards": 0.2697201073169708, + "rewards/unsafe_rewards": 0.3204854130744934, "step": 620 }, { "epoch": 0.34, "learning_rate": 4.179332404841962e-07, - "logits/chosen": -2.1126229763031006, - "logits/rejected": -1.8241170644760132, - "logps/chosen": -250.9884490966797, - "logps/rejected": -230.13394165039062, - "loss": 6280.5078, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.4237682819366455, - "rewards/margins": 0.11134223639965057, - "rewards/rejected": -0.5351104736328125, - "rewards/safe_rewards": -0.4349561631679535, - "rewards/unsafe_rewards": -0.4125804305076599, + "logits/chosen": -2.4540035724639893, + "logits/rejected": -2.223843812942505, + "logps/chosen": -208.46463012695312, + "logps/rejected": -176.60848999023438, + "loss": 25.2961, + "rewards/accuracies": 0.46562498807907104, + "rewards/chosen": 0.14698375761508942, + "rewards/margins": 0.1325828731060028, + "rewards/rejected": 0.014400847256183624, + "rewards/safe_rewards": 0.04335422068834305, + "rewards/unsafe_rewards": 0.250613272190094, "step": 630 }, { "epoch": 0.34, "learning_rate": 4.1442403102434954e-07, - "logits/chosen": -2.112366199493408, - "logits/rejected": -1.851994276046753, - "logps/chosen": -256.3794860839844, - "logps/rejected": -232.27554321289062, - "loss": 6645.3945, - "rewards/accuracies": 0.565625011920929, - "rewards/chosen": -0.44062596559524536, - "rewards/margins": 0.09120626747608185, - "rewards/rejected": -0.531832218170166, - "rewards/safe_rewards": -0.4353967607021332, - "rewards/unsafe_rewards": -0.4458550810813904, + "logits/chosen": -2.4651191234588623, + "logits/rejected": -2.252150535583496, + "logps/chosen": -212.79736328125, + "logps/rejected": -179.38711547851562, + "loss": 117.4084, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.48046717047691345, + "rewards/margins": -0.18567809462547302, + "rewards/rejected": -0.29478907585144043, + "rewards/safe_rewards": -0.6295033693313599, + "rewards/unsafe_rewards": -0.3314310312271118, "step": 640 }, { "epoch": 0.35, "learning_rate": 4.108567745733318e-07, - "logits/chosen": -2.0930869579315186, - "logits/rejected": -1.8053241968154907, - "logps/chosen": -219.0164794921875, - "logps/rejected": -208.75341796875, - "loss": 6823.0883, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.34530672430992126, - "rewards/margins": 0.07487599551677704, - "rewards/rejected": -0.4201827645301819, - "rewards/safe_rewards": -0.3363416790962219, - "rewards/unsafe_rewards": -0.354271799325943, + "logits/chosen": -2.447937488555908, + "logits/rejected": -2.201697826385498, + "logps/chosen": -184.49168395996094, + "logps/rejected": -166.9139404296875, + "loss": 10.7524, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.005866925232112408, + "rewards/margins": 0.17292609810829163, + "rewards/rejected": -0.1787930279970169, + "rewards/safe_rewards": 0.03122568130493164, + "rewards/unsafe_rewards": -0.042959537357091904, "step": 650 }, { "epoch": 0.36, "learning_rate": 4.0723273048783426e-07, - "logits/chosen": -2.1528728008270264, - "logits/rejected": -1.9104461669921875, - "logps/chosen": -246.50015258789062, - "logps/rejected": -206.0511932373047, - "loss": 6722.1141, - "rewards/accuracies": 0.528124988079071, - "rewards/chosen": -0.34617477655410767, - "rewards/margins": 0.06205359101295471, - "rewards/rejected": -0.40822833776474, - "rewards/safe_rewards": -0.33560293912887573, - "rewards/unsafe_rewards": -0.3567466139793396, + "logits/chosen": -2.44038462638855, + "logits/rejected": -2.2175660133361816, + "logps/chosen": -211.3206787109375, + "logps/rejected": -165.2122802734375, + "loss": 81.9566, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.5619795918464661, + "rewards/margins": 0.545897364616394, + "rewards/rejected": 0.016082104295492172, + "rewards/safe_rewards": 1.0618271827697754, + "rewards/unsafe_rewards": 0.06213190406560898, "step": 660 }, { "epoch": 0.36, "learning_rate": 4.0355317817241697e-07, - "logits/chosen": -2.108217716217041, - "logits/rejected": -1.8410425186157227, - "logps/chosen": -265.3013916015625, - "logps/rejected": -219.53402709960938, - "loss": 6793.2109, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.3496417999267578, - "rewards/margins": 0.07917375862598419, - "rewards/rejected": -0.4288156032562256, - "rewards/safe_rewards": -0.3285156488418579, - "rewards/unsafe_rewards": -0.3707680106163025, + "logits/chosen": -2.3970015048980713, + "logits/rejected": -2.163048267364502, + "logps/chosen": -229.952880859375, + "logps/rejected": -176.55599975585938, + "loss": 26.2558, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.38434115052223206, + "rewards/margins": 0.2878434658050537, + "rewards/rejected": 0.09649765491485596, + "rewards/safe_rewards": 0.5011194944381714, + "rewards/unsafe_rewards": 0.26756277680397034, "step": 670 }, { "epoch": 0.37, "learning_rate": 3.998194166278367e-07, - "logits/chosen": -2.1824183464050293, - "logits/rejected": -1.9256740808486938, - "logps/chosen": -228.2741241455078, - "logps/rejected": -197.99484252929688, - "loss": 6763.4398, - "rewards/accuracies": 0.546875, - "rewards/chosen": -0.35494309663772583, - "rewards/margins": 0.058072127401828766, - "rewards/rejected": -0.4130152761936188, - "rewards/safe_rewards": -0.353218674659729, - "rewards/unsafe_rewards": -0.3566676080226898, + "logits/chosen": -2.4422953128814697, + "logits/rejected": -2.2152860164642334, + "logps/chosen": -193.12109375, + "logps/rejected": -156.7648162841797, + "loss": 157.1721, + "rewards/accuracies": 0.4593749940395355, + "rewards/chosen": -0.3413035273551941, + "rewards/margins": -0.26979130506515503, + "rewards/rejected": -0.07151220738887787, + "rewards/safe_rewards": -0.49346867203712463, + "rewards/unsafe_rewards": -0.18913838267326355, "step": 680 }, { "epoch": 0.37, "learning_rate": 3.9603276399245855e-07, - "logits/chosen": -2.2138686180114746, - "logits/rejected": -1.9390268325805664, - "logps/chosen": -246.8724822998047, - "logps/rejected": -219.68814086914062, - "loss": 6987.0063, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": -0.34344762563705444, - "rewards/margins": 0.12904086709022522, - "rewards/rejected": -0.47248849272727966, - "rewards/safe_rewards": -0.3371937572956085, - "rewards/unsafe_rewards": -0.34970152378082275, + "logits/chosen": -2.4512076377868652, + "logits/rejected": -2.217556953430176, + "logps/chosen": -212.5731658935547, + "logps/rejected": -172.98239135742188, + "loss": 140.5213, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.045460961759090424, + "rewards/margins": 0.4976281523704529, + "rewards/rejected": -0.5430891513824463, + "rewards/safe_rewards": 0.1586223542690277, + "rewards/unsafe_rewards": -0.24954433739185333, "step": 690 }, { "epoch": 0.38, "learning_rate": 3.9219455707691e-07, - "logits/chosen": -2.201855421066284, - "logits/rejected": -1.927875280380249, - "logps/chosen": -269.231201171875, - "logps/rejected": -243.1079559326172, - "loss": 6437.2254, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.4689943194389343, - "rewards/margins": 0.08409595489501953, - "rewards/rejected": -0.5530902147293091, - "rewards/safe_rewards": -0.4955593943595886, - "rewards/unsafe_rewards": -0.44242924451828003, + "logits/chosen": -2.443801164627075, + "logits/rejected": -2.217026710510254, + "logps/chosen": -223.50064086914062, + "logps/rejected": -188.3572998046875, + "loss": 239.7127, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -1.1688416004180908, + "rewards/margins": -0.6104832291603088, + "rewards/rejected": -0.5583583116531372, + "rewards/safe_rewards": -0.5376420021057129, + "rewards/unsafe_rewards": -1.8000411987304688, "step": 700 }, { "epoch": 0.38, "learning_rate": 3.883061508921439e-07, - "logits/chosen": -2.2111687660217285, - "logits/rejected": -2.002533197402954, - "logps/chosen": -251.5044403076172, - "logps/rejected": -248.65231323242188, - "loss": 6540.6648, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": -0.5252124071121216, - "rewards/margins": 0.051811158657073975, - "rewards/rejected": -0.5770235061645508, - "rewards/safe_rewards": -0.5454440712928772, - "rewards/unsafe_rewards": -0.5049806833267212, + "logits/chosen": -2.4577882289886475, + "logits/rejected": -2.289802074432373, + "logps/chosen": -199.79066467285156, + "logps/rejected": -191.25059509277344, + "loss": 127.1414, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.8074787855148315, + "rewards/margins": -0.5068421363830566, + "rewards/rejected": -0.30063679814338684, + "rewards/safe_rewards": -1.0235812664031982, + "rewards/unsafe_rewards": -0.5913764238357544, "step": 710 }, { "epoch": 0.39, "learning_rate": 3.8436891817107555e-07, - "logits/chosen": -2.166321277618408, - "logits/rejected": -2.001189947128296, - "logps/chosen": -240.1176300048828, - "logps/rejected": -228.7688446044922, - "loss": 6719.8211, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.48019713163375854, - "rewards/margins": 0.08811075985431671, - "rewards/rejected": -0.5683078765869141, - "rewards/safe_rewards": -0.48789653182029724, - "rewards/unsafe_rewards": -0.47249770164489746, + "logits/chosen": -2.384692668914795, + "logits/rejected": -2.2363414764404297, + "logps/chosen": -192.9431915283203, + "logps/rejected": -173.0110626220703, + "loss": 88.3357, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": -0.845304012298584, + "rewards/margins": 0.22771115601062775, + "rewards/rejected": -1.0730152130126953, + "rewards/safe_rewards": -0.7142607569694519, + "rewards/unsafe_rewards": -0.9763473272323608, "step": 720 }, { "epoch": 0.39, "learning_rate": 3.8038424888396414e-07, - "logits/chosen": -2.277644157409668, - "logits/rejected": -2.064286708831787, - "logps/chosen": -227.665771484375, - "logps/rejected": -219.80184936523438, - "loss": 6400.2289, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": -0.384390264749527, - "rewards/margins": 0.08620595932006836, - "rewards/rejected": -0.47059616446495056, - "rewards/safe_rewards": -0.39066264033317566, - "rewards/unsafe_rewards": -0.3781178593635559, + "logits/chosen": -2.4334444999694824, + "logits/rejected": -2.2202000617980957, + "logps/chosen": -190.13265991210938, + "logps/rejected": -173.72535705566406, + "loss": 46.5741, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": -0.9059289693832397, + "rewards/margins": 0.07719539105892181, + "rewards/rejected": -0.9831243753433228, + "rewards/safe_rewards": -1.5265599489212036, + "rewards/unsafe_rewards": -0.2852979004383087, "step": 730 }, { "epoch": 0.4, "learning_rate": 3.763535497477079e-07, - "logits/chosen": -2.284943103790283, - "logits/rejected": -2.068763017654419, - "logps/chosen": -244.8645782470703, - "logps/rejected": -230.4162139892578, - "loss": 6407.4703, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4144255518913269, - "rewards/margins": 0.10388362407684326, - "rewards/rejected": -0.5183092355728149, - "rewards/safe_rewards": -0.4147794246673584, - "rewards/unsafe_rewards": -0.4140717387199402, + "logits/chosen": -2.428952693939209, + "logits/rejected": -2.205458641052246, + "logps/chosen": -203.35873413085938, + "logps/rejected": -178.9982452392578, + "loss": 30.0399, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.06326188147068024, + "rewards/margins": 0.4762391149997711, + "rewards/rejected": -0.4129772186279297, + "rewards/safe_rewards": 0.01683131232857704, + "rewards/unsafe_rewards": 0.10969245433807373, "step": 740 }, { "epoch": 0.4, "learning_rate": 3.7227824372922795e-07, - "logits/chosen": -2.2572951316833496, - "logits/rejected": -2.0249342918395996, - "logps/chosen": -233.4899444580078, - "logps/rejected": -220.7420654296875, - "loss": 6312.2859, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4421048164367676, - "rewards/margins": 0.09387814998626709, - "rewards/rejected": -0.5359830260276794, - "rewards/safe_rewards": -0.43796879053115845, - "rewards/unsafe_rewards": -0.4462409019470215, + "logits/chosen": -2.4341301918029785, + "logits/rejected": -2.2008628845214844, + "logps/chosen": -189.18417358398438, + "logps/rejected": -167.0784454345703, + "loss": 12.3092, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.09529106318950653, + "rewards/margins": 0.02994244359433651, + "rewards/rejected": 0.06534863263368607, + "rewards/safe_rewards": 0.103404700756073, + "rewards/unsafe_rewards": 0.08717743307352066, "step": 750 }, { "epoch": 0.41, "learning_rate": 3.681597695431148e-07, - "logits/chosen": -2.2042198181152344, - "logits/rejected": -2.0483179092407227, - "logps/chosen": -239.17190551757812, - "logps/rejected": -231.75057983398438, - "loss": 6550.8, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.37860310077667236, - "rewards/margins": 0.10926772654056549, - "rewards/rejected": -0.4878707826137543, - "rewards/safe_rewards": -0.39898186922073364, - "rewards/unsafe_rewards": -0.3582242727279663, + "logits/chosen": -2.397660732269287, + "logits/rejected": -2.248548984527588, + "logps/chosen": -201.36961364746094, + "logps/rejected": -183.10923767089844, + "loss": 44.1826, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.05803655833005905, + "rewards/margins": 0.0876794308423996, + "rewards/rejected": -0.14571599662303925, + "rewards/safe_rewards": -0.19255781173706055, + "rewards/unsafe_rewards": 0.07648466527462006, "step": 760 }, { "epoch": 0.41, "learning_rate": 3.639995811437159e-07, - "logits/chosen": -2.19938063621521, - "logits/rejected": -2.0218288898468018, - "logps/chosen": -238.9772186279297, - "logps/rejected": -230.42684936523438, - "loss": 6573.4984, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.41638341546058655, - "rewards/margins": 0.0957685336470604, - "rewards/rejected": -0.5121519565582275, - "rewards/safe_rewards": -0.4294784963130951, - "rewards/unsafe_rewards": -0.4032882750034332, + "logits/chosen": -2.3755042552948, + "logits/rejected": -2.191373348236084, + "logps/chosen": -197.1927032470703, + "logps/rejected": -179.4755859375, + "loss": 154.7574, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.14616283774375916, + "rewards/margins": 0.41009521484375, + "rewards/rejected": -0.26393240690231323, + "rewards/safe_rewards": 0.3775586485862732, + "rewards/unsafe_rewards": -0.08523297309875488, "step": 770 }, { "epoch": 0.42, "learning_rate": 3.597991472118426e-07, - "logits/chosen": -2.2364182472229004, - "logits/rejected": -2.0072903633117676, - "logps/chosen": -250.245361328125, - "logps/rejected": -227.9338836669922, - "loss": 6494.7406, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.43522390723228455, - "rewards/margins": 0.08199247717857361, - "rewards/rejected": -0.5172163844108582, - "rewards/safe_rewards": -0.4500158727169037, - "rewards/unsafe_rewards": -0.4204320013523102, + "logits/chosen": -2.4273521900177, + "logits/rejected": -2.192534923553467, + "logps/chosen": -206.8874053955078, + "logps/rejected": -176.24118041992188, + "loss": 36.5319, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.164406418800354, + "rewards/margins": -0.1354350596666336, + "rewards/rejected": -0.028971344232559204, + "rewards/safe_rewards": 0.2160978764295578, + "rewards/unsafe_rewards": -0.5449106097221375, "step": 780 }, { "epoch": 0.43, "learning_rate": 3.5555995063627836e-07, - "logits/chosen": -2.1975796222686768, - "logits/rejected": -1.9783557653427124, - "logps/chosen": -269.57159423828125, - "logps/rejected": -248.4347381591797, - "loss": 6381.1195, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.46899929642677307, - "rewards/margins": 0.10295481979846954, - "rewards/rejected": -0.5719541311264038, - "rewards/safe_rewards": -0.4677364230155945, - "rewards/unsafe_rewards": -0.47026222944259644, + "logits/chosen": -2.415065050125122, + "logits/rejected": -2.194133758544922, + "logps/chosen": -222.50820922851562, + "logps/rejected": -191.37088012695312, + "loss": 16.1129, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.1634823977947235, + "rewards/margins": 0.29501140117645264, + "rewards/rejected": -0.1315290331840515, + "rewards/safe_rewards": -0.023002928122878075, + "rewards/unsafe_rewards": 0.34996774792671204, "step": 790 }, { "epoch": 0.43, "learning_rate": 3.512834879902715e-07, - "logits/chosen": -2.1973164081573486, - "logits/rejected": -1.9568490982055664, - "logps/chosen": -239.0543975830078, - "logps/rejected": -227.7292938232422, - "loss": 6457.2102, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.4537328779697418, - "rewards/margins": 0.13302846252918243, - "rewards/rejected": -0.5867613554000854, - "rewards/safe_rewards": -0.43677186965942383, - "rewards/unsafe_rewards": -0.4706939160823822, + "logits/chosen": -2.446582794189453, + "logits/rejected": -2.2151386737823486, + "logps/chosen": -193.52993774414062, + "logps/rejected": -169.22207641601562, + "loss": 17.2298, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.15118324756622314, + "rewards/margins": 0.320087730884552, + "rewards/rejected": -0.16890448331832886, + "rewards/safe_rewards": 0.15818454325199127, + "rewards/unsafe_rewards": 0.14418195188045502, "step": 800 }, { "epoch": 0.44, "learning_rate": 3.4697126900319616e-07, - "logits/chosen": -2.1789050102233887, - "logits/rejected": -1.9364265203475952, - "logps/chosen": -245.84500122070312, - "logps/rejected": -226.31884765625, - "loss": 6765.2641, - "rewards/accuracies": 0.653124988079071, - "rewards/chosen": -0.4480791985988617, - "rewards/margins": 0.13407856225967407, - "rewards/rejected": -0.5821577310562134, - "rewards/safe_rewards": -0.4408470690250397, - "rewards/unsafe_rewards": -0.4553113579750061, + "logits/chosen": -2.4158897399902344, + "logits/rejected": -2.180227756500244, + "logps/chosen": -200.93173217773438, + "logps/rejected": -167.99073791503906, + "loss": 22.7375, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.10535750538110733, + "rewards/margins": -0.006962819490581751, + "rewards/rejected": 0.11232032626867294, + "rewards/safe_rewards": -0.18741589784622192, + "rewards/unsafe_rewards": 0.3981309235095978, "step": 810 }, { "epoch": 0.44, "learning_rate": 3.426248160275693e-07, - "logits/chosen": -2.1804230213165283, - "logits/rejected": -1.9869539737701416, - "logps/chosen": -239.50259399414062, - "logps/rejected": -229.2792510986328, - "loss": 6714.4187, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.43128710985183716, - "rewards/margins": 0.08299657702445984, - "rewards/rejected": -0.5142837166786194, - "rewards/safe_rewards": -0.4506538510322571, - "rewards/unsafe_rewards": -0.4119204580783844, + "logits/chosen": -2.4130988121032715, + "logits/rejected": -2.223747730255127, + "logps/chosen": -196.2846221923828, + "logps/rejected": -177.1783447265625, + "loss": 62.6098, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": 0.08924231678247452, + "rewards/margins": -0.5832756757736206, + "rewards/rejected": 0.6725180745124817, + "rewards/safe_rewards": 0.08092136681079865, + "rewards/unsafe_rewards": 0.09756331145763397, "step": 820 }, { "epoch": 0.45, "learning_rate": 3.3824566350161094e-07, - "logits/chosen": -2.2021470069885254, - "logits/rejected": -1.9547889232635498, - "logps/chosen": -247.32504272460938, - "logps/rejected": -212.7538604736328, - "loss": 6502.4563, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.3571341633796692, - "rewards/margins": 0.11506947129964828, - "rewards/rejected": -0.47220364212989807, - "rewards/safe_rewards": -0.3600516617298126, - "rewards/unsafe_rewards": -0.35421669483184814, + "logits/chosen": -2.4248764514923096, + "logits/rejected": -2.1799604892730713, + "logps/chosen": -211.0237274169922, + "logps/rejected": -165.1766815185547, + "loss": 8.7437, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.5879140496253967, + "rewards/margins": 0.23111946880817413, + "rewards/rejected": 0.356794536113739, + "rewards/safe_rewards": 0.5438351631164551, + "rewards/unsafe_rewards": 0.6319928765296936, "step": 830 }, { "epoch": 0.45, "learning_rate": 3.338353574075381e-07, - "logits/chosen": -2.1721179485321045, - "logits/rejected": -1.9909629821777344, - "logps/chosen": -226.5969696044922, - "logps/rejected": -212.7884063720703, - "loss": 7247.418, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.38080552220344543, - "rewards/margins": 0.08085910975933075, - "rewards/rejected": -0.46166467666625977, - "rewards/safe_rewards": -0.3966561555862427, - "rewards/unsafe_rewards": -0.3649549186229706, + "logits/chosen": -2.3919012546539307, + "logits/rejected": -2.212056875228882, + "logps/chosen": -188.0956268310547, + "logps/rejected": -166.2266387939453, + "loss": 23.4515, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 0.4208035469055176, + "rewards/margins": 0.025497043505311012, + "rewards/rejected": 0.3953064978122711, + "rewards/safe_rewards": 0.5599286556243896, + "rewards/unsafe_rewards": 0.2816784679889679, "step": 840 }, { "epoch": 0.46, "learning_rate": 3.2939545472578314e-07, - "logits/chosen": -2.2441887855529785, - "logits/rejected": -1.9507675170898438, - "logps/chosen": -263.19268798828125, - "logps/rejected": -226.2810516357422, - "loss": 6521.1922, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.41747385263442993, - "rewards/margins": 0.06601213663816452, - "rewards/rejected": -0.48348602652549744, - "rewards/safe_rewards": -0.4087367653846741, - "rewards/unsafe_rewards": -0.42621102929115295, + "logits/chosen": -2.4613280296325684, + "logits/rejected": -2.1779792308807373, + "logps/chosen": -220.7722625732422, + "logps/rejected": -177.66567993164062, + "loss": 71.1367, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.6730166077613831, + "rewards/margins": 0.4062492251396179, + "rewards/rejected": 0.26676732301712036, + "rewards/safe_rewards": 0.1429261863231659, + "rewards/unsafe_rewards": 1.2031069993972778, "step": 850 }, { "epoch": 0.46, "learning_rate": 3.2492752288532916e-07, - "logits/chosen": -2.2073259353637695, - "logits/rejected": -1.9797918796539307, - "logps/chosen": -234.3251495361328, - "logps/rejected": -222.50390625, - "loss": 6354.6242, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.41758179664611816, - "rewards/margins": 0.0925571545958519, - "rewards/rejected": -0.5101389288902283, - "rewards/safe_rewards": -0.40995293855667114, - "rewards/unsafe_rewards": -0.4252106547355652, + "logits/chosen": -2.4267163276672363, + "logits/rejected": -2.2031116485595703, + "logps/chosen": -192.3984832763672, + "logps/rejected": -171.2382354736328, + "loss": 46.0145, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.1684725284576416, + "rewards/margins": -0.08331739902496338, + "rewards/rejected": 0.25178998708724976, + "rewards/safe_rewards": 0.12440772354602814, + "rewards/unsafe_rewards": 0.21253737807273865, "step": 860 }, { "epoch": 0.47, "learning_rate": 3.204331392103574e-07, - "logits/chosen": -2.2528116703033447, - "logits/rejected": -1.9719661474227905, - "logps/chosen": -254.8304443359375, - "logps/rejected": -214.4097137451172, - "loss": 6612.2297, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.4306084215641022, - "rewards/margins": 0.07473549991846085, - "rewards/rejected": -0.5053439736366272, - "rewards/safe_rewards": -0.4317397475242615, - "rewards/unsafe_rewards": -0.42947712540626526, + "logits/chosen": -2.483734369277954, + "logits/rejected": -2.2113869190216064, + "logps/chosen": -211.577880859375, + "logps/rejected": -163.4304656982422, + "loss": 155.5441, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": 0.19171550869941711, + "rewards/margins": -0.2531381845474243, + "rewards/rejected": 0.44485369324684143, + "rewards/safe_rewards": 0.10490121692419052, + "rewards/unsafe_rewards": 0.2785297632217407, "step": 870 }, { "epoch": 0.47, "learning_rate": 3.159138903634006e-07, - "logits/chosen": -2.15206241607666, - "logits/rejected": -1.9694172143936157, - "logps/chosen": -251.68887329101562, - "logps/rejected": -230.56942749023438, - "loss": 6409.4055, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.473393052816391, - "rewards/margins": 0.0951012596487999, - "rewards/rejected": -0.5684942603111267, - "rewards/safe_rewards": -0.4793036878108978, - "rewards/unsafe_rewards": -0.46748247742652893, + "logits/chosen": -2.409116744995117, + "logits/rejected": -2.2290921211242676, + "logps/chosen": -203.94369506835938, + "logps/rejected": -173.5029754638672, + "loss": 9.3153, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": 0.4058583676815033, + "rewards/margins": 0.18885207176208496, + "rewards/rejected": 0.21700629591941833, + "rewards/safe_rewards": 0.3220987915992737, + "rewards/unsafe_rewards": 0.48961788415908813, "step": 880 }, { "epoch": 0.48, "learning_rate": 3.1137137178519977e-07, - "logits/chosen": -2.1355667114257812, - "logits/rejected": -1.9253263473510742, - "logps/chosen": -233.0535888671875, - "logps/rejected": -214.96810913085938, - "loss": 6620.7125, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.4889693260192871, - "rewards/margins": 0.08977462351322174, - "rewards/rejected": -0.5787439942359924, - "rewards/safe_rewards": -0.4807513654232025, - "rewards/unsafe_rewards": -0.49718719720840454, + "logits/chosen": -2.4068942070007324, + "logits/rejected": -2.212474822998047, + "logps/chosen": -184.1978759765625, + "logps/rejected": -157.02920532226562, + "loss": 47.3581, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.041205547749996185, + "rewards/margins": -0.1057499423623085, + "rewards/rejected": 0.06454440206289291, + "rewards/safe_rewards": -0.18809974193572998, + "rewards/unsafe_rewards": 0.10568861663341522, "step": 890 }, { "epoch": 0.48, "learning_rate": 3.068071871314626e-07, - "logits/chosen": -2.1069016456604004, - "logits/rejected": -1.892960548400879, - "logps/chosen": -236.0681915283203, - "logps/rejected": -210.6871337890625, - "loss": 6417.7578, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.42577171325683594, - "rewards/margins": 0.09936337172985077, - "rewards/rejected": -0.5251351594924927, - "rewards/safe_rewards": -0.43935972452163696, - "rewards/unsafe_rewards": -0.4121837019920349, + "logits/chosen": -2.3744447231292725, + "logits/rejected": -2.1711204051971436, + "logps/chosen": -193.363525390625, + "logps/rejected": -157.98092651367188, + "loss": 36.4272, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.1274958997964859, + "rewards/margins": -0.06520196795463562, + "rewards/rejected": 0.19269786775112152, + "rewards/safe_rewards": 0.27050352096557617, + "rewards/unsafe_rewards": -0.015511776320636272, "step": 900 }, { "epoch": 0.49, "learning_rate": 3.022229477067205e-07, - "logits/chosen": -2.155628204345703, - "logits/rejected": -1.9340698719024658, - "logps/chosen": -254.1663055419922, - "logps/rejected": -214.5603790283203, - "loss": 6326.4648, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4181991219520569, - "rewards/margins": 0.10034485161304474, - "rewards/rejected": -0.5185439586639404, - "rewards/safe_rewards": -0.40648728609085083, - "rewards/unsafe_rewards": -0.4299109876155853, + "logits/chosen": -2.4298009872436523, + "logits/rejected": -2.2137274742126465, + "logps/chosen": -212.06454467773438, + "logps/rejected": -162.7147216796875, + "loss": 22.3251, + "rewards/accuracies": 0.5406249761581421, + "rewards/chosen": 0.2818406820297241, + "rewards/margins": 0.29059693217277527, + "rewards/rejected": -0.008756252937018871, + "rewards/safe_rewards": 0.12103135883808136, + "rewards/unsafe_rewards": 0.4426499903202057, "step": 910 }, { "epoch": 0.49, "learning_rate": 2.976202718954869e-07, - "logits/chosen": -2.1675617694854736, - "logits/rejected": -1.925453543663025, - "logps/chosen": -253.51510620117188, - "logps/rejected": -238.5836181640625, - "loss": 6792.7578, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.4517107903957367, - "rewards/margins": 0.08173434436321259, - "rewards/rejected": -0.5334451794624329, - "rewards/safe_rewards": -0.4674789309501648, - "rewards/unsafe_rewards": -0.4359425902366638, + "logits/chosen": -2.4414241313934326, + "logits/rejected": -2.214113235473633, + "logps/chosen": -208.3417510986328, + "logps/rejected": -185.30526733398438, + "loss": 15.9322, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.002296045422554016, + "rewards/margins": 0.06845332682132721, + "rewards/rejected": -0.0661572739481926, + "rewards/safe_rewards": 0.11791107803583145, + "rewards/unsafe_rewards": -0.11331899464130402, "step": 920 }, { "epoch": 0.5, "learning_rate": 2.930007845909146e-07, - "logits/chosen": -2.1799418926239014, - "logits/rejected": -2.0040533542633057, - "logps/chosen": -266.9742126464844, - "logps/rejected": -249.785888671875, - "loss": 6767.8273, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.46331778168678284, - "rewards/margins": 0.09225159883499146, - "rewards/rejected": -0.5555693507194519, - "rewards/safe_rewards": -0.45066460967063904, - "rewards/unsafe_rewards": -0.47597089409828186, + "logits/chosen": -2.465981960296631, + "logits/rejected": -2.2979178428649902, + "logps/chosen": -220.63400268554688, + "logps/rejected": -194.15982055664062, + "loss": 20.6631, + "rewards/accuracies": 0.44062501192092896, + "rewards/chosen": 0.008412945084273815, + "rewards/margins": -0.06071774289011955, + "rewards/rejected": 0.06913068145513535, + "rewards/safe_rewards": 0.15175995230674744, + "rewards/unsafe_rewards": -0.13493406772613525, "step": 930 }, { "epoch": 0.51, "learning_rate": 2.8836611662115634e-07, - "logits/chosen": -2.0794804096221924, - "logits/rejected": -1.8282148838043213, - "logps/chosen": -250.9180450439453, - "logps/rejected": -217.40322875976562, - "loss": 6787.9852, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.49260464310646057, - "rewards/margins": 0.09080933034420013, - "rewards/rejected": -0.5834139585494995, - "rewards/safe_rewards": -0.484276682138443, - "rewards/unsafe_rewards": -0.5009325742721558, + "logits/chosen": -2.411681890487671, + "logits/rejected": -2.184065818786621, + "logps/chosen": -201.34774780273438, + "logps/rejected": -158.77896118164062, + "loss": 53.4563, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.30983632802963257, + "rewards/margins": 0.02700033411383629, + "rewards/rejected": 0.282835990190506, + "rewards/safe_rewards": 0.16020536422729492, + "rewards/unsafe_rewards": 0.4594673216342926, "step": 940 }, { "epoch": 0.51, "learning_rate": 2.8371790417362986e-07, - "logits/chosen": -2.115889310836792, - "logits/rejected": -1.9069702625274658, - "logps/chosen": -242.36587524414062, - "logps/rejected": -239.2466278076172, - "loss": 7101.5695, - "rewards/accuracies": 0.5531250238418579, - "rewards/chosen": -0.47016048431396484, - "rewards/margins": 0.0702749639749527, - "rewards/rejected": -0.5404354333877563, - "rewards/safe_rewards": -0.46828895807266235, - "rewards/unsafe_rewards": -0.47203201055526733, + "logits/chosen": -2.4363036155700684, + "logits/rejected": -2.2508435249328613, + "logps/chosen": -194.97052001953125, + "logps/rejected": -184.87435913085938, + "loss": 15.1437, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3793017268180847, + "rewards/margins": 0.050546444952487946, + "rewards/rejected": 0.328755259513855, + "rewards/safe_rewards": 0.453242689371109, + "rewards/unsafe_rewards": 0.30536073446273804, "step": 950 }, { "epoch": 0.52, "learning_rate": 2.7905778821739056e-07, - "logits/chosen": -2.122325897216797, - "logits/rejected": -1.8465118408203125, - "logps/chosen": -248.68179321289062, - "logps/rejected": -213.2493133544922, - "loss": 6604.2383, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.40824347734451294, - "rewards/margins": 0.10310404002666473, - "rewards/rejected": -0.5113475322723389, - "rewards/safe_rewards": -0.390902042388916, - "rewards/unsafe_rewards": -0.42558494210243225, + "logits/chosen": -2.430182456970215, + "logits/rejected": -2.181687116622925, + "logps/chosen": -207.5760955810547, + "logps/rejected": -161.82400512695312, + "loss": 36.165, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.2813587188720703, + "rewards/margins": -0.009190035052597523, + "rewards/rejected": 0.29054874181747437, + "rewards/safe_rewards": 0.5300852060317993, + "rewards/unsafe_rewards": 0.03263214975595474, "step": 960 }, { "epoch": 0.52, "learning_rate": 2.74387413923817e-07, - "logits/chosen": -2.0418407917022705, - "logits/rejected": -1.8530842065811157, - "logps/chosen": -263.1913757324219, - "logps/rejected": -247.3702392578125, - "loss": 6765.8367, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.4658144414424896, - "rewards/margins": 0.08784971386194229, - "rewards/rejected": -0.5536641478538513, - "rewards/safe_rewards": -0.4485572874546051, - "rewards/unsafe_rewards": -0.4830716550350189, + "logits/chosen": -2.3779215812683105, + "logits/rejected": -2.2126498222351074, + "logps/chosen": -216.20980834960938, + "logps/rejected": -191.72068786621094, + "loss": 35.9574, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": 0.40009957551956177, + "rewards/margins": 0.11698710918426514, + "rewards/rejected": 0.28311246633529663, + "rewards/safe_rewards": 0.3544066548347473, + "rewards/unsafe_rewards": 0.4457924962043762, "step": 970 }, { "epoch": 0.53, "learning_rate": 2.69708430085812e-07, - "logits/chosen": -2.1047558784484863, - "logits/rejected": -1.8455266952514648, - "logps/chosen": -260.87078857421875, - "logps/rejected": -240.1245574951172, - "loss": 6804.8938, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.4981120526790619, - "rewards/margins": 0.11919406801462173, - "rewards/rejected": -0.617306113243103, - "rewards/safe_rewards": -0.5019406080245972, - "rewards/unsafe_rewards": -0.494283527135849, + "logits/chosen": -2.442641496658325, + "logits/rejected": -2.2196171283721924, + "logps/chosen": -210.2590789794922, + "logps/rejected": -178.38427734375, + "loss": 143.7915, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.8005061149597168, + "rewards/margins": 0.7907932996749878, + "rewards/rejected": 0.009712839499115944, + "rewards/safe_rewards": 1.2023097276687622, + "rewards/unsafe_rewards": 0.39870262145996094, "step": 980 }, { "epoch": 0.53, "learning_rate": 2.6502248853572504e-07, - "logits/chosen": -2.065930128097534, - "logits/rejected": -1.8151044845581055, - "logps/chosen": -241.413818359375, - "logps/rejected": -220.83212280273438, - "loss": 6964.4953, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4998900294303894, - "rewards/margins": 0.0783344954252243, - "rewards/rejected": -0.5782245397567749, - "rewards/safe_rewards": -0.5021571516990662, - "rewards/unsafe_rewards": -0.49762290716171265, + "logits/chosen": -2.397225856781006, + "logits/rejected": -2.1839497089385986, + "logps/chosen": -191.41046142578125, + "logps/rejected": -162.9442901611328, + "loss": 12.2808, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.014359796419739723, + "rewards/margins": -0.05102130025625229, + "rewards/rejected": 0.06538109481334686, + "rewards/safe_rewards": -0.18064935505390167, + "rewards/unsafe_rewards": 0.20936894416809082, "step": 990 }, { "epoch": 0.54, "learning_rate": 2.6033124356220325e-07, - "logits/chosen": -2.0734333992004395, - "logits/rejected": -1.8190828561782837, - "logps/chosen": -241.6118621826172, - "logps/rejected": -212.5392303466797, - "loss": 6458.732, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4252408444881439, - "rewards/margins": 0.10315348953008652, - "rewards/rejected": -0.528394341468811, - "rewards/safe_rewards": -0.4195174276828766, - "rewards/unsafe_rewards": -0.43096423149108887, + "logits/chosen": -2.364447593688965, + "logits/rejected": -2.1461973190307617, + "logps/chosen": -199.1238555908203, + "logps/rejected": -159.5116729736328, + "loss": 34.7958, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.03608076646924019, + "rewards/margins": -0.22421510517597198, + "rewards/rejected": 0.1881343126296997, + "rewards/safe_rewards": 0.28278595209121704, + "rewards/unsafe_rewards": -0.3549474775791168, "step": 1000 }, { "epoch": 0.54, - "eval_logits/chosen": -1.7035794258117676, - "eval_logits/rejected": -1.4110270738601685, - "eval_logps/chosen": -194.92967224121094, - "eval_logps/rejected": -155.5638885498047, - "eval_loss": 3057.427490234375, - "eval_rewards/accuracies": 0.4847531318664551, - "eval_rewards/chosen": -0.6406379342079163, - "eval_rewards/margins": -0.008517943322658539, - "eval_rewards/rejected": -0.6321200728416443, - "eval_rewards/safe_rewards": -0.6396505832672119, - "eval_rewards/unsafe_rewards": -0.6373612284660339, - "eval_runtime": 1793.558, - "eval_samples_per_second": 18.424, - "eval_steps_per_second": 1.152, + "eval_logits/chosen": -2.0551209449768066, + "eval_logits/rejected": -1.7989723682403564, + "eval_logps/chosen": -130.9921875, + "eval_logps/rejected": -92.4808578491211, + "eval_loss": 0.7397361993789673, + "eval_rewards/accuracies": 0.5028436779975891, + "eval_rewards/chosen": -0.12634092569351196, + "eval_rewards/margins": 0.00263192574493587, + "eval_rewards/rejected": -0.1289728581905365, + "eval_rewards/safe_rewards": -0.12365306168794632, + "eval_rewards/unsafe_rewards": -0.1263761818408966, + "eval_runtime": 1869.3277, + "eval_samples_per_second": 17.677, + "eval_steps_per_second": 1.105, "step": 1000 }, { "epoch": 0.54, "learning_rate": 2.55636351326173e-07, - "logits/chosen": -2.128598690032959, - "logits/rejected": -1.909714937210083, - "logps/chosen": -256.15692138671875, - "logps/rejected": -227.8979949951172, - "loss": 6005.793, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4100671708583832, - "rewards/margins": 0.1085771769285202, - "rewards/rejected": -0.518644392490387, - "rewards/safe_rewards": -0.4100631773471832, - "rewards/unsafe_rewards": -0.4100712239742279, + "logits/chosen": -2.4121344089508057, + "logits/rejected": -2.2164716720581055, + "logps/chosen": -214.9409637451172, + "logps/rejected": -175.6654815673828, + "loss": 72.6154, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.20922379195690155, + "rewards/margins": -0.15886008739471436, + "rewards/rejected": 0.3680838942527771, + "rewards/safe_rewards": 0.6282423734664917, + "rewards/unsafe_rewards": -0.20979471504688263, "step": 1010 }, { "epoch": 0.55, "learning_rate": 2.509394692761622e-07, - "logits/chosen": -2.1080050468444824, - "logits/rejected": -1.8461472988128662, - "logps/chosen": -260.11492919921875, - "logps/rejected": -234.23159790039062, - "loss": 6730.5984, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.4158630967140198, - "rewards/margins": 0.11760497093200684, - "rewards/rejected": -0.5334680676460266, - "rewards/safe_rewards": -0.413133442401886, - "rewards/unsafe_rewards": -0.41859275102615356, + "logits/chosen": -2.39310884475708, + "logits/rejected": -2.1510488986968994, + "logps/chosen": -218.1635284423828, + "logps/rejected": -180.8001251220703, + "loss": 79.5377, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.365112841129303, + "rewards/margins": 0.28045016527175903, + "rewards/rejected": 0.08466275036334991, + "rewards/safe_rewards": 0.08056111633777618, + "rewards/unsafe_rewards": 0.649664580821991, "step": 1020 }, { "epoch": 0.55, "learning_rate": 2.462422555631674e-07, - "logits/chosen": -2.129202127456665, - "logits/rejected": -1.863739252090454, - "logps/chosen": -240.8564910888672, - "logps/rejected": -216.32406616210938, - "loss": 6471.7312, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": -0.4328088164329529, - "rewards/margins": 0.11855790764093399, - "rewards/rejected": -0.5513667464256287, - "rewards/safe_rewards": -0.4395027160644531, - "rewards/unsafe_rewards": -0.426114946603775, + "logits/chosen": -2.4212746620178223, + "logits/rejected": -2.187579393386841, + "logps/chosen": -197.0594024658203, + "logps/rejected": -160.92257690429688, + "loss": 30.6297, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.516227126121521, + "rewards/margins": 0.2513945698738098, + "rewards/rejected": 0.2648325562477112, + "rewards/safe_rewards": 0.3707699179649353, + "rewards/unsafe_rewards": 0.6616843938827515, "step": 1030 }, { "epoch": 0.56, "learning_rate": 2.415463684552728e-07, - "logits/chosen": -2.0589680671691895, - "logits/rejected": -1.8485314846038818, - "logps/chosen": -236.1913299560547, - "logps/rejected": -216.6231231689453, - "loss": 6823.7234, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.48712578415870667, - "rewards/margins": 0.08755507320165634, - "rewards/rejected": -0.5746808648109436, - "rewards/safe_rewards": -0.4804520010948181, - "rewards/unsafe_rewards": -0.4937995970249176, + "logits/chosen": -2.3526053428649902, + "logits/rejected": -2.168795585632324, + "logps/chosen": -187.2362518310547, + "logps/rejected": -158.90509033203125, + "loss": 16.6677, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": 0.2424931526184082, + "rewards/margins": -0.007419240660965443, + "rewards/rejected": 0.2499123513698578, + "rewards/safe_rewards": 0.3042396008968353, + "rewards/unsafe_rewards": 0.1807466745376587, "step": 1040 }, { "epoch": 0.56, "learning_rate": 2.3685346575222807e-07, - "logits/chosen": -2.0947399139404297, - "logits/rejected": -1.8131996393203735, - "logps/chosen": -252.2243194580078, - "logps/rejected": -225.3450164794922, - "loss": 6464.8258, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.4532151222229004, - "rewards/margins": 0.09485702961683273, - "rewards/rejected": -0.5480721592903137, - "rewards/safe_rewards": -0.464500367641449, - "rewards/unsafe_rewards": -0.4419298768043518, + "logits/chosen": -2.388552188873291, + "logits/rejected": -2.140934467315674, + "logps/chosen": -206.6807098388672, + "logps/rejected": -170.2689666748047, + "loss": 9.8385, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.22210577130317688, + "rewards/margins": -0.04671960324048996, + "rewards/rejected": 0.26882538199424744, + "rewards/safe_rewards": 0.3220168948173523, + "rewards/unsafe_rewards": 0.12219462543725967, "step": 1050 }, { "epoch": 0.57, "learning_rate": 2.321652042001919e-07, - "logits/chosen": -2.109151601791382, - "logits/rejected": -1.7986654043197632, - "logps/chosen": -256.6636657714844, - "logps/rejected": -242.2730255126953, - "loss": 6561.9937, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.46755141019821167, - "rewards/margins": 0.12294862419366837, - "rewards/rejected": -0.590499997138977, - "rewards/safe_rewards": -0.4698343276977539, - "rewards/unsafe_rewards": -0.4652685225009918, + "logits/chosen": -2.390388011932373, + "logits/rejected": -2.10972261428833, + "logps/chosen": -209.7392120361328, + "logps/rejected": -183.13662719726562, + "loss": 9.9643, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.16931462287902832, + "rewards/margins": 0.08294131606817245, + "rewards/rejected": 0.08637328445911407, + "rewards/safe_rewards": 0.11830408871173859, + "rewards/unsafe_rewards": 0.22032511234283447, "step": 1060 }, { "epoch": 0.58, "learning_rate": 2.2748323890684662e-07, - "logits/chosen": -2.102909803390503, - "logits/rejected": -1.8447599411010742, - "logps/chosen": -245.5310821533203, - "logps/rejected": -227.4581756591797, - "loss": 6596.2875, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.46844926476478577, - "rewards/margins": 0.10815373808145523, - "rewards/rejected": -0.5766030550003052, - "rewards/safe_rewards": -0.4830314517021179, - "rewards/unsafe_rewards": -0.4538671374320984, + "logits/chosen": -2.3839309215545654, + "logits/rejected": -2.1623384952545166, + "logps/chosen": -198.49668884277344, + "logps/rejected": -169.58737182617188, + "loss": 11.8899, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.18948496878147125, + "rewards/margins": -0.021000146865844727, + "rewards/rejected": 0.2104850709438324, + "rewards/safe_rewards": 0.07809984683990479, + "rewards/unsafe_rewards": 0.3008700907230377, "step": 1070 }, { "epoch": 0.58, "learning_rate": 2.2280922275709213e-07, - "logits/chosen": -2.111426830291748, - "logits/rejected": -1.8440996408462524, - "logps/chosen": -249.52572631835938, - "logps/rejected": -235.2782745361328, - "loss": 6598.8516, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4535207152366638, - "rewards/margins": 0.10985337197780609, - "rewards/rejected": -0.5633742213249207, - "rewards/safe_rewards": -0.4334963262081146, - "rewards/unsafe_rewards": -0.47354525327682495, + "logits/chosen": -2.402510166168213, + "logits/rejected": -2.1689133644104004, + "logps/chosen": -204.17782592773438, + "logps/rejected": -179.0993194580078, + "loss": 46.4965, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.004160255193710327, + "rewards/margins": 0.15431872010231018, + "rewards/rejected": -0.1584789752960205, + "rewards/safe_rewards": -0.011111170053482056, + "rewards/unsafe_rewards": 0.0027906596660614014, "step": 1080 }, { "epoch": 0.59, "learning_rate": 2.1814480582952375e-07, - "logits/chosen": -2.1081595420837402, - "logits/rejected": -1.845463514328003, - "logps/chosen": -252.86978149414062, - "logps/rejected": -241.583984375, - "loss": 6642.643, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.49175339937210083, - "rewards/margins": 0.10933689773082733, - "rewards/rejected": -0.6010903716087341, - "rewards/safe_rewards": -0.49363985657691956, - "rewards/unsafe_rewards": -0.48986703157424927, + "logits/chosen": -2.410515308380127, + "logits/rejected": -2.184720993041992, + "logps/chosen": -203.24267578125, + "logps/rejected": -181.4256134033203, + "loss": 102.4097, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 0.45176035165786743, + "rewards/margins": 0.40241655707359314, + "rewards/rejected": 0.04934380576014519, + "rewards/safe_rewards": 0.11663278192281723, + "rewards/unsafe_rewards": 0.7868879437446594, "step": 1090 }, { "epoch": 0.59, "learning_rate": 2.1349163481390187e-07, - "logits/chosen": -2.0954482555389404, - "logits/rejected": -1.8629337549209595, - "logps/chosen": -238.1507568359375, - "logps/rejected": -226.63101196289062, - "loss": 6691.6672, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.4491768479347229, - "rewards/margins": 0.09711066633462906, - "rewards/rejected": -0.5462875366210938, - "rewards/safe_rewards": -0.4436133801937103, - "rewards/unsafe_rewards": -0.45474034547805786, + "logits/chosen": -2.397282600402832, + "logits/rejected": -2.194654703140259, + "logps/chosen": -193.00746154785156, + "logps/rejected": -171.80690002441406, + "loss": 7.6309, + "rewards/accuracies": 0.515625, + "rewards/chosen": 0.22561688721179962, + "rewards/margins": 0.030238542705774307, + "rewards/rejected": 0.1953783482313156, + "rewards/safe_rewards": 0.10761779546737671, + "rewards/unsafe_rewards": 0.34361597895622253, "step": 1100 }, { "epoch": 0.6, "learning_rate": 2.0885135242981647e-07, - "logits/chosen": -2.0993576049804688, - "logits/rejected": -1.8268959522247314, - "logps/chosen": -258.9464111328125, - "logps/rejected": -218.21286010742188, - "loss": 6055.6734, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.45462924242019653, - "rewards/margins": 0.10500512272119522, - "rewards/rejected": -0.5596343278884888, - "rewards/safe_rewards": -0.4565979540348053, - "rewards/unsafe_rewards": -0.4526605010032654, + "logits/chosen": -2.398287057876587, + "logits/rejected": -2.1465389728546143, + "logps/chosen": -213.0477752685547, + "logps/rejected": -162.02694702148438, + "loss": 7.6341, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.43572482466697693, + "rewards/margins": 0.21323783695697784, + "rewards/rejected": 0.22248701751232147, + "rewards/safe_rewards": 0.5539884567260742, + "rewards/unsafe_rewards": 0.3174612522125244, "step": 1110 }, { "epoch": 0.6, "learning_rate": 2.0422559684675494e-07, - "logits/chosen": -2.1483044624328613, - "logits/rejected": -1.8406091928482056, - "logps/chosen": -258.66229248046875, - "logps/rejected": -219.83975219726562, - "loss": 6445.002, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.4176865220069885, - "rewards/margins": 0.09093281626701355, - "rewards/rejected": -0.5086194276809692, - "rewards/safe_rewards": -0.3987955152988434, - "rewards/unsafe_rewards": -0.43657755851745605, + "logits/chosen": -2.4309935569763184, + "logits/rejected": -2.1530261039733887, + "logps/chosen": -217.1282958984375, + "logps/rejected": -168.8966522216797, + "loss": 12.2909, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": -0.23469574749469757, + "rewards/margins": -0.31583258509635925, + "rewards/rejected": 0.08113676309585571, + "rewards/safe_rewards": -0.3013322949409485, + "rewards/unsafe_rewards": -0.16805927455425262, "step": 1120 }, { "epoch": 0.61, "learning_rate": 1.9961600110577457e-07, - "logits/chosen": -2.055497646331787, - "logits/rejected": -1.8122695684432983, - "logps/chosen": -249.71914672851562, - "logps/rejected": -244.0706024169922, - "loss": 6471.9266, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4234393239021301, - "rewards/margins": 0.09865859895944595, - "rewards/rejected": -0.5220978856086731, - "rewards/safe_rewards": -0.39991191029548645, - "rewards/unsafe_rewards": -0.4469667077064514, + "logits/chosen": -2.349834680557251, + "logits/rejected": -2.1397252082824707, + "logps/chosen": -207.71615600585938, + "logps/rejected": -192.11148071289062, + "loss": 83.4484, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.3409779667854309, + "rewards/margins": -0.09031665325164795, + "rewards/rejected": -0.25066131353378296, + "rewards/safe_rewards": -0.046450722962617874, + "rewards/unsafe_rewards": -0.6355050802230835, "step": 1130 }, { "epoch": 0.61, "learning_rate": 1.950241925429867e-07, - "logits/chosen": -2.1423842906951904, - "logits/rejected": -1.8863725662231445, - "logps/chosen": -242.42935180664062, - "logps/rejected": -222.77133178710938, - "loss": 6285.3195, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": -0.4015336036682129, - "rewards/margins": 0.09903542697429657, - "rewards/rejected": -0.5005691051483154, - "rewards/safe_rewards": -0.4006478786468506, - "rewards/unsafe_rewards": -0.40241941809654236, + "logits/chosen": -2.4354217052459717, + "logits/rejected": -2.2282073497772217, + "logps/chosen": -202.4095458984375, + "logps/rejected": -172.94119262695312, + "loss": 10.2059, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.13358193635940552, + "rewards/margins": 0.09321316331624985, + "rewards/rejected": -0.22679507732391357, + "rewards/safe_rewards": -0.3772156536579132, + "rewards/unsafe_rewards": 0.110051728785038, "step": 1140 }, { "epoch": 0.62, "learning_rate": 1.9045179221505495e-07, - "logits/chosen": -2.0579514503479004, - "logits/rejected": -1.8176921606063843, - "logps/chosen": -267.66314697265625, - "logps/rejected": -238.9211883544922, - "loss": 6360.9383, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.45752495527267456, - "rewards/margins": 0.0931306928396225, - "rewards/rejected": -0.550655722618103, - "rewards/safe_rewards": -0.4510927200317383, - "rewards/unsafe_rewards": -0.4639572501182556, + "logits/chosen": -2.385145664215088, + "logits/rejected": -2.1816518306732178, + "logps/chosen": -222.2650909423828, + "logps/rejected": -183.89297485351562, + "loss": 70.6764, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.3544660210609436, + "rewards/margins": -0.31712788343429565, + "rewards/rejected": -0.037338145077228546, + "rewards/safe_rewards": -0.14638884365558624, + "rewards/unsafe_rewards": -0.5625432729721069, "step": 1150 }, { "epoch": 0.62, "learning_rate": 1.8590041432690893e-07, - "logits/chosen": -1.9864181280136108, - "logits/rejected": -1.7629731893539429, - "logps/chosen": -241.68258666992188, - "logps/rejected": -226.5916748046875, - "loss": 6329.5895, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": -0.497173547744751, - "rewards/margins": 0.08917877823114395, - "rewards/rejected": -0.5863522887229919, - "rewards/safe_rewards": -0.49702000617980957, - "rewards/unsafe_rewards": -0.49732694029808044, + "logits/chosen": -2.3393194675445557, + "logits/rejected": -2.157670736312866, + "logps/chosen": -191.87765502929688, + "logps/rejected": -167.9620819091797, + "loss": 15.8742, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.08758888393640518, + "rewards/margins": 0.09322256594896317, + "rewards/rejected": -0.005633688066154718, + "rewards/safe_rewards": 0.3342171907424927, + "rewards/unsafe_rewards": -0.1590394526720047, "step": 1160 }, { "epoch": 0.63, "learning_rate": 1.813716656618788e-07, - "logits/chosen": -2.0296473503112793, - "logits/rejected": -1.793302297592163, - "logps/chosen": -234.65371704101562, - "logps/rejected": -217.74581909179688, - "loss": 7077.6555, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.49143609404563904, - "rewards/margins": 0.08558019250631332, - "rewards/rejected": -0.577016294002533, - "rewards/safe_rewards": -0.499739408493042, - "rewards/unsafe_rewards": -0.48313283920288086, + "logits/chosen": -2.371502637863159, + "logits/rejected": -2.179802417755127, + "logps/chosen": -185.43954467773438, + "logps/rejected": -159.95692443847656, + "loss": 31.7421, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.07055462151765823, + "rewards/margins": -0.016725819557905197, + "rewards/rejected": 0.08728043735027313, + "rewards/safe_rewards": 0.0999542772769928, + "rewards/unsafe_rewards": 0.041154973208904266, "step": 1170 }, { "epoch": 0.63, "learning_rate": 1.7686714501444788e-07, - "logits/chosen": -2.088947296142578, - "logits/rejected": -1.7341728210449219, - "logps/chosen": -270.2713317871094, - "logps/rejected": -237.0913543701172, - "loss": 6750.0367, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.5015474557876587, - "rewards/margins": 0.09311170130968094, - "rewards/rejected": -0.594659149646759, - "rewards/safe_rewards": -0.5031787157058716, - "rewards/unsafe_rewards": -0.499916136264801, + "logits/chosen": -2.408245086669922, + "logits/rejected": -2.111708402633667, + "logps/chosen": -220.0321807861328, + "logps/rejected": -177.4727783203125, + "loss": 30.3254, + "rewards/accuracies": 0.46875, + "rewards/chosen": 0.08441750705242157, + "rewards/margins": -0.06825534999370575, + "rewards/rejected": 0.15267284214496613, + "rewards/safe_rewards": -0.1991117298603058, + "rewards/unsafe_rewards": 0.36794668436050415, "step": 1180 }, { "epoch": 0.64, "learning_rate": 1.7238844262582768e-07, - "logits/chosen": -2.0681557655334473, - "logits/rejected": -1.87997567653656, - "logps/chosen": -261.2225036621094, - "logps/rejected": -239.202880859375, - "loss": 6169.3504, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.4642775058746338, - "rewards/margins": 0.07037371397018433, - "rewards/rejected": -0.5346512198448181, - "rewards/safe_rewards": -0.4574803411960602, - "rewards/unsafe_rewards": -0.4710747301578522, + "logits/chosen": -2.3922970294952393, + "logits/rejected": -2.2358450889587402, + "logps/chosen": -214.66928100585938, + "logps/rejected": -185.44973754882812, + "loss": 25.5158, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.1254725158214569, + "rewards/margins": -0.16257312893867493, + "rewards/rejected": 0.28804564476013184, + "rewards/safe_rewards": -0.2854710817337036, + "rewards/unsafe_rewards": 0.5364161133766174, "step": 1190 }, { "epoch": 0.65, "learning_rate": 1.679371396225504e-07, - "logits/chosen": -2.0639595985412598, - "logits/rejected": -1.796931266784668, - "logps/chosen": -248.5899658203125, - "logps/rejected": -232.2371368408203, - "loss": 6636.9758, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.4413328170776367, - "rewards/margins": 0.07315496355295181, - "rewards/rejected": -0.5144877433776855, - "rewards/safe_rewards": -0.4346230924129486, - "rewards/unsafe_rewards": -0.44804254174232483, + "logits/chosen": -2.381708860397339, + "logits/rejected": -2.1555020809173584, + "logps/chosen": -204.30628967285156, + "logps/rejected": -180.3300323486328, + "loss": 22.5219, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.1503904014825821, + "rewards/margins": -0.30795037746429443, + "rewards/rejected": 0.45834073424339294, + "rewards/safe_rewards": 0.4240906834602356, + "rewards/unsafe_rewards": -0.12330994755029678, "step": 1200 }, { "epoch": 0.65, "learning_rate": 1.6351480745828096e-07, - "logits/chosen": -2.096024513244629, - "logits/rejected": -1.840306043624878, - "logps/chosen": -239.71939086914062, - "logps/rejected": -224.69808959960938, - "loss": 6078.8516, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.40647679567337036, - "rewards/margins": 0.11829674243927002, - "rewards/rejected": -0.5247735381126404, - "rewards/safe_rewards": -0.4001832604408264, - "rewards/unsafe_rewards": -0.4127703607082367, + "logits/chosen": -2.4050099849700928, + "logits/rejected": -2.1825802326202393, + "logps/chosen": -198.45777893066406, + "logps/rejected": -172.17959594726562, + "loss": 37.0212, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.6139317750930786, + "rewards/margins": 0.5727913975715637, + "rewards/rejected": 0.04114028066396713, + "rewards/safe_rewards": 0.8607079386711121, + "rewards/unsafe_rewards": 0.3671554923057556, "step": 1210 }, { "epoch": 0.66, "learning_rate": 1.5912300735904248e-07, - "logits/chosen": -2.1436526775360107, - "logits/rejected": -1.8284460306167603, - "logps/chosen": -269.2972106933594, - "logps/rejected": -228.4302215576172, - "loss": 6714.8125, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.45736759901046753, - "rewards/margins": 0.08712475001811981, - "rewards/rejected": -0.5444923639297485, - "rewards/safe_rewards": -0.44737473130226135, - "rewards/unsafe_rewards": -0.46736055612564087, + "logits/chosen": -2.4449119567871094, + "logits/rejected": -2.174882173538208, + "logps/chosen": -223.2691192626953, + "logps/rejected": -173.9079132080078, + "loss": 21.6142, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": 0.29134517908096313, + "rewards/margins": 0.21831652522087097, + "rewards/rejected": 0.07302861660718918, + "rewards/safe_rewards": 0.24711818993091583, + "rewards/unsafe_rewards": 0.33557215332984924, "step": 1220 }, { "epoch": 0.66, "learning_rate": 1.5476328977205395e-07, - "logits/chosen": -2.0719504356384277, - "logits/rejected": -1.8282197713851929, - "logps/chosen": -240.5180206298828, - "logps/rejected": -223.3226318359375, - "loss": 6474.3016, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.4578246474266052, - "rewards/margins": 0.12289600074291229, - "rewards/rejected": -0.5807207226753235, - "rewards/safe_rewards": -0.479422003030777, - "rewards/unsafe_rewards": -0.43622738122940063, + "logits/chosen": -2.383089542388916, + "logits/rejected": -2.1814026832580566, + "logps/chosen": -195.18643188476562, + "logps/rejected": -165.39920043945312, + "loss": 279.4912, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.45090776681900024, + "rewards/margins": -0.30230337381362915, + "rewards/rejected": -0.14860430359840393, + "rewards/safe_rewards": 0.5930166840553284, + "rewards/unsafe_rewards": -1.4948322772979736, "step": 1230 }, { "epoch": 0.67, "learning_rate": 1.5043719381837112e-07, - "logits/chosen": -2.0963845252990723, - "logits/rejected": -1.8657476902008057, - "logps/chosen": -265.12103271484375, - "logps/rejected": -245.16470336914062, - "loss": 6423.2137, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.4568420350551605, - "rewards/margins": 0.10198511928319931, - "rewards/rejected": -0.5588272213935852, - "rewards/safe_rewards": -0.44269007444381714, - "rewards/unsafe_rewards": -0.47099393606185913, + "logits/chosen": -2.4133849143981934, + "logits/rejected": -2.2195193767547607, + "logps/chosen": -219.2970428466797, + "logps/rejected": -189.27816772460938, + "loss": 29.5997, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.13977238535881042, + "rewards/margins": 0.13596734404563904, + "rewards/rejected": 0.0038050352595746517, + "rewards/safe_rewards": 0.10270917415618896, + "rewards/unsafe_rewards": 0.17683559656143188, "step": 1240 }, { "epoch": 0.67, "learning_rate": 1.461462467495284e-07, - "logits/chosen": -2.0682883262634277, - "logits/rejected": -1.8371692895889282, - "logps/chosen": -242.6766815185547, - "logps/rejected": -227.84335327148438, - "loss": 6296.4098, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.4713061451911926, - "rewards/margins": 0.12812045216560364, - "rewards/rejected": -0.5994266271591187, - "rewards/safe_rewards": -0.4832456111907959, - "rewards/unsafe_rewards": -0.45936664938926697, + "logits/chosen": -2.3971149921417236, + "logits/rejected": -2.2017760276794434, + "logps/chosen": -195.2748260498047, + "logps/rejected": -168.03221130371094, + "loss": 29.824, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.27122873067855835, + "rewards/margins": 0.40271610021591187, + "rewards/rejected": -0.13148736953735352, + "rewards/safe_rewards": 0.28425708413124084, + "rewards/unsafe_rewards": 0.25820040702819824, "step": 1250 }, { "epoch": 0.68, "learning_rate": 1.4189196340836865e-07, - "logits/chosen": -2.127166986465454, - "logits/rejected": -1.8461487293243408, - "logps/chosen": -246.42178344726562, - "logps/rejected": -222.8474578857422, - "loss": 6083.118, - "rewards/accuracies": 0.5718749761581421, - "rewards/chosen": -0.4719027876853943, - "rewards/margins": 0.09129159897565842, - "rewards/rejected": -0.5631943941116333, - "rewards/safe_rewards": -0.47717374563217163, - "rewards/unsafe_rewards": -0.4666318893432617, + "logits/chosen": -2.4611334800720215, + "logits/rejected": -2.2188827991485596, + "logps/chosen": -199.0708465576172, + "logps/rejected": -166.50717163085938, + "loss": 42.7807, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.1606270670890808, + "rewards/margins": 0.13977651298046112, + "rewards/rejected": 0.020850548520684242, + "rewards/safe_rewards": 0.23887856304645538, + "rewards/unsafe_rewards": 0.08237558603286743, "step": 1260 }, { "epoch": 0.68, "learning_rate": 1.3767584569425561e-07, - "logits/chosen": -2.1966021060943604, - "logits/rejected": -1.8988735675811768, - "logps/chosen": -259.76678466796875, - "logps/rejected": -234.09518432617188, - "loss": 6448.4398, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.44939374923706055, - "rewards/margins": 0.11033139377832413, - "rewards/rejected": -0.5597251653671265, - "rewards/safe_rewards": -0.4581433832645416, - "rewards/unsafe_rewards": -0.44064411520957947, + "logits/chosen": -2.5276553630828857, + "logits/rejected": -2.2806928157806396, + "logps/chosen": -214.76614379882812, + "logps/rejected": -178.0789031982422, + "loss": 7.7411, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.06127176806330681, + "rewards/margins": 0.017500977963209152, + "rewards/rejected": 0.043770790100097656, + "rewards/safe_rewards": 0.12450921535491943, + "rewards/unsafe_rewards": -0.0019656748045235872, "step": 1270 }, { "epoch": 0.69, "learning_rate": 1.334993820328541e-07, - "logits/chosen": -2.136979579925537, - "logits/rejected": -1.8722255229949951, - "logps/chosen": -250.0802001953125, - "logps/rejected": -230.7494659423828, - "loss": 5991.9031, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.45802736282348633, - "rewards/margins": 0.1330554187297821, - "rewards/rejected": -0.5910828113555908, - "rewards/safe_rewards": -0.43893709778785706, - "rewards/unsafe_rewards": -0.4771176874637604, + "logits/chosen": -2.461317539215088, + "logits/rejected": -2.2503418922424316, + "logps/chosen": -204.41952514648438, + "logps/rejected": -171.56008911132812, + "loss": 88.8508, + "rewards/accuracies": 0.453125, + "rewards/chosen": -0.142044335603714, + "rewards/margins": -0.2231227457523346, + "rewards/rejected": 0.0810784175992012, + "rewards/safe_rewards": -0.2867421507835388, + "rewards/unsafe_rewards": 0.002653457224369049, "step": 1280 }, { "epoch": 0.69, "learning_rate": 1.2936404685066852e-07, - "logits/chosen": -2.0627663135528564, - "logits/rejected": -1.8343334197998047, - "logps/chosen": -250.0009307861328, - "logps/rejected": -234.5965118408203, - "loss": 6591.9164, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.44661563634872437, - "rewards/margins": 0.09103361517190933, - "rewards/rejected": -0.5376492738723755, - "rewards/safe_rewards": -0.4586601257324219, - "rewards/unsafe_rewards": -0.43457117676734924, + "logits/chosen": -2.3843283653259277, + "logits/rejected": -2.1979799270629883, + "logps/chosen": -205.46273803710938, + "logps/rejected": -180.91793823242188, + "loss": 66.3165, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.1233854666352272, + "rewards/margins": -0.03703648969531059, + "rewards/rejected": -0.0863489955663681, + "rewards/safe_rewards": -0.44312816858291626, + "rewards/unsafe_rewards": 0.19635725021362305, "step": 1290 }, { "epoch": 0.7, "learning_rate": 1.252713000545221e-07, - "logits/chosen": -2.161848545074463, - "logits/rejected": -1.8758856058120728, - "logps/chosen": -255.42941284179688, - "logps/rejected": -227.38516235351562, - "loss": 5795.3254, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.43463054299354553, - "rewards/margins": 0.10909499228000641, - "rewards/rejected": -0.5437254309654236, - "rewards/safe_rewards": -0.43680882453918457, - "rewards/unsafe_rewards": -0.43245211243629456, + "logits/chosen": -2.455895185470581, + "logits/rejected": -2.2126731872558594, + "logps/chosen": -211.90866088867188, + "logps/rejected": -172.7696533203125, + "loss": 8.9746, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.057679928839206696, + "rewards/margins": -0.1852763295173645, + "rewards/rejected": 0.2429562509059906, + "rewards/safe_rewards": 0.14237050712108612, + "rewards/unsafe_rewards": -0.027010658755898476, "step": 1300 }, { "epoch": 0.7, "learning_rate": 1.2122258651616304e-07, - "logits/chosen": -2.141693115234375, - "logits/rejected": -1.872849702835083, - "logps/chosen": -255.8201141357422, - "logps/rejected": -228.45068359375, - "loss": 6160.4359, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4597500264644623, - "rewards/margins": 0.08634473383426666, - "rewards/rejected": -0.5460947751998901, - "rewards/safe_rewards": -0.47202786803245544, - "rewards/unsafe_rewards": -0.44747209548950195, + "logits/chosen": -2.445269823074341, + "logits/rejected": -2.224661350250244, + "logps/chosen": -209.90713500976562, + "logps/rejected": -173.6033935546875, + "loss": 63.3258, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.0619967095553875, + "rewards/margins": -0.29980406165122986, + "rewards/rejected": 0.23780739307403564, + "rewards/safe_rewards": -0.2617853283882141, + "rewards/unsafe_rewards": 0.13779191672801971, "step": 1310 }, { "epoch": 0.71, "learning_rate": 1.1721933556217792e-07, - "logits/chosen": -2.105320453643799, - "logits/rejected": -1.879272699356079, - "logps/chosen": -239.11795043945312, - "logps/rejected": -230.5526885986328, - "loss": 6437.4898, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.43065792322158813, - "rewards/margins": 0.1191934198141098, - "rewards/rejected": -0.5498512983322144, - "rewards/safe_rewards": -0.4450756907463074, - "rewards/unsafe_rewards": -0.4162401258945465, + "logits/chosen": -2.4175376892089844, + "logits/rejected": -2.23214054107666, + "logps/chosen": -195.77786254882812, + "logps/rejected": -175.40225219726562, + "loss": 11.5399, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.27427220344543457, + "rewards/margins": 0.10898621380329132, + "rewards/rejected": 0.16528600454330444, + "rewards/safe_rewards": 0.21214981377124786, + "rewards/unsafe_rewards": 0.3363945782184601, "step": 1320 }, { "epoch": 0.72, "learning_rate": 1.1326296046939333e-07, - "logits/chosen": -2.050401210784912, - "logits/rejected": -1.7805709838867188, - "logps/chosen": -227.81747436523438, - "logps/rejected": -207.8745574951172, - "loss": 5906.4801, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.43025341629981995, - "rewards/margins": 0.11153332889080048, - "rewards/rejected": -0.5417866706848145, - "rewards/safe_rewards": -0.41498079895973206, - "rewards/unsafe_rewards": -0.4455259442329407, + "logits/chosen": -2.3801956176757812, + "logits/rejected": -2.162496328353882, + "logps/chosen": -184.91856384277344, + "logps/rejected": -153.4582061767578, + "loss": 63.4268, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.12642014026641846, + "rewards/margins": -0.3640880286693573, + "rewards/rejected": 0.23766788840293884, + "rewards/safe_rewards": -0.23899023234844208, + "rewards/unsafe_rewards": -0.013849982991814613, "step": 1330 }, { "epoch": 0.72, "learning_rate": 1.0935485796594351e-07, - "logits/chosen": -2.155824661254883, - "logits/rejected": -1.8547401428222656, - "logps/chosen": -268.284423828125, - "logps/rejected": -232.23007202148438, - "loss": 6847.3039, - "rewards/accuracies": 0.6156250238418579, - "rewards/chosen": -0.4572017788887024, - "rewards/margins": 0.10400652885437012, - "rewards/rejected": -0.5612083673477173, - "rewards/safe_rewards": -0.4599490165710449, - "rewards/unsafe_rewards": -0.4544545114040375, + "logits/chosen": -2.4861056804656982, + "logits/rejected": -2.239741086959839, + "logps/chosen": -222.3768310546875, + "logps/rejected": -176.0164337158203, + "loss": 21.3914, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.18739810585975647, + "rewards/margins": 0.09455545991659164, + "rewards/rejected": 0.09284263849258423, + "rewards/safe_rewards": 0.19133667647838593, + "rewards/unsafe_rewards": 0.1834595501422882, "step": 1340 }, { "epoch": 0.73, "learning_rate": 1.0549640773818028e-07, - "logits/chosen": -2.066615581512451, - "logits/rejected": -1.831392526626587, - "logps/chosen": -254.32418823242188, - "logps/rejected": -215.6810760498047, - "loss": 6832.7969, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.49353137612342834, - "rewards/margins": 0.0739959180355072, - "rewards/rejected": -0.5675273537635803, - "rewards/safe_rewards": -0.505325198173523, - "rewards/unsafe_rewards": -0.4817374646663666, + "logits/chosen": -2.4289638996124268, + "logits/rejected": -2.237046003341675, + "logps/chosen": -204.95181274414062, + "logps/rejected": -158.824951171875, + "loss": 8.4938, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.01927146315574646, + "rewards/margins": -0.0841434970498085, + "rewards/rejected": 0.10341496765613556, + "rewards/safe_rewards": 0.04857074096798897, + "rewards/unsafe_rewards": -0.010027825832366943, "step": 1350 }, { "epoch": 0.73, "learning_rate": 1.0168897194359921e-07, - "logits/chosen": -2.1050429344177246, - "logits/rejected": -1.8025562763214111, - "logps/chosen": -274.6943054199219, - "logps/rejected": -244.18777465820312, - "loss": 6459.3121, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.523662805557251, - "rewards/margins": 0.08569201081991196, - "rewards/rejected": -0.6093548536300659, - "rewards/safe_rewards": -0.5233367681503296, - "rewards/unsafe_rewards": -0.5239888429641724, + "logits/chosen": -2.4466030597686768, + "logits/rejected": -2.194831132888794, + "logps/chosen": -222.03775024414062, + "logps/rejected": -183.56564331054688, + "loss": 82.0212, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.2902601659297943, + "rewards/margins": 0.6036115884780884, + "rewards/rejected": -0.3133513927459717, + "rewards/safe_rewards": 0.4282899498939514, + "rewards/unsafe_rewards": 0.152230367064476, "step": 1360 }, { "epoch": 0.74, "learning_rate": 9.793389472995392e-08, - "logits/chosen": -2.055671453475952, - "logits/rejected": -1.7598520517349243, - "logps/chosen": -256.25750732421875, - "logps/rejected": -225.249267578125, - "loss": 5955.8453, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.4657188951969147, - "rewards/margins": 0.12058509886264801, - "rewards/rejected": -0.5863040089607239, - "rewards/safe_rewards": -0.44779258966445923, - "rewards/unsafe_rewards": -0.4836452007293701, + "logits/chosen": -2.4077987670898438, + "logits/rejected": -2.1739821434020996, + "logps/chosen": -209.699951171875, + "logps/rejected": -166.60293579101562, + "loss": 11.3477, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": -0.014321346767246723, + "rewards/margins": -0.030286794528365135, + "rewards/rejected": 0.015965450555086136, + "rewards/safe_rewards": 0.03987512364983559, + "rewards/unsafe_rewards": -0.06851781159639359, "step": 1370 }, { "epoch": 0.74, "learning_rate": 9.423250176072874e-08, - "logits/chosen": -2.042423725128174, - "logits/rejected": -1.7933346033096313, - "logps/chosen": -229.9665985107422, - "logps/rejected": -209.86416625976562, - "loss": 7156.8875, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4851716160774231, - "rewards/margins": 0.07350887358188629, - "rewards/rejected": -0.5586804747581482, - "rewards/safe_rewards": -0.48476117849349976, - "rewards/unsafe_rewards": -0.4855819642543793, + "logits/chosen": -2.401275634765625, + "logits/rejected": -2.192737340927124, + "logps/chosen": -181.48147583007812, + "logps/rejected": -154.23431396484375, + "loss": 15.9486, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.0320340096950531, + "rewards/margins": 0.20615582168102264, + "rewards/rejected": -0.23818981647491455, + "rewards/safe_rewards": 0.015104318037629128, + "rewards/unsafe_rewards": -0.07917235046625137, "step": 1380 }, { "epoch": 0.75, "learning_rate": 9.058609974713654e-08, - "logits/chosen": -2.1118245124816895, - "logits/rejected": -1.7858221530914307, - "logps/chosen": -252.89590454101562, - "logps/rejected": -229.27572631835938, - "loss": 6091.018, - "rewards/accuracies": 0.6031249761581421, - "rewards/chosen": -0.46690288186073303, - "rewards/margins": 0.10783787816762924, - "rewards/rejected": -0.5747407674789429, - "rewards/safe_rewards": -0.4574856758117676, - "rewards/unsafe_rewards": -0.4763200283050537, + "logits/chosen": -2.4539401531219482, + "logits/rejected": -2.1792826652526855, + "logps/chosen": -206.2873992919922, + "logps/rejected": -171.813232421875, + "loss": 27.4047, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.08179975301027298, + "rewards/margins": -0.07024437934160233, + "rewards/rejected": -0.011555373668670654, + "rewards/safe_rewards": 0.006575888488441706, + "rewards/unsafe_rewards": -0.17017540335655212, "step": 1390 }, { "epoch": 0.75, "learning_rate": 8.699597598680753e-08, - "logits/chosen": -2.0331668853759766, - "logits/rejected": -1.7570956945419312, - "logps/chosen": -229.2872314453125, - "logps/rejected": -227.269287109375, - "loss": 5937.1336, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.4522533416748047, - "rewards/margins": 0.12965361773967743, - "rewards/rejected": -0.5819069743156433, - "rewards/safe_rewards": -0.4643983840942383, - "rewards/unsafe_rewards": -0.44010835886001587, + "logits/chosen": -2.3884987831115723, + "logits/rejected": -2.1706833839416504, + "logps/chosen": -183.61544799804688, + "logps/rejected": -168.7871856689453, + "loss": 34.4575, + "rewards/accuracies": 0.4937500059604645, + "rewards/chosen": 0.44646185636520386, + "rewards/margins": 0.1550489217042923, + "rewards/rejected": 0.29141297936439514, + "rewards/safe_rewards": 0.32389289140701294, + "rewards/unsafe_rewards": 0.5690308809280396, "step": 1400 }, { "epoch": 0.76, "learning_rate": 8.346339790933166e-08, - "logits/chosen": -2.1295323371887207, - "logits/rejected": -1.8313289880752563, - "logps/chosen": -246.0663299560547, - "logps/rejected": -216.1509552001953, - "loss": 6727.4266, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": -0.4585806727409363, - "rewards/margins": 0.10392139852046967, - "rewards/rejected": -0.5625020265579224, - "rewards/safe_rewards": -0.45531362295150757, - "rewards/unsafe_rewards": -0.461847722530365, + "logits/chosen": -2.4721839427948, + "logits/rejected": -2.2297019958496094, + "logps/chosen": -200.0784149169922, + "logps/rejected": -159.7423858642578, + "loss": 6.7397, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": 0.12982772290706635, + "rewards/margins": -0.028519075363874435, + "rewards/rejected": 0.15834678709506989, + "rewards/safe_rewards": -0.03010488487780094, + "rewards/unsafe_rewards": 0.2897603511810303, "step": 1410 }, { "epoch": 0.76, "learning_rate": 7.998961262881506e-08, - "logits/chosen": -2.066490650177002, - "logits/rejected": -1.7475519180297852, - "logps/chosen": -263.6619567871094, - "logps/rejected": -227.02685546875, - "loss": 6193.2656, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": -0.42703723907470703, - "rewards/margins": 0.11379702389240265, - "rewards/rejected": -0.5408341884613037, - "rewards/safe_rewards": -0.42232245206832886, - "rewards/unsafe_rewards": -0.43175190687179565, + "logits/chosen": -2.418222665786743, + "logits/rejected": -2.1581873893737793, + "logps/chosen": -220.6064453125, + "logps/rejected": -172.82266235351562, + "loss": 6.4288, + "rewards/accuracies": 0.559374988079071, + "rewards/chosen": 0.35179823637008667, + "rewards/margins": 0.23103070259094238, + "rewards/rejected": 0.12076754868030548, + "rewards/safe_rewards": 0.3257552981376648, + "rewards/unsafe_rewards": 0.37784117460250854, "step": 1420 }, { "epoch": 0.77, "learning_rate": 7.657584650360846e-08, - "logits/chosen": -2.0444042682647705, - "logits/rejected": -1.795877456665039, - "logps/chosen": -242.45947265625, - "logps/rejected": -228.25601196289062, - "loss": 6729.2211, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.43044179677963257, - "rewards/margins": 0.12840771675109863, - "rewards/rejected": -0.5588495135307312, - "rewards/safe_rewards": -0.4417741894721985, - "rewards/unsafe_rewards": -0.4191093444824219, + "logits/chosen": -2.396697521209717, + "logits/rejected": -2.2003862857818604, + "logps/chosen": -199.44009399414062, + "logps/rejected": -172.6617431640625, + "loss": 35.7268, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.024801719933748245, + "rewards/margins": 0.26587414741516113, + "rewards/rejected": -0.2906758785247803, + "rewards/safe_rewards": 0.05263688042759895, + "rewards/unsafe_rewards": -0.10224030166864395, "step": 1430 }, { "epoch": 0.77, "learning_rate": 7.322330470336313e-08, - "logits/chosen": -2.052246570587158, - "logits/rejected": -1.7939999103546143, - "logps/chosen": -232.52175903320312, - "logps/rejected": -225.86880493164062, - "loss": 6284.6258, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.42558392882347107, - "rewards/margins": 0.13375507295131683, - "rewards/rejected": -0.5593389272689819, - "rewards/safe_rewards": -0.4226798117160797, - "rewards/unsafe_rewards": -0.4284881055355072, + "logits/chosen": -2.3913733959198, + "logits/rejected": -2.189946413040161, + "logps/chosen": -190.08120727539062, + "logps/rejected": -170.0216522216797, + "loss": 9.582, + "rewards/accuracies": 0.546875, + "rewards/chosen": -0.11784199625253677, + "rewards/margins": -0.031096214428544044, + "rewards/rejected": -0.08674577623605728, + "rewards/safe_rewards": -0.293480783700943, + "rewards/unsafe_rewards": 0.05779681354761124, "step": 1440 }, { "epoch": 0.78, "learning_rate": 6.993317078356709e-08, - "logits/chosen": -2.0569279193878174, - "logits/rejected": -1.8582913875579834, - "logps/chosen": -244.96859741210938, - "logps/rejected": -223.7991485595703, - "loss": 6497.4699, - "rewards/accuracies": 0.5625, - "rewards/chosen": -0.460013210773468, - "rewards/margins": 0.07620026171207428, - "rewards/rejected": -0.5362134575843811, - "rewards/safe_rewards": -0.47913751006126404, - "rewards/unsafe_rewards": -0.4408888816833496, + "logits/chosen": -2.3910608291625977, + "logits/rejected": -2.2192938327789307, + "logps/chosen": -199.07406616210938, + "logps/rejected": -170.1977996826172, + "loss": 45.9652, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.10676655918359756, + "rewards/margins": -0.08676379173994064, + "rewards/rejected": -0.02000277489423752, + "rewards/safe_rewards": -0.0720798522233963, + "rewards/unsafe_rewards": -0.14145328104496002, "step": 1450 }, { "epoch": 0.79, "learning_rate": 6.67066062677118e-08, - "logits/chosen": -2.1144156455993652, - "logits/rejected": -1.8631536960601807, - "logps/chosen": -254.3730010986328, - "logps/rejected": -221.9481201171875, - "loss": 6792.3, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.456630140542984, - "rewards/margins": 0.08420534431934357, - "rewards/rejected": -0.5408354997634888, - "rewards/safe_rewards": -0.4593842029571533, - "rewards/unsafe_rewards": -0.4538760185241699, + "logits/chosen": -2.4357597827911377, + "logits/rejected": -2.2244791984558105, + "logps/chosen": -208.4618682861328, + "logps/rejected": -167.52764892578125, + "loss": 20.8808, + "rewards/accuracies": 0.48124998807907104, + "rewards/chosen": 0.2480795830488205, + "rewards/margins": -0.08884197473526001, + "rewards/rejected": 0.3369216322898865, + "rewards/safe_rewards": 0.1537085473537445, + "rewards/unsafe_rewards": 0.34245067834854126, "step": 1460 }, { "epoch": 0.79, "learning_rate": 6.354475023723685e-08, - "logits/chosen": -2.077465534210205, - "logits/rejected": -1.8094123601913452, - "logps/chosen": -262.92901611328125, - "logps/rejected": -229.3378448486328, - "loss": 6367.159, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.457561731338501, - "rewards/margins": 0.11682651191949844, - "rewards/rejected": -0.5743882060050964, - "rewards/safe_rewards": -0.45188575983047485, - "rewards/unsafe_rewards": -0.4632377028465271, + "logits/chosen": -2.3960747718811035, + "logits/rejected": -2.1642906665802, + "logps/chosen": -216.65756225585938, + "logps/rejected": -171.6775665283203, + "loss": 59.1855, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.5152679681777954, + "rewards/margins": 0.2937913239002228, + "rewards/rejected": 0.22147664427757263, + "rewards/safe_rewards": 0.2597041726112366, + "rewards/unsafe_rewards": 0.7708317041397095, "step": 1470 }, { "epoch": 0.8, "learning_rate": 6.044871892939746e-08, - "logits/chosen": -2.107585906982422, - "logits/rejected": -1.8727686405181885, - "logps/chosen": -272.7564392089844, - "logps/rejected": -245.385498046875, - "loss": 6379.8508, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.47117677330970764, - "rewards/margins": 0.0936112180352211, - "rewards/rejected": -0.5647879242897034, - "rewards/safe_rewards": -0.46114739775657654, - "rewards/unsafe_rewards": -0.4812060296535492, + "logits/chosen": -2.4158756732940674, + "logits/rejected": -2.2148139476776123, + "logps/chosen": -225.4951171875, + "logps/rejected": -189.0193328857422, + "loss": 30.3887, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.14366625249385834, + "rewards/margins": 0.25630486011505127, + "rewards/rejected": -0.11263859272003174, + "rewards/safe_rewards": 0.08603324741125107, + "rewards/unsafe_rewards": 0.20129923522472382, "step": 1480 }, { "epoch": 0.8, "learning_rate": 5.741960534319676e-08, - "logits/chosen": -2.0843305587768555, - "logits/rejected": -1.864058494567871, - "logps/chosen": -237.5848846435547, - "logps/rejected": -216.74258422851562, - "loss": 6329.8926, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4688313901424408, - "rewards/margins": 0.09106676280498505, - "rewards/rejected": -0.5598980784416199, - "rewards/safe_rewards": -0.48697489500045776, - "rewards/unsafe_rewards": -0.45068782567977905, + "logits/chosen": -2.391890525817871, + "logits/rejected": -2.2089953422546387, + "logps/chosen": -190.7472686767578, + "logps/rejected": -160.5789031982422, + "loss": 29.5828, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.04550771787762642, + "rewards/margins": -0.21935884654521942, + "rewards/rejected": 0.1738511323928833, + "rewards/safe_rewards": -0.12179826200008392, + "rewards/unsafe_rewards": 0.030782824382185936, "step": 1490 }, { "epoch": 0.81, "learning_rate": 5.44584788535217e-08, - "logits/chosen": -2.1008994579315186, - "logits/rejected": -1.84600031375885, - "logps/chosen": -259.45599365234375, - "logps/rejected": -233.75082397460938, - "loss": 5925.3711, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.464712917804718, - "rewards/margins": 0.1044440045952797, - "rewards/rejected": -0.56915682554245, - "rewards/safe_rewards": -0.47588276863098145, - "rewards/unsafe_rewards": -0.4535430073738098, + "logits/chosen": -2.4144439697265625, + "logits/rejected": -2.209897994995117, + "logps/chosen": -213.13632202148438, + "logps/rejected": -176.97024536132812, + "loss": 15.9924, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.15160061419010162, + "rewards/margins": -0.01647660695016384, + "rewards/rejected": -0.13512399792671204, + "rewards/safe_rewards": -0.37716132402420044, + "rewards/unsafe_rewards": 0.07396010309457779, "step": 1500 }, { "epoch": 0.81, - "eval_logits/chosen": -1.6828917264938354, - "eval_logits/rejected": -1.3719456195831299, - "eval_logps/chosen": -201.3945770263672, - "eval_logps/rejected": -161.33763122558594, - "eval_loss": 3026.4443359375, - "eval_rewards/accuracies": 0.46672314405441284, - "eval_rewards/chosen": -0.7052872776985168, - "eval_rewards/margins": -0.01542994100600481, - "eval_rewards/rejected": -0.6898572444915771, - "eval_rewards/safe_rewards": -0.705478847026825, - "eval_rewards/unsafe_rewards": -0.7033840417861938, - "eval_runtime": 1797.98, - "eval_samples_per_second": 18.378, - "eval_steps_per_second": 1.149, + "eval_logits/chosen": -2.084881544113159, + "eval_logits/rejected": -1.833509922027588, + "eval_logps/chosen": -131.02365112304688, + "eval_logps/rejected": -92.45955657958984, + "eval_loss": 0.6823093295097351, + "eval_rewards/accuracies": 0.4713214039802551, + "eval_rewards/chosen": -0.1577797532081604, + "eval_rewards/margins": -0.05011267587542534, + "eval_rewards/rejected": -0.10766706615686417, + "eval_rewards/safe_rewards": -0.15565218031406403, + "eval_rewards/unsafe_rewards": -0.15351000428199768, + "eval_runtime": 1880.4558, + "eval_samples_per_second": 17.572, + "eval_steps_per_second": 1.099, "step": 1500 }, { "epoch": 0.81, "learning_rate": 5.156638483361933e-08, - "logits/chosen": -2.107635021209717, - "logits/rejected": -1.8341983556747437, - "logps/chosen": -250.5362091064453, - "logps/rejected": -231.42312622070312, - "loss": 6448.6469, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.43800896406173706, - "rewards/margins": 0.13183660805225372, - "rewards/rejected": -0.5698455572128296, - "rewards/safe_rewards": -0.4343613088130951, - "rewards/unsafe_rewards": -0.44165658950805664, + "logits/chosen": -2.435300827026367, + "logits/rejected": -2.1943700313568115, + "logps/chosen": -206.97384643554688, + "logps/rejected": -174.73373413085938, + "loss": 6.0946, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": -0.2385288029909134, + "rewards/margins": 0.05665416270494461, + "rewards/rejected": -0.295183002948761, + "rewards/safe_rewards": -0.1594325453042984, + "rewards/unsafe_rewards": -0.3176250755786896, "step": 1510 }, { "epoch": 0.82, "learning_rate": 4.8744344286046236e-08, - "logits/chosen": -2.0658557415008545, - "logits/rejected": -1.8039467334747314, - "logps/chosen": -254.8303680419922, - "logps/rejected": -224.9630584716797, - "loss": 6587.8852, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.4746926724910736, - "rewards/margins": 0.08673126995563507, - "rewards/rejected": -0.5614239573478699, - "rewards/safe_rewards": -0.4753999710083008, - "rewards/unsafe_rewards": -0.4739854335784912, + "logits/chosen": -2.4003233909606934, + "logits/rejected": -2.177899122238159, + "logps/chosen": -207.0956573486328, + "logps/rejected": -169.01504516601562, + "loss": 45.2569, + "rewards/accuracies": 0.4906249940395355, + "rewards/chosen": 0.26545172929763794, + "rewards/margins": 0.4598563611507416, + "rewards/rejected": -0.19440460205078125, + "rewards/safe_rewards": 0.313471257686615, + "rewards/unsafe_rewards": 0.21743226051330566, "step": 1520 }, { "epoch": 0.82, "learning_rate": 4.599335348222169e-08, - "logits/chosen": -2.104322910308838, - "logits/rejected": -1.8812888860702515, - "logps/chosen": -253.0201416015625, - "logps/rejected": -242.67636108398438, - "loss": 6177.6988, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4594038128852844, - "rewards/margins": 0.10736414045095444, - "rewards/rejected": -0.5667679309844971, - "rewards/safe_rewards": -0.45568904280662537, - "rewards/unsafe_rewards": -0.46311864256858826, + "logits/chosen": -2.4335553646087646, + "logits/rejected": -2.246596574783325, + "logps/chosen": -207.1642608642578, + "logps/rejected": -186.18722534179688, + "loss": 5.5153, + "rewards/accuracies": 0.515625, + "rewards/chosen": -0.0844786986708641, + "rewards/margins": 0.10317480564117432, + "rewards/rejected": -0.1876535415649414, + "rewards/safe_rewards": -0.22987417876720428, + "rewards/unsafe_rewards": 0.06091681867837906, "step": 1530 }, { "epoch": 0.83, "learning_rate": 4.331438361071163e-08, - "logits/chosen": -2.03129243850708, - "logits/rejected": -1.8628809452056885, - "logps/chosen": -258.9869079589844, - "logps/rejected": -249.06881713867188, - "loss": 6718.5445, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": -0.4645802080631256, - "rewards/margins": 0.08629541099071503, - "rewards/rejected": -0.5508756637573242, - "rewards/safe_rewards": -0.45355597138404846, - "rewards/unsafe_rewards": -0.4756045341491699, + "logits/chosen": -2.3511147499084473, + "logits/rejected": -2.206602096557617, + "logps/chosen": -212.7078857421875, + "logps/rejected": -194.2686767578125, + "loss": 21.5544, + "rewards/accuracies": 0.47187501192092896, + "rewards/chosen": -0.17899686098098755, + "rewards/margins": 0.10844133794307709, + "rewards/rejected": -0.28743821382522583, + "rewards/safe_rewards": -0.30973827838897705, + "rewards/unsafe_rewards": -0.04825545474886894, "step": 1540 }, { "epoch": 0.83, "learning_rate": 4.0708380434367864e-08, - "logits/chosen": -2.1100175380706787, - "logits/rejected": -1.8269599676132202, - "logps/chosen": -244.19241333007812, - "logps/rejected": -228.1028594970703, - "loss": 6020.5559, - "rewards/accuracies": 0.6343749761581421, - "rewards/chosen": -0.4490428566932678, - "rewards/margins": 0.117251917719841, - "rewards/rejected": -0.5662947297096252, - "rewards/safe_rewards": -0.4383305609226227, - "rewards/unsafe_rewards": -0.4597550928592682, + "logits/chosen": -2.4302127361297607, + "logits/rejected": -2.1903905868530273, + "logps/chosen": -199.45376586914062, + "logps/rejected": -171.4712677001953, + "loss": 11.7548, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.16563379764556885, + "rewards/margins": -0.16774305701255798, + "rewards/rejected": 0.0021092891693115234, + "rewards/safe_rewards": -0.17964015901088715, + "rewards/unsafe_rewards": -0.15162742137908936, "step": 1550 }, { "epoch": 0.84, "learning_rate": 3.817626395644305e-08, - "logits/chosen": -2.1250922679901123, - "logits/rejected": -1.9055827856063843, - "logps/chosen": -252.9553985595703, - "logps/rejected": -230.5366668701172, - "loss": 6467.1945, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.4705273509025574, - "rewards/margins": 0.061760313808918, - "rewards/rejected": -0.5322877168655396, - "rewards/safe_rewards": -0.46299856901168823, - "rewards/unsafe_rewards": -0.4780561327934265, + "logits/chosen": -2.428711414337158, + "logits/rejected": -2.232553005218506, + "logps/chosen": -206.1396942138672, + "logps/rejected": -177.48374938964844, + "loss": 20.469, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.2369929999113083, + "rewards/margins": -0.06113150715827942, + "rewards/rejected": -0.17586149275302887, + "rewards/safe_rewards": -0.15375518798828125, + "rewards/unsafe_rewards": -0.32023078203201294, "step": 1560 }, { "epoch": 0.84, "learning_rate": 3.571892809580013e-08, - "logits/chosen": -2.091125965118408, - "logits/rejected": -1.8502193689346313, - "logps/chosen": -240.99044799804688, - "logps/rejected": -229.93167114257812, - "loss": 6833.8594, - "rewards/accuracies": 0.565625011920929, - "rewards/chosen": -0.45763105154037476, - "rewards/margins": 0.08750450611114502, - "rewards/rejected": -0.5451356172561646, - "rewards/safe_rewards": -0.4449933171272278, - "rewards/unsafe_rewards": -0.4702689051628113, + "logits/chosen": -2.395301580429077, + "logits/rejected": -2.1873881816864014, + "logps/chosen": -195.25765991210938, + "logps/rejected": -175.76754760742188, + "loss": 47.6931, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": -0.030344385653734207, + "rewards/margins": 0.31907138228416443, + "rewards/rejected": -0.34941577911376953, + "rewards/safe_rewards": -0.09232734888792038, + "rewards/unsafe_rewards": 0.03163857385516167, "step": 1570 }, { "epoch": 0.85, "learning_rate": 3.333724037132976e-08, - "logits/chosen": -2.106806516647339, - "logits/rejected": -1.8567653894424438, - "logps/chosen": -245.6099853515625, - "logps/rejected": -225.84719848632812, - "loss": 6472.2758, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.47156161069869995, - "rewards/margins": 0.07817086577415466, - "rewards/rejected": -0.549732506275177, - "rewards/safe_rewards": -0.4597776532173157, - "rewards/unsafe_rewards": -0.4833455979824066, + "logits/chosen": -2.4109716415405273, + "logits/rejected": -2.1891541481018066, + "logps/chosen": -198.6385040283203, + "logps/rejected": -170.99563598632812, + "loss": 5.2251, + "rewards/accuracies": 0.4281249940395355, + "rewards/chosen": -0.1846873015165329, + "rewards/margins": -0.06299707293510437, + "rewards/rejected": -0.12169022858142853, + "rewards/safe_rewards": -0.1805131733417511, + "rewards/unsafe_rewards": -0.18886145949363708, "step": 1580 }, { "epoch": 0.86, "learning_rate": 3.1032041595688506e-08, - "logits/chosen": -2.086425304412842, - "logits/rejected": -1.8307304382324219, - "logps/chosen": -259.83050537109375, - "logps/rejected": -240.1065673828125, - "loss": 6203.0508, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.43678250908851624, - "rewards/margins": 0.10790238529443741, - "rewards/rejected": -0.544684886932373, - "rewards/safe_rewards": -0.4430006444454193, - "rewards/unsafe_rewards": -0.4305643141269684, + "logits/chosen": -2.3785929679870605, + "logits/rejected": -2.171466827392578, + "logps/chosen": -216.2442626953125, + "logps/rejected": -185.38406372070312, + "loss": 21.8766, + "rewards/accuracies": 0.4312500059604645, + "rewards/chosen": -0.09203994274139404, + "rewards/margins": -0.34605592489242554, + "rewards/rejected": 0.2540159523487091, + "rewards/safe_rewards": -0.014932965859770775, + "rewards/unsafe_rewards": -0.16914694011211395, "step": 1590 }, { "epoch": 0.86, "learning_rate": 2.880414557846453e-08, - "logits/chosen": -2.116537570953369, - "logits/rejected": -1.9354660511016846, - "logps/chosen": -241.5932159423828, - "logps/rejected": -219.1776123046875, - "loss": 5802.0645, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.4176766276359558, - "rewards/margins": 0.13016578555107117, - "rewards/rejected": -0.5478425025939941, - "rewards/safe_rewards": -0.4161531925201416, - "rewards/unsafe_rewards": -0.4192000925540924, + "logits/chosen": -2.4211525917053223, + "logits/rejected": -2.259765863418579, + "logps/chosen": -200.02296447753906, + "logps/rejected": -164.5922393798828, + "loss": 78.4789, + "rewards/accuracies": 0.503125011920929, + "rewards/chosen": -0.19741728901863098, + "rewards/margins": 0.0014421313535422087, + "rewards/rejected": -0.19885942339897156, + "rewards/safe_rewards": -0.07222781330347061, + "rewards/unsafe_rewards": -0.32260677218437195, "step": 1600 }, { "epoch": 0.87, "learning_rate": 2.6654338838876662e-08, - "logits/chosen": -2.127192497253418, - "logits/rejected": -1.8054416179656982, - "logps/chosen": -251.4619598388672, - "logps/rejected": -217.2257537841797, - "loss": 6674.4672, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.4491957128047943, - "rewards/margins": 0.10260852426290512, - "rewards/rejected": -0.5518041849136353, - "rewards/safe_rewards": -0.45763081312179565, - "rewards/unsafe_rewards": -0.44076067209243774, + "logits/chosen": -2.4327399730682373, + "logits/rejected": -2.1489098072052, + "logps/chosen": -206.57406616210938, + "logps/rejected": -162.18191528320312, + "loss": 31.8357, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.03170815855264664, + "rewards/margins": 0.1048579216003418, + "rewards/rejected": -0.13656608760356903, + "rewards/safe_rewards": -0.28655681014060974, + "rewards/unsafe_rewards": 0.22314047813415527, "step": 1610 }, { "epoch": 0.87, "learning_rate": 2.4583380328107805e-08, - "logits/chosen": -2.1055760383605957, - "logits/rejected": -1.831613540649414, - "logps/chosen": -264.2273864746094, - "logps/rejected": -228.84228515625, - "loss": 6595.832, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4543065130710602, - "rewards/margins": 0.09187433123588562, - "rewards/rejected": -0.5461808443069458, - "rewards/safe_rewards": -0.4405900835990906, - "rewards/unsafe_rewards": -0.46802282333374023, + "logits/chosen": -2.4065799713134766, + "logits/rejected": -2.168668508529663, + "logps/chosen": -219.0827178955078, + "logps/rejected": -174.2585906982422, + "loss": 19.5756, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.28594768047332764, + "rewards/margins": -0.2515542805194855, + "rewards/rejected": -0.034393392503261566, + "rewards/safe_rewards": -0.40954461693763733, + "rewards/unsafe_rewards": -0.16235077381134033, "step": 1620 }, { "epoch": 0.88, "learning_rate": 2.259200116137039e-08, - "logits/chosen": -2.0728936195373535, - "logits/rejected": -1.8516597747802734, - "logps/chosen": -248.5038299560547, - "logps/rejected": -240.140625, - "loss": 6611.8016, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.43512576818466187, - "rewards/margins": 0.09586648643016815, - "rewards/rejected": -0.530992329120636, - "rewards/safe_rewards": -0.42405596375465393, - "rewards/unsafe_rewards": -0.4461956024169922, + "logits/chosen": -2.381093740463257, + "logits/rejected": -2.1939659118652344, + "logps/chosen": -204.22921752929688, + "logps/rejected": -187.06576538085938, + "loss": 169.2986, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.762048602104187, + "rewards/margins": 0.7863933444023132, + "rewards/rejected": -0.024344712495803833, + "rewards/safe_rewards": 0.9233208894729614, + "rewards/unsafe_rewards": 0.6007765531539917, "step": 1630 }, { "epoch": 0.88, "learning_rate": 2.068090435979958e-08, - "logits/chosen": -2.0459280014038086, - "logits/rejected": -1.8485805988311768, - "logps/chosen": -236.850341796875, - "logps/rejected": -217.1259307861328, - "loss": 6589.3367, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": -0.42821797728538513, - "rewards/margins": 0.0908961072564125, - "rewards/rejected": -0.519114077091217, - "rewards/safe_rewards": -0.4395659565925598, - "rewards/unsafe_rewards": -0.4168699383735657, + "logits/chosen": -2.3571343421936035, + "logits/rejected": -2.1805238723754883, + "logps/chosen": -194.33248901367188, + "logps/rejected": -165.1163787841797, + "loss": 56.9496, + "rewards/accuracies": 0.484375, + "rewards/chosen": -0.3039317727088928, + "rewards/margins": -0.40204334259033203, + "rewards/rejected": 0.09811154752969742, + "rewards/safe_rewards": 0.22571036219596863, + "rewards/unsafe_rewards": -0.8335739374160767, "step": 1640 }, { "epoch": 0.89, "learning_rate": 1.8850764602263423e-08, - "logits/chosen": -2.1169512271881104, - "logits/rejected": -1.8033020496368408, - "logps/chosen": -244.875732421875, - "logps/rejected": -228.0574188232422, - "loss": 6550.1773, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4457070231437683, - "rewards/margins": 0.09957444667816162, - "rewards/rejected": -0.5452814698219299, - "rewards/safe_rewards": -0.43225640058517456, - "rewards/unsafe_rewards": -0.4591576159000397, + "logits/chosen": -2.415358304977417, + "logits/rejected": -2.1450016498565674, + "logps/chosen": -200.4285888671875, + "logps/rejected": -173.5339813232422, + "loss": 27.6237, + "rewards/accuracies": 0.5531250238418579, + "rewards/chosen": -0.12354181706905365, + "rewards/margins": -0.11882360279560089, + "rewards/rejected": -0.0047182366251945496, + "rewards/safe_rewards": -0.05137089639902115, + "rewards/unsafe_rewards": -0.19571277499198914, "step": 1650 }, { "epoch": 0.89, "learning_rate": 1.710222798718028e-08, - "logits/chosen": -2.144412040710449, - "logits/rejected": -1.9019086360931396, - "logps/chosen": -245.9793243408203, - "logps/rejected": -231.61734008789062, - "loss": 6516.768, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.426375150680542, - "rewards/margins": 0.102508544921875, - "rewards/rejected": -0.5288837552070618, - "rewards/safe_rewards": -0.41905418038368225, - "rewards/unsafe_rewards": -0.4336961805820465, + "logits/chosen": -2.4396722316741943, + "logits/rejected": -2.2350778579711914, + "logps/chosen": -203.3378448486328, + "logps/rejected": -178.9970245361328, + "loss": 22.9552, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.003960543777793646, + "rewards/margins": 0.2720267176628113, + "rewards/rejected": -0.2680661380290985, + "rewards/safe_rewards": 0.06284536421298981, + "rewards/unsafe_rewards": -0.054924286901950836, "step": 1660 }, { "epoch": 0.9, "learning_rate": 1.5435911804424356e-08, - "logits/chosen": -2.104454755783081, - "logits/rejected": -1.9044666290283203, - "logps/chosen": -274.7130432128906, - "logps/rejected": -240.73812866210938, - "loss": 6860.0828, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4544183611869812, - "rewards/margins": 0.0989476665854454, - "rewards/rejected": -0.5533660650253296, - "rewards/safe_rewards": -0.447795569896698, - "rewards/unsafe_rewards": -0.461041122674942, + "logits/chosen": -2.4028568267822266, + "logits/rejected": -2.2349255084991455, + "logps/chosen": -229.4883270263672, + "logps/rejected": -185.6358642578125, + "loss": 31.5896, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": -0.2171170711517334, + "rewards/margins": 0.017191190272569656, + "rewards/rejected": -0.23430824279785156, + "rewards/safe_rewards": 0.15241694450378418, + "rewards/unsafe_rewards": -0.586651086807251, "step": 1670 }, { "epoch": 0.9, "learning_rate": 1.3852404317403199e-08, - "logits/chosen": -2.104949474334717, - "logits/rejected": -1.8842833042144775, - "logps/chosen": -263.541259765625, - "logps/rejected": -248.37600708007812, - "loss": 6686.3102, - "rewards/accuracies": 0.596875011920929, - "rewards/chosen": -0.43208789825439453, - "rewards/margins": 0.10889849811792374, - "rewards/rejected": -0.5409864187240601, - "rewards/safe_rewards": -0.44288283586502075, - "rewards/unsafe_rewards": -0.4212929606437683, + "logits/chosen": -2.395153284072876, + "logits/rejected": -2.2008633613586426, + "logps/chosen": -220.5502471923828, + "logps/rejected": -194.44186401367188, + "loss": 26.1714, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.21780619025230408, + "rewards/margins": -0.0533115491271019, + "rewards/rejected": -0.16449466347694397, + "rewards/safe_rewards": -0.4852879047393799, + "rewards/unsafe_rewards": 0.04967564344406128, "step": 1680 }, { "epoch": 0.91, "learning_rate": 1.235226455538113e-08, - "logits/chosen": -2.1549041271209717, - "logits/rejected": -1.9211156368255615, - "logps/chosen": -245.10513305664062, - "logps/rejected": -219.42529296875, - "loss": 6679.3547, - "rewards/accuracies": 0.5718749761581421, - "rewards/chosen": -0.4363730847835541, - "rewards/margins": 0.07803308963775635, - "rewards/rejected": -0.5144062042236328, - "rewards/safe_rewards": -0.4496842324733734, - "rewards/unsafe_rewards": -0.42306193709373474, + "logits/chosen": -2.4504330158233643, + "logits/rejected": -2.2494871616363525, + "logps/chosen": -201.50564575195312, + "logps/rejected": -167.95364379882812, + "loss": 5.2467, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.037842657417058945, + "rewards/margins": -0.06888096779584885, + "rewards/rejected": 0.031038302928209305, + "rewards/safe_rewards": -0.037054188549518585, + "rewards/unsafe_rewards": -0.0386311374604702, "step": 1690 }, { "epoch": 0.91, "learning_rate": 1.0936022116124321e-08, - "logits/chosen": -2.13358211517334, - "logits/rejected": -1.880497694015503, - "logps/chosen": -243.0107421875, - "logps/rejected": -220.0526123046875, - "loss": 6255.5586, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.4362039566040039, - "rewards/margins": 0.11256744712591171, - "rewards/rejected": -0.5487713813781738, - "rewards/safe_rewards": -0.44225654006004333, - "rewards/unsafe_rewards": -0.43015122413635254, + "logits/chosen": -2.4290854930877686, + "logits/rejected": -2.204906463623047, + "logps/chosen": -199.54847717285156, + "logps/rejected": -165.02816772460938, + "loss": 41.987, + "rewards/accuracies": 0.5093749761581421, + "rewards/chosen": -0.15811631083488464, + "rewards/margins": -0.3054047226905823, + "rewards/rejected": 0.14728839695453644, + "rewards/safe_rewards": -0.39381590485572815, + "rewards/unsafe_rewards": 0.07758323848247528, "step": 1700 }, { "epoch": 0.92, "learning_rate": 9.60417697893534e-09, - "logits/chosen": -2.1104042530059814, - "logits/rejected": -1.9010612964630127, - "logps/chosen": -243.5632781982422, - "logps/rejected": -227.34716796875, - "loss": 6634.2828, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": -0.4358798861503601, - "rewards/margins": 0.0963749811053276, - "rewards/rejected": -0.5322549939155579, - "rewards/safe_rewards": -0.42670002579689026, - "rewards/unsafe_rewards": -0.44505977630615234, + "logits/chosen": -2.4069314002990723, + "logits/rejected": -2.2242488861083984, + "logps/chosen": -199.82015991210938, + "logps/rejected": -173.9343719482422, + "loss": 22.6453, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": 0.1551128476858139, + "rewards/margins": -0.032198842614889145, + "rewards/rejected": 0.18731167912483215, + "rewards/safe_rewards": 0.21557751297950745, + "rewards/unsafe_rewards": 0.09464815258979797, "step": 1710 }, { "epoch": 0.93, "learning_rate": 8.357199328144576e-09, - "logits/chosen": -2.1054680347442627, - "logits/rejected": -1.8882167339324951, - "logps/chosen": -263.0108947753906, - "logps/rejected": -243.1034698486328, - "loss": 6000.1617, - "rewards/accuracies": 0.5843750238418579, - "rewards/chosen": -0.46379217505455017, - "rewards/margins": 0.08999677002429962, - "rewards/rejected": -0.553788959980011, - "rewards/safe_rewards": -0.4631151258945465, - "rewards/unsafe_rewards": -0.4644692540168762, + "logits/chosen": -2.4046077728271484, + "logits/rejected": -2.2161166667938232, + "logps/chosen": -216.55093383789062, + "logps/rejected": -187.6187286376953, + "loss": 56.4505, + "rewards/accuracies": 0.528124988079071, + "rewards/chosen": 0.08073421567678452, + "rewards/margins": -0.025125902146100998, + "rewards/rejected": 0.10586012899875641, + "rewards/safe_rewards": 0.07104392349720001, + "rewards/unsafe_rewards": 0.09042453020811081, "step": 1720 }, { "epoch": 0.93, "learning_rate": 7.1955293871198144e-09, - "logits/chosen": -2.101811170578003, - "logits/rejected": -1.945387840270996, - "logps/chosen": -230.8267059326172, - "logps/rejected": -220.24111938476562, - "loss": 6856.0781, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.4375353455543518, - "rewards/margins": 0.06351236999034882, - "rewards/rejected": -0.5010477304458618, - "rewards/safe_rewards": -0.4362742006778717, - "rewards/unsafe_rewards": -0.4387964606285095, + "logits/chosen": -2.4008450508117676, + "logits/rejected": -2.261340379714966, + "logps/chosen": -187.19436645507812, + "logps/rejected": -169.91722106933594, + "loss": 18.4483, + "rewards/accuracies": 0.45625001192092896, + "rewards/chosen": -0.12119190394878387, + "rewards/margins": -0.34029996395111084, + "rewards/rejected": 0.21910807490348816, + "rewards/safe_rewards": -0.20904748141765594, + "rewards/unsafe_rewards": -0.03333630412817001, "step": 1730 }, { "epoch": 0.94, "learning_rate": 6.119577262853254e-09, - "logits/chosen": -2.128836154937744, - "logits/rejected": -1.8593677282333374, - "logps/chosen": -237.20468139648438, - "logps/rejected": -215.84890747070312, - "loss": 6559.5469, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.44040852785110474, - "rewards/margins": 0.09161019325256348, - "rewards/rejected": -0.5320187211036682, - "rewards/safe_rewards": -0.4399905204772949, - "rewards/unsafe_rewards": -0.4408264756202698, + "logits/chosen": -2.4227774143218994, + "logits/rejected": -2.1880428791046143, + "logps/chosen": -193.263671875, + "logps/rejected": -162.72183227539062, + "loss": 27.596, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.09984199702739716, + "rewards/margins": -0.025053083896636963, + "rewards/rejected": -0.07478892058134079, + "rewards/safe_rewards": -0.26086345314979553, + "rewards/unsafe_rewards": 0.06117943674325943, "step": 1740 }, { "epoch": 0.94, "learning_rate": 5.129722801180542e-09, - "logits/chosen": -2.041652202606201, - "logits/rejected": -1.8507925271987915, - "logps/chosen": -242.01846313476562, - "logps/rejected": -237.16769409179688, - "loss": 6117.1355, - "rewards/accuracies": 0.590624988079071, - "rewards/chosen": -0.4510047435760498, - "rewards/margins": 0.11661112308502197, - "rewards/rejected": -0.5676159262657166, - "rewards/safe_rewards": -0.45727530121803284, - "rewards/unsafe_rewards": -0.44473427534103394, + "logits/chosen": -2.3443946838378906, + "logits/rejected": -2.1799635887145996, + "logps/chosen": -197.2679443359375, + "logps/rejected": -180.6214599609375, + "loss": 19.3736, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.3499890863895416, + "rewards/margins": -0.13464350998401642, + "rewards/rejected": -0.2153455764055252, + "rewards/safe_rewards": -0.1743151694536209, + "rewards/unsafe_rewards": -0.5256629586219788, "step": 1750 }, { "epoch": 0.95, "learning_rate": 4.226315452682816e-09, - "logits/chosen": -2.111886501312256, - "logits/rejected": -1.8561007976531982, - "logps/chosen": -241.124267578125, - "logps/rejected": -226.9755096435547, - "loss": 6254.3047, - "rewards/accuracies": 0.628125011920929, - "rewards/chosen": -0.44722381234169006, - "rewards/margins": 0.092010997235775, - "rewards/rejected": -0.5392348766326904, - "rewards/safe_rewards": -0.4598180651664734, - "rewards/unsafe_rewards": -0.4346295893192291, + "logits/chosen": -2.413181781768799, + "logits/rejected": -2.187439441680908, + "logps/chosen": -196.54916381835938, + "logps/rejected": -173.30929565429688, + "loss": 31.0958, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": -0.147268146276474, + "rewards/margins": 0.10999511182308197, + "rewards/rejected": -0.2572632431983948, + "rewards/safe_rewards": -0.11902125179767609, + "rewards/unsafe_rewards": -0.17551502585411072, "step": 1760 }, { "epoch": 0.95, "learning_rate": 3.4096741493194193e-09, - "logits/chosen": -2.14597749710083, - "logits/rejected": -1.940734624862671, - "logps/chosen": -241.82864379882812, - "logps/rejected": -225.2042999267578, - "loss": 6777.9141, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.4236208498477936, - "rewards/margins": 0.08857716619968414, - "rewards/rejected": -0.5121980309486389, - "rewards/safe_rewards": -0.4265195429325104, - "rewards/unsafe_rewards": -0.4207221567630768, + "logits/chosen": -2.443580389022827, + "logits/rejected": -2.2651684284210205, + "logps/chosen": -199.41049194335938, + "logps/rejected": -173.95718383789062, + "loss": 9.8716, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.056076280772686005, + "rewards/margins": 0.028754467144608498, + "rewards/rejected": 0.027321819216012955, + "rewards/safe_rewards": 0.034864675253629684, + "rewards/unsafe_rewards": 0.07728789001703262, "step": 1770 }, { "epoch": 0.96, "learning_rate": 2.6800871918346846e-09, - "logits/chosen": -2.1110918521881104, - "logits/rejected": -1.8164072036743164, - "logps/chosen": -244.0395965576172, - "logps/rejected": -226.79354858398438, - "loss": 6527.3641, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": -0.4077354073524475, - "rewards/margins": 0.12862348556518555, - "rewards/rejected": -0.5363588333129883, - "rewards/safe_rewards": -0.4131808876991272, - "rewards/unsafe_rewards": -0.4022899568080902, + "logits/chosen": -2.4057886600494385, + "logits/rejected": -2.155165672302246, + "logps/chosen": -203.48025512695312, + "logps/rejected": -172.94015502929688, + "loss": 41.8074, + "rewards/accuracies": 0.5218750238418579, + "rewards/chosen": -0.21421018242835999, + "rewards/margins": -0.43174242973327637, + "rewards/rejected": 0.2175322324037552, + "rewards/safe_rewards": -0.06811753660440445, + "rewards/unsafe_rewards": -0.36030280590057373, "step": 1780 }, { "epoch": 0.96, "learning_rate": 2.0378121479783796e-09, - "logits/chosen": -2.0832958221435547, - "logits/rejected": -1.8225599527359009, - "logps/chosen": -240.6083984375, - "logps/rejected": -222.42355346679688, - "loss": 6531.5406, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": -0.4457913041114807, - "rewards/margins": 0.10579758882522583, - "rewards/rejected": -0.5515888929367065, - "rewards/safe_rewards": -0.4444062113761902, - "rewards/unsafe_rewards": -0.44717639684677124, + "logits/chosen": -2.389869213104248, + "logits/rejected": -2.1555044651031494, + "logps/chosen": -196.02059936523438, + "logps/rejected": -167.43655395507812, + "loss": 61.0971, + "rewards/accuracies": 0.4781250059604645, + "rewards/chosen": 0.008689677342772484, + "rewards/margins": 0.18058201670646667, + "rewards/rejected": -0.17189235985279083, + "rewards/safe_rewards": 0.042548321187496185, + "rewards/unsafe_rewards": -0.025168979540467262, "step": 1790 }, { "epoch": 0.97, "learning_rate": 1.4830757615760247e-09, - "logits/chosen": -2.1382036209106445, - "logits/rejected": -1.8611648082733154, - "logps/chosen": -252.05618286132812, - "logps/rejected": -223.10189819335938, - "loss": 6842.4117, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": -0.44894617795944214, - "rewards/margins": 0.07909347116947174, - "rewards/rejected": -0.5280395746231079, - "rewards/safe_rewards": -0.4768086075782776, - "rewards/unsafe_rewards": -0.4210837781429291, + "logits/chosen": -2.4289557933807373, + "logits/rejected": -2.1850333213806152, + "logps/chosen": -207.24124145507812, + "logps/rejected": -170.49305725097656, + "loss": 144.1229, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.07967615127563477, + "rewards/margins": 0.1154344230890274, + "rewards/rejected": -0.19511058926582336, + "rewards/safe_rewards": -0.10584060847759247, + "rewards/unsafe_rewards": -0.05351167917251587, "step": 1800 }, { "epoch": 0.97, "learning_rate": 1.0160738724809548e-09, - "logits/chosen": -2.136545181274414, - "logits/rejected": -1.8660471439361572, - "logps/chosen": -239.1907958984375, - "logps/rejected": -225.1981964111328, - "loss": 6138.1797, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": -0.4316760003566742, - "rewards/margins": 0.10981061309576035, - "rewards/rejected": -0.5414865612983704, - "rewards/safe_rewards": -0.43494072556495667, - "rewards/unsafe_rewards": -0.42841118574142456, + "logits/chosen": -2.4409990310668945, + "logits/rejected": -2.207919120788574, + "logps/chosen": -196.10601806640625, + "logps/rejected": -171.36843872070312, + "loss": 18.7773, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.08281825482845306, + "rewards/margins": 0.23606376349925995, + "rewards/rejected": -0.3188820481300354, + "rewards/safe_rewards": 0.05616650730371475, + "rewards/unsafe_rewards": -0.2218029946088791, "step": 1810 }, { "epoch": 0.98, "learning_rate": 6.369713474366212e-10, - "logits/chosen": -2.122779369354248, - "logits/rejected": -1.8656489849090576, - "logps/chosen": -265.56866455078125, - "logps/rejected": -237.6756591796875, - "loss": 5787.684, - "rewards/accuracies": 0.578125, - "rewards/chosen": -0.46309009194374084, - "rewards/margins": 0.09460426867008209, - "rewards/rejected": -0.5576944351196289, - "rewards/safe_rewards": -0.4682251513004303, - "rewards/unsafe_rewards": -0.4579550623893738, + "logits/chosen": -2.420626640319824, + "logits/rejected": -2.1977345943450928, + "logps/chosen": -219.4222869873047, + "logps/rejected": -181.95010375976562, + "loss": 17.5266, + "rewards/accuracies": 0.49687498807907104, + "rewards/chosen": -0.16265834867954254, + "rewards/margins": -0.11877261102199554, + "rewards/rejected": -0.0438857302069664, + "rewards/safe_rewards": -0.035406678915023804, + "rewards/unsafe_rewards": -0.2899099886417389, "step": 1820 }, { "epoch": 0.98, "learning_rate": 3.459020218731512e-10, - "logits/chosen": -2.134183883666992, - "logits/rejected": -1.8942667245864868, - "logps/chosen": -247.1185302734375, - "logps/rejected": -223.71078491210938, - "loss": 5796.5938, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.443460613489151, - "rewards/margins": 0.12163563072681427, - "rewards/rejected": -0.5650962591171265, - "rewards/safe_rewards": -0.43802452087402344, - "rewards/unsafe_rewards": -0.44889673590660095, + "logits/chosen": -2.4327456951141357, + "logits/rejected": -2.220496654510498, + "logps/chosen": -202.61898803710938, + "logps/rejected": -167.197021484375, + "loss": 43.7242, + "rewards/accuracies": 0.534375011920929, + "rewards/chosen": 0.15349379181861877, + "rewards/margins": 0.14935937523841858, + "rewards/rejected": 0.004134447779506445, + "rewards/safe_rewards": 0.08035097271203995, + "rewards/unsafe_rewards": 0.2266366183757782, "step": 1830 }, { "epoch": 0.99, "learning_rate": 1.429686526593088e-10, - "logits/chosen": -2.093444347381592, - "logits/rejected": -1.8680245876312256, - "logps/chosen": -251.08969116210938, - "logps/rejected": -229.55322265625, - "loss": 6394.6027, - "rewards/accuracies": 0.621874988079071, - "rewards/chosen": -0.43905067443847656, - "rewards/margins": 0.0962488204240799, - "rewards/rejected": -0.5352994799613953, - "rewards/safe_rewards": -0.4522920250892639, - "rewards/unsafe_rewards": -0.4258092939853668, + "logits/chosen": -2.398090124130249, + "logits/rejected": -2.192744255065918, + "logps/chosen": -206.80520629882812, + "logps/rejected": -175.9212646484375, + "loss": 23.3409, + "rewards/accuracies": 0.453125, + "rewards/chosen": 0.3794136941432953, + "rewards/margins": 0.2774004638195038, + "rewards/rejected": 0.10201327502727509, + "rewards/safe_rewards": 0.5806846022605896, + "rewards/unsafe_rewards": 0.17814283072948456, "step": 1840 }, { "epoch": 1.0, "learning_rate": 2.824288182584622e-11, - "logits/chosen": -2.135525703430176, - "logits/rejected": -1.925082802772522, - "logps/chosen": -252.08755493164062, - "logps/rejected": -223.11550903320312, - "loss": 5975.7648, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": -0.45583659410476685, - "rewards/margins": 0.09374500811100006, - "rewards/rejected": -0.5495815873146057, - "rewards/safe_rewards": -0.4470265805721283, - "rewards/unsafe_rewards": -0.464646577835083, + "logits/chosen": -2.4241063594818115, + "logits/rejected": -2.2421114444732666, + "logps/chosen": -206.7459716796875, + "logps/rejected": -168.176513671875, + "loss": 19.5817, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": -0.24208331108093262, + "rewards/margins": -0.22288334369659424, + "rewards/rejected": -0.019199971109628677, + "rewards/safe_rewards": 0.039939720183610916, + "rewards/unsafe_rewards": -0.5241063237190247, "step": 1850 }, { "epoch": 1.0, "step": 1858, "total_flos": 0.0, - "train_loss": 6725.912355355221, - "train_runtime": 39534.0439, - "train_samples_per_second": 1.504, - "train_steps_per_second": 0.047 + "train_loss": 67.04043597990268, + "train_runtime": 46860.0347, + "train_samples_per_second": 1.269, + "train_steps_per_second": 0.04 } ], "logging_steps": 10,