dpo-selective-buffer-spo-shift / trainer_state.json
wxzhang's picture
Model save
5d84c8e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9995965030262273,
"eval_steps": 500,
"global_step": 1858,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 2.6881720430107528e-09,
"logits/chosen": -2.4663572311401367,
"logits/rejected": -2.057170867919922,
"logps/chosen": -246.4422607421875,
"logps/rejected": -173.7652587890625,
"loss": 0.5938,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"rewards/safe_rewards": 0.0,
"rewards/unsafe_rewards": 0.0,
"step": 1
},
{
"epoch": 0.01,
"learning_rate": 2.6881720430107527e-08,
"logits/chosen": -2.3338096141815186,
"logits/rejected": -2.1100988388061523,
"logps/chosen": -199.19329833984375,
"logps/rejected": -169.358642578125,
"loss": 1.134,
"rewards/accuracies": 0.4097222089767456,
"rewards/chosen": -0.03551425039768219,
"rewards/margins": -0.041799187660217285,
"rewards/rejected": 0.006284935399889946,
"rewards/safe_rewards": -0.01677405834197998,
"rewards/unsafe_rewards": -0.0542544424533844,
"step": 10
},
{
"epoch": 0.01,
"learning_rate": 5.3763440860215054e-08,
"logits/chosen": -2.329479694366455,
"logits/rejected": -2.0858876705169678,
"logps/chosen": -215.32296752929688,
"logps/rejected": -176.8864288330078,
"loss": 1.1266,
"rewards/accuracies": 0.47187501192092896,
"rewards/chosen": -0.031086910516023636,
"rewards/margins": -0.04154179245233536,
"rewards/rejected": 0.010454884730279446,
"rewards/safe_rewards": -0.04110833257436752,
"rewards/unsafe_rewards": -0.021065494045615196,
"step": 20
},
{
"epoch": 0.02,
"learning_rate": 8.064516129032257e-08,
"logits/chosen": -2.322885036468506,
"logits/rejected": -2.1038832664489746,
"logps/chosen": -199.3030242919922,
"logps/rejected": -180.7991943359375,
"loss": 1.1716,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.0008645713096484542,
"rewards/margins": 0.027558892965316772,
"rewards/rejected": -0.026694318279623985,
"rewards/safe_rewards": -0.0032820613123476505,
"rewards/unsafe_rewards": 0.005011203698813915,
"step": 30
},
{
"epoch": 0.02,
"learning_rate": 1.0752688172043011e-07,
"logits/chosen": -2.268714427947998,
"logits/rejected": -1.9988443851470947,
"logps/chosen": -197.72109985351562,
"logps/rejected": -177.70603942871094,
"loss": 1.1036,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.023218240588903427,
"rewards/margins": 0.022794129326939583,
"rewards/rejected": 0.0004241138813085854,
"rewards/safe_rewards": 0.03502867370843887,
"rewards/unsafe_rewards": 0.011407810263335705,
"step": 40
},
{
"epoch": 0.03,
"learning_rate": 1.3440860215053762e-07,
"logits/chosen": -2.374366283416748,
"logits/rejected": -2.07818603515625,
"logps/chosen": -191.63714599609375,
"logps/rejected": -162.17771911621094,
"loss": 1.1473,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -0.015013009309768677,
"rewards/margins": 0.0906001627445221,
"rewards/rejected": -0.10561318695545197,
"rewards/safe_rewards": -0.018471335992217064,
"rewards/unsafe_rewards": -0.011554678902029991,
"step": 50
},
{
"epoch": 0.03,
"learning_rate": 1.6129032258064515e-07,
"logits/chosen": -2.346019983291626,
"logits/rejected": -2.1285576820373535,
"logps/chosen": -186.499755859375,
"logps/rejected": -175.0586700439453,
"loss": 1.0107,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.009731076657772064,
"rewards/margins": 0.04699288681149483,
"rewards/rejected": -0.037261806428432465,
"rewards/safe_rewards": -0.01583387330174446,
"rewards/unsafe_rewards": 0.03529602661728859,
"step": 60
},
{
"epoch": 0.04,
"learning_rate": 1.8817204301075268e-07,
"logits/chosen": -2.3234503269195557,
"logits/rejected": -2.110891819000244,
"logps/chosen": -221.27426147460938,
"logps/rejected": -179.11380004882812,
"loss": 2.1985,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.11594200134277344,
"rewards/margins": 0.07270021736621857,
"rewards/rejected": 0.04324179142713547,
"rewards/safe_rewards": 0.0875079482793808,
"rewards/unsafe_rewards": 0.14437603950500488,
"step": 70
},
{
"epoch": 0.04,
"learning_rate": 2.1505376344086022e-07,
"logits/chosen": -2.3453927040100098,
"logits/rejected": -2.1327505111694336,
"logps/chosen": -197.19949340820312,
"logps/rejected": -176.77151489257812,
"loss": 2.7155,
"rewards/accuracies": 0.4468750059604645,
"rewards/chosen": 0.15263572335243225,
"rewards/margins": 0.012048400938510895,
"rewards/rejected": 0.14058732986450195,
"rewards/safe_rewards": 0.18941155076026917,
"rewards/unsafe_rewards": 0.11585988849401474,
"step": 80
},
{
"epoch": 0.05,
"learning_rate": 2.4193548387096775e-07,
"logits/chosen": -2.3641719818115234,
"logits/rejected": -2.137413263320923,
"logps/chosen": -216.1211395263672,
"logps/rejected": -168.5092315673828,
"loss": 2.721,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 0.008618640713393688,
"rewards/margins": 0.006397470831871033,
"rewards/rejected": 0.002221171511337161,
"rewards/safe_rewards": 0.031759221106767654,
"rewards/unsafe_rewards": -0.014521944336593151,
"step": 90
},
{
"epoch": 0.05,
"learning_rate": 2.6881720430107523e-07,
"logits/chosen": -2.360917568206787,
"logits/rejected": -2.153608798980713,
"logps/chosen": -201.7233428955078,
"logps/rejected": -190.54605102539062,
"loss": 1.4712,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.07169636338949203,
"rewards/margins": 0.08422265201807022,
"rewards/rejected": -0.012526283040642738,
"rewards/safe_rewards": 0.09221886098384857,
"rewards/unsafe_rewards": 0.0511738546192646,
"step": 100
},
{
"epoch": 0.06,
"learning_rate": 2.956989247311828e-07,
"logits/chosen": -2.3796088695526123,
"logits/rejected": -2.148357629776001,
"logps/chosen": -207.0086212158203,
"logps/rejected": -176.24658203125,
"loss": 4.9646,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.0011837140191346407,
"rewards/margins": 0.02744489349424839,
"rewards/rejected": -0.02626117691397667,
"rewards/safe_rewards": -0.013010969385504723,
"rewards/unsafe_rewards": 0.015378397889435291,
"step": 110
},
{
"epoch": 0.06,
"learning_rate": 3.225806451612903e-07,
"logits/chosen": -2.378938913345337,
"logits/rejected": -2.1289939880371094,
"logps/chosen": -203.86172485351562,
"logps/rejected": -168.72509765625,
"loss": 5.8793,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.11708948761224747,
"rewards/margins": -0.0013303399318829179,
"rewards/rejected": 0.1184198409318924,
"rewards/safe_rewards": 0.14375139772891998,
"rewards/unsafe_rewards": 0.09042758494615555,
"step": 120
},
{
"epoch": 0.07,
"learning_rate": 3.4946236559139783e-07,
"logits/chosen": -2.4672460556030273,
"logits/rejected": -2.235044479370117,
"logps/chosen": -211.15414428710938,
"logps/rejected": -167.7396697998047,
"loss": 2.9066,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": -0.141743004322052,
"rewards/margins": 0.018900588154792786,
"rewards/rejected": -0.1606435775756836,
"rewards/safe_rewards": -0.20868048071861267,
"rewards/unsafe_rewards": -0.07480548322200775,
"step": 130
},
{
"epoch": 0.08,
"learning_rate": 3.7634408602150537e-07,
"logits/chosen": -2.469130516052246,
"logits/rejected": -2.2549142837524414,
"logps/chosen": -219.2992401123047,
"logps/rejected": -180.1728057861328,
"loss": 14.0865,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.04582630842924118,
"rewards/margins": 0.07086005806922913,
"rewards/rejected": -0.025033747777342796,
"rewards/safe_rewards": 0.05242709070444107,
"rewards/unsafe_rewards": 0.03922552615404129,
"step": 140
},
{
"epoch": 0.08,
"learning_rate": 4.0322580645161285e-07,
"logits/chosen": -2.4029898643493652,
"logits/rejected": -2.2180120944976807,
"logps/chosen": -205.2784881591797,
"logps/rejected": -167.4949951171875,
"loss": 1531.726,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.9918906092643738,
"rewards/margins": 0.18755348026752472,
"rewards/rejected": -1.1794440746307373,
"rewards/safe_rewards": -0.9263350367546082,
"rewards/unsafe_rewards": -1.0574461221694946,
"step": 150
},
{
"epoch": 0.09,
"learning_rate": 4.3010752688172043e-07,
"logits/chosen": -2.3345110416412354,
"logits/rejected": -2.1149539947509766,
"logps/chosen": -209.245849609375,
"logps/rejected": -186.2938995361328,
"loss": 76.4742,
"rewards/accuracies": 0.4468750059604645,
"rewards/chosen": -0.40810996294021606,
"rewards/margins": -0.12766215205192566,
"rewards/rejected": -0.280447781085968,
"rewards/safe_rewards": -0.5669787526130676,
"rewards/unsafe_rewards": -0.2492411583662033,
"step": 160
},
{
"epoch": 0.09,
"learning_rate": 4.569892473118279e-07,
"logits/chosen": -2.364396810531616,
"logits/rejected": -2.153006076812744,
"logps/chosen": -193.48985290527344,
"logps/rejected": -157.84793090820312,
"loss": 366.3049,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.4611927568912506,
"rewards/margins": -0.09943069517612457,
"rewards/rejected": -0.36176207661628723,
"rewards/safe_rewards": -0.3680972456932068,
"rewards/unsafe_rewards": -0.5542882680892944,
"step": 170
},
{
"epoch": 0.1,
"learning_rate": 4.838709677419355e-07,
"logits/chosen": -2.379281520843506,
"logits/rejected": -2.1527695655822754,
"logps/chosen": -201.44384765625,
"logps/rejected": -176.66293334960938,
"loss": 212.1672,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -1.0156188011169434,
"rewards/margins": 0.32616162300109863,
"rewards/rejected": -1.341780424118042,
"rewards/safe_rewards": -0.7996016144752502,
"rewards/unsafe_rewards": -1.2316361665725708,
"step": 180
},
{
"epoch": 0.1,
"learning_rate": 4.999929391798331e-07,
"logits/chosen": -2.4363088607788086,
"logits/rejected": -2.1664328575134277,
"logps/chosen": -214.74972534179688,
"logps/rejected": -172.83267211914062,
"loss": 281.1637,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": -2.0342252254486084,
"rewards/margins": -1.3259267807006836,
"rewards/rejected": -0.7082984447479248,
"rewards/safe_rewards": -1.9764223098754883,
"rewards/unsafe_rewards": -2.0920281410217285,
"step": 190
},
{
"epoch": 0.11,
"learning_rate": 4.9991350953333e-07,
"logits/chosen": -2.399965763092041,
"logits/rejected": -2.1533687114715576,
"logps/chosen": -211.14138793945312,
"logps/rejected": -183.2847442626953,
"loss": 37.4693,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.5885945558547974,
"rewards/margins": -0.33234477043151855,
"rewards/rejected": -0.2562498152256012,
"rewards/safe_rewards": 0.10301212966442108,
"rewards/unsafe_rewards": -1.2802014350891113,
"step": 200
},
{
"epoch": 0.11,
"learning_rate": 4.997458523498236e-07,
"logits/chosen": -2.4136548042297363,
"logits/rejected": -2.1710681915283203,
"logps/chosen": -192.46209716796875,
"logps/rejected": -160.3273468017578,
"loss": 19.4933,
"rewards/accuracies": 0.46562498807907104,
"rewards/chosen": 0.7500754594802856,
"rewards/margins": 0.07740475982427597,
"rewards/rejected": 0.6726706624031067,
"rewards/safe_rewards": 0.8147931098937988,
"rewards/unsafe_rewards": 0.6853577494621277,
"step": 210
},
{
"epoch": 0.12,
"learning_rate": 4.99490026817712e-07,
"logits/chosen": -2.3793249130249023,
"logits/rejected": -2.126897096633911,
"logps/chosen": -206.8174591064453,
"logps/rejected": -174.28512573242188,
"loss": 618.2743,
"rewards/accuracies": 0.4593749940395355,
"rewards/chosen": 0.6119144558906555,
"rewards/margins": 1.0083643198013306,
"rewards/rejected": -0.3964497447013855,
"rewards/safe_rewards": 0.269029825925827,
"rewards/unsafe_rewards": 0.9547992944717407,
"step": 220
},
{
"epoch": 0.12,
"learning_rate": 4.991461232516674e-07,
"logits/chosen": -2.278285503387451,
"logits/rejected": -2.0165598392486572,
"logps/chosen": -220.05496215820312,
"logps/rejected": -191.4230499267578,
"loss": 117.4644,
"rewards/accuracies": 0.44062501192092896,
"rewards/chosen": -2.331136465072632,
"rewards/margins": -0.27771270275115967,
"rewards/rejected": -2.053424119949341,
"rewards/safe_rewards": -1.6258525848388672,
"rewards/unsafe_rewards": -3.0364208221435547,
"step": 230
},
{
"epoch": 0.13,
"learning_rate": 4.98714263060751e-07,
"logits/chosen": -2.2665092945098877,
"logits/rejected": -1.9782488346099854,
"logps/chosen": -189.6136016845703,
"logps/rejected": -156.85269165039062,
"loss": 123.5274,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -1.6799499988555908,
"rewards/margins": -0.4719271659851074,
"rewards/rejected": -1.2080228328704834,
"rewards/safe_rewards": -1.867531418800354,
"rewards/unsafe_rewards": -1.4923683404922485,
"step": 240
},
{
"epoch": 0.13,
"learning_rate": 4.98194598705552e-07,
"logits/chosen": -2.2388875484466553,
"logits/rejected": -2.0419199466705322,
"logps/chosen": -203.91488647460938,
"logps/rejected": -175.87570190429688,
"loss": 29.462,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": -0.5732041597366333,
"rewards/margins": 0.3381038308143616,
"rewards/rejected": -0.9113079905509949,
"rewards/safe_rewards": -0.5594094395637512,
"rewards/unsafe_rewards": -0.5869989395141602,
"step": 250
},
{
"epoch": 0.14,
"learning_rate": 4.975873136443648e-07,
"logits/chosen": -2.323503017425537,
"logits/rejected": -2.1084866523742676,
"logps/chosen": -219.4092254638672,
"logps/rejected": -188.0467071533203,
"loss": 514.7106,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": 0.01872560940682888,
"rewards/margins": 0.048060666769742966,
"rewards/rejected": -0.029335061088204384,
"rewards/safe_rewards": -0.101626917719841,
"rewards/unsafe_rewards": 0.13907812535762787,
"step": 260
},
{
"epoch": 0.15,
"learning_rate": 4.968926222684212e-07,
"logits/chosen": -2.3192670345306396,
"logits/rejected": -2.128873586654663,
"logps/chosen": -195.8466796875,
"logps/rejected": -173.4759063720703,
"loss": 62.0019,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": 0.5205889940261841,
"rewards/margins": 0.10105878114700317,
"rewards/rejected": 0.4195302128791809,
"rewards/safe_rewards": 0.4973847270011902,
"rewards/unsafe_rewards": 0.5437930822372437,
"step": 270
},
{
"epoch": 0.15,
"learning_rate": 4.961107698262044e-07,
"logits/chosen": -2.3513216972351074,
"logits/rejected": -2.1132161617279053,
"logps/chosen": -209.58480834960938,
"logps/rejected": -173.8505096435547,
"loss": 19.9099,
"rewards/accuracies": 0.47187501192092896,
"rewards/chosen": 1.4319963455200195,
"rewards/margins": -0.04141209274530411,
"rewards/rejected": 1.4734083414077759,
"rewards/safe_rewards": 0.7625109553337097,
"rewards/unsafe_rewards": 2.1014816761016846,
"step": 280
},
{
"epoch": 0.16,
"learning_rate": 4.952420323368673e-07,
"logits/chosen": -2.327949047088623,
"logits/rejected": -2.081421136856079,
"logps/chosen": -202.83131408691406,
"logps/rejected": -173.12339782714844,
"loss": 166.1931,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 1.1239261627197266,
"rewards/margins": 0.29456058144569397,
"rewards/rejected": 0.8293657302856445,
"rewards/safe_rewards": 0.95171719789505,
"rewards/unsafe_rewards": 1.2961351871490479,
"step": 290
},
{
"epoch": 0.16,
"learning_rate": 4.942867164927899e-07,
"logits/chosen": -2.3304100036621094,
"logits/rejected": -2.148871898651123,
"logps/chosen": -200.2861785888672,
"logps/rejected": -173.5687713623047,
"loss": 83.8678,
"rewards/accuracies": 0.546875,
"rewards/chosen": 1.1026077270507812,
"rewards/margins": 0.16524335741996765,
"rewards/rejected": 0.9373642206192017,
"rewards/safe_rewards": 1.20353102684021,
"rewards/unsafe_rewards": 1.001684308052063,
"step": 300
},
{
"epoch": 0.17,
"learning_rate": 4.932451595513062e-07,
"logits/chosen": -2.3603804111480713,
"logits/rejected": -2.1054179668426514,
"logps/chosen": -222.5138702392578,
"logps/rejected": -189.41696166992188,
"loss": 125.375,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 1.2356212139129639,
"rewards/margins": 0.4028751254081726,
"rewards/rejected": 0.8327462077140808,
"rewards/safe_rewards": 1.2047992944717407,
"rewards/unsafe_rewards": 1.2664434909820557,
"step": 310
},
{
"epoch": 0.17,
"learning_rate": 4.921177292156419e-07,
"logits/chosen": -2.4207069873809814,
"logits/rejected": -2.131692409515381,
"logps/chosen": -197.57579040527344,
"logps/rejected": -173.03189086914062,
"loss": 32.4693,
"rewards/accuracies": 0.4375,
"rewards/chosen": 1.0067864656448364,
"rewards/margins": -0.07762779295444489,
"rewards/rejected": 1.0844142436981201,
"rewards/safe_rewards": 0.9899358749389648,
"rewards/unsafe_rewards": 1.0236369371414185,
"step": 320
},
{
"epoch": 0.18,
"learning_rate": 4.909048235051033e-07,
"logits/chosen": -2.3886237144470215,
"logits/rejected": -2.2095794677734375,
"logps/chosen": -201.99131774902344,
"logps/rejected": -180.18301391601562,
"loss": 165.1989,
"rewards/accuracies": 0.546875,
"rewards/chosen": 0.8952449560165405,
"rewards/margins": 0.1524442732334137,
"rewards/rejected": 0.7428006529808044,
"rewards/safe_rewards": 0.9638195037841797,
"rewards/unsafe_rewards": 0.8266702890396118,
"step": 330
},
{
"epoch": 0.18,
"learning_rate": 4.896068706145631e-07,
"logits/chosen": -2.4264276027679443,
"logits/rejected": -2.1699893474578857,
"logps/chosen": -209.13687133789062,
"logps/rejected": -161.4777374267578,
"loss": 63.6332,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.5357077717781067,
"rewards/margins": 0.20826852321624756,
"rewards/rejected": 0.32743921875953674,
"rewards/safe_rewards": 0.6318890452384949,
"rewards/unsafe_rewards": 0.4395265579223633,
"step": 340
},
{
"epoch": 0.19,
"learning_rate": 4.882243287632946e-07,
"logits/chosen": -2.4155266284942627,
"logits/rejected": -2.1885287761688232,
"logps/chosen": -190.31680297851562,
"logps/rejected": -167.34011840820312,
"loss": 22.5493,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.2450559437274933,
"rewards/margins": 0.11199624836444855,
"rewards/rejected": 0.13305969536304474,
"rewards/safe_rewards": 0.32091599702835083,
"rewards/unsafe_rewards": 0.16919586062431335,
"step": 350
},
{
"epoch": 0.19,
"learning_rate": 4.867576860332048e-07,
"logits/chosen": -2.4087131023406982,
"logits/rejected": -2.1696860790252686,
"logps/chosen": -182.63320922851562,
"logps/rejected": -157.3323974609375,
"loss": 39.9616,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.33797144889831543,
"rewards/margins": 0.2170281708240509,
"rewards/rejected": 0.12094320356845856,
"rewards/safe_rewards": 0.7084277868270874,
"rewards/unsafe_rewards": -0.0324850007891655,
"step": 360
},
{
"epoch": 0.2,
"learning_rate": 4.85207460196526e-07,
"logits/chosen": -2.3588593006134033,
"logits/rejected": -2.1359121799468994,
"logps/chosen": -201.29721069335938,
"logps/rejected": -180.4462432861328,
"loss": 18.4967,
"rewards/accuracies": 0.421875,
"rewards/chosen": -0.20872633159160614,
"rewards/margins": -0.07106774300336838,
"rewards/rejected": -0.13765858113765717,
"rewards/safe_rewards": -0.24322757124900818,
"rewards/unsafe_rewards": -0.1742250919342041,
"step": 370
},
{
"epoch": 0.2,
"learning_rate": 4.835741985330259e-07,
"logits/chosen": -2.393688678741455,
"logits/rejected": -2.1949095726013184,
"logps/chosen": -196.72280883789062,
"logps/rejected": -164.93276977539062,
"loss": 13.0753,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": -0.13964949548244476,
"rewards/margins": 0.004895883612334728,
"rewards/rejected": -0.14454536139965057,
"rewards/safe_rewards": -0.14425238966941833,
"rewards/unsafe_rewards": -0.1350466012954712,
"step": 380
},
{
"epoch": 0.21,
"learning_rate": 4.818584776367992e-07,
"logits/chosen": -2.348188638687134,
"logits/rejected": -2.183293342590332,
"logps/chosen": -207.3245086669922,
"logps/rejected": -185.33078002929688,
"loss": 405.7585,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": 0.9341610074043274,
"rewards/margins": -0.25985628366470337,
"rewards/rejected": 1.1940172910690308,
"rewards/safe_rewards": 1.5165033340454102,
"rewards/unsafe_rewards": 0.3518185615539551,
"step": 390
},
{
"epoch": 0.22,
"learning_rate": 4.800609032127122e-07,
"logits/chosen": -2.362936496734619,
"logits/rejected": -2.117405652999878,
"logps/chosen": -205.0863037109375,
"logps/rejected": -173.82562255859375,
"loss": 250.8796,
"rewards/accuracies": 0.46562498807907104,
"rewards/chosen": 0.8158755302429199,
"rewards/margins": 0.04819601774215698,
"rewards/rejected": 0.7676795721054077,
"rewards/safe_rewards": 0.8238789439201355,
"rewards/unsafe_rewards": 0.8078721761703491,
"step": 400
},
{
"epoch": 0.22,
"learning_rate": 4.78182109862569e-07,
"logits/chosen": -2.334447145462036,
"logits/rejected": -2.1603846549987793,
"logps/chosen": -193.15878295898438,
"logps/rejected": -169.64031982421875,
"loss": 43.271,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 1.1323336362838745,
"rewards/margins": -0.2824760377407074,
"rewards/rejected": 1.4148097038269043,
"rewards/safe_rewards": 1.1305078268051147,
"rewards/unsafe_rewards": 1.1341596841812134,
"step": 410
},
{
"epoch": 0.23,
"learning_rate": 4.7622276086107677e-07,
"logits/chosen": -2.4567148685455322,
"logits/rejected": -2.2268338203430176,
"logps/chosen": -221.8797149658203,
"logps/rejected": -183.58682250976562,
"loss": 170.0915,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 1.5294605493545532,
"rewards/margins": -0.26579660177230835,
"rewards/rejected": 1.7952572107315063,
"rewards/safe_rewards": 1.6476377248764038,
"rewards/unsafe_rewards": 1.4112837314605713,
"step": 420
},
{
"epoch": 0.23,
"learning_rate": 4.741835479216879e-07,
"logits/chosen": -2.4018983840942383,
"logits/rejected": -2.1745998859405518,
"logps/chosen": -224.1997833251953,
"logps/rejected": -202.8693084716797,
"loss": 318.6482,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 1.9682689905166626,
"rewards/margins": 1.3845123052597046,
"rewards/rejected": 0.5837565660476685,
"rewards/safe_rewards": 1.9361345767974854,
"rewards/unsafe_rewards": 2.0004029273986816,
"step": 430
},
{
"epoch": 0.24,
"learning_rate": 4.720651909524036e-07,
"logits/chosen": -2.368582248687744,
"logits/rejected": -2.1598029136657715,
"logps/chosen": -199.04641723632812,
"logps/rejected": -171.59878540039062,
"loss": 20.6844,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.34873148798942566,
"rewards/margins": -0.22367699444293976,
"rewards/rejected": 0.5724084973335266,
"rewards/safe_rewards": 0.4507713317871094,
"rewards/unsafe_rewards": 0.24669162929058075,
"step": 440
},
{
"epoch": 0.24,
"learning_rate": 4.698684378016222e-07,
"logits/chosen": -2.4238266944885254,
"logits/rejected": -2.1877074241638184,
"logps/chosen": -206.9587860107422,
"logps/rejected": -166.5978546142578,
"loss": 36.0619,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.48088520765304565,
"rewards/margins": -0.556584894657135,
"rewards/rejected": 0.07569964975118637,
"rewards/safe_rewards": -0.8808043599128723,
"rewards/unsafe_rewards": -0.08096615970134735,
"step": 450
},
{
"epoch": 0.25,
"learning_rate": 4.675940639941256e-07,
"logits/chosen": -2.381782054901123,
"logits/rejected": -2.2072319984436035,
"logps/chosen": -202.72836303710938,
"logps/rejected": -178.13565063476562,
"loss": 19.0221,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.561229407787323,
"rewards/margins": 0.22027714550495148,
"rewards/rejected": 0.3409522473812103,
"rewards/safe_rewards": 0.481137752532959,
"rewards/unsafe_rewards": 0.641321063041687,
"step": 460
},
{
"epoch": 0.25,
"learning_rate": 4.6524287245729286e-07,
"logits/chosen": -2.3484253883361816,
"logits/rejected": -2.134340524673462,
"logps/chosen": -198.06240844726562,
"logps/rejected": -166.09368896484375,
"loss": 26.6374,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.22380805015563965,
"rewards/margins": -0.03325925022363663,
"rewards/rejected": 0.2570672631263733,
"rewards/safe_rewards": 0.1073969230055809,
"rewards/unsafe_rewards": 0.3402191996574402,
"step": 470
},
{
"epoch": 0.26,
"learning_rate": 4.628156932376418e-07,
"logits/chosen": -2.3849387168884277,
"logits/rejected": -2.1502578258514404,
"logps/chosen": -202.72006225585938,
"logps/rejected": -165.7488555908203,
"loss": 163.8104,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": 0.22546739876270294,
"rewards/margins": 0.1902145892381668,
"rewards/rejected": 0.03525285795331001,
"rewards/safe_rewards": -0.18428334593772888,
"rewards/unsafe_rewards": 0.6352182030677795,
"step": 480
},
{
"epoch": 0.26,
"learning_rate": 4.603133832077953e-07,
"logits/chosen": -2.3536932468414307,
"logits/rejected": -2.1680946350097656,
"logps/chosen": -209.31454467773438,
"logps/rejected": -199.3543701171875,
"loss": 75.0922,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.6691935658454895,
"rewards/margins": 0.05490832403302193,
"rewards/rejected": 0.6142852902412415,
"rewards/safe_rewards": 0.21514494717121124,
"rewards/unsafe_rewards": 1.1232421398162842,
"step": 490
},
{
"epoch": 0.27,
"learning_rate": 4.5773682576397776e-07,
"logits/chosen": -2.360821008682251,
"logits/rejected": -2.1603407859802246,
"logps/chosen": -201.4778594970703,
"logps/rejected": -169.81484985351562,
"loss": 131.6857,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.10828091204166412,
"rewards/margins": -0.5544548034667969,
"rewards/rejected": 0.662735641002655,
"rewards/safe_rewards": -0.3237845301628113,
"rewards/unsafe_rewards": 0.5403462648391724,
"step": 500
},
{
"epoch": 0.27,
"eval_logits/chosen": -2.056485414505005,
"eval_logits/rejected": -1.803229808807373,
"eval_logps/chosen": -130.9681396484375,
"eval_logps/rejected": -92.36480712890625,
"eval_loss": 0.8894476294517517,
"eval_rewards/accuracies": 0.45462244749069214,
"eval_rewards/chosen": -0.10225697606801987,
"eval_rewards/margins": -0.08933582156896591,
"eval_rewards/rejected": -0.012921147979795933,
"eval_rewards/safe_rewards": -0.10428992658853531,
"eval_rewards/unsafe_rewards": -0.10168781876564026,
"eval_runtime": 2237.5747,
"eval_samples_per_second": 14.768,
"eval_steps_per_second": 0.923,
"step": 500
},
{
"epoch": 0.27,
"learning_rate": 4.5508693051414774e-07,
"logits/chosen": -2.3876683712005615,
"logits/rejected": -2.2101075649261475,
"logps/chosen": -197.6197509765625,
"logps/rejected": -179.2535858154297,
"loss": 10.2996,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.15894845128059387,
"rewards/margins": -0.012749219313263893,
"rewards/rejected": 0.17169766128063202,
"rewards/safe_rewards": 0.22355195879936218,
"rewards/unsafe_rewards": 0.09434493631124496,
"step": 510
},
{
"epoch": 0.28,
"learning_rate": 4.52364632956877e-07,
"logits/chosen": -2.3795700073242188,
"logits/rejected": -2.167722225189209,
"logps/chosen": -209.80477905273438,
"logps/rejected": -170.1284942626953,
"loss": 102.0474,
"rewards/accuracies": 0.47187501192092896,
"rewards/chosen": -0.054634951055049896,
"rewards/margins": -0.5382941961288452,
"rewards/rejected": 0.48365920782089233,
"rewards/safe_rewards": 0.07545175403356552,
"rewards/unsafe_rewards": -0.1847216635942459,
"step": 520
},
{
"epoch": 0.29,
"learning_rate": 4.4957089415108895e-07,
"logits/chosen": -2.3528215885162354,
"logits/rejected": -2.1418814659118652,
"logps/chosen": -187.97207641601562,
"logps/rejected": -165.0076446533203,
"loss": 120.2137,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.6140010952949524,
"rewards/margins": 0.31187087297439575,
"rewards/rejected": 0.30213022232055664,
"rewards/safe_rewards": 0.45020800828933716,
"rewards/unsafe_rewards": 0.7777942419052124,
"step": 530
},
{
"epoch": 0.29,
"learning_rate": 4.467067003767745e-07,
"logits/chosen": -2.441636800765991,
"logits/rejected": -2.219637870788574,
"logps/chosen": -215.01718139648438,
"logps/rejected": -178.1621856689453,
"loss": 31.3751,
"rewards/accuracies": 0.578125,
"rewards/chosen": 0.5192718505859375,
"rewards/margins": 0.13771791756153107,
"rewards/rejected": 0.3815539479255676,
"rewards/safe_rewards": 0.3229585587978363,
"rewards/unsafe_rewards": 0.7155852317810059,
"step": 540
},
{
"epoch": 0.3,
"learning_rate": 4.437730627868027e-07,
"logits/chosen": -2.378955602645874,
"logits/rejected": -2.138523578643799,
"logps/chosen": -181.02993774414062,
"logps/rejected": -161.35678100585938,
"loss": 48.7052,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.13924112915992737,
"rewards/margins": 0.37750715017318726,
"rewards/rejected": -0.23826603591442108,
"rewards/safe_rewards": 0.4204103946685791,
"rewards/unsafe_rewards": -0.14192816615104675,
"step": 550
},
{
"epoch": 0.3,
"learning_rate": 4.4077101704995163e-07,
"logits/chosen": -2.4157960414886475,
"logits/rejected": -2.1959304809570312,
"logps/chosen": -204.2389373779297,
"logps/rejected": -188.56707763671875,
"loss": 23.5436,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.36682039499282837,
"rewards/margins": -0.1311464011669159,
"rewards/rejected": -0.23567399382591248,
"rewards/safe_rewards": -0.4130277633666992,
"rewards/unsafe_rewards": -0.3206130862236023,
"step": 560
},
{
"epoch": 0.31,
"learning_rate": 4.3770162298528356e-07,
"logits/chosen": -2.4378573894500732,
"logits/rejected": -2.243499994277954,
"logps/chosen": -201.71572875976562,
"logps/rejected": -169.5461883544922,
"loss": 48.0924,
"rewards/accuracies": 0.565625011920929,
"rewards/chosen": 0.21430592238903046,
"rewards/margins": 0.6119717359542847,
"rewards/rejected": -0.397665798664093,
"rewards/safe_rewards": -0.27201324701309204,
"rewards/unsafe_rewards": 0.7006251811981201,
"step": 570
},
{
"epoch": 0.31,
"learning_rate": 4.3456596418799476e-07,
"logits/chosen": -2.383977174758911,
"logits/rejected": -2.204479694366455,
"logps/chosen": -208.63818359375,
"logps/rejected": -172.74533081054688,
"loss": 40.887,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.31667083501815796,
"rewards/margins": 0.20149526000022888,
"rewards/rejected": 0.11517556756734848,
"rewards/safe_rewards": 0.08531586080789566,
"rewards/unsafe_rewards": 0.5480257868766785,
"step": 580
},
{
"epoch": 0.32,
"learning_rate": 4.313651476468715e-07,
"logits/chosen": -2.452789783477783,
"logits/rejected": -2.2367706298828125,
"logps/chosen": -206.00991821289062,
"logps/rejected": -181.455810546875,
"loss": 17.7462,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.025053849443793297,
"rewards/margins": -0.13409826159477234,
"rewards/rejected": 0.1591521054506302,
"rewards/safe_rewards": -0.05954737588763237,
"rewards/unsafe_rewards": 0.10965506732463837,
"step": 590
},
{
"epoch": 0.32,
"learning_rate": 4.2810030335348693e-07,
"logits/chosen": -2.4099035263061523,
"logits/rejected": -2.218843936920166,
"logps/chosen": -218.79177856445312,
"logps/rejected": -168.67431640625,
"loss": 74.9431,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.23495905101299286,
"rewards/margins": -0.6415296196937561,
"rewards/rejected": 0.40657052397727966,
"rewards/safe_rewards": -0.5243301391601562,
"rewards/unsafe_rewards": 0.05441205948591232,
"step": 600
},
{
"epoch": 0.33,
"learning_rate": 4.2477258390327806e-07,
"logits/chosen": -2.4378225803375244,
"logits/rejected": -2.2049014568328857,
"logps/chosen": -190.23843383789062,
"logps/rejected": -167.7356719970703,
"loss": 24.3453,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.24785485863685608,
"rewards/margins": -0.3185795247554779,
"rewards/rejected": 0.07072468847036362,
"rewards/safe_rewards": -0.1916726529598236,
"rewards/unsafe_rewards": -0.30403703451156616,
"step": 610
},
{
"epoch": 0.33,
"learning_rate": 4.2138316408864197e-07,
"logits/chosen": -2.4828572273254395,
"logits/rejected": -2.251974105834961,
"logps/chosen": -195.9208221435547,
"logps/rejected": -162.9341278076172,
"loss": 47.0644,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.2951027750968933,
"rewards/margins": 0.2895166277885437,
"rewards/rejected": 0.005586123559623957,
"rewards/safe_rewards": 0.2697201073169708,
"rewards/unsafe_rewards": 0.3204854130744934,
"step": 620
},
{
"epoch": 0.34,
"learning_rate": 4.179332404841962e-07,
"logits/chosen": -2.4540035724639893,
"logits/rejected": -2.223843812942505,
"logps/chosen": -208.46463012695312,
"logps/rejected": -176.60848999023438,
"loss": 25.2961,
"rewards/accuracies": 0.46562498807907104,
"rewards/chosen": 0.14698375761508942,
"rewards/margins": 0.1325828731060028,
"rewards/rejected": 0.014400847256183624,
"rewards/safe_rewards": 0.04335422068834305,
"rewards/unsafe_rewards": 0.250613272190094,
"step": 630
},
{
"epoch": 0.34,
"learning_rate": 4.1442403102434954e-07,
"logits/chosen": -2.4651191234588623,
"logits/rejected": -2.252150535583496,
"logps/chosen": -212.79736328125,
"logps/rejected": -179.38711547851562,
"loss": 117.4084,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.48046717047691345,
"rewards/margins": -0.18567809462547302,
"rewards/rejected": -0.29478907585144043,
"rewards/safe_rewards": -0.6295033693313599,
"rewards/unsafe_rewards": -0.3314310312271118,
"step": 640
},
{
"epoch": 0.35,
"learning_rate": 4.108567745733318e-07,
"logits/chosen": -2.447937488555908,
"logits/rejected": -2.201697826385498,
"logps/chosen": -184.49168395996094,
"logps/rejected": -166.9139404296875,
"loss": 10.7524,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.005866925232112408,
"rewards/margins": 0.17292609810829163,
"rewards/rejected": -0.1787930279970169,
"rewards/safe_rewards": 0.03122568130493164,
"rewards/unsafe_rewards": -0.042959537357091904,
"step": 650
},
{
"epoch": 0.36,
"learning_rate": 4.0723273048783426e-07,
"logits/chosen": -2.44038462638855,
"logits/rejected": -2.2175660133361816,
"logps/chosen": -211.3206787109375,
"logps/rejected": -165.2122802734375,
"loss": 81.9566,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 0.5619795918464661,
"rewards/margins": 0.545897364616394,
"rewards/rejected": 0.016082104295492172,
"rewards/safe_rewards": 1.0618271827697754,
"rewards/unsafe_rewards": 0.06213190406560898,
"step": 660
},
{
"epoch": 0.36,
"learning_rate": 4.0355317817241697e-07,
"logits/chosen": -2.3970015048980713,
"logits/rejected": -2.163048267364502,
"logps/chosen": -229.952880859375,
"logps/rejected": -176.55599975585938,
"loss": 26.2558,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.38434115052223206,
"rewards/margins": 0.2878434658050537,
"rewards/rejected": 0.09649765491485596,
"rewards/safe_rewards": 0.5011194944381714,
"rewards/unsafe_rewards": 0.26756277680397034,
"step": 670
},
{
"epoch": 0.37,
"learning_rate": 3.998194166278367e-07,
"logits/chosen": -2.4422953128814697,
"logits/rejected": -2.2152860164642334,
"logps/chosen": -193.12109375,
"logps/rejected": -156.7648162841797,
"loss": 157.1721,
"rewards/accuracies": 0.4593749940395355,
"rewards/chosen": -0.3413035273551941,
"rewards/margins": -0.26979130506515503,
"rewards/rejected": -0.07151220738887787,
"rewards/safe_rewards": -0.49346867203712463,
"rewards/unsafe_rewards": -0.18913838267326355,
"step": 680
},
{
"epoch": 0.37,
"learning_rate": 3.9603276399245855e-07,
"logits/chosen": -2.4512076377868652,
"logits/rejected": -2.217556953430176,
"logps/chosen": -212.5731658935547,
"logps/rejected": -172.98239135742188,
"loss": 140.5213,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.045460961759090424,
"rewards/margins": 0.4976281523704529,
"rewards/rejected": -0.5430891513824463,
"rewards/safe_rewards": 0.1586223542690277,
"rewards/unsafe_rewards": -0.24954433739185333,
"step": 690
},
{
"epoch": 0.38,
"learning_rate": 3.9219455707691e-07,
"logits/chosen": -2.443801164627075,
"logits/rejected": -2.217026710510254,
"logps/chosen": -223.50064086914062,
"logps/rejected": -188.3572998046875,
"loss": 239.7127,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -1.1688416004180908,
"rewards/margins": -0.6104832291603088,
"rewards/rejected": -0.5583583116531372,
"rewards/safe_rewards": -0.5376420021057129,
"rewards/unsafe_rewards": -1.8000411987304688,
"step": 700
},
{
"epoch": 0.38,
"learning_rate": 3.883061508921439e-07,
"logits/chosen": -2.4577882289886475,
"logits/rejected": -2.289802074432373,
"logps/chosen": -199.79066467285156,
"logps/rejected": -191.25059509277344,
"loss": 127.1414,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.8074787855148315,
"rewards/margins": -0.5068421363830566,
"rewards/rejected": -0.30063679814338684,
"rewards/safe_rewards": -1.0235812664031982,
"rewards/unsafe_rewards": -0.5913764238357544,
"step": 710
},
{
"epoch": 0.39,
"learning_rate": 3.8436891817107555e-07,
"logits/chosen": -2.384692668914795,
"logits/rejected": -2.2363414764404297,
"logps/chosen": -192.9431915283203,
"logps/rejected": -173.0110626220703,
"loss": 88.3357,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": -0.845304012298584,
"rewards/margins": 0.22771115601062775,
"rewards/rejected": -1.0730152130126953,
"rewards/safe_rewards": -0.7142607569694519,
"rewards/unsafe_rewards": -0.9763473272323608,
"step": 720
},
{
"epoch": 0.39,
"learning_rate": 3.8038424888396414e-07,
"logits/chosen": -2.4334444999694824,
"logits/rejected": -2.2202000617980957,
"logps/chosen": -190.13265991210938,
"logps/rejected": -173.72535705566406,
"loss": 46.5741,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.9059289693832397,
"rewards/margins": 0.07719539105892181,
"rewards/rejected": -0.9831243753433228,
"rewards/safe_rewards": -1.5265599489212036,
"rewards/unsafe_rewards": -0.2852979004383087,
"step": 730
},
{
"epoch": 0.4,
"learning_rate": 3.763535497477079e-07,
"logits/chosen": -2.428952693939209,
"logits/rejected": -2.205458641052246,
"logps/chosen": -203.35873413085938,
"logps/rejected": -178.9982452392578,
"loss": 30.0399,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.06326188147068024,
"rewards/margins": 0.4762391149997711,
"rewards/rejected": -0.4129772186279297,
"rewards/safe_rewards": 0.01683131232857704,
"rewards/unsafe_rewards": 0.10969245433807373,
"step": 740
},
{
"epoch": 0.4,
"learning_rate": 3.7227824372922795e-07,
"logits/chosen": -2.4341301918029785,
"logits/rejected": -2.2008628845214844,
"logps/chosen": -189.18417358398438,
"logps/rejected": -167.0784454345703,
"loss": 12.3092,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.09529106318950653,
"rewards/margins": 0.02994244359433651,
"rewards/rejected": 0.06534863263368607,
"rewards/safe_rewards": 0.103404700756073,
"rewards/unsafe_rewards": 0.08717743307352066,
"step": 750
},
{
"epoch": 0.41,
"learning_rate": 3.681597695431148e-07,
"logits/chosen": -2.397660732269287,
"logits/rejected": -2.248548984527588,
"logps/chosen": -201.36961364746094,
"logps/rejected": -183.10923767089844,
"loss": 44.1826,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.05803655833005905,
"rewards/margins": 0.0876794308423996,
"rewards/rejected": -0.14571599662303925,
"rewards/safe_rewards": -0.19255781173706055,
"rewards/unsafe_rewards": 0.07648466527462006,
"step": 760
},
{
"epoch": 0.41,
"learning_rate": 3.639995811437159e-07,
"logits/chosen": -2.3755042552948,
"logits/rejected": -2.191373348236084,
"logps/chosen": -197.1927032470703,
"logps/rejected": -179.4755859375,
"loss": 154.7574,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": 0.14616283774375916,
"rewards/margins": 0.41009521484375,
"rewards/rejected": -0.26393240690231323,
"rewards/safe_rewards": 0.3775586485862732,
"rewards/unsafe_rewards": -0.08523297309875488,
"step": 770
},
{
"epoch": 0.42,
"learning_rate": 3.597991472118426e-07,
"logits/chosen": -2.4273521900177,
"logits/rejected": -2.192534923553467,
"logps/chosen": -206.8874053955078,
"logps/rejected": -176.24118041992188,
"loss": 36.5319,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.164406418800354,
"rewards/margins": -0.1354350596666336,
"rewards/rejected": -0.028971344232559204,
"rewards/safe_rewards": 0.2160978764295578,
"rewards/unsafe_rewards": -0.5449106097221375,
"step": 780
},
{
"epoch": 0.43,
"learning_rate": 3.5555995063627836e-07,
"logits/chosen": -2.415065050125122,
"logits/rejected": -2.194133758544922,
"logps/chosen": -222.50820922851562,
"logps/rejected": -191.37088012695312,
"loss": 16.1129,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.1634823977947235,
"rewards/margins": 0.29501140117645264,
"rewards/rejected": -0.1315290331840515,
"rewards/safe_rewards": -0.023002928122878075,
"rewards/unsafe_rewards": 0.34996774792671204,
"step": 790
},
{
"epoch": 0.43,
"learning_rate": 3.512834879902715e-07,
"logits/chosen": -2.446582794189453,
"logits/rejected": -2.2151386737823486,
"logps/chosen": -193.52993774414062,
"logps/rejected": -169.22207641601562,
"loss": 17.2298,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.15118324756622314,
"rewards/margins": 0.320087730884552,
"rewards/rejected": -0.16890448331832886,
"rewards/safe_rewards": 0.15818454325199127,
"rewards/unsafe_rewards": 0.14418195188045502,
"step": 800
},
{
"epoch": 0.44,
"learning_rate": 3.4697126900319616e-07,
"logits/chosen": -2.4158897399902344,
"logits/rejected": -2.180227756500244,
"logps/chosen": -200.93173217773438,
"logps/rejected": -167.99073791503906,
"loss": 22.7375,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.10535750538110733,
"rewards/margins": -0.006962819490581751,
"rewards/rejected": 0.11232032626867294,
"rewards/safe_rewards": -0.18741589784622192,
"rewards/unsafe_rewards": 0.3981309235095978,
"step": 810
},
{
"epoch": 0.44,
"learning_rate": 3.426248160275693e-07,
"logits/chosen": -2.4130988121032715,
"logits/rejected": -2.223747730255127,
"logps/chosen": -196.2846221923828,
"logps/rejected": -177.1783447265625,
"loss": 62.6098,
"rewards/accuracies": 0.47187501192092896,
"rewards/chosen": 0.08924231678247452,
"rewards/margins": -0.5832756757736206,
"rewards/rejected": 0.6725180745124817,
"rewards/safe_rewards": 0.08092136681079865,
"rewards/unsafe_rewards": 0.09756331145763397,
"step": 820
},
{
"epoch": 0.45,
"learning_rate": 3.3824566350161094e-07,
"logits/chosen": -2.4248764514923096,
"logits/rejected": -2.1799604892730713,
"logps/chosen": -211.0237274169922,
"logps/rejected": -165.1766815185547,
"loss": 8.7437,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.5879140496253967,
"rewards/margins": 0.23111946880817413,
"rewards/rejected": 0.356794536113739,
"rewards/safe_rewards": 0.5438351631164551,
"rewards/unsafe_rewards": 0.6319928765296936,
"step": 830
},
{
"epoch": 0.45,
"learning_rate": 3.338353574075381e-07,
"logits/chosen": -2.3919012546539307,
"logits/rejected": -2.212056875228882,
"logps/chosen": -188.0956268310547,
"logps/rejected": -166.2266387939453,
"loss": 23.4515,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 0.4208035469055176,
"rewards/margins": 0.025497043505311012,
"rewards/rejected": 0.3953064978122711,
"rewards/safe_rewards": 0.5599286556243896,
"rewards/unsafe_rewards": 0.2816784679889679,
"step": 840
},
{
"epoch": 0.46,
"learning_rate": 3.2939545472578314e-07,
"logits/chosen": -2.4613280296325684,
"logits/rejected": -2.1779792308807373,
"logps/chosen": -220.7722625732422,
"logps/rejected": -177.66567993164062,
"loss": 71.1367,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.6730166077613831,
"rewards/margins": 0.4062492251396179,
"rewards/rejected": 0.26676732301712036,
"rewards/safe_rewards": 0.1429261863231659,
"rewards/unsafe_rewards": 1.2031069993972778,
"step": 850
},
{
"epoch": 0.46,
"learning_rate": 3.2492752288532916e-07,
"logits/chosen": -2.4267163276672363,
"logits/rejected": -2.2031116485595703,
"logps/chosen": -192.3984832763672,
"logps/rejected": -171.2382354736328,
"loss": 46.0145,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.1684725284576416,
"rewards/margins": -0.08331739902496338,
"rewards/rejected": 0.25178998708724976,
"rewards/safe_rewards": 0.12440772354602814,
"rewards/unsafe_rewards": 0.21253737807273865,
"step": 860
},
{
"epoch": 0.47,
"learning_rate": 3.204331392103574e-07,
"logits/chosen": -2.483734369277954,
"logits/rejected": -2.2113869190216064,
"logps/chosen": -211.577880859375,
"logps/rejected": -163.4304656982422,
"loss": 155.5441,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": 0.19171550869941711,
"rewards/margins": -0.2531381845474243,
"rewards/rejected": 0.44485369324684143,
"rewards/safe_rewards": 0.10490121692419052,
"rewards/unsafe_rewards": 0.2785297632217407,
"step": 870
},
{
"epoch": 0.47,
"learning_rate": 3.159138903634006e-07,
"logits/chosen": -2.409116744995117,
"logits/rejected": -2.2290921211242676,
"logps/chosen": -203.94369506835938,
"logps/rejected": -173.5029754638672,
"loss": 9.3153,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": 0.4058583676815033,
"rewards/margins": 0.18885207176208496,
"rewards/rejected": 0.21700629591941833,
"rewards/safe_rewards": 0.3220987915992737,
"rewards/unsafe_rewards": 0.48961788415908813,
"step": 880
},
{
"epoch": 0.48,
"learning_rate": 3.1137137178519977e-07,
"logits/chosen": -2.4068942070007324,
"logits/rejected": -2.212474822998047,
"logps/chosen": -184.1978759765625,
"logps/rejected": -157.02920532226562,
"loss": 47.3581,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.041205547749996185,
"rewards/margins": -0.1057499423623085,
"rewards/rejected": 0.06454440206289291,
"rewards/safe_rewards": -0.18809974193572998,
"rewards/unsafe_rewards": 0.10568861663341522,
"step": 890
},
{
"epoch": 0.48,
"learning_rate": 3.068071871314626e-07,
"logits/chosen": -2.3744447231292725,
"logits/rejected": -2.1711204051971436,
"logps/chosen": -193.363525390625,
"logps/rejected": -157.98092651367188,
"loss": 36.4272,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.1274958997964859,
"rewards/margins": -0.06520196795463562,
"rewards/rejected": 0.19269786775112152,
"rewards/safe_rewards": 0.27050352096557617,
"rewards/unsafe_rewards": -0.015511776320636272,
"step": 900
},
{
"epoch": 0.49,
"learning_rate": 3.022229477067205e-07,
"logits/chosen": -2.4298009872436523,
"logits/rejected": -2.2137274742126465,
"logps/chosen": -212.06454467773438,
"logps/rejected": -162.7147216796875,
"loss": 22.3251,
"rewards/accuracies": 0.5406249761581421,
"rewards/chosen": 0.2818406820297241,
"rewards/margins": 0.29059693217277527,
"rewards/rejected": -0.008756252937018871,
"rewards/safe_rewards": 0.12103135883808136,
"rewards/unsafe_rewards": 0.4426499903202057,
"step": 910
},
{
"epoch": 0.49,
"learning_rate": 2.976202718954869e-07,
"logits/chosen": -2.4414241313934326,
"logits/rejected": -2.214113235473633,
"logps/chosen": -208.3417510986328,
"logps/rejected": -185.30526733398438,
"loss": 15.9322,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.002296045422554016,
"rewards/margins": 0.06845332682132721,
"rewards/rejected": -0.0661572739481926,
"rewards/safe_rewards": 0.11791107803583145,
"rewards/unsafe_rewards": -0.11331899464130402,
"step": 920
},
{
"epoch": 0.5,
"learning_rate": 2.930007845909146e-07,
"logits/chosen": -2.465981960296631,
"logits/rejected": -2.2979178428649902,
"logps/chosen": -220.63400268554688,
"logps/rejected": -194.15982055664062,
"loss": 20.6631,
"rewards/accuracies": 0.44062501192092896,
"rewards/chosen": 0.008412945084273815,
"rewards/margins": -0.06071774289011955,
"rewards/rejected": 0.06913068145513535,
"rewards/safe_rewards": 0.15175995230674744,
"rewards/unsafe_rewards": -0.13493406772613525,
"step": 930
},
{
"epoch": 0.51,
"learning_rate": 2.8836611662115634e-07,
"logits/chosen": -2.411681890487671,
"logits/rejected": -2.184065818786621,
"logps/chosen": -201.34774780273438,
"logps/rejected": -158.77896118164062,
"loss": 53.4563,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.30983632802963257,
"rewards/margins": 0.02700033411383629,
"rewards/rejected": 0.282835990190506,
"rewards/safe_rewards": 0.16020536422729492,
"rewards/unsafe_rewards": 0.4594673216342926,
"step": 940
},
{
"epoch": 0.51,
"learning_rate": 2.8371790417362986e-07,
"logits/chosen": -2.4363036155700684,
"logits/rejected": -2.2508435249328613,
"logps/chosen": -194.97052001953125,
"logps/rejected": -184.87435913085938,
"loss": 15.1437,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.3793017268180847,
"rewards/margins": 0.050546444952487946,
"rewards/rejected": 0.328755259513855,
"rewards/safe_rewards": 0.453242689371109,
"rewards/unsafe_rewards": 0.30536073446273804,
"step": 950
},
{
"epoch": 0.52,
"learning_rate": 2.7905778821739056e-07,
"logits/chosen": -2.430182456970215,
"logits/rejected": -2.181687116622925,
"logps/chosen": -207.5760955810547,
"logps/rejected": -161.82400512695312,
"loss": 36.165,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.2813587188720703,
"rewards/margins": -0.009190035052597523,
"rewards/rejected": 0.29054874181747437,
"rewards/safe_rewards": 0.5300852060317993,
"rewards/unsafe_rewards": 0.03263214975595474,
"step": 960
},
{
"epoch": 0.52,
"learning_rate": 2.74387413923817e-07,
"logits/chosen": -2.3779215812683105,
"logits/rejected": -2.2126498222351074,
"logps/chosen": -216.20980834960938,
"logps/rejected": -191.72068786621094,
"loss": 35.9574,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": 0.40009957551956177,
"rewards/margins": 0.11698710918426514,
"rewards/rejected": 0.28311246633529663,
"rewards/safe_rewards": 0.3544066548347473,
"rewards/unsafe_rewards": 0.4457924962043762,
"step": 970
},
{
"epoch": 0.53,
"learning_rate": 2.69708430085812e-07,
"logits/chosen": -2.442641496658325,
"logits/rejected": -2.2196171283721924,
"logps/chosen": -210.2590789794922,
"logps/rejected": -178.38427734375,
"loss": 143.7915,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.8005061149597168,
"rewards/margins": 0.7907932996749878,
"rewards/rejected": 0.009712839499115944,
"rewards/safe_rewards": 1.2023097276687622,
"rewards/unsafe_rewards": 0.39870262145996094,
"step": 980
},
{
"epoch": 0.53,
"learning_rate": 2.6502248853572504e-07,
"logits/chosen": -2.397225856781006,
"logits/rejected": -2.1839497089385986,
"logps/chosen": -191.41046142578125,
"logps/rejected": -162.9442901611328,
"loss": 12.2808,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.014359796419739723,
"rewards/margins": -0.05102130025625229,
"rewards/rejected": 0.06538109481334686,
"rewards/safe_rewards": -0.18064935505390167,
"rewards/unsafe_rewards": 0.20936894416809082,
"step": 990
},
{
"epoch": 0.54,
"learning_rate": 2.6033124356220325e-07,
"logits/chosen": -2.364447593688965,
"logits/rejected": -2.1461973190307617,
"logps/chosen": -199.1238555908203,
"logps/rejected": -159.5116729736328,
"loss": 34.7958,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.03608076646924019,
"rewards/margins": -0.22421510517597198,
"rewards/rejected": 0.1881343126296997,
"rewards/safe_rewards": 0.28278595209121704,
"rewards/unsafe_rewards": -0.3549474775791168,
"step": 1000
},
{
"epoch": 0.54,
"eval_logits/chosen": -2.0551209449768066,
"eval_logits/rejected": -1.7989723682403564,
"eval_logps/chosen": -130.9921875,
"eval_logps/rejected": -92.4808578491211,
"eval_loss": 0.7397361993789673,
"eval_rewards/accuracies": 0.5028436779975891,
"eval_rewards/chosen": -0.12634092569351196,
"eval_rewards/margins": 0.00263192574493587,
"eval_rewards/rejected": -0.1289728581905365,
"eval_rewards/safe_rewards": -0.12365306168794632,
"eval_rewards/unsafe_rewards": -0.1263761818408966,
"eval_runtime": 1869.3277,
"eval_samples_per_second": 17.677,
"eval_steps_per_second": 1.105,
"step": 1000
},
{
"epoch": 0.54,
"learning_rate": 2.55636351326173e-07,
"logits/chosen": -2.4121344089508057,
"logits/rejected": -2.2164716720581055,
"logps/chosen": -214.9409637451172,
"logps/rejected": -175.6654815673828,
"loss": 72.6154,
"rewards/accuracies": 0.53125,
"rewards/chosen": 0.20922379195690155,
"rewards/margins": -0.15886008739471436,
"rewards/rejected": 0.3680838942527771,
"rewards/safe_rewards": 0.6282423734664917,
"rewards/unsafe_rewards": -0.20979471504688263,
"step": 1010
},
{
"epoch": 0.55,
"learning_rate": 2.509394692761622e-07,
"logits/chosen": -2.39310884475708,
"logits/rejected": -2.1510488986968994,
"logps/chosen": -218.1635284423828,
"logps/rejected": -180.8001251220703,
"loss": 79.5377,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.365112841129303,
"rewards/margins": 0.28045016527175903,
"rewards/rejected": 0.08466275036334991,
"rewards/safe_rewards": 0.08056111633777618,
"rewards/unsafe_rewards": 0.649664580821991,
"step": 1020
},
{
"epoch": 0.55,
"learning_rate": 2.462422555631674e-07,
"logits/chosen": -2.4212746620178223,
"logits/rejected": -2.187579393386841,
"logps/chosen": -197.0594024658203,
"logps/rejected": -160.92257690429688,
"loss": 30.6297,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.516227126121521,
"rewards/margins": 0.2513945698738098,
"rewards/rejected": 0.2648325562477112,
"rewards/safe_rewards": 0.3707699179649353,
"rewards/unsafe_rewards": 0.6616843938827515,
"step": 1030
},
{
"epoch": 0.56,
"learning_rate": 2.415463684552728e-07,
"logits/chosen": -2.3526053428649902,
"logits/rejected": -2.168795585632324,
"logps/chosen": -187.2362518310547,
"logps/rejected": -158.90509033203125,
"loss": 16.6677,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": 0.2424931526184082,
"rewards/margins": -0.007419240660965443,
"rewards/rejected": 0.2499123513698578,
"rewards/safe_rewards": 0.3042396008968353,
"rewards/unsafe_rewards": 0.1807466745376587,
"step": 1040
},
{
"epoch": 0.56,
"learning_rate": 2.3685346575222807e-07,
"logits/chosen": -2.388552188873291,
"logits/rejected": -2.140934467315674,
"logps/chosen": -206.6807098388672,
"logps/rejected": -170.2689666748047,
"loss": 9.8385,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.22210577130317688,
"rewards/margins": -0.04671960324048996,
"rewards/rejected": 0.26882538199424744,
"rewards/safe_rewards": 0.3220168948173523,
"rewards/unsafe_rewards": 0.12219462543725967,
"step": 1050
},
{
"epoch": 0.57,
"learning_rate": 2.321652042001919e-07,
"logits/chosen": -2.390388011932373,
"logits/rejected": -2.10972261428833,
"logps/chosen": -209.7392120361328,
"logps/rejected": -183.13662719726562,
"loss": 9.9643,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.16931462287902832,
"rewards/margins": 0.08294131606817245,
"rewards/rejected": 0.08637328445911407,
"rewards/safe_rewards": 0.11830408871173859,
"rewards/unsafe_rewards": 0.22032511234283447,
"step": 1060
},
{
"epoch": 0.58,
"learning_rate": 2.2748323890684662e-07,
"logits/chosen": -2.3839309215545654,
"logits/rejected": -2.1623384952545166,
"logps/chosen": -198.49668884277344,
"logps/rejected": -169.58737182617188,
"loss": 11.8899,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.18948496878147125,
"rewards/margins": -0.021000146865844727,
"rewards/rejected": 0.2104850709438324,
"rewards/safe_rewards": 0.07809984683990479,
"rewards/unsafe_rewards": 0.3008700907230377,
"step": 1070
},
{
"epoch": 0.58,
"learning_rate": 2.2280922275709213e-07,
"logits/chosen": -2.402510166168213,
"logits/rejected": -2.1689133644104004,
"logps/chosen": -204.17782592773438,
"logps/rejected": -179.0993194580078,
"loss": 46.4965,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.004160255193710327,
"rewards/margins": 0.15431872010231018,
"rewards/rejected": -0.1584789752960205,
"rewards/safe_rewards": -0.011111170053482056,
"rewards/unsafe_rewards": 0.0027906596660614014,
"step": 1080
},
{
"epoch": 0.59,
"learning_rate": 2.1814480582952375e-07,
"logits/chosen": -2.410515308380127,
"logits/rejected": -2.184720993041992,
"logps/chosen": -203.24267578125,
"logps/rejected": -181.4256134033203,
"loss": 102.4097,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 0.45176035165786743,
"rewards/margins": 0.40241655707359314,
"rewards/rejected": 0.04934380576014519,
"rewards/safe_rewards": 0.11663278192281723,
"rewards/unsafe_rewards": 0.7868879437446594,
"step": 1090
},
{
"epoch": 0.59,
"learning_rate": 2.1349163481390187e-07,
"logits/chosen": -2.397282600402832,
"logits/rejected": -2.194654703140259,
"logps/chosen": -193.00746154785156,
"logps/rejected": -171.80690002441406,
"loss": 7.6309,
"rewards/accuracies": 0.515625,
"rewards/chosen": 0.22561688721179962,
"rewards/margins": 0.030238542705774307,
"rewards/rejected": 0.1953783482313156,
"rewards/safe_rewards": 0.10761779546737671,
"rewards/unsafe_rewards": 0.34361597895622253,
"step": 1100
},
{
"epoch": 0.6,
"learning_rate": 2.0885135242981647e-07,
"logits/chosen": -2.398287057876587,
"logits/rejected": -2.1465389728546143,
"logps/chosen": -213.0477752685547,
"logps/rejected": -162.02694702148438,
"loss": 7.6341,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": 0.43572482466697693,
"rewards/margins": 0.21323783695697784,
"rewards/rejected": 0.22248701751232147,
"rewards/safe_rewards": 0.5539884567260742,
"rewards/unsafe_rewards": 0.3174612522125244,
"step": 1110
},
{
"epoch": 0.6,
"learning_rate": 2.0422559684675494e-07,
"logits/chosen": -2.4309935569763184,
"logits/rejected": -2.1530261039733887,
"logps/chosen": -217.1282958984375,
"logps/rejected": -168.8966522216797,
"loss": 12.2909,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.23469574749469757,
"rewards/margins": -0.31583258509635925,
"rewards/rejected": 0.08113676309585571,
"rewards/safe_rewards": -0.3013322949409485,
"rewards/unsafe_rewards": -0.16805927455425262,
"step": 1120
},
{
"epoch": 0.61,
"learning_rate": 1.9961600110577457e-07,
"logits/chosen": -2.349834680557251,
"logits/rejected": -2.1397252082824707,
"logps/chosen": -207.71615600585938,
"logps/rejected": -192.11148071289062,
"loss": 83.4484,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.3409779667854309,
"rewards/margins": -0.09031665325164795,
"rewards/rejected": -0.25066131353378296,
"rewards/safe_rewards": -0.046450722962617874,
"rewards/unsafe_rewards": -0.6355050802230835,
"step": 1130
},
{
"epoch": 0.61,
"learning_rate": 1.950241925429867e-07,
"logits/chosen": -2.4354217052459717,
"logits/rejected": -2.2282073497772217,
"logps/chosen": -202.4095458984375,
"logps/rejected": -172.94119262695312,
"loss": 10.2059,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13358193635940552,
"rewards/margins": 0.09321316331624985,
"rewards/rejected": -0.22679507732391357,
"rewards/safe_rewards": -0.3772156536579132,
"rewards/unsafe_rewards": 0.110051728785038,
"step": 1140
},
{
"epoch": 0.62,
"learning_rate": 1.9045179221505495e-07,
"logits/chosen": -2.385145664215088,
"logits/rejected": -2.1816518306732178,
"logps/chosen": -222.2650909423828,
"logps/rejected": -183.89297485351562,
"loss": 70.6764,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.3544660210609436,
"rewards/margins": -0.31712788343429565,
"rewards/rejected": -0.037338145077228546,
"rewards/safe_rewards": -0.14638884365558624,
"rewards/unsafe_rewards": -0.5625432729721069,
"step": 1150
},
{
"epoch": 0.62,
"learning_rate": 1.8590041432690893e-07,
"logits/chosen": -2.3393194675445557,
"logits/rejected": -2.157670736312866,
"logps/chosen": -191.87765502929688,
"logps/rejected": -167.9620819091797,
"loss": 15.8742,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.08758888393640518,
"rewards/margins": 0.09322256594896317,
"rewards/rejected": -0.005633688066154718,
"rewards/safe_rewards": 0.3342171907424927,
"rewards/unsafe_rewards": -0.1590394526720047,
"step": 1160
},
{
"epoch": 0.63,
"learning_rate": 1.813716656618788e-07,
"logits/chosen": -2.371502637863159,
"logits/rejected": -2.179802417755127,
"logps/chosen": -185.43954467773438,
"logps/rejected": -159.95692443847656,
"loss": 31.7421,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 0.07055462151765823,
"rewards/margins": -0.016725819557905197,
"rewards/rejected": 0.08728043735027313,
"rewards/safe_rewards": 0.0999542772769928,
"rewards/unsafe_rewards": 0.041154973208904266,
"step": 1170
},
{
"epoch": 0.63,
"learning_rate": 1.7686714501444788e-07,
"logits/chosen": -2.408245086669922,
"logits/rejected": -2.111708402633667,
"logps/chosen": -220.0321807861328,
"logps/rejected": -177.4727783203125,
"loss": 30.3254,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.08441750705242157,
"rewards/margins": -0.06825534999370575,
"rewards/rejected": 0.15267284214496613,
"rewards/safe_rewards": -0.1991117298603058,
"rewards/unsafe_rewards": 0.36794668436050415,
"step": 1180
},
{
"epoch": 0.64,
"learning_rate": 1.7238844262582768e-07,
"logits/chosen": -2.3922970294952393,
"logits/rejected": -2.2358450889587402,
"logps/chosen": -214.66928100585938,
"logps/rejected": -185.44973754882812,
"loss": 25.5158,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.1254725158214569,
"rewards/margins": -0.16257312893867493,
"rewards/rejected": 0.28804564476013184,
"rewards/safe_rewards": -0.2854710817337036,
"rewards/unsafe_rewards": 0.5364161133766174,
"step": 1190
},
{
"epoch": 0.65,
"learning_rate": 1.679371396225504e-07,
"logits/chosen": -2.381708860397339,
"logits/rejected": -2.1555020809173584,
"logps/chosen": -204.30628967285156,
"logps/rejected": -180.3300323486328,
"loss": 22.5219,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.1503904014825821,
"rewards/margins": -0.30795037746429443,
"rewards/rejected": 0.45834073424339294,
"rewards/safe_rewards": 0.4240906834602356,
"rewards/unsafe_rewards": -0.12330994755029678,
"step": 1200
},
{
"epoch": 0.65,
"learning_rate": 1.6351480745828096e-07,
"logits/chosen": -2.4050099849700928,
"logits/rejected": -2.1825802326202393,
"logps/chosen": -198.45777893066406,
"logps/rejected": -172.17959594726562,
"loss": 37.0212,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.6139317750930786,
"rewards/margins": 0.5727913975715637,
"rewards/rejected": 0.04114028066396713,
"rewards/safe_rewards": 0.8607079386711121,
"rewards/unsafe_rewards": 0.3671554923057556,
"step": 1210
},
{
"epoch": 0.66,
"learning_rate": 1.5912300735904248e-07,
"logits/chosen": -2.4449119567871094,
"logits/rejected": -2.174882173538208,
"logps/chosen": -223.2691192626953,
"logps/rejected": -173.9079132080078,
"loss": 21.6142,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": 0.29134517908096313,
"rewards/margins": 0.21831652522087097,
"rewards/rejected": 0.07302861660718918,
"rewards/safe_rewards": 0.24711818993091583,
"rewards/unsafe_rewards": 0.33557215332984924,
"step": 1220
},
{
"epoch": 0.66,
"learning_rate": 1.5476328977205395e-07,
"logits/chosen": -2.383089542388916,
"logits/rejected": -2.1814026832580566,
"logps/chosen": -195.18643188476562,
"logps/rejected": -165.39920043945312,
"loss": 279.4912,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.45090776681900024,
"rewards/margins": -0.30230337381362915,
"rewards/rejected": -0.14860430359840393,
"rewards/safe_rewards": 0.5930166840553284,
"rewards/unsafe_rewards": -1.4948322772979736,
"step": 1230
},
{
"epoch": 0.67,
"learning_rate": 1.5043719381837112e-07,
"logits/chosen": -2.4133849143981934,
"logits/rejected": -2.2195193767547607,
"logps/chosen": -219.2970428466797,
"logps/rejected": -189.27816772460938,
"loss": 29.5997,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.13977238535881042,
"rewards/margins": 0.13596734404563904,
"rewards/rejected": 0.0038050352595746517,
"rewards/safe_rewards": 0.10270917415618896,
"rewards/unsafe_rewards": 0.17683559656143188,
"step": 1240
},
{
"epoch": 0.67,
"learning_rate": 1.461462467495284e-07,
"logits/chosen": -2.3971149921417236,
"logits/rejected": -2.2017760276794434,
"logps/chosen": -195.2748260498047,
"logps/rejected": -168.03221130371094,
"loss": 29.824,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.27122873067855835,
"rewards/margins": 0.40271610021591187,
"rewards/rejected": -0.13148736953735352,
"rewards/safe_rewards": 0.28425708413124084,
"rewards/unsafe_rewards": 0.25820040702819824,
"step": 1250
},
{
"epoch": 0.68,
"learning_rate": 1.4189196340836865e-07,
"logits/chosen": -2.4611334800720215,
"logits/rejected": -2.2188827991485596,
"logps/chosen": -199.0708465576172,
"logps/rejected": -166.50717163085938,
"loss": 42.7807,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.1606270670890808,
"rewards/margins": 0.13977651298046112,
"rewards/rejected": 0.020850548520684242,
"rewards/safe_rewards": 0.23887856304645538,
"rewards/unsafe_rewards": 0.08237558603286743,
"step": 1260
},
{
"epoch": 0.68,
"learning_rate": 1.3767584569425561e-07,
"logits/chosen": -2.5276553630828857,
"logits/rejected": -2.2806928157806396,
"logps/chosen": -214.76614379882812,
"logps/rejected": -178.0789031982422,
"loss": 7.7411,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.06127176806330681,
"rewards/margins": 0.017500977963209152,
"rewards/rejected": 0.043770790100097656,
"rewards/safe_rewards": 0.12450921535491943,
"rewards/unsafe_rewards": -0.0019656748045235872,
"step": 1270
},
{
"epoch": 0.69,
"learning_rate": 1.334993820328541e-07,
"logits/chosen": -2.461317539215088,
"logits/rejected": -2.2503418922424316,
"logps/chosen": -204.41952514648438,
"logps/rejected": -171.56008911132812,
"loss": 88.8508,
"rewards/accuracies": 0.453125,
"rewards/chosen": -0.142044335603714,
"rewards/margins": -0.2231227457523346,
"rewards/rejected": 0.0810784175992012,
"rewards/safe_rewards": -0.2867421507835388,
"rewards/unsafe_rewards": 0.002653457224369049,
"step": 1280
},
{
"epoch": 0.69,
"learning_rate": 1.2936404685066852e-07,
"logits/chosen": -2.3843283653259277,
"logits/rejected": -2.1979799270629883,
"logps/chosen": -205.46273803710938,
"logps/rejected": -180.91793823242188,
"loss": 66.3165,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.1233854666352272,
"rewards/margins": -0.03703648969531059,
"rewards/rejected": -0.0863489955663681,
"rewards/safe_rewards": -0.44312816858291626,
"rewards/unsafe_rewards": 0.19635725021362305,
"step": 1290
},
{
"epoch": 0.7,
"learning_rate": 1.252713000545221e-07,
"logits/chosen": -2.455895185470581,
"logits/rejected": -2.2126731872558594,
"logps/chosen": -211.90866088867188,
"logps/rejected": -172.7696533203125,
"loss": 8.9746,
"rewards/accuracies": 0.4375,
"rewards/chosen": 0.057679928839206696,
"rewards/margins": -0.1852763295173645,
"rewards/rejected": 0.2429562509059906,
"rewards/safe_rewards": 0.14237050712108612,
"rewards/unsafe_rewards": -0.027010658755898476,
"step": 1300
},
{
"epoch": 0.7,
"learning_rate": 1.2122258651616304e-07,
"logits/chosen": -2.445269823074341,
"logits/rejected": -2.224661350250244,
"logps/chosen": -209.90713500976562,
"logps/rejected": -173.6033935546875,
"loss": 63.3258,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -0.0619967095553875,
"rewards/margins": -0.29980406165122986,
"rewards/rejected": 0.23780739307403564,
"rewards/safe_rewards": -0.2617853283882141,
"rewards/unsafe_rewards": 0.13779191672801971,
"step": 1310
},
{
"epoch": 0.71,
"learning_rate": 1.1721933556217792e-07,
"logits/chosen": -2.4175376892089844,
"logits/rejected": -2.23214054107666,
"logps/chosen": -195.77786254882812,
"logps/rejected": -175.40225219726562,
"loss": 11.5399,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.27427220344543457,
"rewards/margins": 0.10898621380329132,
"rewards/rejected": 0.16528600454330444,
"rewards/safe_rewards": 0.21214981377124786,
"rewards/unsafe_rewards": 0.3363945782184601,
"step": 1320
},
{
"epoch": 0.72,
"learning_rate": 1.1326296046939333e-07,
"logits/chosen": -2.3801956176757812,
"logits/rejected": -2.162496328353882,
"logps/chosen": -184.91856384277344,
"logps/rejected": -153.4582061767578,
"loss": 63.4268,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.12642014026641846,
"rewards/margins": -0.3640880286693573,
"rewards/rejected": 0.23766788840293884,
"rewards/safe_rewards": -0.23899023234844208,
"rewards/unsafe_rewards": -0.013849982991814613,
"step": 1330
},
{
"epoch": 0.72,
"learning_rate": 1.0935485796594351e-07,
"logits/chosen": -2.4861056804656982,
"logits/rejected": -2.239741086959839,
"logps/chosen": -222.3768310546875,
"logps/rejected": -176.0164337158203,
"loss": 21.3914,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.18739810585975647,
"rewards/margins": 0.09455545991659164,
"rewards/rejected": 0.09284263849258423,
"rewards/safe_rewards": 0.19133667647838593,
"rewards/unsafe_rewards": 0.1834595501422882,
"step": 1340
},
{
"epoch": 0.73,
"learning_rate": 1.0549640773818028e-07,
"logits/chosen": -2.4289638996124268,
"logits/rejected": -2.237046003341675,
"logps/chosen": -204.95181274414062,
"logps/rejected": -158.824951171875,
"loss": 8.4938,
"rewards/accuracies": 0.484375,
"rewards/chosen": 0.01927146315574646,
"rewards/margins": -0.0841434970498085,
"rewards/rejected": 0.10341496765613556,
"rewards/safe_rewards": 0.04857074096798897,
"rewards/unsafe_rewards": -0.010027825832366943,
"step": 1350
},
{
"epoch": 0.73,
"learning_rate": 1.0168897194359921e-07,
"logits/chosen": -2.4466030597686768,
"logits/rejected": -2.194831132888794,
"logps/chosen": -222.03775024414062,
"logps/rejected": -183.56564331054688,
"loss": 82.0212,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.2902601659297943,
"rewards/margins": 0.6036115884780884,
"rewards/rejected": -0.3133513927459717,
"rewards/safe_rewards": 0.4282899498939514,
"rewards/unsafe_rewards": 0.152230367064476,
"step": 1360
},
{
"epoch": 0.74,
"learning_rate": 9.793389472995392e-08,
"logits/chosen": -2.4077987670898438,
"logits/rejected": -2.1739821434020996,
"logps/chosen": -209.699951171875,
"logps/rejected": -166.60293579101562,
"loss": 11.3477,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.014321346767246723,
"rewards/margins": -0.030286794528365135,
"rewards/rejected": 0.015965450555086136,
"rewards/safe_rewards": 0.03987512364983559,
"rewards/unsafe_rewards": -0.06851781159639359,
"step": 1370
},
{
"epoch": 0.74,
"learning_rate": 9.423250176072874e-08,
"logits/chosen": -2.401275634765625,
"logits/rejected": -2.192737340927124,
"logps/chosen": -181.48147583007812,
"logps/rejected": -154.23431396484375,
"loss": 15.9486,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.0320340096950531,
"rewards/margins": 0.20615582168102264,
"rewards/rejected": -0.23818981647491455,
"rewards/safe_rewards": 0.015104318037629128,
"rewards/unsafe_rewards": -0.07917235046625137,
"step": 1380
},
{
"epoch": 0.75,
"learning_rate": 9.058609974713654e-08,
"logits/chosen": -2.4539401531219482,
"logits/rejected": -2.1792826652526855,
"logps/chosen": -206.2873992919922,
"logps/rejected": -171.813232421875,
"loss": 27.4047,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.08179975301027298,
"rewards/margins": -0.07024437934160233,
"rewards/rejected": -0.011555373668670654,
"rewards/safe_rewards": 0.006575888488441706,
"rewards/unsafe_rewards": -0.17017540335655212,
"step": 1390
},
{
"epoch": 0.75,
"learning_rate": 8.699597598680753e-08,
"logits/chosen": -2.3884987831115723,
"logits/rejected": -2.1706833839416504,
"logps/chosen": -183.61544799804688,
"logps/rejected": -168.7871856689453,
"loss": 34.4575,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": 0.44646185636520386,
"rewards/margins": 0.1550489217042923,
"rewards/rejected": 0.29141297936439514,
"rewards/safe_rewards": 0.32389289140701294,
"rewards/unsafe_rewards": 0.5690308809280396,
"step": 1400
},
{
"epoch": 0.76,
"learning_rate": 8.346339790933166e-08,
"logits/chosen": -2.4721839427948,
"logits/rejected": -2.2297019958496094,
"logps/chosen": -200.0784149169922,
"logps/rejected": -159.7423858642578,
"loss": 6.7397,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.12982772290706635,
"rewards/margins": -0.028519075363874435,
"rewards/rejected": 0.15834678709506989,
"rewards/safe_rewards": -0.03010488487780094,
"rewards/unsafe_rewards": 0.2897603511810303,
"step": 1410
},
{
"epoch": 0.76,
"learning_rate": 7.998961262881506e-08,
"logits/chosen": -2.418222665786743,
"logits/rejected": -2.1581873893737793,
"logps/chosen": -220.6064453125,
"logps/rejected": -172.82266235351562,
"loss": 6.4288,
"rewards/accuracies": 0.559374988079071,
"rewards/chosen": 0.35179823637008667,
"rewards/margins": 0.23103070259094238,
"rewards/rejected": 0.12076754868030548,
"rewards/safe_rewards": 0.3257552981376648,
"rewards/unsafe_rewards": 0.37784117460250854,
"step": 1420
},
{
"epoch": 0.77,
"learning_rate": 7.657584650360846e-08,
"logits/chosen": -2.396697521209717,
"logits/rejected": -2.2003862857818604,
"logps/chosen": -199.44009399414062,
"logps/rejected": -172.6617431640625,
"loss": 35.7268,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.024801719933748245,
"rewards/margins": 0.26587414741516113,
"rewards/rejected": -0.2906758785247803,
"rewards/safe_rewards": 0.05263688042759895,
"rewards/unsafe_rewards": -0.10224030166864395,
"step": 1430
},
{
"epoch": 0.77,
"learning_rate": 7.322330470336313e-08,
"logits/chosen": -2.3913733959198,
"logits/rejected": -2.189946413040161,
"logps/chosen": -190.08120727539062,
"logps/rejected": -170.0216522216797,
"loss": 9.582,
"rewards/accuracies": 0.546875,
"rewards/chosen": -0.11784199625253677,
"rewards/margins": -0.031096214428544044,
"rewards/rejected": -0.08674577623605728,
"rewards/safe_rewards": -0.293480783700943,
"rewards/unsafe_rewards": 0.05779681354761124,
"step": 1440
},
{
"epoch": 0.78,
"learning_rate": 6.993317078356709e-08,
"logits/chosen": -2.3910608291625977,
"logits/rejected": -2.2192938327789307,
"logps/chosen": -199.07406616210938,
"logps/rejected": -170.1977996826172,
"loss": 45.9652,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.10676655918359756,
"rewards/margins": -0.08676379173994064,
"rewards/rejected": -0.02000277489423752,
"rewards/safe_rewards": -0.0720798522233963,
"rewards/unsafe_rewards": -0.14145328104496002,
"step": 1450
},
{
"epoch": 0.79,
"learning_rate": 6.67066062677118e-08,
"logits/chosen": -2.4357597827911377,
"logits/rejected": -2.2244791984558105,
"logps/chosen": -208.4618682861328,
"logps/rejected": -167.52764892578125,
"loss": 20.8808,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": 0.2480795830488205,
"rewards/margins": -0.08884197473526001,
"rewards/rejected": 0.3369216322898865,
"rewards/safe_rewards": 0.1537085473537445,
"rewards/unsafe_rewards": 0.34245067834854126,
"step": 1460
},
{
"epoch": 0.79,
"learning_rate": 6.354475023723685e-08,
"logits/chosen": -2.3960747718811035,
"logits/rejected": -2.1642906665802,
"logps/chosen": -216.65756225585938,
"logps/rejected": -171.6775665283203,
"loss": 59.1855,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.5152679681777954,
"rewards/margins": 0.2937913239002228,
"rewards/rejected": 0.22147664427757263,
"rewards/safe_rewards": 0.2597041726112366,
"rewards/unsafe_rewards": 0.7708317041397095,
"step": 1470
},
{
"epoch": 0.8,
"learning_rate": 6.044871892939746e-08,
"logits/chosen": -2.4158756732940674,
"logits/rejected": -2.2148139476776123,
"logps/chosen": -225.4951171875,
"logps/rejected": -189.0193328857422,
"loss": 30.3887,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.14366625249385834,
"rewards/margins": 0.25630486011505127,
"rewards/rejected": -0.11263859272003174,
"rewards/safe_rewards": 0.08603324741125107,
"rewards/unsafe_rewards": 0.20129923522472382,
"step": 1480
},
{
"epoch": 0.8,
"learning_rate": 5.741960534319676e-08,
"logits/chosen": -2.391890525817871,
"logits/rejected": -2.2089953422546387,
"logps/chosen": -190.7472686767578,
"logps/rejected": -160.5789031982422,
"loss": 29.5828,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.04550771787762642,
"rewards/margins": -0.21935884654521942,
"rewards/rejected": 0.1738511323928833,
"rewards/safe_rewards": -0.12179826200008392,
"rewards/unsafe_rewards": 0.030782824382185936,
"step": 1490
},
{
"epoch": 0.81,
"learning_rate": 5.44584788535217e-08,
"logits/chosen": -2.4144439697265625,
"logits/rejected": -2.209897994995117,
"logps/chosen": -213.13632202148438,
"logps/rejected": -176.97024536132812,
"loss": 15.9924,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.15160061419010162,
"rewards/margins": -0.01647660695016384,
"rewards/rejected": -0.13512399792671204,
"rewards/safe_rewards": -0.37716132402420044,
"rewards/unsafe_rewards": 0.07396010309457779,
"step": 1500
},
{
"epoch": 0.81,
"eval_logits/chosen": -2.084881544113159,
"eval_logits/rejected": -1.833509922027588,
"eval_logps/chosen": -131.02365112304688,
"eval_logps/rejected": -92.45955657958984,
"eval_loss": 0.6823093295097351,
"eval_rewards/accuracies": 0.4713214039802551,
"eval_rewards/chosen": -0.1577797532081604,
"eval_rewards/margins": -0.05011267587542534,
"eval_rewards/rejected": -0.10766706615686417,
"eval_rewards/safe_rewards": -0.15565218031406403,
"eval_rewards/unsafe_rewards": -0.15351000428199768,
"eval_runtime": 1880.4558,
"eval_samples_per_second": 17.572,
"eval_steps_per_second": 1.099,
"step": 1500
},
{
"epoch": 0.81,
"learning_rate": 5.156638483361933e-08,
"logits/chosen": -2.435300827026367,
"logits/rejected": -2.1943700313568115,
"logps/chosen": -206.97384643554688,
"logps/rejected": -174.73373413085938,
"loss": 6.0946,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": -0.2385288029909134,
"rewards/margins": 0.05665416270494461,
"rewards/rejected": -0.295183002948761,
"rewards/safe_rewards": -0.1594325453042984,
"rewards/unsafe_rewards": -0.3176250755786896,
"step": 1510
},
{
"epoch": 0.82,
"learning_rate": 4.8744344286046236e-08,
"logits/chosen": -2.4003233909606934,
"logits/rejected": -2.177899122238159,
"logps/chosen": -207.0956573486328,
"logps/rejected": -169.01504516601562,
"loss": 45.2569,
"rewards/accuracies": 0.4906249940395355,
"rewards/chosen": 0.26545172929763794,
"rewards/margins": 0.4598563611507416,
"rewards/rejected": -0.19440460205078125,
"rewards/safe_rewards": 0.313471257686615,
"rewards/unsafe_rewards": 0.21743226051330566,
"step": 1520
},
{
"epoch": 0.82,
"learning_rate": 4.599335348222169e-08,
"logits/chosen": -2.4335553646087646,
"logits/rejected": -2.246596574783325,
"logps/chosen": -207.1642608642578,
"logps/rejected": -186.18722534179688,
"loss": 5.5153,
"rewards/accuracies": 0.515625,
"rewards/chosen": -0.0844786986708641,
"rewards/margins": 0.10317480564117432,
"rewards/rejected": -0.1876535415649414,
"rewards/safe_rewards": -0.22987417876720428,
"rewards/unsafe_rewards": 0.06091681867837906,
"step": 1530
},
{
"epoch": 0.83,
"learning_rate": 4.331438361071163e-08,
"logits/chosen": -2.3511147499084473,
"logits/rejected": -2.206602096557617,
"logps/chosen": -212.7078857421875,
"logps/rejected": -194.2686767578125,
"loss": 21.5544,
"rewards/accuracies": 0.47187501192092896,
"rewards/chosen": -0.17899686098098755,
"rewards/margins": 0.10844133794307709,
"rewards/rejected": -0.28743821382522583,
"rewards/safe_rewards": -0.30973827838897705,
"rewards/unsafe_rewards": -0.04825545474886894,
"step": 1540
},
{
"epoch": 0.83,
"learning_rate": 4.0708380434367864e-08,
"logits/chosen": -2.4302127361297607,
"logits/rejected": -2.1903905868530273,
"logps/chosen": -199.45376586914062,
"logps/rejected": -171.4712677001953,
"loss": 11.7548,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.16563379764556885,
"rewards/margins": -0.16774305701255798,
"rewards/rejected": 0.0021092891693115234,
"rewards/safe_rewards": -0.17964015901088715,
"rewards/unsafe_rewards": -0.15162742137908936,
"step": 1550
},
{
"epoch": 0.84,
"learning_rate": 3.817626395644305e-08,
"logits/chosen": -2.428711414337158,
"logits/rejected": -2.232553005218506,
"logps/chosen": -206.1396942138672,
"logps/rejected": -177.48374938964844,
"loss": 20.469,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.2369929999113083,
"rewards/margins": -0.06113150715827942,
"rewards/rejected": -0.17586149275302887,
"rewards/safe_rewards": -0.15375518798828125,
"rewards/unsafe_rewards": -0.32023078203201294,
"step": 1560
},
{
"epoch": 0.84,
"learning_rate": 3.571892809580013e-08,
"logits/chosen": -2.395301580429077,
"logits/rejected": -2.1873881816864014,
"logps/chosen": -195.25765991210938,
"logps/rejected": -175.76754760742188,
"loss": 47.6931,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.030344385653734207,
"rewards/margins": 0.31907138228416443,
"rewards/rejected": -0.34941577911376953,
"rewards/safe_rewards": -0.09232734888792038,
"rewards/unsafe_rewards": 0.03163857385516167,
"step": 1570
},
{
"epoch": 0.85,
"learning_rate": 3.333724037132976e-08,
"logits/chosen": -2.4109716415405273,
"logits/rejected": -2.1891541481018066,
"logps/chosen": -198.6385040283203,
"logps/rejected": -170.99563598632812,
"loss": 5.2251,
"rewards/accuracies": 0.4281249940395355,
"rewards/chosen": -0.1846873015165329,
"rewards/margins": -0.06299707293510437,
"rewards/rejected": -0.12169022858142853,
"rewards/safe_rewards": -0.1805131733417511,
"rewards/unsafe_rewards": -0.18886145949363708,
"step": 1580
},
{
"epoch": 0.86,
"learning_rate": 3.1032041595688506e-08,
"logits/chosen": -2.3785929679870605,
"logits/rejected": -2.171466827392578,
"logps/chosen": -216.2442626953125,
"logps/rejected": -185.38406372070312,
"loss": 21.8766,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.09203994274139404,
"rewards/margins": -0.34605592489242554,
"rewards/rejected": 0.2540159523487091,
"rewards/safe_rewards": -0.014932965859770775,
"rewards/unsafe_rewards": -0.16914694011211395,
"step": 1590
},
{
"epoch": 0.86,
"learning_rate": 2.880414557846453e-08,
"logits/chosen": -2.4211525917053223,
"logits/rejected": -2.259765863418579,
"logps/chosen": -200.02296447753906,
"logps/rejected": -164.5922393798828,
"loss": 78.4789,
"rewards/accuracies": 0.503125011920929,
"rewards/chosen": -0.19741728901863098,
"rewards/margins": 0.0014421313535422087,
"rewards/rejected": -0.19885942339897156,
"rewards/safe_rewards": -0.07222781330347061,
"rewards/unsafe_rewards": -0.32260677218437195,
"step": 1600
},
{
"epoch": 0.87,
"learning_rate": 2.6654338838876662e-08,
"logits/chosen": -2.4327399730682373,
"logits/rejected": -2.1489098072052,
"logps/chosen": -206.57406616210938,
"logps/rejected": -162.18191528320312,
"loss": 31.8357,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -0.03170815855264664,
"rewards/margins": 0.1048579216003418,
"rewards/rejected": -0.13656608760356903,
"rewards/safe_rewards": -0.28655681014060974,
"rewards/unsafe_rewards": 0.22314047813415527,
"step": 1610
},
{
"epoch": 0.87,
"learning_rate": 2.4583380328107805e-08,
"logits/chosen": -2.4065799713134766,
"logits/rejected": -2.168668508529663,
"logps/chosen": -219.0827178955078,
"logps/rejected": -174.2585906982422,
"loss": 19.5756,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.28594768047332764,
"rewards/margins": -0.2515542805194855,
"rewards/rejected": -0.034393392503261566,
"rewards/safe_rewards": -0.40954461693763733,
"rewards/unsafe_rewards": -0.16235077381134033,
"step": 1620
},
{
"epoch": 0.88,
"learning_rate": 2.259200116137039e-08,
"logits/chosen": -2.381093740463257,
"logits/rejected": -2.1939659118652344,
"logps/chosen": -204.22921752929688,
"logps/rejected": -187.06576538085938,
"loss": 169.2986,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.762048602104187,
"rewards/margins": 0.7863933444023132,
"rewards/rejected": -0.024344712495803833,
"rewards/safe_rewards": 0.9233208894729614,
"rewards/unsafe_rewards": 0.6007765531539917,
"step": 1630
},
{
"epoch": 0.88,
"learning_rate": 2.068090435979958e-08,
"logits/chosen": -2.3571343421936035,
"logits/rejected": -2.1805238723754883,
"logps/chosen": -194.33248901367188,
"logps/rejected": -165.1163787841797,
"loss": 56.9496,
"rewards/accuracies": 0.484375,
"rewards/chosen": -0.3039317727088928,
"rewards/margins": -0.40204334259033203,
"rewards/rejected": 0.09811154752969742,
"rewards/safe_rewards": 0.22571036219596863,
"rewards/unsafe_rewards": -0.8335739374160767,
"step": 1640
},
{
"epoch": 0.89,
"learning_rate": 1.8850764602263423e-08,
"logits/chosen": -2.415358304977417,
"logits/rejected": -2.1450016498565674,
"logps/chosen": -200.4285888671875,
"logps/rejected": -173.5339813232422,
"loss": 27.6237,
"rewards/accuracies": 0.5531250238418579,
"rewards/chosen": -0.12354181706905365,
"rewards/margins": -0.11882360279560089,
"rewards/rejected": -0.0047182366251945496,
"rewards/safe_rewards": -0.05137089639902115,
"rewards/unsafe_rewards": -0.19571277499198914,
"step": 1650
},
{
"epoch": 0.89,
"learning_rate": 1.710222798718028e-08,
"logits/chosen": -2.4396722316741943,
"logits/rejected": -2.2350778579711914,
"logps/chosen": -203.3378448486328,
"logps/rejected": -178.9970245361328,
"loss": 22.9552,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.003960543777793646,
"rewards/margins": 0.2720267176628113,
"rewards/rejected": -0.2680661380290985,
"rewards/safe_rewards": 0.06284536421298981,
"rewards/unsafe_rewards": -0.054924286901950836,
"step": 1660
},
{
"epoch": 0.9,
"learning_rate": 1.5435911804424356e-08,
"logits/chosen": -2.4028568267822266,
"logits/rejected": -2.2349255084991455,
"logps/chosen": -229.4883270263672,
"logps/rejected": -185.6358642578125,
"loss": 31.5896,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.2171170711517334,
"rewards/margins": 0.017191190272569656,
"rewards/rejected": -0.23430824279785156,
"rewards/safe_rewards": 0.15241694450378418,
"rewards/unsafe_rewards": -0.586651086807251,
"step": 1670
},
{
"epoch": 0.9,
"learning_rate": 1.3852404317403199e-08,
"logits/chosen": -2.395153284072876,
"logits/rejected": -2.2008633613586426,
"logps/chosen": -220.5502471923828,
"logps/rejected": -194.44186401367188,
"loss": 26.1714,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -0.21780619025230408,
"rewards/margins": -0.0533115491271019,
"rewards/rejected": -0.16449466347694397,
"rewards/safe_rewards": -0.4852879047393799,
"rewards/unsafe_rewards": 0.04967564344406128,
"step": 1680
},
{
"epoch": 0.91,
"learning_rate": 1.235226455538113e-08,
"logits/chosen": -2.4504330158233643,
"logits/rejected": -2.2494871616363525,
"logps/chosen": -201.50564575195312,
"logps/rejected": -167.95364379882812,
"loss": 5.2467,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.037842657417058945,
"rewards/margins": -0.06888096779584885,
"rewards/rejected": 0.031038302928209305,
"rewards/safe_rewards": -0.037054188549518585,
"rewards/unsafe_rewards": -0.0386311374604702,
"step": 1690
},
{
"epoch": 0.91,
"learning_rate": 1.0936022116124321e-08,
"logits/chosen": -2.4290854930877686,
"logits/rejected": -2.204906463623047,
"logps/chosen": -199.54847717285156,
"logps/rejected": -165.02816772460938,
"loss": 41.987,
"rewards/accuracies": 0.5093749761581421,
"rewards/chosen": -0.15811631083488464,
"rewards/margins": -0.3054047226905823,
"rewards/rejected": 0.14728839695453644,
"rewards/safe_rewards": -0.39381590485572815,
"rewards/unsafe_rewards": 0.07758323848247528,
"step": 1700
},
{
"epoch": 0.92,
"learning_rate": 9.60417697893534e-09,
"logits/chosen": -2.4069314002990723,
"logits/rejected": -2.2242488861083984,
"logps/chosen": -199.82015991210938,
"logps/rejected": -173.9343719482422,
"loss": 22.6453,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": 0.1551128476858139,
"rewards/margins": -0.032198842614889145,
"rewards/rejected": 0.18731167912483215,
"rewards/safe_rewards": 0.21557751297950745,
"rewards/unsafe_rewards": 0.09464815258979797,
"step": 1710
},
{
"epoch": 0.93,
"learning_rate": 8.357199328144576e-09,
"logits/chosen": -2.4046077728271484,
"logits/rejected": -2.2161166667938232,
"logps/chosen": -216.55093383789062,
"logps/rejected": -187.6187286376953,
"loss": 56.4505,
"rewards/accuracies": 0.528124988079071,
"rewards/chosen": 0.08073421567678452,
"rewards/margins": -0.025125902146100998,
"rewards/rejected": 0.10586012899875641,
"rewards/safe_rewards": 0.07104392349720001,
"rewards/unsafe_rewards": 0.09042453020811081,
"step": 1720
},
{
"epoch": 0.93,
"learning_rate": 7.1955293871198144e-09,
"logits/chosen": -2.4008450508117676,
"logits/rejected": -2.261340379714966,
"logps/chosen": -187.19436645507812,
"logps/rejected": -169.91722106933594,
"loss": 18.4483,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.12119190394878387,
"rewards/margins": -0.34029996395111084,
"rewards/rejected": 0.21910807490348816,
"rewards/safe_rewards": -0.20904748141765594,
"rewards/unsafe_rewards": -0.03333630412817001,
"step": 1730
},
{
"epoch": 0.94,
"learning_rate": 6.119577262853254e-09,
"logits/chosen": -2.4227774143218994,
"logits/rejected": -2.1880428791046143,
"logps/chosen": -193.263671875,
"logps/rejected": -162.72183227539062,
"loss": 27.596,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.09984199702739716,
"rewards/margins": -0.025053083896636963,
"rewards/rejected": -0.07478892058134079,
"rewards/safe_rewards": -0.26086345314979553,
"rewards/unsafe_rewards": 0.06117943674325943,
"step": 1740
},
{
"epoch": 0.94,
"learning_rate": 5.129722801180542e-09,
"logits/chosen": -2.3443946838378906,
"logits/rejected": -2.1799635887145996,
"logps/chosen": -197.2679443359375,
"logps/rejected": -180.6214599609375,
"loss": 19.3736,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.3499890863895416,
"rewards/margins": -0.13464350998401642,
"rewards/rejected": -0.2153455764055252,
"rewards/safe_rewards": -0.1743151694536209,
"rewards/unsafe_rewards": -0.5256629586219788,
"step": 1750
},
{
"epoch": 0.95,
"learning_rate": 4.226315452682816e-09,
"logits/chosen": -2.413181781768799,
"logits/rejected": -2.187439441680908,
"logps/chosen": -196.54916381835938,
"logps/rejected": -173.30929565429688,
"loss": 31.0958,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.147268146276474,
"rewards/margins": 0.10999511182308197,
"rewards/rejected": -0.2572632431983948,
"rewards/safe_rewards": -0.11902125179767609,
"rewards/unsafe_rewards": -0.17551502585411072,
"step": 1760
},
{
"epoch": 0.95,
"learning_rate": 3.4096741493194193e-09,
"logits/chosen": -2.443580389022827,
"logits/rejected": -2.2651684284210205,
"logps/chosen": -199.41049194335938,
"logps/rejected": -173.95718383789062,
"loss": 9.8716,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": 0.056076280772686005,
"rewards/margins": 0.028754467144608498,
"rewards/rejected": 0.027321819216012955,
"rewards/safe_rewards": 0.034864675253629684,
"rewards/unsafe_rewards": 0.07728789001703262,
"step": 1770
},
{
"epoch": 0.96,
"learning_rate": 2.6800871918346846e-09,
"logits/chosen": -2.4057886600494385,
"logits/rejected": -2.155165672302246,
"logps/chosen": -203.48025512695312,
"logps/rejected": -172.94015502929688,
"loss": 41.8074,
"rewards/accuracies": 0.5218750238418579,
"rewards/chosen": -0.21421018242835999,
"rewards/margins": -0.43174242973327637,
"rewards/rejected": 0.2175322324037552,
"rewards/safe_rewards": -0.06811753660440445,
"rewards/unsafe_rewards": -0.36030280590057373,
"step": 1780
},
{
"epoch": 0.96,
"learning_rate": 2.0378121479783796e-09,
"logits/chosen": -2.389869213104248,
"logits/rejected": -2.1555044651031494,
"logps/chosen": -196.02059936523438,
"logps/rejected": -167.43655395507812,
"loss": 61.0971,
"rewards/accuracies": 0.4781250059604645,
"rewards/chosen": 0.008689677342772484,
"rewards/margins": 0.18058201670646667,
"rewards/rejected": -0.17189235985279083,
"rewards/safe_rewards": 0.042548321187496185,
"rewards/unsafe_rewards": -0.025168979540467262,
"step": 1790
},
{
"epoch": 0.97,
"learning_rate": 1.4830757615760247e-09,
"logits/chosen": -2.4289557933807373,
"logits/rejected": -2.1850333213806152,
"logps/chosen": -207.24124145507812,
"logps/rejected": -170.49305725097656,
"loss": 144.1229,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.07967615127563477,
"rewards/margins": 0.1154344230890274,
"rewards/rejected": -0.19511058926582336,
"rewards/safe_rewards": -0.10584060847759247,
"rewards/unsafe_rewards": -0.05351167917251587,
"step": 1800
},
{
"epoch": 0.97,
"learning_rate": 1.0160738724809548e-09,
"logits/chosen": -2.4409990310668945,
"logits/rejected": -2.207919120788574,
"logps/chosen": -196.10601806640625,
"logps/rejected": -171.36843872070312,
"loss": 18.7773,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.08281825482845306,
"rewards/margins": 0.23606376349925995,
"rewards/rejected": -0.3188820481300354,
"rewards/safe_rewards": 0.05616650730371475,
"rewards/unsafe_rewards": -0.2218029946088791,
"step": 1810
},
{
"epoch": 0.98,
"learning_rate": 6.369713474366212e-10,
"logits/chosen": -2.420626640319824,
"logits/rejected": -2.1977345943450928,
"logps/chosen": -219.4222869873047,
"logps/rejected": -181.95010375976562,
"loss": 17.5266,
"rewards/accuracies": 0.49687498807907104,
"rewards/chosen": -0.16265834867954254,
"rewards/margins": -0.11877261102199554,
"rewards/rejected": -0.0438857302069664,
"rewards/safe_rewards": -0.035406678915023804,
"rewards/unsafe_rewards": -0.2899099886417389,
"step": 1820
},
{
"epoch": 0.98,
"learning_rate": 3.459020218731512e-10,
"logits/chosen": -2.4327456951141357,
"logits/rejected": -2.220496654510498,
"logps/chosen": -202.61898803710938,
"logps/rejected": -167.197021484375,
"loss": 43.7242,
"rewards/accuracies": 0.534375011920929,
"rewards/chosen": 0.15349379181861877,
"rewards/margins": 0.14935937523841858,
"rewards/rejected": 0.004134447779506445,
"rewards/safe_rewards": 0.08035097271203995,
"rewards/unsafe_rewards": 0.2266366183757782,
"step": 1830
},
{
"epoch": 0.99,
"learning_rate": 1.429686526593088e-10,
"logits/chosen": -2.398090124130249,
"logits/rejected": -2.192744255065918,
"logps/chosen": -206.80520629882812,
"logps/rejected": -175.9212646484375,
"loss": 23.3409,
"rewards/accuracies": 0.453125,
"rewards/chosen": 0.3794136941432953,
"rewards/margins": 0.2774004638195038,
"rewards/rejected": 0.10201327502727509,
"rewards/safe_rewards": 0.5806846022605896,
"rewards/unsafe_rewards": 0.17814283072948456,
"step": 1840
},
{
"epoch": 1.0,
"learning_rate": 2.824288182584622e-11,
"logits/chosen": -2.4241063594818115,
"logits/rejected": -2.2421114444732666,
"logps/chosen": -206.7459716796875,
"logps/rejected": -168.176513671875,
"loss": 19.5817,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.24208331108093262,
"rewards/margins": -0.22288334369659424,
"rewards/rejected": -0.019199971109628677,
"rewards/safe_rewards": 0.039939720183610916,
"rewards/unsafe_rewards": -0.5241063237190247,
"step": 1850
},
{
"epoch": 1.0,
"step": 1858,
"total_flos": 0.0,
"train_loss": 67.04043597990268,
"train_runtime": 46860.0347,
"train_samples_per_second": 1.269,
"train_steps_per_second": 0.04
}
],
"logging_steps": 10,
"max_steps": 1858,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}