Falcon-7B-Instruct-ORPO-SALT-HALF / trainer_state.json
chchen's picture
End of training
5677209 verified
{
"best_metric": 1.4967381954193115,
"best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo-salt-half/checkpoint-1500",
"epoch": 2.9974597798475866,
"eval_steps": 500,
"global_step": 1770,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01693480101608806,
"grad_norm": 0.6027132868766785,
"learning_rate": 4.999614014035063e-06,
"logits/chosen": -14.201833724975586,
"logits/rejected": -14.270045280456543,
"logps/chosen": -1.961771011352539,
"logps/rejected": -2.1497561931610107,
"loss": 2.0361,
"odds_ratio_loss": 0.7429978251457214,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.19617711007595062,
"rewards/margins": 0.01879853382706642,
"rewards/rejected": -0.21497564017772675,
"sft_loss": 1.961771011352539,
"step": 10
},
{
"epoch": 0.03386960203217612,
"grad_norm": 0.4791746735572815,
"learning_rate": 4.998440543386042e-06,
"logits/chosen": -14.17326545715332,
"logits/rejected": -14.03160572052002,
"logps/chosen": -1.9260406494140625,
"logps/rejected": -2.0053372383117676,
"loss": 2.0019,
"odds_ratio_loss": 0.7586489915847778,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.19260406494140625,
"rewards/margins": 0.007929656654596329,
"rewards/rejected": -0.20053371787071228,
"sft_loss": 1.9260406494140625,
"step": 20
},
{
"epoch": 0.05080440304826418,
"grad_norm": 0.3785243630409241,
"learning_rate": 4.996479918381253e-06,
"logits/chosen": -14.245376586914062,
"logits/rejected": -14.222900390625,
"logps/chosen": -1.8398857116699219,
"logps/rejected": -1.8666032552719116,
"loss": 1.9146,
"odds_ratio_loss": 0.7475350499153137,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.18398860096931458,
"rewards/margins": 0.00267172628082335,
"rewards/rejected": -0.18666031956672668,
"sft_loss": 1.8398857116699219,
"step": 30
},
{
"epoch": 0.06773920406435224,
"grad_norm": 0.637917697429657,
"learning_rate": 4.993732756731818e-06,
"logits/chosen": -14.213427543640137,
"logits/rejected": -14.385249137878418,
"logps/chosen": -1.8162885904312134,
"logps/rejected": -1.9234222173690796,
"loss": 1.889,
"odds_ratio_loss": 0.7271509766578674,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.18162885308265686,
"rewards/margins": 0.010713383555412292,
"rewards/rejected": -0.19234223663806915,
"sft_loss": 1.8162885904312134,
"step": 40
},
{
"epoch": 0.0846740050804403,
"grad_norm": 0.6790710091590881,
"learning_rate": 4.9901999239537345e-06,
"logits/chosen": -14.203392028808594,
"logits/rejected": -14.118731498718262,
"logps/chosen": -1.9451831579208374,
"logps/rejected": -1.9480127096176147,
"loss": 2.0255,
"odds_ratio_loss": 0.8034948110580444,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.1945182979106903,
"rewards/margins": 0.0002829456643667072,
"rewards/rejected": -0.19480125606060028,
"sft_loss": 1.9451831579208374,
"step": 50
},
{
"epoch": 0.10160880609652836,
"grad_norm": 0.38820621371269226,
"learning_rate": 4.985882533095186e-06,
"logits/chosen": -14.125239372253418,
"logits/rejected": -14.241134643554688,
"logps/chosen": -1.7669858932495117,
"logps/rejected": -1.818566918373108,
"loss": 1.8465,
"odds_ratio_loss": 0.7950754761695862,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.17669859528541565,
"rewards/margins": 0.005158091429620981,
"rewards/rejected": -0.1818566769361496,
"sft_loss": 1.7669858932495117,
"step": 60
},
{
"epoch": 0.11854360711261643,
"grad_norm": 1.485378384590149,
"learning_rate": 4.9807819443858705e-06,
"logits/chosen": -14.16772174835205,
"logits/rejected": -14.14952564239502,
"logps/chosen": -1.7974742650985718,
"logps/rejected": -1.8876419067382812,
"loss": 1.8722,
"odds_ratio_loss": 0.7475281953811646,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.17974743247032166,
"rewards/margins": 0.009016749449074268,
"rewards/rejected": -0.18876421451568604,
"sft_loss": 1.7974742650985718,
"step": 70
},
{
"epoch": 0.1354784081287045,
"grad_norm": 0.6158199310302734,
"learning_rate": 4.9748997648084404e-06,
"logits/chosen": -14.09917163848877,
"logits/rejected": -14.224530220031738,
"logps/chosen": -1.7899717092514038,
"logps/rejected": -1.8508037328720093,
"loss": 1.8688,
"odds_ratio_loss": 0.7882196307182312,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.17899715900421143,
"rewards/margins": 0.0060832141898572445,
"rewards/rejected": -0.1850803941488266,
"sft_loss": 1.7899717092514038,
"step": 80
},
{
"epoch": 0.15241320914479256,
"grad_norm": 1.0399421453475952,
"learning_rate": 4.96823784759222e-06,
"logits/chosen": -14.11219596862793,
"logits/rejected": -14.099919319152832,
"logps/chosen": -1.7365163564682007,
"logps/rejected": -1.7418320178985596,
"loss": 1.8161,
"odds_ratio_loss": 0.7956770658493042,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.17365165054798126,
"rewards/margins": 0.0005315736052580178,
"rewards/rejected": -0.174183189868927,
"sft_loss": 1.7365163564682007,
"step": 90
},
{
"epoch": 0.1693480101608806,
"grad_norm": 0.9482620358467102,
"learning_rate": 4.960798291629323e-06,
"logits/chosen": -14.198771476745605,
"logits/rejected": -14.24067497253418,
"logps/chosen": -1.8019222021102905,
"logps/rejected": -1.7944272756576538,
"loss": 1.8785,
"odds_ratio_loss": 0.765292227268219,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.18019220232963562,
"rewards/margins": -0.000749480735976249,
"rewards/rejected": -0.17944273352622986,
"sft_loss": 1.8019222021102905,
"step": 100
},
{
"epoch": 0.18628281117696868,
"grad_norm": 1.496517539024353,
"learning_rate": 4.952583440813383e-06,
"logits/chosen": -14.270334243774414,
"logits/rejected": -14.252988815307617,
"logps/chosen": -1.8082859516143799,
"logps/rejected": -1.8689155578613281,
"loss": 1.885,
"odds_ratio_loss": 0.7666890025138855,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.18082860112190247,
"rewards/margins": 0.0060629709623754025,
"rewards/rejected": -0.186891570687294,
"sft_loss": 1.8082859516143799,
"step": 110
},
{
"epoch": 0.20321761219305673,
"grad_norm": 0.8162474036216736,
"learning_rate": 4.943595883301086e-06,
"logits/chosen": -14.396245002746582,
"logits/rejected": -14.407267570495605,
"logps/chosen": -1.8202846050262451,
"logps/rejected": -1.8238685131072998,
"loss": 1.8966,
"odds_ratio_loss": 0.7631626129150391,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.18202845752239227,
"rewards/margins": 0.0003583906218409538,
"rewards/rejected": -0.1823868602514267,
"sft_loss": 1.8202846050262451,
"step": 120
},
{
"epoch": 0.2201524132091448,
"grad_norm": 0.9815341234207153,
"learning_rate": 4.933838450696757e-06,
"logits/chosen": -14.14527702331543,
"logits/rejected": -14.11426830291748,
"logps/chosen": -1.6691076755523682,
"logps/rejected": -1.7151718139648438,
"loss": 1.7441,
"odds_ratio_loss": 0.7502495050430298,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.16691075265407562,
"rewards/margins": 0.004606431350111961,
"rewards/rejected": -0.17151719331741333,
"sft_loss": 1.6691076755523682,
"step": 130
},
{
"epoch": 0.23708721422523285,
"grad_norm": 1.7414650917053223,
"learning_rate": 4.923314217160234e-06,
"logits/chosen": -14.14660358428955,
"logits/rejected": -14.196474075317383,
"logps/chosen": -1.7544286251068115,
"logps/rejected": -1.7217376232147217,
"loss": 1.8341,
"odds_ratio_loss": 0.7964597344398499,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.1754428595304489,
"rewards/margins": -0.003269097302109003,
"rewards/rejected": -0.17217376828193665,
"sft_loss": 1.7544286251068115,
"step": 140
},
{
"epoch": 0.2540220152413209,
"grad_norm": 0.6009025573730469,
"learning_rate": 4.9120264984383285e-06,
"logits/chosen": -14.155496597290039,
"logits/rejected": -14.008768081665039,
"logps/chosen": -1.5715187788009644,
"logps/rejected": -1.608656644821167,
"loss": 1.6472,
"odds_ratio_loss": 0.7572886347770691,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15715190768241882,
"rewards/margins": 0.003713789861649275,
"rewards/rejected": -0.1608656644821167,
"sft_loss": 1.5715187788009644,
"step": 150
},
{
"epoch": 0.270956816257409,
"grad_norm": 0.7238659858703613,
"learning_rate": 4.899978850820176e-06,
"logits/chosen": -14.257448196411133,
"logits/rejected": -14.187673568725586,
"logps/chosen": -1.7162948846817017,
"logps/rejected": -1.7536369562149048,
"loss": 1.7925,
"odds_ratio_loss": 0.7625432014465332,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.17162947356700897,
"rewards/margins": 0.0037342351861298084,
"rewards/rejected": -0.1753637045621872,
"sft_loss": 1.7162948846817017,
"step": 160
},
{
"epoch": 0.28789161727349705,
"grad_norm": 0.9593597650527954,
"learning_rate": 4.887175070016795e-06,
"logits/chosen": -14.389033317565918,
"logits/rejected": -14.29101276397705,
"logps/chosen": -1.514937162399292,
"logps/rejected": -1.5708329677581787,
"loss": 1.5883,
"odds_ratio_loss": 0.7331644296646118,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15149369835853577,
"rewards/margins": 0.005589589010924101,
"rewards/rejected": -0.15708328783512115,
"sft_loss": 1.514937162399292,
"step": 170
},
{
"epoch": 0.3048264182895851,
"grad_norm": 1.0034801959991455,
"learning_rate": 4.873619189965217e-06,
"logits/chosen": -14.039607048034668,
"logits/rejected": -14.147199630737305,
"logps/chosen": -1.5949114561080933,
"logps/rejected": -1.746072769165039,
"loss": 1.6635,
"odds_ratio_loss": 0.6863279938697815,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.1594911515712738,
"rewards/margins": 0.015116140246391296,
"rewards/rejected": -0.1746072769165039,
"sft_loss": 1.5949114561080933,
"step": 180
},
{
"epoch": 0.32176121930567314,
"grad_norm": 0.89156574010849,
"learning_rate": 4.859315481557563e-06,
"logits/chosen": -14.219070434570312,
"logits/rejected": -14.151147842407227,
"logps/chosen": -1.5719926357269287,
"logps/rejected": -1.6470130681991577,
"loss": 1.6487,
"odds_ratio_loss": 0.767541766166687,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.15719927847385406,
"rewards/margins": 0.0075020515359938145,
"rewards/rejected": -0.16470131278038025,
"sft_loss": 1.5719926357269287,
"step": 190
},
{
"epoch": 0.3386960203217612,
"grad_norm": 0.587933361530304,
"learning_rate": 4.84426845129546e-06,
"logits/chosen": -14.344035148620605,
"logits/rejected": -14.321207046508789,
"logps/chosen": -1.6490224599838257,
"logps/rejected": -1.638528823852539,
"loss": 1.7261,
"odds_ratio_loss": 0.7703002095222473,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": -0.1649022400379181,
"rewards/margins": -0.0010493483860045671,
"rewards/rejected": -0.16385288536548615,
"sft_loss": 1.6490224599838257,
"step": 200
},
{
"epoch": 0.3556308213378493,
"grad_norm": 2.0271973609924316,
"learning_rate": 4.828482839870233e-06,
"logits/chosen": -14.22668170928955,
"logits/rejected": -14.1005220413208,
"logps/chosen": -1.5818629264831543,
"logps/rejected": -1.5753711462020874,
"loss": 1.6618,
"odds_ratio_loss": 0.7996558547019958,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15818628668785095,
"rewards/margins": -0.0006491712993010879,
"rewards/rejected": -0.15753711760044098,
"sft_loss": 1.5818629264831543,
"step": 210
},
{
"epoch": 0.37256562235393736,
"grad_norm": 0.809647262096405,
"learning_rate": 4.811963620669314e-06,
"logits/chosen": -14.262086868286133,
"logits/rejected": -14.35071849822998,
"logps/chosen": -1.5450419187545776,
"logps/rejected": -1.599981665611267,
"loss": 1.6187,
"odds_ratio_loss": 0.7366654276847839,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.15450419485569,
"rewards/margins": 0.005493967793881893,
"rewards/rejected": -0.15999816358089447,
"sft_loss": 1.5450419187545776,
"step": 220
},
{
"epoch": 0.3895004233700254,
"grad_norm": 0.9206905961036682,
"learning_rate": 4.794715998209328e-06,
"logits/chosen": -14.026702880859375,
"logits/rejected": -14.009126663208008,
"logps/chosen": -1.5401651859283447,
"logps/rejected": -1.6259161233901978,
"loss": 1.6132,
"odds_ratio_loss": 0.7308396100997925,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.15401650965213776,
"rewards/margins": 0.008575108833611012,
"rewards/rejected": -0.1625916212797165,
"sft_loss": 1.5401651859283447,
"step": 230
},
{
"epoch": 0.40643522438611346,
"grad_norm": 1.0553600788116455,
"learning_rate": 4.7767454064963724e-06,
"logits/chosen": -14.294774055480957,
"logits/rejected": -14.33879280090332,
"logps/chosen": -1.571942925453186,
"logps/rejected": -1.6219526529312134,
"loss": 1.6455,
"odds_ratio_loss": 0.7359451651573181,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.15719430148601532,
"rewards/margins": 0.005000968463718891,
"rewards/rejected": -0.16219528019428253,
"sft_loss": 1.571942925453186,
"step": 240
},
{
"epoch": 0.42337002540220153,
"grad_norm": 1.133743166923523,
"learning_rate": 4.758057507313987e-06,
"logits/chosen": -14.3100004196167,
"logits/rejected": -14.21064567565918,
"logps/chosen": -1.4966617822647095,
"logps/rejected": -1.5281431674957275,
"loss": 1.5708,
"odds_ratio_loss": 0.7413426041603088,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1496661901473999,
"rewards/margins": 0.0031481466721743345,
"rewards/rejected": -0.1528143286705017,
"sft_loss": 1.4966617822647095,
"step": 250
},
{
"epoch": 0.4403048264182896,
"grad_norm": 1.6055690050125122,
"learning_rate": 4.73865818843936e-06,
"logits/chosen": -14.18690299987793,
"logits/rejected": -14.250242233276367,
"logps/chosen": -1.5969842672348022,
"logps/rejected": -1.7042526006698608,
"loss": 1.6715,
"odds_ratio_loss": 0.744690477848053,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1596984565258026,
"rewards/margins": 0.010726812295615673,
"rewards/rejected": -0.17042526602745056,
"sft_loss": 1.5969842672348022,
"step": 260
},
{
"epoch": 0.4572396274343776,
"grad_norm": 0.7864425778388977,
"learning_rate": 4.718553561788339e-06,
"logits/chosen": -14.111845016479492,
"logits/rejected": -14.31633186340332,
"logps/chosen": -1.487687110900879,
"logps/rejected": -1.5424432754516602,
"loss": 1.5596,
"odds_ratio_loss": 0.7193279266357422,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14876870810985565,
"rewards/margins": 0.00547564122825861,
"rewards/rejected": -0.1542443484067917,
"sft_loss": 1.487687110900879,
"step": 270
},
{
"epoch": 0.4741744284504657,
"grad_norm": 1.302501916885376,
"learning_rate": 4.697749961489822e-06,
"logits/chosen": -14.314417839050293,
"logits/rejected": -14.266924858093262,
"logps/chosen": -1.6229807138442993,
"logps/rejected": -1.7468087673187256,
"loss": 1.6957,
"odds_ratio_loss": 0.7271685004234314,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1622980535030365,
"rewards/margins": 0.012382803484797478,
"rewards/rejected": -0.17468087375164032,
"sft_loss": 1.6229807138442993,
"step": 280
},
{
"epoch": 0.4911092294665538,
"grad_norm": 0.9335818886756897,
"learning_rate": 4.67625394189013e-06,
"logits/chosen": -14.308195114135742,
"logits/rejected": -14.264862060546875,
"logps/chosen": -1.47157883644104,
"logps/rejected": -1.6349776983261108,
"loss": 1.5387,
"odds_ratio_loss": 0.6707261204719543,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.14715787768363953,
"rewards/margins": 0.01633988879621029,
"rewards/rejected": -0.16349777579307556,
"sft_loss": 1.47157883644104,
"step": 290
},
{
"epoch": 0.5080440304826418,
"grad_norm": 1.5830973386764526,
"learning_rate": 4.654072275488016e-06,
"logits/chosen": -14.484451293945312,
"logits/rejected": -14.427891731262207,
"logps/chosen": -1.4168641567230225,
"logps/rejected": -1.4915310144424438,
"loss": 1.4878,
"odds_ratio_loss": 0.7094072103500366,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.14168642461299896,
"rewards/margins": 0.0074666752479970455,
"rewards/rejected": -0.14915308356285095,
"sft_loss": 1.4168641567230225,
"step": 300
},
{
"epoch": 0.5249788314987299,
"grad_norm": 1.3539669513702393,
"learning_rate": 4.631211950800925e-06,
"logits/chosen": -14.32929515838623,
"logits/rejected": -14.424825668334961,
"logps/chosen": -1.4027061462402344,
"logps/rejected": -1.481377363204956,
"loss": 1.4748,
"odds_ratio_loss": 0.7213728427886963,
"rewards/accuracies": 0.59375,
"rewards/chosen": -0.14027062058448792,
"rewards/margins": 0.007867120206356049,
"rewards/rejected": -0.14813774824142456,
"sft_loss": 1.4027061462402344,
"step": 310
},
{
"epoch": 0.541913632514818,
"grad_norm": 2.352029323577881,
"learning_rate": 4.6076801701632095e-06,
"logits/chosen": -14.217028617858887,
"logits/rejected": -14.44648551940918,
"logps/chosen": -1.513146162033081,
"logps/rejected": -1.49079430103302,
"loss": 1.5925,
"odds_ratio_loss": 0.7936692833900452,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1513146311044693,
"rewards/margins": -0.002235203282907605,
"rewards/rejected": -0.14907941222190857,
"sft_loss": 1.513146162033081,
"step": 320
},
{
"epoch": 0.558848433530906,
"grad_norm": 0.966873288154602,
"learning_rate": 4.583484347456972e-06,
"logits/chosen": -14.30597972869873,
"logits/rejected": -14.244359016418457,
"logps/chosen": -1.5698734521865845,
"logps/rejected": -1.5634009838104248,
"loss": 1.648,
"odds_ratio_loss": 0.781231164932251,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.15698735415935516,
"rewards/margins": -0.0006472375243902206,
"rewards/rejected": -0.1563401073217392,
"sft_loss": 1.5698734521865845,
"step": 330
},
{
"epoch": 0.5757832345469941,
"grad_norm": 0.9054247140884399,
"learning_rate": 4.55863210577626e-06,
"logits/chosen": -14.461858749389648,
"logits/rejected": -14.340890884399414,
"logps/chosen": -1.5450735092163086,
"logps/rejected": -1.656599760055542,
"loss": 1.6172,
"odds_ratio_loss": 0.7215217351913452,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.1545073539018631,
"rewards/margins": 0.011152632534503937,
"rewards/rejected": -0.16565999388694763,
"sft_loss": 1.5450735092163086,
"step": 340
},
{
"epoch": 0.5927180355630821,
"grad_norm": 0.9015621542930603,
"learning_rate": 4.5331312750253465e-06,
"logits/chosen": -14.178003311157227,
"logits/rejected": -14.2726411819458,
"logps/chosen": -1.487000584602356,
"logps/rejected": -1.4908943176269531,
"loss": 1.5652,
"odds_ratio_loss": 0.7824643850326538,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1487000733613968,
"rewards/margins": 0.00038935727206990123,
"rewards/rejected": -0.14908942580223083,
"sft_loss": 1.487000584602356,
"step": 350
},
{
"epoch": 0.6096528365791702,
"grad_norm": 2.001441717147827,
"learning_rate": 4.506989889451858e-06,
"logits/chosen": -14.397753715515137,
"logits/rejected": -14.500781059265137,
"logps/chosen": -1.4975332021713257,
"logps/rejected": -1.5102782249450684,
"loss": 1.5735,
"odds_ratio_loss": 0.7592841982841492,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.14975331723690033,
"rewards/margins": 0.0012745079584419727,
"rewards/rejected": -0.1510278284549713,
"sft_loss": 1.4975332021713257,
"step": 360
},
{
"epoch": 0.6265876375952583,
"grad_norm": 1.57513427734375,
"learning_rate": 4.480216185115512e-06,
"logits/chosen": -14.3065767288208,
"logits/rejected": -14.306581497192383,
"logps/chosen": -1.4990990161895752,
"logps/rejected": -1.6238371133804321,
"loss": 1.5673,
"odds_ratio_loss": 0.6823247671127319,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.14990989863872528,
"rewards/margins": 0.0124738160520792,
"rewards/rejected": -0.16238370537757874,
"sft_loss": 1.4990990161895752,
"step": 370
},
{
"epoch": 0.6435224386113463,
"grad_norm": 1.0783131122589111,
"learning_rate": 4.4528185972932856e-06,
"logits/chosen": -14.319122314453125,
"logits/rejected": -14.488665580749512,
"logps/chosen": -1.5176422595977783,
"logps/rejected": -1.656542420387268,
"loss": 1.5915,
"odds_ratio_loss": 0.7389153242111206,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.15176422894001007,
"rewards/margins": 0.013890010304749012,
"rewards/rejected": -0.1656542271375656,
"sft_loss": 1.5176422595977783,
"step": 380
},
{
"epoch": 0.6604572396274344,
"grad_norm": 1.4694324731826782,
"learning_rate": 4.424805757821803e-06,
"logits/chosen": -14.226755142211914,
"logits/rejected": -14.333894729614258,
"logps/chosen": -1.574268102645874,
"logps/rejected": -1.6511255502700806,
"loss": 1.6513,
"odds_ratio_loss": 0.7702363133430481,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.15742680430412292,
"rewards/margins": 0.007685736753046513,
"rewards/rejected": -0.16511255502700806,
"sft_loss": 1.574268102645874,
"step": 390
},
{
"epoch": 0.6773920406435224,
"grad_norm": 0.8252859711647034,
"learning_rate": 4.396186492377812e-06,
"logits/chosen": -14.237678527832031,
"logits/rejected": -14.311739921569824,
"logps/chosen": -1.508466124534607,
"logps/rejected": -1.5852457284927368,
"loss": 1.5797,
"odds_ratio_loss": 0.7126177549362183,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.15084661543369293,
"rewards/margins": 0.0076779513619840145,
"rewards/rejected": -0.15852457284927368,
"sft_loss": 1.508466124534607,
"step": 400
},
{
"epoch": 0.6943268416596104,
"grad_norm": 1.2841962575912476,
"learning_rate": 4.366969817697578e-06,
"logits/chosen": -14.2535400390625,
"logits/rejected": -14.371434211730957,
"logps/chosen": -1.5005015134811401,
"logps/rejected": -1.5292456150054932,
"loss": 1.5766,
"odds_ratio_loss": 0.7610150575637817,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.15005014836788177,
"rewards/margins": 0.002874411642551422,
"rewards/rejected": -0.1529245674610138,
"sft_loss": 1.5005015134811401,
"step": 410
},
{
"epoch": 0.7112616426756986,
"grad_norm": 1.2207895517349243,
"learning_rate": 4.337164938736086e-06,
"logits/chosen": -14.3642578125,
"logits/rejected": -14.369051933288574,
"logps/chosen": -1.5299899578094482,
"logps/rejected": -1.4911963939666748,
"loss": 1.6108,
"odds_ratio_loss": 0.8085638284683228,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.15299901366233826,
"rewards/margins": -0.0038793571293354034,
"rewards/rejected": -0.14911964535713196,
"sft_loss": 1.5299899578094482,
"step": 420
},
{
"epoch": 0.7281964436917866,
"grad_norm": 0.8184213042259216,
"learning_rate": 4.306781245766945e-06,
"logits/chosen": -14.233909606933594,
"logits/rejected": -14.245084762573242,
"logps/chosen": -1.3620591163635254,
"logps/rejected": -1.4749568700790405,
"loss": 1.4336,
"odds_ratio_loss": 0.7158304452896118,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13620591163635254,
"rewards/margins": 0.011289774440228939,
"rewards/rejected": -0.14749568700790405,
"sft_loss": 1.3620591163635254,
"step": 430
},
{
"epoch": 0.7451312447078747,
"grad_norm": 2.0060269832611084,
"learning_rate": 4.275828311423903e-06,
"logits/chosen": -14.381686210632324,
"logits/rejected": -14.249435424804688,
"logps/chosen": -1.6260135173797607,
"logps/rejected": -1.5776515007019043,
"loss": 1.709,
"odds_ratio_loss": 0.829800009727478,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.16260136663913727,
"rewards/margins": -0.004836211446672678,
"rewards/rejected": -0.15776515007019043,
"sft_loss": 1.6260135173797607,
"step": 440
},
{
"epoch": 0.7620660457239627,
"grad_norm": 4.041975498199463,
"learning_rate": 4.244315887684912e-06,
"logits/chosen": -14.30778980255127,
"logits/rejected": -14.218801498413086,
"logps/chosen": -1.483784794807434,
"logps/rejected": -1.5452721118927002,
"loss": 1.5595,
"odds_ratio_loss": 0.7568337917327881,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14837847650051117,
"rewards/margins": 0.006148716900497675,
"rewards/rejected": -0.1545272022485733,
"sft_loss": 1.483784794807434,
"step": 450
},
{
"epoch": 0.7790008467400508,
"grad_norm": 0.7099826335906982,
"learning_rate": 4.212253902799685e-06,
"logits/chosen": -14.486287117004395,
"logits/rejected": -14.316320419311523,
"logps/chosen": -1.4297285079956055,
"logps/rejected": -1.5128008127212524,
"loss": 1.5023,
"odds_ratio_loss": 0.7252711057662964,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14297285676002502,
"rewards/margins": 0.008307242766022682,
"rewards/rejected": -0.15128009021282196,
"sft_loss": 1.4297285079956055,
"step": 460
},
{
"epoch": 0.7959356477561389,
"grad_norm": 1.2492146492004395,
"learning_rate": 4.179652458161718e-06,
"logits/chosen": -14.241589546203613,
"logits/rejected": -14.272315979003906,
"logps/chosen": -1.4517958164215088,
"logps/rejected": -1.4656177759170532,
"loss": 1.5259,
"odds_ratio_loss": 0.7411133050918579,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1451795995235443,
"rewards/margins": 0.0013821950415149331,
"rewards/rejected": -0.14656177163124084,
"sft_loss": 1.4517958164215088,
"step": 470
},
{
"epoch": 0.8128704487722269,
"grad_norm": 0.9384155869483948,
"learning_rate": 4.146521825125765e-06,
"logits/chosen": -14.420669555664062,
"logits/rejected": -14.434637069702148,
"logps/chosen": -1.4806429147720337,
"logps/rejected": -1.5676599740982056,
"loss": 1.5509,
"odds_ratio_loss": 0.7023881673812866,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14806430041790009,
"rewards/margins": 0.008701696991920471,
"rewards/rejected": -0.15676598250865936,
"sft_loss": 1.4806429147720337,
"step": 480
},
{
"epoch": 0.8298052497883149,
"grad_norm": 1.070791244506836,
"learning_rate": 4.11287244177176e-06,
"logits/chosen": -14.464094161987305,
"logits/rejected": -14.335436820983887,
"logps/chosen": -1.455758810043335,
"logps/rejected": -1.5940083265304565,
"loss": 1.5245,
"odds_ratio_loss": 0.6876194477081299,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1455758810043335,
"rewards/margins": 0.01382494904100895,
"rewards/rejected": -0.1594008356332779,
"sft_loss": 1.455758810043335,
"step": 490
},
{
"epoch": 0.8467400508044031,
"grad_norm": 2.7851524353027344,
"learning_rate": 4.078714909616215e-06,
"logits/chosen": -14.458696365356445,
"logits/rejected": -14.464262008666992,
"logps/chosen": -1.531051754951477,
"logps/rejected": -1.6913106441497803,
"loss": 1.5988,
"odds_ratio_loss": 0.6771985292434692,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.15310516953468323,
"rewards/margins": 0.01602589525282383,
"rewards/rejected": -0.1691310703754425,
"sft_loss": 1.531051754951477,
"step": 500
},
{
"epoch": 0.8467400508044031,
"eval_logits/chosen": -14.385932922363281,
"eval_logits/rejected": -14.353007316589355,
"eval_logps/chosen": -1.491932988166809,
"eval_logps/rejected": -1.5724329948425293,
"eval_loss": 1.5655477046966553,
"eval_odds_ratio_loss": 0.736146092414856,
"eval_rewards/accuracies": 0.49619048833847046,
"eval_rewards/chosen": -0.14919330179691315,
"eval_rewards/margins": 0.008049987256526947,
"eval_rewards/rejected": -0.1572432965040207,
"eval_runtime": 207.7292,
"eval_samples_per_second": 5.055,
"eval_sft_loss": 1.491932988166809,
"eval_steps_per_second": 2.527,
"step": 500
},
{
"epoch": 0.8636748518204911,
"grad_norm": 2.8025050163269043,
"learning_rate": 4.044059990272125e-06,
"logits/chosen": -14.447216987609863,
"logits/rejected": -14.498886108398438,
"logps/chosen": -1.528641700744629,
"logps/rejected": -1.6202799081802368,
"loss": 1.6018,
"odds_ratio_loss": 0.732014536857605,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.15286414325237274,
"rewards/margins": 0.009163827635347843,
"rewards/rejected": -0.1620279997587204,
"sft_loss": 1.528641700744629,
"step": 510
},
{
"epoch": 0.8806096528365792,
"grad_norm": 1.3604254722595215,
"learning_rate": 4.0089186020584345e-06,
"logits/chosen": -14.258474349975586,
"logits/rejected": -14.413030624389648,
"logps/chosen": -1.5629048347473145,
"logps/rejected": -1.5826667547225952,
"loss": 1.6364,
"odds_ratio_loss": 0.7350566387176514,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.15629048645496368,
"rewards/margins": 0.001976185943931341,
"rewards/rejected": -0.15826669335365295,
"sft_loss": 1.5629048347473145,
"step": 520
},
{
"epoch": 0.8975444538526672,
"grad_norm": 2.011760711669922,
"learning_rate": 3.973301816560124e-06,
"logits/chosen": -14.397709846496582,
"logits/rejected": -14.129496574401855,
"logps/chosen": -1.4165706634521484,
"logps/rejected": -1.5228968858718872,
"loss": 1.4866,
"odds_ratio_loss": 0.6998282670974731,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.14165706932544708,
"rewards/margins": 0.010632617399096489,
"rewards/rejected": -0.15228970348834991,
"sft_loss": 1.4165706634521484,
"step": 530
},
{
"epoch": 0.9144792548687553,
"grad_norm": 1.5524851083755493,
"learning_rate": 3.937220855140021e-06,
"logits/chosen": -14.287254333496094,
"logits/rejected": -14.5077543258667,
"logps/chosen": -1.445703148841858,
"logps/rejected": -1.4684772491455078,
"loss": 1.5204,
"odds_ratio_loss": 0.7468188405036926,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.14457032084465027,
"rewards/margins": 0.0022774008102715015,
"rewards/rejected": -0.14684772491455078,
"sft_loss": 1.445703148841858,
"step": 540
},
{
"epoch": 0.9314140558848434,
"grad_norm": 1.5534979104995728,
"learning_rate": 3.900687085403418e-06,
"logits/chosen": -14.357900619506836,
"logits/rejected": -14.454984664916992,
"logps/chosen": -1.386063575744629,
"logps/rejected": -1.3658872842788696,
"loss": 1.4644,
"odds_ratio_loss": 0.7831361293792725,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.13860636949539185,
"rewards/margins": -0.0020176374819129705,
"rewards/rejected": -0.13658872246742249,
"sft_loss": 1.386063575744629,
"step": 550
},
{
"epoch": 0.9483488569009314,
"grad_norm": 1.1890796422958374,
"learning_rate": 3.863712017616614e-06,
"logits/chosen": -14.284517288208008,
"logits/rejected": -14.413591384887695,
"logps/chosen": -1.4638760089874268,
"logps/rejected": -1.5988643169403076,
"loss": 1.5324,
"odds_ratio_loss": 0.6851348876953125,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14638759195804596,
"rewards/margins": 0.013498829677700996,
"rewards/rejected": -0.1598864495754242,
"sft_loss": 1.4638760089874268,
"step": 560
},
{
"epoch": 0.9652836579170194,
"grad_norm": 6.166572570800781,
"learning_rate": 3.826307301080504e-06,
"logits/chosen": -14.168184280395508,
"logits/rejected": -14.155644416809082,
"logps/chosen": -1.4714304208755493,
"logps/rejected": -1.577530860900879,
"loss": 1.5501,
"odds_ratio_loss": 0.7865978479385376,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14714303612709045,
"rewards/margins": 0.010610053315758705,
"rewards/rejected": -0.1577531099319458,
"sft_loss": 1.4714304208755493,
"step": 570
},
{
"epoch": 0.9822184589331076,
"grad_norm": 1.6688357591629028,
"learning_rate": 3.7884847204603775e-06,
"logits/chosen": -14.45263385772705,
"logits/rejected": -14.489707946777344,
"logps/chosen": -1.519616961479187,
"logps/rejected": -1.4644415378570557,
"loss": 1.5989,
"odds_ratio_loss": 0.7931729555130005,
"rewards/accuracies": 0.39375001192092896,
"rewards/chosen": -0.15196169912815094,
"rewards/margins": -0.0055175526067614555,
"rewards/rejected": -0.1464441567659378,
"sft_loss": 1.519616961479187,
"step": 580
},
{
"epoch": 0.9991532599491956,
"grad_norm": 1.3263885974884033,
"learning_rate": 3.750256192073058e-06,
"logits/chosen": -14.519624710083008,
"logits/rejected": -14.511543273925781,
"logps/chosen": -1.6179249286651611,
"logps/rejected": -1.6542644500732422,
"loss": 1.6929,
"odds_ratio_loss": 0.7493273019790649,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.16179248690605164,
"rewards/margins": 0.0036339648067951202,
"rewards/rejected": -0.16542646288871765,
"sft_loss": 1.6179249286651611,
"step": 590
},
{
"epoch": 1.0160880609652836,
"grad_norm": 2.145953893661499,
"learning_rate": 3.7116337601325715e-06,
"logits/chosen": -14.438863754272461,
"logits/rejected": -14.496429443359375,
"logps/chosen": -1.4121149778366089,
"logps/rejected": -1.4823601245880127,
"loss": 1.4826,
"odds_ratio_loss": 0.7051838636398315,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.14121152460575104,
"rewards/margins": 0.007024504244327545,
"rewards/rejected": -0.1482360064983368,
"sft_loss": 1.4121149778366089,
"step": 600
},
{
"epoch": 1.0330228619813717,
"grad_norm": 1.4814651012420654,
"learning_rate": 3.6726295929555154e-06,
"logits/chosen": -14.25225830078125,
"logits/rejected": -14.299070358276367,
"logps/chosen": -1.333702802658081,
"logps/rejected": -1.4111506938934326,
"loss": 1.4074,
"odds_ratio_loss": 0.7373310327529907,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1333702653646469,
"rewards/margins": 0.007744790520519018,
"rewards/rejected": -0.14111506938934326,
"sft_loss": 1.333702802658081,
"step": 610
},
{
"epoch": 1.0499576629974599,
"grad_norm": 1.6669461727142334,
"learning_rate": 3.6332559791273307e-06,
"logits/chosen": -14.348184585571289,
"logits/rejected": -14.468172073364258,
"logps/chosen": -1.3673087358474731,
"logps/rejected": -1.4689829349517822,
"loss": 1.4376,
"odds_ratio_loss": 0.703393280506134,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1367308795452118,
"rewards/margins": 0.01016741432249546,
"rewards/rejected": -0.1468982994556427,
"sft_loss": 1.3673087358474731,
"step": 620
},
{
"epoch": 1.0668924640135478,
"grad_norm": 1.9912712574005127,
"learning_rate": 3.593525323630681e-06,
"logits/chosen": -14.204243659973145,
"logits/rejected": -14.313570976257324,
"logps/chosen": -1.4642140865325928,
"logps/rejected": -1.5515140295028687,
"loss": 1.5346,
"odds_ratio_loss": 0.7034581899642944,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1464214026927948,
"rewards/margins": 0.008729999884963036,
"rewards/rejected": -0.15515141189098358,
"sft_loss": 1.4642140865325928,
"step": 630
},
{
"epoch": 1.083827265029636,
"grad_norm": 1.084834098815918,
"learning_rate": 3.5534501439371615e-06,
"logits/chosen": -14.336616516113281,
"logits/rejected": -14.360015869140625,
"logps/chosen": -1.431004285812378,
"logps/rejected": -1.5110365152359009,
"loss": 1.5053,
"odds_ratio_loss": 0.7428441047668457,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14310042560100555,
"rewards/margins": 0.008003225550055504,
"rewards/rejected": -0.1511036604642868,
"sft_loss": 1.431004285812378,
"step": 640
},
{
"epoch": 1.100762066045724,
"grad_norm": 7.101503372192383,
"learning_rate": 3.5130430660635633e-06,
"logits/chosen": -14.246923446655273,
"logits/rejected": -14.310781478881836,
"logps/chosen": -1.4178617000579834,
"logps/rejected": -1.4921773672103882,
"loss": 1.4902,
"odds_ratio_loss": 0.7228954434394836,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14178617298603058,
"rewards/margins": 0.0074315681122243404,
"rewards/rejected": -0.14921775460243225,
"sft_loss": 1.4178617000579834,
"step": 650
},
{
"epoch": 1.117696867061812,
"grad_norm": 0.7868030071258545,
"learning_rate": 3.4723168205939444e-06,
"logits/chosen": -14.346036911010742,
"logits/rejected": -14.401220321655273,
"logps/chosen": -1.4435014724731445,
"logps/rejected": -1.4272395372390747,
"loss": 1.5198,
"odds_ratio_loss": 0.7628483772277832,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14435014128684998,
"rewards/margins": -0.0016262030694633722,
"rewards/rejected": -0.142723947763443,
"sft_loss": 1.4435014724731445,
"step": 660
},
{
"epoch": 1.1346316680779,
"grad_norm": 0.8476426601409912,
"learning_rate": 3.431284238668754e-06,
"logits/chosen": -14.173054695129395,
"logits/rejected": -14.25976276397705,
"logps/chosen": -1.5427913665771484,
"logps/rejected": -1.51954984664917,
"loss": 1.6222,
"odds_ratio_loss": 0.7936299443244934,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.15427914261817932,
"rewards/margins": -0.002324149012565613,
"rewards/rejected": -0.15195497870445251,
"sft_loss": 1.5427913665771484,
"step": 670
},
{
"epoch": 1.1515664690939882,
"grad_norm": 3.089587688446045,
"learning_rate": 3.389958247942274e-06,
"logits/chosen": -14.338518142700195,
"logits/rejected": -14.398809432983398,
"logps/chosen": -1.508374810218811,
"logps/rejected": -1.6098997592926025,
"loss": 1.5859,
"odds_ratio_loss": 0.7754709720611572,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1508374661207199,
"rewards/margins": 0.010152501054108143,
"rewards/rejected": -0.16098996996879578,
"sft_loss": 1.508374810218811,
"step": 680
},
{
"epoch": 1.168501270110076,
"grad_norm": 1.2698506116867065,
"learning_rate": 3.3483518685096588e-06,
"logits/chosen": -14.310267448425293,
"logits/rejected": -14.27270221710205,
"logps/chosen": -1.4493352174758911,
"logps/rejected": -1.5172946453094482,
"loss": 1.5243,
"odds_ratio_loss": 0.7492562532424927,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1449335366487503,
"rewards/margins": 0.006795944180339575,
"rewards/rejected": -0.15172946453094482,
"sft_loss": 1.4493352174758911,
"step": 690
},
{
"epoch": 1.1854360711261642,
"grad_norm": 1.6422189474105835,
"learning_rate": 3.306478208804839e-06,
"logits/chosen": -14.337800979614258,
"logits/rejected": -14.443319320678711,
"logps/chosen": -1.3992269039154053,
"logps/rejected": -1.4721871614456177,
"loss": 1.4736,
"odds_ratio_loss": 0.7440443634986877,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13992270827293396,
"rewards/margins": 0.007296019699424505,
"rewards/rejected": -0.1472187042236328,
"sft_loss": 1.3992269039154053,
"step": 700
},
{
"epoch": 1.2023708721422524,
"grad_norm": 1.635892391204834,
"learning_rate": 3.264350461470608e-06,
"logits/chosen": -14.11363410949707,
"logits/rejected": -14.23077392578125,
"logps/chosen": -1.4146158695220947,
"logps/rejected": -1.5160566568374634,
"loss": 1.4872,
"odds_ratio_loss": 0.726182222366333,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1414615958929062,
"rewards/margins": 0.010144074447453022,
"rewards/rejected": -0.15160568058490753,
"sft_loss": 1.4146158695220947,
"step": 710
},
{
"epoch": 1.2193056731583405,
"grad_norm": 2.807609796524048,
"learning_rate": 3.2219818992021685e-06,
"logits/chosen": -14.307601928710938,
"logits/rejected": -14.457585334777832,
"logps/chosen": -1.3360792398452759,
"logps/rejected": -1.5054932832717896,
"loss": 1.4058,
"odds_ratio_loss": 0.6972737312316895,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13360795378684998,
"rewards/margins": 0.016941383481025696,
"rewards/rejected": -0.15054932236671448,
"sft_loss": 1.3360792398452759,
"step": 720
},
{
"epoch": 1.2362404741744284,
"grad_norm": 4.885401248931885,
"learning_rate": 3.1793858705654595e-06,
"logits/chosen": -14.334493637084961,
"logits/rejected": -14.283819198608398,
"logps/chosen": -1.435250997543335,
"logps/rejected": -1.4584420919418335,
"loss": 1.5098,
"odds_ratio_loss": 0.7454192638397217,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.14352509379386902,
"rewards/margins": 0.0023191256914287806,
"rewards/rejected": -0.1458442211151123,
"sft_loss": 1.435250997543335,
"step": 730
},
{
"epoch": 1.2531752751905165,
"grad_norm": 2.119098424911499,
"learning_rate": 3.1365757957915787e-06,
"logits/chosen": -14.451696395874023,
"logits/rejected": -14.478349685668945,
"logps/chosen": -1.4766839742660522,
"logps/rejected": -1.5273820161819458,
"loss": 1.5488,
"odds_ratio_loss": 0.7213960289955139,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14766840636730194,
"rewards/margins": 0.005069802515208721,
"rewards/rejected": -0.15273821353912354,
"sft_loss": 1.4766839742660522,
"step": 740
},
{
"epoch": 1.2701100762066047,
"grad_norm": 2.60243821144104,
"learning_rate": 3.093565162548633e-06,
"logits/chosen": -14.26720905303955,
"logits/rejected": -14.301678657531738,
"logps/chosen": -1.4956939220428467,
"logps/rejected": -1.5772297382354736,
"loss": 1.5741,
"odds_ratio_loss": 0.7844332456588745,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14956940710544586,
"rewards/margins": 0.008153588511049747,
"rewards/rejected": -0.15772297978401184,
"sft_loss": 1.4956939220428467,
"step": 750
},
{
"epoch": 1.2870448772226926,
"grad_norm": 1.4909660816192627,
"learning_rate": 3.0503675216923294e-06,
"logits/chosen": -14.459734916687012,
"logits/rejected": -14.364084243774414,
"logps/chosen": -1.3072437047958374,
"logps/rejected": -1.4731833934783936,
"loss": 1.3741,
"odds_ratio_loss": 0.6682445406913757,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13072435557842255,
"rewards/margins": 0.01659397967159748,
"rewards/rejected": -0.14731833338737488,
"sft_loss": 1.3072437047958374,
"step": 760
},
{
"epoch": 1.3039796782387807,
"grad_norm": 1.1245403289794922,
"learning_rate": 3.0069964829966748e-06,
"logits/chosen": -14.397039413452148,
"logits/rejected": -14.449551582336426,
"logps/chosen": -1.3757708072662354,
"logps/rejected": -1.4135478734970093,
"loss": 1.4523,
"odds_ratio_loss": 0.7652989029884338,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.137577086687088,
"rewards/margins": 0.003777713282033801,
"rewards/rejected": -0.1413547843694687,
"sft_loss": 1.3757708072662354,
"step": 770
},
{
"epoch": 1.3209144792548688,
"grad_norm": 1.2307573556900024,
"learning_rate": 2.963465710866094e-06,
"logits/chosen": -14.386013984680176,
"logits/rejected": -14.34870719909668,
"logps/chosen": -1.4350049495697021,
"logps/rejected": -1.5495213270187378,
"loss": 1.5065,
"odds_ratio_loss": 0.7147475481033325,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14350050687789917,
"rewards/margins": 0.01145164854824543,
"rewards/rejected": -0.15495215356349945,
"sft_loss": 1.4350049495697021,
"step": 780
},
{
"epoch": 1.337849280270957,
"grad_norm": 2.506805181503296,
"learning_rate": 2.919788920030357e-06,
"logits/chosen": -14.521794319152832,
"logits/rejected": -14.562520027160645,
"logps/chosen": -1.5004112720489502,
"logps/rejected": -1.5385072231292725,
"loss": 1.5749,
"odds_ratio_loss": 0.7447755336761475,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1500411331653595,
"rewards/margins": 0.003809594316408038,
"rewards/rejected": -0.1538507342338562,
"sft_loss": 1.5004112720489502,
"step": 790
},
{
"epoch": 1.3547840812870449,
"grad_norm": 2.221041440963745,
"learning_rate": 2.8759798712236303e-06,
"logits/chosen": -14.375375747680664,
"logits/rejected": -14.200535774230957,
"logps/chosen": -1.3673021793365479,
"logps/rejected": -1.4980638027191162,
"loss": 1.4391,
"odds_ratio_loss": 0.7180894017219543,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.13673020899295807,
"rewards/margins": 0.013076169416308403,
"rewards/rejected": -0.14980638027191162,
"sft_loss": 1.3673021793365479,
"step": 800
},
{
"epoch": 1.371718882303133,
"grad_norm": 1.1964547634124756,
"learning_rate": 2.8320523668490507e-06,
"logits/chosen": -14.326695442199707,
"logits/rejected": -14.330057144165039,
"logps/chosen": -1.4386107921600342,
"logps/rejected": -1.4542288780212402,
"loss": 1.516,
"odds_ratio_loss": 0.7743045091629028,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1438610851764679,
"rewards/margins": 0.00156181410420686,
"rewards/rejected": -0.14542289078235626,
"sft_loss": 1.4386107921600342,
"step": 810
},
{
"epoch": 1.388653683319221,
"grad_norm": 1.63833749294281,
"learning_rate": 2.7880202466301597e-06,
"logits/chosen": -14.170251846313477,
"logits/rejected": -14.376757621765137,
"logps/chosen": -1.4189726114273071,
"logps/rejected": -1.4344730377197266,
"loss": 1.4949,
"odds_ratio_loss": 0.7592172026634216,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.1418972760438919,
"rewards/margins": 0.00155004789121449,
"rewards/rejected": -0.14344730973243713,
"sft_loss": 1.4189726114273071,
"step": 820
},
{
"epoch": 1.405588484335309,
"grad_norm": 1.4605140686035156,
"learning_rate": 2.7438973832505854e-06,
"logits/chosen": -14.213847160339355,
"logits/rejected": -14.075439453125,
"logps/chosen": -1.394853115081787,
"logps/rejected": -1.4763586521148682,
"loss": 1.4703,
"odds_ratio_loss": 0.7543301582336426,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13948531448841095,
"rewards/margins": 0.008150560781359673,
"rewards/rejected": -0.14763586223125458,
"sft_loss": 1.394853115081787,
"step": 830
},
{
"epoch": 1.4225232853513972,
"grad_norm": 6.998382091522217,
"learning_rate": 2.699697677983341e-06,
"logits/chosen": -14.502642631530762,
"logits/rejected": -14.471555709838867,
"logps/chosen": -1.3794063329696655,
"logps/rejected": -1.3286025524139404,
"loss": 1.4577,
"odds_ratio_loss": 0.7826226353645325,
"rewards/accuracies": 0.40625,
"rewards/chosen": -0.1379406601190567,
"rewards/margins": -0.005080387927591801,
"rewards/rejected": -0.13286025822162628,
"sft_loss": 1.3794063329696655,
"step": 840
},
{
"epoch": 1.4394580863674853,
"grad_norm": 6.508487224578857,
"learning_rate": 2.6554350563111115e-06,
"logits/chosen": -14.415182113647461,
"logits/rejected": -14.4021577835083,
"logps/chosen": -1.4343197345733643,
"logps/rejected": -1.389868140220642,
"loss": 1.5147,
"odds_ratio_loss": 0.803573489189148,
"rewards/accuracies": 0.4000000059604645,
"rewards/chosen": -0.14343199133872986,
"rewards/margins": -0.00444516446441412,
"rewards/rejected": -0.13898679614067078,
"sft_loss": 1.4343197345733643,
"step": 850
},
{
"epoch": 1.4563928873835732,
"grad_norm": 3.286094903945923,
"learning_rate": 2.611123463538913e-06,
"logits/chosen": -14.409162521362305,
"logits/rejected": -14.423065185546875,
"logps/chosen": -1.3563302755355835,
"logps/rejected": -1.470460295677185,
"loss": 1.4284,
"odds_ratio_loss": 0.7211607694625854,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.13563302159309387,
"rewards/margins": 0.011413001455366611,
"rewards/rejected": -0.1470460146665573,
"sft_loss": 1.3563302755355835,
"step": 860
},
{
"epoch": 1.4733276883996613,
"grad_norm": 1.353800654411316,
"learning_rate": 2.566776860400514e-06,
"logits/chosen": -14.359599113464355,
"logits/rejected": -14.388442993164062,
"logps/chosen": -1.4657598733901978,
"logps/rejected": -1.5304598808288574,
"loss": 1.5387,
"odds_ratio_loss": 0.7289360761642456,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14657600224018097,
"rewards/margins": 0.006469997111707926,
"rewards/rejected": -0.15304598212242126,
"sft_loss": 1.4657598733901978,
"step": 870
},
{
"epoch": 1.4902624894157492,
"grad_norm": 0.8999080657958984,
"learning_rate": 2.522409218659989e-06,
"logits/chosen": -14.522372245788574,
"logits/rejected": -14.516871452331543,
"logps/chosen": -1.5183885097503662,
"logps/rejected": -1.5601129531860352,
"loss": 1.5903,
"odds_ratio_loss": 0.7187842130661011,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.15183886885643005,
"rewards/margins": 0.004172446206212044,
"rewards/rejected": -0.15601131319999695,
"sft_loss": 1.5183885097503662,
"step": 880
},
{
"epoch": 1.5071972904318374,
"grad_norm": 1.7247016429901123,
"learning_rate": 2.4780345167097976e-06,
"logits/chosen": -14.4078369140625,
"logits/rejected": -14.206354141235352,
"logps/chosen": -1.422533392906189,
"logps/rejected": -1.617108941078186,
"loss": 1.4925,
"odds_ratio_loss": 0.6991701126098633,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1422533541917801,
"rewards/margins": 0.019457560032606125,
"rewards/rejected": -0.16171090304851532,
"sft_loss": 1.422533392906189,
"step": 890
},
{
"epoch": 1.5241320914479255,
"grad_norm": 1.1559327840805054,
"learning_rate": 2.4336667351667747e-06,
"logits/chosen": -14.479301452636719,
"logits/rejected": -14.487524032592773,
"logps/chosen": -1.5677707195281982,
"logps/rejected": -1.654937982559204,
"loss": 1.6407,
"odds_ratio_loss": 0.7297645807266235,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.1567770540714264,
"rewards/margins": 0.00871671736240387,
"rewards/rejected": -0.16549380123615265,
"sft_loss": 1.5677707195281982,
"step": 900
},
{
"epoch": 1.5410668924640136,
"grad_norm": 2.899705171585083,
"learning_rate": 2.3893198524674264e-06,
"logits/chosen": -14.416735649108887,
"logits/rejected": -14.323824882507324,
"logps/chosen": -1.3870880603790283,
"logps/rejected": -1.490903615951538,
"loss": 1.4593,
"odds_ratio_loss": 0.7216765284538269,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13870880007743835,
"rewards/margins": 0.010381558910012245,
"rewards/rejected": -0.14909036457538605,
"sft_loss": 1.3870880603790283,
"step": 910
},
{
"epoch": 1.5580016934801018,
"grad_norm": 1.2076252698898315,
"learning_rate": 2.345007840463904e-06,
"logits/chosen": -14.292505264282227,
"logits/rejected": -14.244054794311523,
"logps/chosen": -1.4259792566299438,
"logps/rejected": -1.4341694116592407,
"loss": 1.5022,
"odds_ratio_loss": 0.7626054883003235,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14259792864322662,
"rewards/margins": 0.0008190165390260518,
"rewards/rejected": -0.14341694116592407,
"sft_loss": 1.4259792566299438,
"step": 920
},
{
"epoch": 1.5749364944961897,
"grad_norm": 2.6530520915985107,
"learning_rate": 2.3007446600220572e-06,
"logits/chosen": -14.440101623535156,
"logits/rejected": -14.175987243652344,
"logps/chosen": -1.361826777458191,
"logps/rejected": -1.4479329586029053,
"loss": 1.4351,
"odds_ratio_loss": 0.7332156300544739,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13618269562721252,
"rewards/margins": 0.008610614575445652,
"rewards/rejected": -0.144793301820755,
"sft_loss": 1.361826777458191,
"step": 930
},
{
"epoch": 1.5918712955122776,
"grad_norm": 3.269102096557617,
"learning_rate": 2.2565442566229507e-06,
"logits/chosen": -14.330474853515625,
"logits/rejected": -14.3932466506958,
"logps/chosen": -1.4583683013916016,
"logps/rejected": -1.4522769451141357,
"loss": 1.5392,
"odds_ratio_loss": 0.8081096410751343,
"rewards/accuracies": 0.3812499940395355,
"rewards/chosen": -0.14583681523799896,
"rewards/margins": -0.0006091395625844598,
"rewards/rejected": -0.14522768557071686,
"sft_loss": 1.4583683013916016,
"step": 940
},
{
"epoch": 1.6088060965283657,
"grad_norm": 1.2394914627075195,
"learning_rate": 2.2124205559692195e-06,
"logits/chosen": -14.25177001953125,
"logits/rejected": -14.32116985321045,
"logps/chosen": -1.4207613468170166,
"logps/rejected": -1.5083825588226318,
"loss": 1.4919,
"odds_ratio_loss": 0.7114149928092957,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.14207611978054047,
"rewards/margins": 0.008762138895690441,
"rewards/rejected": -0.15083825588226318,
"sft_loss": 1.4207613468170166,
"step": 950
},
{
"epoch": 1.6257408975444538,
"grad_norm": 1.6583099365234375,
"learning_rate": 2.168387459597666e-06,
"logits/chosen": -14.210861206054688,
"logits/rejected": -14.444610595703125,
"logps/chosen": -1.5090281963348389,
"logps/rejected": -1.5863807201385498,
"loss": 1.5813,
"odds_ratio_loss": 0.7230504155158997,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.15090280771255493,
"rewards/margins": 0.007735258433967829,
"rewards/rejected": -0.15863807499408722,
"sft_loss": 1.5090281963348389,
"step": 960
},
{
"epoch": 1.642675698560542,
"grad_norm": 1.3439754247665405,
"learning_rate": 2.1244588404994648e-06,
"logits/chosen": -14.237951278686523,
"logits/rejected": -14.269018173217773,
"logps/chosen": -1.376792073249817,
"logps/rejected": -1.4212851524353027,
"loss": 1.453,
"odds_ratio_loss": 0.7622562646865845,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13767921924591064,
"rewards/margins": 0.0044493041932582855,
"rewards/rejected": -0.14212851226329803,
"sft_loss": 1.376792073249817,
"step": 970
},
{
"epoch": 1.65961049957663,
"grad_norm": 2.962531328201294,
"learning_rate": 2.08064853874936e-06,
"logits/chosen": -14.473817825317383,
"logits/rejected": -14.631460189819336,
"logps/chosen": -1.4066752195358276,
"logps/rejected": -1.455766201019287,
"loss": 1.4788,
"odds_ratio_loss": 0.7210047245025635,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.14066752791404724,
"rewards/margins": 0.004909100476652384,
"rewards/rejected": -0.1455766260623932,
"sft_loss": 1.4066752195358276,
"step": 980
},
{
"epoch": 1.676545300592718,
"grad_norm": 3.2846462726593018,
"learning_rate": 2.0369703571452387e-06,
"logits/chosen": -14.20033073425293,
"logits/rejected": -14.109931945800781,
"logps/chosen": -1.309378743171692,
"logps/rejected": -1.4727327823638916,
"loss": 1.3763,
"odds_ratio_loss": 0.6690842509269714,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13093788921833038,
"rewards/margins": 0.01633540540933609,
"rewards/rejected": -0.14727327227592468,
"sft_loss": 1.309378743171692,
"step": 990
},
{
"epoch": 1.6934801016088061,
"grad_norm": 1.1083016395568848,
"learning_rate": 1.993438056859441e-06,
"logits/chosen": -14.497441291809082,
"logits/rejected": -14.366804122924805,
"logps/chosen": -1.353459119796753,
"logps/rejected": -1.469897985458374,
"loss": 1.4213,
"odds_ratio_loss": 0.6788592338562012,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13534590601921082,
"rewards/margins": 0.011643897742033005,
"rewards/rejected": -0.14698980748653412,
"sft_loss": 1.353459119796753,
"step": 1000
},
{
"epoch": 1.6934801016088061,
"eval_logits/chosen": -14.432435989379883,
"eval_logits/rejected": -14.399744987487793,
"eval_logps/chosen": -1.4366357326507568,
"eval_logps/rejected": -1.5239636898040771,
"eval_loss": 1.509663701057434,
"eval_odds_ratio_loss": 0.7302786707878113,
"eval_rewards/accuracies": 0.5038095116615295,
"eval_rewards/chosen": -0.14366357028484344,
"eval_rewards/margins": 0.00873279757797718,
"eval_rewards/rejected": -0.15239638090133667,
"eval_runtime": 445.7589,
"eval_samples_per_second": 2.356,
"eval_sft_loss": 1.4366357326507568,
"eval_steps_per_second": 1.178,
"step": 1000
},
{
"epoch": 1.710414902624894,
"grad_norm": 1.8078409433364868,
"learning_rate": 1.9500653531031917e-06,
"logits/chosen": -14.443731307983398,
"logits/rejected": -14.476076126098633,
"logps/chosen": -1.361530065536499,
"logps/rejected": -1.5223243236541748,
"loss": 1.4302,
"odds_ratio_loss": 0.6869481205940247,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.136152982711792,
"rewards/margins": 0.016079427674412727,
"rewards/rejected": -0.15223243832588196,
"sft_loss": 1.361530065536499,
"step": 1010
},
{
"epoch": 1.7273497036409822,
"grad_norm": 1.288388729095459,
"learning_rate": 1.9068659108055117e-06,
"logits/chosen": -14.475682258605957,
"logits/rejected": -14.473660469055176,
"logps/chosen": -1.4284050464630127,
"logps/rejected": -1.4647681713104248,
"loss": 1.5008,
"odds_ratio_loss": 0.7240586280822754,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.14284051954746246,
"rewards/margins": 0.003636319888755679,
"rewards/rejected": -0.1464768350124359,
"sft_loss": 1.4284050464630127,
"step": 1020
},
{
"epoch": 1.7442845046570703,
"grad_norm": 1.2943964004516602,
"learning_rate": 1.863853340307962e-06,
"logits/chosen": -14.312501907348633,
"logits/rejected": -14.362284660339355,
"logps/chosen": -1.2968519926071167,
"logps/rejected": -1.579993486404419,
"loss": 1.3634,
"odds_ratio_loss": 0.6657830476760864,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -0.1296851933002472,
"rewards/margins": 0.0283141378313303,
"rewards/rejected": -0.15799932181835175,
"sft_loss": 1.2968519926071167,
"step": 1030
},
{
"epoch": 1.7612193056731584,
"grad_norm": 1.1572942733764648,
"learning_rate": 1.8210411930766019e-06,
"logits/chosen": -14.294156074523926,
"logits/rejected": -14.323614120483398,
"logps/chosen": -1.479034662246704,
"logps/rejected": -1.6268787384033203,
"loss": 1.547,
"odds_ratio_loss": 0.6801426410675049,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1479034721851349,
"rewards/margins": 0.014784415252506733,
"rewards/rejected": -0.16268786787986755,
"sft_loss": 1.479034662246704,
"step": 1040
},
{
"epoch": 1.7781541066892466,
"grad_norm": 1.574400782585144,
"learning_rate": 1.7784429574324803e-06,
"logits/chosen": -14.368769645690918,
"logits/rejected": -14.502416610717773,
"logps/chosen": -1.3905737400054932,
"logps/rejected": -1.5777366161346436,
"loss": 1.4567,
"odds_ratio_loss": 0.6612822413444519,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13905738294124603,
"rewards/margins": 0.018716301769018173,
"rewards/rejected": -0.1577736884355545,
"sft_loss": 1.3905737400054932,
"step": 1050
},
{
"epoch": 1.7950889077053345,
"grad_norm": 1.195115089416504,
"learning_rate": 1.7360720543020327e-06,
"logits/chosen": -14.439001083374023,
"logits/rejected": -14.227216720581055,
"logps/chosen": -1.3061621189117432,
"logps/rejected": -1.3979461193084717,
"loss": 1.3747,
"odds_ratio_loss": 0.6853240728378296,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1306162178516388,
"rewards/margins": 0.009178402833640575,
"rewards/rejected": -0.13979461789131165,
"sft_loss": 1.3061621189117432,
"step": 1060
},
{
"epoch": 1.8120237087214224,
"grad_norm": 3.909592390060425,
"learning_rate": 1.6939418329887042e-06,
"logits/chosen": -14.45744514465332,
"logits/rejected": -14.5038423538208,
"logps/chosen": -1.4311087131500244,
"logps/rejected": -1.4849843978881836,
"loss": 1.5049,
"odds_ratio_loss": 0.7381945848464966,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.14311087131500244,
"rewards/margins": 0.005387583281844854,
"rewards/rejected": -0.14849844574928284,
"sft_loss": 1.4311087131500244,
"step": 1070
},
{
"epoch": 1.8289585097375105,
"grad_norm": 1.7437409162521362,
"learning_rate": 1.6520655669671467e-06,
"logits/chosen": -14.631024360656738,
"logits/rejected": -14.515978813171387,
"logps/chosen": -1.4438676834106445,
"logps/rejected": -1.4797852039337158,
"loss": 1.519,
"odds_ratio_loss": 0.7515386343002319,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14438676834106445,
"rewards/margins": 0.0035917561035603285,
"rewards/rejected": -0.1479785144329071,
"sft_loss": 1.4438676834106445,
"step": 1080
},
{
"epoch": 1.8458933107535986,
"grad_norm": 3.1396241188049316,
"learning_rate": 1.610456449701294e-06,
"logits/chosen": -14.319239616394043,
"logits/rejected": -14.346944808959961,
"logps/chosen": -1.4771324396133423,
"logps/rejected": -1.537941336631775,
"loss": 1.5548,
"odds_ratio_loss": 0.776719331741333,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.14771324396133423,
"rewards/margins": 0.0060809021815657616,
"rewards/rejected": -0.15379413962364197,
"sft_loss": 1.4771324396133423,
"step": 1090
},
{
"epoch": 1.8628281117696868,
"grad_norm": 1.4689712524414062,
"learning_rate": 1.5691275904876545e-06,
"logits/chosen": -14.461804389953613,
"logits/rejected": -14.278103828430176,
"logps/chosen": -1.407566785812378,
"logps/rejected": -1.5848530530929565,
"loss": 1.474,
"odds_ratio_loss": 0.6638895869255066,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14075669646263123,
"rewards/margins": 0.017728609964251518,
"rewards/rejected": -0.1584853082895279,
"sft_loss": 1.407566785812378,
"step": 1100
},
{
"epoch": 1.879762912785775,
"grad_norm": 0.9537128210067749,
"learning_rate": 1.5280920103251235e-06,
"logits/chosen": -14.299761772155762,
"logits/rejected": -14.347249984741211,
"logps/chosen": -1.3132389783859253,
"logps/rejected": -1.451719880104065,
"loss": 1.3829,
"odds_ratio_loss": 0.6965407133102417,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.131323903799057,
"rewards/margins": 0.013848078437149525,
"rewards/rejected": -0.14517197012901306,
"sft_loss": 1.3132389783859253,
"step": 1110
},
{
"epoch": 1.8966977138018628,
"grad_norm": 1.3030270338058472,
"learning_rate": 1.4873626378126015e-06,
"logits/chosen": -14.38860034942627,
"logits/rejected": -14.277740478515625,
"logps/chosen": -1.3292900323867798,
"logps/rejected": -1.5040452480316162,
"loss": 1.3984,
"odds_ratio_loss": 0.6911579966545105,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.1329289972782135,
"rewards/margins": 0.017475521191954613,
"rewards/rejected": -0.15040451288223267,
"sft_loss": 1.3292900323867798,
"step": 1120
},
{
"epoch": 1.913632514817951,
"grad_norm": 2.765397071838379,
"learning_rate": 1.446952305075738e-06,
"logits/chosen": -14.399679183959961,
"logits/rejected": -14.427862167358398,
"logps/chosen": -1.3543717861175537,
"logps/rejected": -1.3891161680221558,
"loss": 1.4306,
"odds_ratio_loss": 0.7619328498840332,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13543717563152313,
"rewards/margins": 0.003474441124126315,
"rewards/rejected": -0.13891161978244781,
"sft_loss": 1.3543717861175537,
"step": 1130
},
{
"epoch": 1.9305673158340388,
"grad_norm": 1.730094075202942,
"learning_rate": 1.406873743724065e-06,
"logits/chosen": -14.437395095825195,
"logits/rejected": -14.322535514831543,
"logps/chosen": -1.4621553421020508,
"logps/rejected": -1.6176691055297852,
"loss": 1.5314,
"odds_ratio_loss": 0.692920982837677,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.1462155282497406,
"rewards/margins": 0.015551361255347729,
"rewards/rejected": -0.1617669016122818,
"sft_loss": 1.4621553421020508,
"step": 1140
},
{
"epoch": 1.947502116850127,
"grad_norm": 1.5328463315963745,
"learning_rate": 1.3671395808397898e-06,
"logits/chosen": -14.267127990722656,
"logits/rejected": -14.463046073913574,
"logps/chosen": -1.335663080215454,
"logps/rejected": -1.3676198720932007,
"loss": 1.4094,
"odds_ratio_loss": 0.7378238439559937,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13356631994247437,
"rewards/margins": 0.0031956590246409178,
"rewards/rejected": -0.13676197826862335,
"sft_loss": 1.335663080215454,
"step": 1150
},
{
"epoch": 1.964436917866215,
"grad_norm": 3.9082131385803223,
"learning_rate": 1.3277623349995418e-06,
"logits/chosen": -14.250445365905762,
"logits/rejected": -14.258328437805176,
"logps/chosen": -1.386776089668274,
"logps/rejected": -1.3914397954940796,
"loss": 1.4653,
"odds_ratio_loss": 0.7851333618164062,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13867759704589844,
"rewards/margins": 0.00046637197374366224,
"rewards/rejected": -0.13914397358894348,
"sft_loss": 1.386776089668274,
"step": 1160
},
{
"epoch": 1.9813717188823032,
"grad_norm": 3.576561450958252,
"learning_rate": 1.2887544123302781e-06,
"logits/chosen": -14.434526443481445,
"logits/rejected": -14.393232345581055,
"logps/chosen": -1.4019829034805298,
"logps/rejected": -1.4435473680496216,
"loss": 1.4772,
"odds_ratio_loss": 0.752662181854248,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.14019827544689178,
"rewards/margins": 0.004156465642154217,
"rewards/rejected": -0.14435474574565887,
"sft_loss": 1.4019829034805298,
"step": 1170
},
{
"epoch": 1.9983065198983911,
"grad_norm": 1.4880342483520508,
"learning_rate": 1.2501281026006393e-06,
"logits/chosen": -14.47376823425293,
"logits/rejected": -14.513628005981445,
"logps/chosen": -1.420966386795044,
"logps/rejected": -1.4258407354354858,
"loss": 1.5002,
"odds_ratio_loss": 0.7924087643623352,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.1420966386795044,
"rewards/margins": 0.000487445795442909,
"rewards/rejected": -0.14258407056331635,
"sft_loss": 1.420966386795044,
"step": 1180
},
{
"epoch": 2.015241320914479,
"grad_norm": 1.0734080076217651,
"learning_rate": 1.2118955753489523e-06,
"logits/chosen": -14.561826705932617,
"logits/rejected": -14.332305908203125,
"logps/chosen": -1.3783150911331177,
"logps/rejected": -1.4396107196807861,
"loss": 1.4511,
"odds_ratio_loss": 0.7278788685798645,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.13783150911331177,
"rewards/margins": 0.006129562854766846,
"rewards/rejected": -0.1439610719680786,
"sft_loss": 1.3783150911331177,
"step": 1190
},
{
"epoch": 2.032176121930567,
"grad_norm": 1.3539475202560425,
"learning_rate": 1.1740688760491189e-06,
"logits/chosen": -14.37562370300293,
"logits/rejected": -14.43455696105957,
"logps/chosen": -1.3733515739440918,
"logps/rejected": -1.4605834484100342,
"loss": 1.4435,
"odds_ratio_loss": 0.7019113302230835,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1373351514339447,
"rewards/margins": 0.00872319657355547,
"rewards/rejected": -0.1460583508014679,
"sft_loss": 1.3733515739440918,
"step": 1200
},
{
"epoch": 2.0491109229466553,
"grad_norm": 1.5765854120254517,
"learning_rate": 1.1366599223155847e-06,
"logits/chosen": -14.275134086608887,
"logits/rejected": -14.2963228225708,
"logps/chosen": -1.371392011642456,
"logps/rejected": -1.4632259607315063,
"loss": 1.4461,
"odds_ratio_loss": 0.7467167377471924,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.1371392160654068,
"rewards/margins": 0.009183400310575962,
"rewards/rejected": -0.1463226079940796,
"sft_loss": 1.371392011642456,
"step": 1210
},
{
"epoch": 2.0660457239627434,
"grad_norm": 1.6226162910461426,
"learning_rate": 1.0996805001486067e-06,
"logits/chosen": -14.387079238891602,
"logits/rejected": -14.525866508483887,
"logps/chosen": -1.3380024433135986,
"logps/rejected": -1.4540449380874634,
"loss": 1.4055,
"odds_ratio_loss": 0.6752744913101196,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -0.13380023837089539,
"rewards/margins": 0.011604254133999348,
"rewards/rejected": -0.14540448784828186,
"sft_loss": 1.3380024433135986,
"step": 1220
},
{
"epoch": 2.0829805249788316,
"grad_norm": 2.682673454284668,
"learning_rate": 1.0631422602209608e-06,
"logits/chosen": -14.46452808380127,
"logits/rejected": -14.45245361328125,
"logps/chosen": -1.52396559715271,
"logps/rejected": -1.5300567150115967,
"loss": 1.6016,
"odds_ratio_loss": 0.7762898802757263,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.15239658951759338,
"rewards/margins": 0.0006091115646995604,
"rewards/rejected": -0.1530056893825531,
"sft_loss": 1.52396559715271,
"step": 1230
},
{
"epoch": 2.0999153259949197,
"grad_norm": 0.9156871438026428,
"learning_rate": 1.027056714207319e-06,
"logits/chosen": -14.493863105773926,
"logits/rejected": -14.539648056030273,
"logps/chosen": -1.4326021671295166,
"logps/rejected": -1.5681862831115723,
"loss": 1.5034,
"odds_ratio_loss": 0.7082626223564148,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14326021075248718,
"rewards/margins": 0.0135584007948637,
"rewards/rejected": -0.15681862831115723,
"sft_loss": 1.4326021671295166,
"step": 1240
},
{
"epoch": 2.116850127011008,
"grad_norm": 3.18613600730896,
"learning_rate": 9.914352311573838e-07,
"logits/chosen": -14.396720886230469,
"logits/rejected": -14.398330688476562,
"logps/chosen": -1.3194880485534668,
"logps/rejected": -1.4313329458236694,
"loss": 1.3887,
"odds_ratio_loss": 0.6922628283500671,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1319488137960434,
"rewards/margins": 0.011184502393007278,
"rewards/rejected": -0.14313331246376038,
"sft_loss": 1.3194880485534668,
"step": 1250
},
{
"epoch": 2.1337849280270955,
"grad_norm": 1.0878351926803589,
"learning_rate": 9.562890339139877e-07,
"logits/chosen": -14.146682739257812,
"logits/rejected": -14.353192329406738,
"logps/chosen": -1.3349636793136597,
"logps/rejected": -1.379267930984497,
"loss": 1.4097,
"odds_ratio_loss": 0.7469658255577087,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13349637389183044,
"rewards/margins": 0.004430420231074095,
"rewards/rejected": -0.13792680203914642,
"sft_loss": 1.3349636793136597,
"step": 1260
},
{
"epoch": 2.1507197290431836,
"grad_norm": 1.177203893661499,
"learning_rate": 9.216291955772374e-07,
"logits/chosen": -14.328463554382324,
"logits/rejected": -14.295025825500488,
"logps/chosen": -1.3897377252578735,
"logps/rejected": -1.4198486804962158,
"loss": 1.4659,
"odds_ratio_loss": 0.7619088292121887,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13897378742694855,
"rewards/margins": 0.0030110946390777826,
"rewards/rejected": -0.14198487997055054,
"sft_loss": 1.3897377252578735,
"step": 1270
},
{
"epoch": 2.167654530059272,
"grad_norm": 2.2964181900024414,
"learning_rate": 8.874666360158457e-07,
"logits/chosen": -14.346217155456543,
"logits/rejected": -14.197412490844727,
"logps/chosen": -1.3614085912704468,
"logps/rejected": -1.4674574136734009,
"loss": 1.4341,
"odds_ratio_loss": 0.7273774147033691,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1361408680677414,
"rewards/margins": 0.010604878887534142,
"rewards/rejected": -0.1467457413673401,
"sft_loss": 1.3614085912704468,
"step": 1280
},
{
"epoch": 2.18458933107536,
"grad_norm": 3.246114492416382,
"learning_rate": 8.538121184267315e-07,
"logits/chosen": -14.440536499023438,
"logits/rejected": -14.329854965209961,
"logps/chosen": -1.2875430583953857,
"logps/rejected": -1.3775211572647095,
"loss": 1.3574,
"odds_ratio_loss": 0.6986570954322815,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.12875431776046753,
"rewards/margins": 0.008997795172035694,
"rewards/rejected": -0.13775211572647095,
"sft_loss": 1.2875430583953857,
"step": 1290
},
{
"epoch": 2.201524132091448,
"grad_norm": 1.6076223850250244,
"learning_rate": 8.206762459439907e-07,
"logits/chosen": -14.393684387207031,
"logits/rejected": -14.419075012207031,
"logps/chosen": -1.4106100797653198,
"logps/rejected": -1.4857350587844849,
"loss": 1.4865,
"odds_ratio_loss": 0.758701741695404,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.14106100797653198,
"rewards/margins": 0.0075125014409422874,
"rewards/rejected": -0.14857350289821625,
"sft_loss": 1.4106100797653198,
"step": 1300
},
{
"epoch": 2.218458933107536,
"grad_norm": 1.4635405540466309,
"learning_rate": 7.880694582982898e-07,
"logits/chosen": -14.465181350708008,
"logits/rejected": -14.500001907348633,
"logps/chosen": -1.4319560527801514,
"logps/rejected": -1.5127556324005127,
"loss": 1.506,
"odds_ratio_loss": 0.7399921417236328,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14319561421871185,
"rewards/margins": 0.00807994045317173,
"rewards/rejected": -0.15127556025981903,
"sft_loss": 1.4319560527801514,
"step": 1310
},
{
"epoch": 2.235393734123624,
"grad_norm": 3.1588046550750732,
"learning_rate": 7.560020285277401e-07,
"logits/chosen": -14.269197463989258,
"logits/rejected": -14.49077320098877,
"logps/chosen": -1.3981552124023438,
"logps/rejected": -1.4313172101974487,
"loss": 1.4741,
"odds_ratio_loss": 0.7590950727462769,
"rewards/accuracies": 0.4437499940395355,
"rewards/chosen": -0.1398155391216278,
"rewards/margins": 0.0033162026666104794,
"rewards/rejected": -0.14313173294067383,
"sft_loss": 1.3981552124023438,
"step": 1320
},
{
"epoch": 2.252328535139712,
"grad_norm": 3.586276054382324,
"learning_rate": 7.244840597412956e-07,
"logits/chosen": -14.22734546661377,
"logits/rejected": -14.291172981262207,
"logps/chosen": -1.514716625213623,
"logps/rejected": -1.4183883666992188,
"loss": 1.5981,
"odds_ratio_loss": 0.8342422246932983,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.15147167444229126,
"rewards/margins": -0.009632834233343601,
"rewards/rejected": -0.14183883368968964,
"sft_loss": 1.514716625213623,
"step": 1330
},
{
"epoch": 2.2692633361558,
"grad_norm": 2.3110530376434326,
"learning_rate": 6.935254819356796e-07,
"logits/chosen": -14.419351577758789,
"logits/rejected": -14.297566413879395,
"logps/chosen": -1.4030816555023193,
"logps/rejected": -1.4476964473724365,
"loss": 1.4773,
"odds_ratio_loss": 0.7421059012413025,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1403081715106964,
"rewards/margins": 0.0044614695943892,
"rewards/rejected": -0.14476963877677917,
"sft_loss": 1.4030816555023193,
"step": 1340
},
{
"epoch": 2.2861981371718882,
"grad_norm": 1.1914503574371338,
"learning_rate": 6.631360488668662e-07,
"logits/chosen": -14.460253715515137,
"logits/rejected": -14.41465950012207,
"logps/chosen": -1.2984880208969116,
"logps/rejected": -1.4945783615112305,
"loss": 1.3662,
"odds_ratio_loss": 0.6775275468826294,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12984880805015564,
"rewards/margins": 0.019609034061431885,
"rewards/rejected": -0.14945784211158752,
"sft_loss": 1.2984880208969116,
"step": 1350
},
{
"epoch": 2.3031329381879764,
"grad_norm": 2.2295608520507812,
"learning_rate": 6.333253349770672e-07,
"logits/chosen": -14.249277114868164,
"logits/rejected": -14.275445938110352,
"logps/chosen": -1.4399076700210571,
"logps/rejected": -1.4462318420410156,
"loss": 1.5184,
"odds_ratio_loss": 0.7848686575889587,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14399076998233795,
"rewards/margins": 0.0006324196001514792,
"rewards/rejected": -0.14462319016456604,
"sft_loss": 1.4399076700210571,
"step": 1360
},
{
"epoch": 2.3200677392040645,
"grad_norm": 0.9200133681297302,
"learning_rate": 6.041027323782364e-07,
"logits/chosen": -14.550092697143555,
"logits/rejected": -14.5205717086792,
"logps/chosen": -1.3879852294921875,
"logps/rejected": -1.5073843002319336,
"loss": 1.4568,
"odds_ratio_loss": 0.6877447366714478,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.1387985199689865,
"rewards/margins": 0.011939908377826214,
"rewards/rejected": -0.1507384330034256,
"sft_loss": 1.3879852294921875,
"step": 1370
},
{
"epoch": 2.337002540220152,
"grad_norm": 1.757595181465149,
"learning_rate": 5.754774478929969e-07,
"logits/chosen": -14.518872261047363,
"logits/rejected": -14.515436172485352,
"logps/chosen": -1.4030746221542358,
"logps/rejected": -1.525309443473816,
"loss": 1.4726,
"odds_ratio_loss": 0.6956244707107544,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.1403074562549591,
"rewards/margins": 0.012223480269312859,
"rewards/rejected": -0.1525309532880783,
"sft_loss": 1.4030746221542358,
"step": 1380
},
{
"epoch": 2.3539373412362403,
"grad_norm": 1.9958380460739136,
"learning_rate": 5.474585001539634e-07,
"logits/chosen": -14.516281127929688,
"logits/rejected": -14.449725151062012,
"logps/chosen": -1.3020037412643433,
"logps/rejected": -1.4323627948760986,
"loss": 1.3692,
"odds_ratio_loss": 0.671482503414154,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13020040094852448,
"rewards/margins": 0.013035891577601433,
"rewards/rejected": -0.14323627948760986,
"sft_loss": 1.3020037412643433,
"step": 1390
},
{
"epoch": 2.3708721422523285,
"grad_norm": 0.9711344242095947,
"learning_rate": 5.200547167623424e-07,
"logits/chosen": -14.532658576965332,
"logits/rejected": -14.446354866027832,
"logps/chosen": -1.4261430501937866,
"logps/rejected": -1.6040065288543701,
"loss": 1.4923,
"odds_ratio_loss": 0.6615304946899414,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": -0.14261427521705627,
"rewards/margins": 0.01778637059032917,
"rewards/rejected": -0.1604006588459015,
"sft_loss": 1.4261430501937866,
"step": 1400
},
{
"epoch": 2.3878069432684166,
"grad_norm": 0.9917483925819397,
"learning_rate": 4.932747315067271e-07,
"logits/chosen": -14.57470417022705,
"logits/rejected": -14.438740730285645,
"logps/chosen": -1.4024930000305176,
"logps/rejected": -1.469939947128296,
"loss": 1.4755,
"odds_ratio_loss": 0.7300769090652466,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.14024929702281952,
"rewards/margins": 0.006744695361703634,
"rewards/rejected": -0.1469939947128296,
"sft_loss": 1.4024930000305176,
"step": 1410
},
{
"epoch": 2.4047417442845047,
"grad_norm": 2.87284779548645,
"learning_rate": 4.6712698164294553e-07,
"logits/chosen": -14.489944458007812,
"logits/rejected": -14.394497871398926,
"logps/chosen": -1.4407953023910522,
"logps/rejected": -1.4559253454208374,
"loss": 1.5183,
"odds_ratio_loss": 0.7750439047813416,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.1440795361995697,
"rewards/margins": 0.0015129944076761603,
"rewards/rejected": -0.14559254050254822,
"sft_loss": 1.4407953023910522,
"step": 1420
},
{
"epoch": 2.421676545300593,
"grad_norm": 3.170734167098999,
"learning_rate": 4.41619705235842e-07,
"logits/chosen": -14.575798034667969,
"logits/rejected": -14.610578536987305,
"logps/chosen": -1.375421404838562,
"logps/rejected": -1.5859653949737549,
"loss": 1.4439,
"odds_ratio_loss": 0.6848722696304321,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13754215836524963,
"rewards/margins": 0.02105441316962242,
"rewards/rejected": -0.15859656035900116,
"sft_loss": 1.375421404838562,
"step": 1430
},
{
"epoch": 2.438611346316681,
"grad_norm": 0.8895889520645142,
"learning_rate": 4.167609385637961e-07,
"logits/chosen": -14.474627494812012,
"logits/rejected": -14.27497386932373,
"logps/chosen": -1.3773252964019775,
"logps/rejected": -1.4834753274917603,
"loss": 1.4478,
"odds_ratio_loss": 0.7047079205513,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.13773252069950104,
"rewards/margins": 0.010615019127726555,
"rewards/rejected": -0.14834752678871155,
"sft_loss": 1.3773252964019775,
"step": 1440
},
{
"epoch": 2.4555461473327687,
"grad_norm": 1.5126135349273682,
"learning_rate": 3.9255851358683567e-07,
"logits/chosen": -14.236564636230469,
"logits/rejected": -14.380549430847168,
"logps/chosen": -1.3431507349014282,
"logps/rejected": -1.4221420288085938,
"loss": 1.4184,
"odds_ratio_loss": 0.7521894574165344,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.13431507349014282,
"rewards/margins": 0.007899129763245583,
"rewards/rejected": -0.14221420884132385,
"sft_loss": 1.3431507349014282,
"step": 1450
},
{
"epoch": 2.472480948348857,
"grad_norm": 2.2620511054992676,
"learning_rate": 3.690200554791082e-07,
"logits/chosen": -14.424779891967773,
"logits/rejected": -14.354517936706543,
"logps/chosen": -1.3713457584381104,
"logps/rejected": -1.483659267425537,
"loss": 1.44,
"odds_ratio_loss": 0.6865109205245972,
"rewards/accuracies": 0.4937500059604645,
"rewards/chosen": -0.1371345818042755,
"rewards/margins": 0.01123136654496193,
"rewards/rejected": -0.14836594462394714,
"sft_loss": 1.3713457584381104,
"step": 1460
},
{
"epoch": 2.489415749364945,
"grad_norm": 2.7279679775238037,
"learning_rate": 3.461529802265079e-07,
"logits/chosen": -14.534950256347656,
"logits/rejected": -14.408660888671875,
"logps/chosen": -1.3657411336898804,
"logps/rejected": -1.4428269863128662,
"loss": 1.4382,
"odds_ratio_loss": 0.724717915058136,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": -0.13657411932945251,
"rewards/margins": 0.007708588149398565,
"rewards/rejected": -0.14428271353244781,
"sft_loss": 1.3657411336898804,
"step": 1470
},
{
"epoch": 2.506350550381033,
"grad_norm": 1.4955379962921143,
"learning_rate": 3.2396449229020883e-07,
"logits/chosen": -14.613665580749512,
"logits/rejected": -14.357098579406738,
"logps/chosen": -1.430061936378479,
"logps/rejected": -1.4436513185501099,
"loss": 1.5066,
"odds_ratio_loss": 0.7651657462120056,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.14300617575645447,
"rewards/margins": 0.001358934328891337,
"rewards/rejected": -0.1443651169538498,
"sft_loss": 1.430061936378479,
"step": 1480
},
{
"epoch": 2.523285351397121,
"grad_norm": 2.4484000205993652,
"learning_rate": 3.024615823368371e-07,
"logits/chosen": -14.309808731079102,
"logits/rejected": -14.362199783325195,
"logps/chosen": -1.368744134902954,
"logps/rejected": -1.4789055585861206,
"loss": 1.4407,
"odds_ratio_loss": 0.7196033596992493,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.13687442243099213,
"rewards/margins": 0.011016142554581165,
"rewards/rejected": -0.14789055287837982,
"sft_loss": 1.368744134902954,
"step": 1490
},
{
"epoch": 2.5402201524132093,
"grad_norm": 1.3006510734558105,
"learning_rate": 2.8165102503600716e-07,
"logits/chosen": -14.335368156433105,
"logits/rejected": -14.394729614257812,
"logps/chosen": -1.3518388271331787,
"logps/rejected": -1.5090402364730835,
"loss": 1.4234,
"odds_ratio_loss": 0.7160680890083313,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.1351839005947113,
"rewards/margins": 0.015720132738351822,
"rewards/rejected": -0.15090402960777283,
"sft_loss": 1.3518388271331787,
"step": 1500
},
{
"epoch": 2.5402201524132093,
"eval_logits/chosen": -14.433335304260254,
"eval_logits/rejected": -14.40054702758789,
"eval_logps/chosen": -1.4238022565841675,
"eval_logps/rejected": -1.5122665166854858,
"eval_loss": 1.4967381954193115,
"eval_odds_ratio_loss": 0.7293583154678345,
"eval_rewards/accuracies": 0.5038095116615295,
"eval_rewards/chosen": -0.1423802226781845,
"eval_rewards/margins": 0.008846436627209187,
"eval_rewards/rejected": -0.15122665464878082,
"eval_runtime": 445.9302,
"eval_samples_per_second": 2.355,
"eval_sft_loss": 1.4238022565841675,
"eval_steps_per_second": 1.177,
"step": 1500
},
{
"epoch": 2.557154953429297,
"grad_norm": 1.7379106283187866,
"learning_rate": 2.615393769259039e-07,
"logits/chosen": -14.186014175415039,
"logits/rejected": -14.296531677246094,
"logps/chosen": -1.5615041255950928,
"logps/rejected": -1.4595506191253662,
"loss": 1.6479,
"odds_ratio_loss": 0.8642258644104004,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.1561504304409027,
"rewards/margins": -0.01019534282386303,
"rewards/rejected": -0.14595508575439453,
"sft_loss": 1.5615041255950928,
"step": 1510
},
{
"epoch": 2.574089754445385,
"grad_norm": 1.4174609184265137,
"learning_rate": 2.421329743475917e-07,
"logits/chosen": -14.357484817504883,
"logits/rejected": -14.365758895874023,
"logps/chosen": -1.3432402610778809,
"logps/rejected": -1.4273216724395752,
"loss": 1.4165,
"odds_ratio_loss": 0.7326976656913757,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13432399928569794,
"rewards/margins": 0.008408156223595142,
"rewards/rejected": -0.142732173204422,
"sft_loss": 1.3432402610778809,
"step": 1520
},
{
"epoch": 2.5910245554614733,
"grad_norm": 2.1974966526031494,
"learning_rate": 2.234379314486973e-07,
"logits/chosen": -14.357455253601074,
"logits/rejected": -14.430908203125,
"logps/chosen": -1.3924636840820312,
"logps/rejected": -1.438753366470337,
"loss": 1.4659,
"odds_ratio_loss": 0.734772801399231,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.1392463743686676,
"rewards/margins": 0.004628963768482208,
"rewards/rejected": -0.1438753306865692,
"sft_loss": 1.3924636840820312,
"step": 1530
},
{
"epoch": 2.6079593564775614,
"grad_norm": 1.687161922454834,
"learning_rate": 2.0546013825709783e-07,
"logits/chosen": -14.250285148620605,
"logits/rejected": -14.199666976928711,
"logps/chosen": -1.3859349489212036,
"logps/rejected": -1.6351137161254883,
"loss": 1.4537,
"odds_ratio_loss": 0.6773584485054016,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.13859349489212036,
"rewards/margins": 0.024917880073189735,
"rewards/rejected": -0.16351138055324554,
"sft_loss": 1.3859349489212036,
"step": 1540
},
{
"epoch": 2.6248941574936495,
"grad_norm": 1.5129095315933228,
"learning_rate": 1.88205258825217e-07,
"logits/chosen": -14.429784774780273,
"logits/rejected": -14.17693042755127,
"logps/chosen": -1.2826873064041138,
"logps/rejected": -1.4500634670257568,
"loss": 1.35,
"odds_ratio_loss": 0.6734637022018433,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.12826873362064362,
"rewards/margins": 0.016737615689635277,
"rewards/rejected": -0.14500637352466583,
"sft_loss": 1.2826873064041138,
"step": 1550
},
{
"epoch": 2.6418289585097376,
"grad_norm": 2.0735878944396973,
"learning_rate": 1.7167872944552245e-07,
"logits/chosen": -14.309649467468262,
"logits/rejected": -14.5745849609375,
"logps/chosen": -1.3819622993469238,
"logps/rejected": -1.4382798671722412,
"loss": 1.4545,
"odds_ratio_loss": 0.7257741689682007,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.1381962150335312,
"rewards/margins": 0.005631768610328436,
"rewards/rejected": -0.14382800459861755,
"sft_loss": 1.3819622993469238,
"step": 1560
},
{
"epoch": 2.6587637595258258,
"grad_norm": 1.331615924835205,
"learning_rate": 1.5588575693777142e-07,
"logits/chosen": -14.269506454467773,
"logits/rejected": -14.277575492858887,
"logps/chosen": -1.3485890626907349,
"logps/rejected": -1.391801118850708,
"loss": 1.42,
"odds_ratio_loss": 0.714430034160614,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": -0.1348589062690735,
"rewards/margins": 0.004321185871958733,
"rewards/rejected": -0.13918009400367737,
"sft_loss": 1.3485890626907349,
"step": 1570
},
{
"epoch": 2.675698560541914,
"grad_norm": 1.4459912776947021,
"learning_rate": 1.4083131700856428e-07,
"logits/chosen": -14.257006645202637,
"logits/rejected": -14.398195266723633,
"logps/chosen": -1.4757592678070068,
"logps/rejected": -1.4755744934082031,
"loss": 1.553,
"odds_ratio_loss": 0.7721089124679565,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.14757592976093292,
"rewards/margins": -1.848684587457683e-05,
"rewards/rejected": -0.14755743741989136,
"sft_loss": 1.4757592678070068,
"step": 1580
},
{
"epoch": 2.6926333615580016,
"grad_norm": 1.7114406824111938,
"learning_rate": 1.2652015268370315e-07,
"logits/chosen": -14.462023735046387,
"logits/rejected": -14.4578218460083,
"logps/chosen": -1.3610906600952148,
"logps/rejected": -1.4776142835617065,
"loss": 1.4335,
"odds_ratio_loss": 0.7242997884750366,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.13610906898975372,
"rewards/margins": 0.011652367189526558,
"rewards/rejected": -0.14776143431663513,
"sft_loss": 1.3610906600952148,
"step": 1590
},
{
"epoch": 2.7095681625740897,
"grad_norm": 1.469370722770691,
"learning_rate": 1.1295677281386502e-07,
"logits/chosen": -14.559967041015625,
"logits/rejected": -14.478399276733398,
"logps/chosen": -1.4620916843414307,
"logps/rejected": -1.5956697463989258,
"loss": 1.5327,
"odds_ratio_loss": 0.706096351146698,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.14620915055274963,
"rewards/margins": 0.013357831165194511,
"rewards/rejected": -0.1595669686794281,
"sft_loss": 1.4620916843414307,
"step": 1600
},
{
"epoch": 2.726502963590178,
"grad_norm": 3.563047409057617,
"learning_rate": 1.0014545065404973e-07,
"logits/chosen": -14.436056137084961,
"logits/rejected": -14.507670402526855,
"logps/chosen": -1.4244582653045654,
"logps/rejected": -1.5525462627410889,
"loss": 1.4981,
"odds_ratio_loss": 0.7365024089813232,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.14244583249092102,
"rewards/margins": 0.012808804400265217,
"rewards/rejected": -0.15525463223457336,
"sft_loss": 1.4244582653045654,
"step": 1610
},
{
"epoch": 2.743437764606266,
"grad_norm": 1.1012893915176392,
"learning_rate": 8.809022251725502e-08,
"logits/chosen": -14.58587646484375,
"logits/rejected": -14.311334609985352,
"logps/chosen": -1.3465197086334229,
"logps/rejected": -1.5383667945861816,
"loss": 1.4143,
"odds_ratio_loss": 0.6780072450637817,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -0.13465197384357452,
"rewards/margins": 0.01918472908437252,
"rewards/rejected": -0.1538366973400116,
"sft_loss": 1.3465197086334229,
"step": 1620
},
{
"epoch": 2.7603725656223537,
"grad_norm": 1.1277046203613281,
"learning_rate": 7.679488650280509e-08,
"logits/chosen": -14.479377746582031,
"logits/rejected": -14.5874605178833,
"logps/chosen": -1.3598499298095703,
"logps/rejected": -1.5038646459579468,
"loss": 1.4265,
"odds_ratio_loss": 0.6669132113456726,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13598500192165375,
"rewards/margins": 0.014401474967598915,
"rewards/rejected": -0.1503864824771881,
"sft_loss": 1.3598499298095703,
"step": 1630
},
{
"epoch": 2.777307366638442,
"grad_norm": 4.1279425621032715,
"learning_rate": 6.626300129972563e-08,
"logits/chosen": -14.374710083007812,
"logits/rejected": -14.649663925170898,
"logps/chosen": -1.337192177772522,
"logps/rejected": -1.399910569190979,
"loss": 1.4103,
"odds_ratio_loss": 0.7308821678161621,
"rewards/accuracies": 0.4312500059604645,
"rewards/chosen": -0.13371922075748444,
"rewards/margins": 0.0062718503177165985,
"rewards/rejected": -0.13999105989933014,
"sft_loss": 1.337192177772522,
"step": 1640
},
{
"epoch": 2.79424216765453,
"grad_norm": 1.395706057548523,
"learning_rate": 5.649788506555065e-08,
"logits/chosen": -14.170741081237793,
"logits/rejected": -14.524632453918457,
"logps/chosen": -1.359508752822876,
"logps/rejected": -1.4829118251800537,
"loss": 1.4274,
"odds_ratio_loss": 0.6786811351776123,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.13595086336135864,
"rewards/margins": 0.012340312823653221,
"rewards/rejected": -0.1482912003993988,
"sft_loss": 1.359508752822876,
"step": 1650
},
{
"epoch": 2.811176968670618,
"grad_norm": 1.766761302947998,
"learning_rate": 4.7502614380908474e-08,
"logits/chosen": -14.416241645812988,
"logits/rejected": -14.220751762390137,
"logps/chosen": -1.3510209321975708,
"logps/rejected": -1.4324430227279663,
"loss": 1.4234,
"odds_ratio_loss": 0.7241480946540833,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.13510209321975708,
"rewards/margins": 0.008142213337123394,
"rewards/rejected": -0.14324429631233215,
"sft_loss": 1.3510209321975708,
"step": 1660
},
{
"epoch": 2.828111769686706,
"grad_norm": 1.6919310092926025,
"learning_rate": 3.9280023280222066e-08,
"logits/chosen": -14.29878044128418,
"logits/rejected": -14.355636596679688,
"logps/chosen": -1.3545790910720825,
"logps/rejected": -1.4631725549697876,
"loss": 1.4267,
"odds_ratio_loss": 0.7212874293327332,
"rewards/accuracies": 0.48124998807907104,
"rewards/chosen": -0.13545790314674377,
"rewards/margins": 0.010859351605176926,
"rewards/rejected": -0.146317258477211,
"sft_loss": 1.3545790910720825,
"step": 1670
},
{
"epoch": 2.8450465707027943,
"grad_norm": 1.2037099599838257,
"learning_rate": 3.1832702358818855e-08,
"logits/chosen": -14.370442390441895,
"logits/rejected": -14.418550491333008,
"logps/chosen": -1.509386658668518,
"logps/rejected": -1.5371757745742798,
"loss": 1.5849,
"odds_ratio_loss": 0.7552896738052368,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -0.15093867480754852,
"rewards/margins": 0.0027789073064923286,
"rewards/rejected": -0.15371759235858917,
"sft_loss": 1.509386658668518,
"step": 1680
},
{
"epoch": 2.8619813717188824,
"grad_norm": 1.7988624572753906,
"learning_rate": 2.5162997956746647e-08,
"logits/chosen": -14.56567096710205,
"logits/rejected": -14.401374816894531,
"logps/chosen": -1.374145746231079,
"logps/rejected": -1.5657732486724854,
"loss": 1.4407,
"odds_ratio_loss": 0.6658385992050171,
"rewards/accuracies": 0.48750001192092896,
"rewards/chosen": -0.1374145746231079,
"rewards/margins": 0.019162729382514954,
"rewards/rejected": -0.15657731890678406,
"sft_loss": 1.374145746231079,
"step": 1690
},
{
"epoch": 2.8789161727349706,
"grad_norm": 1.8519299030303955,
"learning_rate": 1.9273011419536914e-08,
"logits/chosen": -14.358851432800293,
"logits/rejected": -14.361642837524414,
"logps/chosen": -1.3464272022247314,
"logps/rejected": -1.403352975845337,
"loss": 1.4203,
"odds_ratio_loss": 0.7389532327651978,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -0.13464272022247314,
"rewards/margins": 0.005692584905773401,
"rewards/rejected": -0.1403352916240692,
"sft_loss": 1.3464272022247314,
"step": 1700
},
{
"epoch": 2.8958509737510583,
"grad_norm": 1.5958627462387085,
"learning_rate": 1.4164598436159083e-08,
"logits/chosen": -14.45777416229248,
"logits/rejected": -14.55150318145752,
"logps/chosen": -1.3691927194595337,
"logps/rejected": -1.3762314319610596,
"loss": 1.4457,
"odds_ratio_loss": 0.7649668455123901,
"rewards/accuracies": 0.45625001192092896,
"rewards/chosen": -0.13691926002502441,
"rewards/margins": 0.0007038834737613797,
"rewards/rejected": -0.1376231610774994,
"sft_loss": 1.3691927194595337,
"step": 1710
},
{
"epoch": 2.9127857747671464,
"grad_norm": 1.1447230577468872,
"learning_rate": 9.839368454371556e-09,
"logits/chosen": -14.424572944641113,
"logits/rejected": -14.471136093139648,
"logps/chosen": -1.3717620372772217,
"logps/rejected": -1.5175390243530273,
"loss": 1.4405,
"odds_ratio_loss": 0.6872409582138062,
"rewards/accuracies": 0.53125,
"rewards/chosen": -0.13717620074748993,
"rewards/margins": 0.014577709138393402,
"rewards/rejected": -0.15175390243530273,
"sft_loss": 1.3717620372772217,
"step": 1720
},
{
"epoch": 2.9297205757832345,
"grad_norm": 1.2689136266708374,
"learning_rate": 6.298684173650649e-09,
"logits/chosen": -14.209467887878418,
"logits/rejected": -14.251020431518555,
"logps/chosen": -1.3433691263198853,
"logps/rejected": -1.4693882465362549,
"loss": 1.4164,
"odds_ratio_loss": 0.7302767038345337,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.134336918592453,
"rewards/margins": 0.012601924128830433,
"rewards/rejected": -0.14693884551525116,
"sft_loss": 1.3433691263198853,
"step": 1730
},
{
"epoch": 2.9466553767993227,
"grad_norm": 1.0975892543792725,
"learning_rate": 3.543661115860686e-09,
"logits/chosen": -14.267629623413086,
"logits/rejected": -14.19848918914795,
"logps/chosen": -1.3776047229766846,
"logps/rejected": -1.4311275482177734,
"loss": 1.4519,
"odds_ratio_loss": 0.7429286241531372,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.1377604901790619,
"rewards/margins": 0.0053522614762187,
"rewards/rejected": -0.14311274886131287,
"sft_loss": 1.3776047229766846,
"step": 1740
},
{
"epoch": 2.963590177815411,
"grad_norm": 1.3392242193222046,
"learning_rate": 1.575167273800693e-09,
"logits/chosen": -14.299784660339355,
"logits/rejected": -14.385360717773438,
"logps/chosen": -1.3382477760314941,
"logps/rejected": -1.3448528051376343,
"loss": 1.4132,
"odds_ratio_loss": 0.7496879696846008,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -0.13382478058338165,
"rewards/margins": 0.0006605213275179267,
"rewards/rejected": -0.13448528945446014,
"sft_loss": 1.3382477760314941,
"step": 1750
},
{
"epoch": 2.9805249788314985,
"grad_norm": 1.3686504364013672,
"learning_rate": 3.9382283773564676e-10,
"logits/chosen": -14.456472396850586,
"logits/rejected": -14.480894088745117,
"logps/chosen": -1.4318442344665527,
"logps/rejected": -1.5163114070892334,
"loss": 1.5085,
"odds_ratio_loss": 0.7666203379631042,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.14318443834781647,
"rewards/margins": 0.0084467101842165,
"rewards/rejected": -0.15163113176822662,
"sft_loss": 1.4318442344665527,
"step": 1760
},
{
"epoch": 2.9974597798475866,
"grad_norm": 8.027430534362793,
"learning_rate": 0.0,
"logits/chosen": -14.504228591918945,
"logits/rejected": -14.523704528808594,
"logps/chosen": -1.456779956817627,
"logps/rejected": -1.5364240407943726,
"loss": 1.5332,
"odds_ratio_loss": 0.7639864683151245,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.14567799866199493,
"rewards/margins": 0.007964405231177807,
"rewards/rejected": -0.1536424160003662,
"sft_loss": 1.456779956817627,
"step": 1770
},
{
"epoch": 2.9974597798475866,
"step": 1770,
"total_flos": 1.8624482718096753e+18,
"train_loss": 1.5362868001905539,
"train_runtime": 27766.9561,
"train_samples_per_second": 1.021,
"train_steps_per_second": 0.064
}
],
"logging_steps": 10,
"max_steps": 1770,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"total_flos": 1.8624482718096753e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}