{ "best_metric": 1.4967381954193115, "best_model_checkpoint": "saves/Falcon-7B-Instruct/lora/orpo-salt-half/checkpoint-1500", "epoch": 2.9974597798475866, "eval_steps": 500, "global_step": 1770, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01693480101608806, "grad_norm": 0.6027132868766785, "learning_rate": 4.999614014035063e-06, "logits/chosen": -14.201833724975586, "logits/rejected": -14.270045280456543, "logps/chosen": -1.961771011352539, "logps/rejected": -2.1497561931610107, "loss": 2.0361, "odds_ratio_loss": 0.7429978251457214, "rewards/accuracies": 0.5, "rewards/chosen": -0.19617711007595062, "rewards/margins": 0.01879853382706642, "rewards/rejected": -0.21497564017772675, "sft_loss": 1.961771011352539, "step": 10 }, { "epoch": 0.03386960203217612, "grad_norm": 0.4791746735572815, "learning_rate": 4.998440543386042e-06, "logits/chosen": -14.17326545715332, "logits/rejected": -14.03160572052002, "logps/chosen": -1.9260406494140625, "logps/rejected": -2.0053372383117676, "loss": 2.0019, "odds_ratio_loss": 0.7586489915847778, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.19260406494140625, "rewards/margins": 0.007929656654596329, "rewards/rejected": -0.20053371787071228, "sft_loss": 1.9260406494140625, "step": 20 }, { "epoch": 0.05080440304826418, "grad_norm": 0.3785243630409241, "learning_rate": 4.996479918381253e-06, "logits/chosen": -14.245376586914062, "logits/rejected": -14.222900390625, "logps/chosen": -1.8398857116699219, "logps/rejected": -1.8666032552719116, "loss": 1.9146, "odds_ratio_loss": 0.7475350499153137, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.18398860096931458, "rewards/margins": 0.00267172628082335, "rewards/rejected": -0.18666031956672668, "sft_loss": 1.8398857116699219, "step": 30 }, { "epoch": 0.06773920406435224, "grad_norm": 0.637917697429657, "learning_rate": 4.993732756731818e-06, "logits/chosen": -14.213427543640137, "logits/rejected": -14.385249137878418, "logps/chosen": -1.8162885904312134, "logps/rejected": -1.9234222173690796, "loss": 1.889, "odds_ratio_loss": 0.7271509766578674, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.18162885308265686, "rewards/margins": 0.010713383555412292, "rewards/rejected": -0.19234223663806915, "sft_loss": 1.8162885904312134, "step": 40 }, { "epoch": 0.0846740050804403, "grad_norm": 0.6790710091590881, "learning_rate": 4.9901999239537345e-06, "logits/chosen": -14.203392028808594, "logits/rejected": -14.118731498718262, "logps/chosen": -1.9451831579208374, "logps/rejected": -1.9480127096176147, "loss": 2.0255, "odds_ratio_loss": 0.8034948110580444, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1945182979106903, "rewards/margins": 0.0002829456643667072, "rewards/rejected": -0.19480125606060028, "sft_loss": 1.9451831579208374, "step": 50 }, { "epoch": 0.10160880609652836, "grad_norm": 0.38820621371269226, "learning_rate": 4.985882533095186e-06, "logits/chosen": -14.125239372253418, "logits/rejected": -14.241134643554688, "logps/chosen": -1.7669858932495117, "logps/rejected": -1.818566918373108, "loss": 1.8465, "odds_ratio_loss": 0.7950754761695862, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.17669859528541565, "rewards/margins": 0.005158091429620981, "rewards/rejected": -0.1818566769361496, "sft_loss": 1.7669858932495117, "step": 60 }, { "epoch": 0.11854360711261643, "grad_norm": 1.485378384590149, "learning_rate": 4.9807819443858705e-06, "logits/chosen": -14.16772174835205, "logits/rejected": -14.14952564239502, "logps/chosen": -1.7974742650985718, "logps/rejected": -1.8876419067382812, "loss": 1.8722, "odds_ratio_loss": 0.7475281953811646, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.17974743247032166, "rewards/margins": 0.009016749449074268, "rewards/rejected": -0.18876421451568604, "sft_loss": 1.7974742650985718, "step": 70 }, { "epoch": 0.1354784081287045, "grad_norm": 0.6158199310302734, "learning_rate": 4.9748997648084404e-06, "logits/chosen": -14.09917163848877, "logits/rejected": -14.224530220031738, "logps/chosen": -1.7899717092514038, "logps/rejected": -1.8508037328720093, "loss": 1.8688, "odds_ratio_loss": 0.7882196307182312, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.17899715900421143, "rewards/margins": 0.0060832141898572445, "rewards/rejected": -0.1850803941488266, "sft_loss": 1.7899717092514038, "step": 80 }, { "epoch": 0.15241320914479256, "grad_norm": 1.0399421453475952, "learning_rate": 4.96823784759222e-06, "logits/chosen": -14.11219596862793, "logits/rejected": -14.099919319152832, "logps/chosen": -1.7365163564682007, "logps/rejected": -1.7418320178985596, "loss": 1.8161, "odds_ratio_loss": 0.7956770658493042, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.17365165054798126, "rewards/margins": 0.0005315736052580178, "rewards/rejected": -0.174183189868927, "sft_loss": 1.7365163564682007, "step": 90 }, { "epoch": 0.1693480101608806, "grad_norm": 0.9482620358467102, "learning_rate": 4.960798291629323e-06, "logits/chosen": -14.198771476745605, "logits/rejected": -14.24067497253418, "logps/chosen": -1.8019222021102905, "logps/rejected": -1.7944272756576538, "loss": 1.8785, "odds_ratio_loss": 0.765292227268219, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.18019220232963562, "rewards/margins": -0.000749480735976249, "rewards/rejected": -0.17944273352622986, "sft_loss": 1.8019222021102905, "step": 100 }, { "epoch": 0.18628281117696868, "grad_norm": 1.496517539024353, "learning_rate": 4.952583440813383e-06, "logits/chosen": -14.270334243774414, "logits/rejected": -14.252988815307617, "logps/chosen": -1.8082859516143799, "logps/rejected": -1.8689155578613281, "loss": 1.885, "odds_ratio_loss": 0.7666890025138855, "rewards/accuracies": 0.46875, "rewards/chosen": -0.18082860112190247, "rewards/margins": 0.0060629709623754025, "rewards/rejected": -0.186891570687294, "sft_loss": 1.8082859516143799, "step": 110 }, { "epoch": 0.20321761219305673, "grad_norm": 0.8162474036216736, "learning_rate": 4.943595883301086e-06, "logits/chosen": -14.396245002746582, "logits/rejected": -14.407267570495605, "logps/chosen": -1.8202846050262451, "logps/rejected": -1.8238685131072998, "loss": 1.8966, "odds_ratio_loss": 0.7631626129150391, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.18202845752239227, "rewards/margins": 0.0003583906218409538, "rewards/rejected": -0.1823868602514267, "sft_loss": 1.8202846050262451, "step": 120 }, { "epoch": 0.2201524132091448, "grad_norm": 0.9815341234207153, "learning_rate": 4.933838450696757e-06, "logits/chosen": -14.14527702331543, "logits/rejected": -14.11426830291748, "logps/chosen": -1.6691076755523682, "logps/rejected": -1.7151718139648438, "loss": 1.7441, "odds_ratio_loss": 0.7502495050430298, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.16691075265407562, "rewards/margins": 0.004606431350111961, "rewards/rejected": -0.17151719331741333, "sft_loss": 1.6691076755523682, "step": 130 }, { "epoch": 0.23708721422523285, "grad_norm": 1.7414650917053223, "learning_rate": 4.923314217160234e-06, "logits/chosen": -14.14660358428955, "logits/rejected": -14.196474075317383, "logps/chosen": -1.7544286251068115, "logps/rejected": -1.7217376232147217, "loss": 1.8341, "odds_ratio_loss": 0.7964597344398499, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.1754428595304489, "rewards/margins": -0.003269097302109003, "rewards/rejected": -0.17217376828193665, "sft_loss": 1.7544286251068115, "step": 140 }, { "epoch": 0.2540220152413209, "grad_norm": 0.6009025573730469, "learning_rate": 4.9120264984383285e-06, "logits/chosen": -14.155496597290039, "logits/rejected": -14.008768081665039, "logps/chosen": -1.5715187788009644, "logps/rejected": -1.608656644821167, "loss": 1.6472, "odds_ratio_loss": 0.7572886347770691, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.15715190768241882, "rewards/margins": 0.003713789861649275, "rewards/rejected": -0.1608656644821167, "sft_loss": 1.5715187788009644, "step": 150 }, { "epoch": 0.270956816257409, "grad_norm": 0.7238659858703613, "learning_rate": 4.899978850820176e-06, "logits/chosen": -14.257448196411133, "logits/rejected": -14.187673568725586, "logps/chosen": -1.7162948846817017, "logps/rejected": -1.7536369562149048, "loss": 1.7925, "odds_ratio_loss": 0.7625432014465332, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.17162947356700897, "rewards/margins": 0.0037342351861298084, "rewards/rejected": -0.1753637045621872, "sft_loss": 1.7162948846817017, "step": 160 }, { "epoch": 0.28789161727349705, "grad_norm": 0.9593597650527954, "learning_rate": 4.887175070016795e-06, "logits/chosen": -14.389033317565918, "logits/rejected": -14.29101276397705, "logps/chosen": -1.514937162399292, "logps/rejected": -1.5708329677581787, "loss": 1.5883, "odds_ratio_loss": 0.7331644296646118, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15149369835853577, "rewards/margins": 0.005589589010924101, "rewards/rejected": -0.15708328783512115, "sft_loss": 1.514937162399292, "step": 170 }, { "epoch": 0.3048264182895851, "grad_norm": 1.0034801959991455, "learning_rate": 4.873619189965217e-06, "logits/chosen": -14.039607048034668, "logits/rejected": -14.147199630737305, "logps/chosen": -1.5949114561080933, "logps/rejected": -1.746072769165039, "loss": 1.6635, "odds_ratio_loss": 0.6863279938697815, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.1594911515712738, "rewards/margins": 0.015116140246391296, "rewards/rejected": -0.1746072769165039, "sft_loss": 1.5949114561080933, "step": 180 }, { "epoch": 0.32176121930567314, "grad_norm": 0.89156574010849, "learning_rate": 4.859315481557563e-06, "logits/chosen": -14.219070434570312, "logits/rejected": -14.151147842407227, "logps/chosen": -1.5719926357269287, "logps/rejected": -1.6470130681991577, "loss": 1.6487, "odds_ratio_loss": 0.767541766166687, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.15719927847385406, "rewards/margins": 0.0075020515359938145, "rewards/rejected": -0.16470131278038025, "sft_loss": 1.5719926357269287, "step": 190 }, { "epoch": 0.3386960203217612, "grad_norm": 0.587933361530304, "learning_rate": 4.84426845129546e-06, "logits/chosen": -14.344035148620605, "logits/rejected": -14.321207046508789, "logps/chosen": -1.6490224599838257, "logps/rejected": -1.638528823852539, "loss": 1.7261, "odds_ratio_loss": 0.7703002095222473, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.1649022400379181, "rewards/margins": -0.0010493483860045671, "rewards/rejected": -0.16385288536548615, "sft_loss": 1.6490224599838257, "step": 200 }, { "epoch": 0.3556308213378493, "grad_norm": 2.0271973609924316, "learning_rate": 4.828482839870233e-06, "logits/chosen": -14.22668170928955, "logits/rejected": -14.1005220413208, "logps/chosen": -1.5818629264831543, "logps/rejected": -1.5753711462020874, "loss": 1.6618, "odds_ratio_loss": 0.7996558547019958, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.15818628668785095, "rewards/margins": -0.0006491712993010879, "rewards/rejected": -0.15753711760044098, "sft_loss": 1.5818629264831543, "step": 210 }, { "epoch": 0.37256562235393736, "grad_norm": 0.809647262096405, "learning_rate": 4.811963620669314e-06, "logits/chosen": -14.262086868286133, "logits/rejected": -14.35071849822998, "logps/chosen": -1.5450419187545776, "logps/rejected": -1.599981665611267, "loss": 1.6187, "odds_ratio_loss": 0.7366654276847839, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.15450419485569, "rewards/margins": 0.005493967793881893, "rewards/rejected": -0.15999816358089447, "sft_loss": 1.5450419187545776, "step": 220 }, { "epoch": 0.3895004233700254, "grad_norm": 0.9206905961036682, "learning_rate": 4.794715998209328e-06, "logits/chosen": -14.026702880859375, "logits/rejected": -14.009126663208008, "logps/chosen": -1.5401651859283447, "logps/rejected": -1.6259161233901978, "loss": 1.6132, "odds_ratio_loss": 0.7308396100997925, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15401650965213776, "rewards/margins": 0.008575108833611012, "rewards/rejected": -0.1625916212797165, "sft_loss": 1.5401651859283447, "step": 230 }, { "epoch": 0.40643522438611346, "grad_norm": 1.0553600788116455, "learning_rate": 4.7767454064963724e-06, "logits/chosen": -14.294774055480957, "logits/rejected": -14.33879280090332, "logps/chosen": -1.571942925453186, "logps/rejected": -1.6219526529312134, "loss": 1.6455, "odds_ratio_loss": 0.7359451651573181, "rewards/accuracies": 0.53125, "rewards/chosen": -0.15719430148601532, "rewards/margins": 0.005000968463718891, "rewards/rejected": -0.16219528019428253, "sft_loss": 1.571942925453186, "step": 240 }, { "epoch": 0.42337002540220153, "grad_norm": 1.133743166923523, "learning_rate": 4.758057507313987e-06, "logits/chosen": -14.3100004196167, "logits/rejected": -14.21064567565918, "logps/chosen": -1.4966617822647095, "logps/rejected": -1.5281431674957275, "loss": 1.5708, "odds_ratio_loss": 0.7413426041603088, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1496661901473999, "rewards/margins": 0.0031481466721743345, "rewards/rejected": -0.1528143286705017, "sft_loss": 1.4966617822647095, "step": 250 }, { "epoch": 0.4403048264182896, "grad_norm": 1.6055690050125122, "learning_rate": 4.73865818843936e-06, "logits/chosen": -14.18690299987793, "logits/rejected": -14.250242233276367, "logps/chosen": -1.5969842672348022, "logps/rejected": -1.7042526006698608, "loss": 1.6715, "odds_ratio_loss": 0.744690477848053, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1596984565258026, "rewards/margins": 0.010726812295615673, "rewards/rejected": -0.17042526602745056, "sft_loss": 1.5969842672348022, "step": 260 }, { "epoch": 0.4572396274343776, "grad_norm": 0.7864425778388977, "learning_rate": 4.718553561788339e-06, "logits/chosen": -14.111845016479492, "logits/rejected": -14.31633186340332, "logps/chosen": -1.487687110900879, "logps/rejected": -1.5424432754516602, "loss": 1.5596, "odds_ratio_loss": 0.7193279266357422, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14876870810985565, "rewards/margins": 0.00547564122825861, "rewards/rejected": -0.1542443484067917, "sft_loss": 1.487687110900879, "step": 270 }, { "epoch": 0.4741744284504657, "grad_norm": 1.302501916885376, "learning_rate": 4.697749961489822e-06, "logits/chosen": -14.314417839050293, "logits/rejected": -14.266924858093262, "logps/chosen": -1.6229807138442993, "logps/rejected": -1.7468087673187256, "loss": 1.6957, "odds_ratio_loss": 0.7271685004234314, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1622980535030365, "rewards/margins": 0.012382803484797478, "rewards/rejected": -0.17468087375164032, "sft_loss": 1.6229807138442993, "step": 280 }, { "epoch": 0.4911092294665538, "grad_norm": 0.9335818886756897, "learning_rate": 4.67625394189013e-06, "logits/chosen": -14.308195114135742, "logits/rejected": -14.264862060546875, "logps/chosen": -1.47157883644104, "logps/rejected": -1.6349776983261108, "loss": 1.5387, "odds_ratio_loss": 0.6707261204719543, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14715787768363953, "rewards/margins": 0.01633988879621029, "rewards/rejected": -0.16349777579307556, "sft_loss": 1.47157883644104, "step": 290 }, { "epoch": 0.5080440304826418, "grad_norm": 1.5830973386764526, "learning_rate": 4.654072275488016e-06, "logits/chosen": -14.484451293945312, "logits/rejected": -14.427891731262207, "logps/chosen": -1.4168641567230225, "logps/rejected": -1.4915310144424438, "loss": 1.4878, "odds_ratio_loss": 0.7094072103500366, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.14168642461299896, "rewards/margins": 0.0074666752479970455, "rewards/rejected": -0.14915308356285095, "sft_loss": 1.4168641567230225, "step": 300 }, { "epoch": 0.5249788314987299, "grad_norm": 1.3539669513702393, "learning_rate": 4.631211950800925e-06, "logits/chosen": -14.32929515838623, "logits/rejected": -14.424825668334961, "logps/chosen": -1.4027061462402344, "logps/rejected": -1.481377363204956, "loss": 1.4748, "odds_ratio_loss": 0.7213728427886963, "rewards/accuracies": 0.59375, "rewards/chosen": -0.14027062058448792, "rewards/margins": 0.007867120206356049, "rewards/rejected": -0.14813774824142456, "sft_loss": 1.4027061462402344, "step": 310 }, { "epoch": 0.541913632514818, "grad_norm": 2.352029323577881, "learning_rate": 4.6076801701632095e-06, "logits/chosen": -14.217028617858887, "logits/rejected": -14.44648551940918, "logps/chosen": -1.513146162033081, "logps/rejected": -1.49079430103302, "loss": 1.5925, "odds_ratio_loss": 0.7936692833900452, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1513146311044693, "rewards/margins": -0.002235203282907605, "rewards/rejected": -0.14907941222190857, "sft_loss": 1.513146162033081, "step": 320 }, { "epoch": 0.558848433530906, "grad_norm": 0.966873288154602, "learning_rate": 4.583484347456972e-06, "logits/chosen": -14.30597972869873, "logits/rejected": -14.244359016418457, "logps/chosen": -1.5698734521865845, "logps/rejected": -1.5634009838104248, "loss": 1.648, "odds_ratio_loss": 0.781231164932251, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.15698735415935516, "rewards/margins": -0.0006472375243902206, "rewards/rejected": -0.1563401073217392, "sft_loss": 1.5698734521865845, "step": 330 }, { "epoch": 0.5757832345469941, "grad_norm": 0.9054247140884399, "learning_rate": 4.55863210577626e-06, "logits/chosen": -14.461858749389648, "logits/rejected": -14.340890884399414, "logps/chosen": -1.5450735092163086, "logps/rejected": -1.656599760055542, "loss": 1.6172, "odds_ratio_loss": 0.7215217351913452, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.1545073539018631, "rewards/margins": 0.011152632534503937, "rewards/rejected": -0.16565999388694763, "sft_loss": 1.5450735092163086, "step": 340 }, { "epoch": 0.5927180355630821, "grad_norm": 0.9015621542930603, "learning_rate": 4.5331312750253465e-06, "logits/chosen": -14.178003311157227, "logits/rejected": -14.2726411819458, "logps/chosen": -1.487000584602356, "logps/rejected": -1.4908943176269531, "loss": 1.5652, "odds_ratio_loss": 0.7824643850326538, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1487000733613968, "rewards/margins": 0.00038935727206990123, "rewards/rejected": -0.14908942580223083, "sft_loss": 1.487000584602356, "step": 350 }, { "epoch": 0.6096528365791702, "grad_norm": 2.001441717147827, "learning_rate": 4.506989889451858e-06, "logits/chosen": -14.397753715515137, "logits/rejected": -14.500781059265137, "logps/chosen": -1.4975332021713257, "logps/rejected": -1.5102782249450684, "loss": 1.5735, "odds_ratio_loss": 0.7592841982841492, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14975331723690033, "rewards/margins": 0.0012745079584419727, "rewards/rejected": -0.1510278284549713, "sft_loss": 1.4975332021713257, "step": 360 }, { "epoch": 0.6265876375952583, "grad_norm": 1.57513427734375, "learning_rate": 4.480216185115512e-06, "logits/chosen": -14.3065767288208, "logits/rejected": -14.306581497192383, "logps/chosen": -1.4990990161895752, "logps/rejected": -1.6238371133804321, "loss": 1.5673, "odds_ratio_loss": 0.6823247671127319, "rewards/accuracies": 0.53125, "rewards/chosen": -0.14990989863872528, "rewards/margins": 0.0124738160520792, "rewards/rejected": -0.16238370537757874, "sft_loss": 1.4990990161895752, "step": 370 }, { "epoch": 0.6435224386113463, "grad_norm": 1.0783131122589111, "learning_rate": 4.4528185972932856e-06, "logits/chosen": -14.319122314453125, "logits/rejected": -14.488665580749512, "logps/chosen": -1.5176422595977783, "logps/rejected": -1.656542420387268, "loss": 1.5915, "odds_ratio_loss": 0.7389153242111206, "rewards/accuracies": 0.5, "rewards/chosen": -0.15176422894001007, "rewards/margins": 0.013890010304749012, "rewards/rejected": -0.1656542271375656, "sft_loss": 1.5176422595977783, "step": 380 }, { "epoch": 0.6604572396274344, "grad_norm": 1.4694324731826782, "learning_rate": 4.424805757821803e-06, "logits/chosen": -14.226755142211914, "logits/rejected": -14.333894729614258, "logps/chosen": -1.574268102645874, "logps/rejected": -1.6511255502700806, "loss": 1.6513, "odds_ratio_loss": 0.7702363133430481, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.15742680430412292, "rewards/margins": 0.007685736753046513, "rewards/rejected": -0.16511255502700806, "sft_loss": 1.574268102645874, "step": 390 }, { "epoch": 0.6773920406435224, "grad_norm": 0.8252859711647034, "learning_rate": 4.396186492377812e-06, "logits/chosen": -14.237678527832031, "logits/rejected": -14.311739921569824, "logps/chosen": -1.508466124534607, "logps/rejected": -1.5852457284927368, "loss": 1.5797, "odds_ratio_loss": 0.7126177549362183, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.15084661543369293, "rewards/margins": 0.0076779513619840145, "rewards/rejected": -0.15852457284927368, "sft_loss": 1.508466124534607, "step": 400 }, { "epoch": 0.6943268416596104, "grad_norm": 1.2841962575912476, "learning_rate": 4.366969817697578e-06, "logits/chosen": -14.2535400390625, "logits/rejected": -14.371434211730957, "logps/chosen": -1.5005015134811401, "logps/rejected": -1.5292456150054932, "loss": 1.5766, "odds_ratio_loss": 0.7610150575637817, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.15005014836788177, "rewards/margins": 0.002874411642551422, "rewards/rejected": -0.1529245674610138, "sft_loss": 1.5005015134811401, "step": 410 }, { "epoch": 0.7112616426756986, "grad_norm": 1.2207895517349243, "learning_rate": 4.337164938736086e-06, "logits/chosen": -14.3642578125, "logits/rejected": -14.369051933288574, "logps/chosen": -1.5299899578094482, "logps/rejected": -1.4911963939666748, "loss": 1.6108, "odds_ratio_loss": 0.8085638284683228, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.15299901366233826, "rewards/margins": -0.0038793571293354034, "rewards/rejected": -0.14911964535713196, "sft_loss": 1.5299899578094482, "step": 420 }, { "epoch": 0.7281964436917866, "grad_norm": 0.8184213042259216, "learning_rate": 4.306781245766945e-06, "logits/chosen": -14.233909606933594, "logits/rejected": -14.245084762573242, "logps/chosen": -1.3620591163635254, "logps/rejected": -1.4749568700790405, "loss": 1.4336, "odds_ratio_loss": 0.7158304452896118, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13620591163635254, "rewards/margins": 0.011289774440228939, "rewards/rejected": -0.14749568700790405, "sft_loss": 1.3620591163635254, "step": 430 }, { "epoch": 0.7451312447078747, "grad_norm": 2.0060269832611084, "learning_rate": 4.275828311423903e-06, "logits/chosen": -14.381686210632324, "logits/rejected": -14.249435424804688, "logps/chosen": -1.6260135173797607, "logps/rejected": -1.5776515007019043, "loss": 1.709, "odds_ratio_loss": 0.829800009727478, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.16260136663913727, "rewards/margins": -0.004836211446672678, "rewards/rejected": -0.15776515007019043, "sft_loss": 1.6260135173797607, "step": 440 }, { "epoch": 0.7620660457239627, "grad_norm": 4.041975498199463, "learning_rate": 4.244315887684912e-06, "logits/chosen": -14.30778980255127, "logits/rejected": -14.218801498413086, "logps/chosen": -1.483784794807434, "logps/rejected": -1.5452721118927002, "loss": 1.5595, "odds_ratio_loss": 0.7568337917327881, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14837847650051117, "rewards/margins": 0.006148716900497675, "rewards/rejected": -0.1545272022485733, "sft_loss": 1.483784794807434, "step": 450 }, { "epoch": 0.7790008467400508, "grad_norm": 0.7099826335906982, "learning_rate": 4.212253902799685e-06, "logits/chosen": -14.486287117004395, "logits/rejected": -14.316320419311523, "logps/chosen": -1.4297285079956055, "logps/rejected": -1.5128008127212524, "loss": 1.5023, "odds_ratio_loss": 0.7252711057662964, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14297285676002502, "rewards/margins": 0.008307242766022682, "rewards/rejected": -0.15128009021282196, "sft_loss": 1.4297285079956055, "step": 460 }, { "epoch": 0.7959356477561389, "grad_norm": 1.2492146492004395, "learning_rate": 4.179652458161718e-06, "logits/chosen": -14.241589546203613, "logits/rejected": -14.272315979003906, "logps/chosen": -1.4517958164215088, "logps/rejected": -1.4656177759170532, "loss": 1.5259, "odds_ratio_loss": 0.7411133050918579, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1451795995235443, "rewards/margins": 0.0013821950415149331, "rewards/rejected": -0.14656177163124084, "sft_loss": 1.4517958164215088, "step": 470 }, { "epoch": 0.8128704487722269, "grad_norm": 0.9384155869483948, "learning_rate": 4.146521825125765e-06, "logits/chosen": -14.420669555664062, "logits/rejected": -14.434637069702148, "logps/chosen": -1.4806429147720337, "logps/rejected": -1.5676599740982056, "loss": 1.5509, "odds_ratio_loss": 0.7023881673812866, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14806430041790009, "rewards/margins": 0.008701696991920471, "rewards/rejected": -0.15676598250865936, "sft_loss": 1.4806429147720337, "step": 480 }, { "epoch": 0.8298052497883149, "grad_norm": 1.070791244506836, "learning_rate": 4.11287244177176e-06, "logits/chosen": -14.464094161987305, "logits/rejected": -14.335436820983887, "logps/chosen": -1.455758810043335, "logps/rejected": -1.5940083265304565, "loss": 1.5245, "odds_ratio_loss": 0.6876194477081299, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1455758810043335, "rewards/margins": 0.01382494904100895, "rewards/rejected": -0.1594008356332779, "sft_loss": 1.455758810043335, "step": 490 }, { "epoch": 0.8467400508044031, "grad_norm": 2.7851524353027344, "learning_rate": 4.078714909616215e-06, "logits/chosen": -14.458696365356445, "logits/rejected": -14.464262008666992, "logps/chosen": -1.531051754951477, "logps/rejected": -1.6913106441497803, "loss": 1.5988, "odds_ratio_loss": 0.6771985292434692, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.15310516953468323, "rewards/margins": 0.01602589525282383, "rewards/rejected": -0.1691310703754425, "sft_loss": 1.531051754951477, "step": 500 }, { "epoch": 0.8467400508044031, "eval_logits/chosen": -14.385932922363281, "eval_logits/rejected": -14.353007316589355, "eval_logps/chosen": -1.491932988166809, "eval_logps/rejected": -1.5724329948425293, "eval_loss": 1.5655477046966553, "eval_odds_ratio_loss": 0.736146092414856, "eval_rewards/accuracies": 0.49619048833847046, "eval_rewards/chosen": -0.14919330179691315, "eval_rewards/margins": 0.008049987256526947, "eval_rewards/rejected": -0.1572432965040207, "eval_runtime": 207.7292, "eval_samples_per_second": 5.055, "eval_sft_loss": 1.491932988166809, "eval_steps_per_second": 2.527, "step": 500 }, { "epoch": 0.8636748518204911, "grad_norm": 2.8025050163269043, "learning_rate": 4.044059990272125e-06, "logits/chosen": -14.447216987609863, "logits/rejected": -14.498886108398438, "logps/chosen": -1.528641700744629, "logps/rejected": -1.6202799081802368, "loss": 1.6018, "odds_ratio_loss": 0.732014536857605, "rewards/accuracies": 0.5625, "rewards/chosen": -0.15286414325237274, "rewards/margins": 0.009163827635347843, "rewards/rejected": -0.1620279997587204, "sft_loss": 1.528641700744629, "step": 510 }, { "epoch": 0.8806096528365792, "grad_norm": 1.3604254722595215, "learning_rate": 4.0089186020584345e-06, "logits/chosen": -14.258474349975586, "logits/rejected": -14.413030624389648, "logps/chosen": -1.5629048347473145, "logps/rejected": -1.5826667547225952, "loss": 1.6364, "odds_ratio_loss": 0.7350566387176514, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.15629048645496368, "rewards/margins": 0.001976185943931341, "rewards/rejected": -0.15826669335365295, "sft_loss": 1.5629048347473145, "step": 520 }, { "epoch": 0.8975444538526672, "grad_norm": 2.011760711669922, "learning_rate": 3.973301816560124e-06, "logits/chosen": -14.397709846496582, "logits/rejected": -14.129496574401855, "logps/chosen": -1.4165706634521484, "logps/rejected": -1.5228968858718872, "loss": 1.4866, "odds_ratio_loss": 0.6998282670974731, "rewards/accuracies": 0.53125, "rewards/chosen": -0.14165706932544708, "rewards/margins": 0.010632617399096489, "rewards/rejected": -0.15228970348834991, "sft_loss": 1.4165706634521484, "step": 530 }, { "epoch": 0.9144792548687553, "grad_norm": 1.5524851083755493, "learning_rate": 3.937220855140021e-06, "logits/chosen": -14.287254333496094, "logits/rejected": -14.5077543258667, "logps/chosen": -1.445703148841858, "logps/rejected": -1.4684772491455078, "loss": 1.5204, "odds_ratio_loss": 0.7468188405036926, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.14457032084465027, "rewards/margins": 0.0022774008102715015, "rewards/rejected": -0.14684772491455078, "sft_loss": 1.445703148841858, "step": 540 }, { "epoch": 0.9314140558848434, "grad_norm": 1.5534979104995728, "learning_rate": 3.900687085403418e-06, "logits/chosen": -14.357900619506836, "logits/rejected": -14.454984664916992, "logps/chosen": -1.386063575744629, "logps/rejected": -1.3658872842788696, "loss": 1.4644, "odds_ratio_loss": 0.7831361293792725, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13860636949539185, "rewards/margins": -0.0020176374819129705, "rewards/rejected": -0.13658872246742249, "sft_loss": 1.386063575744629, "step": 550 }, { "epoch": 0.9483488569009314, "grad_norm": 1.1890796422958374, "learning_rate": 3.863712017616614e-06, "logits/chosen": -14.284517288208008, "logits/rejected": -14.413591384887695, "logps/chosen": -1.4638760089874268, "logps/rejected": -1.5988643169403076, "loss": 1.5324, "odds_ratio_loss": 0.6851348876953125, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14638759195804596, "rewards/margins": 0.013498829677700996, "rewards/rejected": -0.1598864495754242, "sft_loss": 1.4638760089874268, "step": 560 }, { "epoch": 0.9652836579170194, "grad_norm": 6.166572570800781, "learning_rate": 3.826307301080504e-06, "logits/chosen": -14.168184280395508, "logits/rejected": -14.155644416809082, "logps/chosen": -1.4714304208755493, "logps/rejected": -1.577530860900879, "loss": 1.5501, "odds_ratio_loss": 0.7865978479385376, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14714303612709045, "rewards/margins": 0.010610053315758705, "rewards/rejected": -0.1577531099319458, "sft_loss": 1.4714304208755493, "step": 570 }, { "epoch": 0.9822184589331076, "grad_norm": 1.6688357591629028, "learning_rate": 3.7884847204603775e-06, "logits/chosen": -14.45263385772705, "logits/rejected": -14.489707946777344, "logps/chosen": -1.519616961479187, "logps/rejected": -1.4644415378570557, "loss": 1.5989, "odds_ratio_loss": 0.7931729555130005, "rewards/accuracies": 0.39375001192092896, "rewards/chosen": -0.15196169912815094, "rewards/margins": -0.0055175526067614555, "rewards/rejected": -0.1464441567659378, "sft_loss": 1.519616961479187, "step": 580 }, { "epoch": 0.9991532599491956, "grad_norm": 1.3263885974884033, "learning_rate": 3.750256192073058e-06, "logits/chosen": -14.519624710083008, "logits/rejected": -14.511543273925781, "logps/chosen": -1.6179249286651611, "logps/rejected": -1.6542644500732422, "loss": 1.6929, "odds_ratio_loss": 0.7493273019790649, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.16179248690605164, "rewards/margins": 0.0036339648067951202, "rewards/rejected": -0.16542646288871765, "sft_loss": 1.6179249286651611, "step": 590 }, { "epoch": 1.0160880609652836, "grad_norm": 2.145953893661499, "learning_rate": 3.7116337601325715e-06, "logits/chosen": -14.438863754272461, "logits/rejected": -14.496429443359375, "logps/chosen": -1.4121149778366089, "logps/rejected": -1.4823601245880127, "loss": 1.4826, "odds_ratio_loss": 0.7051838636398315, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.14121152460575104, "rewards/margins": 0.007024504244327545, "rewards/rejected": -0.1482360064983368, "sft_loss": 1.4121149778366089, "step": 600 }, { "epoch": 1.0330228619813717, "grad_norm": 1.4814651012420654, "learning_rate": 3.6726295929555154e-06, "logits/chosen": -14.25225830078125, "logits/rejected": -14.299070358276367, "logps/chosen": -1.333702802658081, "logps/rejected": -1.4111506938934326, "loss": 1.4074, "odds_ratio_loss": 0.7373310327529907, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1333702653646469, "rewards/margins": 0.007744790520519018, "rewards/rejected": -0.14111506938934326, "sft_loss": 1.333702802658081, "step": 610 }, { "epoch": 1.0499576629974599, "grad_norm": 1.6669461727142334, "learning_rate": 3.6332559791273307e-06, "logits/chosen": -14.348184585571289, "logits/rejected": -14.468172073364258, "logps/chosen": -1.3673087358474731, "logps/rejected": -1.4689829349517822, "loss": 1.4376, "odds_ratio_loss": 0.703393280506134, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1367308795452118, "rewards/margins": 0.01016741432249546, "rewards/rejected": -0.1468982994556427, "sft_loss": 1.3673087358474731, "step": 620 }, { "epoch": 1.0668924640135478, "grad_norm": 1.9912712574005127, "learning_rate": 3.593525323630681e-06, "logits/chosen": -14.204243659973145, "logits/rejected": -14.313570976257324, "logps/chosen": -1.4642140865325928, "logps/rejected": -1.5515140295028687, "loss": 1.5346, "odds_ratio_loss": 0.7034581899642944, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1464214026927948, "rewards/margins": 0.008729999884963036, "rewards/rejected": -0.15515141189098358, "sft_loss": 1.4642140865325928, "step": 630 }, { "epoch": 1.083827265029636, "grad_norm": 1.084834098815918, "learning_rate": 3.5534501439371615e-06, "logits/chosen": -14.336616516113281, "logits/rejected": -14.360015869140625, "logps/chosen": -1.431004285812378, "logps/rejected": -1.5110365152359009, "loss": 1.5053, "odds_ratio_loss": 0.7428441047668457, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14310042560100555, "rewards/margins": 0.008003225550055504, "rewards/rejected": -0.1511036604642868, "sft_loss": 1.431004285812378, "step": 640 }, { "epoch": 1.100762066045724, "grad_norm": 7.101503372192383, "learning_rate": 3.5130430660635633e-06, "logits/chosen": -14.246923446655273, "logits/rejected": -14.310781478881836, "logps/chosen": -1.4178617000579834, "logps/rejected": -1.4921773672103882, "loss": 1.4902, "odds_ratio_loss": 0.7228954434394836, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14178617298603058, "rewards/margins": 0.0074315681122243404, "rewards/rejected": -0.14921775460243225, "sft_loss": 1.4178617000579834, "step": 650 }, { "epoch": 1.117696867061812, "grad_norm": 0.7868030071258545, "learning_rate": 3.4723168205939444e-06, "logits/chosen": -14.346036911010742, "logits/rejected": -14.401220321655273, "logps/chosen": -1.4435014724731445, "logps/rejected": -1.4272395372390747, "loss": 1.5198, "odds_ratio_loss": 0.7628483772277832, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14435014128684998, "rewards/margins": -0.0016262030694633722, "rewards/rejected": -0.142723947763443, "sft_loss": 1.4435014724731445, "step": 660 }, { "epoch": 1.1346316680779, "grad_norm": 0.8476426601409912, "learning_rate": 3.431284238668754e-06, "logits/chosen": -14.173054695129395, "logits/rejected": -14.25976276397705, "logps/chosen": -1.5427913665771484, "logps/rejected": -1.51954984664917, "loss": 1.6222, "odds_ratio_loss": 0.7936299443244934, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.15427914261817932, "rewards/margins": -0.002324149012565613, "rewards/rejected": -0.15195497870445251, "sft_loss": 1.5427913665771484, "step": 670 }, { "epoch": 1.1515664690939882, "grad_norm": 3.089587688446045, "learning_rate": 3.389958247942274e-06, "logits/chosen": -14.338518142700195, "logits/rejected": -14.398809432983398, "logps/chosen": -1.508374810218811, "logps/rejected": -1.6098997592926025, "loss": 1.5859, "odds_ratio_loss": 0.7754709720611572, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1508374661207199, "rewards/margins": 0.010152501054108143, "rewards/rejected": -0.16098996996879578, "sft_loss": 1.508374810218811, "step": 680 }, { "epoch": 1.168501270110076, "grad_norm": 1.2698506116867065, "learning_rate": 3.3483518685096588e-06, "logits/chosen": -14.310267448425293, "logits/rejected": -14.27270221710205, "logps/chosen": -1.4493352174758911, "logps/rejected": -1.5172946453094482, "loss": 1.5243, "odds_ratio_loss": 0.7492562532424927, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1449335366487503, "rewards/margins": 0.006795944180339575, "rewards/rejected": -0.15172946453094482, "sft_loss": 1.4493352174758911, "step": 690 }, { "epoch": 1.1854360711261642, "grad_norm": 1.6422189474105835, "learning_rate": 3.306478208804839e-06, "logits/chosen": -14.337800979614258, "logits/rejected": -14.443319320678711, "logps/chosen": -1.3992269039154053, "logps/rejected": -1.4721871614456177, "loss": 1.4736, "odds_ratio_loss": 0.7440443634986877, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13992270827293396, "rewards/margins": 0.007296019699424505, "rewards/rejected": -0.1472187042236328, "sft_loss": 1.3992269039154053, "step": 700 }, { "epoch": 1.2023708721422524, "grad_norm": 1.635892391204834, "learning_rate": 3.264350461470608e-06, "logits/chosen": -14.11363410949707, "logits/rejected": -14.23077392578125, "logps/chosen": -1.4146158695220947, "logps/rejected": -1.5160566568374634, "loss": 1.4872, "odds_ratio_loss": 0.726182222366333, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1414615958929062, "rewards/margins": 0.010144074447453022, "rewards/rejected": -0.15160568058490753, "sft_loss": 1.4146158695220947, "step": 710 }, { "epoch": 1.2193056731583405, "grad_norm": 2.807609796524048, "learning_rate": 3.2219818992021685e-06, "logits/chosen": -14.307601928710938, "logits/rejected": -14.457585334777832, "logps/chosen": -1.3360792398452759, "logps/rejected": -1.5054932832717896, "loss": 1.4058, "odds_ratio_loss": 0.6972737312316895, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13360795378684998, "rewards/margins": 0.016941383481025696, "rewards/rejected": -0.15054932236671448, "sft_loss": 1.3360792398452759, "step": 720 }, { "epoch": 1.2362404741744284, "grad_norm": 4.885401248931885, "learning_rate": 3.1793858705654595e-06, "logits/chosen": -14.334493637084961, "logits/rejected": -14.283819198608398, "logps/chosen": -1.435250997543335, "logps/rejected": -1.4584420919418335, "loss": 1.5098, "odds_ratio_loss": 0.7454192638397217, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14352509379386902, "rewards/margins": 0.0023191256914287806, "rewards/rejected": -0.1458442211151123, "sft_loss": 1.435250997543335, "step": 730 }, { "epoch": 1.2531752751905165, "grad_norm": 2.119098424911499, "learning_rate": 3.1365757957915787e-06, "logits/chosen": -14.451696395874023, "logits/rejected": -14.478349685668945, "logps/chosen": -1.4766839742660522, "logps/rejected": -1.5273820161819458, "loss": 1.5488, "odds_ratio_loss": 0.7213960289955139, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14766840636730194, "rewards/margins": 0.005069802515208721, "rewards/rejected": -0.15273821353912354, "sft_loss": 1.4766839742660522, "step": 740 }, { "epoch": 1.2701100762066047, "grad_norm": 2.60243821144104, "learning_rate": 3.093565162548633e-06, "logits/chosen": -14.26720905303955, "logits/rejected": -14.301678657531738, "logps/chosen": -1.4956939220428467, "logps/rejected": -1.5772297382354736, "loss": 1.5741, "odds_ratio_loss": 0.7844332456588745, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14956940710544586, "rewards/margins": 0.008153588511049747, "rewards/rejected": -0.15772297978401184, "sft_loss": 1.4956939220428467, "step": 750 }, { "epoch": 1.2870448772226926, "grad_norm": 1.4909660816192627, "learning_rate": 3.0503675216923294e-06, "logits/chosen": -14.459734916687012, "logits/rejected": -14.364084243774414, "logps/chosen": -1.3072437047958374, "logps/rejected": -1.4731833934783936, "loss": 1.3741, "odds_ratio_loss": 0.6682445406913757, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13072435557842255, "rewards/margins": 0.01659397967159748, "rewards/rejected": -0.14731833338737488, "sft_loss": 1.3072437047958374, "step": 760 }, { "epoch": 1.3039796782387807, "grad_norm": 1.1245403289794922, "learning_rate": 3.0069964829966748e-06, "logits/chosen": -14.397039413452148, "logits/rejected": -14.449551582336426, "logps/chosen": -1.3757708072662354, "logps/rejected": -1.4135478734970093, "loss": 1.4523, "odds_ratio_loss": 0.7652989029884338, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.137577086687088, "rewards/margins": 0.003777713282033801, "rewards/rejected": -0.1413547843694687, "sft_loss": 1.3757708072662354, "step": 770 }, { "epoch": 1.3209144792548688, "grad_norm": 1.2307573556900024, "learning_rate": 2.963465710866094e-06, "logits/chosen": -14.386013984680176, "logits/rejected": -14.34870719909668, "logps/chosen": -1.4350049495697021, "logps/rejected": -1.5495213270187378, "loss": 1.5065, "odds_ratio_loss": 0.7147475481033325, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14350050687789917, "rewards/margins": 0.01145164854824543, "rewards/rejected": -0.15495215356349945, "sft_loss": 1.4350049495697021, "step": 780 }, { "epoch": 1.337849280270957, "grad_norm": 2.506805181503296, "learning_rate": 2.919788920030357e-06, "logits/chosen": -14.521794319152832, "logits/rejected": -14.562520027160645, "logps/chosen": -1.5004112720489502, "logps/rejected": -1.5385072231292725, "loss": 1.5749, "odds_ratio_loss": 0.7447755336761475, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1500411331653595, "rewards/margins": 0.003809594316408038, "rewards/rejected": -0.1538507342338562, "sft_loss": 1.5004112720489502, "step": 790 }, { "epoch": 1.3547840812870449, "grad_norm": 2.221041440963745, "learning_rate": 2.8759798712236303e-06, "logits/chosen": -14.375375747680664, "logits/rejected": -14.200535774230957, "logps/chosen": -1.3673021793365479, "logps/rejected": -1.4980638027191162, "loss": 1.4391, "odds_ratio_loss": 0.7180894017219543, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13673020899295807, "rewards/margins": 0.013076169416308403, "rewards/rejected": -0.14980638027191162, "sft_loss": 1.3673021793365479, "step": 800 }, { "epoch": 1.371718882303133, "grad_norm": 1.1964547634124756, "learning_rate": 2.8320523668490507e-06, "logits/chosen": -14.326695442199707, "logits/rejected": -14.330057144165039, "logps/chosen": -1.4386107921600342, "logps/rejected": -1.4542288780212402, "loss": 1.516, "odds_ratio_loss": 0.7743045091629028, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1438610851764679, "rewards/margins": 0.00156181410420686, "rewards/rejected": -0.14542289078235626, "sft_loss": 1.4386107921600342, "step": 810 }, { "epoch": 1.388653683319221, "grad_norm": 1.63833749294281, "learning_rate": 2.7880202466301597e-06, "logits/chosen": -14.170251846313477, "logits/rejected": -14.376757621765137, "logps/chosen": -1.4189726114273071, "logps/rejected": -1.4344730377197266, "loss": 1.4949, "odds_ratio_loss": 0.7592172026634216, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.1418972760438919, "rewards/margins": 0.00155004789121449, "rewards/rejected": -0.14344730973243713, "sft_loss": 1.4189726114273071, "step": 820 }, { "epoch": 1.405588484335309, "grad_norm": 1.4605140686035156, "learning_rate": 2.7438973832505854e-06, "logits/chosen": -14.213847160339355, "logits/rejected": -14.075439453125, "logps/chosen": -1.394853115081787, "logps/rejected": -1.4763586521148682, "loss": 1.4703, "odds_ratio_loss": 0.7543301582336426, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13948531448841095, "rewards/margins": 0.008150560781359673, "rewards/rejected": -0.14763586223125458, "sft_loss": 1.394853115081787, "step": 830 }, { "epoch": 1.4225232853513972, "grad_norm": 6.998382091522217, "learning_rate": 2.699697677983341e-06, "logits/chosen": -14.502642631530762, "logits/rejected": -14.471555709838867, "logps/chosen": -1.3794063329696655, "logps/rejected": -1.3286025524139404, "loss": 1.4577, "odds_ratio_loss": 0.7826226353645325, "rewards/accuracies": 0.40625, "rewards/chosen": -0.1379406601190567, "rewards/margins": -0.005080387927591801, "rewards/rejected": -0.13286025822162628, "sft_loss": 1.3794063329696655, "step": 840 }, { "epoch": 1.4394580863674853, "grad_norm": 6.508487224578857, "learning_rate": 2.6554350563111115e-06, "logits/chosen": -14.415182113647461, "logits/rejected": -14.4021577835083, "logps/chosen": -1.4343197345733643, "logps/rejected": -1.389868140220642, "loss": 1.5147, "odds_ratio_loss": 0.803573489189148, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -0.14343199133872986, "rewards/margins": -0.00444516446441412, "rewards/rejected": -0.13898679614067078, "sft_loss": 1.4343197345733643, "step": 850 }, { "epoch": 1.4563928873835732, "grad_norm": 3.286094903945923, "learning_rate": 2.611123463538913e-06, "logits/chosen": -14.409162521362305, "logits/rejected": -14.423065185546875, "logps/chosen": -1.3563302755355835, "logps/rejected": -1.470460295677185, "loss": 1.4284, "odds_ratio_loss": 0.7211607694625854, "rewards/accuracies": 0.46875, "rewards/chosen": -0.13563302159309387, "rewards/margins": 0.011413001455366611, "rewards/rejected": -0.1470460146665573, "sft_loss": 1.3563302755355835, "step": 860 }, { "epoch": 1.4733276883996613, "grad_norm": 1.353800654411316, "learning_rate": 2.566776860400514e-06, "logits/chosen": -14.359599113464355, "logits/rejected": -14.388442993164062, "logps/chosen": -1.4657598733901978, "logps/rejected": -1.5304598808288574, "loss": 1.5387, "odds_ratio_loss": 0.7289360761642456, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14657600224018097, "rewards/margins": 0.006469997111707926, "rewards/rejected": -0.15304598212242126, "sft_loss": 1.4657598733901978, "step": 870 }, { "epoch": 1.4902624894157492, "grad_norm": 0.8999080657958984, "learning_rate": 2.522409218659989e-06, "logits/chosen": -14.522372245788574, "logits/rejected": -14.516871452331543, "logps/chosen": -1.5183885097503662, "logps/rejected": -1.5601129531860352, "loss": 1.5903, "odds_ratio_loss": 0.7187842130661011, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.15183886885643005, "rewards/margins": 0.004172446206212044, "rewards/rejected": -0.15601131319999695, "sft_loss": 1.5183885097503662, "step": 880 }, { "epoch": 1.5071972904318374, "grad_norm": 1.7247016429901123, "learning_rate": 2.4780345167097976e-06, "logits/chosen": -14.4078369140625, "logits/rejected": -14.206354141235352, "logps/chosen": -1.422533392906189, "logps/rejected": -1.617108941078186, "loss": 1.4925, "odds_ratio_loss": 0.6991701126098633, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1422533541917801, "rewards/margins": 0.019457560032606125, "rewards/rejected": -0.16171090304851532, "sft_loss": 1.422533392906189, "step": 890 }, { "epoch": 1.5241320914479255, "grad_norm": 1.1559327840805054, "learning_rate": 2.4336667351667747e-06, "logits/chosen": -14.479301452636719, "logits/rejected": -14.487524032592773, "logps/chosen": -1.5677707195281982, "logps/rejected": -1.654937982559204, "loss": 1.6407, "odds_ratio_loss": 0.7297645807266235, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1567770540714264, "rewards/margins": 0.00871671736240387, "rewards/rejected": -0.16549380123615265, "sft_loss": 1.5677707195281982, "step": 900 }, { "epoch": 1.5410668924640136, "grad_norm": 2.899705171585083, "learning_rate": 2.3893198524674264e-06, "logits/chosen": -14.416735649108887, "logits/rejected": -14.323824882507324, "logps/chosen": -1.3870880603790283, "logps/rejected": -1.490903615951538, "loss": 1.4593, "odds_ratio_loss": 0.7216765284538269, "rewards/accuracies": 0.5, "rewards/chosen": -0.13870880007743835, "rewards/margins": 0.010381558910012245, "rewards/rejected": -0.14909036457538605, "sft_loss": 1.3870880603790283, "step": 910 }, { "epoch": 1.5580016934801018, "grad_norm": 1.2076252698898315, "learning_rate": 2.345007840463904e-06, "logits/chosen": -14.292505264282227, "logits/rejected": -14.244054794311523, "logps/chosen": -1.4259792566299438, "logps/rejected": -1.4341694116592407, "loss": 1.5022, "odds_ratio_loss": 0.7626054883003235, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14259792864322662, "rewards/margins": 0.0008190165390260518, "rewards/rejected": -0.14341694116592407, "sft_loss": 1.4259792566299438, "step": 920 }, { "epoch": 1.5749364944961897, "grad_norm": 2.6530520915985107, "learning_rate": 2.3007446600220572e-06, "logits/chosen": -14.440101623535156, "logits/rejected": -14.175987243652344, "logps/chosen": -1.361826777458191, "logps/rejected": -1.4479329586029053, "loss": 1.4351, "odds_ratio_loss": 0.7332156300544739, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13618269562721252, "rewards/margins": 0.008610614575445652, "rewards/rejected": -0.144793301820755, "sft_loss": 1.361826777458191, "step": 930 }, { "epoch": 1.5918712955122776, "grad_norm": 3.269102096557617, "learning_rate": 2.2565442566229507e-06, "logits/chosen": -14.330474853515625, "logits/rejected": -14.3932466506958, "logps/chosen": -1.4583683013916016, "logps/rejected": -1.4522769451141357, "loss": 1.5392, "odds_ratio_loss": 0.8081096410751343, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -0.14583681523799896, "rewards/margins": -0.0006091395625844598, "rewards/rejected": -0.14522768557071686, "sft_loss": 1.4583683013916016, "step": 940 }, { "epoch": 1.6088060965283657, "grad_norm": 1.2394914627075195, "learning_rate": 2.2124205559692195e-06, "logits/chosen": -14.25177001953125, "logits/rejected": -14.32116985321045, "logps/chosen": -1.4207613468170166, "logps/rejected": -1.5083825588226318, "loss": 1.4919, "odds_ratio_loss": 0.7114149928092957, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14207611978054047, "rewards/margins": 0.008762138895690441, "rewards/rejected": -0.15083825588226318, "sft_loss": 1.4207613468170166, "step": 950 }, { "epoch": 1.6257408975444538, "grad_norm": 1.6583099365234375, "learning_rate": 2.168387459597666e-06, "logits/chosen": -14.210861206054688, "logits/rejected": -14.444610595703125, "logps/chosen": -1.5090281963348389, "logps/rejected": -1.5863807201385498, "loss": 1.5813, "odds_ratio_loss": 0.7230504155158997, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.15090280771255493, "rewards/margins": 0.007735258433967829, "rewards/rejected": -0.15863807499408722, "sft_loss": 1.5090281963348389, "step": 960 }, { "epoch": 1.642675698560542, "grad_norm": 1.3439754247665405, "learning_rate": 2.1244588404994648e-06, "logits/chosen": -14.237951278686523, "logits/rejected": -14.269018173217773, "logps/chosen": -1.376792073249817, "logps/rejected": -1.4212851524353027, "loss": 1.453, "odds_ratio_loss": 0.7622562646865845, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13767921924591064, "rewards/margins": 0.0044493041932582855, "rewards/rejected": -0.14212851226329803, "sft_loss": 1.376792073249817, "step": 970 }, { "epoch": 1.65961049957663, "grad_norm": 2.962531328201294, "learning_rate": 2.08064853874936e-06, "logits/chosen": -14.473817825317383, "logits/rejected": -14.631460189819336, "logps/chosen": -1.4066752195358276, "logps/rejected": -1.455766201019287, "loss": 1.4788, "odds_ratio_loss": 0.7210047245025635, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.14066752791404724, "rewards/margins": 0.004909100476652384, "rewards/rejected": -0.1455766260623932, "sft_loss": 1.4066752195358276, "step": 980 }, { "epoch": 1.676545300592718, "grad_norm": 3.2846462726593018, "learning_rate": 2.0369703571452387e-06, "logits/chosen": -14.20033073425293, "logits/rejected": -14.109931945800781, "logps/chosen": -1.309378743171692, "logps/rejected": -1.4727327823638916, "loss": 1.3763, "odds_ratio_loss": 0.6690842509269714, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13093788921833038, "rewards/margins": 0.01633540540933609, "rewards/rejected": -0.14727327227592468, "sft_loss": 1.309378743171692, "step": 990 }, { "epoch": 1.6934801016088061, "grad_norm": 1.1083016395568848, "learning_rate": 1.993438056859441e-06, "logits/chosen": -14.497441291809082, "logits/rejected": -14.366804122924805, "logps/chosen": -1.353459119796753, "logps/rejected": -1.469897985458374, "loss": 1.4213, "odds_ratio_loss": 0.6788592338562012, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13534590601921082, "rewards/margins": 0.011643897742033005, "rewards/rejected": -0.14698980748653412, "sft_loss": 1.353459119796753, "step": 1000 }, { "epoch": 1.6934801016088061, "eval_logits/chosen": -14.432435989379883, "eval_logits/rejected": -14.399744987487793, "eval_logps/chosen": -1.4366357326507568, "eval_logps/rejected": -1.5239636898040771, "eval_loss": 1.509663701057434, "eval_odds_ratio_loss": 0.7302786707878113, "eval_rewards/accuracies": 0.5038095116615295, "eval_rewards/chosen": -0.14366357028484344, "eval_rewards/margins": 0.00873279757797718, "eval_rewards/rejected": -0.15239638090133667, "eval_runtime": 445.7589, "eval_samples_per_second": 2.356, "eval_sft_loss": 1.4366357326507568, "eval_steps_per_second": 1.178, "step": 1000 }, { "epoch": 1.710414902624894, "grad_norm": 1.8078409433364868, "learning_rate": 1.9500653531031917e-06, "logits/chosen": -14.443731307983398, "logits/rejected": -14.476076126098633, "logps/chosen": -1.361530065536499, "logps/rejected": -1.5223243236541748, "loss": 1.4302, "odds_ratio_loss": 0.6869481205940247, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.136152982711792, "rewards/margins": 0.016079427674412727, "rewards/rejected": -0.15223243832588196, "sft_loss": 1.361530065536499, "step": 1010 }, { "epoch": 1.7273497036409822, "grad_norm": 1.288388729095459, "learning_rate": 1.9068659108055117e-06, "logits/chosen": -14.475682258605957, "logits/rejected": -14.473660469055176, "logps/chosen": -1.4284050464630127, "logps/rejected": -1.4647681713104248, "loss": 1.5008, "odds_ratio_loss": 0.7240586280822754, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14284051954746246, "rewards/margins": 0.003636319888755679, "rewards/rejected": -0.1464768350124359, "sft_loss": 1.4284050464630127, "step": 1020 }, { "epoch": 1.7442845046570703, "grad_norm": 1.2943964004516602, "learning_rate": 1.863853340307962e-06, "logits/chosen": -14.312501907348633, "logits/rejected": -14.362284660339355, "logps/chosen": -1.2968519926071167, "logps/rejected": -1.579993486404419, "loss": 1.3634, "odds_ratio_loss": 0.6657830476760864, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1296851933002472, "rewards/margins": 0.0283141378313303, "rewards/rejected": -0.15799932181835175, "sft_loss": 1.2968519926071167, "step": 1030 }, { "epoch": 1.7612193056731584, "grad_norm": 1.1572942733764648, "learning_rate": 1.8210411930766019e-06, "logits/chosen": -14.294156074523926, "logits/rejected": -14.323614120483398, "logps/chosen": -1.479034662246704, "logps/rejected": -1.6268787384033203, "loss": 1.547, "odds_ratio_loss": 0.6801426410675049, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1479034721851349, "rewards/margins": 0.014784415252506733, "rewards/rejected": -0.16268786787986755, "sft_loss": 1.479034662246704, "step": 1040 }, { "epoch": 1.7781541066892466, "grad_norm": 1.574400782585144, "learning_rate": 1.7784429574324803e-06, "logits/chosen": -14.368769645690918, "logits/rejected": -14.502416610717773, "logps/chosen": -1.3905737400054932, "logps/rejected": -1.5777366161346436, "loss": 1.4567, "odds_ratio_loss": 0.6612822413444519, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13905738294124603, "rewards/margins": 0.018716301769018173, "rewards/rejected": -0.1577736884355545, "sft_loss": 1.3905737400054932, "step": 1050 }, { "epoch": 1.7950889077053345, "grad_norm": 1.195115089416504, "learning_rate": 1.7360720543020327e-06, "logits/chosen": -14.439001083374023, "logits/rejected": -14.227216720581055, "logps/chosen": -1.3061621189117432, "logps/rejected": -1.3979461193084717, "loss": 1.3747, "odds_ratio_loss": 0.6853240728378296, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1306162178516388, "rewards/margins": 0.009178402833640575, "rewards/rejected": -0.13979461789131165, "sft_loss": 1.3061621189117432, "step": 1060 }, { "epoch": 1.8120237087214224, "grad_norm": 3.909592390060425, "learning_rate": 1.6939418329887042e-06, "logits/chosen": -14.45744514465332, "logits/rejected": -14.5038423538208, "logps/chosen": -1.4311087131500244, "logps/rejected": -1.4849843978881836, "loss": 1.5049, "odds_ratio_loss": 0.7381945848464966, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14311087131500244, "rewards/margins": 0.005387583281844854, "rewards/rejected": -0.14849844574928284, "sft_loss": 1.4311087131500244, "step": 1070 }, { "epoch": 1.8289585097375105, "grad_norm": 1.7437409162521362, "learning_rate": 1.6520655669671467e-06, "logits/chosen": -14.631024360656738, "logits/rejected": -14.515978813171387, "logps/chosen": -1.4438676834106445, "logps/rejected": -1.4797852039337158, "loss": 1.519, "odds_ratio_loss": 0.7515386343002319, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14438676834106445, "rewards/margins": 0.0035917561035603285, "rewards/rejected": -0.1479785144329071, "sft_loss": 1.4438676834106445, "step": 1080 }, { "epoch": 1.8458933107535986, "grad_norm": 3.1396241188049316, "learning_rate": 1.610456449701294e-06, "logits/chosen": -14.319239616394043, "logits/rejected": -14.346944808959961, "logps/chosen": -1.4771324396133423, "logps/rejected": -1.537941336631775, "loss": 1.5548, "odds_ratio_loss": 0.776719331741333, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14771324396133423, "rewards/margins": 0.0060809021815657616, "rewards/rejected": -0.15379413962364197, "sft_loss": 1.4771324396133423, "step": 1090 }, { "epoch": 1.8628281117696868, "grad_norm": 1.4689712524414062, "learning_rate": 1.5691275904876545e-06, "logits/chosen": -14.461804389953613, "logits/rejected": -14.278103828430176, "logps/chosen": -1.407566785812378, "logps/rejected": -1.5848530530929565, "loss": 1.474, "odds_ratio_loss": 0.6638895869255066, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14075669646263123, "rewards/margins": 0.017728609964251518, "rewards/rejected": -0.1584853082895279, "sft_loss": 1.407566785812378, "step": 1100 }, { "epoch": 1.879762912785775, "grad_norm": 0.9537128210067749, "learning_rate": 1.5280920103251235e-06, "logits/chosen": -14.299761772155762, "logits/rejected": -14.347249984741211, "logps/chosen": -1.3132389783859253, "logps/rejected": -1.451719880104065, "loss": 1.3829, "odds_ratio_loss": 0.6965407133102417, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.131323903799057, "rewards/margins": 0.013848078437149525, "rewards/rejected": -0.14517197012901306, "sft_loss": 1.3132389783859253, "step": 1110 }, { "epoch": 1.8966977138018628, "grad_norm": 1.3030270338058472, "learning_rate": 1.4873626378126015e-06, "logits/chosen": -14.38860034942627, "logits/rejected": -14.277740478515625, "logps/chosen": -1.3292900323867798, "logps/rejected": -1.5040452480316162, "loss": 1.3984, "odds_ratio_loss": 0.6911579966545105, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1329289972782135, "rewards/margins": 0.017475521191954613, "rewards/rejected": -0.15040451288223267, "sft_loss": 1.3292900323867798, "step": 1120 }, { "epoch": 1.913632514817951, "grad_norm": 2.765397071838379, "learning_rate": 1.446952305075738e-06, "logits/chosen": -14.399679183959961, "logits/rejected": -14.427862167358398, "logps/chosen": -1.3543717861175537, "logps/rejected": -1.3891161680221558, "loss": 1.4306, "odds_ratio_loss": 0.7619328498840332, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13543717563152313, "rewards/margins": 0.003474441124126315, "rewards/rejected": -0.13891161978244781, "sft_loss": 1.3543717861175537, "step": 1130 }, { "epoch": 1.9305673158340388, "grad_norm": 1.730094075202942, "learning_rate": 1.406873743724065e-06, "logits/chosen": -14.437395095825195, "logits/rejected": -14.322535514831543, "logps/chosen": -1.4621553421020508, "logps/rejected": -1.6176691055297852, "loss": 1.5314, "odds_ratio_loss": 0.692920982837677, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1462155282497406, "rewards/margins": 0.015551361255347729, "rewards/rejected": -0.1617669016122818, "sft_loss": 1.4621553421020508, "step": 1140 }, { "epoch": 1.947502116850127, "grad_norm": 1.5328463315963745, "learning_rate": 1.3671395808397898e-06, "logits/chosen": -14.267127990722656, "logits/rejected": -14.463046073913574, "logps/chosen": -1.335663080215454, "logps/rejected": -1.3676198720932007, "loss": 1.4094, "odds_ratio_loss": 0.7378238439559937, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13356631994247437, "rewards/margins": 0.0031956590246409178, "rewards/rejected": -0.13676197826862335, "sft_loss": 1.335663080215454, "step": 1150 }, { "epoch": 1.964436917866215, "grad_norm": 3.9082131385803223, "learning_rate": 1.3277623349995418e-06, "logits/chosen": -14.250445365905762, "logits/rejected": -14.258328437805176, "logps/chosen": -1.386776089668274, "logps/rejected": -1.3914397954940796, "loss": 1.4653, "odds_ratio_loss": 0.7851333618164062, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13867759704589844, "rewards/margins": 0.00046637197374366224, "rewards/rejected": -0.13914397358894348, "sft_loss": 1.386776089668274, "step": 1160 }, { "epoch": 1.9813717188823032, "grad_norm": 3.576561450958252, "learning_rate": 1.2887544123302781e-06, "logits/chosen": -14.434526443481445, "logits/rejected": -14.393232345581055, "logps/chosen": -1.4019829034805298, "logps/rejected": -1.4435473680496216, "loss": 1.4772, "odds_ratio_loss": 0.752662181854248, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.14019827544689178, "rewards/margins": 0.004156465642154217, "rewards/rejected": -0.14435474574565887, "sft_loss": 1.4019829034805298, "step": 1170 }, { "epoch": 1.9983065198983911, "grad_norm": 1.4880342483520508, "learning_rate": 1.2501281026006393e-06, "logits/chosen": -14.47376823425293, "logits/rejected": -14.513628005981445, "logps/chosen": -1.420966386795044, "logps/rejected": -1.4258407354354858, "loss": 1.5002, "odds_ratio_loss": 0.7924087643623352, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.1420966386795044, "rewards/margins": 0.000487445795442909, "rewards/rejected": -0.14258407056331635, "sft_loss": 1.420966386795044, "step": 1180 }, { "epoch": 2.015241320914479, "grad_norm": 1.0734080076217651, "learning_rate": 1.2118955753489523e-06, "logits/chosen": -14.561826705932617, "logits/rejected": -14.332305908203125, "logps/chosen": -1.3783150911331177, "logps/rejected": -1.4396107196807861, "loss": 1.4511, "odds_ratio_loss": 0.7278788685798645, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13783150911331177, "rewards/margins": 0.006129562854766846, "rewards/rejected": -0.1439610719680786, "sft_loss": 1.3783150911331177, "step": 1190 }, { "epoch": 2.032176121930567, "grad_norm": 1.3539475202560425, "learning_rate": 1.1740688760491189e-06, "logits/chosen": -14.37562370300293, "logits/rejected": -14.43455696105957, "logps/chosen": -1.3733515739440918, "logps/rejected": -1.4605834484100342, "loss": 1.4435, "odds_ratio_loss": 0.7019113302230835, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1373351514339447, "rewards/margins": 0.00872319657355547, "rewards/rejected": -0.1460583508014679, "sft_loss": 1.3733515739440918, "step": 1200 }, { "epoch": 2.0491109229466553, "grad_norm": 1.5765854120254517, "learning_rate": 1.1366599223155847e-06, "logits/chosen": -14.275134086608887, "logits/rejected": -14.2963228225708, "logps/chosen": -1.371392011642456, "logps/rejected": -1.4632259607315063, "loss": 1.4461, "odds_ratio_loss": 0.7467167377471924, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1371392160654068, "rewards/margins": 0.009183400310575962, "rewards/rejected": -0.1463226079940796, "sft_loss": 1.371392011642456, "step": 1210 }, { "epoch": 2.0660457239627434, "grad_norm": 1.6226162910461426, "learning_rate": 1.0996805001486067e-06, "logits/chosen": -14.387079238891602, "logits/rejected": -14.525866508483887, "logps/chosen": -1.3380024433135986, "logps/rejected": -1.4540449380874634, "loss": 1.4055, "odds_ratio_loss": 0.6752744913101196, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.13380023837089539, "rewards/margins": 0.011604254133999348, "rewards/rejected": -0.14540448784828186, "sft_loss": 1.3380024433135986, "step": 1220 }, { "epoch": 2.0829805249788316, "grad_norm": 2.682673454284668, "learning_rate": 1.0631422602209608e-06, "logits/chosen": -14.46452808380127, "logits/rejected": -14.45245361328125, "logps/chosen": -1.52396559715271, "logps/rejected": -1.5300567150115967, "loss": 1.6016, "odds_ratio_loss": 0.7762898802757263, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.15239658951759338, "rewards/margins": 0.0006091115646995604, "rewards/rejected": -0.1530056893825531, "sft_loss": 1.52396559715271, "step": 1230 }, { "epoch": 2.0999153259949197, "grad_norm": 0.9156871438026428, "learning_rate": 1.027056714207319e-06, "logits/chosen": -14.493863105773926, "logits/rejected": -14.539648056030273, "logps/chosen": -1.4326021671295166, "logps/rejected": -1.5681862831115723, "loss": 1.5034, "odds_ratio_loss": 0.7082626223564148, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14326021075248718, "rewards/margins": 0.0135584007948637, "rewards/rejected": -0.15681862831115723, "sft_loss": 1.4326021671295166, "step": 1240 }, { "epoch": 2.116850127011008, "grad_norm": 3.18613600730896, "learning_rate": 9.914352311573838e-07, "logits/chosen": -14.396720886230469, "logits/rejected": -14.398330688476562, "logps/chosen": -1.3194880485534668, "logps/rejected": -1.4313329458236694, "loss": 1.3887, "odds_ratio_loss": 0.6922628283500671, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1319488137960434, "rewards/margins": 0.011184502393007278, "rewards/rejected": -0.14313331246376038, "sft_loss": 1.3194880485534668, "step": 1250 }, { "epoch": 2.1337849280270955, "grad_norm": 1.0878351926803589, "learning_rate": 9.562890339139877e-07, "logits/chosen": -14.146682739257812, "logits/rejected": -14.353192329406738, "logps/chosen": -1.3349636793136597, "logps/rejected": -1.379267930984497, "loss": 1.4097, "odds_ratio_loss": 0.7469658255577087, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13349637389183044, "rewards/margins": 0.004430420231074095, "rewards/rejected": -0.13792680203914642, "sft_loss": 1.3349636793136597, "step": 1260 }, { "epoch": 2.1507197290431836, "grad_norm": 1.177203893661499, "learning_rate": 9.216291955772374e-07, "logits/chosen": -14.328463554382324, "logits/rejected": -14.295025825500488, "logps/chosen": -1.3897377252578735, "logps/rejected": -1.4198486804962158, "loss": 1.4659, "odds_ratio_loss": 0.7619088292121887, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13897378742694855, "rewards/margins": 0.0030110946390777826, "rewards/rejected": -0.14198487997055054, "sft_loss": 1.3897377252578735, "step": 1270 }, { "epoch": 2.167654530059272, "grad_norm": 2.2964181900024414, "learning_rate": 8.874666360158457e-07, "logits/chosen": -14.346217155456543, "logits/rejected": -14.197412490844727, "logps/chosen": -1.3614085912704468, "logps/rejected": -1.4674574136734009, "loss": 1.4341, "odds_ratio_loss": 0.7273774147033691, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1361408680677414, "rewards/margins": 0.010604878887534142, "rewards/rejected": -0.1467457413673401, "sft_loss": 1.3614085912704468, "step": 1280 }, { "epoch": 2.18458933107536, "grad_norm": 3.246114492416382, "learning_rate": 8.538121184267315e-07, "logits/chosen": -14.440536499023438, "logits/rejected": -14.329854965209961, "logps/chosen": -1.2875430583953857, "logps/rejected": -1.3775211572647095, "loss": 1.3574, "odds_ratio_loss": 0.6986570954322815, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.12875431776046753, "rewards/margins": 0.008997795172035694, "rewards/rejected": -0.13775211572647095, "sft_loss": 1.2875430583953857, "step": 1290 }, { "epoch": 2.201524132091448, "grad_norm": 1.6076223850250244, "learning_rate": 8.206762459439907e-07, "logits/chosen": -14.393684387207031, "logits/rejected": -14.419075012207031, "logps/chosen": -1.4106100797653198, "logps/rejected": -1.4857350587844849, "loss": 1.4865, "odds_ratio_loss": 0.758701741695404, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.14106100797653198, "rewards/margins": 0.0075125014409422874, "rewards/rejected": -0.14857350289821625, "sft_loss": 1.4106100797653198, "step": 1300 }, { "epoch": 2.218458933107536, "grad_norm": 1.4635405540466309, "learning_rate": 7.880694582982898e-07, "logits/chosen": -14.465181350708008, "logits/rejected": -14.500001907348633, "logps/chosen": -1.4319560527801514, "logps/rejected": -1.5127556324005127, "loss": 1.506, "odds_ratio_loss": 0.7399921417236328, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14319561421871185, "rewards/margins": 0.00807994045317173, "rewards/rejected": -0.15127556025981903, "sft_loss": 1.4319560527801514, "step": 1310 }, { "epoch": 2.235393734123624, "grad_norm": 3.1588046550750732, "learning_rate": 7.560020285277401e-07, "logits/chosen": -14.269197463989258, "logits/rejected": -14.49077320098877, "logps/chosen": -1.3981552124023438, "logps/rejected": -1.4313172101974487, "loss": 1.4741, "odds_ratio_loss": 0.7590950727462769, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.1398155391216278, "rewards/margins": 0.0033162026666104794, "rewards/rejected": -0.14313173294067383, "sft_loss": 1.3981552124023438, "step": 1320 }, { "epoch": 2.252328535139712, "grad_norm": 3.586276054382324, "learning_rate": 7.244840597412956e-07, "logits/chosen": -14.22734546661377, "logits/rejected": -14.291172981262207, "logps/chosen": -1.514716625213623, "logps/rejected": -1.4183883666992188, "loss": 1.5981, "odds_ratio_loss": 0.8342422246932983, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.15147167444229126, "rewards/margins": -0.009632834233343601, "rewards/rejected": -0.14183883368968964, "sft_loss": 1.514716625213623, "step": 1330 }, { "epoch": 2.2692633361558, "grad_norm": 2.3110530376434326, "learning_rate": 6.935254819356796e-07, "logits/chosen": -14.419351577758789, "logits/rejected": -14.297566413879395, "logps/chosen": -1.4030816555023193, "logps/rejected": -1.4476964473724365, "loss": 1.4773, "odds_ratio_loss": 0.7421059012413025, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1403081715106964, "rewards/margins": 0.0044614695943892, "rewards/rejected": -0.14476963877677917, "sft_loss": 1.4030816555023193, "step": 1340 }, { "epoch": 2.2861981371718882, "grad_norm": 1.1914503574371338, "learning_rate": 6.631360488668662e-07, "logits/chosen": -14.460253715515137, "logits/rejected": -14.41465950012207, "logps/chosen": -1.2984880208969116, "logps/rejected": -1.4945783615112305, "loss": 1.3662, "odds_ratio_loss": 0.6775275468826294, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12984880805015564, "rewards/margins": 0.019609034061431885, "rewards/rejected": -0.14945784211158752, "sft_loss": 1.2984880208969116, "step": 1350 }, { "epoch": 2.3031329381879764, "grad_norm": 2.2295608520507812, "learning_rate": 6.333253349770672e-07, "logits/chosen": -14.249277114868164, "logits/rejected": -14.275445938110352, "logps/chosen": -1.4399076700210571, "logps/rejected": -1.4462318420410156, "loss": 1.5184, "odds_ratio_loss": 0.7848686575889587, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14399076998233795, "rewards/margins": 0.0006324196001514792, "rewards/rejected": -0.14462319016456604, "sft_loss": 1.4399076700210571, "step": 1360 }, { "epoch": 2.3200677392040645, "grad_norm": 0.9200133681297302, "learning_rate": 6.041027323782364e-07, "logits/chosen": -14.550092697143555, "logits/rejected": -14.5205717086792, "logps/chosen": -1.3879852294921875, "logps/rejected": -1.5073843002319336, "loss": 1.4568, "odds_ratio_loss": 0.6877447366714478, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1387985199689865, "rewards/margins": 0.011939908377826214, "rewards/rejected": -0.1507384330034256, "sft_loss": 1.3879852294921875, "step": 1370 }, { "epoch": 2.337002540220152, "grad_norm": 1.757595181465149, "learning_rate": 5.754774478929969e-07, "logits/chosen": -14.518872261047363, "logits/rejected": -14.515436172485352, "logps/chosen": -1.4030746221542358, "logps/rejected": -1.525309443473816, "loss": 1.4726, "odds_ratio_loss": 0.6956244707107544, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1403074562549591, "rewards/margins": 0.012223480269312859, "rewards/rejected": -0.1525309532880783, "sft_loss": 1.4030746221542358, "step": 1380 }, { "epoch": 2.3539373412362403, "grad_norm": 1.9958380460739136, "learning_rate": 5.474585001539634e-07, "logits/chosen": -14.516281127929688, "logits/rejected": -14.449725151062012, "logps/chosen": -1.3020037412643433, "logps/rejected": -1.4323627948760986, "loss": 1.3692, "odds_ratio_loss": 0.671482503414154, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13020040094852448, "rewards/margins": 0.013035891577601433, "rewards/rejected": -0.14323627948760986, "sft_loss": 1.3020037412643433, "step": 1390 }, { "epoch": 2.3708721422523285, "grad_norm": 0.9711344242095947, "learning_rate": 5.200547167623424e-07, "logits/chosen": -14.532658576965332, "logits/rejected": -14.446354866027832, "logps/chosen": -1.4261430501937866, "logps/rejected": -1.6040065288543701, "loss": 1.4923, "odds_ratio_loss": 0.6615304946899414, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14261427521705627, "rewards/margins": 0.01778637059032917, "rewards/rejected": -0.1604006588459015, "sft_loss": 1.4261430501937866, "step": 1400 }, { "epoch": 2.3878069432684166, "grad_norm": 0.9917483925819397, "learning_rate": 4.932747315067271e-07, "logits/chosen": -14.57470417022705, "logits/rejected": -14.438740730285645, "logps/chosen": -1.4024930000305176, "logps/rejected": -1.469939947128296, "loss": 1.4755, "odds_ratio_loss": 0.7300769090652466, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14024929702281952, "rewards/margins": 0.006744695361703634, "rewards/rejected": -0.1469939947128296, "sft_loss": 1.4024930000305176, "step": 1410 }, { "epoch": 2.4047417442845047, "grad_norm": 2.87284779548645, "learning_rate": 4.6712698164294553e-07, "logits/chosen": -14.489944458007812, "logits/rejected": -14.394497871398926, "logps/chosen": -1.4407953023910522, "logps/rejected": -1.4559253454208374, "loss": 1.5183, "odds_ratio_loss": 0.7750439047813416, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1440795361995697, "rewards/margins": 0.0015129944076761603, "rewards/rejected": -0.14559254050254822, "sft_loss": 1.4407953023910522, "step": 1420 }, { "epoch": 2.421676545300593, "grad_norm": 3.170734167098999, "learning_rate": 4.41619705235842e-07, "logits/chosen": -14.575798034667969, "logits/rejected": -14.610578536987305, "logps/chosen": -1.375421404838562, "logps/rejected": -1.5859653949737549, "loss": 1.4439, "odds_ratio_loss": 0.6848722696304321, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13754215836524963, "rewards/margins": 0.02105441316962242, "rewards/rejected": -0.15859656035900116, "sft_loss": 1.375421404838562, "step": 1430 }, { "epoch": 2.438611346316681, "grad_norm": 0.8895889520645142, "learning_rate": 4.167609385637961e-07, "logits/chosen": -14.474627494812012, "logits/rejected": -14.27497386932373, "logps/chosen": -1.3773252964019775, "logps/rejected": -1.4834753274917603, "loss": 1.4478, "odds_ratio_loss": 0.7047079205513, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13773252069950104, "rewards/margins": 0.010615019127726555, "rewards/rejected": -0.14834752678871155, "sft_loss": 1.3773252964019775, "step": 1440 }, { "epoch": 2.4555461473327687, "grad_norm": 1.5126135349273682, "learning_rate": 3.9255851358683567e-07, "logits/chosen": -14.236564636230469, "logits/rejected": -14.380549430847168, "logps/chosen": -1.3431507349014282, "logps/rejected": -1.4221420288085938, "loss": 1.4184, "odds_ratio_loss": 0.7521894574165344, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.13431507349014282, "rewards/margins": 0.007899129763245583, "rewards/rejected": -0.14221420884132385, "sft_loss": 1.3431507349014282, "step": 1450 }, { "epoch": 2.472480948348857, "grad_norm": 2.2620511054992676, "learning_rate": 3.690200554791082e-07, "logits/chosen": -14.424779891967773, "logits/rejected": -14.354517936706543, "logps/chosen": -1.3713457584381104, "logps/rejected": -1.483659267425537, "loss": 1.44, "odds_ratio_loss": 0.6865109205245972, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1371345818042755, "rewards/margins": 0.01123136654496193, "rewards/rejected": -0.14836594462394714, "sft_loss": 1.3713457584381104, "step": 1460 }, { "epoch": 2.489415749364945, "grad_norm": 2.7279679775238037, "learning_rate": 3.461529802265079e-07, "logits/chosen": -14.534950256347656, "logits/rejected": -14.408660888671875, "logps/chosen": -1.3657411336898804, "logps/rejected": -1.4428269863128662, "loss": 1.4382, "odds_ratio_loss": 0.724717915058136, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13657411932945251, "rewards/margins": 0.007708588149398565, "rewards/rejected": -0.14428271353244781, "sft_loss": 1.3657411336898804, "step": 1470 }, { "epoch": 2.506350550381033, "grad_norm": 1.4955379962921143, "learning_rate": 3.2396449229020883e-07, "logits/chosen": -14.613665580749512, "logits/rejected": -14.357098579406738, "logps/chosen": -1.430061936378479, "logps/rejected": -1.4436513185501099, "loss": 1.5066, "odds_ratio_loss": 0.7651657462120056, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14300617575645447, "rewards/margins": 0.001358934328891337, "rewards/rejected": -0.1443651169538498, "sft_loss": 1.430061936378479, "step": 1480 }, { "epoch": 2.523285351397121, "grad_norm": 2.4484000205993652, "learning_rate": 3.024615823368371e-07, "logits/chosen": -14.309808731079102, "logits/rejected": -14.362199783325195, "logps/chosen": -1.368744134902954, "logps/rejected": -1.4789055585861206, "loss": 1.4407, "odds_ratio_loss": 0.7196033596992493, "rewards/accuracies": 0.5, "rewards/chosen": -0.13687442243099213, "rewards/margins": 0.011016142554581165, "rewards/rejected": -0.14789055287837982, "sft_loss": 1.368744134902954, "step": 1490 }, { "epoch": 2.5402201524132093, "grad_norm": 1.3006510734558105, "learning_rate": 2.8165102503600716e-07, "logits/chosen": -14.335368156433105, "logits/rejected": -14.394729614257812, "logps/chosen": -1.3518388271331787, "logps/rejected": -1.5090402364730835, "loss": 1.4234, "odds_ratio_loss": 0.7160680890083313, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1351839005947113, "rewards/margins": 0.015720132738351822, "rewards/rejected": -0.15090402960777283, "sft_loss": 1.3518388271331787, "step": 1500 }, { "epoch": 2.5402201524132093, "eval_logits/chosen": -14.433335304260254, "eval_logits/rejected": -14.40054702758789, "eval_logps/chosen": -1.4238022565841675, "eval_logps/rejected": -1.5122665166854858, "eval_loss": 1.4967381954193115, "eval_odds_ratio_loss": 0.7293583154678345, "eval_rewards/accuracies": 0.5038095116615295, "eval_rewards/chosen": -0.1423802226781845, "eval_rewards/margins": 0.008846436627209187, "eval_rewards/rejected": -0.15122665464878082, "eval_runtime": 445.9302, "eval_samples_per_second": 2.355, "eval_sft_loss": 1.4238022565841675, "eval_steps_per_second": 1.177, "step": 1500 }, { "epoch": 2.557154953429297, "grad_norm": 1.7379106283187866, "learning_rate": 2.615393769259039e-07, "logits/chosen": -14.186014175415039, "logits/rejected": -14.296531677246094, "logps/chosen": -1.5615041255950928, "logps/rejected": -1.4595506191253662, "loss": 1.6479, "odds_ratio_loss": 0.8642258644104004, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.1561504304409027, "rewards/margins": -0.01019534282386303, "rewards/rejected": -0.14595508575439453, "sft_loss": 1.5615041255950928, "step": 1510 }, { "epoch": 2.574089754445385, "grad_norm": 1.4174609184265137, "learning_rate": 2.421329743475917e-07, "logits/chosen": -14.357484817504883, "logits/rejected": -14.365758895874023, "logps/chosen": -1.3432402610778809, "logps/rejected": -1.4273216724395752, "loss": 1.4165, "odds_ratio_loss": 0.7326976656913757, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13432399928569794, "rewards/margins": 0.008408156223595142, "rewards/rejected": -0.142732173204422, "sft_loss": 1.3432402610778809, "step": 1520 }, { "epoch": 2.5910245554614733, "grad_norm": 2.1974966526031494, "learning_rate": 2.234379314486973e-07, "logits/chosen": -14.357455253601074, "logits/rejected": -14.430908203125, "logps/chosen": -1.3924636840820312, "logps/rejected": -1.438753366470337, "loss": 1.4659, "odds_ratio_loss": 0.734772801399231, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.1392463743686676, "rewards/margins": 0.004628963768482208, "rewards/rejected": -0.1438753306865692, "sft_loss": 1.3924636840820312, "step": 1530 }, { "epoch": 2.6079593564775614, "grad_norm": 1.687161922454834, "learning_rate": 2.0546013825709783e-07, "logits/chosen": -14.250285148620605, "logits/rejected": -14.199666976928711, "logps/chosen": -1.3859349489212036, "logps/rejected": -1.6351137161254883, "loss": 1.4537, "odds_ratio_loss": 0.6773584485054016, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13859349489212036, "rewards/margins": 0.024917880073189735, "rewards/rejected": -0.16351138055324554, "sft_loss": 1.3859349489212036, "step": 1540 }, { "epoch": 2.6248941574936495, "grad_norm": 1.5129095315933228, "learning_rate": 1.88205258825217e-07, "logits/chosen": -14.429784774780273, "logits/rejected": -14.17693042755127, "logps/chosen": -1.2826873064041138, "logps/rejected": -1.4500634670257568, "loss": 1.35, "odds_ratio_loss": 0.6734637022018433, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12826873362064362, "rewards/margins": 0.016737615689635277, "rewards/rejected": -0.14500637352466583, "sft_loss": 1.2826873064041138, "step": 1550 }, { "epoch": 2.6418289585097376, "grad_norm": 2.0735878944396973, "learning_rate": 1.7167872944552245e-07, "logits/chosen": -14.309649467468262, "logits/rejected": -14.5745849609375, "logps/chosen": -1.3819622993469238, "logps/rejected": -1.4382798671722412, "loss": 1.4545, "odds_ratio_loss": 0.7257741689682007, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1381962150335312, "rewards/margins": 0.005631768610328436, "rewards/rejected": -0.14382800459861755, "sft_loss": 1.3819622993469238, "step": 1560 }, { "epoch": 2.6587637595258258, "grad_norm": 1.331615924835205, "learning_rate": 1.5588575693777142e-07, "logits/chosen": -14.269506454467773, "logits/rejected": -14.277575492858887, "logps/chosen": -1.3485890626907349, "logps/rejected": -1.391801118850708, "loss": 1.42, "odds_ratio_loss": 0.714430034160614, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1348589062690735, "rewards/margins": 0.004321185871958733, "rewards/rejected": -0.13918009400367737, "sft_loss": 1.3485890626907349, "step": 1570 }, { "epoch": 2.675698560541914, "grad_norm": 1.4459912776947021, "learning_rate": 1.4083131700856428e-07, "logits/chosen": -14.257006645202637, "logits/rejected": -14.398195266723633, "logps/chosen": -1.4757592678070068, "logps/rejected": -1.4755744934082031, "loss": 1.553, "odds_ratio_loss": 0.7721089124679565, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.14757592976093292, "rewards/margins": -1.848684587457683e-05, "rewards/rejected": -0.14755743741989136, "sft_loss": 1.4757592678070068, "step": 1580 }, { "epoch": 2.6926333615580016, "grad_norm": 1.7114406824111938, "learning_rate": 1.2652015268370315e-07, "logits/chosen": -14.462023735046387, "logits/rejected": -14.4578218460083, "logps/chosen": -1.3610906600952148, "logps/rejected": -1.4776142835617065, "loss": 1.4335, "odds_ratio_loss": 0.7242997884750366, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13610906898975372, "rewards/margins": 0.011652367189526558, "rewards/rejected": -0.14776143431663513, "sft_loss": 1.3610906600952148, "step": 1590 }, { "epoch": 2.7095681625740897, "grad_norm": 1.469370722770691, "learning_rate": 1.1295677281386502e-07, "logits/chosen": -14.559967041015625, "logits/rejected": -14.478399276733398, "logps/chosen": -1.4620916843414307, "logps/rejected": -1.5956697463989258, "loss": 1.5327, "odds_ratio_loss": 0.706096351146698, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.14620915055274963, "rewards/margins": 0.013357831165194511, "rewards/rejected": -0.1595669686794281, "sft_loss": 1.4620916843414307, "step": 1600 }, { "epoch": 2.726502963590178, "grad_norm": 3.563047409057617, "learning_rate": 1.0014545065404973e-07, "logits/chosen": -14.436056137084961, "logits/rejected": -14.507670402526855, "logps/chosen": -1.4244582653045654, "logps/rejected": -1.5525462627410889, "loss": 1.4981, "odds_ratio_loss": 0.7365024089813232, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.14244583249092102, "rewards/margins": 0.012808804400265217, "rewards/rejected": -0.15525463223457336, "sft_loss": 1.4244582653045654, "step": 1610 }, { "epoch": 2.743437764606266, "grad_norm": 1.1012893915176392, "learning_rate": 8.809022251725502e-08, "logits/chosen": -14.58587646484375, "logits/rejected": -14.311334609985352, "logps/chosen": -1.3465197086334229, "logps/rejected": -1.5383667945861816, "loss": 1.4143, "odds_ratio_loss": 0.6780072450637817, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13465197384357452, "rewards/margins": 0.01918472908437252, "rewards/rejected": -0.1538366973400116, "sft_loss": 1.3465197086334229, "step": 1620 }, { "epoch": 2.7603725656223537, "grad_norm": 1.1277046203613281, "learning_rate": 7.679488650280509e-08, "logits/chosen": -14.479377746582031, "logits/rejected": -14.5874605178833, "logps/chosen": -1.3598499298095703, "logps/rejected": -1.5038646459579468, "loss": 1.4265, "odds_ratio_loss": 0.6669132113456726, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13598500192165375, "rewards/margins": 0.014401474967598915, "rewards/rejected": -0.1503864824771881, "sft_loss": 1.3598499298095703, "step": 1630 }, { "epoch": 2.777307366638442, "grad_norm": 4.1279425621032715, "learning_rate": 6.626300129972563e-08, "logits/chosen": -14.374710083007812, "logits/rejected": -14.649663925170898, "logps/chosen": -1.337192177772522, "logps/rejected": -1.399910569190979, "loss": 1.4103, "odds_ratio_loss": 0.7308821678161621, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.13371922075748444, "rewards/margins": 0.0062718503177165985, "rewards/rejected": -0.13999105989933014, "sft_loss": 1.337192177772522, "step": 1640 }, { "epoch": 2.79424216765453, "grad_norm": 1.395706057548523, "learning_rate": 5.649788506555065e-08, "logits/chosen": -14.170741081237793, "logits/rejected": -14.524632453918457, "logps/chosen": -1.359508752822876, "logps/rejected": -1.4829118251800537, "loss": 1.4274, "odds_ratio_loss": 0.6786811351776123, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13595086336135864, "rewards/margins": 0.012340312823653221, "rewards/rejected": -0.1482912003993988, "sft_loss": 1.359508752822876, "step": 1650 }, { "epoch": 2.811176968670618, "grad_norm": 1.766761302947998, "learning_rate": 4.7502614380908474e-08, "logits/chosen": -14.416241645812988, "logits/rejected": -14.220751762390137, "logps/chosen": -1.3510209321975708, "logps/rejected": -1.4324430227279663, "loss": 1.4234, "odds_ratio_loss": 0.7241480946540833, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13510209321975708, "rewards/margins": 0.008142213337123394, "rewards/rejected": -0.14324429631233215, "sft_loss": 1.3510209321975708, "step": 1660 }, { "epoch": 2.828111769686706, "grad_norm": 1.6919310092926025, "learning_rate": 3.9280023280222066e-08, "logits/chosen": -14.29878044128418, "logits/rejected": -14.355636596679688, "logps/chosen": -1.3545790910720825, "logps/rejected": -1.4631725549697876, "loss": 1.4267, "odds_ratio_loss": 0.7212874293327332, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13545790314674377, "rewards/margins": 0.010859351605176926, "rewards/rejected": -0.146317258477211, "sft_loss": 1.3545790910720825, "step": 1670 }, { "epoch": 2.8450465707027943, "grad_norm": 1.2037099599838257, "learning_rate": 3.1832702358818855e-08, "logits/chosen": -14.370442390441895, "logits/rejected": -14.418550491333008, "logps/chosen": -1.509386658668518, "logps/rejected": -1.5371757745742798, "loss": 1.5849, "odds_ratio_loss": 0.7552896738052368, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.15093867480754852, "rewards/margins": 0.0027789073064923286, "rewards/rejected": -0.15371759235858917, "sft_loss": 1.509386658668518, "step": 1680 }, { "epoch": 2.8619813717188824, "grad_norm": 1.7988624572753906, "learning_rate": 2.5162997956746647e-08, "logits/chosen": -14.56567096710205, "logits/rejected": -14.401374816894531, "logps/chosen": -1.374145746231079, "logps/rejected": -1.5657732486724854, "loss": 1.4407, "odds_ratio_loss": 0.6658385992050171, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1374145746231079, "rewards/margins": 0.019162729382514954, "rewards/rejected": -0.15657731890678406, "sft_loss": 1.374145746231079, "step": 1690 }, { "epoch": 2.8789161727349706, "grad_norm": 1.8519299030303955, "learning_rate": 1.9273011419536914e-08, "logits/chosen": -14.358851432800293, "logits/rejected": -14.361642837524414, "logps/chosen": -1.3464272022247314, "logps/rejected": -1.403352975845337, "loss": 1.4203, "odds_ratio_loss": 0.7389532327651978, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13464272022247314, "rewards/margins": 0.005692584905773401, "rewards/rejected": -0.1403352916240692, "sft_loss": 1.3464272022247314, "step": 1700 }, { "epoch": 2.8958509737510583, "grad_norm": 1.5958627462387085, "learning_rate": 1.4164598436159083e-08, "logits/chosen": -14.45777416229248, "logits/rejected": -14.55150318145752, "logps/chosen": -1.3691927194595337, "logps/rejected": -1.3762314319610596, "loss": 1.4457, "odds_ratio_loss": 0.7649668455123901, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.13691926002502441, "rewards/margins": 0.0007038834737613797, "rewards/rejected": -0.1376231610774994, "sft_loss": 1.3691927194595337, "step": 1710 }, { "epoch": 2.9127857747671464, "grad_norm": 1.1447230577468872, "learning_rate": 9.839368454371556e-09, "logits/chosen": -14.424572944641113, "logits/rejected": -14.471136093139648, "logps/chosen": -1.3717620372772217, "logps/rejected": -1.5175390243530273, "loss": 1.4405, "odds_ratio_loss": 0.6872409582138062, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13717620074748993, "rewards/margins": 0.014577709138393402, "rewards/rejected": -0.15175390243530273, "sft_loss": 1.3717620372772217, "step": 1720 }, { "epoch": 2.9297205757832345, "grad_norm": 1.2689136266708374, "learning_rate": 6.298684173650649e-09, "logits/chosen": -14.209467887878418, "logits/rejected": -14.251020431518555, "logps/chosen": -1.3433691263198853, "logps/rejected": -1.4693882465362549, "loss": 1.4164, "odds_ratio_loss": 0.7302767038345337, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.134336918592453, "rewards/margins": 0.012601924128830433, "rewards/rejected": -0.14693884551525116, "sft_loss": 1.3433691263198853, "step": 1730 }, { "epoch": 2.9466553767993227, "grad_norm": 1.0975892543792725, "learning_rate": 3.543661115860686e-09, "logits/chosen": -14.267629623413086, "logits/rejected": -14.19848918914795, "logps/chosen": -1.3776047229766846, "logps/rejected": -1.4311275482177734, "loss": 1.4519, "odds_ratio_loss": 0.7429286241531372, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1377604901790619, "rewards/margins": 0.0053522614762187, "rewards/rejected": -0.14311274886131287, "sft_loss": 1.3776047229766846, "step": 1740 }, { "epoch": 2.963590177815411, "grad_norm": 1.3392242193222046, "learning_rate": 1.575167273800693e-09, "logits/chosen": -14.299784660339355, "logits/rejected": -14.385360717773438, "logps/chosen": -1.3382477760314941, "logps/rejected": -1.3448528051376343, "loss": 1.4132, "odds_ratio_loss": 0.7496879696846008, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.13382478058338165, "rewards/margins": 0.0006605213275179267, "rewards/rejected": -0.13448528945446014, "sft_loss": 1.3382477760314941, "step": 1750 }, { "epoch": 2.9805249788314985, "grad_norm": 1.3686504364013672, "learning_rate": 3.9382283773564676e-10, "logits/chosen": -14.456472396850586, "logits/rejected": -14.480894088745117, "logps/chosen": -1.4318442344665527, "logps/rejected": -1.5163114070892334, "loss": 1.5085, "odds_ratio_loss": 0.7666203379631042, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.14318443834781647, "rewards/margins": 0.0084467101842165, "rewards/rejected": -0.15163113176822662, "sft_loss": 1.4318442344665527, "step": 1760 }, { "epoch": 2.9974597798475866, "grad_norm": 8.027430534362793, "learning_rate": 0.0, "logits/chosen": -14.504228591918945, "logits/rejected": -14.523704528808594, "logps/chosen": -1.456779956817627, "logps/rejected": -1.5364240407943726, "loss": 1.5332, "odds_ratio_loss": 0.7639864683151245, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.14567799866199493, "rewards/margins": 0.007964405231177807, "rewards/rejected": -0.1536424160003662, "sft_loss": 1.456779956817627, "step": 1770 }, { "epoch": 2.9974597798475866, "step": 1770, "total_flos": 1.8624482718096753e+18, "train_loss": 1.5362868001905539, "train_runtime": 27766.9561, "train_samples_per_second": 1.021, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 1770, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.8624482718096753e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }