{ "best_metric": null, "best_model_checkpoint": null, "epoch": 6.112054329371817, "eval_steps": 100, "global_step": 3600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1697792869269949, "grad_norm": 1439668.0, "learning_rate": 9.811356347858896e-07, "logits/chosen": -5.761106014251709, "logits/rejected": -8.760143280029297, "logps/chosen": -5.761106014251709, "logps/rejected": -8.760143280029297, "loss": 0.641, "rewards/accuracies": 0.6431249976158142, "rewards/chosen": -0.3449641466140747, "rewards/margins": 0.22517701983451843, "rewards/rejected": -0.5701411962509155, "step": 100 }, { "epoch": 0.1697792869269949, "eval_logits/chosen": -5.319983005523682, "eval_logits/rejected": -8.8311128616333, "eval_logps/chosen": -5.319983005523682, "eval_logps/rejected": -8.8311128616333, "eval_loss": 0.625819981098175, "eval_rewards/accuracies": 0.6587603092193604, "eval_rewards/chosen": -0.3068196475505829, "eval_rewards/margins": 0.27513614296913147, "eval_rewards/rejected": -0.5819559097290039, "eval_runtime": 209.2855, "eval_samples_per_second": 22.519, "eval_steps_per_second": 0.707, "step": 100 }, { "epoch": 0.3395585738539898, "grad_norm": 720618.0, "learning_rate": 9.62271269571779e-07, "logits/chosen": -6.1459760665893555, "logits/rejected": -11.072888374328613, "logps/chosen": -6.1459760665893555, "logps/rejected": -11.072888374328613, "loss": 0.5998, "rewards/accuracies": 0.6946874856948853, "rewards/chosen": -0.38635188341140747, "rewards/margins": 0.42572271823883057, "rewards/rejected": -0.8120746612548828, "step": 200 }, { "epoch": 0.3395585738539898, "eval_logits/chosen": -6.499293327331543, "eval_logits/rejected": -11.455465316772461, "eval_logps/chosen": -6.499293327331543, "eval_logps/rejected": -11.455465316772461, "eval_loss": 0.6120086908340454, "eval_rewards/accuracies": 0.6800863146781921, "eval_rewards/chosen": -0.4247507154941559, "eval_rewards/margins": 0.4196403920650482, "eval_rewards/rejected": -0.8443910479545593, "eval_runtime": 208.8859, "eval_samples_per_second": 22.563, "eval_steps_per_second": 0.709, "step": 200 }, { "epoch": 0.5093378607809848, "grad_norm": 881687.4375, "learning_rate": 9.434069043576683e-07, "logits/chosen": -6.092313289642334, "logits/rejected": -11.18380355834961, "logps/chosen": -6.092313289642334, "logps/rejected": -11.18380355834961, "loss": 0.5987, "rewards/accuracies": 0.6971874833106995, "rewards/chosen": -0.37863293290138245, "rewards/margins": 0.43722429871559143, "rewards/rejected": -0.8158571720123291, "step": 300 }, { "epoch": 0.5093378607809848, "eval_logits/chosen": -7.781611442565918, "eval_logits/rejected": -13.130339622497559, "eval_logps/chosen": -7.781611442565918, "eval_logps/rejected": -13.130339622497559, "eval_loss": 0.6120378971099854, "eval_rewards/accuracies": 0.6751126646995544, "eval_rewards/chosen": -0.5529825091362, "eval_rewards/margins": 0.4588959217071533, "eval_rewards/rejected": -1.011878490447998, "eval_runtime": 208.9508, "eval_samples_per_second": 22.556, "eval_steps_per_second": 0.708, "step": 300 }, { "epoch": 0.6791171477079796, "grad_norm": 1323350.0, "learning_rate": 9.245425391435577e-07, "logits/chosen": -6.091500282287598, "logits/rejected": -12.107943534851074, "logps/chosen": -6.091500282287598, "logps/rejected": -12.107943534851074, "loss": 0.5766, "rewards/accuracies": 0.7225000262260437, "rewards/chosen": -0.384408175945282, "rewards/margins": 0.5248246788978577, "rewards/rejected": -0.9092329144477844, "step": 400 }, { "epoch": 0.6791171477079796, "eval_logits/chosen": -6.126616954803467, "eval_logits/rejected": -11.5726900100708, "eval_logps/chosen": -6.126616954803467, "eval_logps/rejected": -11.5726900100708, "eval_loss": 0.6009896993637085, "eval_rewards/accuracies": 0.6990897059440613, "eval_rewards/chosen": -0.38748303055763245, "eval_rewards/margins": 0.46863046288490295, "eval_rewards/rejected": -0.8561134934425354, "eval_runtime": 208.8946, "eval_samples_per_second": 22.562, "eval_steps_per_second": 0.708, "step": 400 }, { "epoch": 0.8488964346349746, "grad_norm": 1345789.5, "learning_rate": 9.056781739294472e-07, "logits/chosen": -6.241028308868408, "logits/rejected": -12.876534461975098, "logps/chosen": -6.241028308868408, "logps/rejected": -12.876534461975098, "loss": 0.5682, "rewards/accuracies": 0.7256249785423279, "rewards/chosen": -0.3934747576713562, "rewards/margins": 0.5985441207885742, "rewards/rejected": -0.9920188784599304, "step": 500 }, { "epoch": 0.8488964346349746, "eval_logits/chosen": -5.7960638999938965, "eval_logits/rejected": -11.697932243347168, "eval_logps/chosen": -5.7960638999938965, "eval_logps/rejected": -11.697932243347168, "eval_loss": 0.6155823469161987, "eval_rewards/accuracies": 0.6845439076423645, "eval_rewards/chosen": -0.354427695274353, "eval_rewards/margins": 0.5142099261283875, "eval_rewards/rejected": -0.8686376810073853, "eval_runtime": 209.0577, "eval_samples_per_second": 22.544, "eval_steps_per_second": 0.708, "step": 500 }, { "epoch": 1.0186757215619695, "grad_norm": 1157107.25, "learning_rate": 8.868138087153367e-07, "logits/chosen": -6.348308086395264, "logits/rejected": -14.041864395141602, "logps/chosen": -6.348308086395264, "logps/rejected": -14.041864395141602, "loss": 0.5489, "rewards/accuracies": 0.7470967769622803, "rewards/chosen": -0.41100218892097473, "rewards/margins": 0.6878145337104797, "rewards/rejected": -1.0988166332244873, "step": 600 }, { "epoch": 1.0186757215619695, "eval_logits/chosen": -6.673037052154541, "eval_logits/rejected": -13.226807594299316, "eval_logps/chosen": -6.673037052154541, "eval_logps/rejected": -13.226807594299316, "eval_loss": 0.5992106199264526, "eval_rewards/accuracies": 0.7024680972099304, "eval_rewards/chosen": -0.4421250522136688, "eval_rewards/margins": 0.5794002413749695, "eval_rewards/rejected": -1.021525263786316, "eval_runtime": 208.9678, "eval_samples_per_second": 22.554, "eval_steps_per_second": 0.708, "step": 600 }, { "epoch": 1.1884550084889645, "grad_norm": 4128357.25, "learning_rate": 8.679494435012261e-07, "logits/chosen": -6.268939018249512, "logits/rejected": -16.667015075683594, "logps/chosen": -6.268939018249512, "logps/rejected": -16.667015075683594, "loss": 0.4857, "rewards/accuracies": 0.7893750071525574, "rewards/chosen": -0.39871758222579956, "rewards/margins": 0.9612747430801392, "rewards/rejected": -1.3599923849105835, "step": 700 }, { "epoch": 1.1884550084889645, "eval_logits/chosen": -8.286482810974121, "eval_logits/rejected": -16.864917755126953, "eval_logps/chosen": -8.286482810974121, "eval_logps/rejected": -16.864917755126953, "eval_loss": 0.6289598941802979, "eval_rewards/accuracies": 0.703007698059082, "eval_rewards/chosen": -0.6034695506095886, "eval_rewards/margins": 0.7818668484687805, "eval_rewards/rejected": -1.3853363990783691, "eval_runtime": 208.8912, "eval_samples_per_second": 22.562, "eval_steps_per_second": 0.709, "step": 700 }, { "epoch": 1.3582342954159592, "grad_norm": 2473335.0, "learning_rate": 8.490850782871156e-07, "logits/chosen": -7.054203510284424, "logits/rejected": -20.1887264251709, "logps/chosen": -7.054203510284424, "logps/rejected": -20.1887264251709, "loss": 0.4511, "rewards/accuracies": 0.8128125071525574, "rewards/chosen": -0.47764578461647034, "rewards/margins": 1.2407335042953491, "rewards/rejected": -1.718379259109497, "step": 800 }, { "epoch": 1.3582342954159592, "eval_logits/chosen": -8.789167404174805, "eval_logits/rejected": -17.3386173248291, "eval_logps/chosen": -8.789167404174805, "eval_logps/rejected": -17.3386173248291, "eval_loss": 0.6369263529777527, "eval_rewards/accuracies": 0.6990897059440613, "eval_rewards/chosen": -0.6537380218505859, "eval_rewards/margins": 0.778968334197998, "eval_rewards/rejected": -1.432706356048584, "eval_runtime": 208.899, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 800 }, { "epoch": 1.5280135823429541, "grad_norm": 3142982.75, "learning_rate": 8.30220713073005e-07, "logits/chosen": -7.051361083984375, "logits/rejected": -21.3233585357666, "logps/chosen": -7.051361083984375, "logps/rejected": -21.3233585357666, "loss": 0.4269, "rewards/accuracies": 0.8178125023841858, "rewards/chosen": -0.48277050256729126, "rewards/margins": 1.3506759405136108, "rewards/rejected": -1.8334465026855469, "step": 900 }, { "epoch": 1.5280135823429541, "eval_logits/chosen": -9.925564765930176, "eval_logits/rejected": -19.528337478637695, "eval_logps/chosen": -9.925564765930176, "eval_logps/rejected": -19.528337478637695, "eval_loss": 0.6703615784645081, "eval_rewards/accuracies": 0.7046968340873718, "eval_rewards/chosen": -0.7673779129981995, "eval_rewards/margins": 0.8843004107475281, "eval_rewards/rejected": -1.6516783237457275, "eval_runtime": 208.9359, "eval_samples_per_second": 22.557, "eval_steps_per_second": 0.708, "step": 900 }, { "epoch": 1.697792869269949, "grad_norm": 4501669.0, "learning_rate": 8.113563478588945e-07, "logits/chosen": -6.532191753387451, "logits/rejected": -20.091066360473633, "logps/chosen": -6.532191753387451, "logps/rejected": -20.091066360473633, "loss": 0.4419, "rewards/accuracies": 0.8090624809265137, "rewards/chosen": -0.42273572087287903, "rewards/margins": 1.2897990942001343, "rewards/rejected": -1.7125346660614014, "step": 1000 }, { "epoch": 1.697792869269949, "eval_logits/chosen": -9.914738655090332, "eval_logits/rejected": -19.7497501373291, "eval_logps/chosen": -9.914738655090332, "eval_logps/rejected": -19.7497501373291, "eval_loss": 0.677839457988739, "eval_rewards/accuracies": 0.697494387626648, "eval_rewards/chosen": -0.7662952542304993, "eval_rewards/margins": 0.9075242280960083, "eval_rewards/rejected": -1.6738194227218628, "eval_runtime": 208.901, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 1000 }, { "epoch": 1.8675721561969438, "grad_norm": 3824196.75, "learning_rate": 7.92491982644784e-07, "logits/chosen": -7.085892200469971, "logits/rejected": -22.929262161254883, "logps/chosen": -7.085892200469971, "logps/rejected": -22.929262161254883, "loss": 0.4115, "rewards/accuracies": 0.8321874737739563, "rewards/chosen": -0.475479394197464, "rewards/margins": 1.510648488998413, "rewards/rejected": -1.9861279726028442, "step": 1100 }, { "epoch": 1.8675721561969438, "eval_logits/chosen": -9.49491024017334, "eval_logits/rejected": -19.15648651123047, "eval_logps/chosen": -9.49491024017334, "eval_logps/rejected": -19.15648651123047, "eval_loss": 0.7029697895050049, "eval_rewards/accuracies": 0.6816582679748535, "eval_rewards/chosen": -0.7243123650550842, "eval_rewards/margins": 0.8901805281639099, "eval_rewards/rejected": -1.6144930124282837, "eval_runtime": 208.9527, "eval_samples_per_second": 22.555, "eval_steps_per_second": 0.708, "step": 1100 }, { "epoch": 2.037351443123939, "grad_norm": 3300928.75, "learning_rate": 7.736276174306734e-07, "logits/chosen": -6.721697807312012, "logits/rejected": -25.793066024780273, "logps/chosen": -6.721697807312012, "logps/rejected": -25.793066024780273, "loss": 0.3554, "rewards/accuracies": 0.8589919209480286, "rewards/chosen": -0.44800105690956116, "rewards/margins": 1.8316749334335327, "rewards/rejected": -2.2796759605407715, "step": 1200 }, { "epoch": 2.037351443123939, "eval_logits/chosen": -13.165043830871582, "eval_logits/rejected": -25.12335777282715, "eval_logps/chosen": -13.165043830871582, "eval_logps/rejected": -25.12335777282715, "eval_loss": 0.783903956413269, "eval_rewards/accuracies": 0.6831362843513489, "eval_rewards/chosen": -1.0913257598876953, "eval_rewards/margins": 1.1198549270629883, "eval_rewards/rejected": -2.2111806869506836, "eval_runtime": 209.1437, "eval_samples_per_second": 22.535, "eval_steps_per_second": 0.708, "step": 1200 }, { "epoch": 2.2071307300509337, "grad_norm": 1768611.75, "learning_rate": 7.547632522165629e-07, "logits/chosen": -7.6104841232299805, "logits/rejected": -33.324466705322266, "logps/chosen": -7.6104841232299805, "logps/rejected": -33.324466705322266, "loss": 0.2664, "rewards/accuracies": 0.9059374928474426, "rewards/chosen": -0.5307304263114929, "rewards/margins": 2.5041840076446533, "rewards/rejected": -3.034914493560791, "step": 1300 }, { "epoch": 2.2071307300509337, "eval_logits/chosen": -13.735506057739258, "eval_logits/rejected": -26.433338165283203, "eval_logps/chosen": -13.735506057739258, "eval_logps/rejected": -26.433338165283203, "eval_loss": 0.8584771156311035, "eval_rewards/accuracies": 0.6738457679748535, "eval_rewards/chosen": -1.148371934890747, "eval_rewards/margins": 1.1938064098358154, "eval_rewards/rejected": -2.3421783447265625, "eval_runtime": 209.1729, "eval_samples_per_second": 22.532, "eval_steps_per_second": 0.708, "step": 1300 }, { "epoch": 2.376910016977929, "grad_norm": 1666495.375, "learning_rate": 7.358988870024523e-07, "logits/chosen": -8.515084266662598, "logits/rejected": -37.48921203613281, "logps/chosen": -8.515084266662598, "logps/rejected": -37.48921203613281, "loss": 0.2585, "rewards/accuracies": 0.9043750166893005, "rewards/chosen": -0.6187115907669067, "rewards/margins": 2.8392581939697266, "rewards/rejected": -3.4579696655273438, "step": 1400 }, { "epoch": 2.376910016977929, "eval_logits/chosen": -14.21739387512207, "eval_logits/rejected": -27.55201530456543, "eval_logps/chosen": -14.21739387512207, "eval_logps/rejected": -27.55201530456543, "eval_loss": 0.8983097672462463, "eval_rewards/accuracies": 0.6670889854431152, "eval_rewards/chosen": -1.1965607404708862, "eval_rewards/margins": 1.2574853897094727, "eval_rewards/rejected": -2.4540457725524902, "eval_runtime": 209.0344, "eval_samples_per_second": 22.547, "eval_steps_per_second": 0.708, "step": 1400 }, { "epoch": 2.5466893039049237, "grad_norm": 4300686.5, "learning_rate": 7.170345217883418e-07, "logits/chosen": -8.596078872680664, "logits/rejected": -41.23643493652344, "logps/chosen": -8.596078872680664, "logps/rejected": -41.23643493652344, "loss": 0.2118, "rewards/accuracies": 0.9256250262260437, "rewards/chosen": -0.6369072198867798, "rewards/margins": 3.18845272064209, "rewards/rejected": -3.825360059738159, "step": 1500 }, { "epoch": 2.5466893039049237, "eval_logits/chosen": -17.494094848632812, "eval_logits/rejected": -31.98635482788086, "eval_logps/chosen": -17.494094848632812, "eval_logps/rejected": -31.98635482788086, "eval_loss": 0.9939661622047424, "eval_rewards/accuracies": 0.6700450778007507, "eval_rewards/chosen": -1.524230718612671, "eval_rewards/margins": 1.3732486963272095, "eval_rewards/rejected": -2.89747953414917, "eval_runtime": 209.0953, "eval_samples_per_second": 22.54, "eval_steps_per_second": 0.708, "step": 1500 }, { "epoch": 2.7164685908319184, "grad_norm": 2803856.75, "learning_rate": 6.981701565742313e-07, "logits/chosen": -9.222362518310547, "logits/rejected": -43.080440521240234, "logps/chosen": -9.222362518310547, "logps/rejected": -43.080440521240234, "loss": 0.2303, "rewards/accuracies": 0.9228125214576721, "rewards/chosen": -0.6898964643478394, "rewards/margins": 3.3144190311431885, "rewards/rejected": -4.004315376281738, "step": 1600 }, { "epoch": 2.7164685908319184, "eval_logits/chosen": -17.349721908569336, "eval_logits/rejected": -31.9420108795166, "eval_logps/chosen": -17.349721908569336, "eval_logps/rejected": -31.9420108795166, "eval_loss": 1.0111446380615234, "eval_rewards/accuracies": 0.6772241592407227, "eval_rewards/chosen": -1.5097935199737549, "eval_rewards/margins": 1.3832521438598633, "eval_rewards/rejected": -2.8930459022521973, "eval_runtime": 208.9712, "eval_samples_per_second": 22.553, "eval_steps_per_second": 0.708, "step": 1600 }, { "epoch": 2.8862478777589136, "grad_norm": 4121151.5, "learning_rate": 6.793057913601207e-07, "logits/chosen": -9.143606185913086, "logits/rejected": -45.31938171386719, "logps/chosen": -9.143606185913086, "logps/rejected": -45.31938171386719, "loss": 0.1983, "rewards/accuracies": 0.9259374737739563, "rewards/chosen": -0.6908590197563171, "rewards/margins": 3.5353012084960938, "rewards/rejected": -4.226160049438477, "step": 1700 }, { "epoch": 2.8862478777589136, "eval_logits/chosen": -16.322399139404297, "eval_logits/rejected": -31.206256866455078, "eval_logps/chosen": -16.322399139404297, "eval_logps/rejected": -31.206256866455078, "eval_loss": 1.0371383428573608, "eval_rewards/accuracies": 0.6601210832595825, "eval_rewards/chosen": -1.407061219215393, "eval_rewards/margins": 1.4124088287353516, "eval_rewards/rejected": -2.819470167160034, "eval_runtime": 209.0101, "eval_samples_per_second": 22.549, "eval_steps_per_second": 0.708, "step": 1700 }, { "epoch": 3.0560271646859083, "grad_norm": 5452789.5, "learning_rate": 6.604414261460102e-07, "logits/chosen": -9.218870162963867, "logits/rejected": -47.04511260986328, "logps/chosen": -9.218870162963867, "logps/rejected": -47.04511260986328, "loss": 0.1806, "rewards/accuracies": 0.9396673440933228, "rewards/chosen": -0.689750611782074, "rewards/margins": 3.7056329250335693, "rewards/rejected": -4.395383834838867, "step": 1800 }, { "epoch": 3.0560271646859083, "eval_logits/chosen": -17.876995086669922, "eval_logits/rejected": -33.25433349609375, "eval_logps/chosen": -17.876995086669922, "eval_logps/rejected": -33.25433349609375, "eval_loss": 1.10402250289917, "eval_rewards/accuracies": 0.6537866592407227, "eval_rewards/chosen": -1.5625207424163818, "eval_rewards/margins": 1.4617576599121094, "eval_rewards/rejected": -3.024278402328491, "eval_runtime": 209.0247, "eval_samples_per_second": 22.548, "eval_steps_per_second": 0.708, "step": 1800 }, { "epoch": 3.225806451612903, "grad_norm": 5950445.5, "learning_rate": 6.415770609318995e-07, "logits/chosen": -9.886687278747559, "logits/rejected": -55.296512603759766, "logps/chosen": -9.886687278747559, "logps/rejected": -55.296512603759766, "loss": 0.1156, "rewards/accuracies": 0.9618750214576721, "rewards/chosen": -0.7571214437484741, "rewards/margins": 4.463034152984619, "rewards/rejected": -5.220155715942383, "step": 1900 }, { "epoch": 3.225806451612903, "eval_logits/chosen": -20.19826316833496, "eval_logits/rejected": -35.85367202758789, "eval_logps/chosen": -20.19826316833496, "eval_logps/rejected": -35.85367202758789, "eval_loss": 1.301727533340454, "eval_rewards/accuracies": 0.6404842734336853, "eval_rewards/chosen": -1.7946478128433228, "eval_rewards/margins": 1.489564061164856, "eval_rewards/rejected": -3.2842113971710205, "eval_runtime": 208.9047, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 1900 }, { "epoch": 3.395585738539898, "grad_norm": 851176.3125, "learning_rate": 6.22712695717789e-07, "logits/chosen": -10.0634183883667, "logits/rejected": -57.99760818481445, "logps/chosen": -10.0634183883667, "logps/rejected": -57.99760818481445, "loss": 0.1305, "rewards/accuracies": 0.9524999856948853, "rewards/chosen": -0.7792974710464478, "rewards/margins": 4.724158763885498, "rewards/rejected": -5.5034565925598145, "step": 2000 }, { "epoch": 3.395585738539898, "eval_logits/chosen": -22.325353622436523, "eval_logits/rejected": -41.12445068359375, "eval_logps/chosen": -22.325353622436523, "eval_logps/rejected": -41.12445068359375, "eval_loss": 1.3581064939498901, "eval_rewards/accuracies": 0.6620214581489563, "eval_rewards/chosen": -2.007356643676758, "eval_rewards/margins": 1.8039321899414062, "eval_rewards/rejected": -3.8112893104553223, "eval_runtime": 208.9047, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 2000 }, { "epoch": 3.565365025466893, "grad_norm": 2612348.0, "learning_rate": 6.038483305036785e-07, "logits/chosen": -10.501612663269043, "logits/rejected": -61.6521110534668, "logps/chosen": -10.501612663269043, "logps/rejected": -61.6521110534668, "loss": 0.1125, "rewards/accuracies": 0.9612500071525574, "rewards/chosen": -0.8246511220932007, "rewards/margins": 5.043383598327637, "rewards/rejected": -5.868035316467285, "step": 2100 }, { "epoch": 3.565365025466893, "eval_logits/chosen": -23.211849212646484, "eval_logits/rejected": -42.190460205078125, "eval_logps/chosen": -23.211849212646484, "eval_logps/rejected": -42.190460205078125, "eval_loss": 1.3938747644424438, "eval_rewards/accuracies": 0.6575872898101807, "eval_rewards/chosen": -2.096006393432617, "eval_rewards/margins": 1.8218843936920166, "eval_rewards/rejected": -3.917890787124634, "eval_runtime": 208.8957, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 2100 }, { "epoch": 3.735144312393888, "grad_norm": 1196963.375, "learning_rate": 5.849839652895679e-07, "logits/chosen": -11.40926742553711, "logits/rejected": -63.774147033691406, "logps/chosen": -11.40926742553711, "logps/rejected": -63.774147033691406, "loss": 0.1247, "rewards/accuracies": 0.9581249952316284, "rewards/chosen": -0.917951226234436, "rewards/margins": 5.163402557373047, "rewards/rejected": -6.081354141235352, "step": 2200 }, { "epoch": 3.735144312393888, "eval_logits/chosen": -22.39486312866211, "eval_logits/rejected": -41.15921401977539, "eval_logps/chosen": -22.39486312866211, "eval_logps/rejected": -41.15921401977539, "eval_loss": 1.4013347625732422, "eval_rewards/accuracies": 0.6580095887184143, "eval_rewards/chosen": -2.0143074989318848, "eval_rewards/margins": 1.800458312034607, "eval_rewards/rejected": -3.814765691757202, "eval_runtime": 208.8861, "eval_samples_per_second": 22.563, "eval_steps_per_second": 0.709, "step": 2200 }, { "epoch": 3.904923599320883, "grad_norm": 5717707.0, "learning_rate": 5.661196000754574e-07, "logits/chosen": -10.704707145690918, "logits/rejected": -61.2271614074707, "logps/chosen": -10.704707145690918, "logps/rejected": -61.2271614074707, "loss": 0.1432, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8354018926620483, "rewards/margins": 4.986200332641602, "rewards/rejected": -5.821602821350098, "step": 2300 }, { "epoch": 3.904923599320883, "eval_logits/chosen": -22.799854278564453, "eval_logits/rejected": -40.016639709472656, "eval_logps/chosen": -22.799854278564453, "eval_logps/rejected": -40.016639709472656, "eval_loss": 1.422210693359375, "eval_rewards/accuracies": 0.640906572341919, "eval_rewards/chosen": -2.05480694770813, "eval_rewards/margins": 1.64570152759552, "eval_rewards/rejected": -3.700507879257202, "eval_runtime": 208.8717, "eval_samples_per_second": 22.564, "eval_steps_per_second": 0.709, "step": 2300 }, { "epoch": 4.074702886247878, "grad_norm": 3006169.25, "learning_rate": 5.472552348613468e-07, "logits/chosen": -11.348149299621582, "logits/rejected": -65.989501953125, "logps/chosen": -11.348149299621582, "logps/rejected": -65.989501953125, "loss": 0.0807, "rewards/accuracies": 0.9731149077415466, "rewards/chosen": -0.9088042974472046, "rewards/margins": 5.384040355682373, "rewards/rejected": -6.292844772338867, "step": 2400 }, { "epoch": 4.074702886247878, "eval_logits/chosen": -25.678863525390625, "eval_logits/rejected": -45.5009880065918, "eval_logps/chosen": -25.678863525390625, "eval_logps/rejected": -45.5009880065918, "eval_loss": 1.5664551258087158, "eval_rewards/accuracies": 0.6468187570571899, "eval_rewards/chosen": -2.342707633972168, "eval_rewards/margins": 1.9062355756759644, "eval_rewards/rejected": -4.248943328857422, "eval_runtime": 208.9116, "eval_samples_per_second": 22.56, "eval_steps_per_second": 0.708, "step": 2400 }, { "epoch": 4.244482173174872, "grad_norm": 832362.5, "learning_rate": 5.283908696472363e-07, "logits/chosen": -11.587979316711426, "logits/rejected": -70.81718444824219, "logps/chosen": -11.587979316711426, "logps/rejected": -70.81718444824219, "loss": 0.0877, "rewards/accuracies": 0.9759374856948853, "rewards/chosen": -0.9369795918464661, "rewards/margins": 5.846475124359131, "rewards/rejected": -6.783454418182373, "step": 2500 }, { "epoch": 4.244482173174872, "eval_logits/chosen": -26.959436416625977, "eval_logits/rejected": -48.20720291137695, "eval_logps/chosen": -26.959436416625977, "eval_logps/rejected": -48.20720291137695, "eval_loss": 1.609139323234558, "eval_rewards/accuracies": 0.6615991592407227, "eval_rewards/chosen": -2.4707648754119873, "eval_rewards/margins": 2.048799753189087, "eval_rewards/rejected": -4.519564628601074, "eval_runtime": 208.9321, "eval_samples_per_second": 22.558, "eval_steps_per_second": 0.708, "step": 2500 }, { "epoch": 4.4142614601018675, "grad_norm": 1446953.875, "learning_rate": 5.095265044331258e-07, "logits/chosen": -12.649277687072754, "logits/rejected": -72.97715759277344, "logps/chosen": -12.649277687072754, "logps/rejected": -72.97715759277344, "loss": 0.0931, "rewards/accuracies": 0.9690625071525574, "rewards/chosen": -1.0309958457946777, "rewards/margins": 5.968573570251465, "rewards/rejected": -6.999568939208984, "step": 2600 }, { "epoch": 4.4142614601018675, "eval_logits/chosen": -26.158695220947266, "eval_logits/rejected": -47.53694534301758, "eval_logps/chosen": -26.158695220947266, "eval_logps/rejected": -47.53694534301758, "eval_loss": 1.6116589307785034, "eval_rewards/accuracies": 0.6586430668830872, "eval_rewards/chosen": -2.390690803527832, "eval_rewards/margins": 2.0618481636047363, "eval_rewards/rejected": -4.452538967132568, "eval_runtime": 208.8998, "eval_samples_per_second": 22.561, "eval_steps_per_second": 0.708, "step": 2600 }, { "epoch": 4.584040747028863, "grad_norm": 853237.6875, "learning_rate": 4.906621392190153e-07, "logits/chosen": -12.076998710632324, "logits/rejected": -74.51419067382812, "logps/chosen": -12.076998710632324, "logps/rejected": -74.51419067382812, "loss": 0.0769, "rewards/accuracies": 0.9715625047683716, "rewards/chosen": -0.9808881878852844, "rewards/margins": 6.173401355743408, "rewards/rejected": -7.154289722442627, "step": 2700 }, { "epoch": 4.584040747028863, "eval_logits/chosen": -29.247961044311523, "eval_logits/rejected": -51.42329788208008, "eval_logps/chosen": -29.247961044311523, "eval_logps/rejected": -51.42329788208008, "eval_loss": 1.7785123586654663, "eval_rewards/accuracies": 0.6575872898101807, "eval_rewards/chosen": -2.699617624282837, "eval_rewards/margins": 2.14155650138855, "eval_rewards/rejected": -4.8411736488342285, "eval_runtime": 208.92, "eval_samples_per_second": 22.559, "eval_steps_per_second": 0.708, "step": 2700 }, { "epoch": 4.753820033955858, "grad_norm": 64613.96484375, "learning_rate": 4.7179777400490475e-07, "logits/chosen": -13.13824462890625, "logits/rejected": -77.2339859008789, "logps/chosen": -13.13824462890625, "logps/rejected": -77.2339859008789, "loss": 0.0671, "rewards/accuracies": 0.9771875143051147, "rewards/chosen": -1.0873844623565674, "rewards/margins": 6.331437587738037, "rewards/rejected": -7.418821334838867, "step": 2800 }, { "epoch": 4.753820033955858, "eval_logits/chosen": -29.356685638427734, "eval_logits/rejected": -52.42674255371094, "eval_logps/chosen": -29.356685638427734, "eval_logps/rejected": -52.42674255371094, "eval_loss": 1.7627665996551514, "eval_rewards/accuracies": 0.6613879799842834, "eval_rewards/chosen": -2.7104897499084473, "eval_rewards/margins": 2.2310285568237305, "eval_rewards/rejected": -4.941518306732178, "eval_runtime": 208.9366, "eval_samples_per_second": 22.557, "eval_steps_per_second": 0.708, "step": 2800 }, { "epoch": 4.923599320882852, "grad_norm": 1532345.25, "learning_rate": 4.529334087907942e-07, "logits/chosen": -13.333649635314941, "logits/rejected": -78.9609603881836, "logps/chosen": -13.333649635314941, "logps/rejected": -78.9609603881836, "loss": 0.0704, "rewards/accuracies": 0.9737499952316284, "rewards/chosen": -1.1064611673355103, "rewards/margins": 6.48252010345459, "rewards/rejected": -7.588980674743652, "step": 2900 }, { "epoch": 4.923599320882852, "eval_logits/chosen": -28.425640106201172, "eval_logits/rejected": -49.83637237548828, "eval_logps/chosen": -28.425640106201172, "eval_logps/rejected": -49.83637237548828, "eval_loss": 1.750953197479248, "eval_rewards/accuracies": 0.6472409963607788, "eval_rewards/chosen": -2.6173858642578125, "eval_rewards/margins": 2.065096139907837, "eval_rewards/rejected": -4.68248176574707, "eval_runtime": 209.0155, "eval_samples_per_second": 22.549, "eval_steps_per_second": 0.708, "step": 2900 }, { "epoch": 5.093378607809847, "grad_norm": 301765.34375, "learning_rate": 4.3406904357668363e-07, "logits/chosen": -13.260400772094727, "logits/rejected": -81.56822204589844, "logps/chosen": -13.260400772094727, "logps/rejected": -81.56822204589844, "loss": 0.052, "rewards/accuracies": 0.9834374785423279, "rewards/chosen": -1.090865969657898, "rewards/margins": 6.772265434265137, "rewards/rejected": -7.863131523132324, "step": 3000 }, { "epoch": 5.093378607809847, "eval_logits/chosen": -32.38473892211914, "eval_logits/rejected": -57.836265563964844, "eval_logps/chosen": -32.38473892211914, "eval_logps/rejected": -57.836265563964844, "eval_loss": 1.8803811073303223, "eval_rewards/accuracies": 0.6659393906593323, "eval_rewards/chosen": -3.0132949352264404, "eval_rewards/margins": 2.4691765308380127, "eval_rewards/rejected": -5.482471466064453, "eval_runtime": 209.0035, "eval_samples_per_second": 22.55, "eval_steps_per_second": 0.708, "step": 3000 }, { "epoch": 5.2631578947368425, "grad_norm": 2405187.25, "learning_rate": 4.152046783625731e-07, "logits/chosen": -13.827689170837402, "logits/rejected": -86.41466522216797, "logps/chosen": -13.827689170837402, "logps/rejected": -86.41466522216797, "loss": 0.0433, "rewards/accuracies": 0.9871875047683716, "rewards/chosen": -1.158460259437561, "rewards/margins": 7.181718826293945, "rewards/rejected": -8.340180397033691, "step": 3100 }, { "epoch": 5.2631578947368425, "eval_logits/chosen": -32.28849411010742, "eval_logits/rejected": -57.642208099365234, "eval_logps/chosen": -32.28849411010742, "eval_logps/rejected": -57.642208099365234, "eval_loss": 1.9446017742156982, "eval_rewards/accuracies": 0.6600272059440613, "eval_rewards/chosen": -3.0036704540252686, "eval_rewards/margins": 2.459395170211792, "eval_rewards/rejected": -5.463066101074219, "eval_runtime": 208.9454, "eval_samples_per_second": 22.556, "eval_steps_per_second": 0.708, "step": 3100 }, { "epoch": 5.432937181663837, "grad_norm": 582192.5, "learning_rate": 3.9634031314846257e-07, "logits/chosen": -13.921838760375977, "logits/rejected": -87.0871810913086, "logps/chosen": -13.921838760375977, "logps/rejected": -87.0871810913086, "loss": 0.0501, "rewards/accuracies": 0.9846875071525574, "rewards/chosen": -1.164242148399353, "rewards/margins": 7.242315292358398, "rewards/rejected": -8.4065580368042, "step": 3200 }, { "epoch": 5.432937181663837, "eval_logits/chosen": -34.68353271484375, "eval_logits/rejected": -61.13566970825195, "eval_logps/chosen": -34.68353271484375, "eval_logps/rejected": -61.13566970825195, "eval_loss": 2.0483999252319336, "eval_rewards/accuracies": 0.6620214581489563, "eval_rewards/chosen": -3.2431745529174805, "eval_rewards/margins": 2.5692365169525146, "eval_rewards/rejected": -5.812410831451416, "eval_runtime": 208.9578, "eval_samples_per_second": 22.555, "eval_steps_per_second": 0.708, "step": 3200 }, { "epoch": 5.602716468590832, "grad_norm": 2969641.25, "learning_rate": 3.77475947934352e-07, "logits/chosen": -14.580632209777832, "logits/rejected": -90.8200454711914, "logps/chosen": -14.580632209777832, "logps/rejected": -90.8200454711914, "loss": 0.0564, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -1.224000096321106, "rewards/margins": 7.565284252166748, "rewards/rejected": -8.789283752441406, "step": 3300 }, { "epoch": 5.602716468590832, "eval_logits/chosen": -35.181819915771484, "eval_logits/rejected": -62.393310546875, "eval_logps/chosen": -35.181819915771484, "eval_logps/rejected": -62.393310546875, "eval_loss": 2.0540757179260254, "eval_rewards/accuracies": 0.6639217734336853, "eval_rewards/chosen": -3.2930033206939697, "eval_rewards/margins": 2.645172119140625, "eval_rewards/rejected": -5.938176155090332, "eval_runtime": 485.1257, "eval_samples_per_second": 9.715, "eval_steps_per_second": 0.305, "step": 3300 }, { "epoch": 5.772495755517827, "grad_norm": 651674.5625, "learning_rate": 3.5861158272024146e-07, "logits/chosen": -14.960144996643066, "logits/rejected": -92.49775695800781, "logps/chosen": -14.960144996643066, "logps/rejected": -92.49775695800781, "loss": 0.0449, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -1.2752126455307007, "rewards/margins": 7.666284561157227, "rewards/rejected": -8.941495895385742, "step": 3400 }, { "epoch": 5.772495755517827, "eval_logits/chosen": -35.62704086303711, "eval_logits/rejected": -61.88102340698242, "eval_logps/chosen": -35.62704086303711, "eval_logps/rejected": -61.88102340698242, "eval_loss": 2.136263847351074, "eval_rewards/accuracies": 0.6544200778007507, "eval_rewards/chosen": -3.3375253677368164, "eval_rewards/margins": 2.549421787261963, "eval_rewards/rejected": -5.886947154998779, "eval_runtime": 208.9278, "eval_samples_per_second": 22.558, "eval_steps_per_second": 0.708, "step": 3400 }, { "epoch": 5.942275042444821, "grad_norm": 238421.484375, "learning_rate": 3.397472175061309e-07, "logits/chosen": -15.450565338134766, "logits/rejected": -94.02888488769531, "logps/chosen": -15.450565338134766, "logps/rejected": -94.02888488769531, "loss": 0.0502, "rewards/accuracies": 0.9834374785423279, "rewards/chosen": -1.313481092453003, "rewards/margins": 7.784170150756836, "rewards/rejected": -9.097650527954102, "step": 3500 }, { "epoch": 5.942275042444821, "eval_logits/chosen": -34.725975036621094, "eval_logits/rejected": -59.86567687988281, "eval_logps/chosen": -34.725975036621094, "eval_logps/rejected": -59.86567687988281, "eval_loss": 2.154789924621582, "eval_rewards/accuracies": 0.6511589884757996, "eval_rewards/chosen": -3.2474188804626465, "eval_rewards/margins": 2.437992811203003, "eval_rewards/rejected": -5.68541145324707, "eval_runtime": 208.9288, "eval_samples_per_second": 22.558, "eval_steps_per_second": 0.708, "step": 3500 }, { "epoch": 6.112054329371817, "grad_norm": 355115.0, "learning_rate": 3.2088285229202035e-07, "logits/chosen": -14.956696510314941, "logits/rejected": -96.87789154052734, "logps/chosen": -14.956696510314941, "logps/rejected": -96.87789154052734, "loss": 0.0237, "rewards/accuracies": 0.9918649196624756, "rewards/chosen": -1.269476056098938, "rewards/margins": 8.1198148727417, "rewards/rejected": -9.389290809631348, "step": 3600 }, { "epoch": 6.112054329371817, "eval_logits/chosen": -36.520957946777344, "eval_logits/rejected": -62.83375930786133, "eval_logps/chosen": -36.520957946777344, "eval_logps/rejected": -62.83375930786133, "eval_loss": 2.1961820125579834, "eval_rewards/accuracies": 0.6507366895675659, "eval_rewards/chosen": -3.42691707611084, "eval_rewards/margins": 2.5553040504455566, "eval_rewards/rejected": -5.9822211265563965, "eval_runtime": 208.9997, "eval_samples_per_second": 22.55, "eval_steps_per_second": 0.708, "step": 3600 } ], "logging_steps": 100, "max_steps": 5301, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 32, "trial_name": null, "trial_params": null }