{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9871794871794872, "eval_steps": 500, "global_step": 699, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.021367521367521368, "grad_norm": 16.125, "learning_rate": 1.7556090538745385e-06, "logits/chosen": -3.5723044872283936, "logits/rejected": -3.5033211708068848, "logps/chosen": -41.10147476196289, "logps/rejected": -79.84379577636719, "loss": 0.6923, "rewards/accuracies": 0.3291666507720947, "rewards/chosen": 0.0008107174071483314, "rewards/margins": 0.0017922676634043455, "rewards/rejected": -0.0009815500816330314, "step": 5 }, { "epoch": 0.042735042735042736, "grad_norm": 16.75, "learning_rate": 3.950120371217711e-06, "logits/chosen": -3.5865039825439453, "logits/rejected": -3.506371021270752, "logps/chosen": -40.01554870605469, "logps/rejected": -78.2536849975586, "loss": 0.679, "rewards/accuracies": 0.8291667103767395, "rewards/chosen": -0.0026048908475786448, "rewards/margins": 0.028653645887970924, "rewards/rejected": -0.031258534640073776, "step": 10 }, { "epoch": 0.0641025641025641, "grad_norm": 14.6875, "learning_rate": 6.144631688560886e-06, "logits/chosen": -3.5993950366973877, "logits/rejected": -3.5078125, "logps/chosen": -39.8319206237793, "logps/rejected": -81.44599151611328, "loss": 0.6296, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.006084696389734745, "rewards/margins": 0.1335393637418747, "rewards/rejected": -0.13962405920028687, "step": 15 }, { "epoch": 0.08547008547008547, "grad_norm": 13.75, "learning_rate": 8.339143005904057e-06, "logits/chosen": -3.5526351928710938, "logits/rejected": -3.4919040203094482, "logps/chosen": -39.764862060546875, "logps/rejected": -80.73335266113281, "loss": 0.5434, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.015795540064573288, "rewards/margins": 0.33912044763565063, "rewards/rejected": -0.3549160361289978, "step": 20 }, { "epoch": 0.10683760683760683, "grad_norm": 11.1875, "learning_rate": 1.0533654323247232e-05, "logits/chosen": -3.5133590698242188, "logits/rejected": -3.472569227218628, "logps/chosen": -40.0025749206543, "logps/rejected": -85.53997802734375, "loss": 0.3966, "rewards/accuracies": 0.9791666865348816, "rewards/chosen": -0.04189577326178551, "rewards/margins": 0.7760334014892578, "rewards/rejected": -0.8179291486740112, "step": 25 }, { "epoch": 0.1282051282051282, "grad_norm": 7.59375, "learning_rate": 1.2728165640590407e-05, "logits/chosen": -3.44189190864563, "logits/rejected": -3.4465813636779785, "logps/chosen": -42.446632385253906, "logps/rejected": -93.0277328491211, "loss": 0.2592, "rewards/accuracies": 0.9958332777023315, "rewards/chosen": -0.1323380172252655, "rewards/margins": 1.4258702993392944, "rewards/rejected": -1.5582085847854614, "step": 30 }, { "epoch": 0.14957264957264957, "grad_norm": 4.28125, "learning_rate": 1.4922676957933578e-05, "logits/chosen": -3.223677158355713, "logits/rejected": -3.2687485218048096, "logps/chosen": -44.17692947387695, "logps/rejected": -108.72818756103516, "loss": 0.1195, "rewards/accuracies": 1.0, "rewards/chosen": -0.43831247091293335, "rewards/margins": 2.755509853363037, "rewards/rejected": -3.1938223838806152, "step": 35 }, { "epoch": 0.17094017094017094, "grad_norm": 2.03125, "learning_rate": 1.5360556888469565e-05, "logits/chosen": -2.780639410018921, "logits/rejected": -2.814786911010742, "logps/chosen": -52.43251419067383, "logps/rejected": -143.02699279785156, "loss": 0.0425, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -1.2548245191574097, "rewards/margins": 5.2223920822143555, "rewards/rejected": -6.477217197418213, "step": 40 }, { "epoch": 0.19230769230769232, "grad_norm": 1.6953125, "learning_rate": 1.535640428282884e-05, "logits/chosen": -2.3407504558563232, "logits/rejected": -2.3020548820495605, "logps/chosen": -51.26860427856445, "logps/rejected": -161.84654235839844, "loss": 0.0168, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.2653954029083252, "rewards/margins": 7.260884761810303, "rewards/rejected": -8.526280403137207, "step": 45 }, { "epoch": 0.21367521367521367, "grad_norm": 9.3125, "learning_rate": 1.5349059809872097e-05, "logits/chosen": -1.9962679147720337, "logits/rejected": -1.9376245737075806, "logps/chosen": -57.276153564453125, "logps/rejected": -180.43075561523438, "loss": 0.0199, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.6868196725845337, "rewards/margins": 8.579647064208984, "rewards/rejected": -10.266467094421387, "step": 50 }, { "epoch": 0.23504273504273504, "grad_norm": 1.484375, "learning_rate": 1.5338527542732884e-05, "logits/chosen": -1.9884917736053467, "logits/rejected": -1.9206396341323853, "logps/chosen": -58.84391403198242, "logps/rejected": -196.67623901367188, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": -1.8948497772216797, "rewards/margins": 9.876777648925781, "rewards/rejected": -11.771627426147461, "step": 55 }, { "epoch": 0.2564102564102564, "grad_norm": 5.25, "learning_rate": 1.532481332244717e-05, "logits/chosen": -1.9375540018081665, "logits/rejected": -1.8217322826385498, "logps/chosen": -57.795997619628906, "logps/rejected": -198.72848510742188, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.8656346797943115, "rewards/margins": 10.204580307006836, "rewards/rejected": -12.070215225219727, "step": 60 }, { "epoch": 0.2777777777777778, "grad_norm": 16.125, "learning_rate": 1.5307924754713968e-05, "logits/chosen": -1.9060790538787842, "logits/rejected": -1.8332831859588623, "logps/chosen": -58.628753662109375, "logps/rejected": -194.50588989257812, "loss": 0.0115, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -1.797798752784729, "rewards/margins": 9.891462326049805, "rewards/rejected": -11.689262390136719, "step": 65 }, { "epoch": 0.29914529914529914, "grad_norm": 0.2470703125, "learning_rate": 1.528787120567736e-05, "logits/chosen": -1.974802017211914, "logits/rejected": -1.877560019493103, "logps/chosen": -52.279327392578125, "logps/rejected": -189.9191131591797, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -1.2973945140838623, "rewards/margins": 10.00140380859375, "rewards/rejected": -11.298797607421875, "step": 70 }, { "epoch": 0.32051282051282054, "grad_norm": 0.640625, "learning_rate": 1.526466379673215e-05, "logits/chosen": -1.9735548496246338, "logits/rejected": -1.870365858078003, "logps/chosen": -53.06673049926758, "logps/rejected": -199.06517028808594, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.3975543975830078, "rewards/margins": 10.824033737182617, "rewards/rejected": -12.221589088439941, "step": 75 }, { "epoch": 0.3418803418803419, "grad_norm": 10.5, "learning_rate": 1.5238315398356126e-05, "logits/chosen": -1.9223344326019287, "logits/rejected": -1.7829482555389404, "logps/chosen": -58.65262985229492, "logps/rejected": -205.0301055908203, "loss": 0.0075, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9159812927246094, "rewards/margins": 10.972046852111816, "rewards/rejected": -12.888028144836426, "step": 80 }, { "epoch": 0.36324786324786323, "grad_norm": 0.055419921875, "learning_rate": 1.5208840622972272e-05, "logits/chosen": -1.9187581539154053, "logits/rejected": -1.742376685142517, "logps/chosen": -60.388633728027344, "logps/rejected": -209.2400360107422, "loss": 0.0215, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -1.904120683670044, "rewards/margins": 11.195540428161621, "rewards/rejected": -13.099660873413086, "step": 85 }, { "epoch": 0.38461538461538464, "grad_norm": 0.97265625, "learning_rate": 1.5176255816844948e-05, "logits/chosen": -1.870398759841919, "logits/rejected": -1.6664142608642578, "logps/chosen": -52.21706008911133, "logps/rejected": -206.40274047851562, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.3197202682495117, "rewards/margins": 11.56867504119873, "rewards/rejected": -12.888395309448242, "step": 90 }, { "epoch": 0.405982905982906, "grad_norm": 0.609375, "learning_rate": 1.5140579051014502e-05, "logits/chosen": -1.8215786218643188, "logits/rejected": -1.587632179260254, "logps/chosen": -56.239166259765625, "logps/rejected": -221.23486328125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -1.6193969249725342, "rewards/margins": 12.424324035644531, "rewards/rejected": -14.043721199035645, "step": 95 }, { "epoch": 0.42735042735042733, "grad_norm": 0.19140625, "learning_rate": 1.5101830111275334e-05, "logits/chosen": -1.761718511581421, "logits/rejected": -1.5578025579452515, "logps/chosen": -58.93622970581055, "logps/rejected": -211.36245727539062, "loss": 0.0134, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9031003713607788, "rewards/margins": 11.43458366394043, "rewards/rejected": -13.337686538696289, "step": 100 }, { "epoch": 0.44871794871794873, "grad_norm": 5.25, "learning_rate": 1.5060030487203004e-05, "logits/chosen": -1.7170766592025757, "logits/rejected": -1.4669135808944702, "logps/chosen": -59.824798583984375, "logps/rejected": -224.89785766601562, "loss": 0.0058, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9724791049957275, "rewards/margins": 12.608641624450684, "rewards/rejected": -14.581120491027832, "step": 105 }, { "epoch": 0.4700854700854701, "grad_norm": 0.06396484375, "learning_rate": 1.501520336023643e-05, "logits/chosen": -1.6807842254638672, "logits/rejected": -1.377798318862915, "logps/chosen": -60.823036193847656, "logps/rejected": -220.12026977539062, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -2.0682151317596436, "rewards/margins": 12.238883972167969, "rewards/rejected": -14.307100296020508, "step": 110 }, { "epoch": 0.49145299145299143, "grad_norm": 1.8984375, "learning_rate": 1.4967373590821828e-05, "logits/chosen": -1.6894537210464478, "logits/rejected": -1.387304425239563, "logps/chosen": -70.60054779052734, "logps/rejected": -224.40921020507812, "loss": 0.0135, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -3.0002286434173584, "rewards/margins": 11.61396598815918, "rewards/rejected": -14.6141939163208, "step": 115 }, { "epoch": 0.5128205128205128, "grad_norm": 0.07080078125, "learning_rate": 1.491656770462546e-05, "logits/chosen": -1.549338936805725, "logits/rejected": -1.1871496438980103, "logps/chosen": -62.74003219604492, "logps/rejected": -226.72341918945312, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -2.2881884574890137, "rewards/margins": 12.516637802124023, "rewards/rejected": -14.804829597473145, "step": 120 }, { "epoch": 0.5341880341880342, "grad_norm": 9.5625, "learning_rate": 1.4862813877822923e-05, "logits/chosen": -1.5509467124938965, "logits/rejected": -1.188998818397522, "logps/chosen": -58.65150833129883, "logps/rejected": -226.0467071533203, "loss": 0.0104, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.832358956336975, "rewards/margins": 13.06682300567627, "rewards/rejected": -14.89918327331543, "step": 125 }, { "epoch": 0.5555555555555556, "grad_norm": 0.1904296875, "learning_rate": 1.4806141921473063e-05, "logits/chosen": -1.520416498184204, "logits/rejected": -1.0859997272491455, "logps/chosen": -57.10784149169922, "logps/rejected": -232.3367156982422, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.6952478885650635, "rewards/margins": 13.670125961303711, "rewards/rejected": -15.365373611450195, "step": 130 }, { "epoch": 0.5769230769230769, "grad_norm": 0.0791015625, "learning_rate": 1.4746583264985202e-05, "logits/chosen": -1.515005350112915, "logits/rejected": -1.0654128789901733, "logps/chosen": -57.121559143066406, "logps/rejected": -238.92703247070312, "loss": 0.0058, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.67409348487854, "rewards/margins": 14.308405876159668, "rewards/rejected": -15.982501029968262, "step": 135 }, { "epoch": 0.5982905982905983, "grad_norm": 0.033203125, "learning_rate": 1.468417093868888e-05, "logits/chosen": -1.547180414199829, "logits/rejected": -1.1224654912948608, "logps/chosen": -55.69971466064453, "logps/rejected": -235.5997772216797, "loss": 0.004, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.6552823781967163, "rewards/margins": 14.057337760925293, "rewards/rejected": -15.712621688842773, "step": 140 }, { "epoch": 0.6196581196581197, "grad_norm": 0.09326171875, "learning_rate": 1.4618939555515721e-05, "logits/chosen": -1.4791333675384521, "logits/rejected": -1.0679481029510498, "logps/chosen": -60.25483322143555, "logps/rejected": -235.65640258789062, "loss": 0.0038, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.0124049186706543, "rewards/margins": 13.800189018249512, "rewards/rejected": -15.812593460083008, "step": 145 }, { "epoch": 0.6410256410256411, "grad_norm": 0.0203857421875, "learning_rate": 1.455092529180363e-05, "logits/chosen": -1.5142757892608643, "logits/rejected": -1.0721313953399658, "logps/chosen": -58.559837341308594, "logps/rejected": -238.7608184814453, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.9865844249725342, "rewards/margins": 14.183688163757324, "rewards/rejected": -16.170270919799805, "step": 150 }, { "epoch": 0.6623931623931624, "grad_norm": 4.375, "learning_rate": 1.4480165867233946e-05, "logits/chosen": -1.5100654363632202, "logits/rejected": -1.0968310832977295, "logps/chosen": -63.598304748535156, "logps/rejected": -246.39779663085938, "loss": 0.0081, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.3885955810546875, "rewards/margins": 14.358621597290039, "rewards/rejected": -16.747217178344727, "step": 155 }, { "epoch": 0.6837606837606838, "grad_norm": 6.375, "learning_rate": 1.440670052391267e-05, "logits/chosen": -1.5425626039505005, "logits/rejected": -1.1030757427215576, "logps/chosen": -59.99821090698242, "logps/rejected": -236.77932739257812, "loss": 0.0057, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.146557569503784, "rewards/margins": 13.929641723632812, "rewards/rejected": -16.07619857788086, "step": 160 }, { "epoch": 0.7051282051282052, "grad_norm": 2.140625, "learning_rate": 1.4330570004607398e-05, "logits/chosen": -1.5993397235870361, "logits/rejected": -1.1524969339370728, "logps/chosen": -58.561241149902344, "logps/rejected": -247.20388793945312, "loss": 0.0033, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.9490623474121094, "rewards/margins": 14.975687026977539, "rewards/rejected": -16.924747467041016, "step": 165 }, { "epoch": 0.7264957264957265, "grad_norm": 1.0, "learning_rate": 1.4251816530151986e-05, "logits/chosen": -1.550431489944458, "logits/rejected": -1.151962399482727, "logps/chosen": -62.471397399902344, "logps/rejected": -246.4335479736328, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -2.2203402519226074, "rewards/margins": 14.611509323120117, "rewards/rejected": -16.831850051879883, "step": 170 }, { "epoch": 0.7478632478632479, "grad_norm": 0.10791015625, "learning_rate": 1.4170483776031526e-05, "logits/chosen": -1.5044711828231812, "logits/rejected": -1.0406298637390137, "logps/chosen": -64.19065856933594, "logps/rejected": -249.743408203125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.455766201019287, "rewards/margins": 14.67004680633545, "rewards/rejected": -17.125812530517578, "step": 175 }, { "epoch": 0.7692307692307693, "grad_norm": 0.30078125, "learning_rate": 1.4086616848160574e-05, "logits/chosen": -1.512284278869629, "logits/rejected": -1.0526224374771118, "logps/chosen": -68.66128540039062, "logps/rejected": -245.33120727539062, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -2.8888022899627686, "rewards/margins": 13.867166519165039, "rewards/rejected": -16.75596809387207, "step": 180 }, { "epoch": 0.7905982905982906, "grad_norm": 0.1064453125, "learning_rate": 1.4000262257868096e-05, "logits/chosen": -1.442132830619812, "logits/rejected": -0.9388168454170227, "logps/chosen": -61.00693893432617, "logps/rejected": -247.60067749023438, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.1700937747955322, "rewards/margins": 14.933375358581543, "rewards/rejected": -17.103466033935547, "step": 185 }, { "epoch": 0.811965811965812, "grad_norm": 0.65625, "learning_rate": 1.3911467896102994e-05, "logits/chosen": -1.3841662406921387, "logits/rejected": -0.9196082353591919, "logps/chosen": -56.78447341918945, "logps/rejected": -246.0129852294922, "loss": 0.0049, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -1.7434440851211548, "rewards/margins": 15.205429077148438, "rewards/rejected": -16.94887351989746, "step": 190 }, { "epoch": 0.8333333333333334, "grad_norm": 0.047119140625, "learning_rate": 1.3820283006874503e-05, "logits/chosen": -1.36122727394104, "logits/rejected": -0.899361789226532, "logps/chosen": -64.0156478881836, "logps/rejected": -245.7220458984375, "loss": 0.0077, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.193249464035034, "rewards/margins": 14.599599838256836, "rewards/rejected": -16.792850494384766, "step": 195 }, { "epoch": 0.8547008547008547, "grad_norm": 0.220703125, "learning_rate": 1.372675815994221e-05, "logits/chosen": -1.2733592987060547, "logits/rejected": -0.7683244347572327, "logps/chosen": -54.458702087402344, "logps/rejected": -250.61141967773438, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.6900008916854858, "rewards/margins": 15.71391773223877, "rewards/rejected": -17.403919219970703, "step": 200 }, { "epoch": 0.8760683760683761, "grad_norm": 41.75, "learning_rate": 1.3630945222770829e-05, "logits/chosen": -1.3039919137954712, "logits/rejected": -0.7636764645576477, "logps/chosen": -61.440521240234375, "logps/rejected": -256.41033935546875, "loss": 0.0309, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.126218795776367, "rewards/margins": 15.685290336608887, "rewards/rejected": -17.811508178710938, "step": 205 }, { "epoch": 0.8974358974358975, "grad_norm": 0.8046875, "learning_rate": 1.3532897331765301e-05, "logits/chosen": -1.339553713798523, "logits/rejected": -0.7716963291168213, "logps/chosen": -62.3395881652832, "logps/rejected": -254.98672485351562, "loss": 0.0111, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.235722064971924, "rewards/margins": 15.472686767578125, "rewards/rejected": -17.70840835571289, "step": 210 }, { "epoch": 0.9188034188034188, "grad_norm": 0.314453125, "learning_rate": 1.3432668862802134e-05, "logits/chosen": -1.256614089012146, "logits/rejected": -0.6890861988067627, "logps/chosen": -62.10491943359375, "logps/rejected": -249.8031005859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -2.166273593902588, "rewards/margins": 14.911642074584961, "rewards/rejected": -17.07791519165039, "step": 215 }, { "epoch": 0.9401709401709402, "grad_norm": 0.68359375, "learning_rate": 1.3330315401073371e-05, "logits/chosen": -1.2784340381622314, "logits/rejected": -0.7194468379020691, "logps/chosen": -64.89430236816406, "logps/rejected": -246.3287811279297, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -2.396063804626465, "rewards/margins": 14.473241806030273, "rewards/rejected": -16.869304656982422, "step": 220 }, { "epoch": 0.9615384615384616, "grad_norm": 11.5625, "learning_rate": 1.3225893710259887e-05, "logits/chosen": -1.1662753820419312, "logits/rejected": -0.5432911515235901, "logps/chosen": -66.24633026123047, "logps/rejected": -253.39212036132812, "loss": 0.01, "rewards/accuracies": 0.9916666746139526, "rewards/chosen": -2.7053072452545166, "rewards/margins": 14.921958923339844, "rewards/rejected": -17.62726402282715, "step": 225 }, { "epoch": 0.9829059829059829, "grad_norm": 1.5859375, "learning_rate": 1.3119461701051105e-05, "logits/chosen": -1.161176323890686, "logits/rejected": -0.45327743887901306, "logps/chosen": -62.70893096923828, "logps/rejected": -251.8865966796875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.506155252456665, "rewards/margins": 15.214022636413574, "rewards/rejected": -17.720178604125977, "step": 230 }, { "epoch": 0.9957264957264957, "eval_logits/chosen": -1.1960320472717285, "eval_logits/rejected": -0.4940774440765381, "eval_logps/chosen": -66.11089324951172, "eval_logps/rejected": -261.5491943359375, "eval_loss": 0.0009878784185275435, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.6667160987854004, "eval_rewards/margins": 15.686513900756836, "eval_rewards/rejected": -18.353229522705078, "eval_runtime": 9.4343, "eval_samples_per_second": 21.199, "eval_steps_per_second": 21.199, "step": 233 }, { "epoch": 1.0042735042735043, "grad_norm": 0.11669921875, "learning_rate": 1.3011078399028605e-05, "logits/chosen": -1.1641021966934204, "logits/rejected": -0.48939043283462524, "logps/chosen": -65.07087707519531, "logps/rejected": -266.9398498535156, "loss": 0.0032, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -2.5494394302368164, "rewards/margins": 16.291004180908203, "rewards/rejected": -18.840442657470703, "step": 235 }, { "epoch": 1.0256410256410255, "grad_norm": 0.130859375, "learning_rate": 1.2900803911931431e-05, "logits/chosen": -1.1983628273010254, "logits/rejected": -0.48308929800987244, "logps/chosen": -64.48258209228516, "logps/rejected": -265.9141540527344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.472846746444702, "rewards/margins": 16.26525115966797, "rewards/rejected": -18.73809814453125, "step": 240 }, { "epoch": 1.047008547008547, "grad_norm": 0.052978515625, "learning_rate": 1.2788699396321252e-05, "logits/chosen": -1.1283172369003296, "logits/rejected": -0.419720321893692, "logps/chosen": -62.42974853515625, "logps/rejected": -256.7882385253906, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.3851864337921143, "rewards/margins": 15.727444648742676, "rewards/rejected": -18.11263084411621, "step": 245 }, { "epoch": 1.0683760683760684, "grad_norm": 0.000896453857421875, "learning_rate": 1.2674827023665853e-05, "logits/chosen": -1.1639864444732666, "logits/rejected": -0.4902980327606201, "logps/chosen": -67.04655456542969, "logps/rejected": -274.08026123046875, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -2.747849941253662, "rewards/margins": 16.89676284790039, "rewards/rejected": -19.64461326599121, "step": 250 }, { "epoch": 1.0897435897435896, "grad_norm": 0.208984375, "learning_rate": 1.255924994585978e-05, "logits/chosen": -1.1226640939712524, "logits/rejected": -0.4087928235530853, "logps/chosen": -69.7280044555664, "logps/rejected": -270.5373840332031, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.9504196643829346, "rewards/margins": 16.320491790771484, "rewards/rejected": -19.27090835571289, "step": 255 }, { "epoch": 1.1111111111111112, "grad_norm": 0.00885009765625, "learning_rate": 1.2442032260201255e-05, "logits/chosen": -1.1412135362625122, "logits/rejected": -0.4527861475944519, "logps/chosen": -69.53722381591797, "logps/rejected": -270.5766296386719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.999143123626709, "rewards/margins": 16.23387336730957, "rewards/rejected": -19.233016967773438, "step": 260 }, { "epoch": 1.1324786324786325, "grad_norm": 0.263671875, "learning_rate": 1.2323238973844796e-05, "logits/chosen": -1.1780011653900146, "logits/rejected": -0.45843830704689026, "logps/chosen": -71.02696228027344, "logps/rejected": -280.1233825683594, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0659804344177246, "rewards/margins": 16.907106399536133, "rewards/rejected": -19.973085403442383, "step": 265 }, { "epoch": 1.1538461538461537, "grad_norm": 0.02197265625, "learning_rate": 1.2202935967749212e-05, "logits/chosen": -1.0917080640792847, "logits/rejected": -0.38812121748924255, "logps/chosen": -74.89707946777344, "logps/rejected": -274.8170166015625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.3719239234924316, "rewards/margins": 16.13779067993164, "rewards/rejected": -19.509714126586914, "step": 270 }, { "epoch": 1.1752136752136753, "grad_norm": 0.72265625, "learning_rate": 1.2081189960141038e-05, "logits/chosen": -1.1228703260421753, "logits/rejected": -0.39134496450424194, "logps/chosen": -69.5420913696289, "logps/rejected": -274.7674255371094, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.9194247722625732, "rewards/margins": 16.717750549316406, "rewards/rejected": -19.637174606323242, "step": 275 }, { "epoch": 1.1965811965811965, "grad_norm": 1.171875, "learning_rate": 1.1958068469513604e-05, "logits/chosen": -1.1384648084640503, "logits/rejected": -0.39668112993240356, "logps/chosen": -72.6049575805664, "logps/rejected": -283.6707458496094, "loss": 0.0092, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.210206985473633, "rewards/margins": 17.243507385253906, "rewards/rejected": -20.453712463378906, "step": 280 }, { "epoch": 1.217948717948718, "grad_norm": 0.013916015625, "learning_rate": 1.1833639777182316e-05, "logits/chosen": -1.0187714099884033, "logits/rejected": -0.2573033571243286, "logps/chosen": -70.54331970214844, "logps/rejected": -280.8934020996094, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.204702377319336, "rewards/margins": 17.17652702331543, "rewards/rejected": -20.3812313079834, "step": 285 }, { "epoch": 1.2393162393162394, "grad_norm": 0.012451171875, "learning_rate": 1.170797288941685e-05, "logits/chosen": -0.9472154378890991, "logits/rejected": -0.2180492877960205, "logps/chosen": -79.2474594116211, "logps/rejected": -289.6960144042969, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.919898271560669, "rewards/margins": 17.01470947265625, "rewards/rejected": -20.934606552124023, "step": 290 }, { "epoch": 1.2606837606837606, "grad_norm": 0.3359375, "learning_rate": 1.1581137499171342e-05, "logits/chosen": -0.9705740213394165, "logits/rejected": -0.21009401977062225, "logps/chosen": -79.57368469238281, "logps/rejected": -278.9507751464844, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.9748988151550293, "rewards/margins": 16.132137298583984, "rewards/rejected": -20.10703468322754, "step": 295 }, { "epoch": 1.282051282051282, "grad_norm": 0.017822265625, "learning_rate": 1.145320394743371e-05, "logits/chosen": -0.9654294848442078, "logits/rejected": -0.22467419505119324, "logps/chosen": -74.53304290771484, "logps/rejected": -270.533447265625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.5043253898620605, "rewards/margins": 15.963613510131836, "rewards/rejected": -19.467941284179688, "step": 300 }, { "epoch": 1.3034188034188035, "grad_norm": 0.0078125, "learning_rate": 1.1324243184215622e-05, "logits/chosen": -0.969558835029602, "logits/rejected": -0.21970291435718536, "logps/chosen": -73.47796630859375, "logps/rejected": -285.55706787109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.5001537799835205, "rewards/margins": 17.251827239990234, "rewards/rejected": -20.75197982788086, "step": 305 }, { "epoch": 1.3247863247863247, "grad_norm": 0.0084228515625, "learning_rate": 1.1194326729204686e-05, "logits/chosen": -1.0194365978240967, "logits/rejected": -0.25868645310401917, "logps/chosen": -73.23660278320312, "logps/rejected": -280.81256103515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4368844032287598, "rewards/margins": 16.979907989501953, "rewards/rejected": -20.416793823242188, "step": 310 }, { "epoch": 1.3461538461538463, "grad_norm": 0.00933837890625, "learning_rate": 1.1063526632100717e-05, "logits/chosen": -1.0158953666687012, "logits/rejected": -0.3026788830757141, "logps/chosen": -75.67848205566406, "logps/rejected": -277.49310302734375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -3.546459197998047, "rewards/margins": 16.51190948486328, "rewards/rejected": -20.058368682861328, "step": 315 }, { "epoch": 1.3675213675213675, "grad_norm": 0.005615234375, "learning_rate": 1.0931915432658055e-05, "logits/chosen": -1.023720383644104, "logits/rejected": -0.28601619601249695, "logps/chosen": -71.98751068115234, "logps/rejected": -279.75103759765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2338900566101074, "rewards/margins": 16.945287704467773, "rewards/rejected": -20.17917823791504, "step": 320 }, { "epoch": 1.3888888888888888, "grad_norm": 0.09130859375, "learning_rate": 1.0799566120456133e-05, "logits/chosen": -1.0158692598342896, "logits/rejected": -0.25889790058135986, "logps/chosen": -74.53166198730469, "logps/rejected": -277.46502685546875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4598422050476074, "rewards/margins": 16.516708374023438, "rewards/rejected": -19.976551055908203, "step": 325 }, { "epoch": 1.4102564102564101, "grad_norm": 0.0211181640625, "learning_rate": 1.066655209442054e-05, "logits/chosen": -1.0357223749160767, "logits/rejected": -0.3092297315597534, "logps/chosen": -74.45147705078125, "logps/rejected": -280.28125, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -3.444270372390747, "rewards/margins": 16.66710090637207, "rewards/rejected": -20.111371994018555, "step": 330 }, { "epoch": 1.4316239316239316, "grad_norm": 0.033447265625, "learning_rate": 1.0532947122117101e-05, "logits/chosen": -1.0367907285690308, "logits/rejected": -0.30450788140296936, "logps/chosen": -74.00047302246094, "logps/rejected": -280.2533264160156, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4167351722717285, "rewards/margins": 16.851306915283203, "rewards/rejected": -20.268043518066406, "step": 335 }, { "epoch": 1.452991452991453, "grad_norm": 0.05126953125, "learning_rate": 1.0398825298841499e-05, "logits/chosen": -1.0600817203521729, "logits/rejected": -0.41858386993408203, "logps/chosen": -77.15374755859375, "logps/rejected": -283.68585205078125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.601487636566162, "rewards/margins": 16.883991241455078, "rewards/rejected": -20.4854793548584, "step": 340 }, { "epoch": 1.4743589743589745, "grad_norm": 0.0087890625, "learning_rate": 1.0264261006527144e-05, "logits/chosen": -1.1149675846099854, "logits/rejected": -0.4605252742767334, "logps/chosen": -73.10884094238281, "logps/rejected": -273.56329345703125, "loss": 0.0043, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.302705764770508, "rewards/margins": 16.43352508544922, "rewards/rejected": -19.73623275756836, "step": 345 }, { "epoch": 1.4957264957264957, "grad_norm": 0.056396484375, "learning_rate": 1.0129328872494075e-05, "logits/chosen": -1.2343705892562866, "logits/rejected": -0.5823469161987305, "logps/chosen": -70.9728775024414, "logps/rejected": -277.93389892578125, "loss": 0.0029, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.130044460296631, "rewards/margins": 16.93770980834961, "rewards/rejected": -20.067752838134766, "step": 350 }, { "epoch": 1.517094017094017, "grad_norm": 0.166015625, "learning_rate": 9.994103728061786e-06, "logits/chosen": -1.1977897882461548, "logits/rejected": -0.5562113523483276, "logps/chosen": -70.82041931152344, "logps/rejected": -274.3385009765625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.062682867050171, "rewards/margins": 16.56722640991211, "rewards/rejected": -19.62990951538086, "step": 355 }, { "epoch": 1.5384615384615383, "grad_norm": 0.00537109375, "learning_rate": 9.858660567048902e-06, "logits/chosen": -1.2067844867706299, "logits/rejected": -0.5067782998085022, "logps/chosen": -74.36605834960938, "logps/rejected": -289.45306396484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4267430305480957, "rewards/margins": 17.64672088623047, "rewards/rejected": -21.073461532592773, "step": 360 }, { "epoch": 1.5598290598290598, "grad_norm": 0.10302734375, "learning_rate": 9.72307450418274e-06, "logits/chosen": -1.2308647632598877, "logits/rejected": -0.5679563283920288, "logps/chosen": -69.85784912109375, "logps/rejected": -274.14263916015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9802072048187256, "rewards/margins": 16.728424072265625, "rewards/rejected": -19.708629608154297, "step": 365 }, { "epoch": 1.5811965811965814, "grad_norm": 0.041015625, "learning_rate": 9.587420733441835e-06, "logits/chosen": -1.227565050125122, "logits/rejected": -0.5805907249450684, "logps/chosen": -68.70391845703125, "logps/rejected": -279.85845947265625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.8893535137176514, "rewards/margins": 17.282146453857422, "rewards/rejected": -20.171499252319336, "step": 370 }, { "epoch": 1.6025641025641026, "grad_norm": 0.000698089599609375, "learning_rate": 9.45177448635447e-06, "logits/chosen": -1.251037836074829, "logits/rejected": -0.515083372592926, "logps/chosen": -68.63763427734375, "logps/rejected": -280.8656921386719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9085798263549805, "rewards/margins": 17.550891876220703, "rewards/rejected": -20.459474563598633, "step": 375 }, { "epoch": 1.623931623931624, "grad_norm": 0.00811767578125, "learning_rate": 9.316210990276434e-06, "logits/chosen": -1.174579381942749, "logits/rejected": -0.5305734276771545, "logps/chosen": -68.64695739746094, "logps/rejected": -270.04510498046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.869532346725464, "rewards/margins": 16.443248748779297, "rewards/rejected": -19.31278419494629, "step": 380 }, { "epoch": 1.6452991452991452, "grad_norm": 0.002349853515625, "learning_rate": 9.18080542667105e-06, "logits/chosen": -1.2183802127838135, "logits/rejected": -0.5133947134017944, "logps/chosen": -69.98764038085938, "logps/rejected": -290.1512756347656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.9592034816741943, "rewards/margins": 18.04782485961914, "rewards/rejected": -21.007028579711914, "step": 385 }, { "epoch": 1.6666666666666665, "grad_norm": 0.01348876953125, "learning_rate": 9.045632889414686e-06, "logits/chosen": -1.2081528902053833, "logits/rejected": -0.5276485681533813, "logps/chosen": -68.10844421386719, "logps/rejected": -281.4985046386719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.8550543785095215, "rewards/margins": 17.489383697509766, "rewards/rejected": -20.344438552856445, "step": 390 }, { "epoch": 1.688034188034188, "grad_norm": 0.00848388671875, "learning_rate": 8.910768343150828e-06, "logits/chosen": -1.1923013925552368, "logits/rejected": -0.5268815755844116, "logps/chosen": -69.50736999511719, "logps/rejected": -282.33880615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0535500049591064, "rewards/margins": 17.47011375427246, "rewards/rejected": -20.523662567138672, "step": 395 }, { "epoch": 1.7094017094017095, "grad_norm": 0.001434326171875, "learning_rate": 8.77628658171581e-06, "logits/chosen": -1.2103922367095947, "logits/rejected": -0.5478588342666626, "logps/chosen": -69.14801025390625, "logps/rejected": -277.53424072265625, "loss": 0.006, "rewards/accuracies": 0.9958332777023315, "rewards/chosen": -2.929593563079834, "rewards/margins": 16.97675895690918, "rewards/rejected": -19.906352996826172, "step": 400 }, { "epoch": 1.7307692307692308, "grad_norm": 0.091796875, "learning_rate": 8.642262186659298e-06, "logits/chosen": -1.1698075532913208, "logits/rejected": -0.520237147808075, "logps/chosen": -69.62812805175781, "logps/rejected": -279.01776123046875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.9473519325256348, "rewards/margins": 17.070568084716797, "rewards/rejected": -20.01791763305664, "step": 405 }, { "epoch": 1.7521367521367521, "grad_norm": 0.004058837890625, "learning_rate": 8.508769485882487e-06, "logits/chosen": -1.2100353240966797, "logits/rejected": -0.566001296043396, "logps/chosen": -73.57793426513672, "logps/rejected": -278.8910217285156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.203927516937256, "rewards/margins": 16.867618560791016, "rewards/rejected": -20.07154655456543, "step": 410 }, { "epoch": 1.7735042735042734, "grad_norm": 0.78515625, "learning_rate": 8.375882512416969e-06, "logits/chosen": -1.1643860340118408, "logits/rejected": -0.4784732758998871, "logps/chosen": -72.14900207519531, "logps/rejected": -282.3783264160156, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.1646692752838135, "rewards/margins": 17.262126922607422, "rewards/rejected": -20.426794052124023, "step": 415 }, { "epoch": 1.7948717948717947, "grad_norm": 0.0245361328125, "learning_rate": 8.243674963367137e-06, "logits/chosen": -1.170971393585205, "logits/rejected": -0.4777548909187317, "logps/chosen": -74.64592742919922, "logps/rejected": -281.1402587890625, "loss": 0.0031, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.446406602859497, "rewards/margins": 16.716421127319336, "rewards/rejected": -20.162830352783203, "step": 420 }, { "epoch": 1.8162393162393162, "grad_norm": 0.00127410888671875, "learning_rate": 8.11222015903888e-06, "logits/chosen": -1.1845993995666504, "logits/rejected": -0.47238603234291077, "logps/chosen": -75.17137145996094, "logps/rejected": -286.2430725097656, "loss": 0.0046, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.5714709758758545, "rewards/margins": 17.298736572265625, "rewards/rejected": -20.87020492553711, "step": 425 }, { "epoch": 1.8376068376068377, "grad_norm": 0.0174560546875, "learning_rate": 7.981591002277265e-06, "logits/chosen": -1.1712948083877563, "logits/rejected": -0.4820891320705414, "logps/chosen": -74.11454010009766, "logps/rejected": -281.625732421875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.407923460006714, "rewards/margins": 17.11835479736328, "rewards/rejected": -20.526281356811523, "step": 430 }, { "epoch": 1.858974358974359, "grad_norm": 0.01104736328125, "learning_rate": 7.851859938035712e-06, "logits/chosen": -1.149280309677124, "logits/rejected": -0.46685323119163513, "logps/chosen": -75.06929016113281, "logps/rejected": -291.274169921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.454303026199341, "rewards/margins": 17.817852020263672, "rewards/rejected": -21.272151947021484, "step": 435 }, { "epoch": 1.8803418803418803, "grad_norm": 0.0277099609375, "learning_rate": 7.723098913199118e-06, "logits/chosen": -1.1872258186340332, "logits/rejected": -0.4977366328239441, "logps/chosen": -72.54876708984375, "logps/rejected": -279.2344665527344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3534111976623535, "rewards/margins": 17.049516677856445, "rewards/rejected": -20.402929306030273, "step": 440 }, { "epoch": 1.9017094017094016, "grad_norm": 0.01104736328125, "learning_rate": 7.595379336683204e-06, "logits/chosen": -1.1503307819366455, "logits/rejected": -0.40190115571022034, "logps/chosen": -70.95316314697266, "logps/rejected": -287.2573547363281, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2571163177490234, "rewards/margins": 17.867298126220703, "rewards/rejected": -21.124412536621094, "step": 445 }, { "epoch": 1.9230769230769231, "grad_norm": 0.00060272216796875, "learning_rate": 7.468772039832218e-06, "logits/chosen": -1.1172707080841064, "logits/rejected": -0.3832516074180603, "logps/chosen": -66.7195816040039, "logps/rejected": -281.6941833496094, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.839646816253662, "rewards/margins": 17.725854873657227, "rewards/rejected": -20.56549835205078, "step": 450 }, { "epoch": 1.9444444444444444, "grad_norm": 0.0028839111328125, "learning_rate": 7.3433472371369404e-06, "logits/chosen": -1.1930923461914062, "logits/rejected": -0.48473644256591797, "logps/chosen": -72.68934631347656, "logps/rejected": -284.6398010253906, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.318150281906128, "rewards/margins": 17.357219696044922, "rewards/rejected": -20.675371170043945, "step": 455 }, { "epoch": 1.965811965811966, "grad_norm": 0.000492095947265625, "learning_rate": 7.219174487294784e-06, "logits/chosen": -1.1722862720489502, "logits/rejected": -0.47326725721359253, "logps/chosen": -70.49462890625, "logps/rejected": -287.0985107421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0771074295043945, "rewards/margins": 17.8206844329834, "rewards/rejected": -20.89779281616211, "step": 460 }, { "epoch": 1.9871794871794872, "grad_norm": 0.00872802734375, "learning_rate": 7.0963226546336e-06, "logits/chosen": -1.20412278175354, "logits/rejected": -0.5105618238449097, "logps/chosen": -70.91376495361328, "logps/rejected": -280.29852294921875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.1506786346435547, "rewards/margins": 17.16250228881836, "rewards/rejected": -20.313182830810547, "step": 465 }, { "epoch": 1.9914529914529915, "eval_logits/chosen": -1.2244133949279785, "eval_logits/rejected": -0.5135009288787842, "eval_logps/chosen": -72.20535278320312, "eval_logps/rejected": -285.3635559082031, "eval_loss": 4.7980323870433494e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.276163101196289, "eval_rewards/margins": 17.45850372314453, "eval_rewards/rejected": -20.734668731689453, "eval_runtime": 9.4345, "eval_samples_per_second": 21.199, "eval_steps_per_second": 21.199, "step": 466 }, { "epoch": 2.0085470085470085, "grad_norm": 0.024169921875, "learning_rate": 6.974859870920561e-06, "logits/chosen": -1.1253283023834229, "logits/rejected": -0.4625110626220703, "logps/chosen": -72.798095703125, "logps/rejected": -279.7697448730469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.3227851390838623, "rewards/margins": 17.012950897216797, "rewards/rejected": -20.335737228393555, "step": 470 }, { "epoch": 2.02991452991453, "grad_norm": 0.00131988525390625, "learning_rate": 6.8548534975773135e-06, "logits/chosen": -1.179386854171753, "logits/rejected": -0.49411916732788086, "logps/chosen": -73.65827178955078, "logps/rejected": -285.2862243652344, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.295319080352783, "rewards/margins": 17.375682830810547, "rewards/rejected": -20.671003341674805, "step": 475 }, { "epoch": 2.051282051282051, "grad_norm": 0.189453125, "learning_rate": 6.736370088322359e-06, "logits/chosen": -1.1767910718917847, "logits/rejected": -0.4558785557746887, "logps/chosen": -72.45469665527344, "logps/rejected": -282.46148681640625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -3.309537410736084, "rewards/margins": 17.340744018554688, "rewards/rejected": -20.65028190612793, "step": 480 }, { "epoch": 2.072649572649573, "grad_norm": 0.016357421875, "learning_rate": 6.619475352261356e-06, "logits/chosen": -1.1392714977264404, "logits/rejected": -0.45718201994895935, "logps/chosen": -76.55453491210938, "logps/rejected": -285.7558898925781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4542293548583984, "rewards/margins": 17.28127670288086, "rewards/rejected": -20.73550796508789, "step": 485 }, { "epoch": 2.094017094017094, "grad_norm": 0.0069580078125, "learning_rate": 6.504234117445857e-06, "logits/chosen": -1.1605439186096191, "logits/rejected": -0.4634723663330078, "logps/chosen": -72.77029418945312, "logps/rejected": -282.7125549316406, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3726367950439453, "rewards/margins": 17.24945068359375, "rewards/rejected": -20.622087478637695, "step": 490 }, { "epoch": 2.1153846153846154, "grad_norm": 0.007354736328125, "learning_rate": 6.39071029492065e-06, "logits/chosen": -1.1251627206802368, "logits/rejected": -0.4109951853752136, "logps/chosen": -72.30722045898438, "logps/rejected": -281.81280517578125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.365689754486084, "rewards/margins": 17.244686126708984, "rewards/rejected": -20.610374450683594, "step": 495 }, { "epoch": 2.1367521367521367, "grad_norm": 0.13671875, "learning_rate": 6.2789668432796535e-06, "logits/chosen": -1.1276848316192627, "logits/rejected": -0.4581735134124756, "logps/chosen": -73.38197326660156, "logps/rejected": -283.4115295410156, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4196181297302246, "rewards/margins": 17.356618881225586, "rewards/rejected": -20.7762393951416, "step": 500 }, { "epoch": 2.1367521367521367, "eval_logits/chosen": -1.2213764190673828, "eval_logits/rejected": -0.508858859539032, "eval_logps/chosen": -72.31729125976562, "eval_logps/rejected": -285.5926818847656, "eval_loss": 4.8542991862632334e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.2873549461364746, "eval_rewards/margins": 17.470226287841797, "eval_rewards/rejected": -20.757583618164062, "eval_runtime": 9.4725, "eval_samples_per_second": 21.114, "eval_steps_per_second": 21.114, "step": 500 }, { "epoch": 2.158119658119658, "grad_norm": 0.006744384765625, "learning_rate": 6.16906573375004e-06, "logits/chosen": -1.1647155284881592, "logits/rejected": -0.5106357336044312, "logps/chosen": -72.9385986328125, "logps/rejected": -281.58062744140625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.3363139629364014, "rewards/margins": 17.115955352783203, "rewards/rejected": -20.452272415161133, "step": 505 }, { "epoch": 2.1794871794871793, "grad_norm": 0.033203125, "learning_rate": 6.061067915823923e-06, "logits/chosen": -1.102561593055725, "logits/rejected": -0.4104360044002533, "logps/chosen": -70.00373077392578, "logps/rejected": -283.4765319824219, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.082702159881592, "rewards/margins": 17.501201629638672, "rewards/rejected": -20.583904266357422, "step": 510 }, { "epoch": 2.200854700854701, "grad_norm": 0.04150390625, "learning_rate": 5.955033283456711e-06, "logits/chosen": -1.136200189590454, "logits/rejected": -0.4127614498138428, "logps/chosen": -77.00238037109375, "logps/rejected": -293.89617919921875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.632338762283325, "rewards/margins": 17.84152603149414, "rewards/rejected": -21.473865509033203, "step": 515 }, { "epoch": 2.2222222222222223, "grad_norm": 0.00136566162109375, "learning_rate": 5.8510206418507914e-06, "logits/chosen": -1.2033765316009521, "logits/rejected": -0.5239652395248413, "logps/chosen": -74.33930969238281, "logps/rejected": -299.2998352050781, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.386591672897339, "rewards/margins": 18.531057357788086, "rewards/rejected": -21.91765022277832, "step": 520 }, { "epoch": 2.2435897435897436, "grad_norm": 0.003875732421875, "learning_rate": 5.749087674843095e-06, "logits/chosen": -1.139147400856018, "logits/rejected": -0.46916326880455017, "logps/chosen": -68.5783920288086, "logps/rejected": -284.41552734375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0137269496917725, "rewards/margins": 17.694652557373047, "rewards/rejected": -20.708377838134766, "step": 525 }, { "epoch": 2.264957264957265, "grad_norm": 0.01904296875, "learning_rate": 5.649290912914482e-06, "logits/chosen": -1.1451623439788818, "logits/rejected": -0.47735461592674255, "logps/chosen": -77.09215545654297, "logps/rejected": -298.22796630859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.572205066680908, "rewards/margins": 18.168258666992188, "rewards/rejected": -21.740463256835938, "step": 530 }, { "epoch": 2.286324786324786, "grad_norm": 0.036865234375, "learning_rate": 5.5516857018388144e-06, "logits/chosen": -1.187374472618103, "logits/rejected": -0.5284038782119751, "logps/chosen": -71.90570831298828, "logps/rejected": -280.32940673828125, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.206554412841797, "rewards/margins": 17.105445861816406, "rewards/rejected": -20.312000274658203, "step": 535 }, { "epoch": 2.3076923076923075, "grad_norm": 0.00921630859375, "learning_rate": 5.456326171989005e-06, "logits/chosen": -1.15079927444458, "logits/rejected": -0.4923780858516693, "logps/chosen": -70.75343322753906, "logps/rejected": -297.9629821777344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.109229564666748, "rewards/margins": 18.49727439880371, "rewards/rejected": -21.60650634765625, "step": 540 }, { "epoch": 2.3290598290598292, "grad_norm": 0.00830078125, "learning_rate": 5.363265208317156e-06, "logits/chosen": -1.1182730197906494, "logits/rejected": -0.4831443727016449, "logps/chosen": -71.95735931396484, "logps/rejected": -281.10089111328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.26196551322937, "rewards/margins": 17.25033950805664, "rewards/rejected": -20.512304306030273, "step": 545 }, { "epoch": 2.3504273504273505, "grad_norm": 0.006072998046875, "learning_rate": 5.272554421025347e-06, "logits/chosen": -1.1618000268936157, "logits/rejected": -0.45643243193626404, "logps/chosen": -72.98902893066406, "logps/rejected": -289.97998046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.336233139038086, "rewards/margins": 17.833538055419922, "rewards/rejected": -21.169771194458008, "step": 550 }, { "epoch": 2.371794871794872, "grad_norm": 0.008544921875, "learning_rate": 5.184244116943411e-06, "logits/chosen": -1.151729941368103, "logits/rejected": -0.4680960774421692, "logps/chosen": -72.05565643310547, "logps/rejected": -286.6762390136719, "loss": 0.003, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.189112901687622, "rewards/margins": 17.703588485717773, "rewards/rejected": -20.892702102661133, "step": 555 }, { "epoch": 2.393162393162393, "grad_norm": 0.00457763671875, "learning_rate": 5.098383271629512e-06, "logits/chosen": -1.169447660446167, "logits/rejected": -0.4685635566711426, "logps/chosen": -72.01811218261719, "logps/rejected": -279.0551452636719, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2659707069396973, "rewards/margins": 17.044937133789062, "rewards/rejected": -20.3109073638916, "step": 560 }, { "epoch": 2.4145299145299144, "grad_norm": 0.002227783203125, "learning_rate": 5.015019502209056e-06, "logits/chosen": -1.1631286144256592, "logits/rejected": -0.46378326416015625, "logps/chosen": -70.7663803100586, "logps/rejected": -276.07537841796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2191970348358154, "rewards/margins": 16.797992706298828, "rewards/rejected": -20.017189025878906, "step": 565 }, { "epoch": 2.435897435897436, "grad_norm": 0.000713348388671875, "learning_rate": 4.934199040966955e-06, "logits/chosen": -1.185304045677185, "logits/rejected": -0.4863820970058441, "logps/chosen": -71.80037689208984, "logps/rejected": -280.21148681640625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.2629001140594482, "rewards/margins": 17.172740936279297, "rewards/rejected": -20.435644149780273, "step": 570 }, { "epoch": 2.4572649572649574, "grad_norm": 0.00372314453125, "learning_rate": 4.855966709707881e-06, "logits/chosen": -1.1501516103744507, "logits/rejected": -0.5059275031089783, "logps/chosen": -75.91123962402344, "logps/rejected": -286.08929443359375, "loss": 0.0029, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.5484020709991455, "rewards/margins": 17.334753036499023, "rewards/rejected": -20.88315773010254, "step": 575 }, { "epoch": 2.4786324786324787, "grad_norm": 0.01190185546875, "learning_rate": 4.780365894898799e-06, "logits/chosen": -1.1519519090652466, "logits/rejected": -0.47328823804855347, "logps/chosen": -73.0169906616211, "logps/rejected": -286.8009948730469, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.299699068069458, "rewards/margins": 17.556556701660156, "rewards/rejected": -20.856258392333984, "step": 580 }, { "epoch": 2.5, "grad_norm": 0.007354736328125, "learning_rate": 4.7074385236074684e-06, "logits/chosen": -1.1783647537231445, "logits/rejected": -0.45893925428390503, "logps/chosen": -77.3306655883789, "logps/rejected": -294.39862060546875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -3.5838446617126465, "rewards/margins": 17.903881072998047, "rewards/rejected": -21.487728118896484, "step": 585 }, { "epoch": 2.5213675213675213, "grad_norm": 0.0615234375, "learning_rate": 4.63722504025034e-06, "logits/chosen": -1.1400740146636963, "logits/rejected": -0.455253541469574, "logps/chosen": -70.24212646484375, "logps/rejected": -285.1997985839844, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -3.0748324394226074, "rewards/margins": 17.681386947631836, "rewards/rejected": -20.7562198638916, "step": 590 }, { "epoch": 2.5427350427350426, "grad_norm": 0.00384521484375, "learning_rate": 4.569764384162676e-06, "logits/chosen": -1.1541723012924194, "logits/rejected": -0.43652376532554626, "logps/chosen": -65.93037414550781, "logps/rejected": -284.7110290527344, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -2.755821943283081, "rewards/margins": 17.971193313598633, "rewards/rejected": -20.727014541625977, "step": 595 }, { "epoch": 2.564102564102564, "grad_norm": 0.0198974609375, "learning_rate": 4.50509396800341e-06, "logits/chosen": -1.0580496788024902, "logits/rejected": -0.334017276763916, "logps/chosen": -72.97813415527344, "logps/rejected": -287.36724853515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.329857587814331, "rewards/margins": 17.7404727935791, "rewards/rejected": -21.070331573486328, "step": 600 }, { "epoch": 2.5854700854700856, "grad_norm": 0.02490234375, "learning_rate": 4.443249657006627e-06, "logits/chosen": -1.0786378383636475, "logits/rejected": -0.3501695990562439, "logps/chosen": -70.572509765625, "logps/rejected": -291.306396484375, "loss": 0.0029, "rewards/accuracies": 0.9958333969116211, "rewards/chosen": -3.165311813354492, "rewards/margins": 18.223491668701172, "rewards/rejected": -21.388805389404297, "step": 605 }, { "epoch": 2.606837606837607, "grad_norm": 0.00946044921875, "learning_rate": 4.384265749091266e-06, "logits/chosen": -1.050445318222046, "logits/rejected": -0.34452471137046814, "logps/chosen": -78.69644927978516, "logps/rejected": -289.40362548828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -3.7829060554504395, "rewards/margins": 17.385984420776367, "rewards/rejected": -21.16888999938965, "step": 610 }, { "epoch": 2.628205128205128, "grad_norm": 0.054443359375, "learning_rate": 4.328174955840002e-06, "logits/chosen": -1.0618460178375244, "logits/rejected": -0.329507052898407, "logps/chosen": -67.98468017578125, "logps/rejected": -282.68719482421875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.0549356937408447, "rewards/margins": 17.611309051513672, "rewards/rejected": -20.666244506835938, "step": 615 }, { "epoch": 2.6495726495726495, "grad_norm": 0.006683349609375, "learning_rate": 4.275008384357902e-06, "logits/chosen": -1.0847241878509521, "logits/rejected": -0.37132930755615234, "logps/chosen": -72.32841491699219, "logps/rejected": -284.59033203125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2704505920410156, "rewards/margins": 17.539974212646484, "rewards/rejected": -20.810426712036133, "step": 620 }, { "epoch": 2.6709401709401708, "grad_norm": 0.0023651123046875, "learning_rate": 4.224795520020898e-06, "logits/chosen": -1.0495777130126953, "logits/rejected": -0.3218225836753845, "logps/chosen": -75.63634490966797, "logps/rejected": -283.78515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.4399490356445312, "rewards/margins": 17.156658172607422, "rewards/rejected": -20.596609115600586, "step": 625 }, { "epoch": 2.6923076923076925, "grad_norm": 0.00848388671875, "learning_rate": 4.177564210123634e-06, "logits/chosen": -1.0822218656539917, "logits/rejected": -0.37178927659988403, "logps/chosen": -71.94633483886719, "logps/rejected": -292.45965576171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.2282912731170654, "rewards/margins": 18.118127822875977, "rewards/rejected": -21.346416473388672, "step": 630 }, { "epoch": 2.713675213675214, "grad_norm": 0.0595703125, "learning_rate": 4.133340648435789e-06, "logits/chosen": -1.0735520124435425, "logits/rejected": -0.33007147908210754, "logps/chosen": -74.36048889160156, "logps/rejected": -289.21795654296875, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.496561050415039, "rewards/margins": 17.668859481811523, "rewards/rejected": -21.16541862487793, "step": 635 }, { "epoch": 2.735042735042735, "grad_norm": 0.0281982421875, "learning_rate": 4.092149360675402e-06, "logits/chosen": -1.0645346641540527, "logits/rejected": -0.37463390827178955, "logps/chosen": -78.34007263183594, "logps/rejected": -292.8553771972656, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6229121685028076, "rewards/margins": 17.589675903320312, "rewards/rejected": -21.212587356567383, "step": 640 }, { "epoch": 2.7564102564102564, "grad_norm": 0.034912109375, "learning_rate": 4.054013190907282e-06, "logits/chosen": -1.039671540260315, "logits/rejected": -0.3333393931388855, "logps/chosen": -69.4139404296875, "logps/rejected": -283.67291259765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.0979743003845215, "rewards/margins": 17.693653106689453, "rewards/rejected": -20.7916259765625, "step": 645 }, { "epoch": 2.7777777777777777, "grad_norm": 0.0034332275390625, "learning_rate": 4.018953288874035e-06, "logits/chosen": -1.0656228065490723, "logits/rejected": -0.3747131824493408, "logps/chosen": -73.5749282836914, "logps/rejected": -289.0091247558594, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.367324113845825, "rewards/margins": 17.781917572021484, "rewards/rejected": -21.149242401123047, "step": 650 }, { "epoch": 2.799145299145299, "grad_norm": 0.00098419189453125, "learning_rate": 3.9869890982667385e-06, "logits/chosen": -1.0841352939605713, "logits/rejected": -0.30320778489112854, "logps/chosen": -71.91044616699219, "logps/rejected": -290.2735595703125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.256523609161377, "rewards/margins": 17.801498413085938, "rewards/rejected": -21.058025360107422, "step": 655 }, { "epoch": 2.8205128205128203, "grad_norm": 0.00970458984375, "learning_rate": 3.9581383459417625e-06, "logits/chosen": -1.052257776260376, "logits/rejected": -0.34810546040534973, "logps/chosen": -78.00955963134766, "logps/rejected": -298.2610168457031, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.769831418991089, "rewards/margins": 18.104028701782227, "rewards/rejected": -21.873859405517578, "step": 660 }, { "epoch": 2.841880341880342, "grad_norm": 0.004974365234375, "learning_rate": 3.932417032089722e-06, "logits/chosen": -1.109933614730835, "logits/rejected": -0.38616496324539185, "logps/chosen": -75.1268081665039, "logps/rejected": -294.04522705078125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.484327793121338, "rewards/margins": 18.015382766723633, "rewards/rejected": -21.499710083007812, "step": 665 }, { "epoch": 2.8632478632478633, "grad_norm": 0.00125885009765625, "learning_rate": 3.909839421362017e-06, "logits/chosen": -1.0365312099456787, "logits/rejected": -0.3303161561489105, "logps/chosen": -75.15225219726562, "logps/rejected": -289.403076171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.5623269081115723, "rewards/margins": 17.46319007873535, "rewards/rejected": -21.025516510009766, "step": 670 }, { "epoch": 2.8846153846153846, "grad_norm": 0.039794921875, "learning_rate": 3.890418034959871e-06, "logits/chosen": -1.0185749530792236, "logits/rejected": -0.24934515357017517, "logps/chosen": -73.84525299072266, "logps/rejected": -287.80670166015625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.451679229736328, "rewards/margins": 17.665843963623047, "rewards/rejected": -21.117523193359375, "step": 675 }, { "epoch": 2.905982905982906, "grad_norm": 0.05517578125, "learning_rate": 3.874163643690263e-06, "logits/chosen": -1.0194957256317139, "logits/rejected": -0.2993074655532837, "logps/chosen": -80.23220825195312, "logps/rejected": -291.32550048828125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9078011512756348, "rewards/margins": 17.403505325317383, "rewards/rejected": -21.31130599975586, "step": 680 }, { "epoch": 2.9273504273504276, "grad_norm": 0.01055908203125, "learning_rate": 3.861085261992599e-06, "logits/chosen": -1.0856374502182007, "logits/rejected": -0.4421593248844147, "logps/chosen": -77.59326171875, "logps/rejected": -291.57843017578125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.7359116077423096, "rewards/margins": 17.531463623046875, "rewards/rejected": -21.267375946044922, "step": 685 }, { "epoch": 2.948717948717949, "grad_norm": 0.068359375, "learning_rate": 3.851190142939442e-06, "logits/chosen": -1.0751783847808838, "logits/rejected": -0.3822060823440552, "logps/chosen": -71.5597915649414, "logps/rejected": -287.9521179199219, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.1508712768554688, "rewards/margins": 17.788990020751953, "rewards/rejected": -20.939861297607422, "step": 690 }, { "epoch": 2.97008547008547, "grad_norm": 0.005615234375, "learning_rate": 3.844483774214069e-06, "logits/chosen": -1.044440507888794, "logits/rejected": -0.2854236960411072, "logps/chosen": -70.91199493408203, "logps/rejected": -288.72283935546875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.184840679168701, "rewards/margins": 17.9717960357666, "rewards/rejected": -21.156635284423828, "step": 695 }, { "epoch": 2.9871794871794872, "eval_logits/chosen": -1.1473212242126465, "eval_logits/rejected": -0.4121095538139343, "eval_logps/chosen": -73.0550308227539, "eval_logps/rejected": -287.1957092285156, "eval_loss": 4.4729193177772686e-05, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -3.3611295223236084, "eval_rewards/margins": 17.5567569732666, "eval_rewards/rejected": -20.917884826660156, "eval_runtime": 9.4413, "eval_samples_per_second": 21.183, "eval_steps_per_second": 21.183, "step": 699 } ], "logging_steps": 5, "max_steps": 702, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }