{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.6058631921824107, "eval_steps": 50, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013029315960912053, "grad_norm": 41.75, "learning_rate": 1.6666666666666667e-06, "logits/chosen": 0.4338657259941101, "logits/rejected": 0.4453325867652893, "logps/chosen": -67.76948547363281, "logps/rejected": -152.9691162109375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.026058631921824105, "grad_norm": 36.25, "learning_rate": 3.3333333333333333e-06, "logits/chosen": 0.3402215540409088, "logits/rejected": 0.3878844380378723, "logps/chosen": -98.9161148071289, "logps/rejected": -155.82638549804688, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.03908794788273615, "grad_norm": 64.5, "learning_rate": 5e-06, "logits/chosen": 0.38514813780784607, "logits/rejected": 0.36703822016716003, "logps/chosen": -93.1368408203125, "logps/rejected": -161.52493286132812, "loss": 0.6983, "rewards/accuracies": 0.5, "rewards/chosen": -0.0311676524579525, "rewards/margins": -0.002570953220129013, "rewards/rejected": -0.028596699237823486, "step": 3 }, { "epoch": 0.05211726384364821, "grad_norm": 29.875, "learning_rate": 6.666666666666667e-06, "logits/chosen": 0.4961632192134857, "logits/rejected": 0.49073392152786255, "logps/chosen": -94.36677551269531, "logps/rejected": -176.82952880859375, "loss": 0.7247, "rewards/accuracies": 0.40625, "rewards/chosen": -0.008490505628287792, "rewards/margins": -0.055457405745983124, "rewards/rejected": 0.04696689918637276, "step": 4 }, { "epoch": 0.06514657980456026, "grad_norm": 27.5, "learning_rate": 8.333333333333334e-06, "logits/chosen": 0.3893408179283142, "logits/rejected": 0.41501885652542114, "logps/chosen": -91.56944274902344, "logps/rejected": -141.12969970703125, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": 0.02342848852276802, "rewards/margins": 0.03252270072698593, "rewards/rejected": -0.009094213135540485, "step": 5 }, { "epoch": 0.0781758957654723, "grad_norm": 28.375, "learning_rate": 1e-05, "logits/chosen": 0.4950886070728302, "logits/rejected": 0.5048765540122986, "logps/chosen": -79.60177612304688, "logps/rejected": -174.52386474609375, "loss": 0.6915, "rewards/accuracies": 0.625, "rewards/chosen": 0.0013483259826898575, "rewards/margins": 0.008842225186526775, "rewards/rejected": -0.007493901532143354, "step": 6 }, { "epoch": 0.09120521172638436, "grad_norm": 44.25, "learning_rate": 1.1666666666666668e-05, "logits/chosen": 0.3866894543170929, "logits/rejected": 0.4369007349014282, "logps/chosen": -73.19027709960938, "logps/rejected": -144.08810424804688, "loss": 0.7116, "rewards/accuracies": 0.46875, "rewards/chosen": -0.015920385718345642, "rewards/margins": -0.026944227516651154, "rewards/rejected": 0.011023844592273235, "step": 7 }, { "epoch": 0.10423452768729642, "grad_norm": 63.75, "learning_rate": 1.3333333333333333e-05, "logits/chosen": 0.45976200699806213, "logits/rejected": 0.426272988319397, "logps/chosen": -71.57977294921875, "logps/rejected": -137.3433074951172, "loss": 0.6707, "rewards/accuracies": 0.71875, "rewards/chosen": 0.01595836505293846, "rewards/margins": 0.04949212074279785, "rewards/rejected": -0.03353375196456909, "step": 8 }, { "epoch": 0.11726384364820847, "grad_norm": 54.75, "learning_rate": 1.5e-05, "logits/chosen": 0.49033746123313904, "logits/rejected": 0.48075181245803833, "logps/chosen": -91.1353759765625, "logps/rejected": -167.73594665527344, "loss": 0.6547, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04745086282491684, "rewards/margins": 0.08511507511138916, "rewards/rejected": -0.03766421973705292, "step": 9 }, { "epoch": 0.13029315960912052, "grad_norm": 33.5, "learning_rate": 1.6666666666666667e-05, "logits/chosen": 0.5154792070388794, "logits/rejected": 0.4838900566101074, "logps/chosen": -96.14872741699219, "logps/rejected": -157.02932739257812, "loss": 0.6958, "rewards/accuracies": 0.46875, "rewards/chosen": 0.024153033271431923, "rewards/margins": 0.006197445094585419, "rewards/rejected": 0.017955590039491653, "step": 10 }, { "epoch": 0.14332247557003258, "grad_norm": 35.0, "learning_rate": 1.8333333333333333e-05, "logits/chosen": 0.45827457308769226, "logits/rejected": 0.5124724507331848, "logps/chosen": -93.97823333740234, "logps/rejected": -138.24327087402344, "loss": 0.699, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01763225719332695, "rewards/margins": 0.0005271416157484055, "rewards/rejected": -0.018159402534365654, "step": 11 }, { "epoch": 0.1563517915309446, "grad_norm": 34.5, "learning_rate": 2e-05, "logits/chosen": 0.4826943874359131, "logits/rejected": 0.43963971734046936, "logps/chosen": -98.74089050292969, "logps/rejected": -145.690185546875, "loss": 0.7101, "rewards/accuracies": 0.4375, "rewards/chosen": -0.030411405488848686, "rewards/margins": -0.028361458331346512, "rewards/rejected": -0.0020499457605183125, "step": 12 }, { "epoch": 0.16938110749185667, "grad_norm": 33.25, "learning_rate": 2.1666666666666667e-05, "logits/chosen": 0.384093314409256, "logits/rejected": 0.4154108166694641, "logps/chosen": -110.437744140625, "logps/rejected": -170.55215454101562, "loss": 0.7018, "rewards/accuracies": 0.53125, "rewards/chosen": -0.000756765715777874, "rewards/margins": -0.005527975037693977, "rewards/rejected": 0.004771207459270954, "step": 13 }, { "epoch": 0.18241042345276873, "grad_norm": 32.0, "learning_rate": 2.3333333333333336e-05, "logits/chosen": 0.3536284565925598, "logits/rejected": 0.4306492209434509, "logps/chosen": -87.72677612304688, "logps/rejected": -135.49493408203125, "loss": 0.7118, "rewards/accuracies": 0.375, "rewards/chosen": -0.030064944177865982, "rewards/margins": -0.03109516017138958, "rewards/rejected": 0.001030217856168747, "step": 14 }, { "epoch": 0.19543973941368079, "grad_norm": 32.5, "learning_rate": 2.5e-05, "logits/chosen": 0.4092313051223755, "logits/rejected": 0.5090660452842712, "logps/chosen": -95.63008117675781, "logps/rejected": -135.93472290039062, "loss": 0.6946, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0016081184148788452, "rewards/margins": 0.002313855104148388, "rewards/rejected": -0.003921976778656244, "step": 15 }, { "epoch": 0.20846905537459284, "grad_norm": 33.25, "learning_rate": 2.6666666666666667e-05, "logits/chosen": 0.4373230040073395, "logits/rejected": 0.5158215761184692, "logps/chosen": -115.45347595214844, "logps/rejected": -160.17929077148438, "loss": 0.6503, "rewards/accuracies": 0.625, "rewards/chosen": 0.015219582244753838, "rewards/margins": 0.10696868598461151, "rewards/rejected": -0.09174911677837372, "step": 16 }, { "epoch": 0.22149837133550487, "grad_norm": 31.25, "learning_rate": 2.8333333333333335e-05, "logits/chosen": 0.5184516906738281, "logits/rejected": 0.5677393674850464, "logps/chosen": -128.66629028320312, "logps/rejected": -172.19888305664062, "loss": 0.635, "rewards/accuracies": 0.78125, "rewards/chosen": -0.0017459085211157799, "rewards/margins": 0.12914448976516724, "rewards/rejected": -0.1308903992176056, "step": 17 }, { "epoch": 0.23452768729641693, "grad_norm": 60.5, "learning_rate": 3e-05, "logits/chosen": 0.43745332956314087, "logits/rejected": 0.4682745337486267, "logps/chosen": -108.17106628417969, "logps/rejected": -155.61282348632812, "loss": 0.6391, "rewards/accuracies": 0.75, "rewards/chosen": -0.010563232935965061, "rewards/margins": 0.1288895308971405, "rewards/rejected": -0.13945278525352478, "step": 18 }, { "epoch": 0.247557003257329, "grad_norm": 28.375, "learning_rate": 3.1666666666666666e-05, "logits/chosen": 0.4536093473434448, "logits/rejected": 0.4597874581813812, "logps/chosen": -80.29083251953125, "logps/rejected": -146.64483642578125, "loss": 0.6456, "rewards/accuracies": 0.78125, "rewards/chosen": -0.004713800735771656, "rewards/margins": 0.10411291569471359, "rewards/rejected": -0.10882672667503357, "step": 19 }, { "epoch": 0.26058631921824105, "grad_norm": 50.0, "learning_rate": 3.3333333333333335e-05, "logits/chosen": 0.46047478914260864, "logits/rejected": 0.5494062304496765, "logps/chosen": -103.00077056884766, "logps/rejected": -168.70933532714844, "loss": 0.6503, "rewards/accuracies": 0.6875, "rewards/chosen": -0.048282139003276825, "rewards/margins": 0.09549374878406525, "rewards/rejected": -0.14377588033676147, "step": 20 }, { "epoch": 0.2736156351791531, "grad_norm": 31.0, "learning_rate": 3.5e-05, "logits/chosen": 0.5022985935211182, "logits/rejected": 0.5251904726028442, "logps/chosen": -82.43826293945312, "logps/rejected": -148.17120361328125, "loss": 0.6247, "rewards/accuracies": 0.75, "rewards/chosen": 0.02719825878739357, "rewards/margins": 0.15447314083576202, "rewards/rejected": -0.12727488577365875, "step": 21 }, { "epoch": 0.28664495114006516, "grad_norm": 30.75, "learning_rate": 3.6666666666666666e-05, "logits/chosen": 0.4817676544189453, "logits/rejected": 0.4860598146915436, "logps/chosen": -101.01628875732422, "logps/rejected": -146.12977600097656, "loss": 0.622, "rewards/accuracies": 0.875, "rewards/chosen": -0.027572251856327057, "rewards/margins": 0.15721869468688965, "rewards/rejected": -0.1847909688949585, "step": 22 }, { "epoch": 0.2996742671009772, "grad_norm": 21.75, "learning_rate": 3.8333333333333334e-05, "logits/chosen": 0.48463064432144165, "logits/rejected": 0.5631467700004578, "logps/chosen": -81.53482055664062, "logps/rejected": -135.9483184814453, "loss": 0.5766, "rewards/accuracies": 0.90625, "rewards/chosen": 0.021168498322367668, "rewards/margins": 0.2705130875110626, "rewards/rejected": -0.2493445873260498, "step": 23 }, { "epoch": 0.3127035830618892, "grad_norm": 35.25, "learning_rate": 4e-05, "logits/chosen": 0.38634905219078064, "logits/rejected": 0.42648378014564514, "logps/chosen": -97.1165771484375, "logps/rejected": -161.6883087158203, "loss": 0.5806, "rewards/accuracies": 0.875, "rewards/chosen": -0.008925480768084526, "rewards/margins": 0.2537250518798828, "rewards/rejected": -0.2626505196094513, "step": 24 }, { "epoch": 0.3257328990228013, "grad_norm": 27.25, "learning_rate": 4.166666666666667e-05, "logits/chosen": 0.41833925247192383, "logits/rejected": 0.4584392011165619, "logps/chosen": -89.66869354248047, "logps/rejected": -150.55813598632812, "loss": 0.5952, "rewards/accuracies": 0.90625, "rewards/chosen": -0.019657809287309647, "rewards/margins": 0.21433238685131073, "rewards/rejected": -0.23399019241333008, "step": 25 }, { "epoch": 0.33876221498371334, "grad_norm": 50.5, "learning_rate": 4.3333333333333334e-05, "logits/chosen": 0.46740618348121643, "logits/rejected": 0.4832380712032318, "logps/chosen": -62.494773864746094, "logps/rejected": -146.53067016601562, "loss": 0.5411, "rewards/accuracies": 1.0, "rewards/chosen": -0.0009484302718192339, "rewards/margins": 0.3466818928718567, "rewards/rejected": -0.3476303815841675, "step": 26 }, { "epoch": 0.3517915309446254, "grad_norm": 23.0, "learning_rate": 4.5e-05, "logits/chosen": 0.45530009269714355, "logits/rejected": 0.5172832012176514, "logps/chosen": -85.00700378417969, "logps/rejected": -136.05020141601562, "loss": 0.5479, "rewards/accuracies": 0.90625, "rewards/chosen": -0.001965973526239395, "rewards/margins": 0.34270864725112915, "rewards/rejected": -0.34467458724975586, "step": 27 }, { "epoch": 0.36482084690553745, "grad_norm": 23.375, "learning_rate": 4.666666666666667e-05, "logits/chosen": 0.46558958292007446, "logits/rejected": 0.5210444331169128, "logps/chosen": -105.98873901367188, "logps/rejected": -163.59945678710938, "loss": 0.519, "rewards/accuracies": 1.0, "rewards/chosen": -0.022590279579162598, "rewards/margins": 0.4247127175331116, "rewards/rejected": -0.44730299711227417, "step": 28 }, { "epoch": 0.3778501628664495, "grad_norm": 22.75, "learning_rate": 4.8333333333333334e-05, "logits/chosen": 0.4795917868614197, "logits/rejected": 0.47115039825439453, "logps/chosen": -107.12705993652344, "logps/rejected": -142.822509765625, "loss": 0.5271, "rewards/accuracies": 0.9375, "rewards/chosen": -0.037488676607608795, "rewards/margins": 0.3942331075668335, "rewards/rejected": -0.4317218065261841, "step": 29 }, { "epoch": 0.39087947882736157, "grad_norm": 19.625, "learning_rate": 5e-05, "logits/chosen": 0.4289873242378235, "logits/rejected": 0.5595239996910095, "logps/chosen": -86.29112243652344, "logps/rejected": -172.88059997558594, "loss": 0.459, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05108689144253731, "rewards/margins": 0.5967621803283691, "rewards/rejected": -0.5456752777099609, "step": 30 }, { "epoch": 0.40390879478827363, "grad_norm": 24.375, "learning_rate": 4.993150684931507e-05, "logits/chosen": 0.39370930194854736, "logits/rejected": 0.42319971323013306, "logps/chosen": -102.44596862792969, "logps/rejected": -169.67660522460938, "loss": 0.4393, "rewards/accuracies": 0.96875, "rewards/chosen": -0.0332549586892128, "rewards/margins": 0.642684817314148, "rewards/rejected": -0.6759397387504578, "step": 31 }, { "epoch": 0.4169381107491857, "grad_norm": 20.0, "learning_rate": 4.986301369863014e-05, "logits/chosen": 0.49218329787254333, "logits/rejected": 0.5275806784629822, "logps/chosen": -74.05796813964844, "logps/rejected": -133.33255004882812, "loss": 0.4407, "rewards/accuracies": 1.0, "rewards/chosen": 0.02741517871618271, "rewards/margins": 0.6402420997619629, "rewards/rejected": -0.612826943397522, "step": 32 }, { "epoch": 0.42996742671009774, "grad_norm": 24.75, "learning_rate": 4.979452054794521e-05, "logits/chosen": 0.35451555252075195, "logits/rejected": 0.40355199575424194, "logps/chosen": -104.55900573730469, "logps/rejected": -151.27711486816406, "loss": 0.4234, "rewards/accuracies": 0.96875, "rewards/chosen": 0.023018483072519302, "rewards/margins": 0.6792783737182617, "rewards/rejected": -0.6562598943710327, "step": 33 }, { "epoch": 0.44299674267100975, "grad_norm": 17.5, "learning_rate": 4.972602739726028e-05, "logits/chosen": 0.40463435649871826, "logits/rejected": 0.5144488215446472, "logps/chosen": -72.91780090332031, "logps/rejected": -145.31849670410156, "loss": 0.4111, "rewards/accuracies": 0.96875, "rewards/chosen": 0.02165827713906765, "rewards/margins": 0.7402617931365967, "rewards/rejected": -0.7186034917831421, "step": 34 }, { "epoch": 0.4560260586319218, "grad_norm": 16.5, "learning_rate": 4.9657534246575346e-05, "logits/chosen": 0.4734452962875366, "logits/rejected": 0.5330387353897095, "logps/chosen": -83.89728546142578, "logps/rejected": -147.41265869140625, "loss": 0.3853, "rewards/accuracies": 1.0, "rewards/chosen": -0.009855479001998901, "rewards/margins": 0.8149614930152893, "rewards/rejected": -0.8248169422149658, "step": 35 }, { "epoch": 0.46905537459283386, "grad_norm": 24.75, "learning_rate": 4.958904109589041e-05, "logits/chosen": 0.3432111144065857, "logits/rejected": 0.39720407128334045, "logps/chosen": -84.57624053955078, "logps/rejected": -131.17434692382812, "loss": 0.4056, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020672655664384365, "rewards/margins": 0.7789303064346313, "rewards/rejected": -0.7809975743293762, "step": 36 }, { "epoch": 0.4820846905537459, "grad_norm": 68.5, "learning_rate": 4.952054794520548e-05, "logits/chosen": 0.3694133758544922, "logits/rejected": 0.42799627780914307, "logps/chosen": -85.02811431884766, "logps/rejected": -169.74673461914062, "loss": 0.3145, "rewards/accuracies": 0.96875, "rewards/chosen": 0.07135964930057526, "rewards/margins": 1.2428215742111206, "rewards/rejected": -1.171462059020996, "step": 37 }, { "epoch": 0.495114006514658, "grad_norm": 18.0, "learning_rate": 4.945205479452055e-05, "logits/chosen": 0.4724690318107605, "logits/rejected": 0.5161466598510742, "logps/chosen": -79.45156860351562, "logps/rejected": -183.5731201171875, "loss": 0.281, "rewards/accuracies": 1.0, "rewards/chosen": 0.08071783930063248, "rewards/margins": 1.4206629991531372, "rewards/rejected": -1.3399451971054077, "step": 38 }, { "epoch": 0.50814332247557, "grad_norm": 10.9375, "learning_rate": 4.938356164383562e-05, "logits/chosen": 0.570473313331604, "logits/rejected": 0.5667930841445923, "logps/chosen": -67.05783081054688, "logps/rejected": -160.54501342773438, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": 0.012804888188838959, "rewards/margins": 1.2680517435073853, "rewards/rejected": -1.255246877670288, "step": 39 }, { "epoch": 0.5211726384364821, "grad_norm": 15.0, "learning_rate": 4.9315068493150684e-05, "logits/chosen": 0.3750945031642914, "logits/rejected": 0.5399055480957031, "logps/chosen": -80.3337631225586, "logps/rejected": -150.540771484375, "loss": 0.2555, "rewards/accuracies": 1.0, "rewards/chosen": 0.09064020216464996, "rewards/margins": 1.4325942993164062, "rewards/rejected": -1.3419541120529175, "step": 40 }, { "epoch": 0.5342019543973942, "grad_norm": 17.625, "learning_rate": 4.9246575342465756e-05, "logits/chosen": 0.40898123383522034, "logits/rejected": 0.3948415219783783, "logps/chosen": -120.64512634277344, "logps/rejected": -172.23046875, "loss": 0.2607, "rewards/accuracies": 1.0, "rewards/chosen": 0.03111358918249607, "rewards/margins": 1.4379582405090332, "rewards/rejected": -1.4068448543548584, "step": 41 }, { "epoch": 0.5472312703583062, "grad_norm": 11.0, "learning_rate": 4.917808219178082e-05, "logits/chosen": 0.44859111309051514, "logits/rejected": 0.4527463912963867, "logps/chosen": -111.03682708740234, "logps/rejected": -175.25076293945312, "loss": 0.23, "rewards/accuracies": 1.0, "rewards/chosen": 0.07664196938276291, "rewards/margins": 1.6669435501098633, "rewards/rejected": -1.590301513671875, "step": 42 }, { "epoch": 0.5602605863192183, "grad_norm": 15.75, "learning_rate": 4.9109589041095895e-05, "logits/chosen": 0.4859389662742615, "logits/rejected": 0.5201914310455322, "logps/chosen": -78.25588989257812, "logps/rejected": -162.362548828125, "loss": 0.2227, "rewards/accuracies": 1.0, "rewards/chosen": 0.0701964795589447, "rewards/margins": 1.5760339498519897, "rewards/rejected": -1.5058374404907227, "step": 43 }, { "epoch": 0.5732899022801303, "grad_norm": 12.125, "learning_rate": 4.904109589041096e-05, "logits/chosen": 0.5065852403640747, "logits/rejected": 0.5527216196060181, "logps/chosen": -78.39152526855469, "logps/rejected": -183.5028839111328, "loss": 0.197, "rewards/accuracies": 1.0, "rewards/chosen": 0.0381561741232872, "rewards/margins": 1.9460369348526, "rewards/rejected": -1.9078807830810547, "step": 44 }, { "epoch": 0.5863192182410424, "grad_norm": 15.75, "learning_rate": 4.8972602739726034e-05, "logits/chosen": 0.5216741561889648, "logits/rejected": 0.6273947954177856, "logps/chosen": -74.12837982177734, "logps/rejected": -167.24652099609375, "loss": 0.1831, "rewards/accuracies": 1.0, "rewards/chosen": 0.13984212279319763, "rewards/margins": 2.0655643939971924, "rewards/rejected": -1.9257222414016724, "step": 45 }, { "epoch": 0.5993485342019544, "grad_norm": 15.125, "learning_rate": 4.89041095890411e-05, "logits/chosen": 0.5224686861038208, "logits/rejected": 0.5461165308952332, "logps/chosen": -101.55109405517578, "logps/rejected": -163.4028778076172, "loss": 0.1841, "rewards/accuracies": 1.0, "rewards/chosen": 0.01885811612010002, "rewards/margins": 1.9022661447525024, "rewards/rejected": -1.8834080696105957, "step": 46 }, { "epoch": 0.6123778501628665, "grad_norm": 10.3125, "learning_rate": 4.8835616438356167e-05, "logits/chosen": 0.438764363527298, "logits/rejected": 0.5729016661643982, "logps/chosen": -73.1627426147461, "logps/rejected": -153.8181610107422, "loss": 0.1734, "rewards/accuracies": 1.0, "rewards/chosen": 0.08434567600488663, "rewards/margins": 2.0392439365386963, "rewards/rejected": -1.9548982381820679, "step": 47 }, { "epoch": 0.6254071661237784, "grad_norm": 8.9375, "learning_rate": 4.876712328767123e-05, "logits/chosen": 0.40418195724487305, "logits/rejected": 0.4241870045661926, "logps/chosen": -143.9720001220703, "logps/rejected": -195.26536560058594, "loss": 0.1135, "rewards/accuracies": 1.0, "rewards/chosen": -0.0051138997077941895, "rewards/margins": 2.4568700790405273, "rewards/rejected": -2.461984157562256, "step": 48 }, { "epoch": 0.6384364820846905, "grad_norm": 16.5, "learning_rate": 4.8698630136986305e-05, "logits/chosen": 0.5531054735183716, "logits/rejected": 0.5722475051879883, "logps/chosen": -80.95619201660156, "logps/rejected": -174.85643005371094, "loss": 0.1363, "rewards/accuracies": 1.0, "rewards/chosen": 0.057745300233364105, "rewards/margins": 2.4418563842773438, "rewards/rejected": -2.3841114044189453, "step": 49 }, { "epoch": 0.6514657980456026, "grad_norm": 6.6875, "learning_rate": 4.863013698630137e-05, "logits/chosen": 0.3978479504585266, "logits/rejected": 0.575504720211029, "logps/chosen": -111.10527038574219, "logps/rejected": -194.09478759765625, "loss": 0.0979, "rewards/accuracies": 1.0, "rewards/chosen": -0.024841848760843277, "rewards/margins": 2.8879756927490234, "rewards/rejected": -2.9128177165985107, "step": 50 }, { "epoch": 0.6514657980456026, "eval_logits/chosen": 0.40171119570732117, "eval_logits/rejected": 0.4472416043281555, "eval_logps/chosen": -94.96456909179688, "eval_logps/rejected": -177.69801330566406, "eval_loss": 0.10980458557605743, "eval_rewards/accuracies": 0.9985119104385376, "eval_rewards/chosen": 0.048970796167850494, "eval_rewards/margins": 2.70963716506958, "eval_rewards/rejected": -2.6606662273406982, "eval_runtime": 53.1051, "eval_samples_per_second": 12.635, "eval_steps_per_second": 0.791, "step": 50 }, { "epoch": 0.6644951140065146, "grad_norm": 9.9375, "learning_rate": 4.856164383561644e-05, "logits/chosen": 0.5971242189407349, "logits/rejected": 0.5052528381347656, "logps/chosen": -100.87618255615234, "logps/rejected": -183.73324584960938, "loss": 0.1258, "rewards/accuracies": 1.0, "rewards/chosen": 0.029725002124905586, "rewards/margins": 2.650700330734253, "rewards/rejected": -2.6209752559661865, "step": 51 }, { "epoch": 0.6775244299674267, "grad_norm": 9.5, "learning_rate": 4.849315068493151e-05, "logits/chosen": 0.46090734004974365, "logits/rejected": 0.5332375168800354, "logps/chosen": -83.30604553222656, "logps/rejected": -198.97483825683594, "loss": 0.0769, "rewards/accuracies": 1.0, "rewards/chosen": 0.07736861705780029, "rewards/margins": 3.2583978176116943, "rewards/rejected": -3.1810293197631836, "step": 52 }, { "epoch": 0.6905537459283387, "grad_norm": 10.5, "learning_rate": 4.8424657534246577e-05, "logits/chosen": 0.5082046985626221, "logits/rejected": 0.5545482635498047, "logps/chosen": -96.28477478027344, "logps/rejected": -153.87228393554688, "loss": 0.1285, "rewards/accuracies": 1.0, "rewards/chosen": 0.049273423850536346, "rewards/margins": 2.481055974960327, "rewards/rejected": -2.4317827224731445, "step": 53 }, { "epoch": 0.7035830618892508, "grad_norm": 11.6875, "learning_rate": 4.835616438356165e-05, "logits/chosen": 0.4179171621799469, "logits/rejected": 0.40184441208839417, "logps/chosen": -138.70870971679688, "logps/rejected": -198.06478881835938, "loss": 0.0812, "rewards/accuracies": 1.0, "rewards/chosen": 0.07567030191421509, "rewards/margins": 3.1052422523498535, "rewards/rejected": -3.0295724868774414, "step": 54 }, { "epoch": 0.7166123778501629, "grad_norm": 4.34375, "learning_rate": 4.8287671232876716e-05, "logits/chosen": 0.5138372182846069, "logits/rejected": 0.5542392730712891, "logps/chosen": -93.45801544189453, "logps/rejected": -196.15989685058594, "loss": 0.0619, "rewards/accuracies": 1.0, "rewards/chosen": 0.008173711597919464, "rewards/margins": 3.3829448223114014, "rewards/rejected": -3.3747713565826416, "step": 55 }, { "epoch": 0.7296416938110749, "grad_norm": 5.9375, "learning_rate": 4.821917808219178e-05, "logits/chosen": 0.4723089337348938, "logits/rejected": 0.5142194032669067, "logps/chosen": -101.18618774414062, "logps/rejected": -202.30770874023438, "loss": 0.0698, "rewards/accuracies": 1.0, "rewards/chosen": -0.20023450255393982, "rewards/margins": 3.583833694458008, "rewards/rejected": -3.7840681076049805, "step": 56 }, { "epoch": 0.742671009771987, "grad_norm": 4.84375, "learning_rate": 4.815068493150685e-05, "logits/chosen": 0.6098852157592773, "logits/rejected": 0.5306227207183838, "logps/chosen": -92.79605102539062, "logps/rejected": -194.44285583496094, "loss": 0.0694, "rewards/accuracies": 1.0, "rewards/chosen": 0.09434399008750916, "rewards/margins": 3.6242706775665283, "rewards/rejected": -3.5299267768859863, "step": 57 }, { "epoch": 0.755700325732899, "grad_norm": 3.09375, "learning_rate": 4.808219178082192e-05, "logits/chosen": 0.596287190914154, "logits/rejected": 0.5526207685470581, "logps/chosen": -80.8297348022461, "logps/rejected": -199.17770385742188, "loss": 0.0424, "rewards/accuracies": 1.0, "rewards/chosen": 0.16401855647563934, "rewards/margins": 4.359023094177246, "rewards/rejected": -4.195004463195801, "step": 58 }, { "epoch": 0.7687296416938111, "grad_norm": 4.78125, "learning_rate": 4.801369863013699e-05, "logits/chosen": 0.5375024080276489, "logits/rejected": 0.5418161153793335, "logps/chosen": -94.54348754882812, "logps/rejected": -179.93148803710938, "loss": 0.065, "rewards/accuracies": 1.0, "rewards/chosen": 0.02510090172290802, "rewards/margins": 3.8914875984191895, "rewards/rejected": -3.866386890411377, "step": 59 }, { "epoch": 0.7817589576547231, "grad_norm": 3.953125, "learning_rate": 4.794520547945205e-05, "logits/chosen": 0.5458413362503052, "logits/rejected": 0.5163211226463318, "logps/chosen": -102.55235290527344, "logps/rejected": -192.88011169433594, "loss": 0.0422, "rewards/accuracies": 1.0, "rewards/chosen": 0.03571543097496033, "rewards/margins": 4.136109352111816, "rewards/rejected": -4.100393772125244, "step": 60 }, { "epoch": 0.7947882736156352, "grad_norm": 3.390625, "learning_rate": 4.7876712328767126e-05, "logits/chosen": 0.44991570711135864, "logits/rejected": 0.47752076387405396, "logps/chosen": -71.73591613769531, "logps/rejected": -166.39166259765625, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": 0.09097965061664581, "rewards/margins": 3.9125423431396484, "rewards/rejected": -3.8215625286102295, "step": 61 }, { "epoch": 0.8078175895765473, "grad_norm": 5.4375, "learning_rate": 4.780821917808219e-05, "logits/chosen": 0.5184447169303894, "logits/rejected": 0.49066781997680664, "logps/chosen": -96.78662109375, "logps/rejected": -220.57266235351562, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": -0.07911338657140732, "rewards/margins": 4.811767101287842, "rewards/rejected": -4.890880584716797, "step": 62 }, { "epoch": 0.8208469055374593, "grad_norm": 3.0, "learning_rate": 4.7739726027397265e-05, "logits/chosen": 0.5503054857254028, "logits/rejected": 0.7354578971862793, "logps/chosen": -76.80421447753906, "logps/rejected": -210.28140258789062, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 0.024801086634397507, "rewards/margins": 5.572457790374756, "rewards/rejected": -5.547656536102295, "step": 63 }, { "epoch": 0.8338762214983714, "grad_norm": 3.46875, "learning_rate": 4.767123287671233e-05, "logits/chosen": 0.5171054005622864, "logits/rejected": 0.512793242931366, "logps/chosen": -131.59396362304688, "logps/rejected": -217.56964111328125, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": -0.10994181782007217, "rewards/margins": 5.379184246063232, "rewards/rejected": -5.489125728607178, "step": 64 }, { "epoch": 0.8469055374592834, "grad_norm": 4.03125, "learning_rate": 4.7602739726027403e-05, "logits/chosen": 0.44678860902786255, "logits/rejected": 0.5419712662696838, "logps/chosen": -104.75637817382812, "logps/rejected": -201.79806518554688, "loss": 0.0296, "rewards/accuracies": 1.0, "rewards/chosen": -0.1282435804605484, "rewards/margins": 4.831494331359863, "rewards/rejected": -4.959737777709961, "step": 65 }, { "epoch": 0.8599348534201955, "grad_norm": 2.40625, "learning_rate": 4.753424657534247e-05, "logits/chosen": 0.582385778427124, "logits/rejected": 0.6422931551933289, "logps/chosen": -94.39370727539062, "logps/rejected": -199.6475830078125, "loss": 0.0258, "rewards/accuracies": 1.0, "rewards/chosen": -0.04259404167532921, "rewards/margins": 5.368470191955566, "rewards/rejected": -5.411064147949219, "step": 66 }, { "epoch": 0.8729641693811075, "grad_norm": 2.6875, "learning_rate": 4.7465753424657536e-05, "logits/chosen": 0.5766660571098328, "logits/rejected": 0.6043537855148315, "logps/chosen": -102.68363952636719, "logps/rejected": -214.7265625, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -0.2821919322013855, "rewards/margins": 5.887226581573486, "rewards/rejected": -6.169419288635254, "step": 67 }, { "epoch": 0.8859934853420195, "grad_norm": 1.0859375, "learning_rate": 4.73972602739726e-05, "logits/chosen": 0.4715408682823181, "logits/rejected": 0.5762664079666138, "logps/chosen": -86.6288070678711, "logps/rejected": -225.074951171875, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.10806949436664581, "rewards/margins": 6.162431716918945, "rewards/rejected": -6.270501136779785, "step": 68 }, { "epoch": 0.8990228013029316, "grad_norm": 1.71875, "learning_rate": 4.7328767123287675e-05, "logits/chosen": 0.613117516040802, "logits/rejected": 0.5737402439117432, "logps/chosen": -71.23908996582031, "logps/rejected": -197.6245880126953, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": -0.04037155583500862, "rewards/margins": 5.633227825164795, "rewards/rejected": -5.6735992431640625, "step": 69 }, { "epoch": 0.9120521172638436, "grad_norm": 0.98828125, "learning_rate": 4.726027397260274e-05, "logits/chosen": 0.6605570912361145, "logits/rejected": 0.6310275197029114, "logps/chosen": -123.74465942382812, "logps/rejected": -249.78793334960938, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -0.3881164789199829, "rewards/margins": 7.08071231842041, "rewards/rejected": -7.4688286781311035, "step": 70 }, { "epoch": 0.9250814332247557, "grad_norm": 23.375, "learning_rate": 4.719178082191781e-05, "logits/chosen": 0.5911487936973572, "logits/rejected": 0.6923888325691223, "logps/chosen": -161.05184936523438, "logps/rejected": -264.648193359375, "loss": 0.058, "rewards/accuracies": 0.96875, "rewards/chosen": -0.7978758811950684, "rewards/margins": 8.156850814819336, "rewards/rejected": -8.954728126525879, "step": 71 }, { "epoch": 0.9381107491856677, "grad_norm": 4.5625, "learning_rate": 4.712328767123288e-05, "logits/chosen": 0.6496680378913879, "logits/rejected": 0.6733301281929016, "logps/chosen": -121.81378173828125, "logps/rejected": -239.56304931640625, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.4053517282009125, "rewards/margins": 7.532309532165527, "rewards/rejected": -7.937661647796631, "step": 72 }, { "epoch": 0.9511400651465798, "grad_norm": 17.625, "learning_rate": 4.7054794520547946e-05, "logits/chosen": 0.5184324383735657, "logits/rejected": 0.6415278911590576, "logps/chosen": -105.58231353759766, "logps/rejected": -222.8607940673828, "loss": 0.0298, "rewards/accuracies": 1.0, "rewards/chosen": -0.48813995718955994, "rewards/margins": 7.293839454650879, "rewards/rejected": -7.781979560852051, "step": 73 }, { "epoch": 0.9641693811074918, "grad_norm": 0.5234375, "learning_rate": 4.698630136986302e-05, "logits/chosen": 0.5843162536621094, "logits/rejected": 0.5905658602714539, "logps/chosen": -100.66535949707031, "logps/rejected": -242.2615203857422, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.5956183075904846, "rewards/margins": 7.831192493438721, "rewards/rejected": -8.426811218261719, "step": 74 }, { "epoch": 0.9771986970684039, "grad_norm": 0.90625, "learning_rate": 4.6917808219178085e-05, "logits/chosen": 0.6023251414299011, "logits/rejected": 0.6175463199615479, "logps/chosen": -74.83623504638672, "logps/rejected": -226.2584228515625, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -0.32583388686180115, "rewards/margins": 7.1260175704956055, "rewards/rejected": -7.4518513679504395, "step": 75 }, { "epoch": 0.990228013029316, "grad_norm": 1.5859375, "learning_rate": 4.684931506849316e-05, "logits/chosen": 0.549035906791687, "logits/rejected": 0.5604692697525024, "logps/chosen": -106.24671936035156, "logps/rejected": -224.1392059326172, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -0.48925158381462097, "rewards/margins": 8.133434295654297, "rewards/rejected": -8.622686386108398, "step": 76 }, { "epoch": 1.003257328990228, "grad_norm": 3.21875, "learning_rate": 4.6780821917808224e-05, "logits/chosen": 0.4611436426639557, "logits/rejected": 0.5326769948005676, "logps/chosen": -122.00413513183594, "logps/rejected": -225.345703125, "loss": 0.0104, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694484233856201, "rewards/margins": 8.14291763305664, "rewards/rejected": -8.512365341186523, "step": 77 }, { "epoch": 1.01628664495114, "grad_norm": 1.0390625, "learning_rate": 4.671232876712329e-05, "logits/chosen": 0.5869070887565613, "logits/rejected": 0.6033880710601807, "logps/chosen": -82.62848663330078, "logps/rejected": -218.4529571533203, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -0.45080143213272095, "rewards/margins": 7.709619522094727, "rewards/rejected": -8.160421371459961, "step": 78 }, { "epoch": 1.0293159609120521, "grad_norm": 0.5, "learning_rate": 4.6643835616438356e-05, "logits/chosen": 0.6383049488067627, "logits/rejected": 0.6318773031234741, "logps/chosen": -85.02655029296875, "logps/rejected": -236.74661254882812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.6345354318618774, "rewards/margins": 8.56661319732666, "rewards/rejected": -9.201148986816406, "step": 79 }, { "epoch": 1.0423452768729642, "grad_norm": 6.03125, "learning_rate": 4.657534246575342e-05, "logits/chosen": 0.5868783593177795, "logits/rejected": 0.6521725654602051, "logps/chosen": -72.04723358154297, "logps/rejected": -230.14759826660156, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.3693377375602722, "rewards/margins": 8.969406127929688, "rewards/rejected": -9.3387451171875, "step": 80 }, { "epoch": 1.0553745928338762, "grad_norm": 3.875, "learning_rate": 4.6506849315068495e-05, "logits/chosen": 0.6232761144638062, "logits/rejected": 0.7092280983924866, "logps/chosen": -79.42515563964844, "logps/rejected": -243.50372314453125, "loss": 0.0114, "rewards/accuracies": 1.0, "rewards/chosen": -0.4384864568710327, "rewards/margins": 9.8868408203125, "rewards/rejected": -10.325326919555664, "step": 81 }, { "epoch": 1.0684039087947883, "grad_norm": 17.625, "learning_rate": 4.643835616438356e-05, "logits/chosen": 0.5587644577026367, "logits/rejected": 0.507000744342804, "logps/chosen": -107.61006164550781, "logps/rejected": -269.83843994140625, "loss": 0.0298, "rewards/accuracies": 0.96875, "rewards/chosen": -0.767175555229187, "rewards/margins": 10.086366653442383, "rewards/rejected": -10.85354232788086, "step": 82 }, { "epoch": 1.0814332247557004, "grad_norm": 1.765625, "learning_rate": 4.6369863013698634e-05, "logits/chosen": 0.7217209339141846, "logits/rejected": 0.6606077551841736, "logps/chosen": -112.81648254394531, "logps/rejected": -288.869384765625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.637757420539856, "rewards/margins": 11.375957489013672, "rewards/rejected": -12.013714790344238, "step": 83 }, { "epoch": 1.0944625407166124, "grad_norm": 0.134765625, "learning_rate": 4.63013698630137e-05, "logits/chosen": 0.598381757736206, "logits/rejected": 0.7315313816070557, "logps/chosen": -107.20101928710938, "logps/rejected": -281.4562683105469, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.0282131433486938, "rewards/margins": 10.506587982177734, "rewards/rejected": -11.534799575805664, "step": 84 }, { "epoch": 1.1074918566775245, "grad_norm": 0.341796875, "learning_rate": 4.623287671232877e-05, "logits/chosen": 0.5361148118972778, "logits/rejected": 0.625439465045929, "logps/chosen": -94.30006408691406, "logps/rejected": -247.62734985351562, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.6392572522163391, "rewards/margins": 9.514155387878418, "rewards/rejected": -10.153412818908691, "step": 85 }, { "epoch": 1.1205211726384365, "grad_norm": 0.1318359375, "learning_rate": 4.616438356164384e-05, "logits/chosen": 0.4699576199054718, "logits/rejected": 0.5327920317649841, "logps/chosen": -99.83711242675781, "logps/rejected": -277.1376953125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.7559419870376587, "rewards/margins": 10.79163932800293, "rewards/rejected": -11.54758071899414, "step": 86 }, { "epoch": 1.1335504885993486, "grad_norm": 21.625, "learning_rate": 4.609589041095891e-05, "logits/chosen": 0.5424385666847229, "logits/rejected": 0.5994272232055664, "logps/chosen": -126.75860595703125, "logps/rejected": -259.98785400390625, "loss": 0.027, "rewards/accuracies": 0.96875, "rewards/chosen": -1.0079156160354614, "rewards/margins": 9.790204048156738, "rewards/rejected": -10.79811954498291, "step": 87 }, { "epoch": 1.1465798045602607, "grad_norm": 0.1923828125, "learning_rate": 4.602739726027398e-05, "logits/chosen": 0.4807354509830475, "logits/rejected": 0.5597364902496338, "logps/chosen": -106.52574157714844, "logps/rejected": -272.2024841308594, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.9496315717697144, "rewards/margins": 10.869487762451172, "rewards/rejected": -11.81911849975586, "step": 88 }, { "epoch": 1.1596091205211727, "grad_norm": 0.1513671875, "learning_rate": 4.5958904109589044e-05, "logits/chosen": 0.4442989230155945, "logits/rejected": 0.5743086338043213, "logps/chosen": -126.14883422851562, "logps/rejected": -257.60479736328125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.766221821308136, "rewards/margins": 10.424041748046875, "rewards/rejected": -11.190263748168945, "step": 89 }, { "epoch": 1.1726384364820848, "grad_norm": 1.109375, "learning_rate": 4.589041095890411e-05, "logits/chosen": 0.6463179588317871, "logits/rejected": 0.7357967495918274, "logps/chosen": -111.60262298583984, "logps/rejected": -257.9665222167969, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.7475589513778687, "rewards/margins": 9.678692817687988, "rewards/rejected": -10.426251411437988, "step": 90 }, { "epoch": 1.1856677524429968, "grad_norm": 6.6875, "learning_rate": 4.5821917808219176e-05, "logits/chosen": 0.48268792033195496, "logits/rejected": 0.5555750131607056, "logps/chosen": -109.53272247314453, "logps/rejected": -254.43492126464844, "loss": 0.021, "rewards/accuracies": 1.0, "rewards/chosen": -1.1768321990966797, "rewards/margins": 10.083324432373047, "rewards/rejected": -11.260156631469727, "step": 91 }, { "epoch": 1.1986970684039089, "grad_norm": 1.3515625, "learning_rate": 4.575342465753425e-05, "logits/chosen": 0.4292120337486267, "logits/rejected": 0.521615743637085, "logps/chosen": -95.94520568847656, "logps/rejected": -253.99993896484375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.871029257774353, "rewards/margins": 10.545323371887207, "rewards/rejected": -11.416353225708008, "step": 92 }, { "epoch": 1.211726384364821, "grad_norm": 0.80859375, "learning_rate": 4.5684931506849315e-05, "logits/chosen": 0.599204421043396, "logits/rejected": 0.6558493375778198, "logps/chosen": -81.71524047851562, "logps/rejected": -286.6025390625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.6884966492652893, "rewards/margins": 12.463363647460938, "rewards/rejected": -13.151860237121582, "step": 93 }, { "epoch": 1.224755700325733, "grad_norm": 18.0, "learning_rate": 4.561643835616439e-05, "logits/chosen": 0.5306810140609741, "logits/rejected": 0.6242883801460266, "logps/chosen": -123.8375244140625, "logps/rejected": -301.3340759277344, "loss": 0.115, "rewards/accuracies": 0.96875, "rewards/chosen": -1.471944808959961, "rewards/margins": 11.491494178771973, "rewards/rejected": -12.963438034057617, "step": 94 }, { "epoch": 1.237785016286645, "grad_norm": 3.6875, "learning_rate": 4.5547945205479454e-05, "logits/chosen": 0.307037353515625, "logits/rejected": 0.4169548749923706, "logps/chosen": -104.93318176269531, "logps/rejected": -298.0616455078125, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -0.8200819492340088, "rewards/margins": 12.302337646484375, "rewards/rejected": -13.122421264648438, "step": 95 }, { "epoch": 1.2508143322475571, "grad_norm": 0.357421875, "learning_rate": 4.547945205479453e-05, "logits/chosen": 0.44628292322158813, "logits/rejected": 0.5122686624526978, "logps/chosen": -138.44715881347656, "logps/rejected": -297.4310302734375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.1169642210006714, "rewards/margins": 12.18505859375, "rewards/rejected": -13.302022933959961, "step": 96 }, { "epoch": 1.2638436482084692, "grad_norm": 0.734375, "learning_rate": 4.541095890410959e-05, "logits/chosen": 0.4561493992805481, "logits/rejected": 0.42395809292793274, "logps/chosen": -97.692626953125, "logps/rejected": -269.0616149902344, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0600621700286865, "rewards/margins": 10.464313507080078, "rewards/rejected": -11.524375915527344, "step": 97 }, { "epoch": 1.2768729641693812, "grad_norm": 0.171875, "learning_rate": 4.534246575342466e-05, "logits/chosen": 0.5301443934440613, "logits/rejected": 0.5689199566841125, "logps/chosen": -82.25302124023438, "logps/rejected": -299.8308410644531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.6450921297073364, "rewards/margins": 12.625539779663086, "rewards/rejected": -13.270631790161133, "step": 98 }, { "epoch": 1.2899022801302933, "grad_norm": 0.193359375, "learning_rate": 4.5273972602739725e-05, "logits/chosen": 0.5289660096168518, "logits/rejected": 0.5680521726608276, "logps/chosen": -114.97647094726562, "logps/rejected": -289.7352294921875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.0461821556091309, "rewards/margins": 11.369194030761719, "rewards/rejected": -12.415376663208008, "step": 99 }, { "epoch": 1.3029315960912053, "grad_norm": 12.875, "learning_rate": 4.520547945205479e-05, "logits/chosen": 0.5327968597412109, "logits/rejected": 0.5609641075134277, "logps/chosen": -102.47958374023438, "logps/rejected": -250.79983520507812, "loss": 0.0433, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2846791744232178, "rewards/margins": 10.277151107788086, "rewards/rejected": -11.561830520629883, "step": 100 }, { "epoch": 1.3029315960912053, "eval_logits/chosen": 0.4522504210472107, "eval_logits/rejected": 0.5126740336418152, "eval_logps/chosen": -105.14033508300781, "eval_logps/rejected": -271.7301330566406, "eval_loss": 0.010936837643384933, "eval_rewards/accuracies": 0.9955357313156128, "eval_rewards/chosen": -0.9686061143875122, "eval_rewards/margins": 11.095270156860352, "eval_rewards/rejected": -12.06387710571289, "eval_runtime": 52.2837, "eval_samples_per_second": 12.834, "eval_steps_per_second": 0.803, "step": 100 }, { "epoch": 1.3159609120521172, "grad_norm": 0.3359375, "learning_rate": 4.5136986301369864e-05, "logits/chosen": 0.4589378833770752, "logits/rejected": 0.5487878918647766, "logps/chosen": -105.76063537597656, "logps/rejected": -304.374755859375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.6952133774757385, "rewards/margins": 11.938570022583008, "rewards/rejected": -12.633784294128418, "step": 101 }, { "epoch": 1.3289902280130292, "grad_norm": 16.875, "learning_rate": 4.506849315068493e-05, "logits/chosen": 0.3769131302833557, "logits/rejected": 0.4298419952392578, "logps/chosen": -90.59988403320312, "logps/rejected": -247.70855712890625, "loss": 0.0366, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8937082886695862, "rewards/margins": 10.360432624816895, "rewards/rejected": -11.254140853881836, "step": 102 }, { "epoch": 1.3420195439739413, "grad_norm": 0.1240234375, "learning_rate": 4.5e-05, "logits/chosen": 0.4195227026939392, "logits/rejected": 0.4982715845108032, "logps/chosen": -108.6019515991211, "logps/rejected": -256.0687255859375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.35523366928100586, "rewards/margins": 10.765069961547852, "rewards/rejected": -11.120304107666016, "step": 103 }, { "epoch": 1.3550488599348534, "grad_norm": 0.734375, "learning_rate": 4.493150684931507e-05, "logits/chosen": 0.512363851070404, "logits/rejected": 0.576703667640686, "logps/chosen": -87.09799194335938, "logps/rejected": -250.88160705566406, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8132816553115845, "rewards/margins": 10.275364875793457, "rewards/rejected": -11.088645935058594, "step": 104 }, { "epoch": 1.3680781758957654, "grad_norm": 0.5390625, "learning_rate": 4.486301369863014e-05, "logits/chosen": 0.5740979909896851, "logits/rejected": 0.6141005158424377, "logps/chosen": -101.0667495727539, "logps/rejected": -270.2124328613281, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.4534332752227783, "rewards/margins": 11.378608703613281, "rewards/rejected": -11.832042694091797, "step": 105 }, { "epoch": 1.3811074918566775, "grad_norm": 0.1689453125, "learning_rate": 4.479452054794521e-05, "logits/chosen": 0.4920623004436493, "logits/rejected": 0.5869815945625305, "logps/chosen": -78.95692443847656, "logps/rejected": -261.3721923828125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.5587632656097412, "rewards/margins": 11.233173370361328, "rewards/rejected": -11.791936874389648, "step": 106 }, { "epoch": 1.3941368078175895, "grad_norm": 0.251953125, "learning_rate": 4.472602739726028e-05, "logits/chosen": 0.570668637752533, "logits/rejected": 0.6403558850288391, "logps/chosen": -100.12591552734375, "logps/rejected": -284.8184814453125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.974056601524353, "rewards/margins": 11.53510570526123, "rewards/rejected": -12.509162902832031, "step": 107 }, { "epoch": 1.4071661237785016, "grad_norm": 0.486328125, "learning_rate": 4.465753424657535e-05, "logits/chosen": 0.5420557260513306, "logits/rejected": 0.5884326100349426, "logps/chosen": -88.60862731933594, "logps/rejected": -289.9623718261719, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.7016857862472534, "rewards/margins": 12.609970092773438, "rewards/rejected": -13.31165599822998, "step": 108 }, { "epoch": 1.4201954397394136, "grad_norm": 0.16796875, "learning_rate": 4.458904109589041e-05, "logits/chosen": 0.4910571575164795, "logits/rejected": 0.5071029663085938, "logps/chosen": -126.79181671142578, "logps/rejected": -296.6622314453125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5642073154449463, "rewards/margins": 11.54067325592041, "rewards/rejected": -13.104881286621094, "step": 109 }, { "epoch": 1.4332247557003257, "grad_norm": 0.09423828125, "learning_rate": 4.452054794520548e-05, "logits/chosen": 0.5247446298599243, "logits/rejected": 0.47774773836135864, "logps/chosen": -100.17961883544922, "logps/rejected": -256.7818908691406, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.5451500415802002, "rewards/margins": 10.933671951293945, "rewards/rejected": -11.478821754455566, "step": 110 }, { "epoch": 1.4462540716612378, "grad_norm": 0.0615234375, "learning_rate": 4.4452054794520545e-05, "logits/chosen": 0.6131365299224854, "logits/rejected": 0.615870475769043, "logps/chosen": -91.60357666015625, "logps/rejected": -277.2375793457031, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.4648512601852417, "rewards/margins": 11.733713150024414, "rewards/rejected": -12.198564529418945, "step": 111 }, { "epoch": 1.4592833876221498, "grad_norm": 0.6015625, "learning_rate": 4.438356164383562e-05, "logits/chosen": 0.7266855239868164, "logits/rejected": 0.633425235748291, "logps/chosen": -83.83377075195312, "logps/rejected": -264.64501953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.8645696640014648, "rewards/margins": 11.404350280761719, "rewards/rejected": -12.268918991088867, "step": 112 }, { "epoch": 1.4723127035830619, "grad_norm": 39.5, "learning_rate": 4.4315068493150684e-05, "logits/chosen": 0.6473186016082764, "logits/rejected": 0.6468358635902405, "logps/chosen": -145.5900115966797, "logps/rejected": -300.77301025390625, "loss": 0.0413, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2610886096954346, "rewards/margins": 12.301548957824707, "rewards/rejected": -13.562638282775879, "step": 113 }, { "epoch": 1.485342019543974, "grad_norm": 0.400390625, "learning_rate": 4.424657534246576e-05, "logits/chosen": 0.4430687427520752, "logits/rejected": 0.5213119983673096, "logps/chosen": -133.21205139160156, "logps/rejected": -270.613525390625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.0954465866088867, "rewards/margins": 11.330828666687012, "rewards/rejected": -12.426275253295898, "step": 114 }, { "epoch": 1.498371335504886, "grad_norm": 0.94140625, "learning_rate": 4.417808219178082e-05, "logits/chosen": 0.5086010694503784, "logits/rejected": 0.5820840001106262, "logps/chosen": -123.90394592285156, "logps/rejected": -257.7217712402344, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -1.11330246925354, "rewards/margins": 10.829158782958984, "rewards/rejected": -11.942461013793945, "step": 115 }, { "epoch": 1.511400651465798, "grad_norm": 0.1435546875, "learning_rate": 4.4109589041095896e-05, "logits/chosen": 0.6593326330184937, "logits/rejected": 0.6211075186729431, "logps/chosen": -75.89244842529297, "logps/rejected": -266.60357666015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.7435863614082336, "rewards/margins": 11.823095321655273, "rewards/rejected": -12.566681861877441, "step": 116 }, { "epoch": 1.52442996742671, "grad_norm": 0.3125, "learning_rate": 4.404109589041096e-05, "logits/chosen": 0.44883668422698975, "logits/rejected": 0.5639724135398865, "logps/chosen": -93.89613342285156, "logps/rejected": -286.56451416015625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.0818122625350952, "rewards/margins": 12.308505058288574, "rewards/rejected": -13.390316009521484, "step": 117 }, { "epoch": 1.5374592833876222, "grad_norm": 0.294921875, "learning_rate": 4.3972602739726035e-05, "logits/chosen": 0.5254025459289551, "logits/rejected": 0.5744770765304565, "logps/chosen": -120.49933624267578, "logps/rejected": -313.8304443359375, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.2532737255096436, "rewards/margins": 13.612017631530762, "rewards/rejected": -14.8652925491333, "step": 118 }, { "epoch": 1.5504885993485342, "grad_norm": 0.06005859375, "learning_rate": 4.39041095890411e-05, "logits/chosen": 0.4704741835594177, "logits/rejected": 0.5933064222335815, "logps/chosen": -101.07899475097656, "logps/rejected": -312.5476989746094, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.0991394519805908, "rewards/margins": 13.127508163452148, "rewards/rejected": -14.226646423339844, "step": 119 }, { "epoch": 1.5635179153094463, "grad_norm": 0.7265625, "learning_rate": 4.383561643835617e-05, "logits/chosen": 0.5662128329277039, "logits/rejected": 0.5538490414619446, "logps/chosen": -106.43547058105469, "logps/rejected": -237.27182006835938, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.7144113779067993, "rewards/margins": 9.923102378845215, "rewards/rejected": -10.637513160705566, "step": 120 }, { "epoch": 1.5765472312703583, "grad_norm": 0.2734375, "learning_rate": 4.376712328767123e-05, "logits/chosen": 0.6062589883804321, "logits/rejected": 0.6001408100128174, "logps/chosen": -85.78362274169922, "logps/rejected": -230.78456115722656, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.44176292419433594, "rewards/margins": 10.009519577026367, "rewards/rejected": -10.451282501220703, "step": 121 }, { "epoch": 1.5895765472312704, "grad_norm": 0.890625, "learning_rate": 4.36986301369863e-05, "logits/chosen": 0.4221673607826233, "logits/rejected": 0.5758030414581299, "logps/chosen": -105.4853744506836, "logps/rejected": -291.33416748046875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.8794921636581421, "rewards/margins": 12.619811058044434, "rewards/rejected": -13.499303817749023, "step": 122 }, { "epoch": 1.6026058631921825, "grad_norm": 11.3125, "learning_rate": 4.363013698630137e-05, "logits/chosen": 0.5420705676078796, "logits/rejected": 0.6151952147483826, "logps/chosen": -100.22688293457031, "logps/rejected": -243.79376220703125, "loss": 0.0141, "rewards/accuracies": 1.0, "rewards/chosen": -1.1169291734695435, "rewards/margins": 10.309640884399414, "rewards/rejected": -11.426569938659668, "step": 123 }, { "epoch": 1.6156351791530945, "grad_norm": 0.1025390625, "learning_rate": 4.356164383561644e-05, "logits/chosen": 0.5193166136741638, "logits/rejected": 0.6056085228919983, "logps/chosen": -82.8109359741211, "logps/rejected": -290.5059814453125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -0.6234797239303589, "rewards/margins": 13.424489974975586, "rewards/rejected": -14.047967910766602, "step": 124 }, { "epoch": 1.6286644951140063, "grad_norm": 0.515625, "learning_rate": 4.349315068493151e-05, "logits/chosen": 0.524208664894104, "logits/rejected": 0.4996390640735626, "logps/chosen": -99.54425811767578, "logps/rejected": -269.98858642578125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.75706547498703, "rewards/margins": 11.190351486206055, "rewards/rejected": -11.947418212890625, "step": 125 }, { "epoch": 1.6416938110749184, "grad_norm": 0.263671875, "learning_rate": 4.342465753424658e-05, "logits/chosen": 0.6168690323829651, "logits/rejected": 0.6482622027397156, "logps/chosen": -85.97930908203125, "logps/rejected": -270.2721862792969, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.8299384117126465, "rewards/margins": 11.914796829223633, "rewards/rejected": -12.744734764099121, "step": 126 }, { "epoch": 1.6547231270358305, "grad_norm": 0.5546875, "learning_rate": 4.335616438356165e-05, "logits/chosen": 0.4758910536766052, "logits/rejected": 0.6165511012077332, "logps/chosen": -120.85889434814453, "logps/rejected": -330.94580078125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.3049366474151611, "rewards/margins": 13.541584014892578, "rewards/rejected": -14.846521377563477, "step": 127 }, { "epoch": 1.6677524429967425, "grad_norm": 6.8125, "learning_rate": 4.3287671232876716e-05, "logits/chosen": 0.4912353754043579, "logits/rejected": 0.5630989074707031, "logps/chosen": -99.70421600341797, "logps/rejected": -262.81793212890625, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -1.1004682779312134, "rewards/margins": 11.206673622131348, "rewards/rejected": -12.307140350341797, "step": 128 }, { "epoch": 1.6807817589576546, "grad_norm": 0.16015625, "learning_rate": 4.321917808219178e-05, "logits/chosen": 0.4782199263572693, "logits/rejected": 0.525773823261261, "logps/chosen": -104.79579162597656, "logps/rejected": -289.299560546875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.9410255551338196, "rewards/margins": 13.11217212677002, "rewards/rejected": -14.05319595336914, "step": 129 }, { "epoch": 1.6938110749185666, "grad_norm": 0.21875, "learning_rate": 4.3150684931506855e-05, "logits/chosen": 0.4822072684764862, "logits/rejected": 0.4817202687263489, "logps/chosen": -86.81942749023438, "logps/rejected": -299.3095703125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.1278772354125977, "rewards/margins": 12.366036415100098, "rewards/rejected": -13.493914604187012, "step": 130 }, { "epoch": 1.7068403908794787, "grad_norm": 0.08642578125, "learning_rate": 4.308219178082192e-05, "logits/chosen": 0.5804314613342285, "logits/rejected": 0.6889848709106445, "logps/chosen": -91.85730743408203, "logps/rejected": -298.603515625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.0585956573486328, "rewards/margins": 12.788677215576172, "rewards/rejected": -13.847272872924805, "step": 131 }, { "epoch": 1.7198697068403908, "grad_norm": 0.072265625, "learning_rate": 4.301369863013699e-05, "logits/chosen": 0.4860071837902069, "logits/rejected": 0.6394906044006348, "logps/chosen": -122.80025482177734, "logps/rejected": -303.95257568359375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.156942367553711, "rewards/margins": 13.32013988494873, "rewards/rejected": -14.477082252502441, "step": 132 }, { "epoch": 1.7328990228013028, "grad_norm": 0.15234375, "learning_rate": 4.294520547945205e-05, "logits/chosen": 0.4813675880432129, "logits/rejected": 0.6056811213493347, "logps/chosen": -89.08052062988281, "logps/rejected": -268.1934814453125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.8572717308998108, "rewards/margins": 12.159040451049805, "rewards/rejected": -13.016312599182129, "step": 133 }, { "epoch": 1.7459283387622149, "grad_norm": 0.05615234375, "learning_rate": 4.2876712328767126e-05, "logits/chosen": 0.43135523796081543, "logits/rejected": 0.5367728471755981, "logps/chosen": -104.37152099609375, "logps/rejected": -309.7494201660156, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.9332711100578308, "rewards/margins": 13.174678802490234, "rewards/rejected": -14.107950210571289, "step": 134 }, { "epoch": 1.758957654723127, "grad_norm": 0.1123046875, "learning_rate": 4.280821917808219e-05, "logits/chosen": 0.46707215905189514, "logits/rejected": 0.545040488243103, "logps/chosen": -141.20016479492188, "logps/rejected": -337.659423828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.5533472299575806, "rewards/margins": 13.37056827545166, "rewards/rejected": -14.923914909362793, "step": 135 }, { "epoch": 1.771986970684039, "grad_norm": 0.142578125, "learning_rate": 4.2739726027397265e-05, "logits/chosen": 0.45749402046203613, "logits/rejected": 0.5103408098220825, "logps/chosen": -97.52786254882812, "logps/rejected": -218.84869384765625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.6832572221755981, "rewards/margins": 9.930760383605957, "rewards/rejected": -10.614017486572266, "step": 136 }, { "epoch": 1.785016286644951, "grad_norm": 0.04638671875, "learning_rate": 4.267123287671233e-05, "logits/chosen": 0.6288174986839294, "logits/rejected": 0.6228695511817932, "logps/chosen": -118.99038696289062, "logps/rejected": -292.7908020019531, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.2574949264526367, "rewards/margins": 12.354877471923828, "rewards/rejected": -13.612371444702148, "step": 137 }, { "epoch": 1.798045602605863, "grad_norm": 0.330078125, "learning_rate": 4.2602739726027404e-05, "logits/chosen": 0.4609254002571106, "logits/rejected": 0.480663537979126, "logps/chosen": -87.55207824707031, "logps/rejected": -289.66162109375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.7838760614395142, "rewards/margins": 12.662479400634766, "rewards/rejected": -13.446355819702148, "step": 138 }, { "epoch": 1.8110749185667752, "grad_norm": 0.091796875, "learning_rate": 4.253424657534247e-05, "logits/chosen": 0.5592811703681946, "logits/rejected": 0.6325635313987732, "logps/chosen": -113.62852478027344, "logps/rejected": -291.84967041015625, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.9283789992332458, "rewards/margins": 12.165189743041992, "rewards/rejected": -13.093568801879883, "step": 139 }, { "epoch": 1.8241042345276872, "grad_norm": 0.1376953125, "learning_rate": 4.2465753424657536e-05, "logits/chosen": 0.5351129174232483, "logits/rejected": 0.5127934813499451, "logps/chosen": -173.83511352539062, "logps/rejected": -313.468994140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.008704662322998, "rewards/margins": 12.894453048706055, "rewards/rejected": -14.903158187866211, "step": 140 }, { "epoch": 1.8371335504885993, "grad_norm": 25.25, "learning_rate": 4.23972602739726e-05, "logits/chosen": 0.5461170673370361, "logits/rejected": 0.5241893529891968, "logps/chosen": -90.9225082397461, "logps/rejected": -266.9288635253906, "loss": 0.0711, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2498632669448853, "rewards/margins": 11.258042335510254, "rewards/rejected": -12.507905960083008, "step": 141 }, { "epoch": 1.8501628664495113, "grad_norm": 0.15625, "learning_rate": 4.232876712328767e-05, "logits/chosen": 0.4733356535434723, "logits/rejected": 0.5178252458572388, "logps/chosen": -120.46127319335938, "logps/rejected": -303.619384765625, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.1871830224990845, "rewards/margins": 12.890132904052734, "rewards/rejected": -14.077316284179688, "step": 142 }, { "epoch": 1.8631921824104234, "grad_norm": 0.2578125, "learning_rate": 4.226027397260274e-05, "logits/chosen": 0.48812466859817505, "logits/rejected": 0.6284564733505249, "logps/chosen": -94.5536880493164, "logps/rejected": -292.870849609375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.950133204460144, "rewards/margins": 13.727540969848633, "rewards/rejected": -14.677675247192383, "step": 143 }, { "epoch": 1.8762214983713354, "grad_norm": 0.0859375, "learning_rate": 4.219178082191781e-05, "logits/chosen": 0.6320376992225647, "logits/rejected": 0.6237307786941528, "logps/chosen": -152.7342529296875, "logps/rejected": -295.3027648925781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.494692325592041, "rewards/margins": 12.356241226196289, "rewards/rejected": -13.850934028625488, "step": 144 }, { "epoch": 1.8892508143322475, "grad_norm": 0.2265625, "learning_rate": 4.212328767123288e-05, "logits/chosen": 0.7280508279800415, "logits/rejected": 0.6899917125701904, "logps/chosen": -79.49422454833984, "logps/rejected": -290.1501770019531, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.7727007865905762, "rewards/margins": 12.538410186767578, "rewards/rejected": -13.311111450195312, "step": 145 }, { "epoch": 1.9022801302931596, "grad_norm": 0.875, "learning_rate": 4.2054794520547946e-05, "logits/chosen": 0.4206058382987976, "logits/rejected": 0.5227707624435425, "logps/chosen": -101.57917785644531, "logps/rejected": -279.0715637207031, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.9858956336975098, "rewards/margins": 12.150425910949707, "rewards/rejected": -13.136322021484375, "step": 146 }, { "epoch": 1.9153094462540716, "grad_norm": 1.4609375, "learning_rate": 4.198630136986302e-05, "logits/chosen": 0.5245968699455261, "logits/rejected": 0.6121017932891846, "logps/chosen": -116.4501953125, "logps/rejected": -281.0984802246094, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.5875823497772217, "rewards/margins": 12.053236961364746, "rewards/rejected": -13.640819549560547, "step": 147 }, { "epoch": 1.9283387622149837, "grad_norm": 0.431640625, "learning_rate": 4.1917808219178085e-05, "logits/chosen": 0.45093053579330444, "logits/rejected": 0.587200403213501, "logps/chosen": -104.39015197753906, "logps/rejected": -293.44232177734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.8772008419036865, "rewards/margins": 12.074963569641113, "rewards/rejected": -12.952163696289062, "step": 148 }, { "epoch": 1.9413680781758957, "grad_norm": 3.4375, "learning_rate": 4.184931506849315e-05, "logits/chosen": 0.48234254121780396, "logits/rejected": 0.5706640481948853, "logps/chosen": -147.8875732421875, "logps/rejected": -278.24407958984375, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.0798665285110474, "rewards/margins": 11.722947120666504, "rewards/rejected": -12.802812576293945, "step": 149 }, { "epoch": 1.9543973941368078, "grad_norm": 0.60546875, "learning_rate": 4.1780821917808224e-05, "logits/chosen": 0.5278698205947876, "logits/rejected": 0.635560154914856, "logps/chosen": -99.79202270507812, "logps/rejected": -271.11785888671875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.8048182725906372, "rewards/margins": 12.346576690673828, "rewards/rejected": -13.151394844055176, "step": 150 }, { "epoch": 1.9543973941368078, "eval_logits/chosen": 0.4638054668903351, "eval_logits/rejected": 0.5228009223937988, "eval_logps/chosen": -108.31918334960938, "eval_logps/rejected": -286.8623046875, "eval_loss": 0.007638773415237665, "eval_rewards/accuracies": 0.9955357313156128, "eval_rewards/chosen": -1.2864917516708374, "eval_rewards/margins": 12.290605545043945, "eval_rewards/rejected": -13.57709789276123, "eval_runtime": 52.2778, "eval_samples_per_second": 12.835, "eval_steps_per_second": 0.803, "step": 150 }, { "epoch": 1.9674267100977199, "grad_norm": 1.53125, "learning_rate": 4.171232876712329e-05, "logits/chosen": 0.5083509683609009, "logits/rejected": 0.6153576374053955, "logps/chosen": -86.2269515991211, "logps/rejected": -281.91888427734375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.8234192132949829, "rewards/margins": 12.487937927246094, "rewards/rejected": -13.311358451843262, "step": 151 }, { "epoch": 1.980456026058632, "grad_norm": 0.08740234375, "learning_rate": 4.1643835616438356e-05, "logits/chosen": 0.4471871554851532, "logits/rejected": 0.5222618579864502, "logps/chosen": -77.19293212890625, "logps/rejected": -279.3829040527344, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.7928668856620789, "rewards/margins": 12.986977577209473, "rewards/rejected": -13.779845237731934, "step": 152 }, { "epoch": 1.993485342019544, "grad_norm": 0.38671875, "learning_rate": 4.157534246575342e-05, "logits/chosen": 0.5125950574874878, "logits/rejected": 0.531832218170166, "logps/chosen": -89.48603057861328, "logps/rejected": -292.6934509277344, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.9772664308547974, "rewards/margins": 12.81619644165039, "rewards/rejected": -13.793462753295898, "step": 153 }, { "epoch": 2.006514657980456, "grad_norm": 0.09326171875, "learning_rate": 4.1506849315068495e-05, "logits/chosen": 0.5642431378364563, "logits/rejected": 0.6921492218971252, "logps/chosen": -109.61473083496094, "logps/rejected": -336.2562255859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.3835595846176147, "rewards/margins": 15.08292007446289, "rewards/rejected": -16.46647834777832, "step": 154 }, { "epoch": 2.019543973941368, "grad_norm": 0.455078125, "learning_rate": 4.143835616438356e-05, "logits/chosen": 0.4728472828865051, "logits/rejected": 0.5778607726097107, "logps/chosen": -113.82855987548828, "logps/rejected": -300.3656005859375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.0702670812606812, "rewards/margins": 13.495317459106445, "rewards/rejected": -14.565585136413574, "step": 155 }, { "epoch": 2.03257328990228, "grad_norm": 0.1611328125, "learning_rate": 4.1369863013698634e-05, "logits/chosen": 0.558509886264801, "logits/rejected": 0.5765538215637207, "logps/chosen": -96.08161163330078, "logps/rejected": -311.4420471191406, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.7950121760368347, "rewards/margins": 14.138938903808594, "rewards/rejected": -14.933950424194336, "step": 156 }, { "epoch": 2.045602605863192, "grad_norm": 0.23828125, "learning_rate": 4.13013698630137e-05, "logits/chosen": 0.5611923933029175, "logits/rejected": 0.5538697242736816, "logps/chosen": -118.36637878417969, "logps/rejected": -269.89837646484375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.5846986770629883, "rewards/margins": 11.414068222045898, "rewards/rejected": -12.998766899108887, "step": 157 }, { "epoch": 2.0586319218241043, "grad_norm": 0.240234375, "learning_rate": 4.123287671232877e-05, "logits/chosen": 0.5009916424751282, "logits/rejected": 0.5371646881103516, "logps/chosen": -100.47499084472656, "logps/rejected": -283.9187316894531, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.0149474143981934, "rewards/margins": 11.762290000915527, "rewards/rejected": -12.777236938476562, "step": 158 }, { "epoch": 2.0716612377850163, "grad_norm": 0.1591796875, "learning_rate": 4.116438356164384e-05, "logits/chosen": 0.6033108830451965, "logits/rejected": 0.6458787322044373, "logps/chosen": -118.35772705078125, "logps/rejected": -342.5250244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.5086966753005981, "rewards/margins": 15.292023658752441, "rewards/rejected": -16.80072021484375, "step": 159 }, { "epoch": 2.0846905537459284, "grad_norm": 0.29296875, "learning_rate": 4.1095890410958905e-05, "logits/chosen": 0.5724061131477356, "logits/rejected": 0.6467206478118896, "logps/chosen": -95.32568359375, "logps/rejected": -270.0829772949219, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.0148924589157104, "rewards/margins": 12.349864959716797, "rewards/rejected": -13.364758491516113, "step": 160 }, { "epoch": 2.0977198697068404, "grad_norm": 0.1328125, "learning_rate": 4.102739726027398e-05, "logits/chosen": 0.36649227142333984, "logits/rejected": 0.4759945273399353, "logps/chosen": -79.16898345947266, "logps/rejected": -256.05426025390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.5944907069206238, "rewards/margins": 11.262916564941406, "rewards/rejected": -11.85740852355957, "step": 161 }, { "epoch": 2.1107491856677525, "grad_norm": 0.115234375, "learning_rate": 4.0958904109589044e-05, "logits/chosen": 0.4255332350730896, "logits/rejected": 0.5424034595489502, "logps/chosen": -146.3050079345703, "logps/rejected": -328.6482849121094, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.6175340414047241, "rewards/margins": 14.242683410644531, "rewards/rejected": -15.860215187072754, "step": 162 }, { "epoch": 2.1237785016286646, "grad_norm": 6.78125, "learning_rate": 4.089041095890411e-05, "logits/chosen": 0.5109447836875916, "logits/rejected": 0.5712834596633911, "logps/chosen": -125.36318969726562, "logps/rejected": -287.7838134765625, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -1.4990017414093018, "rewards/margins": 12.238770484924316, "rewards/rejected": -13.737771987915039, "step": 163 }, { "epoch": 2.1368078175895766, "grad_norm": 0.054931640625, "learning_rate": 4.0821917808219176e-05, "logits/chosen": 0.5000830888748169, "logits/rejected": 0.5245240926742554, "logps/chosen": -97.7026596069336, "logps/rejected": -304.09375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.1258344650268555, "rewards/margins": 12.807943344116211, "rewards/rejected": -13.9337797164917, "step": 164 }, { "epoch": 2.1498371335504887, "grad_norm": 0.1376953125, "learning_rate": 4.075342465753425e-05, "logits/chosen": 0.4336688816547394, "logits/rejected": 0.6021983623504639, "logps/chosen": -106.12345123291016, "logps/rejected": -288.62469482421875, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.3127267360687256, "rewards/margins": 12.904582977294922, "rewards/rejected": -14.217309951782227, "step": 165 }, { "epoch": 2.1628664495114007, "grad_norm": 0.036865234375, "learning_rate": 4.0684931506849315e-05, "logits/chosen": 0.4477992355823517, "logits/rejected": 0.5709498524665833, "logps/chosen": -108.85577392578125, "logps/rejected": -285.3506164550781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.2774969339370728, "rewards/margins": 11.801679611206055, "rewards/rejected": -13.07917594909668, "step": 166 }, { "epoch": 2.175895765472313, "grad_norm": 0.037841796875, "learning_rate": 4.061643835616439e-05, "logits/chosen": 0.452865868806839, "logits/rejected": 0.5479907989501953, "logps/chosen": -110.41411590576172, "logps/rejected": -293.05035400390625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.1979793310165405, "rewards/margins": 13.181974411010742, "rewards/rejected": -14.37995433807373, "step": 167 }, { "epoch": 2.188925081433225, "grad_norm": 0.040771484375, "learning_rate": 4.0547945205479454e-05, "logits/chosen": 0.4804653823375702, "logits/rejected": 0.5071645379066467, "logps/chosen": -93.72543334960938, "logps/rejected": -326.3215637207031, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.0895832777023315, "rewards/margins": 15.001323699951172, "rewards/rejected": -16.090906143188477, "step": 168 }, { "epoch": 2.201954397394137, "grad_norm": 0.1396484375, "learning_rate": 4.047945205479452e-05, "logits/chosen": 0.3976234197616577, "logits/rejected": 0.5127770304679871, "logps/chosen": -86.84957122802734, "logps/rejected": -272.968505859375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.7831870913505554, "rewards/margins": 12.802996635437012, "rewards/rejected": -13.58618450164795, "step": 169 }, { "epoch": 2.214983713355049, "grad_norm": 0.1044921875, "learning_rate": 4.041095890410959e-05, "logits/chosen": 0.4045504927635193, "logits/rejected": 0.45465028285980225, "logps/chosen": -105.28460693359375, "logps/rejected": -309.6754150390625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.0679643154144287, "rewards/margins": 14.03729248046875, "rewards/rejected": -15.105257987976074, "step": 170 }, { "epoch": 2.228013029315961, "grad_norm": 0.03759765625, "learning_rate": 4.034246575342466e-05, "logits/chosen": 0.4175838530063629, "logits/rejected": 0.5390201210975647, "logps/chosen": -95.82322692871094, "logps/rejected": -332.502685546875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.0852179527282715, "rewards/margins": 14.905830383300781, "rewards/rejected": -15.991046905517578, "step": 171 }, { "epoch": 2.241042345276873, "grad_norm": 0.06396484375, "learning_rate": 4.027397260273973e-05, "logits/chosen": 0.48719215393066406, "logits/rejected": 0.5657703876495361, "logps/chosen": -88.64961242675781, "logps/rejected": -275.7567138671875, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.9338966012001038, "rewards/margins": 13.001708030700684, "rewards/rejected": -13.935606002807617, "step": 172 }, { "epoch": 2.254071661237785, "grad_norm": 0.0966796875, "learning_rate": 4.02054794520548e-05, "logits/chosen": 0.5867688655853271, "logits/rejected": 0.6384550333023071, "logps/chosen": -110.77032470703125, "logps/rejected": -328.6289367675781, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.4720832109451294, "rewards/margins": 14.248076438903809, "rewards/rejected": -15.720159530639648, "step": 173 }, { "epoch": 2.267100977198697, "grad_norm": 0.039794921875, "learning_rate": 4.0136986301369864e-05, "logits/chosen": 0.4327799677848816, "logits/rejected": 0.4705524742603302, "logps/chosen": -105.45439147949219, "logps/rejected": -319.2513122558594, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8988175392150879, "rewards/margins": 14.017317771911621, "rewards/rejected": -14.916135787963867, "step": 174 }, { "epoch": 2.2801302931596092, "grad_norm": 0.09619140625, "learning_rate": 4.006849315068493e-05, "logits/chosen": 0.5131232738494873, "logits/rejected": 0.5097309947013855, "logps/chosen": -120.355712890625, "logps/rejected": -296.6656494140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.3266019821166992, "rewards/margins": 12.798480987548828, "rewards/rejected": -14.125082969665527, "step": 175 }, { "epoch": 2.2931596091205213, "grad_norm": 0.2255859375, "learning_rate": 4e-05, "logits/chosen": 0.4963986575603485, "logits/rejected": 0.5654538869857788, "logps/chosen": -119.40376281738281, "logps/rejected": -269.1568908691406, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.111502766609192, "rewards/margins": 11.538202285766602, "rewards/rejected": -12.64970588684082, "step": 176 }, { "epoch": 2.3061889250814334, "grad_norm": 0.201171875, "learning_rate": 3.993150684931507e-05, "logits/chosen": 0.5080669522285461, "logits/rejected": 0.4891076385974884, "logps/chosen": -112.92520141601562, "logps/rejected": -291.1544189453125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.3876513242721558, "rewards/margins": 11.71539306640625, "rewards/rejected": -13.103044509887695, "step": 177 }, { "epoch": 2.3192182410423454, "grad_norm": 0.2138671875, "learning_rate": 3.9863013698630135e-05, "logits/chosen": 0.4692964553833008, "logits/rejected": 0.5622753500938416, "logps/chosen": -92.26762390136719, "logps/rejected": -267.98675537109375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.8580554723739624, "rewards/margins": 12.327470779418945, "rewards/rejected": -13.185525894165039, "step": 178 }, { "epoch": 2.3322475570032575, "grad_norm": 9.25, "learning_rate": 3.979452054794521e-05, "logits/chosen": 0.5638495683670044, "logits/rejected": 0.5911377668380737, "logps/chosen": -117.00182342529297, "logps/rejected": -285.2914123535156, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": -1.443804144859314, "rewards/margins": 11.933603286743164, "rewards/rejected": -13.377408981323242, "step": 179 }, { "epoch": 2.3452768729641695, "grad_norm": 0.043212890625, "learning_rate": 3.9726027397260274e-05, "logits/chosen": 0.4331457316875458, "logits/rejected": 0.5054813623428345, "logps/chosen": -114.8367919921875, "logps/rejected": -263.3021240234375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.0023431777954102, "rewards/margins": 11.675691604614258, "rewards/rejected": -12.678034782409668, "step": 180 }, { "epoch": 2.3583061889250816, "grad_norm": 0.2177734375, "learning_rate": 3.965753424657535e-05, "logits/chosen": 0.614739716053009, "logits/rejected": 0.6245816946029663, "logps/chosen": -94.85420989990234, "logps/rejected": -277.0835266113281, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.3436460494995117, "rewards/margins": 11.414693832397461, "rewards/rejected": -12.758339881896973, "step": 181 }, { "epoch": 2.3713355048859937, "grad_norm": 0.134765625, "learning_rate": 3.958904109589041e-05, "logits/chosen": 0.5919771790504456, "logits/rejected": 0.61507648229599, "logps/chosen": -69.8411865234375, "logps/rejected": -272.3177795410156, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -0.573222279548645, "rewards/margins": 12.539608001708984, "rewards/rejected": -13.112829208374023, "step": 182 }, { "epoch": 2.3843648208469057, "grad_norm": 0.0966796875, "learning_rate": 3.952054794520548e-05, "logits/chosen": 0.48881152272224426, "logits/rejected": 0.5776315927505493, "logps/chosen": -89.60847473144531, "logps/rejected": -293.9697265625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.0633786916732788, "rewards/margins": 13.587398529052734, "rewards/rejected": -14.650779724121094, "step": 183 }, { "epoch": 2.3973941368078178, "grad_norm": 0.07470703125, "learning_rate": 3.9452054794520546e-05, "logits/chosen": 0.6034122705459595, "logits/rejected": 0.5341907739639282, "logps/chosen": -82.32555389404297, "logps/rejected": -266.908203125, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -0.8900930881500244, "rewards/margins": 12.200946807861328, "rewards/rejected": -13.091039657592773, "step": 184 }, { "epoch": 2.41042345276873, "grad_norm": 0.039306640625, "learning_rate": 3.938356164383562e-05, "logits/chosen": 0.5521727800369263, "logits/rejected": 0.6301867365837097, "logps/chosen": -98.17955017089844, "logps/rejected": -288.569580078125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.1262962818145752, "rewards/margins": 11.977638244628906, "rewards/rejected": -13.103934288024902, "step": 185 }, { "epoch": 2.423452768729642, "grad_norm": 0.0166015625, "learning_rate": 3.9315068493150684e-05, "logits/chosen": 0.5002225041389465, "logits/rejected": 0.595288097858429, "logps/chosen": -96.44597625732422, "logps/rejected": -270.15771484375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.1506391763687134, "rewards/margins": 12.394613265991211, "rewards/rejected": -13.545251846313477, "step": 186 }, { "epoch": 2.436482084690554, "grad_norm": 0.1865234375, "learning_rate": 3.924657534246576e-05, "logits/chosen": 0.5442834496498108, "logits/rejected": 0.5952669978141785, "logps/chosen": -104.47047424316406, "logps/rejected": -306.7992248535156, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.0476319789886475, "rewards/margins": 13.874088287353516, "rewards/rejected": -14.921720504760742, "step": 187 }, { "epoch": 2.449511400651466, "grad_norm": 0.138671875, "learning_rate": 3.9178082191780823e-05, "logits/chosen": 0.38490670919418335, "logits/rejected": 0.6002693176269531, "logps/chosen": -87.23043823242188, "logps/rejected": -338.7787170410156, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -0.8123894929885864, "rewards/margins": 16.088157653808594, "rewards/rejected": -16.90054702758789, "step": 188 }, { "epoch": 2.462540716612378, "grad_norm": 0.267578125, "learning_rate": 3.910958904109589e-05, "logits/chosen": 0.4915946125984192, "logits/rejected": 0.5476264953613281, "logps/chosen": -82.17195892333984, "logps/rejected": -279.19854736328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.7035698890686035, "rewards/margins": 12.01541519165039, "rewards/rejected": -12.718984603881836, "step": 189 }, { "epoch": 2.47557003257329, "grad_norm": 0.078125, "learning_rate": 3.904109589041096e-05, "logits/chosen": 0.5442248582839966, "logits/rejected": 0.5692495107650757, "logps/chosen": -118.85929870605469, "logps/rejected": -289.40765380859375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.2475743293762207, "rewards/margins": 12.606383323669434, "rewards/rejected": -13.853957176208496, "step": 190 }, { "epoch": 2.488599348534202, "grad_norm": 0.3046875, "learning_rate": 3.897260273972603e-05, "logits/chosen": 0.5258509516716003, "logits/rejected": 0.6596174240112305, "logps/chosen": -131.38265991210938, "logps/rejected": -283.6547546386719, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.4643476009368896, "rewards/margins": 11.995124816894531, "rewards/rejected": -13.45947265625, "step": 191 }, { "epoch": 2.5016286644951142, "grad_norm": 0.0108642578125, "learning_rate": 3.89041095890411e-05, "logits/chosen": 0.4301671087741852, "logits/rejected": 0.5925787091255188, "logps/chosen": -98.11710357666016, "logps/rejected": -325.28521728515625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.8909515738487244, "rewards/margins": 15.353211402893066, "rewards/rejected": -16.24416160583496, "step": 192 }, { "epoch": 2.5146579804560263, "grad_norm": 0.2021484375, "learning_rate": 3.883561643835617e-05, "logits/chosen": 0.5148497819900513, "logits/rejected": 0.5551873445510864, "logps/chosen": -97.75564575195312, "logps/rejected": -261.85284423828125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.9065383672714233, "rewards/margins": 12.213420867919922, "rewards/rejected": -13.119958877563477, "step": 193 }, { "epoch": 2.5276872964169383, "grad_norm": 0.036865234375, "learning_rate": 3.8767123287671233e-05, "logits/chosen": 0.49658170342445374, "logits/rejected": 0.5841426849365234, "logps/chosen": -129.8172149658203, "logps/rejected": -320.80657958984375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.4361586570739746, "rewards/margins": 13.985431671142578, "rewards/rejected": -15.421590805053711, "step": 194 }, { "epoch": 2.5407166123778504, "grad_norm": 0.032470703125, "learning_rate": 3.86986301369863e-05, "logits/chosen": 0.40110084414482117, "logits/rejected": 0.4429419934749603, "logps/chosen": -110.05766296386719, "logps/rejected": -279.5133056640625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.49040687084198, "rewards/margins": 12.111815452575684, "rewards/rejected": -13.602222442626953, "step": 195 }, { "epoch": 2.5537459283387625, "grad_norm": 0.047607421875, "learning_rate": 3.863013698630137e-05, "logits/chosen": 0.3707536458969116, "logits/rejected": 0.4637380838394165, "logps/chosen": -111.06605529785156, "logps/rejected": -331.0019836425781, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.4405083656311035, "rewards/margins": 14.104761123657227, "rewards/rejected": -15.545269966125488, "step": 196 }, { "epoch": 2.5667752442996745, "grad_norm": 0.1787109375, "learning_rate": 3.856164383561644e-05, "logits/chosen": 0.4742357134819031, "logits/rejected": 0.5186038613319397, "logps/chosen": -102.65884399414062, "logps/rejected": -286.2248229980469, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.2666797637939453, "rewards/margins": 12.227950096130371, "rewards/rejected": -13.494630813598633, "step": 197 }, { "epoch": 2.5798045602605866, "grad_norm": 3.21875, "learning_rate": 3.8493150684931505e-05, "logits/chosen": 0.5423088073730469, "logits/rejected": 0.5629587173461914, "logps/chosen": -116.73429870605469, "logps/rejected": -314.3695068359375, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -1.7822774648666382, "rewards/margins": 12.672046661376953, "rewards/rejected": -14.454323768615723, "step": 198 }, { "epoch": 2.5928338762214986, "grad_norm": 0.032470703125, "learning_rate": 3.842465753424658e-05, "logits/chosen": 0.3940759599208832, "logits/rejected": 0.5198019742965698, "logps/chosen": -129.81735229492188, "logps/rejected": -329.01812744140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.2780612707138062, "rewards/margins": 14.856027603149414, "rewards/rejected": -16.13408851623535, "step": 199 }, { "epoch": 2.6058631921824107, "grad_norm": 0.0235595703125, "learning_rate": 3.8356164383561644e-05, "logits/chosen": 0.4287755489349365, "logits/rejected": 0.49127259850502014, "logps/chosen": -88.73255920410156, "logps/rejected": -294.54254150390625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -0.845811128616333, "rewards/margins": 13.853937149047852, "rewards/rejected": -14.699748992919922, "step": 200 }, { "epoch": 2.6058631921824107, "eval_logits/chosen": 0.41032981872558594, "eval_logits/rejected": 0.4839063882827759, "eval_logps/chosen": -110.02198791503906, "eval_logps/rejected": -293.51873779296875, "eval_loss": 0.00859944336116314, "eval_rewards/accuracies": 0.9955357313156128, "eval_rewards/chosen": -1.456769585609436, "eval_rewards/margins": 12.785966873168945, "eval_rewards/rejected": -14.24273681640625, "eval_runtime": 52.2735, "eval_samples_per_second": 12.836, "eval_steps_per_second": 0.803, "step": 200 } ], "logging_steps": 1.0, "max_steps": 760, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 200, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }