diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13221 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 14.915254237288135, + "eval_steps": 1, + "global_step": 880, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01694915254237288, + "grad_norm": 40.30728845506363, + "learning_rate": 4.2372881355932205e-09, + "logits/chosen": 12.842013359069824, + "logits/rejected": 13.082613945007324, + "logps/chosen": -18.68050193786621, + "logps/rejected": -30.006702423095703, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.03389830508474576, + "grad_norm": 38.041564508830845, + "learning_rate": 8.474576271186441e-09, + "logits/chosen": 10.079428672790527, + "logits/rejected": 10.317561149597168, + "logps/chosen": -20.233402252197266, + "logps/rejected": -21.939817428588867, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.05084745762711865, + "grad_norm": 40.36008813830181, + "learning_rate": 1.2711864406779661e-08, + "logits/chosen": 14.052921295166016, + "logits/rejected": 15.504006385803223, + "logps/chosen": -16.064619064331055, + "logps/rejected": -29.048044204711914, + "loss": 0.6998, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.026119917631149292, + "rewards/margins": -0.0660426914691925, + "rewards/rejected": 0.03992277383804321, + "step": 3 + }, + { + "epoch": 0.06779661016949153, + "grad_norm": 36.63114570098774, + "learning_rate": 1.6949152542372882e-08, + "logits/chosen": 11.676168441772461, + "logits/rejected": 12.226595878601074, + "logps/chosen": -15.098368644714355, + "logps/rejected": -23.02960205078125, + "loss": 0.7011, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0063827671110630035, + "rewards/margins": 0.014015134423971176, + "rewards/rejected": -0.02039790153503418, + "step": 4 + }, + { + "epoch": 0.0847457627118644, + "grad_norm": 42.846639257495035, + "learning_rate": 2.11864406779661e-08, + "logits/chosen": 13.691095352172852, + "logits/rejected": 13.234848976135254, + "logps/chosen": -18.44330596923828, + "logps/rejected": -15.939766883850098, + "loss": 0.7118, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05990014597773552, + "rewards/margins": 0.0980239287018776, + "rewards/rejected": -0.038123778998851776, + "step": 5 + }, + { + "epoch": 0.1016949152542373, + "grad_norm": 41.95119341285687, + "learning_rate": 2.5423728813559323e-08, + "logits/chosen": 9.710973739624023, + "logits/rejected": 9.734879493713379, + "logps/chosen": -19.200092315673828, + "logps/rejected": -23.185443878173828, + "loss": 0.707, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.014187633991241455, + "rewards/margins": 0.03218716382980347, + "rewards/rejected": -0.01799952983856201, + "step": 6 + }, + { + "epoch": 0.11864406779661017, + "grad_norm": 42.934384992123036, + "learning_rate": 2.966101694915254e-08, + "logits/chosen": 12.107307434082031, + "logits/rejected": 12.784415245056152, + "logps/chosen": -16.704126358032227, + "logps/rejected": -20.03870391845703, + "loss": 0.7033, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06281542778015137, + "rewards/margins": 0.17079591751098633, + "rewards/rejected": -0.10798049718141556, + "step": 7 + }, + { + "epoch": 0.13559322033898305, + "grad_norm": 37.640968099147095, + "learning_rate": 3.3898305084745764e-08, + "logits/chosen": 12.114370346069336, + "logits/rejected": 12.843629837036133, + "logps/chosen": -14.356505393981934, + "logps/rejected": -23.001556396484375, + "loss": 0.7016, + "rewards/accuracies": 0.1875, + "rewards/chosen": -0.04248759523034096, + "rewards/margins": -0.09302316606044769, + "rewards/rejected": 0.050535574555397034, + "step": 8 + }, + { + "epoch": 0.15254237288135594, + "grad_norm": 48.59143625607344, + "learning_rate": 3.813559322033898e-08, + "logits/chosen": 13.338932991027832, + "logits/rejected": 13.014775276184082, + "logps/chosen": -20.295446395874023, + "logps/rejected": -17.377910614013672, + "loss": 0.7236, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.012785300612449646, + "rewards/margins": 0.025210216641426086, + "rewards/rejected": -0.01242491602897644, + "step": 9 + }, + { + "epoch": 0.1694915254237288, + "grad_norm": 42.21481624311643, + "learning_rate": 4.23728813559322e-08, + "logits/chosen": 10.676657676696777, + "logits/rejected": 10.866190910339355, + "logps/chosen": -22.40692138671875, + "logps/rejected": -17.18635368347168, + "loss": 0.7366, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.0834585577249527, + "rewards/margins": -0.16166189312934875, + "rewards/rejected": 0.07820333540439606, + "step": 10 + }, + { + "epoch": 0.1864406779661017, + "grad_norm": 44.24683815055113, + "learning_rate": 4.661016949152542e-08, + "logits/chosen": 11.689413070678711, + "logits/rejected": 12.616247177124023, + "logps/chosen": -16.787853240966797, + "logps/rejected": -25.714006423950195, + "loss": 0.7008, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.051930248737335205, + "rewards/margins": -0.04377424716949463, + "rewards/rejected": 0.09570449590682983, + "step": 11 + }, + { + "epoch": 0.2033898305084746, + "grad_norm": 36.42082486581824, + "learning_rate": 5.0847457627118645e-08, + "logits/chosen": 10.625275611877441, + "logits/rejected": 10.755789756774902, + "logps/chosen": -13.972813606262207, + "logps/rejected": -18.619009017944336, + "loss": 0.6921, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.0011137649416923523, + "rewards/margins": -0.010056130588054657, + "rewards/rejected": 0.008942365646362305, + "step": 12 + }, + { + "epoch": 0.22033898305084745, + "grad_norm": 39.783625967921616, + "learning_rate": 5.508474576271186e-08, + "logits/chosen": 11.280006408691406, + "logits/rejected": 12.170235633850098, + "logps/chosen": -15.202749252319336, + "logps/rejected": -19.611961364746094, + "loss": 0.7027, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.04294466972351074, + "rewards/margins": -0.056273579597473145, + "rewards/rejected": 0.013328909873962402, + "step": 13 + }, + { + "epoch": 0.23728813559322035, + "grad_norm": 43.360099635584334, + "learning_rate": 5.932203389830508e-08, + "logits/chosen": 7.622957706451416, + "logits/rejected": 9.428025245666504, + "logps/chosen": -18.667062759399414, + "logps/rejected": -33.28583908081055, + "loss": 0.6832, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.021295249462127686, + "rewards/margins": 0.09253796935081482, + "rewards/rejected": -0.07124271988868713, + "step": 14 + }, + { + "epoch": 0.2542372881355932, + "grad_norm": 39.17087374081227, + "learning_rate": 6.35593220338983e-08, + "logits/chosen": 15.24155044555664, + "logits/rejected": 16.208242416381836, + "logps/chosen": -13.249297142028809, + "logps/rejected": -23.505441665649414, + "loss": 0.7163, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.02330087125301361, + "rewards/margins": -0.07975521683692932, + "rewards/rejected": 0.05645434558391571, + "step": 15 + }, + { + "epoch": 0.2711864406779661, + "grad_norm": 37.50466933990464, + "learning_rate": 6.779661016949153e-08, + "logits/chosen": 14.139371871948242, + "logits/rejected": 14.819007873535156, + "logps/chosen": -16.831674575805664, + "logps/rejected": -22.343116760253906, + "loss": 0.6819, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.03721078485250473, + "rewards/margins": -0.026429735124111176, + "rewards/rejected": -0.010781049728393555, + "step": 16 + }, + { + "epoch": 0.288135593220339, + "grad_norm": 39.00712547599067, + "learning_rate": 7.203389830508475e-08, + "logits/chosen": 14.457000732421875, + "logits/rejected": 14.57475757598877, + "logps/chosen": -13.015029907226562, + "logps/rejected": -21.924034118652344, + "loss": 0.6667, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08990475535392761, + "rewards/margins": 0.14169014990329742, + "rewards/rejected": -0.05178540199995041, + "step": 17 + }, + { + "epoch": 0.3050847457627119, + "grad_norm": 44.410867327593046, + "learning_rate": 7.627118644067796e-08, + "logits/chosen": 12.978434562683105, + "logits/rejected": 13.062633514404297, + "logps/chosen": -12.442479133605957, + "logps/rejected": -17.96292495727539, + "loss": 0.7077, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.06292188912630081, + "rewards/margins": -0.007752574980258942, + "rewards/rejected": 0.07067446410655975, + "step": 18 + }, + { + "epoch": 0.3220338983050847, + "grad_norm": 38.86515880597673, + "learning_rate": 8.050847457627117e-08, + "logits/chosen": 10.975082397460938, + "logits/rejected": 11.75900936126709, + "logps/chosen": -11.997967720031738, + "logps/rejected": -24.670589447021484, + "loss": 0.7101, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.018813543021678925, + "rewards/margins": -0.012066647410392761, + "rewards/rejected": -0.006746895611286163, + "step": 19 + }, + { + "epoch": 0.3389830508474576, + "grad_norm": 37.07719478540159, + "learning_rate": 8.47457627118644e-08, + "logits/chosen": 13.550518989562988, + "logits/rejected": 13.391486167907715, + "logps/chosen": -17.554624557495117, + "logps/rejected": -20.00838279724121, + "loss": 0.693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.066666379570961, + "rewards/margins": 0.024734124541282654, + "rewards/rejected": 0.041932255029678345, + "step": 20 + }, + { + "epoch": 0.3559322033898305, + "grad_norm": 38.29971669543194, + "learning_rate": 8.898305084745762e-08, + "logits/chosen": 13.924101829528809, + "logits/rejected": 14.325352668762207, + "logps/chosen": -14.366829872131348, + "logps/rejected": -21.499202728271484, + "loss": 0.7111, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.044451721012592316, + "rewards/margins": -0.024806134402751923, + "rewards/rejected": 0.06925785541534424, + "step": 21 + }, + { + "epoch": 0.3728813559322034, + "grad_norm": 41.353983577755464, + "learning_rate": 9.322033898305084e-08, + "logits/chosen": 9.174437522888184, + "logits/rejected": 9.49307918548584, + "logps/chosen": -13.705348014831543, + "logps/rejected": -17.275392532348633, + "loss": 0.7085, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.0074497610330581665, + "rewards/margins": -0.07652243971824646, + "rewards/rejected": 0.0690726786851883, + "step": 22 + }, + { + "epoch": 0.3898305084745763, + "grad_norm": 43.114227330572284, + "learning_rate": 9.745762711864407e-08, + "logits/chosen": 12.173064231872559, + "logits/rejected": 12.370023727416992, + "logps/chosen": -16.37883758544922, + "logps/rejected": -20.038183212280273, + "loss": 0.723, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01693063974380493, + "rewards/margins": 0.010861068964004517, + "rewards/rejected": -0.027791708707809448, + "step": 23 + }, + { + "epoch": 0.4067796610169492, + "grad_norm": 38.83032139356952, + "learning_rate": 1.0169491525423729e-07, + "logits/chosen": 15.312846183776855, + "logits/rejected": 15.3582181930542, + "logps/chosen": -20.627126693725586, + "logps/rejected": -27.445995330810547, + "loss": 0.6751, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07566741108894348, + "rewards/margins": 0.06957697868347168, + "rewards/rejected": 0.006090432405471802, + "step": 24 + }, + { + "epoch": 0.423728813559322, + "grad_norm": 43.67807182666561, + "learning_rate": 1.059322033898305e-07, + "logits/chosen": 11.813891410827637, + "logits/rejected": 11.837639808654785, + "logps/chosen": -21.68389892578125, + "logps/rejected": -25.150630950927734, + "loss": 0.6891, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.08590184897184372, + "rewards/margins": 0.07415612787008286, + "rewards/rejected": 0.011745721101760864, + "step": 25 + }, + { + "epoch": 0.4406779661016949, + "grad_norm": 41.746172744282724, + "learning_rate": 1.1016949152542372e-07, + "logits/chosen": 10.105422973632812, + "logits/rejected": 10.882453918457031, + "logps/chosen": -14.849566459655762, + "logps/rejected": -21.765117645263672, + "loss": 0.7372, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004259124398231506, + "rewards/margins": 0.01350797712802887, + "rewards/rejected": -0.009248852729797363, + "step": 26 + }, + { + "epoch": 0.4576271186440678, + "grad_norm": 42.56394623827792, + "learning_rate": 1.1440677966101695e-07, + "logits/chosen": 13.506670951843262, + "logits/rejected": 13.843586921691895, + "logps/chosen": -19.2850399017334, + "logps/rejected": -22.344873428344727, + "loss": 0.7004, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.05585460364818573, + "rewards/margins": -0.10725802183151245, + "rewards/rejected": 0.05140341818332672, + "step": 27 + }, + { + "epoch": 0.4745762711864407, + "grad_norm": 36.639123230654995, + "learning_rate": 1.1864406779661017e-07, + "logits/chosen": 14.286264419555664, + "logits/rejected": 14.345661163330078, + "logps/chosen": -15.126670837402344, + "logps/rejected": -21.23917007446289, + "loss": 0.6815, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.012283587828278542, + "rewards/margins": 0.06251242011785507, + "rewards/rejected": -0.05022883415222168, + "step": 28 + }, + { + "epoch": 0.4915254237288136, + "grad_norm": 43.447579941943744, + "learning_rate": 1.228813559322034e-07, + "logits/chosen": 12.693879127502441, + "logits/rejected": 12.80766773223877, + "logps/chosen": -16.297504425048828, + "logps/rejected": -23.56643295288086, + "loss": 0.6999, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.020490556955337524, + "rewards/margins": 0.07709893584251404, + "rewards/rejected": -0.09758949279785156, + "step": 29 + }, + { + "epoch": 0.5084745762711864, + "grad_norm": 41.412620150940505, + "learning_rate": 1.271186440677966e-07, + "logits/chosen": 10.882184982299805, + "logits/rejected": 11.457561492919922, + "logps/chosen": -18.117883682250977, + "logps/rejected": -24.392854690551758, + "loss": 0.6949, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.05693977326154709, + "rewards/margins": -0.06227093189954758, + "rewards/rejected": 0.005331158638000488, + "step": 30 + }, + { + "epoch": 0.5254237288135594, + "grad_norm": 41.04485757895064, + "learning_rate": 1.3135593220338984e-07, + "logits/chosen": 12.045116424560547, + "logits/rejected": 12.250951766967773, + "logps/chosen": -18.459415435791016, + "logps/rejected": -24.026748657226562, + "loss": 0.7062, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.022956043481826782, + "rewards/margins": -0.023535877466201782, + "rewards/rejected": 0.046491920948028564, + "step": 31 + }, + { + "epoch": 0.5423728813559322, + "grad_norm": 76.97626958402756, + "learning_rate": 1.3559322033898305e-07, + "logits/chosen": 12.790939331054688, + "logits/rejected": 12.999430656433105, + "logps/chosen": -18.866657257080078, + "logps/rejected": -24.532800674438477, + "loss": 0.7016, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.008551269769668579, + "rewards/margins": -0.07439977675676346, + "rewards/rejected": 0.06584850698709488, + "step": 32 + }, + { + "epoch": 0.559322033898305, + "grad_norm": 41.233124075932956, + "learning_rate": 1.3983050847457625e-07, + "logits/chosen": 11.019987106323242, + "logits/rejected": 11.895480155944824, + "logps/chosen": -17.390188217163086, + "logps/rejected": -27.735076904296875, + "loss": 0.7024, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.07621224224567413, + "rewards/margins": -0.00863146036863327, + "rewards/rejected": 0.0848436951637268, + "step": 33 + }, + { + "epoch": 0.576271186440678, + "grad_norm": 41.51281273691135, + "learning_rate": 1.440677966101695e-07, + "logits/chosen": 10.961234092712402, + "logits/rejected": 11.117724418640137, + "logps/chosen": -15.324450492858887, + "logps/rejected": -22.6516056060791, + "loss": 0.7064, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.03333364427089691, + "rewards/margins": -0.046573787927627563, + "rewards/rejected": 0.013240143656730652, + "step": 34 + }, + { + "epoch": 0.5932203389830508, + "grad_norm": 38.46980857022322, + "learning_rate": 1.483050847457627e-07, + "logits/chosen": 11.720096588134766, + "logits/rejected": 13.81384563446045, + "logps/chosen": -11.844643592834473, + "logps/rejected": -26.574565887451172, + "loss": 0.7136, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.01212693378329277, + "rewards/margins": -0.1099301129579544, + "rewards/rejected": 0.09780317544937134, + "step": 35 + }, + { + "epoch": 0.6101694915254238, + "grad_norm": 36.14892188798069, + "learning_rate": 1.5254237288135593e-07, + "logits/chosen": 13.399576187133789, + "logits/rejected": 12.95138168334961, + "logps/chosen": -13.442176818847656, + "logps/rejected": -18.09130859375, + "loss": 0.6871, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.01939527690410614, + "rewards/margins": -0.010730020701885223, + "rewards/rejected": 0.030125297605991364, + "step": 36 + }, + { + "epoch": 0.6271186440677966, + "grad_norm": 40.851396852891206, + "learning_rate": 1.5677966101694915e-07, + "logits/chosen": 14.05972671508789, + "logits/rejected": 14.02302360534668, + "logps/chosen": -19.48711395263672, + "logps/rejected": -20.447059631347656, + "loss": 0.703, + "rewards/accuracies": 0.3125, + "rewards/chosen": -0.016420789062976837, + "rewards/margins": -0.0325637087225914, + "rewards/rejected": 0.016142919659614563, + "step": 37 + }, + { + "epoch": 0.6440677966101694, + "grad_norm": 37.67420037057592, + "learning_rate": 1.6101694915254234e-07, + "logits/chosen": 14.11151123046875, + "logits/rejected": 15.1259765625, + "logps/chosen": -19.008934020996094, + "logps/rejected": -34.84490966796875, + "loss": 0.6951, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.008656233549118042, + "rewards/margins": -0.0032239556312561035, + "rewards/rejected": 0.011880189180374146, + "step": 38 + }, + { + "epoch": 0.6610169491525424, + "grad_norm": 37.70147259149854, + "learning_rate": 1.6525423728813559e-07, + "logits/chosen": 11.573564529418945, + "logits/rejected": 12.407613754272461, + "logps/chosen": -25.639665603637695, + "logps/rejected": -27.497907638549805, + "loss": 0.7096, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.07243013381958008, + "rewards/margins": 0.015660464763641357, + "rewards/rejected": -0.08809059858322144, + "step": 39 + }, + { + "epoch": 0.6779661016949152, + "grad_norm": 37.72600351271015, + "learning_rate": 1.694915254237288e-07, + "logits/chosen": 12.521224021911621, + "logits/rejected": 13.355399131774902, + "logps/chosen": -15.367606163024902, + "logps/rejected": -24.378122329711914, + "loss": 0.7133, + "rewards/accuracies": 0.25, + "rewards/chosen": 0.029328536242246628, + "rewards/margins": -0.07064150273799896, + "rewards/rejected": 0.09997004270553589, + "step": 40 + }, + { + "epoch": 0.6949152542372882, + "grad_norm": 37.200999329078535, + "learning_rate": 1.7372881355932202e-07, + "logits/chosen": 9.892135620117188, + "logits/rejected": 11.508831977844238, + "logps/chosen": -16.929044723510742, + "logps/rejected": -24.869295120239258, + "loss": 0.6806, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.057470276951789856, + "rewards/margins": 0.0028195232152938843, + "rewards/rejected": -0.06028980016708374, + "step": 41 + }, + { + "epoch": 0.711864406779661, + "grad_norm": 41.45400661954368, + "learning_rate": 1.7796610169491524e-07, + "logits/chosen": 12.402385711669922, + "logits/rejected": 12.750102996826172, + "logps/chosen": -14.469609260559082, + "logps/rejected": -22.454177856445312, + "loss": 0.7085, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06580530107021332, + "rewards/margins": 0.16378699243068695, + "rewards/rejected": -0.09798169136047363, + "step": 42 + }, + { + "epoch": 0.7288135593220338, + "grad_norm": 41.67437074393324, + "learning_rate": 1.8220338983050846e-07, + "logits/chosen": 14.246957778930664, + "logits/rejected": 14.030410766601562, + "logps/chosen": -27.281932830810547, + "logps/rejected": -22.738405227661133, + "loss": 0.6912, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.020992666482925415, + "rewards/margins": 0.07494118809700012, + "rewards/rejected": -0.09593385457992554, + "step": 43 + }, + { + "epoch": 0.7457627118644068, + "grad_norm": 36.371842747457514, + "learning_rate": 1.8644067796610168e-07, + "logits/chosen": 12.301523208618164, + "logits/rejected": 12.70305347442627, + "logps/chosen": -20.010610580444336, + "logps/rejected": -29.714580535888672, + "loss": 0.6922, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12373033165931702, + "rewards/margins": 0.15592673420906067, + "rewards/rejected": -0.03219640254974365, + "step": 44 + }, + { + "epoch": 0.7627118644067796, + "grad_norm": 41.8781105096038, + "learning_rate": 1.906779661016949e-07, + "logits/chosen": 10.72525405883789, + "logits/rejected": 12.510992050170898, + "logps/chosen": -14.862523078918457, + "logps/rejected": -27.508800506591797, + "loss": 0.6838, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.029734574258327484, + "rewards/margins": 0.07138457149267197, + "rewards/rejected": -0.04164999723434448, + "step": 45 + }, + { + "epoch": 0.7796610169491526, + "grad_norm": 41.90756110852291, + "learning_rate": 1.9491525423728814e-07, + "logits/chosen": 11.77270221710205, + "logits/rejected": 11.579182624816895, + "logps/chosen": -18.880056381225586, + "logps/rejected": -22.22597885131836, + "loss": 0.7062, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0413464680314064, + "rewards/margins": 0.0769132599234581, + "rewards/rejected": -0.0355667918920517, + "step": 46 + }, + { + "epoch": 0.7966101694915254, + "grad_norm": 62.77035926211305, + "learning_rate": 1.9915254237288134e-07, + "logits/chosen": 12.305220603942871, + "logits/rejected": 12.062664031982422, + "logps/chosen": -18.956796646118164, + "logps/rejected": -20.161949157714844, + "loss": 0.7016, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.03725249692797661, + "rewards/margins": 0.011303797364234924, + "rewards/rejected": 0.025948703289031982, + "step": 47 + }, + { + "epoch": 0.8135593220338984, + "grad_norm": 37.86640870800892, + "learning_rate": 2.0338983050847458e-07, + "logits/chosen": 13.177642822265625, + "logits/rejected": 13.93821907043457, + "logps/chosen": -16.465944290161133, + "logps/rejected": -26.467010498046875, + "loss": 0.6881, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.035428911447525024, + "rewards/margins": -0.01987355947494507, + "rewards/rejected": 0.05530247092247009, + "step": 48 + }, + { + "epoch": 0.8305084745762712, + "grad_norm": 37.25491848473269, + "learning_rate": 2.076271186440678e-07, + "logits/chosen": 14.861462593078613, + "logits/rejected": 14.734480857849121, + "logps/chosen": -15.433595657348633, + "logps/rejected": -17.87074089050293, + "loss": 0.7049, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.03526681661605835, + "rewards/margins": -0.05743015184998512, + "rewards/rejected": 0.022163331508636475, + "step": 49 + }, + { + "epoch": 0.847457627118644, + "grad_norm": 41.88806594120029, + "learning_rate": 2.11864406779661e-07, + "logits/chosen": 10.575276374816895, + "logits/rejected": 11.21845817565918, + "logps/chosen": -14.93527889251709, + "logps/rejected": -25.54239273071289, + "loss": 0.703, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.06612725555896759, + "rewards/margins": -0.07982112467288971, + "rewards/rejected": 0.01369386911392212, + "step": 50 + }, + { + "epoch": 0.864406779661017, + "grad_norm": 40.46403881382111, + "learning_rate": 2.1610169491525424e-07, + "logits/chosen": 12.37446117401123, + "logits/rejected": 13.336200714111328, + "logps/chosen": -17.579078674316406, + "logps/rejected": -32.97596740722656, + "loss": 0.6947, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.008079767227172852, + "rewards/margins": 0.00566767156124115, + "rewards/rejected": 0.0024120956659317017, + "step": 51 + }, + { + "epoch": 0.8813559322033898, + "grad_norm": 39.18280033903859, + "learning_rate": 2.2033898305084743e-07, + "logits/chosen": 13.351238250732422, + "logits/rejected": 14.134299278259277, + "logps/chosen": -16.299585342407227, + "logps/rejected": -24.282123565673828, + "loss": 0.6835, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08001573383808136, + "rewards/margins": 0.03988678753376007, + "rewards/rejected": 0.04012894630432129, + "step": 52 + }, + { + "epoch": 0.8983050847457628, + "grad_norm": 39.754504822625954, + "learning_rate": 2.2457627118644068e-07, + "logits/chosen": 13.946189880371094, + "logits/rejected": 14.154667854309082, + "logps/chosen": -15.623710632324219, + "logps/rejected": -18.68361473083496, + "loss": 0.6901, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.02733871340751648, + "rewards/margins": 0.13955903053283691, + "rewards/rejected": -0.1668977439403534, + "step": 53 + }, + { + "epoch": 0.9152542372881356, + "grad_norm": 37.66072122883095, + "learning_rate": 2.288135593220339e-07, + "logits/chosen": 15.305002212524414, + "logits/rejected": 15.318461418151855, + "logps/chosen": -17.736726760864258, + "logps/rejected": -18.221403121948242, + "loss": 0.6973, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.10565029084682465, + "rewards/margins": -0.06389039754867554, + "rewards/rejected": -0.04175989329814911, + "step": 54 + }, + { + "epoch": 0.9322033898305084, + "grad_norm": 35.612257753723746, + "learning_rate": 2.330508474576271e-07, + "logits/chosen": 15.089719772338867, + "logits/rejected": 15.363324165344238, + "logps/chosen": -17.88437843322754, + "logps/rejected": -21.706897735595703, + "loss": 0.6827, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.053910866379737854, + "rewards/margins": 0.10098430514335632, + "rewards/rejected": -0.04707343876361847, + "step": 55 + }, + { + "epoch": 0.9491525423728814, + "grad_norm": 39.18257877542564, + "learning_rate": 2.3728813559322033e-07, + "logits/chosen": 13.049365997314453, + "logits/rejected": 13.170905113220215, + "logps/chosen": -15.466392517089844, + "logps/rejected": -19.350955963134766, + "loss": 0.7121, + "rewards/accuracies": 0.375, + "rewards/chosen": -0.08173052966594696, + "rewards/margins": -0.09363162517547607, + "rewards/rejected": 0.011901095509529114, + "step": 56 + }, + { + "epoch": 0.9661016949152542, + "grad_norm": 40.96891813392811, + "learning_rate": 2.4152542372881355e-07, + "logits/chosen": 13.86726188659668, + "logits/rejected": 14.41377067565918, + "logps/chosen": -17.831151962280273, + "logps/rejected": -28.358530044555664, + "loss": 0.7051, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.056873977184295654, + "rewards/margins": 0.04540741443634033, + "rewards/rejected": 0.011466562747955322, + "step": 57 + }, + { + "epoch": 0.9830508474576272, + "grad_norm": 40.64462852147455, + "learning_rate": 2.457627118644068e-07, + "logits/chosen": 13.420578956604004, + "logits/rejected": 13.936662673950195, + "logps/chosen": -22.40713119506836, + "logps/rejected": -22.56110191345215, + "loss": 0.7096, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.023797959089279175, + "rewards/margins": -0.01631149649620056, + "rewards/rejected": 0.040109455585479736, + "step": 58 + }, + { + "epoch": 1.0, + "grad_norm": 42.59747961209585, + "learning_rate": 2.5e-07, + "logits/chosen": 12.881525039672852, + "logits/rejected": 13.721171379089355, + "logps/chosen": -16.626148223876953, + "logps/rejected": -25.509788513183594, + "loss": 0.694, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.028950288891792297, + "rewards/margins": 0.006730042397975922, + "rewards/rejected": -0.03568033128976822, + "step": 59 + }, + { + "epoch": 1.0169491525423728, + "grad_norm": 39.65797571647544, + "learning_rate": 2.542372881355932e-07, + "logits/chosen": 14.147932052612305, + "logits/rejected": 15.005398750305176, + "logps/chosen": -12.525471687316895, + "logps/rejected": -23.025493621826172, + "loss": 0.6843, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.01412774994969368, + "rewards/margins": 0.024437706917524338, + "rewards/rejected": -0.03856545686721802, + "step": 60 + }, + { + "epoch": 1.0338983050847457, + "grad_norm": 36.46992678492697, + "learning_rate": 2.584745762711864e-07, + "logits/chosen": 12.175691604614258, + "logits/rejected": 12.264796257019043, + "logps/chosen": -17.057971954345703, + "logps/rejected": -21.540843963623047, + "loss": 0.6934, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.048048973083496094, + "rewards/margins": 0.03411996364593506, + "rewards/rejected": -0.08216893672943115, + "step": 61 + }, + { + "epoch": 1.0508474576271187, + "grad_norm": 38.82225863120416, + "learning_rate": 2.6271186440677967e-07, + "logits/chosen": 10.932394027709961, + "logits/rejected": 11.038849830627441, + "logps/chosen": -16.846797943115234, + "logps/rejected": -23.29388999938965, + "loss": 0.6939, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03787383437156677, + "rewards/margins": 0.04288873076438904, + "rewards/rejected": -0.005014896392822266, + "step": 62 + }, + { + "epoch": 1.0677966101694916, + "grad_norm": 39.49407859666274, + "learning_rate": 2.6694915254237286e-07, + "logits/chosen": 10.65343952178955, + "logits/rejected": 11.759461402893066, + "logps/chosen": -16.778003692626953, + "logps/rejected": -26.33823585510254, + "loss": 0.6776, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.09588789939880371, + "rewards/margins": 0.12244513630867004, + "rewards/rejected": -0.026557236909866333, + "step": 63 + }, + { + "epoch": 1.0847457627118644, + "grad_norm": 39.740758663676, + "learning_rate": 2.711864406779661e-07, + "logits/chosen": 12.711525917053223, + "logits/rejected": 13.348038673400879, + "logps/chosen": -17.374736785888672, + "logps/rejected": -24.012853622436523, + "loss": 0.6737, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.009046778082847595, + "rewards/margins": 0.059755921363830566, + "rewards/rejected": -0.06880269944667816, + "step": 64 + }, + { + "epoch": 1.1016949152542372, + "grad_norm": 43.94207160987874, + "learning_rate": 2.754237288135593e-07, + "logits/chosen": 11.751956939697266, + "logits/rejected": 13.209794044494629, + "logps/chosen": -17.514083862304688, + "logps/rejected": -27.080425262451172, + "loss": 0.6797, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.12722863256931305, + "rewards/margins": -0.018149808049201965, + "rewards/rejected": -0.10907882452011108, + "step": 65 + }, + { + "epoch": 1.11864406779661, + "grad_norm": 35.43215229240541, + "learning_rate": 2.796610169491525e-07, + "logits/chosen": 12.129363059997559, + "logits/rejected": 12.394501686096191, + "logps/chosen": -18.410751342773438, + "logps/rejected": -22.589263916015625, + "loss": 0.6714, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.015718191862106323, + "rewards/margins": 0.04346385598182678, + "rewards/rejected": -0.02774566411972046, + "step": 66 + }, + { + "epoch": 1.1355932203389831, + "grad_norm": 37.13327137852814, + "learning_rate": 2.838983050847458e-07, + "logits/chosen": 10.463054656982422, + "logits/rejected": 10.192320823669434, + "logps/chosen": -14.659592628479004, + "logps/rejected": -16.757896423339844, + "loss": 0.669, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.00866326130926609, + "rewards/margins": 0.055536217987537384, + "rewards/rejected": -0.06419947743415833, + "step": 67 + }, + { + "epoch": 1.152542372881356, + "grad_norm": 40.430671120170146, + "learning_rate": 2.88135593220339e-07, + "logits/chosen": 14.019149780273438, + "logits/rejected": 14.417764663696289, + "logps/chosen": -16.717283248901367, + "logps/rejected": -24.413545608520508, + "loss": 0.6584, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12998425960540771, + "rewards/margins": 0.2277044951915741, + "rewards/rejected": -0.09772025048732758, + "step": 68 + }, + { + "epoch": 1.1694915254237288, + "grad_norm": 40.58226171654033, + "learning_rate": 2.923728813559322e-07, + "logits/chosen": 12.504399299621582, + "logits/rejected": 13.594255447387695, + "logps/chosen": -14.431526184082031, + "logps/rejected": -22.050336837768555, + "loss": 0.6733, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.043449416756629944, + "rewards/margins": 0.060832902789115906, + "rewards/rejected": -0.10428231954574585, + "step": 69 + }, + { + "epoch": 1.1864406779661016, + "grad_norm": 39.39764264695123, + "learning_rate": 2.966101694915254e-07, + "logits/chosen": 11.229328155517578, + "logits/rejected": 11.79920768737793, + "logps/chosen": -15.364906311035156, + "logps/rejected": -24.188587188720703, + "loss": 0.6667, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16962946951389313, + "rewards/margins": 0.2086838036775589, + "rewards/rejected": -0.03905433416366577, + "step": 70 + }, + { + "epoch": 1.2033898305084745, + "grad_norm": 40.31701080664346, + "learning_rate": 3.008474576271186e-07, + "logits/chosen": 11.156949043273926, + "logits/rejected": 11.680891036987305, + "logps/chosen": -16.79814910888672, + "logps/rejected": -25.06736946105957, + "loss": 0.6683, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05000852048397064, + "rewards/margins": 0.0933413952589035, + "rewards/rejected": -0.04333287477493286, + "step": 71 + }, + { + "epoch": 1.2203389830508475, + "grad_norm": 41.58246062784223, + "learning_rate": 3.0508474576271186e-07, + "logits/chosen": 8.323310852050781, + "logits/rejected": 9.706480026245117, + "logps/chosen": -18.133089065551758, + "logps/rejected": -26.048744201660156, + "loss": 0.6549, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04978783428668976, + "rewards/margins": 0.22536294162273407, + "rewards/rejected": -0.1755751073360443, + "step": 72 + }, + { + "epoch": 1.2372881355932204, + "grad_norm": 37.559888806481496, + "learning_rate": 3.093220338983051e-07, + "logits/chosen": 13.401594161987305, + "logits/rejected": 13.73277759552002, + "logps/chosen": -14.402023315429688, + "logps/rejected": -19.22127914428711, + "loss": 0.6612, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16372399032115936, + "rewards/margins": 0.29817506670951843, + "rewards/rejected": -0.13445107638835907, + "step": 73 + }, + { + "epoch": 1.2542372881355932, + "grad_norm": 40.882004689136416, + "learning_rate": 3.135593220338983e-07, + "logits/chosen": 13.300498008728027, + "logits/rejected": 13.367609977722168, + "logps/chosen": -16.360227584838867, + "logps/rejected": -22.51313018798828, + "loss": 0.6598, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06130232661962509, + "rewards/margins": 0.21569877862930298, + "rewards/rejected": -0.15439645946025848, + "step": 74 + }, + { + "epoch": 1.271186440677966, + "grad_norm": 37.29775386481398, + "learning_rate": 3.177966101694915e-07, + "logits/chosen": 9.930148124694824, + "logits/rejected": 9.965421676635742, + "logps/chosen": -13.893769264221191, + "logps/rejected": -17.500465393066406, + "loss": 0.6518, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06109648942947388, + "rewards/margins": 0.09164294600486755, + "rewards/rejected": -0.030546456575393677, + "step": 75 + }, + { + "epoch": 1.288135593220339, + "grad_norm": 35.354986913884105, + "learning_rate": 3.220338983050847e-07, + "logits/chosen": 13.425969123840332, + "logits/rejected": 14.32559585571289, + "logps/chosen": -15.175914764404297, + "logps/rejected": -24.598207473754883, + "loss": 0.6717, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.0012704432010650635, + "rewards/margins": 0.02735494077205658, + "rewards/rejected": -0.028625383973121643, + "step": 76 + }, + { + "epoch": 1.305084745762712, + "grad_norm": 36.98175660192275, + "learning_rate": 3.26271186440678e-07, + "logits/chosen": 9.431652069091797, + "logits/rejected": 9.38131332397461, + "logps/chosen": -21.996917724609375, + "logps/rejected": -30.023685455322266, + "loss": 0.6569, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13959163427352905, + "rewards/margins": 0.22159650921821594, + "rewards/rejected": -0.08200487494468689, + "step": 77 + }, + { + "epoch": 1.3220338983050848, + "grad_norm": 36.359735380010825, + "learning_rate": 3.3050847457627117e-07, + "logits/chosen": 14.359407424926758, + "logits/rejected": 15.139140129089355, + "logps/chosen": -15.036028861999512, + "logps/rejected": -24.142406463623047, + "loss": 0.6529, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.014202922582626343, + "rewards/margins": 0.19232788681983948, + "rewards/rejected": -0.17812496423721313, + "step": 78 + }, + { + "epoch": 1.3389830508474576, + "grad_norm": 36.50503808890011, + "learning_rate": 3.3474576271186436e-07, + "logits/chosen": 14.310426712036133, + "logits/rejected": 14.55799674987793, + "logps/chosen": -12.61948013305664, + "logps/rejected": -20.105886459350586, + "loss": 0.6574, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005430340766906738, + "rewards/margins": 0.12721706926822662, + "rewards/rejected": -0.12178672850131989, + "step": 79 + }, + { + "epoch": 1.3559322033898304, + "grad_norm": 35.78916311248173, + "learning_rate": 3.389830508474576e-07, + "logits/chosen": 12.969365119934082, + "logits/rejected": 13.552573204040527, + "logps/chosen": -15.17340087890625, + "logps/rejected": -27.757129669189453, + "loss": 0.6573, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05384726822376251, + "rewards/margins": 0.14679737389087677, + "rewards/rejected": -0.09295010566711426, + "step": 80 + }, + { + "epoch": 1.3728813559322033, + "grad_norm": 38.99240126520527, + "learning_rate": 3.432203389830508e-07, + "logits/chosen": 14.033243179321289, + "logits/rejected": 14.331340789794922, + "logps/chosen": -14.390337944030762, + "logps/rejected": -22.8853702545166, + "loss": 0.6488, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07335115224123001, + "rewards/margins": 0.18050891160964966, + "rewards/rejected": -0.10715775191783905, + "step": 81 + }, + { + "epoch": 1.3898305084745763, + "grad_norm": 34.14892965471411, + "learning_rate": 3.4745762711864405e-07, + "logits/chosen": 11.835970878601074, + "logits/rejected": 11.749543190002441, + "logps/chosen": -11.856498718261719, + "logps/rejected": -14.143364906311035, + "loss": 0.6574, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.04997224733233452, + "rewards/margins": 0.04305719956755638, + "rewards/rejected": 0.006915047764778137, + "step": 82 + }, + { + "epoch": 1.4067796610169492, + "grad_norm": 39.274641594335634, + "learning_rate": 3.516949152542373e-07, + "logits/chosen": 10.360257148742676, + "logits/rejected": 10.089805603027344, + "logps/chosen": -18.05191421508789, + "logps/rejected": -20.790409088134766, + "loss": 0.6483, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03167504072189331, + "rewards/margins": 0.13742826879024506, + "rewards/rejected": -0.10575322806835175, + "step": 83 + }, + { + "epoch": 1.423728813559322, + "grad_norm": 36.85996674163347, + "learning_rate": 3.559322033898305e-07, + "logits/chosen": 10.419514656066895, + "logits/rejected": 10.655095100402832, + "logps/chosen": -17.017175674438477, + "logps/rejected": -19.419795989990234, + "loss": 0.6639, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04058393836021423, + "rewards/margins": 0.0065583735704422, + "rewards/rejected": -0.04714231193065643, + "step": 84 + }, + { + "epoch": 1.4406779661016949, + "grad_norm": 35.17113062069321, + "learning_rate": 3.601694915254237e-07, + "logits/chosen": 13.433987617492676, + "logits/rejected": 13.448290824890137, + "logps/chosen": -15.061188697814941, + "logps/rejected": -16.228702545166016, + "loss": 0.6367, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.028515294194221497, + "rewards/margins": 0.07839739322662354, + "rewards/rejected": -0.04988209158182144, + "step": 85 + }, + { + "epoch": 1.457627118644068, + "grad_norm": 37.41104165649846, + "learning_rate": 3.644067796610169e-07, + "logits/chosen": 11.649856567382812, + "logits/rejected": 12.419816970825195, + "logps/chosen": -15.746871948242188, + "logps/rejected": -28.98259162902832, + "loss": 0.6567, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.020991250872612, + "rewards/margins": 0.15705639123916626, + "rewards/rejected": -0.13606514036655426, + "step": 86 + }, + { + "epoch": 1.4745762711864407, + "grad_norm": 35.42707515007882, + "learning_rate": 3.6864406779661017e-07, + "logits/chosen": 10.078230857849121, + "logits/rejected": 10.944649696350098, + "logps/chosen": -15.349961280822754, + "logps/rejected": -23.0633487701416, + "loss": 0.6476, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.008510768413543701, + "rewards/margins": 0.13419035077095032, + "rewards/rejected": -0.12567958235740662, + "step": 87 + }, + { + "epoch": 1.4915254237288136, + "grad_norm": 35.46644556499606, + "learning_rate": 3.7288135593220336e-07, + "logits/chosen": 11.770813941955566, + "logits/rejected": 12.585672378540039, + "logps/chosen": -18.352783203125, + "logps/rejected": -22.152597427368164, + "loss": 0.6159, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1304049789905548, + "rewards/margins": 0.36608466506004333, + "rewards/rejected": -0.23567967116832733, + "step": 88 + }, + { + "epoch": 1.5084745762711864, + "grad_norm": 37.29455949159622, + "learning_rate": 3.771186440677966e-07, + "logits/chosen": 13.56704330444336, + "logits/rejected": 13.756218910217285, + "logps/chosen": -18.772235870361328, + "logps/rejected": -23.98781967163086, + "loss": 0.6373, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0235111266374588, + "rewards/margins": 0.19346098601818085, + "rewards/rejected": -0.16994985938072205, + "step": 89 + }, + { + "epoch": 1.5254237288135593, + "grad_norm": 37.279643088142166, + "learning_rate": 3.813559322033898e-07, + "logits/chosen": 10.409061431884766, + "logits/rejected": 11.294801712036133, + "logps/chosen": -20.020545959472656, + "logps/rejected": -27.11080551147461, + "loss": 0.6548, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03669746220111847, + "rewards/margins": 0.003694683313369751, + "rewards/rejected": -0.04039214551448822, + "step": 90 + }, + { + "epoch": 1.542372881355932, + "grad_norm": 34.92132206920193, + "learning_rate": 3.8559322033898304e-07, + "logits/chosen": 10.68431282043457, + "logits/rejected": 10.871668815612793, + "logps/chosen": -18.425750732421875, + "logps/rejected": -19.080305099487305, + "loss": 0.6373, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.04941455274820328, + "rewards/margins": 0.01522158831357956, + "rewards/rejected": -0.06463614106178284, + "step": 91 + }, + { + "epoch": 1.559322033898305, + "grad_norm": 35.67266173202364, + "learning_rate": 3.898305084745763e-07, + "logits/chosen": 11.35604476928711, + "logits/rejected": 11.843755722045898, + "logps/chosen": -17.879486083984375, + "logps/rejected": -24.5284423828125, + "loss": 0.612, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03402914106845856, + "rewards/margins": 0.12276534736156464, + "rewards/rejected": -0.08873620629310608, + "step": 92 + }, + { + "epoch": 1.576271186440678, + "grad_norm": 37.54947919804327, + "learning_rate": 3.940677966101695e-07, + "logits/chosen": 11.453618049621582, + "logits/rejected": 12.393528938293457, + "logps/chosen": -14.59844970703125, + "logps/rejected": -22.819143295288086, + "loss": 0.6599, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.05744030699133873, + "rewards/margins": 0.21809090673923492, + "rewards/rejected": -0.1606505960226059, + "step": 93 + }, + { + "epoch": 1.5932203389830508, + "grad_norm": 34.33945131326431, + "learning_rate": 3.9830508474576267e-07, + "logits/chosen": 9.380054473876953, + "logits/rejected": 9.780099868774414, + "logps/chosen": -13.068021774291992, + "logps/rejected": -24.602806091308594, + "loss": 0.6245, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.02326585352420807, + "rewards/margins": 0.2321978360414505, + "rewards/rejected": -0.20893198251724243, + "step": 94 + }, + { + "epoch": 1.6101694915254239, + "grad_norm": 35.81592033651009, + "learning_rate": 4.025423728813559e-07, + "logits/chosen": 13.495811462402344, + "logits/rejected": 13.81532096862793, + "logps/chosen": -13.080314636230469, + "logps/rejected": -18.98967170715332, + "loss": 0.6407, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.07077131420373917, + "rewards/margins": 0.2651638686656952, + "rewards/rejected": -0.19439256191253662, + "step": 95 + }, + { + "epoch": 1.6271186440677967, + "grad_norm": 34.53874606260594, + "learning_rate": 4.0677966101694916e-07, + "logits/chosen": 12.902689933776855, + "logits/rejected": 13.325467109680176, + "logps/chosen": -18.66386604309082, + "logps/rejected": -23.48880958557129, + "loss": 0.6075, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.015904970467090607, + "rewards/margins": 0.06729435175657272, + "rewards/rejected": -0.05138938128948212, + "step": 96 + }, + { + "epoch": 1.6440677966101696, + "grad_norm": 35.02426892448023, + "learning_rate": 4.1101694915254236e-07, + "logits/chosen": 9.74704360961914, + "logits/rejected": 10.310001373291016, + "logps/chosen": -12.350079536437988, + "logps/rejected": -20.870689392089844, + "loss": 0.6058, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03719270974397659, + "rewards/margins": 0.30710333585739136, + "rewards/rejected": -0.26991063356399536, + "step": 97 + }, + { + "epoch": 1.6610169491525424, + "grad_norm": 62.01619203898106, + "learning_rate": 4.152542372881356e-07, + "logits/chosen": 10.409793853759766, + "logits/rejected": 11.300240516662598, + "logps/chosen": -17.81023406982422, + "logps/rejected": -24.685707092285156, + "loss": 0.591, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.07331323623657227, + "rewards/margins": 0.28021639585494995, + "rewards/rejected": -0.20690315961837769, + "step": 98 + }, + { + "epoch": 1.6779661016949152, + "grad_norm": 33.31574704614062, + "learning_rate": 4.194915254237288e-07, + "logits/chosen": 16.979007720947266, + "logits/rejected": 16.530698776245117, + "logps/chosen": -17.609045028686523, + "logps/rejected": -22.1740665435791, + "loss": 0.618, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09865903854370117, + "rewards/margins": 0.20120608806610107, + "rewards/rejected": -0.1025470495223999, + "step": 99 + }, + { + "epoch": 1.694915254237288, + "grad_norm": 37.008794239474405, + "learning_rate": 4.23728813559322e-07, + "logits/chosen": 12.573817253112793, + "logits/rejected": 12.280284881591797, + "logps/chosen": -22.748950958251953, + "logps/rejected": -24.286212921142578, + "loss": 0.6158, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.005324997007846832, + "rewards/margins": 0.3028494119644165, + "rewards/rejected": -0.30817440152168274, + "step": 100 + }, + { + "epoch": 1.711864406779661, + "grad_norm": 34.22685156094786, + "learning_rate": 4.279661016949153e-07, + "logits/chosen": 8.986577033996582, + "logits/rejected": 9.49609661102295, + "logps/chosen": -16.046968460083008, + "logps/rejected": -25.25179672241211, + "loss": 0.5904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.020370006561279297, + "rewards/margins": 0.3425019681453705, + "rewards/rejected": -0.3628719747066498, + "step": 101 + }, + { + "epoch": 1.7288135593220337, + "grad_norm": 34.625932958965485, + "learning_rate": 4.322033898305085e-07, + "logits/chosen": 11.384140968322754, + "logits/rejected": 11.360179901123047, + "logps/chosen": -17.27088165283203, + "logps/rejected": -22.397926330566406, + "loss": 0.6012, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.01983724534511566, + "rewards/margins": 0.28960052132606506, + "rewards/rejected": -0.2697632908821106, + "step": 102 + }, + { + "epoch": 1.7457627118644068, + "grad_norm": 33.33748508633769, + "learning_rate": 4.3644067796610167e-07, + "logits/chosen": 14.142353057861328, + "logits/rejected": 14.965060234069824, + "logps/chosen": -14.294002532958984, + "logps/rejected": -25.333721160888672, + "loss": 0.5844, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02870616316795349, + "rewards/margins": 0.4857754111289978, + "rewards/rejected": -0.4570692479610443, + "step": 103 + }, + { + "epoch": 1.7627118644067796, + "grad_norm": 39.42019607658545, + "learning_rate": 4.4067796610169486e-07, + "logits/chosen": 12.695120811462402, + "logits/rejected": 12.690081596374512, + "logps/chosen": -13.946575164794922, + "logps/rejected": -14.654245376586914, + "loss": 0.6185, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.004654925316572189, + "rewards/margins": 0.08387665450572968, + "rewards/rejected": -0.08853158354759216, + "step": 104 + }, + { + "epoch": 1.7796610169491527, + "grad_norm": 35.38737658039196, + "learning_rate": 4.449152542372881e-07, + "logits/chosen": 13.033133506774902, + "logits/rejected": 13.914467811584473, + "logps/chosen": -18.670257568359375, + "logps/rejected": -28.337072372436523, + "loss": 0.5595, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.012542501091957092, + "rewards/margins": 0.43251490592956543, + "rewards/rejected": -0.41997238993644714, + "step": 105 + }, + { + "epoch": 1.7966101694915255, + "grad_norm": 31.47996004507577, + "learning_rate": 4.4915254237288135e-07, + "logits/chosen": 10.95296859741211, + "logits/rejected": 11.121237754821777, + "logps/chosen": -16.975109100341797, + "logps/rejected": -25.316640853881836, + "loss": 0.5541, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.012887578457593918, + "rewards/margins": 0.3660765588283539, + "rewards/rejected": -0.35318896174430847, + "step": 106 + }, + { + "epoch": 1.8135593220338984, + "grad_norm": 36.40410354328316, + "learning_rate": 4.5338983050847454e-07, + "logits/chosen": 10.065115928649902, + "logits/rejected": 10.710315704345703, + "logps/chosen": -15.07365608215332, + "logps/rejected": -26.562572479248047, + "loss": 0.6034, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.011696398258209229, + "rewards/margins": 0.1393248438835144, + "rewards/rejected": -0.12762844562530518, + "step": 107 + }, + { + "epoch": 1.8305084745762712, + "grad_norm": 40.6534803962061, + "learning_rate": 4.576271186440678e-07, + "logits/chosen": 11.191512107849121, + "logits/rejected": 12.057967185974121, + "logps/chosen": -14.929410934448242, + "logps/rejected": -24.907493591308594, + "loss": 0.5917, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.07028765976428986, + "rewards/margins": 0.2717167139053345, + "rewards/rejected": -0.34200435876846313, + "step": 108 + }, + { + "epoch": 1.847457627118644, + "grad_norm": 35.71218397639331, + "learning_rate": 4.61864406779661e-07, + "logits/chosen": 12.564529418945312, + "logits/rejected": 13.02096176147461, + "logps/chosen": -10.957157135009766, + "logps/rejected": -21.63454818725586, + "loss": 0.5709, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1165485829114914, + "rewards/margins": 0.4494689106941223, + "rewards/rejected": -0.3329203128814697, + "step": 109 + }, + { + "epoch": 1.8644067796610169, + "grad_norm": 33.929367751303374, + "learning_rate": 4.661016949152542e-07, + "logits/chosen": 8.212858200073242, + "logits/rejected": 8.061684608459473, + "logps/chosen": -26.250083923339844, + "logps/rejected": -25.81487274169922, + "loss": 0.5885, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.013301417231559753, + "rewards/margins": 0.05666793882846832, + "rewards/rejected": -0.04336652159690857, + "step": 110 + }, + { + "epoch": 1.8813559322033897, + "grad_norm": 32.019942469686896, + "learning_rate": 4.7033898305084747e-07, + "logits/chosen": 12.412970542907715, + "logits/rejected": 13.044310569763184, + "logps/chosen": -14.468311309814453, + "logps/rejected": -22.58787727355957, + "loss": 0.5516, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0577264279127121, + "rewards/margins": 0.42321252822875977, + "rewards/rejected": -0.3654860854148865, + "step": 111 + }, + { + "epoch": 1.8983050847457628, + "grad_norm": 35.55786831096516, + "learning_rate": 4.7457627118644066e-07, + "logits/chosen": 8.835311889648438, + "logits/rejected": 9.934072494506836, + "logps/chosen": -15.43493938446045, + "logps/rejected": -21.87274742126465, + "loss": 0.6047, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.010160937905311584, + "rewards/margins": 0.2345259189605713, + "rewards/rejected": -0.2243649810552597, + "step": 112 + }, + { + "epoch": 1.9152542372881356, + "grad_norm": 34.291753623304935, + "learning_rate": 4.788135593220339e-07, + "logits/chosen": 10.39708137512207, + "logits/rejected": 10.226729393005371, + "logps/chosen": -17.666790008544922, + "logps/rejected": -19.35904312133789, + "loss": 0.5398, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.05581867694854736, + "rewards/margins": 0.19152891635894775, + "rewards/rejected": -0.24734759330749512, + "step": 113 + }, + { + "epoch": 1.9322033898305084, + "grad_norm": 33.6944592067948, + "learning_rate": 4.830508474576271e-07, + "logits/chosen": 13.301194190979004, + "logits/rejected": 13.128783226013184, + "logps/chosen": -21.99412727355957, + "logps/rejected": -30.18555450439453, + "loss": 0.5636, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03493595868349075, + "rewards/margins": 0.6104806661605835, + "rewards/rejected": -0.5755447149276733, + "step": 114 + }, + { + "epoch": 1.9491525423728815, + "grad_norm": 31.824406268281404, + "learning_rate": 4.872881355932203e-07, + "logits/chosen": 13.88214111328125, + "logits/rejected": 13.710270881652832, + "logps/chosen": -21.656864166259766, + "logps/rejected": -28.074872970581055, + "loss": 0.5808, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.06213666498661041, + "rewards/margins": 0.45388251543045044, + "rewards/rejected": -0.5160191655158997, + "step": 115 + }, + { + "epoch": 1.9661016949152543, + "grad_norm": 36.84167639719694, + "learning_rate": 4.915254237288136e-07, + "logits/chosen": 9.90043830871582, + "logits/rejected": 10.525725364685059, + "logps/chosen": -16.76810073852539, + "logps/rejected": -24.763050079345703, + "loss": 0.5318, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12131154537200928, + "rewards/margins": 0.5713745355606079, + "rewards/rejected": -0.450063019990921, + "step": 116 + }, + { + "epoch": 1.9830508474576272, + "grad_norm": 43.015222191808384, + "learning_rate": 4.957627118644068e-07, + "logits/chosen": 10.298654556274414, + "logits/rejected": 10.836167335510254, + "logps/chosen": -19.98606300354004, + "logps/rejected": -29.31884765625, + "loss": 0.5295, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06146101653575897, + "rewards/margins": 0.4207366108894348, + "rewards/rejected": -0.35927557945251465, + "step": 117 + }, + { + "epoch": 2.0, + "grad_norm": 33.830726998774026, + "learning_rate": 5e-07, + "logits/chosen": 10.221606254577637, + "logits/rejected": 10.766395568847656, + "logps/chosen": -17.46257209777832, + "logps/rejected": -23.293987274169922, + "loss": 0.5567, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.017913732677698135, + "rewards/margins": 0.30477598309516907, + "rewards/rejected": -0.28686225414276123, + "step": 118 + }, + { + "epoch": 2.016949152542373, + "grad_norm": 31.598979188153844, + "learning_rate": 4.99998906143358e-07, + "logits/chosen": 7.365565776824951, + "logits/rejected": 8.237077713012695, + "logps/chosen": -13.421714782714844, + "logps/rejected": -24.525100708007812, + "loss": 0.5111, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09821398556232452, + "rewards/margins": 0.6104830503463745, + "rewards/rejected": -0.5122690796852112, + "step": 119 + }, + { + "epoch": 2.0338983050847457, + "grad_norm": 28.9174685908656, + "learning_rate": 4.999956245830044e-07, + "logits/chosen": 8.895161628723145, + "logits/rejected": 9.429058074951172, + "logps/chosen": -16.168237686157227, + "logps/rejected": -23.326154708862305, + "loss": 0.5228, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0778268426656723, + "rewards/margins": 0.5315461158752441, + "rewards/rejected": -0.45371925830841064, + "step": 120 + }, + { + "epoch": 2.0508474576271185, + "grad_norm": 32.2647349268431, + "learning_rate": 4.999901553476555e-07, + "logits/chosen": 11.467917442321777, + "logits/rejected": 10.708782196044922, + "logps/chosen": -22.234786987304688, + "logps/rejected": -18.131317138671875, + "loss": 0.5482, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0419849194586277, + "rewards/margins": 0.25038909912109375, + "rewards/rejected": -0.29237401485443115, + "step": 121 + }, + { + "epoch": 2.0677966101694913, + "grad_norm": 34.7212502994755, + "learning_rate": 4.999824984851718e-07, + "logits/chosen": 8.989114761352539, + "logits/rejected": 9.167284965515137, + "logps/chosen": -21.553335189819336, + "logps/rejected": -29.138168334960938, + "loss": 0.5211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06739462912082672, + "rewards/margins": 0.38520726561546326, + "rewards/rejected": -0.31781265139579773, + "step": 122 + }, + { + "epoch": 2.084745762711864, + "grad_norm": 31.945568508747726, + "learning_rate": 4.999726540625574e-07, + "logits/chosen": 9.047769546508789, + "logits/rejected": 9.909135818481445, + "logps/chosen": -15.000102043151855, + "logps/rejected": -26.59886932373047, + "loss": 0.5061, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.098088338971138, + "rewards/margins": 0.3822152018547058, + "rewards/rejected": -0.2841268479824066, + "step": 123 + }, + { + "epoch": 2.1016949152542375, + "grad_norm": 27.47436739327594, + "learning_rate": 4.999606221659594e-07, + "logits/chosen": 8.615792274475098, + "logits/rejected": 8.573972702026367, + "logps/chosen": -21.705352783203125, + "logps/rejected": -26.503461837768555, + "loss": 0.519, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11717559397220612, + "rewards/margins": 0.4941335618495941, + "rewards/rejected": -0.3769579529762268, + "step": 124 + }, + { + "epoch": 2.1186440677966103, + "grad_norm": 28.57759693816656, + "learning_rate": 4.999464029006672e-07, + "logits/chosen": 10.453192710876465, + "logits/rejected": 10.818021774291992, + "logps/chosen": -15.647000312805176, + "logps/rejected": -22.32992935180664, + "loss": 0.5042, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1543671041727066, + "rewards/margins": 0.44498467445373535, + "rewards/rejected": -0.29061758518218994, + "step": 125 + }, + { + "epoch": 2.135593220338983, + "grad_norm": 30.8599676767818, + "learning_rate": 4.999299963911115e-07, + "logits/chosen": 9.664230346679688, + "logits/rejected": 9.966415405273438, + "logps/chosen": -15.471524238586426, + "logps/rejected": -18.966232299804688, + "loss": 0.5189, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1293969601392746, + "rewards/margins": 0.45817211270332336, + "rewards/rejected": -0.3287751376628876, + "step": 126 + }, + { + "epoch": 2.152542372881356, + "grad_norm": 28.153225851680524, + "learning_rate": 4.999114027808631e-07, + "logits/chosen": 10.365848541259766, + "logits/rejected": 10.544573783874512, + "logps/chosen": -18.896303176879883, + "logps/rejected": -26.077632904052734, + "loss": 0.4968, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06244945526123047, + "rewards/margins": 0.8896008133888245, + "rewards/rejected": -0.8271512985229492, + "step": 127 + }, + { + "epoch": 2.169491525423729, + "grad_norm": 29.077836625260247, + "learning_rate": 4.998906222326321e-07, + "logits/chosen": 11.856369972229004, + "logits/rejected": 12.338065147399902, + "logps/chosen": -19.74888801574707, + "logps/rejected": -26.943958282470703, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18331018090248108, + "rewards/margins": 0.8261761665344238, + "rewards/rejected": -0.6428660154342651, + "step": 128 + }, + { + "epoch": 2.1864406779661016, + "grad_norm": 30.101971609588308, + "learning_rate": 4.99867654928266e-07, + "logits/chosen": 9.577162742614746, + "logits/rejected": 9.2794771194458, + "logps/chosen": -19.739227294921875, + "logps/rejected": -26.39651107788086, + "loss": 0.4757, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.08600222319364548, + "rewards/margins": 0.5597529411315918, + "rewards/rejected": -0.4737507402896881, + "step": 129 + }, + { + "epoch": 2.2033898305084745, + "grad_norm": 27.97003250483979, + "learning_rate": 4.998425010687483e-07, + "logits/chosen": 9.935083389282227, + "logits/rejected": 10.445450782775879, + "logps/chosen": -18.21001434326172, + "logps/rejected": -24.57656478881836, + "loss": 0.5066, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.17998579144477844, + "rewards/margins": 0.8225247859954834, + "rewards/rejected": -0.6425389051437378, + "step": 130 + }, + { + "epoch": 2.2203389830508473, + "grad_norm": 28.70117722604828, + "learning_rate": 4.998151608741969e-07, + "logits/chosen": 8.155485153198242, + "logits/rejected": 8.829066276550293, + "logps/chosen": -19.724557876586914, + "logps/rejected": -32.21281433105469, + "loss": 0.4693, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10095170885324478, + "rewards/margins": 0.9277392625808716, + "rewards/rejected": -0.8267876505851746, + "step": 131 + }, + { + "epoch": 2.23728813559322, + "grad_norm": 30.67778967336796, + "learning_rate": 4.997856345838614e-07, + "logits/chosen": 7.782173156738281, + "logits/rejected": 8.034370422363281, + "logps/chosen": -15.949851036071777, + "logps/rejected": -20.475975036621094, + "loss": 0.5104, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03952084109187126, + "rewards/margins": 0.616716206073761, + "rewards/rejected": -0.577195405960083, + "step": 132 + }, + { + "epoch": 2.2542372881355934, + "grad_norm": 30.80137068187178, + "learning_rate": 4.997539224561225e-07, + "logits/chosen": 11.369648933410645, + "logits/rejected": 11.718732833862305, + "logps/chosen": -13.169281005859375, + "logps/rejected": -17.166488647460938, + "loss": 0.4788, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1415635049343109, + "rewards/margins": 0.5353420972824097, + "rewards/rejected": -0.39377859234809875, + "step": 133 + }, + { + "epoch": 2.2711864406779663, + "grad_norm": 31.80509416960202, + "learning_rate": 4.99720024768488e-07, + "logits/chosen": 8.444438934326172, + "logits/rejected": 8.495518684387207, + "logps/chosen": -21.16027069091797, + "logps/rejected": -23.736560821533203, + "loss": 0.4245, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12956289947032928, + "rewards/margins": 0.5659047365188599, + "rewards/rejected": -0.4363418519496918, + "step": 134 + }, + { + "epoch": 2.288135593220339, + "grad_norm": 32.67234861176432, + "learning_rate": 4.996839418175918e-07, + "logits/chosen": 10.731569290161133, + "logits/rejected": 12.128774642944336, + "logps/chosen": -16.912437438964844, + "logps/rejected": -32.4586296081543, + "loss": 0.4672, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.042983584105968475, + "rewards/margins": 0.8427398800849915, + "rewards/rejected": -0.8857234716415405, + "step": 135 + }, + { + "epoch": 2.305084745762712, + "grad_norm": 27.925436246552373, + "learning_rate": 4.996456739191904e-07, + "logits/chosen": 11.008123397827148, + "logits/rejected": 12.00696086883545, + "logps/chosen": -12.97161865234375, + "logps/rejected": -22.966861724853516, + "loss": 0.456, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14307719469070435, + "rewards/margins": 0.8332260847091675, + "rewards/rejected": -0.6901488900184631, + "step": 136 + }, + { + "epoch": 2.3220338983050848, + "grad_norm": 27.57837493625319, + "learning_rate": 4.996052214081608e-07, + "logits/chosen": 11.080045700073242, + "logits/rejected": 10.920424461364746, + "logps/chosen": -16.392833709716797, + "logps/rejected": -25.646282196044922, + "loss": 0.4793, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03493800759315491, + "rewards/margins": 0.7717962861061096, + "rewards/rejected": -0.7368583083152771, + "step": 137 + }, + { + "epoch": 2.3389830508474576, + "grad_norm": 30.84970177656098, + "learning_rate": 4.995625846384966e-07, + "logits/chosen": 12.923776626586914, + "logits/rejected": 13.309206008911133, + "logps/chosen": -18.304767608642578, + "logps/rejected": -26.07758903503418, + "loss": 0.4991, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.07253877818584442, + "rewards/margins": 0.41423463821411133, + "rewards/rejected": -0.3416959047317505, + "step": 138 + }, + { + "epoch": 2.3559322033898304, + "grad_norm": 28.87895085350674, + "learning_rate": 4.995177639833061e-07, + "logits/chosen": 10.714326858520508, + "logits/rejected": 11.218597412109375, + "logps/chosen": -12.854103088378906, + "logps/rejected": -20.783885955810547, + "loss": 0.477, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16060581803321838, + "rewards/margins": 0.5592792630195618, + "rewards/rejected": -0.398673415184021, + "step": 139 + }, + { + "epoch": 2.3728813559322033, + "grad_norm": 30.345207712006474, + "learning_rate": 4.994707598348084e-07, + "logits/chosen": 9.185007095336914, + "logits/rejected": 10.09119701385498, + "logps/chosen": -16.681596755981445, + "logps/rejected": -28.037395477294922, + "loss": 0.4592, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.07563964277505875, + "rewards/margins": 0.6644760966300964, + "rewards/rejected": -0.5888364315032959, + "step": 140 + }, + { + "epoch": 2.389830508474576, + "grad_norm": 29.900980845912017, + "learning_rate": 4.994215726043297e-07, + "logits/chosen": 11.37492561340332, + "logits/rejected": 12.306166648864746, + "logps/chosen": -18.86722183227539, + "logps/rejected": -27.48049545288086, + "loss": 0.4599, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.03138112276792526, + "rewards/margins": 0.8968455791473389, + "rewards/rejected": -0.865464448928833, + "step": 141 + }, + { + "epoch": 2.406779661016949, + "grad_norm": 28.184632132634288, + "learning_rate": 4.993702027223003e-07, + "logits/chosen": 10.223265647888184, + "logits/rejected": 11.089990615844727, + "logps/chosen": -16.124814987182617, + "logps/rejected": -24.316085815429688, + "loss": 0.4277, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1649247407913208, + "rewards/margins": 1.0841801166534424, + "rewards/rejected": -0.9192553758621216, + "step": 142 + }, + { + "epoch": 2.423728813559322, + "grad_norm": 33.65023165087564, + "learning_rate": 4.993166506382505e-07, + "logits/chosen": 12.015296936035156, + "logits/rejected": 12.360418319702148, + "logps/chosen": -11.458662033081055, + "logps/rejected": -24.14784812927246, + "loss": 0.4601, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12457229942083359, + "rewards/margins": 0.8391572833061218, + "rewards/rejected": -0.7145849466323853, + "step": 143 + }, + { + "epoch": 2.440677966101695, + "grad_norm": 27.026839644788712, + "learning_rate": 4.992609168208068e-07, + "logits/chosen": 10.368545532226562, + "logits/rejected": 10.083795547485352, + "logps/chosen": -23.548145294189453, + "logps/rejected": -22.070770263671875, + "loss": 0.4604, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2950201630592346, + "rewards/margins": 0.7546678781509399, + "rewards/rejected": -0.4596477150917053, + "step": 144 + }, + { + "epoch": 2.457627118644068, + "grad_norm": 26.060817977279918, + "learning_rate": 4.992030017576875e-07, + "logits/chosen": 10.199317932128906, + "logits/rejected": 10.922442436218262, + "logps/chosen": -21.961441040039062, + "logps/rejected": -35.883018493652344, + "loss": 0.4145, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1837531328201294, + "rewards/margins": 1.1913630962371826, + "rewards/rejected": -1.0076100826263428, + "step": 145 + }, + { + "epoch": 2.4745762711864407, + "grad_norm": 30.504477408509327, + "learning_rate": 4.991429059556989e-07, + "logits/chosen": 9.347443580627441, + "logits/rejected": 9.603809356689453, + "logps/chosen": -18.638919830322266, + "logps/rejected": -22.002826690673828, + "loss": 0.4603, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0920301228761673, + "rewards/margins": 0.9097036123275757, + "rewards/rejected": -0.8176735043525696, + "step": 146 + }, + { + "epoch": 2.4915254237288136, + "grad_norm": 26.196734416575048, + "learning_rate": 4.990806299407305e-07, + "logits/chosen": 8.564815521240234, + "logits/rejected": 9.3782377243042, + "logps/chosen": -14.529531478881836, + "logps/rejected": -19.932464599609375, + "loss": 0.4439, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14292752742767334, + "rewards/margins": 0.5559223890304565, + "rewards/rejected": -0.4129948318004608, + "step": 147 + }, + { + "epoch": 2.5084745762711864, + "grad_norm": 28.9947309009371, + "learning_rate": 4.990161742577506e-07, + "logits/chosen": 9.976179122924805, + "logits/rejected": 10.230119705200195, + "logps/chosen": -14.522598266601562, + "logps/rejected": -24.451900482177734, + "loss": 0.4331, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.01258106529712677, + "rewards/margins": 0.8219106197357178, + "rewards/rejected": -0.809329628944397, + "step": 148 + }, + { + "epoch": 2.5254237288135593, + "grad_norm": 28.301260499378476, + "learning_rate": 4.989495394708015e-07, + "logits/chosen": 8.76701831817627, + "logits/rejected": 9.332403182983398, + "logps/chosen": -24.048688888549805, + "logps/rejected": -22.38104820251465, + "loss": 0.4394, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04988780617713928, + "rewards/margins": 0.7242330312728882, + "rewards/rejected": -0.6743452548980713, + "step": 149 + }, + { + "epoch": 2.542372881355932, + "grad_norm": 24.87074559473272, + "learning_rate": 4.988807261629942e-07, + "logits/chosen": 11.337067604064941, + "logits/rejected": 11.21312427520752, + "logps/chosen": -18.84625244140625, + "logps/rejected": -23.458078384399414, + "loss": 0.4181, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.005762174725532532, + "rewards/margins": 1.0975971221923828, + "rewards/rejected": -1.0918350219726562, + "step": 150 + }, + { + "epoch": 2.559322033898305, + "grad_norm": 25.943700529763216, + "learning_rate": 4.988097349365039e-07, + "logits/chosen": 8.649271965026855, + "logits/rejected": 10.232512474060059, + "logps/chosen": -19.395671844482422, + "logps/rejected": -26.689014434814453, + "loss": 0.4141, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16895315051078796, + "rewards/margins": 0.9023088216781616, + "rewards/rejected": -0.7333556413650513, + "step": 151 + }, + { + "epoch": 2.576271186440678, + "grad_norm": 26.560543547978344, + "learning_rate": 4.987365664125646e-07, + "logits/chosen": 11.920557975769043, + "logits/rejected": 12.487735748291016, + "logps/chosen": -16.76218032836914, + "logps/rejected": -19.075769424438477, + "loss": 0.4424, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09761041402816772, + "rewards/margins": 0.5322891473770142, + "rewards/rejected": -0.43467870354652405, + "step": 152 + }, + { + "epoch": 2.593220338983051, + "grad_norm": 30.875491021836197, + "learning_rate": 4.986612212314632e-07, + "logits/chosen": 8.095218658447266, + "logits/rejected": 8.630035400390625, + "logps/chosen": -16.40205192565918, + "logps/rejected": -29.01576805114746, + "loss": 0.5034, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0655764490365982, + "rewards/margins": 1.2725319862365723, + "rewards/rejected": -1.2069554328918457, + "step": 153 + }, + { + "epoch": 2.610169491525424, + "grad_norm": 29.106013409554382, + "learning_rate": 4.985837000525343e-07, + "logits/chosen": 11.021102905273438, + "logits/rejected": 10.358692169189453, + "logps/chosen": -15.820294380187988, + "logps/rejected": -19.943557739257812, + "loss": 0.4508, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.012390628457069397, + "rewards/margins": 0.5218388438224792, + "rewards/rejected": -0.5342295169830322, + "step": 154 + }, + { + "epoch": 2.6271186440677967, + "grad_norm": 25.043691622956985, + "learning_rate": 4.985040035541542e-07, + "logits/chosen": 9.689821243286133, + "logits/rejected": 11.35682201385498, + "logps/chosen": -16.085561752319336, + "logps/rejected": -28.019329071044922, + "loss": 0.3964, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18151240050792694, + "rewards/margins": 1.1647062301635742, + "rewards/rejected": -0.9831939935684204, + "step": 155 + }, + { + "epoch": 2.6440677966101696, + "grad_norm": 26.21644326627851, + "learning_rate": 4.984221324337356e-07, + "logits/chosen": 9.743395805358887, + "logits/rejected": 11.021498680114746, + "logps/chosen": -14.556093215942383, + "logps/rejected": -26.45406723022461, + "loss": 0.4057, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07891668379306793, + "rewards/margins": 0.9213628172874451, + "rewards/rejected": -1.0002795457839966, + "step": 156 + }, + { + "epoch": 2.6610169491525424, + "grad_norm": 29.80638488183626, + "learning_rate": 4.983380874077204e-07, + "logits/chosen": 8.874907493591309, + "logits/rejected": 8.924434661865234, + "logps/chosen": -13.57056999206543, + "logps/rejected": -17.829633712768555, + "loss": 0.4866, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1548992097377777, + "rewards/margins": 0.7773313522338867, + "rewards/rejected": -0.6224321126937866, + "step": 157 + }, + { + "epoch": 2.6779661016949152, + "grad_norm": 26.481655223143644, + "learning_rate": 4.982518692115743e-07, + "logits/chosen": 9.572946548461914, + "logits/rejected": 10.248663902282715, + "logps/chosen": -14.427574157714844, + "logps/rejected": -22.97412109375, + "loss": 0.4233, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3066959083080292, + "rewards/margins": 1.1530815362930298, + "rewards/rejected": -0.8463855981826782, + "step": 158 + }, + { + "epoch": 2.694915254237288, + "grad_norm": 30.392105942262955, + "learning_rate": 4.981634785997801e-07, + "logits/chosen": 11.983063697814941, + "logits/rejected": 12.59067440032959, + "logps/chosen": -17.3765869140625, + "logps/rejected": -25.998476028442383, + "loss": 0.4306, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.10719990730285645, + "rewards/margins": 0.8148340582847595, + "rewards/rejected": -0.922033965587616, + "step": 159 + }, + { + "epoch": 2.711864406779661, + "grad_norm": 27.76495608776217, + "learning_rate": 4.980729163458311e-07, + "logits/chosen": 8.870572090148926, + "logits/rejected": 9.155264854431152, + "logps/chosen": -20.706480026245117, + "logps/rejected": -22.279741287231445, + "loss": 0.4485, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08153115212917328, + "rewards/margins": 0.7684078216552734, + "rewards/rejected": -0.686876654624939, + "step": 160 + }, + { + "epoch": 2.7288135593220337, + "grad_norm": 28.299451351438137, + "learning_rate": 4.979801832422243e-07, + "logits/chosen": 9.896560668945312, + "logits/rejected": 10.647369384765625, + "logps/chosen": -14.198293685913086, + "logps/rejected": -23.360374450683594, + "loss": 0.4107, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.050837576389312744, + "rewards/margins": 0.5405330657958984, + "rewards/rejected": -0.5913706421852112, + "step": 161 + }, + { + "epoch": 2.7457627118644066, + "grad_norm": 26.33726622019465, + "learning_rate": 4.978852801004533e-07, + "logits/chosen": 9.247544288635254, + "logits/rejected": 9.513944625854492, + "logps/chosen": -14.692225456237793, + "logps/rejected": -21.092857360839844, + "loss": 0.4495, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.126656174659729, + "rewards/margins": 0.5739098787307739, + "rewards/rejected": -0.44725367426872253, + "step": 162 + }, + { + "epoch": 2.7627118644067794, + "grad_norm": 27.340813928397274, + "learning_rate": 4.977882077510018e-07, + "logits/chosen": 9.752120971679688, + "logits/rejected": 11.007214546203613, + "logps/chosen": -13.380044937133789, + "logps/rejected": -25.285293579101562, + "loss": 0.4236, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09852191060781479, + "rewards/margins": 1.5012712478637695, + "rewards/rejected": -1.4027493000030518, + "step": 163 + }, + { + "epoch": 2.7796610169491527, + "grad_norm": 25.850905007942515, + "learning_rate": 4.976889670433355e-07, + "logits/chosen": 8.906452178955078, + "logits/rejected": 8.845396995544434, + "logps/chosen": -23.392776489257812, + "logps/rejected": -22.667905807495117, + "loss": 0.3735, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09475763142108917, + "rewards/margins": 1.1101325750350952, + "rewards/rejected": -1.0153748989105225, + "step": 164 + }, + { + "epoch": 2.7966101694915255, + "grad_norm": 25.80939076026458, + "learning_rate": 4.975875588458953e-07, + "logits/chosen": 11.2208833694458, + "logits/rejected": 11.42241382598877, + "logps/chosen": -18.877132415771484, + "logps/rejected": -20.48996353149414, + "loss": 0.4493, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.040170177817344666, + "rewards/margins": 0.5747877359390259, + "rewards/rejected": -0.6149579286575317, + "step": 165 + }, + { + "epoch": 2.8135593220338984, + "grad_norm": 26.338313528073403, + "learning_rate": 4.974839840460894e-07, + "logits/chosen": 10.968280792236328, + "logits/rejected": 11.683061599731445, + "logps/chosen": -8.583036422729492, + "logps/rejected": -19.372709274291992, + "loss": 0.3715, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21730396151542664, + "rewards/margins": 1.3301661014556885, + "rewards/rejected": -1.1128621101379395, + "step": 166 + }, + { + "epoch": 2.830508474576271, + "grad_norm": 26.738734324592283, + "learning_rate": 4.973782435502858e-07, + "logits/chosen": 8.941149711608887, + "logits/rejected": 10.029899597167969, + "logps/chosen": -18.587181091308594, + "logps/rejected": -29.493885040283203, + "loss": 0.4358, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.08800230920314789, + "rewards/margins": 1.2188138961791992, + "rewards/rejected": -1.3068161010742188, + "step": 167 + }, + { + "epoch": 2.847457627118644, + "grad_norm": 25.571243928734713, + "learning_rate": 4.97270338283804e-07, + "logits/chosen": 8.122725486755371, + "logits/rejected": 8.439363479614258, + "logps/chosen": -12.261382102966309, + "logps/rejected": -17.34757423400879, + "loss": 0.4122, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1635618358850479, + "rewards/margins": 0.41292399168014526, + "rewards/rejected": -0.24936217069625854, + "step": 168 + }, + { + "epoch": 2.864406779661017, + "grad_norm": 26.732405795819066, + "learning_rate": 4.97160269190907e-07, + "logits/chosen": 11.752440452575684, + "logits/rejected": 12.06600284576416, + "logps/chosen": -14.426056861877441, + "logps/rejected": -20.560089111328125, + "loss": 0.4442, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10870569944381714, + "rewards/margins": 0.657209038734436, + "rewards/rejected": -0.5485032796859741, + "step": 169 + }, + { + "epoch": 2.8813559322033897, + "grad_norm": 37.82985353920003, + "learning_rate": 4.970480372347933e-07, + "logits/chosen": 10.382537841796875, + "logits/rejected": 10.585658073425293, + "logps/chosen": -13.967865943908691, + "logps/rejected": -22.570960998535156, + "loss": 0.4372, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.07470311224460602, + "rewards/margins": 0.851300835609436, + "rewards/rejected": -0.7765977382659912, + "step": 170 + }, + { + "epoch": 2.898305084745763, + "grad_norm": 28.963422726648655, + "learning_rate": 4.969336433975886e-07, + "logits/chosen": 11.298312187194824, + "logits/rejected": 12.328322410583496, + "logps/chosen": -14.665711402893066, + "logps/rejected": -29.004274368286133, + "loss": 0.418, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.05677422136068344, + "rewards/margins": 1.126150131225586, + "rewards/rejected": -1.0693758726119995, + "step": 171 + }, + { + "epoch": 2.915254237288136, + "grad_norm": 25.04801638294687, + "learning_rate": 4.968170886803361e-07, + "logits/chosen": 8.335662841796875, + "logits/rejected": 9.045802116394043, + "logps/chosen": -15.8262300491333, + "logps/rejected": -25.64813804626465, + "loss": 0.3991, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.28148353099823, + "rewards/margins": 0.9742909073829651, + "rewards/rejected": -0.6928073167800903, + "step": 172 + }, + { + "epoch": 2.9322033898305087, + "grad_norm": 27.65516270956161, + "learning_rate": 4.966983741029893e-07, + "logits/chosen": 7.764376163482666, + "logits/rejected": 7.787277698516846, + "logps/chosen": -17.45782470703125, + "logps/rejected": -28.34349250793457, + "loss": 0.3965, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.016701295971870422, + "rewards/margins": 0.8749082684516907, + "rewards/rejected": -0.8582069277763367, + "step": 173 + }, + { + "epoch": 2.9491525423728815, + "grad_norm": 27.335474568003058, + "learning_rate": 4.965775007044019e-07, + "logits/chosen": 9.209973335266113, + "logits/rejected": 10.886381149291992, + "logps/chosen": -17.583267211914062, + "logps/rejected": -26.150978088378906, + "loss": 0.3574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3135643005371094, + "rewards/margins": 1.6306042671203613, + "rewards/rejected": -1.3170397281646729, + "step": 174 + }, + { + "epoch": 2.9661016949152543, + "grad_norm": 27.369409853348216, + "learning_rate": 4.964544695423193e-07, + "logits/chosen": 9.178210258483887, + "logits/rejected": 9.29438304901123, + "logps/chosen": -13.898372650146484, + "logps/rejected": -18.40739631652832, + "loss": 0.4116, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16570331156253815, + "rewards/margins": 0.9794048070907593, + "rewards/rejected": -0.8137016296386719, + "step": 175 + }, + { + "epoch": 2.983050847457627, + "grad_norm": 27.08632667503853, + "learning_rate": 4.963292816933691e-07, + "logits/chosen": 7.745831489562988, + "logits/rejected": 8.334796905517578, + "logps/chosen": -19.82839012145996, + "logps/rejected": -26.07370376586914, + "loss": 0.425, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10253079980611801, + "rewards/margins": 0.9082773923873901, + "rewards/rejected": -0.8057465553283691, + "step": 176 + }, + { + "epoch": 3.0, + "grad_norm": 29.11312831933624, + "learning_rate": 4.96201938253052e-07, + "logits/chosen": 8.67463207244873, + "logits/rejected": 8.769617080688477, + "logps/chosen": -19.712432861328125, + "logps/rejected": -29.56153106689453, + "loss": 0.4492, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.009731769561767578, + "rewards/margins": 1.0045030117034912, + "rewards/rejected": -1.0142347812652588, + "step": 177 + }, + { + "epoch": 3.016949152542373, + "grad_norm": 22.70341027925011, + "learning_rate": 4.960724403357314e-07, + "logits/chosen": 9.667706489562988, + "logits/rejected": 9.530271530151367, + "logps/chosen": -14.369274139404297, + "logps/rejected": -23.56820297241211, + "loss": 0.3558, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.0965251475572586, + "rewards/margins": 1.0754098892211914, + "rewards/rejected": -0.9788846969604492, + "step": 178 + }, + { + "epoch": 3.0338983050847457, + "grad_norm": 22.012454849538084, + "learning_rate": 4.959407890746248e-07, + "logits/chosen": 8.7022123336792, + "logits/rejected": 8.974809646606445, + "logps/chosen": -14.963942527770996, + "logps/rejected": -24.484926223754883, + "loss": 0.3825, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15482452511787415, + "rewards/margins": 1.3257172107696533, + "rewards/rejected": -1.1708927154541016, + "step": 179 + }, + { + "epoch": 3.0508474576271185, + "grad_norm": 21.871293164666387, + "learning_rate": 4.958069856217929e-07, + "logits/chosen": 10.28845500946045, + "logits/rejected": 9.987916946411133, + "logps/chosen": -14.2538480758667, + "logps/rejected": -19.900346755981445, + "loss": 0.3447, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1215810775756836, + "rewards/margins": 1.0722662210464478, + "rewards/rejected": -0.9506851434707642, + "step": 180 + }, + { + "epoch": 3.0677966101694913, + "grad_norm": 24.133472988977683, + "learning_rate": 4.956710311481302e-07, + "logits/chosen": 7.17443323135376, + "logits/rejected": 7.678508281707764, + "logps/chosen": -14.993780136108398, + "logps/rejected": -25.741018295288086, + "loss": 0.372, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.11443566530942917, + "rewards/margins": 1.444988489151001, + "rewards/rejected": -1.3305529356002808, + "step": 181 + }, + { + "epoch": 3.084745762711864, + "grad_norm": 24.14418324234125, + "learning_rate": 4.955329268433542e-07, + "logits/chosen": 6.731716156005859, + "logits/rejected": 7.250002861022949, + "logps/chosen": -18.923925399780273, + "logps/rejected": -23.595857620239258, + "loss": 0.3804, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.052950143814086914, + "rewards/margins": 1.2683604955673218, + "rewards/rejected": -1.3213107585906982, + "step": 182 + }, + { + "epoch": 3.1016949152542375, + "grad_norm": 23.24825847333165, + "learning_rate": 4.953926739159956e-07, + "logits/chosen": 10.804813385009766, + "logits/rejected": 11.33828067779541, + "logps/chosen": -18.222864151000977, + "logps/rejected": -26.791757583618164, + "loss": 0.3568, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.029467307031154633, + "rewards/margins": 1.3857711553573608, + "rewards/rejected": -1.3563038110733032, + "step": 183 + }, + { + "epoch": 3.1186440677966103, + "grad_norm": 23.624561156451218, + "learning_rate": 4.952502735933869e-07, + "logits/chosen": 7.371200084686279, + "logits/rejected": 8.635663986206055, + "logps/chosen": -16.824323654174805, + "logps/rejected": -30.131465911865234, + "loss": 0.3962, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.26038092374801636, + "rewards/margins": 1.377197027206421, + "rewards/rejected": -1.1168160438537598, + "step": 184 + }, + { + "epoch": 3.135593220338983, + "grad_norm": 23.0617388330513, + "learning_rate": 4.951057271216525e-07, + "logits/chosen": 7.263133525848389, + "logits/rejected": 8.648643493652344, + "logps/chosen": -14.994542121887207, + "logps/rejected": -27.363725662231445, + "loss": 0.3559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.04362674802541733, + "rewards/margins": 0.8833621144294739, + "rewards/rejected": -0.8397355079650879, + "step": 185 + }, + { + "epoch": 3.152542372881356, + "grad_norm": 24.060190276119712, + "learning_rate": 4.949590357656974e-07, + "logits/chosen": 10.067876815795898, + "logits/rejected": 10.368553161621094, + "logps/chosen": -18.041959762573242, + "logps/rejected": -30.241945266723633, + "loss": 0.3332, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.26056158542633057, + "rewards/margins": 1.1221503019332886, + "rewards/rejected": -0.8615886569023132, + "step": 186 + }, + { + "epoch": 3.169491525423729, + "grad_norm": 21.596576629334546, + "learning_rate": 4.948102008091962e-07, + "logits/chosen": 8.81076431274414, + "logits/rejected": 9.11528205871582, + "logps/chosen": -16.617202758789062, + "logps/rejected": -25.771406173706055, + "loss": 0.3458, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04481735825538635, + "rewards/margins": 1.1783089637756348, + "rewards/rejected": -1.1334917545318604, + "step": 187 + }, + { + "epoch": 3.1864406779661016, + "grad_norm": 23.319342589454678, + "learning_rate": 4.946592235545815e-07, + "logits/chosen": 6.276343822479248, + "logits/rejected": 7.280974388122559, + "logps/chosen": -25.35882568359375, + "logps/rejected": -31.435470581054688, + "loss": 0.3524, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.13174700736999512, + "rewards/margins": 1.1706089973449707, + "rewards/rejected": -1.0388619899749756, + "step": 188 + }, + { + "epoch": 3.2033898305084745, + "grad_norm": 24.334730171525816, + "learning_rate": 4.945061053230333e-07, + "logits/chosen": 6.7825822830200195, + "logits/rejected": 7.950262069702148, + "logps/chosen": -19.224178314208984, + "logps/rejected": -34.59772872924805, + "loss": 0.3604, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13449785113334656, + "rewards/margins": 1.6669206619262695, + "rewards/rejected": -1.5324227809906006, + "step": 189 + }, + { + "epoch": 3.2203389830508473, + "grad_norm": 21.339498583686392, + "learning_rate": 4.943508474544666e-07, + "logits/chosen": 7.709277629852295, + "logits/rejected": 8.046525001525879, + "logps/chosen": -11.371763229370117, + "logps/rejected": -21.591466903686523, + "loss": 0.3242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2736116051673889, + "rewards/margins": 1.5210593938827515, + "rewards/rejected": -1.2474478483200073, + "step": 190 + }, + { + "epoch": 3.23728813559322, + "grad_norm": 21.53073528694387, + "learning_rate": 4.941934513075204e-07, + "logits/chosen": 6.447499752044678, + "logits/rejected": 7.401057720184326, + "logps/chosen": -24.705169677734375, + "logps/rejected": -27.498699188232422, + "loss": 0.325, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24285036325454712, + "rewards/margins": 1.7285178899765015, + "rewards/rejected": -1.4856674671173096, + "step": 191 + }, + { + "epoch": 3.2542372881355934, + "grad_norm": 20.761722419548054, + "learning_rate": 4.94033918259545e-07, + "logits/chosen": 11.910299301147461, + "logits/rejected": 11.728422164916992, + "logps/chosen": -14.798544883728027, + "logps/rejected": -23.423885345458984, + "loss": 0.3595, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3081997036933899, + "rewards/margins": 1.2061128616333008, + "rewards/rejected": -0.8979132175445557, + "step": 192 + }, + { + "epoch": 3.2711864406779663, + "grad_norm": 22.02765764075306, + "learning_rate": 4.938722497065909e-07, + "logits/chosen": 9.125901222229004, + "logits/rejected": 9.558362007141113, + "logps/chosen": -16.233884811401367, + "logps/rejected": -20.573768615722656, + "loss": 0.3395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1345757693052292, + "rewards/margins": 1.391578197479248, + "rewards/rejected": -1.257002592086792, + "step": 193 + }, + { + "epoch": 3.288135593220339, + "grad_norm": 23.161966059680974, + "learning_rate": 4.937084470633958e-07, + "logits/chosen": 5.163336753845215, + "logits/rejected": 5.996748447418213, + "logps/chosen": -19.100372314453125, + "logps/rejected": -24.20922088623047, + "loss": 0.3378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20368611812591553, + "rewards/margins": 1.46111261844635, + "rewards/rejected": -1.2574265003204346, + "step": 194 + }, + { + "epoch": 3.305084745762712, + "grad_norm": 29.992204991018934, + "learning_rate": 4.935425117633726e-07, + "logits/chosen": 9.052857398986816, + "logits/rejected": 8.928924560546875, + "logps/chosen": -15.425498962402344, + "logps/rejected": -21.158906936645508, + "loss": 0.3513, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.12333399057388306, + "rewards/margins": 0.8104948401451111, + "rewards/rejected": -0.687160849571228, + "step": 195 + }, + { + "epoch": 3.3220338983050848, + "grad_norm": 22.847542397883526, + "learning_rate": 4.933744452585966e-07, + "logits/chosen": 7.322593688964844, + "logits/rejected": 7.687209129333496, + "logps/chosen": -14.15726375579834, + "logps/rejected": -21.306930541992188, + "loss": 0.371, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2583553194999695, + "rewards/margins": 1.3991026878356934, + "rewards/rejected": -1.1407474279403687, + "step": 196 + }, + { + "epoch": 3.3389830508474576, + "grad_norm": 22.873574991222572, + "learning_rate": 4.932042490197933e-07, + "logits/chosen": 7.417664051055908, + "logits/rejected": 7.723471641540527, + "logps/chosen": -24.805866241455078, + "logps/rejected": -26.682741165161133, + "loss": 0.3619, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21670067310333252, + "rewards/margins": 1.4350523948669434, + "rewards/rejected": -1.2183517217636108, + "step": 197 + }, + { + "epoch": 3.3559322033898304, + "grad_norm": 24.56769186369395, + "learning_rate": 4.930319245363248e-07, + "logits/chosen": 9.720744132995605, + "logits/rejected": 10.568902015686035, + "logps/chosen": -10.5819091796875, + "logps/rejected": -24.263601303100586, + "loss": 0.3567, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09235741198062897, + "rewards/margins": 1.4750256538391113, + "rewards/rejected": -1.3826682567596436, + "step": 198 + }, + { + "epoch": 3.3728813559322033, + "grad_norm": 23.233797057466784, + "learning_rate": 4.928574733161775e-07, + "logits/chosen": 11.545450210571289, + "logits/rejected": 12.52687931060791, + "logps/chosen": -16.03731918334961, + "logps/rejected": -26.412817001342773, + "loss": 0.3599, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.010396137833595276, + "rewards/margins": 1.0698754787445068, + "rewards/rejected": -1.0594793558120728, + "step": 199 + }, + { + "epoch": 3.389830508474576, + "grad_norm": 20.139690046673707, + "learning_rate": 4.926808968859483e-07, + "logits/chosen": 5.908224105834961, + "logits/rejected": 6.287415981292725, + "logps/chosen": -15.103047370910645, + "logps/rejected": -20.081340789794922, + "loss": 0.2854, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27523258328437805, + "rewards/margins": 1.3817555904388428, + "rewards/rejected": -1.1065233945846558, + "step": 200 + }, + { + "epoch": 3.406779661016949, + "grad_norm": 23.94893902926556, + "learning_rate": 4.925021967908316e-07, + "logits/chosen": 9.42378044128418, + "logits/rejected": 10.111106872558594, + "logps/chosen": -12.101956367492676, + "logps/rejected": -23.762901306152344, + "loss": 0.3612, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.05877889692783356, + "rewards/margins": 0.8194180727005005, + "rewards/rejected": -0.8781968951225281, + "step": 201 + }, + { + "epoch": 3.423728813559322, + "grad_norm": 20.9327603969309, + "learning_rate": 4.923213745946059e-07, + "logits/chosen": 8.8514986038208, + "logits/rejected": 10.125353813171387, + "logps/chosen": -12.044666290283203, + "logps/rejected": -28.45659637451172, + "loss": 0.3327, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20615214109420776, + "rewards/margins": 1.9639978408813477, + "rewards/rejected": -1.7578457593917847, + "step": 202 + }, + { + "epoch": 3.440677966101695, + "grad_norm": 23.527829904491835, + "learning_rate": 4.921384318796193e-07, + "logits/chosen": 7.121918678283691, + "logits/rejected": 8.05240249633789, + "logps/chosen": -19.336584091186523, + "logps/rejected": -23.552419662475586, + "loss": 0.3402, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11570586264133453, + "rewards/margins": 1.2959182262420654, + "rewards/rejected": -1.180212378501892, + "step": 203 + }, + { + "epoch": 3.457627118644068, + "grad_norm": 19.749222830827275, + "learning_rate": 4.919533702467771e-07, + "logits/chosen": 6.370368480682373, + "logits/rejected": 7.441535472869873, + "logps/chosen": -16.786794662475586, + "logps/rejected": -27.888469696044922, + "loss": 0.2868, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22812122106552124, + "rewards/margins": 2.133049488067627, + "rewards/rejected": -1.9049283266067505, + "step": 204 + }, + { + "epoch": 3.4745762711864407, + "grad_norm": 22.127296567931705, + "learning_rate": 4.91766191315526e-07, + "logits/chosen": 8.205850601196289, + "logits/rejected": 8.747001647949219, + "logps/chosen": -21.583003997802734, + "logps/rejected": -30.38113021850586, + "loss": 0.319, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11991026997566223, + "rewards/margins": 1.666430950164795, + "rewards/rejected": -1.546520709991455, + "step": 205 + }, + { + "epoch": 3.4915254237288136, + "grad_norm": 27.870162386057064, + "learning_rate": 4.915768967238417e-07, + "logits/chosen": 10.85152816772461, + "logits/rejected": 11.19784164428711, + "logps/chosen": -16.432479858398438, + "logps/rejected": -20.672014236450195, + "loss": 0.3583, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.08660467714071274, + "rewards/margins": 0.7723907828330994, + "rewards/rejected": -0.6857861280441284, + "step": 206 + }, + { + "epoch": 3.5084745762711864, + "grad_norm": 21.105675455485688, + "learning_rate": 4.913854881282131e-07, + "logits/chosen": 9.626623153686523, + "logits/rejected": 10.561471939086914, + "logps/chosen": -13.190305709838867, + "logps/rejected": -27.12738800048828, + "loss": 0.3139, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.09586920589208603, + "rewards/margins": 1.6659499406814575, + "rewards/rejected": -1.5700807571411133, + "step": 207 + }, + { + "epoch": 3.5254237288135593, + "grad_norm": 21.947799986422574, + "learning_rate": 4.91191967203629e-07, + "logits/chosen": 8.077613830566406, + "logits/rejected": 8.295317649841309, + "logps/chosen": -15.712980270385742, + "logps/rejected": -27.68941879272461, + "loss": 0.3188, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1660003513097763, + "rewards/margins": 1.33102548122406, + "rewards/rejected": -1.165025234222412, + "step": 208 + }, + { + "epoch": 3.542372881355932, + "grad_norm": 21.54421619455441, + "learning_rate": 4.909963356435624e-07, + "logits/chosen": 7.969567775726318, + "logits/rejected": 8.85141658782959, + "logps/chosen": -11.287786483764648, + "logps/rejected": -26.90688133239746, + "loss": 0.3329, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15023496747016907, + "rewards/margins": 2.192362070083618, + "rewards/rejected": -2.0421271324157715, + "step": 209 + }, + { + "epoch": 3.559322033898305, + "grad_norm": 23.389960730472435, + "learning_rate": 4.907985951599563e-07, + "logits/chosen": 9.967187881469727, + "logits/rejected": 10.385942459106445, + "logps/chosen": -15.627347946166992, + "logps/rejected": -25.37283706665039, + "loss": 0.3403, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.147053062915802, + "rewards/margins": 1.3340773582458496, + "rewards/rejected": -1.481130599975586, + "step": 210 + }, + { + "epoch": 3.576271186440678, + "grad_norm": 20.386193598695463, + "learning_rate": 4.905987474832087e-07, + "logits/chosen": 6.471014499664307, + "logits/rejected": 8.414436340332031, + "logps/chosen": -19.985212326049805, + "logps/rejected": -30.850446701049805, + "loss": 0.3001, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.35499030351638794, + "rewards/margins": 1.4340555667877197, + "rewards/rejected": -1.079065203666687, + "step": 211 + }, + { + "epoch": 3.593220338983051, + "grad_norm": 19.86096413206925, + "learning_rate": 4.903967943621573e-07, + "logits/chosen": 7.104785919189453, + "logits/rejected": 6.808341979980469, + "logps/chosen": -21.855426788330078, + "logps/rejected": -26.186508178710938, + "loss": 0.2714, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.16494721174240112, + "rewards/margins": 1.7304604053497314, + "rewards/rejected": -1.5655131340026855, + "step": 212 + }, + { + "epoch": 3.610169491525424, + "grad_norm": 19.972878939684275, + "learning_rate": 4.901927375640642e-07, + "logits/chosen": 7.536691188812256, + "logits/rejected": 8.184225082397461, + "logps/chosen": -17.092975616455078, + "logps/rejected": -27.26826286315918, + "loss": 0.3071, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1841067671775818, + "rewards/margins": 1.6666710376739502, + "rewards/rejected": -1.4825642108917236, + "step": 213 + }, + { + "epoch": 3.6271186440677967, + "grad_norm": 20.907070751662914, + "learning_rate": 4.899865788746005e-07, + "logits/chosen": 6.0391011238098145, + "logits/rejected": 6.892394065856934, + "logps/chosen": -19.163902282714844, + "logps/rejected": -22.660507202148438, + "loss": 0.3244, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2965112328529358, + "rewards/margins": 2.102067470550537, + "rewards/rejected": -1.8055561780929565, + "step": 214 + }, + { + "epoch": 3.6440677966101696, + "grad_norm": 18.794923971592116, + "learning_rate": 4.897783200978305e-07, + "logits/chosen": 6.708567142486572, + "logits/rejected": 8.018110275268555, + "logps/chosen": -17.882518768310547, + "logps/rejected": -25.35689353942871, + "loss": 0.2939, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.06995850056409836, + "rewards/margins": 1.7058115005493164, + "rewards/rejected": -1.6358528137207031, + "step": 215 + }, + { + "epoch": 3.6610169491525424, + "grad_norm": 22.113554379144286, + "learning_rate": 4.895679630561963e-07, + "logits/chosen": 8.934822082519531, + "logits/rejected": 9.462247848510742, + "logps/chosen": -13.299782752990723, + "logps/rejected": -22.16495704650879, + "loss": 0.3266, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2283388376235962, + "rewards/margins": 1.3375617265701294, + "rewards/rejected": -1.1092228889465332, + "step": 216 + }, + { + "epoch": 3.6779661016949152, + "grad_norm": 22.146172855918522, + "learning_rate": 4.893555095905013e-07, + "logits/chosen": 4.558166027069092, + "logits/rejected": 5.124869346618652, + "logps/chosen": -23.284637451171875, + "logps/rejected": -31.29628562927246, + "loss": 0.3132, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2708703279495239, + "rewards/margins": 1.6527708768844604, + "rewards/rejected": -1.3819005489349365, + "step": 217 + }, + { + "epoch": 3.694915254237288, + "grad_norm": 20.522636710839876, + "learning_rate": 4.891409615598949e-07, + "logits/chosen": 7.233565807342529, + "logits/rejected": 7.809101581573486, + "logps/chosen": -17.282150268554688, + "logps/rejected": -21.183683395385742, + "loss": 0.3113, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11144751310348511, + "rewards/margins": 1.5106884241104126, + "rewards/rejected": -1.3992410898208618, + "step": 218 + }, + { + "epoch": 3.711864406779661, + "grad_norm": 19.979629024830988, + "learning_rate": 4.889243208418549e-07, + "logits/chosen": 7.521520614624023, + "logits/rejected": 7.7953314781188965, + "logps/chosen": -14.246883392333984, + "logps/rejected": -23.500028610229492, + "loss": 0.3296, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.27966731786727905, + "rewards/margins": 1.612604022026062, + "rewards/rejected": -1.3329367637634277, + "step": 219 + }, + { + "epoch": 3.7288135593220337, + "grad_norm": 21.796523646749833, + "learning_rate": 4.88705589332173e-07, + "logits/chosen": 9.764825820922852, + "logits/rejected": 9.947137832641602, + "logps/chosen": -13.865436553955078, + "logps/rejected": -18.25220489501953, + "loss": 0.3321, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22284789383411407, + "rewards/margins": 0.9826896786689758, + "rewards/rejected": -0.7598418593406677, + "step": 220 + }, + { + "epoch": 3.7457627118644066, + "grad_norm": 24.185750088279356, + "learning_rate": 4.884847689449361e-07, + "logits/chosen": 9.337504386901855, + "logits/rejected": 10.483132362365723, + "logps/chosen": -19.560300827026367, + "logps/rejected": -31.120304107666016, + "loss": 0.3651, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04826126992702484, + "rewards/margins": 1.525299310684204, + "rewards/rejected": -1.477038025856018, + "step": 221 + }, + { + "epoch": 3.7627118644067794, + "grad_norm": 20.57645440620922, + "learning_rate": 4.88261861612511e-07, + "logits/chosen": 7.008860111236572, + "logits/rejected": 7.396633625030518, + "logps/chosen": -19.443256378173828, + "logps/rejected": -21.94890594482422, + "loss": 0.2738, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2511497139930725, + "rewards/margins": 1.7133121490478516, + "rewards/rejected": -1.4621624946594238, + "step": 222 + }, + { + "epoch": 3.7796610169491527, + "grad_norm": 42.36323422895596, + "learning_rate": 4.880368692855273e-07, + "logits/chosen": 5.257360935211182, + "logits/rejected": 6.08449649810791, + "logps/chosen": -24.111492156982422, + "logps/rejected": -28.312646865844727, + "loss": 0.3026, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22682160139083862, + "rewards/margins": 1.9936718940734863, + "rewards/rejected": -1.766850233078003, + "step": 223 + }, + { + "epoch": 3.7966101694915255, + "grad_norm": 21.57536476218224, + "learning_rate": 4.878097939328596e-07, + "logits/chosen": 10.081559181213379, + "logits/rejected": 9.927122116088867, + "logps/chosen": -17.91533088684082, + "logps/rejected": -24.432580947875977, + "loss": 0.3221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07144062221050262, + "rewards/margins": 1.6415483951568604, + "rewards/rejected": -1.712989091873169, + "step": 224 + }, + { + "epoch": 3.8135593220338984, + "grad_norm": 22.918857955226564, + "learning_rate": 4.875806375416109e-07, + "logits/chosen": 9.115406036376953, + "logits/rejected": 9.300793647766113, + "logps/chosen": -22.0445499420166, + "logps/rejected": -24.975872039794922, + "loss": 0.3145, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.03266003727912903, + "rewards/margins": 1.8592348098754883, + "rewards/rejected": -1.8265748023986816, + "step": 225 + }, + { + "epoch": 3.830508474576271, + "grad_norm": 22.493511216167683, + "learning_rate": 4.873494021170954e-07, + "logits/chosen": 7.798854351043701, + "logits/rejected": 8.126043319702148, + "logps/chosen": -15.902989387512207, + "logps/rejected": -21.789674758911133, + "loss": 0.3353, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.045627400279045105, + "rewards/margins": 1.554894208908081, + "rewards/rejected": -1.5092668533325195, + "step": 226 + }, + { + "epoch": 3.847457627118644, + "grad_norm": 20.36579881861922, + "learning_rate": 4.871160896828199e-07, + "logits/chosen": 6.323782444000244, + "logits/rejected": 7.2763142585754395, + "logps/chosen": -20.429576873779297, + "logps/rejected": -27.979249954223633, + "loss": 0.3174, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.04189814627170563, + "rewards/margins": 1.8435866832733154, + "rewards/rejected": -1.801688551902771, + "step": 227 + }, + { + "epoch": 3.864406779661017, + "grad_norm": 24.774639537029763, + "learning_rate": 4.868807022804678e-07, + "logits/chosen": 9.863037109375, + "logits/rejected": 11.282732963562012, + "logps/chosen": -18.87766456604004, + "logps/rejected": -35.58513259887695, + "loss": 0.3485, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13600066304206848, + "rewards/margins": 2.0307259559631348, + "rewards/rejected": -2.16672682762146, + "step": 228 + }, + { + "epoch": 3.8813559322033897, + "grad_norm": 22.966813457902724, + "learning_rate": 4.866432419698792e-07, + "logits/chosen": 5.345111846923828, + "logits/rejected": 5.739473342895508, + "logps/chosen": -14.497096061706543, + "logps/rejected": -19.52205467224121, + "loss": 0.3202, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11341910064220428, + "rewards/margins": 1.1143789291381836, + "rewards/rejected": -1.000959873199463, + "step": 229 + }, + { + "epoch": 3.898305084745763, + "grad_norm": 20.655199996849774, + "learning_rate": 4.864037108290347e-07, + "logits/chosen": 7.576197624206543, + "logits/rejected": 8.149455070495605, + "logps/chosen": -13.794957160949707, + "logps/rejected": -35.497589111328125, + "loss": 0.3118, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09785757213830948, + "rewards/margins": 2.4956488609313965, + "rewards/rejected": -2.3977913856506348, + "step": 230 + }, + { + "epoch": 3.915254237288136, + "grad_norm": 22.031674230554742, + "learning_rate": 4.86162110954036e-07, + "logits/chosen": 5.174468040466309, + "logits/rejected": 5.88414192199707, + "logps/chosen": -14.31143569946289, + "logps/rejected": -18.397750854492188, + "loss": 0.3161, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.34032052755355835, + "rewards/margins": 0.8846842646598816, + "rewards/rejected": -0.5443637371063232, + "step": 231 + }, + { + "epoch": 3.9322033898305087, + "grad_norm": 21.411141660381265, + "learning_rate": 4.859184444590881e-07, + "logits/chosen": 9.678922653198242, + "logits/rejected": 10.129996299743652, + "logps/chosen": -15.969707489013672, + "logps/rejected": -21.065452575683594, + "loss": 0.2914, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13621875643730164, + "rewards/margins": 0.9995884895324707, + "rewards/rejected": -0.8633697032928467, + "step": 232 + }, + { + "epoch": 3.9491525423728815, + "grad_norm": 23.327719096582616, + "learning_rate": 4.856727134764809e-07, + "logits/chosen": 6.851404190063477, + "logits/rejected": 7.505338668823242, + "logps/chosen": -13.464282035827637, + "logps/rejected": -23.67028045654297, + "loss": 0.3166, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.0010511577129364014, + "rewards/margins": 1.9928821325302124, + "rewards/rejected": -1.9939334392547607, + "step": 233 + }, + { + "epoch": 3.9661016949152543, + "grad_norm": 22.34328312174226, + "learning_rate": 4.8542492015657e-07, + "logits/chosen": 11.095281600952148, + "logits/rejected": 11.888551712036133, + "logps/chosen": -18.534252166748047, + "logps/rejected": -27.535181045532227, + "loss": 0.3389, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1114778071641922, + "rewards/margins": 2.0294349193573, + "rewards/rejected": -1.9179573059082031, + "step": 234 + }, + { + "epoch": 3.983050847457627, + "grad_norm": 22.460277454690832, + "learning_rate": 4.851750666677583e-07, + "logits/chosen": 7.924036026000977, + "logits/rejected": 8.230151176452637, + "logps/chosen": -15.009639739990234, + "logps/rejected": -24.392051696777344, + "loss": 0.3365, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17272941768169403, + "rewards/margins": 1.2254295349121094, + "rewards/rejected": -1.0527000427246094, + "step": 235 + }, + { + "epoch": 4.0, + "grad_norm": 22.886063239469433, + "learning_rate": 4.849231551964771e-07, + "logits/chosen": 5.907500267028809, + "logits/rejected": 6.998715877532959, + "logps/chosen": -20.924211502075195, + "logps/rejected": -26.191280364990234, + "loss": 0.2911, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.018931731581687927, + "rewards/margins": 2.093313217163086, + "rewards/rejected": -2.1122450828552246, + "step": 236 + }, + { + "epoch": 4.016949152542373, + "grad_norm": 16.11228313336153, + "learning_rate": 4.846691879471666e-07, + "logits/chosen": 6.13948917388916, + "logits/rejected": 7.0086822509765625, + "logps/chosen": -21.371328353881836, + "logps/rejected": -26.916461944580078, + "loss": 0.218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2001882642507553, + "rewards/margins": 2.128901481628418, + "rewards/rejected": -1.928713321685791, + "step": 237 + }, + { + "epoch": 4.033898305084746, + "grad_norm": 19.987499022105034, + "learning_rate": 4.844131671422569e-07, + "logits/chosen": 6.478044033050537, + "logits/rejected": 6.766986846923828, + "logps/chosen": -16.220653533935547, + "logps/rejected": -19.514848709106445, + "loss": 0.3162, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3637089133262634, + "rewards/margins": 1.3370646238327026, + "rewards/rejected": -0.9733555912971497, + "step": 238 + }, + { + "epoch": 4.0508474576271185, + "grad_norm": 18.688925017295187, + "learning_rate": 4.841550950221485e-07, + "logits/chosen": 7.050149440765381, + "logits/rejected": 7.562372207641602, + "logps/chosen": -20.11605453491211, + "logps/rejected": -26.2134952545166, + "loss": 0.2711, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16213461756706238, + "rewards/margins": 1.712576985359192, + "rewards/rejected": -1.5504424571990967, + "step": 239 + }, + { + "epoch": 4.067796610169491, + "grad_norm": 18.92762993148661, + "learning_rate": 4.838949738451928e-07, + "logits/chosen": 7.53150749206543, + "logits/rejected": 7.950225830078125, + "logps/chosen": -19.729782104492188, + "logps/rejected": -30.533475875854492, + "loss": 0.2874, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2349463552236557, + "rewards/margins": 2.648629665374756, + "rewards/rejected": -2.4136834144592285, + "step": 240 + }, + { + "epoch": 4.084745762711864, + "grad_norm": 18.885623463835536, + "learning_rate": 4.836328058876717e-07, + "logits/chosen": 7.093875885009766, + "logits/rejected": 8.201071739196777, + "logps/chosen": -17.072683334350586, + "logps/rejected": -24.49260902404785, + "loss": 0.2928, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.017010435461997986, + "rewards/margins": 1.231659173965454, + "rewards/rejected": -1.214648723602295, + "step": 241 + }, + { + "epoch": 4.101694915254237, + "grad_norm": 18.79125519072356, + "learning_rate": 4.833685934437787e-07, + "logits/chosen": 6.813934803009033, + "logits/rejected": 7.2115678787231445, + "logps/chosen": -17.01702117919922, + "logps/rejected": -29.53050422668457, + "loss": 0.2546, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4310588240623474, + "rewards/margins": 1.8695611953735352, + "rewards/rejected": -1.438502311706543, + "step": 242 + }, + { + "epoch": 4.11864406779661, + "grad_norm": 16.887866681775407, + "learning_rate": 4.831023388255979e-07, + "logits/chosen": 5.699078559875488, + "logits/rejected": 6.792886734008789, + "logps/chosen": -18.427654266357422, + "logps/rejected": -24.72570037841797, + "loss": 0.2358, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.15313753485679626, + "rewards/margins": 2.126829147338867, + "rewards/rejected": -1.973691701889038, + "step": 243 + }, + { + "epoch": 4.135593220338983, + "grad_norm": 17.68436179219868, + "learning_rate": 4.828340443630846e-07, + "logits/chosen": 6.9619526863098145, + "logits/rejected": 7.495880126953125, + "logps/chosen": -16.34537124633789, + "logps/rejected": -22.424455642700195, + "loss": 0.3041, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.48435744643211365, + "rewards/margins": 1.6667134761810303, + "rewards/rejected": -1.1823559999465942, + "step": 244 + }, + { + "epoch": 4.1525423728813555, + "grad_norm": 19.718183477610275, + "learning_rate": 4.825637124040441e-07, + "logits/chosen": 9.444064140319824, + "logits/rejected": 9.68178653717041, + "logps/chosen": -17.95220947265625, + "logps/rejected": -29.515003204345703, + "loss": 0.2785, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34623271226882935, + "rewards/margins": 1.8744040727615356, + "rewards/rejected": -1.5281713008880615, + "step": 245 + }, + { + "epoch": 4.169491525423728, + "grad_norm": 19.424536492921106, + "learning_rate": 4.822913453141117e-07, + "logits/chosen": 7.507587432861328, + "logits/rejected": 7.846378326416016, + "logps/chosen": -15.093152046203613, + "logps/rejected": -24.109743118286133, + "loss": 0.2887, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.22552460432052612, + "rewards/margins": 1.9814910888671875, + "rewards/rejected": -1.7559665441513062, + "step": 246 + }, + { + "epoch": 4.186440677966102, + "grad_norm": 19.10137459530579, + "learning_rate": 4.820169454767318e-07, + "logits/chosen": 8.067806243896484, + "logits/rejected": 9.25924301147461, + "logps/chosen": -16.840450286865234, + "logps/rejected": -28.04216957092285, + "loss": 0.2616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35099655389785767, + "rewards/margins": 2.54416561126709, + "rewards/rejected": -2.193168878555298, + "step": 247 + }, + { + "epoch": 4.203389830508475, + "grad_norm": 18.565041100888553, + "learning_rate": 4.81740515293137e-07, + "logits/chosen": 8.024762153625488, + "logits/rejected": 8.494616508483887, + "logps/chosen": -17.160768508911133, + "logps/rejected": -26.250389099121094, + "loss": 0.2831, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26058316230773926, + "rewards/margins": 1.5284281969070435, + "rewards/rejected": -1.2678451538085938, + "step": 248 + }, + { + "epoch": 4.220338983050848, + "grad_norm": 17.696717834540067, + "learning_rate": 4.814620571823274e-07, + "logits/chosen": 5.288139343261719, + "logits/rejected": 5.49987268447876, + "logps/chosen": -21.20608901977539, + "logps/rejected": -30.10845947265625, + "loss": 0.284, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29643139243125916, + "rewards/margins": 2.4248037338256836, + "rewards/rejected": -2.1283726692199707, + "step": 249 + }, + { + "epoch": 4.237288135593221, + "grad_norm": 16.08433980916575, + "learning_rate": 4.811815735810489e-07, + "logits/chosen": 6.320197105407715, + "logits/rejected": 7.34105110168457, + "logps/chosen": -18.912696838378906, + "logps/rejected": -30.96662139892578, + "loss": 0.2491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30656683444976807, + "rewards/margins": 2.7686104774475098, + "rewards/rejected": -2.462043285369873, + "step": 250 + }, + { + "epoch": 4.254237288135593, + "grad_norm": 16.436104483894702, + "learning_rate": 4.808990669437724e-07, + "logits/chosen": 3.874683380126953, + "logits/rejected": 4.041537761688232, + "logps/chosen": -13.296972274780273, + "logps/rejected": -26.234445571899414, + "loss": 0.2308, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.20190800726413727, + "rewards/margins": 2.277385711669922, + "rewards/rejected": -2.0754778385162354, + "step": 251 + }, + { + "epoch": 4.271186440677966, + "grad_norm": 19.96329758104426, + "learning_rate": 4.806145397426719e-07, + "logits/chosen": 8.5095853805542, + "logits/rejected": 8.628890991210938, + "logps/chosen": -11.131439208984375, + "logps/rejected": -20.39238166809082, + "loss": 0.277, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16035281121730804, + "rewards/margins": 2.0513343811035156, + "rewards/rejected": -1.890981674194336, + "step": 252 + }, + { + "epoch": 4.288135593220339, + "grad_norm": 18.10784435727316, + "learning_rate": 4.803279944676032e-07, + "logits/chosen": 7.8867902755737305, + "logits/rejected": 7.66014289855957, + "logps/chosen": -11.152649879455566, + "logps/rejected": -24.569089889526367, + "loss": 0.2508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1465606689453125, + "rewards/margins": 1.89396071434021, + "rewards/rejected": -1.747400164604187, + "step": 253 + }, + { + "epoch": 4.305084745762712, + "grad_norm": 16.848798055022613, + "learning_rate": 4.800394336260819e-07, + "logits/chosen": 3.822847366333008, + "logits/rejected": 5.139825820922852, + "logps/chosen": -14.96919059753418, + "logps/rejected": -27.31922149658203, + "loss": 0.2692, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.23743754625320435, + "rewards/margins": 1.99288010597229, + "rewards/rejected": -1.755442500114441, + "step": 254 + }, + { + "epoch": 4.322033898305085, + "grad_norm": 17.12136208102501, + "learning_rate": 4.797488597432616e-07, + "logits/chosen": 4.065765380859375, + "logits/rejected": 5.0273637771606445, + "logps/chosen": -24.291616439819336, + "logps/rejected": -23.00619888305664, + "loss": 0.2292, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.25025373697280884, + "rewards/margins": 1.9274500608444214, + "rewards/rejected": -1.6771961450576782, + "step": 255 + }, + { + "epoch": 4.338983050847458, + "grad_norm": 17.427062960507172, + "learning_rate": 4.794562753619117e-07, + "logits/chosen": 6.049924373626709, + "logits/rejected": 5.929304122924805, + "logps/chosen": -18.5858211517334, + "logps/rejected": -23.01677703857422, + "loss": 0.2702, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.24605974555015564, + "rewards/margins": 1.5662342309951782, + "rewards/rejected": -1.3201744556427002, + "step": 256 + }, + { + "epoch": 4.3559322033898304, + "grad_norm": 15.920672408874744, + "learning_rate": 4.791616830423949e-07, + "logits/chosen": 7.087856292724609, + "logits/rejected": 7.3572678565979, + "logps/chosen": -17.611812591552734, + "logps/rejected": -23.177900314331055, + "loss": 0.2335, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2877931594848633, + "rewards/margins": 1.9493603706359863, + "rewards/rejected": -1.6615670919418335, + "step": 257 + }, + { + "epoch": 4.372881355932203, + "grad_norm": 16.27906455725465, + "learning_rate": 4.788650853626456e-07, + "logits/chosen": 4.602998733520508, + "logits/rejected": 5.435512542724609, + "logps/chosen": -18.157915115356445, + "logps/rejected": -23.088634490966797, + "loss": 0.2488, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42691361904144287, + "rewards/margins": 2.0973167419433594, + "rewards/rejected": -1.6704033613204956, + "step": 258 + }, + { + "epoch": 4.389830508474576, + "grad_norm": 15.402435870137548, + "learning_rate": 4.785664849181465e-07, + "logits/chosen": 8.592238426208496, + "logits/rejected": 9.519624710083008, + "logps/chosen": -14.045475959777832, + "logps/rejected": -23.337627410888672, + "loss": 0.2581, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.038690753281116486, + "rewards/margins": 1.2892788648605347, + "rewards/rejected": -1.250588297843933, + "step": 259 + }, + { + "epoch": 4.406779661016949, + "grad_norm": 16.299035928417254, + "learning_rate": 4.78265884321906e-07, + "logits/chosen": 5.048234939575195, + "logits/rejected": 5.757306098937988, + "logps/chosen": -14.810338973999023, + "logps/rejected": -26.654769897460938, + "loss": 0.264, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16818922758102417, + "rewards/margins": 1.9285526275634766, + "rewards/rejected": -1.7603633403778076, + "step": 260 + }, + { + "epoch": 4.423728813559322, + "grad_norm": 17.024561674564683, + "learning_rate": 4.779632862044361e-07, + "logits/chosen": 5.772840976715088, + "logits/rejected": 6.783274173736572, + "logps/chosen": -17.9024715423584, + "logps/rejected": -30.046398162841797, + "loss": 0.2566, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1782139241695404, + "rewards/margins": 2.617436408996582, + "rewards/rejected": -2.439222574234009, + "step": 261 + }, + { + "epoch": 4.440677966101695, + "grad_norm": 17.896887917576542, + "learning_rate": 4.776586932137283e-07, + "logits/chosen": 8.3196382522583, + "logits/rejected": 8.486194610595703, + "logps/chosen": -18.671096801757812, + "logps/rejected": -23.115036010742188, + "loss": 0.262, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.37524986267089844, + "rewards/margins": 1.8184455633163452, + "rewards/rejected": -1.4431955814361572, + "step": 262 + }, + { + "epoch": 4.4576271186440675, + "grad_norm": 18.02282203660666, + "learning_rate": 4.773521080152311e-07, + "logits/chosen": 5.8606696128845215, + "logits/rejected": 6.721665382385254, + "logps/chosen": -19.343944549560547, + "logps/rejected": -32.12931442260742, + "loss": 0.2426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45170512795448303, + "rewards/margins": 2.2819502353668213, + "rewards/rejected": -1.830245018005371, + "step": 263 + }, + { + "epoch": 4.47457627118644, + "grad_norm": 16.54309012883112, + "learning_rate": 4.770435332918267e-07, + "logits/chosen": 6.439443588256836, + "logits/rejected": 7.319515228271484, + "logps/chosen": -18.117504119873047, + "logps/rejected": -26.829072952270508, + "loss": 0.2231, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21523486077785492, + "rewards/margins": 2.130596876144409, + "rewards/rejected": -1.915362000465393, + "step": 264 + }, + { + "epoch": 4.491525423728813, + "grad_norm": 17.419485402954614, + "learning_rate": 4.76732971743807e-07, + "logits/chosen": 7.095752716064453, + "logits/rejected": 8.249744415283203, + "logps/chosen": -15.102618217468262, + "logps/rejected": -24.421607971191406, + "loss": 0.2272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3142382502555847, + "rewards/margins": 2.8125152587890625, + "rewards/rejected": -2.498276710510254, + "step": 265 + }, + { + "epoch": 4.508474576271187, + "grad_norm": 18.562995753543007, + "learning_rate": 4.7642042608885056e-07, + "logits/chosen": 7.712843894958496, + "logits/rejected": 8.154708862304688, + "logps/chosen": -21.448400497436523, + "logps/rejected": -29.982311248779297, + "loss": 0.2839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.047302231192588806, + "rewards/margins": 2.0715363025665283, + "rewards/rejected": -2.0242340564727783, + "step": 266 + }, + { + "epoch": 4.52542372881356, + "grad_norm": 17.86786243504552, + "learning_rate": 4.761058990619986e-07, + "logits/chosen": 5.361347675323486, + "logits/rejected": 5.248828411102295, + "logps/chosen": -18.133102416992188, + "logps/rejected": -25.32653045654297, + "loss": 0.2412, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06450856477022171, + "rewards/margins": 2.018827438354492, + "rewards/rejected": -1.954318881034851, + "step": 267 + }, + { + "epoch": 4.5423728813559325, + "grad_norm": 16.209189141288928, + "learning_rate": 4.757893934156309e-07, + "logits/chosen": 7.7050018310546875, + "logits/rejected": 8.658272743225098, + "logps/chosen": -15.19764232635498, + "logps/rejected": -30.116540908813477, + "loss": 0.2379, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.209306001663208, + "rewards/margins": 2.709834575653076, + "rewards/rejected": -2.500528573989868, + "step": 268 + }, + { + "epoch": 4.559322033898305, + "grad_norm": 17.963829609972713, + "learning_rate": 4.754709119194418e-07, + "logits/chosen": 3.150035858154297, + "logits/rejected": 3.4155702590942383, + "logps/chosen": -16.21509552001953, + "logps/rejected": -31.73999786376953, + "loss": 0.2436, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1859074831008911, + "rewards/margins": 2.52579402923584, + "rewards/rejected": -2.339886426925659, + "step": 269 + }, + { + "epoch": 4.576271186440678, + "grad_norm": 18.147760096125833, + "learning_rate": 4.7515045736041615e-07, + "logits/chosen": 7.926346778869629, + "logits/rejected": 9.348649024963379, + "logps/chosen": -12.561721801757812, + "logps/rejected": -29.205713272094727, + "loss": 0.2694, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32173627614974976, + "rewards/margins": 2.6866581439971924, + "rewards/rejected": -2.364922046661377, + "step": 270 + }, + { + "epoch": 4.593220338983051, + "grad_norm": 16.51138027595473, + "learning_rate": 4.748280325428048e-07, + "logits/chosen": 10.498759269714355, + "logits/rejected": 10.979084014892578, + "logps/chosen": -17.349008560180664, + "logps/rejected": -32.83536911010742, + "loss": 0.2308, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.198812797665596, + "rewards/margins": 2.9069724082946777, + "rewards/rejected": -2.7081594467163086, + "step": 271 + }, + { + "epoch": 4.610169491525424, + "grad_norm": 17.18774465901935, + "learning_rate": 4.745036402880999e-07, + "logits/chosen": 3.7919700145721436, + "logits/rejected": 4.364020347595215, + "logps/chosen": -13.981499671936035, + "logps/rejected": -24.345117568969727, + "loss": 0.2668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21096213161945343, + "rewards/margins": 2.6246230602264404, + "rewards/rejected": -2.413661003112793, + "step": 272 + }, + { + "epoch": 4.627118644067797, + "grad_norm": 18.245124366119207, + "learning_rate": 4.741772834350104e-07, + "logits/chosen": 6.9369378089904785, + "logits/rejected": 7.000739574432373, + "logps/chosen": -18.30843734741211, + "logps/rejected": -26.522764205932617, + "loss": 0.269, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11697474122047424, + "rewards/margins": 1.2881109714508057, + "rewards/rejected": -1.1711363792419434, + "step": 273 + }, + { + "epoch": 4.6440677966101696, + "grad_norm": 18.848939003698714, + "learning_rate": 4.7384896483943726e-07, + "logits/chosen": 4.844106197357178, + "logits/rejected": 5.648234844207764, + "logps/chosen": -11.51663589477539, + "logps/rejected": -27.013900756835938, + "loss": 0.2596, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3477116823196411, + "rewards/margins": 2.468505859375, + "rewards/rejected": -2.1207942962646484, + "step": 274 + }, + { + "epoch": 4.661016949152542, + "grad_norm": 21.485542587702177, + "learning_rate": 4.7351868737444825e-07, + "logits/chosen": 9.079914093017578, + "logits/rejected": 9.508895874023438, + "logps/chosen": -14.13361930847168, + "logps/rejected": -27.242555618286133, + "loss": 0.3004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.02514958381652832, + "rewards/margins": 1.8603540658950806, + "rewards/rejected": -1.8855036497116089, + "step": 275 + }, + { + "epoch": 4.677966101694915, + "grad_norm": 19.670219619103424, + "learning_rate": 4.7318645393025305e-07, + "logits/chosen": 8.693038940429688, + "logits/rejected": 9.391683578491211, + "logps/chosen": -16.673078536987305, + "logps/rejected": -21.615997314453125, + "loss": 0.2778, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.019453592598438263, + "rewards/margins": 1.658519983291626, + "rewards/rejected": -1.6390663385391235, + "step": 276 + }, + { + "epoch": 4.694915254237288, + "grad_norm": 16.137053552384206, + "learning_rate": 4.7285226741417753e-07, + "logits/chosen": 4.7442708015441895, + "logits/rejected": 5.013760566711426, + "logps/chosen": -19.061439514160156, + "logps/rejected": -25.69558334350586, + "loss": 0.2495, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4043675363063812, + "rewards/margins": 2.4568257331848145, + "rewards/rejected": -2.0524580478668213, + "step": 277 + }, + { + "epoch": 4.711864406779661, + "grad_norm": 17.739725289410767, + "learning_rate": 4.7251613075063905e-07, + "logits/chosen": 5.071934700012207, + "logits/rejected": 6.222379684448242, + "logps/chosen": -14.962982177734375, + "logps/rejected": -25.778844833374023, + "loss": 0.2445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3471611440181732, + "rewards/margins": 2.654113531112671, + "rewards/rejected": -2.306952476501465, + "step": 278 + }, + { + "epoch": 4.728813559322034, + "grad_norm": 16.298662722486412, + "learning_rate": 4.721780468811201e-07, + "logits/chosen": 5.887434482574463, + "logits/rejected": 6.384895324707031, + "logps/chosen": -19.674272537231445, + "logps/rejected": -23.1964111328125, + "loss": 0.2212, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23275047540664673, + "rewards/margins": 1.6070315837860107, + "rewards/rejected": -1.3742811679840088, + "step": 279 + }, + { + "epoch": 4.745762711864407, + "grad_norm": 15.906115484751137, + "learning_rate": 4.7183801876414286e-07, + "logits/chosen": 7.360743045806885, + "logits/rejected": 8.02312183380127, + "logps/chosen": -15.797630310058594, + "logps/rejected": -25.812213897705078, + "loss": 0.2118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.017660900950431824, + "rewards/margins": 2.3710832595825195, + "rewards/rejected": -2.3534226417541504, + "step": 280 + }, + { + "epoch": 4.762711864406779, + "grad_norm": 15.677627452398355, + "learning_rate": 4.7149604937524356e-07, + "logits/chosen": 3.6860146522521973, + "logits/rejected": 4.597846031188965, + "logps/chosen": -23.743751525878906, + "logps/rejected": -34.87071228027344, + "loss": 0.2401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40262383222579956, + "rewards/margins": 2.2429513931274414, + "rewards/rejected": -1.8403273820877075, + "step": 281 + }, + { + "epoch": 4.779661016949152, + "grad_norm": 14.517687160323698, + "learning_rate": 4.7115214170694616e-07, + "logits/chosen": 5.888266563415527, + "logits/rejected": 6.816344261169434, + "logps/chosen": -15.760833740234375, + "logps/rejected": -31.780643463134766, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.04766558110713959, + "rewards/margins": 2.9352927207946777, + "rewards/rejected": -2.887627124786377, + "step": 282 + }, + { + "epoch": 4.796610169491525, + "grad_norm": 18.000705159355824, + "learning_rate": 4.70806298768736e-07, + "logits/chosen": 7.001628398895264, + "logits/rejected": 6.544361114501953, + "logps/chosen": -16.813600540161133, + "logps/rejected": -20.177221298217773, + "loss": 0.2368, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17308537662029266, + "rewards/margins": 2.580078125, + "rewards/rejected": -2.4069926738739014, + "step": 283 + }, + { + "epoch": 4.813559322033898, + "grad_norm": 17.74785330269053, + "learning_rate": 4.70458523587034e-07, + "logits/chosen": 4.586512565612793, + "logits/rejected": 5.216480255126953, + "logps/chosen": -21.734012603759766, + "logps/rejected": -38.459041595458984, + "loss": 0.2345, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13362450897693634, + "rewards/margins": 2.968825578689575, + "rewards/rejected": -2.8352010250091553, + "step": 284 + }, + { + "epoch": 4.830508474576272, + "grad_norm": 17.69313020536604, + "learning_rate": 4.701088192051695e-07, + "logits/chosen": 4.594993591308594, + "logits/rejected": 5.305399417877197, + "logps/chosen": -19.230627059936523, + "logps/rejected": -29.320083618164062, + "loss": 0.2494, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4186445474624634, + "rewards/margins": 2.536355972290039, + "rewards/rejected": -2.117711305618286, + "step": 285 + }, + { + "epoch": 4.847457627118644, + "grad_norm": 15.219068766771, + "learning_rate": 4.697571886833543e-07, + "logits/chosen": 6.026544094085693, + "logits/rejected": 6.7140607833862305, + "logps/chosen": -15.21136474609375, + "logps/rejected": -27.875221252441406, + "loss": 0.215, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03304705023765564, + "rewards/margins": 2.2568628787994385, + "rewards/rejected": -2.22381591796875, + "step": 286 + }, + { + "epoch": 4.864406779661017, + "grad_norm": 15.548775123675558, + "learning_rate": 4.6940363509865553e-07, + "logits/chosen": 8.8172607421875, + "logits/rejected": 9.029597282409668, + "logps/chosen": -17.15589714050293, + "logps/rejected": -25.47644805908203, + "loss": 0.2311, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38684672117233276, + "rewards/margins": 2.146179676055908, + "rewards/rejected": -1.7593328952789307, + "step": 287 + }, + { + "epoch": 4.88135593220339, + "grad_norm": 16.621647617560352, + "learning_rate": 4.6904816154496854e-07, + "logits/chosen": 7.06538200378418, + "logits/rejected": 8.518985748291016, + "logps/chosen": -16.530170440673828, + "logps/rejected": -25.592021942138672, + "loss": 0.2418, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1437515765428543, + "rewards/margins": 2.5299487113952637, + "rewards/rejected": -2.386197328567505, + "step": 288 + }, + { + "epoch": 4.898305084745763, + "grad_norm": 18.20879716146644, + "learning_rate": 4.6869077113299025e-07, + "logits/chosen": 4.580350399017334, + "logits/rejected": 5.193227767944336, + "logps/chosen": -16.725788116455078, + "logps/rejected": -27.487756729125977, + "loss": 0.2752, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.07869633287191391, + "rewards/margins": 1.970123052597046, + "rewards/rejected": -2.0488195419311523, + "step": 289 + }, + { + "epoch": 4.915254237288136, + "grad_norm": 17.56550072326665, + "learning_rate": 4.6833146699019177e-07, + "logits/chosen": 7.187812328338623, + "logits/rejected": 7.607518196105957, + "logps/chosen": -15.682883262634277, + "logps/rejected": -30.07357406616211, + "loss": 0.272, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13676121830940247, + "rewards/margins": 2.355900287628174, + "rewards/rejected": -2.2191390991210938, + "step": 290 + }, + { + "epoch": 4.932203389830509, + "grad_norm": 17.319715186835847, + "learning_rate": 4.6797025226079074e-07, + "logits/chosen": 8.21790885925293, + "logits/rejected": 8.874201774597168, + "logps/chosen": -16.58701515197754, + "logps/rejected": -28.231704711914062, + "loss": 0.2398, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.043866753578186035, + "rewards/margins": 2.017036199569702, + "rewards/rejected": -2.0609028339385986, + "step": 291 + }, + { + "epoch": 4.9491525423728815, + "grad_norm": 17.60150177581467, + "learning_rate": 4.676071301057243e-07, + "logits/chosen": 4.695080757141113, + "logits/rejected": 5.318776607513428, + "logps/chosen": -15.804220199584961, + "logps/rejected": -26.266937255859375, + "loss": 0.262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24425703287124634, + "rewards/margins": 1.4589682817459106, + "rewards/rejected": -1.214711308479309, + "step": 292 + }, + { + "epoch": 4.966101694915254, + "grad_norm": 18.261705360854126, + "learning_rate": 4.67242103702621e-07, + "logits/chosen": 5.390717506408691, + "logits/rejected": 6.542966842651367, + "logps/chosen": -16.86713981628418, + "logps/rejected": -26.163740158081055, + "loss": 0.2747, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0493212565779686, + "rewards/margins": 1.8711323738098145, + "rewards/rejected": -1.8218111991882324, + "step": 293 + }, + { + "epoch": 4.983050847457627, + "grad_norm": 16.247074569406255, + "learning_rate": 4.668751762457733e-07, + "logits/chosen": 6.2724609375, + "logits/rejected": 6.9650468826293945, + "logps/chosen": -14.587836265563965, + "logps/rejected": -28.363473892211914, + "loss": 0.2201, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.029139414429664612, + "rewards/margins": 2.09466814994812, + "rewards/rejected": -2.0655288696289062, + "step": 294 + }, + { + "epoch": 5.0, + "grad_norm": 15.163845862231213, + "learning_rate": 4.6650635094610966e-07, + "logits/chosen": 6.010763168334961, + "logits/rejected": 7.175766468048096, + "logps/chosen": -15.043150901794434, + "logps/rejected": -26.033666610717773, + "loss": 0.2364, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40571334958076477, + "rewards/margins": 2.3270514011383057, + "rewards/rejected": -1.9213382005691528, + "step": 295 + }, + { + "epoch": 5.016949152542373, + "grad_norm": 12.995315182536677, + "learning_rate": 4.661356310311659e-07, + "logits/chosen": 6.765958786010742, + "logits/rejected": 7.221011161804199, + "logps/chosen": -17.7678165435791, + "logps/rejected": -30.081592559814453, + "loss": 0.208, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16127540171146393, + "rewards/margins": 2.3699498176574707, + "rewards/rejected": -2.5312252044677734, + "step": 296 + }, + { + "epoch": 5.033898305084746, + "grad_norm": 16.281282112966135, + "learning_rate": 4.657630197450576e-07, + "logits/chosen": 4.995595455169678, + "logits/rejected": 6.14824104309082, + "logps/chosen": -27.272743225097656, + "logps/rejected": -37.27703857421875, + "loss": 0.2147, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.37089285254478455, + "rewards/margins": 3.0824501514434814, + "rewards/rejected": -2.711557149887085, + "step": 297 + }, + { + "epoch": 5.0508474576271185, + "grad_norm": 15.038788736271039, + "learning_rate": 4.653885203484515e-07, + "logits/chosen": 4.113032817840576, + "logits/rejected": 4.685057163238525, + "logps/chosen": -13.144742965698242, + "logps/rejected": -19.484567642211914, + "loss": 0.2289, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3587087392807007, + "rewards/margins": 1.6604740619659424, + "rewards/rejected": -1.3017653226852417, + "step": 298 + }, + { + "epoch": 5.067796610169491, + "grad_norm": 13.341197053744226, + "learning_rate": 4.6501213611853673e-07, + "logits/chosen": 3.0600595474243164, + "logits/rejected": 3.6969590187072754, + "logps/chosen": -14.055743217468262, + "logps/rejected": -30.396642684936523, + "loss": 0.1703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2175249606370926, + "rewards/margins": 2.968029022216797, + "rewards/rejected": -2.750504493713379, + "step": 299 + }, + { + "epoch": 5.084745762711864, + "grad_norm": 15.64944020606352, + "learning_rate": 4.6463387034899643e-07, + "logits/chosen": 7.053459167480469, + "logits/rejected": 6.929786682128906, + "logps/chosen": -17.727378845214844, + "logps/rejected": -26.59695816040039, + "loss": 0.2295, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2067881077528, + "rewards/margins": 2.4637019634246826, + "rewards/rejected": -2.256913661956787, + "step": 300 + }, + { + "epoch": 5.101694915254237, + "grad_norm": 13.927587181216817, + "learning_rate": 4.642537263499788e-07, + "logits/chosen": 5.323790550231934, + "logits/rejected": 5.755880832672119, + "logps/chosen": -12.35572338104248, + "logps/rejected": -24.77527618408203, + "loss": 0.2085, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3975515365600586, + "rewards/margins": 2.341681957244873, + "rewards/rejected": -1.9441306591033936, + "step": 301 + }, + { + "epoch": 5.11864406779661, + "grad_norm": 13.003919286175359, + "learning_rate": 4.6387170744806813e-07, + "logits/chosen": 4.886084079742432, + "logits/rejected": 6.434139728546143, + "logps/chosen": -17.88004493713379, + "logps/rejected": -33.52429962158203, + "loss": 0.2086, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4249918758869171, + "rewards/margins": 3.037893772125244, + "rewards/rejected": -2.6129019260406494, + "step": 302 + }, + { + "epoch": 5.135593220338983, + "grad_norm": 13.709462601856833, + "learning_rate": 4.634878169862557e-07, + "logits/chosen": 6.236474514007568, + "logits/rejected": 6.070339202880859, + "logps/chosen": -18.325437545776367, + "logps/rejected": -27.6246337890625, + "loss": 0.2171, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.09506276994943619, + "rewards/margins": 1.819183588027954, + "rewards/rejected": -1.7241206169128418, + "step": 303 + }, + { + "epoch": 5.1525423728813555, + "grad_norm": 13.530622153939468, + "learning_rate": 4.6310205832391065e-07, + "logits/chosen": 4.541669845581055, + "logits/rejected": 5.271422863006592, + "logps/chosen": -18.641277313232422, + "logps/rejected": -29.24665069580078, + "loss": 0.1922, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12702220678329468, + "rewards/margins": 3.098491668701172, + "rewards/rejected": -2.9714694023132324, + "step": 304 + }, + { + "epoch": 5.169491525423728, + "grad_norm": 16.78342955862146, + "learning_rate": 4.6271443483675027e-07, + "logits/chosen": 5.560043811798096, + "logits/rejected": 6.474727630615234, + "logps/chosen": -15.253793716430664, + "logps/rejected": -22.487646102905273, + "loss": 0.2217, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.13450895249843597, + "rewards/margins": 1.9386663436889648, + "rewards/rejected": -1.8041574954986572, + "step": 305 + }, + { + "epoch": 5.186440677966102, + "grad_norm": 13.925990324124848, + "learning_rate": 4.6232494991681087e-07, + "logits/chosen": 4.304049015045166, + "logits/rejected": 5.471685886383057, + "logps/chosen": -16.470518112182617, + "logps/rejected": -29.901363372802734, + "loss": 0.1896, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.024986281991004944, + "rewards/margins": 2.9503417015075684, + "rewards/rejected": -2.9253554344177246, + "step": 306 + }, + { + "epoch": 5.203389830508475, + "grad_norm": 12.898539468992048, + "learning_rate": 4.6193360697241766e-07, + "logits/chosen": 6.212376594543457, + "logits/rejected": 6.92727518081665, + "logps/chosen": -19.12429428100586, + "logps/rejected": -32.73984146118164, + "loss": 0.1965, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17453734576702118, + "rewards/margins": 3.106781482696533, + "rewards/rejected": -2.932243585586548, + "step": 307 + }, + { + "epoch": 5.220338983050848, + "grad_norm": 14.083485689677397, + "learning_rate": 4.615404094281554e-07, + "logits/chosen": 5.328009605407715, + "logits/rejected": 6.5234055519104, + "logps/chosen": -16.867034912109375, + "logps/rejected": -28.303224563598633, + "loss": 0.1882, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20804297924041748, + "rewards/margins": 2.509683847427368, + "rewards/rejected": -2.301640748977661, + "step": 308 + }, + { + "epoch": 5.237288135593221, + "grad_norm": 13.457404135592146, + "learning_rate": 4.611453607248381e-07, + "logits/chosen": 4.071807384490967, + "logits/rejected": 5.283750534057617, + "logps/chosen": -17.891639709472656, + "logps/rejected": -26.934532165527344, + "loss": 0.1876, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4134616255760193, + "rewards/margins": 3.161255121231079, + "rewards/rejected": -2.747793674468994, + "step": 309 + }, + { + "epoch": 5.254237288135593, + "grad_norm": 14.350604886951919, + "learning_rate": 4.607484643194788e-07, + "logits/chosen": 5.258482456207275, + "logits/rejected": 5.556510925292969, + "logps/chosen": -15.743051528930664, + "logps/rejected": -24.031320571899414, + "loss": 0.2009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37243759632110596, + "rewards/margins": 2.6536245346069336, + "rewards/rejected": -2.281187057495117, + "step": 310 + }, + { + "epoch": 5.271186440677966, + "grad_norm": 14.91827512482227, + "learning_rate": 4.6034972368525957e-07, + "logits/chosen": 5.340587139129639, + "logits/rejected": 6.861990451812744, + "logps/chosen": -15.216270446777344, + "logps/rejected": -30.178709030151367, + "loss": 0.2054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19152489304542542, + "rewards/margins": 2.885601282119751, + "rewards/rejected": -3.0771260261535645, + "step": 311 + }, + { + "epoch": 5.288135593220339, + "grad_norm": 15.236909598130472, + "learning_rate": 4.599491423115014e-07, + "logits/chosen": 5.011383056640625, + "logits/rejected": 5.633141040802002, + "logps/chosen": -16.395797729492188, + "logps/rejected": -29.717737197875977, + "loss": 0.206, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19525401294231415, + "rewards/margins": 2.0301461219787598, + "rewards/rejected": -1.8348920345306396, + "step": 312 + }, + { + "epoch": 5.305084745762712, + "grad_norm": 13.545653395969534, + "learning_rate": 4.595467237036329e-07, + "logits/chosen": 5.080275535583496, + "logits/rejected": 5.903102397918701, + "logps/chosen": -14.001216888427734, + "logps/rejected": -21.97933006286621, + "loss": 0.2029, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3123638927936554, + "rewards/margins": 1.9239617586135864, + "rewards/rejected": -1.6115977764129639, + "step": 313 + }, + { + "epoch": 5.322033898305085, + "grad_norm": 13.661349245141958, + "learning_rate": 4.591424713831602e-07, + "logits/chosen": 4.491490840911865, + "logits/rejected": 5.605785369873047, + "logps/chosen": -14.745203018188477, + "logps/rejected": -32.56706619262695, + "loss": 0.2112, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3098144829273224, + "rewards/margins": 3.300020456314087, + "rewards/rejected": -2.990206003189087, + "step": 314 + }, + { + "epoch": 5.338983050847458, + "grad_norm": 15.004385831738075, + "learning_rate": 4.587363888876361e-07, + "logits/chosen": 6.41065788269043, + "logits/rejected": 7.018966197967529, + "logps/chosen": -13.388933181762695, + "logps/rejected": -28.454553604125977, + "loss": 0.2148, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11646291613578796, + "rewards/margins": 2.6084635257720947, + "rewards/rejected": -2.4920005798339844, + "step": 315 + }, + { + "epoch": 5.3559322033898304, + "grad_norm": 16.490268828216397, + "learning_rate": 4.583284797706287e-07, + "logits/chosen": 5.329119682312012, + "logits/rejected": 5.552703380584717, + "logps/chosen": -12.61679744720459, + "logps/rejected": -22.200180053710938, + "loss": 0.2401, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.10427120327949524, + "rewards/margins": 1.3166574239730835, + "rewards/rejected": -1.212386131286621, + "step": 316 + }, + { + "epoch": 5.372881355932203, + "grad_norm": 13.010137117952876, + "learning_rate": 4.5791874760169093e-07, + "logits/chosen": 3.9311819076538086, + "logits/rejected": 4.017928123474121, + "logps/chosen": -14.682848930358887, + "logps/rejected": -20.934898376464844, + "loss": 0.1723, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12462379038333893, + "rewards/margins": 2.234661817550659, + "rewards/rejected": -2.1100382804870605, + "step": 317 + }, + { + "epoch": 5.389830508474576, + "grad_norm": 13.709606261421065, + "learning_rate": 4.575071959663288e-07, + "logits/chosen": 5.797114372253418, + "logits/rejected": 6.789894104003906, + "logps/chosen": -19.55613899230957, + "logps/rejected": -34.477935791015625, + "loss": 0.1851, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.051314182579517365, + "rewards/margins": 2.7669546604156494, + "rewards/rejected": -2.7156405448913574, + "step": 318 + }, + { + "epoch": 5.406779661016949, + "grad_norm": 14.989659075325534, + "learning_rate": 4.570938284659702e-07, + "logits/chosen": 7.181881904602051, + "logits/rejected": 7.72728967666626, + "logps/chosen": -16.837162017822266, + "logps/rejected": -26.756256103515625, + "loss": 0.2261, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11718781292438507, + "rewards/margins": 2.574601888656616, + "rewards/rejected": -2.457413911819458, + "step": 319 + }, + { + "epoch": 5.423728813559322, + "grad_norm": 14.352574380790168, + "learning_rate": 4.566786487179334e-07, + "logits/chosen": 4.944997310638428, + "logits/rejected": 6.036535739898682, + "logps/chosen": -17.46659278869629, + "logps/rejected": -25.725868225097656, + "loss": 0.2242, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4766841530799866, + "rewards/margins": 3.0206315517425537, + "rewards/rejected": -2.543947458267212, + "step": 320 + }, + { + "epoch": 5.440677966101695, + "grad_norm": 12.37098062441416, + "learning_rate": 4.5626166035539535e-07, + "logits/chosen": 7.607375144958496, + "logits/rejected": 9.128290176391602, + "logps/chosen": -14.674689292907715, + "logps/rejected": -27.710792541503906, + "loss": 0.1668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13436803221702576, + "rewards/margins": 2.9709839820861816, + "rewards/rejected": -2.836616039276123, + "step": 321 + }, + { + "epoch": 5.4576271186440675, + "grad_norm": 13.91317628066266, + "learning_rate": 4.5584286702736007e-07, + "logits/chosen": 4.578854084014893, + "logits/rejected": 5.086130142211914, + "logps/chosen": -14.649238586425781, + "logps/rejected": -25.154186248779297, + "loss": 0.1746, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16391971707344055, + "rewards/margins": 1.9267592430114746, + "rewards/rejected": -1.762839436531067, + "step": 322 + }, + { + "epoch": 5.47457627118644, + "grad_norm": 15.211982042512723, + "learning_rate": 4.5542227239862654e-07, + "logits/chosen": 3.5927846431732178, + "logits/rejected": 4.380569934844971, + "logps/chosen": -17.435205459594727, + "logps/rejected": -32.917701721191406, + "loss": 0.2114, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1431255340576172, + "rewards/margins": 3.1278703212738037, + "rewards/rejected": -3.270995616912842, + "step": 323 + }, + { + "epoch": 5.491525423728813, + "grad_norm": 14.815140645438243, + "learning_rate": 4.5499988014975635e-07, + "logits/chosen": 4.980439186096191, + "logits/rejected": 5.031620979309082, + "logps/chosen": -21.051170349121094, + "logps/rejected": -29.4047794342041, + "loss": 0.2156, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2153717428445816, + "rewards/margins": 2.1704635620117188, + "rewards/rejected": -1.9550918340682983, + "step": 324 + }, + { + "epoch": 5.508474576271187, + "grad_norm": 20.500615522038547, + "learning_rate": 4.545756939770422e-07, + "logits/chosen": 6.855619430541992, + "logits/rejected": 7.975409030914307, + "logps/chosen": -12.433855056762695, + "logps/rejected": -29.087566375732422, + "loss": 0.1834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16718144714832306, + "rewards/margins": 3.911283016204834, + "rewards/rejected": -3.7441017627716064, + "step": 325 + }, + { + "epoch": 5.52542372881356, + "grad_norm": 14.24067627416158, + "learning_rate": 4.54149717592475e-07, + "logits/chosen": 6.717032432556152, + "logits/rejected": 7.574321269989014, + "logps/chosen": -17.490558624267578, + "logps/rejected": -24.359642028808594, + "loss": 0.2031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06353382766246796, + "rewards/margins": 2.110564708709717, + "rewards/rejected": -2.174098491668701, + "step": 326 + }, + { + "epoch": 5.5423728813559325, + "grad_norm": 13.71169586115688, + "learning_rate": 4.537219547237114e-07, + "logits/chosen": 5.765188694000244, + "logits/rejected": 6.628974437713623, + "logps/chosen": -14.898784637451172, + "logps/rejected": -38.453922271728516, + "loss": 0.203, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0718056857585907, + "rewards/margins": 3.7458438873291016, + "rewards/rejected": -3.6740381717681885, + "step": 327 + }, + { + "epoch": 5.559322033898305, + "grad_norm": 13.228697365678313, + "learning_rate": 4.5329240911404167e-07, + "logits/chosen": 6.111119747161865, + "logits/rejected": 6.550088882446289, + "logps/chosen": -12.508288383483887, + "logps/rejected": -21.71690559387207, + "loss": 0.1939, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08406098186969757, + "rewards/margins": 2.3071329593658447, + "rewards/rejected": -2.223072052001953, + "step": 328 + }, + { + "epoch": 5.576271186440678, + "grad_norm": 14.841949798286183, + "learning_rate": 4.528610845223562e-07, + "logits/chosen": 3.667128324508667, + "logits/rejected": 4.614826679229736, + "logps/chosen": -17.83123779296875, + "logps/rejected": -38.682918548583984, + "loss": 0.189, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46043580770492554, + "rewards/margins": 3.5695717334747314, + "rewards/rejected": -3.109135866165161, + "step": 329 + }, + { + "epoch": 5.593220338983051, + "grad_norm": 13.699477662259353, + "learning_rate": 4.5242798472311306e-07, + "logits/chosen": 4.154301166534424, + "logits/rejected": 4.492623329162598, + "logps/chosen": -14.746369361877441, + "logps/rejected": -20.418010711669922, + "loss": 0.1711, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09611117839813232, + "rewards/margins": 1.9738531112670898, + "rewards/rejected": -1.877742052078247, + "step": 330 + }, + { + "epoch": 5.610169491525424, + "grad_norm": 12.374676658865045, + "learning_rate": 4.519931135063051e-07, + "logits/chosen": 4.899678707122803, + "logits/rejected": 5.104933738708496, + "logps/chosen": -16.690101623535156, + "logps/rejected": -30.977108001708984, + "loss": 0.17, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09213536977767944, + "rewards/margins": 3.308014392852783, + "rewards/rejected": -3.215879201889038, + "step": 331 + }, + { + "epoch": 5.627118644067797, + "grad_norm": 15.061400244189306, + "learning_rate": 4.515564746774265e-07, + "logits/chosen": 3.064007520675659, + "logits/rejected": 3.6515920162200928, + "logps/chosen": -15.63426399230957, + "logps/rejected": -24.327058792114258, + "loss": 0.2141, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3263694643974304, + "rewards/margins": 2.3982205390930176, + "rewards/rejected": -2.0718512535095215, + "step": 332 + }, + { + "epoch": 5.6440677966101696, + "grad_norm": 13.261396058716384, + "learning_rate": 4.5111807205743945e-07, + "logits/chosen": 3.667691946029663, + "logits/rejected": 5.319759845733643, + "logps/chosen": -18.902265548706055, + "logps/rejected": -36.717247009277344, + "loss": 0.1695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5261211395263672, + "rewards/margins": 3.4888546466827393, + "rewards/rejected": -2.962733745574951, + "step": 333 + }, + { + "epoch": 5.661016949152542, + "grad_norm": 14.645385511954315, + "learning_rate": 4.5067790948274085e-07, + "logits/chosen": 4.4554548263549805, + "logits/rejected": 5.304936408996582, + "logps/chosen": -15.04642391204834, + "logps/rejected": -23.59757423400879, + "loss": 0.191, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16090461611747742, + "rewards/margins": 2.001107692718506, + "rewards/rejected": -1.840202808380127, + "step": 334 + }, + { + "epoch": 5.677966101694915, + "grad_norm": 13.666596346564294, + "learning_rate": 4.5023599080512896e-07, + "logits/chosen": 5.841002941131592, + "logits/rejected": 6.072424411773682, + "logps/chosen": -19.874303817749023, + "logps/rejected": -26.235692977905273, + "loss": 0.1684, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2315959334373474, + "rewards/margins": 2.4640674591064453, + "rewards/rejected": -2.2324717044830322, + "step": 335 + }, + { + "epoch": 5.694915254237288, + "grad_norm": 13.787686175151595, + "learning_rate": 4.4979231989176905e-07, + "logits/chosen": 4.080766201019287, + "logits/rejected": 4.8679890632629395, + "logps/chosen": -13.054632186889648, + "logps/rejected": -22.956096649169922, + "loss": 0.1858, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.0869242399930954, + "rewards/margins": 2.300351858139038, + "rewards/rejected": -2.2134275436401367, + "step": 336 + }, + { + "epoch": 5.711864406779661, + "grad_norm": 13.551160822081176, + "learning_rate": 4.493469006251601e-07, + "logits/chosen": 5.657144069671631, + "logits/rejected": 7.342559814453125, + "logps/chosen": -17.066558837890625, + "logps/rejected": -30.203340530395508, + "loss": 0.1861, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.165096715092659, + "rewards/margins": 3.3928604125976562, + "rewards/rejected": -3.2277636528015137, + "step": 337 + }, + { + "epoch": 5.728813559322034, + "grad_norm": 14.419160323493225, + "learning_rate": 4.488997369031008e-07, + "logits/chosen": 4.785531997680664, + "logits/rejected": 4.794522762298584, + "logps/chosen": -14.324603080749512, + "logps/rejected": -24.154769897460938, + "loss": 0.1699, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2423357367515564, + "rewards/margins": 2.1883811950683594, + "rewards/rejected": -1.9460456371307373, + "step": 338 + }, + { + "epoch": 5.745762711864407, + "grad_norm": 11.664671797317196, + "learning_rate": 4.4845083263865514e-07, + "logits/chosen": 2.3409574031829834, + "logits/rejected": 3.2811951637268066, + "logps/chosen": -17.960046768188477, + "logps/rejected": -22.232858657836914, + "loss": 0.1891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36390525102615356, + "rewards/margins": 2.5179195404052734, + "rewards/rejected": -2.1540141105651855, + "step": 339 + }, + { + "epoch": 5.762711864406779, + "grad_norm": 13.733632789469274, + "learning_rate": 4.4800019176011847e-07, + "logits/chosen": 4.645487308502197, + "logits/rejected": 4.405810832977295, + "logps/chosen": -14.33531379699707, + "logps/rejected": -26.283172607421875, + "loss": 0.217, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1105184406042099, + "rewards/margins": 2.3442327976226807, + "rewards/rejected": -2.2337143421173096, + "step": 340 + }, + { + "epoch": 5.779661016949152, + "grad_norm": 13.029185639792166, + "learning_rate": 4.4754781821098286e-07, + "logits/chosen": 6.101343631744385, + "logits/rejected": 5.980262756347656, + "logps/chosen": -19.913057327270508, + "logps/rejected": -28.51534652709961, + "loss": 0.167, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.290585994720459, + "rewards/margins": 3.1626698970794678, + "rewards/rejected": -2.872083902359009, + "step": 341 + }, + { + "epoch": 5.796610169491525, + "grad_norm": 13.259351062753181, + "learning_rate": 4.470937159499028e-07, + "logits/chosen": 6.815540313720703, + "logits/rejected": 7.2496442794799805, + "logps/chosen": -12.720376968383789, + "logps/rejected": -24.852739334106445, + "loss": 0.1981, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.05745570361614227, + "rewards/margins": 2.748012065887451, + "rewards/rejected": -2.690556287765503, + "step": 342 + }, + { + "epoch": 5.813559322033898, + "grad_norm": 14.211807222272416, + "learning_rate": 4.4663788895066065e-07, + "logits/chosen": 5.461134433746338, + "logits/rejected": 5.95833158493042, + "logps/chosen": -16.62125015258789, + "logps/rejected": -25.2918701171875, + "loss": 0.1881, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31963789463043213, + "rewards/margins": 2.098417282104492, + "rewards/rejected": -1.7787795066833496, + "step": 343 + }, + { + "epoch": 5.830508474576272, + "grad_norm": 13.633790061419106, + "learning_rate": 4.4618034120213135e-07, + "logits/chosen": 5.231446266174316, + "logits/rejected": 6.182756423950195, + "logps/chosen": -16.792110443115234, + "logps/rejected": -36.05128479003906, + "loss": 0.1739, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2307071089744568, + "rewards/margins": 3.797922134399414, + "rewards/rejected": -3.5672149658203125, + "step": 344 + }, + { + "epoch": 5.847457627118644, + "grad_norm": 12.438805612920406, + "learning_rate": 4.4572107670824806e-07, + "logits/chosen": 2.8178811073303223, + "logits/rejected": 3.8991308212280273, + "logps/chosen": -13.256142616271973, + "logps/rejected": -28.83277702331543, + "loss": 0.1554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3214130103588104, + "rewards/margins": 3.2701797485351562, + "rewards/rejected": -2.9487667083740234, + "step": 345 + }, + { + "epoch": 5.864406779661017, + "grad_norm": 13.708565551444401, + "learning_rate": 4.45260099487967e-07, + "logits/chosen": 3.6462607383728027, + "logits/rejected": 3.852987289428711, + "logps/chosen": -24.624631881713867, + "logps/rejected": -24.646089553833008, + "loss": 0.209, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.16818492114543915, + "rewards/margins": 2.566938638687134, + "rewards/rejected": -2.3987538814544678, + "step": 346 + }, + { + "epoch": 5.88135593220339, + "grad_norm": 13.340203940885758, + "learning_rate": 4.4479741357523204e-07, + "logits/chosen": 6.4154052734375, + "logits/rejected": 7.19559907913208, + "logps/chosen": -16.736295700073242, + "logps/rejected": -28.568880081176758, + "loss": 0.174, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3518092632293701, + "rewards/margins": 2.901329517364502, + "rewards/rejected": -2.549520254135132, + "step": 347 + }, + { + "epoch": 5.898305084745763, + "grad_norm": 14.529901903868208, + "learning_rate": 4.4433302301893983e-07, + "logits/chosen": 3.5100386142730713, + "logits/rejected": 3.799931049346924, + "logps/chosen": -13.322935104370117, + "logps/rejected": -29.25261116027832, + "loss": 0.1959, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.017160028219223022, + "rewards/margins": 2.818272829055786, + "rewards/rejected": -2.835433006286621, + "step": 348 + }, + { + "epoch": 5.915254237288136, + "grad_norm": 14.146444228683375, + "learning_rate": 4.438669318829037e-07, + "logits/chosen": 0.44511693716049194, + "logits/rejected": 1.87983238697052, + "logps/chosen": -17.3454532623291, + "logps/rejected": -27.43291473388672, + "loss": 0.1855, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7081802487373352, + "rewards/margins": 2.4732515811920166, + "rewards/rejected": -1.7650713920593262, + "step": 349 + }, + { + "epoch": 5.932203389830509, + "grad_norm": 13.385150325169393, + "learning_rate": 4.433991442458188e-07, + "logits/chosen": 2.5039730072021484, + "logits/rejected": 4.11275577545166, + "logps/chosen": -23.479930877685547, + "logps/rejected": -25.535865783691406, + "loss": 0.1886, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.12501907348632812, + "rewards/margins": 2.16085147857666, + "rewards/rejected": -2.035832405090332, + "step": 350 + }, + { + "epoch": 5.9491525423728815, + "grad_norm": 13.909362766221472, + "learning_rate": 4.4292966420122613e-07, + "logits/chosen": 7.141190528869629, + "logits/rejected": 7.579991340637207, + "logps/chosen": -15.560004234313965, + "logps/rejected": -25.857866287231445, + "loss": 0.1725, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.055767402052879333, + "rewards/margins": 2.5317444801330566, + "rewards/rejected": -2.4759769439697266, + "step": 351 + }, + { + "epoch": 5.966101694915254, + "grad_norm": 14.588943173031627, + "learning_rate": 4.4245849585747655e-07, + "logits/chosen": 2.5870280265808105, + "logits/rejected": 2.972602367401123, + "logps/chosen": -17.201574325561523, + "logps/rejected": -27.69642448425293, + "loss": 0.2357, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09064903110265732, + "rewards/margins": 2.292398691177368, + "rewards/rejected": -2.383047580718994, + "step": 352 + }, + { + "epoch": 5.983050847457627, + "grad_norm": 12.836870682632282, + "learning_rate": 4.41985643337695e-07, + "logits/chosen": 7.1439924240112305, + "logits/rejected": 7.3029303550720215, + "logps/chosen": -16.506511688232422, + "logps/rejected": -31.438173294067383, + "loss": 0.1732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15994346141815186, + "rewards/margins": 3.0539016723632812, + "rewards/rejected": -2.89395809173584, + "step": 353 + }, + { + "epoch": 6.0, + "grad_norm": 13.14834477516658, + "learning_rate": 4.415111107797445e-07, + "logits/chosen": 5.239683628082275, + "logits/rejected": 6.064949035644531, + "logps/chosen": -11.112751960754395, + "logps/rejected": -22.786832809448242, + "loss": 0.168, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2163887917995453, + "rewards/margins": 3.0100321769714355, + "rewards/rejected": -2.7936432361602783, + "step": 354 + }, + { + "epoch": 6.016949152542373, + "grad_norm": 12.532478313650268, + "learning_rate": 4.410349023361897e-07, + "logits/chosen": 5.389923095703125, + "logits/rejected": 5.660534381866455, + "logps/chosen": -23.009815216064453, + "logps/rejected": -30.078847885131836, + "loss": 0.186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05637688934803009, + "rewards/margins": 2.8357455730438232, + "rewards/rejected": -2.892122507095337, + "step": 355 + }, + { + "epoch": 6.033898305084746, + "grad_norm": 10.813866763874628, + "learning_rate": 4.4055702217426085e-07, + "logits/chosen": 3.7662200927734375, + "logits/rejected": 4.253184795379639, + "logps/chosen": -14.30909538269043, + "logps/rejected": -25.599939346313477, + "loss": 0.1599, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2995351552963257, + "rewards/margins": 3.287034273147583, + "rewards/rejected": -2.987499475479126, + "step": 356 + }, + { + "epoch": 6.0508474576271185, + "grad_norm": 12.085541722736634, + "learning_rate": 4.40077474475817e-07, + "logits/chosen": 2.5573201179504395, + "logits/rejected": 3.847440004348755, + "logps/chosen": -18.95703887939453, + "logps/rejected": -28.729902267456055, + "loss": 0.1712, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.37842535972595215, + "rewards/margins": 2.1750941276550293, + "rewards/rejected": -1.7966687679290771, + "step": 357 + }, + { + "epoch": 6.067796610169491, + "grad_norm": 10.470231348613757, + "learning_rate": 4.395962634373096e-07, + "logits/chosen": 6.930731773376465, + "logits/rejected": 7.621533393859863, + "logps/chosen": -15.80143928527832, + "logps/rejected": -27.89528465270996, + "loss": 0.1435, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10261240601539612, + "rewards/margins": 2.8732903003692627, + "rewards/rejected": -2.975903034210205, + "step": 358 + }, + { + "epoch": 6.084745762711864, + "grad_norm": 10.943734781770678, + "learning_rate": 4.3911339326974584e-07, + "logits/chosen": 5.029911518096924, + "logits/rejected": 5.953924179077148, + "logps/chosen": -11.060075759887695, + "logps/rejected": -31.347972869873047, + "loss": 0.1391, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3969661295413971, + "rewards/margins": 4.131930351257324, + "rewards/rejected": -3.734964370727539, + "step": 359 + }, + { + "epoch": 6.101694915254237, + "grad_norm": 11.465145430024815, + "learning_rate": 4.386288681986516e-07, + "logits/chosen": 5.777263164520264, + "logits/rejected": 6.994723320007324, + "logps/chosen": -19.753942489624023, + "logps/rejected": -29.311805725097656, + "loss": 0.1415, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.22933685779571533, + "rewards/margins": 3.1403188705444336, + "rewards/rejected": -3.3696556091308594, + "step": 360 + }, + { + "epoch": 6.11864406779661, + "grad_norm": 11.328941727318716, + "learning_rate": 4.3814269246403456e-07, + "logits/chosen": 4.634091377258301, + "logits/rejected": 5.644282817840576, + "logps/chosen": -16.082172393798828, + "logps/rejected": -26.386442184448242, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08987744152545929, + "rewards/margins": 2.8858983516693115, + "rewards/rejected": -2.796020984649658, + "step": 361 + }, + { + "epoch": 6.135593220338983, + "grad_norm": 11.969751512905047, + "learning_rate": 4.3765487032034737e-07, + "logits/chosen": 1.9608010053634644, + "logits/rejected": 2.5253000259399414, + "logps/chosen": -21.299962997436523, + "logps/rejected": -33.22754669189453, + "loss": 0.1546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23107457160949707, + "rewards/margins": 3.7274532318115234, + "rewards/rejected": -3.4963784217834473, + "step": 362 + }, + { + "epoch": 6.1525423728813555, + "grad_norm": 11.229599928113995, + "learning_rate": 4.371654060364498e-07, + "logits/chosen": 3.036740779876709, + "logits/rejected": 3.398423194885254, + "logps/chosen": -14.125732421875, + "logps/rejected": -20.322280883789062, + "loss": 0.17, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22011126577854156, + "rewards/margins": 2.515415668487549, + "rewards/rejected": -2.295304536819458, + "step": 363 + }, + { + "epoch": 6.169491525423728, + "grad_norm": 11.749415025290373, + "learning_rate": 4.366743038955719e-07, + "logits/chosen": 3.778263568878174, + "logits/rejected": 4.308474540710449, + "logps/chosen": -18.887042999267578, + "logps/rejected": -26.429588317871094, + "loss": 0.1574, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3668030798435211, + "rewards/margins": 2.436537742614746, + "rewards/rejected": -2.069734811782837, + "step": 364 + }, + { + "epoch": 6.186440677966102, + "grad_norm": 11.80077626000249, + "learning_rate": 4.361815681952765e-07, + "logits/chosen": 1.918591856956482, + "logits/rejected": 2.6716151237487793, + "logps/chosen": -19.709033966064453, + "logps/rejected": -21.16021156311035, + "loss": 0.17, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25224724411964417, + "rewards/margins": 2.316915273666382, + "rewards/rejected": -2.0646679401397705, + "step": 365 + }, + { + "epoch": 6.203389830508475, + "grad_norm": 11.838957746477757, + "learning_rate": 4.3568720324742126e-07, + "logits/chosen": 5.8752264976501465, + "logits/rejected": 7.20805549621582, + "logps/chosen": -17.11652183532715, + "logps/rejected": -31.32016372680664, + "loss": 0.1459, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32180485129356384, + "rewards/margins": 2.999842405319214, + "rewards/rejected": -2.678037643432617, + "step": 366 + }, + { + "epoch": 6.220338983050848, + "grad_norm": 11.944090270848385, + "learning_rate": 4.351912133781212e-07, + "logits/chosen": 6.407975196838379, + "logits/rejected": 6.661712169647217, + "logps/chosen": -15.283435821533203, + "logps/rejected": -19.59105682373047, + "loss": 0.1742, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47592228651046753, + "rewards/margins": 1.68387770652771, + "rewards/rejected": -1.2079553604125977, + "step": 367 + }, + { + "epoch": 6.237288135593221, + "grad_norm": 11.440576248397297, + "learning_rate": 4.3469360292771096e-07, + "logits/chosen": 4.260448455810547, + "logits/rejected": 4.510221481323242, + "logps/chosen": -14.869780540466309, + "logps/rejected": -22.963233947753906, + "loss": 0.1535, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5458751320838928, + "rewards/margins": 2.5354907512664795, + "rewards/rejected": -1.989615559577942, + "step": 368 + }, + { + "epoch": 6.254237288135593, + "grad_norm": 13.011393149825132, + "learning_rate": 4.3419437625070634e-07, + "logits/chosen": 4.716248035430908, + "logits/rejected": 5.315852165222168, + "logps/chosen": -14.17831802368164, + "logps/rejected": -23.364871978759766, + "loss": 0.1877, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23576220870018005, + "rewards/margins": 2.1676623821258545, + "rewards/rejected": -1.9319000244140625, + "step": 369 + }, + { + "epoch": 6.271186440677966, + "grad_norm": 12.22060362782235, + "learning_rate": 4.336935377157668e-07, + "logits/chosen": 1.9044804573059082, + "logits/rejected": 2.334730386734009, + "logps/chosen": -16.843324661254883, + "logps/rejected": -28.972496032714844, + "loss": 0.1899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5382827520370483, + "rewards/margins": 3.484276294708252, + "rewards/rejected": -2.945993661880493, + "step": 370 + }, + { + "epoch": 6.288135593220339, + "grad_norm": 11.110250794754856, + "learning_rate": 4.3319109170565676e-07, + "logits/chosen": 4.34942102432251, + "logits/rejected": 5.465624809265137, + "logps/chosen": -12.080187797546387, + "logps/rejected": -29.275917053222656, + "loss": 0.1556, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.535807192325592, + "rewards/margins": 3.084014654159546, + "rewards/rejected": -2.5482072830200195, + "step": 371 + }, + { + "epoch": 6.305084745762712, + "grad_norm": 10.689860258387656, + "learning_rate": 4.3268704261720745e-07, + "logits/chosen": 2.993368625640869, + "logits/rejected": 3.6167068481445312, + "logps/chosen": -15.360583305358887, + "logps/rejected": -26.135984420776367, + "loss": 0.1471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6923851370811462, + "rewards/margins": 2.7075209617614746, + "rewards/rejected": -2.0151357650756836, + "step": 372 + }, + { + "epoch": 6.322033898305085, + "grad_norm": 11.813171861529911, + "learning_rate": 4.321813948612785e-07, + "logits/chosen": 5.465794563293457, + "logits/rejected": 5.721805572509766, + "logps/chosen": -17.526994705200195, + "logps/rejected": -23.695585250854492, + "loss": 0.1569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14771461486816406, + "rewards/margins": 2.829829692840576, + "rewards/rejected": -2.9775443077087402, + "step": 373 + }, + { + "epoch": 6.338983050847458, + "grad_norm": 10.966391063899888, + "learning_rate": 4.31674152862719e-07, + "logits/chosen": 2.381113290786743, + "logits/rejected": 2.6802804470062256, + "logps/chosen": -11.867440223693848, + "logps/rejected": -24.956613540649414, + "loss": 0.158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08301214873790741, + "rewards/margins": 2.746257781982422, + "rewards/rejected": -2.663245677947998, + "step": 374 + }, + { + "epoch": 6.3559322033898304, + "grad_norm": 11.645900837905495, + "learning_rate": 4.311653210603293e-07, + "logits/chosen": 3.390183448791504, + "logits/rejected": 5.339263439178467, + "logps/chosen": -22.072269439697266, + "logps/rejected": -30.078414916992188, + "loss": 0.1575, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2785249948501587, + "rewards/margins": 3.537834644317627, + "rewards/rejected": -3.2593092918395996, + "step": 375 + }, + { + "epoch": 6.372881355932203, + "grad_norm": 11.44172677194351, + "learning_rate": 4.306549039068218e-07, + "logits/chosen": 2.786032199859619, + "logits/rejected": 3.0518574714660645, + "logps/chosen": -17.41475486755371, + "logps/rejected": -22.37212371826172, + "loss": 0.14, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.38636505603790283, + "rewards/margins": 2.6983985900878906, + "rewards/rejected": -2.3120336532592773, + "step": 376 + }, + { + "epoch": 6.389830508474576, + "grad_norm": 13.337553165557853, + "learning_rate": 4.301429058687819e-07, + "logits/chosen": 1.9814975261688232, + "logits/rejected": 3.0942461490631104, + "logps/chosen": -15.964947700500488, + "logps/rejected": -35.37507247924805, + "loss": 0.1701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8118382692337036, + "rewards/margins": 4.838287830352783, + "rewards/rejected": -4.026450157165527, + "step": 377 + }, + { + "epoch": 6.406779661016949, + "grad_norm": 9.255376884922308, + "learning_rate": 4.296293314266294e-07, + "logits/chosen": 2.894780158996582, + "logits/rejected": 3.551926851272583, + "logps/chosen": -13.228153228759766, + "logps/rejected": -26.36261558532715, + "loss": 0.1186, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35001492500305176, + "rewards/margins": 3.228752851486206, + "rewards/rejected": -2.8787379264831543, + "step": 378 + }, + { + "epoch": 6.423728813559322, + "grad_norm": 12.86108986778706, + "learning_rate": 4.2911418507457876e-07, + "logits/chosen": 1.326431155204773, + "logits/rejected": 1.1022425889968872, + "logps/chosen": -18.70794105529785, + "logps/rejected": -25.21274185180664, + "loss": 0.1771, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.20193463563919067, + "rewards/margins": 2.162876844406128, + "rewards/rejected": -1.960942268371582, + "step": 379 + }, + { + "epoch": 6.440677966101695, + "grad_norm": 10.842665215954792, + "learning_rate": 4.285974713206e-07, + "logits/chosen": 4.0209760665893555, + "logits/rejected": 5.607435703277588, + "logps/chosen": -18.19493865966797, + "logps/rejected": -29.27100372314453, + "loss": 0.1646, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25257328152656555, + "rewards/margins": 3.3824639320373535, + "rewards/rejected": -3.1298906803131104, + "step": 380 + }, + { + "epoch": 6.4576271186440675, + "grad_norm": 11.725063198263442, + "learning_rate": 4.280791946863794e-07, + "logits/chosen": 1.2813572883605957, + "logits/rejected": 2.023667097091675, + "logps/chosen": -14.165773391723633, + "logps/rejected": -25.905025482177734, + "loss": 0.1548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.08021500706672668, + "rewards/margins": 2.9218506813049316, + "rewards/rejected": -2.8416357040405273, + "step": 381 + }, + { + "epoch": 6.47457627118644, + "grad_norm": 11.46721147012392, + "learning_rate": 4.275593597072795e-07, + "logits/chosen": 1.4821670055389404, + "logits/rejected": 1.7210092544555664, + "logps/chosen": -18.427885055541992, + "logps/rejected": -25.48371696472168, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41084200143814087, + "rewards/margins": 2.797346353530884, + "rewards/rejected": -2.3865044116973877, + "step": 382 + }, + { + "epoch": 6.491525423728813, + "grad_norm": 10.51651979591079, + "learning_rate": 4.270379709323001e-07, + "logits/chosen": 4.080705642700195, + "logits/rejected": 4.421995162963867, + "logps/chosen": -17.705881118774414, + "logps/rejected": -31.33135986328125, + "loss": 0.1394, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04687368869781494, + "rewards/margins": 3.1977744102478027, + "rewards/rejected": -3.2446484565734863, + "step": 383 + }, + { + "epoch": 6.508474576271187, + "grad_norm": 11.273261378268472, + "learning_rate": 4.265150329240376e-07, + "logits/chosen": 3.1031620502471924, + "logits/rejected": 4.636520862579346, + "logps/chosen": -14.841632843017578, + "logps/rejected": -26.25077247619629, + "loss": 0.1285, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06430919468402863, + "rewards/margins": 2.5861589908599854, + "rewards/rejected": -2.5218496322631836, + "step": 384 + }, + { + "epoch": 6.52542372881356, + "grad_norm": 11.992095342201655, + "learning_rate": 4.259905502586457e-07, + "logits/chosen": 2.4647104740142822, + "logits/rejected": 3.4929392337799072, + "logps/chosen": -15.373156547546387, + "logps/rejected": -26.317827224731445, + "loss": 0.1608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12471124529838562, + "rewards/margins": 2.660918951034546, + "rewards/rejected": -2.536207675933838, + "step": 385 + }, + { + "epoch": 6.5423728813559325, + "grad_norm": 12.945212906753929, + "learning_rate": 4.254645275257953e-07, + "logits/chosen": 3.468475580215454, + "logits/rejected": 4.383484840393066, + "logps/chosen": -13.625848770141602, + "logps/rejected": -28.03760528564453, + "loss": 0.1689, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24450263381004333, + "rewards/margins": 2.8928561210632324, + "rewards/rejected": -2.6483535766601562, + "step": 386 + }, + { + "epoch": 6.559322033898305, + "grad_norm": 11.089672955876997, + "learning_rate": 4.24936969328634e-07, + "logits/chosen": 4.420220375061035, + "logits/rejected": 5.376060485839844, + "logps/chosen": -11.676502227783203, + "logps/rejected": -24.155860900878906, + "loss": 0.152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08071908354759216, + "rewards/margins": 2.936311721801758, + "rewards/rejected": -2.855592727661133, + "step": 387 + }, + { + "epoch": 6.576271186440678, + "grad_norm": 13.695617001861233, + "learning_rate": 4.244078802837462e-07, + "logits/chosen": 5.1619062423706055, + "logits/rejected": 5.240504264831543, + "logps/chosen": -18.182863235473633, + "logps/rejected": -21.219486236572266, + "loss": 0.1946, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22650668025016785, + "rewards/margins": 2.3760101795196533, + "rewards/rejected": -2.149503469467163, + "step": 388 + }, + { + "epoch": 6.593220338983051, + "grad_norm": 11.788406533092438, + "learning_rate": 4.238772650211123e-07, + "logits/chosen": 1.1291437149047852, + "logits/rejected": 2.072202682495117, + "logps/chosen": -14.250567436218262, + "logps/rejected": -30.102697372436523, + "loss": 0.1543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.043179064989089966, + "rewards/margins": 3.3207321166992188, + "rewards/rejected": -3.277553081512451, + "step": 389 + }, + { + "epoch": 6.610169491525424, + "grad_norm": 11.274977736495742, + "learning_rate": 4.233451281840685e-07, + "logits/chosen": 3.7870407104492188, + "logits/rejected": 3.9119009971618652, + "logps/chosen": -15.645469665527344, + "logps/rejected": -24.250320434570312, + "loss": 0.1671, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23693856596946716, + "rewards/margins": 1.8370524644851685, + "rewards/rejected": -1.6001137495040894, + "step": 390 + }, + { + "epoch": 6.627118644067797, + "grad_norm": 11.026889873931545, + "learning_rate": 4.2281147442926636e-07, + "logits/chosen": 2.935783624649048, + "logits/rejected": 3.035520553588867, + "logps/chosen": -12.330340385437012, + "logps/rejected": -23.562206268310547, + "loss": 0.1625, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31259769201278687, + "rewards/margins": 2.575361490249634, + "rewards/rejected": -2.262763500213623, + "step": 391 + }, + { + "epoch": 6.6440677966101696, + "grad_norm": 11.464351312206663, + "learning_rate": 4.222763084266313e-07, + "logits/chosen": 2.7037510871887207, + "logits/rejected": 4.515506267547607, + "logps/chosen": -14.069086074829102, + "logps/rejected": -28.03109359741211, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20625919103622437, + "rewards/margins": 3.8207006454467773, + "rewards/rejected": -3.6144416332244873, + "step": 392 + }, + { + "epoch": 6.661016949152542, + "grad_norm": 9.640842179436438, + "learning_rate": 4.217396348593224e-07, + "logits/chosen": 3.874035120010376, + "logits/rejected": 4.602170944213867, + "logps/chosen": -22.469409942626953, + "logps/rejected": -33.009944915771484, + "loss": 0.1544, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03429122269153595, + "rewards/margins": 3.2556371688842773, + "rewards/rejected": -3.2899281978607178, + "step": 393 + }, + { + "epoch": 6.677966101694915, + "grad_norm": 13.652483617561197, + "learning_rate": 4.2120145842369137e-07, + "logits/chosen": 3.716986656188965, + "logits/rejected": 4.291690826416016, + "logps/chosen": -14.710618019104004, + "logps/rejected": -27.50786590576172, + "loss": 0.1801, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3418799936771393, + "rewards/margins": 3.645103931427002, + "rewards/rejected": -3.3032238483428955, + "step": 394 + }, + { + "epoch": 6.694915254237288, + "grad_norm": 11.48146565873232, + "learning_rate": 4.206617838292411e-07, + "logits/chosen": 4.914379119873047, + "logits/rejected": 5.654470443725586, + "logps/chosen": -14.613869667053223, + "logps/rejected": -29.499710083007812, + "loss": 0.1565, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.09016422927379608, + "rewards/margins": 3.4852559566497803, + "rewards/rejected": -3.575420379638672, + "step": 395 + }, + { + "epoch": 6.711864406779661, + "grad_norm": 9.939140025420576, + "learning_rate": 4.201206157985846e-07, + "logits/chosen": 5.169272422790527, + "logits/rejected": 6.066037654876709, + "logps/chosen": -13.552163124084473, + "logps/rejected": -25.132427215576172, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26199135184288025, + "rewards/margins": 3.1676011085510254, + "rewards/rejected": -2.905609607696533, + "step": 396 + }, + { + "epoch": 6.728813559322034, + "grad_norm": 11.987189290505833, + "learning_rate": 4.1957795906740403e-07, + "logits/chosen": 2.286560535430908, + "logits/rejected": 2.5394349098205566, + "logps/chosen": -12.481584548950195, + "logps/rejected": -23.126205444335938, + "loss": 0.1557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4645286798477173, + "rewards/margins": 2.306704044342041, + "rewards/rejected": -1.8421754837036133, + "step": 397 + }, + { + "epoch": 6.745762711864407, + "grad_norm": 10.168711244583603, + "learning_rate": 4.1903381838440853e-07, + "logits/chosen": 4.190589427947998, + "logits/rejected": 4.447041034698486, + "logps/chosen": -17.075122833251953, + "logps/rejected": -26.705982208251953, + "loss": 0.1375, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1718921661376953, + "rewards/margins": 2.3209805488586426, + "rewards/rejected": -2.1490883827209473, + "step": 398 + }, + { + "epoch": 6.762711864406779, + "grad_norm": 10.139455694407227, + "learning_rate": 4.1848819851129345e-07, + "logits/chosen": 2.6103100776672363, + "logits/rejected": 2.7593235969543457, + "logps/chosen": -23.044191360473633, + "logps/rejected": -33.22798156738281, + "loss": 0.1222, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31734511256217957, + "rewards/margins": 3.8872694969177246, + "rewards/rejected": -3.5699243545532227, + "step": 399 + }, + { + "epoch": 6.779661016949152, + "grad_norm": 11.787533074792071, + "learning_rate": 4.179411042226982e-07, + "logits/chosen": 3.197756052017212, + "logits/rejected": 3.3959622383117676, + "logps/chosen": -21.1182918548584, + "logps/rejected": -29.089521408081055, + "loss": 0.1524, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14221636950969696, + "rewards/margins": 3.5914371013641357, + "rewards/rejected": -3.7336530685424805, + "step": 400 + }, + { + "epoch": 6.796610169491525, + "grad_norm": 10.509355548093078, + "learning_rate": 4.173925403061644e-07, + "logits/chosen": 0.36680668592453003, + "logits/rejected": 1.1943421363830566, + "logps/chosen": -21.096643447875977, + "logps/rejected": -44.89948272705078, + "loss": 0.1227, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27233120799064636, + "rewards/margins": 4.326108455657959, + "rewards/rejected": -4.05377721786499, + "step": 401 + }, + { + "epoch": 6.813559322033898, + "grad_norm": 11.105525805087115, + "learning_rate": 4.1684251156209437e-07, + "logits/chosen": 3.423051118850708, + "logits/rejected": 4.709883689880371, + "logps/chosen": -12.655998229980469, + "logps/rejected": -32.92218780517578, + "loss": 0.1419, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6730118989944458, + "rewards/margins": 4.144976615905762, + "rewards/rejected": -3.4719643592834473, + "step": 402 + }, + { + "epoch": 6.830508474576272, + "grad_norm": 11.881691689692572, + "learning_rate": 4.16291022803709e-07, + "logits/chosen": 3.5171265602111816, + "logits/rejected": 3.0639753341674805, + "logps/chosen": -18.158029556274414, + "logps/rejected": -22.001617431640625, + "loss": 0.1534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4230116307735443, + "rewards/margins": 2.6759896278381348, + "rewards/rejected": -2.2529778480529785, + "step": 403 + }, + { + "epoch": 6.847457627118644, + "grad_norm": 11.334720471938402, + "learning_rate": 4.1573807885700523e-07, + "logits/chosen": 3.2304704189300537, + "logits/rejected": 3.686030387878418, + "logps/chosen": -17.79078483581543, + "logps/rejected": -36.99302291870117, + "loss": 0.1354, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3478419780731201, + "rewards/margins": 4.222134113311768, + "rewards/rejected": -3.8742923736572266, + "step": 404 + }, + { + "epoch": 6.864406779661017, + "grad_norm": 10.228794064023427, + "learning_rate": 4.151836845607144e-07, + "logits/chosen": 2.565765142440796, + "logits/rejected": 3.0117526054382324, + "logps/chosen": -19.339305877685547, + "logps/rejected": -25.283498764038086, + "loss": 0.1395, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.0219439268112183, + "rewards/margins": 3.060976266860962, + "rewards/rejected": -2.0390326976776123, + "step": 405 + }, + { + "epoch": 6.88135593220339, + "grad_norm": 12.09741072980745, + "learning_rate": 4.146278447662597e-07, + "logits/chosen": 6.464285850524902, + "logits/rejected": 6.6425957679748535, + "logps/chosen": -12.595333099365234, + "logps/rejected": -25.08333396911621, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24255235493183136, + "rewards/margins": 2.9764745235443115, + "rewards/rejected": -2.733922004699707, + "step": 406 + }, + { + "epoch": 6.898305084745763, + "grad_norm": 9.900890386099787, + "learning_rate": 4.1407056433771324e-07, + "logits/chosen": 5.483569145202637, + "logits/rejected": 6.747907638549805, + "logps/chosen": -16.0140438079834, + "logps/rejected": -32.348541259765625, + "loss": 0.1148, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2642427384853363, + "rewards/margins": 3.627537727355957, + "rewards/rejected": -3.891780376434326, + "step": 407 + }, + { + "epoch": 6.915254237288136, + "grad_norm": 18.674013367884058, + "learning_rate": 4.1351184815175456e-07, + "logits/chosen": 2.835052251815796, + "logits/rejected": 4.391816139221191, + "logps/chosen": -20.78997230529785, + "logps/rejected": -29.96456527709961, + "loss": 0.1488, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27503877878189087, + "rewards/margins": 2.9371590614318848, + "rewards/rejected": -2.6621201038360596, + "step": 408 + }, + { + "epoch": 6.932203389830509, + "grad_norm": 11.088887852350569, + "learning_rate": 4.1295170109762677e-07, + "logits/chosen": 2.4660263061523438, + "logits/rejected": 2.9183759689331055, + "logps/chosen": -17.321752548217773, + "logps/rejected": -28.743206024169922, + "loss": 0.1468, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0913095474243164, + "rewards/margins": 3.131824016571045, + "rewards/rejected": -3.0405147075653076, + "step": 409 + }, + { + "epoch": 6.9491525423728815, + "grad_norm": 10.620335803144801, + "learning_rate": 4.1239012807709444e-07, + "logits/chosen": 0.7266221046447754, + "logits/rejected": 1.808227777481079, + "logps/chosen": -16.597368240356445, + "logps/rejected": -35.07196044921875, + "loss": 0.1351, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12688055634498596, + "rewards/margins": 3.9789581298828125, + "rewards/rejected": -4.105838775634766, + "step": 410 + }, + { + "epoch": 6.966101694915254, + "grad_norm": 10.627874755495654, + "learning_rate": 4.1182713400440074e-07, + "logits/chosen": 3.519763708114624, + "logits/rejected": 4.384012699127197, + "logps/chosen": -21.77133560180664, + "logps/rejected": -29.07388687133789, + "loss": 0.1378, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16306172311306, + "rewards/margins": 3.332913875579834, + "rewards/rejected": -3.1698520183563232, + "step": 411 + }, + { + "epoch": 6.983050847457627, + "grad_norm": 10.90365370777316, + "learning_rate": 4.112627238062238e-07, + "logits/chosen": 3.6548125743865967, + "logits/rejected": 4.417596340179443, + "logps/chosen": -13.329391479492188, + "logps/rejected": -23.57076644897461, + "loss": 0.1444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.530329704284668, + "rewards/margins": 2.5617852210998535, + "rewards/rejected": -2.0314555168151855, + "step": 412 + }, + { + "epoch": 7.0, + "grad_norm": 13.2953949469969, + "learning_rate": 4.106969024216348e-07, + "logits/chosen": 3.9974312782287598, + "logits/rejected": 4.03011417388916, + "logps/chosen": -15.118673324584961, + "logps/rejected": -29.512248992919922, + "loss": 0.1894, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01425081491470337, + "rewards/margins": 3.2692325115203857, + "rewards/rejected": -3.2834835052490234, + "step": 413 + }, + { + "epoch": 7.016949152542373, + "grad_norm": 9.87165410115901, + "learning_rate": 4.101296748020533e-07, + "logits/chosen": -0.07437923550605774, + "logits/rejected": 0.5111943483352661, + "logps/chosen": -14.69372844696045, + "logps/rejected": -25.89691162109375, + "loss": 0.1389, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21298867464065552, + "rewards/margins": 3.466539144515991, + "rewards/rejected": -3.2535505294799805, + "step": 414 + }, + { + "epoch": 7.033898305084746, + "grad_norm": 10.23895017926183, + "learning_rate": 4.09561045911205e-07, + "logits/chosen": 5.7406206130981445, + "logits/rejected": 6.087039947509766, + "logps/chosen": -16.156099319458008, + "logps/rejected": -25.66880226135254, + "loss": 0.0953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3853185176849365, + "rewards/margins": 3.1551337242126465, + "rewards/rejected": -2.769815444946289, + "step": 415 + }, + { + "epoch": 7.0508474576271185, + "grad_norm": 8.915034579880647, + "learning_rate": 4.0899102072507773e-07, + "logits/chosen": 3.7955563068389893, + "logits/rejected": 4.054969310760498, + "logps/chosen": -14.184603691101074, + "logps/rejected": -23.060760498046875, + "loss": 0.1179, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09845928102731705, + "rewards/margins": 2.538086414337158, + "rewards/rejected": -2.439626932144165, + "step": 416 + }, + { + "epoch": 7.067796610169491, + "grad_norm": 9.239050917639268, + "learning_rate": 4.084196042318783e-07, + "logits/chosen": 0.5394778847694397, + "logits/rejected": 0.6815662384033203, + "logps/chosen": -18.747802734375, + "logps/rejected": -30.199262619018555, + "loss": 0.1287, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.28906530141830444, + "rewards/margins": 2.6135823726654053, + "rewards/rejected": -2.324517011642456, + "step": 417 + }, + { + "epoch": 7.084745762711864, + "grad_norm": 9.872275462051524, + "learning_rate": 4.0784680143198837e-07, + "logits/chosen": 4.96520471572876, + "logits/rejected": 6.231711387634277, + "logps/chosen": -13.00781536102295, + "logps/rejected": -29.61225128173828, + "loss": 0.1289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14047710597515106, + "rewards/margins": 4.104274749755859, + "rewards/rejected": -3.9637980461120605, + "step": 418 + }, + { + "epoch": 7.101694915254237, + "grad_norm": 8.491591923304918, + "learning_rate": 4.0727261733792124e-07, + "logits/chosen": 4.542697429656982, + "logits/rejected": 4.926889419555664, + "logps/chosen": -15.30163288116455, + "logps/rejected": -26.00771713256836, + "loss": 0.1146, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32629522681236267, + "rewards/margins": 2.9046356678009033, + "rewards/rejected": -2.5783402919769287, + "step": 419 + }, + { + "epoch": 7.11864406779661, + "grad_norm": 14.655705370533742, + "learning_rate": 4.0669705697427754e-07, + "logits/chosen": 0.6694058179855347, + "logits/rejected": 1.186065912246704, + "logps/chosen": -21.470455169677734, + "logps/rejected": -32.79707717895508, + "loss": 0.1201, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5823428630828857, + "rewards/margins": 3.7213852405548096, + "rewards/rejected": -3.1390419006347656, + "step": 420 + }, + { + "epoch": 7.135593220338983, + "grad_norm": 9.811884566086711, + "learning_rate": 4.061201253777015e-07, + "logits/chosen": 3.503796100616455, + "logits/rejected": 3.606149435043335, + "logps/chosen": -18.951021194458008, + "logps/rejected": -23.3331241607666, + "loss": 0.1272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27215754985809326, + "rewards/margins": 2.7002599239349365, + "rewards/rejected": -2.428102493286133, + "step": 421 + }, + { + "epoch": 7.1525423728813555, + "grad_norm": 8.9341598495611, + "learning_rate": 4.0554182759683675e-07, + "logits/chosen": 2.680142641067505, + "logits/rejected": 3.6626601219177246, + "logps/chosen": -10.236148834228516, + "logps/rejected": -23.045013427734375, + "loss": 0.1144, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.27169984579086304, + "rewards/margins": 2.928940773010254, + "rewards/rejected": -2.657240867614746, + "step": 422 + }, + { + "epoch": 7.169491525423728, + "grad_norm": 11.341699653332553, + "learning_rate": 4.049621686922823e-07, + "logits/chosen": 4.442047119140625, + "logits/rejected": 4.583933353424072, + "logps/chosen": -20.456544876098633, + "logps/rejected": -30.686235427856445, + "loss": 0.1428, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31697165966033936, + "rewards/margins": 2.6978225708007812, + "rewards/rejected": -3.01479434967041, + "step": 423 + }, + { + "epoch": 7.186440677966102, + "grad_norm": 9.639665090084435, + "learning_rate": 4.0438115373654795e-07, + "logits/chosen": 1.4850196838378906, + "logits/rejected": 2.471409559249878, + "logps/chosen": -19.277435302734375, + "logps/rejected": -31.83528709411621, + "loss": 0.1367, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31030380725860596, + "rewards/margins": 3.9561548233032227, + "rewards/rejected": -3.6458511352539062, + "step": 424 + }, + { + "epoch": 7.203389830508475, + "grad_norm": 10.000078800507277, + "learning_rate": 4.0379878781401046e-07, + "logits/chosen": 0.7568904757499695, + "logits/rejected": 1.3338820934295654, + "logps/chosen": -15.746091842651367, + "logps/rejected": -29.87813949584961, + "loss": 0.1318, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3971250057220459, + "rewards/margins": 3.3505163192749023, + "rewards/rejected": -2.9533913135528564, + "step": 425 + }, + { + "epoch": 7.220338983050848, + "grad_norm": 10.381904195353888, + "learning_rate": 4.0321507602086836e-07, + "logits/chosen": -0.01644599437713623, + "logits/rejected": 1.1252076625823975, + "logps/chosen": -17.216590881347656, + "logps/rejected": -30.554401397705078, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3007734417915344, + "rewards/margins": 3.6961429119110107, + "rewards/rejected": -3.395369529724121, + "step": 426 + }, + { + "epoch": 7.237288135593221, + "grad_norm": 9.150425635235424, + "learning_rate": 4.026300234650979e-07, + "logits/chosen": 2.1568071842193604, + "logits/rejected": 2.3625097274780273, + "logps/chosen": -19.713909149169922, + "logps/rejected": -32.234867095947266, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5371749401092529, + "rewards/margins": 3.1868789196014404, + "rewards/rejected": -2.6497037410736084, + "step": 427 + }, + { + "epoch": 7.254237288135593, + "grad_norm": 8.714918171505428, + "learning_rate": 4.020436352664079e-07, + "logits/chosen": 1.7266515493392944, + "logits/rejected": 2.308364152908325, + "logps/chosen": -15.875692367553711, + "logps/rejected": -25.839502334594727, + "loss": 0.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.39192503690719604, + "rewards/margins": 3.4913225173950195, + "rewards/rejected": -3.099397659301758, + "step": 428 + }, + { + "epoch": 7.271186440677966, + "grad_norm": 9.08065333589319, + "learning_rate": 4.014559165561956e-07, + "logits/chosen": 3.0672965049743652, + "logits/rejected": 4.741214752197266, + "logps/chosen": -16.432716369628906, + "logps/rejected": -32.22150421142578, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.271750807762146, + "rewards/margins": 5.287177562713623, + "rewards/rejected": -5.0154266357421875, + "step": 429 + }, + { + "epoch": 7.288135593220339, + "grad_norm": 8.930294809336747, + "learning_rate": 4.0086687247750095e-07, + "logits/chosen": 3.319852113723755, + "logits/rejected": 3.4241392612457275, + "logps/chosen": -14.741172790527344, + "logps/rejected": -23.185720443725586, + "loss": 0.1161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5052129030227661, + "rewards/margins": 2.2517757415771484, + "rewards/rejected": -1.7465627193450928, + "step": 430 + }, + { + "epoch": 7.305084745762712, + "grad_norm": 9.14004965474861, + "learning_rate": 4.0027650818496226e-07, + "logits/chosen": 4.515099048614502, + "logits/rejected": 5.159193515777588, + "logps/chosen": -15.581079483032227, + "logps/rejected": -35.971229553222656, + "loss": 0.1288, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.013070344924926758, + "rewards/margins": 4.25930643081665, + "rewards/rejected": -4.272377014160156, + "step": 431 + }, + { + "epoch": 7.322033898305085, + "grad_norm": 9.813549653689446, + "learning_rate": 3.996848288447707e-07, + "logits/chosen": 2.0078516006469727, + "logits/rejected": 2.6934330463409424, + "logps/chosen": -12.148004531860352, + "logps/rejected": -26.57069206237793, + "loss": 0.1269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11509425938129425, + "rewards/margins": 3.352147102355957, + "rewards/rejected": -3.2370529174804688, + "step": 432 + }, + { + "epoch": 7.338983050847458, + "grad_norm": 8.984701012628799, + "learning_rate": 3.9909183963462536e-07, + "logits/chosen": 0.2953256368637085, + "logits/rejected": 2.0255162715911865, + "logps/chosen": -22.085187911987305, + "logps/rejected": -33.014617919921875, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4210081696510315, + "rewards/margins": 3.541736364364624, + "rewards/rejected": -3.120728015899658, + "step": 433 + }, + { + "epoch": 7.3559322033898304, + "grad_norm": 10.288893478868536, + "learning_rate": 3.984975457436876e-07, + "logits/chosen": 3.767810821533203, + "logits/rejected": 4.347950458526611, + "logps/chosen": -13.075026512145996, + "logps/rejected": -26.276962280273438, + "loss": 0.1332, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17277248203754425, + "rewards/margins": 3.197721481323242, + "rewards/rejected": -3.024949073791504, + "step": 434 + }, + { + "epoch": 7.372881355932203, + "grad_norm": 9.51515469393944, + "learning_rate": 3.979019523725361e-07, + "logits/chosen": 4.625918388366699, + "logits/rejected": 4.527889728546143, + "logps/chosen": -18.1661376953125, + "logps/rejected": -20.289915084838867, + "loss": 0.1262, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22275793552398682, + "rewards/margins": 2.77077054977417, + "rewards/rejected": -2.5480129718780518, + "step": 435 + }, + { + "epoch": 7.389830508474576, + "grad_norm": 10.055043599285973, + "learning_rate": 3.973050647331209e-07, + "logits/chosen": 3.001349925994873, + "logits/rejected": 3.237732410430908, + "logps/chosen": -19.620223999023438, + "logps/rejected": -32.204917907714844, + "loss": 0.1159, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4234805703163147, + "rewards/margins": 3.593975782394409, + "rewards/rejected": -3.17049503326416, + "step": 436 + }, + { + "epoch": 7.406779661016949, + "grad_norm": 9.794022274217578, + "learning_rate": 3.967068880487181e-07, + "logits/chosen": 2.001206159591675, + "logits/rejected": 2.360400915145874, + "logps/chosen": -16.585494995117188, + "logps/rejected": -31.013713836669922, + "loss": 0.1444, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5124250054359436, + "rewards/margins": 4.170086860656738, + "rewards/rejected": -3.6576623916625977, + "step": 437 + }, + { + "epoch": 7.423728813559322, + "grad_norm": 11.01890506925563, + "learning_rate": 3.9610742755388406e-07, + "logits/chosen": 4.098840713500977, + "logits/rejected": 4.901972770690918, + "logps/chosen": -14.838088989257812, + "logps/rejected": -20.27056884765625, + "loss": 0.1509, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4280650019645691, + "rewards/margins": 2.6378211975097656, + "rewards/rejected": -2.209756374359131, + "step": 438 + }, + { + "epoch": 7.440677966101695, + "grad_norm": 9.451372970532205, + "learning_rate": 3.955066884944094e-07, + "logits/chosen": 0.9893157482147217, + "logits/rejected": 1.3610438108444214, + "logps/chosen": -20.96686363220215, + "logps/rejected": -29.387542724609375, + "loss": 0.1093, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38434547185897827, + "rewards/margins": 3.0744457244873047, + "rewards/rejected": -2.6900997161865234, + "step": 439 + }, + { + "epoch": 7.4576271186440675, + "grad_norm": 8.812937276501128, + "learning_rate": 3.949046761272735e-07, + "logits/chosen": 4.211581230163574, + "logits/rejected": 4.303144454956055, + "logps/chosen": -11.633298873901367, + "logps/rejected": -22.921463012695312, + "loss": 0.1111, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6645989418029785, + "rewards/margins": 2.8330225944519043, + "rewards/rejected": -2.168423891067505, + "step": 440 + }, + { + "epoch": 7.47457627118644, + "grad_norm": 8.39612269762287, + "learning_rate": 3.9430139572059815e-07, + "logits/chosen": 3.9839651584625244, + "logits/rejected": 4.943660736083984, + "logps/chosen": -20.955799102783203, + "logps/rejected": -38.92570495605469, + "loss": 0.1154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20799720287322998, + "rewards/margins": 4.353493690490723, + "rewards/rejected": -4.145496845245361, + "step": 441 + }, + { + "epoch": 7.491525423728813, + "grad_norm": 9.062673269123717, + "learning_rate": 3.9369685255360173e-07, + "logits/chosen": 2.831712007522583, + "logits/rejected": 3.3848414421081543, + "logps/chosen": -17.006778717041016, + "logps/rejected": -27.789405822753906, + "loss": 0.0997, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08365049958229065, + "rewards/margins": 3.540269374847412, + "rewards/rejected": -3.4566190242767334, + "step": 442 + }, + { + "epoch": 7.508474576271187, + "grad_norm": 10.944641060023693, + "learning_rate": 3.9309105191655247e-07, + "logits/chosen": 1.4619724750518799, + "logits/rejected": 1.7838622331619263, + "logps/chosen": -14.890405654907227, + "logps/rejected": -29.264156341552734, + "loss": 0.1429, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27657827734947205, + "rewards/margins": 3.4900012016296387, + "rewards/rejected": -3.2134225368499756, + "step": 443 + }, + { + "epoch": 7.52542372881356, + "grad_norm": 8.665383852874081, + "learning_rate": 3.924839991107229e-07, + "logits/chosen": 2.904837131500244, + "logits/rejected": 3.712893486022949, + "logps/chosen": -20.52821922302246, + "logps/rejected": -42.29707717895508, + "loss": 0.1018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07331550121307373, + "rewards/margins": 4.935427665710449, + "rewards/rejected": -4.862112045288086, + "step": 444 + }, + { + "epoch": 7.5423728813559325, + "grad_norm": 9.29215580691363, + "learning_rate": 3.918756994483429e-07, + "logits/chosen": 5.155490398406982, + "logits/rejected": 6.437995910644531, + "logps/chosen": -12.761405944824219, + "logps/rejected": -30.25531005859375, + "loss": 0.1124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3791295289993286, + "rewards/margins": 3.675171136856079, + "rewards/rejected": -3.296041488647461, + "step": 445 + }, + { + "epoch": 7.559322033898305, + "grad_norm": 8.788116788401233, + "learning_rate": 3.912661582525536e-07, + "logits/chosen": 2.623152256011963, + "logits/rejected": 3.0128703117370605, + "logps/chosen": -18.31006622314453, + "logps/rejected": -27.341602325439453, + "loss": 0.126, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6639620065689087, + "rewards/margins": 2.841630697250366, + "rewards/rejected": -2.177668571472168, + "step": 446 + }, + { + "epoch": 7.576271186440678, + "grad_norm": 8.59243714374398, + "learning_rate": 3.906553808573604e-07, + "logits/chosen": 2.9111697673797607, + "logits/rejected": 3.34836745262146, + "logps/chosen": -16.26513671875, + "logps/rejected": -26.296367645263672, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3604896068572998, + "rewards/margins": 3.5548095703125, + "rewards/rejected": -3.1943202018737793, + "step": 447 + }, + { + "epoch": 7.593220338983051, + "grad_norm": 9.388091105308293, + "learning_rate": 3.9004337260758644e-07, + "logits/chosen": 3.1576616764068604, + "logits/rejected": 3.7552833557128906, + "logps/chosen": -15.407849311828613, + "logps/rejected": -30.661590576171875, + "loss": 0.1299, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30992385745048523, + "rewards/margins": 3.318376064300537, + "rewards/rejected": -3.0084524154663086, + "step": 448 + }, + { + "epoch": 7.610169491525424, + "grad_norm": 11.069966787411271, + "learning_rate": 3.894301388588264e-07, + "logits/chosen": 1.9091204404830933, + "logits/rejected": 2.0430729389190674, + "logps/chosen": -19.71815299987793, + "logps/rejected": -27.437362670898438, + "loss": 0.1598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05926653742790222, + "rewards/margins": 3.001842498779297, + "rewards/rejected": -2.942575693130493, + "step": 449 + }, + { + "epoch": 7.627118644067797, + "grad_norm": 8.54855711232238, + "learning_rate": 3.888156849773985e-07, + "logits/chosen": 3.887667179107666, + "logits/rejected": 5.632800102233887, + "logps/chosen": -16.554094314575195, + "logps/rejected": -27.82715606689453, + "loss": 0.1129, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0867987871170044, + "rewards/margins": 3.4089887142181396, + "rewards/rejected": -3.4957876205444336, + "step": 450 + }, + { + "epoch": 7.6440677966101696, + "grad_norm": 9.519539340294198, + "learning_rate": 3.882000163402983e-07, + "logits/chosen": 5.304140567779541, + "logits/rejected": 5.604523658752441, + "logps/chosen": -20.477975845336914, + "logps/rejected": -31.88489532470703, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03972455859184265, + "rewards/margins": 3.0314011573791504, + "rewards/rejected": -3.0711255073547363, + "step": 451 + }, + { + "epoch": 7.661016949152542, + "grad_norm": 10.369425267842848, + "learning_rate": 3.8758313833515186e-07, + "logits/chosen": 3.0293540954589844, + "logits/rejected": 3.7426505088806152, + "logps/chosen": -16.383888244628906, + "logps/rejected": -28.657669067382812, + "loss": 0.1347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2297368347644806, + "rewards/margins": 4.051611423492432, + "rewards/rejected": -4.28134822845459, + "step": 452 + }, + { + "epoch": 7.677966101694915, + "grad_norm": 10.511612229614789, + "learning_rate": 3.86965056360168e-07, + "logits/chosen": 1.9586790800094604, + "logits/rejected": 2.216707229614258, + "logps/chosen": -14.328598022460938, + "logps/rejected": -25.2967586517334, + "loss": 0.1355, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4405815005302429, + "rewards/margins": 2.9190452098846436, + "rewards/rejected": -2.478464126586914, + "step": 453 + }, + { + "epoch": 7.694915254237288, + "grad_norm": 10.275712619690644, + "learning_rate": 3.8634577582409115e-07, + "logits/chosen": 3.4585013389587402, + "logits/rejected": 3.590578317642212, + "logps/chosen": -8.868934631347656, + "logps/rejected": -29.173030853271484, + "loss": 0.1482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42794957756996155, + "rewards/margins": 3.5148518085479736, + "rewards/rejected": -3.086902141571045, + "step": 454 + }, + { + "epoch": 7.711864406779661, + "grad_norm": 8.535758374035677, + "learning_rate": 3.857253021461545e-07, + "logits/chosen": 1.1870063543319702, + "logits/rejected": 1.9431824684143066, + "logps/chosen": -17.088212966918945, + "logps/rejected": -25.219560623168945, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2807660698890686, + "rewards/margins": 2.8102126121520996, + "rewards/rejected": -2.529446601867676, + "step": 455 + }, + { + "epoch": 7.728813559322034, + "grad_norm": 8.239820038925641, + "learning_rate": 3.8510364075603185e-07, + "logits/chosen": 2.6802639961242676, + "logits/rejected": 3.867419958114624, + "logps/chosen": -15.612676620483398, + "logps/rejected": -36.14496612548828, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.16128285229206085, + "rewards/margins": 5.361781120300293, + "rewards/rejected": -5.523064613342285, + "step": 456 + }, + { + "epoch": 7.745762711864407, + "grad_norm": 9.780405360243819, + "learning_rate": 3.84480797093791e-07, + "logits/chosen": 2.6172919273376465, + "logits/rejected": 3.19209623336792, + "logps/chosen": -11.52426528930664, + "logps/rejected": -22.12085723876953, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30407530069351196, + "rewards/margins": 3.035153865814209, + "rewards/rejected": -2.731078624725342, + "step": 457 + }, + { + "epoch": 7.762711864406779, + "grad_norm": 8.55870058153661, + "learning_rate": 3.8385677660984514e-07, + "logits/chosen": 4.2267279624938965, + "logits/rejected": 5.226436614990234, + "logps/chosen": -16.732118606567383, + "logps/rejected": -35.05083465576172, + "loss": 0.1035, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3358514904975891, + "rewards/margins": 4.643409252166748, + "rewards/rejected": -4.307557582855225, + "step": 458 + }, + { + "epoch": 7.779661016949152, + "grad_norm": 9.816530221791213, + "learning_rate": 3.83231584764906e-07, + "logits/chosen": -1.649599313735962, + "logits/rejected": 0.7722344398498535, + "logps/chosen": -19.57205581665039, + "logps/rejected": -30.491806030273438, + "loss": 0.1411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12148743867874146, + "rewards/margins": 3.641042470932007, + "rewards/rejected": -3.51955509185791, + "step": 459 + }, + { + "epoch": 7.796610169491525, + "grad_norm": 9.580671233027974, + "learning_rate": 3.826052270299356e-07, + "logits/chosen": 3.5561270713806152, + "logits/rejected": 3.568795680999756, + "logps/chosen": -17.304832458496094, + "logps/rejected": -27.24372673034668, + "loss": 0.123, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1926884800195694, + "rewards/margins": 3.580801486968994, + "rewards/rejected": -3.388113260269165, + "step": 460 + }, + { + "epoch": 7.813559322033898, + "grad_norm": 9.146891343129145, + "learning_rate": 3.8197770888609846e-07, + "logits/chosen": 2.516333818435669, + "logits/rejected": 3.34853458404541, + "logps/chosen": -15.37035083770752, + "logps/rejected": -26.412580490112305, + "loss": 0.1266, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.251945436000824, + "rewards/margins": 3.64552903175354, + "rewards/rejected": -3.3935835361480713, + "step": 461 + }, + { + "epoch": 7.830508474576272, + "grad_norm": 9.710956700917844, + "learning_rate": 3.813490358247137e-07, + "logits/chosen": 1.6899590492248535, + "logits/rejected": 2.5557150840759277, + "logps/chosen": -15.23322582244873, + "logps/rejected": -33.32341003417969, + "loss": 0.1115, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.556183934211731, + "rewards/margins": 3.5709474086761475, + "rewards/rejected": -3.014763355255127, + "step": 462 + }, + { + "epoch": 7.847457627118644, + "grad_norm": 8.690500202952354, + "learning_rate": 3.807192133472069e-07, + "logits/chosen": 4.0055131912231445, + "logits/rejected": 4.724710941314697, + "logps/chosen": -13.636266708374023, + "logps/rejected": -28.918319702148438, + "loss": 0.0946, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31850284337997437, + "rewards/margins": 3.9023942947387695, + "rewards/rejected": -3.5838913917541504, + "step": 463 + }, + { + "epoch": 7.864406779661017, + "grad_norm": 8.955582537191807, + "learning_rate": 3.80088246965062e-07, + "logits/chosen": 2.3977673053741455, + "logits/rejected": 3.1791818141937256, + "logps/chosen": -11.766952514648438, + "logps/rejected": -30.629945755004883, + "loss": 0.1064, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18112745881080627, + "rewards/margins": 4.072152614593506, + "rewards/rejected": -3.8910253047943115, + "step": 464 + }, + { + "epoch": 7.88135593220339, + "grad_norm": 9.28701234024736, + "learning_rate": 3.794561421997734e-07, + "logits/chosen": -0.8194286823272705, + "logits/rejected": 0.3698238730430603, + "logps/chosen": -16.36358642578125, + "logps/rejected": -26.18773078918457, + "loss": 0.1126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6522509455680847, + "rewards/margins": 2.9714879989624023, + "rewards/rejected": -2.319237232208252, + "step": 465 + }, + { + "epoch": 7.898305084745763, + "grad_norm": 8.823571317960623, + "learning_rate": 3.78822904582797e-07, + "logits/chosen": 0.573715090751648, + "logits/rejected": 2.2726151943206787, + "logps/chosen": -15.998534202575684, + "logps/rejected": -27.229896545410156, + "loss": 0.1291, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6307635307312012, + "rewards/margins": 3.3138785362243652, + "rewards/rejected": -2.683115243911743, + "step": 466 + }, + { + "epoch": 7.915254237288136, + "grad_norm": 11.217955453701311, + "learning_rate": 3.781885396555019e-07, + "logits/chosen": 3.541079044342041, + "logits/rejected": 4.791148662567139, + "logps/chosen": -14.431351661682129, + "logps/rejected": -32.49176788330078, + "loss": 0.143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1487632840871811, + "rewards/margins": 3.8755033016204834, + "rewards/rejected": -3.7267403602600098, + "step": 467 + }, + { + "epoch": 7.932203389830509, + "grad_norm": 8.59517424315105, + "learning_rate": 3.775530529691227e-07, + "logits/chosen": 3.7382278442382812, + "logits/rejected": 4.195878028869629, + "logps/chosen": -11.119057655334473, + "logps/rejected": -25.03734588623047, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38498881459236145, + "rewards/margins": 3.7704718112945557, + "rewards/rejected": -3.3854832649230957, + "step": 468 + }, + { + "epoch": 7.9491525423728815, + "grad_norm": 8.901209729681504, + "learning_rate": 3.7691645008470997e-07, + "logits/chosen": -0.129108726978302, + "logits/rejected": 0.6862486600875854, + "logps/chosen": -14.666604995727539, + "logps/rejected": -38.28477096557617, + "loss": 0.126, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20774927735328674, + "rewards/margins": 4.353483200073242, + "rewards/rejected": -4.145733833312988, + "step": 469 + }, + { + "epoch": 7.966101694915254, + "grad_norm": 8.636425954622347, + "learning_rate": 3.7627873657308206e-07, + "logits/chosen": 1.7377265691757202, + "logits/rejected": 2.822047710418701, + "logps/chosen": -14.229039192199707, + "logps/rejected": -29.386869430541992, + "loss": 0.1252, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3835397958755493, + "rewards/margins": 4.004900932312012, + "rewards/rejected": -3.62136173248291, + "step": 470 + }, + { + "epoch": 7.983050847457627, + "grad_norm": 8.838726675318133, + "learning_rate": 3.7563991801477624e-07, + "logits/chosen": -0.580298662185669, + "logits/rejected": 0.00358683243393898, + "logps/chosen": -20.286121368408203, + "logps/rejected": -26.494842529296875, + "loss": 0.1069, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030002586543560028, + "rewards/margins": 3.3837900161743164, + "rewards/rejected": -3.353787660598755, + "step": 471 + }, + { + "epoch": 8.0, + "grad_norm": 10.328093966742186, + "learning_rate": 3.75e-07, + "logits/chosen": 0.8947811722755432, + "logits/rejected": 2.392275094985962, + "logps/chosen": -17.493534088134766, + "logps/rejected": -23.69567108154297, + "loss": 0.1471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6442322731018066, + "rewards/margins": 3.405353546142578, + "rewards/rejected": -2.7611215114593506, + "step": 472 + }, + { + "epoch": 8.016949152542374, + "grad_norm": 7.750172905554329, + "learning_rate": 3.743589881285818e-07, + "logits/chosen": 3.432040214538574, + "logits/rejected": 3.4083125591278076, + "logps/chosen": -19.61204719543457, + "logps/rejected": -26.032976150512695, + "loss": 0.1337, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05513594299554825, + "rewards/margins": 2.830720901489258, + "rewards/rejected": -2.8858566284179688, + "step": 473 + }, + { + "epoch": 8.033898305084746, + "grad_norm": 9.092042783490344, + "learning_rate": 3.737168880099223e-07, + "logits/chosen": 4.506225109100342, + "logits/rejected": 5.203121662139893, + "logps/chosen": -20.774394989013672, + "logps/rejected": -28.78338623046875, + "loss": 0.1235, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7455430626869202, + "rewards/margins": 3.0526225566864014, + "rewards/rejected": -2.307079792022705, + "step": 474 + }, + { + "epoch": 8.05084745762712, + "grad_norm": 9.314515722517227, + "learning_rate": 3.7307370526294553e-07, + "logits/chosen": 2.7973568439483643, + "logits/rejected": 3.653848648071289, + "logps/chosen": -21.094676971435547, + "logps/rejected": -27.839862823486328, + "loss": 0.1412, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3912617266178131, + "rewards/margins": 3.751455307006836, + "rewards/rejected": -3.3601937294006348, + "step": 475 + }, + { + "epoch": 8.067796610169491, + "grad_norm": 8.64064476803114, + "learning_rate": 3.724294455160491e-07, + "logits/chosen": 2.164475440979004, + "logits/rejected": 2.9439971446990967, + "logps/chosen": -16.0778865814209, + "logps/rejected": -32.343509674072266, + "loss": 0.0957, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5423865914344788, + "rewards/margins": 3.9885687828063965, + "rewards/rejected": -3.4461820125579834, + "step": 476 + }, + { + "epoch": 8.084745762711865, + "grad_norm": 8.258600230037871, + "learning_rate": 3.7178411440705556e-07, + "logits/chosen": 3.7295310497283936, + "logits/rejected": 4.5462646484375, + "logps/chosen": -14.436502456665039, + "logps/rejected": -28.71784019470215, + "loss": 0.124, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3619258999824524, + "rewards/margins": 3.4388718605041504, + "rewards/rejected": -3.076946496963501, + "step": 477 + }, + { + "epoch": 8.101694915254237, + "grad_norm": 8.448006933926113, + "learning_rate": 3.7113771758316255e-07, + "logits/chosen": 4.252035617828369, + "logits/rejected": 4.446700096130371, + "logps/chosen": -19.428070068359375, + "logps/rejected": -23.08380889892578, + "loss": 0.122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7620154023170471, + "rewards/margins": 2.585181951522827, + "rewards/rejected": -1.8231666088104248, + "step": 478 + }, + { + "epoch": 8.11864406779661, + "grad_norm": 6.910166322428136, + "learning_rate": 3.704902607008938e-07, + "logits/chosen": 0.2076493501663208, + "logits/rejected": 1.2938125133514404, + "logps/chosen": -21.162704467773438, + "logps/rejected": -30.260038375854492, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25260335206985474, + "rewards/margins": 3.3553013801574707, + "rewards/rejected": -3.1026980876922607, + "step": 479 + }, + { + "epoch": 8.135593220338983, + "grad_norm": 8.235876606138394, + "learning_rate": 3.698417494260494e-07, + "logits/chosen": 1.5194463729858398, + "logits/rejected": 3.1246232986450195, + "logps/chosen": -19.262439727783203, + "logps/rejected": -31.70151710510254, + "loss": 0.0994, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3763061761856079, + "rewards/margins": 4.7519121170043945, + "rewards/rejected": -4.375606536865234, + "step": 480 + }, + { + "epoch": 8.152542372881356, + "grad_norm": 8.235302236969599, + "learning_rate": 3.691921894336563e-07, + "logits/chosen": 0.1965102255344391, + "logits/rejected": 1.2409251928329468, + "logps/chosen": -13.936543464660645, + "logps/rejected": -26.925045013427734, + "loss": 0.1128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19482684135437012, + "rewards/margins": 4.248325347900391, + "rewards/rejected": -4.053498268127441, + "step": 481 + }, + { + "epoch": 8.169491525423728, + "grad_norm": 7.368546080599964, + "learning_rate": 3.685415864079185e-07, + "logits/chosen": 2.3343710899353027, + "logits/rejected": 3.2269225120544434, + "logps/chosen": -19.352066040039062, + "logps/rejected": -33.908836364746094, + "loss": 0.0824, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4641505777835846, + "rewards/margins": 4.385346412658691, + "rewards/rejected": -3.9211952686309814, + "step": 482 + }, + { + "epoch": 8.186440677966102, + "grad_norm": 9.063756247108762, + "learning_rate": 3.6788994604216764e-07, + "logits/chosen": 0.6629161834716797, + "logits/rejected": 1.4587095975875854, + "logps/chosen": -11.309172630310059, + "logps/rejected": -32.34671401977539, + "loss": 0.1014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24906420707702637, + "rewards/margins": 3.948303461074829, + "rewards/rejected": -3.6992392539978027, + "step": 483 + }, + { + "epoch": 8.203389830508474, + "grad_norm": 7.189037130415892, + "learning_rate": 3.6723727403881275e-07, + "logits/chosen": 1.5907362699508667, + "logits/rejected": 3.4336838722229004, + "logps/chosen": -18.35771942138672, + "logps/rejected": -27.41173553466797, + "loss": 0.0976, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.8187582492828369, + "rewards/margins": 3.567868232727051, + "rewards/rejected": -2.749110221862793, + "step": 484 + }, + { + "epoch": 8.220338983050848, + "grad_norm": 7.784083778398003, + "learning_rate": 3.665835761092908e-07, + "logits/chosen": -1.079679012298584, + "logits/rejected": -0.2996291220188141, + "logps/chosen": -15.97018814086914, + "logps/rejected": -25.22320556640625, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6464424133300781, + "rewards/margins": 3.0428719520568848, + "rewards/rejected": -2.3964295387268066, + "step": 485 + }, + { + "epoch": 8.23728813559322, + "grad_norm": 7.10522523168738, + "learning_rate": 3.659288579740163e-07, + "logits/chosen": 3.566469669342041, + "logits/rejected": 4.723283767700195, + "logps/chosen": -25.532649993896484, + "logps/rejected": -30.565183639526367, + "loss": 0.0953, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.48834341764450073, + "rewards/margins": 3.6361196041107178, + "rewards/rejected": -3.147775888442993, + "step": 486 + }, + { + "epoch": 8.254237288135593, + "grad_norm": 6.861689922111202, + "learning_rate": 3.6527312536233147e-07, + "logits/chosen": 2.6697049140930176, + "logits/rejected": 3.045163154602051, + "logps/chosen": -15.023626327514648, + "logps/rejected": -29.254968643188477, + "loss": 0.0858, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3370720446109772, + "rewards/margins": 3.555783987045288, + "rewards/rejected": -3.2187116146087646, + "step": 487 + }, + { + "epoch": 8.271186440677965, + "grad_norm": 7.755950966789172, + "learning_rate": 3.646163840124561e-07, + "logits/chosen": 1.4673826694488525, + "logits/rejected": 2.106332540512085, + "logps/chosen": -17.002397537231445, + "logps/rejected": -26.75397300720215, + "loss": 0.0894, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4947773218154907, + "rewards/margins": 3.349858522415161, + "rewards/rejected": -2.855081081390381, + "step": 488 + }, + { + "epoch": 8.288135593220339, + "grad_norm": 8.848971273791523, + "learning_rate": 3.639586396714374e-07, + "logits/chosen": 2.7336244583129883, + "logits/rejected": 2.796771764755249, + "logps/chosen": -12.618149757385254, + "logps/rejected": -23.861251831054688, + "loss": 0.1141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30856549739837646, + "rewards/margins": 3.3756675720214844, + "rewards/rejected": -3.0671024322509766, + "step": 489 + }, + { + "epoch": 8.305084745762711, + "grad_norm": 9.532540271403574, + "learning_rate": 3.6329989809509933e-07, + "logits/chosen": 0.5570180416107178, + "logits/rejected": 1.2071716785430908, + "logps/chosen": -15.186332702636719, + "logps/rejected": -30.408462524414062, + "loss": 0.1157, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5822116136550903, + "rewards/margins": 4.023227214813232, + "rewards/rejected": -3.4410154819488525, + "step": 490 + }, + { + "epoch": 8.322033898305085, + "grad_norm": 8.215897245877398, + "learning_rate": 3.626401650479927e-07, + "logits/chosen": 2.7207868099212646, + "logits/rejected": 3.3667051792144775, + "logps/chosen": -14.001188278198242, + "logps/rejected": -25.76093292236328, + "loss": 0.1102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2106812298297882, + "rewards/margins": 3.618776798248291, + "rewards/rejected": -3.408095359802246, + "step": 491 + }, + { + "epoch": 8.338983050847457, + "grad_norm": 9.754620312298025, + "learning_rate": 3.6197944630334465e-07, + "logits/chosen": -0.6451621055603027, + "logits/rejected": -0.036910831928253174, + "logps/chosen": -15.894433975219727, + "logps/rejected": -28.29058837890625, + "loss": 0.1122, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5367656946182251, + "rewards/margins": 3.88037109375, + "rewards/rejected": -3.3436052799224854, + "step": 492 + }, + { + "epoch": 8.35593220338983, + "grad_norm": 8.705719115692244, + "learning_rate": 3.6131774764300785e-07, + "logits/chosen": 3.9698734283447266, + "logits/rejected": 3.9669480323791504, + "logps/chosen": -15.973098754882812, + "logps/rejected": -21.522592544555664, + "loss": 0.1106, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.44191816449165344, + "rewards/margins": 2.4768028259277344, + "rewards/rejected": -2.0348846912384033, + "step": 493 + }, + { + "epoch": 8.372881355932204, + "grad_norm": 6.277591531069204, + "learning_rate": 3.6065507485741e-07, + "logits/chosen": 2.5741474628448486, + "logits/rejected": 3.6729209423065186, + "logps/chosen": -15.507135391235352, + "logps/rejected": -27.956684112548828, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36563146114349365, + "rewards/margins": 4.5596771240234375, + "rewards/rejected": -4.194045066833496, + "step": 494 + }, + { + "epoch": 8.389830508474576, + "grad_norm": 7.923605032689377, + "learning_rate": 3.5999143374550334e-07, + "logits/chosen": -0.3899872601032257, + "logits/rejected": 0.7094336748123169, + "logps/chosen": -20.5274715423584, + "logps/rejected": -33.68539810180664, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26818332076072693, + "rewards/margins": 3.903979778289795, + "rewards/rejected": -3.635796308517456, + "step": 495 + }, + { + "epoch": 8.40677966101695, + "grad_norm": 8.438194296029398, + "learning_rate": 3.593268301147139e-07, + "logits/chosen": 3.5322861671447754, + "logits/rejected": 4.039117336273193, + "logps/chosen": -15.595931053161621, + "logps/rejected": -25.944929122924805, + "loss": 0.1163, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5988155603408813, + "rewards/margins": 3.9967434406280518, + "rewards/rejected": -3.397927761077881, + "step": 496 + }, + { + "epoch": 8.423728813559322, + "grad_norm": 8.599338534257203, + "learning_rate": 3.586612697808902e-07, + "logits/chosen": 2.3289849758148193, + "logits/rejected": 3.1125564575195312, + "logps/chosen": -16.034414291381836, + "logps/rejected": -26.47426986694336, + "loss": 0.1103, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3328191041946411, + "rewards/margins": 3.7066524028778076, + "rewards/rejected": -3.373833417892456, + "step": 497 + }, + { + "epoch": 8.440677966101696, + "grad_norm": 9.964386548019352, + "learning_rate": 3.579947585682532e-07, + "logits/chosen": 0.27496790885925293, + "logits/rejected": 1.4074618816375732, + "logps/chosen": -16.960861206054688, + "logps/rejected": -39.56070327758789, + "loss": 0.0938, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18327996134757996, + "rewards/margins": 4.648859024047852, + "rewards/rejected": -4.465579032897949, + "step": 498 + }, + { + "epoch": 8.457627118644067, + "grad_norm": 7.602211724729448, + "learning_rate": 3.573273023093446e-07, + "logits/chosen": 3.052069664001465, + "logits/rejected": 4.1386637687683105, + "logps/chosen": -23.612491607666016, + "logps/rejected": -38.5206298828125, + "loss": 0.0975, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14806516468524933, + "rewards/margins": 4.848412990570068, + "rewards/rejected": -4.700348377227783, + "step": 499 + }, + { + "epoch": 8.474576271186441, + "grad_norm": 7.622477576386271, + "learning_rate": 3.5665890684497605e-07, + "logits/chosen": 2.440854072570801, + "logits/rejected": 2.863269329071045, + "logps/chosen": -16.440420150756836, + "logps/rejected": -33.62314224243164, + "loss": 0.0925, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2879524827003479, + "rewards/margins": 4.277005672454834, + "rewards/rejected": -4.564958095550537, + "step": 500 + }, + { + "epoch": 8.491525423728813, + "grad_norm": 8.484995585922134, + "learning_rate": 3.559895780241781e-07, + "logits/chosen": 2.012855291366577, + "logits/rejected": 3.6795272827148438, + "logps/chosen": -21.541399002075195, + "logps/rejected": -24.667354583740234, + "loss": 0.1068, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.7486720085144043, + "rewards/margins": 3.1486990451812744, + "rewards/rejected": -2.400026798248291, + "step": 501 + }, + { + "epoch": 8.508474576271187, + "grad_norm": 7.646847524397191, + "learning_rate": 3.553193217041489e-07, + "logits/chosen": 1.8552759885787964, + "logits/rejected": 2.502427577972412, + "logps/chosen": -15.881732940673828, + "logps/rejected": -27.009502410888672, + "loss": 0.0913, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28931599855422974, + "rewards/margins": 3.4781432151794434, + "rewards/rejected": -3.1888270378112793, + "step": 502 + }, + { + "epoch": 8.525423728813559, + "grad_norm": 7.1749270448200635, + "learning_rate": 3.546481437502032e-07, + "logits/chosen": 0.3308050036430359, + "logits/rejected": 1.2291865348815918, + "logps/chosen": -16.362823486328125, + "logps/rejected": -29.266414642333984, + "loss": 0.1001, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.471590518951416, + "rewards/margins": 3.7012906074523926, + "rewards/rejected": -3.2296998500823975, + "step": 503 + }, + { + "epoch": 8.542372881355933, + "grad_norm": 6.72550755962128, + "learning_rate": 3.539760500357206e-07, + "logits/chosen": 1.2928094863891602, + "logits/rejected": 2.6595168113708496, + "logps/chosen": -19.021821975708008, + "logps/rejected": -27.033843994140625, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34017112851142883, + "rewards/margins": 3.2771692276000977, + "rewards/rejected": -2.936998128890991, + "step": 504 + }, + { + "epoch": 8.559322033898304, + "grad_norm": 6.378108042023536, + "learning_rate": 3.533030464420945e-07, + "logits/chosen": 1.789313793182373, + "logits/rejected": 2.7768216133117676, + "logps/chosen": -19.944744110107422, + "logps/rejected": -34.9632682800293, + "loss": 0.0764, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15939801931381226, + "rewards/margins": 4.254008769989014, + "rewards/rejected": -4.413406848907471, + "step": 505 + }, + { + "epoch": 8.576271186440678, + "grad_norm": 8.242800714593855, + "learning_rate": 3.526291388586806e-07, + "logits/chosen": 0.0323818176984787, + "logits/rejected": 0.5169703364372253, + "logps/chosen": -13.888110160827637, + "logps/rejected": -29.98931884765625, + "loss": 0.1098, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.1510196030139923, + "rewards/margins": 2.6823315620422363, + "rewards/rejected": -2.531311511993408, + "step": 506 + }, + { + "epoch": 8.59322033898305, + "grad_norm": 8.403972897386142, + "learning_rate": 3.5195433318274515e-07, + "logits/chosen": 4.257566452026367, + "logits/rejected": 5.3338093757629395, + "logps/chosen": -20.00336456298828, + "logps/rejected": -34.83162307739258, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1355169117450714, + "rewards/margins": 4.166006565093994, + "rewards/rejected": -4.301523685455322, + "step": 507 + }, + { + "epoch": 8.610169491525424, + "grad_norm": 7.373643813935829, + "learning_rate": 3.5127863531941335e-07, + "logits/chosen": 1.3813129663467407, + "logits/rejected": 1.631213665008545, + "logps/chosen": -17.48348617553711, + "logps/rejected": -37.02750015258789, + "loss": 0.0825, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10136617720127106, + "rewards/margins": 4.755850791931152, + "rewards/rejected": -4.857217311859131, + "step": 508 + }, + { + "epoch": 8.627118644067796, + "grad_norm": 8.315993274864201, + "learning_rate": 3.5060205118161816e-07, + "logits/chosen": 1.8312853574752808, + "logits/rejected": 2.7615444660186768, + "logps/chosen": -22.70886993408203, + "logps/rejected": -27.375215530395508, + "loss": 0.1118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6258946657180786, + "rewards/margins": 4.108339786529541, + "rewards/rejected": -3.4824447631835938, + "step": 509 + }, + { + "epoch": 8.64406779661017, + "grad_norm": 8.304627193892093, + "learning_rate": 3.49924586690048e-07, + "logits/chosen": 0.5513447523117065, + "logits/rejected": 1.1990902423858643, + "logps/chosen": -18.463212966918945, + "logps/rejected": -23.737966537475586, + "loss": 0.1082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0850096940994263, + "rewards/margins": 3.221740484237671, + "rewards/rejected": -2.136730670928955, + "step": 510 + }, + { + "epoch": 8.661016949152543, + "grad_norm": 7.118882711880802, + "learning_rate": 3.4924624777309504e-07, + "logits/chosen": 0.539596676826477, + "logits/rejected": 1.4375852346420288, + "logps/chosen": -15.951204299926758, + "logps/rejected": -37.94011688232422, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2643980383872986, + "rewards/margins": 4.801435470581055, + "rewards/rejected": -5.065833568572998, + "step": 511 + }, + { + "epoch": 8.677966101694915, + "grad_norm": 7.605042914781135, + "learning_rate": 3.4856704036680355e-07, + "logits/chosen": 0.71272873878479, + "logits/rejected": 1.4662222862243652, + "logps/chosen": -14.760433197021484, + "logps/rejected": -30.09404945373535, + "loss": 0.0864, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23031166195869446, + "rewards/margins": 3.1281630992889404, + "rewards/rejected": -2.8978514671325684, + "step": 512 + }, + { + "epoch": 8.694915254237289, + "grad_norm": 8.335339415371848, + "learning_rate": 3.4788697041481786e-07, + "logits/chosen": 0.4760729670524597, + "logits/rejected": 2.0185904502868652, + "logps/chosen": -14.30174446105957, + "logps/rejected": -34.21358871459961, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6585089564323425, + "rewards/margins": 4.3875041007995605, + "rewards/rejected": -3.728994846343994, + "step": 513 + }, + { + "epoch": 8.711864406779661, + "grad_norm": 7.834839249551774, + "learning_rate": 3.472060438683302e-07, + "logits/chosen": 0.7659852504730225, + "logits/rejected": 2.2133705615997314, + "logps/chosen": -21.14748191833496, + "logps/rejected": -33.81575393676758, + "loss": 0.1055, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6409199833869934, + "rewards/margins": 4.414564609527588, + "rewards/rejected": -3.7736446857452393, + "step": 514 + }, + { + "epoch": 8.728813559322035, + "grad_norm": 8.345623124734605, + "learning_rate": 3.4652426668602863e-07, + "logits/chosen": 1.0636900663375854, + "logits/rejected": 2.09614634513855, + "logps/chosen": -13.39885139465332, + "logps/rejected": -24.176618576049805, + "loss": 0.1031, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15705546736717224, + "rewards/margins": 3.6203300952911377, + "rewards/rejected": -3.7773852348327637, + "step": 515 + }, + { + "epoch": 8.745762711864407, + "grad_norm": 8.080741957241715, + "learning_rate": 3.4584164483404535e-07, + "logits/chosen": 1.1965031623840332, + "logits/rejected": 2.299811363220215, + "logps/chosen": -13.63241195678711, + "logps/rejected": -23.092220306396484, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31253504753112793, + "rewards/margins": 3.104691982269287, + "rewards/rejected": -2.79215669631958, + "step": 516 + }, + { + "epoch": 8.76271186440678, + "grad_norm": 8.47589990486935, + "learning_rate": 3.4515818428590393e-07, + "logits/chosen": 1.4136298894882202, + "logits/rejected": 2.353076696395874, + "logps/chosen": -16.381393432617188, + "logps/rejected": -27.299104690551758, + "loss": 0.091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3608657121658325, + "rewards/margins": 3.4944076538085938, + "rewards/rejected": -3.1335418224334717, + "step": 517 + }, + { + "epoch": 8.779661016949152, + "grad_norm": 7.546298321983651, + "learning_rate": 3.444738910224671e-07, + "logits/chosen": 0.9574793577194214, + "logits/rejected": 1.109427809715271, + "logps/chosen": -16.541025161743164, + "logps/rejected": -25.279096603393555, + "loss": 0.106, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5484856367111206, + "rewards/margins": 2.9023871421813965, + "rewards/rejected": -2.3539016246795654, + "step": 518 + }, + { + "epoch": 8.796610169491526, + "grad_norm": 7.868010821435501, + "learning_rate": 3.437887710318848e-07, + "logits/chosen": 0.7616167068481445, + "logits/rejected": 1.7057359218597412, + "logps/chosen": -16.0111026763916, + "logps/rejected": -28.748083114624023, + "loss": 0.1059, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7589426636695862, + "rewards/margins": 3.7685017585754395, + "rewards/rejected": -3.009559154510498, + "step": 519 + }, + { + "epoch": 8.813559322033898, + "grad_norm": 7.77017102372031, + "learning_rate": 3.4310283030954146e-07, + "logits/chosen": -3.4271130561828613, + "logits/rejected": -2.246941328048706, + "logps/chosen": -20.427770614624023, + "logps/rejected": -27.396621704101562, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33634325861930847, + "rewards/margins": 4.049783706665039, + "rewards/rejected": -3.71343994140625, + "step": 520 + }, + { + "epoch": 8.830508474576272, + "grad_norm": 8.993381798716076, + "learning_rate": 3.4241607485800363e-07, + "logits/chosen": 4.1821794509887695, + "logits/rejected": 5.369024753570557, + "logps/chosen": -11.20211124420166, + "logps/rejected": -31.801847457885742, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26158803701400757, + "rewards/margins": 4.944797039031982, + "rewards/rejected": -4.68320894241333, + "step": 521 + }, + { + "epoch": 8.847457627118644, + "grad_norm": 8.92274871440318, + "learning_rate": 3.417285106869673e-07, + "logits/chosen": 1.3400788307189941, + "logits/rejected": 2.010988235473633, + "logps/chosen": -19.691082000732422, + "logps/rejected": -32.518096923828125, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15838858485221863, + "rewards/margins": 4.223845481872559, + "rewards/rejected": -4.065457344055176, + "step": 522 + }, + { + "epoch": 8.864406779661017, + "grad_norm": 7.789159880434814, + "learning_rate": 3.4104014381320555e-07, + "logits/chosen": 3.6833863258361816, + "logits/rejected": 3.613530158996582, + "logps/chosen": -13.66424560546875, + "logps/rejected": -29.675308227539062, + "loss": 0.1182, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25313207507133484, + "rewards/margins": 4.033567905426025, + "rewards/rejected": -3.780435562133789, + "step": 523 + }, + { + "epoch": 8.88135593220339, + "grad_norm": 7.386491120135085, + "learning_rate": 3.403509802605159e-07, + "logits/chosen": 1.8282943964004517, + "logits/rejected": 2.981198310852051, + "logps/chosen": -12.948617935180664, + "logps/rejected": -29.092636108398438, + "loss": 0.085, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14645610749721527, + "rewards/margins": 4.348095893859863, + "rewards/rejected": -4.201639652252197, + "step": 524 + }, + { + "epoch": 8.898305084745763, + "grad_norm": 8.599775984807337, + "learning_rate": 3.396610260596673e-07, + "logits/chosen": 1.1022146940231323, + "logits/rejected": 2.0041048526763916, + "logps/chosen": -17.718761444091797, + "logps/rejected": -31.86960220336914, + "loss": 0.1063, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.21742774546146393, + "rewards/margins": 3.964740753173828, + "rewards/rejected": -3.7473130226135254, + "step": 525 + }, + { + "epoch": 8.915254237288135, + "grad_norm": 9.74148232877656, + "learning_rate": 3.389702872483477e-07, + "logits/chosen": -1.2812941074371338, + "logits/rejected": -0.24090474843978882, + "logps/chosen": -15.892394065856934, + "logps/rejected": -24.498292922973633, + "loss": 0.1197, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4295375943183899, + "rewards/margins": 2.9514575004577637, + "rewards/rejected": -2.5219202041625977, + "step": 526 + }, + { + "epoch": 8.932203389830509, + "grad_norm": 8.085275421779402, + "learning_rate": 3.38278769871111e-07, + "logits/chosen": 0.18231505155563354, + "logits/rejected": 1.6734726428985596, + "logps/chosen": -14.41200065612793, + "logps/rejected": -24.13207244873047, + "loss": 0.1144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45978984236717224, + "rewards/margins": 3.5683913230895996, + "rewards/rejected": -3.1086015701293945, + "step": 527 + }, + { + "epoch": 8.94915254237288, + "grad_norm": 7.581029960559348, + "learning_rate": 3.375864799793242e-07, + "logits/chosen": 0.2579716444015503, + "logits/rejected": 0.8376175761222839, + "logps/chosen": -15.547198295593262, + "logps/rejected": -22.652027130126953, + "loss": 0.0996, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4138343930244446, + "rewards/margins": 3.735337495803833, + "rewards/rejected": -3.321503162384033, + "step": 528 + }, + { + "epoch": 8.966101694915254, + "grad_norm": 8.98481843263997, + "learning_rate": 3.368934236311143e-07, + "logits/chosen": 0.5636337995529175, + "logits/rejected": 0.6224699020385742, + "logps/chosen": -19.514921188354492, + "logps/rejected": -30.25006675720215, + "loss": 0.1024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2242983728647232, + "rewards/margins": 3.2491581439971924, + "rewards/rejected": -3.02485990524292, + "step": 529 + }, + { + "epoch": 8.983050847457626, + "grad_norm": 8.623348675400907, + "learning_rate": 3.361996068913159e-07, + "logits/chosen": -0.3652976155281067, + "logits/rejected": 1.0697021484375, + "logps/chosen": -16.017398834228516, + "logps/rejected": -34.6381950378418, + "loss": 0.1104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0027234703302383423, + "rewards/margins": 4.782834529876709, + "rewards/rejected": -4.785558223724365, + "step": 530 + }, + { + "epoch": 9.0, + "grad_norm": 6.966863469240621, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": 0.13183480501174927, + "logits/rejected": 1.832362174987793, + "logps/chosen": -15.880701065063477, + "logps/rejected": -31.734975814819336, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.018413469195365906, + "rewards/margins": 4.415182113647461, + "rewards/rejected": -4.433596134185791, + "step": 531 + }, + { + "epoch": 9.016949152542374, + "grad_norm": 7.934422577021392, + "learning_rate": 3.348097165295075e-07, + "logits/chosen": 1.4493695497512817, + "logits/rejected": 1.6005449295043945, + "logps/chosen": -16.320154190063477, + "logps/rejected": -30.15085792541504, + "loss": 0.103, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.31645795702934265, + "rewards/margins": 4.485091209411621, + "rewards/rejected": -4.168633460998535, + "step": 532 + }, + { + "epoch": 9.033898305084746, + "grad_norm": 7.861889123608613, + "learning_rate": 3.341136550702241e-07, + "logits/chosen": 3.3205459117889404, + "logits/rejected": 3.6725425720214844, + "logps/chosen": -18.245128631591797, + "logps/rejected": -32.34790802001953, + "loss": 0.0856, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05831918120384216, + "rewards/margins": 4.102582931518555, + "rewards/rejected": -4.044262886047363, + "step": 533 + }, + { + "epoch": 9.05084745762712, + "grad_norm": 7.675292482188158, + "learning_rate": 3.334168575446985e-07, + "logits/chosen": -0.5090641379356384, + "logits/rejected": 0.2972201108932495, + "logps/chosen": -17.58763313293457, + "logps/rejected": -30.431671142578125, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.13420388102531433, + "rewards/margins": 3.9743454456329346, + "rewards/rejected": -4.108549118041992, + "step": 534 + }, + { + "epoch": 9.067796610169491, + "grad_norm": 6.42736068110364, + "learning_rate": 3.327193300505035e-07, + "logits/chosen": 0.4148287773132324, + "logits/rejected": 0.887850821018219, + "logps/chosen": -16.828723907470703, + "logps/rejected": -36.98302459716797, + "loss": 0.0818, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18271449208259583, + "rewards/margins": 4.25698709487915, + "rewards/rejected": -4.074272155761719, + "step": 535 + }, + { + "epoch": 9.084745762711865, + "grad_norm": 6.860146645240362, + "learning_rate": 3.3202107869159967e-07, + "logits/chosen": -1.478266716003418, + "logits/rejected": -1.039851188659668, + "logps/chosen": -20.26593017578125, + "logps/rejected": -30.06897735595703, + "loss": 0.0667, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8925938606262207, + "rewards/margins": 4.3639235496521, + "rewards/rejected": -3.4713294506073, + "step": 536 + }, + { + "epoch": 9.101694915254237, + "grad_norm": 7.308444071525899, + "learning_rate": 3.313221095782822e-07, + "logits/chosen": 0.33444347977638245, + "logits/rejected": 1.7593356370925903, + "logps/chosen": -17.03250503540039, + "logps/rejected": -31.375883102416992, + "loss": 0.0858, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5907961130142212, + "rewards/margins": 4.174586772918701, + "rewards/rejected": -3.5837903022766113, + "step": 537 + }, + { + "epoch": 9.11864406779661, + "grad_norm": 7.017699161634787, + "learning_rate": 3.306224288271272e-07, + "logits/chosen": 4.382320404052734, + "logits/rejected": 5.0999627113342285, + "logps/chosen": -12.660648345947266, + "logps/rejected": -29.118486404418945, + "loss": 0.0845, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3442675471305847, + "rewards/margins": 3.5753989219665527, + "rewards/rejected": -3.2311315536499023, + "step": 538 + }, + { + "epoch": 9.135593220338983, + "grad_norm": 7.4522176589996585, + "learning_rate": 3.2992204256093807e-07, + "logits/chosen": -2.729804277420044, + "logits/rejected": -2.560558319091797, + "logps/chosen": -20.1125545501709, + "logps/rejected": -32.09090042114258, + "loss": 0.1015, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.01902557909488678, + "rewards/margins": 3.7781434059143066, + "rewards/rejected": -3.75911808013916, + "step": 539 + }, + { + "epoch": 9.152542372881356, + "grad_norm": 7.283164358559854, + "learning_rate": 3.2922095690869224e-07, + "logits/chosen": 0.20417343080043793, + "logits/rejected": 1.5824004411697388, + "logps/chosen": -12.8890962600708, + "logps/rejected": -31.73255729675293, + "loss": 0.0836, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18634510040283203, + "rewards/margins": 5.074504375457764, + "rewards/rejected": -4.88815975189209, + "step": 540 + }, + { + "epoch": 9.169491525423728, + "grad_norm": 6.519601729714241, + "learning_rate": 3.2851917800548725e-07, + "logits/chosen": -0.5393965840339661, + "logits/rejected": -0.480104923248291, + "logps/chosen": -21.48811149597168, + "logps/rejected": -34.56924819946289, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3659580945968628, + "rewards/margins": 4.822329044342041, + "rewards/rejected": -4.456370830535889, + "step": 541 + }, + { + "epoch": 9.186440677966102, + "grad_norm": 8.893575269669139, + "learning_rate": 3.278167119924871e-07, + "logits/chosen": 2.511479377746582, + "logits/rejected": 3.0335445404052734, + "logps/chosen": -21.226470947265625, + "logps/rejected": -31.758705139160156, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.021360307931900024, + "rewards/margins": 4.572617053985596, + "rewards/rejected": -4.593977451324463, + "step": 542 + }, + { + "epoch": 9.203389830508474, + "grad_norm": 7.151569424146644, + "learning_rate": 3.2711356501686886e-07, + "logits/chosen": 0.11793151497840881, + "logits/rejected": 2.1041746139526367, + "logps/chosen": -15.034038543701172, + "logps/rejected": -31.883384704589844, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4007861316204071, + "rewards/margins": 4.863787651062012, + "rewards/rejected": -4.463001728057861, + "step": 543 + }, + { + "epoch": 9.220338983050848, + "grad_norm": 5.900554985023887, + "learning_rate": 3.2640974323176843e-07, + "logits/chosen": -1.388903260231018, + "logits/rejected": -0.5128521919250488, + "logps/chosen": -11.40340518951416, + "logps/rejected": -26.74514389038086, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.631538987159729, + "rewards/margins": 3.9903998374938965, + "rewards/rejected": -3.358860969543457, + "step": 544 + }, + { + "epoch": 9.23728813559322, + "grad_norm": 8.099563527529172, + "learning_rate": 3.257052527962269e-07, + "logits/chosen": -3.805596113204956, + "logits/rejected": -2.9346771240234375, + "logps/chosen": -15.774307250976562, + "logps/rejected": -27.184717178344727, + "loss": 0.096, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3437478840351105, + "rewards/margins": 3.4009368419647217, + "rewards/rejected": -3.0571885108947754, + "step": 545 + }, + { + "epoch": 9.254237288135593, + "grad_norm": 8.00979281359722, + "learning_rate": 3.250000998751365e-07, + "logits/chosen": 0.29453498125076294, + "logits/rejected": 1.2413694858551025, + "logps/chosen": -17.441308975219727, + "logps/rejected": -32.38224792480469, + "loss": 0.0931, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14581690728664398, + "rewards/margins": 3.907651662826538, + "rewards/rejected": -4.053468227386475, + "step": 546 + }, + { + "epoch": 9.271186440677965, + "grad_norm": 6.256499605525865, + "learning_rate": 3.2429429063918694e-07, + "logits/chosen": 1.1686536073684692, + "logits/rejected": 1.2653439044952393, + "logps/chosen": -14.776887893676758, + "logps/rejected": -27.53533363342285, + "loss": 0.0807, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.022234037518501282, + "rewards/margins": 3.4280288219451904, + "rewards/rejected": -3.450263023376465, + "step": 547 + }, + { + "epoch": 9.288135593220339, + "grad_norm": 7.36750818731658, + "learning_rate": 3.235878312648112e-07, + "logits/chosen": 0.6835775375366211, + "logits/rejected": 1.4044272899627686, + "logps/chosen": -11.718465805053711, + "logps/rejected": -29.979129791259766, + "loss": 0.1092, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4759228229522705, + "rewards/margins": 4.728598594665527, + "rewards/rejected": -4.252676486968994, + "step": 548 + }, + { + "epoch": 9.305084745762711, + "grad_norm": 7.267461085469627, + "learning_rate": 3.2288072793413147e-07, + "logits/chosen": 0.21600386500358582, + "logits/rejected": 0.11410781741142273, + "logps/chosen": -17.949951171875, + "logps/rejected": -23.075180053710938, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7247830629348755, + "rewards/margins": 3.1238279342651367, + "rewards/rejected": -2.3990447521209717, + "step": 549 + }, + { + "epoch": 9.322033898305085, + "grad_norm": 20.44947669682873, + "learning_rate": 3.2217298683490525e-07, + "logits/chosen": 1.887885332107544, + "logits/rejected": 2.2117202281951904, + "logps/chosen": -14.434246063232422, + "logps/rejected": -26.846405029296875, + "loss": 0.0903, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7232978940010071, + "rewards/margins": 4.097745895385742, + "rewards/rejected": -3.374448299407959, + "step": 550 + }, + { + "epoch": 9.338983050847457, + "grad_norm": 6.959074038410189, + "learning_rate": 3.214646141604709e-07, + "logits/chosen": 0.24728596210479736, + "logits/rejected": 1.4561734199523926, + "logps/chosen": -26.268035888671875, + "logps/rejected": -29.52174186706543, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5730979442596436, + "rewards/margins": 3.468724250793457, + "rewards/rejected": -2.8956265449523926, + "step": 551 + }, + { + "epoch": 9.35593220338983, + "grad_norm": 6.313077671551259, + "learning_rate": 3.2075561610969347e-07, + "logits/chosen": 1.3113412857055664, + "logits/rejected": 2.5229415893554688, + "logps/chosen": -19.23296546936035, + "logps/rejected": -33.98227310180664, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0007563233375549316, + "rewards/margins": 4.5488176345825195, + "rewards/rejected": -4.549574375152588, + "step": 552 + }, + { + "epoch": 9.372881355932204, + "grad_norm": 6.765231684504796, + "learning_rate": 3.200459988869111e-07, + "logits/chosen": 2.283677101135254, + "logits/rejected": 3.3818447589874268, + "logps/chosen": -17.183874130249023, + "logps/rejected": -27.654109954833984, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10292646288871765, + "rewards/margins": 3.9736781120300293, + "rewards/rejected": -4.07660436630249, + "step": 553 + }, + { + "epoch": 9.389830508474576, + "grad_norm": 7.653842264043208, + "learning_rate": 3.193357687018797e-07, + "logits/chosen": 3.6839277744293213, + "logits/rejected": 3.4737772941589355, + "logps/chosen": -15.96898078918457, + "logps/rejected": -32.69147872924805, + "loss": 0.0996, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.23338308930397034, + "rewards/margins": 5.1718363761901855, + "rewards/rejected": -4.938453674316406, + "step": 554 + }, + { + "epoch": 9.40677966101695, + "grad_norm": 7.773525249469395, + "learning_rate": 3.186249317697194e-07, + "logits/chosen": 1.9703714847564697, + "logits/rejected": 2.7468535900115967, + "logps/chosen": -23.632442474365234, + "logps/rejected": -31.607494354248047, + "loss": 0.0793, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16064058244228363, + "rewards/margins": 4.493778228759766, + "rewards/rejected": -4.333136558532715, + "step": 555 + }, + { + "epoch": 9.423728813559322, + "grad_norm": 6.042060539510561, + "learning_rate": 3.1791349431085965e-07, + "logits/chosen": 1.4269167184829712, + "logits/rejected": 2.471031427383423, + "logps/chosen": -15.8950834274292, + "logps/rejected": -30.275249481201172, + "loss": 0.0718, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.443370521068573, + "rewards/margins": 4.576178550720215, + "rewards/rejected": -4.132808208465576, + "step": 556 + }, + { + "epoch": 9.440677966101696, + "grad_norm": 7.04877899953803, + "learning_rate": 3.1720146255098537e-07, + "logits/chosen": -3.6721673011779785, + "logits/rejected": -1.3306491374969482, + "logps/chosen": -15.854928970336914, + "logps/rejected": -34.22749328613281, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24515631794929504, + "rewards/margins": 5.318935394287109, + "rewards/rejected": -5.073779106140137, + "step": 557 + }, + { + "epoch": 9.457627118644067, + "grad_norm": 6.415901347555323, + "learning_rate": 3.1648884272098177e-07, + "logits/chosen": -1.1916613578796387, + "logits/rejected": -0.4890459477901459, + "logps/chosen": -12.988748550415039, + "logps/rejected": -18.562076568603516, + "loss": 0.0831, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3931862711906433, + "rewards/margins": 2.553929328918457, + "rewards/rejected": -2.160742998123169, + "step": 558 + }, + { + "epoch": 9.474576271186441, + "grad_norm": 8.236180250931438, + "learning_rate": 3.157756410568803e-07, + "logits/chosen": -0.5213596820831299, + "logits/rejected": 0.4198833107948303, + "logps/chosen": -16.794715881347656, + "logps/rejected": -24.685680389404297, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47782301902770996, + "rewards/margins": 2.9782776832580566, + "rewards/rejected": -2.5004544258117676, + "step": 559 + }, + { + "epoch": 9.491525423728813, + "grad_norm": 6.682073953644928, + "learning_rate": 3.150618637998041e-07, + "logits/chosen": -1.3392764329910278, + "logits/rejected": -0.44644010066986084, + "logps/chosen": -14.473580360412598, + "logps/rejected": -28.390796661376953, + "loss": 0.1011, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40848979353904724, + "rewards/margins": 4.742112636566162, + "rewards/rejected": -4.333622455596924, + "step": 560 + }, + { + "epoch": 9.508474576271187, + "grad_norm": 7.9822120644527175, + "learning_rate": 3.1434751719591305e-07, + "logits/chosen": -2.3160512447357178, + "logits/rejected": -2.051666736602783, + "logps/chosen": -18.745946884155273, + "logps/rejected": -34.98544692993164, + "loss": 0.0725, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22431953251361847, + "rewards/margins": 3.5555598735809326, + "rewards/rejected": -3.7798798084259033, + "step": 561 + }, + { + "epoch": 9.525423728813559, + "grad_norm": 6.991538298987854, + "learning_rate": 3.136326074963494e-07, + "logits/chosen": 1.2757248878479004, + "logits/rejected": 1.5806338787078857, + "logps/chosen": -16.574125289916992, + "logps/rejected": -25.393587112426758, + "loss": 0.0897, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2125283181667328, + "rewards/margins": 3.118790864944458, + "rewards/rejected": -3.3313193321228027, + "step": 562 + }, + { + "epoch": 9.542372881355933, + "grad_norm": 7.369679468767769, + "learning_rate": 3.1291714095718294e-07, + "logits/chosen": 2.265183448791504, + "logits/rejected": 2.9056434631347656, + "logps/chosen": -11.20726203918457, + "logps/rejected": -30.12190818786621, + "loss": 0.0899, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08223026990890503, + "rewards/margins": 4.531458854675293, + "rewards/rejected": -4.449228763580322, + "step": 563 + }, + { + "epoch": 9.559322033898304, + "grad_norm": 6.569787571821808, + "learning_rate": 3.122011238393562e-07, + "logits/chosen": -0.13098454475402832, + "logits/rejected": 0.32105502486228943, + "logps/chosen": -11.81690788269043, + "logps/rejected": -22.512004852294922, + "loss": 0.0747, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28879374265670776, + "rewards/margins": 3.5874359607696533, + "rewards/rejected": -3.298642158508301, + "step": 564 + }, + { + "epoch": 9.576271186440678, + "grad_norm": 6.887546375786722, + "learning_rate": 3.1148456240862993e-07, + "logits/chosen": 1.671111822128296, + "logits/rejected": 2.590162992477417, + "logps/chosen": -18.62039566040039, + "logps/rejected": -37.18769073486328, + "loss": 0.0721, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12747378647327423, + "rewards/margins": 5.317519187927246, + "rewards/rejected": -5.444993019104004, + "step": 565 + }, + { + "epoch": 9.59322033898305, + "grad_norm": 6.474331733578539, + "learning_rate": 3.1076746293552785e-07, + "logits/chosen": -0.6289358735084534, + "logits/rejected": 0.30806756019592285, + "logps/chosen": -13.156976699829102, + "logps/rejected": -35.53199005126953, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07099157571792603, + "rewards/margins": 5.9290876388549805, + "rewards/rejected": -5.858095645904541, + "step": 566 + }, + { + "epoch": 9.610169491525424, + "grad_norm": 6.754288182278723, + "learning_rate": 3.1004983169528225e-07, + "logits/chosen": 1.0062041282653809, + "logits/rejected": 1.5405733585357666, + "logps/chosen": -14.011225700378418, + "logps/rejected": -28.679346084594727, + "loss": 0.0821, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.006055355072021484, + "rewards/margins": 4.468363285064697, + "rewards/rejected": -4.462307929992676, + "step": 567 + }, + { + "epoch": 9.627118644067796, + "grad_norm": 6.7447175824089785, + "learning_rate": 3.0933167496777873e-07, + "logits/chosen": 2.5769147872924805, + "logits/rejected": 3.43656587600708, + "logps/chosen": -14.334616661071777, + "logps/rejected": -23.00216293334961, + "loss": 0.0792, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5674694776535034, + "rewards/margins": 3.888597249984741, + "rewards/rejected": -3.3211278915405273, + "step": 568 + }, + { + "epoch": 9.64406779661017, + "grad_norm": 7.580248860572857, + "learning_rate": 3.0861299903750115e-07, + "logits/chosen": -2.0739083290100098, + "logits/rejected": -1.222943902015686, + "logps/chosen": -18.079673767089844, + "logps/rejected": -33.84402847290039, + "loss": 0.103, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08615212887525558, + "rewards/margins": 5.459482192993164, + "rewards/rejected": -5.5456342697143555, + "step": 569 + }, + { + "epoch": 9.661016949152543, + "grad_norm": 6.992408377021299, + "learning_rate": 3.0789381019347724e-07, + "logits/chosen": 1.04190993309021, + "logits/rejected": 1.8616278171539307, + "logps/chosen": -11.107009887695312, + "logps/rejected": -24.896486282348633, + "loss": 0.0826, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5456734895706177, + "rewards/margins": 3.182859182357788, + "rewards/rejected": -2.63718581199646, + "step": 570 + }, + { + "epoch": 9.677966101694915, + "grad_norm": 6.873279551179824, + "learning_rate": 3.071741147292229e-07, + "logits/chosen": 1.097784161567688, + "logits/rejected": 1.5534999370574951, + "logps/chosen": -19.162097930908203, + "logps/rejected": -30.126440048217773, + "loss": 0.0849, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3364775478839874, + "rewards/margins": 4.0454020500183105, + "rewards/rejected": -3.7089245319366455, + "step": 571 + }, + { + "epoch": 9.694915254237289, + "grad_norm": 7.3910530095138816, + "learning_rate": 3.0645391894268734e-07, + "logits/chosen": 0.8061460256576538, + "logits/rejected": 0.8003032803535461, + "logps/chosen": -18.292184829711914, + "logps/rejected": -37.91918182373047, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048791974782943726, + "rewards/margins": 4.811036586761475, + "rewards/rejected": -4.859828472137451, + "step": 572 + }, + { + "epoch": 9.711864406779661, + "grad_norm": 5.996192080077326, + "learning_rate": 3.057332291361983e-07, + "logits/chosen": -0.12891456484794617, + "logits/rejected": 1.2491331100463867, + "logps/chosen": -17.618732452392578, + "logps/rejected": -30.310043334960938, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26622286438941956, + "rewards/margins": 4.7754058837890625, + "rewards/rejected": -4.509182929992676, + "step": 573 + }, + { + "epoch": 9.728813559322035, + "grad_norm": 6.171874008126151, + "learning_rate": 3.050120516164062e-07, + "logits/chosen": -0.018170345574617386, + "logits/rejected": 1.102428913116455, + "logps/chosen": -17.116703033447266, + "logps/rejected": -36.32782745361328, + "loss": 0.0823, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4552394449710846, + "rewards/margins": 5.033170700073242, + "rewards/rejected": -4.577930927276611, + "step": 574 + }, + { + "epoch": 9.745762711864407, + "grad_norm": 7.451739032357378, + "learning_rate": 3.042903926942297e-07, + "logits/chosen": -1.0211288928985596, + "logits/rejected": 0.3760063350200653, + "logps/chosen": -20.83000373840332, + "logps/rejected": -33.21617889404297, + "loss": 0.0854, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09166756272315979, + "rewards/margins": 4.8829121589660645, + "rewards/rejected": -4.791244983673096, + "step": 575 + }, + { + "epoch": 9.76271186440678, + "grad_norm": 6.577485413860199, + "learning_rate": 3.0356825868480014e-07, + "logits/chosen": 1.2451457977294922, + "logits/rejected": 1.2695012092590332, + "logps/chosen": -14.58353328704834, + "logps/rejected": -26.606840133666992, + "loss": 0.0827, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3149658143520355, + "rewards/margins": 3.5247819423675537, + "rewards/rejected": -3.2098162174224854, + "step": 576 + }, + { + "epoch": 9.779661016949152, + "grad_norm": 7.124665800971439, + "learning_rate": 3.0284565590740607e-07, + "logits/chosen": 0.004668239504098892, + "logits/rejected": 1.3000279664993286, + "logps/chosen": -15.21019172668457, + "logps/rejected": -32.77819061279297, + "loss": 0.0944, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3152053952217102, + "rewards/margins": 4.309925079345703, + "rewards/rejected": -3.9947195053100586, + "step": 577 + }, + { + "epoch": 9.796610169491526, + "grad_norm": 7.258911609377368, + "learning_rate": 3.021225906854383e-07, + "logits/chosen": 0.9472925066947937, + "logits/rejected": 0.8469686508178711, + "logps/chosen": -16.569828033447266, + "logps/rejected": -26.66617202758789, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1706191599369049, + "rewards/margins": 3.989084243774414, + "rewards/rejected": -3.818464756011963, + "step": 578 + }, + { + "epoch": 9.813559322033898, + "grad_norm": 6.194678399120452, + "learning_rate": 3.013990693463344e-07, + "logits/chosen": 2.516047954559326, + "logits/rejected": 3.760178565979004, + "logps/chosen": -19.39842987060547, + "logps/rejected": -27.496429443359375, + "loss": 0.0709, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6658935546875, + "rewards/margins": 3.9277522563934326, + "rewards/rejected": -3.2618587017059326, + "step": 579 + }, + { + "epoch": 9.830508474576272, + "grad_norm": 6.308757033182929, + "learning_rate": 3.006750982215234e-07, + "logits/chosen": -0.16934671998023987, + "logits/rejected": 0.16713052988052368, + "logps/chosen": -20.933340072631836, + "logps/rejected": -27.842639923095703, + "loss": 0.0902, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43451377749443054, + "rewards/margins": 3.2033681869506836, + "rewards/rejected": -2.7688541412353516, + "step": 580 + }, + { + "epoch": 9.847457627118644, + "grad_norm": 7.185771510108942, + "learning_rate": 2.9995068364637023e-07, + "logits/chosen": 0.970811665058136, + "logits/rejected": 1.682969331741333, + "logps/chosen": -11.327841758728027, + "logps/rejected": -30.293479919433594, + "loss": 0.0889, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.22116993367671967, + "rewards/margins": 4.630741596221924, + "rewards/rejected": -4.409571647644043, + "step": 581 + }, + { + "epoch": 9.864406779661017, + "grad_norm": 5.920235267116203, + "learning_rate": 2.9922583196012035e-07, + "logits/chosen": 2.227673053741455, + "logits/rejected": 3.2266247272491455, + "logps/chosen": -14.300827980041504, + "logps/rejected": -26.003128051757812, + "loss": 0.0834, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.379828542470932, + "rewards/margins": 3.9268646240234375, + "rewards/rejected": -3.5470361709594727, + "step": 582 + }, + { + "epoch": 9.88135593220339, + "grad_norm": 8.120498709040012, + "learning_rate": 2.985005495058446e-07, + "logits/chosen": 2.3699660301208496, + "logits/rejected": 3.4204599857330322, + "logps/chosen": -13.776253700256348, + "logps/rejected": -27.62920570373535, + "loss": 0.0801, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11627595126628876, + "rewards/margins": 4.1572794914245605, + "rewards/rejected": -4.041003227233887, + "step": 583 + }, + { + "epoch": 9.898305084745763, + "grad_norm": 6.7905769224670385, + "learning_rate": 2.9777484263038303e-07, + "logits/chosen": -0.20370978116989136, + "logits/rejected": -0.03232604265213013, + "logps/chosen": -19.46759605407715, + "logps/rejected": -32.68392562866211, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04020601511001587, + "rewards/margins": 4.452649116516113, + "rewards/rejected": -4.492855072021484, + "step": 584 + }, + { + "epoch": 9.915254237288135, + "grad_norm": 8.057292474544546, + "learning_rate": 2.9704871768429016e-07, + "logits/chosen": 0.16171008348464966, + "logits/rejected": 0.7341707348823547, + "logps/chosen": -19.58940315246582, + "logps/rejected": -27.96889877319336, + "loss": 0.099, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6214279532432556, + "rewards/margins": 3.7912871837615967, + "rewards/rejected": -3.1698591709136963, + "step": 585 + }, + { + "epoch": 9.932203389830509, + "grad_norm": 7.937563375803782, + "learning_rate": 2.9632218102177856e-07, + "logits/chosen": 1.5007938146591187, + "logits/rejected": 2.5846619606018066, + "logps/chosen": -15.489872932434082, + "logps/rejected": -25.30343246459961, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.508277416229248, + "rewards/margins": 3.338674306869507, + "rewards/rejected": -2.830397129058838, + "step": 586 + }, + { + "epoch": 9.94915254237288, + "grad_norm": 6.201025361723098, + "learning_rate": 2.9559523900066393e-07, + "logits/chosen": 3.0162744522094727, + "logits/rejected": 4.763891696929932, + "logps/chosen": -15.360055923461914, + "logps/rejected": -23.94509506225586, + "loss": 0.0815, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10628265142440796, + "rewards/margins": 3.4138388633728027, + "rewards/rejected": -3.30755615234375, + "step": 587 + }, + { + "epoch": 9.966101694915254, + "grad_norm": 6.718672190898394, + "learning_rate": 2.948678979823092e-07, + "logits/chosen": -1.6246399879455566, + "logits/rejected": -1.125629186630249, + "logps/chosen": -21.073156356811523, + "logps/rejected": -27.51262092590332, + "loss": 0.0851, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7466506361961365, + "rewards/margins": 4.355165004730225, + "rewards/rejected": -3.6085143089294434, + "step": 588 + }, + { + "epoch": 9.983050847457626, + "grad_norm": 6.881964686221444, + "learning_rate": 2.941401643315686e-07, + "logits/chosen": -0.25319910049438477, + "logits/rejected": 0.6310909986495972, + "logps/chosen": -13.684246063232422, + "logps/rejected": -31.185253143310547, + "loss": 0.0839, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46263331174850464, + "rewards/margins": 3.4962663650512695, + "rewards/rejected": -3.033633232116699, + "step": 589 + }, + { + "epoch": 10.0, + "grad_norm": 7.234238898566095, + "learning_rate": 2.934120444167326e-07, + "logits/chosen": -0.9664945602416992, + "logits/rejected": 0.9399136304855347, + "logps/chosen": -13.277244567871094, + "logps/rejected": -25.155290603637695, + "loss": 0.0974, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3529563844203949, + "rewards/margins": 3.6515965461730957, + "rewards/rejected": -3.298640251159668, + "step": 590 + }, + { + "epoch": 10.016949152542374, + "grad_norm": 5.605599826609898, + "learning_rate": 2.926835446094716e-07, + "logits/chosen": -0.22446855902671814, + "logits/rejected": 0.5719990730285645, + "logps/chosen": -18.55261993408203, + "logps/rejected": -34.263275146484375, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.035877808928489685, + "rewards/margins": 4.546453952789307, + "rewards/rejected": -4.582332134246826, + "step": 591 + }, + { + "epoch": 10.033898305084746, + "grad_norm": 7.0637140819187545, + "learning_rate": 2.919546712847804e-07, + "logits/chosen": 1.9319177865982056, + "logits/rejected": 4.413154125213623, + "logps/chosen": -16.405902862548828, + "logps/rejected": -38.4708251953125, + "loss": 0.0868, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28276926279067993, + "rewards/margins": 5.0292229652404785, + "rewards/rejected": -4.746453762054443, + "step": 592 + }, + { + "epoch": 10.05084745762712, + "grad_norm": 5.921559005856268, + "learning_rate": 2.9122543082092246e-07, + "logits/chosen": 0.27007830142974854, + "logits/rejected": 0.5482916831970215, + "logps/chosen": -19.874710083007812, + "logps/rejected": -34.47208786010742, + "loss": 0.0747, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15760372579097748, + "rewards/margins": 4.97886323928833, + "rewards/rejected": -4.82125997543335, + "step": 593 + }, + { + "epoch": 10.067796610169491, + "grad_norm": 5.603806953490683, + "learning_rate": 2.9049582959937393e-07, + "logits/chosen": -1.8163609504699707, + "logits/rejected": -0.8079499006271362, + "logps/chosen": -19.71383285522461, + "logps/rejected": -27.676366806030273, + "loss": 0.0728, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5124025344848633, + "rewards/margins": 3.4622397422790527, + "rewards/rejected": -2.9498372077941895, + "step": 594 + }, + { + "epoch": 10.084745762711865, + "grad_norm": 8.323328684692287, + "learning_rate": 2.89765874004768e-07, + "logits/chosen": 0.9996928572654724, + "logits/rejected": 2.200232982635498, + "logps/chosen": -14.065080642700195, + "logps/rejected": -31.71418571472168, + "loss": 0.0682, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40716665983200073, + "rewards/margins": 4.4655961990356445, + "rewards/rejected": -4.05842924118042, + "step": 595 + }, + { + "epoch": 10.101694915254237, + "grad_norm": 6.075575616422893, + "learning_rate": 2.890355704248388e-07, + "logits/chosen": -0.06882792711257935, + "logits/rejected": 0.32778534293174744, + "logps/chosen": -14.473672866821289, + "logps/rejected": -27.489944458007812, + "loss": 0.0843, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15015511214733124, + "rewards/margins": 4.016153335571289, + "rewards/rejected": -3.8659985065460205, + "step": 596 + }, + { + "epoch": 10.11864406779661, + "grad_norm": 6.4149085949318385, + "learning_rate": 2.8830492525036587e-07, + "logits/chosen": -1.3330459594726562, + "logits/rejected": -0.7347086668014526, + "logps/chosen": -16.720176696777344, + "logps/rejected": -30.86295509338379, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8011523485183716, + "rewards/margins": 4.588528633117676, + "rewards/rejected": -3.7873764038085938, + "step": 597 + }, + { + "epoch": 10.135593220338983, + "grad_norm": 6.036481300660005, + "learning_rate": 2.875739448751176e-07, + "logits/chosen": -0.32826659083366394, + "logits/rejected": 0.4339306354522705, + "logps/chosen": -14.774092674255371, + "logps/rejected": -31.687725067138672, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1728964000940323, + "rewards/margins": 4.463289260864258, + "rewards/rejected": -4.290392875671387, + "step": 598 + }, + { + "epoch": 10.152542372881356, + "grad_norm": 5.535179077856142, + "learning_rate": 2.8684263569579603e-07, + "logits/chosen": -0.16325974464416504, + "logits/rejected": 1.0072623491287231, + "logps/chosen": -15.923517227172852, + "logps/rejected": -30.345041275024414, + "loss": 0.0735, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2372168004512787, + "rewards/margins": 4.257624626159668, + "rewards/rejected": -4.020407676696777, + "step": 599 + }, + { + "epoch": 10.169491525423728, + "grad_norm": 5.163419851208816, + "learning_rate": 2.8611100411198035e-07, + "logits/chosen": 2.3917036056518555, + "logits/rejected": 3.164809226989746, + "logps/chosen": -13.81911849975586, + "logps/rejected": -27.369752883911133, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1035224199295044, + "rewards/margins": 3.278686761856079, + "rewards/rejected": -3.1751644611358643, + "step": 600 + }, + { + "epoch": 10.186440677966102, + "grad_norm": 5.282168937091676, + "learning_rate": 2.853790565260712e-07, + "logits/chosen": 2.342893123626709, + "logits/rejected": 3.2603580951690674, + "logps/chosen": -10.237663269042969, + "logps/rejected": -28.54045867919922, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11436425149440765, + "rewards/margins": 4.062482833862305, + "rewards/rejected": -4.176846981048584, + "step": 601 + }, + { + "epoch": 10.203389830508474, + "grad_norm": 6.559991217868432, + "learning_rate": 2.846467993432342e-07, + "logits/chosen": -0.0991126000881195, + "logits/rejected": 1.038847804069519, + "logps/chosen": -18.47357940673828, + "logps/rejected": -32.717811584472656, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10679806023836136, + "rewards/margins": 4.322455406188965, + "rewards/rejected": -4.429253101348877, + "step": 602 + }, + { + "epoch": 10.220338983050848, + "grad_norm": 6.029869722239407, + "learning_rate": 2.8391423897134454e-07, + "logits/chosen": 2.3927204608917236, + "logits/rejected": 3.382364273071289, + "logps/chosen": -22.96356964111328, + "logps/rejected": -40.342674255371094, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9394710063934326, + "rewards/margins": 5.052562713623047, + "rewards/rejected": -5.9920334815979, + "step": 603 + }, + { + "epoch": 10.23728813559322, + "grad_norm": 6.053046265634852, + "learning_rate": 2.8318138182093047e-07, + "logits/chosen": 3.377546787261963, + "logits/rejected": 3.5565762519836426, + "logps/chosen": -14.09969711303711, + "logps/rejected": -37.41474914550781, + "loss": 0.0856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06845837831497192, + "rewards/margins": 5.450356483459473, + "rewards/rejected": -5.518815040588379, + "step": 604 + }, + { + "epoch": 10.254237288135593, + "grad_norm": 6.757171886215197, + "learning_rate": 2.8244823430511725e-07, + "logits/chosen": -0.2594801187515259, + "logits/rejected": 0.3401494026184082, + "logps/chosen": -20.410661697387695, + "logps/rejected": -33.572357177734375, + "loss": 0.0739, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09414413571357727, + "rewards/margins": 4.429196357727051, + "rewards/rejected": -4.335052013397217, + "step": 605 + }, + { + "epoch": 10.271186440677965, + "grad_norm": 7.487518504831589, + "learning_rate": 2.8171480283957117e-07, + "logits/chosen": -1.103548288345337, + "logits/rejected": -0.3884028196334839, + "logps/chosen": -14.363391876220703, + "logps/rejected": -26.286643981933594, + "loss": 0.0859, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20181015133857727, + "rewards/margins": 3.964519500732422, + "rewards/rejected": -3.762709140777588, + "step": 606 + }, + { + "epoch": 10.288135593220339, + "grad_norm": 9.61749318701916, + "learning_rate": 2.8098109384244315e-07, + "logits/chosen": -0.7662684917449951, + "logits/rejected": 0.3284900188446045, + "logps/chosen": -18.53818702697754, + "logps/rejected": -30.907567977905273, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3579058051109314, + "rewards/margins": 4.812725067138672, + "rewards/rejected": -4.4548187255859375, + "step": 607 + }, + { + "epoch": 10.305084745762711, + "grad_norm": 13.352854572328381, + "learning_rate": 2.8024711373431297e-07, + "logits/chosen": 3.6991214752197266, + "logits/rejected": 5.248588562011719, + "logps/chosen": -19.04616928100586, + "logps/rejected": -34.87682342529297, + "loss": 0.0856, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.015332631766796112, + "rewards/margins": 4.818325519561768, + "rewards/rejected": -4.833658695220947, + "step": 608 + }, + { + "epoch": 10.322033898305085, + "grad_norm": 6.606910858411598, + "learning_rate": 2.795128689381327e-07, + "logits/chosen": -0.6707379817962646, + "logits/rejected": 0.10683369636535645, + "logps/chosen": -14.721006393432617, + "logps/rejected": -32.36350631713867, + "loss": 0.0832, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08566325902938843, + "rewards/margins": 4.577155113220215, + "rewards/rejected": -4.491491794586182, + "step": 609 + }, + { + "epoch": 10.338983050847457, + "grad_norm": 6.849176075211304, + "learning_rate": 2.787783658791707e-07, + "logits/chosen": 0.8173718452453613, + "logits/rejected": 1.94963800907135, + "logps/chosen": -17.403362274169922, + "logps/rejected": -34.591819763183594, + "loss": 0.0773, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31257036328315735, + "rewards/margins": 5.061819553375244, + "rewards/rejected": -4.749249458312988, + "step": 610 + }, + { + "epoch": 10.35593220338983, + "grad_norm": 5.6378028599700905, + "learning_rate": 2.7804361098495547e-07, + "logits/chosen": 1.038071632385254, + "logits/rejected": 2.492664337158203, + "logps/chosen": -21.735687255859375, + "logps/rejected": -39.11198425292969, + "loss": 0.0712, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03703010082244873, + "rewards/margins": 5.7275238037109375, + "rewards/rejected": -5.764554023742676, + "step": 611 + }, + { + "epoch": 10.372881355932204, + "grad_norm": 6.252348135448011, + "learning_rate": 2.7730861068521913e-07, + "logits/chosen": -0.26412123441696167, + "logits/rejected": -0.7102423906326294, + "logps/chosen": -15.097103118896484, + "logps/rejected": -25.90133285522461, + "loss": 0.0717, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3093462884426117, + "rewards/margins": 3.6670660972595215, + "rewards/rejected": -3.357720136642456, + "step": 612 + }, + { + "epoch": 10.389830508474576, + "grad_norm": 7.728236696916781, + "learning_rate": 2.7657337141184134e-07, + "logits/chosen": -1.9688349962234497, + "logits/rejected": -0.6171210408210754, + "logps/chosen": -16.57401466369629, + "logps/rejected": -30.085830688476562, + "loss": 0.0964, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.272685170173645, + "rewards/margins": 4.039053440093994, + "rewards/rejected": -3.7663679122924805, + "step": 613 + }, + { + "epoch": 10.40677966101695, + "grad_norm": 5.485716187816843, + "learning_rate": 2.75837899598793e-07, + "logits/chosen": -3.0717618465423584, + "logits/rejected": -2.5173027515411377, + "logps/chosen": -11.400280952453613, + "logps/rejected": -26.211803436279297, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6044121384620667, + "rewards/margins": 4.291702747344971, + "rewards/rejected": -3.6872904300689697, + "step": 614 + }, + { + "epoch": 10.423728813559322, + "grad_norm": 6.25330768538192, + "learning_rate": 2.7510220168207996e-07, + "logits/chosen": -0.6666077375411987, + "logits/rejected": 0.4080359935760498, + "logps/chosen": -15.857671737670898, + "logps/rejected": -31.8955078125, + "loss": 0.0748, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42522817850112915, + "rewards/margins": 4.800365447998047, + "rewards/rejected": -4.3751373291015625, + "step": 615 + }, + { + "epoch": 10.440677966101696, + "grad_norm": 6.5295459646116125, + "learning_rate": 2.743662840996866e-07, + "logits/chosen": -0.040004000067710876, + "logits/rejected": 1.337239146232605, + "logps/chosen": -25.981477737426758, + "logps/rejected": -37.080894470214844, + "loss": 0.0922, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7214669585227966, + "rewards/margins": 3.7918219566345215, + "rewards/rejected": -3.070355176925659, + "step": 616 + }, + { + "epoch": 10.457627118644067, + "grad_norm": 6.723707358215044, + "learning_rate": 2.736301532915196e-07, + "logits/chosen": 2.539464235305786, + "logits/rejected": 3.4116129875183105, + "logps/chosen": -13.20813274383545, + "logps/rejected": -23.4318904876709, + "loss": 0.0881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3384879231452942, + "rewards/margins": 3.296609401702881, + "rewards/rejected": -2.9581210613250732, + "step": 617 + }, + { + "epoch": 10.474576271186441, + "grad_norm": 5.972258436867128, + "learning_rate": 2.7289381569935167e-07, + "logits/chosen": 1.7145967483520508, + "logits/rejected": 2.2452683448791504, + "logps/chosen": -20.402782440185547, + "logps/rejected": -31.13981819152832, + "loss": 0.0698, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.17722374200820923, + "rewards/margins": 4.375895977020264, + "rewards/rejected": -4.553119659423828, + "step": 618 + }, + { + "epoch": 10.491525423728813, + "grad_norm": 5.552103251591103, + "learning_rate": 2.7215727776676476e-07, + "logits/chosen": -0.3053174316883087, + "logits/rejected": -0.7472686767578125, + "logps/chosen": -13.040075302124023, + "logps/rejected": -28.802827835083008, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5394836068153381, + "rewards/margins": 4.001735687255859, + "rewards/rejected": -3.462251901626587, + "step": 619 + }, + { + "epoch": 10.508474576271187, + "grad_norm": 6.23435993213956, + "learning_rate": 2.714205459390942e-07, + "logits/chosen": -2.3834853172302246, + "logits/rejected": -0.9893413186073303, + "logps/chosen": -20.2705020904541, + "logps/rejected": -35.93256759643555, + "loss": 0.0675, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2076442539691925, + "rewards/margins": 4.942375659942627, + "rewards/rejected": -4.734731197357178, + "step": 620 + }, + { + "epoch": 10.525423728813559, + "grad_norm": 5.497320238058098, + "learning_rate": 2.7068362666337213e-07, + "logits/chosen": 0.960399866104126, + "logits/rejected": 1.0540908575057983, + "logps/chosen": -16.861644744873047, + "logps/rejected": -31.652591705322266, + "loss": 0.0587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34831947088241577, + "rewards/margins": 3.5641627311706543, + "rewards/rejected": -3.912482261657715, + "step": 621 + }, + { + "epoch": 10.542372881355933, + "grad_norm": 6.039868051539346, + "learning_rate": 2.6994652638827075e-07, + "logits/chosen": -1.5167878866195679, + "logits/rejected": -0.33636558055877686, + "logps/chosen": -15.524983406066895, + "logps/rejected": -28.981571197509766, + "loss": 0.0657, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14473609626293182, + "rewards/margins": 4.660205841064453, + "rewards/rejected": -4.804942607879639, + "step": 622 + }, + { + "epoch": 10.559322033898304, + "grad_norm": 7.569134750815346, + "learning_rate": 2.6920925156404644e-07, + "logits/chosen": -0.6254299283027649, + "logits/rejected": 0.8782142996788025, + "logps/chosen": -22.75749969482422, + "logps/rejected": -31.623991012573242, + "loss": 0.0881, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.046930328011512756, + "rewards/margins": 3.9689981937408447, + "rewards/rejected": -3.922067642211914, + "step": 623 + }, + { + "epoch": 10.576271186440678, + "grad_norm": 6.555450045599343, + "learning_rate": 2.684718086424828e-07, + "logits/chosen": -1.0054882764816284, + "logits/rejected": -1.0527081489562988, + "logps/chosen": -13.448890686035156, + "logps/rejected": -29.792495727539062, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32683414220809937, + "rewards/margins": 4.527410507202148, + "rewards/rejected": -4.200576305389404, + "step": 624 + }, + { + "epoch": 10.59322033898305, + "grad_norm": 6.4497992529556, + "learning_rate": 2.677342040768346e-07, + "logits/chosen": -0.724130392074585, + "logits/rejected": -0.5873971581459045, + "logps/chosen": -13.653425216674805, + "logps/rejected": -21.41414451599121, + "loss": 0.0984, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3240708112716675, + "rewards/margins": 2.993685007095337, + "rewards/rejected": -2.669614315032959, + "step": 625 + }, + { + "epoch": 10.610169491525424, + "grad_norm": 6.034047003671632, + "learning_rate": 2.669964443217711e-07, + "logits/chosen": 0.8687635064125061, + "logits/rejected": 1.7205008268356323, + "logps/chosen": -13.059608459472656, + "logps/rejected": -24.26249122619629, + "loss": 0.0703, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6247247457504272, + "rewards/margins": 4.4865922927856445, + "rewards/rejected": -3.8618674278259277, + "step": 626 + }, + { + "epoch": 10.627118644067796, + "grad_norm": 5.783511488843652, + "learning_rate": 2.662585358333194e-07, + "logits/chosen": 1.799331784248352, + "logits/rejected": 2.220020294189453, + "logps/chosen": -14.510954856872559, + "logps/rejected": -28.626510620117188, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4560384154319763, + "rewards/margins": 4.656362533569336, + "rewards/rejected": -4.200323581695557, + "step": 627 + }, + { + "epoch": 10.64406779661017, + "grad_norm": 6.878208069873474, + "learning_rate": 2.655204850688085e-07, + "logits/chosen": -4.561980724334717, + "logits/rejected": -4.086709022521973, + "logps/chosen": -21.963542938232422, + "logps/rejected": -29.26177978515625, + "loss": 0.0985, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.047145962715148926, + "rewards/margins": 3.6454405784606934, + "rewards/rejected": -3.598294734954834, + "step": 628 + }, + { + "epoch": 10.661016949152543, + "grad_norm": 6.441910708272441, + "learning_rate": 2.6478229848681217e-07, + "logits/chosen": -0.45938020944595337, + "logits/rejected": 0.5661107897758484, + "logps/chosen": -24.593807220458984, + "logps/rejected": -43.87635040283203, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4468405842781067, + "rewards/margins": 4.661881446838379, + "rewards/rejected": -4.215041160583496, + "step": 629 + }, + { + "epoch": 10.677966101694915, + "grad_norm": 6.21379201456098, + "learning_rate": 2.6404398254709283e-07, + "logits/chosen": 0.44006413221359253, + "logits/rejected": 2.213808298110962, + "logps/chosen": -18.386600494384766, + "logps/rejected": -24.819305419921875, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34933701157569885, + "rewards/margins": 3.0405497550964355, + "rewards/rejected": -2.6912126541137695, + "step": 630 + }, + { + "epoch": 10.694915254237289, + "grad_norm": 6.025721748593164, + "learning_rate": 2.633055437105446e-07, + "logits/chosen": -2.20528507232666, + "logits/rejected": -0.7411336898803711, + "logps/chosen": -14.320496559143066, + "logps/rejected": -29.599105834960938, + "loss": 0.0787, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7818676233291626, + "rewards/margins": 4.8507161140441895, + "rewards/rejected": -4.068848609924316, + "step": 631 + }, + { + "epoch": 10.711864406779661, + "grad_norm": 5.45264581101388, + "learning_rate": 2.6256698843913765e-07, + "logits/chosen": -0.10968533158302307, + "logits/rejected": 0.07270720601081848, + "logps/chosen": -14.70391845703125, + "logps/rejected": -33.96483612060547, + "loss": 0.0603, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03261980414390564, + "rewards/margins": 5.4097900390625, + "rewards/rejected": -5.377170562744141, + "step": 632 + }, + { + "epoch": 10.728813559322035, + "grad_norm": 6.881559988949641, + "learning_rate": 2.6182832319586045e-07, + "logits/chosen": -1.6360795497894287, + "logits/rejected": -0.09993427991867065, + "logps/chosen": -25.044832229614258, + "logps/rejected": -31.365337371826172, + "loss": 0.0888, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6498722434043884, + "rewards/margins": 3.7184948921203613, + "rewards/rejected": -3.068622589111328, + "step": 633 + }, + { + "epoch": 10.745762711864407, + "grad_norm": 4.9186439647542315, + "learning_rate": 2.6108955444466407e-07, + "logits/chosen": -2.3466663360595703, + "logits/rejected": -1.737764835357666, + "logps/chosen": -14.39980697631836, + "logps/rejected": -28.963144302368164, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.045732468366622925, + "rewards/margins": 4.375434875488281, + "rewards/rejected": -4.329702377319336, + "step": 634 + }, + { + "epoch": 10.76271186440678, + "grad_norm": 5.7723666132358575, + "learning_rate": 2.6035068865040556e-07, + "logits/chosen": 1.6776143312454224, + "logits/rejected": 2.0671205520629883, + "logps/chosen": -16.550514221191406, + "logps/rejected": -36.80196762084961, + "loss": 0.0597, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3467442989349365, + "rewards/margins": 4.6361002922058105, + "rewards/rejected": -4.289356231689453, + "step": 635 + }, + { + "epoch": 10.779661016949152, + "grad_norm": 5.557597931065527, + "learning_rate": 2.596117322787907e-07, + "logits/chosen": -1.9914857149124146, + "logits/rejected": -1.6738017797470093, + "logps/chosen": -12.358434677124023, + "logps/rejected": -27.78692626953125, + "loss": 0.073, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.19304338097572327, + "rewards/margins": 3.466974973678589, + "rewards/rejected": -3.2739317417144775, + "step": 636 + }, + { + "epoch": 10.796610169491526, + "grad_norm": 6.364962907087287, + "learning_rate": 2.588726917963183e-07, + "logits/chosen": -0.4724288284778595, + "logits/rejected": 0.24811546504497528, + "logps/chosen": -20.698482513427734, + "logps/rejected": -30.643646240234375, + "loss": 0.0722, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10548533499240875, + "rewards/margins": 3.7175071239471436, + "rewards/rejected": -3.8229920864105225, + "step": 637 + }, + { + "epoch": 10.813559322033898, + "grad_norm": 5.984849335536948, + "learning_rate": 2.58133573670223e-07, + "logits/chosen": -1.071981430053711, + "logits/rejected": 0.18668809533119202, + "logps/chosen": -19.217470169067383, + "logps/rejected": -40.57974624633789, + "loss": 0.0714, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3740503191947937, + "rewards/margins": 5.143554210662842, + "rewards/rejected": -4.769504070281982, + "step": 638 + }, + { + "epoch": 10.830508474576272, + "grad_norm": 5.392339021607484, + "learning_rate": 2.5739438436841923e-07, + "logits/chosen": -0.7297754287719727, + "logits/rejected": -0.20466329157352448, + "logps/chosen": -12.028447151184082, + "logps/rejected": -24.535282135009766, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4793805778026581, + "rewards/margins": 4.4265217781066895, + "rewards/rejected": -3.947141170501709, + "step": 639 + }, + { + "epoch": 10.847457627118644, + "grad_norm": 5.987376643789218, + "learning_rate": 2.566551303594437e-07, + "logits/chosen": -2.031426429748535, + "logits/rejected": -1.2302422523498535, + "logps/chosen": -15.288844108581543, + "logps/rejected": -25.842559814453125, + "loss": 0.0801, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4474770426750183, + "rewards/margins": 4.266576290130615, + "rewards/rejected": -3.819099187850952, + "step": 640 + }, + { + "epoch": 10.864406779661017, + "grad_norm": 6.364295591288036, + "learning_rate": 2.559158181123998e-07, + "logits/chosen": -2.1851398944854736, + "logits/rejected": -0.4198753237724304, + "logps/chosen": -17.15755844116211, + "logps/rejected": -37.938419342041016, + "loss": 0.0788, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.057618916034698486, + "rewards/margins": 5.671139717102051, + "rewards/rejected": -5.613520622253418, + "step": 641 + }, + { + "epoch": 10.88135593220339, + "grad_norm": 6.4165584079093385, + "learning_rate": 2.5517645409690045e-07, + "logits/chosen": -1.9795904159545898, + "logits/rejected": 0.3546423316001892, + "logps/chosen": -14.565407752990723, + "logps/rejected": -32.81105422973633, + "loss": 0.0759, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.44077977538108826, + "rewards/margins": 5.220073223114014, + "rewards/rejected": -4.779293060302734, + "step": 642 + }, + { + "epoch": 10.898305084745763, + "grad_norm": 5.958138967478258, + "learning_rate": 2.544370447830115e-07, + "logits/chosen": -1.0830657482147217, + "logits/rejected": -0.3092998266220093, + "logps/chosen": -11.562661170959473, + "logps/rejected": -32.89630889892578, + "loss": 0.0846, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.37156158685684204, + "rewards/margins": 4.80405855178833, + "rewards/rejected": -4.432497024536133, + "step": 643 + }, + { + "epoch": 10.915254237288135, + "grad_norm": 6.0750333626885755, + "learning_rate": 2.5369759664119533e-07, + "logits/chosen": -1.6788640022277832, + "logits/rejected": -1.4880775213241577, + "logps/chosen": -10.794754028320312, + "logps/rejected": -29.423439025878906, + "loss": 0.0879, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6279995441436768, + "rewards/margins": 4.083406925201416, + "rewards/rejected": -3.4554076194763184, + "step": 644 + }, + { + "epoch": 10.932203389830509, + "grad_norm": 6.463093646854645, + "learning_rate": 2.52958116142254e-07, + "logits/chosen": 0.09284260869026184, + "logits/rejected": 1.2297515869140625, + "logps/chosen": -19.610464096069336, + "logps/rejected": -35.90717315673828, + "loss": 0.0848, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3308804929256439, + "rewards/margins": 5.242147445678711, + "rewards/rejected": -4.911267280578613, + "step": 645 + }, + { + "epoch": 10.94915254237288, + "grad_norm": 6.204542792920929, + "learning_rate": 2.522186097572727e-07, + "logits/chosen": -0.35911503434181213, + "logits/rejected": -0.4865139126777649, + "logps/chosen": -14.578887939453125, + "logps/rejected": -30.42376708984375, + "loss": 0.0755, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18817180395126343, + "rewards/margins": 3.996063232421875, + "rewards/rejected": -3.807891368865967, + "step": 646 + }, + { + "epoch": 10.966101694915254, + "grad_norm": 5.916857518131793, + "learning_rate": 2.514790839575634e-07, + "logits/chosen": -0.07091692090034485, + "logits/rejected": 1.1772609949111938, + "logps/chosen": -16.612977981567383, + "logps/rejected": -38.160552978515625, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07712310552597046, + "rewards/margins": 5.184111595153809, + "rewards/rejected": -5.106988430023193, + "step": 647 + }, + { + "epoch": 10.983050847457626, + "grad_norm": 6.407191345132673, + "learning_rate": 2.507395452146074e-07, + "logits/chosen": -0.2633797228336334, + "logits/rejected": 0.21091461181640625, + "logps/chosen": -17.18181037902832, + "logps/rejected": -28.922496795654297, + "loss": 0.074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47414809465408325, + "rewards/margins": 4.16361665725708, + "rewards/rejected": -3.6894688606262207, + "step": 648 + }, + { + "epoch": 11.0, + "grad_norm": 7.332363565320445, + "learning_rate": 2.5e-07, + "logits/chosen": 1.4961445331573486, + "logits/rejected": 2.3838436603546143, + "logps/chosen": -17.417078018188477, + "logps/rejected": -31.51581573486328, + "loss": 0.0915, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28374284505844116, + "rewards/margins": 5.035284996032715, + "rewards/rejected": -5.319027900695801, + "step": 649 + }, + { + "epoch": 11.016949152542374, + "grad_norm": 6.710449322941788, + "learning_rate": 2.4926045478539256e-07, + "logits/chosen": -0.41290348768234253, + "logits/rejected": -0.33926910161972046, + "logps/chosen": -13.81264591217041, + "logps/rejected": -32.02968215942383, + "loss": 0.0819, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03655707836151123, + "rewards/margins": 4.345344066619873, + "rewards/rejected": -4.381901741027832, + "step": 650 + }, + { + "epoch": 11.033898305084746, + "grad_norm": 5.564446129176738, + "learning_rate": 2.485209160424366e-07, + "logits/chosen": -0.4851105213165283, + "logits/rejected": 0.16047148406505585, + "logps/chosen": -16.25713348388672, + "logps/rejected": -25.432125091552734, + "loss": 0.0668, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.536626935005188, + "rewards/margins": 3.623508930206299, + "rewards/rejected": -3.0868821144104004, + "step": 651 + }, + { + "epoch": 11.05084745762712, + "grad_norm": 5.5774736986307, + "learning_rate": 2.477813902427272e-07, + "logits/chosen": -0.30681484937667847, + "logits/rejected": 0.5228248238563538, + "logps/chosen": -19.037277221679688, + "logps/rejected": -33.582115173339844, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4117254316806793, + "rewards/margins": 5.038257598876953, + "rewards/rejected": -4.626532077789307, + "step": 652 + }, + { + "epoch": 11.067796610169491, + "grad_norm": 5.723595464164498, + "learning_rate": 2.47041883857746e-07, + "logits/chosen": -2.9797439575195312, + "logits/rejected": -0.5674165487289429, + "logps/chosen": -18.158016204833984, + "logps/rejected": -38.53350830078125, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48181551694869995, + "rewards/margins": 6.316951274871826, + "rewards/rejected": -5.835135459899902, + "step": 653 + }, + { + "epoch": 11.084745762711865, + "grad_norm": 5.650928750845028, + "learning_rate": 2.463024033588046e-07, + "logits/chosen": -2.9346420764923096, + "logits/rejected": -1.6567249298095703, + "logps/chosen": -18.69190216064453, + "logps/rejected": -30.705974578857422, + "loss": 0.0602, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2089875191450119, + "rewards/margins": 4.7780303955078125, + "rewards/rejected": -4.569043159484863, + "step": 654 + }, + { + "epoch": 11.101694915254237, + "grad_norm": 5.834575281966656, + "learning_rate": 2.455629552169885e-07, + "logits/chosen": -1.604995846748352, + "logits/rejected": -1.15968656539917, + "logps/chosen": -14.991609573364258, + "logps/rejected": -36.816070556640625, + "loss": 0.0704, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32100000977516174, + "rewards/margins": 4.978287220001221, + "rewards/rejected": -4.657287120819092, + "step": 655 + }, + { + "epoch": 11.11864406779661, + "grad_norm": 4.389904402387597, + "learning_rate": 2.448235459030996e-07, + "logits/chosen": -1.4382152557373047, + "logits/rejected": -0.1963300108909607, + "logps/chosen": -14.805438995361328, + "logps/rejected": -25.915740966796875, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7601976990699768, + "rewards/margins": 3.805788516998291, + "rewards/rejected": -3.045591115951538, + "step": 656 + }, + { + "epoch": 11.135593220338983, + "grad_norm": 4.8932214139129435, + "learning_rate": 2.4408418188760024e-07, + "logits/chosen": 0.6055293083190918, + "logits/rejected": 1.1443455219268799, + "logps/chosen": -14.102373123168945, + "logps/rejected": -31.51885223388672, + "loss": 0.0644, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22653046250343323, + "rewards/margins": 4.710800647735596, + "rewards/rejected": -4.484270095825195, + "step": 657 + }, + { + "epoch": 11.152542372881356, + "grad_norm": 5.086376596339275, + "learning_rate": 2.433448696405563e-07, + "logits/chosen": 0.898298442363739, + "logits/rejected": 1.4417123794555664, + "logps/chosen": -14.588051795959473, + "logps/rejected": -32.29475784301758, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19099584221839905, + "rewards/margins": 3.8837523460388184, + "rewards/rejected": -4.074748516082764, + "step": 658 + }, + { + "epoch": 11.169491525423728, + "grad_norm": 5.917003120913046, + "learning_rate": 2.426056156315808e-07, + "logits/chosen": -0.6991736888885498, + "logits/rejected": 0.2075153887271881, + "logps/chosen": -15.327507972717285, + "logps/rejected": -32.986915588378906, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6373805999755859, + "rewards/margins": 4.569518089294434, + "rewards/rejected": -3.9321374893188477, + "step": 659 + }, + { + "epoch": 11.186440677966102, + "grad_norm": 5.884139034140242, + "learning_rate": 2.4186642632977697e-07, + "logits/chosen": -1.9707210063934326, + "logits/rejected": -2.106113910675049, + "logps/chosen": -18.28716278076172, + "logps/rejected": -47.34130096435547, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2313433587551117, + "rewards/margins": 5.2318034172058105, + "rewards/rejected": -5.000459671020508, + "step": 660 + }, + { + "epoch": 11.203389830508474, + "grad_norm": 5.643822454346463, + "learning_rate": 2.4112730820368174e-07, + "logits/chosen": -1.8582695722579956, + "logits/rejected": -0.8738647699356079, + "logps/chosen": -13.801697731018066, + "logps/rejected": -25.913692474365234, + "loss": 0.0664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005020655691623688, + "rewards/margins": 3.6902852058410645, + "rewards/rejected": -3.6953060626983643, + "step": 661 + }, + { + "epoch": 11.220338983050848, + "grad_norm": 4.680962274881366, + "learning_rate": 2.403882677212093e-07, + "logits/chosen": 0.6877023577690125, + "logits/rejected": 2.0853915214538574, + "logps/chosen": -18.771940231323242, + "logps/rejected": -28.819293975830078, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35152608156204224, + "rewards/margins": 4.176992893218994, + "rewards/rejected": -3.825467109680176, + "step": 662 + }, + { + "epoch": 11.23728813559322, + "grad_norm": 5.544585862782336, + "learning_rate": 2.3964931134959447e-07, + "logits/chosen": -1.1638025045394897, + "logits/rejected": -1.0309693813323975, + "logps/chosen": -18.670225143432617, + "logps/rejected": -28.90134048461914, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8741496801376343, + "rewards/margins": 4.214159965515137, + "rewards/rejected": -3.340010404586792, + "step": 663 + }, + { + "epoch": 11.254237288135593, + "grad_norm": 5.503541176838127, + "learning_rate": 2.3891044555533586e-07, + "logits/chosen": 0.034407466650009155, + "logits/rejected": 0.9282214641571045, + "logps/chosen": -15.212516784667969, + "logps/rejected": -28.66778564453125, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3786475956439972, + "rewards/margins": 4.232975959777832, + "rewards/rejected": -3.8543283939361572, + "step": 664 + }, + { + "epoch": 11.271186440677965, + "grad_norm": 5.416764804202228, + "learning_rate": 2.381716768041395e-07, + "logits/chosen": 1.908860683441162, + "logits/rejected": 2.344414234161377, + "logps/chosen": -21.801124572753906, + "logps/rejected": -48.7527961730957, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009497061371803284, + "rewards/margins": 5.7271881103515625, + "rewards/rejected": -5.717690467834473, + "step": 665 + }, + { + "epoch": 11.288135593220339, + "grad_norm": 6.367781532097555, + "learning_rate": 2.374330115608624e-07, + "logits/chosen": -0.11974642425775528, + "logits/rejected": 0.6266567707061768, + "logps/chosen": -19.88003921508789, + "logps/rejected": -35.740413665771484, + "loss": 0.078, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.530373215675354, + "rewards/margins": 5.431632041931152, + "rewards/rejected": -5.962004661560059, + "step": 666 + }, + { + "epoch": 11.305084745762711, + "grad_norm": 5.626477352589517, + "learning_rate": 2.3669445628945538e-07, + "logits/chosen": 3.3318705558776855, + "logits/rejected": 4.2224955558776855, + "logps/chosen": -19.341773986816406, + "logps/rejected": -34.1142463684082, + "loss": 0.0662, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06154298782348633, + "rewards/margins": 4.670680046081543, + "rewards/rejected": -4.732223033905029, + "step": 667 + }, + { + "epoch": 11.322033898305085, + "grad_norm": 5.994475408383821, + "learning_rate": 2.3595601745290725e-07, + "logits/chosen": -0.7659940719604492, + "logits/rejected": -0.017023414373397827, + "logps/chosen": -13.725777626037598, + "logps/rejected": -26.52062225341797, + "loss": 0.0733, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.18746277689933777, + "rewards/margins": 3.57584810256958, + "rewards/rejected": -3.38838529586792, + "step": 668 + }, + { + "epoch": 11.338983050847457, + "grad_norm": 6.014148031758321, + "learning_rate": 2.3521770151318784e-07, + "logits/chosen": -1.5654340982437134, + "logits/rejected": -0.7088303565979004, + "logps/chosen": -19.64613914489746, + "logps/rejected": -35.96198272705078, + "loss": 0.0746, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24194762110710144, + "rewards/margins": 5.484867572784424, + "rewards/rejected": -5.242919921875, + "step": 669 + }, + { + "epoch": 11.35593220338983, + "grad_norm": 5.904026735747155, + "learning_rate": 2.344795149311915e-07, + "logits/chosen": -1.6562185287475586, + "logits/rejected": -0.89968341588974, + "logps/chosen": -16.9832820892334, + "logps/rejected": -32.758567810058594, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13500508666038513, + "rewards/margins": 4.955135822296143, + "rewards/rejected": -4.820130825042725, + "step": 670 + }, + { + "epoch": 11.372881355932204, + "grad_norm": 4.493982386752136, + "learning_rate": 2.3374146416668062e-07, + "logits/chosen": -1.7589337825775146, + "logits/rejected": -1.0952833890914917, + "logps/chosen": -18.65007209777832, + "logps/rejected": -34.211727142333984, + "loss": 0.0534, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2679644823074341, + "rewards/margins": 5.072956562042236, + "rewards/rejected": -4.804991722106934, + "step": 671 + }, + { + "epoch": 11.389830508474576, + "grad_norm": 5.9376340717952, + "learning_rate": 2.3300355567822893e-07, + "logits/chosen": -1.799917221069336, + "logits/rejected": -1.431991457939148, + "logps/chosen": -12.619460105895996, + "logps/rejected": -33.20170974731445, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10192511230707169, + "rewards/margins": 5.032021522521973, + "rewards/rejected": -4.930096626281738, + "step": 672 + }, + { + "epoch": 11.40677966101695, + "grad_norm": 5.493300214575323, + "learning_rate": 2.3226579592316537e-07, + "logits/chosen": -4.258989334106445, + "logits/rejected": -3.360172748565674, + "logps/chosen": -16.969221115112305, + "logps/rejected": -30.5546932220459, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7313569188117981, + "rewards/margins": 4.744319915771484, + "rewards/rejected": -4.01296329498291, + "step": 673 + }, + { + "epoch": 11.423728813559322, + "grad_norm": 5.344025354735871, + "learning_rate": 2.315281913575172e-07, + "logits/chosen": -0.1827794909477234, + "logits/rejected": 0.7984082698822021, + "logps/chosen": -16.188215255737305, + "logps/rejected": -26.802419662475586, + "loss": 0.0669, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4958565831184387, + "rewards/margins": 4.04383659362793, + "rewards/rejected": -3.5479798316955566, + "step": 674 + }, + { + "epoch": 11.440677966101696, + "grad_norm": 5.640670191172759, + "learning_rate": 2.3079074843595354e-07, + "logits/chosen": -0.5284520983695984, + "logits/rejected": 1.0243207216262817, + "logps/chosen": -15.523112297058105, + "logps/rejected": -33.79278564453125, + "loss": 0.0789, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.5843504071235657, + "rewards/margins": 4.946844577789307, + "rewards/rejected": -4.362493515014648, + "step": 675 + }, + { + "epoch": 11.457627118644067, + "grad_norm": 6.132077188997056, + "learning_rate": 2.300534736117292e-07, + "logits/chosen": -4.605233192443848, + "logits/rejected": -3.059077739715576, + "logps/chosen": -18.289066314697266, + "logps/rejected": -29.978668212890625, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5631596446037292, + "rewards/margins": 4.844709873199463, + "rewards/rejected": -4.281550407409668, + "step": 676 + }, + { + "epoch": 11.474576271186441, + "grad_norm": 6.728620411744036, + "learning_rate": 2.2931637333662785e-07, + "logits/chosen": -2.1704790592193604, + "logits/rejected": -1.2523642778396606, + "logps/chosen": -17.027385711669922, + "logps/rejected": -23.327802658081055, + "loss": 0.0772, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.022564664483070374, + "rewards/margins": 3.2541422843933105, + "rewards/rejected": -3.276707172393799, + "step": 677 + }, + { + "epoch": 11.491525423728813, + "grad_norm": 4.822714725679061, + "learning_rate": 2.2857945406090578e-07, + "logits/chosen": 1.1990220546722412, + "logits/rejected": 1.711303472518921, + "logps/chosen": -10.6589994430542, + "logps/rejected": -30.195152282714844, + "loss": 0.056, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8227249383926392, + "rewards/margins": 4.05718994140625, + "rewards/rejected": -3.2344648838043213, + "step": 678 + }, + { + "epoch": 11.508474576271187, + "grad_norm": 5.502535282470463, + "learning_rate": 2.2784272223323527e-07, + "logits/chosen": 0.7849825620651245, + "logits/rejected": 0.8696894645690918, + "logps/chosen": -12.942652702331543, + "logps/rejected": -22.10407066345215, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7114682197570801, + "rewards/margins": 3.7902207374572754, + "rewards/rejected": -3.078752279281616, + "step": 679 + }, + { + "epoch": 11.525423728813559, + "grad_norm": 5.489019094233486, + "learning_rate": 2.271061843006484e-07, + "logits/chosen": 1.2034146785736084, + "logits/rejected": 2.1347923278808594, + "logps/chosen": -15.247549057006836, + "logps/rejected": -34.84923553466797, + "loss": 0.0726, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38953667879104614, + "rewards/margins": 4.773509979248047, + "rewards/rejected": -4.383973598480225, + "step": 680 + }, + { + "epoch": 11.542372881355933, + "grad_norm": 5.059363361327104, + "learning_rate": 2.263698467084804e-07, + "logits/chosen": -2.265956163406372, + "logits/rejected": -2.1130669116973877, + "logps/chosen": -13.221713066101074, + "logps/rejected": -44.21703338623047, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46408432722091675, + "rewards/margins": 6.4323248863220215, + "rewards/rejected": -5.968240261077881, + "step": 681 + }, + { + "epoch": 11.559322033898304, + "grad_norm": 5.682481282090831, + "learning_rate": 2.2563371590031338e-07, + "logits/chosen": -0.5086088180541992, + "logits/rejected": -0.18234482407569885, + "logps/chosen": -13.774551391601562, + "logps/rejected": -29.623085021972656, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3060076832771301, + "rewards/margins": 4.601235866546631, + "rewards/rejected": -4.295228004455566, + "step": 682 + }, + { + "epoch": 11.576271186440678, + "grad_norm": 5.448076076241578, + "learning_rate": 2.2489779831792004e-07, + "logits/chosen": -1.4444375038146973, + "logits/rejected": -1.0325533151626587, + "logps/chosen": -17.940210342407227, + "logps/rejected": -25.362424850463867, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.061567530035972595, + "rewards/margins": 3.1408650875091553, + "rewards/rejected": -3.0792975425720215, + "step": 683 + }, + { + "epoch": 11.59322033898305, + "grad_norm": 5.012773976084183, + "learning_rate": 2.2416210040120701e-07, + "logits/chosen": -2.4592440128326416, + "logits/rejected": -0.24068701267242432, + "logps/chosen": -17.526630401611328, + "logps/rejected": -28.232179641723633, + "loss": 0.0681, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.41350769996643066, + "rewards/margins": 4.1663594245910645, + "rewards/rejected": -3.752851724624634, + "step": 684 + }, + { + "epoch": 11.610169491525424, + "grad_norm": 5.374681219785216, + "learning_rate": 2.2342662858815867e-07, + "logits/chosen": -1.426383137702942, + "logits/rejected": -0.5009871125221252, + "logps/chosen": -19.61524200439453, + "logps/rejected": -36.145263671875, + "loss": 0.0634, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7944812774658203, + "rewards/margins": 5.898135662078857, + "rewards/rejected": -5.103654861450195, + "step": 685 + }, + { + "epoch": 11.627118644067796, + "grad_norm": 6.447716303096716, + "learning_rate": 2.2269138931478082e-07, + "logits/chosen": -1.0536390542984009, + "logits/rejected": -0.29386839270591736, + "logps/chosen": -17.011661529541016, + "logps/rejected": -34.54016876220703, + "loss": 0.0801, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05360276997089386, + "rewards/margins": 4.743268013000488, + "rewards/rejected": -4.796871185302734, + "step": 686 + }, + { + "epoch": 11.64406779661017, + "grad_norm": 5.838531737146053, + "learning_rate": 2.2195638901504448e-07, + "logits/chosen": -3.675412654876709, + "logits/rejected": -1.9526886940002441, + "logps/chosen": -13.154958724975586, + "logps/rejected": -25.237438201904297, + "loss": 0.0613, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6144644618034363, + "rewards/margins": 4.4610209465026855, + "rewards/rejected": -3.8465561866760254, + "step": 687 + }, + { + "epoch": 11.661016949152543, + "grad_norm": 5.5640251268706615, + "learning_rate": 2.2122163412082927e-07, + "logits/chosen": -1.8933367729187012, + "logits/rejected": -1.5240275859832764, + "logps/chosen": -17.554466247558594, + "logps/rejected": -29.81175422668457, + "loss": 0.0639, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.03798585385084152, + "rewards/margins": 3.6812267303466797, + "rewards/rejected": -3.719212532043457, + "step": 688 + }, + { + "epoch": 11.677966101694915, + "grad_norm": 5.298577590978091, + "learning_rate": 2.2048713106186737e-07, + "logits/chosen": 3.980241060256958, + "logits/rejected": 4.915340900421143, + "logps/chosen": -10.66274642944336, + "logps/rejected": -35.31324768066406, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.28576070070266724, + "rewards/margins": 5.346850395202637, + "rewards/rejected": -5.061089515686035, + "step": 689 + }, + { + "epoch": 11.694915254237289, + "grad_norm": 5.33692092549021, + "learning_rate": 2.197528862656871e-07, + "logits/chosen": 0.6891968846321106, + "logits/rejected": 0.9424477815628052, + "logps/chosen": -15.107596397399902, + "logps/rejected": -28.289329528808594, + "loss": 0.0695, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3898676633834839, + "rewards/margins": 4.073996067047119, + "rewards/rejected": -3.6841282844543457, + "step": 690 + }, + { + "epoch": 11.711864406779661, + "grad_norm": 6.204488409189156, + "learning_rate": 2.190189061575569e-07, + "logits/chosen": -2.2730422019958496, + "logits/rejected": -2.2472658157348633, + "logps/chosen": -13.751178741455078, + "logps/rejected": -32.95414352416992, + "loss": 0.0688, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2908335030078888, + "rewards/margins": 4.785499095916748, + "rewards/rejected": -5.0763325691223145, + "step": 691 + }, + { + "epoch": 11.728813559322035, + "grad_norm": 5.3301717255826375, + "learning_rate": 2.1828519716042886e-07, + "logits/chosen": -0.497864305973053, + "logits/rejected": 0.37698572874069214, + "logps/chosen": -16.339096069335938, + "logps/rejected": -34.11829376220703, + "loss": 0.0625, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3000969886779785, + "rewards/margins": 4.58716344833374, + "rewards/rejected": -4.287066459655762, + "step": 692 + }, + { + "epoch": 11.745762711864407, + "grad_norm": 5.19057795345044, + "learning_rate": 2.1755176569488273e-07, + "logits/chosen": 1.1150970458984375, + "logits/rejected": 2.441777467727661, + "logps/chosen": -16.924339294433594, + "logps/rejected": -30.032011032104492, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12007679045200348, + "rewards/margins": 5.060428142547607, + "rewards/rejected": -5.180505275726318, + "step": 693 + }, + { + "epoch": 11.76271186440678, + "grad_norm": 6.742926871062795, + "learning_rate": 2.168186181790695e-07, + "logits/chosen": 0.5850726366043091, + "logits/rejected": 1.4656791687011719, + "logps/chosen": -17.600032806396484, + "logps/rejected": -37.253028869628906, + "loss": 0.0758, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2762156128883362, + "rewards/margins": 5.659514904022217, + "rewards/rejected": -5.383298873901367, + "step": 694 + }, + { + "epoch": 11.779661016949152, + "grad_norm": 6.11340248272783, + "learning_rate": 2.1608576102865547e-07, + "logits/chosen": -1.120666742324829, + "logits/rejected": 0.017741888761520386, + "logps/chosen": -15.677812576293945, + "logps/rejected": -25.590530395507812, + "loss": 0.0806, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49915313720703125, + "rewards/margins": 3.54221248626709, + "rewards/rejected": -3.0430593490600586, + "step": 695 + }, + { + "epoch": 11.796610169491526, + "grad_norm": 4.952631946369327, + "learning_rate": 2.1535320065676578e-07, + "logits/chosen": -0.8462392687797546, + "logits/rejected": 0.24551531672477722, + "logps/chosen": -14.667747497558594, + "logps/rejected": -32.78245544433594, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34924012422561646, + "rewards/margins": 5.127192974090576, + "rewards/rejected": -4.777953147888184, + "step": 696 + }, + { + "epoch": 11.813559322033898, + "grad_norm": 5.705449211384567, + "learning_rate": 2.1462094347392884e-07, + "logits/chosen": -0.5555151700973511, + "logits/rejected": 0.14884862303733826, + "logps/chosen": -17.458023071289062, + "logps/rejected": -35.274070739746094, + "loss": 0.0719, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2345159649848938, + "rewards/margins": 5.021158218383789, + "rewards/rejected": -4.786642551422119, + "step": 697 + }, + { + "epoch": 11.830508474576272, + "grad_norm": 5.606218673543748, + "learning_rate": 2.1388899588801963e-07, + "logits/chosen": -1.732840657234192, + "logits/rejected": -1.3636012077331543, + "logps/chosen": -16.198732376098633, + "logps/rejected": -29.274105072021484, + "loss": 0.0581, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05347772687673569, + "rewards/margins": 4.097093105316162, + "rewards/rejected": -4.150570392608643, + "step": 698 + }, + { + "epoch": 11.847457627118644, + "grad_norm": 5.689400746949895, + "learning_rate": 2.131573643042039e-07, + "logits/chosen": -1.4225223064422607, + "logits/rejected": -1.7895958423614502, + "logps/chosen": -15.910755157470703, + "logps/rejected": -34.615966796875, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4202052652835846, + "rewards/margins": 5.09837007522583, + "rewards/rejected": -4.678165435791016, + "step": 699 + }, + { + "epoch": 11.864406779661017, + "grad_norm": 5.323397390244622, + "learning_rate": 2.1242605512488245e-07, + "logits/chosen": -1.2466119527816772, + "logits/rejected": -0.6074275970458984, + "logps/chosen": -15.015626907348633, + "logps/rejected": -28.64008903503418, + "loss": 0.0685, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0258762836456299, + "rewards/margins": 4.653193950653076, + "rewards/rejected": -3.627316951751709, + "step": 700 + }, + { + "epoch": 11.88135593220339, + "grad_norm": 5.664653229298019, + "learning_rate": 2.116950747496342e-07, + "logits/chosen": -2.161691188812256, + "logits/rejected": -1.698690414428711, + "logps/chosen": -18.23191261291504, + "logps/rejected": -33.978004455566406, + "loss": 0.0784, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4678615927696228, + "rewards/margins": 4.916198253631592, + "rewards/rejected": -4.448336601257324, + "step": 701 + }, + { + "epoch": 11.898305084745763, + "grad_norm": 5.550000502617084, + "learning_rate": 2.1096442957516116e-07, + "logits/chosen": -0.8441931009292603, + "logits/rejected": -0.6889966726303101, + "logps/chosen": -15.197530746459961, + "logps/rejected": -31.47784423828125, + "loss": 0.0777, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.033519446849823, + "rewards/margins": 4.367273807525635, + "rewards/rejected": -4.400793552398682, + "step": 702 + }, + { + "epoch": 11.915254237288135, + "grad_norm": 6.49263254472757, + "learning_rate": 2.10234125995232e-07, + "logits/chosen": -0.9990702867507935, + "logits/rejected": -0.8798142671585083, + "logps/chosen": -10.551839828491211, + "logps/rejected": -26.121063232421875, + "loss": 0.0993, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38751670718193054, + "rewards/margins": 4.126040458679199, + "rewards/rejected": -3.7385239601135254, + "step": 703 + }, + { + "epoch": 11.932203389830509, + "grad_norm": 5.247819487380474, + "learning_rate": 2.0950417040062607e-07, + "logits/chosen": 2.57755446434021, + "logits/rejected": 2.9328744411468506, + "logps/chosen": -12.72929573059082, + "logps/rejected": -27.83072280883789, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030468828976154327, + "rewards/margins": 4.585507392883301, + "rewards/rejected": -4.5550384521484375, + "step": 704 + }, + { + "epoch": 11.94915254237288, + "grad_norm": 5.151602852165204, + "learning_rate": 2.0877456917907757e-07, + "logits/chosen": -0.7559548616409302, + "logits/rejected": -1.2953628301620483, + "logps/chosen": -20.540603637695312, + "logps/rejected": -31.206186294555664, + "loss": 0.0546, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11103224754333496, + "rewards/margins": 4.5761895179748535, + "rewards/rejected": -4.465157985687256, + "step": 705 + }, + { + "epoch": 11.966101694915254, + "grad_norm": 5.688687348755467, + "learning_rate": 2.0804532871521957e-07, + "logits/chosen": -2.665072441101074, + "logits/rejected": -1.002528429031372, + "logps/chosen": -13.152605056762695, + "logps/rejected": -29.87421226501465, + "loss": 0.0696, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5528115630149841, + "rewards/margins": 4.340553283691406, + "rewards/rejected": -3.7877418994903564, + "step": 706 + }, + { + "epoch": 11.983050847457626, + "grad_norm": 6.496033619273407, + "learning_rate": 2.0731645539052842e-07, + "logits/chosen": -2.125807523727417, + "logits/rejected": -1.361229658126831, + "logps/chosen": -11.900805473327637, + "logps/rejected": -33.30543899536133, + "loss": 0.0694, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30786752700805664, + "rewards/margins": 5.710712909698486, + "rewards/rejected": -5.40284538269043, + "step": 707 + }, + { + "epoch": 12.0, + "grad_norm": 6.060562935251418, + "learning_rate": 2.065879555832674e-07, + "logits/chosen": -4.1413726806640625, + "logits/rejected": -3.282559394836426, + "logps/chosen": -16.134963989257812, + "logps/rejected": -33.68052673339844, + "loss": 0.0642, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.011490941047668457, + "rewards/margins": 5.368725776672363, + "rewards/rejected": -5.357234954833984, + "step": 708 + }, + { + "epoch": 12.016949152542374, + "grad_norm": 4.532552644301971, + "learning_rate": 2.0585983566843142e-07, + "logits/chosen": -5.349707126617432, + "logits/rejected": -4.256799697875977, + "logps/chosen": -15.861433982849121, + "logps/rejected": -31.989398956298828, + "loss": 0.0484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5362915992736816, + "rewards/margins": 5.06649923324585, + "rewards/rejected": -4.530208110809326, + "step": 709 + }, + { + "epoch": 12.033898305084746, + "grad_norm": 5.532947699353363, + "learning_rate": 2.0513210201769083e-07, + "logits/chosen": -1.8150622844696045, + "logits/rejected": -0.2966909408569336, + "logps/chosen": -14.874473571777344, + "logps/rejected": -24.714536666870117, + "loss": 0.0701, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8530625700950623, + "rewards/margins": 4.482250213623047, + "rewards/rejected": -3.62918758392334, + "step": 710 + }, + { + "epoch": 12.05084745762712, + "grad_norm": 5.5718381236496475, + "learning_rate": 2.0440476099933602e-07, + "logits/chosen": -4.311933994293213, + "logits/rejected": -2.9934637546539307, + "logps/chosen": -18.516633987426758, + "logps/rejected": -29.81332015991211, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17470814287662506, + "rewards/margins": 4.888881683349609, + "rewards/rejected": -4.714173316955566, + "step": 711 + }, + { + "epoch": 12.067796610169491, + "grad_norm": 6.360617007701148, + "learning_rate": 2.0367781897822144e-07, + "logits/chosen": 0.7523773908615112, + "logits/rejected": 1.1841909885406494, + "logps/chosen": -17.700777053833008, + "logps/rejected": -29.529111862182617, + "loss": 0.0862, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30104872584342957, + "rewards/margins": 5.281501770019531, + "rewards/rejected": -5.582550525665283, + "step": 712 + }, + { + "epoch": 12.084745762711865, + "grad_norm": 4.70334660438877, + "learning_rate": 2.0295128231570984e-07, + "logits/chosen": 0.837024986743927, + "logits/rejected": 0.9363638162612915, + "logps/chosen": -13.225208282470703, + "logps/rejected": -35.414329528808594, + "loss": 0.0491, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.149062380194664, + "rewards/margins": 5.915937900543213, + "rewards/rejected": -5.766875743865967, + "step": 713 + }, + { + "epoch": 12.101694915254237, + "grad_norm": 5.4006474930748105, + "learning_rate": 2.0222515736961692e-07, + "logits/chosen": -1.7546391487121582, + "logits/rejected": 0.6276485919952393, + "logps/chosen": -19.25953483581543, + "logps/rejected": -44.87244415283203, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08649277687072754, + "rewards/margins": 7.5871148109436035, + "rewards/rejected": -7.67360782623291, + "step": 714 + }, + { + "epoch": 12.11864406779661, + "grad_norm": 5.368433493971488, + "learning_rate": 2.0149945049415546e-07, + "logits/chosen": -1.8162565231323242, + "logits/rejected": -0.9864399433135986, + "logps/chosen": -12.635334014892578, + "logps/rejected": -26.762569427490234, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3255395293235779, + "rewards/margins": 4.295178413391113, + "rewards/rejected": -3.9696390628814697, + "step": 715 + }, + { + "epoch": 12.135593220338983, + "grad_norm": 5.192183933114531, + "learning_rate": 2.0077416803987963e-07, + "logits/chosen": -0.8176920413970947, + "logits/rejected": -0.31932979822158813, + "logps/chosen": -18.625732421875, + "logps/rejected": -31.451997756958008, + "loss": 0.0567, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17950454354286194, + "rewards/margins": 5.262685298919678, + "rewards/rejected": -5.083180904388428, + "step": 716 + }, + { + "epoch": 12.152542372881356, + "grad_norm": 5.093631383500938, + "learning_rate": 2.0004931635362982e-07, + "logits/chosen": -1.2001303434371948, + "logits/rejected": -0.3195866346359253, + "logps/chosen": -13.616204261779785, + "logps/rejected": -24.842506408691406, + "loss": 0.0522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5055986642837524, + "rewards/margins": 4.0246782302856445, + "rewards/rejected": -3.5190796852111816, + "step": 717 + }, + { + "epoch": 12.169491525423728, + "grad_norm": 5.49085446169452, + "learning_rate": 1.993249017784766e-07, + "logits/chosen": 0.5020394325256348, + "logits/rejected": 0.33351314067840576, + "logps/chosen": -14.211220741271973, + "logps/rejected": -35.0599365234375, + "loss": 0.0755, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.25228646397590637, + "rewards/margins": 5.2668375968933105, + "rewards/rejected": -5.014551639556885, + "step": 718 + }, + { + "epoch": 12.186440677966102, + "grad_norm": 5.629520362419779, + "learning_rate": 1.9860093065366557e-07, + "logits/chosen": -2.5751099586486816, + "logits/rejected": -2.715381145477295, + "logps/chosen": -15.755634307861328, + "logps/rejected": -25.378055572509766, + "loss": 0.0731, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10519762337207794, + "rewards/margins": 3.9933674335479736, + "rewards/rejected": -3.888169765472412, + "step": 719 + }, + { + "epoch": 12.203389830508474, + "grad_norm": 5.952068955136311, + "learning_rate": 1.9787740931456164e-07, + "logits/chosen": -0.4816681444644928, + "logits/rejected": 1.571818232536316, + "logps/chosen": -11.133060455322266, + "logps/rejected": -36.0597038269043, + "loss": 0.0708, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23930492997169495, + "rewards/margins": 5.7355451583862305, + "rewards/rejected": -5.496240615844727, + "step": 720 + }, + { + "epoch": 12.220338983050848, + "grad_norm": 4.290693041598025, + "learning_rate": 1.971543440925939e-07, + "logits/chosen": -1.3183561563491821, + "logits/rejected": 0.07553315162658691, + "logps/chosen": -13.733359336853027, + "logps/rejected": -31.047632217407227, + "loss": 0.0355, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36901021003723145, + "rewards/margins": 5.332430839538574, + "rewards/rejected": -4.963420867919922, + "step": 721 + }, + { + "epoch": 12.23728813559322, + "grad_norm": 5.184170424363948, + "learning_rate": 1.9643174131519984e-07, + "logits/chosen": 3.3170368671417236, + "logits/rejected": 3.977391004562378, + "logps/chosen": -10.85775375366211, + "logps/rejected": -29.93194580078125, + "loss": 0.0583, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.11747324466705322, + "rewards/margins": 4.625977993011475, + "rewards/rejected": -4.508504867553711, + "step": 722 + }, + { + "epoch": 12.254237288135593, + "grad_norm": 4.889541509223092, + "learning_rate": 1.9570960730577032e-07, + "logits/chosen": -0.7656147480010986, + "logits/rejected": -0.24224507808685303, + "logps/chosen": -19.524702072143555, + "logps/rejected": -30.796566009521484, + "loss": 0.0574, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3538164794445038, + "rewards/margins": 5.012910842895508, + "rewards/rejected": -4.659094333648682, + "step": 723 + }, + { + "epoch": 12.271186440677965, + "grad_norm": 5.10453945431393, + "learning_rate": 1.949879483835939e-07, + "logits/chosen": -2.638871431350708, + "logits/rejected": -1.0773653984069824, + "logps/chosen": -12.800580978393555, + "logps/rejected": -27.307966232299805, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5115848779678345, + "rewards/margins": 4.255484104156494, + "rewards/rejected": -3.74389910697937, + "step": 724 + }, + { + "epoch": 12.288135593220339, + "grad_norm": 5.719698536850818, + "learning_rate": 1.9426677086380183e-07, + "logits/chosen": -0.1832807958126068, + "logits/rejected": 0.017368942499160767, + "logps/chosen": -14.976875305175781, + "logps/rejected": -29.548471450805664, + "loss": 0.0829, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.375164270401001, + "rewards/margins": 4.5754265785217285, + "rewards/rejected": -4.200262546539307, + "step": 725 + }, + { + "epoch": 12.305084745762711, + "grad_norm": 5.774954444673895, + "learning_rate": 1.9354608105731267e-07, + "logits/chosen": -0.24937498569488525, + "logits/rejected": 0.15181505680084229, + "logps/chosen": -16.608240127563477, + "logps/rejected": -40.357643127441406, + "loss": 0.0611, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4569090008735657, + "rewards/margins": 6.628655433654785, + "rewards/rejected": -7.085563659667969, + "step": 726 + }, + { + "epoch": 12.322033898305085, + "grad_norm": 4.581448773761563, + "learning_rate": 1.9282588527077713e-07, + "logits/chosen": 0.02391517162322998, + "logits/rejected": 0.6816602349281311, + "logps/chosen": -16.32445526123047, + "logps/rejected": -29.69344711303711, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46570780873298645, + "rewards/margins": 4.278436183929443, + "rewards/rejected": -3.8127284049987793, + "step": 727 + }, + { + "epoch": 12.338983050847457, + "grad_norm": 5.603070097043823, + "learning_rate": 1.9210618980652273e-07, + "logits/chosen": 1.931321620941162, + "logits/rejected": 2.3644118309020996, + "logps/chosen": -13.214820861816406, + "logps/rejected": -30.852802276611328, + "loss": 0.0666, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0568082332611084, + "rewards/margins": 5.3618268966674805, + "rewards/rejected": -5.41863489151001, + "step": 728 + }, + { + "epoch": 12.35593220338983, + "grad_norm": 5.080980560752212, + "learning_rate": 1.9138700096249883e-07, + "logits/chosen": -3.121365785598755, + "logits/rejected": -1.288278341293335, + "logps/chosen": -21.657861709594727, + "logps/rejected": -37.93989181518555, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0911291092634201, + "rewards/margins": 5.4343109130859375, + "rewards/rejected": -5.525440692901611, + "step": 729 + }, + { + "epoch": 12.372881355932204, + "grad_norm": 5.2142904258182385, + "learning_rate": 1.9066832503222128e-07, + "logits/chosen": -1.7135505676269531, + "logits/rejected": 0.22507372498512268, + "logps/chosen": -20.781967163085938, + "logps/rejected": -38.08007049560547, + "loss": 0.0596, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6541242003440857, + "rewards/margins": 5.690436840057373, + "rewards/rejected": -6.344560623168945, + "step": 730 + }, + { + "epoch": 12.389830508474576, + "grad_norm": 4.191891312611567, + "learning_rate": 1.899501683047177e-07, + "logits/chosen": -2.5515406131744385, + "logits/rejected": -2.6740291118621826, + "logps/chosen": -17.506187438964844, + "logps/rejected": -37.260231018066406, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08348126709461212, + "rewards/margins": 5.0080647468566895, + "rewards/rejected": -5.091546058654785, + "step": 731 + }, + { + "epoch": 12.40677966101695, + "grad_norm": 5.779154114222945, + "learning_rate": 1.892325370644721e-07, + "logits/chosen": -0.9004234671592712, + "logits/rejected": -0.19273042678833008, + "logps/chosen": -18.21908187866211, + "logps/rejected": -27.488815307617188, + "loss": 0.0786, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6875766515731812, + "rewards/margins": 4.210147380828857, + "rewards/rejected": -3.5225706100463867, + "step": 732 + }, + { + "epoch": 12.423728813559322, + "grad_norm": 4.8198265842340575, + "learning_rate": 1.8851543759137007e-07, + "logits/chosen": -1.2583699226379395, + "logits/rejected": -0.1383885145187378, + "logps/chosen": -11.82453727722168, + "logps/rejected": -26.698476791381836, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7940881252288818, + "rewards/margins": 5.455242156982422, + "rewards/rejected": -4.661154270172119, + "step": 733 + }, + { + "epoch": 12.440677966101696, + "grad_norm": 4.338898328943316, + "learning_rate": 1.8779887616064382e-07, + "logits/chosen": -3.5943920612335205, + "logits/rejected": -2.3568880558013916, + "logps/chosen": -15.277399063110352, + "logps/rejected": -25.739362716674805, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31375235319137573, + "rewards/margins": 4.360467433929443, + "rewards/rejected": -4.046714782714844, + "step": 734 + }, + { + "epoch": 12.457627118644067, + "grad_norm": 5.221531148515628, + "learning_rate": 1.8708285904281712e-07, + "logits/chosen": 0.39441272616386414, + "logits/rejected": 0.6481245756149292, + "logps/chosen": -13.238401412963867, + "logps/rejected": -25.95081329345703, + "loss": 0.0672, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.049318715929985046, + "rewards/margins": 4.6796040534973145, + "rewards/rejected": -4.630285263061523, + "step": 735 + }, + { + "epoch": 12.474576271186441, + "grad_norm": 4.662194142659164, + "learning_rate": 1.8636739250365056e-07, + "logits/chosen": -0.6668483018875122, + "logits/rejected": 0.672187089920044, + "logps/chosen": -17.061891555786133, + "logps/rejected": -25.8005428314209, + "loss": 0.0554, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8260502219200134, + "rewards/margins": 4.933106422424316, + "rewards/rejected": -4.107056617736816, + "step": 736 + }, + { + "epoch": 12.491525423728813, + "grad_norm": 5.547045768800254, + "learning_rate": 1.8565248280408698e-07, + "logits/chosen": -1.3748953342437744, + "logits/rejected": -0.43877971172332764, + "logps/chosen": -12.486265182495117, + "logps/rejected": -35.297569274902344, + "loss": 0.0732, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32557570934295654, + "rewards/margins": 6.038592338562012, + "rewards/rejected": -5.713016510009766, + "step": 737 + }, + { + "epoch": 12.508474576271187, + "grad_norm": 4.938011936686151, + "learning_rate": 1.8493813620019595e-07, + "logits/chosen": 0.07011851668357849, + "logits/rejected": 0.6697176694869995, + "logps/chosen": -18.494783401489258, + "logps/rejected": -41.817710876464844, + "loss": 0.0592, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09022299945354462, + "rewards/margins": 5.310074329376221, + "rewards/rejected": -5.400297164916992, + "step": 738 + }, + { + "epoch": 12.525423728813559, + "grad_norm": 4.975592954809822, + "learning_rate": 1.8422435894311973e-07, + "logits/chosen": -4.636005878448486, + "logits/rejected": -3.7690577507019043, + "logps/chosen": -13.333321571350098, + "logps/rejected": -29.106115341186523, + "loss": 0.0578, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.269630491733551, + "rewards/margins": 4.382090091705322, + "rewards/rejected": -4.112459182739258, + "step": 739 + }, + { + "epoch": 12.542372881355933, + "grad_norm": 4.701494943737591, + "learning_rate": 1.8351115727901829e-07, + "logits/chosen": -0.8022490739822388, + "logits/rejected": -0.29640018939971924, + "logps/chosen": -16.64236831665039, + "logps/rejected": -34.56496810913086, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09874613583087921, + "rewards/margins": 5.410571098327637, + "rewards/rejected": -5.311825752258301, + "step": 740 + }, + { + "epoch": 12.559322033898304, + "grad_norm": 4.983518760341374, + "learning_rate": 1.8279853744901464e-07, + "logits/chosen": -1.764894723892212, + "logits/rejected": -1.817033052444458, + "logps/chosen": -15.258569717407227, + "logps/rejected": -26.04188346862793, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4003324806690216, + "rewards/margins": 3.7917897701263428, + "rewards/rejected": -3.3914575576782227, + "step": 741 + }, + { + "epoch": 12.576271186440678, + "grad_norm": 4.476989646138773, + "learning_rate": 1.8208650568914033e-07, + "logits/chosen": -1.363398790359497, + "logits/rejected": -0.9498892426490784, + "logps/chosen": -19.603914260864258, + "logps/rejected": -30.36231231689453, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.011793076992034912, + "rewards/margins": 3.9837088584899902, + "rewards/rejected": -3.995501756668091, + "step": 742 + }, + { + "epoch": 12.59322033898305, + "grad_norm": 5.053665090290864, + "learning_rate": 1.8137506823028065e-07, + "logits/chosen": -1.609299898147583, + "logits/rejected": -0.7718572616577148, + "logps/chosen": -22.111347198486328, + "logps/rejected": -28.185571670532227, + "loss": 0.0561, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35352978110313416, + "rewards/margins": 3.5998425483703613, + "rewards/rejected": -3.2463128566741943, + "step": 743 + }, + { + "epoch": 12.610169491525424, + "grad_norm": 5.304619777865993, + "learning_rate": 1.8066423129812026e-07, + "logits/chosen": 1.7559425830841064, + "logits/rejected": 2.9329328536987305, + "logps/chosen": -20.055051803588867, + "logps/rejected": -36.75667190551758, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09106519818305969, + "rewards/margins": 4.863574028015137, + "rewards/rejected": -4.954638957977295, + "step": 744 + }, + { + "epoch": 12.627118644067796, + "grad_norm": 4.667357132937044, + "learning_rate": 1.7995400111308883e-07, + "logits/chosen": -0.961107611656189, + "logits/rejected": -0.7769833207130432, + "logps/chosen": -18.276569366455078, + "logps/rejected": -30.646440505981445, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07002657651901245, + "rewards/margins": 4.622382640838623, + "rewards/rejected": -4.552356243133545, + "step": 745 + }, + { + "epoch": 12.64406779661017, + "grad_norm": 5.408680811716402, + "learning_rate": 1.7924438389030648e-07, + "logits/chosen": -1.4892151355743408, + "logits/rejected": -0.24569085240364075, + "logps/chosen": -20.448904037475586, + "logps/rejected": -33.296146392822266, + "loss": 0.0661, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.29597920179367065, + "rewards/margins": 5.750075340270996, + "rewards/rejected": -5.454095840454102, + "step": 746 + }, + { + "epoch": 12.661016949152543, + "grad_norm": 5.247658476313327, + "learning_rate": 1.785353858395292e-07, + "logits/chosen": -1.1832478046417236, + "logits/rejected": -0.4320974051952362, + "logps/chosen": -16.966569900512695, + "logps/rejected": -30.618276596069336, + "loss": 0.0561, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.308743953704834, + "rewards/margins": 4.0481038093566895, + "rewards/rejected": -3.7393596172332764, + "step": 747 + }, + { + "epoch": 12.677966101694915, + "grad_norm": 5.386940866382028, + "learning_rate": 1.7782701316509478e-07, + "logits/chosen": -0.8887053728103638, + "logits/rejected": 0.26204565167427063, + "logps/chosen": -15.196284294128418, + "logps/rejected": -32.22180938720703, + "loss": 0.0569, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023950517177581787, + "rewards/margins": 5.039477348327637, + "rewards/rejected": -5.01552677154541, + "step": 748 + }, + { + "epoch": 12.694915254237289, + "grad_norm": 6.448249897658643, + "learning_rate": 1.7711927206586853e-07, + "logits/chosen": -3.0813937187194824, + "logits/rejected": -1.0136470794677734, + "logps/chosen": -15.665678024291992, + "logps/rejected": -31.437288284301758, + "loss": 0.0766, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6306146383285522, + "rewards/margins": 4.768771171569824, + "rewards/rejected": -4.138156414031982, + "step": 749 + }, + { + "epoch": 12.711864406779661, + "grad_norm": 5.1415627551289855, + "learning_rate": 1.7641216873518876e-07, + "logits/chosen": -0.4980939030647278, + "logits/rejected": 0.9896915555000305, + "logps/chosen": -13.606678009033203, + "logps/rejected": -36.296627044677734, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3556250333786011, + "rewards/margins": 5.726956844329834, + "rewards/rejected": -5.371331691741943, + "step": 750 + }, + { + "epoch": 12.728813559322035, + "grad_norm": 5.538397960309667, + "learning_rate": 1.7570570936081306e-07, + "logits/chosen": -5.0588178634643555, + "logits/rejected": -4.427032947540283, + "logps/chosen": -15.996036529541016, + "logps/rejected": -29.211898803710938, + "loss": 0.0711, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.036909669637680054, + "rewards/margins": 4.613151550292969, + "rewards/rejected": -4.576241970062256, + "step": 751 + }, + { + "epoch": 12.745762711864407, + "grad_norm": 5.401159987947702, + "learning_rate": 1.7499990012486348e-07, + "logits/chosen": 0.5816026329994202, + "logits/rejected": 2.0243024826049805, + "logps/chosen": -14.950392723083496, + "logps/rejected": -44.65636444091797, + "loss": 0.0744, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08444496989250183, + "rewards/margins": 5.976288795471191, + "rewards/rejected": -6.060733318328857, + "step": 752 + }, + { + "epoch": 12.76271186440678, + "grad_norm": 5.642148679884942, + "learning_rate": 1.7429474720377312e-07, + "logits/chosen": -4.293545722961426, + "logits/rejected": -3.6384382247924805, + "logps/chosen": -15.824508666992188, + "logps/rejected": -21.923446655273438, + "loss": 0.0616, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6682969331741333, + "rewards/margins": 3.7970216274261475, + "rewards/rejected": -3.1287245750427246, + "step": 753 + }, + { + "epoch": 12.779661016949152, + "grad_norm": 5.456868524343284, + "learning_rate": 1.735902567682315e-07, + "logits/chosen": -0.1643550992012024, + "logits/rejected": 1.1555442810058594, + "logps/chosen": -18.49430274963379, + "logps/rejected": -28.05358123779297, + "loss": 0.0698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23009660840034485, + "rewards/margins": 4.420815467834473, + "rewards/rejected": -4.190719127655029, + "step": 754 + }, + { + "epoch": 12.796610169491526, + "grad_norm": 5.5899581786249675, + "learning_rate": 1.7288643498313104e-07, + "logits/chosen": -1.605297565460205, + "logits/rejected": -1.2698827981948853, + "logps/chosen": -17.66697120666504, + "logps/rejected": -28.075071334838867, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.554067850112915, + "rewards/margins": 4.1530561447143555, + "rewards/rejected": -3.5989880561828613, + "step": 755 + }, + { + "epoch": 12.813559322033898, + "grad_norm": 4.97167654515329, + "learning_rate": 1.7218328800751285e-07, + "logits/chosen": -1.9998224973678589, + "logits/rejected": -1.9531162977218628, + "logps/chosen": -16.823482513427734, + "logps/rejected": -38.705322265625, + "loss": 0.0501, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05464259535074234, + "rewards/margins": 5.106647491455078, + "rewards/rejected": -5.052005290985107, + "step": 756 + }, + { + "epoch": 12.830508474576272, + "grad_norm": 4.636697957998646, + "learning_rate": 1.7148082199451286e-07, + "logits/chosen": 2.3911752700805664, + "logits/rejected": 2.8658528327941895, + "logps/chosen": -14.898900985717773, + "logps/rejected": -32.14864730834961, + "loss": 0.0509, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25076889991760254, + "rewards/margins": 5.665815830230713, + "rewards/rejected": -5.4150471687316895, + "step": 757 + }, + { + "epoch": 12.847457627118644, + "grad_norm": 4.43156755971984, + "learning_rate": 1.7077904309130782e-07, + "logits/chosen": 0.07385343313217163, + "logits/rejected": 1.4616187810897827, + "logps/chosen": -16.277408599853516, + "logps/rejected": -33.832191467285156, + "loss": 0.0383, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02317967265844345, + "rewards/margins": 4.975264549255371, + "rewards/rejected": -4.952084541320801, + "step": 758 + }, + { + "epoch": 12.864406779661017, + "grad_norm": 5.1087161747778955, + "learning_rate": 1.7007795743906194e-07, + "logits/chosen": -0.01966071128845215, + "logits/rejected": 1.2537782192230225, + "logps/chosen": -12.349102973937988, + "logps/rejected": -29.57040023803711, + "loss": 0.0624, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.719579815864563, + "rewards/margins": 4.931670665740967, + "rewards/rejected": -4.212090969085693, + "step": 759 + }, + { + "epoch": 12.88135593220339, + "grad_norm": 4.575423088701401, + "learning_rate": 1.6937757117287276e-07, + "logits/chosen": -2.572610378265381, + "logits/rejected": -2.4339561462402344, + "logps/chosen": -20.87567138671875, + "logps/rejected": -26.362003326416016, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.36575549840927124, + "rewards/margins": 3.4433891773223877, + "rewards/rejected": -3.0776336193084717, + "step": 760 + }, + { + "epoch": 12.898305084745763, + "grad_norm": 4.792758308336707, + "learning_rate": 1.6867789042171777e-07, + "logits/chosen": 0.42236053943634033, + "logits/rejected": 1.349460244178772, + "logps/chosen": -18.220428466796875, + "logps/rejected": -40.099430084228516, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5892751216888428, + "rewards/margins": 6.123309135437012, + "rewards/rejected": -6.712584495544434, + "step": 761 + }, + { + "epoch": 12.915254237288135, + "grad_norm": 5.182633483984975, + "learning_rate": 1.6797892130840036e-07, + "logits/chosen": -4.92634916305542, + "logits/rejected": -3.316908359527588, + "logps/chosen": -22.279062271118164, + "logps/rejected": -40.13741683959961, + "loss": 0.0524, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5362528562545776, + "rewards/margins": 5.827565670013428, + "rewards/rejected": -5.291313171386719, + "step": 762 + }, + { + "epoch": 12.932203389830509, + "grad_norm": 6.2174069067649596, + "learning_rate": 1.6728066994949658e-07, + "logits/chosen": 0.3422883450984955, + "logits/rejected": 1.34884512424469, + "logps/chosen": -16.529205322265625, + "logps/rejected": -31.202659606933594, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2604439854621887, + "rewards/margins": 5.107015609741211, + "rewards/rejected": -4.846571445465088, + "step": 763 + }, + { + "epoch": 12.94915254237288, + "grad_norm": 5.015581327963073, + "learning_rate": 1.6658314245530148e-07, + "logits/chosen": -1.485102653503418, + "logits/rejected": 0.03505659103393555, + "logps/chosen": -15.350225448608398, + "logps/rejected": -32.654762268066406, + "loss": 0.0628, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5115726590156555, + "rewards/margins": 5.575878143310547, + "rewards/rejected": -5.064305305480957, + "step": 764 + }, + { + "epoch": 12.966101694915254, + "grad_norm": 5.203411876922801, + "learning_rate": 1.6588634492977582e-07, + "logits/chosen": -2.822071075439453, + "logits/rejected": -1.4864444732666016, + "logps/chosen": -20.292339324951172, + "logps/rejected": -41.56735610961914, + "loss": 0.0754, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02777126431465149, + "rewards/margins": 5.915192604064941, + "rewards/rejected": -5.887421607971191, + "step": 765 + }, + { + "epoch": 12.983050847457626, + "grad_norm": 5.575372006782511, + "learning_rate": 1.651902834704924e-07, + "logits/chosen": -0.6836428642272949, + "logits/rejected": -0.2021288275718689, + "logps/chosen": -15.80721664428711, + "logps/rejected": -22.794828414916992, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13072407245635986, + "rewards/margins": 3.470419406890869, + "rewards/rejected": -3.3396952152252197, + "step": 766 + }, + { + "epoch": 13.0, + "grad_norm": 4.979310125788461, + "learning_rate": 1.6449496416858282e-07, + "logits/chosen": -1.8837170600891113, + "logits/rejected": -0.8573411703109741, + "logps/chosen": -16.546485900878906, + "logps/rejected": -28.7147216796875, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3212031126022339, + "rewards/margins": 4.140195846557617, + "rewards/rejected": -3.8189926147460938, + "step": 767 + }, + { + "epoch": 13.016949152542374, + "grad_norm": 4.661905901633895, + "learning_rate": 1.6380039310868414e-07, + "logits/chosen": -0.571354329586029, + "logits/rejected": -0.13909736275672913, + "logps/chosen": -17.805564880371094, + "logps/rejected": -35.160640716552734, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1881602704524994, + "rewards/margins": 4.904372692108154, + "rewards/rejected": -5.092532634735107, + "step": 768 + }, + { + "epoch": 13.033898305084746, + "grad_norm": 5.250431718898731, + "learning_rate": 1.631065763688857e-07, + "logits/chosen": -1.4799143075942993, + "logits/rejected": -0.8538658618927002, + "logps/chosen": -12.869924545288086, + "logps/rejected": -30.551315307617188, + "loss": 0.0671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15870481729507446, + "rewards/margins": 4.908497333526611, + "rewards/rejected": -4.749792575836182, + "step": 769 + }, + { + "epoch": 13.05084745762712, + "grad_norm": 4.746347720305799, + "learning_rate": 1.6241352002067588e-07, + "logits/chosen": -0.8848968148231506, + "logits/rejected": -0.33918485045433044, + "logps/chosen": -17.034452438354492, + "logps/rejected": -36.654781341552734, + "loss": 0.0474, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5132811069488525, + "rewards/margins": 6.099752426147461, + "rewards/rejected": -6.613033771514893, + "step": 770 + }, + { + "epoch": 13.067796610169491, + "grad_norm": 4.650926687696971, + "learning_rate": 1.61721230128889e-07, + "logits/chosen": 1.7074960470199585, + "logits/rejected": 2.339669704437256, + "logps/chosen": -14.039243698120117, + "logps/rejected": -34.935001373291016, + "loss": 0.0497, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.006116881966590881, + "rewards/margins": 5.922494411468506, + "rewards/rejected": -5.928612232208252, + "step": 771 + }, + { + "epoch": 13.084745762711865, + "grad_norm": 4.992257153089093, + "learning_rate": 1.6102971275165227e-07, + "logits/chosen": -4.5710248947143555, + "logits/rejected": -3.5121634006500244, + "logps/chosen": -19.965408325195312, + "logps/rejected": -39.98967361450195, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06310770660638809, + "rewards/margins": 6.08799934387207, + "rewards/rejected": -6.1511077880859375, + "step": 772 + }, + { + "epoch": 13.101694915254237, + "grad_norm": 5.116376837204931, + "learning_rate": 1.603389739403327e-07, + "logits/chosen": -3.5352561473846436, + "logits/rejected": -2.2301392555236816, + "logps/chosen": -16.83971405029297, + "logps/rejected": -26.139163970947266, + "loss": 0.062, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8963336944580078, + "rewards/margins": 4.319397926330566, + "rewards/rejected": -3.4230642318725586, + "step": 773 + }, + { + "epoch": 13.11864406779661, + "grad_norm": 4.984010048813125, + "learning_rate": 1.5964901973948408e-07, + "logits/chosen": 0.6005043983459473, + "logits/rejected": 1.7892330884933472, + "logps/chosen": -13.829337120056152, + "logps/rejected": -29.835859298706055, + "loss": 0.0658, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5679302215576172, + "rewards/margins": 4.191529750823975, + "rewards/rejected": -3.6235995292663574, + "step": 774 + }, + { + "epoch": 13.135593220338983, + "grad_norm": 4.060001356412656, + "learning_rate": 1.5895985618679445e-07, + "logits/chosen": -1.580199956893921, + "logits/rejected": -0.7634269595146179, + "logps/chosen": -18.07457733154297, + "logps/rejected": -39.27557373046875, + "loss": 0.0372, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12154760956764221, + "rewards/margins": 5.554102897644043, + "rewards/rejected": -5.432555675506592, + "step": 775 + }, + { + "epoch": 13.152542372881356, + "grad_norm": 4.8504946205719595, + "learning_rate": 1.5827148931303275e-07, + "logits/chosen": -0.6837934255599976, + "logits/rejected": 0.3661801517009735, + "logps/chosen": -15.66816520690918, + "logps/rejected": -41.75244140625, + "loss": 0.0449, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35962551832199097, + "rewards/margins": 6.23133659362793, + "rewards/rejected": -6.5909624099731445, + "step": 776 + }, + { + "epoch": 13.169491525423728, + "grad_norm": 5.519362182135464, + "learning_rate": 1.5758392514199643e-07, + "logits/chosen": -3.332242488861084, + "logits/rejected": -3.9557909965515137, + "logps/chosen": -11.690735816955566, + "logps/rejected": -27.75088119506836, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2635051906108856, + "rewards/margins": 4.264172554016113, + "rewards/rejected": -4.527677536010742, + "step": 777 + }, + { + "epoch": 13.186440677966102, + "grad_norm": 4.935439407056802, + "learning_rate": 1.5689716969045847e-07, + "logits/chosen": -0.9040203094482422, + "logits/rejected": 0.7142425179481506, + "logps/chosen": -20.44605255126953, + "logps/rejected": -41.98016357421875, + "loss": 0.0506, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10384351015090942, + "rewards/margins": 6.846504211425781, + "rewards/rejected": -6.742660045623779, + "step": 778 + }, + { + "epoch": 13.203389830508474, + "grad_norm": 4.6398107320600674, + "learning_rate": 1.5621122896811522e-07, + "logits/chosen": -0.5301164388656616, + "logits/rejected": 0.8865979909896851, + "logps/chosen": -14.487466812133789, + "logps/rejected": -32.661075592041016, + "loss": 0.0653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13690048456192017, + "rewards/margins": 5.08042049407959, + "rewards/rejected": -4.943520545959473, + "step": 779 + }, + { + "epoch": 13.220338983050848, + "grad_norm": 5.035524688198735, + "learning_rate": 1.555261089775329e-07, + "logits/chosen": -1.5996699333190918, + "logits/rejected": -0.5739651918411255, + "logps/chosen": -14.261418342590332, + "logps/rejected": -33.98247146606445, + "loss": 0.0593, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08373254537582397, + "rewards/margins": 5.217193126678467, + "rewards/rejected": -5.3009257316589355, + "step": 780 + }, + { + "epoch": 13.23728813559322, + "grad_norm": 4.270205266332028, + "learning_rate": 1.548418157140961e-07, + "logits/chosen": -2.086413860321045, + "logits/rejected": -2.292921781539917, + "logps/chosen": -22.09109878540039, + "logps/rejected": -38.62481689453125, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9677011370658875, + "rewards/margins": 4.687134265899658, + "rewards/rejected": -5.6548357009887695, + "step": 781 + }, + { + "epoch": 13.254237288135593, + "grad_norm": 4.219215065631512, + "learning_rate": 1.5415835516595463e-07, + "logits/chosen": 0.14554214477539062, + "logits/rejected": 0.22008490562438965, + "logps/chosen": -16.21091651916504, + "logps/rejected": -27.570281982421875, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2888966202735901, + "rewards/margins": 4.920593738555908, + "rewards/rejected": -4.631697177886963, + "step": 782 + }, + { + "epoch": 13.271186440677965, + "grad_norm": 4.339597407909707, + "learning_rate": 1.5347573331397135e-07, + "logits/chosen": -1.9881035089492798, + "logits/rejected": -0.008711844682693481, + "logps/chosen": -21.09736442565918, + "logps/rejected": -39.05769729614258, + "loss": 0.0426, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.46814027428627014, + "rewards/margins": 7.1204423904418945, + "rewards/rejected": -6.6523027420043945, + "step": 783 + }, + { + "epoch": 13.288135593220339, + "grad_norm": 5.367134909878089, + "learning_rate": 1.5279395613166985e-07, + "logits/chosen": -2.6423563957214355, + "logits/rejected": -1.4679323434829712, + "logps/chosen": -17.295196533203125, + "logps/rejected": -26.447715759277344, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2713179886341095, + "rewards/margins": 4.201862812042236, + "rewards/rejected": -3.9305450916290283, + "step": 784 + }, + { + "epoch": 13.305084745762711, + "grad_norm": 4.300679654482064, + "learning_rate": 1.5211302958518214e-07, + "logits/chosen": -0.8169465065002441, + "logits/rejected": 0.09549763798713684, + "logps/chosen": -18.21856689453125, + "logps/rejected": -30.12501335144043, + "loss": 0.0496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32225629687309265, + "rewards/margins": 5.08899450302124, + "rewards/rejected": -4.766737461090088, + "step": 785 + }, + { + "epoch": 13.322033898305085, + "grad_norm": 4.29162168254634, + "learning_rate": 1.5143295963319642e-07, + "logits/chosen": -0.8983233571052551, + "logits/rejected": 0.23173213005065918, + "logps/chosen": -15.133694648742676, + "logps/rejected": -30.526933670043945, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10836651176214218, + "rewards/margins": 4.605082988739014, + "rewards/rejected": -4.496715545654297, + "step": 786 + }, + { + "epoch": 13.338983050847457, + "grad_norm": 4.876409740979101, + "learning_rate": 1.5075375222690496e-07, + "logits/chosen": -0.9811916351318359, + "logits/rejected": 0.19567179679870605, + "logps/chosen": -19.57292366027832, + "logps/rejected": -36.624717712402344, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.12690672278404236, + "rewards/margins": 6.475020408630371, + "rewards/rejected": -6.601926803588867, + "step": 787 + }, + { + "epoch": 13.35593220338983, + "grad_norm": 4.607150296948369, + "learning_rate": 1.5007541330995198e-07, + "logits/chosen": -2.696889638900757, + "logits/rejected": -1.390625, + "logps/chosen": -20.865976333618164, + "logps/rejected": -32.45897674560547, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.054636940360069275, + "rewards/margins": 4.085897922515869, + "rewards/rejected": -4.031260967254639, + "step": 788 + }, + { + "epoch": 13.372881355932204, + "grad_norm": 6.11812735648075, + "learning_rate": 1.4939794881838176e-07, + "logits/chosen": -1.5392122268676758, + "logits/rejected": 0.14951567351818085, + "logps/chosen": -20.76491928100586, + "logps/rejected": -30.57718276977539, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19175273180007935, + "rewards/margins": 4.187775611877441, + "rewards/rejected": -3.9960227012634277, + "step": 789 + }, + { + "epoch": 13.389830508474576, + "grad_norm": 3.8459575352861775, + "learning_rate": 1.487213646805866e-07, + "logits/chosen": -2.8805551528930664, + "logits/rejected": -2.2380049228668213, + "logps/chosen": -16.49655532836914, + "logps/rejected": -20.970523834228516, + "loss": 0.0409, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7969187498092651, + "rewards/margins": 3.531719446182251, + "rewards/rejected": -2.7348005771636963, + "step": 790 + }, + { + "epoch": 13.40677966101695, + "grad_norm": 5.154988116162254, + "learning_rate": 1.4804566681725496e-07, + "logits/chosen": 0.7958973050117493, + "logits/rejected": 1.2581777572631836, + "logps/chosen": -13.75499439239502, + "logps/rejected": -27.726245880126953, + "loss": 0.0647, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08662920445203781, + "rewards/margins": 4.289105415344238, + "rewards/rejected": -4.202475547790527, + "step": 791 + }, + { + "epoch": 13.423728813559322, + "grad_norm": 4.401669016604261, + "learning_rate": 1.473708611413194e-07, + "logits/chosen": -0.515765368938446, + "logits/rejected": 0.03481185436248779, + "logps/chosen": -16.60962677001953, + "logps/rejected": -32.53062438964844, + "loss": 0.0433, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2858050763607025, + "rewards/margins": 5.730351448059082, + "rewards/rejected": -6.01615571975708, + "step": 792 + }, + { + "epoch": 13.440677966101696, + "grad_norm": 4.443887778355534, + "learning_rate": 1.4669695355790552e-07, + "logits/chosen": -2.8968968391418457, + "logits/rejected": -2.2223594188690186, + "logps/chosen": -18.33234405517578, + "logps/rejected": -34.28547668457031, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19874697923660278, + "rewards/margins": 4.827791213989258, + "rewards/rejected": -4.629044532775879, + "step": 793 + }, + { + "epoch": 13.457627118644067, + "grad_norm": 3.971025721995942, + "learning_rate": 1.4602394996427942e-07, + "logits/chosen": -0.9764919281005859, + "logits/rejected": -0.13439202308654785, + "logps/chosen": -15.439664840698242, + "logps/rejected": -26.79970359802246, + "loss": 0.0461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06810703873634338, + "rewards/margins": 3.983307361602783, + "rewards/rejected": -3.9151999950408936, + "step": 794 + }, + { + "epoch": 13.474576271186441, + "grad_norm": 4.907220694727352, + "learning_rate": 1.4535185624979687e-07, + "logits/chosen": -2.4083213806152344, + "logits/rejected": -1.2281450033187866, + "logps/chosen": -19.956815719604492, + "logps/rejected": -38.46086883544922, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3511507511138916, + "rewards/margins": 5.372152328491211, + "rewards/rejected": -5.723303318023682, + "step": 795 + }, + { + "epoch": 13.491525423728813, + "grad_norm": 7.315327297358435, + "learning_rate": 1.4468067829585108e-07, + "logits/chosen": -2.2506372928619385, + "logits/rejected": -2.7237861156463623, + "logps/chosen": -17.557580947875977, + "logps/rejected": -35.88199234008789, + "loss": 0.0562, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5984256267547607, + "rewards/margins": 5.224565505981445, + "rewards/rejected": -4.6261396408081055, + "step": 796 + }, + { + "epoch": 13.508474576271187, + "grad_norm": 4.920641143929202, + "learning_rate": 1.4401042197582192e-07, + "logits/chosen": 0.2772759795188904, + "logits/rejected": 0.9791284203529358, + "logps/chosen": -14.289785385131836, + "logps/rejected": -34.84285354614258, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13411599397659302, + "rewards/margins": 5.060387134552002, + "rewards/rejected": -4.926271438598633, + "step": 797 + }, + { + "epoch": 13.525423728813559, + "grad_norm": 4.521477877513176, + "learning_rate": 1.4334109315502392e-07, + "logits/chosen": -0.9948372840881348, + "logits/rejected": -0.7503252625465393, + "logps/chosen": -16.5922908782959, + "logps/rejected": -34.337181091308594, + "loss": 0.0493, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3663084805011749, + "rewards/margins": 4.992905616760254, + "rewards/rejected": -4.6265974044799805, + "step": 798 + }, + { + "epoch": 13.542372881355933, + "grad_norm": 5.520637708481951, + "learning_rate": 1.4267269769065537e-07, + "logits/chosen": -2.080489158630371, + "logits/rejected": -1.0827324390411377, + "logps/chosen": -17.34023094177246, + "logps/rejected": -28.902450561523438, + "loss": 0.0583, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2214941382408142, + "rewards/margins": 5.067776203155518, + "rewards/rejected": -4.846282482147217, + "step": 799 + }, + { + "epoch": 13.559322033898304, + "grad_norm": 3.7597183239261796, + "learning_rate": 1.4200524143174676e-07, + "logits/chosen": -2.283979892730713, + "logits/rejected": -0.6506827473640442, + "logps/chosen": -15.541634559631348, + "logps/rejected": -32.611759185791016, + "loss": 0.0412, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1795252561569214, + "rewards/margins": 5.449156761169434, + "rewards/rejected": -5.628682613372803, + "step": 800 + }, + { + "epoch": 13.576271186440678, + "grad_norm": 4.103020829191366, + "learning_rate": 1.4133873021910976e-07, + "logits/chosen": 0.5742231011390686, + "logits/rejected": 1.5821070671081543, + "logps/chosen": -13.288924217224121, + "logps/rejected": -29.91840171813965, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6092392802238464, + "rewards/margins": 4.6803364753723145, + "rewards/rejected": -4.0710978507995605, + "step": 801 + }, + { + "epoch": 13.59322033898305, + "grad_norm": 5.313735220398172, + "learning_rate": 1.4067316988528616e-07, + "logits/chosen": -2.240776538848877, + "logits/rejected": -1.4768493175506592, + "logps/chosen": -21.16518783569336, + "logps/rejected": -31.8751220703125, + "loss": 0.0686, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8960158824920654, + "rewards/margins": 5.112182140350342, + "rewards/rejected": -4.2161664962768555, + "step": 802 + }, + { + "epoch": 13.610169491525424, + "grad_norm": 5.028648319330279, + "learning_rate": 1.4000856625449664e-07, + "logits/chosen": -3.578434467315674, + "logits/rejected": -1.6968674659729004, + "logps/chosen": -21.81180763244629, + "logps/rejected": -36.5727653503418, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.005623072385787964, + "rewards/margins": 5.015188217163086, + "rewards/rejected": -5.020811080932617, + "step": 803 + }, + { + "epoch": 13.627118644067796, + "grad_norm": 4.40829356214919, + "learning_rate": 1.3934492514259003e-07, + "logits/chosen": -1.2967630624771118, + "logits/rejected": -0.4483084976673126, + "logps/chosen": -14.010137557983398, + "logps/rejected": -31.958181381225586, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6740959882736206, + "rewards/margins": 5.422782897949219, + "rewards/rejected": -4.74868631362915, + "step": 804 + }, + { + "epoch": 13.64406779661017, + "grad_norm": 5.745264644229589, + "learning_rate": 1.3868225235699216e-07, + "logits/chosen": -1.2424854040145874, + "logits/rejected": -0.6555378437042236, + "logps/chosen": -18.450178146362305, + "logps/rejected": -35.39335632324219, + "loss": 0.0487, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5278493165969849, + "rewards/margins": 5.411442279815674, + "rewards/rejected": -5.939291477203369, + "step": 805 + }, + { + "epoch": 13.661016949152543, + "grad_norm": 5.36790675366442, + "learning_rate": 1.3802055369665533e-07, + "logits/chosen": -0.2212720513343811, + "logits/rejected": 1.6100802421569824, + "logps/chosen": -14.109825134277344, + "logps/rejected": -33.012298583984375, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.709932804107666, + "rewards/margins": 5.652429103851318, + "rewards/rejected": -4.9424967765808105, + "step": 806 + }, + { + "epoch": 13.677966101694915, + "grad_norm": 14.088164156627155, + "learning_rate": 1.373598349520073e-07, + "logits/chosen": -2.5952467918395996, + "logits/rejected": -0.777185320854187, + "logps/chosen": -11.400592803955078, + "logps/rejected": -29.009681701660156, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41961854696273804, + "rewards/margins": 5.154366970062256, + "rewards/rejected": -4.734748840332031, + "step": 807 + }, + { + "epoch": 13.694915254237289, + "grad_norm": 5.194429705443838, + "learning_rate": 1.3670010190490073e-07, + "logits/chosen": -2.7721073627471924, + "logits/rejected": -0.28102076053619385, + "logps/chosen": -19.503450393676758, + "logps/rejected": -38.648590087890625, + "loss": 0.0502, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23439528048038483, + "rewards/margins": 5.5849690437316895, + "rewards/rejected": -5.350574016571045, + "step": 808 + }, + { + "epoch": 13.711864406779661, + "grad_norm": 3.4681710195473396, + "learning_rate": 1.3604136032856268e-07, + "logits/chosen": -4.101337909698486, + "logits/rejected": -3.245271682739258, + "logps/chosen": -18.42070198059082, + "logps/rejected": -41.46506881713867, + "loss": 0.0319, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3121660649776459, + "rewards/margins": 7.131579399108887, + "rewards/rejected": -6.819413185119629, + "step": 809 + }, + { + "epoch": 13.728813559322035, + "grad_norm": 5.1618133217577, + "learning_rate": 1.3538361598754382e-07, + "logits/chosen": -2.2615439891815186, + "logits/rejected": -0.9000074863433838, + "logps/chosen": -18.307828903198242, + "logps/rejected": -36.45680618286133, + "loss": 0.0615, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.048638343811035156, + "rewards/margins": 6.239575386047363, + "rewards/rejected": -6.190937042236328, + "step": 810 + }, + { + "epoch": 13.745762711864407, + "grad_norm": 5.144615388580858, + "learning_rate": 1.3472687463766848e-07, + "logits/chosen": 0.011030316352844238, + "logits/rejected": 0.2687840461730957, + "logps/chosen": -15.965656280517578, + "logps/rejected": -29.35083770751953, + "loss": 0.0606, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2761033773422241, + "rewards/margins": 4.888636589050293, + "rewards/rejected": -4.6125335693359375, + "step": 811 + }, + { + "epoch": 13.76271186440678, + "grad_norm": 5.434824546786577, + "learning_rate": 1.3407114202598368e-07, + "logits/chosen": -2.206752061843872, + "logits/rejected": -1.648052453994751, + "logps/chosen": -12.60938835144043, + "logps/rejected": -23.64356231689453, + "loss": 0.0754, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21786317229270935, + "rewards/margins": 3.707275390625, + "rewards/rejected": -3.9251387119293213, + "step": 812 + }, + { + "epoch": 13.779661016949152, + "grad_norm": 4.815034905940384, + "learning_rate": 1.3341642389070926e-07, + "logits/chosen": -0.7637806534767151, + "logits/rejected": -1.1668951511383057, + "logps/chosen": -16.947185516357422, + "logps/rejected": -35.942691802978516, + "loss": 0.0591, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3600042462348938, + "rewards/margins": 5.317866325378418, + "rewards/rejected": -4.95786190032959, + "step": 813 + }, + { + "epoch": 13.796610169491526, + "grad_norm": 4.948954844647574, + "learning_rate": 1.3276272596118728e-07, + "logits/chosen": -2.535177707672119, + "logits/rejected": -2.6263809204101562, + "logps/chosen": -22.20641326904297, + "logps/rejected": -35.22035598754883, + "loss": 0.0614, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07845325767993927, + "rewards/margins": 5.658707618713379, + "rewards/rejected": -5.580254554748535, + "step": 814 + }, + { + "epoch": 13.813559322033898, + "grad_norm": 5.081803609403105, + "learning_rate": 1.3211005395783244e-07, + "logits/chosen": -3.0509417057037354, + "logits/rejected": -1.9049749374389648, + "logps/chosen": -17.269140243530273, + "logps/rejected": -35.48225402832031, + "loss": 0.0652, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.17116232216358185, + "rewards/margins": 5.267909049987793, + "rewards/rejected": -5.096746921539307, + "step": 815 + }, + { + "epoch": 13.830508474576272, + "grad_norm": 4.31483209294794, + "learning_rate": 1.3145841359208148e-07, + "logits/chosen": -0.06046187877655029, + "logits/rejected": 1.4396018981933594, + "logps/chosen": -17.44058609008789, + "logps/rejected": -37.39258575439453, + "loss": 0.0422, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2737378180027008, + "rewards/margins": 5.390666484832764, + "rewards/rejected": -5.116928577423096, + "step": 816 + }, + { + "epoch": 13.847457627118644, + "grad_norm": 4.568010812319063, + "learning_rate": 1.308078105663437e-07, + "logits/chosen": -2.7601492404937744, + "logits/rejected": -2.4725546836853027, + "logps/chosen": -19.867713928222656, + "logps/rejected": -28.408180236816406, + "loss": 0.0605, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.049654990434646606, + "rewards/margins": 4.494874477386475, + "rewards/rejected": -4.544529914855957, + "step": 817 + }, + { + "epoch": 13.864406779661017, + "grad_norm": 4.69192569572312, + "learning_rate": 1.3015825057395058e-07, + "logits/chosen": -0.5765193700790405, + "logits/rejected": 0.6422019004821777, + "logps/chosen": -13.671000480651855, + "logps/rejected": -26.479190826416016, + "loss": 0.0508, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.473724365234375, + "rewards/margins": 4.31511116027832, + "rewards/rejected": -3.8413872718811035, + "step": 818 + }, + { + "epoch": 13.88135593220339, + "grad_norm": 5.415611690257974, + "learning_rate": 1.2950973929910619e-07, + "logits/chosen": -3.1382057666778564, + "logits/rejected": -0.9989707469940186, + "logps/chosen": -17.182632446289062, + "logps/rejected": -29.84030532836914, + "loss": 0.073, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07108063995838165, + "rewards/margins": 4.959874629974365, + "rewards/rejected": -4.8887939453125, + "step": 819 + }, + { + "epoch": 13.898305084745763, + "grad_norm": 4.253278835211674, + "learning_rate": 1.2886228241683748e-07, + "logits/chosen": -2.7567286491394043, + "logits/rejected": -1.1469461917877197, + "logps/chosen": -20.566917419433594, + "logps/rejected": -36.77910614013672, + "loss": 0.047, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2701362371444702, + "rewards/margins": 6.22340202331543, + "rewards/rejected": -5.95326566696167, + "step": 820 + }, + { + "epoch": 13.915254237288135, + "grad_norm": 6.0534564497811, + "learning_rate": 1.282158855929445e-07, + "logits/chosen": -4.0090460777282715, + "logits/rejected": -2.398519515991211, + "logps/chosen": -17.34862518310547, + "logps/rejected": -31.26947021484375, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.38930952548980713, + "rewards/margins": 4.36777925491333, + "rewards/rejected": -3.9784698486328125, + "step": 821 + }, + { + "epoch": 13.932203389830509, + "grad_norm": 4.712692482905532, + "learning_rate": 1.275705544839509e-07, + "logits/chosen": -2.769106864929199, + "logits/rejected": -1.0413520336151123, + "logps/chosen": -22.0438232421875, + "logps/rejected": -37.239341735839844, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0389874204993248, + "rewards/margins": 5.474620819091797, + "rewards/rejected": -5.513607978820801, + "step": 822 + }, + { + "epoch": 13.94915254237288, + "grad_norm": 5.289659327917796, + "learning_rate": 1.2692629473705452e-07, + "logits/chosen": 0.8868709802627563, + "logits/rejected": 1.507910966873169, + "logps/chosen": -16.889572143554688, + "logps/rejected": -29.149890899658203, + "loss": 0.0543, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1100674495100975, + "rewards/margins": 4.167652130126953, + "rewards/rejected": -4.277719497680664, + "step": 823 + }, + { + "epoch": 13.966101694915254, + "grad_norm": 5.966734982532794, + "learning_rate": 1.2628311199007762e-07, + "logits/chosen": -2.237454652786255, + "logits/rejected": -1.3493525981903076, + "logps/chosen": -15.533367156982422, + "logps/rejected": -27.488908767700195, + "loss": 0.0649, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02705906331539154, + "rewards/margins": 4.565030574798584, + "rewards/rejected": -4.5379719734191895, + "step": 824 + }, + { + "epoch": 13.983050847457626, + "grad_norm": 4.76829853180543, + "learning_rate": 1.2564101187141828e-07, + "logits/chosen": -3.445924758911133, + "logits/rejected": -2.606234550476074, + "logps/chosen": -12.318361282348633, + "logps/rejected": -33.33863830566406, + "loss": 0.054, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.674911618232727, + "rewards/margins": 6.01348876953125, + "rewards/rejected": -5.338577747344971, + "step": 825 + }, + { + "epoch": 14.0, + "grad_norm": 4.855237519617607, + "learning_rate": 1.2500000000000005e-07, + "logits/chosen": -2.449713706970215, + "logits/rejected": -1.7510132789611816, + "logps/chosen": -21.224750518798828, + "logps/rejected": -34.99382400512695, + "loss": 0.0601, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32820674777030945, + "rewards/margins": 5.6509318351745605, + "rewards/rejected": -5.9791388511657715, + "step": 826 + }, + { + "epoch": 14.016949152542374, + "grad_norm": 4.54608207656459, + "learning_rate": 1.2436008198522374e-07, + "logits/chosen": -0.290882408618927, + "logits/rejected": 0.5373449325561523, + "logps/chosen": -18.350481033325195, + "logps/rejected": -35.856468200683594, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.007371596992015839, + "rewards/margins": 5.433517932891846, + "rewards/rejected": -5.426146507263184, + "step": 827 + }, + { + "epoch": 14.033898305084746, + "grad_norm": 4.908312293515908, + "learning_rate": 1.2372126342691797e-07, + "logits/chosen": -4.190606594085693, + "logits/rejected": -3.0944652557373047, + "logps/chosen": -19.034318923950195, + "logps/rejected": -34.272186279296875, + "loss": 0.0547, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.2908100485801697, + "rewards/margins": 4.515417575836182, + "rewards/rejected": -4.224607467651367, + "step": 828 + }, + { + "epoch": 14.05084745762712, + "grad_norm": 5.045804367551582, + "learning_rate": 1.2308354991529006e-07, + "logits/chosen": -0.760540246963501, + "logits/rejected": -1.555609107017517, + "logps/chosen": -14.472261428833008, + "logps/rejected": -36.33820724487305, + "loss": 0.0653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.45639586448669434, + "rewards/margins": 4.686809062957764, + "rewards/rejected": -4.23041296005249, + "step": 829 + }, + { + "epoch": 14.067796610169491, + "grad_norm": 4.199306543227042, + "learning_rate": 1.2244694703087727e-07, + "logits/chosen": -3.3680789470672607, + "logits/rejected": -1.9466626644134521, + "logps/chosen": -26.21337127685547, + "logps/rejected": -34.70789337158203, + "loss": 0.0512, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30467984080314636, + "rewards/margins": 4.528075695037842, + "rewards/rejected": -4.223395824432373, + "step": 830 + }, + { + "epoch": 14.084745762711865, + "grad_norm": 4.672521820477395, + "learning_rate": 1.2181146034449807e-07, + "logits/chosen": -2.1504950523376465, + "logits/rejected": -1.46886146068573, + "logps/chosen": -15.693143844604492, + "logps/rejected": -26.54515838623047, + "loss": 0.0565, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22424635291099548, + "rewards/margins": 3.5846357345581055, + "rewards/rejected": -3.3603897094726562, + "step": 831 + }, + { + "epoch": 14.101694915254237, + "grad_norm": 5.058340149965996, + "learning_rate": 1.2117709541720306e-07, + "logits/chosen": -2.309311866760254, + "logits/rejected": -2.2456936836242676, + "logps/chosen": -20.746421813964844, + "logps/rejected": -34.04157638549805, + "loss": 0.0621, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012225247919559479, + "rewards/margins": 4.281233787536621, + "rewards/rejected": -4.269008636474609, + "step": 832 + }, + { + "epoch": 14.11864406779661, + "grad_norm": 4.80031946203614, + "learning_rate": 1.2054385780022655e-07, + "logits/chosen": 0.12395321577787399, + "logits/rejected": 0.8067972660064697, + "logps/chosen": -16.532777786254883, + "logps/rejected": -38.36420440673828, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06285473704338074, + "rewards/margins": 5.238184928894043, + "rewards/rejected": -5.175330638885498, + "step": 833 + }, + { + "epoch": 14.135593220338983, + "grad_norm": 4.784487064199788, + "learning_rate": 1.199117530349379e-07, + "logits/chosen": -3.471977472305298, + "logits/rejected": -3.575911521911621, + "logps/chosen": -17.44545555114746, + "logps/rejected": -32.76516342163086, + "loss": 0.0494, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09463706612586975, + "rewards/margins": 5.1845011711120605, + "rewards/rejected": -5.0898637771606445, + "step": 834 + }, + { + "epoch": 14.152542372881356, + "grad_norm": 4.424987876484658, + "learning_rate": 1.192807866527931e-07, + "logits/chosen": -1.5066072940826416, + "logits/rejected": -0.8759188055992126, + "logps/chosen": -18.205663681030273, + "logps/rejected": -31.757957458496094, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.26423490047454834, + "rewards/margins": 4.786740303039551, + "rewards/rejected": -4.522505760192871, + "step": 835 + }, + { + "epoch": 14.169491525423728, + "grad_norm": 4.955141012463688, + "learning_rate": 1.1865096417528633e-07, + "logits/chosen": -0.48518604040145874, + "logits/rejected": 0.1290414035320282, + "logps/chosen": -13.834370613098145, + "logps/rejected": -27.40927505493164, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2843541204929352, + "rewards/margins": 4.416263103485107, + "rewards/rejected": -4.131909370422363, + "step": 836 + }, + { + "epoch": 14.186440677966102, + "grad_norm": 5.441205501869942, + "learning_rate": 1.1802229111390155e-07, + "logits/chosen": -1.3898885250091553, + "logits/rejected": 0.1110721230506897, + "logps/chosen": -15.802949905395508, + "logps/rejected": -42.55592346191406, + "loss": 0.0637, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11498227715492249, + "rewards/margins": 7.029240131378174, + "rewards/rejected": -6.914258003234863, + "step": 837 + }, + { + "epoch": 14.203389830508474, + "grad_norm": 4.2930763676154315, + "learning_rate": 1.173947729700644e-07, + "logits/chosen": -1.319080114364624, + "logits/rejected": -0.31338560581207275, + "logps/chosen": -20.780853271484375, + "logps/rejected": -37.78578186035156, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.14082486927509308, + "rewards/margins": 5.361012935638428, + "rewards/rejected": -5.501838684082031, + "step": 838 + }, + { + "epoch": 14.220338983050848, + "grad_norm": 4.6566564145391975, + "learning_rate": 1.1676841523509398e-07, + "logits/chosen": -2.1716244220733643, + "logits/rejected": -1.0681712627410889, + "logps/chosen": -21.671016693115234, + "logps/rejected": -32.90324401855469, + "loss": 0.0541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4317570924758911, + "rewards/margins": 5.18146276473999, + "rewards/rejected": -4.749706268310547, + "step": 839 + }, + { + "epoch": 14.23728813559322, + "grad_norm": 4.056864733239223, + "learning_rate": 1.1614322339015484e-07, + "logits/chosen": -1.7106143236160278, + "logits/rejected": -0.9364138245582581, + "logps/chosen": -19.77383804321289, + "logps/rejected": -35.89718246459961, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.35204410552978516, + "rewards/margins": 6.058632850646973, + "rewards/rejected": -5.7065887451171875, + "step": 840 + }, + { + "epoch": 14.254237288135593, + "grad_norm": 4.983849879172364, + "learning_rate": 1.1551920290620903e-07, + "logits/chosen": -2.1713905334472656, + "logits/rejected": -1.5407037734985352, + "logps/chosen": -12.266080856323242, + "logps/rejected": -29.389978408813477, + "loss": 0.0455, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4713742733001709, + "rewards/margins": 4.8038177490234375, + "rewards/rejected": -4.3324432373046875, + "step": 841 + }, + { + "epoch": 14.271186440677965, + "grad_norm": 5.360406348411582, + "learning_rate": 1.1489635924396815e-07, + "logits/chosen": -1.4929659366607666, + "logits/rejected": -1.120671033859253, + "logps/chosen": -19.657901763916016, + "logps/rejected": -41.733489990234375, + "loss": 0.065, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41843128204345703, + "rewards/margins": 5.889748573303223, + "rewards/rejected": -6.30817985534668, + "step": 842 + }, + { + "epoch": 14.288135593220339, + "grad_norm": 4.853679039074438, + "learning_rate": 1.1427469785384558e-07, + "logits/chosen": -1.4412063360214233, + "logits/rejected": -0.877423882484436, + "logps/chosen": -16.15688133239746, + "logps/rejected": -35.47041320800781, + "loss": 0.0489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.431803822517395, + "rewards/margins": 5.776800632476807, + "rewards/rejected": -5.344996929168701, + "step": 843 + }, + { + "epoch": 14.305084745762711, + "grad_norm": 4.74221982053372, + "learning_rate": 1.1365422417590878e-07, + "logits/chosen": 0.059105001389980316, + "logits/rejected": 2.012216329574585, + "logps/chosen": -20.8731689453125, + "logps/rejected": -38.89690399169922, + "loss": 0.058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.27555304765701294, + "rewards/margins": 6.34234619140625, + "rewards/rejected": -6.066792964935303, + "step": 844 + }, + { + "epoch": 14.322033898305085, + "grad_norm": 4.7538090273563025, + "learning_rate": 1.1303494363983196e-07, + "logits/chosen": -1.425571322441101, + "logits/rejected": -0.497536838054657, + "logps/chosen": -15.412924766540527, + "logps/rejected": -29.72442626953125, + "loss": 0.0542, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09866355359554291, + "rewards/margins": 4.266383647918701, + "rewards/rejected": -4.167720317840576, + "step": 845 + }, + { + "epoch": 14.338983050847457, + "grad_norm": 4.844067597917288, + "learning_rate": 1.1241686166484804e-07, + "logits/chosen": -1.9793237447738647, + "logits/rejected": -0.8182030320167542, + "logps/chosen": -15.300029754638672, + "logps/rejected": -31.1224308013916, + "loss": 0.0475, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10771667957305908, + "rewards/margins": 4.908604145050049, + "rewards/rejected": -4.800887584686279, + "step": 846 + }, + { + "epoch": 14.35593220338983, + "grad_norm": 4.878898863334605, + "learning_rate": 1.1179998365970172e-07, + "logits/chosen": -0.17034506797790527, + "logits/rejected": 0.5397195219993591, + "logps/chosen": -17.27823257446289, + "logps/rejected": -29.693744659423828, + "loss": 0.067, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.09199562668800354, + "rewards/margins": 4.912941932678223, + "rewards/rejected": -4.820946216583252, + "step": 847 + }, + { + "epoch": 14.372881355932204, + "grad_norm": 4.559682445793414, + "learning_rate": 1.1118431502260162e-07, + "logits/chosen": -0.6307776570320129, + "logits/rejected": 0.6813377737998962, + "logps/chosen": -11.645711898803711, + "logps/rejected": -33.321189880371094, + "loss": 0.0539, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3501269221305847, + "rewards/margins": 5.4904913902282715, + "rewards/rejected": -5.140363693237305, + "step": 848 + }, + { + "epoch": 14.389830508474576, + "grad_norm": 4.036758306534431, + "learning_rate": 1.1056986114117367e-07, + "logits/chosen": -1.4370734691619873, + "logits/rejected": -0.3933512270450592, + "logps/chosen": -14.565434455871582, + "logps/rejected": -31.868255615234375, + "loss": 0.0426, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3414157032966614, + "rewards/margins": 5.2931809425354, + "rewards/rejected": -4.951766014099121, + "step": 849 + }, + { + "epoch": 14.40677966101695, + "grad_norm": 4.642153321658131, + "learning_rate": 1.0995662739241346e-07, + "logits/chosen": -3.0956766605377197, + "logits/rejected": -2.5723183155059814, + "logps/chosen": -21.624290466308594, + "logps/rejected": -42.88805389404297, + "loss": 0.0533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2431950867176056, + "rewards/margins": 5.845188140869141, + "rewards/rejected": -5.601992607116699, + "step": 850 + }, + { + "epoch": 14.423728813559322, + "grad_norm": 4.307115009032184, + "learning_rate": 1.0934461914263965e-07, + "logits/chosen": -1.71174955368042, + "logits/rejected": 0.0297551229596138, + "logps/chosen": -16.369564056396484, + "logps/rejected": -32.02233123779297, + "loss": 0.0486, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.33911365270614624, + "rewards/margins": 4.879045486450195, + "rewards/rejected": -4.5399322509765625, + "step": 851 + }, + { + "epoch": 14.440677966101696, + "grad_norm": 4.791655857038553, + "learning_rate": 1.087338417474464e-07, + "logits/chosen": -0.3006948232650757, + "logits/rejected": 0.6420655250549316, + "logps/chosen": -13.013580322265625, + "logps/rejected": -35.55011749267578, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17818471789360046, + "rewards/margins": 5.40357780456543, + "rewards/rejected": -5.581762313842773, + "step": 852 + }, + { + "epoch": 14.457627118644067, + "grad_norm": 5.0370461913788, + "learning_rate": 1.0812430055165709e-07, + "logits/chosen": -3.104762315750122, + "logits/rejected": -2.9962918758392334, + "logps/chosen": -22.40864372253418, + "logps/rejected": -33.83332824707031, + "loss": 0.0545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1751263439655304, + "rewards/margins": 4.681097984313965, + "rewards/rejected": -4.505971431732178, + "step": 853 + }, + { + "epoch": 14.474576271186441, + "grad_norm": 4.114102239483635, + "learning_rate": 1.0751600088927712e-07, + "logits/chosen": -0.8753526210784912, + "logits/rejected": -0.42285048961639404, + "logps/chosen": -17.130706787109375, + "logps/rejected": -36.58259582519531, + "loss": 0.0521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.19100093841552734, + "rewards/margins": 5.029597282409668, + "rewards/rejected": -4.838596343994141, + "step": 854 + }, + { + "epoch": 14.491525423728813, + "grad_norm": 4.674717207809757, + "learning_rate": 1.0690894808344756e-07, + "logits/chosen": -0.887531042098999, + "logits/rejected": 0.030666358768939972, + "logps/chosen": -18.499759674072266, + "logps/rejected": -33.44499206542969, + "loss": 0.048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05299860239028931, + "rewards/margins": 5.534433364868164, + "rewards/rejected": -5.4814348220825195, + "step": 855 + }, + { + "epoch": 14.508474576271187, + "grad_norm": 5.571055008821593, + "learning_rate": 1.0630314744639829e-07, + "logits/chosen": -2.3482229709625244, + "logits/rejected": -1.5293030738830566, + "logps/chosen": -20.04794692993164, + "logps/rejected": -33.5222053527832, + "loss": 0.0713, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.03216637670993805, + "rewards/margins": 5.284518241882324, + "rewards/rejected": -5.316684722900391, + "step": 856 + }, + { + "epoch": 14.525423728813559, + "grad_norm": 4.036017883945882, + "learning_rate": 1.0569860427940178e-07, + "logits/chosen": -4.571822643280029, + "logits/rejected": -3.6904447078704834, + "logps/chosen": -17.9635066986084, + "logps/rejected": -30.057472229003906, + "loss": 0.0416, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03753848373889923, + "rewards/margins": 4.019134521484375, + "rewards/rejected": -3.981595993041992, + "step": 857 + }, + { + "epoch": 14.542372881355933, + "grad_norm": 4.455559455534382, + "learning_rate": 1.050953238727264e-07, + "logits/chosen": -2.462489366531372, + "logits/rejected": -2.7516651153564453, + "logps/chosen": -14.60168743133545, + "logps/rejected": -29.86794662475586, + "loss": 0.0543, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3153627812862396, + "rewards/margins": 4.536343574523926, + "rewards/rejected": -4.220980644226074, + "step": 858 + }, + { + "epoch": 14.559322033898304, + "grad_norm": 4.409069243450616, + "learning_rate": 1.0449331150559063e-07, + "logits/chosen": 0.21322837471961975, + "logits/rejected": 0.560262143611908, + "logps/chosen": -19.38181495666504, + "logps/rejected": -37.284568786621094, + "loss": 0.043, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0371914803981781, + "rewards/margins": 6.507880210876465, + "rewards/rejected": -6.470687389373779, + "step": 859 + }, + { + "epoch": 14.576271186440678, + "grad_norm": 3.8106192326465425, + "learning_rate": 1.0389257244611601e-07, + "logits/chosen": -3.911578893661499, + "logits/rejected": -3.2801082134246826, + "logps/chosen": -18.08879280090332, + "logps/rejected": -32.2973747253418, + "loss": 0.0411, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.41884616017341614, + "rewards/margins": 4.887192726135254, + "rewards/rejected": -4.46834659576416, + "step": 860 + }, + { + "epoch": 14.59322033898305, + "grad_norm": 4.10040976821886, + "learning_rate": 1.0329311195128193e-07, + "logits/chosen": -3.737536907196045, + "logits/rejected": -1.8438628911972046, + "logps/chosen": -13.185300827026367, + "logps/rejected": -31.151159286499023, + "loss": 0.0432, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.189189612865448, + "rewards/margins": 5.370091915130615, + "rewards/rejected": -5.559281826019287, + "step": 861 + }, + { + "epoch": 14.610169491525424, + "grad_norm": 4.689298687726538, + "learning_rate": 1.0269493526687914e-07, + "logits/chosen": -4.051319122314453, + "logits/rejected": -2.245673418045044, + "logps/chosen": -21.487590789794922, + "logps/rejected": -32.1451301574707, + "loss": 0.0595, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4900110960006714, + "rewards/margins": 4.4578471183776855, + "rewards/rejected": -3.967834949493408, + "step": 862 + }, + { + "epoch": 14.627118644067796, + "grad_norm": 4.629593885843378, + "learning_rate": 1.0209804762746396e-07, + "logits/chosen": -1.2015026807785034, + "logits/rejected": 0.05276120454072952, + "logps/chosen": -19.471660614013672, + "logps/rejected": -33.03922653198242, + "loss": 0.0626, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06236635148525238, + "rewards/margins": 5.169223308563232, + "rewards/rejected": -5.106856822967529, + "step": 863 + }, + { + "epoch": 14.64406779661017, + "grad_norm": 5.0290811803860205, + "learning_rate": 1.0150245425631235e-07, + "logits/chosen": -0.8136476278305054, + "logits/rejected": -0.6300673484802246, + "logps/chosen": -14.205711364746094, + "logps/rejected": -29.096141815185547, + "loss": 0.0538, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.31267741322517395, + "rewards/margins": 4.294619560241699, + "rewards/rejected": -3.9819421768188477, + "step": 864 + }, + { + "epoch": 14.661016949152543, + "grad_norm": 4.984998268591449, + "learning_rate": 1.0090816036537461e-07, + "logits/chosen": -4.853797912597656, + "logits/rejected": -4.035728931427002, + "logps/chosen": -13.243602752685547, + "logps/rejected": -32.274715423583984, + "loss": 0.0488, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05507740378379822, + "rewards/margins": 5.407604694366455, + "rewards/rejected": -5.462681293487549, + "step": 865 + }, + { + "epoch": 14.677966101694915, + "grad_norm": 4.935756536829011, + "learning_rate": 1.0031517115522925e-07, + "logits/chosen": -0.9398390054702759, + "logits/rejected": -0.6894736289978027, + "logps/chosen": -15.939778327941895, + "logps/rejected": -29.07857894897461, + "loss": 0.06, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2174006849527359, + "rewards/margins": 4.617957592010498, + "rewards/rejected": -4.400557041168213, + "step": 866 + }, + { + "epoch": 14.694915254237289, + "grad_norm": 5.177827880857834, + "learning_rate": 9.972349181503773e-08, + "logits/chosen": -2.80765962600708, + "logits/rejected": -2.379774570465088, + "logps/chosen": -13.54705810546875, + "logps/rejected": -27.66398811340332, + "loss": 0.0548, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.011886119842529297, + "rewards/margins": 4.244990348815918, + "rewards/rejected": -4.233104228973389, + "step": 867 + }, + { + "epoch": 14.711864406779661, + "grad_norm": 4.651526502576263, + "learning_rate": 9.913312752249903e-08, + "logits/chosen": 0.8737283945083618, + "logits/rejected": 0.9006978273391724, + "logps/chosen": -15.809414863586426, + "logps/rejected": -31.1838436126709, + "loss": 0.0568, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.017194844782352448, + "rewards/margins": 4.261214733123779, + "rewards/rejected": -4.278409481048584, + "step": 868 + }, + { + "epoch": 14.728813559322035, + "grad_norm": 3.8276656992018285, + "learning_rate": 9.85440834438044e-08, + "logits/chosen": -2.807131052017212, + "logits/rejected": -1.8012992143630981, + "logps/chosen": -19.008888244628906, + "logps/rejected": -37.323631286621094, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3896780014038086, + "rewards/margins": 5.351907730102539, + "rewards/rejected": -4.962228775024414, + "step": 869 + }, + { + "epoch": 14.745762711864407, + "grad_norm": 5.226466365324867, + "learning_rate": 9.795636473359207e-08, + "logits/chosen": -1.8736178874969482, + "logits/rejected": -1.2298340797424316, + "logps/chosen": -20.007198333740234, + "logps/rejected": -29.63496208190918, + "loss": 0.0588, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.11613491177558899, + "rewards/margins": 3.2119195461273193, + "rewards/rejected": -3.095784902572632, + "step": 870 + }, + { + "epoch": 14.76271186440678, + "grad_norm": 4.897408411630909, + "learning_rate": 9.736997653490214e-08, + "logits/chosen": 2.0472121238708496, + "logits/rejected": 2.940474033355713, + "logps/chosen": -17.741928100585938, + "logps/rejected": -37.018699645996094, + "loss": 0.064, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27036699652671814, + "rewards/margins": 5.808854103088379, + "rewards/rejected": -6.079221725463867, + "step": 871 + }, + { + "epoch": 14.779661016949152, + "grad_norm": 4.896869366507913, + "learning_rate": 9.678492397913165e-08, + "logits/chosen": -3.1093807220458984, + "logits/rejected": -1.964849829673767, + "logps/chosen": -24.06382179260254, + "logps/rejected": -39.17333984375, + "loss": 0.0504, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24048033356666565, + "rewards/margins": 5.500030040740967, + "rewards/rejected": -5.7405104637146, + "step": 872 + }, + { + "epoch": 14.796610169491526, + "grad_norm": 4.236238856435463, + "learning_rate": 9.620121218598957e-08, + "logits/chosen": -3.5577645301818848, + "logits/rejected": -3.226468563079834, + "logps/chosen": -17.269495010375977, + "logps/rejected": -29.61627960205078, + "loss": 0.0463, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20766395330429077, + "rewards/margins": 5.623841285705566, + "rewards/rejected": -5.416177272796631, + "step": 873 + }, + { + "epoch": 14.813559322033898, + "grad_norm": 4.775367869659583, + "learning_rate": 9.561884626345204e-08, + "logits/chosen": -2.6089420318603516, + "logits/rejected": -3.4899003505706787, + "logps/chosen": -15.538010597229004, + "logps/rejected": -29.64735221862793, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.801546573638916, + "rewards/margins": 4.463309288024902, + "rewards/rejected": -3.6617627143859863, + "step": 874 + }, + { + "epoch": 14.830508474576272, + "grad_norm": 5.311387814948821, + "learning_rate": 9.503783130771778e-08, + "logits/chosen": -2.0723490715026855, + "logits/rejected": -0.9905359148979187, + "logps/chosen": -16.402400970458984, + "logps/rejected": -31.389341354370117, + "loss": 0.0531, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6416011452674866, + "rewards/margins": 4.774829387664795, + "rewards/rejected": -4.133228302001953, + "step": 875 + }, + { + "epoch": 14.847457627118644, + "grad_norm": 4.576464275088217, + "learning_rate": 9.445817240316332e-08, + "logits/chosen": -0.9681025743484497, + "logits/rejected": 1.3066233396530151, + "logps/chosen": -15.319185256958008, + "logps/rejected": -30.807912826538086, + "loss": 0.0604, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10009877383708954, + "rewards/margins": 4.77067756652832, + "rewards/rejected": -4.870776653289795, + "step": 876 + }, + { + "epoch": 14.864406779661017, + "grad_norm": 3.566749084202065, + "learning_rate": 9.387987462229857e-08, + "logits/chosen": -3.0737295150756836, + "logits/rejected": -1.5600917339324951, + "logps/chosen": -13.265515327453613, + "logps/rejected": -35.03205108642578, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16393795609474182, + "rewards/margins": 5.836406707763672, + "rewards/rejected": -5.672469139099121, + "step": 877 + }, + { + "epoch": 14.88135593220339, + "grad_norm": 3.732800979255398, + "learning_rate": 9.330294302572242e-08, + "logits/chosen": -1.8261581659317017, + "logits/rejected": -1.2361934185028076, + "logps/chosen": -15.713083267211914, + "logps/rejected": -28.814865112304688, + "loss": 0.0408, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16630926728248596, + "rewards/margins": 5.071293354034424, + "rewards/rejected": -4.904984474182129, + "step": 878 + }, + { + "epoch": 14.898305084745763, + "grad_norm": 4.831490870008395, + "learning_rate": 9.272738266207871e-08, + "logits/chosen": -2.989412307739258, + "logits/rejected": -2.152963161468506, + "logps/chosen": -17.28832244873047, + "logps/rejected": -35.29121780395508, + "loss": 0.0598, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2409454882144928, + "rewards/margins": 4.79300594329834, + "rewards/rejected": -4.552060127258301, + "step": 879 + }, + { + "epoch": 14.915254237288135, + "grad_norm": 4.787771544995592, + "learning_rate": 9.215319856801157e-08, + "logits/chosen": -2.2084031105041504, + "logits/rejected": -0.7729737758636475, + "logps/chosen": -16.84943199157715, + "logps/rejected": -31.124263763427734, + "loss": 0.0571, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07245789468288422, + "rewards/margins": 4.6041388511657715, + "rewards/rejected": -4.676596641540527, + "step": 880 + } + ], + "logging_steps": 1, + "max_steps": 1180, + "num_input_tokens_seen": 0, + "num_train_epochs": 20, + "save_steps": 80, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}