diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7942 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 527, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018975332068311196, + "grad_norm": 1.0043981075286865, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -2.840092182159424, + "logits/rejected": -2.8336455821990967, + "logps/chosen": -25.032325744628906, + "logps/rejected": -22.43791389465332, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.003795066413662239, + "grad_norm": 1.213585376739502, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": -2.8843088150024414, + "logits/rejected": -2.880852222442627, + "logps/chosen": -24.25356101989746, + "logps/rejected": -22.29548454284668, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0056925996204933585, + "grad_norm": 1.4231489896774292, + "learning_rate": 6e-06, + "logits/chosen": -2.7949483394622803, + "logits/rejected": -2.794952392578125, + "logps/chosen": -23.652122497558594, + "logps/rejected": -22.365129470825195, + "loss": 0.6902, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.0177607424557209, + "rewards/margins": 0.005833745002746582, + "rewards/rejected": -0.023594487458467484, + "step": 3 + }, + { + "epoch": 0.007590132827324478, + "grad_norm": 1.0011111497879028, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": -2.8000831604003906, + "logits/rejected": -2.7997589111328125, + "logps/chosen": -25.592479705810547, + "logps/rejected": -23.90885353088379, + "loss": 0.6911, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0664670318365097, + "rewards/margins": 0.004310930147767067, + "rewards/rejected": -0.07077796757221222, + "step": 4 + }, + { + "epoch": 0.009487666034155597, + "grad_norm": 0.9724621772766113, + "learning_rate": 1e-05, + "logits/chosen": -2.8149185180664062, + "logits/rejected": -2.813040018081665, + "logps/chosen": -23.858686447143555, + "logps/rejected": -22.61505699157715, + "loss": 0.686, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.10101969540119171, + "rewards/margins": 0.014916013926267624, + "rewards/rejected": -0.11593571305274963, + "step": 5 + }, + { + "epoch": 0.011385199240986717, + "grad_norm": 0.9295000433921814, + "learning_rate": 1.2e-05, + "logits/chosen": -2.808767080307007, + "logits/rejected": -2.8061323165893555, + "logps/chosen": -25.594276428222656, + "logps/rejected": -24.02318572998047, + "loss": 0.6875, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.18439313769340515, + "rewards/margins": 0.012782273814082146, + "rewards/rejected": -0.19717541337013245, + "step": 6 + }, + { + "epoch": 0.013282732447817837, + "grad_norm": 0.9410561323165894, + "learning_rate": 1.4000000000000001e-05, + "logits/chosen": -2.794262409210205, + "logits/rejected": -2.792712926864624, + "logps/chosen": -26.27347183227539, + "logps/rejected": -24.40791893005371, + "loss": 0.7022, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.2307298630475998, + "rewards/margins": -0.016173291951417923, + "rewards/rejected": -0.21455657482147217, + "step": 7 + }, + { + "epoch": 0.015180265654648957, + "grad_norm": 0.963943600654602, + "learning_rate": 1.6000000000000003e-05, + "logits/chosen": -2.861567735671997, + "logits/rejected": -2.862137794494629, + "logps/chosen": -26.166967391967773, + "logps/rejected": -24.172080993652344, + "loss": 0.6788, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.28767508268356323, + "rewards/margins": 0.03301116079092026, + "rewards/rejected": -0.3206862211227417, + "step": 8 + }, + { + "epoch": 0.017077798861480076, + "grad_norm": 0.8776592016220093, + "learning_rate": 1.8e-05, + "logits/chosen": -2.8510162830352783, + "logits/rejected": -2.852877616882324, + "logps/chosen": -26.08197784423828, + "logps/rejected": -25.353015899658203, + "loss": 0.6532, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.2958042621612549, + "rewards/margins": 0.08714231848716736, + "rewards/rejected": -0.38294661045074463, + "step": 9 + }, + { + "epoch": 0.018975332068311195, + "grad_norm": 0.923055112361908, + "learning_rate": 2e-05, + "logits/chosen": -2.845109462738037, + "logits/rejected": -2.8453867435455322, + "logps/chosen": -27.779403686523438, + "logps/rejected": -26.860719680786133, + "loss": 0.6573, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.3790830373764038, + "rewards/margins": 0.07841245085000992, + "rewards/rejected": -0.45749545097351074, + "step": 10 + }, + { + "epoch": 0.020872865275142316, + "grad_norm": 1.1252574920654297, + "learning_rate": 2.2000000000000003e-05, + "logits/chosen": -2.831721067428589, + "logits/rejected": -2.82804799079895, + "logps/chosen": -27.735706329345703, + "logps/rejected": -26.28765869140625, + "loss": 0.5991, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3327995538711548, + "rewards/margins": 0.20304027199745178, + "rewards/rejected": -0.5358397960662842, + "step": 11 + }, + { + "epoch": 0.022770398481973434, + "grad_norm": 1.247209072113037, + "learning_rate": 2.4e-05, + "logits/chosen": -2.832109212875366, + "logits/rejected": -2.8269498348236084, + "logps/chosen": -29.082386016845703, + "logps/rejected": -27.44651222229004, + "loss": 0.6788, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5191504955291748, + "rewards/margins": 0.03733288496732712, + "rewards/rejected": -0.5564833879470825, + "step": 12 + }, + { + "epoch": 0.024667931688804556, + "grad_norm": 0.9526516199111938, + "learning_rate": 2.6000000000000002e-05, + "logits/chosen": -2.7532851696014404, + "logits/rejected": -2.751197099685669, + "logps/chosen": -28.381704330444336, + "logps/rejected": -28.021570205688477, + "loss": 0.6313, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46909499168395996, + "rewards/margins": 0.1372968554496765, + "rewards/rejected": -0.6063918471336365, + "step": 13 + }, + { + "epoch": 0.026565464895635674, + "grad_norm": 0.967846155166626, + "learning_rate": 2.8000000000000003e-05, + "logits/chosen": -2.8310461044311523, + "logits/rejected": -2.8237879276275635, + "logps/chosen": -28.878814697265625, + "logps/rejected": -29.368640899658203, + "loss": 0.6105, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5359129309654236, + "rewards/margins": 0.18219587206840515, + "rewards/rejected": -0.7181087732315063, + "step": 14 + }, + { + "epoch": 0.028462998102466792, + "grad_norm": 1.1417112350463867, + "learning_rate": 3e-05, + "logits/chosen": -2.8312528133392334, + "logits/rejected": -2.8290631771087646, + "logps/chosen": -29.81403350830078, + "logps/rejected": -28.913654327392578, + "loss": 0.6425, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.5813980102539062, + "rewards/margins": 0.13141022622585297, + "rewards/rejected": -0.7128081917762756, + "step": 15 + }, + { + "epoch": 0.030360531309297913, + "grad_norm": 0.9886475801467896, + "learning_rate": 3.2000000000000005e-05, + "logits/chosen": -2.854173183441162, + "logits/rejected": -2.8493854999542236, + "logps/chosen": -30.15297508239746, + "logps/rejected": -29.798294067382812, + "loss": 0.6355, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6225000619888306, + "rewards/margins": 0.1385478377342224, + "rewards/rejected": -0.7610478401184082, + "step": 16 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 1.4905331134796143, + "learning_rate": 3.4000000000000007e-05, + "logits/chosen": -2.815732479095459, + "logits/rejected": -2.822748899459839, + "logps/chosen": -29.450511932373047, + "logps/rejected": -30.160736083984375, + "loss": 0.6313, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6014214754104614, + "rewards/margins": 0.15089011192321777, + "rewards/rejected": -0.7523115873336792, + "step": 17 + }, + { + "epoch": 0.03415559772296015, + "grad_norm": 0.9573298692703247, + "learning_rate": 3.6e-05, + "logits/chosen": -2.7691009044647217, + "logits/rejected": -2.768683671951294, + "logps/chosen": -30.41909408569336, + "logps/rejected": -28.978151321411133, + "loss": 0.6377, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.60785311460495, + "rewards/margins": 0.1253933608531952, + "rewards/rejected": -0.7332464456558228, + "step": 18 + }, + { + "epoch": 0.036053130929791274, + "grad_norm": 1.1032794713974, + "learning_rate": 3.8e-05, + "logits/chosen": -2.8177359104156494, + "logits/rejected": -2.8107476234436035, + "logps/chosen": -30.09964942932129, + "logps/rejected": -30.08537483215332, + "loss": 0.6323, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6421741843223572, + "rewards/margins": 0.18355508148670197, + "rewards/rejected": -0.825729250907898, + "step": 19 + }, + { + "epoch": 0.03795066413662239, + "grad_norm": 1.030942678451538, + "learning_rate": 4e-05, + "logits/chosen": -2.7741472721099854, + "logits/rejected": -2.7686080932617188, + "logps/chosen": -30.057147979736328, + "logps/rejected": -30.112163543701172, + "loss": 0.6484, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6412388682365417, + "rewards/margins": 0.10512620210647583, + "rewards/rejected": -0.7463650703430176, + "step": 20 + }, + { + "epoch": 0.03984819734345351, + "grad_norm": 0.9527221918106079, + "learning_rate": 4.2e-05, + "logits/chosen": -2.818779706954956, + "logits/rejected": -2.81341290473938, + "logps/chosen": -29.735050201416016, + "logps/rejected": -29.48119354248047, + "loss": 0.536, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3829268217086792, + "rewards/margins": 0.3811430335044861, + "rewards/rejected": -0.7640698552131653, + "step": 21 + }, + { + "epoch": 0.04174573055028463, + "grad_norm": 1.2015403509140015, + "learning_rate": 4.4000000000000006e-05, + "logits/chosen": -2.8072240352630615, + "logits/rejected": -2.8029563426971436, + "logps/chosen": -30.29505157470703, + "logps/rejected": -30.543851852416992, + "loss": 0.5779, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6176290512084961, + "rewards/margins": 0.26741302013397217, + "rewards/rejected": -0.8850420117378235, + "step": 22 + }, + { + "epoch": 0.04364326375711575, + "grad_norm": 1.0167680978775024, + "learning_rate": 4.600000000000001e-05, + "logits/chosen": -2.8216638565063477, + "logits/rejected": -2.818462610244751, + "logps/chosen": -30.67039680480957, + "logps/rejected": -32.72490692138672, + "loss": 0.5636, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6111881136894226, + "rewards/margins": 0.31430453062057495, + "rewards/rejected": -0.9254926443099976, + "step": 23 + }, + { + "epoch": 0.04554079696394687, + "grad_norm": 1.308831810951233, + "learning_rate": 4.8e-05, + "logits/chosen": -2.8071296215057373, + "logits/rejected": -2.8020644187927246, + "logps/chosen": -30.84124755859375, + "logps/rejected": -32.474220275878906, + "loss": 0.4833, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6006485223770142, + "rewards/margins": 0.5012238025665283, + "rewards/rejected": -1.101872444152832, + "step": 24 + }, + { + "epoch": 0.04743833017077799, + "grad_norm": 1.2391417026519775, + "learning_rate": 5e-05, + "logits/chosen": -2.839644193649292, + "logits/rejected": -2.8386754989624023, + "logps/chosen": -32.8646125793457, + "logps/rejected": -33.53518295288086, + "loss": 0.5978, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9321977496147156, + "rewards/margins": 0.24001502990722656, + "rewards/rejected": -1.172212839126587, + "step": 25 + }, + { + "epoch": 0.04933586337760911, + "grad_norm": 1.4166021347045898, + "learning_rate": 5.2000000000000004e-05, + "logits/chosen": -2.822469472885132, + "logits/rejected": -2.824627637863159, + "logps/chosen": -33.248661041259766, + "logps/rejected": -33.05607223510742, + "loss": 0.6222, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9429985284805298, + "rewards/margins": 0.20479901134967804, + "rewards/rejected": -1.1477975845336914, + "step": 26 + }, + { + "epoch": 0.051233396584440226, + "grad_norm": 1.3769515752792358, + "learning_rate": 5.4000000000000005e-05, + "logits/chosen": -2.8131866455078125, + "logits/rejected": -2.8165550231933594, + "logps/chosen": -32.11775588989258, + "logps/rejected": -33.78160858154297, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7932265996932983, + "rewards/margins": 0.3268255591392517, + "rewards/rejected": -1.1200520992279053, + "step": 27 + }, + { + "epoch": 0.05313092979127135, + "grad_norm": 1.8020470142364502, + "learning_rate": 5.6000000000000006e-05, + "logits/chosen": -2.75449800491333, + "logits/rejected": -2.7445366382598877, + "logps/chosen": -29.581748962402344, + "logps/rejected": -31.868961334228516, + "loss": 0.5396, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.48946696519851685, + "rewards/margins": 0.41189253330230713, + "rewards/rejected": -0.901359498500824, + "step": 28 + }, + { + "epoch": 0.05502846299810247, + "grad_norm": 1.4790736436843872, + "learning_rate": 5.8e-05, + "logits/chosen": -2.7621536254882812, + "logits/rejected": -2.763129472732544, + "logps/chosen": -26.641231536865234, + "logps/rejected": -28.51852035522461, + "loss": 0.5958, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.22555744647979736, + "rewards/margins": 0.36228662729263306, + "rewards/rejected": -0.5878440737724304, + "step": 29 + }, + { + "epoch": 0.056925996204933584, + "grad_norm": 2.4083566665649414, + "learning_rate": 6e-05, + "logits/chosen": -2.7771716117858887, + "logits/rejected": -2.779322862625122, + "logps/chosen": -25.557411193847656, + "logps/rejected": -28.930173873901367, + "loss": 0.4966, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.2201511561870575, + "rewards/margins": 0.5415716767311096, + "rewards/rejected": -0.7617228627204895, + "step": 30 + }, + { + "epoch": 0.058823529411764705, + "grad_norm": 3.190519094467163, + "learning_rate": 6.2e-05, + "logits/chosen": -2.7948427200317383, + "logits/rejected": -2.7934060096740723, + "logps/chosen": -25.57403564453125, + "logps/rejected": -27.072856903076172, + "loss": 0.548, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.15162353217601776, + "rewards/margins": 0.38971802592277527, + "rewards/rejected": -0.5413415431976318, + "step": 31 + }, + { + "epoch": 0.06072106261859583, + "grad_norm": 2.303135633468628, + "learning_rate": 6.400000000000001e-05, + "logits/chosen": -2.7421650886535645, + "logits/rejected": -2.746206521987915, + "logps/chosen": -27.796472549438477, + "logps/rejected": -33.54944610595703, + "loss": 0.4797, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.43570366501808167, + "rewards/margins": 0.6830397844314575, + "rewards/rejected": -1.1187434196472168, + "step": 32 + }, + { + "epoch": 0.06261859582542695, + "grad_norm": 2.5751965045928955, + "learning_rate": 6.6e-05, + "logits/chosen": -2.8050615787506104, + "logits/rejected": -2.8058907985687256, + "logps/chosen": -31.59589195251465, + "logps/rejected": -37.699378967285156, + "loss": 0.4424, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7283644676208496, + "rewards/margins": 0.8916000127792358, + "rewards/rejected": -1.619964599609375, + "step": 33 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 2.540719985961914, + "learning_rate": 6.800000000000001e-05, + "logits/chosen": -2.7393884658813477, + "logits/rejected": -2.745088815689087, + "logps/chosen": -33.36492156982422, + "logps/rejected": -41.315711975097656, + "loss": 0.3926, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9094129204750061, + "rewards/margins": 0.948678731918335, + "rewards/rejected": -1.8580915927886963, + "step": 34 + }, + { + "epoch": 0.06641366223908918, + "grad_norm": 4.434343338012695, + "learning_rate": 7e-05, + "logits/chosen": -2.7703030109405518, + "logits/rejected": -2.7763912677764893, + "logps/chosen": -38.98546600341797, + "logps/rejected": -45.82567596435547, + "loss": 0.5193, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5312814712524414, + "rewards/margins": 0.8732771277427673, + "rewards/rejected": -2.4045586585998535, + "step": 35 + }, + { + "epoch": 0.0683111954459203, + "grad_norm": 4.4201507568359375, + "learning_rate": 7.2e-05, + "logits/chosen": -2.7684011459350586, + "logits/rejected": -2.7685604095458984, + "logps/chosen": -40.817020416259766, + "logps/rejected": -46.39413833618164, + "loss": 0.5283, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7174456119537354, + "rewards/margins": 0.7200771570205688, + "rewards/rejected": -2.4375228881835938, + "step": 36 + }, + { + "epoch": 0.07020872865275142, + "grad_norm": 3.611697196960449, + "learning_rate": 7.4e-05, + "logits/chosen": -2.773036241531372, + "logits/rejected": -2.7685177326202393, + "logps/chosen": -42.66209411621094, + "logps/rejected": -47.02984619140625, + "loss": 0.5431, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.8478937149047852, + "rewards/margins": 0.6286407709121704, + "rewards/rejected": -2.476534605026245, + "step": 37 + }, + { + "epoch": 0.07210626185958255, + "grad_norm": 5.300126075744629, + "learning_rate": 7.6e-05, + "logits/chosen": -2.8151535987854004, + "logits/rejected": -2.8189120292663574, + "logps/chosen": -44.82855224609375, + "logps/rejected": -48.87239074707031, + "loss": 0.4847, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0770606994628906, + "rewards/margins": 0.6569112539291382, + "rewards/rejected": -2.7339720726013184, + "step": 38 + }, + { + "epoch": 0.07400379506641366, + "grad_norm": 9.036521911621094, + "learning_rate": 7.800000000000001e-05, + "logits/chosen": -2.735901117324829, + "logits/rejected": -2.746289014816284, + "logps/chosen": -36.09475326538086, + "logps/rejected": -48.74174499511719, + "loss": 0.3724, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2715508937835693, + "rewards/margins": 1.4835450649261475, + "rewards/rejected": -2.755095958709717, + "step": 39 + }, + { + "epoch": 0.07590132827324478, + "grad_norm": 7.483906269073486, + "learning_rate": 8e-05, + "logits/chosen": -2.762651205062866, + "logits/rejected": -2.760044813156128, + "logps/chosen": -36.31230926513672, + "logps/rejected": -39.352806091308594, + "loss": 0.6028, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1989113092422485, + "rewards/margins": 0.44402891397476196, + "rewards/rejected": -1.6429402828216553, + "step": 40 + }, + { + "epoch": 0.0777988614800759, + "grad_norm": 3.6882236003875732, + "learning_rate": 8.2e-05, + "logits/chosen": -2.7478814125061035, + "logits/rejected": -2.7577743530273438, + "logps/chosen": -36.97419738769531, + "logps/rejected": -41.79148864746094, + "loss": 0.5367, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.360243320465088, + "rewards/margins": 0.6748560667037964, + "rewards/rejected": -2.0350992679595947, + "step": 41 + }, + { + "epoch": 0.07969639468690702, + "grad_norm": 5.39766263961792, + "learning_rate": 8.4e-05, + "logits/chosen": -2.777827262878418, + "logits/rejected": -2.779998302459717, + "logps/chosen": -34.955528259277344, + "logps/rejected": -40.095359802246094, + "loss": 0.4864, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0647568702697754, + "rewards/margins": 0.7299851179122925, + "rewards/rejected": -1.7947419881820679, + "step": 42 + }, + { + "epoch": 0.08159392789373814, + "grad_norm": 3.68463397026062, + "learning_rate": 8.6e-05, + "logits/chosen": -2.7425332069396973, + "logits/rejected": -2.749401807785034, + "logps/chosen": -33.41716766357422, + "logps/rejected": -44.460105895996094, + "loss": 0.3641, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.0076907873153687, + "rewards/margins": 1.2475147247314453, + "rewards/rejected": -2.2552056312561035, + "step": 43 + }, + { + "epoch": 0.08349146110056926, + "grad_norm": 4.242417335510254, + "learning_rate": 8.800000000000001e-05, + "logits/chosen": -2.7301769256591797, + "logits/rejected": -2.739335298538208, + "logps/chosen": -40.64069747924805, + "logps/rejected": -53.96464538574219, + "loss": 0.3206, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6848864555358887, + "rewards/margins": 1.3943724632263184, + "rewards/rejected": -3.079258918762207, + "step": 44 + }, + { + "epoch": 0.08538899430740038, + "grad_norm": 7.282301902770996, + "learning_rate": 9e-05, + "logits/chosen": -2.766569137573242, + "logits/rejected": -2.7720367908477783, + "logps/chosen": -43.393699645996094, + "logps/rejected": -67.36265563964844, + "loss": 0.3599, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.043391466140747, + "rewards/margins": 2.490874767303467, + "rewards/rejected": -4.534266471862793, + "step": 45 + }, + { + "epoch": 0.0872865275142315, + "grad_norm": 6.472597122192383, + "learning_rate": 9.200000000000001e-05, + "logits/chosen": -2.8010120391845703, + "logits/rejected": -2.7943105697631836, + "logps/chosen": -54.13050079345703, + "logps/rejected": -66.39878845214844, + "loss": 0.692, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.856748342514038, + "rewards/margins": 1.4960048198699951, + "rewards/rejected": -4.352753162384033, + "step": 46 + }, + { + "epoch": 0.08918406072106262, + "grad_norm": 6.507133483886719, + "learning_rate": 9.4e-05, + "logits/chosen": -2.702542543411255, + "logits/rejected": -2.7060205936431885, + "logps/chosen": -58.316917419433594, + "logps/rejected": -62.27170944213867, + "loss": 0.7671, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.5426511764526367, + "rewards/margins": 0.42596814036369324, + "rewards/rejected": -3.9686193466186523, + "step": 47 + }, + { + "epoch": 0.09108159392789374, + "grad_norm": 1.7756693363189697, + "learning_rate": 9.6e-05, + "logits/chosen": -2.7501490116119385, + "logits/rejected": -2.746398687362671, + "logps/chosen": -44.69342803955078, + "logps/rejected": -59.879600524902344, + "loss": 0.2497, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9775663614273071, + "rewards/margins": 1.8432695865631104, + "rewards/rejected": -3.820835590362549, + "step": 48 + }, + { + "epoch": 0.09297912713472485, + "grad_norm": 2.5171730518341064, + "learning_rate": 9.8e-05, + "logits/chosen": -2.7280032634735107, + "logits/rejected": -2.728956699371338, + "logps/chosen": -49.98912048339844, + "logps/rejected": -59.551475524902344, + "loss": 0.549, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6230831146240234, + "rewards/margins": 1.1891345977783203, + "rewards/rejected": -3.8122177124023438, + "step": 49 + }, + { + "epoch": 0.09487666034155598, + "grad_norm": 2.106658935546875, + "learning_rate": 0.0001, + "logits/chosen": -2.7503480911254883, + "logits/rejected": -2.7512266635894775, + "logps/chosen": -42.37887954711914, + "logps/rejected": -49.372154235839844, + "loss": 0.4477, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.683307409286499, + "rewards/margins": 1.000291347503662, + "rewards/rejected": -2.683598518371582, + "step": 50 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 7.144794940948486, + "learning_rate": 0.00010200000000000001, + "logits/chosen": -2.7536375522613525, + "logits/rejected": -2.75874662399292, + "logps/chosen": -40.97515106201172, + "logps/rejected": -44.2424201965332, + "loss": 0.7101, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.8148384094238281, + "rewards/margins": 0.42236870527267456, + "rewards/rejected": -2.2372071743011475, + "step": 51 + }, + { + "epoch": 0.09867172675521822, + "grad_norm": 2.209843158721924, + "learning_rate": 0.00010400000000000001, + "logits/chosen": -2.6992690563201904, + "logits/rejected": -2.6988301277160645, + "logps/chosen": -37.43678283691406, + "logps/rejected": -45.80652618408203, + "loss": 0.3782, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3452003002166748, + "rewards/margins": 1.0832030773162842, + "rewards/rejected": -2.428403615951538, + "step": 52 + }, + { + "epoch": 0.10056925996204934, + "grad_norm": 1.8978558778762817, + "learning_rate": 0.00010600000000000002, + "logits/chosen": -2.763897657394409, + "logits/rejected": -2.768207311630249, + "logps/chosen": -41.63645553588867, + "logps/rejected": -52.406524658203125, + "loss": 0.3655, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7027217149734497, + "rewards/margins": 1.1549346446990967, + "rewards/rejected": -2.857656240463257, + "step": 53 + }, + { + "epoch": 0.10246679316888045, + "grad_norm": 2.3911263942718506, + "learning_rate": 0.00010800000000000001, + "logits/chosen": -2.7382662296295166, + "logits/rejected": -2.7411863803863525, + "logps/chosen": -45.131248474121094, + "logps/rejected": -59.659149169921875, + "loss": 0.3506, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.070969343185425, + "rewards/margins": 1.574415683746338, + "rewards/rejected": -3.6453850269317627, + "step": 54 + }, + { + "epoch": 0.10436432637571158, + "grad_norm": 3.873135566711426, + "learning_rate": 0.00011000000000000002, + "logits/chosen": -2.7719104290008545, + "logits/rejected": -2.7709810733795166, + "logps/chosen": -48.20206069946289, + "logps/rejected": -58.50641632080078, + "loss": 0.4576, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.500065803527832, + "rewards/margins": 1.092839002609253, + "rewards/rejected": -3.592904806137085, + "step": 55 + }, + { + "epoch": 0.1062618595825427, + "grad_norm": 4.552927017211914, + "learning_rate": 0.00011200000000000001, + "logits/chosen": -2.738482713699341, + "logits/rejected": -2.7385125160217285, + "logps/chosen": -38.80826950073242, + "logps/rejected": -52.960838317871094, + "loss": 0.4095, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4881004095077515, + "rewards/margins": 1.5598132610321045, + "rewards/rejected": -3.0479137897491455, + "step": 56 + }, + { + "epoch": 0.10815939278937381, + "grad_norm": 2.9376704692840576, + "learning_rate": 0.00011399999999999999, + "logits/chosen": -2.767287254333496, + "logits/rejected": -2.7738823890686035, + "logps/chosen": -37.63935470581055, + "logps/rejected": -51.30040740966797, + "loss": 0.3822, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.387617826461792, + "rewards/margins": 1.600461483001709, + "rewards/rejected": -2.988079309463501, + "step": 57 + }, + { + "epoch": 0.11005692599620494, + "grad_norm": 4.015516757965088, + "learning_rate": 0.000116, + "logits/chosen": -2.772315502166748, + "logits/rejected": -2.7690601348876953, + "logps/chosen": -47.2697868347168, + "logps/rejected": -54.677337646484375, + "loss": 0.5154, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.417880058288574, + "rewards/margins": 0.9048663377761841, + "rewards/rejected": -3.322746515274048, + "step": 58 + }, + { + "epoch": 0.11195445920303605, + "grad_norm": 4.015993118286133, + "learning_rate": 0.000118, + "logits/chosen": -2.7056713104248047, + "logits/rejected": -2.7002859115600586, + "logps/chosen": -51.202369689941406, + "logps/rejected": -59.07714080810547, + "loss": 0.5931, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.7120983600616455, + "rewards/margins": 0.9703352451324463, + "rewards/rejected": -3.682433605194092, + "step": 59 + }, + { + "epoch": 0.11385199240986717, + "grad_norm": 2.3085007667541504, + "learning_rate": 0.00012, + "logits/chosen": -2.7429823875427246, + "logits/rejected": -2.7343454360961914, + "logps/chosen": -55.68383026123047, + "logps/rejected": -64.50788879394531, + "loss": 0.4269, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2317302227020264, + "rewards/margins": 0.981264054775238, + "rewards/rejected": -4.21299409866333, + "step": 60 + }, + { + "epoch": 0.1157495256166983, + "grad_norm": 2.4124083518981934, + "learning_rate": 0.000122, + "logits/chosen": -2.7513816356658936, + "logits/rejected": -2.7479352951049805, + "logps/chosen": -54.174095153808594, + "logps/rejected": -65.53788757324219, + "loss": 0.3048, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.013425588607788, + "rewards/margins": 1.3886945247650146, + "rewards/rejected": -4.4021196365356445, + "step": 61 + }, + { + "epoch": 0.11764705882352941, + "grad_norm": 2.433696746826172, + "learning_rate": 0.000124, + "logits/chosen": -2.783195734024048, + "logits/rejected": -2.778404474258423, + "logps/chosen": -51.3481559753418, + "logps/rejected": -70.99629211425781, + "loss": 0.319, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.688727378845215, + "rewards/margins": 2.164720058441162, + "rewards/rejected": -4.853447914123535, + "step": 62 + }, + { + "epoch": 0.11954459203036052, + "grad_norm": 2.1883819103240967, + "learning_rate": 0.000126, + "logits/chosen": -2.746483564376831, + "logits/rejected": -2.736555337905884, + "logps/chosen": -48.178592681884766, + "logps/rejected": -70.20449829101562, + "loss": 0.1814, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.385026693344116, + "rewards/margins": 2.3813328742980957, + "rewards/rejected": -4.766359329223633, + "step": 63 + }, + { + "epoch": 0.12144212523719165, + "grad_norm": 2.765345811843872, + "learning_rate": 0.00012800000000000002, + "logits/chosen": -2.810784339904785, + "logits/rejected": -2.810107946395874, + "logps/chosen": -55.56414794921875, + "logps/rejected": -79.71456909179688, + "loss": 0.2055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.27866530418396, + "rewards/margins": 2.5508813858032227, + "rewards/rejected": -5.829546928405762, + "step": 64 + }, + { + "epoch": 0.12333965844402277, + "grad_norm": 6.344622611999512, + "learning_rate": 0.00013000000000000002, + "logits/chosen": -2.814685583114624, + "logits/rejected": -2.805901527404785, + "logps/chosen": -44.5737419128418, + "logps/rejected": -73.86512756347656, + "loss": 0.1899, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0165200233459473, + "rewards/margins": 3.251584053039551, + "rewards/rejected": -5.268104553222656, + "step": 65 + }, + { + "epoch": 0.1252371916508539, + "grad_norm": 3.388427972793579, + "learning_rate": 0.000132, + "logits/chosen": -2.798591136932373, + "logits/rejected": -2.8005106449127197, + "logps/chosen": -36.04314422607422, + "logps/rejected": -50.72333526611328, + "loss": 0.3848, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1693958044052124, + "rewards/margins": 1.6317634582519531, + "rewards/rejected": -2.801159381866455, + "step": 66 + }, + { + "epoch": 0.127134724857685, + "grad_norm": 2.023927688598633, + "learning_rate": 0.000134, + "logits/chosen": -2.81923508644104, + "logits/rejected": -2.815878391265869, + "logps/chosen": -28.859542846679688, + "logps/rejected": -41.972694396972656, + "loss": 0.2323, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40116792917251587, + "rewards/margins": 1.6016311645507812, + "rewards/rejected": -2.0027990341186523, + "step": 67 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 3.041720151901245, + "learning_rate": 0.00013600000000000003, + "logits/chosen": -2.782683849334717, + "logits/rejected": -2.7856640815734863, + "logps/chosen": -32.9669189453125, + "logps/rejected": -49.34394836425781, + "loss": 0.3746, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.0322561264038086, + "rewards/margins": 1.6814837455749512, + "rewards/rejected": -2.7137398719787598, + "step": 68 + }, + { + "epoch": 0.13092979127134724, + "grad_norm": 7.477070331573486, + "learning_rate": 0.000138, + "logits/chosen": -2.747135639190674, + "logits/rejected": -2.747574806213379, + "logps/chosen": -36.88587188720703, + "logps/rejected": -66.01129150390625, + "loss": 0.2206, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.325242519378662, + "rewards/margins": 2.977128744125366, + "rewards/rejected": -4.302371025085449, + "step": 69 + }, + { + "epoch": 0.13282732447817835, + "grad_norm": 6.996569633483887, + "learning_rate": 0.00014, + "logits/chosen": -2.7544913291931152, + "logits/rejected": -2.7490170001983643, + "logps/chosen": -38.83042907714844, + "logps/rejected": -65.96528625488281, + "loss": 0.3555, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2973248958587646, + "rewards/margins": 3.1091742515563965, + "rewards/rejected": -4.406498908996582, + "step": 70 + }, + { + "epoch": 0.1347248576850095, + "grad_norm": 6.556032180786133, + "learning_rate": 0.000142, + "logits/chosen": -2.7899088859558105, + "logits/rejected": -2.7880430221557617, + "logps/chosen": -43.366214752197266, + "logps/rejected": -61.33940124511719, + "loss": 0.2978, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9949687719345093, + "rewards/margins": 1.9916281700134277, + "rewards/rejected": -3.9865970611572266, + "step": 71 + }, + { + "epoch": 0.1366223908918406, + "grad_norm": 2.458948850631714, + "learning_rate": 0.000144, + "logits/chosen": -2.792599678039551, + "logits/rejected": -2.7890465259552, + "logps/chosen": -43.916316986083984, + "logps/rejected": -55.91321563720703, + "loss": 0.341, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9487779140472412, + "rewards/margins": 1.4445126056671143, + "rewards/rejected": -3.3932905197143555, + "step": 72 + }, + { + "epoch": 0.13851992409867173, + "grad_norm": 4.216429710388184, + "learning_rate": 0.000146, + "logits/chosen": -2.8028111457824707, + "logits/rejected": -2.805846929550171, + "logps/chosen": -45.717369079589844, + "logps/rejected": -54.00206756591797, + "loss": 0.5921, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.3208117485046387, + "rewards/margins": 0.9625445008277893, + "rewards/rejected": -3.283356189727783, + "step": 73 + }, + { + "epoch": 0.14041745730550284, + "grad_norm": 1.7848654985427856, + "learning_rate": 0.000148, + "logits/chosen": -2.7844631671905518, + "logits/rejected": -2.7775328159332275, + "logps/chosen": -45.630828857421875, + "logps/rejected": -57.7205696105957, + "loss": 0.3173, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1562933921813965, + "rewards/margins": 1.4049997329711914, + "rewards/rejected": -3.561293125152588, + "step": 74 + }, + { + "epoch": 0.14231499051233396, + "grad_norm": 4.6799750328063965, + "learning_rate": 0.00015000000000000001, + "logits/chosen": -2.7869224548339844, + "logits/rejected": -2.7798612117767334, + "logps/chosen": -57.45626449584961, + "logps/rejected": -61.46935272216797, + "loss": 0.6529, + "rewards/accuracies": 0.5625, + "rewards/chosen": -3.3889098167419434, + "rewards/margins": 0.6126527190208435, + "rewards/rejected": -4.001562118530273, + "step": 75 + }, + { + "epoch": 0.1442125237191651, + "grad_norm": 2.800229787826538, + "learning_rate": 0.000152, + "logits/chosen": -2.832784652709961, + "logits/rejected": -2.828519582748413, + "logps/chosen": -52.276004791259766, + "logps/rejected": -63.41943359375, + "loss": 0.4297, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.8967790603637695, + "rewards/margins": 1.1908848285675049, + "rewards/rejected": -4.087663650512695, + "step": 76 + }, + { + "epoch": 0.1461100569259962, + "grad_norm": 3.5293054580688477, + "learning_rate": 0.000154, + "logits/chosen": -2.791073799133301, + "logits/rejected": -2.789182662963867, + "logps/chosen": -52.498924255371094, + "logps/rejected": -68.02275848388672, + "loss": 0.3526, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7176871299743652, + "rewards/margins": 1.8619203567504883, + "rewards/rejected": -4.579607009887695, + "step": 77 + }, + { + "epoch": 0.14800759013282733, + "grad_norm": 5.139841556549072, + "learning_rate": 0.00015600000000000002, + "logits/chosen": -2.801218271255493, + "logits/rejected": -2.798741102218628, + "logps/chosen": -39.000831604003906, + "logps/rejected": -56.16624450683594, + "loss": 0.5178, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.4603410959243774, + "rewards/margins": 1.9403096437454224, + "rewards/rejected": -3.400650978088379, + "step": 78 + }, + { + "epoch": 0.14990512333965844, + "grad_norm": 3.5403361320495605, + "learning_rate": 0.00015800000000000002, + "logits/chosen": -2.8410701751708984, + "logits/rejected": -2.8384857177734375, + "logps/chosen": -36.62928009033203, + "logps/rejected": -45.52888488769531, + "loss": 0.5752, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.2457019090652466, + "rewards/margins": 1.257145643234253, + "rewards/rejected": -2.502847671508789, + "step": 79 + }, + { + "epoch": 0.15180265654648956, + "grad_norm": 2.7179343700408936, + "learning_rate": 0.00016, + "logits/chosen": -2.806529998779297, + "logits/rejected": -2.806776762008667, + "logps/chosen": -36.37416076660156, + "logps/rejected": -49.32950973510742, + "loss": 0.3537, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3248584270477295, + "rewards/margins": 1.3763021230697632, + "rewards/rejected": -2.701160430908203, + "step": 80 + }, + { + "epoch": 0.15370018975332067, + "grad_norm": 3.5153768062591553, + "learning_rate": 0.000162, + "logits/chosen": -2.786527633666992, + "logits/rejected": -2.793896198272705, + "logps/chosen": -31.23716926574707, + "logps/rejected": -42.66054153442383, + "loss": 0.3014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7100868821144104, + "rewards/margins": 1.4225895404815674, + "rewards/rejected": -2.132676601409912, + "step": 81 + }, + { + "epoch": 0.1555977229601518, + "grad_norm": 4.196669101715088, + "learning_rate": 0.000164, + "logits/chosen": -2.8289337158203125, + "logits/rejected": -2.824960470199585, + "logps/chosen": -30.018644332885742, + "logps/rejected": -35.070648193359375, + "loss": 0.6769, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7271355986595154, + "rewards/margins": 0.5995543599128723, + "rewards/rejected": -1.3266899585723877, + "step": 82 + }, + { + "epoch": 0.15749525616698293, + "grad_norm": 4.570925235748291, + "learning_rate": 0.000166, + "logits/chosen": -2.8203651905059814, + "logits/rejected": -2.8185455799102783, + "logps/chosen": -44.2990608215332, + "logps/rejected": -52.1357536315918, + "loss": 0.6825, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.085677146911621, + "rewards/margins": 0.8163881897926331, + "rewards/rejected": -2.9020657539367676, + "step": 83 + }, + { + "epoch": 0.15939278937381404, + "grad_norm": 1.6578748226165771, + "learning_rate": 0.000168, + "logits/chosen": -2.781203031539917, + "logits/rejected": -2.7794113159179688, + "logps/chosen": -45.45800018310547, + "logps/rejected": -62.20787811279297, + "loss": 0.2852, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.2425527572631836, + "rewards/margins": 1.843325138092041, + "rewards/rejected": -4.085877895355225, + "step": 84 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 4.050334453582764, + "learning_rate": 0.00017, + "logits/chosen": -2.8100481033325195, + "logits/rejected": -2.8053321838378906, + "logps/chosen": -47.41972351074219, + "logps/rejected": -58.743255615234375, + "loss": 0.5081, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.2062716484069824, + "rewards/margins": 1.4793806076049805, + "rewards/rejected": -3.685652256011963, + "step": 85 + }, + { + "epoch": 0.16318785578747627, + "grad_norm": 3.0159974098205566, + "learning_rate": 0.000172, + "logits/chosen": -2.758812427520752, + "logits/rejected": -2.7559688091278076, + "logps/chosen": -55.07322311401367, + "logps/rejected": -65.33331298828125, + "loss": 0.3226, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1052637100219727, + "rewards/margins": 1.3766382932662964, + "rewards/rejected": -4.481902122497559, + "step": 86 + }, + { + "epoch": 0.1650853889943074, + "grad_norm": 4.908008098602295, + "learning_rate": 0.000174, + "logits/chosen": -2.80372953414917, + "logits/rejected": -2.793184280395508, + "logps/chosen": -59.20977020263672, + "logps/rejected": -65.16618347167969, + "loss": 0.6049, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.495293140411377, + "rewards/margins": 0.8276402950286865, + "rewards/rejected": -4.322933197021484, + "step": 87 + }, + { + "epoch": 0.16698292220113853, + "grad_norm": 2.5410759449005127, + "learning_rate": 0.00017600000000000002, + "logits/chosen": -2.777139186859131, + "logits/rejected": -2.7644989490509033, + "logps/chosen": -56.76138687133789, + "logps/rejected": -73.66291809082031, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2906441688537598, + "rewards/margins": 1.8566827774047852, + "rewards/rejected": -5.147326946258545, + "step": 88 + }, + { + "epoch": 0.16888045540796964, + "grad_norm": 3.6899871826171875, + "learning_rate": 0.00017800000000000002, + "logits/chosen": -2.778470277786255, + "logits/rejected": -2.774235725402832, + "logps/chosen": -54.31599426269531, + "logps/rejected": -70.16642761230469, + "loss": 0.3418, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.986976385116577, + "rewards/margins": 1.7881888151168823, + "rewards/rejected": -4.775165557861328, + "step": 89 + }, + { + "epoch": 0.17077798861480076, + "grad_norm": 4.643296241760254, + "learning_rate": 0.00018, + "logits/chosen": -2.7810251712799072, + "logits/rejected": -2.7724738121032715, + "logps/chosen": -60.76677322387695, + "logps/rejected": -76.55674743652344, + "loss": 0.6284, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.654345989227295, + "rewards/margins": 1.6196839809417725, + "rewards/rejected": -5.274029731750488, + "step": 90 + }, + { + "epoch": 0.17267552182163187, + "grad_norm": 8.180418968200684, + "learning_rate": 0.000182, + "logits/chosen": -2.819366931915283, + "logits/rejected": -2.807485342025757, + "logps/chosen": -51.31687927246094, + "logps/rejected": -69.36310577392578, + "loss": 0.4503, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7893166542053223, + "rewards/margins": 2.0452799797058105, + "rewards/rejected": -4.834596633911133, + "step": 91 + }, + { + "epoch": 0.174573055028463, + "grad_norm": 4.3448686599731445, + "learning_rate": 0.00018400000000000003, + "logits/chosen": -2.824305772781372, + "logits/rejected": -2.8248698711395264, + "logps/chosen": -45.58390808105469, + "logps/rejected": -57.605201721191406, + "loss": 0.5903, + "rewards/accuracies": 0.625, + "rewards/chosen": -2.0943570137023926, + "rewards/margins": 1.3772242069244385, + "rewards/rejected": -3.471581220626831, + "step": 92 + }, + { + "epoch": 0.17647058823529413, + "grad_norm": 2.931424856185913, + "learning_rate": 0.00018600000000000002, + "logits/chosen": -2.8630051612854004, + "logits/rejected": -2.8607943058013916, + "logps/chosen": -41.100894927978516, + "logps/rejected": -49.50748825073242, + "loss": 0.463, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.815004825592041, + "rewards/margins": 0.9866312146186829, + "rewards/rejected": -2.801636219024658, + "step": 93 + }, + { + "epoch": 0.17836812144212524, + "grad_norm": 2.702983856201172, + "learning_rate": 0.000188, + "logits/chosen": -2.8779056072235107, + "logits/rejected": -2.8776400089263916, + "logps/chosen": -42.185447692871094, + "logps/rejected": -50.3919792175293, + "loss": 0.4895, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.823110818862915, + "rewards/margins": 1.0870286226272583, + "rewards/rejected": -2.910139560699463, + "step": 94 + }, + { + "epoch": 0.18026565464895636, + "grad_norm": 4.391569137573242, + "learning_rate": 0.00019, + "logits/chosen": -2.777017116546631, + "logits/rejected": -2.7851855754852295, + "logps/chosen": -29.167160034179688, + "logps/rejected": -43.34316635131836, + "loss": 0.2765, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.45135945081710815, + "rewards/margins": 1.7064999341964722, + "rewards/rejected": -2.1578593254089355, + "step": 95 + }, + { + "epoch": 0.18216318785578747, + "grad_norm": 4.305187702178955, + "learning_rate": 0.000192, + "logits/chosen": -2.8385465145111084, + "logits/rejected": -2.83952260017395, + "logps/chosen": -38.421730041503906, + "logps/rejected": -50.840003967285156, + "loss": 0.3135, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.460430383682251, + "rewards/margins": 1.4080810546875, + "rewards/rejected": -2.868511199951172, + "step": 96 + }, + { + "epoch": 0.1840607210626186, + "grad_norm": 12.295331001281738, + "learning_rate": 0.000194, + "logits/chosen": -2.830328941345215, + "logits/rejected": -2.8286261558532715, + "logps/chosen": -36.80898666381836, + "logps/rejected": -48.225921630859375, + "loss": 0.6485, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2993323802947998, + "rewards/margins": 1.328029751777649, + "rewards/rejected": -2.627362012863159, + "step": 97 + }, + { + "epoch": 0.1859582542694497, + "grad_norm": 6.877435207366943, + "learning_rate": 0.000196, + "logits/chosen": -2.8228061199188232, + "logits/rejected": -2.826436996459961, + "logps/chosen": -37.87078857421875, + "logps/rejected": -51.07941436767578, + "loss": 0.6151, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.410650372505188, + "rewards/margins": 1.5915892124176025, + "rewards/rejected": -3.00223970413208, + "step": 98 + }, + { + "epoch": 0.18785578747628084, + "grad_norm": 4.5217509269714355, + "learning_rate": 0.00019800000000000002, + "logits/chosen": -2.8376476764678955, + "logits/rejected": -2.836334466934204, + "logps/chosen": -33.52427673339844, + "logps/rejected": -45.65314483642578, + "loss": 0.3943, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0066884756088257, + "rewards/margins": 1.3794984817504883, + "rewards/rejected": -2.3861870765686035, + "step": 99 + }, + { + "epoch": 0.18975332068311196, + "grad_norm": 1.6153665781021118, + "learning_rate": 0.0002, + "logits/chosen": -2.809156656265259, + "logits/rejected": -2.817115068435669, + "logps/chosen": -30.36741065979004, + "logps/rejected": -32.7757453918457, + "loss": 0.5425, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.783231258392334, + "rewards/margins": 0.42554494738578796, + "rewards/rejected": -1.2087762355804443, + "step": 100 + }, + { + "epoch": 0.19165085388994307, + "grad_norm": 1.3271561861038208, + "learning_rate": 0.00019999729347501484, + "logits/chosen": -2.8080809116363525, + "logits/rejected": -2.8086583614349365, + "logps/chosen": -28.36756706237793, + "logps/rejected": -33.18540954589844, + "loss": 0.4887, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4251585602760315, + "rewards/margins": 0.5602906942367554, + "rewards/rejected": -0.9854491949081421, + "step": 101 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 2.151851177215576, + "learning_rate": 0.00019998917404656487, + "logits/chosen": -2.776926040649414, + "logits/rejected": -2.7729344367980957, + "logps/chosen": -35.40061569213867, + "logps/rejected": -35.295692443847656, + "loss": 0.5817, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9625377655029297, + "rewards/margins": 0.3408290147781372, + "rewards/rejected": -1.3033668994903564, + "step": 102 + }, + { + "epoch": 0.1954459203036053, + "grad_norm": 1.497015357017517, + "learning_rate": 0.00019997564215415884, + "logits/chosen": -2.8229072093963623, + "logits/rejected": -2.8220646381378174, + "logps/chosen": -34.33465576171875, + "logps/rejected": -38.946231842041016, + "loss": 0.4599, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.002916693687439, + "rewards/margins": 0.6433842182159424, + "rewards/rejected": -1.6463007926940918, + "step": 103 + }, + { + "epoch": 0.19734345351043645, + "grad_norm": 2.398622989654541, + "learning_rate": 0.00019995669853028485, + "logits/chosen": -2.835087299346924, + "logits/rejected": -2.8351354598999023, + "logps/chosen": -38.583656311035156, + "logps/rejected": -42.82282257080078, + "loss": 0.4885, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4459500312805176, + "rewards/margins": 0.5609026551246643, + "rewards/rejected": -2.006852626800537, + "step": 104 + }, + { + "epoch": 0.19924098671726756, + "grad_norm": 4.335348606109619, + "learning_rate": 0.00019993234420037073, + "logits/chosen": -2.898766279220581, + "logits/rejected": -2.8957412242889404, + "logps/chosen": -38.778602600097656, + "logps/rejected": -40.92530822753906, + "loss": 0.5875, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5738805532455444, + "rewards/margins": 0.4367479085922241, + "rewards/rejected": -2.0106287002563477, + "step": 105 + }, + { + "epoch": 0.20113851992409867, + "grad_norm": 4.584505081176758, + "learning_rate": 0.0001999025804827285, + "logits/chosen": -2.8751659393310547, + "logits/rejected": -2.875046968460083, + "logps/chosen": -39.396610260009766, + "logps/rejected": -39.287452697753906, + "loss": 0.7515, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.5874710083007812, + "rewards/margins": 0.11016285419464111, + "rewards/rejected": -1.6976337432861328, + "step": 106 + }, + { + "epoch": 0.2030360531309298, + "grad_norm": 3.5621519088745117, + "learning_rate": 0.00019986740898848306, + "logits/chosen": -2.8922340869903564, + "logits/rejected": -2.8920042514801025, + "logps/chosen": -43.401451110839844, + "logps/rejected": -48.335975646972656, + "loss": 0.6008, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9959304332733154, + "rewards/margins": 0.6039863228797913, + "rewards/rejected": -2.599916696548462, + "step": 107 + }, + { + "epoch": 0.2049335863377609, + "grad_norm": 2.4933717250823975, + "learning_rate": 0.000199826831621485, + "logits/chosen": -2.91758394241333, + "logits/rejected": -2.9207208156585693, + "logps/chosen": -47.70870590209961, + "logps/rejected": -52.232574462890625, + "loss": 0.5479, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.437047004699707, + "rewards/margins": 0.527195155620575, + "rewards/rejected": -2.9642422199249268, + "step": 108 + }, + { + "epoch": 0.20683111954459202, + "grad_norm": 3.197727918624878, + "learning_rate": 0.0001997808505782075, + "logits/chosen": -2.9473459720611572, + "logits/rejected": -2.949979305267334, + "logps/chosen": -45.37776184082031, + "logps/rejected": -47.56669616699219, + "loss": 0.5926, + "rewards/accuracies": 0.5625, + "rewards/chosen": -2.0948638916015625, + "rewards/margins": 0.5222463607788086, + "rewards/rejected": -2.617110013961792, + "step": 109 + }, + { + "epoch": 0.20872865275142316, + "grad_norm": 2.204061985015869, + "learning_rate": 0.0001997294683476273, + "logits/chosen": -2.9493114948272705, + "logits/rejected": -2.9442033767700195, + "logps/chosen": -45.5977668762207, + "logps/rejected": -52.712066650390625, + "loss": 0.3879, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1548056602478027, + "rewards/margins": 0.9166700839996338, + "rewards/rejected": -3.0714759826660156, + "step": 110 + }, + { + "epoch": 0.21062618595825428, + "grad_norm": 4.2725300788879395, + "learning_rate": 0.00019967268771109035, + "logits/chosen": -2.9485433101654053, + "logits/rejected": -2.9486706256866455, + "logps/chosen": -41.39335250854492, + "logps/rejected": -51.77354431152344, + "loss": 0.3888, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7500886917114258, + "rewards/margins": 1.1263270378112793, + "rewards/rejected": -2.876415729522705, + "step": 111 + }, + { + "epoch": 0.2125237191650854, + "grad_norm": 2.504917860031128, + "learning_rate": 0.00019961051174216082, + "logits/chosen": -2.9456284046173096, + "logits/rejected": -2.949559211730957, + "logps/chosen": -43.10253143310547, + "logps/rejected": -58.495235443115234, + "loss": 0.3659, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8653165102005005, + "rewards/margins": 1.712073802947998, + "rewards/rejected": -3.577390193939209, + "step": 112 + }, + { + "epoch": 0.2144212523719165, + "grad_norm": 3.238914728164673, + "learning_rate": 0.00019954294380645498, + "logits/chosen": -2.925440549850464, + "logits/rejected": -2.9304261207580566, + "logps/chosen": -43.154396057128906, + "logps/rejected": -60.708831787109375, + "loss": 0.421, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8937389850616455, + "rewards/margins": 1.8209748268127441, + "rewards/rejected": -3.7147140502929688, + "step": 113 + }, + { + "epoch": 0.21631878557874762, + "grad_norm": 6.110694408416748, + "learning_rate": 0.0001994699875614589, + "logits/chosen": -2.9304544925689697, + "logits/rejected": -2.9413235187530518, + "logps/chosen": -45.33203887939453, + "logps/rejected": -58.13191223144531, + "loss": 0.4857, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.097524642944336, + "rewards/margins": 1.4301856756210327, + "rewards/rejected": -3.527710437774658, + "step": 114 + }, + { + "epoch": 0.21821631878557876, + "grad_norm": 3.7045321464538574, + "learning_rate": 0.00019939164695633067, + "logits/chosen": -2.9360766410827637, + "logits/rejected": -2.936542272567749, + "logps/chosen": -51.745140075683594, + "logps/rejected": -66.68146514892578, + "loss": 0.5156, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.90883731842041, + "rewards/margins": 1.5629081726074219, + "rewards/rejected": -4.471745491027832, + "step": 115 + }, + { + "epoch": 0.22011385199240988, + "grad_norm": 4.1607136726379395, + "learning_rate": 0.00019930792623168637, + "logits/chosen": -2.929075002670288, + "logits/rejected": -2.9353854656219482, + "logps/chosen": -59.28025436401367, + "logps/rejected": -75.60289001464844, + "loss": 0.2769, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.515871047973633, + "rewards/margins": 1.8124377727508545, + "rewards/rejected": -5.328309059143066, + "step": 116 + }, + { + "epoch": 0.222011385199241, + "grad_norm": 3.711951494216919, + "learning_rate": 0.0001992188299193706, + "logits/chosen": -2.948878049850464, + "logits/rejected": -2.9539246559143066, + "logps/chosen": -52.6070442199707, + "logps/rejected": -68.63351440429688, + "loss": 0.3944, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7233872413635254, + "rewards/margins": 1.939584493637085, + "rewards/rejected": -4.662971496582031, + "step": 117 + }, + { + "epoch": 0.2239089184060721, + "grad_norm": 4.125283718109131, + "learning_rate": 0.00019912436284221134, + "logits/chosen": -2.8983044624328613, + "logits/rejected": -2.9071972370147705, + "logps/chosen": -59.11683654785156, + "logps/rejected": -71.92567443847656, + "loss": 0.3953, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.552089214324951, + "rewards/margins": 1.6068460941314697, + "rewards/rejected": -5.158935546875, + "step": 118 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 4.579989433288574, + "learning_rate": 0.00019902453011375865, + "logits/chosen": -2.9441299438476562, + "logits/rejected": -2.9400062561035156, + "logps/chosen": -54.54322052001953, + "logps/rejected": -62.37023162841797, + "loss": 0.4215, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.9585518836975098, + "rewards/margins": 1.0481977462768555, + "rewards/rejected": -4.006749629974365, + "step": 119 + }, + { + "epoch": 0.22770398481973433, + "grad_norm": 2.091538667678833, + "learning_rate": 0.00019891933713800798, + "logits/chosen": -2.8959543704986572, + "logits/rejected": -2.907708168029785, + "logps/chosen": -52.33782958984375, + "logps/rejected": -80.5691909790039, + "loss": 0.1493, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.778528928756714, + "rewards/margins": 3.066986560821533, + "rewards/rejected": -5.845515251159668, + "step": 120 + }, + { + "epoch": 0.22960151802656548, + "grad_norm": 1.603954553604126, + "learning_rate": 0.00019880878960910772, + "logits/chosen": -2.887864828109741, + "logits/rejected": -2.8939895629882812, + "logps/chosen": -44.32196044921875, + "logps/rejected": -83.90393829345703, + "loss": 0.1169, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9681143760681152, + "rewards/margins": 4.234763145446777, + "rewards/rejected": -6.202877998352051, + "step": 121 + }, + { + "epoch": 0.2314990512333966, + "grad_norm": 17.788175582885742, + "learning_rate": 0.00019869289351105086, + "logits/chosen": -2.88763427734375, + "logits/rejected": -2.8800721168518066, + "logps/chosen": -63.583621978759766, + "logps/rejected": -76.61012268066406, + "loss": 0.9297, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.007019519805908, + "rewards/margins": 1.262047290802002, + "rewards/rejected": -5.26906681060791, + "step": 122 + }, + { + "epoch": 0.2333965844402277, + "grad_norm": 4.482125282287598, + "learning_rate": 0.00019857165511735103, + "logits/chosen": -2.876539707183838, + "logits/rejected": -2.8815243244171143, + "logps/chosen": -70.19147491455078, + "logps/rejected": -86.98196411132812, + "loss": 0.6258, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.761369705200195, + "rewards/margins": 1.8523237705230713, + "rewards/rejected": -6.6136932373046875, + "step": 123 + }, + { + "epoch": 0.23529411764705882, + "grad_norm": 4.31642484664917, + "learning_rate": 0.0001984450809907031, + "logits/chosen": -2.8928816318511963, + "logits/rejected": -2.894294023513794, + "logps/chosen": -52.248863220214844, + "logps/rejected": -76.62648010253906, + "loss": 0.1486, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7789430618286133, + "rewards/margins": 2.767739772796631, + "rewards/rejected": -5.546682834625244, + "step": 124 + }, + { + "epoch": 0.23719165085388993, + "grad_norm": 3.312211275100708, + "learning_rate": 0.00019831317798262786, + "logits/chosen": -2.8706130981445312, + "logits/rejected": -2.8669066429138184, + "logps/chosen": -46.89989471435547, + "logps/rejected": -65.01393127441406, + "loss": 0.3462, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1963088512420654, + "rewards/margins": 2.007949113845825, + "rewards/rejected": -4.204257965087891, + "step": 125 + }, + { + "epoch": 0.23908918406072105, + "grad_norm": 3.9472641944885254, + "learning_rate": 0.00019817595323310097, + "logits/chosen": -2.880319356918335, + "logits/rejected": -2.884037494659424, + "logps/chosen": -43.69145202636719, + "logps/rejected": -59.259368896484375, + "loss": 0.4902, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.0041704177856445, + "rewards/margins": 1.6014294624328613, + "rewards/rejected": -3.605599880218506, + "step": 126 + }, + { + "epoch": 0.2409867172675522, + "grad_norm": 3.1087348461151123, + "learning_rate": 0.0001980334141701667, + "logits/chosen": -2.848803997039795, + "logits/rejected": -2.841339588165283, + "logps/chosen": -42.932640075683594, + "logps/rejected": -45.115997314453125, + "loss": 0.6577, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.0474021434783936, + "rewards/margins": 0.36965787410736084, + "rewards/rejected": -2.417059898376465, + "step": 127 + }, + { + "epoch": 0.2428842504743833, + "grad_norm": 1.647839069366455, + "learning_rate": 0.0001978855685095358, + "logits/chosen": -2.8491506576538086, + "logits/rejected": -2.840852975845337, + "logps/chosen": -39.779205322265625, + "logps/rejected": -48.12335968017578, + "loss": 0.383, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.630105972290039, + "rewards/margins": 1.0952038764953613, + "rewards/rejected": -2.7253098487854004, + "step": 128 + }, + { + "epoch": 0.24478178368121442, + "grad_norm": 1.2736908197402954, + "learning_rate": 0.00019773242425416768, + "logits/chosen": -2.934404134750366, + "logits/rejected": -2.9313926696777344, + "logps/chosen": -42.447059631347656, + "logps/rejected": -54.27853775024414, + "loss": 0.3232, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8265783786773682, + "rewards/margins": 1.4815216064453125, + "rewards/rejected": -3.3081002235412598, + "step": 129 + }, + { + "epoch": 0.24667931688804554, + "grad_norm": 1.8245573043823242, + "learning_rate": 0.0001975739896938375, + "logits/chosen": -2.9123284816741943, + "logits/rejected": -2.909496545791626, + "logps/chosen": -36.656558990478516, + "logps/rejected": -40.601043701171875, + "loss": 0.5459, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.2777769565582275, + "rewards/margins": 0.5270283222198486, + "rewards/rejected": -1.8048052787780762, + "step": 130 + }, + { + "epoch": 0.24857685009487665, + "grad_norm": 1.4160490036010742, + "learning_rate": 0.00019741027340468715, + "logits/chosen": -2.9194860458374023, + "logits/rejected": -2.9193787574768066, + "logps/chosen": -31.82858657836914, + "logps/rejected": -51.616615295410156, + "loss": 0.2742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.807111382484436, + "rewards/margins": 2.136796712875366, + "rewards/rejected": -2.943908214569092, + "step": 131 + }, + { + "epoch": 0.2504743833017078, + "grad_norm": 2.3211324214935303, + "learning_rate": 0.00019724128424876116, + "logits/chosen": -2.9083187580108643, + "logits/rejected": -2.9106099605560303, + "logps/chosen": -32.994529724121094, + "logps/rejected": -48.345680236816406, + "loss": 0.4002, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8857762813568115, + "rewards/margins": 1.6735560894012451, + "rewards/rejected": -2.5593323707580566, + "step": 132 + }, + { + "epoch": 0.2523719165085389, + "grad_norm": 2.762873888015747, + "learning_rate": 0.00019706703137352695, + "logits/chosen": -2.9064345359802246, + "logits/rejected": -2.9009690284729004, + "logps/chosen": -35.646888732910156, + "logps/rejected": -56.55396270751953, + "loss": 0.1766, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.0316578149795532, + "rewards/margins": 2.3745384216308594, + "rewards/rejected": -3.406196355819702, + "step": 133 + }, + { + "epoch": 0.25426944971537, + "grad_norm": 3.811307907104492, + "learning_rate": 0.0001968875242113798, + "logits/chosen": -2.9285547733306885, + "logits/rejected": -2.928537130355835, + "logps/chosen": -39.112098693847656, + "logps/rejected": -56.93737030029297, + "loss": 0.3068, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5674068927764893, + "rewards/margins": 1.9389293193817139, + "rewards/rejected": -3.506336212158203, + "step": 134 + }, + { + "epoch": 0.25616698292220114, + "grad_norm": 3.130340337753296, + "learning_rate": 0.00019670277247913205, + "logits/chosen": -2.915271520614624, + "logits/rejected": -2.9155397415161133, + "logps/chosen": -41.27661895751953, + "logps/rejected": -58.242916107177734, + "loss": 0.2538, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7179136276245117, + "rewards/margins": 1.8386231660842896, + "rewards/rejected": -3.556536912918091, + "step": 135 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.0148847103118896, + "learning_rate": 0.0001965127861774873, + "logits/chosen": -2.9431655406951904, + "logits/rejected": -2.9385225772857666, + "logps/chosen": -53.51545715332031, + "logps/rejected": -75.12637329101562, + "loss": 0.1931, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.833062171936035, + "rewards/margins": 2.448612689971924, + "rewards/rejected": -5.281674861907959, + "step": 136 + }, + { + "epoch": 0.25996204933586337, + "grad_norm": 3.7195470333099365, + "learning_rate": 0.00019631757559049898, + "logits/chosen": -2.9738211631774902, + "logits/rejected": -2.9678642749786377, + "logps/chosen": -56.815650939941406, + "logps/rejected": -70.7271728515625, + "loss": 0.4445, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.2865800857543945, + "rewards/margins": 1.6364598274230957, + "rewards/rejected": -4.923039436340332, + "step": 137 + }, + { + "epoch": 0.2618595825426945, + "grad_norm": 1.2523024082183838, + "learning_rate": 0.00019611715128501378, + "logits/chosen": -2.9443469047546387, + "logits/rejected": -2.9399166107177734, + "logps/chosen": -65.968505859375, + "logps/rejected": -84.37772369384766, + "loss": 0.2675, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.088393688201904, + "rewards/margins": 2.0679855346679688, + "rewards/rejected": -6.156379222869873, + "step": 138 + }, + { + "epoch": 0.2637571157495256, + "grad_norm": 1.1542834043502808, + "learning_rate": 0.0001959115241100994, + "logits/chosen": -2.975813865661621, + "logits/rejected": -2.970766544342041, + "logps/chosen": -64.42146301269531, + "logps/rejected": -84.94852447509766, + "loss": 0.2146, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.040821552276611, + "rewards/margins": 2.274022102355957, + "rewards/rejected": -6.314843654632568, + "step": 139 + }, + { + "epoch": 0.2656546489563567, + "grad_norm": 1.7230674028396606, + "learning_rate": 0.00019570070519645767, + "logits/chosen": -2.9944310188293457, + "logits/rejected": -2.9947562217712402, + "logps/chosen": -58.38348388671875, + "logps/rejected": -79.51581573486328, + "loss": 0.2081, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.408212661743164, + "rewards/margins": 2.3429977893829346, + "rewards/rejected": -5.751210689544678, + "step": 140 + }, + { + "epoch": 0.2675521821631879, + "grad_norm": 1.7467715740203857, + "learning_rate": 0.00019548470595582166, + "logits/chosen": -2.96416974067688, + "logits/rejected": -2.9619786739349365, + "logps/chosen": -66.57093048095703, + "logps/rejected": -83.50373077392578, + "loss": 0.3606, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.171794414520264, + "rewards/margins": 1.9003512859344482, + "rewards/rejected": -6.072145462036133, + "step": 141 + }, + { + "epoch": 0.269449715370019, + "grad_norm": 4.687629222869873, + "learning_rate": 0.00019526353808033825, + "logits/chosen": -3.0060713291168213, + "logits/rejected": -3.0069186687469482, + "logps/chosen": -49.43899917602539, + "logps/rejected": -76.19532775878906, + "loss": 0.3784, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.521219253540039, + "rewards/margins": 2.774571180343628, + "rewards/rejected": -5.295790195465088, + "step": 142 + }, + { + "epoch": 0.2713472485768501, + "grad_norm": 6.6354451179504395, + "learning_rate": 0.00019503721354193504, + "logits/chosen": -2.9884586334228516, + "logits/rejected": -2.9861557483673096, + "logps/chosen": -44.920143127441406, + "logps/rejected": -75.18492126464844, + "loss": 0.2519, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0825746059417725, + "rewards/margins": 3.216859817504883, + "rewards/rejected": -5.299434661865234, + "step": 143 + }, + { + "epoch": 0.2732447817836812, + "grad_norm": 6.729991912841797, + "learning_rate": 0.0001948057445916724, + "logits/chosen": -2.983428955078125, + "logits/rejected": -2.977630138397217, + "logps/chosen": -40.8714599609375, + "logps/rejected": -72.71563720703125, + "loss": 0.308, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.621235728263855, + "rewards/margins": 3.496899366378784, + "rewards/rejected": -5.118135452270508, + "step": 144 + }, + { + "epoch": 0.27514231499051234, + "grad_norm": 1.3384770154953003, + "learning_rate": 0.00019456914375908023, + "logits/chosen": -2.9973654747009277, + "logits/rejected": -2.9885263442993164, + "logps/chosen": -37.383270263671875, + "logps/rejected": -62.177364349365234, + "loss": 0.1718, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3070964813232422, + "rewards/margins": 2.7271294593811035, + "rewards/rejected": -4.034225940704346, + "step": 145 + }, + { + "epoch": 0.27703984819734345, + "grad_norm": 3.572751522064209, + "learning_rate": 0.00019432742385147987, + "logits/chosen": -3.0002806186676025, + "logits/rejected": -2.992485523223877, + "logps/chosen": -40.88140106201172, + "logps/rejected": -76.8145980834961, + "loss": 0.2338, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6452064514160156, + "rewards/margins": 3.8751437664031982, + "rewards/rejected": -5.520350456237793, + "step": 146 + }, + { + "epoch": 0.27893738140417457, + "grad_norm": 3.8624587059020996, + "learning_rate": 0.0001940805979532907, + "logits/chosen": -3.022134304046631, + "logits/rejected": -3.0201826095581055, + "logps/chosen": -46.764404296875, + "logps/rejected": -76.25241088867188, + "loss": 0.32, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.3360767364501953, + "rewards/margins": 3.035257577896118, + "rewards/rejected": -5.371334075927734, + "step": 147 + }, + { + "epoch": 0.2808349146110057, + "grad_norm": 2.982891321182251, + "learning_rate": 0.00019382867942532194, + "logits/chosen": -3.0175113677978516, + "logits/rejected": -3.006810188293457, + "logps/chosen": -55.96101379394531, + "logps/rejected": -92.93550109863281, + "loss": 0.2791, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.2954790592193604, + "rewards/margins": 3.832991361618042, + "rewards/rejected": -7.128470420837402, + "step": 148 + }, + { + "epoch": 0.2827324478178368, + "grad_norm": 5.67358922958374, + "learning_rate": 0.00019357168190404936, + "logits/chosen": -3.0050764083862305, + "logits/rejected": -2.9992873668670654, + "logps/chosen": -58.6027717590332, + "logps/rejected": -89.6549301147461, + "loss": 0.5549, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.570911407470703, + "rewards/margins": 3.1905393600463867, + "rewards/rejected": -6.76145076751709, + "step": 149 + }, + { + "epoch": 0.2846299810246679, + "grad_norm": 7.6213297843933105, + "learning_rate": 0.00019330961930087725, + "logits/chosen": -3.0374433994293213, + "logits/rejected": -3.035446882247925, + "logps/chosen": -66.83151245117188, + "logps/rejected": -85.86499786376953, + "loss": 0.7267, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.353948593139648, + "rewards/margins": 1.9724806547164917, + "rewards/rejected": -6.3264288902282715, + "step": 150 + }, + { + "epoch": 0.286527514231499, + "grad_norm": 4.909121513366699, + "learning_rate": 0.00019304250580138524, + "logits/chosen": -3.071992874145508, + "logits/rejected": -3.07159423828125, + "logps/chosen": -68.75637817382812, + "logps/rejected": -89.85405731201172, + "loss": 0.5353, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.568320274353027, + "rewards/margins": 2.245361328125, + "rewards/rejected": -6.813681602478027, + "step": 151 + }, + { + "epoch": 0.2884250474383302, + "grad_norm": 6.616629600524902, + "learning_rate": 0.00019277035586456057, + "logits/chosen": -3.0715456008911133, + "logits/rejected": -3.070178270339966, + "logps/chosen": -62.83892059326172, + "logps/rejected": -83.76055145263672, + "loss": 0.5039, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.863878011703491, + "rewards/margins": 2.338576316833496, + "rewards/rejected": -6.202454566955566, + "step": 152 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 2.418257713317871, + "learning_rate": 0.00019249318422201523, + "logits/chosen": -3.0929856300354004, + "logits/rejected": -3.085249185562134, + "logps/chosen": -60.662742614746094, + "logps/rejected": -78.93743896484375, + "loss": 0.401, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6301565170288086, + "rewards/margins": 2.02978253364563, + "rewards/rejected": -5.659938812255859, + "step": 153 + }, + { + "epoch": 0.2922201138519924, + "grad_norm": 1.500931978225708, + "learning_rate": 0.00019221100587718884, + "logits/chosen": -3.0711896419525146, + "logits/rejected": -3.065363645553589, + "logps/chosen": -58.692962646484375, + "logps/rejected": -79.85702514648438, + "loss": 0.2513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.450127124786377, + "rewards/margins": 2.3925392627716064, + "rewards/rejected": -5.8426666259765625, + "step": 154 + }, + { + "epoch": 0.29411764705882354, + "grad_norm": 2.4970502853393555, + "learning_rate": 0.00019192383610453618, + "logits/chosen": -3.10758900642395, + "logits/rejected": -3.1034536361694336, + "logps/chosen": -61.92613220214844, + "logps/rejected": -78.0351333618164, + "loss": 0.3792, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7843873500823975, + "rewards/margins": 1.7335383892059326, + "rewards/rejected": -5.51792573928833, + "step": 155 + }, + { + "epoch": 0.29601518026565465, + "grad_norm": 4.313493251800537, + "learning_rate": 0.0001916316904487005, + "logits/chosen": -3.1217973232269287, + "logits/rejected": -3.1168103218078613, + "logps/chosen": -61.29463577270508, + "logps/rejected": -74.25381469726562, + "loss": 0.6506, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.835869550704956, + "rewards/margins": 1.4246034622192383, + "rewards/rejected": -5.260473251342773, + "step": 156 + }, + { + "epoch": 0.29791271347248577, + "grad_norm": 2.5466320514678955, + "learning_rate": 0.00019133458472367213, + "logits/chosen": -3.1386356353759766, + "logits/rejected": -3.138566255569458, + "logps/chosen": -64.02009582519531, + "logps/rejected": -86.23556518554688, + "loss": 0.2796, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9694676399230957, + "rewards/margins": 2.4586336612701416, + "rewards/rejected": -6.428101539611816, + "step": 157 + }, + { + "epoch": 0.2998102466793169, + "grad_norm": 3.1764183044433594, + "learning_rate": 0.00019103253501193254, + "logits/chosen": -3.181774377822876, + "logits/rejected": -3.176786422729492, + "logps/chosen": -63.130775451660156, + "logps/rejected": -84.73394775390625, + "loss": 0.279, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.033406734466553, + "rewards/margins": 2.265568733215332, + "rewards/rejected": -6.298975467681885, + "step": 158 + }, + { + "epoch": 0.301707779886148, + "grad_norm": 5.094723701477051, + "learning_rate": 0.00019072555766358346, + "logits/chosen": -3.164654016494751, + "logits/rejected": -3.161222457885742, + "logps/chosen": -61.11647033691406, + "logps/rejected": -88.24617004394531, + "loss": 0.4683, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.767437219619751, + "rewards/margins": 2.976083755493164, + "rewards/rejected": -6.743520736694336, + "step": 159 + }, + { + "epoch": 0.3036053130929791, + "grad_norm": 2.2500054836273193, + "learning_rate": 0.00019041366929546219, + "logits/chosen": -3.1624157428741455, + "logits/rejected": -3.1557273864746094, + "logps/chosen": -58.45857238769531, + "logps/rejected": -83.57339477539062, + "loss": 0.2568, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.415253162384033, + "rewards/margins": 2.820373058319092, + "rewards/rejected": -6.235626220703125, + "step": 160 + }, + { + "epoch": 0.3055028462998102, + "grad_norm": 2.157329797744751, + "learning_rate": 0.0001900968867902419, + "logits/chosen": -3.1669349670410156, + "logits/rejected": -3.1593239307403564, + "logps/chosen": -54.65401077270508, + "logps/rejected": -82.71353912353516, + "loss": 0.2324, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.078385829925537, + "rewards/margins": 2.9072060585021973, + "rewards/rejected": -5.985591888427734, + "step": 161 + }, + { + "epoch": 0.30740037950664134, + "grad_norm": 3.35286808013916, + "learning_rate": 0.000189775227295518, + "logits/chosen": -3.158579111099243, + "logits/rejected": -3.155748128890991, + "logps/chosen": -57.20378112792969, + "logps/rejected": -77.08695983886719, + "loss": 0.4451, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.36582350730896, + "rewards/margins": 2.117894411087036, + "rewards/rejected": -5.483717441558838, + "step": 162 + }, + { + "epoch": 0.3092979127134725, + "grad_norm": 1.9471580982208252, + "learning_rate": 0.00018944870822287956, + "logits/chosen": -3.1374692916870117, + "logits/rejected": -3.134248971939087, + "logps/chosen": -49.715667724609375, + "logps/rejected": -73.12477111816406, + "loss": 0.2535, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6359071731567383, + "rewards/margins": 2.407243251800537, + "rewards/rejected": -5.043150901794434, + "step": 163 + }, + { + "epoch": 0.3111954459203036, + "grad_norm": 2.495023012161255, + "learning_rate": 0.00018911734724696722, + "logits/chosen": -3.1576712131500244, + "logits/rejected": -3.1541287899017334, + "logps/chosen": -48.007938385009766, + "logps/rejected": -66.66117095947266, + "loss": 0.4549, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4171323776245117, + "rewards/margins": 2.037095308303833, + "rewards/rejected": -4.454227447509766, + "step": 164 + }, + { + "epoch": 0.31309297912713474, + "grad_norm": 4.0073652267456055, + "learning_rate": 0.00018878116230451613, + "logits/chosen": -3.111081838607788, + "logits/rejected": -3.111217737197876, + "logps/chosen": -44.466983795166016, + "logps/rejected": -60.27463912963867, + "loss": 0.4762, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.118543863296509, + "rewards/margins": 1.7310559749603271, + "rewards/rejected": -3.849599599838257, + "step": 165 + }, + { + "epoch": 0.31499051233396586, + "grad_norm": 3.148956537246704, + "learning_rate": 0.00018844017159338528, + "logits/chosen": -3.118450403213501, + "logits/rejected": -3.123354911804199, + "logps/chosen": -38.37690734863281, + "logps/rejected": -53.64491271972656, + "loss": 0.4602, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5390743017196655, + "rewards/margins": 1.7078005075454712, + "rewards/rejected": -3.2468748092651367, + "step": 166 + }, + { + "epoch": 0.31688804554079697, + "grad_norm": 1.8341965675354004, + "learning_rate": 0.00018809439357157223, + "logits/chosen": -3.0582046508789062, + "logits/rejected": -3.05729079246521, + "logps/chosen": -33.0239372253418, + "logps/rejected": -39.82599639892578, + "loss": 0.5013, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.91768878698349, + "rewards/margins": 0.8253737092018127, + "rewards/rejected": -1.7430624961853027, + "step": 167 + }, + { + "epoch": 0.3187855787476281, + "grad_norm": 2.7328832149505615, + "learning_rate": 0.00018774384695621407, + "logits/chosen": -3.0516273975372314, + "logits/rejected": -3.0475456714630127, + "logps/chosen": -41.876365661621094, + "logps/rejected": -49.84267044067383, + "loss": 0.5872, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.7223644256591797, + "rewards/margins": 0.9419503808021545, + "rewards/rejected": -2.6643147468566895, + "step": 168 + }, + { + "epoch": 0.3206831119544592, + "grad_norm": 1.2971059083938599, + "learning_rate": 0.0001873885507225743, + "logits/chosen": -3.0343210697174072, + "logits/rejected": -3.0288002490997314, + "logps/chosen": -36.68798065185547, + "logps/rejected": -51.96241760253906, + "loss": 0.2352, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3304095268249512, + "rewards/margins": 1.7680621147155762, + "rewards/rejected": -3.0984716415405273, + "step": 169 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 2.298835039138794, + "learning_rate": 0.00018702852410301554, + "logits/chosen": -3.0099384784698486, + "logits/rejected": -3.014251708984375, + "logps/chosen": -48.32796096801758, + "logps/rejected": -69.7240219116211, + "loss": 0.314, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4110379219055176, + "rewards/margins": 2.3697621822357178, + "rewards/rejected": -4.7808003425598145, + "step": 170 + }, + { + "epoch": 0.32447817836812143, + "grad_norm": 2.5293760299682617, + "learning_rate": 0.0001866637865859586, + "logits/chosen": -3.0135481357574463, + "logits/rejected": -3.0115551948547363, + "logps/chosen": -54.45928955078125, + "logps/rejected": -71.06674194335938, + "loss": 0.3232, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.125586986541748, + "rewards/margins": 1.8217525482177734, + "rewards/rejected": -4.94734001159668, + "step": 171 + }, + { + "epoch": 0.32637571157495254, + "grad_norm": 2.6669814586639404, + "learning_rate": 0.00018629435791482765, + "logits/chosen": -2.947815418243408, + "logits/rejected": -2.9488513469696045, + "logps/chosen": -60.36651611328125, + "logps/rejected": -84.83184814453125, + "loss": 0.3001, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.707231283187866, + "rewards/margins": 2.5273969173431396, + "rewards/rejected": -6.234628677368164, + "step": 172 + }, + { + "epoch": 0.32827324478178366, + "grad_norm": 3.21972918510437, + "learning_rate": 0.00018592025808698116, + "logits/chosen": -2.9881162643432617, + "logits/rejected": -2.986424684524536, + "logps/chosen": -63.61874771118164, + "logps/rejected": -97.28710174560547, + "loss": 0.3389, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8928141593933105, + "rewards/margins": 3.6045291423797607, + "rewards/rejected": -7.497343063354492, + "step": 173 + }, + { + "epoch": 0.3301707779886148, + "grad_norm": 2.9337503910064697, + "learning_rate": 0.00018554150735262975, + "logits/chosen": -2.904815912246704, + "logits/rejected": -2.9042913913726807, + "logps/chosen": -65.26492309570312, + "logps/rejected": -88.40277099609375, + "loss": 0.3105, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.115939140319824, + "rewards/margins": 2.4648208618164062, + "rewards/rejected": -6.5807600021362305, + "step": 174 + }, + { + "epoch": 0.33206831119544594, + "grad_norm": 3.591499090194702, + "learning_rate": 0.00018515812621373997, + "logits/chosen": -2.9820444583892822, + "logits/rejected": -2.969967842102051, + "logps/chosen": -67.85137176513672, + "logps/rejected": -87.8746109008789, + "loss": 0.4435, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.4569549560546875, + "rewards/margins": 2.025627851486206, + "rewards/rejected": -6.482582092285156, + "step": 175 + }, + { + "epoch": 0.33396584440227706, + "grad_norm": 3.6195833683013916, + "learning_rate": 0.00018477013542292446, + "logits/chosen": -2.9211320877075195, + "logits/rejected": -2.9202651977539062, + "logps/chosen": -65.85560607910156, + "logps/rejected": -88.60107421875, + "loss": 0.542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.231626510620117, + "rewards/margins": 2.3576202392578125, + "rewards/rejected": -6.589247226715088, + "step": 176 + }, + { + "epoch": 0.33586337760910817, + "grad_norm": 4.6038594245910645, + "learning_rate": 0.00018437755598231856, + "logits/chosen": -2.9478230476379395, + "logits/rejected": -2.9453256130218506, + "logps/chosen": -66.39960479736328, + "logps/rejected": -88.36288452148438, + "loss": 0.3217, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.242918014526367, + "rewards/margins": 2.3832414150238037, + "rewards/rejected": -6.62615966796875, + "step": 177 + }, + { + "epoch": 0.3377609108159393, + "grad_norm": 1.2974350452423096, + "learning_rate": 0.00018398040914244362, + "logits/chosen": -2.901718854904175, + "logits/rejected": -2.8996200561523438, + "logps/chosen": -67.7139892578125, + "logps/rejected": -91.30947875976562, + "loss": 0.1607, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.423122406005859, + "rewards/margins": 2.575498580932617, + "rewards/rejected": -6.998621463775635, + "step": 178 + }, + { + "epoch": 0.3396584440227704, + "grad_norm": 3.1375489234924316, + "learning_rate": 0.00018357871640105645, + "logits/chosen": -2.926860809326172, + "logits/rejected": -2.923616886138916, + "logps/chosen": -68.5616455078125, + "logps/rejected": -92.57727813720703, + "loss": 0.294, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.4224042892456055, + "rewards/margins": 2.60679292678833, + "rewards/rejected": -7.029196739196777, + "step": 179 + }, + { + "epoch": 0.3415559772296015, + "grad_norm": 1.6190105676651, + "learning_rate": 0.00018317249950198597, + "logits/chosen": -2.937143087387085, + "logits/rejected": -2.9350616931915283, + "logps/chosen": -67.23421478271484, + "logps/rejected": -93.80059814453125, + "loss": 0.1848, + "rewards/accuracies": 1.0, + "rewards/chosen": -4.204146385192871, + "rewards/margins": 3.0119266510009766, + "rewards/rejected": -7.216073513031006, + "step": 180 + }, + { + "epoch": 0.34345351043643263, + "grad_norm": 2.069124698638916, + "learning_rate": 0.00018276178043395586, + "logits/chosen": -2.942324638366699, + "logits/rejected": -2.943343162536621, + "logps/chosen": -70.87568664550781, + "logps/rejected": -101.76148223876953, + "loss": 0.1794, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.680966377258301, + "rewards/margins": 3.301745653152466, + "rewards/rejected": -7.9827117919921875, + "step": 181 + }, + { + "epoch": 0.34535104364326374, + "grad_norm": 6.071887493133545, + "learning_rate": 0.00018234658142939454, + "logits/chosen": -2.9245431423187256, + "logits/rejected": -2.9229578971862793, + "logps/chosen": -66.28981018066406, + "logps/rejected": -88.98973083496094, + "loss": 0.5967, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.1609787940979, + "rewards/margins": 2.4637625217437744, + "rewards/rejected": -6.624741077423096, + "step": 182 + }, + { + "epoch": 0.34724857685009486, + "grad_norm": 1.7315760850906372, + "learning_rate": 0.00018192692496323156, + "logits/chosen": -2.9373724460601807, + "logits/rejected": -2.9322011470794678, + "logps/chosen": -61.68766403198242, + "logps/rejected": -97.79752349853516, + "loss": 0.0801, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.600949287414551, + "rewards/margins": 3.957946300506592, + "rewards/rejected": -7.558895587921143, + "step": 183 + }, + { + "epoch": 0.349146110056926, + "grad_norm": 4.817263603210449, + "learning_rate": 0.00018150283375168114, + "logits/chosen": -2.9242465496063232, + "logits/rejected": -2.9281442165374756, + "logps/chosen": -49.455657958984375, + "logps/rejected": -78.6143798828125, + "loss": 0.4009, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.6410908699035645, + "rewards/margins": 3.0010156631469727, + "rewards/rejected": -5.642106533050537, + "step": 184 + }, + { + "epoch": 0.3510436432637571, + "grad_norm": 6.691137790679932, + "learning_rate": 0.00018107433075101252, + "logits/chosen": -2.8935739994049072, + "logits/rejected": -2.8921587467193604, + "logps/chosen": -39.874351501464844, + "logps/rejected": -75.13919830322266, + "loss": 0.6717, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.6036827564239502, + "rewards/margins": 3.638152837753296, + "rewards/rejected": -5.241835594177246, + "step": 185 + }, + { + "epoch": 0.35294117647058826, + "grad_norm": 6.4132914543151855, + "learning_rate": 0.00018064143915630723, + "logits/chosen": -2.8910973072052, + "logits/rejected": -2.8896028995513916, + "logps/chosen": -42.009525299072266, + "logps/rejected": -60.758155822753906, + "loss": 0.7367, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7192809581756592, + "rewards/margins": 2.1440207958221436, + "rewards/rejected": -3.8633017539978027, + "step": 186 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 3.2330329418182373, + "learning_rate": 0.0001802041824002036, + "logits/chosen": -2.9204766750335693, + "logits/rejected": -2.915883779525757, + "logps/chosen": -38.97833251953125, + "logps/rejected": -57.29167175292969, + "loss": 0.3248, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.5657511949539185, + "rewards/margins": 1.8807975053787231, + "rewards/rejected": -3.4465484619140625, + "step": 187 + }, + { + "epoch": 0.3567362428842505, + "grad_norm": 1.4538908004760742, + "learning_rate": 0.00017976258415162833, + "logits/chosen": -2.883896827697754, + "logits/rejected": -2.8819518089294434, + "logps/chosen": -38.06407928466797, + "logps/rejected": -52.192264556884766, + "loss": 0.3424, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4242199659347534, + "rewards/margins": 1.5716015100479126, + "rewards/rejected": -2.995821475982666, + "step": 188 + }, + { + "epoch": 0.3586337760910816, + "grad_norm": 1.8755921125411987, + "learning_rate": 0.00017931666831451536, + "logits/chosen": -2.859116554260254, + "logits/rejected": -2.8647141456604004, + "logps/chosen": -35.76837158203125, + "logps/rejected": -55.3032112121582, + "loss": 0.5151, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.3126046657562256, + "rewards/margins": 2.0264344215393066, + "rewards/rejected": -3.3390393257141113, + "step": 189 + }, + { + "epoch": 0.3605313092979127, + "grad_norm": 2.1501007080078125, + "learning_rate": 0.00017886645902651167, + "logits/chosen": -2.9286084175109863, + "logits/rejected": -2.9369022846221924, + "logps/chosen": -39.66361618041992, + "logps/rejected": -68.78336334228516, + "loss": 0.1306, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5931215286254883, + "rewards/margins": 3.1761364936828613, + "rewards/rejected": -4.76925802230835, + "step": 190 + }, + { + "epoch": 0.36242884250474383, + "grad_norm": 4.919617652893066, + "learning_rate": 0.00017841198065767107, + "logits/chosen": -2.9136152267456055, + "logits/rejected": -2.914355754852295, + "logps/chosen": -54.04287338256836, + "logps/rejected": -71.08902740478516, + "loss": 0.6126, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.111724615097046, + "rewards/margins": 1.875967025756836, + "rewards/rejected": -4.987691402435303, + "step": 191 + }, + { + "epoch": 0.36432637571157495, + "grad_norm": 2.6215672492980957, + "learning_rate": 0.0001779532578091347, + "logits/chosen": -2.9398844242095947, + "logits/rejected": -2.94006609916687, + "logps/chosen": -56.28917694091797, + "logps/rejected": -94.09794616699219, + "loss": 0.1769, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2438466548919678, + "rewards/margins": 4.059857368469238, + "rewards/rejected": -7.303704261779785, + "step": 192 + }, + { + "epoch": 0.36622390891840606, + "grad_norm": 7.178459167480469, + "learning_rate": 0.00017749031531179963, + "logits/chosen": -2.8921046257019043, + "logits/rejected": -2.8897299766540527, + "logps/chosen": -65.22640991210938, + "logps/rejected": -91.53515625, + "loss": 0.6301, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.077861785888672, + "rewards/margins": 2.8920609951019287, + "rewards/rejected": -6.96992301940918, + "step": 193 + }, + { + "epoch": 0.3681214421252372, + "grad_norm": 4.36641263961792, + "learning_rate": 0.00017702317822497455, + "logits/chosen": -2.8808705806732178, + "logits/rejected": -2.8756415843963623, + "logps/chosen": -72.72103118896484, + "logps/rejected": -96.75923919677734, + "loss": 0.3474, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.825438499450684, + "rewards/margins": 2.646296977996826, + "rewards/rejected": -7.47173547744751, + "step": 194 + }, + { + "epoch": 0.3700189753320683, + "grad_norm": 4.489946365356445, + "learning_rate": 0.00017655187183502344, + "logits/chosen": -2.899508237838745, + "logits/rejected": -2.89617919921875, + "logps/chosen": -77.47444152832031, + "logps/rejected": -94.92417907714844, + "loss": 0.7369, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.465056419372559, + "rewards/margins": 1.7898337841033936, + "rewards/rejected": -7.254889488220215, + "step": 195 + }, + { + "epoch": 0.3719165085388994, + "grad_norm": 3.543882369995117, + "learning_rate": 0.00017607642165399666, + "logits/chosen": -2.8700618743896484, + "logits/rejected": -2.865544319152832, + "logps/chosen": -71.58328247070312, + "logps/rejected": -85.74073028564453, + "loss": 0.4353, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.7718706130981445, + "rewards/margins": 1.615250587463379, + "rewards/rejected": -6.387121200561523, + "step": 196 + }, + { + "epoch": 0.3738140417457306, + "grad_norm": 1.7851449251174927, + "learning_rate": 0.0001755968534182501, + "logits/chosen": -2.8684256076812744, + "logits/rejected": -2.8665754795074463, + "logps/chosen": -68.68287658691406, + "logps/rejected": -88.38851928710938, + "loss": 0.2769, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.466452598571777, + "rewards/margins": 2.200467824935913, + "rewards/rejected": -6.6669206619262695, + "step": 197 + }, + { + "epoch": 0.3757115749525617, + "grad_norm": 3.8078126907348633, + "learning_rate": 0.00017511319308705198, + "logits/chosen": -2.8291757106781006, + "logits/rejected": -2.8295886516571045, + "logps/chosen": -74.86436462402344, + "logps/rejected": -83.49817657470703, + "loss": 0.5876, + "rewards/accuracies": 0.6875, + "rewards/chosen": -5.133200168609619, + "rewards/margins": 0.8893264532089233, + "rewards/rejected": -6.022526741027832, + "step": 198 + }, + { + "epoch": 0.3776091081593928, + "grad_norm": 2.537524938583374, + "learning_rate": 0.0001746254668411778, + "logits/chosen": -2.826998472213745, + "logits/rejected": -2.8250083923339844, + "logps/chosen": -70.27324676513672, + "logps/rejected": -84.13557434082031, + "loss": 0.3498, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.654443740844727, + "rewards/margins": 1.5413017272949219, + "rewards/rejected": -6.195745468139648, + "step": 199 + }, + { + "epoch": 0.3795066413662239, + "grad_norm": 0.7963341474533081, + "learning_rate": 0.00017413370108149286, + "logits/chosen": -2.8284592628479004, + "logits/rejected": -2.8273520469665527, + "logps/chosen": -59.756290435791016, + "logps/rejected": -88.60334777832031, + "loss": 0.1007, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.604917049407959, + "rewards/margins": 3.0917110443115234, + "rewards/rejected": -6.696628570556641, + "step": 200 + }, + { + "epoch": 0.38140417457305503, + "grad_norm": 2.5711867809295654, + "learning_rate": 0.00017363792242752353, + "logits/chosen": -2.8135452270507812, + "logits/rejected": -2.8090662956237793, + "logps/chosen": -60.564918518066406, + "logps/rejected": -83.0078125, + "loss": 0.3432, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7318811416625977, + "rewards/margins": 2.3645801544189453, + "rewards/rejected": -6.096461296081543, + "step": 201 + }, + { + "epoch": 0.38330170777988615, + "grad_norm": 1.702682614326477, + "learning_rate": 0.0001731381577160161, + "logits/chosen": -2.8074567317962646, + "logits/rejected": -2.8068923950195312, + "logps/chosen": -58.00551986694336, + "logps/rejected": -77.38255310058594, + "loss": 0.2787, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.473022937774658, + "rewards/margins": 2.0791990756988525, + "rewards/rejected": -5.55222225189209, + "step": 202 + }, + { + "epoch": 0.38519924098671726, + "grad_norm": 3.307158946990967, + "learning_rate": 0.0001726344339994841, + "logits/chosen": -2.837968349456787, + "logits/rejected": -2.8358547687530518, + "logps/chosen": -68.95384979248047, + "logps/rejected": -79.25270080566406, + "loss": 0.5837, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.315940856933594, + "rewards/margins": 1.430248498916626, + "rewards/rejected": -5.746189594268799, + "step": 203 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 2.0259664058685303, + "learning_rate": 0.000172126778544744, + "logits/chosen": -2.8287785053253174, + "logits/rejected": -2.8304224014282227, + "logps/chosen": -67.12183380126953, + "logps/rejected": -80.09559631347656, + "loss": 0.3632, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.335432529449463, + "rewards/margins": 1.4739618301391602, + "rewards/rejected": -5.809394836425781, + "step": 204 + }, + { + "epoch": 0.3889943074003795, + "grad_norm": 2.4748473167419434, + "learning_rate": 0.00017161521883143934, + "logits/chosen": -2.862060308456421, + "logits/rejected": -2.861398458480835, + "logps/chosen": -68.42402648925781, + "logps/rejected": -91.68376159667969, + "loss": 0.2329, + "rewards/accuracies": 0.9375, + "rewards/chosen": -4.550724983215332, + "rewards/margins": 2.5067808628082275, + "rewards/rejected": -7.057506084442139, + "step": 205 + }, + { + "epoch": 0.3908918406072106, + "grad_norm": 1.0783460140228271, + "learning_rate": 0.00017109978255055295, + "logits/chosen": -2.7947001457214355, + "logits/rejected": -2.8020119667053223, + "logps/chosen": -55.356285095214844, + "logps/rejected": -88.07723236083984, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.234992027282715, + "rewards/margins": 3.352567672729492, + "rewards/rejected": -6.587559700012207, + "step": 206 + }, + { + "epoch": 0.3927893738140417, + "grad_norm": 3.741246223449707, + "learning_rate": 0.0001705804976029083, + "logits/chosen": -2.7769172191619873, + "logits/rejected": -2.7746269702911377, + "logps/chosen": -67.84706115722656, + "logps/rejected": -86.53413391113281, + "loss": 0.3825, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.322835922241211, + "rewards/margins": 2.0191259384155273, + "rewards/rejected": -6.341961860656738, + "step": 207 + }, + { + "epoch": 0.3946869070208729, + "grad_norm": 3.0758566856384277, + "learning_rate": 0.00017005739209765904, + "logits/chosen": -2.798175096511841, + "logits/rejected": -2.7865893840789795, + "logps/chosen": -63.870933532714844, + "logps/rejected": -85.13277435302734, + "loss": 0.3158, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9742186069488525, + "rewards/margins": 2.2601842880249023, + "rewards/rejected": -6.234403133392334, + "step": 208 + }, + { + "epoch": 0.396584440227704, + "grad_norm": 4.3031907081604, + "learning_rate": 0.0001695304943507677, + "logits/chosen": -2.786068916320801, + "logits/rejected": -2.7925620079040527, + "logps/chosen": -70.43618774414062, + "logps/rejected": -92.2144775390625, + "loss": 0.6388, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.723659038543701, + "rewards/margins": 2.3320984840393066, + "rewards/rejected": -7.055757522583008, + "step": 209 + }, + { + "epoch": 0.3984819734345351, + "grad_norm": 4.984089374542236, + "learning_rate": 0.00016899983288347248, + "logits/chosen": -2.802447557449341, + "logits/rejected": -2.8011577129364014, + "logps/chosen": -75.90496063232422, + "logps/rejected": -96.45457458496094, + "loss": 0.717, + "rewards/accuracies": 0.75, + "rewards/chosen": -5.078993320465088, + "rewards/margins": 2.2575745582580566, + "rewards/rejected": -7.3365678787231445, + "step": 210 + }, + { + "epoch": 0.40037950664136623, + "grad_norm": 2.23420786857605, + "learning_rate": 0.0001684654364207438, + "logits/chosen": -2.7645010948181152, + "logits/rejected": -2.7675206661224365, + "logps/chosen": -55.797607421875, + "logps/rejected": -92.49737548828125, + "loss": 0.1039, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.1702518463134766, + "rewards/margins": 3.868875503540039, + "rewards/rejected": -7.039127349853516, + "step": 211 + }, + { + "epoch": 0.40227703984819735, + "grad_norm": 1.9920927286148071, + "learning_rate": 0.00016792733388972932, + "logits/chosen": -2.7536532878875732, + "logits/rejected": -2.755016326904297, + "logps/chosen": -58.96205139160156, + "logps/rejected": -94.6013412475586, + "loss": 0.105, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3591365814208984, + "rewards/margins": 3.903737783432007, + "rewards/rejected": -7.262874126434326, + "step": 212 + }, + { + "epoch": 0.40417457305502846, + "grad_norm": 3.800337791442871, + "learning_rate": 0.00016738555441818783, + "logits/chosen": -2.8170573711395264, + "logits/rejected": -2.8192145824432373, + "logps/chosen": -57.503204345703125, + "logps/rejected": -100.11039733886719, + "loss": 0.187, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.294625759124756, + "rewards/margins": 4.43009614944458, + "rewards/rejected": -7.724721908569336, + "step": 213 + }, + { + "epoch": 0.4060721062618596, + "grad_norm": 6.148056983947754, + "learning_rate": 0.0001668401273329129, + "logits/chosen": -2.773486614227295, + "logits/rejected": -2.782440423965454, + "logps/chosen": -49.874900817871094, + "logps/rejected": -73.721435546875, + "loss": 0.3677, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.5481603145599365, + "rewards/margins": 2.5078892707824707, + "rewards/rejected": -5.056049346923828, + "step": 214 + }, + { + "epoch": 0.4079696394686907, + "grad_norm": 4.637800216674805, + "learning_rate": 0.00016629108215814525, + "logits/chosen": -2.7890918254852295, + "logits/rejected": -2.78002667427063, + "logps/chosen": -50.07588577270508, + "logps/rejected": -76.56361389160156, + "loss": 0.2625, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4453320503234863, + "rewards/margins": 2.9321773052215576, + "rewards/rejected": -5.377509117126465, + "step": 215 + }, + { + "epoch": 0.4098671726755218, + "grad_norm": 3.3369038105010986, + "learning_rate": 0.00016573844861397444, + "logits/chosen": -2.7674639225006104, + "logits/rejected": -2.777357339859009, + "logps/chosen": -47.326290130615234, + "logps/rejected": -73.55824279785156, + "loss": 0.5078, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.330988883972168, + "rewards/margins": 2.8556952476501465, + "rewards/rejected": -5.186683654785156, + "step": 216 + }, + { + "epoch": 0.4117647058823529, + "grad_norm": 3.6975343227386475, + "learning_rate": 0.00016518225661473043, + "logits/chosen": -2.7834789752960205, + "logits/rejected": -2.77740216255188, + "logps/chosen": -49.831703186035156, + "logps/rejected": -76.83187866210938, + "loss": 0.2911, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4276180267333984, + "rewards/margins": 3.013000011444092, + "rewards/rejected": -5.440618515014648, + "step": 217 + }, + { + "epoch": 0.41366223908918404, + "grad_norm": 3.3639872074127197, + "learning_rate": 0.00016462253626736413, + "logits/chosen": -2.6748290061950684, + "logits/rejected": -2.6851727962493896, + "logps/chosen": -41.36797332763672, + "logps/rejected": -60.205989837646484, + "loss": 0.4055, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.751816749572754, + "rewards/margins": 2.143329381942749, + "rewards/rejected": -3.895146131515503, + "step": 218 + }, + { + "epoch": 0.4155597722960152, + "grad_norm": 3.045083522796631, + "learning_rate": 0.00016405931786981755, + "logits/chosen": -2.7627456188201904, + "logits/rejected": -2.7599875926971436, + "logps/chosen": -42.954856872558594, + "logps/rejected": -59.64561462402344, + "loss": 0.3513, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9114494323730469, + "rewards/margins": 1.9816038608551025, + "rewards/rejected": -3.8930535316467285, + "step": 219 + }, + { + "epoch": 0.4174573055028463, + "grad_norm": 2.8113279342651367, + "learning_rate": 0.000163492631909384, + "logits/chosen": -2.745089530944824, + "logits/rejected": -2.7422215938568115, + "logps/chosen": -42.2723503112793, + "logps/rejected": -57.04561233520508, + "loss": 0.4329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7828037738800049, + "rewards/margins": 1.7417727708816528, + "rewards/rejected": -3.5245766639709473, + "step": 220 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 1.3215032815933228, + "learning_rate": 0.0001629225090610577, + "logits/chosen": -2.764284372329712, + "logits/rejected": -2.7611119747161865, + "logps/chosen": -36.366241455078125, + "logps/rejected": -59.05260467529297, + "loss": 0.1957, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2659788131713867, + "rewards/margins": 2.423325777053833, + "rewards/rejected": -3.6893045902252197, + "step": 221 + }, + { + "epoch": 0.42125237191650855, + "grad_norm": 1.7855051755905151, + "learning_rate": 0.00016234898018587337, + "logits/chosen": -2.8013715744018555, + "logits/rejected": -2.8014392852783203, + "logps/chosen": -38.1886100769043, + "logps/rejected": -57.5101318359375, + "loss": 0.2574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.3807275295257568, + "rewards/margins": 2.1531457901000977, + "rewards/rejected": -3.5338735580444336, + "step": 222 + }, + { + "epoch": 0.42314990512333966, + "grad_norm": 4.893378257751465, + "learning_rate": 0.00016177207632923557, + "logits/chosen": -2.793851137161255, + "logits/rejected": -2.791360378265381, + "logps/chosen": -44.361148834228516, + "logps/rejected": -56.266658782958984, + "loss": 0.3702, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9667861461639404, + "rewards/margins": 1.407994270324707, + "rewards/rejected": -3.3747801780700684, + "step": 223 + }, + { + "epoch": 0.4250474383301708, + "grad_norm": 1.959364891052246, + "learning_rate": 0.00016119182871923834, + "logits/chosen": -2.7673821449279785, + "logits/rejected": -2.7583274841308594, + "logps/chosen": -47.93505859375, + "logps/rejected": -65.74525451660156, + "loss": 0.25, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4285130500793457, + "rewards/margins": 1.9772919416427612, + "rewards/rejected": -4.4058051109313965, + "step": 224 + }, + { + "epoch": 0.4269449715370019, + "grad_norm": 2.6299164295196533, + "learning_rate": 0.00016060826876497478, + "logits/chosen": -2.7214760780334473, + "logits/rejected": -2.7112197875976562, + "logps/chosen": -42.125022888183594, + "logps/rejected": -62.053749084472656, + "loss": 0.2648, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.8338714838027954, + "rewards/margins": 2.1915903091430664, + "rewards/rejected": -4.025461673736572, + "step": 225 + }, + { + "epoch": 0.428842504743833, + "grad_norm": 3.1354124546051025, + "learning_rate": 0.00016002142805483685, + "logits/chosen": -2.7817063331604004, + "logits/rejected": -2.7779695987701416, + "logps/chosen": -46.67332077026367, + "logps/rejected": -69.51974487304688, + "loss": 0.3174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3287980556488037, + "rewards/margins": 2.3333349227905273, + "rewards/rejected": -4.662132740020752, + "step": 226 + }, + { + "epoch": 0.4307400379506641, + "grad_norm": 1.7215931415557861, + "learning_rate": 0.00015943133835480535, + "logits/chosen": -2.8229281902313232, + "logits/rejected": -2.815373659133911, + "logps/chosen": -40.25981140136719, + "logps/rejected": -73.13153076171875, + "loss": 0.1534, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6590008735656738, + "rewards/margins": 3.475335121154785, + "rewards/rejected": -5.134335994720459, + "step": 227 + }, + { + "epoch": 0.43263757115749524, + "grad_norm": 4.5978593826293945, + "learning_rate": 0.0001588380316067307, + "logits/chosen": -2.824258804321289, + "logits/rejected": -2.8195881843566895, + "logps/chosen": -40.724361419677734, + "logps/rejected": -67.01683044433594, + "loss": 0.1866, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.497710108757019, + "rewards/margins": 2.8423562049865723, + "rewards/rejected": -4.340066432952881, + "step": 228 + }, + { + "epoch": 0.43453510436432635, + "grad_norm": 6.522673606872559, + "learning_rate": 0.0001582415399266036, + "logits/chosen": -2.7951138019561768, + "logits/rejected": -2.7876198291778564, + "logps/chosen": -47.27073287963867, + "logps/rejected": -76.75838470458984, + "loss": 0.2995, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.335322380065918, + "rewards/margins": 3.229301691055298, + "rewards/rejected": -5.564623832702637, + "step": 229 + }, + { + "epoch": 0.4364326375711575, + "grad_norm": 4.254772186279297, + "learning_rate": 0.00015764189560281677, + "logits/chosen": -2.768986701965332, + "logits/rejected": -2.7672719955444336, + "logps/chosen": -58.088409423828125, + "logps/rejected": -78.85195922851562, + "loss": 0.555, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2533457279205322, + "rewards/margins": 2.4660933017730713, + "rewards/rejected": -5.7194390296936035, + "step": 230 + }, + { + "epoch": 0.43833017077798864, + "grad_norm": 6.352909088134766, + "learning_rate": 0.00015703913109441713, + "logits/chosen": -2.79345965385437, + "logits/rejected": -2.786287546157837, + "logps/chosen": -49.76398468017578, + "logps/rejected": -80.52045440673828, + "loss": 0.1719, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.432440996170044, + "rewards/margins": 3.3548390865325928, + "rewards/rejected": -5.787280082702637, + "step": 231 + }, + { + "epoch": 0.44022770398481975, + "grad_norm": 10.744542121887207, + "learning_rate": 0.00015643327902934868, + "logits/chosen": -2.75455379486084, + "logits/rejected": -2.741607666015625, + "logps/chosen": -58.280277252197266, + "logps/rejected": -90.40587615966797, + "loss": 0.4328, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.340040683746338, + "rewards/margins": 3.4735236167907715, + "rewards/rejected": -6.813564300537109, + "step": 232 + }, + { + "epoch": 0.44212523719165087, + "grad_norm": 5.020275115966797, + "learning_rate": 0.00015582437220268647, + "logits/chosen": -2.758476972579956, + "logits/rejected": -2.7559456825256348, + "logps/chosen": -61.09891128540039, + "logps/rejected": -85.42538452148438, + "loss": 0.4021, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7310962677001953, + "rewards/margins": 2.584928512573242, + "rewards/rejected": -6.3160247802734375, + "step": 233 + }, + { + "epoch": 0.444022770398482, + "grad_norm": 3.41190767288208, + "learning_rate": 0.00015521244357486133, + "logits/chosen": -2.7153968811035156, + "logits/rejected": -2.7004103660583496, + "logps/chosen": -77.57876586914062, + "logps/rejected": -97.70706939697266, + "loss": 0.2604, + "rewards/accuracies": 0.9375, + "rewards/chosen": -5.392991065979004, + "rewards/margins": 2.164095640182495, + "rewards/rejected": -7.557086944580078, + "step": 234 + }, + { + "epoch": 0.4459203036053131, + "grad_norm": 2.357773542404175, + "learning_rate": 0.00015459752626987563, + "logits/chosen": -2.8138904571533203, + "logits/rejected": -2.8117315769195557, + "logps/chosen": -70.28298950195312, + "logps/rejected": -88.47047424316406, + "loss": 0.4502, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.694599151611328, + "rewards/margins": 1.9797905683517456, + "rewards/rejected": -6.674389839172363, + "step": 235 + }, + { + "epoch": 0.4478178368121442, + "grad_norm": 3.5848381519317627, + "learning_rate": 0.00015397965357351033, + "logits/chosen": -2.788076162338257, + "logits/rejected": -2.7764651775360107, + "logps/chosen": -63.50676727294922, + "logps/rejected": -96.62000274658203, + "loss": 0.2589, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.9061508178710938, + "rewards/margins": 3.6903600692749023, + "rewards/rejected": -7.596510887145996, + "step": 236 + }, + { + "epoch": 0.4497153700189753, + "grad_norm": 4.058860778808594, + "learning_rate": 0.00015335885893152335, + "logits/chosen": -2.765608549118042, + "logits/rejected": -2.752915382385254, + "logps/chosen": -59.92298126220703, + "logps/rejected": -83.04705810546875, + "loss": 0.3769, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5093774795532227, + "rewards/margins": 2.6241321563720703, + "rewards/rejected": -6.133509635925293, + "step": 237 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 3.230358600616455, + "learning_rate": 0.00015273517594783877, + "logits/chosen": -2.7464921474456787, + "logits/rejected": -2.73105788230896, + "logps/chosen": -57.03343200683594, + "logps/rejected": -75.98129272460938, + "loss": 0.2463, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1314737796783447, + "rewards/margins": 2.3742733001708984, + "rewards/rejected": -5.505747318267822, + "step": 238 + }, + { + "epoch": 0.45351043643263755, + "grad_norm": 3.2784218788146973, + "learning_rate": 0.0001521086383827282, + "logits/chosen": -2.815624237060547, + "logits/rejected": -2.818958044052124, + "logps/chosen": -48.82466506958008, + "logps/rejected": -58.87793731689453, + "loss": 0.4348, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.4546396732330322, + "rewards/margins": 1.227008581161499, + "rewards/rejected": -3.6816484928131104, + "step": 239 + }, + { + "epoch": 0.45540796963946867, + "grad_norm": 3.05546236038208, + "learning_rate": 0.0001514792801509831, + "logits/chosen": -2.8245487213134766, + "logits/rejected": -2.8188109397888184, + "logps/chosen": -51.791107177734375, + "logps/rejected": -59.150779724121094, + "loss": 0.46, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5664262771606445, + "rewards/margins": 1.1049449443817139, + "rewards/rejected": -3.6713712215423584, + "step": 240 + }, + { + "epoch": 0.4573055028462998, + "grad_norm": 2.409597873687744, + "learning_rate": 0.00015084713532007905, + "logits/chosen": -2.7943313121795654, + "logits/rejected": -2.7907536029815674, + "logps/chosen": -43.191131591796875, + "logps/rejected": -53.549072265625, + "loss": 0.3468, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.9051916599273682, + "rewards/margins": 1.3212178945541382, + "rewards/rejected": -3.226409912109375, + "step": 241 + }, + { + "epoch": 0.45920303605313095, + "grad_norm": 1.9227346181869507, + "learning_rate": 0.00015021223810833165, + "logits/chosen": -2.87888503074646, + "logits/rejected": -2.8758420944213867, + "logps/chosen": -47.109779357910156, + "logps/rejected": -61.53156661987305, + "loss": 0.3432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3327531814575195, + "rewards/margins": 1.5949348211288452, + "rewards/rejected": -3.9276881217956543, + "step": 242 + }, + { + "epoch": 0.46110056925996207, + "grad_norm": 2.2493233680725098, + "learning_rate": 0.0001495746228830442, + "logits/chosen": -2.8305468559265137, + "logits/rejected": -2.8325793743133545, + "logps/chosen": -47.47154998779297, + "logps/rejected": -62.330421447753906, + "loss": 0.2987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.40806245803833, + "rewards/margins": 1.6747158765792847, + "rewards/rejected": -4.082778453826904, + "step": 243 + }, + { + "epoch": 0.4629981024667932, + "grad_norm": 2.9621243476867676, + "learning_rate": 0.0001489343241586475, + "logits/chosen": -2.873603343963623, + "logits/rejected": -2.8682057857513428, + "logps/chosen": -52.045467376708984, + "logps/rejected": -62.28657150268555, + "loss": 0.4042, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7810521125793457, + "rewards/margins": 1.326963186264038, + "rewards/rejected": -4.108015060424805, + "step": 244 + }, + { + "epoch": 0.4648956356736243, + "grad_norm": 4.03017520904541, + "learning_rate": 0.00014829137659483143, + "logits/chosen": -2.836946964263916, + "logits/rejected": -2.834871530532837, + "logps/chosen": -56.07475662231445, + "logps/rejected": -69.99559020996094, + "loss": 0.4504, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.2221789360046387, + "rewards/margins": 1.5012485980987549, + "rewards/rejected": -4.723427772521973, + "step": 245 + }, + { + "epoch": 0.4667931688804554, + "grad_norm": 2.259509563446045, + "learning_rate": 0.00014764581499466893, + "logits/chosen": -2.8168697357177734, + "logits/rejected": -2.8176605701446533, + "logps/chosen": -60.62321853637695, + "logps/rejected": -72.13325500488281, + "loss": 0.4589, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.7241644859313965, + "rewards/margins": 1.2307034730911255, + "rewards/rejected": -4.954867839813232, + "step": 246 + }, + { + "epoch": 0.4686907020872865, + "grad_norm": 1.4829596281051636, + "learning_rate": 0.000146997674302732, + "logits/chosen": -2.8266377449035645, + "logits/rejected": -2.8239901065826416, + "logps/chosen": -53.81561279296875, + "logps/rejected": -81.11199188232422, + "loss": 0.1423, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.9962782859802246, + "rewards/margins": 2.8328514099121094, + "rewards/rejected": -5.829129695892334, + "step": 247 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 3.550029993057251, + "learning_rate": 0.00014634698960320016, + "logits/chosen": -2.8737902641296387, + "logits/rejected": -2.8744564056396484, + "logps/chosen": -70.34271240234375, + "logps/rejected": -85.00938415527344, + "loss": 0.4912, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.698043346405029, + "rewards/margins": 1.4900033473968506, + "rewards/rejected": -6.188046455383301, + "step": 248 + }, + { + "epoch": 0.47248576850094876, + "grad_norm": 2.4781672954559326, + "learning_rate": 0.00014569379611796137, + "logits/chosen": -2.831427812576294, + "logits/rejected": -2.830643892288208, + "logps/chosen": -46.51897430419922, + "logps/rejected": -78.5478286743164, + "loss": 0.1867, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3443069458007812, + "rewards/margins": 3.368368148803711, + "rewards/rejected": -5.712675094604492, + "step": 249 + }, + { + "epoch": 0.47438330170777987, + "grad_norm": 3.924811601638794, + "learning_rate": 0.00014503812920470534, + "logits/chosen": -2.848410129547119, + "logits/rejected": -2.8469910621643066, + "logps/chosen": -51.54380798339844, + "logps/rejected": -79.81298828125, + "loss": 0.3428, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.817504405975342, + "rewards/margins": 3.0145010948181152, + "rewards/rejected": -5.832005500793457, + "step": 250 + }, + { + "epoch": 0.476280834914611, + "grad_norm": 3.6853103637695312, + "learning_rate": 0.00014438002435500979, + "logits/chosen": -2.8478622436523438, + "logits/rejected": -2.8459577560424805, + "logps/chosen": -49.33186340332031, + "logps/rejected": -90.88196563720703, + "loss": 0.23, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5675415992736816, + "rewards/margins": 4.337457656860352, + "rewards/rejected": -6.904999732971191, + "step": 251 + }, + { + "epoch": 0.4781783681214421, + "grad_norm": 3.788881301879883, + "learning_rate": 0.00014371951719241904, + "logits/chosen": -2.839564085006714, + "logits/rejected": -2.825866222381592, + "logps/chosen": -52.40620422363281, + "logps/rejected": -89.26689910888672, + "loss": 0.2075, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6375033855438232, + "rewards/margins": 4.117871284484863, + "rewards/rejected": -6.755374908447266, + "step": 252 + }, + { + "epoch": 0.48007590132827327, + "grad_norm": 3.9719743728637695, + "learning_rate": 0.00014305664347051585, + "logits/chosen": -2.8288607597351074, + "logits/rejected": -2.824831008911133, + "logps/chosen": -58.445960998535156, + "logps/rejected": -84.42477416992188, + "loss": 0.3711, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4238996505737305, + "rewards/margins": 2.746028423309326, + "rewards/rejected": -6.169928073883057, + "step": 253 + }, + { + "epoch": 0.4819734345351044, + "grad_norm": 3.1389803886413574, + "learning_rate": 0.0001423914390709861, + "logits/chosen": -2.8727009296417236, + "logits/rejected": -2.8723371028900146, + "logps/chosen": -49.1451416015625, + "logps/rejected": -77.82601928710938, + "loss": 0.36, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6173038482666016, + "rewards/margins": 2.9812803268432617, + "rewards/rejected": -5.598584175109863, + "step": 254 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 4.244487762451172, + "learning_rate": 0.00014172394000167623, + "logits/chosen": -2.8132166862487793, + "logits/rejected": -2.804203748703003, + "logps/chosen": -59.2404899597168, + "logps/rejected": -68.9154281616211, + "loss": 0.5202, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4824461936950684, + "rewards/margins": 1.2314536571502686, + "rewards/rejected": -4.713899612426758, + "step": 255 + }, + { + "epoch": 0.4857685009487666, + "grad_norm": 1.7867659330368042, + "learning_rate": 0.00014105418239464452, + "logits/chosen": -2.8577332496643066, + "logits/rejected": -2.8458566665649414, + "logps/chosen": -55.45354080200195, + "logps/rejected": -80.99148559570312, + "loss": 0.2365, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.028547763824463, + "rewards/margins": 2.8536529541015625, + "rewards/rejected": -5.882200717926025, + "step": 256 + }, + { + "epoch": 0.4876660341555977, + "grad_norm": 2.2104556560516357, + "learning_rate": 0.00014038220250420485, + "logits/chosen": -2.829854726791382, + "logits/rejected": -2.8256146907806396, + "logps/chosen": -60.85689926147461, + "logps/rejected": -90.96002197265625, + "loss": 0.2461, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.654787063598633, + "rewards/margins": 3.1404576301574707, + "rewards/rejected": -6.7952446937561035, + "step": 257 + }, + { + "epoch": 0.48956356736242884, + "grad_norm": 3.49383807182312, + "learning_rate": 0.00013970803670496453, + "logits/chosen": -2.859201192855835, + "logits/rejected": -2.8544297218322754, + "logps/chosen": -58.228614807128906, + "logps/rejected": -79.75567626953125, + "loss": 0.4078, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4922361373901367, + "rewards/margins": 2.3522963523864746, + "rewards/rejected": -5.8445329666137695, + "step": 258 + }, + { + "epoch": 0.49146110056925996, + "grad_norm": 1.2643572092056274, + "learning_rate": 0.0001390317214898551, + "logits/chosen": -2.8534157276153564, + "logits/rejected": -2.8505430221557617, + "logps/chosen": -58.288631439208984, + "logps/rejected": -83.08782196044922, + "loss": 0.144, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.3658194541931152, + "rewards/margins": 2.705644130706787, + "rewards/rejected": -6.071463584899902, + "step": 259 + }, + { + "epoch": 0.49335863377609107, + "grad_norm": 2.9561476707458496, + "learning_rate": 0.00013835329346815716, + "logits/chosen": -2.8778738975524902, + "logits/rejected": -2.8710873126983643, + "logps/chosen": -62.86555480957031, + "logps/rejected": -96.86504364013672, + "loss": 0.2063, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.8083794116973877, + "rewards/margins": 3.612440586090088, + "rewards/rejected": -7.420820236206055, + "step": 260 + }, + { + "epoch": 0.4952561669829222, + "grad_norm": 4.405760288238525, + "learning_rate": 0.00013767278936351854, + "logits/chosen": -2.8736281394958496, + "logits/rejected": -2.8688836097717285, + "logps/chosen": -61.89301300048828, + "logps/rejected": -96.45506286621094, + "loss": 0.3393, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.816981077194214, + "rewards/margins": 3.6093504428863525, + "rewards/rejected": -7.426331520080566, + "step": 261 + }, + { + "epoch": 0.4971537001897533, + "grad_norm": 2.922870397567749, + "learning_rate": 0.00013699024601196641, + "logits/chosen": -2.8758511543273926, + "logits/rejected": -2.869248151779175, + "logps/chosen": -59.3803825378418, + "logps/rejected": -99.51313018798828, + "loss": 0.267, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.5901854038238525, + "rewards/margins": 4.170801639556885, + "rewards/rejected": -7.760987281799316, + "step": 262 + }, + { + "epoch": 0.4990512333965844, + "grad_norm": 7.673828601837158, + "learning_rate": 0.0001363057003599135, + "logits/chosen": -2.869354724884033, + "logits/rejected": -2.8672537803649902, + "logps/chosen": -63.243125915527344, + "logps/rejected": -94.80345153808594, + "loss": 0.4937, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.047074317932129, + "rewards/margins": 3.288224220275879, + "rewards/rejected": -7.335298538208008, + "step": 263 + }, + { + "epoch": 0.5009487666034156, + "grad_norm": 4.203124046325684, + "learning_rate": 0.00013561918946215806, + "logits/chosen": -2.8657474517822266, + "logits/rejected": -2.859548807144165, + "logps/chosen": -67.86384582519531, + "logps/rejected": -100.13394927978516, + "loss": 0.4981, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.426224231719971, + "rewards/margins": 3.4542787075042725, + "rewards/rejected": -7.880502700805664, + "step": 264 + }, + { + "epoch": 0.5028462998102466, + "grad_norm": 6.988152027130127, + "learning_rate": 0.000134930750479878, + "logits/chosen": -2.8737897872924805, + "logits/rejected": -2.869394540786743, + "logps/chosen": -65.10237121582031, + "logps/rejected": -95.148681640625, + "loss": 0.3511, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.182986259460449, + "rewards/margins": 3.097425937652588, + "rewards/rejected": -7.280411720275879, + "step": 265 + }, + { + "epoch": 0.5047438330170778, + "grad_norm": 5.459814071655273, + "learning_rate": 0.00013424042067861945, + "logits/chosen": -2.8713884353637695, + "logits/rejected": -2.86498761177063, + "logps/chosen": -72.272216796875, + "logps/rejected": -97.40777587890625, + "loss": 0.7328, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.793173313140869, + "rewards/margins": 2.8042550086975098, + "rewards/rejected": -7.597428798675537, + "step": 266 + }, + { + "epoch": 0.5066413662239089, + "grad_norm": 2.6633589267730713, + "learning_rate": 0.0001335482374262795, + "logits/chosen": -2.8936328887939453, + "logits/rejected": -2.8898427486419678, + "logps/chosen": -59.96571731567383, + "logps/rejected": -97.0484619140625, + "loss": 0.2084, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.6741292476654053, + "rewards/margins": 3.8013479709625244, + "rewards/rejected": -7.47547721862793, + "step": 267 + }, + { + "epoch": 0.50853889943074, + "grad_norm": 4.516148090362549, + "learning_rate": 0.0001328542381910835, + "logits/chosen": -2.8802101612091064, + "logits/rejected": -2.8792312145233154, + "logps/chosen": -67.06829071044922, + "logps/rejected": -91.89190673828125, + "loss": 0.4533, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.338167667388916, + "rewards/margins": 2.6398956775665283, + "rewards/rejected": -6.978063583374023, + "step": 268 + }, + { + "epoch": 0.5104364326375711, + "grad_norm": 4.260200023651123, + "learning_rate": 0.00013215846053955683, + "logits/chosen": -2.871657371520996, + "logits/rejected": -2.8631982803344727, + "logps/chosen": -58.0806770324707, + "logps/rejected": -93.51595306396484, + "loss": 0.2301, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.373506784439087, + "rewards/margins": 3.758054256439209, + "rewards/rejected": -7.131560802459717, + "step": 269 + }, + { + "epoch": 0.5123339658444023, + "grad_norm": 8.104193687438965, + "learning_rate": 0.00013146094213449148, + "logits/chosen": -2.834609270095825, + "logits/rejected": -2.835766077041626, + "logps/chosen": -66.10044860839844, + "logps/rejected": -86.46508026123047, + "loss": 0.7338, + "rewards/accuracies": 0.75, + "rewards/chosen": -4.160252571105957, + "rewards/margins": 2.152247667312622, + "rewards/rejected": -6.312500476837158, + "step": 270 + }, + { + "epoch": 0.5142314990512334, + "grad_norm": 2.269233465194702, + "learning_rate": 0.00013076172073290724, + "logits/chosen": -2.8664426803588867, + "logits/rejected": -2.866476058959961, + "logps/chosen": -56.94659423828125, + "logps/rejected": -77.3311996459961, + "loss": 0.6377, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.369450569152832, + "rewards/margins": 2.010820150375366, + "rewards/rejected": -5.380270957946777, + "step": 271 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 3.103928804397583, + "learning_rate": 0.000130060834184008, + "logits/chosen": -2.8317670822143555, + "logits/rejected": -2.8320889472961426, + "logps/chosen": -62.027313232421875, + "logps/rejected": -74.0067367553711, + "loss": 0.6377, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.687196731567383, + "rewards/margins": 1.4686436653137207, + "rewards/rejected": -5.1558403968811035, + "step": 272 + }, + { + "epoch": 0.5180265654648957, + "grad_norm": 1.593082308769226, + "learning_rate": 0.00012935832042713287, + "logits/chosen": -2.828850269317627, + "logits/rejected": -2.8194522857666016, + "logps/chosen": -53.06214904785156, + "logps/rejected": -78.66574096679688, + "loss": 0.1704, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8542308807373047, + "rewards/margins": 2.848317861557007, + "rewards/rejected": -5.702548980712891, + "step": 273 + }, + { + "epoch": 0.5199240986717267, + "grad_norm": 1.320363998413086, + "learning_rate": 0.00012865421748970256, + "logits/chosen": -2.8424181938171387, + "logits/rejected": -2.835724353790283, + "logps/chosen": -55.70930480957031, + "logps/rejected": -81.26738739013672, + "loss": 0.1867, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0446360111236572, + "rewards/margins": 2.9781501293182373, + "rewards/rejected": -6.0227861404418945, + "step": 274 + }, + { + "epoch": 0.5218216318785579, + "grad_norm": 1.8908852338790894, + "learning_rate": 0.00012794856348516095, + "logits/chosen": -2.847842216491699, + "logits/rejected": -2.841383457183838, + "logps/chosen": -54.49677276611328, + "logps/rejected": -80.69400024414062, + "loss": 0.2009, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.114948034286499, + "rewards/margins": 2.7647268772125244, + "rewards/rejected": -5.879674911499023, + "step": 275 + }, + { + "epoch": 0.523719165085389, + "grad_norm": 2.000441312789917, + "learning_rate": 0.0001272413966109119, + "logits/chosen": -2.854240894317627, + "logits/rejected": -2.8539373874664307, + "logps/chosen": -57.158050537109375, + "logps/rejected": -77.82567596435547, + "loss": 0.387, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4087672233581543, + "rewards/margins": 2.2918338775634766, + "rewards/rejected": -5.700601100921631, + "step": 276 + }, + { + "epoch": 0.5256166982922201, + "grad_norm": 0.9269229769706726, + "learning_rate": 0.00012653275514625166, + "logits/chosen": -2.8237087726593018, + "logits/rejected": -2.8187015056610107, + "logps/chosen": -52.4594841003418, + "logps/rejected": -81.11265563964844, + "loss": 0.1577, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.902538299560547, + "rewards/margins": 3.004708766937256, + "rewards/rejected": -5.907247543334961, + "step": 277 + }, + { + "epoch": 0.5275142314990512, + "grad_norm": 4.269275188446045, + "learning_rate": 0.00012582267745029686, + "logits/chosen": -2.83097505569458, + "logits/rejected": -2.824533700942993, + "logps/chosen": -59.98640823364258, + "logps/rejected": -74.92745971679688, + "loss": 0.5263, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.483750104904175, + "rewards/margins": 1.778311014175415, + "rewards/rejected": -5.26206111907959, + "step": 278 + }, + { + "epoch": 0.5294117647058824, + "grad_norm": 2.934359073638916, + "learning_rate": 0.000125111201959908, + "logits/chosen": -2.8329038619995117, + "logits/rejected": -2.829845666885376, + "logps/chosen": -60.867156982421875, + "logps/rejected": -89.72724914550781, + "loss": 0.1871, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.616037368774414, + "rewards/margins": 3.0759949684143066, + "rewards/rejected": -6.692032337188721, + "step": 279 + }, + { + "epoch": 0.5313092979127134, + "grad_norm": 1.4452064037322998, + "learning_rate": 0.00012439836718760886, + "logits/chosen": -2.8472280502319336, + "logits/rejected": -2.8384790420532227, + "logps/chosen": -56.88945007324219, + "logps/rejected": -86.16199493408203, + "loss": 0.223, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.25555682182312, + "rewards/margins": 3.1891889572143555, + "rewards/rejected": -6.444746017456055, + "step": 280 + }, + { + "epoch": 0.5332068311195446, + "grad_norm": 2.5307774543762207, + "learning_rate": 0.00012368421171950192, + "logits/chosen": -2.8560056686401367, + "logits/rejected": -2.8471438884735107, + "logps/chosen": -55.13880157470703, + "logps/rejected": -85.99432373046875, + "loss": 0.1456, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0400350093841553, + "rewards/margins": 3.3702197074890137, + "rewards/rejected": -6.410254955291748, + "step": 281 + }, + { + "epoch": 0.5351043643263758, + "grad_norm": 9.487899780273438, + "learning_rate": 0.0001229687742131796, + "logits/chosen": -2.865816116333008, + "logits/rejected": -2.8594892024993896, + "logps/chosen": -50.71941375732422, + "logps/rejected": -81.01962280273438, + "loss": 0.2773, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.763157367706299, + "rewards/margins": 3.12109375, + "rewards/rejected": -5.884251594543457, + "step": 282 + }, + { + "epoch": 0.5370018975332068, + "grad_norm": 7.071329116821289, + "learning_rate": 0.00012225209339563145, + "logits/chosen": -2.8695383071899414, + "logits/rejected": -2.862952470779419, + "logps/chosen": -46.901954650878906, + "logps/rejected": -78.72514343261719, + "loss": 0.3699, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.247375965118408, + "rewards/margins": 3.466681480407715, + "rewards/rejected": -5.714057445526123, + "step": 283 + }, + { + "epoch": 0.538899430740038, + "grad_norm": 1.0974787473678589, + "learning_rate": 0.0001215342080611484, + "logits/chosen": -2.8294849395751953, + "logits/rejected": -2.818751096725464, + "logps/chosen": -42.89143371582031, + "logps/rejected": -80.403564453125, + "loss": 0.1185, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8664650917053223, + "rewards/margins": 4.029164791107178, + "rewards/rejected": -5.8956298828125, + "step": 284 + }, + { + "epoch": 0.540796963946869, + "grad_norm": 1.9439111948013306, + "learning_rate": 0.00012081515706922227, + "logits/chosen": -2.886253833770752, + "logits/rejected": -2.8804054260253906, + "logps/chosen": -33.328758239746094, + "logps/rejected": -74.43219757080078, + "loss": 0.1276, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8928669691085815, + "rewards/margins": 4.253573417663574, + "rewards/rejected": -5.146440029144287, + "step": 285 + }, + { + "epoch": 0.5426944971537002, + "grad_norm": 5.50389289855957, + "learning_rate": 0.00012009497934244256, + "logits/chosen": -2.860034704208374, + "logits/rejected": -2.8562252521514893, + "logps/chosen": -44.1033821105957, + "logps/rejected": -78.41339111328125, + "loss": 0.4644, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.0543887615203857, + "rewards/margins": 3.5441386699676514, + "rewards/rejected": -5.598527908325195, + "step": 286 + }, + { + "epoch": 0.5445920303605313, + "grad_norm": 9.102168083190918, + "learning_rate": 0.00011937371386438954, + "logits/chosen": -2.8378746509552, + "logits/rejected": -2.8391685485839844, + "logps/chosen": -43.23701477050781, + "logps/rejected": -75.10458374023438, + "loss": 0.3444, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.033146858215332, + "rewards/margins": 3.336325168609619, + "rewards/rejected": -5.369471549987793, + "step": 287 + }, + { + "epoch": 0.5464895635673624, + "grad_norm": 1.0387825965881348, + "learning_rate": 0.0001186513996775239, + "logits/chosen": -2.8853936195373535, + "logits/rejected": -2.880082607269287, + "logps/chosen": -28.779258728027344, + "logps/rejected": -83.81454467773438, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3752191662788391, + "rewards/margins": 5.667832851409912, + "rewards/rejected": -6.0430521965026855, + "step": 288 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 2.832765579223633, + "learning_rate": 0.00011792807588107357, + "logits/chosen": -2.773785352706909, + "logits/rejected": -2.771526575088501, + "logps/chosen": -44.170867919921875, + "logps/rejected": -81.07795715332031, + "loss": 0.2608, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.024778366088867, + "rewards/margins": 3.906771183013916, + "rewards/rejected": -5.931549072265625, + "step": 289 + }, + { + "epoch": 0.5502846299810247, + "grad_norm": 2.276353597640991, + "learning_rate": 0.00011720378162891708, + "logits/chosen": -2.895059823989868, + "logits/rejected": -2.89176869392395, + "logps/chosen": -38.851749420166016, + "logps/rejected": -79.41373443603516, + "loss": 0.2234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7230916023254395, + "rewards/margins": 4.087526798248291, + "rewards/rejected": -5.8106184005737305, + "step": 290 + }, + { + "epoch": 0.5521821631878557, + "grad_norm": 1.7804261445999146, + "learning_rate": 0.00011647855612746423, + "logits/chosen": -2.876720428466797, + "logits/rejected": -2.872744083404541, + "logps/chosen": -49.478515625, + "logps/rejected": -79.2706069946289, + "loss": 0.234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6331980228424072, + "rewards/margins": 3.1652159690856934, + "rewards/rejected": -5.79841423034668, + "step": 291 + }, + { + "epoch": 0.5540796963946869, + "grad_norm": 1.9441782236099243, + "learning_rate": 0.00011575243863353382, + "logits/chosen": -2.8950231075286865, + "logits/rejected": -2.8902437686920166, + "logps/chosen": -43.75312042236328, + "logps/rejected": -80.70056915283203, + "loss": 0.0955, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0715866088867188, + "rewards/margins": 3.8401412963867188, + "rewards/rejected": -5.9117279052734375, + "step": 292 + }, + { + "epoch": 0.5559772296015181, + "grad_norm": 1.1018978357315063, + "learning_rate": 0.00011502546845222859, + "logits/chosen": -2.8955650329589844, + "logits/rejected": -2.888160467147827, + "logps/chosen": -43.491554260253906, + "logps/rejected": -88.80857849121094, + "loss": 0.1789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9829752445220947, + "rewards/margins": 4.644049644470215, + "rewards/rejected": -6.6270246505737305, + "step": 293 + }, + { + "epoch": 0.5578747628083491, + "grad_norm": 7.49698543548584, + "learning_rate": 0.0001142976849348078, + "logits/chosen": -2.8840601444244385, + "logits/rejected": -2.8859639167785645, + "logps/chosen": -54.523216247558594, + "logps/rejected": -71.47078704833984, + "loss": 1.1101, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.054238796234131, + "rewards/margins": 1.8495543003082275, + "rewards/rejected": -4.9037933349609375, + "step": 294 + }, + { + "epoch": 0.5597722960151803, + "grad_norm": 2.938047409057617, + "learning_rate": 0.00011356912747655685, + "logits/chosen": -2.8811421394348145, + "logits/rejected": -2.8825528621673584, + "logps/chosen": -46.03756332397461, + "logps/rejected": -70.37115478515625, + "loss": 0.2148, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1824710369110107, + "rewards/margins": 2.7723135948181152, + "rewards/rejected": -4.954784393310547, + "step": 295 + }, + { + "epoch": 0.5616698292220114, + "grad_norm": 4.470379829406738, + "learning_rate": 0.00011283983551465511, + "logits/chosen": -2.863375425338745, + "logits/rejected": -2.8606436252593994, + "logps/chosen": -39.22755432128906, + "logps/rejected": -77.66708374023438, + "loss": 0.1821, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.5369148254394531, + "rewards/margins": 4.046816349029541, + "rewards/rejected": -5.583731174468994, + "step": 296 + }, + { + "epoch": 0.5635673624288425, + "grad_norm": 7.931248664855957, + "learning_rate": 0.00011210984852604083, + "logits/chosen": -2.8965516090393066, + "logits/rejected": -2.893240213394165, + "logps/chosen": -50.47080993652344, + "logps/rejected": -87.6466064453125, + "loss": 0.4181, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.552130699157715, + "rewards/margins": 3.9827961921691895, + "rewards/rejected": -6.5349273681640625, + "step": 297 + }, + { + "epoch": 0.5654648956356736, + "grad_norm": 4.881028652191162, + "learning_rate": 0.00011137920602527447, + "logits/chosen": -2.916684150695801, + "logits/rejected": -2.910217046737671, + "logps/chosen": -49.143104553222656, + "logps/rejected": -82.70768737792969, + "loss": 0.3173, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.5159926414489746, + "rewards/margins": 3.645129442214966, + "rewards/rejected": -6.1611223220825195, + "step": 298 + }, + { + "epoch": 0.5673624288425048, + "grad_norm": 2.043503522872925, + "learning_rate": 0.00011064794756239977, + "logits/chosen": -2.878192901611328, + "logits/rejected": -2.8828775882720947, + "logps/chosen": -53.674171447753906, + "logps/rejected": -79.92569732666016, + "loss": 0.5443, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.003199577331543, + "rewards/margins": 2.7453689575195312, + "rewards/rejected": -5.748568534851074, + "step": 299 + }, + { + "epoch": 0.5692599620493358, + "grad_norm": 2.94694447517395, + "learning_rate": 0.00010991611272080269, + "logits/chosen": -2.9256865978240967, + "logits/rejected": -2.925057888031006, + "logps/chosen": -59.540016174316406, + "logps/rejected": -82.14313507080078, + "loss": 0.2938, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.6033997535705566, + "rewards/margins": 2.2802443504333496, + "rewards/rejected": -5.883644104003906, + "step": 300 + }, + { + "epoch": 0.571157495256167, + "grad_norm": 1.191422700881958, + "learning_rate": 0.00010918374111506893, + "logits/chosen": -2.925339937210083, + "logits/rejected": -2.9227840900421143, + "logps/chosen": -46.54875946044922, + "logps/rejected": -78.50569915771484, + "loss": 0.1321, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1884849071502686, + "rewards/margins": 3.4915459156036377, + "rewards/rejected": -5.680030822753906, + "step": 301 + }, + { + "epoch": 0.573055028462998, + "grad_norm": 4.066537857055664, + "learning_rate": 0.00010845087238883944, + "logits/chosen": -2.938689947128296, + "logits/rejected": -2.937714099884033, + "logps/chosen": -57.65900802612305, + "logps/rejected": -82.79156494140625, + "loss": 0.3726, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.370972156524658, + "rewards/margins": 2.620013475418091, + "rewards/rejected": -5.990985870361328, + "step": 302 + }, + { + "epoch": 0.5749525616698292, + "grad_norm": 3.895630359649658, + "learning_rate": 0.00010771754621266466, + "logits/chosen": -2.9182395935058594, + "logits/rejected": -2.9164552688598633, + "logps/chosen": -66.43140411376953, + "logps/rejected": -84.01023864746094, + "loss": 0.3176, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.403564453125, + "rewards/margins": 1.890740156173706, + "rewards/rejected": -6.294304370880127, + "step": 303 + }, + { + "epoch": 0.5768500948766604, + "grad_norm": 2.5287392139434814, + "learning_rate": 0.00010698380228185685, + "logits/chosen": -2.902897596359253, + "logits/rejected": -2.902431011199951, + "logps/chosen": -66.64987182617188, + "logps/rejected": -85.07323455810547, + "loss": 0.3927, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.36166524887085, + "rewards/margins": 2.0132718086242676, + "rewards/rejected": -6.374937534332275, + "step": 304 + }, + { + "epoch": 0.5787476280834914, + "grad_norm": 2.4261245727539062, + "learning_rate": 0.00010624968031434173, + "logits/chosen": -2.9023351669311523, + "logits/rejected": -2.9055070877075195, + "logps/chosen": -58.39203643798828, + "logps/rejected": -87.78874206542969, + "loss": 0.3885, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.485225200653076, + "rewards/margins": 2.986713409423828, + "rewards/rejected": -6.471938610076904, + "step": 305 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 2.699694871902466, + "learning_rate": 0.0001055152200485082, + "logits/chosen": -2.8693368434906006, + "logits/rejected": -2.8701136112213135, + "logps/chosen": -61.098785400390625, + "logps/rejected": -79.8037109375, + "loss": 0.4626, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.692890167236328, + "rewards/margins": 2.097386360168457, + "rewards/rejected": -5.790276527404785, + "step": 306 + }, + { + "epoch": 0.5825426944971537, + "grad_norm": 1.7026814222335815, + "learning_rate": 0.00010478046124105744, + "logits/chosen": -2.896878719329834, + "logits/rejected": -2.8954782485961914, + "logps/chosen": -60.92935562133789, + "logps/rejected": -87.54103088378906, + "loss": 0.2256, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.614488124847412, + "rewards/margins": 2.906501531600952, + "rewards/rejected": -6.520989894866943, + "step": 307 + }, + { + "epoch": 0.5844402277039848, + "grad_norm": 4.583454608917236, + "learning_rate": 0.00010404544366485094, + "logits/chosen": -2.91058611869812, + "logits/rejected": -2.9112749099731445, + "logps/chosen": -62.88933181762695, + "logps/rejected": -79.40133666992188, + "loss": 0.58, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.9962563514709473, + "rewards/margins": 1.7496761083602905, + "rewards/rejected": -5.745932579040527, + "step": 308 + }, + { + "epoch": 0.5863377609108159, + "grad_norm": 1.0582410097122192, + "learning_rate": 0.00010331020710675729, + "logits/chosen": -2.9183032512664795, + "logits/rejected": -2.919496536254883, + "logps/chosen": -60.99847412109375, + "logps/rejected": -79.97003936767578, + "loss": 0.2519, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.8266940116882324, + "rewards/margins": 2.1178812980651855, + "rewards/rejected": -5.944575309753418, + "step": 309 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 2.27483868598938, + "learning_rate": 0.00010257479136549889, + "logits/chosen": -2.925682544708252, + "logits/rejected": -2.9215149879455566, + "logps/chosen": -62.194374084472656, + "logps/rejected": -80.60386657714844, + "loss": 0.3425, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.68037748336792, + "rewards/margins": 2.081040143966675, + "rewards/rejected": -5.761418342590332, + "step": 310 + }, + { + "epoch": 0.5901328273244781, + "grad_norm": 1.3051766157150269, + "learning_rate": 0.0001018392362494972, + "logits/chosen": -2.9560303688049316, + "logits/rejected": -2.949531316757202, + "logps/chosen": -57.7611083984375, + "logps/rejected": -78.2846450805664, + "loss": 0.251, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4193286895751953, + "rewards/margins": 2.350963592529297, + "rewards/rejected": -5.77029275894165, + "step": 311 + }, + { + "epoch": 0.5920303605313093, + "grad_norm": 2.4817426204681396, + "learning_rate": 0.00010110358157471824, + "logits/chosen": -2.9510304927825928, + "logits/rejected": -2.953981876373291, + "logps/chosen": -65.96543884277344, + "logps/rejected": -83.39318084716797, + "loss": 0.4345, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.075649261474609, + "rewards/margins": 2.000249147415161, + "rewards/rejected": -6.075898170471191, + "step": 312 + }, + { + "epoch": 0.5939278937381404, + "grad_norm": 1.3821583986282349, + "learning_rate": 0.0001003678671625172, + "logits/chosen": -2.952937602996826, + "logits/rejected": -2.950251817703247, + "logps/chosen": -61.165550231933594, + "logps/rejected": -85.19444274902344, + "loss": 0.2662, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.7030019760131836, + "rewards/margins": 2.63686466217041, + "rewards/rejected": -6.339866638183594, + "step": 313 + }, + { + "epoch": 0.5958254269449715, + "grad_norm": 1.6280112266540527, + "learning_rate": 9.963213283748282e-05, + "logits/chosen": -2.933931589126587, + "logits/rejected": -2.926939010620117, + "logps/chosen": -61.775917053222656, + "logps/rejected": -81.53590393066406, + "loss": 0.3233, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7116825580596924, + "rewards/margins": 2.275146484375, + "rewards/rejected": -5.986828804016113, + "step": 314 + }, + { + "epoch": 0.5977229601518027, + "grad_norm": 1.9062341451644897, + "learning_rate": 9.889641842528178e-05, + "logits/chosen": -2.9100828170776367, + "logits/rejected": -2.9047842025756836, + "logps/chosen": -61.198387145996094, + "logps/rejected": -81.3210220336914, + "loss": 0.3868, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6690847873687744, + "rewards/margins": 2.366109848022461, + "rewards/rejected": -6.0351948738098145, + "step": 315 + }, + { + "epoch": 0.5996204933586338, + "grad_norm": 1.1220036745071411, + "learning_rate": 9.816076375050283e-05, + "logits/chosen": -2.9533026218414307, + "logits/rejected": -2.949734926223755, + "logps/chosen": -53.777095794677734, + "logps/rejected": -82.09625244140625, + "loss": 0.1049, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7948765754699707, + "rewards/margins": 3.177762508392334, + "rewards/rejected": -5.972639083862305, + "step": 316 + }, + { + "epoch": 0.6015180265654649, + "grad_norm": 1.763089895248413, + "learning_rate": 9.742520863450115e-05, + "logits/chosen": -2.9159345626831055, + "logits/rejected": -2.912830114364624, + "logps/chosen": -56.66600799560547, + "logps/rejected": -82.20394897460938, + "loss": 0.2744, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2924387454986572, + "rewards/margins": 2.709559917449951, + "rewards/rejected": -6.0019989013671875, + "step": 317 + }, + { + "epoch": 0.603415559772296, + "grad_norm": 1.6462388038635254, + "learning_rate": 9.668979289324273e-05, + "logits/chosen": -2.9503092765808105, + "logits/rejected": -2.947619676589966, + "logps/chosen": -52.42357635498047, + "logps/rejected": -84.0963134765625, + "loss": 0.2405, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.807110548019409, + "rewards/margins": 3.3853917121887207, + "rewards/rejected": -6.192502021789551, + "step": 318 + }, + { + "epoch": 0.6053130929791272, + "grad_norm": 1.4591569900512695, + "learning_rate": 9.595455633514909e-05, + "logits/chosen": -2.9135751724243164, + "logits/rejected": -2.9162490367889404, + "logps/chosen": -51.95201110839844, + "logps/rejected": -83.46641540527344, + "loss": 0.1972, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7722127437591553, + "rewards/margins": 3.348604679107666, + "rewards/rejected": -6.120817184448242, + "step": 319 + }, + { + "epoch": 0.6072106261859582, + "grad_norm": 2.3641560077667236, + "learning_rate": 9.521953875894257e-05, + "logits/chosen": -2.9282002449035645, + "logits/rejected": -2.926896095275879, + "logps/chosen": -46.65143585205078, + "logps/rejected": -81.9978256225586, + "loss": 0.1471, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.24959397315979, + "rewards/margins": 3.780817985534668, + "rewards/rejected": -6.030411720275879, + "step": 320 + }, + { + "epoch": 0.6091081593927894, + "grad_norm": 2.0623621940612793, + "learning_rate": 9.448477995149182e-05, + "logits/chosen": -2.9305570125579834, + "logits/rejected": -2.929703950881958, + "logps/chosen": -44.84100341796875, + "logps/rejected": -89.29674530029297, + "loss": 0.1482, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.057992935180664, + "rewards/margins": 4.761512279510498, + "rewards/rejected": -6.81950569152832, + "step": 321 + }, + { + "epoch": 0.6110056925996205, + "grad_norm": 2.0448296070098877, + "learning_rate": 9.375031968565829e-05, + "logits/chosen": -2.891784429550171, + "logits/rejected": -2.891885280609131, + "logps/chosen": -48.85424041748047, + "logps/rejected": -73.28569030761719, + "loss": 0.7177, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.64813232421875, + "rewards/margins": 2.567401647567749, + "rewards/rejected": -5.215534210205078, + "step": 322 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 9.832599639892578, + "learning_rate": 9.301619771814316e-05, + "logits/chosen": -2.8943068981170654, + "logits/rejected": -2.8931775093078613, + "logps/chosen": -51.37837219238281, + "logps/rejected": -77.63446044921875, + "loss": 0.4391, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.7383222579956055, + "rewards/margins": 2.8650565147399902, + "rewards/rejected": -5.603378772735596, + "step": 323 + }, + { + "epoch": 0.6148007590132827, + "grad_norm": 2.727288246154785, + "learning_rate": 9.228245378733537e-05, + "logits/chosen": -2.9204916954040527, + "logits/rejected": -2.9224843978881836, + "logps/chosen": -54.46859359741211, + "logps/rejected": -78.30081939697266, + "loss": 0.4835, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.144411325454712, + "rewards/margins": 2.3725523948669434, + "rewards/rejected": -5.516963958740234, + "step": 324 + }, + { + "epoch": 0.6166982922201139, + "grad_norm": 5.74871301651001, + "learning_rate": 9.154912761116056e-05, + "logits/chosen": -2.9131014347076416, + "logits/rejected": -2.9133331775665283, + "logps/chosen": -44.03924560546875, + "logps/rejected": -75.57574462890625, + "loss": 0.2908, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.139979839324951, + "rewards/margins": 3.2752749919891357, + "rewards/rejected": -5.415254592895508, + "step": 325 + }, + { + "epoch": 0.618595825426945, + "grad_norm": 1.533111810684204, + "learning_rate": 9.081625888493108e-05, + "logits/chosen": -2.909083843231201, + "logits/rejected": -2.911268472671509, + "logps/chosen": -34.44724655151367, + "logps/rejected": -74.28544616699219, + "loss": 0.0937, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0769883394241333, + "rewards/margins": 4.155948162078857, + "rewards/rejected": -5.232936859130859, + "step": 326 + }, + { + "epoch": 0.6204933586337761, + "grad_norm": 1.9290465116500854, + "learning_rate": 9.008388727919731e-05, + "logits/chosen": -2.898500442504883, + "logits/rejected": -2.89833664894104, + "logps/chosen": -42.38737487792969, + "logps/rejected": -70.19406127929688, + "loss": 0.2004, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.788802981376648, + "rewards/margins": 3.008125066757202, + "rewards/rejected": -4.7969279289245605, + "step": 327 + }, + { + "epoch": 0.6223908918406073, + "grad_norm": 3.8443222045898438, + "learning_rate": 8.935205243760022e-05, + "logits/chosen": -2.907944679260254, + "logits/rejected": -2.910562038421631, + "logps/chosen": -37.31573486328125, + "logps/rejected": -69.13600158691406, + "loss": 0.2375, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.220193862915039, + "rewards/margins": 3.487980842590332, + "rewards/rejected": -4.708174705505371, + "step": 328 + }, + { + "epoch": 0.6242884250474383, + "grad_norm": 6.546236515045166, + "learning_rate": 8.862079397472553e-05, + "logits/chosen": -2.9090211391448975, + "logits/rejected": -2.910724401473999, + "logps/chosen": -39.852840423583984, + "logps/rejected": -66.06358337402344, + "loss": 0.2685, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7172813415527344, + "rewards/margins": 2.741621494293213, + "rewards/rejected": -4.458902835845947, + "step": 329 + }, + { + "epoch": 0.6261859582542695, + "grad_norm": 5.839223861694336, + "learning_rate": 8.789015147395919e-05, + "logits/chosen": -2.889848470687866, + "logits/rejected": -2.890821933746338, + "logps/chosen": -60.09476852416992, + "logps/rejected": -75.91924285888672, + "loss": 0.8722, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.577435255050659, + "rewards/margins": 1.8433260917663574, + "rewards/rejected": -5.4207611083984375, + "step": 330 + }, + { + "epoch": 0.6280834914611005, + "grad_norm": 2.1105823516845703, + "learning_rate": 8.71601644853449e-05, + "logits/chosen": -2.8981029987335205, + "logits/rejected": -2.899024486541748, + "logps/chosen": -45.887939453125, + "logps/rejected": -82.8519287109375, + "loss": 0.1054, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.219303607940674, + "rewards/margins": 3.9345149993896484, + "rewards/rejected": -6.1538190841674805, + "step": 331 + }, + { + "epoch": 0.6299810246679317, + "grad_norm": 1.5297664403915405, + "learning_rate": 8.643087252344313e-05, + "logits/chosen": -2.9043052196502686, + "logits/rejected": -2.9025826454162598, + "logps/chosen": -52.71283721923828, + "logps/rejected": -80.79203033447266, + "loss": 0.1583, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.007112979888916, + "rewards/margins": 2.9045331478118896, + "rewards/rejected": -5.911646366119385, + "step": 332 + }, + { + "epoch": 0.6318785578747628, + "grad_norm": 3.1997110843658447, + "learning_rate": 8.57023150651922e-05, + "logits/chosen": -2.9024877548217773, + "logits/rejected": -2.9013073444366455, + "logps/chosen": -42.579097747802734, + "logps/rejected": -79.65797424316406, + "loss": 0.1922, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8045451641082764, + "rewards/margins": 3.9230599403381348, + "rewards/rejected": -5.72760534286499, + "step": 333 + }, + { + "epoch": 0.6337760910815939, + "grad_norm": 2.93386173248291, + "learning_rate": 8.49745315477714e-05, + "logits/chosen": -2.8821892738342285, + "logits/rejected": -2.88553524017334, + "logps/chosen": -40.950721740722656, + "logps/rejected": -74.56183624267578, + "loss": 0.3928, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8069758415222168, + "rewards/margins": 3.571430206298828, + "rewards/rejected": -5.378406524658203, + "step": 334 + }, + { + "epoch": 0.635673624288425, + "grad_norm": 4.076981544494629, + "learning_rate": 8.424756136646623e-05, + "logits/chosen": -2.888922929763794, + "logits/rejected": -2.8909573554992676, + "logps/chosen": -44.45381164550781, + "logps/rejected": -74.74778747558594, + "loss": 0.6127, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.156569480895996, + "rewards/margins": 3.1150104999542236, + "rewards/rejected": -5.271580219268799, + "step": 335 + }, + { + "epoch": 0.6375711574952562, + "grad_norm": 1.4378771781921387, + "learning_rate": 8.352144387253582e-05, + "logits/chosen": -2.8876922130584717, + "logits/rejected": -2.8896241188049316, + "logps/chosen": -37.443275451660156, + "logps/rejected": -82.51123046875, + "loss": 0.1054, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2844719886779785, + "rewards/margins": 4.881505012512207, + "rewards/rejected": -6.1659770011901855, + "step": 336 + }, + { + "epoch": 0.6394686907020873, + "grad_norm": 4.512608051300049, + "learning_rate": 8.279621837108295e-05, + "logits/chosen": -2.9067864418029785, + "logits/rejected": -2.9087655544281006, + "logps/chosen": -52.943763732910156, + "logps/rejected": -80.59246826171875, + "loss": 0.3255, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.979990005493164, + "rewards/margins": 2.8639986515045166, + "rewards/rejected": -5.843988418579102, + "step": 337 + }, + { + "epoch": 0.6413662239089184, + "grad_norm": 4.300062656402588, + "learning_rate": 8.207192411892646e-05, + "logits/chosen": -2.8789937496185303, + "logits/rejected": -2.877152442932129, + "logps/chosen": -54.1341552734375, + "logps/rejected": -85.54117584228516, + "loss": 0.2004, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.914337396621704, + "rewards/margins": 3.30031681060791, + "rewards/rejected": -6.214654445648193, + "step": 338 + }, + { + "epoch": 0.6432637571157496, + "grad_norm": 2.4098892211914062, + "learning_rate": 8.134860032247613e-05, + "logits/chosen": -2.870198965072632, + "logits/rejected": -2.867429733276367, + "logps/chosen": -44.648841857910156, + "logps/rejected": -85.62705993652344, + "loss": 0.1193, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0035176277160645, + "rewards/margins": 4.332259178161621, + "rewards/rejected": -6.335777282714844, + "step": 339 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 4.86448335647583, + "learning_rate": 8.062628613561051e-05, + "logits/chosen": -2.8677215576171875, + "logits/rejected": -2.865633010864258, + "logps/chosen": -45.240501403808594, + "logps/rejected": -84.56080627441406, + "loss": 0.1312, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1111769676208496, + "rewards/margins": 4.117494583129883, + "rewards/rejected": -6.228672027587891, + "step": 340 + }, + { + "epoch": 0.6470588235294118, + "grad_norm": 3.7126381397247314, + "learning_rate": 7.990502065755748e-05, + "logits/chosen": -2.858647346496582, + "logits/rejected": -2.857041835784912, + "logps/chosen": -55.83709716796875, + "logps/rejected": -82.04129028320312, + "loss": 0.3661, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.063718795776367, + "rewards/margins": 2.8775720596313477, + "rewards/rejected": -5.941291332244873, + "step": 341 + }, + { + "epoch": 0.6489563567362429, + "grad_norm": 2.671086072921753, + "learning_rate": 7.918484293077777e-05, + "logits/chosen": -2.884795665740967, + "logits/rejected": -2.8853371143341064, + "logps/chosen": -51.566864013671875, + "logps/rejected": -78.62126922607422, + "loss": 0.2656, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.805079936981201, + "rewards/margins": 2.820594072341919, + "rewards/rejected": -5.625674247741699, + "step": 342 + }, + { + "epoch": 0.650853889943074, + "grad_norm": 2.7431766986846924, + "learning_rate": 7.846579193885166e-05, + "logits/chosen": -2.9095654487609863, + "logits/rejected": -2.908602476119995, + "logps/chosen": -48.2768440246582, + "logps/rejected": -75.55670166015625, + "loss": 0.3786, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.4378509521484375, + "rewards/margins": 2.9142463207244873, + "rewards/rejected": -5.352097511291504, + "step": 343 + }, + { + "epoch": 0.6527514231499051, + "grad_norm": 2.9285616874694824, + "learning_rate": 7.774790660436858e-05, + "logits/chosen": -2.8709418773651123, + "logits/rejected": -2.8702428340911865, + "logps/chosen": -46.38733673095703, + "logps/rejected": -82.48179626464844, + "loss": 0.3003, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.308894395828247, + "rewards/margins": 3.722503662109375, + "rewards/rejected": -6.031398296356201, + "step": 344 + }, + { + "epoch": 0.6546489563567363, + "grad_norm": 0.9489471912384033, + "learning_rate": 7.703122578682046e-05, + "logits/chosen": -2.8957481384277344, + "logits/rejected": -2.8958137035369873, + "logps/chosen": -36.95110321044922, + "logps/rejected": -86.87677764892578, + "loss": 0.0761, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2242169380187988, + "rewards/margins": 5.352141380310059, + "rewards/rejected": -6.576358318328857, + "step": 345 + }, + { + "epoch": 0.6565464895635673, + "grad_norm": 0.7262978553771973, + "learning_rate": 7.631578828049809e-05, + "logits/chosen": -2.902186870574951, + "logits/rejected": -2.902596950531006, + "logps/chosen": -42.105674743652344, + "logps/rejected": -81.15824890136719, + "loss": 0.137, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8559753894805908, + "rewards/margins": 4.022046089172363, + "rewards/rejected": -5.878021717071533, + "step": 346 + }, + { + "epoch": 0.6584440227703985, + "grad_norm": 0.5167392492294312, + "learning_rate": 7.560163281239115e-05, + "logits/chosen": -2.893174409866333, + "logits/rejected": -2.890427350997925, + "logps/chosen": -34.30916976928711, + "logps/rejected": -81.82603454589844, + "loss": 0.0651, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9803083539009094, + "rewards/margins": 5.106498718261719, + "rewards/rejected": -6.0868072509765625, + "step": 347 + }, + { + "epoch": 0.6603415559772297, + "grad_norm": 3.071392059326172, + "learning_rate": 7.488879804009205e-05, + "logits/chosen": -2.8696646690368652, + "logits/rejected": -2.868384838104248, + "logps/chosen": -52.08047866821289, + "logps/rejected": -89.13264465332031, + "loss": 0.3081, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.913431406021118, + "rewards/margins": 3.715190887451172, + "rewards/rejected": -6.628622055053711, + "step": 348 + }, + { + "epoch": 0.6622390891840607, + "grad_norm": 2.76004958152771, + "learning_rate": 7.417732254970317e-05, + "logits/chosen": -2.8904638290405273, + "logits/rejected": -2.8905301094055176, + "logps/chosen": -43.39665222167969, + "logps/rejected": -80.96947479248047, + "loss": 0.283, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0628576278686523, + "rewards/margins": 3.969433546066284, + "rewards/rejected": -6.032290935516357, + "step": 349 + }, + { + "epoch": 0.6641366223908919, + "grad_norm": 1.2935935258865356, + "learning_rate": 7.346724485374837e-05, + "logits/chosen": -2.8328428268432617, + "logits/rejected": -2.8332414627075195, + "logps/chosen": -42.56072235107422, + "logps/rejected": -72.73566436767578, + "loss": 0.4495, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9386003017425537, + "rewards/margins": 3.1780686378479004, + "rewards/rejected": -5.116669178009033, + "step": 350 + }, + { + "epoch": 0.6660341555977229, + "grad_norm": 0.8221670389175415, + "learning_rate": 7.275860338908815e-05, + "logits/chosen": -2.840083122253418, + "logits/rejected": -2.8375134468078613, + "logps/chosen": -31.67504119873047, + "logps/rejected": -73.15141296386719, + "loss": 0.0362, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7467401027679443, + "rewards/margins": 4.425130844116211, + "rewards/rejected": -5.171871185302734, + "step": 351 + }, + { + "epoch": 0.6679316888045541, + "grad_norm": 7.032353401184082, + "learning_rate": 7.205143651483906e-05, + "logits/chosen": -2.8678221702575684, + "logits/rejected": -2.8668932914733887, + "logps/chosen": -40.8023796081543, + "logps/rejected": -70.9334487915039, + "loss": 0.2645, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7451889514923096, + "rewards/margins": 3.13863468170166, + "rewards/rejected": -4.883823871612549, + "step": 352 + }, + { + "epoch": 0.6698292220113852, + "grad_norm": 3.907095193862915, + "learning_rate": 7.134578251029745e-05, + "logits/chosen": -2.835967779159546, + "logits/rejected": -2.8349101543426514, + "logps/chosen": -37.25190734863281, + "logps/rejected": -75.33383178710938, + "loss": 0.2817, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.264039397239685, + "rewards/margins": 4.003348350524902, + "rewards/rejected": -5.267387866973877, + "step": 353 + }, + { + "epoch": 0.6717267552182163, + "grad_norm": 3.347811698913574, + "learning_rate": 7.064167957286714e-05, + "logits/chosen": -2.8626599311828613, + "logits/rejected": -2.8652517795562744, + "logps/chosen": -43.0107307434082, + "logps/rejected": -77.27116394042969, + "loss": 0.3695, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.9182496070861816, + "rewards/margins": 3.5998854637145996, + "rewards/rejected": -5.518135070800781, + "step": 354 + }, + { + "epoch": 0.6736242884250474, + "grad_norm": 1.5447057485580444, + "learning_rate": 6.993916581599202e-05, + "logits/chosen": -2.8407070636749268, + "logits/rejected": -2.8412859439849854, + "logps/chosen": -39.80915451049805, + "logps/rejected": -74.49897003173828, + "loss": 0.2434, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6093679666519165, + "rewards/margins": 3.66628360748291, + "rewards/rejected": -5.275651931762695, + "step": 355 + }, + { + "epoch": 0.6755218216318786, + "grad_norm": 3.911895275115967, + "learning_rate": 6.923827926709277e-05, + "logits/chosen": -2.8489303588867188, + "logits/rejected": -2.8513529300689697, + "logps/chosen": -40.58164978027344, + "logps/rejected": -76.51760864257812, + "loss": 0.1489, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7140048742294312, + "rewards/margins": 3.8055503368377686, + "rewards/rejected": -5.51955509185791, + "step": 356 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 1.1900240182876587, + "learning_rate": 6.853905786550854e-05, + "logits/chosen": -2.82595157623291, + "logits/rejected": -2.8293063640594482, + "logps/chosen": -34.55420684814453, + "logps/rejected": -82.0986557006836, + "loss": 0.0661, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2518024444580078, + "rewards/margins": 4.8424973487854, + "rewards/rejected": -6.094300270080566, + "step": 357 + }, + { + "epoch": 0.6793168880455408, + "grad_norm": 1.571761131286621, + "learning_rate": 6.78415394604432e-05, + "logits/chosen": -2.826899766921997, + "logits/rejected": -2.826556921005249, + "logps/chosen": -41.274803161621094, + "logps/rejected": -72.56224060058594, + "loss": 0.1361, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6804169416427612, + "rewards/margins": 3.465762138366699, + "rewards/rejected": -5.14617919921875, + "step": 358 + }, + { + "epoch": 0.681214421252372, + "grad_norm": 3.052306890487671, + "learning_rate": 6.714576180891654e-05, + "logits/chosen": -2.8384101390838623, + "logits/rejected": -2.840078353881836, + "logps/chosen": -39.517417907714844, + "logps/rejected": -69.88583374023438, + "loss": 0.1488, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5221190452575684, + "rewards/margins": 3.2681870460510254, + "rewards/rejected": -4.790306091308594, + "step": 359 + }, + { + "epoch": 0.683111954459203, + "grad_norm": 1.8139554262161255, + "learning_rate": 6.645176257372055e-05, + "logits/chosen": -2.818631172180176, + "logits/rejected": -2.8222427368164062, + "logps/chosen": -35.95072937011719, + "logps/rejected": -64.10530090332031, + "loss": 0.1746, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1438565254211426, + "rewards/margins": 3.059673309326172, + "rewards/rejected": -4.203530311584473, + "step": 360 + }, + { + "epoch": 0.6850094876660342, + "grad_norm": 2.069314479827881, + "learning_rate": 6.575957932138057e-05, + "logits/chosen": -2.8182709217071533, + "logits/rejected": -2.8195230960845947, + "logps/chosen": -46.92900848388672, + "logps/rejected": -74.21569061279297, + "loss": 0.3961, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1707496643066406, + "rewards/margins": 2.961195230484009, + "rewards/rejected": -5.1319451332092285, + "step": 361 + }, + { + "epoch": 0.6869070208728653, + "grad_norm": 1.8511279821395874, + "learning_rate": 6.506924952012202e-05, + "logits/chosen": -2.8286561965942383, + "logits/rejected": -2.823415994644165, + "logps/chosen": -44.049896240234375, + "logps/rejected": -78.49517822265625, + "loss": 0.2609, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9191491603851318, + "rewards/margins": 3.8288094997406006, + "rewards/rejected": -5.747958660125732, + "step": 362 + }, + { + "epoch": 0.6888045540796964, + "grad_norm": 2.228764533996582, + "learning_rate": 6.438081053784197e-05, + "logits/chosen": -2.8500823974609375, + "logits/rejected": -2.8482038974761963, + "logps/chosen": -44.39284896850586, + "logps/rejected": -69.33513641357422, + "loss": 0.2864, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1398777961730957, + "rewards/margins": 2.614034652709961, + "rewards/rejected": -4.753911972045898, + "step": 363 + }, + { + "epoch": 0.6907020872865275, + "grad_norm": 2.184192180633545, + "learning_rate": 6.36942996400865e-05, + "logits/chosen": -2.819075584411621, + "logits/rejected": -2.820847749710083, + "logps/chosen": -43.02545166015625, + "logps/rejected": -67.06218719482422, + "loss": 0.217, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8877732753753662, + "rewards/margins": 2.566277027130127, + "rewards/rejected": -4.454050064086914, + "step": 364 + }, + { + "epoch": 0.6925996204933587, + "grad_norm": 1.0147178173065186, + "learning_rate": 6.300975398803362e-05, + "logits/chosen": -2.817196846008301, + "logits/rejected": -2.819939613342285, + "logps/chosen": -43.927947998046875, + "logps/rejected": -77.98281860351562, + "loss": 0.0992, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.025996208190918, + "rewards/margins": 3.6155223846435547, + "rewards/rejected": -5.641518592834473, + "step": 365 + }, + { + "epoch": 0.6944971537001897, + "grad_norm": 2.812535285949707, + "learning_rate": 6.232721063648148e-05, + "logits/chosen": -2.8124892711639404, + "logits/rejected": -2.813704252243042, + "logps/chosen": -39.741539001464844, + "logps/rejected": -72.29251098632812, + "loss": 0.3044, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6100232601165771, + "rewards/margins": 3.4391074180603027, + "rewards/rejected": -5.049130439758301, + "step": 366 + }, + { + "epoch": 0.6963946869070209, + "grad_norm": 4.147728443145752, + "learning_rate": 6.164670653184285e-05, + "logits/chosen": -2.8021512031555176, + "logits/rejected": -2.8073835372924805, + "logps/chosen": -49.98514175415039, + "logps/rejected": -68.45268249511719, + "loss": 0.7597, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.6653554439544678, + "rewards/margins": 1.931579351425171, + "rewards/rejected": -4.5969343185424805, + "step": 367 + }, + { + "epoch": 0.698292220113852, + "grad_norm": 4.9097676277160645, + "learning_rate": 6.09682785101449e-05, + "logits/chosen": -2.754014253616333, + "logits/rejected": -2.753483295440674, + "logps/chosen": -50.1568603515625, + "logps/rejected": -72.04705047607422, + "loss": 0.343, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7088911533355713, + "rewards/margins": 2.3041090965270996, + "rewards/rejected": -5.01300048828125, + "step": 368 + }, + { + "epoch": 0.7001897533206831, + "grad_norm": 1.8070886135101318, + "learning_rate": 6.0291963295035484e-05, + "logits/chosen": -2.7732787132263184, + "logits/rejected": -2.773905038833618, + "logps/chosen": -36.48072052001953, + "logps/rejected": -64.48783874511719, + "loss": 0.1676, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3253145217895508, + "rewards/margins": 2.909857749938965, + "rewards/rejected": -4.235172271728516, + "step": 369 + }, + { + "epoch": 0.7020872865275142, + "grad_norm": 1.1058545112609863, + "learning_rate": 5.961779749579516e-05, + "logits/chosen": -2.8352999687194824, + "logits/rejected": -2.837979793548584, + "logps/chosen": -40.074058532714844, + "logps/rejected": -78.63117980957031, + "loss": 0.1221, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5429928302764893, + "rewards/margins": 4.072991847991943, + "rewards/rejected": -5.615984916687012, + "step": 370 + }, + { + "epoch": 0.7039848197343453, + "grad_norm": 1.185791015625, + "learning_rate": 5.894581760535549e-05, + "logits/chosen": -2.7890102863311768, + "logits/rejected": -2.798060417175293, + "logps/chosen": -39.764137268066406, + "logps/rejected": -71.1139144897461, + "loss": 0.1825, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6215232610702515, + "rewards/margins": 3.141541004180908, + "rewards/rejected": -4.763064384460449, + "step": 371 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 2.253542423248291, + "learning_rate": 5.827605999832375e-05, + "logits/chosen": -2.8037962913513184, + "logits/rejected": -2.810068368911743, + "logps/chosen": -40.40297317504883, + "logps/rejected": -69.3699951171875, + "loss": 0.356, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.7513362169265747, + "rewards/margins": 2.997833728790283, + "rewards/rejected": -4.749169826507568, + "step": 372 + }, + { + "epoch": 0.7077798861480076, + "grad_norm": 4.954280376434326, + "learning_rate": 5.7608560929013946e-05, + "logits/chosen": -2.830198049545288, + "logits/rejected": -2.8298377990722656, + "logps/chosen": -38.188899993896484, + "logps/rejected": -60.74665832519531, + "loss": 0.3994, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.466092586517334, + "rewards/margins": 2.4330639839172363, + "rewards/rejected": -3.8991568088531494, + "step": 373 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 2.809379816055298, + "learning_rate": 5.694335652948415e-05, + "logits/chosen": -2.7694361209869385, + "logits/rejected": -2.7702527046203613, + "logps/chosen": -39.33440017700195, + "logps/rejected": -61.418212890625, + "loss": 0.453, + "rewards/accuracies": 0.6875, + "rewards/chosen": -1.5252565145492554, + "rewards/margins": 2.4085230827331543, + "rewards/rejected": -3.933779716491699, + "step": 374 + }, + { + "epoch": 0.7115749525616698, + "grad_norm": 4.30332612991333, + "learning_rate": 5.628048280758096e-05, + "logits/chosen": -2.835770845413208, + "logits/rejected": -2.8393208980560303, + "logps/chosen": -40.133426666259766, + "logps/rejected": -70.28953552246094, + "loss": 0.2347, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6678059101104736, + "rewards/margins": 3.160830497741699, + "rewards/rejected": -4.82863712310791, + "step": 375 + }, + { + "epoch": 0.713472485768501, + "grad_norm": 9.771546363830566, + "learning_rate": 5.5619975644990244e-05, + "logits/chosen": -2.8247642517089844, + "logits/rejected": -2.828068971633911, + "logps/chosen": -42.2925910949707, + "logps/rejected": -60.983116149902344, + "loss": 0.6838, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.81632661819458, + "rewards/margins": 2.0536108016967773, + "rewards/rejected": -3.8699374198913574, + "step": 376 + }, + { + "epoch": 0.715370018975332, + "grad_norm": 8.389379501342773, + "learning_rate": 5.496187079529465e-05, + "logits/chosen": -2.8321757316589355, + "logits/rejected": -2.8335793018341064, + "logps/chosen": -39.37532043457031, + "logps/rejected": -62.030555725097656, + "loss": 0.4633, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.6666603088378906, + "rewards/margins": 2.3203835487365723, + "rewards/rejected": -3.987044095993042, + "step": 377 + }, + { + "epoch": 0.7172675521821632, + "grad_norm": 0.9523990154266357, + "learning_rate": 5.4306203882038664e-05, + "logits/chosen": -2.8029139041900635, + "logits/rejected": -2.8072001934051514, + "logps/chosen": -38.88301467895508, + "logps/rejected": -63.761016845703125, + "loss": 0.155, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5240936279296875, + "rewards/margins": 2.7224197387695312, + "rewards/rejected": -4.246513366699219, + "step": 378 + }, + { + "epoch": 0.7191650853889943, + "grad_norm": 1.4619261026382446, + "learning_rate": 5.365301039679984e-05, + "logits/chosen": -2.7594830989837646, + "logits/rejected": -2.759218692779541, + "logps/chosen": -43.129417419433594, + "logps/rejected": -69.13524627685547, + "loss": 0.1813, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7728075981140137, + "rewards/margins": 2.8003039360046387, + "rewards/rejected": -4.573111534118652, + "step": 379 + }, + { + "epoch": 0.7210626185958254, + "grad_norm": 0.7891879677772522, + "learning_rate": 5.300232569726804e-05, + "logits/chosen": -2.8181209564208984, + "logits/rejected": -2.818765878677368, + "logps/chosen": -37.475074768066406, + "logps/rejected": -67.34873962402344, + "loss": 0.1258, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2424066066741943, + "rewards/margins": 3.349724769592285, + "rewards/rejected": -4.592131614685059, + "step": 380 + }, + { + "epoch": 0.7229601518026565, + "grad_norm": 3.099241018295288, + "learning_rate": 5.235418500533109e-05, + "logits/chosen": -2.8263823986053467, + "logits/rejected": -2.826491355895996, + "logps/chosen": -34.44828796386719, + "logps/rejected": -60.10238265991211, + "loss": 0.1577, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0063425302505493, + "rewards/margins": 2.7296359539031982, + "rewards/rejected": -3.735978603363037, + "step": 381 + }, + { + "epoch": 0.7248576850094877, + "grad_norm": 2.173335313796997, + "learning_rate": 5.170862340516858e-05, + "logits/chosen": -2.8429558277130127, + "logits/rejected": -2.845327377319336, + "logps/chosen": -37.72207260131836, + "logps/rejected": -69.11197662353516, + "loss": 0.1317, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.4073717594146729, + "rewards/margins": 3.334414005279541, + "rewards/rejected": -4.741786003112793, + "step": 382 + }, + { + "epoch": 0.7267552182163188, + "grad_norm": 1.9054666757583618, + "learning_rate": 5.1065675841352514e-05, + "logits/chosen": -2.799262523651123, + "logits/rejected": -2.7995381355285645, + "logps/chosen": -42.19648742675781, + "logps/rejected": -73.97077941894531, + "loss": 0.1394, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8171238899230957, + "rewards/margins": 3.4282195568084717, + "rewards/rejected": -5.2453436851501465, + "step": 383 + }, + { + "epoch": 0.7286527514231499, + "grad_norm": 2.1896724700927734, + "learning_rate": 5.042537711695584e-05, + "logits/chosen": -2.839466094970703, + "logits/rejected": -2.839400291442871, + "logps/chosen": -46.19039535522461, + "logps/rejected": -78.2408447265625, + "loss": 0.1499, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3111000061035156, + "rewards/margins": 3.2166218757629395, + "rewards/rejected": -5.527722358703613, + "step": 384 + }, + { + "epoch": 0.7305502846299811, + "grad_norm": 3.224628210067749, + "learning_rate": 4.9787761891668397e-05, + "logits/chosen": -2.8577723503112793, + "logits/rejected": -2.8555190563201904, + "logps/chosen": -45.490821838378906, + "logps/rejected": -75.40174865722656, + "loss": 0.4481, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.101290702819824, + "rewards/margins": 3.2228689193725586, + "rewards/rejected": -5.324159622192383, + "step": 385 + }, + { + "epoch": 0.7324478178368121, + "grad_norm": 3.9173166751861572, + "learning_rate": 4.915286467992097e-05, + "logits/chosen": -2.853044033050537, + "logits/rejected": -2.8523402214050293, + "logps/chosen": -45.38521957397461, + "logps/rejected": -70.02400207519531, + "loss": 0.2852, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.1188886165618896, + "rewards/margins": 2.6662843227386475, + "rewards/rejected": -4.785172939300537, + "step": 386 + }, + { + "epoch": 0.7343453510436433, + "grad_norm": 2.7123172283172607, + "learning_rate": 4.852071984901696e-05, + "logits/chosen": -2.7952325344085693, + "logits/rejected": -2.7948856353759766, + "logps/chosen": -47.74720764160156, + "logps/rejected": -74.95579528808594, + "loss": 0.3017, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.2861366271972656, + "rewards/margins": 2.9639570713043213, + "rewards/rejected": -5.250093460083008, + "step": 387 + }, + { + "epoch": 0.7362428842504743, + "grad_norm": 2.0673792362213135, + "learning_rate": 4.7891361617271845e-05, + "logits/chosen": -2.8399112224578857, + "logits/rejected": -2.8416287899017334, + "logps/chosen": -49.615631103515625, + "logps/rejected": -81.5777587890625, + "loss": 0.2256, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6042308807373047, + "rewards/margins": 3.308650255203247, + "rewards/rejected": -5.912881374359131, + "step": 388 + }, + { + "epoch": 0.7381404174573055, + "grad_norm": 1.5954556465148926, + "learning_rate": 4.726482405216125e-05, + "logits/chosen": -2.8556995391845703, + "logits/rejected": -2.855954885482788, + "logps/chosen": -42.215030670166016, + "logps/rejected": -80.80831909179688, + "loss": 0.126, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.8362414836883545, + "rewards/margins": 3.931171417236328, + "rewards/rejected": -5.767412185668945, + "step": 389 + }, + { + "epoch": 0.7400379506641366, + "grad_norm": 2.600886583328247, + "learning_rate": 4.6641141068476666e-05, + "logits/chosen": -2.856656074523926, + "logits/rejected": -2.855344295501709, + "logps/chosen": -46.701438903808594, + "logps/rejected": -83.43122863769531, + "loss": 0.202, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.251277208328247, + "rewards/margins": 4.019396781921387, + "rewards/rejected": -6.270673751831055, + "step": 390 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.3920484483242035, + "learning_rate": 4.602034642648968e-05, + "logits/chosen": -2.8546879291534424, + "logits/rejected": -2.8558170795440674, + "logps/chosen": -38.9944953918457, + "logps/rejected": -84.74649047851562, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.391613483428955, + "rewards/margins": 4.989771842956543, + "rewards/rejected": -6.381385803222656, + "step": 391 + }, + { + "epoch": 0.7438330170777988, + "grad_norm": 1.4415783882141113, + "learning_rate": 4.540247373012439e-05, + "logits/chosen": -2.8511734008789062, + "logits/rejected": -2.848379611968994, + "logps/chosen": -49.37206268310547, + "logps/rejected": -83.17179107666016, + "loss": 0.0969, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.363197088241577, + "rewards/margins": 3.6192896366119385, + "rewards/rejected": -5.982486724853516, + "step": 392 + }, + { + "epoch": 0.74573055028463, + "grad_norm": 2.126145362854004, + "learning_rate": 4.4787556425138675e-05, + "logits/chosen": -2.8572943210601807, + "logits/rejected": -2.8603005409240723, + "logps/chosen": -55.69056701660156, + "logps/rejected": -87.90967559814453, + "loss": 0.1894, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.2229087352752686, + "rewards/margins": 3.3716800212860107, + "rewards/rejected": -6.5945892333984375, + "step": 393 + }, + { + "epoch": 0.7476280834914611, + "grad_norm": 1.2590640783309937, + "learning_rate": 4.417562779731355e-05, + "logits/chosen": -2.843212842941284, + "logits/rejected": -2.843592882156372, + "logps/chosen": -36.59722137451172, + "logps/rejected": -79.27423095703125, + "loss": 0.1432, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2509305477142334, + "rewards/margins": 4.489564895629883, + "rewards/rejected": -5.740495681762695, + "step": 394 + }, + { + "epoch": 0.7495256166982922, + "grad_norm": 1.5835777521133423, + "learning_rate": 4.356672097065134e-05, + "logits/chosen": -2.8567137718200684, + "logits/rejected": -2.8576886653900146, + "logps/chosen": -42.656951904296875, + "logps/rejected": -78.82894897460938, + "loss": 0.1267, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9459456205368042, + "rewards/margins": 3.693349599838257, + "rewards/rejected": -5.6392951011657715, + "step": 395 + }, + { + "epoch": 0.7514231499051234, + "grad_norm": 6.007066249847412, + "learning_rate": 4.29608689055829e-05, + "logits/chosen": -2.8007290363311768, + "logits/rejected": -2.799259901046753, + "logps/chosen": -42.47441864013672, + "logps/rejected": -78.50823974609375, + "loss": 0.2904, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.7924405336380005, + "rewards/margins": 3.7978663444519043, + "rewards/rejected": -5.590307235717773, + "step": 396 + }, + { + "epoch": 0.7533206831119544, + "grad_norm": 1.4406176805496216, + "learning_rate": 4.2358104397183264e-05, + "logits/chosen": -2.8301925659179688, + "logits/rejected": -2.836293935775757, + "logps/chosen": -44.866912841796875, + "logps/rejected": -86.76194763183594, + "loss": 0.1087, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.156167507171631, + "rewards/margins": 4.227807998657227, + "rewards/rejected": -6.383975982666016, + "step": 397 + }, + { + "epoch": 0.7552182163187856, + "grad_norm": 3.5366456508636475, + "learning_rate": 4.1758460073396436e-05, + "logits/chosen": -2.815624237060547, + "logits/rejected": -2.813138723373413, + "logps/chosen": -58.1055908203125, + "logps/rejected": -91.65219116210938, + "loss": 0.235, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.413323402404785, + "rewards/margins": 3.6791296005249023, + "rewards/rejected": -7.0924530029296875, + "step": 398 + }, + { + "epoch": 0.7571157495256167, + "grad_norm": 4.2073516845703125, + "learning_rate": 4.116196839326932e-05, + "logits/chosen": -2.8639235496520996, + "logits/rejected": -2.8655333518981934, + "logps/chosen": -55.49702072143555, + "logps/rejected": -83.14749145507812, + "loss": 0.329, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.128420352935791, + "rewards/margins": 2.9055938720703125, + "rewards/rejected": -6.034013748168945, + "step": 399 + }, + { + "epoch": 0.7590132827324478, + "grad_norm": 2.927692413330078, + "learning_rate": 4.056866164519465e-05, + "logits/chosen": -2.8432540893554688, + "logits/rejected": -2.8449456691741943, + "logps/chosen": -49.41889190673828, + "logps/rejected": -92.33918762207031, + "loss": 0.2371, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5948264598846436, + "rewards/margins": 4.3953857421875, + "rewards/rejected": -6.9902119636535645, + "step": 400 + }, + { + "epoch": 0.7609108159392789, + "grad_norm": 1.7601803541183472, + "learning_rate": 3.997857194516319e-05, + "logits/chosen": -2.879696846008301, + "logits/rejected": -2.876967191696167, + "logps/chosen": -56.179996490478516, + "logps/rejected": -98.41171264648438, + "loss": 0.1464, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0214571952819824, + "rewards/margins": 4.520035743713379, + "rewards/rejected": -7.5414934158325195, + "step": 401 + }, + { + "epoch": 0.7628083491461101, + "grad_norm": 2.3982231616973877, + "learning_rate": 3.939173123502523e-05, + "logits/chosen": -2.852635622024536, + "logits/rejected": -2.8533170223236084, + "logps/chosen": -55.57673263549805, + "logps/rejected": -82.56744384765625, + "loss": 0.5012, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.271477699279785, + "rewards/margins": 2.8199305534362793, + "rewards/rejected": -6.0914082527160645, + "step": 402 + }, + { + "epoch": 0.7647058823529411, + "grad_norm": 1.9928884506225586, + "learning_rate": 3.880817128076166e-05, + "logits/chosen": -2.8672335147857666, + "logits/rejected": -2.868398666381836, + "logps/chosen": -59.27280044555664, + "logps/rejected": -86.81177520751953, + "loss": 0.255, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.336042881011963, + "rewards/margins": 3.0970020294189453, + "rewards/rejected": -6.433044910430908, + "step": 403 + }, + { + "epoch": 0.7666034155597723, + "grad_norm": 1.0021593570709229, + "learning_rate": 3.8227923670764466e-05, + "logits/chosen": -2.8443446159362793, + "logits/rejected": -2.8455092906951904, + "logps/chosen": -47.19717025756836, + "logps/rejected": -91.81137084960938, + "loss": 0.1083, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1716463565826416, + "rewards/margins": 4.843894004821777, + "rewards/rejected": -7.01554012298584, + "step": 404 + }, + { + "epoch": 0.7685009487666035, + "grad_norm": 4.413395404815674, + "learning_rate": 3.7651019814126654e-05, + "logits/chosen": -2.8711767196655273, + "logits/rejected": -2.8668553829193115, + "logps/chosen": -53.58222198486328, + "logps/rejected": -78.13496398925781, + "loss": 0.6096, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.8774609565734863, + "rewards/margins": 2.763082981109619, + "rewards/rejected": -5.6405439376831055, + "step": 405 + }, + { + "epoch": 0.7703984819734345, + "grad_norm": 3.5480332374572754, + "learning_rate": 3.707749093894231e-05, + "logits/chosen": -2.8677728176116943, + "logits/rejected": -2.868199348449707, + "logps/chosen": -55.50325012207031, + "logps/rejected": -71.5915756225586, + "loss": 0.5146, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0266475677490234, + "rewards/margins": 1.9313273429870605, + "rewards/rejected": -4.957975387573242, + "step": 406 + }, + { + "epoch": 0.7722960151802657, + "grad_norm": 1.3300139904022217, + "learning_rate": 3.650736809061601e-05, + "logits/chosen": -2.823054075241089, + "logits/rejected": -2.8239519596099854, + "logps/chosen": -46.36268997192383, + "logps/rejected": -86.57362365722656, + "loss": 0.1313, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1757278442382812, + "rewards/margins": 4.097400665283203, + "rewards/rejected": -6.273128986358643, + "step": 407 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 1.6520681381225586, + "learning_rate": 3.594068213018249e-05, + "logits/chosen": -2.8503434658050537, + "logits/rejected": -2.8497822284698486, + "logps/chosen": -51.699127197265625, + "logps/rejected": -85.61526489257812, + "loss": 0.1361, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.76126766204834, + "rewards/margins": 3.510023355484009, + "rewards/rejected": -6.271291255950928, + "step": 408 + }, + { + "epoch": 0.7760910815939279, + "grad_norm": 4.191739559173584, + "learning_rate": 3.537746373263589e-05, + "logits/chosen": -2.8701586723327637, + "logits/rejected": -2.8664653301239014, + "logps/chosen": -58.59525680541992, + "logps/rejected": -86.79007720947266, + "loss": 0.4554, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.43748140335083, + "rewards/margins": 3.084642171859741, + "rewards/rejected": -6.522123336791992, + "step": 409 + }, + { + "epoch": 0.777988614800759, + "grad_norm": 2.7124240398406982, + "learning_rate": 3.481774338526954e-05, + "logits/chosen": -2.8723881244659424, + "logits/rejected": -2.8679914474487305, + "logps/chosen": -57.07926940917969, + "logps/rejected": -91.86167907714844, + "loss": 0.5128, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.1731064319610596, + "rewards/margins": 3.747316837310791, + "rewards/rejected": -6.92042350769043, + "step": 410 + }, + { + "epoch": 0.7798861480075902, + "grad_norm": 2.129563331604004, + "learning_rate": 3.426155138602558e-05, + "logits/chosen": -2.853167772293091, + "logits/rejected": -2.853346824645996, + "logps/chosen": -59.06671142578125, + "logps/rejected": -96.77989959716797, + "loss": 0.4821, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.553715944290161, + "rewards/margins": 3.9553308486938477, + "rewards/rejected": -7.50904655456543, + "step": 411 + }, + { + "epoch": 0.7817836812144212, + "grad_norm": 1.0496379137039185, + "learning_rate": 3.370891784185478e-05, + "logits/chosen": -2.851594924926758, + "logits/rejected": -2.850982666015625, + "logps/chosen": -45.39728546142578, + "logps/rejected": -83.35406494140625, + "loss": 0.0511, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.002638101577759, + "rewards/margins": 4.146675109863281, + "rewards/rejected": -6.149312973022461, + "step": 412 + }, + { + "epoch": 0.7836812144212524, + "grad_norm": 3.537407875061035, + "learning_rate": 3.315987266708708e-05, + "logits/chosen": -2.8707752227783203, + "logits/rejected": -2.8682148456573486, + "logps/chosen": -65.88800048828125, + "logps/rejected": -93.33538055419922, + "loss": 0.5083, + "rewards/accuracies": 0.8125, + "rewards/chosen": -4.260937690734863, + "rewards/margins": 2.699514627456665, + "rewards/rejected": -6.960453033447266, + "step": 413 + }, + { + "epoch": 0.7855787476280834, + "grad_norm": 2.036504030227661, + "learning_rate": 3.261444558181218e-05, + "logits/chosen": -2.822535753250122, + "logits/rejected": -2.8231797218322754, + "logps/chosen": -53.427268981933594, + "logps/rejected": -87.4761962890625, + "loss": 0.2484, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.955658197402954, + "rewards/margins": 3.5562376976013184, + "rewards/rejected": -6.511895656585693, + "step": 414 + }, + { + "epoch": 0.7874762808349146, + "grad_norm": 6.943586349487305, + "learning_rate": 3.207266611027069e-05, + "logits/chosen": -2.845099449157715, + "logits/rejected": -2.845888137817383, + "logps/chosen": -62.221195220947266, + "logps/rejected": -81.78451538085938, + "loss": 0.8399, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.9935247898101807, + "rewards/margins": 1.944313406944275, + "rewards/rejected": -5.937838554382324, + "step": 415 + }, + { + "epoch": 0.7893738140417458, + "grad_norm": 0.8565374612808228, + "learning_rate": 3.153456357925617e-05, + "logits/chosen": -2.877192497253418, + "logits/rejected": -2.8789360523223877, + "logps/chosen": -50.52465057373047, + "logps/rejected": -88.89420318603516, + "loss": 0.0881, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.6657750606536865, + "rewards/margins": 4.038374900817871, + "rewards/rejected": -6.7041497230529785, + "step": 416 + }, + { + "epoch": 0.7912713472485768, + "grad_norm": 2.0553338527679443, + "learning_rate": 3.100016711652752e-05, + "logits/chosen": -2.8713204860687256, + "logits/rejected": -2.8678972721099854, + "logps/chosen": -51.89461898803711, + "logps/rejected": -85.39608764648438, + "loss": 0.3038, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7338039875030518, + "rewards/margins": 3.546187400817871, + "rewards/rejected": -6.279991149902344, + "step": 417 + }, + { + "epoch": 0.793168880455408, + "grad_norm": 2.6920197010040283, + "learning_rate": 3.0469505649232333e-05, + "logits/chosen": -2.842066764831543, + "logits/rejected": -2.8389217853546143, + "logps/chosen": -62.65966796875, + "logps/rejected": -88.504638671875, + "loss": 0.4567, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.93137526512146, + "rewards/margins": 2.59926176071167, + "rewards/rejected": -6.530636787414551, + "step": 418 + }, + { + "epoch": 0.7950664136622391, + "grad_norm": 3.4587042331695557, + "learning_rate": 2.9942607902340945e-05, + "logits/chosen": -2.8469345569610596, + "logits/rejected": -2.844067335128784, + "logps/chosen": -59.01725769042969, + "logps/rejected": -73.55389404296875, + "loss": 0.7181, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.381943702697754, + "rewards/margins": 1.5324156284332275, + "rewards/rejected": -4.914359092712402, + "step": 419 + }, + { + "epoch": 0.7969639468690702, + "grad_norm": 1.682063102722168, + "learning_rate": 2.9419502397091713e-05, + "logits/chosen": -2.816669225692749, + "logits/rejected": -2.8211100101470947, + "logps/chosen": -59.50233459472656, + "logps/rejected": -79.79951477050781, + "loss": 0.2215, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7046406269073486, + "rewards/margins": 2.0102458000183105, + "rewards/rejected": -5.714886665344238, + "step": 420 + }, + { + "epoch": 0.7988614800759013, + "grad_norm": 2.2250099182128906, + "learning_rate": 2.8900217449447074e-05, + "logits/chosen": -2.8525190353393555, + "logits/rejected": -2.8517796993255615, + "logps/chosen": -49.88181686401367, + "logps/rejected": -85.27168273925781, + "loss": 0.2254, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.631014347076416, + "rewards/margins": 3.6132094860076904, + "rewards/rejected": -6.244223594665527, + "step": 421 + }, + { + "epoch": 0.8007590132827325, + "grad_norm": 3.0041167736053467, + "learning_rate": 2.8384781168560693e-05, + "logits/chosen": -2.9001643657684326, + "logits/rejected": -2.898693323135376, + "logps/chosen": -58.808204650878906, + "logps/rejected": -81.7398681640625, + "loss": 0.3757, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.4261245727539062, + "rewards/margins": 2.559650182723999, + "rewards/rejected": -5.985774993896484, + "step": 422 + }, + { + "epoch": 0.8026565464895635, + "grad_norm": 0.5320444107055664, + "learning_rate": 2.7873221455256004e-05, + "logits/chosen": -2.875701427459717, + "logits/rejected": -2.8719263076782227, + "logps/chosen": -47.414459228515625, + "logps/rejected": -90.77688598632812, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.345010757446289, + "rewards/margins": 4.371488571166992, + "rewards/rejected": -6.716499328613281, + "step": 423 + }, + { + "epoch": 0.8045540796963947, + "grad_norm": 2.517533540725708, + "learning_rate": 2.736556600051593e-05, + "logits/chosen": -2.861175298690796, + "logits/rejected": -2.861558675765991, + "logps/chosen": -58.571083068847656, + "logps/rejected": -79.73915100097656, + "loss": 0.4111, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.626460075378418, + "rewards/margins": 2.2237601280212402, + "rewards/rejected": -5.8502197265625, + "step": 424 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 1.6777634620666504, + "learning_rate": 2.6861842283983953e-05, + "logits/chosen": -2.8752527236938477, + "logits/rejected": -2.8697383403778076, + "logps/chosen": -51.51059341430664, + "logps/rejected": -83.09491729736328, + "loss": 0.1459, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8168253898620605, + "rewards/margins": 3.3736629486083984, + "rewards/rejected": -6.190488338470459, + "step": 425 + }, + { + "epoch": 0.8083491461100569, + "grad_norm": 1.8491694927215576, + "learning_rate": 2.6362077572476494e-05, + "logits/chosen": -2.8406431674957275, + "logits/rejected": -2.8375585079193115, + "logps/chosen": -48.828189849853516, + "logps/rejected": -79.91885375976562, + "loss": 0.2327, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5947234630584717, + "rewards/margins": 3.1402082443237305, + "rewards/rejected": -5.734931945800781, + "step": 426 + }, + { + "epoch": 0.8102466793168881, + "grad_norm": 3.158411979675293, + "learning_rate": 2.586629891850716e-05, + "logits/chosen": -2.8103625774383545, + "logits/rejected": -2.81091570854187, + "logps/chosen": -46.74591064453125, + "logps/rejected": -72.90126037597656, + "loss": 0.4351, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.266634702682495, + "rewards/margins": 2.8894031047821045, + "rewards/rejected": -5.1560378074646, + "step": 427 + }, + { + "epoch": 0.8121442125237192, + "grad_norm": 2.506629467010498, + "learning_rate": 2.537453315882222e-05, + "logits/chosen": -2.887596368789673, + "logits/rejected": -2.88572359085083, + "logps/chosen": -51.39200973510742, + "logps/rejected": -80.5135498046875, + "loss": 0.3368, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7697482109069824, + "rewards/margins": 3.133552074432373, + "rewards/rejected": -5.9033002853393555, + "step": 428 + }, + { + "epoch": 0.8140417457305503, + "grad_norm": 1.5517326593399048, + "learning_rate": 2.4886806912948035e-05, + "logits/chosen": -2.8428924083709717, + "logits/rejected": -2.8404133319854736, + "logps/chosen": -50.483375549316406, + "logps/rejected": -78.89204406738281, + "loss": 0.1657, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.781102180480957, + "rewards/margins": 2.961219072341919, + "rewards/rejected": -5.742321014404297, + "step": 429 + }, + { + "epoch": 0.8159392789373814, + "grad_norm": 1.744879961013794, + "learning_rate": 2.4403146581749925e-05, + "logits/chosen": -2.8795626163482666, + "logits/rejected": -2.8794806003570557, + "logps/chosen": -54.82472229003906, + "logps/rejected": -84.26485443115234, + "loss": 0.2315, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0898611545562744, + "rewards/margins": 3.029693365097046, + "rewards/rejected": -6.11955451965332, + "step": 430 + }, + { + "epoch": 0.8178368121442126, + "grad_norm": 0.5849810242652893, + "learning_rate": 2.3923578346003363e-05, + "logits/chosen": -2.888624906539917, + "logits/rejected": -2.882172107696533, + "logps/chosen": -43.547874450683594, + "logps/rejected": -82.17684173583984, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.0178041458129883, + "rewards/margins": 4.022542953491211, + "rewards/rejected": -6.040347099304199, + "step": 431 + }, + { + "epoch": 0.8197343453510436, + "grad_norm": 1.854899525642395, + "learning_rate": 2.344812816497659e-05, + "logits/chosen": -2.8812174797058105, + "logits/rejected": -2.8820064067840576, + "logps/chosen": -48.730194091796875, + "logps/rejected": -85.76399230957031, + "loss": 0.2789, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.402095079421997, + "rewards/margins": 3.9352121353149414, + "rewards/rejected": -6.337306976318359, + "step": 432 + }, + { + "epoch": 0.8216318785578748, + "grad_norm": 1.663291335105896, + "learning_rate": 2.2976821775025457e-05, + "logits/chosen": -2.8493640422821045, + "logits/rejected": -2.8438591957092285, + "logps/chosen": -55.23139572143555, + "logps/rejected": -83.07640075683594, + "loss": 0.2587, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1326611042022705, + "rewards/margins": 2.9811367988586426, + "rewards/rejected": -6.113798141479492, + "step": 433 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 2.9149162769317627, + "learning_rate": 2.2509684688200384e-05, + "logits/chosen": -2.8712880611419678, + "logits/rejected": -2.8701071739196777, + "logps/chosen": -65.67572021484375, + "logps/rejected": -84.64015197753906, + "loss": 0.4566, + "rewards/accuracies": 0.875, + "rewards/chosen": -4.112061500549316, + "rewards/margins": 2.053865671157837, + "rewards/rejected": -6.165927886962891, + "step": 434 + }, + { + "epoch": 0.825426944971537, + "grad_norm": 1.1507574319839478, + "learning_rate": 2.204674219086531e-05, + "logits/chosen": -2.8618247509002686, + "logits/rejected": -2.865861415863037, + "logps/chosen": -58.50060272216797, + "logps/rejected": -87.97689819335938, + "loss": 0.0886, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.454524040222168, + "rewards/margins": 3.154420852661133, + "rewards/rejected": -6.608944892883301, + "step": 435 + }, + { + "epoch": 0.8273244781783681, + "grad_norm": 2.227458953857422, + "learning_rate": 2.1588019342328968e-05, + "logits/chosen": -2.8568058013916016, + "logits/rejected": -2.850839614868164, + "logps/chosen": -52.666900634765625, + "logps/rejected": -85.18878936767578, + "loss": 0.2316, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.931042194366455, + "rewards/margins": 3.4106945991516113, + "rewards/rejected": -6.341736793518066, + "step": 436 + }, + { + "epoch": 0.8292220113851992, + "grad_norm": 2.9324381351470947, + "learning_rate": 2.1133540973488342e-05, + "logits/chosen": -2.854193687438965, + "logits/rejected": -2.8481411933898926, + "logps/chosen": -41.16516876220703, + "logps/rejected": -77.86811828613281, + "loss": 0.2019, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6302728652954102, + "rewards/margins": 3.9812991619110107, + "rewards/rejected": -5.611572265625, + "step": 437 + }, + { + "epoch": 0.8311195445920304, + "grad_norm": 1.595365285873413, + "learning_rate": 2.0683331685484652e-05, + "logits/chosen": -2.834395170211792, + "logits/rejected": -2.8295483589172363, + "logps/chosen": -50.14398956298828, + "logps/rejected": -86.78340148925781, + "loss": 0.1211, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.632577896118164, + "rewards/margins": 3.9267497062683105, + "rewards/rejected": -6.559328079223633, + "step": 438 + }, + { + "epoch": 0.8330170777988615, + "grad_norm": 2.071730852127075, + "learning_rate": 2.0237415848371667e-05, + "logits/chosen": -2.8482611179351807, + "logits/rejected": -2.8530232906341553, + "logps/chosen": -50.99557113647461, + "logps/rejected": -84.28203582763672, + "loss": 0.2539, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6730546951293945, + "rewards/margins": 3.566554307937622, + "rewards/rejected": -6.2396087646484375, + "step": 439 + }, + { + "epoch": 0.8349146110056926, + "grad_norm": 1.4619513750076294, + "learning_rate": 1.9795817599796418e-05, + "logits/chosen": -2.8789496421813965, + "logits/rejected": -2.876634120941162, + "logps/chosen": -46.09120559692383, + "logps/rejected": -80.81315612792969, + "loss": 0.1742, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.269618034362793, + "rewards/margins": 3.6128640174865723, + "rewards/rejected": -5.882482051849365, + "step": 440 + }, + { + "epoch": 0.8368121442125237, + "grad_norm": 2.196380376815796, + "learning_rate": 1.9358560843692787e-05, + "logits/chosen": -2.8895583152770996, + "logits/rejected": -2.890669345855713, + "logps/chosen": -57.63343811035156, + "logps/rejected": -77.12810516357422, + "loss": 0.435, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.4425575733184814, + "rewards/margins": 1.9679261445999146, + "rewards/rejected": -5.4104838371276855, + "step": 441 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.8209553956985474, + "learning_rate": 1.892566924898751e-05, + "logits/chosen": -2.870082139968872, + "logits/rejected": -2.8673083782196045, + "logps/chosen": -58.87828063964844, + "logps/rejected": -95.53388977050781, + "loss": 0.1457, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.519439697265625, + "rewards/margins": 3.827193260192871, + "rewards/rejected": -7.346632957458496, + "step": 442 + }, + { + "epoch": 0.8406072106261859, + "grad_norm": 3.9470529556274414, + "learning_rate": 1.8497166248318876e-05, + "logits/chosen": -2.847348213195801, + "logits/rejected": -2.8416690826416016, + "logps/chosen": -54.76271057128906, + "logps/rejected": -77.33283233642578, + "loss": 0.5914, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.093599796295166, + "rewards/margins": 2.3917601108551025, + "rewards/rejected": -5.485360145568848, + "step": 443 + }, + { + "epoch": 0.8425047438330171, + "grad_norm": 1.719234824180603, + "learning_rate": 1.807307503676846e-05, + "logits/chosen": -2.855872392654419, + "logits/rejected": -2.8510212898254395, + "logps/chosen": -50.66022491455078, + "logps/rejected": -79.40420532226562, + "loss": 0.2117, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6416897773742676, + "rewards/margins": 3.0230698585510254, + "rewards/rejected": -5.664760112762451, + "step": 444 + }, + { + "epoch": 0.8444022770398482, + "grad_norm": 2.0453107357025146, + "learning_rate": 1.7653418570605475e-05, + "logits/chosen": -2.822150468826294, + "logits/rejected": -2.827209711074829, + "logps/chosen": -46.558135986328125, + "logps/rejected": -78.79747009277344, + "loss": 0.1741, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.175271987915039, + "rewards/margins": 3.4841248989105225, + "rewards/rejected": -5.659396648406982, + "step": 445 + }, + { + "epoch": 0.8462998102466793, + "grad_norm": 0.71958988904953, + "learning_rate": 1.7238219566044145e-05, + "logits/chosen": -2.831434726715088, + "logits/rejected": -2.8327438831329346, + "logps/chosen": -48.3876953125, + "logps/rejected": -85.19735717773438, + "loss": 0.0798, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3890655040740967, + "rewards/margins": 3.872114419937134, + "rewards/rejected": -6.2611799240112305, + "step": 446 + }, + { + "epoch": 0.8481973434535104, + "grad_norm": 2.318376302719116, + "learning_rate": 1.6827500498014025e-05, + "logits/chosen": -2.8484344482421875, + "logits/rejected": -2.845217704772949, + "logps/chosen": -48.784584045410156, + "logps/rejected": -87.87923431396484, + "loss": 0.1897, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4254283905029297, + "rewards/margins": 4.209994792938232, + "rewards/rejected": -6.635422706604004, + "step": 447 + }, + { + "epoch": 0.8500948766603416, + "grad_norm": 0.7029361724853516, + "learning_rate": 1.6421283598943528e-05, + "logits/chosen": -2.8784377574920654, + "logits/rejected": -2.876771926879883, + "logps/chosen": -49.851253509521484, + "logps/rejected": -87.93343353271484, + "loss": 0.0479, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.5630671977996826, + "rewards/margins": 4.124255180358887, + "rewards/rejected": -6.687322616577148, + "step": 448 + }, + { + "epoch": 0.8519924098671727, + "grad_norm": 1.5745196342468262, + "learning_rate": 1.601959085755641e-05, + "logits/chosen": -2.8467390537261963, + "logits/rejected": -2.845139265060425, + "logps/chosen": -50.44337844848633, + "logps/rejected": -85.5137710571289, + "loss": 0.1323, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.772374153137207, + "rewards/margins": 3.719602346420288, + "rewards/rejected": -6.491976737976074, + "step": 449 + }, + { + "epoch": 0.8538899430740038, + "grad_norm": 2.028979778289795, + "learning_rate": 1.562244401768144e-05, + "logits/chosen": -2.8904149532318115, + "logits/rejected": -2.884728193283081, + "logps/chosen": -51.37632369995117, + "logps/rejected": -79.03163146972656, + "loss": 0.1726, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.6548404693603516, + "rewards/margins": 3.043051242828369, + "rewards/rejected": -5.697892189025879, + "step": 450 + }, + { + "epoch": 0.855787476280835, + "grad_norm": 3.1051995754241943, + "learning_rate": 1.5229864577075547e-05, + "logits/chosen": -2.870203971862793, + "logits/rejected": -2.868276357650757, + "logps/chosen": -51.94432830810547, + "logps/rejected": -81.9127426147461, + "loss": 0.4108, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.717585802078247, + "rewards/margins": 3.2403957843780518, + "rewards/rejected": -5.957981586456299, + "step": 451 + }, + { + "epoch": 0.857685009487666, + "grad_norm": 0.38512659072875977, + "learning_rate": 1.484187378626002e-05, + "logits/chosen": -2.884380578994751, + "logits/rejected": -2.87831974029541, + "logps/chosen": -49.78013229370117, + "logps/rejected": -95.21629333496094, + "loss": 0.0192, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.50553560256958, + "rewards/margins": 4.855525970458984, + "rewards/rejected": -7.361061096191406, + "step": 452 + }, + { + "epoch": 0.8595825426944972, + "grad_norm": 2.417759656906128, + "learning_rate": 1.4458492647370258e-05, + "logits/chosen": -2.8602285385131836, + "logits/rejected": -2.860398530960083, + "logps/chosen": -54.827938079833984, + "logps/rejected": -87.94296264648438, + "loss": 0.2447, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.203274726867676, + "rewards/margins": 3.4562509059906006, + "rewards/rejected": -6.6595258712768555, + "step": 453 + }, + { + "epoch": 0.8614800759013282, + "grad_norm": 2.0201404094696045, + "learning_rate": 1.4079741913018863e-05, + "logits/chosen": -2.859805107116699, + "logits/rejected": -2.8590550422668457, + "logps/chosen": -49.323951721191406, + "logps/rejected": -84.20152282714844, + "loss": 0.139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.51802396774292, + "rewards/margins": 3.6508617401123047, + "rewards/rejected": -6.168885707855225, + "step": 454 + }, + { + "epoch": 0.8633776091081594, + "grad_norm": 1.3853485584259033, + "learning_rate": 1.3705642085172366e-05, + "logits/chosen": -2.845665454864502, + "logits/rejected": -2.8443827629089355, + "logps/chosen": -41.36186218261719, + "logps/rejected": -85.60066223144531, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6150336265563965, + "rewards/margins": 4.706649303436279, + "rewards/rejected": -6.321683406829834, + "step": 455 + }, + { + "epoch": 0.8652751423149905, + "grad_norm": 0.7386859059333801, + "learning_rate": 1.3336213414041387e-05, + "logits/chosen": -2.8452632427215576, + "logits/rejected": -2.8436524868011475, + "logps/chosen": -35.882301330566406, + "logps/rejected": -82.32295227050781, + "loss": 0.0594, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484332084655762, + "rewards/margins": 4.846029281616211, + "rewards/rejected": -5.994462490081787, + "step": 456 + }, + { + "epoch": 0.8671726755218216, + "grad_norm": 1.8411693572998047, + "learning_rate": 1.2971475896984475e-05, + "logits/chosen": -2.8674824237823486, + "logits/rejected": -2.86271595954895, + "logps/chosen": -55.15509796142578, + "logps/rejected": -81.25284576416016, + "loss": 0.2226, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.0428030490875244, + "rewards/margins": 2.7782084941864014, + "rewards/rejected": -5.821011543273926, + "step": 457 + }, + { + "epoch": 0.8690702087286527, + "grad_norm": 0.8854163885116577, + "learning_rate": 1.2611449277425713e-05, + "logits/chosen": -2.8654544353485107, + "logits/rejected": -2.8657302856445312, + "logps/chosen": -56.486427307128906, + "logps/rejected": -95.68757629394531, + "loss": 0.0783, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.267014980316162, + "rewards/margins": 4.12831449508667, + "rewards/rejected": -7.395329475402832, + "step": 458 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 2.766991376876831, + "learning_rate": 1.2256153043785912e-05, + "logits/chosen": -2.8615031242370605, + "logits/rejected": -2.859715223312378, + "logps/chosen": -59.30867004394531, + "logps/rejected": -87.28678131103516, + "loss": 0.3159, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.432471990585327, + "rewards/margins": 3.0275392532348633, + "rewards/rejected": -6.4600114822387695, + "step": 459 + }, + { + "epoch": 0.872865275142315, + "grad_norm": 1.7893755435943604, + "learning_rate": 1.1905606428427774e-05, + "logits/chosen": -2.881412982940674, + "logits/rejected": -2.8749964237213135, + "logps/chosen": -49.450645446777344, + "logps/rejected": -82.91507720947266, + "loss": 0.2335, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.522357225418091, + "rewards/margins": 3.599726676940918, + "rewards/rejected": -6.122084140777588, + "step": 460 + }, + { + "epoch": 0.8747628083491461, + "grad_norm": 1.8468047380447388, + "learning_rate": 1.1559828406614714e-05, + "logits/chosen": -2.8736824989318848, + "logits/rejected": -2.8692877292633057, + "logps/chosen": -48.415138244628906, + "logps/rejected": -77.85014343261719, + "loss": 0.1343, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3501672744750977, + "rewards/margins": 3.143881320953369, + "rewards/rejected": -5.494048118591309, + "step": 461 + }, + { + "epoch": 0.8766603415559773, + "grad_norm": 3.239903688430786, + "learning_rate": 1.1218837695483853e-05, + "logits/chosen": -2.856627941131592, + "logits/rejected": -2.8530499935150146, + "logps/chosen": -48.137454986572266, + "logps/rejected": -82.09251403808594, + "loss": 0.334, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.357161045074463, + "rewards/margins": 3.6000640392303467, + "rewards/rejected": -5.957225322723389, + "step": 462 + }, + { + "epoch": 0.8785578747628083, + "grad_norm": 2.1839210987091064, + "learning_rate": 1.0882652753032795e-05, + "logits/chosen": -2.872593402862549, + "logits/rejected": -2.8667614459991455, + "logps/chosen": -51.027793884277344, + "logps/rejected": -88.22506713867188, + "loss": 0.1381, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.631429672241211, + "rewards/margins": 3.949152946472168, + "rewards/rejected": -6.580582618713379, + "step": 463 + }, + { + "epoch": 0.8804554079696395, + "grad_norm": 2.2396671772003174, + "learning_rate": 1.0551291777120464e-05, + "logits/chosen": -2.873810052871704, + "logits/rejected": -2.871718168258667, + "logps/chosen": -60.82017517089844, + "logps/rejected": -88.28251647949219, + "loss": 0.2474, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.7728419303894043, + "rewards/margins": 2.73274302482605, + "rewards/rejected": -6.505584716796875, + "step": 464 + }, + { + "epoch": 0.8823529411764706, + "grad_norm": 2.284513473510742, + "learning_rate": 1.0224772704482033e-05, + "logits/chosen": -2.8778035640716553, + "logits/rejected": -2.8775930404663086, + "logps/chosen": -49.48849105834961, + "logps/rejected": -84.93687438964844, + "loss": 0.349, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.605888605117798, + "rewards/margins": 3.584686279296875, + "rewards/rejected": -6.190574645996094, + "step": 465 + }, + { + "epoch": 0.8842504743833017, + "grad_norm": 3.715801477432251, + "learning_rate": 9.903113209758096e-06, + "logits/chosen": -2.8709402084350586, + "logits/rejected": -2.8691956996917725, + "logps/chosen": -52.825950622558594, + "logps/rejected": -80.30274200439453, + "loss": 0.3896, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.904201030731201, + "rewards/margins": 2.963268280029297, + "rewards/rejected": -5.867469787597656, + "step": 466 + }, + { + "epoch": 0.8861480075901328, + "grad_norm": 1.438846230506897, + "learning_rate": 9.586330704537849e-06, + "logits/chosen": -2.8450632095336914, + "logits/rejected": -2.8445072174072266, + "logps/chosen": -53.51820755004883, + "logps/rejected": -85.2142333984375, + "loss": 0.1714, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0311946868896484, + "rewards/margins": 3.2895748615264893, + "rewards/rejected": -6.320769786834717, + "step": 467 + }, + { + "epoch": 0.888045540796964, + "grad_norm": 2.9390602111816406, + "learning_rate": 9.274442336416567e-06, + "logits/chosen": -2.845296621322632, + "logits/rejected": -2.842207431793213, + "logps/chosen": -51.43974304199219, + "logps/rejected": -87.38579559326172, + "loss": 0.1854, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.63873028755188, + "rewards/margins": 3.945521354675293, + "rewards/rejected": -6.584251403808594, + "step": 468 + }, + { + "epoch": 0.889943074003795, + "grad_norm": 2.258697032928467, + "learning_rate": 8.967464988067475e-06, + "logits/chosen": -2.887526512145996, + "logits/rejected": -2.8856074810028076, + "logps/chosen": -46.49995422363281, + "logps/rejected": -79.75428771972656, + "loss": 0.2359, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2005090713500977, + "rewards/margins": 3.588965892791748, + "rewards/rejected": -5.7894744873046875, + "step": 469 + }, + { + "epoch": 0.8918406072106262, + "grad_norm": 0.39533525705337524, + "learning_rate": 8.665415276327871e-06, + "logits/chosen": -2.8955729007720947, + "logits/rejected": -2.888808012008667, + "logps/chosen": -47.26777648925781, + "logps/rejected": -95.47332000732422, + "loss": 0.0254, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.2491884231567383, + "rewards/margins": 5.138714790344238, + "rewards/rejected": -7.387903213500977, + "step": 470 + }, + { + "epoch": 0.8937381404174574, + "grad_norm": 1.2186378240585327, + "learning_rate": 8.368309551299536e-06, + "logits/chosen": -2.856180429458618, + "logits/rejected": -2.849057912826538, + "logps/chosen": -50.02391052246094, + "logps/rejected": -84.98136138916016, + "loss": 0.0835, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.512538433074951, + "rewards/margins": 3.8806662559509277, + "rewards/rejected": -6.393204689025879, + "step": 471 + }, + { + "epoch": 0.8956356736242884, + "grad_norm": 4.68803071975708, + "learning_rate": 8.076163895463861e-06, + "logits/chosen": -2.8694870471954346, + "logits/rejected": -2.8696372509002686, + "logps/chosen": -63.10512924194336, + "logps/rejected": -81.60807037353516, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.999472141265869, + "rewards/margins": 1.9333720207214355, + "rewards/rejected": -5.932844161987305, + "step": 472 + }, + { + "epoch": 0.8975332068311196, + "grad_norm": 1.2908293008804321, + "learning_rate": 7.788994122811178e-06, + "logits/chosen": -2.8755910396575928, + "logits/rejected": -2.874612331390381, + "logps/chosen": -54.27519989013672, + "logps/rejected": -92.8865966796875, + "loss": 0.1045, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9691150188446045, + "rewards/margins": 4.181435585021973, + "rewards/rejected": -7.150550842285156, + "step": 473 + }, + { + "epoch": 0.8994307400379506, + "grad_norm": 1.4010708332061768, + "learning_rate": 7.506815777984788e-06, + "logits/chosen": -2.865668773651123, + "logits/rejected": -2.86503267288208, + "logps/chosen": -51.853553771972656, + "logps/rejected": -90.462158203125, + "loss": 0.0805, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.75411319732666, + "rewards/margins": 4.066606521606445, + "rewards/rejected": -6.8207197189331055, + "step": 474 + }, + { + "epoch": 0.9013282732447818, + "grad_norm": 0.9278813600540161, + "learning_rate": 7.229644135439473e-06, + "logits/chosen": -2.874293804168701, + "logits/rejected": -2.873574733734131, + "logps/chosen": -51.141422271728516, + "logps/rejected": -86.1817855834961, + "loss": 0.0876, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.603024959564209, + "rewards/margins": 3.8318567276000977, + "rewards/rejected": -6.434881687164307, + "step": 475 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 2.820253372192383, + "learning_rate": 6.957494198614778e-06, + "logits/chosen": -2.8808329105377197, + "logits/rejected": -2.8789174556732178, + "logps/chosen": -52.99219512939453, + "logps/rejected": -86.43820190429688, + "loss": 0.1575, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9275622367858887, + "rewards/margins": 3.5494418144226074, + "rewards/rejected": -6.477004051208496, + "step": 476 + }, + { + "epoch": 0.905123339658444, + "grad_norm": 3.0501718521118164, + "learning_rate": 6.690380699122767e-06, + "logits/chosen": -2.8456592559814453, + "logits/rejected": -2.848313093185425, + "logps/chosen": -44.644290924072266, + "logps/rejected": -78.11421966552734, + "loss": 0.1851, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.182406425476074, + "rewards/margins": 3.503129482269287, + "rewards/rejected": -5.685535430908203, + "step": 477 + }, + { + "epoch": 0.9070208728652751, + "grad_norm": 3.485112190246582, + "learning_rate": 6.428318095950647e-06, + "logits/chosen": -2.8692822456359863, + "logits/rejected": -2.8702754974365234, + "logps/chosen": -55.28034973144531, + "logps/rejected": -90.0372314453125, + "loss": 0.2902, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.2888870239257812, + "rewards/margins": 3.50645112991333, + "rewards/rejected": -6.795337677001953, + "step": 478 + }, + { + "epoch": 0.9089184060721063, + "grad_norm": 2.162829637527466, + "learning_rate": 6.171320574678063e-06, + "logits/chosen": -2.8754329681396484, + "logits/rejected": -2.8744900226593018, + "logps/chosen": -46.66167068481445, + "logps/rejected": -88.14268493652344, + "loss": 0.1513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3261728286743164, + "rewards/margins": 4.249751567840576, + "rewards/rejected": -6.575924873352051, + "step": 479 + }, + { + "epoch": 0.9108159392789373, + "grad_norm": 3.0580861568450928, + "learning_rate": 5.919402046709288e-06, + "logits/chosen": -2.8514211177825928, + "logits/rejected": -2.848707675933838, + "logps/chosen": -52.81618118286133, + "logps/rejected": -86.4152603149414, + "loss": 0.2967, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.9318675994873047, + "rewards/margins": 3.5067787170410156, + "rewards/rejected": -6.43864631652832, + "step": 480 + }, + { + "epoch": 0.9127134724857685, + "grad_norm": 3.6979422569274902, + "learning_rate": 5.672576148520137e-06, + "logits/chosen": -2.889620065689087, + "logits/rejected": -2.8883321285247803, + "logps/chosen": -60.14141082763672, + "logps/rejected": -79.537109375, + "loss": 0.7965, + "rewards/accuracies": 0.75, + "rewards/chosen": -3.719564437866211, + "rewards/margins": 2.0429325103759766, + "rewards/rejected": -5.7624969482421875, + "step": 481 + }, + { + "epoch": 0.9146110056925996, + "grad_norm": 3.231797933578491, + "learning_rate": 5.430856240919779e-06, + "logits/chosen": -2.8718159198760986, + "logits/rejected": -2.8671488761901855, + "logps/chosen": -59.959632873535156, + "logps/rejected": -91.78632354736328, + "loss": 0.3767, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.6989855766296387, + "rewards/margins": 3.2586991786956787, + "rewards/rejected": -6.9576849937438965, + "step": 482 + }, + { + "epoch": 0.9165085388994307, + "grad_norm": 1.663153886795044, + "learning_rate": 5.194255408327619e-06, + "logits/chosen": -2.8569464683532715, + "logits/rejected": -2.851780652999878, + "logps/chosen": -54.817230224609375, + "logps/rejected": -77.68112182617188, + "loss": 0.1908, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.1171646118164062, + "rewards/margins": 2.463062286376953, + "rewards/rejected": -5.580226898193359, + "step": 483 + }, + { + "epoch": 0.9184060721062619, + "grad_norm": 4.839911937713623, + "learning_rate": 4.962786458064972e-06, + "logits/chosen": -2.8591115474700928, + "logits/rejected": -2.8583099842071533, + "logps/chosen": -51.154937744140625, + "logps/rejected": -89.58143615722656, + "loss": 0.514, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.68982195854187, + "rewards/margins": 4.086490631103516, + "rewards/rejected": -6.776312828063965, + "step": 484 + }, + { + "epoch": 0.920303605313093, + "grad_norm": 2.3768200874328613, + "learning_rate": 4.7364619196617495e-06, + "logits/chosen": -2.891602039337158, + "logits/rejected": -2.8876867294311523, + "logps/chosen": -48.912391662597656, + "logps/rejected": -88.95805358886719, + "loss": 0.1241, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4300827980041504, + "rewards/margins": 4.336850643157959, + "rewards/rejected": -6.766933441162109, + "step": 485 + }, + { + "epoch": 0.9222011385199241, + "grad_norm": 1.8126496076583862, + "learning_rate": 4.515294044178331e-06, + "logits/chosen": -2.8283488750457764, + "logits/rejected": -2.826066493988037, + "logps/chosen": -58.70185470581055, + "logps/rejected": -91.96338653564453, + "loss": 0.1629, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.5918807983398438, + "rewards/margins": 3.4037351608276367, + "rewards/rejected": -6.9956159591674805, + "step": 486 + }, + { + "epoch": 0.9240986717267552, + "grad_norm": 1.346358060836792, + "learning_rate": 4.299294803542331e-06, + "logits/chosen": -2.8834807872772217, + "logits/rejected": -2.8820314407348633, + "logps/chosen": -47.56989288330078, + "logps/rejected": -80.78783416748047, + "loss": 0.1421, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3317975997924805, + "rewards/margins": 3.4790170192718506, + "rewards/rejected": -5.810814380645752, + "step": 487 + }, + { + "epoch": 0.9259962049335864, + "grad_norm": 3.8116936683654785, + "learning_rate": 4.0884758899006e-06, + "logits/chosen": -2.836764335632324, + "logits/rejected": -2.840043783187866, + "logps/chosen": -50.94938659667969, + "logps/rejected": -78.5450210571289, + "loss": 0.4692, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.733532428741455, + "rewards/margins": 2.9857707023620605, + "rewards/rejected": -5.719302654266357, + "step": 488 + }, + { + "epoch": 0.9278937381404174, + "grad_norm": 3.3547513484954834, + "learning_rate": 3.882848714986243e-06, + "logits/chosen": -2.890246868133545, + "logits/rejected": -2.886960506439209, + "logps/chosen": -57.828041076660156, + "logps/rejected": -84.50794219970703, + "loss": 0.2635, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.4307875633239746, + "rewards/margins": 2.7383012771606445, + "rewards/rejected": -6.169088840484619, + "step": 489 + }, + { + "epoch": 0.9297912713472486, + "grad_norm": 1.3184571266174316, + "learning_rate": 3.6824244095010065e-06, + "logits/chosen": -2.83508563041687, + "logits/rejected": -2.8344218730926514, + "logps/chosen": -46.53615951538086, + "logps/rejected": -92.15784454345703, + "loss": 0.0859, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1809654235839844, + "rewards/margins": 4.734975814819336, + "rewards/rejected": -6.9159417152404785, + "step": 490 + }, + { + "epoch": 0.9316888045540797, + "grad_norm": 2.9140141010284424, + "learning_rate": 3.487213822512714e-06, + "logits/chosen": -2.8875601291656494, + "logits/rejected": -2.8827908039093018, + "logps/chosen": -50.426536560058594, + "logps/rejected": -89.96681213378906, + "loss": 0.3374, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6878647804260254, + "rewards/margins": 4.1089630126953125, + "rewards/rejected": -6.796828269958496, + "step": 491 + }, + { + "epoch": 0.9335863377609108, + "grad_norm": 1.0782984495162964, + "learning_rate": 3.2972275208679625e-06, + "logits/chosen": -2.8685128688812256, + "logits/rejected": -2.8658132553100586, + "logps/chosen": -48.13031768798828, + "logps/rejected": -84.63717651367188, + "loss": 0.0898, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3174753189086914, + "rewards/margins": 3.971602439880371, + "rewards/rejected": -6.2890777587890625, + "step": 492 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 3.8515820503234863, + "learning_rate": 3.112475788620217e-06, + "logits/chosen": -2.8882944583892822, + "logits/rejected": -2.8865597248077393, + "logps/chosen": -49.90972900390625, + "logps/rejected": -77.10386657714844, + "loss": 0.3545, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.617600202560425, + "rewards/margins": 2.900125741958618, + "rewards/rejected": -5.517725944519043, + "step": 493 + }, + { + "epoch": 0.937381404174573, + "grad_norm": 3.0631866455078125, + "learning_rate": 2.932968626473065e-06, + "logits/chosen": -2.8472578525543213, + "logits/rejected": -2.8467025756835938, + "logps/chosen": -48.0924072265625, + "logps/rejected": -73.13133239746094, + "loss": 0.5153, + "rewards/accuracies": 0.75, + "rewards/chosen": -2.564882278442383, + "rewards/margins": 2.5196237564086914, + "rewards/rejected": -5.084506034851074, + "step": 494 + }, + { + "epoch": 0.9392789373814042, + "grad_norm": 2.3246684074401855, + "learning_rate": 2.7587157512388718e-06, + "logits/chosen": -2.8876593112945557, + "logits/rejected": -2.886383295059204, + "logps/chosen": -48.26333999633789, + "logps/rejected": -82.16961669921875, + "loss": 0.2638, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.446296215057373, + "rewards/margins": 3.771624803543091, + "rewards/rejected": -6.217921257019043, + "step": 495 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.9831016063690186, + "learning_rate": 2.589726595312858e-06, + "logits/chosen": -2.8822598457336426, + "logits/rejected": -2.879408121109009, + "logps/chosen": -42.318702697753906, + "logps/rejected": -80.41962432861328, + "loss": 0.0659, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6236395835876465, + "rewards/margins": 4.238935470581055, + "rewards/rejected": -5.862575531005859, + "step": 496 + }, + { + "epoch": 0.9430740037950665, + "grad_norm": 1.4202706813812256, + "learning_rate": 2.426010306162485e-06, + "logits/chosen": -2.83363938331604, + "logits/rejected": -2.8289742469787598, + "logps/chosen": -51.449668884277344, + "logps/rejected": -89.0745849609375, + "loss": 0.1617, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.716837167739868, + "rewards/margins": 4.036784648895264, + "rewards/rejected": -6.753622055053711, + "step": 497 + }, + { + "epoch": 0.9449715370018975, + "grad_norm": 0.9878135323524475, + "learning_rate": 2.2675757458323065e-06, + "logits/chosen": -2.8792364597320557, + "logits/rejected": -2.8786449432373047, + "logps/chosen": -48.229270935058594, + "logps/rejected": -84.9887466430664, + "loss": 0.0762, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.416011095046997, + "rewards/margins": 3.865017890930176, + "rewards/rejected": -6.281028747558594, + "step": 498 + }, + { + "epoch": 0.9468690702087287, + "grad_norm": 1.8558670282363892, + "learning_rate": 2.1144314904642195e-06, + "logits/chosen": -2.8796749114990234, + "logits/rejected": -2.8758296966552734, + "logps/chosen": -43.505531311035156, + "logps/rejected": -75.1484146118164, + "loss": 0.2833, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.8704358339309692, + "rewards/margins": 3.288053274154663, + "rewards/rejected": -5.158489227294922, + "step": 499 + }, + { + "epoch": 0.9487666034155597, + "grad_norm": 1.5600035190582275, + "learning_rate": 1.9665858298333005e-06, + "logits/chosen": -2.855041027069092, + "logits/rejected": -2.856189727783203, + "logps/chosen": -45.85496139526367, + "logps/rejected": -85.69245147705078, + "loss": 0.1211, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0445146560668945, + "rewards/margins": 4.26096248626709, + "rewards/rejected": -6.305477142333984, + "step": 500 + }, + { + "epoch": 0.9506641366223909, + "grad_norm": 2.5038869380950928, + "learning_rate": 1.8240467668990457e-06, + "logits/chosen": -2.883979082107544, + "logits/rejected": -2.8837075233459473, + "logps/chosen": -55.38407516479492, + "logps/rejected": -86.02963256835938, + "loss": 0.2542, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.258172035217285, + "rewards/margins": 3.1368069648742676, + "rewards/rejected": -6.394979000091553, + "step": 501 + }, + { + "epoch": 0.952561669829222, + "grad_norm": 2.0583744049072266, + "learning_rate": 1.6868220173721471e-06, + "logits/chosen": -2.8597681522369385, + "logits/rejected": -2.8624424934387207, + "logps/chosen": -51.48234939575195, + "logps/rejected": -80.75935363769531, + "loss": 0.3349, + "rewards/accuracies": 0.8125, + "rewards/chosen": -2.759897232055664, + "rewards/margins": 3.021780014038086, + "rewards/rejected": -5.78167724609375, + "step": 502 + }, + { + "epoch": 0.9544592030360531, + "grad_norm": 2.2353053092956543, + "learning_rate": 1.5549190092968736e-06, + "logits/chosen": -2.8665668964385986, + "logits/rejected": -2.8670010566711426, + "logps/chosen": -50.2732048034668, + "logps/rejected": -88.10029602050781, + "loss": 0.386, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.653740406036377, + "rewards/margins": 4.059865951538086, + "rewards/rejected": -6.713606834411621, + "step": 503 + }, + { + "epoch": 0.9563567362428842, + "grad_norm": 2.396918773651123, + "learning_rate": 1.4283448826489798e-06, + "logits/chosen": -2.8291938304901123, + "logits/rejected": -2.8266398906707764, + "logps/chosen": -46.44121551513672, + "logps/rejected": -79.84049987792969, + "loss": 0.2428, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.244828701019287, + "rewards/margins": 3.5079965591430664, + "rewards/rejected": -5.7528252601623535, + "step": 504 + }, + { + "epoch": 0.9582542694497154, + "grad_norm": 1.3848168849945068, + "learning_rate": 1.3071064889491724e-06, + "logits/chosen": -2.8477964401245117, + "logits/rejected": -2.8448750972747803, + "logps/chosen": -41.835227966308594, + "logps/rejected": -74.19090270996094, + "loss": 0.1726, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6162993907928467, + "rewards/margins": 3.534118890762329, + "rewards/rejected": -5.150418281555176, + "step": 505 + }, + { + "epoch": 0.9601518026565465, + "grad_norm": 1.7604390382766724, + "learning_rate": 1.1912103908922945e-06, + "logits/chosen": -2.871805429458618, + "logits/rejected": -2.8669626712799072, + "logps/chosen": -49.46722412109375, + "logps/rejected": -83.61421203613281, + "loss": 0.2727, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5857064723968506, + "rewards/margins": 3.5844779014587402, + "rewards/rejected": -6.170184135437012, + "step": 506 + }, + { + "epoch": 0.9620493358633776, + "grad_norm": 2.633267641067505, + "learning_rate": 1.0806628619920322e-06, + "logits/chosen": -2.8769755363464355, + "logits/rejected": -2.8758394718170166, + "logps/chosen": -51.57659149169922, + "logps/rejected": -75.70911407470703, + "loss": 0.3124, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.7957983016967773, + "rewards/margins": 2.4409711360931396, + "rewards/rejected": -5.236769676208496, + "step": 507 + }, + { + "epoch": 0.9639468690702088, + "grad_norm": 0.38449132442474365, + "learning_rate": 9.754698862413759e-07, + "logits/chosen": -2.8793885707855225, + "logits/rejected": -2.876129388809204, + "logps/chosen": -53.12155532836914, + "logps/rejected": -92.95342254638672, + "loss": 0.0331, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.927220106124878, + "rewards/margins": 4.145195007324219, + "rewards/rejected": -7.072414875030518, + "step": 508 + }, + { + "epoch": 0.9658444022770398, + "grad_norm": 1.3234167098999023, + "learning_rate": 8.75637157788689e-07, + "logits/chosen": -2.8668880462646484, + "logits/rejected": -2.8626136779785156, + "logps/chosen": -46.708091735839844, + "logps/rejected": -91.33255767822266, + "loss": 0.192, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.2253031730651855, + "rewards/margins": 4.6860833168029785, + "rewards/rejected": -6.911386013031006, + "step": 509 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 1.2830538749694824, + "learning_rate": 7.81170080629412e-07, + "logits/chosen": -2.8674538135528564, + "logits/rejected": -2.8622677326202393, + "logps/chosen": -50.81001281738281, + "logps/rejected": -82.64586639404297, + "loss": 0.1122, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.573124408721924, + "rewards/margins": 3.5462069511413574, + "rewards/rejected": -6.1193318367004395, + "step": 510 + }, + { + "epoch": 0.969639468690702, + "grad_norm": 2.888144016265869, + "learning_rate": 6.920737683136613e-07, + "logits/chosen": -2.8974032402038574, + "logits/rejected": -2.891505479812622, + "logps/chosen": -53.14949035644531, + "logps/rejected": -89.72847747802734, + "loss": 0.2069, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.929083824157715, + "rewards/margins": 3.8716461658477783, + "rewards/rejected": -6.800730228424072, + "step": 511 + }, + { + "epoch": 0.9715370018975332, + "grad_norm": 1.7620785236358643, + "learning_rate": 6.083530436693408e-07, + "logits/chosen": -2.8567254543304443, + "logits/rejected": -2.863250970840454, + "logps/chosen": -49.66737365722656, + "logps/rejected": -83.99765014648438, + "loss": 0.2175, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.5828912258148193, + "rewards/margins": 3.6682820320129395, + "rewards/rejected": -6.25117301940918, + "step": 512 + }, + { + "epoch": 0.9734345351043643, + "grad_norm": 1.5268384218215942, + "learning_rate": 5.300124385410943e-07, + "logits/chosen": -2.8635151386260986, + "logits/rejected": -2.86435866355896, + "logps/chosen": -55.493438720703125, + "logps/rejected": -85.88677978515625, + "loss": 0.1664, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0996596813201904, + "rewards/margins": 3.1925599575042725, + "rewards/rejected": -6.292219161987305, + "step": 513 + }, + { + "epoch": 0.9753320683111955, + "grad_norm": 2.3480827808380127, + "learning_rate": 4.570561935450468e-07, + "logits/chosen": -2.8450422286987305, + "logits/rejected": -2.8451755046844482, + "logps/chosen": -58.186397552490234, + "logps/rejected": -92.19591522216797, + "loss": 0.4327, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.466181755065918, + "rewards/margins": 3.5265114307403564, + "rewards/rejected": -6.992692947387695, + "step": 514 + }, + { + "epoch": 0.9772296015180265, + "grad_norm": 2.6257131099700928, + "learning_rate": 3.8948825783918784e-07, + "logits/chosen": -2.8356828689575195, + "logits/rejected": -2.834594964981079, + "logps/chosen": -51.776058197021484, + "logps/rejected": -81.43892669677734, + "loss": 0.1723, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8794238567352295, + "rewards/margins": 3.0788588523864746, + "rewards/rejected": -5.958282947540283, + "step": 515 + }, + { + "epoch": 0.9791271347248577, + "grad_norm": 3.8994312286376953, + "learning_rate": 3.273122889096536e-07, + "logits/chosen": -2.866316556930542, + "logits/rejected": -2.8655130863189697, + "logps/chosen": -57.50481414794922, + "logps/rejected": -87.61444854736328, + "loss": 0.2925, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.3754281997680664, + "rewards/margins": 3.1350836753845215, + "rewards/rejected": -6.510511875152588, + "step": 516 + }, + { + "epoch": 0.9810246679316889, + "grad_norm": 1.6719425916671753, + "learning_rate": 2.7053165237268527e-07, + "logits/chosen": -2.875308036804199, + "logits/rejected": -2.874027967453003, + "logps/chosen": -60.042579650878906, + "logps/rejected": -89.38323974609375, + "loss": 0.2483, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.7195136547088623, + "rewards/margins": 3.0455737113952637, + "rewards/rejected": -6.765087127685547, + "step": 517 + }, + { + "epoch": 0.9829222011385199, + "grad_norm": 2.5794994831085205, + "learning_rate": 2.191494217925305e-07, + "logits/chosen": -2.8739867210388184, + "logits/rejected": -2.876070022583008, + "logps/chosen": -47.57518768310547, + "logps/rejected": -83.28882598876953, + "loss": 0.2543, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.296959638595581, + "rewards/margins": 3.761139392852783, + "rewards/rejected": -6.058098793029785, + "step": 518 + }, + { + "epoch": 0.9848197343453511, + "grad_norm": 1.4231163263320923, + "learning_rate": 1.7316837851499844e-07, + "logits/chosen": -2.8610801696777344, + "logits/rejected": -2.858586072921753, + "logps/chosen": -49.79618453979492, + "logps/rejected": -86.7958755493164, + "loss": 0.1576, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.5396759510040283, + "rewards/margins": 3.977015972137451, + "rewards/rejected": -6.516692161560059, + "step": 519 + }, + { + "epoch": 0.9867172675521821, + "grad_norm": 3.5399532318115234, + "learning_rate": 1.3259101151694708e-07, + "logits/chosen": -2.888908624649048, + "logits/rejected": -2.8908607959747314, + "logps/chosen": -57.93926239013672, + "logps/rejected": -87.50239562988281, + "loss": 0.348, + "rewards/accuracies": 0.8125, + "rewards/chosen": -3.480964183807373, + "rewards/margins": 3.0687217712402344, + "rewards/rejected": -6.549685478210449, + "step": 520 + }, + { + "epoch": 0.9886148007590133, + "grad_norm": 3.352266788482666, + "learning_rate": 9.741951727152421e-08, + "logits/chosen": -2.894845485687256, + "logits/rejected": -2.8936767578125, + "logps/chosen": -56.55711364746094, + "logps/rejected": -79.75418853759766, + "loss": 0.3825, + "rewards/accuracies": 0.6875, + "rewards/chosen": -3.3504998683929443, + "rewards/margins": 2.4052634239196777, + "rewards/rejected": -5.755763053894043, + "step": 521 + }, + { + "epoch": 0.9905123339658444, + "grad_norm": 1.2697792053222656, + "learning_rate": 6.765579962928482e-08, + "logits/chosen": -2.8767342567443848, + "logits/rejected": -2.880286931991577, + "logps/chosen": -52.612449645996094, + "logps/rejected": -82.76181030273438, + "loss": 0.0968, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.910796642303467, + "rewards/margins": 3.1595427989959717, + "rewards/rejected": -6.070339679718018, + "step": 522 + }, + { + "epoch": 0.9924098671726755, + "grad_norm": 1.018975853919983, + "learning_rate": 4.330146971515125e-08, + "logits/chosen": -2.852424144744873, + "logits/rejected": -2.8456950187683105, + "logps/chosen": -51.23678207397461, + "logps/rejected": -86.56326293945312, + "loss": 0.1052, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8189241886138916, + "rewards/margins": 3.7502474784851074, + "rewards/rejected": -6.56917142868042, + "step": 523 + }, + { + "epoch": 0.9943074003795066, + "grad_norm": 0.7454613447189331, + "learning_rate": 2.435784584114975e-08, + "logits/chosen": -2.8692071437835693, + "logits/rejected": -2.8686625957489014, + "logps/chosen": -52.17524719238281, + "logps/rejected": -87.58660888671875, + "loss": 0.1174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.771207332611084, + "rewards/margins": 3.7996647357940674, + "rewards/rejected": -6.5708723068237305, + "step": 524 + }, + { + "epoch": 0.9962049335863378, + "grad_norm": 3.760188102722168, + "learning_rate": 1.0825953435122938e-08, + "logits/chosen": -2.8647942543029785, + "logits/rejected": -2.8657193183898926, + "logps/chosen": -49.69865417480469, + "logps/rejected": -85.46101379394531, + "loss": 0.3014, + "rewards/accuracies": 0.875, + "rewards/chosen": -2.602159261703491, + "rewards/margins": 3.695681571960449, + "rewards/rejected": -6.297840595245361, + "step": 525 + }, + { + "epoch": 0.9981024667931688, + "grad_norm": 1.6153333187103271, + "learning_rate": 2.7065249851743193e-09, + "logits/chosen": -2.8594605922698975, + "logits/rejected": -2.861586332321167, + "logps/chosen": -52.35853576660156, + "logps/rejected": -86.91337585449219, + "loss": 0.0971, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.7869677543640137, + "rewards/margins": 3.7072904109954834, + "rewards/rejected": -6.494257926940918, + "step": 526 + }, + { + "epoch": 1.0, + "grad_norm": 3.749000072479248, + "learning_rate": 0.0, + "logits/chosen": -2.897618055343628, + "logits/rejected": -2.891883373260498, + "logps/chosen": -56.75947189331055, + "logps/rejected": -83.30915069580078, + "loss": 0.3765, + "rewards/accuracies": 0.8333333730697632, + "rewards/chosen": -3.231624126434326, + "rewards/margins": 2.8773982524871826, + "rewards/rejected": -6.10902214050293, + "step": 527 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -2.868335485458374, + "eval_logits/rejected": -2.866347312927246, + "eval_logps/chosen": -49.55250930786133, + "eval_logps/rejected": -86.6895523071289, + "eval_loss": 0.14940239489078522, + "eval_rewards/accuracies": 0.945090115070343, + "eval_rewards/chosen": -2.55690598487854, + "eval_rewards/margins": 3.9059858322143555, + "eval_rewards/rejected": -6.462891578674316, + "eval_runtime": 5440.1428, + "eval_samples_per_second": 1.548, + "eval_steps_per_second": 0.097, + "step": 527 + } + ], + "logging_steps": 1, + "max_steps": 527, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}