{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 527, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018975332068311196, "grad_norm": 1.0043981075286865, "learning_rate": 2.0000000000000003e-06, "logits/chosen": -2.840092182159424, "logits/rejected": -2.8336455821990967, "logps/chosen": -25.032325744628906, "logps/rejected": -22.43791389465332, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.003795066413662239, "grad_norm": 1.213585376739502, "learning_rate": 4.000000000000001e-06, "logits/chosen": -2.8843088150024414, "logits/rejected": -2.880852222442627, "logps/chosen": -24.25356101989746, "logps/rejected": -22.29548454284668, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0056925996204933585, "grad_norm": 1.4231489896774292, "learning_rate": 6e-06, "logits/chosen": -2.7949483394622803, "logits/rejected": -2.794952392578125, "logps/chosen": -23.652122497558594, "logps/rejected": -22.365129470825195, "loss": 0.6902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0177607424557209, "rewards/margins": 0.005833745002746582, "rewards/rejected": -0.023594487458467484, "step": 3 }, { "epoch": 0.007590132827324478, "grad_norm": 1.0011111497879028, "learning_rate": 8.000000000000001e-06, "logits/chosen": -2.8000831604003906, "logits/rejected": -2.7997589111328125, "logps/chosen": -25.592479705810547, "logps/rejected": -23.90885353088379, "loss": 0.6911, "rewards/accuracies": 0.5, "rewards/chosen": -0.0664670318365097, "rewards/margins": 0.004310930147767067, "rewards/rejected": -0.07077796757221222, "step": 4 }, { "epoch": 0.009487666034155597, "grad_norm": 0.9724621772766113, "learning_rate": 1e-05, "logits/chosen": -2.8149185180664062, "logits/rejected": -2.813040018081665, "logps/chosen": -23.858686447143555, "logps/rejected": -22.61505699157715, "loss": 0.686, "rewards/accuracies": 0.625, "rewards/chosen": -0.10101969540119171, "rewards/margins": 0.014916013926267624, "rewards/rejected": -0.11593571305274963, "step": 5 }, { "epoch": 0.011385199240986717, "grad_norm": 0.9295000433921814, "learning_rate": 1.2e-05, "logits/chosen": -2.808767080307007, "logits/rejected": -2.8061323165893555, "logps/chosen": -25.594276428222656, "logps/rejected": -24.02318572998047, "loss": 0.6875, "rewards/accuracies": 0.5, "rewards/chosen": -0.18439313769340515, "rewards/margins": 0.012782273814082146, "rewards/rejected": -0.19717541337013245, "step": 6 }, { "epoch": 0.013282732447817837, "grad_norm": 0.9410561323165894, "learning_rate": 1.4000000000000001e-05, "logits/chosen": -2.794262409210205, "logits/rejected": -2.792712926864624, "logps/chosen": -26.27347183227539, "logps/rejected": -24.40791893005371, "loss": 0.7022, "rewards/accuracies": 0.4375, "rewards/chosen": -0.2307298630475998, "rewards/margins": -0.016173291951417923, "rewards/rejected": -0.21455657482147217, "step": 7 }, { "epoch": 0.015180265654648957, "grad_norm": 0.963943600654602, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -2.861567735671997, "logits/rejected": -2.862137794494629, "logps/chosen": -26.166967391967773, "logps/rejected": -24.172080993652344, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": -0.28767508268356323, "rewards/margins": 0.03301116079092026, "rewards/rejected": -0.3206862211227417, "step": 8 }, { "epoch": 0.017077798861480076, "grad_norm": 0.8776592016220093, "learning_rate": 1.8e-05, "logits/chosen": -2.8510162830352783, "logits/rejected": -2.852877616882324, "logps/chosen": -26.08197784423828, "logps/rejected": -25.353015899658203, "loss": 0.6532, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2958042621612549, "rewards/margins": 0.08714231848716736, "rewards/rejected": -0.38294661045074463, "step": 9 }, { "epoch": 0.018975332068311195, "grad_norm": 0.923055112361908, "learning_rate": 2e-05, "logits/chosen": -2.845109462738037, "logits/rejected": -2.8453867435455322, "logps/chosen": -27.779403686523438, "logps/rejected": -26.860719680786133, "loss": 0.6573, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3790830373764038, "rewards/margins": 0.07841245085000992, "rewards/rejected": -0.45749545097351074, "step": 10 }, { "epoch": 0.020872865275142316, "grad_norm": 1.1252574920654297, "learning_rate": 2.2000000000000003e-05, "logits/chosen": -2.831721067428589, "logits/rejected": -2.82804799079895, "logps/chosen": -27.735706329345703, "logps/rejected": -26.28765869140625, "loss": 0.5991, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3327995538711548, "rewards/margins": 0.20304027199745178, "rewards/rejected": -0.5358397960662842, "step": 11 }, { "epoch": 0.022770398481973434, "grad_norm": 1.247209072113037, "learning_rate": 2.4e-05, "logits/chosen": -2.832109212875366, "logits/rejected": -2.8269498348236084, "logps/chosen": -29.082386016845703, "logps/rejected": -27.44651222229004, "loss": 0.6788, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5191504955291748, "rewards/margins": 0.03733288496732712, "rewards/rejected": -0.5564833879470825, "step": 12 }, { "epoch": 0.024667931688804556, "grad_norm": 0.9526516199111938, "learning_rate": 2.6000000000000002e-05, "logits/chosen": -2.7532851696014404, "logits/rejected": -2.751197099685669, "logps/chosen": -28.381704330444336, "logps/rejected": -28.021570205688477, "loss": 0.6313, "rewards/accuracies": 0.6875, "rewards/chosen": -0.46909499168395996, "rewards/margins": 0.1372968554496765, "rewards/rejected": -0.6063918471336365, "step": 13 }, { "epoch": 0.026565464895635674, "grad_norm": 0.967846155166626, "learning_rate": 2.8000000000000003e-05, "logits/chosen": -2.8310461044311523, "logits/rejected": -2.8237879276275635, "logps/chosen": -28.878814697265625, "logps/rejected": -29.368640899658203, "loss": 0.6105, "rewards/accuracies": 0.75, "rewards/chosen": -0.5359129309654236, "rewards/margins": 0.18219587206840515, "rewards/rejected": -0.7181087732315063, "step": 14 }, { "epoch": 0.028462998102466792, "grad_norm": 1.1417112350463867, "learning_rate": 3e-05, "logits/chosen": -2.8312528133392334, "logits/rejected": -2.8290631771087646, "logps/chosen": -29.81403350830078, "logps/rejected": -28.913654327392578, "loss": 0.6425, "rewards/accuracies": 0.5, "rewards/chosen": -0.5813980102539062, "rewards/margins": 0.13141022622585297, "rewards/rejected": -0.7128081917762756, "step": 15 }, { "epoch": 0.030360531309297913, "grad_norm": 0.9886475801467896, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -2.854173183441162, "logits/rejected": -2.8493854999542236, "logps/chosen": -30.15297508239746, "logps/rejected": -29.798294067382812, "loss": 0.6355, "rewards/accuracies": 0.625, "rewards/chosen": -0.6225000619888306, "rewards/margins": 0.1385478377342224, "rewards/rejected": -0.7610478401184082, "step": 16 }, { "epoch": 0.03225806451612903, "grad_norm": 1.4905331134796143, "learning_rate": 3.4000000000000007e-05, "logits/chosen": -2.815732479095459, "logits/rejected": -2.822748899459839, "logps/chosen": -29.450511932373047, "logps/rejected": -30.160736083984375, "loss": 0.6313, "rewards/accuracies": 0.75, "rewards/chosen": -0.6014214754104614, "rewards/margins": 0.15089011192321777, "rewards/rejected": -0.7523115873336792, "step": 17 }, { "epoch": 0.03415559772296015, "grad_norm": 0.9573298692703247, "learning_rate": 3.6e-05, "logits/chosen": -2.7691009044647217, "logits/rejected": -2.768683671951294, "logps/chosen": -30.41909408569336, "logps/rejected": -28.978151321411133, "loss": 0.6377, "rewards/accuracies": 0.6875, "rewards/chosen": -0.60785311460495, "rewards/margins": 0.1253933608531952, "rewards/rejected": -0.7332464456558228, "step": 18 }, { "epoch": 0.036053130929791274, "grad_norm": 1.1032794713974, "learning_rate": 3.8e-05, "logits/chosen": -2.8177359104156494, "logits/rejected": -2.8107476234436035, "logps/chosen": -30.09964942932129, "logps/rejected": -30.08537483215332, "loss": 0.6323, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6421741843223572, "rewards/margins": 0.18355508148670197, "rewards/rejected": -0.825729250907898, "step": 19 }, { "epoch": 0.03795066413662239, "grad_norm": 1.030942678451538, "learning_rate": 4e-05, "logits/chosen": -2.7741472721099854, "logits/rejected": -2.7686080932617188, "logps/chosen": -30.057147979736328, "logps/rejected": -30.112163543701172, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -0.6412388682365417, "rewards/margins": 0.10512620210647583, "rewards/rejected": -0.7463650703430176, "step": 20 }, { "epoch": 0.03984819734345351, "grad_norm": 0.9527221918106079, "learning_rate": 4.2e-05, "logits/chosen": -2.818779706954956, "logits/rejected": -2.81341290473938, "logps/chosen": -29.735050201416016, "logps/rejected": -29.48119354248047, "loss": 0.536, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3829268217086792, "rewards/margins": 0.3811430335044861, "rewards/rejected": -0.7640698552131653, "step": 21 }, { "epoch": 0.04174573055028463, "grad_norm": 1.2015403509140015, "learning_rate": 4.4000000000000006e-05, "logits/chosen": -2.8072240352630615, "logits/rejected": -2.8029563426971436, "logps/chosen": -30.29505157470703, "logps/rejected": -30.543851852416992, "loss": 0.5779, "rewards/accuracies": 0.875, "rewards/chosen": -0.6176290512084961, "rewards/margins": 0.26741302013397217, "rewards/rejected": -0.8850420117378235, "step": 22 }, { "epoch": 0.04364326375711575, "grad_norm": 1.0167680978775024, "learning_rate": 4.600000000000001e-05, "logits/chosen": -2.8216638565063477, "logits/rejected": -2.818462610244751, "logps/chosen": -30.67039680480957, "logps/rejected": -32.72490692138672, "loss": 0.5636, "rewards/accuracies": 0.75, "rewards/chosen": -0.6111881136894226, "rewards/margins": 0.31430453062057495, "rewards/rejected": -0.9254926443099976, "step": 23 }, { "epoch": 0.04554079696394687, "grad_norm": 1.308831810951233, "learning_rate": 4.8e-05, "logits/chosen": -2.8071296215057373, "logits/rejected": -2.8020644187927246, "logps/chosen": -30.84124755859375, "logps/rejected": -32.474220275878906, "loss": 0.4833, "rewards/accuracies": 1.0, "rewards/chosen": -0.6006485223770142, "rewards/margins": 0.5012238025665283, "rewards/rejected": -1.101872444152832, "step": 24 }, { "epoch": 0.04743833017077799, "grad_norm": 1.2391417026519775, "learning_rate": 5e-05, "logits/chosen": -2.839644193649292, "logits/rejected": -2.8386754989624023, "logps/chosen": -32.8646125793457, "logps/rejected": -33.53518295288086, "loss": 0.5978, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9321977496147156, "rewards/margins": 0.24001502990722656, "rewards/rejected": -1.172212839126587, "step": 25 }, { "epoch": 0.04933586337760911, "grad_norm": 1.4166021347045898, "learning_rate": 5.2000000000000004e-05, "logits/chosen": -2.822469472885132, "logits/rejected": -2.824627637863159, "logps/chosen": -33.248661041259766, "logps/rejected": -33.05607223510742, "loss": 0.6222, "rewards/accuracies": 0.625, "rewards/chosen": -0.9429985284805298, "rewards/margins": 0.20479901134967804, "rewards/rejected": -1.1477975845336914, "step": 26 }, { "epoch": 0.051233396584440226, "grad_norm": 1.3769515752792358, "learning_rate": 5.4000000000000005e-05, "logits/chosen": -2.8131866455078125, "logits/rejected": -2.8165550231933594, "logps/chosen": -32.11775588989258, "logps/rejected": -33.78160858154297, "loss": 0.558, "rewards/accuracies": 0.75, "rewards/chosen": -0.7932265996932983, "rewards/margins": 0.3268255591392517, "rewards/rejected": -1.1200520992279053, "step": 27 }, { "epoch": 0.05313092979127135, "grad_norm": 1.8020470142364502, "learning_rate": 5.6000000000000006e-05, "logits/chosen": -2.75449800491333, "logits/rejected": -2.7445366382598877, "logps/chosen": -29.581748962402344, "logps/rejected": -31.868961334228516, "loss": 0.5396, "rewards/accuracies": 0.875, "rewards/chosen": -0.48946696519851685, "rewards/margins": 0.41189253330230713, "rewards/rejected": -0.901359498500824, "step": 28 }, { "epoch": 0.05502846299810247, "grad_norm": 1.4790736436843872, "learning_rate": 5.8e-05, "logits/chosen": -2.7621536254882812, "logits/rejected": -2.763129472732544, "logps/chosen": -26.641231536865234, "logps/rejected": -28.51852035522461, "loss": 0.5958, "rewards/accuracies": 0.8125, "rewards/chosen": -0.22555744647979736, "rewards/margins": 0.36228662729263306, "rewards/rejected": -0.5878440737724304, "step": 29 }, { "epoch": 0.056925996204933584, "grad_norm": 2.4083566665649414, "learning_rate": 6e-05, "logits/chosen": -2.7771716117858887, "logits/rejected": -2.779322862625122, "logps/chosen": -25.557411193847656, "logps/rejected": -28.930173873901367, "loss": 0.4966, "rewards/accuracies": 0.875, "rewards/chosen": -0.2201511561870575, "rewards/margins": 0.5415716767311096, "rewards/rejected": -0.7617228627204895, "step": 30 }, { "epoch": 0.058823529411764705, "grad_norm": 3.190519094467163, "learning_rate": 6.2e-05, "logits/chosen": -2.7948427200317383, "logits/rejected": -2.7934060096740723, "logps/chosen": -25.57403564453125, "logps/rejected": -27.072856903076172, "loss": 0.548, "rewards/accuracies": 0.75, "rewards/chosen": -0.15162353217601776, "rewards/margins": 0.38971802592277527, "rewards/rejected": -0.5413415431976318, "step": 31 }, { "epoch": 0.06072106261859583, "grad_norm": 2.303135633468628, "learning_rate": 6.400000000000001e-05, "logits/chosen": -2.7421650886535645, "logits/rejected": -2.746206521987915, "logps/chosen": -27.796472549438477, "logps/rejected": -33.54944610595703, "loss": 0.4797, "rewards/accuracies": 0.8125, "rewards/chosen": -0.43570366501808167, "rewards/margins": 0.6830397844314575, "rewards/rejected": -1.1187434196472168, "step": 32 }, { "epoch": 0.06261859582542695, "grad_norm": 2.5751965045928955, "learning_rate": 6.6e-05, "logits/chosen": -2.8050615787506104, "logits/rejected": -2.8058907985687256, "logps/chosen": -31.59589195251465, "logps/rejected": -37.699378967285156, "loss": 0.4424, "rewards/accuracies": 0.75, "rewards/chosen": -0.7283644676208496, "rewards/margins": 0.8916000127792358, "rewards/rejected": -1.619964599609375, "step": 33 }, { "epoch": 0.06451612903225806, "grad_norm": 2.540719985961914, "learning_rate": 6.800000000000001e-05, "logits/chosen": -2.7393884658813477, "logits/rejected": -2.745088815689087, "logps/chosen": -33.36492156982422, "logps/rejected": -41.315711975097656, "loss": 0.3926, "rewards/accuracies": 0.875, "rewards/chosen": -0.9094129204750061, "rewards/margins": 0.948678731918335, "rewards/rejected": -1.8580915927886963, "step": 34 }, { "epoch": 0.06641366223908918, "grad_norm": 4.434343338012695, "learning_rate": 7e-05, "logits/chosen": -2.7703030109405518, "logits/rejected": -2.7763912677764893, "logps/chosen": -38.98546600341797, "logps/rejected": -45.82567596435547, "loss": 0.5193, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5312814712524414, "rewards/margins": 0.8732771277427673, "rewards/rejected": -2.4045586585998535, "step": 35 }, { "epoch": 0.0683111954459203, "grad_norm": 4.4201507568359375, "learning_rate": 7.2e-05, "logits/chosen": -2.7684011459350586, "logits/rejected": -2.7685604095458984, "logps/chosen": -40.817020416259766, "logps/rejected": -46.39413833618164, "loss": 0.5283, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7174456119537354, "rewards/margins": 0.7200771570205688, "rewards/rejected": -2.4375228881835938, "step": 36 }, { "epoch": 0.07020872865275142, "grad_norm": 3.611697196960449, "learning_rate": 7.4e-05, "logits/chosen": -2.773036241531372, "logits/rejected": -2.7685177326202393, "logps/chosen": -42.66209411621094, "logps/rejected": -47.02984619140625, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -1.8478937149047852, "rewards/margins": 0.6286407709121704, "rewards/rejected": -2.476534605026245, "step": 37 }, { "epoch": 0.07210626185958255, "grad_norm": 5.300126075744629, "learning_rate": 7.6e-05, "logits/chosen": -2.8151535987854004, "logits/rejected": -2.8189120292663574, "logps/chosen": -44.82855224609375, "logps/rejected": -48.87239074707031, "loss": 0.4847, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0770606994628906, "rewards/margins": 0.6569112539291382, "rewards/rejected": -2.7339720726013184, "step": 38 }, { "epoch": 0.07400379506641366, "grad_norm": 9.036521911621094, "learning_rate": 7.800000000000001e-05, "logits/chosen": -2.735901117324829, "logits/rejected": -2.746289014816284, "logps/chosen": -36.09475326538086, "logps/rejected": -48.74174499511719, "loss": 0.3724, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2715508937835693, "rewards/margins": 1.4835450649261475, "rewards/rejected": -2.755095958709717, "step": 39 }, { "epoch": 0.07590132827324478, "grad_norm": 7.483906269073486, "learning_rate": 8e-05, "logits/chosen": -2.762651205062866, "logits/rejected": -2.760044813156128, "logps/chosen": -36.31230926513672, "logps/rejected": -39.352806091308594, "loss": 0.6028, "rewards/accuracies": 0.625, "rewards/chosen": -1.1989113092422485, "rewards/margins": 0.44402891397476196, "rewards/rejected": -1.6429402828216553, "step": 40 }, { "epoch": 0.0777988614800759, "grad_norm": 3.6882236003875732, "learning_rate": 8.2e-05, "logits/chosen": -2.7478814125061035, "logits/rejected": -2.7577743530273438, "logps/chosen": -36.97419738769531, "logps/rejected": -41.79148864746094, "loss": 0.5367, "rewards/accuracies": 0.6875, "rewards/chosen": -1.360243320465088, "rewards/margins": 0.6748560667037964, "rewards/rejected": -2.0350992679595947, "step": 41 }, { "epoch": 0.07969639468690702, "grad_norm": 5.39766263961792, "learning_rate": 8.4e-05, "logits/chosen": -2.777827262878418, "logits/rejected": -2.779998302459717, "logps/chosen": -34.955528259277344, "logps/rejected": -40.095359802246094, "loss": 0.4864, "rewards/accuracies": 0.75, "rewards/chosen": -1.0647568702697754, "rewards/margins": 0.7299851179122925, "rewards/rejected": -1.7947419881820679, "step": 42 }, { "epoch": 0.08159392789373814, "grad_norm": 3.68463397026062, "learning_rate": 8.6e-05, "logits/chosen": -2.7425332069396973, "logits/rejected": -2.749401807785034, "logps/chosen": -33.41716766357422, "logps/rejected": -44.460105895996094, "loss": 0.3641, "rewards/accuracies": 0.875, "rewards/chosen": -1.0076907873153687, "rewards/margins": 1.2475147247314453, "rewards/rejected": -2.2552056312561035, "step": 43 }, { "epoch": 0.08349146110056926, "grad_norm": 4.242417335510254, "learning_rate": 8.800000000000001e-05, "logits/chosen": -2.7301769256591797, "logits/rejected": -2.739335298538208, "logps/chosen": -40.64069747924805, "logps/rejected": -53.96464538574219, "loss": 0.3206, "rewards/accuracies": 0.875, "rewards/chosen": -1.6848864555358887, "rewards/margins": 1.3943724632263184, "rewards/rejected": -3.079258918762207, "step": 44 }, { "epoch": 0.08538899430740038, "grad_norm": 7.282301902770996, "learning_rate": 9e-05, "logits/chosen": -2.766569137573242, "logits/rejected": -2.7720367908477783, "logps/chosen": -43.393699645996094, "logps/rejected": -67.36265563964844, "loss": 0.3599, "rewards/accuracies": 0.875, "rewards/chosen": -2.043391466140747, "rewards/margins": 2.490874767303467, "rewards/rejected": -4.534266471862793, "step": 45 }, { "epoch": 0.0872865275142315, "grad_norm": 6.472597122192383, "learning_rate": 9.200000000000001e-05, "logits/chosen": -2.8010120391845703, "logits/rejected": -2.7943105697631836, "logps/chosen": -54.13050079345703, "logps/rejected": -66.39878845214844, "loss": 0.692, "rewards/accuracies": 0.6875, "rewards/chosen": -2.856748342514038, "rewards/margins": 1.4960048198699951, "rewards/rejected": -4.352753162384033, "step": 46 }, { "epoch": 0.08918406072106262, "grad_norm": 6.507133483886719, "learning_rate": 9.4e-05, "logits/chosen": -2.702542543411255, "logits/rejected": -2.7060205936431885, "logps/chosen": -58.316917419433594, "logps/rejected": -62.27170944213867, "loss": 0.7671, "rewards/accuracies": 0.6875, "rewards/chosen": -3.5426511764526367, "rewards/margins": 0.42596814036369324, "rewards/rejected": -3.9686193466186523, "step": 47 }, { "epoch": 0.09108159392789374, "grad_norm": 1.7756693363189697, "learning_rate": 9.6e-05, "logits/chosen": -2.7501490116119385, "logits/rejected": -2.746398687362671, "logps/chosen": -44.69342803955078, "logps/rejected": -59.879600524902344, "loss": 0.2497, "rewards/accuracies": 0.875, "rewards/chosen": -1.9775663614273071, "rewards/margins": 1.8432695865631104, "rewards/rejected": -3.820835590362549, "step": 48 }, { "epoch": 0.09297912713472485, "grad_norm": 2.5171730518341064, "learning_rate": 9.8e-05, "logits/chosen": -2.7280032634735107, "logits/rejected": -2.728956699371338, "logps/chosen": -49.98912048339844, "logps/rejected": -59.551475524902344, "loss": 0.549, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6230831146240234, "rewards/margins": 1.1891345977783203, "rewards/rejected": -3.8122177124023438, "step": 49 }, { "epoch": 0.09487666034155598, "grad_norm": 2.106658935546875, "learning_rate": 0.0001, "logits/chosen": -2.7503480911254883, "logits/rejected": -2.7512266635894775, "logps/chosen": -42.37887954711914, "logps/rejected": -49.372154235839844, "loss": 0.4477, "rewards/accuracies": 0.875, "rewards/chosen": -1.683307409286499, "rewards/margins": 1.000291347503662, "rewards/rejected": -2.683598518371582, "step": 50 }, { "epoch": 0.0967741935483871, "grad_norm": 7.144794940948486, "learning_rate": 0.00010200000000000001, "logits/chosen": -2.7536375522613525, "logits/rejected": -2.75874662399292, "logps/chosen": -40.97515106201172, "logps/rejected": -44.2424201965332, "loss": 0.7101, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8148384094238281, "rewards/margins": 0.42236870527267456, "rewards/rejected": -2.2372071743011475, "step": 51 }, { "epoch": 0.09867172675521822, "grad_norm": 2.209843158721924, "learning_rate": 0.00010400000000000001, "logits/chosen": -2.6992690563201904, "logits/rejected": -2.6988301277160645, "logps/chosen": -37.43678283691406, "logps/rejected": -45.80652618408203, "loss": 0.3782, "rewards/accuracies": 0.875, "rewards/chosen": -1.3452003002166748, "rewards/margins": 1.0832030773162842, "rewards/rejected": -2.428403615951538, "step": 52 }, { "epoch": 0.10056925996204934, "grad_norm": 1.8978558778762817, "learning_rate": 0.00010600000000000002, "logits/chosen": -2.763897657394409, "logits/rejected": -2.768207311630249, "logps/chosen": -41.63645553588867, "logps/rejected": -52.406524658203125, "loss": 0.3655, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7027217149734497, "rewards/margins": 1.1549346446990967, "rewards/rejected": -2.857656240463257, "step": 53 }, { "epoch": 0.10246679316888045, "grad_norm": 2.3911263942718506, "learning_rate": 0.00010800000000000001, "logits/chosen": -2.7382662296295166, "logits/rejected": -2.7411863803863525, "logps/chosen": -45.131248474121094, "logps/rejected": -59.659149169921875, "loss": 0.3506, "rewards/accuracies": 0.8125, "rewards/chosen": -2.070969343185425, "rewards/margins": 1.574415683746338, "rewards/rejected": -3.6453850269317627, "step": 54 }, { "epoch": 0.10436432637571158, "grad_norm": 3.873135566711426, "learning_rate": 0.00011000000000000002, "logits/chosen": -2.7719104290008545, "logits/rejected": -2.7709810733795166, "logps/chosen": -48.20206069946289, "logps/rejected": -58.50641632080078, "loss": 0.4576, "rewards/accuracies": 0.8125, "rewards/chosen": -2.500065803527832, "rewards/margins": 1.092839002609253, "rewards/rejected": -3.592904806137085, "step": 55 }, { "epoch": 0.1062618595825427, "grad_norm": 4.552927017211914, "learning_rate": 0.00011200000000000001, "logits/chosen": -2.738482713699341, "logits/rejected": -2.7385125160217285, "logps/chosen": -38.80826950073242, "logps/rejected": -52.960838317871094, "loss": 0.4095, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4881004095077515, "rewards/margins": 1.5598132610321045, "rewards/rejected": -3.0479137897491455, "step": 56 }, { "epoch": 0.10815939278937381, "grad_norm": 2.9376704692840576, "learning_rate": 0.00011399999999999999, "logits/chosen": -2.767287254333496, "logits/rejected": -2.7738823890686035, "logps/chosen": -37.63935470581055, "logps/rejected": -51.30040740966797, "loss": 0.3822, "rewards/accuracies": 0.75, "rewards/chosen": -1.387617826461792, "rewards/margins": 1.600461483001709, "rewards/rejected": -2.988079309463501, "step": 57 }, { "epoch": 0.11005692599620494, "grad_norm": 4.015516757965088, "learning_rate": 0.000116, "logits/chosen": -2.772315502166748, "logits/rejected": -2.7690601348876953, "logps/chosen": -47.2697868347168, "logps/rejected": -54.677337646484375, "loss": 0.5154, "rewards/accuracies": 0.75, "rewards/chosen": -2.417880058288574, "rewards/margins": 0.9048663377761841, "rewards/rejected": -3.322746515274048, "step": 58 }, { "epoch": 0.11195445920303605, "grad_norm": 4.015993118286133, "learning_rate": 0.000118, "logits/chosen": -2.7056713104248047, "logits/rejected": -2.7002859115600586, "logps/chosen": -51.202369689941406, "logps/rejected": -59.07714080810547, "loss": 0.5931, "rewards/accuracies": 0.5625, "rewards/chosen": -2.7120983600616455, "rewards/margins": 0.9703352451324463, "rewards/rejected": -3.682433605194092, "step": 59 }, { "epoch": 0.11385199240986717, "grad_norm": 2.3085007667541504, "learning_rate": 0.00012, "logits/chosen": -2.7429823875427246, "logits/rejected": -2.7343454360961914, "logps/chosen": -55.68383026123047, "logps/rejected": -64.50788879394531, "loss": 0.4269, "rewards/accuracies": 0.875, "rewards/chosen": -3.2317302227020264, "rewards/margins": 0.981264054775238, "rewards/rejected": -4.21299409866333, "step": 60 }, { "epoch": 0.1157495256166983, "grad_norm": 2.4124083518981934, "learning_rate": 0.000122, "logits/chosen": -2.7513816356658936, "logits/rejected": -2.7479352951049805, "logps/chosen": -54.174095153808594, "logps/rejected": -65.53788757324219, "loss": 0.3048, "rewards/accuracies": 1.0, "rewards/chosen": -3.013425588607788, "rewards/margins": 1.3886945247650146, "rewards/rejected": -4.4021196365356445, "step": 61 }, { "epoch": 0.11764705882352941, "grad_norm": 2.433696746826172, "learning_rate": 0.000124, "logits/chosen": -2.783195734024048, "logits/rejected": -2.778404474258423, "logps/chosen": -51.3481559753418, "logps/rejected": -70.99629211425781, "loss": 0.319, "rewards/accuracies": 0.875, "rewards/chosen": -2.688727378845215, "rewards/margins": 2.164720058441162, "rewards/rejected": -4.853447914123535, "step": 62 }, { "epoch": 0.11954459203036052, "grad_norm": 2.1883819103240967, "learning_rate": 0.000126, "logits/chosen": -2.746483564376831, "logits/rejected": -2.736555337905884, "logps/chosen": -48.178592681884766, "logps/rejected": -70.20449829101562, "loss": 0.1814, "rewards/accuracies": 0.9375, "rewards/chosen": -2.385026693344116, "rewards/margins": 2.3813328742980957, "rewards/rejected": -4.766359329223633, "step": 63 }, { "epoch": 0.12144212523719165, "grad_norm": 2.765345811843872, "learning_rate": 0.00012800000000000002, "logits/chosen": -2.810784339904785, "logits/rejected": -2.810107946395874, "logps/chosen": -55.56414794921875, "logps/rejected": -79.71456909179688, "loss": 0.2055, "rewards/accuracies": 0.9375, "rewards/chosen": -3.27866530418396, "rewards/margins": 2.5508813858032227, "rewards/rejected": -5.829546928405762, "step": 64 }, { "epoch": 0.12333965844402277, "grad_norm": 6.344622611999512, "learning_rate": 0.00013000000000000002, "logits/chosen": -2.814685583114624, "logits/rejected": -2.805901527404785, "logps/chosen": -44.5737419128418, "logps/rejected": -73.86512756347656, "loss": 0.1899, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0165200233459473, "rewards/margins": 3.251584053039551, "rewards/rejected": -5.268104553222656, "step": 65 }, { "epoch": 0.1252371916508539, "grad_norm": 3.388427972793579, "learning_rate": 0.000132, "logits/chosen": -2.798591136932373, "logits/rejected": -2.8005106449127197, "logps/chosen": -36.04314422607422, "logps/rejected": -50.72333526611328, "loss": 0.3848, "rewards/accuracies": 0.875, "rewards/chosen": -1.1693958044052124, "rewards/margins": 1.6317634582519531, "rewards/rejected": -2.801159381866455, "step": 66 }, { "epoch": 0.127134724857685, "grad_norm": 2.023927688598633, "learning_rate": 0.000134, "logits/chosen": -2.81923508644104, "logits/rejected": -2.815878391265869, "logps/chosen": -28.859542846679688, "logps/rejected": -41.972694396972656, "loss": 0.2323, "rewards/accuracies": 1.0, "rewards/chosen": -0.40116792917251587, "rewards/margins": 1.6016311645507812, "rewards/rejected": -2.0027990341186523, "step": 67 }, { "epoch": 0.12903225806451613, "grad_norm": 3.041720151901245, "learning_rate": 0.00013600000000000003, "logits/chosen": -2.782683849334717, "logits/rejected": -2.7856640815734863, "logps/chosen": -32.9669189453125, "logps/rejected": -49.34394836425781, "loss": 0.3746, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0322561264038086, "rewards/margins": 1.6814837455749512, "rewards/rejected": -2.7137398719787598, "step": 68 }, { "epoch": 0.13092979127134724, "grad_norm": 7.477070331573486, "learning_rate": 0.000138, "logits/chosen": -2.747135639190674, "logits/rejected": -2.747574806213379, "logps/chosen": -36.88587188720703, "logps/rejected": -66.01129150390625, "loss": 0.2206, "rewards/accuracies": 0.9375, "rewards/chosen": -1.325242519378662, "rewards/margins": 2.977128744125366, "rewards/rejected": -4.302371025085449, "step": 69 }, { "epoch": 0.13282732447817835, "grad_norm": 6.996569633483887, "learning_rate": 0.00014, "logits/chosen": -2.7544913291931152, "logits/rejected": -2.7490170001983643, "logps/chosen": -38.83042907714844, "logps/rejected": -65.96528625488281, "loss": 0.3555, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2973248958587646, "rewards/margins": 3.1091742515563965, "rewards/rejected": -4.406498908996582, "step": 70 }, { "epoch": 0.1347248576850095, "grad_norm": 6.556032180786133, "learning_rate": 0.000142, "logits/chosen": -2.7899088859558105, "logits/rejected": -2.7880430221557617, "logps/chosen": -43.366214752197266, "logps/rejected": -61.33940124511719, "loss": 0.2978, "rewards/accuracies": 0.875, "rewards/chosen": -1.9949687719345093, "rewards/margins": 1.9916281700134277, "rewards/rejected": -3.9865970611572266, "step": 71 }, { "epoch": 0.1366223908918406, "grad_norm": 2.458948850631714, "learning_rate": 0.000144, "logits/chosen": -2.792599678039551, "logits/rejected": -2.7890465259552, "logps/chosen": -43.916316986083984, "logps/rejected": -55.91321563720703, "loss": 0.341, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9487779140472412, "rewards/margins": 1.4445126056671143, "rewards/rejected": -3.3932905197143555, "step": 72 }, { "epoch": 0.13851992409867173, "grad_norm": 4.216429710388184, "learning_rate": 0.000146, "logits/chosen": -2.8028111457824707, "logits/rejected": -2.805846929550171, "logps/chosen": -45.717369079589844, "logps/rejected": -54.00206756591797, "loss": 0.5921, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3208117485046387, "rewards/margins": 0.9625445008277893, "rewards/rejected": -3.283356189727783, "step": 73 }, { "epoch": 0.14041745730550284, "grad_norm": 1.7848654985427856, "learning_rate": 0.000148, "logits/chosen": -2.7844631671905518, "logits/rejected": -2.7775328159332275, "logps/chosen": -45.630828857421875, "logps/rejected": -57.7205696105957, "loss": 0.3173, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1562933921813965, "rewards/margins": 1.4049997329711914, "rewards/rejected": -3.561293125152588, "step": 74 }, { "epoch": 0.14231499051233396, "grad_norm": 4.6799750328063965, "learning_rate": 0.00015000000000000001, "logits/chosen": -2.7869224548339844, "logits/rejected": -2.7798612117767334, "logps/chosen": -57.45626449584961, "logps/rejected": -61.46935272216797, "loss": 0.6529, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3889098167419434, "rewards/margins": 0.6126527190208435, "rewards/rejected": -4.001562118530273, "step": 75 }, { "epoch": 0.1442125237191651, "grad_norm": 2.800229787826538, "learning_rate": 0.000152, "logits/chosen": -2.832784652709961, "logits/rejected": -2.828519582748413, "logps/chosen": -52.276004791259766, "logps/rejected": -63.41943359375, "loss": 0.4297, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8967790603637695, "rewards/margins": 1.1908848285675049, "rewards/rejected": -4.087663650512695, "step": 76 }, { "epoch": 0.1461100569259962, "grad_norm": 3.5293054580688477, "learning_rate": 0.000154, "logits/chosen": -2.791073799133301, "logits/rejected": -2.789182662963867, "logps/chosen": -52.498924255371094, "logps/rejected": -68.02275848388672, "loss": 0.3526, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7176871299743652, "rewards/margins": 1.8619203567504883, "rewards/rejected": -4.579607009887695, "step": 77 }, { "epoch": 0.14800759013282733, "grad_norm": 5.139841556549072, "learning_rate": 0.00015600000000000002, "logits/chosen": -2.801218271255493, "logits/rejected": -2.798741102218628, "logps/chosen": -39.000831604003906, "logps/rejected": -56.16624450683594, "loss": 0.5178, "rewards/accuracies": 0.8125, "rewards/chosen": -1.4603410959243774, "rewards/margins": 1.9403096437454224, "rewards/rejected": -3.400650978088379, "step": 78 }, { "epoch": 0.14990512333965844, "grad_norm": 3.5403361320495605, "learning_rate": 0.00015800000000000002, "logits/chosen": -2.8410701751708984, "logits/rejected": -2.8384857177734375, "logps/chosen": -36.62928009033203, "logps/rejected": -45.52888488769531, "loss": 0.5752, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2457019090652466, "rewards/margins": 1.257145643234253, "rewards/rejected": -2.502847671508789, "step": 79 }, { "epoch": 0.15180265654648956, "grad_norm": 2.7179343700408936, "learning_rate": 0.00016, "logits/chosen": -2.806529998779297, "logits/rejected": -2.806776762008667, "logps/chosen": -36.37416076660156, "logps/rejected": -49.32950973510742, "loss": 0.3537, "rewards/accuracies": 0.875, "rewards/chosen": -1.3248584270477295, "rewards/margins": 1.3763021230697632, "rewards/rejected": -2.701160430908203, "step": 80 }, { "epoch": 0.15370018975332067, "grad_norm": 3.5153768062591553, "learning_rate": 0.000162, "logits/chosen": -2.786527633666992, "logits/rejected": -2.793896198272705, "logps/chosen": -31.23716926574707, "logps/rejected": -42.66054153442383, "loss": 0.3014, "rewards/accuracies": 0.9375, "rewards/chosen": -0.7100868821144104, "rewards/margins": 1.4225895404815674, "rewards/rejected": -2.132676601409912, "step": 81 }, { "epoch": 0.1555977229601518, "grad_norm": 4.196669101715088, "learning_rate": 0.000164, "logits/chosen": -2.8289337158203125, "logits/rejected": -2.824960470199585, "logps/chosen": -30.018644332885742, "logps/rejected": -35.070648193359375, "loss": 0.6769, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7271355986595154, "rewards/margins": 0.5995543599128723, "rewards/rejected": -1.3266899585723877, "step": 82 }, { "epoch": 0.15749525616698293, "grad_norm": 4.570925235748291, "learning_rate": 0.000166, "logits/chosen": -2.8203651905059814, "logits/rejected": -2.8185455799102783, "logps/chosen": -44.2990608215332, "logps/rejected": -52.1357536315918, "loss": 0.6825, "rewards/accuracies": 0.75, "rewards/chosen": -2.085677146911621, "rewards/margins": 0.8163881897926331, "rewards/rejected": -2.9020657539367676, "step": 83 }, { "epoch": 0.15939278937381404, "grad_norm": 1.6578748226165771, "learning_rate": 0.000168, "logits/chosen": -2.781203031539917, "logits/rejected": -2.7794113159179688, "logps/chosen": -45.45800018310547, "logps/rejected": -62.20787811279297, "loss": 0.2852, "rewards/accuracies": 0.9375, "rewards/chosen": -2.2425527572631836, "rewards/margins": 1.843325138092041, "rewards/rejected": -4.085877895355225, "step": 84 }, { "epoch": 0.16129032258064516, "grad_norm": 4.050334453582764, "learning_rate": 0.00017, "logits/chosen": -2.8100481033325195, "logits/rejected": -2.8053321838378906, "logps/chosen": -47.41972351074219, "logps/rejected": -58.743255615234375, "loss": 0.5081, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2062716484069824, "rewards/margins": 1.4793806076049805, "rewards/rejected": -3.685652256011963, "step": 85 }, { "epoch": 0.16318785578747627, "grad_norm": 3.0159974098205566, "learning_rate": 0.000172, "logits/chosen": -2.758812427520752, "logits/rejected": -2.7559688091278076, "logps/chosen": -55.07322311401367, "logps/rejected": -65.33331298828125, "loss": 0.3226, "rewards/accuracies": 1.0, "rewards/chosen": -3.1052637100219727, "rewards/margins": 1.3766382932662964, "rewards/rejected": -4.481902122497559, "step": 86 }, { "epoch": 0.1650853889943074, "grad_norm": 4.908008098602295, "learning_rate": 0.000174, "logits/chosen": -2.80372953414917, "logits/rejected": -2.793184280395508, "logps/chosen": -59.20977020263672, "logps/rejected": -65.16618347167969, "loss": 0.6049, "rewards/accuracies": 0.625, "rewards/chosen": -3.495293140411377, "rewards/margins": 0.8276402950286865, "rewards/rejected": -4.322933197021484, "step": 87 }, { "epoch": 0.16698292220113853, "grad_norm": 2.5410759449005127, "learning_rate": 0.00017600000000000002, "logits/chosen": -2.777139186859131, "logits/rejected": -2.7644989490509033, "logps/chosen": -56.76138687133789, "logps/rejected": -73.66291809082031, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -3.2906441688537598, "rewards/margins": 1.8566827774047852, "rewards/rejected": -5.147326946258545, "step": 88 }, { "epoch": 0.16888045540796964, "grad_norm": 3.6899871826171875, "learning_rate": 0.00017800000000000002, "logits/chosen": -2.778470277786255, "logits/rejected": -2.774235725402832, "logps/chosen": -54.31599426269531, "logps/rejected": -70.16642761230469, "loss": 0.3418, "rewards/accuracies": 0.8125, "rewards/chosen": -2.986976385116577, "rewards/margins": 1.7881888151168823, "rewards/rejected": -4.775165557861328, "step": 89 }, { "epoch": 0.17077798861480076, "grad_norm": 4.643296241760254, "learning_rate": 0.00018, "logits/chosen": -2.7810251712799072, "logits/rejected": -2.7724738121032715, "logps/chosen": -60.76677322387695, "logps/rejected": -76.55674743652344, "loss": 0.6284, "rewards/accuracies": 0.75, "rewards/chosen": -3.654345989227295, "rewards/margins": 1.6196839809417725, "rewards/rejected": -5.274029731750488, "step": 90 }, { "epoch": 0.17267552182163187, "grad_norm": 8.180418968200684, "learning_rate": 0.000182, "logits/chosen": -2.819366931915283, "logits/rejected": -2.807485342025757, "logps/chosen": -51.31687927246094, "logps/rejected": -69.36310577392578, "loss": 0.4503, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7893166542053223, "rewards/margins": 2.0452799797058105, "rewards/rejected": -4.834596633911133, "step": 91 }, { "epoch": 0.174573055028463, "grad_norm": 4.3448686599731445, "learning_rate": 0.00018400000000000003, "logits/chosen": -2.824305772781372, "logits/rejected": -2.8248698711395264, "logps/chosen": -45.58390808105469, "logps/rejected": -57.605201721191406, "loss": 0.5903, "rewards/accuracies": 0.625, "rewards/chosen": -2.0943570137023926, "rewards/margins": 1.3772242069244385, "rewards/rejected": -3.471581220626831, "step": 92 }, { "epoch": 0.17647058823529413, "grad_norm": 2.931424856185913, "learning_rate": 0.00018600000000000002, "logits/chosen": -2.8630051612854004, "logits/rejected": -2.8607943058013916, "logps/chosen": -41.100894927978516, "logps/rejected": -49.50748825073242, "loss": 0.463, "rewards/accuracies": 0.8125, "rewards/chosen": -1.815004825592041, "rewards/margins": 0.9866312146186829, "rewards/rejected": -2.801636219024658, "step": 93 }, { "epoch": 0.17836812144212524, "grad_norm": 2.702983856201172, "learning_rate": 0.000188, "logits/chosen": -2.8779056072235107, "logits/rejected": -2.8776400089263916, "logps/chosen": -42.185447692871094, "logps/rejected": -50.3919792175293, "loss": 0.4895, "rewards/accuracies": 0.75, "rewards/chosen": -1.823110818862915, "rewards/margins": 1.0870286226272583, "rewards/rejected": -2.910139560699463, "step": 94 }, { "epoch": 0.18026565464895636, "grad_norm": 4.391569137573242, "learning_rate": 0.00019, "logits/chosen": -2.777017116546631, "logits/rejected": -2.7851855754852295, "logps/chosen": -29.167160034179688, "logps/rejected": -43.34316635131836, "loss": 0.2765, "rewards/accuracies": 0.9375, "rewards/chosen": -0.45135945081710815, "rewards/margins": 1.7064999341964722, "rewards/rejected": -2.1578593254089355, "step": 95 }, { "epoch": 0.18216318785578747, "grad_norm": 4.305187702178955, "learning_rate": 0.000192, "logits/chosen": -2.8385465145111084, "logits/rejected": -2.83952260017395, "logps/chosen": -38.421730041503906, "logps/rejected": -50.840003967285156, "loss": 0.3135, "rewards/accuracies": 0.9375, "rewards/chosen": -1.460430383682251, "rewards/margins": 1.4080810546875, "rewards/rejected": -2.868511199951172, "step": 96 }, { "epoch": 0.1840607210626186, "grad_norm": 12.295331001281738, "learning_rate": 0.000194, "logits/chosen": -2.830328941345215, "logits/rejected": -2.8286261558532715, "logps/chosen": -36.80898666381836, "logps/rejected": -48.225921630859375, "loss": 0.6485, "rewards/accuracies": 0.75, "rewards/chosen": -1.2993323802947998, "rewards/margins": 1.328029751777649, "rewards/rejected": -2.627362012863159, "step": 97 }, { "epoch": 0.1859582542694497, "grad_norm": 6.877435207366943, "learning_rate": 0.000196, "logits/chosen": -2.8228061199188232, "logits/rejected": -2.826436996459961, "logps/chosen": -37.87078857421875, "logps/rejected": -51.07941436767578, "loss": 0.6151, "rewards/accuracies": 0.6875, "rewards/chosen": -1.410650372505188, "rewards/margins": 1.5915892124176025, "rewards/rejected": -3.00223970413208, "step": 98 }, { "epoch": 0.18785578747628084, "grad_norm": 4.5217509269714355, "learning_rate": 0.00019800000000000002, "logits/chosen": -2.8376476764678955, "logits/rejected": -2.836334466934204, "logps/chosen": -33.52427673339844, "logps/rejected": -45.65314483642578, "loss": 0.3943, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0066884756088257, "rewards/margins": 1.3794984817504883, "rewards/rejected": -2.3861870765686035, "step": 99 }, { "epoch": 0.18975332068311196, "grad_norm": 1.6153665781021118, "learning_rate": 0.0002, "logits/chosen": -2.809156656265259, "logits/rejected": -2.817115068435669, "logps/chosen": -30.36741065979004, "logps/rejected": -32.7757453918457, "loss": 0.5425, "rewards/accuracies": 0.8125, "rewards/chosen": -0.783231258392334, "rewards/margins": 0.42554494738578796, "rewards/rejected": -1.2087762355804443, "step": 100 }, { "epoch": 0.19165085388994307, "grad_norm": 1.3271561861038208, "learning_rate": 0.00019999729347501484, "logits/chosen": -2.8080809116363525, "logits/rejected": -2.8086583614349365, "logps/chosen": -28.36756706237793, "logps/rejected": -33.18540954589844, "loss": 0.4887, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4251585602760315, "rewards/margins": 0.5602906942367554, "rewards/rejected": -0.9854491949081421, "step": 101 }, { "epoch": 0.1935483870967742, "grad_norm": 2.151851177215576, "learning_rate": 0.00019998917404656487, "logits/chosen": -2.776926040649414, "logits/rejected": -2.7729344367980957, "logps/chosen": -35.40061569213867, "logps/rejected": -35.295692443847656, "loss": 0.5817, "rewards/accuracies": 0.625, "rewards/chosen": -0.9625377655029297, "rewards/margins": 0.3408290147781372, "rewards/rejected": -1.3033668994903564, "step": 102 }, { "epoch": 0.1954459203036053, "grad_norm": 1.497015357017517, "learning_rate": 0.00019997564215415884, "logits/chosen": -2.8229072093963623, "logits/rejected": -2.8220646381378174, "logps/chosen": -34.33465576171875, "logps/rejected": -38.946231842041016, "loss": 0.4599, "rewards/accuracies": 0.875, "rewards/chosen": -1.002916693687439, "rewards/margins": 0.6433842182159424, "rewards/rejected": -1.6463007926940918, "step": 103 }, { "epoch": 0.19734345351043645, "grad_norm": 2.398622989654541, "learning_rate": 0.00019995669853028485, "logits/chosen": -2.835087299346924, "logits/rejected": -2.8351354598999023, "logps/chosen": -38.583656311035156, "logps/rejected": -42.82282257080078, "loss": 0.4885, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4459500312805176, "rewards/margins": 0.5609026551246643, "rewards/rejected": -2.006852626800537, "step": 104 }, { "epoch": 0.19924098671726756, "grad_norm": 4.335348606109619, "learning_rate": 0.00019993234420037073, "logits/chosen": -2.898766279220581, "logits/rejected": -2.8957412242889404, "logps/chosen": -38.778602600097656, "logps/rejected": -40.92530822753906, "loss": 0.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5738805532455444, "rewards/margins": 0.4367479085922241, "rewards/rejected": -2.0106287002563477, "step": 105 }, { "epoch": 0.20113851992409867, "grad_norm": 4.584505081176758, "learning_rate": 0.0001999025804827285, "logits/chosen": -2.8751659393310547, "logits/rejected": -2.875046968460083, "logps/chosen": -39.396610260009766, "logps/rejected": -39.287452697753906, "loss": 0.7515, "rewards/accuracies": 0.5, "rewards/chosen": -1.5874710083007812, "rewards/margins": 0.11016285419464111, "rewards/rejected": -1.6976337432861328, "step": 106 }, { "epoch": 0.2030360531309298, "grad_norm": 3.5621519088745117, "learning_rate": 0.00019986740898848306, "logits/chosen": -2.8922340869903564, "logits/rejected": -2.8920042514801025, "logps/chosen": -43.401451110839844, "logps/rejected": -48.335975646972656, "loss": 0.6008, "rewards/accuracies": 0.75, "rewards/chosen": -1.9959304332733154, "rewards/margins": 0.6039863228797913, "rewards/rejected": -2.599916696548462, "step": 107 }, { "epoch": 0.2049335863377609, "grad_norm": 2.4933717250823975, "learning_rate": 0.000199826831621485, "logits/chosen": -2.91758394241333, "logits/rejected": -2.9207208156585693, "logps/chosen": -47.70870590209961, "logps/rejected": -52.232574462890625, "loss": 0.5479, "rewards/accuracies": 0.75, "rewards/chosen": -2.437047004699707, "rewards/margins": 0.527195155620575, "rewards/rejected": -2.9642422199249268, "step": 108 }, { "epoch": 0.20683111954459202, "grad_norm": 3.197727918624878, "learning_rate": 0.0001997808505782075, "logits/chosen": -2.9473459720611572, "logits/rejected": -2.949979305267334, "logps/chosen": -45.37776184082031, "logps/rejected": -47.56669616699219, "loss": 0.5926, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0948638916015625, "rewards/margins": 0.5222463607788086, "rewards/rejected": -2.617110013961792, "step": 109 }, { "epoch": 0.20872865275142316, "grad_norm": 2.204061985015869, "learning_rate": 0.0001997294683476273, "logits/chosen": -2.9493114948272705, "logits/rejected": -2.9442033767700195, "logps/chosen": -45.5977668762207, "logps/rejected": -52.712066650390625, "loss": 0.3879, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1548056602478027, "rewards/margins": 0.9166700839996338, "rewards/rejected": -3.0714759826660156, "step": 110 }, { "epoch": 0.21062618595825428, "grad_norm": 4.2725300788879395, "learning_rate": 0.00019967268771109035, "logits/chosen": -2.9485433101654053, "logits/rejected": -2.9486706256866455, "logps/chosen": -41.39335250854492, "logps/rejected": -51.77354431152344, "loss": 0.3888, "rewards/accuracies": 0.875, "rewards/chosen": -1.7500886917114258, "rewards/margins": 1.1263270378112793, "rewards/rejected": -2.876415729522705, "step": 111 }, { "epoch": 0.2125237191650854, "grad_norm": 2.504917860031128, "learning_rate": 0.00019961051174216082, "logits/chosen": -2.9456284046173096, "logits/rejected": -2.949559211730957, "logps/chosen": -43.10253143310547, "logps/rejected": -58.495235443115234, "loss": 0.3659, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8653165102005005, "rewards/margins": 1.712073802947998, "rewards/rejected": -3.577390193939209, "step": 112 }, { "epoch": 0.2144212523719165, "grad_norm": 3.238914728164673, "learning_rate": 0.00019954294380645498, "logits/chosen": -2.925440549850464, "logits/rejected": -2.9304261207580566, "logps/chosen": -43.154396057128906, "logps/rejected": -60.708831787109375, "loss": 0.421, "rewards/accuracies": 0.875, "rewards/chosen": -1.8937389850616455, "rewards/margins": 1.8209748268127441, "rewards/rejected": -3.7147140502929688, "step": 113 }, { "epoch": 0.21631878557874762, "grad_norm": 6.110694408416748, "learning_rate": 0.0001994699875614589, "logits/chosen": -2.9304544925689697, "logits/rejected": -2.9413235187530518, "logps/chosen": -45.33203887939453, "logps/rejected": -58.13191223144531, "loss": 0.4857, "rewards/accuracies": 0.75, "rewards/chosen": -2.097524642944336, "rewards/margins": 1.4301856756210327, "rewards/rejected": -3.527710437774658, "step": 114 }, { "epoch": 0.21821631878557876, "grad_norm": 3.7045321464538574, "learning_rate": 0.00019939164695633067, "logits/chosen": -2.9360766410827637, "logits/rejected": -2.936542272567749, "logps/chosen": -51.745140075683594, "logps/rejected": -66.68146514892578, "loss": 0.5156, "rewards/accuracies": 0.8125, "rewards/chosen": -2.90883731842041, "rewards/margins": 1.5629081726074219, "rewards/rejected": -4.471745491027832, "step": 115 }, { "epoch": 0.22011385199240988, "grad_norm": 4.1607136726379395, "learning_rate": 0.00019930792623168637, "logits/chosen": -2.929075002670288, "logits/rejected": -2.9353854656219482, "logps/chosen": -59.28025436401367, "logps/rejected": -75.60289001464844, "loss": 0.2769, "rewards/accuracies": 0.9375, "rewards/chosen": -3.515871047973633, "rewards/margins": 1.8124377727508545, "rewards/rejected": -5.328309059143066, "step": 116 }, { "epoch": 0.222011385199241, "grad_norm": 3.711951494216919, "learning_rate": 0.0001992188299193706, "logits/chosen": -2.948878049850464, "logits/rejected": -2.9539246559143066, "logps/chosen": -52.6070442199707, "logps/rejected": -68.63351440429688, "loss": 0.3944, "rewards/accuracies": 0.875, "rewards/chosen": -2.7233872413635254, "rewards/margins": 1.939584493637085, "rewards/rejected": -4.662971496582031, "step": 117 }, { "epoch": 0.2239089184060721, "grad_norm": 4.125283718109131, "learning_rate": 0.00019912436284221134, "logits/chosen": -2.8983044624328613, "logits/rejected": -2.9071972370147705, "logps/chosen": -59.11683654785156, "logps/rejected": -71.92567443847656, "loss": 0.3953, "rewards/accuracies": 0.75, "rewards/chosen": -3.552089214324951, "rewards/margins": 1.6068460941314697, "rewards/rejected": -5.158935546875, "step": 118 }, { "epoch": 0.22580645161290322, "grad_norm": 4.579989433288574, "learning_rate": 0.00019902453011375865, "logits/chosen": -2.9441299438476562, "logits/rejected": -2.9400062561035156, "logps/chosen": -54.54322052001953, "logps/rejected": -62.37023162841797, "loss": 0.4215, "rewards/accuracies": 0.875, "rewards/chosen": -2.9585518836975098, "rewards/margins": 1.0481977462768555, "rewards/rejected": -4.006749629974365, "step": 119 }, { "epoch": 0.22770398481973433, "grad_norm": 2.091538667678833, "learning_rate": 0.00019891933713800798, "logits/chosen": -2.8959543704986572, "logits/rejected": -2.907708168029785, "logps/chosen": -52.33782958984375, "logps/rejected": -80.5691909790039, "loss": 0.1493, "rewards/accuracies": 1.0, "rewards/chosen": -2.778528928756714, "rewards/margins": 3.066986560821533, "rewards/rejected": -5.845515251159668, "step": 120 }, { "epoch": 0.22960151802656548, "grad_norm": 1.603954553604126, "learning_rate": 0.00019880878960910772, "logits/chosen": -2.887864828109741, "logits/rejected": -2.8939895629882812, "logps/chosen": -44.32196044921875, "logps/rejected": -83.90393829345703, "loss": 0.1169, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9681143760681152, "rewards/margins": 4.234763145446777, "rewards/rejected": -6.202877998352051, "step": 121 }, { "epoch": 0.2314990512333966, "grad_norm": 17.788175582885742, "learning_rate": 0.00019869289351105086, "logits/chosen": -2.88763427734375, "logits/rejected": -2.8800721168518066, "logps/chosen": -63.583621978759766, "logps/rejected": -76.61012268066406, "loss": 0.9297, "rewards/accuracies": 0.75, "rewards/chosen": -4.007019519805908, "rewards/margins": 1.262047290802002, "rewards/rejected": -5.26906681060791, "step": 122 }, { "epoch": 0.2333965844402277, "grad_norm": 4.482125282287598, "learning_rate": 0.00019857165511735103, "logits/chosen": -2.876539707183838, "logits/rejected": -2.8815243244171143, "logps/chosen": -70.19147491455078, "logps/rejected": -86.98196411132812, "loss": 0.6258, "rewards/accuracies": 0.8125, "rewards/chosen": -4.761369705200195, "rewards/margins": 1.8523237705230713, "rewards/rejected": -6.6136932373046875, "step": 123 }, { "epoch": 0.23529411764705882, "grad_norm": 4.31642484664917, "learning_rate": 0.0001984450809907031, "logits/chosen": -2.8928816318511963, "logits/rejected": -2.894294023513794, "logps/chosen": -52.248863220214844, "logps/rejected": -76.62648010253906, "loss": 0.1486, "rewards/accuracies": 1.0, "rewards/chosen": -2.7789430618286133, "rewards/margins": 2.767739772796631, "rewards/rejected": -5.546682834625244, "step": 124 }, { "epoch": 0.23719165085388993, "grad_norm": 3.312211275100708, "learning_rate": 0.00019831317798262786, "logits/chosen": -2.8706130981445312, "logits/rejected": -2.8669066429138184, "logps/chosen": -46.89989471435547, "logps/rejected": -65.01393127441406, "loss": 0.3462, "rewards/accuracies": 0.875, "rewards/chosen": -2.1963088512420654, "rewards/margins": 2.007949113845825, "rewards/rejected": -4.204257965087891, "step": 125 }, { "epoch": 0.23908918406072105, "grad_norm": 3.9472641944885254, "learning_rate": 0.00019817595323310097, "logits/chosen": -2.880319356918335, "logits/rejected": -2.884037494659424, "logps/chosen": -43.69145202636719, "logps/rejected": -59.259368896484375, "loss": 0.4902, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0041704177856445, "rewards/margins": 1.6014294624328613, "rewards/rejected": -3.605599880218506, "step": 126 }, { "epoch": 0.2409867172675522, "grad_norm": 3.1087348461151123, "learning_rate": 0.0001980334141701667, "logits/chosen": -2.848803997039795, "logits/rejected": -2.841339588165283, "logps/chosen": -42.932640075683594, "logps/rejected": -45.115997314453125, "loss": 0.6577, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0474021434783936, "rewards/margins": 0.36965787410736084, "rewards/rejected": -2.417059898376465, "step": 127 }, { "epoch": 0.2428842504743833, "grad_norm": 1.647839069366455, "learning_rate": 0.0001978855685095358, "logits/chosen": -2.8491506576538086, "logits/rejected": -2.840852975845337, "logps/chosen": -39.779205322265625, "logps/rejected": -48.12335968017578, "loss": 0.383, "rewards/accuracies": 0.875, "rewards/chosen": -1.630105972290039, "rewards/margins": 1.0952038764953613, "rewards/rejected": -2.7253098487854004, "step": 128 }, { "epoch": 0.24478178368121442, "grad_norm": 1.2736908197402954, "learning_rate": 0.00019773242425416768, "logits/chosen": -2.934404134750366, "logits/rejected": -2.9313926696777344, "logps/chosen": -42.447059631347656, "logps/rejected": -54.27853775024414, "loss": 0.3232, "rewards/accuracies": 0.875, "rewards/chosen": -1.8265783786773682, "rewards/margins": 1.4815216064453125, "rewards/rejected": -3.3081002235412598, "step": 129 }, { "epoch": 0.24667931688804554, "grad_norm": 1.8245573043823242, "learning_rate": 0.0001975739896938375, "logits/chosen": -2.9123284816741943, "logits/rejected": -2.909496545791626, "logps/chosen": -36.656558990478516, "logps/rejected": -40.601043701171875, "loss": 0.5459, "rewards/accuracies": 0.75, "rewards/chosen": -1.2777769565582275, "rewards/margins": 0.5270283222198486, "rewards/rejected": -1.8048052787780762, "step": 130 }, { "epoch": 0.24857685009487665, "grad_norm": 1.4160490036010742, "learning_rate": 0.00019741027340468715, "logits/chosen": -2.9194860458374023, "logits/rejected": -2.9193787574768066, "logps/chosen": -31.82858657836914, "logps/rejected": -51.616615295410156, "loss": 0.2742, "rewards/accuracies": 0.9375, "rewards/chosen": -0.807111382484436, "rewards/margins": 2.136796712875366, "rewards/rejected": -2.943908214569092, "step": 131 }, { "epoch": 0.2504743833017078, "grad_norm": 2.3211324214935303, "learning_rate": 0.00019724128424876116, "logits/chosen": -2.9083187580108643, "logits/rejected": -2.9106099605560303, "logps/chosen": -32.994529724121094, "logps/rejected": -48.345680236816406, "loss": 0.4002, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8857762813568115, "rewards/margins": 1.6735560894012451, "rewards/rejected": -2.5593323707580566, "step": 132 }, { "epoch": 0.2523719165085389, "grad_norm": 2.762873888015747, "learning_rate": 0.00019706703137352695, "logits/chosen": -2.9064345359802246, "logits/rejected": -2.9009690284729004, "logps/chosen": -35.646888732910156, "logps/rejected": -56.55396270751953, "loss": 0.1766, "rewards/accuracies": 0.9375, "rewards/chosen": -1.0316578149795532, "rewards/margins": 2.3745384216308594, "rewards/rejected": -3.406196355819702, "step": 133 }, { "epoch": 0.25426944971537, "grad_norm": 3.811307907104492, "learning_rate": 0.0001968875242113798, "logits/chosen": -2.9285547733306885, "logits/rejected": -2.928537130355835, "logps/chosen": -39.112098693847656, "logps/rejected": -56.93737030029297, "loss": 0.3068, "rewards/accuracies": 0.875, "rewards/chosen": -1.5674068927764893, "rewards/margins": 1.9389293193817139, "rewards/rejected": -3.506336212158203, "step": 134 }, { "epoch": 0.25616698292220114, "grad_norm": 3.130340337753296, "learning_rate": 0.00019670277247913205, "logits/chosen": -2.915271520614624, "logits/rejected": -2.9155397415161133, "logps/chosen": -41.27661895751953, "logps/rejected": -58.242916107177734, "loss": 0.2538, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7179136276245117, "rewards/margins": 1.8386231660842896, "rewards/rejected": -3.556536912918091, "step": 135 }, { "epoch": 0.25806451612903225, "grad_norm": 1.0148847103118896, "learning_rate": 0.0001965127861774873, "logits/chosen": -2.9431655406951904, "logits/rejected": -2.9385225772857666, "logps/chosen": -53.51545715332031, "logps/rejected": -75.12637329101562, "loss": 0.1931, "rewards/accuracies": 0.9375, "rewards/chosen": -2.833062171936035, "rewards/margins": 2.448612689971924, "rewards/rejected": -5.281674861907959, "step": 136 }, { "epoch": 0.25996204933586337, "grad_norm": 3.7195470333099365, "learning_rate": 0.00019631757559049898, "logits/chosen": -2.9738211631774902, "logits/rejected": -2.9678642749786377, "logps/chosen": -56.815650939941406, "logps/rejected": -70.7271728515625, "loss": 0.4445, "rewards/accuracies": 0.6875, "rewards/chosen": -3.2865800857543945, "rewards/margins": 1.6364598274230957, "rewards/rejected": -4.923039436340332, "step": 137 }, { "epoch": 0.2618595825426945, "grad_norm": 1.2523024082183838, "learning_rate": 0.00019611715128501378, "logits/chosen": -2.9443469047546387, "logits/rejected": -2.9399166107177734, "logps/chosen": -65.968505859375, "logps/rejected": -84.37772369384766, "loss": 0.2675, "rewards/accuracies": 0.9375, "rewards/chosen": -4.088393688201904, "rewards/margins": 2.0679855346679688, "rewards/rejected": -6.156379222869873, "step": 138 }, { "epoch": 0.2637571157495256, "grad_norm": 1.1542834043502808, "learning_rate": 0.0001959115241100994, "logits/chosen": -2.975813865661621, "logits/rejected": -2.970766544342041, "logps/chosen": -64.42146301269531, "logps/rejected": -84.94852447509766, "loss": 0.2146, "rewards/accuracies": 0.9375, "rewards/chosen": -4.040821552276611, "rewards/margins": 2.274022102355957, "rewards/rejected": -6.314843654632568, "step": 139 }, { "epoch": 0.2656546489563567, "grad_norm": 1.7230674028396606, "learning_rate": 0.00019570070519645767, "logits/chosen": -2.9944310188293457, "logits/rejected": -2.9947562217712402, "logps/chosen": -58.38348388671875, "logps/rejected": -79.51581573486328, "loss": 0.2081, "rewards/accuracies": 0.9375, "rewards/chosen": -3.408212661743164, "rewards/margins": 2.3429977893829346, "rewards/rejected": -5.751210689544678, "step": 140 }, { "epoch": 0.2675521821631879, "grad_norm": 1.7467715740203857, "learning_rate": 0.00019548470595582166, "logits/chosen": -2.96416974067688, "logits/rejected": -2.9619786739349365, "logps/chosen": -66.57093048095703, "logps/rejected": -83.50373077392578, "loss": 0.3606, "rewards/accuracies": 0.75, "rewards/chosen": -4.171794414520264, "rewards/margins": 1.9003512859344482, "rewards/rejected": -6.072145462036133, "step": 141 }, { "epoch": 0.269449715370019, "grad_norm": 4.687629222869873, "learning_rate": 0.00019526353808033825, "logits/chosen": -3.0060713291168213, "logits/rejected": -3.0069186687469482, "logps/chosen": -49.43899917602539, "logps/rejected": -76.19532775878906, "loss": 0.3784, "rewards/accuracies": 0.75, "rewards/chosen": -2.521219253540039, "rewards/margins": 2.774571180343628, "rewards/rejected": -5.295790195465088, "step": 142 }, { "epoch": 0.2713472485768501, "grad_norm": 6.6354451179504395, "learning_rate": 0.00019503721354193504, "logits/chosen": -2.9884586334228516, "logits/rejected": -2.9861557483673096, "logps/chosen": -44.920143127441406, "logps/rejected": -75.18492126464844, "loss": 0.2519, "rewards/accuracies": 0.875, "rewards/chosen": -2.0825746059417725, "rewards/margins": 3.216859817504883, "rewards/rejected": -5.299434661865234, "step": 143 }, { "epoch": 0.2732447817836812, "grad_norm": 6.729991912841797, "learning_rate": 0.0001948057445916724, "logits/chosen": -2.983428955078125, "logits/rejected": -2.977630138397217, "logps/chosen": -40.8714599609375, "logps/rejected": -72.71563720703125, "loss": 0.308, "rewards/accuracies": 0.8125, "rewards/chosen": -1.621235728263855, "rewards/margins": 3.496899366378784, "rewards/rejected": -5.118135452270508, "step": 144 }, { "epoch": 0.27514231499051234, "grad_norm": 1.3384770154953003, "learning_rate": 0.00019456914375908023, "logits/chosen": -2.9973654747009277, "logits/rejected": -2.9885263442993164, "logps/chosen": -37.383270263671875, "logps/rejected": -62.177364349365234, "loss": 0.1718, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3070964813232422, "rewards/margins": 2.7271294593811035, "rewards/rejected": -4.034225940704346, "step": 145 }, { "epoch": 0.27703984819734345, "grad_norm": 3.572751522064209, "learning_rate": 0.00019432742385147987, "logits/chosen": -3.0002806186676025, "logits/rejected": -2.992485523223877, "logps/chosen": -40.88140106201172, "logps/rejected": -76.8145980834961, "loss": 0.2338, "rewards/accuracies": 0.875, "rewards/chosen": -1.6452064514160156, "rewards/margins": 3.8751437664031982, "rewards/rejected": -5.520350456237793, "step": 146 }, { "epoch": 0.27893738140417457, "grad_norm": 3.8624587059020996, "learning_rate": 0.0001940805979532907, "logits/chosen": -3.022134304046631, "logits/rejected": -3.0201826095581055, "logps/chosen": -46.764404296875, "logps/rejected": -76.25241088867188, "loss": 0.32, "rewards/accuracies": 0.875, "rewards/chosen": -2.3360767364501953, "rewards/margins": 3.035257577896118, "rewards/rejected": -5.371334075927734, "step": 147 }, { "epoch": 0.2808349146110057, "grad_norm": 2.982891321182251, "learning_rate": 0.00019382867942532194, "logits/chosen": -3.0175113677978516, "logits/rejected": -3.006810188293457, "logps/chosen": -55.96101379394531, "logps/rejected": -92.93550109863281, "loss": 0.2791, "rewards/accuracies": 0.9375, "rewards/chosen": -3.2954790592193604, "rewards/margins": 3.832991361618042, "rewards/rejected": -7.128470420837402, "step": 148 }, { "epoch": 0.2827324478178368, "grad_norm": 5.67358922958374, "learning_rate": 0.00019357168190404936, "logits/chosen": -3.0050764083862305, "logits/rejected": -2.9992873668670654, "logps/chosen": -58.6027717590332, "logps/rejected": -89.6549301147461, "loss": 0.5549, "rewards/accuracies": 0.8125, "rewards/chosen": -3.570911407470703, "rewards/margins": 3.1905393600463867, "rewards/rejected": -6.76145076751709, "step": 149 }, { "epoch": 0.2846299810246679, "grad_norm": 7.6213297843933105, "learning_rate": 0.00019330961930087725, "logits/chosen": -3.0374433994293213, "logits/rejected": -3.035446882247925, "logps/chosen": -66.83151245117188, "logps/rejected": -85.86499786376953, "loss": 0.7267, "rewards/accuracies": 0.75, "rewards/chosen": -4.353948593139648, "rewards/margins": 1.9724806547164917, "rewards/rejected": -6.3264288902282715, "step": 150 }, { "epoch": 0.286527514231499, "grad_norm": 4.909121513366699, "learning_rate": 0.00019304250580138524, "logits/chosen": -3.071992874145508, "logits/rejected": -3.07159423828125, "logps/chosen": -68.75637817382812, "logps/rejected": -89.85405731201172, "loss": 0.5353, "rewards/accuracies": 0.75, "rewards/chosen": -4.568320274353027, "rewards/margins": 2.245361328125, "rewards/rejected": -6.813681602478027, "step": 151 }, { "epoch": 0.2884250474383302, "grad_norm": 6.616629600524902, "learning_rate": 0.00019277035586456057, "logits/chosen": -3.0715456008911133, "logits/rejected": -3.070178270339966, "logps/chosen": -62.83892059326172, "logps/rejected": -83.76055145263672, "loss": 0.5039, "rewards/accuracies": 0.6875, "rewards/chosen": -3.863878011703491, "rewards/margins": 2.338576316833496, "rewards/rejected": -6.202454566955566, "step": 152 }, { "epoch": 0.2903225806451613, "grad_norm": 2.418257713317871, "learning_rate": 0.00019249318422201523, "logits/chosen": -3.0929856300354004, "logits/rejected": -3.085249185562134, "logps/chosen": -60.662742614746094, "logps/rejected": -78.93743896484375, "loss": 0.401, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6301565170288086, "rewards/margins": 2.02978253364563, "rewards/rejected": -5.659938812255859, "step": 153 }, { "epoch": 0.2922201138519924, "grad_norm": 1.500931978225708, "learning_rate": 0.00019221100587718884, "logits/chosen": -3.0711896419525146, "logits/rejected": -3.065363645553589, "logps/chosen": -58.692962646484375, "logps/rejected": -79.85702514648438, "loss": 0.2513, "rewards/accuracies": 0.9375, "rewards/chosen": -3.450127124786377, "rewards/margins": 2.3925392627716064, "rewards/rejected": -5.8426666259765625, "step": 154 }, { "epoch": 0.29411764705882354, "grad_norm": 2.4970502853393555, "learning_rate": 0.00019192383610453618, "logits/chosen": -3.10758900642395, "logits/rejected": -3.1034536361694336, "logps/chosen": -61.92613220214844, "logps/rejected": -78.0351333618164, "loss": 0.3792, "rewards/accuracies": 0.875, "rewards/chosen": -3.7843873500823975, "rewards/margins": 1.7335383892059326, "rewards/rejected": -5.51792573928833, "step": 155 }, { "epoch": 0.29601518026565465, "grad_norm": 4.313493251800537, "learning_rate": 0.0001916316904487005, "logits/chosen": -3.1217973232269287, "logits/rejected": -3.1168103218078613, "logps/chosen": -61.29463577270508, "logps/rejected": -74.25381469726562, "loss": 0.6506, "rewards/accuracies": 0.6875, "rewards/chosen": -3.835869550704956, "rewards/margins": 1.4246034622192383, "rewards/rejected": -5.260473251342773, "step": 156 }, { "epoch": 0.29791271347248577, "grad_norm": 2.5466320514678955, "learning_rate": 0.00019133458472367213, "logits/chosen": -3.1386356353759766, "logits/rejected": -3.138566255569458, "logps/chosen": -64.02009582519531, "logps/rejected": -86.23556518554688, "loss": 0.2796, "rewards/accuracies": 0.875, "rewards/chosen": -3.9694676399230957, "rewards/margins": 2.4586336612701416, "rewards/rejected": -6.428101539611816, "step": 157 }, { "epoch": 0.2998102466793169, "grad_norm": 3.1764183044433594, "learning_rate": 0.00019103253501193254, "logits/chosen": -3.181774377822876, "logits/rejected": -3.176786422729492, "logps/chosen": -63.130775451660156, "logps/rejected": -84.73394775390625, "loss": 0.279, "rewards/accuracies": 0.8125, "rewards/chosen": -4.033406734466553, "rewards/margins": 2.265568733215332, "rewards/rejected": -6.298975467681885, "step": 158 }, { "epoch": 0.301707779886148, "grad_norm": 5.094723701477051, "learning_rate": 0.00019072555766358346, "logits/chosen": -3.164654016494751, "logits/rejected": -3.161222457885742, "logps/chosen": -61.11647033691406, "logps/rejected": -88.24617004394531, "loss": 0.4683, "rewards/accuracies": 0.75, "rewards/chosen": -3.767437219619751, "rewards/margins": 2.976083755493164, "rewards/rejected": -6.743520736694336, "step": 159 }, { "epoch": 0.3036053130929791, "grad_norm": 2.2500054836273193, "learning_rate": 0.00019041366929546219, "logits/chosen": -3.1624157428741455, "logits/rejected": -3.1557273864746094, "logps/chosen": -58.45857238769531, "logps/rejected": -83.57339477539062, "loss": 0.2568, "rewards/accuracies": 0.8125, "rewards/chosen": -3.415253162384033, "rewards/margins": 2.820373058319092, "rewards/rejected": -6.235626220703125, "step": 160 }, { "epoch": 0.3055028462998102, "grad_norm": 2.157329797744751, "learning_rate": 0.0001900968867902419, "logits/chosen": -3.1669349670410156, "logits/rejected": -3.1593239307403564, "logps/chosen": -54.65401077270508, "logps/rejected": -82.71353912353516, "loss": 0.2324, "rewards/accuracies": 0.875, "rewards/chosen": -3.078385829925537, "rewards/margins": 2.9072060585021973, "rewards/rejected": -5.985591888427734, "step": 161 }, { "epoch": 0.30740037950664134, "grad_norm": 3.35286808013916, "learning_rate": 0.000189775227295518, "logits/chosen": -3.158579111099243, "logits/rejected": -3.155748128890991, "logps/chosen": -57.20378112792969, "logps/rejected": -77.08695983886719, "loss": 0.4451, "rewards/accuracies": 0.8125, "rewards/chosen": -3.36582350730896, "rewards/margins": 2.117894411087036, "rewards/rejected": -5.483717441558838, "step": 162 }, { "epoch": 0.3092979127134725, "grad_norm": 1.9471580982208252, "learning_rate": 0.00018944870822287956, "logits/chosen": -3.1374692916870117, "logits/rejected": -3.134248971939087, "logps/chosen": -49.715667724609375, "logps/rejected": -73.12477111816406, "loss": 0.2535, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6359071731567383, "rewards/margins": 2.407243251800537, "rewards/rejected": -5.043150901794434, "step": 163 }, { "epoch": 0.3111954459203036, "grad_norm": 2.495023012161255, "learning_rate": 0.00018911734724696722, "logits/chosen": -3.1576712131500244, "logits/rejected": -3.1541287899017334, "logps/chosen": -48.007938385009766, "logps/rejected": -66.66117095947266, "loss": 0.4549, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4171323776245117, "rewards/margins": 2.037095308303833, "rewards/rejected": -4.454227447509766, "step": 164 }, { "epoch": 0.31309297912713474, "grad_norm": 4.0073652267456055, "learning_rate": 0.00018878116230451613, "logits/chosen": -3.111081838607788, "logits/rejected": -3.111217737197876, "logps/chosen": -44.466983795166016, "logps/rejected": -60.27463912963867, "loss": 0.4762, "rewards/accuracies": 0.75, "rewards/chosen": -2.118543863296509, "rewards/margins": 1.7310559749603271, "rewards/rejected": -3.849599599838257, "step": 165 }, { "epoch": 0.31499051233396586, "grad_norm": 3.148956537246704, "learning_rate": 0.00018844017159338528, "logits/chosen": -3.118450403213501, "logits/rejected": -3.123354911804199, "logps/chosen": -38.37690734863281, "logps/rejected": -53.64491271972656, "loss": 0.4602, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5390743017196655, "rewards/margins": 1.7078005075454712, "rewards/rejected": -3.2468748092651367, "step": 166 }, { "epoch": 0.31688804554079697, "grad_norm": 1.8341965675354004, "learning_rate": 0.00018809439357157223, "logits/chosen": -3.0582046508789062, "logits/rejected": -3.05729079246521, "logps/chosen": -33.0239372253418, "logps/rejected": -39.82599639892578, "loss": 0.5013, "rewards/accuracies": 0.625, "rewards/chosen": -0.91768878698349, "rewards/margins": 0.8253737092018127, "rewards/rejected": -1.7430624961853027, "step": 167 }, { "epoch": 0.3187855787476281, "grad_norm": 2.7328832149505615, "learning_rate": 0.00018774384695621407, "logits/chosen": -3.0516273975372314, "logits/rejected": -3.0475456714630127, "logps/chosen": -41.876365661621094, "logps/rejected": -49.84267044067383, "loss": 0.5872, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7223644256591797, "rewards/margins": 0.9419503808021545, "rewards/rejected": -2.6643147468566895, "step": 168 }, { "epoch": 0.3206831119544592, "grad_norm": 1.2971059083938599, "learning_rate": 0.0001873885507225743, "logits/chosen": -3.0343210697174072, "logits/rejected": -3.0288002490997314, "logps/chosen": -36.68798065185547, "logps/rejected": -51.96241760253906, "loss": 0.2352, "rewards/accuracies": 1.0, "rewards/chosen": -1.3304095268249512, "rewards/margins": 1.7680621147155762, "rewards/rejected": -3.0984716415405273, "step": 169 }, { "epoch": 0.3225806451612903, "grad_norm": 2.298835039138794, "learning_rate": 0.00018702852410301554, "logits/chosen": -3.0099384784698486, "logits/rejected": -3.014251708984375, "logps/chosen": -48.32796096801758, "logps/rejected": -69.7240219116211, "loss": 0.314, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4110379219055176, "rewards/margins": 2.3697621822357178, "rewards/rejected": -4.7808003425598145, "step": 170 }, { "epoch": 0.32447817836812143, "grad_norm": 2.5293760299682617, "learning_rate": 0.0001866637865859586, "logits/chosen": -3.0135481357574463, "logits/rejected": -3.0115551948547363, "logps/chosen": -54.45928955078125, "logps/rejected": -71.06674194335938, "loss": 0.3232, "rewards/accuracies": 0.8125, "rewards/chosen": -3.125586986541748, "rewards/margins": 1.8217525482177734, "rewards/rejected": -4.94734001159668, "step": 171 }, { "epoch": 0.32637571157495254, "grad_norm": 2.6669814586639404, "learning_rate": 0.00018629435791482765, "logits/chosen": -2.947815418243408, "logits/rejected": -2.9488513469696045, "logps/chosen": -60.36651611328125, "logps/rejected": -84.83184814453125, "loss": 0.3001, "rewards/accuracies": 0.8125, "rewards/chosen": -3.707231283187866, "rewards/margins": 2.5273969173431396, "rewards/rejected": -6.234628677368164, "step": 172 }, { "epoch": 0.32827324478178366, "grad_norm": 3.21972918510437, "learning_rate": 0.00018592025808698116, "logits/chosen": -2.9881162643432617, "logits/rejected": -2.986424684524536, "logps/chosen": -63.61874771118164, "logps/rejected": -97.28710174560547, "loss": 0.3389, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8928141593933105, "rewards/margins": 3.6045291423797607, "rewards/rejected": -7.497343063354492, "step": 173 }, { "epoch": 0.3301707779886148, "grad_norm": 2.9337503910064697, "learning_rate": 0.00018554150735262975, "logits/chosen": -2.904815912246704, "logits/rejected": -2.9042913913726807, "logps/chosen": -65.26492309570312, "logps/rejected": -88.40277099609375, "loss": 0.3105, "rewards/accuracies": 0.875, "rewards/chosen": -4.115939140319824, "rewards/margins": 2.4648208618164062, "rewards/rejected": -6.5807600021362305, "step": 174 }, { "epoch": 0.33206831119544594, "grad_norm": 3.591499090194702, "learning_rate": 0.00018515812621373997, "logits/chosen": -2.9820444583892822, "logits/rejected": -2.969967842102051, "logps/chosen": -67.85137176513672, "logps/rejected": -87.8746109008789, "loss": 0.4435, "rewards/accuracies": 0.75, "rewards/chosen": -4.4569549560546875, "rewards/margins": 2.025627851486206, "rewards/rejected": -6.482582092285156, "step": 175 }, { "epoch": 0.33396584440227706, "grad_norm": 3.6195833683013916, "learning_rate": 0.00018477013542292446, "logits/chosen": -2.9211320877075195, "logits/rejected": -2.9202651977539062, "logps/chosen": -65.85560607910156, "logps/rejected": -88.60107421875, "loss": 0.542, "rewards/accuracies": 0.8125, "rewards/chosen": -4.231626510620117, "rewards/margins": 2.3576202392578125, "rewards/rejected": -6.589247226715088, "step": 176 }, { "epoch": 0.33586337760910817, "grad_norm": 4.6038594245910645, "learning_rate": 0.00018437755598231856, "logits/chosen": -2.9478230476379395, "logits/rejected": -2.9453256130218506, "logps/chosen": -66.39960479736328, "logps/rejected": -88.36288452148438, "loss": 0.3217, "rewards/accuracies": 0.8125, "rewards/chosen": -4.242918014526367, "rewards/margins": 2.3832414150238037, "rewards/rejected": -6.62615966796875, "step": 177 }, { "epoch": 0.3377609108159393, "grad_norm": 1.2974350452423096, "learning_rate": 0.00018398040914244362, "logits/chosen": -2.901718854904175, "logits/rejected": -2.8996200561523438, "logps/chosen": -67.7139892578125, "logps/rejected": -91.30947875976562, "loss": 0.1607, "rewards/accuracies": 1.0, "rewards/chosen": -4.423122406005859, "rewards/margins": 2.575498580932617, "rewards/rejected": -6.998621463775635, "step": 178 }, { "epoch": 0.3396584440227704, "grad_norm": 3.1375489234924316, "learning_rate": 0.00018357871640105645, "logits/chosen": -2.926860809326172, "logits/rejected": -2.923616886138916, "logps/chosen": -68.5616455078125, "logps/rejected": -92.57727813720703, "loss": 0.294, "rewards/accuracies": 0.8125, "rewards/chosen": -4.4224042892456055, "rewards/margins": 2.60679292678833, "rewards/rejected": -7.029196739196777, "step": 179 }, { "epoch": 0.3415559772296015, "grad_norm": 1.6190105676651, "learning_rate": 0.00018317249950198597, "logits/chosen": -2.937143087387085, "logits/rejected": -2.9350616931915283, "logps/chosen": -67.23421478271484, "logps/rejected": -93.80059814453125, "loss": 0.1848, "rewards/accuracies": 1.0, "rewards/chosen": -4.204146385192871, "rewards/margins": 3.0119266510009766, "rewards/rejected": -7.216073513031006, "step": 180 }, { "epoch": 0.34345351043643263, "grad_norm": 2.069124698638916, "learning_rate": 0.00018276178043395586, "logits/chosen": -2.942324638366699, "logits/rejected": -2.943343162536621, "logps/chosen": -70.87568664550781, "logps/rejected": -101.76148223876953, "loss": 0.1794, "rewards/accuracies": 0.9375, "rewards/chosen": -4.680966377258301, "rewards/margins": 3.301745653152466, "rewards/rejected": -7.9827117919921875, "step": 181 }, { "epoch": 0.34535104364326374, "grad_norm": 6.071887493133545, "learning_rate": 0.00018234658142939454, "logits/chosen": -2.9245431423187256, "logits/rejected": -2.9229578971862793, "logps/chosen": -66.28981018066406, "logps/rejected": -88.98973083496094, "loss": 0.5967, "rewards/accuracies": 0.75, "rewards/chosen": -4.1609787940979, "rewards/margins": 2.4637625217437744, "rewards/rejected": -6.624741077423096, "step": 182 }, { "epoch": 0.34724857685009486, "grad_norm": 1.7315760850906372, "learning_rate": 0.00018192692496323156, "logits/chosen": -2.9373724460601807, "logits/rejected": -2.9322011470794678, "logps/chosen": -61.68766403198242, "logps/rejected": -97.79752349853516, "loss": 0.0801, "rewards/accuracies": 1.0, "rewards/chosen": -3.600949287414551, "rewards/margins": 3.957946300506592, "rewards/rejected": -7.558895587921143, "step": 183 }, { "epoch": 0.349146110056926, "grad_norm": 4.817263603210449, "learning_rate": 0.00018150283375168114, "logits/chosen": -2.9242465496063232, "logits/rejected": -2.9281442165374756, "logps/chosen": -49.455657958984375, "logps/rejected": -78.6143798828125, "loss": 0.4009, "rewards/accuracies": 0.75, "rewards/chosen": -2.6410908699035645, "rewards/margins": 3.0010156631469727, "rewards/rejected": -5.642106533050537, "step": 184 }, { "epoch": 0.3510436432637571, "grad_norm": 6.691137790679932, "learning_rate": 0.00018107433075101252, "logits/chosen": -2.8935739994049072, "logits/rejected": -2.8921587467193604, "logps/chosen": -39.874351501464844, "logps/rejected": -75.13919830322266, "loss": 0.6717, "rewards/accuracies": 0.8125, "rewards/chosen": -1.6036827564239502, "rewards/margins": 3.638152837753296, "rewards/rejected": -5.241835594177246, "step": 185 }, { "epoch": 0.35294117647058826, "grad_norm": 6.4132914543151855, "learning_rate": 0.00018064143915630723, "logits/chosen": -2.8910973072052, "logits/rejected": -2.8896028995513916, "logps/chosen": -42.009525299072266, "logps/rejected": -60.758155822753906, "loss": 0.7367, "rewards/accuracies": 0.875, "rewards/chosen": -1.7192809581756592, "rewards/margins": 2.1440207958221436, "rewards/rejected": -3.8633017539978027, "step": 186 }, { "epoch": 0.3548387096774194, "grad_norm": 3.2330329418182373, "learning_rate": 0.0001802041824002036, "logits/chosen": -2.9204766750335693, "logits/rejected": -2.915883779525757, "logps/chosen": -38.97833251953125, "logps/rejected": -57.29167175292969, "loss": 0.3248, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5657511949539185, "rewards/margins": 1.8807975053787231, "rewards/rejected": -3.4465484619140625, "step": 187 }, { "epoch": 0.3567362428842505, "grad_norm": 1.4538908004760742, "learning_rate": 0.00017976258415162833, "logits/chosen": -2.883896827697754, "logits/rejected": -2.8819518089294434, "logps/chosen": -38.06407928466797, "logps/rejected": -52.192264556884766, "loss": 0.3424, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4242199659347534, "rewards/margins": 1.5716015100479126, "rewards/rejected": -2.995821475982666, "step": 188 }, { "epoch": 0.3586337760910816, "grad_norm": 1.8755921125411987, "learning_rate": 0.00017931666831451536, "logits/chosen": -2.859116554260254, "logits/rejected": -2.8647141456604004, "logps/chosen": -35.76837158203125, "logps/rejected": -55.3032112121582, "loss": 0.5151, "rewards/accuracies": 0.875, "rewards/chosen": -1.3126046657562256, "rewards/margins": 2.0264344215393066, "rewards/rejected": -3.3390393257141113, "step": 189 }, { "epoch": 0.3605313092979127, "grad_norm": 2.1501007080078125, "learning_rate": 0.00017886645902651167, "logits/chosen": -2.9286084175109863, "logits/rejected": -2.9369022846221924, "logps/chosen": -39.66361618041992, "logps/rejected": -68.78336334228516, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/chosen": -1.5931215286254883, "rewards/margins": 3.1761364936828613, "rewards/rejected": -4.76925802230835, "step": 190 }, { "epoch": 0.36242884250474383, "grad_norm": 4.919617652893066, "learning_rate": 0.00017841198065767107, "logits/chosen": -2.9136152267456055, "logits/rejected": -2.914355754852295, "logps/chosen": -54.04287338256836, "logps/rejected": -71.08902740478516, "loss": 0.6126, "rewards/accuracies": 0.6875, "rewards/chosen": -3.111724615097046, "rewards/margins": 1.875967025756836, "rewards/rejected": -4.987691402435303, "step": 191 }, { "epoch": 0.36432637571157495, "grad_norm": 2.6215672492980957, "learning_rate": 0.0001779532578091347, "logits/chosen": -2.9398844242095947, "logits/rejected": -2.94006609916687, "logps/chosen": -56.28917694091797, "logps/rejected": -94.09794616699219, "loss": 0.1769, "rewards/accuracies": 0.875, "rewards/chosen": -3.2438466548919678, "rewards/margins": 4.059857368469238, "rewards/rejected": -7.303704261779785, "step": 192 }, { "epoch": 0.36622390891840606, "grad_norm": 7.178459167480469, "learning_rate": 0.00017749031531179963, "logits/chosen": -2.8921046257019043, "logits/rejected": -2.8897299766540527, "logps/chosen": -65.22640991210938, "logps/rejected": -91.53515625, "loss": 0.6301, "rewards/accuracies": 0.8125, "rewards/chosen": -4.077861785888672, "rewards/margins": 2.8920609951019287, "rewards/rejected": -6.96992301940918, "step": 193 }, { "epoch": 0.3681214421252372, "grad_norm": 4.36641263961792, "learning_rate": 0.00017702317822497455, "logits/chosen": -2.8808705806732178, "logits/rejected": -2.8756415843963623, "logps/chosen": -72.72103118896484, "logps/rejected": -96.75923919677734, "loss": 0.3474, "rewards/accuracies": 0.875, "rewards/chosen": -4.825438499450684, "rewards/margins": 2.646296977996826, "rewards/rejected": -7.47173547744751, "step": 194 }, { "epoch": 0.3700189753320683, "grad_norm": 4.489946365356445, "learning_rate": 0.00017655187183502344, "logits/chosen": -2.899508237838745, "logits/rejected": -2.89617919921875, "logps/chosen": -77.47444152832031, "logps/rejected": -94.92417907714844, "loss": 0.7369, "rewards/accuracies": 0.75, "rewards/chosen": -5.465056419372559, "rewards/margins": 1.7898337841033936, "rewards/rejected": -7.254889488220215, "step": 195 }, { "epoch": 0.3719165085388994, "grad_norm": 3.543882369995117, "learning_rate": 0.00017607642165399666, "logits/chosen": -2.8700618743896484, "logits/rejected": -2.865544319152832, "logps/chosen": -71.58328247070312, "logps/rejected": -85.74073028564453, "loss": 0.4353, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7718706130981445, "rewards/margins": 1.615250587463379, "rewards/rejected": -6.387121200561523, "step": 196 }, { "epoch": 0.3738140417457306, "grad_norm": 1.7851449251174927, "learning_rate": 0.0001755968534182501, "logits/chosen": -2.8684256076812744, "logits/rejected": -2.8665754795074463, "logps/chosen": -68.68287658691406, "logps/rejected": -88.38851928710938, "loss": 0.2769, "rewards/accuracies": 0.875, "rewards/chosen": -4.466452598571777, "rewards/margins": 2.200467824935913, "rewards/rejected": -6.6669206619262695, "step": 197 }, { "epoch": 0.3757115749525617, "grad_norm": 3.8078126907348633, "learning_rate": 0.00017511319308705198, "logits/chosen": -2.8291757106781006, "logits/rejected": -2.8295886516571045, "logps/chosen": -74.86436462402344, "logps/rejected": -83.49817657470703, "loss": 0.5876, "rewards/accuracies": 0.6875, "rewards/chosen": -5.133200168609619, "rewards/margins": 0.8893264532089233, "rewards/rejected": -6.022526741027832, "step": 198 }, { "epoch": 0.3776091081593928, "grad_norm": 2.537524938583374, "learning_rate": 0.0001746254668411778, "logits/chosen": -2.826998472213745, "logits/rejected": -2.8250083923339844, "logps/chosen": -70.27324676513672, "logps/rejected": -84.13557434082031, "loss": 0.3498, "rewards/accuracies": 0.9375, "rewards/chosen": -4.654443740844727, "rewards/margins": 1.5413017272949219, "rewards/rejected": -6.195745468139648, "step": 199 }, { "epoch": 0.3795066413662239, "grad_norm": 0.7963341474533081, "learning_rate": 0.00017413370108149286, "logits/chosen": -2.8284592628479004, "logits/rejected": -2.8273520469665527, "logps/chosen": -59.756290435791016, "logps/rejected": -88.60334777832031, "loss": 0.1007, "rewards/accuracies": 1.0, "rewards/chosen": -3.604917049407959, "rewards/margins": 3.0917110443115234, "rewards/rejected": -6.696628570556641, "step": 200 }, { "epoch": 0.38140417457305503, "grad_norm": 2.5711867809295654, "learning_rate": 0.00017363792242752353, "logits/chosen": -2.8135452270507812, "logits/rejected": -2.8090662956237793, "logps/chosen": -60.564918518066406, "logps/rejected": -83.0078125, "loss": 0.3432, "rewards/accuracies": 0.875, "rewards/chosen": -3.7318811416625977, "rewards/margins": 2.3645801544189453, "rewards/rejected": -6.096461296081543, "step": 201 }, { "epoch": 0.38330170777988615, "grad_norm": 1.702682614326477, "learning_rate": 0.0001731381577160161, "logits/chosen": -2.8074567317962646, "logits/rejected": -2.8068923950195312, "logps/chosen": -58.00551986694336, "logps/rejected": -77.38255310058594, "loss": 0.2787, "rewards/accuracies": 0.875, "rewards/chosen": -3.473022937774658, "rewards/margins": 2.0791990756988525, "rewards/rejected": -5.55222225189209, "step": 202 }, { "epoch": 0.38519924098671726, "grad_norm": 3.307158946990967, "learning_rate": 0.0001726344339994841, "logits/chosen": -2.837968349456787, "logits/rejected": -2.8358547687530518, "logps/chosen": -68.95384979248047, "logps/rejected": -79.25270080566406, "loss": 0.5837, "rewards/accuracies": 0.6875, "rewards/chosen": -4.315940856933594, "rewards/margins": 1.430248498916626, "rewards/rejected": -5.746189594268799, "step": 203 }, { "epoch": 0.3870967741935484, "grad_norm": 2.0259664058685303, "learning_rate": 0.000172126778544744, "logits/chosen": -2.8287785053253174, "logits/rejected": -2.8304224014282227, "logps/chosen": -67.12183380126953, "logps/rejected": -80.09559631347656, "loss": 0.3632, "rewards/accuracies": 0.8125, "rewards/chosen": -4.335432529449463, "rewards/margins": 1.4739618301391602, "rewards/rejected": -5.809394836425781, "step": 204 }, { "epoch": 0.3889943074003795, "grad_norm": 2.4748473167419434, "learning_rate": 0.00017161521883143934, "logits/chosen": -2.862060308456421, "logits/rejected": -2.861398458480835, "logps/chosen": -68.42402648925781, "logps/rejected": -91.68376159667969, "loss": 0.2329, "rewards/accuracies": 0.9375, "rewards/chosen": -4.550724983215332, "rewards/margins": 2.5067808628082275, "rewards/rejected": -7.057506084442139, "step": 205 }, { "epoch": 0.3908918406072106, "grad_norm": 1.0783460140228271, "learning_rate": 0.00017109978255055295, "logits/chosen": -2.7947001457214355, "logits/rejected": -2.8020119667053223, "logps/chosen": -55.356285095214844, "logps/rejected": -88.07723236083984, "loss": 0.0966, "rewards/accuracies": 1.0, "rewards/chosen": -3.234992027282715, "rewards/margins": 3.352567672729492, "rewards/rejected": -6.587559700012207, "step": 206 }, { "epoch": 0.3927893738140417, "grad_norm": 3.741246223449707, "learning_rate": 0.0001705804976029083, "logits/chosen": -2.7769172191619873, "logits/rejected": -2.7746269702911377, "logps/chosen": -67.84706115722656, "logps/rejected": -86.53413391113281, "loss": 0.3825, "rewards/accuracies": 0.8125, "rewards/chosen": -4.322835922241211, "rewards/margins": 2.0191259384155273, "rewards/rejected": -6.341961860656738, "step": 207 }, { "epoch": 0.3946869070208729, "grad_norm": 3.0758566856384277, "learning_rate": 0.00017005739209765904, "logits/chosen": -2.798175096511841, "logits/rejected": -2.7865893840789795, "logps/chosen": -63.870933532714844, "logps/rejected": -85.13277435302734, "loss": 0.3158, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9742186069488525, "rewards/margins": 2.2601842880249023, "rewards/rejected": -6.234403133392334, "step": 208 }, { "epoch": 0.396584440227704, "grad_norm": 4.3031907081604, "learning_rate": 0.0001695304943507677, "logits/chosen": -2.786068916320801, "logits/rejected": -2.7925620079040527, "logps/chosen": -70.43618774414062, "logps/rejected": -92.2144775390625, "loss": 0.6388, "rewards/accuracies": 0.8125, "rewards/chosen": -4.723659038543701, "rewards/margins": 2.3320984840393066, "rewards/rejected": -7.055757522583008, "step": 209 }, { "epoch": 0.3984819734345351, "grad_norm": 4.984089374542236, "learning_rate": 0.00016899983288347248, "logits/chosen": -2.802447557449341, "logits/rejected": -2.8011577129364014, "logps/chosen": -75.90496063232422, "logps/rejected": -96.45457458496094, "loss": 0.717, "rewards/accuracies": 0.75, "rewards/chosen": -5.078993320465088, "rewards/margins": 2.2575745582580566, "rewards/rejected": -7.3365678787231445, "step": 210 }, { "epoch": 0.40037950664136623, "grad_norm": 2.23420786857605, "learning_rate": 0.0001684654364207438, "logits/chosen": -2.7645010948181152, "logits/rejected": -2.7675206661224365, "logps/chosen": -55.797607421875, "logps/rejected": -92.49737548828125, "loss": 0.1039, "rewards/accuracies": 1.0, "rewards/chosen": -3.1702518463134766, "rewards/margins": 3.868875503540039, "rewards/rejected": -7.039127349853516, "step": 211 }, { "epoch": 0.40227703984819735, "grad_norm": 1.9920927286148071, "learning_rate": 0.00016792733388972932, "logits/chosen": -2.7536532878875732, "logits/rejected": -2.755016326904297, "logps/chosen": -58.96205139160156, "logps/rejected": -94.6013412475586, "loss": 0.105, "rewards/accuracies": 1.0, "rewards/chosen": -3.3591365814208984, "rewards/margins": 3.903737783432007, "rewards/rejected": -7.262874126434326, "step": 212 }, { "epoch": 0.40417457305502846, "grad_norm": 3.800337791442871, "learning_rate": 0.00016738555441818783, "logits/chosen": -2.8170573711395264, "logits/rejected": -2.8192145824432373, "logps/chosen": -57.503204345703125, "logps/rejected": -100.11039733886719, "loss": 0.187, "rewards/accuracies": 0.9375, "rewards/chosen": -3.294625759124756, "rewards/margins": 4.43009614944458, "rewards/rejected": -7.724721908569336, "step": 213 }, { "epoch": 0.4060721062618596, "grad_norm": 6.148056983947754, "learning_rate": 0.0001668401273329129, "logits/chosen": -2.773486614227295, "logits/rejected": -2.782440423965454, "logps/chosen": -49.874900817871094, "logps/rejected": -73.721435546875, "loss": 0.3677, "rewards/accuracies": 0.75, "rewards/chosen": -2.5481603145599365, "rewards/margins": 2.5078892707824707, "rewards/rejected": -5.056049346923828, "step": 214 }, { "epoch": 0.4079696394686907, "grad_norm": 4.637800216674805, "learning_rate": 0.00016629108215814525, "logits/chosen": -2.7890918254852295, "logits/rejected": -2.78002667427063, "logps/chosen": -50.07588577270508, "logps/rejected": -76.56361389160156, "loss": 0.2625, "rewards/accuracies": 0.875, "rewards/chosen": -2.4453320503234863, "rewards/margins": 2.9321773052215576, "rewards/rejected": -5.377509117126465, "step": 215 }, { "epoch": 0.4098671726755218, "grad_norm": 3.3369038105010986, "learning_rate": 0.00016573844861397444, "logits/chosen": -2.7674639225006104, "logits/rejected": -2.777357339859009, "logps/chosen": -47.326290130615234, "logps/rejected": -73.55824279785156, "loss": 0.5078, "rewards/accuracies": 0.875, "rewards/chosen": -2.330988883972168, "rewards/margins": 2.8556952476501465, "rewards/rejected": -5.186683654785156, "step": 216 }, { "epoch": 0.4117647058823529, "grad_norm": 3.6975343227386475, "learning_rate": 0.00016518225661473043, "logits/chosen": -2.7834789752960205, "logits/rejected": -2.77740216255188, "logps/chosen": -49.831703186035156, "logps/rejected": -76.83187866210938, "loss": 0.2911, "rewards/accuracies": 0.875, "rewards/chosen": -2.4276180267333984, "rewards/margins": 3.013000011444092, "rewards/rejected": -5.440618515014648, "step": 217 }, { "epoch": 0.41366223908918404, "grad_norm": 3.3639872074127197, "learning_rate": 0.00016462253626736413, "logits/chosen": -2.6748290061950684, "logits/rejected": -2.6851727962493896, "logps/chosen": -41.36797332763672, "logps/rejected": -60.205989837646484, "loss": 0.4055, "rewards/accuracies": 0.875, "rewards/chosen": -1.751816749572754, "rewards/margins": 2.143329381942749, "rewards/rejected": -3.895146131515503, "step": 218 }, { "epoch": 0.4155597722960152, "grad_norm": 3.045083522796631, "learning_rate": 0.00016405931786981755, "logits/chosen": -2.7627456188201904, "logits/rejected": -2.7599875926971436, "logps/chosen": -42.954856872558594, "logps/rejected": -59.64561462402344, "loss": 0.3513, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9114494323730469, "rewards/margins": 1.9816038608551025, "rewards/rejected": -3.8930535316467285, "step": 219 }, { "epoch": 0.4174573055028463, "grad_norm": 2.8113279342651367, "learning_rate": 0.000163492631909384, "logits/chosen": -2.745089530944824, "logits/rejected": -2.7422215938568115, "logps/chosen": -42.2723503112793, "logps/rejected": -57.04561233520508, "loss": 0.4329, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7828037738800049, "rewards/margins": 1.7417727708816528, "rewards/rejected": -3.5245766639709473, "step": 220 }, { "epoch": 0.41935483870967744, "grad_norm": 1.3215032815933228, "learning_rate": 0.0001629225090610577, "logits/chosen": -2.764284372329712, "logits/rejected": -2.7611119747161865, "logps/chosen": -36.366241455078125, "logps/rejected": -59.05260467529297, "loss": 0.1957, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2659788131713867, "rewards/margins": 2.423325777053833, "rewards/rejected": -3.6893045902252197, "step": 221 }, { "epoch": 0.42125237191650855, "grad_norm": 1.7855051755905151, "learning_rate": 0.00016234898018587337, "logits/chosen": -2.8013715744018555, "logits/rejected": -2.8014392852783203, "logps/chosen": -38.1886100769043, "logps/rejected": -57.5101318359375, "loss": 0.2574, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3807275295257568, "rewards/margins": 2.1531457901000977, "rewards/rejected": -3.5338735580444336, "step": 222 }, { "epoch": 0.42314990512333966, "grad_norm": 4.893378257751465, "learning_rate": 0.00016177207632923557, "logits/chosen": -2.793851137161255, "logits/rejected": -2.791360378265381, "logps/chosen": -44.361148834228516, "logps/rejected": -56.266658782958984, "loss": 0.3702, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9667861461639404, "rewards/margins": 1.407994270324707, "rewards/rejected": -3.3747801780700684, "step": 223 }, { "epoch": 0.4250474383301708, "grad_norm": 1.959364891052246, "learning_rate": 0.00016119182871923834, "logits/chosen": -2.7673821449279785, "logits/rejected": -2.7583274841308594, "logps/chosen": -47.93505859375, "logps/rejected": -65.74525451660156, "loss": 0.25, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4285130500793457, "rewards/margins": 1.9772919416427612, "rewards/rejected": -4.4058051109313965, "step": 224 }, { "epoch": 0.4269449715370019, "grad_norm": 2.6299164295196533, "learning_rate": 0.00016060826876497478, "logits/chosen": -2.7214760780334473, "logits/rejected": -2.7112197875976562, "logps/chosen": -42.125022888183594, "logps/rejected": -62.053749084472656, "loss": 0.2648, "rewards/accuracies": 0.875, "rewards/chosen": -1.8338714838027954, "rewards/margins": 2.1915903091430664, "rewards/rejected": -4.025461673736572, "step": 225 }, { "epoch": 0.428842504743833, "grad_norm": 3.1354124546051025, "learning_rate": 0.00016002142805483685, "logits/chosen": -2.7817063331604004, "logits/rejected": -2.7779695987701416, "logps/chosen": -46.67332077026367, "logps/rejected": -69.51974487304688, "loss": 0.3174, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3287980556488037, "rewards/margins": 2.3333349227905273, "rewards/rejected": -4.662132740020752, "step": 226 }, { "epoch": 0.4307400379506641, "grad_norm": 1.7215931415557861, "learning_rate": 0.00015943133835480535, "logits/chosen": -2.8229281902313232, "logits/rejected": -2.815373659133911, "logps/chosen": -40.25981140136719, "logps/rejected": -73.13153076171875, "loss": 0.1534, "rewards/accuracies": 1.0, "rewards/chosen": -1.6590008735656738, "rewards/margins": 3.475335121154785, "rewards/rejected": -5.134335994720459, "step": 227 }, { "epoch": 0.43263757115749524, "grad_norm": 4.5978593826293945, "learning_rate": 0.0001588380316067307, "logits/chosen": -2.824258804321289, "logits/rejected": -2.8195881843566895, "logps/chosen": -40.724361419677734, "logps/rejected": -67.01683044433594, "loss": 0.1866, "rewards/accuracies": 0.9375, "rewards/chosen": -1.497710108757019, "rewards/margins": 2.8423562049865723, "rewards/rejected": -4.340066432952881, "step": 228 }, { "epoch": 0.43453510436432635, "grad_norm": 6.522673606872559, "learning_rate": 0.0001582415399266036, "logits/chosen": -2.7951138019561768, "logits/rejected": -2.7876198291778564, "logps/chosen": -47.27073287963867, "logps/rejected": -76.75838470458984, "loss": 0.2995, "rewards/accuracies": 0.875, "rewards/chosen": -2.335322380065918, "rewards/margins": 3.229301691055298, "rewards/rejected": -5.564623832702637, "step": 229 }, { "epoch": 0.4364326375711575, "grad_norm": 4.254772186279297, "learning_rate": 0.00015764189560281677, "logits/chosen": -2.768986701965332, "logits/rejected": -2.7672719955444336, "logps/chosen": -58.088409423828125, "logps/rejected": -78.85195922851562, "loss": 0.555, "rewards/accuracies": 0.75, "rewards/chosen": -3.2533457279205322, "rewards/margins": 2.4660933017730713, "rewards/rejected": -5.7194390296936035, "step": 230 }, { "epoch": 0.43833017077798864, "grad_norm": 6.352909088134766, "learning_rate": 0.00015703913109441713, "logits/chosen": -2.79345965385437, "logits/rejected": -2.786287546157837, "logps/chosen": -49.76398468017578, "logps/rejected": -80.52045440673828, "loss": 0.1719, "rewards/accuracies": 0.9375, "rewards/chosen": -2.432440996170044, "rewards/margins": 3.3548390865325928, "rewards/rejected": -5.787280082702637, "step": 231 }, { "epoch": 0.44022770398481975, "grad_norm": 10.744542121887207, "learning_rate": 0.00015643327902934868, "logits/chosen": -2.75455379486084, "logits/rejected": -2.741607666015625, "logps/chosen": -58.280277252197266, "logps/rejected": -90.40587615966797, "loss": 0.4328, "rewards/accuracies": 0.875, "rewards/chosen": -3.340040683746338, "rewards/margins": 3.4735236167907715, "rewards/rejected": -6.813564300537109, "step": 232 }, { "epoch": 0.44212523719165087, "grad_norm": 5.020275115966797, "learning_rate": 0.00015582437220268647, "logits/chosen": -2.758476972579956, "logits/rejected": -2.7559456825256348, "logps/chosen": -61.09891128540039, "logps/rejected": -85.42538452148438, "loss": 0.4021, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7310962677001953, "rewards/margins": 2.584928512573242, "rewards/rejected": -6.3160247802734375, "step": 233 }, { "epoch": 0.444022770398482, "grad_norm": 3.41190767288208, "learning_rate": 0.00015521244357486133, "logits/chosen": -2.7153968811035156, "logits/rejected": -2.7004103660583496, "logps/chosen": -77.57876586914062, "logps/rejected": -97.70706939697266, "loss": 0.2604, "rewards/accuracies": 0.9375, "rewards/chosen": -5.392991065979004, "rewards/margins": 2.164095640182495, "rewards/rejected": -7.557086944580078, "step": 234 }, { "epoch": 0.4459203036053131, "grad_norm": 2.357773542404175, "learning_rate": 0.00015459752626987563, "logits/chosen": -2.8138904571533203, "logits/rejected": -2.8117315769195557, "logps/chosen": -70.28298950195312, "logps/rejected": -88.47047424316406, "loss": 0.4502, "rewards/accuracies": 0.8125, "rewards/chosen": -4.694599151611328, "rewards/margins": 1.9797905683517456, "rewards/rejected": -6.674389839172363, "step": 235 }, { "epoch": 0.4478178368121442, "grad_norm": 3.5848381519317627, "learning_rate": 0.00015397965357351033, "logits/chosen": -2.788076162338257, "logits/rejected": -2.7764651775360107, "logps/chosen": -63.50676727294922, "logps/rejected": -96.62000274658203, "loss": 0.2589, "rewards/accuracies": 0.875, "rewards/chosen": -3.9061508178710938, "rewards/margins": 3.6903600692749023, "rewards/rejected": -7.596510887145996, "step": 236 }, { "epoch": 0.4497153700189753, "grad_norm": 4.058860778808594, "learning_rate": 0.00015335885893152335, "logits/chosen": -2.765608549118042, "logits/rejected": -2.752915382385254, "logps/chosen": -59.92298126220703, "logps/rejected": -83.04705810546875, "loss": 0.3769, "rewards/accuracies": 0.875, "rewards/chosen": -3.5093774795532227, "rewards/margins": 2.6241321563720703, "rewards/rejected": -6.133509635925293, "step": 237 }, { "epoch": 0.45161290322580644, "grad_norm": 3.230358600616455, "learning_rate": 0.00015273517594783877, "logits/chosen": -2.7464921474456787, "logits/rejected": -2.73105788230896, "logps/chosen": -57.03343200683594, "logps/rejected": -75.98129272460938, "loss": 0.2463, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1314737796783447, "rewards/margins": 2.3742733001708984, "rewards/rejected": -5.505747318267822, "step": 238 }, { "epoch": 0.45351043643263755, "grad_norm": 3.2784218788146973, "learning_rate": 0.0001521086383827282, "logits/chosen": -2.815624237060547, "logits/rejected": -2.818958044052124, "logps/chosen": -48.82466506958008, "logps/rejected": -58.87793731689453, "loss": 0.4348, "rewards/accuracies": 0.875, "rewards/chosen": -2.4546396732330322, "rewards/margins": 1.227008581161499, "rewards/rejected": -3.6816484928131104, "step": 239 }, { "epoch": 0.45540796963946867, "grad_norm": 3.05546236038208, "learning_rate": 0.0001514792801509831, "logits/chosen": -2.8245487213134766, "logits/rejected": -2.8188109397888184, "logps/chosen": -51.791107177734375, "logps/rejected": -59.150779724121094, "loss": 0.46, "rewards/accuracies": 0.875, "rewards/chosen": -2.5664262771606445, "rewards/margins": 1.1049449443817139, "rewards/rejected": -3.6713712215423584, "step": 240 }, { "epoch": 0.4573055028462998, "grad_norm": 2.409597873687744, "learning_rate": 0.00015084713532007905, "logits/chosen": -2.7943313121795654, "logits/rejected": -2.7907536029815674, "logps/chosen": -43.191131591796875, "logps/rejected": -53.549072265625, "loss": 0.3468, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9051916599273682, "rewards/margins": 1.3212178945541382, "rewards/rejected": -3.226409912109375, "step": 241 }, { "epoch": 0.45920303605313095, "grad_norm": 1.9227346181869507, "learning_rate": 0.00015021223810833165, "logits/chosen": -2.87888503074646, "logits/rejected": -2.8758420944213867, "logps/chosen": -47.109779357910156, "logps/rejected": -61.53156661987305, "loss": 0.3432, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3327531814575195, "rewards/margins": 1.5949348211288452, "rewards/rejected": -3.9276881217956543, "step": 242 }, { "epoch": 0.46110056925996207, "grad_norm": 2.2493233680725098, "learning_rate": 0.0001495746228830442, "logits/chosen": -2.8305468559265137, "logits/rejected": -2.8325793743133545, "logps/chosen": -47.47154998779297, "logps/rejected": -62.330421447753906, "loss": 0.2987, "rewards/accuracies": 0.9375, "rewards/chosen": -2.40806245803833, "rewards/margins": 1.6747158765792847, "rewards/rejected": -4.082778453826904, "step": 243 }, { "epoch": 0.4629981024667932, "grad_norm": 2.9621243476867676, "learning_rate": 0.0001489343241586475, "logits/chosen": -2.873603343963623, "logits/rejected": -2.8682057857513428, "logps/chosen": -52.045467376708984, "logps/rejected": -62.28657150268555, "loss": 0.4042, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7810521125793457, "rewards/margins": 1.326963186264038, "rewards/rejected": -4.108015060424805, "step": 244 }, { "epoch": 0.4648956356736243, "grad_norm": 4.03017520904541, "learning_rate": 0.00014829137659483143, "logits/chosen": -2.836946964263916, "logits/rejected": -2.834871530532837, "logps/chosen": -56.07475662231445, "logps/rejected": -69.99559020996094, "loss": 0.4504, "rewards/accuracies": 0.75, "rewards/chosen": -3.2221789360046387, "rewards/margins": 1.5012485980987549, "rewards/rejected": -4.723427772521973, "step": 245 }, { "epoch": 0.4667931688804554, "grad_norm": 2.259509563446045, "learning_rate": 0.00014764581499466893, "logits/chosen": -2.8168697357177734, "logits/rejected": -2.8176605701446533, "logps/chosen": -60.62321853637695, "logps/rejected": -72.13325500488281, "loss": 0.4589, "rewards/accuracies": 0.625, "rewards/chosen": -3.7241644859313965, "rewards/margins": 1.2307034730911255, "rewards/rejected": -4.954867839813232, "step": 246 }, { "epoch": 0.4686907020872865, "grad_norm": 1.4829596281051636, "learning_rate": 0.000146997674302732, "logits/chosen": -2.8266377449035645, "logits/rejected": -2.8239901065826416, "logps/chosen": -53.81561279296875, "logps/rejected": -81.11199188232422, "loss": 0.1423, "rewards/accuracies": 1.0, "rewards/chosen": -2.9962782859802246, "rewards/margins": 2.8328514099121094, "rewards/rejected": -5.829129695892334, "step": 247 }, { "epoch": 0.47058823529411764, "grad_norm": 3.550029993057251, "learning_rate": 0.00014634698960320016, "logits/chosen": -2.8737902641296387, "logits/rejected": -2.8744564056396484, "logps/chosen": -70.34271240234375, "logps/rejected": -85.00938415527344, "loss": 0.4912, "rewards/accuracies": 0.75, "rewards/chosen": -4.698043346405029, "rewards/margins": 1.4900033473968506, "rewards/rejected": -6.188046455383301, "step": 248 }, { "epoch": 0.47248576850094876, "grad_norm": 2.4781672954559326, "learning_rate": 0.00014569379611796137, "logits/chosen": -2.831427812576294, "logits/rejected": -2.830643892288208, "logps/chosen": -46.51897430419922, "logps/rejected": -78.5478286743164, "loss": 0.1867, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3443069458007812, "rewards/margins": 3.368368148803711, "rewards/rejected": -5.712675094604492, "step": 249 }, { "epoch": 0.47438330170777987, "grad_norm": 3.924811601638794, "learning_rate": 0.00014503812920470534, "logits/chosen": -2.848410129547119, "logits/rejected": -2.8469910621643066, "logps/chosen": -51.54380798339844, "logps/rejected": -79.81298828125, "loss": 0.3428, "rewards/accuracies": 0.9375, "rewards/chosen": -2.817504405975342, "rewards/margins": 3.0145010948181152, "rewards/rejected": -5.832005500793457, "step": 250 }, { "epoch": 0.476280834914611, "grad_norm": 3.6853103637695312, "learning_rate": 0.00014438002435500979, "logits/chosen": -2.8478622436523438, "logits/rejected": -2.8459577560424805, "logps/chosen": -49.33186340332031, "logps/rejected": -90.88196563720703, "loss": 0.23, "rewards/accuracies": 0.875, "rewards/chosen": -2.5675415992736816, "rewards/margins": 4.337457656860352, "rewards/rejected": -6.904999732971191, "step": 251 }, { "epoch": 0.4781783681214421, "grad_norm": 3.788881301879883, "learning_rate": 0.00014371951719241904, "logits/chosen": -2.839564085006714, "logits/rejected": -2.825866222381592, "logps/chosen": -52.40620422363281, "logps/rejected": -89.26689910888672, "loss": 0.2075, "rewards/accuracies": 0.875, "rewards/chosen": -2.6375033855438232, "rewards/margins": 4.117871284484863, "rewards/rejected": -6.755374908447266, "step": 252 }, { "epoch": 0.48007590132827327, "grad_norm": 3.9719743728637695, "learning_rate": 0.00014305664347051585, "logits/chosen": -2.8288607597351074, "logits/rejected": -2.824831008911133, "logps/chosen": -58.445960998535156, "logps/rejected": -84.42477416992188, "loss": 0.3711, "rewards/accuracies": 0.75, "rewards/chosen": -3.4238996505737305, "rewards/margins": 2.746028423309326, "rewards/rejected": -6.169928073883057, "step": 253 }, { "epoch": 0.4819734345351044, "grad_norm": 3.1389803886413574, "learning_rate": 0.0001423914390709861, "logits/chosen": -2.8727009296417236, "logits/rejected": -2.8723371028900146, "logps/chosen": -49.1451416015625, "logps/rejected": -77.82601928710938, "loss": 0.36, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6173038482666016, "rewards/margins": 2.9812803268432617, "rewards/rejected": -5.598584175109863, "step": 254 }, { "epoch": 0.4838709677419355, "grad_norm": 4.244487762451172, "learning_rate": 0.00014172394000167623, "logits/chosen": -2.8132166862487793, "logits/rejected": -2.804203748703003, "logps/chosen": -59.2404899597168, "logps/rejected": -68.9154281616211, "loss": 0.5202, "rewards/accuracies": 0.875, "rewards/chosen": -3.4824461936950684, "rewards/margins": 1.2314536571502686, "rewards/rejected": -4.713899612426758, "step": 255 }, { "epoch": 0.4857685009487666, "grad_norm": 1.7867659330368042, "learning_rate": 0.00014105418239464452, "logits/chosen": -2.8577332496643066, "logits/rejected": -2.8458566665649414, "logps/chosen": -55.45354080200195, "logps/rejected": -80.99148559570312, "loss": 0.2365, "rewards/accuracies": 0.875, "rewards/chosen": -3.028547763824463, "rewards/margins": 2.8536529541015625, "rewards/rejected": -5.882200717926025, "step": 256 }, { "epoch": 0.4876660341555977, "grad_norm": 2.2104556560516357, "learning_rate": 0.00014038220250420485, "logits/chosen": -2.829854726791382, "logits/rejected": -2.8256146907806396, "logps/chosen": -60.85689926147461, "logps/rejected": -90.96002197265625, "loss": 0.2461, "rewards/accuracies": 0.9375, "rewards/chosen": -3.654787063598633, "rewards/margins": 3.1404576301574707, "rewards/rejected": -6.7952446937561035, "step": 257 }, { "epoch": 0.48956356736242884, "grad_norm": 3.49383807182312, "learning_rate": 0.00013970803670496453, "logits/chosen": -2.859201192855835, "logits/rejected": -2.8544297218322754, "logps/chosen": -58.228614807128906, "logps/rejected": -79.75567626953125, "loss": 0.4078, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4922361373901367, "rewards/margins": 2.3522963523864746, "rewards/rejected": -5.8445329666137695, "step": 258 }, { "epoch": 0.49146110056925996, "grad_norm": 1.2643572092056274, "learning_rate": 0.0001390317214898551, "logits/chosen": -2.8534157276153564, "logits/rejected": -2.8505430221557617, "logps/chosen": -58.288631439208984, "logps/rejected": -83.08782196044922, "loss": 0.144, "rewards/accuracies": 1.0, "rewards/chosen": -3.3658194541931152, "rewards/margins": 2.705644130706787, "rewards/rejected": -6.071463584899902, "step": 259 }, { "epoch": 0.49335863377609107, "grad_norm": 2.9561476707458496, "learning_rate": 0.00013835329346815716, "logits/chosen": -2.8778738975524902, "logits/rejected": -2.8710873126983643, "logps/chosen": -62.86555480957031, "logps/rejected": -96.86504364013672, "loss": 0.2063, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8083794116973877, "rewards/margins": 3.612440586090088, "rewards/rejected": -7.420820236206055, "step": 260 }, { "epoch": 0.4952561669829222, "grad_norm": 4.405760288238525, "learning_rate": 0.00013767278936351854, "logits/chosen": -2.8736281394958496, "logits/rejected": -2.8688836097717285, "logps/chosen": -61.89301300048828, "logps/rejected": -96.45506286621094, "loss": 0.3393, "rewards/accuracies": 0.875, "rewards/chosen": -3.816981077194214, "rewards/margins": 3.6093504428863525, "rewards/rejected": -7.426331520080566, "step": 261 }, { "epoch": 0.4971537001897533, "grad_norm": 2.922870397567749, "learning_rate": 0.00013699024601196641, "logits/chosen": -2.8758511543273926, "logits/rejected": -2.869248151779175, "logps/chosen": -59.3803825378418, "logps/rejected": -99.51313018798828, "loss": 0.267, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5901854038238525, "rewards/margins": 4.170801639556885, "rewards/rejected": -7.760987281799316, "step": 262 }, { "epoch": 0.4990512333965844, "grad_norm": 7.673828601837158, "learning_rate": 0.0001363057003599135, "logits/chosen": -2.869354724884033, "logits/rejected": -2.8672537803649902, "logps/chosen": -63.243125915527344, "logps/rejected": -94.80345153808594, "loss": 0.4937, "rewards/accuracies": 0.8125, "rewards/chosen": -4.047074317932129, "rewards/margins": 3.288224220275879, "rewards/rejected": -7.335298538208008, "step": 263 }, { "epoch": 0.5009487666034156, "grad_norm": 4.203124046325684, "learning_rate": 0.00013561918946215806, "logits/chosen": -2.8657474517822266, "logits/rejected": -2.859548807144165, "logps/chosen": -67.86384582519531, "logps/rejected": -100.13394927978516, "loss": 0.4981, "rewards/accuracies": 0.875, "rewards/chosen": -4.426224231719971, "rewards/margins": 3.4542787075042725, "rewards/rejected": -7.880502700805664, "step": 264 }, { "epoch": 0.5028462998102466, "grad_norm": 6.988152027130127, "learning_rate": 0.000134930750479878, "logits/chosen": -2.8737897872924805, "logits/rejected": -2.869394540786743, "logps/chosen": -65.10237121582031, "logps/rejected": -95.148681640625, "loss": 0.3511, "rewards/accuracies": 0.8125, "rewards/chosen": -4.182986259460449, "rewards/margins": 3.097425937652588, "rewards/rejected": -7.280411720275879, "step": 265 }, { "epoch": 0.5047438330170778, "grad_norm": 5.459814071655273, "learning_rate": 0.00013424042067861945, "logits/chosen": -2.8713884353637695, "logits/rejected": -2.86498761177063, "logps/chosen": -72.272216796875, "logps/rejected": -97.40777587890625, "loss": 0.7328, "rewards/accuracies": 0.75, "rewards/chosen": -4.793173313140869, "rewards/margins": 2.8042550086975098, "rewards/rejected": -7.597428798675537, "step": 266 }, { "epoch": 0.5066413662239089, "grad_norm": 2.6633589267730713, "learning_rate": 0.0001335482374262795, "logits/chosen": -2.8936328887939453, "logits/rejected": -2.8898427486419678, "logps/chosen": -59.96571731567383, "logps/rejected": -97.0484619140625, "loss": 0.2084, "rewards/accuracies": 0.9375, "rewards/chosen": -3.6741292476654053, "rewards/margins": 3.8013479709625244, "rewards/rejected": -7.47547721862793, "step": 267 }, { "epoch": 0.50853889943074, "grad_norm": 4.516148090362549, "learning_rate": 0.0001328542381910835, "logits/chosen": -2.8802101612091064, "logits/rejected": -2.8792312145233154, "logps/chosen": -67.06829071044922, "logps/rejected": -91.89190673828125, "loss": 0.4533, "rewards/accuracies": 0.875, "rewards/chosen": -4.338167667388916, "rewards/margins": 2.6398956775665283, "rewards/rejected": -6.978063583374023, "step": 268 }, { "epoch": 0.5104364326375711, "grad_norm": 4.260200023651123, "learning_rate": 0.00013215846053955683, "logits/chosen": -2.871657371520996, "logits/rejected": -2.8631982803344727, "logps/chosen": -58.0806770324707, "logps/rejected": -93.51595306396484, "loss": 0.2301, "rewards/accuracies": 0.9375, "rewards/chosen": -3.373506784439087, "rewards/margins": 3.758054256439209, "rewards/rejected": -7.131560802459717, "step": 269 }, { "epoch": 0.5123339658444023, "grad_norm": 8.104193687438965, "learning_rate": 0.00013146094213449148, "logits/chosen": -2.834609270095825, "logits/rejected": -2.835766077041626, "logps/chosen": -66.10044860839844, "logps/rejected": -86.46508026123047, "loss": 0.7338, "rewards/accuracies": 0.75, "rewards/chosen": -4.160252571105957, "rewards/margins": 2.152247667312622, "rewards/rejected": -6.312500476837158, "step": 270 }, { "epoch": 0.5142314990512334, "grad_norm": 2.269233465194702, "learning_rate": 0.00013076172073290724, "logits/chosen": -2.8664426803588867, "logits/rejected": -2.866476058959961, "logps/chosen": -56.94659423828125, "logps/rejected": -77.3311996459961, "loss": 0.6377, "rewards/accuracies": 0.8125, "rewards/chosen": -3.369450569152832, "rewards/margins": 2.010820150375366, "rewards/rejected": -5.380270957946777, "step": 271 }, { "epoch": 0.5161290322580645, "grad_norm": 3.103928804397583, "learning_rate": 0.000130060834184008, "logits/chosen": -2.8317670822143555, "logits/rejected": -2.8320889472961426, "logps/chosen": -62.027313232421875, "logps/rejected": -74.0067367553711, "loss": 0.6377, "rewards/accuracies": 0.8125, "rewards/chosen": -3.687196731567383, "rewards/margins": 1.4686436653137207, "rewards/rejected": -5.1558403968811035, "step": 272 }, { "epoch": 0.5180265654648957, "grad_norm": 1.593082308769226, "learning_rate": 0.00012935832042713287, "logits/chosen": -2.828850269317627, "logits/rejected": -2.8194522857666016, "logps/chosen": -53.06214904785156, "logps/rejected": -78.66574096679688, "loss": 0.1704, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8542308807373047, "rewards/margins": 2.848317861557007, "rewards/rejected": -5.702548980712891, "step": 273 }, { "epoch": 0.5199240986717267, "grad_norm": 1.320363998413086, "learning_rate": 0.00012865421748970256, "logits/chosen": -2.8424181938171387, "logits/rejected": -2.835724353790283, "logps/chosen": -55.70930480957031, "logps/rejected": -81.26738739013672, "loss": 0.1867, "rewards/accuracies": 0.875, "rewards/chosen": -3.0446360111236572, "rewards/margins": 2.9781501293182373, "rewards/rejected": -6.0227861404418945, "step": 274 }, { "epoch": 0.5218216318785579, "grad_norm": 1.8908852338790894, "learning_rate": 0.00012794856348516095, "logits/chosen": -2.847842216491699, "logits/rejected": -2.841383457183838, "logps/chosen": -54.49677276611328, "logps/rejected": -80.69400024414062, "loss": 0.2009, "rewards/accuracies": 0.9375, "rewards/chosen": -3.114948034286499, "rewards/margins": 2.7647268772125244, "rewards/rejected": -5.879674911499023, "step": 275 }, { "epoch": 0.523719165085389, "grad_norm": 2.000441312789917, "learning_rate": 0.0001272413966109119, "logits/chosen": -2.854240894317627, "logits/rejected": -2.8539373874664307, "logps/chosen": -57.158050537109375, "logps/rejected": -77.82567596435547, "loss": 0.387, "rewards/accuracies": 0.75, "rewards/chosen": -3.4087672233581543, "rewards/margins": 2.2918338775634766, "rewards/rejected": -5.700601100921631, "step": 276 }, { "epoch": 0.5256166982922201, "grad_norm": 0.9269229769706726, "learning_rate": 0.00012653275514625166, "logits/chosen": -2.8237087726593018, "logits/rejected": -2.8187015056610107, "logps/chosen": -52.4594841003418, "logps/rejected": -81.11265563964844, "loss": 0.1577, "rewards/accuracies": 0.875, "rewards/chosen": -2.902538299560547, "rewards/margins": 3.004708766937256, "rewards/rejected": -5.907247543334961, "step": 277 }, { "epoch": 0.5275142314990512, "grad_norm": 4.269275188446045, "learning_rate": 0.00012582267745029686, "logits/chosen": -2.83097505569458, "logits/rejected": -2.824533700942993, "logps/chosen": -59.98640823364258, "logps/rejected": -74.92745971679688, "loss": 0.5263, "rewards/accuracies": 0.75, "rewards/chosen": -3.483750104904175, "rewards/margins": 1.778311014175415, "rewards/rejected": -5.26206111907959, "step": 278 }, { "epoch": 0.5294117647058824, "grad_norm": 2.934359073638916, "learning_rate": 0.000125111201959908, "logits/chosen": -2.8329038619995117, "logits/rejected": -2.829845666885376, "logps/chosen": -60.867156982421875, "logps/rejected": -89.72724914550781, "loss": 0.1871, "rewards/accuracies": 0.9375, "rewards/chosen": -3.616037368774414, "rewards/margins": 3.0759949684143066, "rewards/rejected": -6.692032337188721, "step": 279 }, { "epoch": 0.5313092979127134, "grad_norm": 1.4452064037322998, "learning_rate": 0.00012439836718760886, "logits/chosen": -2.8472280502319336, "logits/rejected": -2.8384790420532227, "logps/chosen": -56.88945007324219, "logps/rejected": -86.16199493408203, "loss": 0.223, "rewards/accuracies": 0.9375, "rewards/chosen": -3.25555682182312, "rewards/margins": 3.1891889572143555, "rewards/rejected": -6.444746017456055, "step": 280 }, { "epoch": 0.5332068311195446, "grad_norm": 2.5307774543762207, "learning_rate": 0.00012368421171950192, "logits/chosen": -2.8560056686401367, "logits/rejected": -2.8471438884735107, "logps/chosen": -55.13880157470703, "logps/rejected": -85.99432373046875, "loss": 0.1456, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0400350093841553, "rewards/margins": 3.3702197074890137, "rewards/rejected": -6.410254955291748, "step": 281 }, { "epoch": 0.5351043643263758, "grad_norm": 9.487899780273438, "learning_rate": 0.0001229687742131796, "logits/chosen": -2.865816116333008, "logits/rejected": -2.8594892024993896, "logps/chosen": -50.71941375732422, "logps/rejected": -81.01962280273438, "loss": 0.2773, "rewards/accuracies": 0.8125, "rewards/chosen": -2.763157367706299, "rewards/margins": 3.12109375, "rewards/rejected": -5.884251594543457, "step": 282 }, { "epoch": 0.5370018975332068, "grad_norm": 7.071329116821289, "learning_rate": 0.00012225209339563145, "logits/chosen": -2.8695383071899414, "logits/rejected": -2.862952470779419, "logps/chosen": -46.901954650878906, "logps/rejected": -78.72514343261719, "loss": 0.3699, "rewards/accuracies": 0.9375, "rewards/chosen": -2.247375965118408, "rewards/margins": 3.466681480407715, "rewards/rejected": -5.714057445526123, "step": 283 }, { "epoch": 0.538899430740038, "grad_norm": 1.0974787473678589, "learning_rate": 0.0001215342080611484, "logits/chosen": -2.8294849395751953, "logits/rejected": -2.818751096725464, "logps/chosen": -42.89143371582031, "logps/rejected": -80.403564453125, "loss": 0.1185, "rewards/accuracies": 1.0, "rewards/chosen": -1.8664650917053223, "rewards/margins": 4.029164791107178, "rewards/rejected": -5.8956298828125, "step": 284 }, { "epoch": 0.540796963946869, "grad_norm": 1.9439111948013306, "learning_rate": 0.00012081515706922227, "logits/chosen": -2.886253833770752, "logits/rejected": -2.8804054260253906, "logps/chosen": -33.328758239746094, "logps/rejected": -74.43219757080078, "loss": 0.1276, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8928669691085815, "rewards/margins": 4.253573417663574, "rewards/rejected": -5.146440029144287, "step": 285 }, { "epoch": 0.5426944971537002, "grad_norm": 5.50389289855957, "learning_rate": 0.00012009497934244256, "logits/chosen": -2.860034704208374, "logits/rejected": -2.8562252521514893, "logps/chosen": -44.1033821105957, "logps/rejected": -78.41339111328125, "loss": 0.4644, "rewards/accuracies": 0.875, "rewards/chosen": -2.0543887615203857, "rewards/margins": 3.5441386699676514, "rewards/rejected": -5.598527908325195, "step": 286 }, { "epoch": 0.5445920303605313, "grad_norm": 9.102168083190918, "learning_rate": 0.00011937371386438954, "logits/chosen": -2.8378746509552, "logits/rejected": -2.8391685485839844, "logps/chosen": -43.23701477050781, "logps/rejected": -75.10458374023438, "loss": 0.3444, "rewards/accuracies": 0.8125, "rewards/chosen": -2.033146858215332, "rewards/margins": 3.336325168609619, "rewards/rejected": -5.369471549987793, "step": 287 }, { "epoch": 0.5464895635673624, "grad_norm": 1.0387825965881348, "learning_rate": 0.0001186513996775239, "logits/chosen": -2.8853936195373535, "logits/rejected": -2.880082607269287, "logps/chosen": -28.779258728027344, "logps/rejected": -83.81454467773438, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": -0.3752191662788391, "rewards/margins": 5.667832851409912, "rewards/rejected": -6.0430521965026855, "step": 288 }, { "epoch": 0.5483870967741935, "grad_norm": 2.832765579223633, "learning_rate": 0.00011792807588107357, "logits/chosen": -2.773785352706909, "logits/rejected": -2.771526575088501, "logps/chosen": -44.170867919921875, "logps/rejected": -81.07795715332031, "loss": 0.2608, "rewards/accuracies": 0.8125, "rewards/chosen": -2.024778366088867, "rewards/margins": 3.906771183013916, "rewards/rejected": -5.931549072265625, "step": 289 }, { "epoch": 0.5502846299810247, "grad_norm": 2.276353597640991, "learning_rate": 0.00011720378162891708, "logits/chosen": -2.895059823989868, "logits/rejected": -2.89176869392395, "logps/chosen": -38.851749420166016, "logps/rejected": -79.41373443603516, "loss": 0.2234, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7230916023254395, "rewards/margins": 4.087526798248291, "rewards/rejected": -5.8106184005737305, "step": 290 }, { "epoch": 0.5521821631878557, "grad_norm": 1.7804261445999146, "learning_rate": 0.00011647855612746423, "logits/chosen": -2.876720428466797, "logits/rejected": -2.872744083404541, "logps/chosen": -49.478515625, "logps/rejected": -79.2706069946289, "loss": 0.234, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6331980228424072, "rewards/margins": 3.1652159690856934, "rewards/rejected": -5.79841423034668, "step": 291 }, { "epoch": 0.5540796963946869, "grad_norm": 1.9441782236099243, "learning_rate": 0.00011575243863353382, "logits/chosen": -2.8950231075286865, "logits/rejected": -2.8902437686920166, "logps/chosen": -43.75312042236328, "logps/rejected": -80.70056915283203, "loss": 0.0955, "rewards/accuracies": 1.0, "rewards/chosen": -2.0715866088867188, "rewards/margins": 3.8401412963867188, "rewards/rejected": -5.9117279052734375, "step": 292 }, { "epoch": 0.5559772296015181, "grad_norm": 1.1018978357315063, "learning_rate": 0.00011502546845222859, "logits/chosen": -2.8955650329589844, "logits/rejected": -2.888160467147827, "logps/chosen": -43.491554260253906, "logps/rejected": -88.80857849121094, "loss": 0.1789, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9829752445220947, "rewards/margins": 4.644049644470215, "rewards/rejected": -6.6270246505737305, "step": 293 }, { "epoch": 0.5578747628083491, "grad_norm": 7.49698543548584, "learning_rate": 0.0001142976849348078, "logits/chosen": -2.8840601444244385, "logits/rejected": -2.8859639167785645, "logps/chosen": -54.523216247558594, "logps/rejected": -71.47078704833984, "loss": 1.1101, "rewards/accuracies": 0.625, "rewards/chosen": -3.054238796234131, "rewards/margins": 1.8495543003082275, "rewards/rejected": -4.9037933349609375, "step": 294 }, { "epoch": 0.5597722960151803, "grad_norm": 2.938047409057617, "learning_rate": 0.00011356912747655685, "logits/chosen": -2.8811421394348145, "logits/rejected": -2.8825528621673584, "logps/chosen": -46.03756332397461, "logps/rejected": -70.37115478515625, "loss": 0.2148, "rewards/accuracies": 0.875, "rewards/chosen": -2.1824710369110107, "rewards/margins": 2.7723135948181152, "rewards/rejected": -4.954784393310547, "step": 295 }, { "epoch": 0.5616698292220114, "grad_norm": 4.470379829406738, "learning_rate": 0.00011283983551465511, "logits/chosen": -2.863375425338745, "logits/rejected": -2.8606436252593994, "logps/chosen": -39.22755432128906, "logps/rejected": -77.66708374023438, "loss": 0.1821, "rewards/accuracies": 0.875, "rewards/chosen": -1.5369148254394531, "rewards/margins": 4.046816349029541, "rewards/rejected": -5.583731174468994, "step": 296 }, { "epoch": 0.5635673624288425, "grad_norm": 7.931248664855957, "learning_rate": 0.00011210984852604083, "logits/chosen": -2.8965516090393066, "logits/rejected": -2.893240213394165, "logps/chosen": -50.47080993652344, "logps/rejected": -87.6466064453125, "loss": 0.4181, "rewards/accuracies": 0.8125, "rewards/chosen": -2.552130699157715, "rewards/margins": 3.9827961921691895, "rewards/rejected": -6.5349273681640625, "step": 297 }, { "epoch": 0.5654648956356736, "grad_norm": 4.881028652191162, "learning_rate": 0.00011137920602527447, "logits/chosen": -2.916684150695801, "logits/rejected": -2.910217046737671, "logps/chosen": -49.143104553222656, "logps/rejected": -82.70768737792969, "loss": 0.3173, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5159926414489746, "rewards/margins": 3.645129442214966, "rewards/rejected": -6.1611223220825195, "step": 298 }, { "epoch": 0.5673624288425048, "grad_norm": 2.043503522872925, "learning_rate": 0.00011064794756239977, "logits/chosen": -2.878192901611328, "logits/rejected": -2.8828775882720947, "logps/chosen": -53.674171447753906, "logps/rejected": -79.92569732666016, "loss": 0.5443, "rewards/accuracies": 0.8125, "rewards/chosen": -3.003199577331543, "rewards/margins": 2.7453689575195312, "rewards/rejected": -5.748568534851074, "step": 299 }, { "epoch": 0.5692599620493358, "grad_norm": 2.94694447517395, "learning_rate": 0.00010991611272080269, "logits/chosen": -2.9256865978240967, "logits/rejected": -2.925057888031006, "logps/chosen": -59.540016174316406, "logps/rejected": -82.14313507080078, "loss": 0.2938, "rewards/accuracies": 0.875, "rewards/chosen": -3.6033997535705566, "rewards/margins": 2.2802443504333496, "rewards/rejected": -5.883644104003906, "step": 300 }, { "epoch": 0.571157495256167, "grad_norm": 1.191422700881958, "learning_rate": 0.00010918374111506893, "logits/chosen": -2.925339937210083, "logits/rejected": -2.9227840900421143, "logps/chosen": -46.54875946044922, "logps/rejected": -78.50569915771484, "loss": 0.1321, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1884849071502686, "rewards/margins": 3.4915459156036377, "rewards/rejected": -5.680030822753906, "step": 301 }, { "epoch": 0.573055028462998, "grad_norm": 4.066537857055664, "learning_rate": 0.00010845087238883944, "logits/chosen": -2.938689947128296, "logits/rejected": -2.937714099884033, "logps/chosen": -57.65900802612305, "logps/rejected": -82.79156494140625, "loss": 0.3726, "rewards/accuracies": 0.8125, "rewards/chosen": -3.370972156524658, "rewards/margins": 2.620013475418091, "rewards/rejected": -5.990985870361328, "step": 302 }, { "epoch": 0.5749525616698292, "grad_norm": 3.895630359649658, "learning_rate": 0.00010771754621266466, "logits/chosen": -2.9182395935058594, "logits/rejected": -2.9164552688598633, "logps/chosen": -66.43140411376953, "logps/rejected": -84.01023864746094, "loss": 0.3176, "rewards/accuracies": 0.8125, "rewards/chosen": -4.403564453125, "rewards/margins": 1.890740156173706, "rewards/rejected": -6.294304370880127, "step": 303 }, { "epoch": 0.5768500948766604, "grad_norm": 2.5287392139434814, "learning_rate": 0.00010698380228185685, "logits/chosen": -2.902897596359253, "logits/rejected": -2.902431011199951, "logps/chosen": -66.64987182617188, "logps/rejected": -85.07323455810547, "loss": 0.3927, "rewards/accuracies": 0.8125, "rewards/chosen": -4.36166524887085, "rewards/margins": 2.0132718086242676, "rewards/rejected": -6.374937534332275, "step": 304 }, { "epoch": 0.5787476280834914, "grad_norm": 2.4261245727539062, "learning_rate": 0.00010624968031434173, "logits/chosen": -2.9023351669311523, "logits/rejected": -2.9055070877075195, "logps/chosen": -58.39203643798828, "logps/rejected": -87.78874206542969, "loss": 0.3885, "rewards/accuracies": 0.875, "rewards/chosen": -3.485225200653076, "rewards/margins": 2.986713409423828, "rewards/rejected": -6.471938610076904, "step": 305 }, { "epoch": 0.5806451612903226, "grad_norm": 2.699694871902466, "learning_rate": 0.0001055152200485082, "logits/chosen": -2.8693368434906006, "logits/rejected": -2.8701136112213135, "logps/chosen": -61.098785400390625, "logps/rejected": -79.8037109375, "loss": 0.4626, "rewards/accuracies": 0.6875, "rewards/chosen": -3.692890167236328, "rewards/margins": 2.097386360168457, "rewards/rejected": -5.790276527404785, "step": 306 }, { "epoch": 0.5825426944971537, "grad_norm": 1.7026814222335815, "learning_rate": 0.00010478046124105744, "logits/chosen": -2.896878719329834, "logits/rejected": -2.8954782485961914, "logps/chosen": -60.92935562133789, "logps/rejected": -87.54103088378906, "loss": 0.2256, "rewards/accuracies": 0.9375, "rewards/chosen": -3.614488124847412, "rewards/margins": 2.906501531600952, "rewards/rejected": -6.520989894866943, "step": 307 }, { "epoch": 0.5844402277039848, "grad_norm": 4.583454608917236, "learning_rate": 0.00010404544366485094, "logits/chosen": -2.91058611869812, "logits/rejected": -2.9112749099731445, "logps/chosen": -62.88933181762695, "logps/rejected": -79.40133666992188, "loss": 0.58, "rewards/accuracies": 0.75, "rewards/chosen": -3.9962563514709473, "rewards/margins": 1.7496761083602905, "rewards/rejected": -5.745932579040527, "step": 308 }, { "epoch": 0.5863377609108159, "grad_norm": 1.0582410097122192, "learning_rate": 0.00010331020710675729, "logits/chosen": -2.9183032512664795, "logits/rejected": -2.919496536254883, "logps/chosen": -60.99847412109375, "logps/rejected": -79.97003936767578, "loss": 0.2519, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8266940116882324, "rewards/margins": 2.1178812980651855, "rewards/rejected": -5.944575309753418, "step": 309 }, { "epoch": 0.5882352941176471, "grad_norm": 2.27483868598938, "learning_rate": 0.00010257479136549889, "logits/chosen": -2.925682544708252, "logits/rejected": -2.9215149879455566, "logps/chosen": -62.194374084472656, "logps/rejected": -80.60386657714844, "loss": 0.3425, "rewards/accuracies": 0.8125, "rewards/chosen": -3.68037748336792, "rewards/margins": 2.081040143966675, "rewards/rejected": -5.761418342590332, "step": 310 }, { "epoch": 0.5901328273244781, "grad_norm": 1.3051766157150269, "learning_rate": 0.0001018392362494972, "logits/chosen": -2.9560303688049316, "logits/rejected": -2.949531316757202, "logps/chosen": -57.7611083984375, "logps/rejected": -78.2846450805664, "loss": 0.251, "rewards/accuracies": 0.875, "rewards/chosen": -3.4193286895751953, "rewards/margins": 2.350963592529297, "rewards/rejected": -5.77029275894165, "step": 311 }, { "epoch": 0.5920303605313093, "grad_norm": 2.4817426204681396, "learning_rate": 0.00010110358157471824, "logits/chosen": -2.9510304927825928, "logits/rejected": -2.953981876373291, "logps/chosen": -65.96543884277344, "logps/rejected": -83.39318084716797, "loss": 0.4345, "rewards/accuracies": 0.8125, "rewards/chosen": -4.075649261474609, "rewards/margins": 2.000249147415161, "rewards/rejected": -6.075898170471191, "step": 312 }, { "epoch": 0.5939278937381404, "grad_norm": 1.3821583986282349, "learning_rate": 0.0001003678671625172, "logits/chosen": -2.952937602996826, "logits/rejected": -2.950251817703247, "logps/chosen": -61.165550231933594, "logps/rejected": -85.19444274902344, "loss": 0.2662, "rewards/accuracies": 0.875, "rewards/chosen": -3.7030019760131836, "rewards/margins": 2.63686466217041, "rewards/rejected": -6.339866638183594, "step": 313 }, { "epoch": 0.5958254269449715, "grad_norm": 1.6280112266540527, "learning_rate": 9.963213283748282e-05, "logits/chosen": -2.933931589126587, "logits/rejected": -2.926939010620117, "logps/chosen": -61.775917053222656, "logps/rejected": -81.53590393066406, "loss": 0.3233, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7116825580596924, "rewards/margins": 2.275146484375, "rewards/rejected": -5.986828804016113, "step": 314 }, { "epoch": 0.5977229601518027, "grad_norm": 1.9062341451644897, "learning_rate": 9.889641842528178e-05, "logits/chosen": -2.9100828170776367, "logits/rejected": -2.9047842025756836, "logps/chosen": -61.198387145996094, "logps/rejected": -81.3210220336914, "loss": 0.3868, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6690847873687744, "rewards/margins": 2.366109848022461, "rewards/rejected": -6.0351948738098145, "step": 315 }, { "epoch": 0.5996204933586338, "grad_norm": 1.1220036745071411, "learning_rate": 9.816076375050283e-05, "logits/chosen": -2.9533026218414307, "logits/rejected": -2.949734926223755, "logps/chosen": -53.777095794677734, "logps/rejected": -82.09625244140625, "loss": 0.1049, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7948765754699707, "rewards/margins": 3.177762508392334, "rewards/rejected": -5.972639083862305, "step": 316 }, { "epoch": 0.6015180265654649, "grad_norm": 1.763089895248413, "learning_rate": 9.742520863450115e-05, "logits/chosen": -2.9159345626831055, "logits/rejected": -2.912830114364624, "logps/chosen": -56.66600799560547, "logps/rejected": -82.20394897460938, "loss": 0.2744, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2924387454986572, "rewards/margins": 2.709559917449951, "rewards/rejected": -6.0019989013671875, "step": 317 }, { "epoch": 0.603415559772296, "grad_norm": 1.6462388038635254, "learning_rate": 9.668979289324273e-05, "logits/chosen": -2.9503092765808105, "logits/rejected": -2.947619676589966, "logps/chosen": -52.42357635498047, "logps/rejected": -84.0963134765625, "loss": 0.2405, "rewards/accuracies": 0.8125, "rewards/chosen": -2.807110548019409, "rewards/margins": 3.3853917121887207, "rewards/rejected": -6.192502021789551, "step": 318 }, { "epoch": 0.6053130929791272, "grad_norm": 1.4591569900512695, "learning_rate": 9.595455633514909e-05, "logits/chosen": -2.9135751724243164, "logits/rejected": -2.9162490367889404, "logps/chosen": -51.95201110839844, "logps/rejected": -83.46641540527344, "loss": 0.1972, "rewards/accuracies": 0.875, "rewards/chosen": -2.7722127437591553, "rewards/margins": 3.348604679107666, "rewards/rejected": -6.120817184448242, "step": 319 }, { "epoch": 0.6072106261859582, "grad_norm": 2.3641560077667236, "learning_rate": 9.521953875894257e-05, "logits/chosen": -2.9282002449035645, "logits/rejected": -2.926896095275879, "logps/chosen": -46.65143585205078, "logps/rejected": -81.9978256225586, "loss": 0.1471, "rewards/accuracies": 0.9375, "rewards/chosen": -2.24959397315979, "rewards/margins": 3.780817985534668, "rewards/rejected": -6.030411720275879, "step": 320 }, { "epoch": 0.6091081593927894, "grad_norm": 2.0623621940612793, "learning_rate": 9.448477995149182e-05, "logits/chosen": -2.9305570125579834, "logits/rejected": -2.929703950881958, "logps/chosen": -44.84100341796875, "logps/rejected": -89.29674530029297, "loss": 0.1482, "rewards/accuracies": 0.9375, "rewards/chosen": -2.057992935180664, "rewards/margins": 4.761512279510498, "rewards/rejected": -6.81950569152832, "step": 321 }, { "epoch": 0.6110056925996205, "grad_norm": 2.0448296070098877, "learning_rate": 9.375031968565829e-05, "logits/chosen": -2.891784429550171, "logits/rejected": -2.891885280609131, "logps/chosen": -48.85424041748047, "logps/rejected": -73.28569030761719, "loss": 0.7177, "rewards/accuracies": 0.9375, "rewards/chosen": -2.64813232421875, "rewards/margins": 2.567401647567749, "rewards/rejected": -5.215534210205078, "step": 322 }, { "epoch": 0.6129032258064516, "grad_norm": 9.832599639892578, "learning_rate": 9.301619771814316e-05, "logits/chosen": -2.8943068981170654, "logits/rejected": -2.8931775093078613, "logps/chosen": -51.37837219238281, "logps/rejected": -77.63446044921875, "loss": 0.4391, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7383222579956055, "rewards/margins": 2.8650565147399902, "rewards/rejected": -5.603378772735596, "step": 323 }, { "epoch": 0.6148007590132827, "grad_norm": 2.727288246154785, "learning_rate": 9.228245378733537e-05, "logits/chosen": -2.9204916954040527, "logits/rejected": -2.9224843978881836, "logps/chosen": -54.46859359741211, "logps/rejected": -78.30081939697266, "loss": 0.4835, "rewards/accuracies": 0.6875, "rewards/chosen": -3.144411325454712, "rewards/margins": 2.3725523948669434, "rewards/rejected": -5.516963958740234, "step": 324 }, { "epoch": 0.6166982922201139, "grad_norm": 5.74871301651001, "learning_rate": 9.154912761116056e-05, "logits/chosen": -2.9131014347076416, "logits/rejected": -2.9133331775665283, "logps/chosen": -44.03924560546875, "logps/rejected": -75.57574462890625, "loss": 0.2908, "rewards/accuracies": 0.8125, "rewards/chosen": -2.139979839324951, "rewards/margins": 3.2752749919891357, "rewards/rejected": -5.415254592895508, "step": 325 }, { "epoch": 0.618595825426945, "grad_norm": 1.533111810684204, "learning_rate": 9.081625888493108e-05, "logits/chosen": -2.909083843231201, "logits/rejected": -2.911268472671509, "logps/chosen": -34.44724655151367, "logps/rejected": -74.28544616699219, "loss": 0.0937, "rewards/accuracies": 1.0, "rewards/chosen": -1.0769883394241333, "rewards/margins": 4.155948162078857, "rewards/rejected": -5.232936859130859, "step": 326 }, { "epoch": 0.6204933586337761, "grad_norm": 1.9290465116500854, "learning_rate": 9.008388727919731e-05, "logits/chosen": -2.898500442504883, "logits/rejected": -2.89833664894104, "logps/chosen": -42.38737487792969, "logps/rejected": -70.19406127929688, "loss": 0.2004, "rewards/accuracies": 0.9375, "rewards/chosen": -1.788802981376648, "rewards/margins": 3.008125066757202, "rewards/rejected": -4.7969279289245605, "step": 327 }, { "epoch": 0.6223908918406073, "grad_norm": 3.8443222045898438, "learning_rate": 8.935205243760022e-05, "logits/chosen": -2.907944679260254, "logits/rejected": -2.910562038421631, "logps/chosen": -37.31573486328125, "logps/rejected": -69.13600158691406, "loss": 0.2375, "rewards/accuracies": 0.9375, "rewards/chosen": -1.220193862915039, "rewards/margins": 3.487980842590332, "rewards/rejected": -4.708174705505371, "step": 328 }, { "epoch": 0.6242884250474383, "grad_norm": 6.546236515045166, "learning_rate": 8.862079397472553e-05, "logits/chosen": -2.9090211391448975, "logits/rejected": -2.910724401473999, "logps/chosen": -39.852840423583984, "logps/rejected": -66.06358337402344, "loss": 0.2685, "rewards/accuracies": 0.875, "rewards/chosen": -1.7172813415527344, "rewards/margins": 2.741621494293213, "rewards/rejected": -4.458902835845947, "step": 329 }, { "epoch": 0.6261859582542695, "grad_norm": 5.839223861694336, "learning_rate": 8.789015147395919e-05, "logits/chosen": -2.889848470687866, "logits/rejected": -2.890821933746338, "logps/chosen": -60.09476852416992, "logps/rejected": -75.91924285888672, "loss": 0.8722, "rewards/accuracies": 0.6875, "rewards/chosen": -3.577435255050659, "rewards/margins": 1.8433260917663574, "rewards/rejected": -5.4207611083984375, "step": 330 }, { "epoch": 0.6280834914611005, "grad_norm": 2.1105823516845703, "learning_rate": 8.71601644853449e-05, "logits/chosen": -2.8981029987335205, "logits/rejected": -2.899024486541748, "logps/chosen": -45.887939453125, "logps/rejected": -82.8519287109375, "loss": 0.1054, "rewards/accuracies": 1.0, "rewards/chosen": -2.219303607940674, "rewards/margins": 3.9345149993896484, "rewards/rejected": -6.1538190841674805, "step": 331 }, { "epoch": 0.6299810246679317, "grad_norm": 1.5297664403915405, "learning_rate": 8.643087252344313e-05, "logits/chosen": -2.9043052196502686, "logits/rejected": -2.9025826454162598, "logps/chosen": -52.71283721923828, "logps/rejected": -80.79203033447266, "loss": 0.1583, "rewards/accuracies": 0.9375, "rewards/chosen": -3.007112979888916, "rewards/margins": 2.9045331478118896, "rewards/rejected": -5.911646366119385, "step": 332 }, { "epoch": 0.6318785578747628, "grad_norm": 3.1997110843658447, "learning_rate": 8.57023150651922e-05, "logits/chosen": -2.9024877548217773, "logits/rejected": -2.9013073444366455, "logps/chosen": -42.579097747802734, "logps/rejected": -79.65797424316406, "loss": 0.1922, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8045451641082764, "rewards/margins": 3.9230599403381348, "rewards/rejected": -5.72760534286499, "step": 333 }, { "epoch": 0.6337760910815939, "grad_norm": 2.93386173248291, "learning_rate": 8.49745315477714e-05, "logits/chosen": -2.8821892738342285, "logits/rejected": -2.88553524017334, "logps/chosen": -40.950721740722656, "logps/rejected": -74.56183624267578, "loss": 0.3928, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8069758415222168, "rewards/margins": 3.571430206298828, "rewards/rejected": -5.378406524658203, "step": 334 }, { "epoch": 0.635673624288425, "grad_norm": 4.076981544494629, "learning_rate": 8.424756136646623e-05, "logits/chosen": -2.888922929763794, "logits/rejected": -2.8909573554992676, "logps/chosen": -44.45381164550781, "logps/rejected": -74.74778747558594, "loss": 0.6127, "rewards/accuracies": 0.8125, "rewards/chosen": -2.156569480895996, "rewards/margins": 3.1150104999542236, "rewards/rejected": -5.271580219268799, "step": 335 }, { "epoch": 0.6375711574952562, "grad_norm": 1.4378771781921387, "learning_rate": 8.352144387253582e-05, "logits/chosen": -2.8876922130584717, "logits/rejected": -2.8896241188049316, "logps/chosen": -37.443275451660156, "logps/rejected": -82.51123046875, "loss": 0.1054, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2844719886779785, "rewards/margins": 4.881505012512207, "rewards/rejected": -6.1659770011901855, "step": 336 }, { "epoch": 0.6394686907020873, "grad_norm": 4.512608051300049, "learning_rate": 8.279621837108295e-05, "logits/chosen": -2.9067864418029785, "logits/rejected": -2.9087655544281006, "logps/chosen": -52.943763732910156, "logps/rejected": -80.59246826171875, "loss": 0.3255, "rewards/accuracies": 0.875, "rewards/chosen": -2.979990005493164, "rewards/margins": 2.8639986515045166, "rewards/rejected": -5.843988418579102, "step": 337 }, { "epoch": 0.6413662239089184, "grad_norm": 4.300062656402588, "learning_rate": 8.207192411892646e-05, "logits/chosen": -2.8789937496185303, "logits/rejected": -2.877152442932129, "logps/chosen": -54.1341552734375, "logps/rejected": -85.54117584228516, "loss": 0.2004, "rewards/accuracies": 0.9375, "rewards/chosen": -2.914337396621704, "rewards/margins": 3.30031681060791, "rewards/rejected": -6.214654445648193, "step": 338 }, { "epoch": 0.6432637571157496, "grad_norm": 2.4098892211914062, "learning_rate": 8.134860032247613e-05, "logits/chosen": -2.870198965072632, "logits/rejected": -2.867429733276367, "logps/chosen": -44.648841857910156, "logps/rejected": -85.62705993652344, "loss": 0.1193, "rewards/accuracies": 1.0, "rewards/chosen": -2.0035176277160645, "rewards/margins": 4.332259178161621, "rewards/rejected": -6.335777282714844, "step": 339 }, { "epoch": 0.6451612903225806, "grad_norm": 4.86448335647583, "learning_rate": 8.062628613561051e-05, "logits/chosen": -2.8677215576171875, "logits/rejected": -2.865633010864258, "logps/chosen": -45.240501403808594, "logps/rejected": -84.56080627441406, "loss": 0.1312, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1111769676208496, "rewards/margins": 4.117494583129883, "rewards/rejected": -6.228672027587891, "step": 340 }, { "epoch": 0.6470588235294118, "grad_norm": 3.7126381397247314, "learning_rate": 7.990502065755748e-05, "logits/chosen": -2.858647346496582, "logits/rejected": -2.857041835784912, "logps/chosen": -55.83709716796875, "logps/rejected": -82.04129028320312, "loss": 0.3661, "rewards/accuracies": 0.6875, "rewards/chosen": -3.063718795776367, "rewards/margins": 2.8775720596313477, "rewards/rejected": -5.941291332244873, "step": 341 }, { "epoch": 0.6489563567362429, "grad_norm": 2.671086072921753, "learning_rate": 7.918484293077777e-05, "logits/chosen": -2.884795665740967, "logits/rejected": -2.8853371143341064, "logps/chosen": -51.566864013671875, "logps/rejected": -78.62126922607422, "loss": 0.2656, "rewards/accuracies": 0.8125, "rewards/chosen": -2.805079936981201, "rewards/margins": 2.820594072341919, "rewards/rejected": -5.625674247741699, "step": 342 }, { "epoch": 0.650853889943074, "grad_norm": 2.7431766986846924, "learning_rate": 7.846579193885166e-05, "logits/chosen": -2.9095654487609863, "logits/rejected": -2.908602476119995, "logps/chosen": -48.2768440246582, "logps/rejected": -75.55670166015625, "loss": 0.3786, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4378509521484375, "rewards/margins": 2.9142463207244873, "rewards/rejected": -5.352097511291504, "step": 343 }, { "epoch": 0.6527514231499051, "grad_norm": 2.9285616874694824, "learning_rate": 7.774790660436858e-05, "logits/chosen": -2.8709418773651123, "logits/rejected": -2.8702428340911865, "logps/chosen": -46.38733673095703, "logps/rejected": -82.48179626464844, "loss": 0.3003, "rewards/accuracies": 0.875, "rewards/chosen": -2.308894395828247, "rewards/margins": 3.722503662109375, "rewards/rejected": -6.031398296356201, "step": 344 }, { "epoch": 0.6546489563567363, "grad_norm": 0.9489471912384033, "learning_rate": 7.703122578682046e-05, "logits/chosen": -2.8957481384277344, "logits/rejected": -2.8958137035369873, "logps/chosen": -36.95110321044922, "logps/rejected": -86.87677764892578, "loss": 0.0761, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2242169380187988, "rewards/margins": 5.352141380310059, "rewards/rejected": -6.576358318328857, "step": 345 }, { "epoch": 0.6565464895635673, "grad_norm": 0.7262978553771973, "learning_rate": 7.631578828049809e-05, "logits/chosen": -2.902186870574951, "logits/rejected": -2.902596950531006, "logps/chosen": -42.105674743652344, "logps/rejected": -81.15824890136719, "loss": 0.137, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8559753894805908, "rewards/margins": 4.022046089172363, "rewards/rejected": -5.878021717071533, "step": 346 }, { "epoch": 0.6584440227703985, "grad_norm": 0.5167392492294312, "learning_rate": 7.560163281239115e-05, "logits/chosen": -2.893174409866333, "logits/rejected": -2.890427350997925, "logps/chosen": -34.30916976928711, "logps/rejected": -81.82603454589844, "loss": 0.0651, "rewards/accuracies": 0.9375, "rewards/chosen": -0.9803083539009094, "rewards/margins": 5.106498718261719, "rewards/rejected": -6.0868072509765625, "step": 347 }, { "epoch": 0.6603415559772297, "grad_norm": 3.071392059326172, "learning_rate": 7.488879804009205e-05, "logits/chosen": -2.8696646690368652, "logits/rejected": -2.868384838104248, "logps/chosen": -52.08047866821289, "logps/rejected": -89.13264465332031, "loss": 0.3081, "rewards/accuracies": 0.8125, "rewards/chosen": -2.913431406021118, "rewards/margins": 3.715190887451172, "rewards/rejected": -6.628622055053711, "step": 348 }, { "epoch": 0.6622390891840607, "grad_norm": 2.76004958152771, "learning_rate": 7.417732254970317e-05, "logits/chosen": -2.8904638290405273, "logits/rejected": -2.8905301094055176, "logps/chosen": -43.39665222167969, "logps/rejected": -80.96947479248047, "loss": 0.283, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0628576278686523, "rewards/margins": 3.969433546066284, "rewards/rejected": -6.032290935516357, "step": 349 }, { "epoch": 0.6641366223908919, "grad_norm": 1.2935935258865356, "learning_rate": 7.346724485374837e-05, "logits/chosen": -2.8328428268432617, "logits/rejected": -2.8332414627075195, "logps/chosen": -42.56072235107422, "logps/rejected": -72.73566436767578, "loss": 0.4495, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9386003017425537, "rewards/margins": 3.1780686378479004, "rewards/rejected": -5.116669178009033, "step": 350 }, { "epoch": 0.6660341555977229, "grad_norm": 0.8221670389175415, "learning_rate": 7.275860338908815e-05, "logits/chosen": -2.840083122253418, "logits/rejected": -2.8375134468078613, "logps/chosen": -31.67504119873047, "logps/rejected": -73.15141296386719, "loss": 0.0362, "rewards/accuracies": 1.0, "rewards/chosen": -0.7467401027679443, "rewards/margins": 4.425130844116211, "rewards/rejected": -5.171871185302734, "step": 351 }, { "epoch": 0.6679316888045541, "grad_norm": 7.032353401184082, "learning_rate": 7.205143651483906e-05, "logits/chosen": -2.8678221702575684, "logits/rejected": -2.8668932914733887, "logps/chosen": -40.8023796081543, "logps/rejected": -70.9334487915039, "loss": 0.2645, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7451889514923096, "rewards/margins": 3.13863468170166, "rewards/rejected": -4.883823871612549, "step": 352 }, { "epoch": 0.6698292220113852, "grad_norm": 3.907095193862915, "learning_rate": 7.134578251029745e-05, "logits/chosen": -2.835967779159546, "logits/rejected": -2.8349101543426514, "logps/chosen": -37.25190734863281, "logps/rejected": -75.33383178710938, "loss": 0.2817, "rewards/accuracies": 0.8125, "rewards/chosen": -1.264039397239685, "rewards/margins": 4.003348350524902, "rewards/rejected": -5.267387866973877, "step": 353 }, { "epoch": 0.6717267552182163, "grad_norm": 3.347811698913574, "learning_rate": 7.064167957286714e-05, "logits/chosen": -2.8626599311828613, "logits/rejected": -2.8652517795562744, "logps/chosen": -43.0107307434082, "logps/rejected": -77.27116394042969, "loss": 0.3695, "rewards/accuracies": 0.875, "rewards/chosen": -1.9182496070861816, "rewards/margins": 3.5998854637145996, "rewards/rejected": -5.518135070800781, "step": 354 }, { "epoch": 0.6736242884250474, "grad_norm": 1.5447057485580444, "learning_rate": 6.993916581599202e-05, "logits/chosen": -2.8407070636749268, "logits/rejected": -2.8412859439849854, "logps/chosen": -39.80915451049805, "logps/rejected": -74.49897003173828, "loss": 0.2434, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6093679666519165, "rewards/margins": 3.66628360748291, "rewards/rejected": -5.275651931762695, "step": 355 }, { "epoch": 0.6755218216318786, "grad_norm": 3.911895275115967, "learning_rate": 6.923827926709277e-05, "logits/chosen": -2.8489303588867188, "logits/rejected": -2.8513529300689697, "logps/chosen": -40.58164978027344, "logps/rejected": -76.51760864257812, "loss": 0.1489, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7140048742294312, "rewards/margins": 3.8055503368377686, "rewards/rejected": -5.51955509185791, "step": 356 }, { "epoch": 0.6774193548387096, "grad_norm": 1.1900240182876587, "learning_rate": 6.853905786550854e-05, "logits/chosen": -2.82595157623291, "logits/rejected": -2.8293063640594482, "logps/chosen": -34.55420684814453, "logps/rejected": -82.0986557006836, "loss": 0.0661, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2518024444580078, "rewards/margins": 4.8424973487854, "rewards/rejected": -6.094300270080566, "step": 357 }, { "epoch": 0.6793168880455408, "grad_norm": 1.571761131286621, "learning_rate": 6.78415394604432e-05, "logits/chosen": -2.826899766921997, "logits/rejected": -2.826556921005249, "logps/chosen": -41.274803161621094, "logps/rejected": -72.56224060058594, "loss": 0.1361, "rewards/accuracies": 1.0, "rewards/chosen": -1.6804169416427612, "rewards/margins": 3.465762138366699, "rewards/rejected": -5.14617919921875, "step": 358 }, { "epoch": 0.681214421252372, "grad_norm": 3.052306890487671, "learning_rate": 6.714576180891654e-05, "logits/chosen": -2.8384101390838623, "logits/rejected": -2.840078353881836, "logps/chosen": -39.517417907714844, "logps/rejected": -69.88583374023438, "loss": 0.1488, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5221190452575684, "rewards/margins": 3.2681870460510254, "rewards/rejected": -4.790306091308594, "step": 359 }, { "epoch": 0.683111954459203, "grad_norm": 1.8139554262161255, "learning_rate": 6.645176257372055e-05, "logits/chosen": -2.818631172180176, "logits/rejected": -2.8222427368164062, "logps/chosen": -35.95072937011719, "logps/rejected": -64.10530090332031, "loss": 0.1746, "rewards/accuracies": 1.0, "rewards/chosen": -1.1438565254211426, "rewards/margins": 3.059673309326172, "rewards/rejected": -4.203530311584473, "step": 360 }, { "epoch": 0.6850094876660342, "grad_norm": 2.069314479827881, "learning_rate": 6.575957932138057e-05, "logits/chosen": -2.8182709217071533, "logits/rejected": -2.8195230960845947, "logps/chosen": -46.92900848388672, "logps/rejected": -74.21569061279297, "loss": 0.3961, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1707496643066406, "rewards/margins": 2.961195230484009, "rewards/rejected": -5.1319451332092285, "step": 361 }, { "epoch": 0.6869070208728653, "grad_norm": 1.8511279821395874, "learning_rate": 6.506924952012202e-05, "logits/chosen": -2.8286561965942383, "logits/rejected": -2.823415994644165, "logps/chosen": -44.049896240234375, "logps/rejected": -78.49517822265625, "loss": 0.2609, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9191491603851318, "rewards/margins": 3.8288094997406006, "rewards/rejected": -5.747958660125732, "step": 362 }, { "epoch": 0.6888045540796964, "grad_norm": 2.228764533996582, "learning_rate": 6.438081053784197e-05, "logits/chosen": -2.8500823974609375, "logits/rejected": -2.8482038974761963, "logps/chosen": -44.39284896850586, "logps/rejected": -69.33513641357422, "loss": 0.2864, "rewards/accuracies": 0.875, "rewards/chosen": -2.1398777961730957, "rewards/margins": 2.614034652709961, "rewards/rejected": -4.753911972045898, "step": 363 }, { "epoch": 0.6907020872865275, "grad_norm": 2.184192180633545, "learning_rate": 6.36942996400865e-05, "logits/chosen": -2.819075584411621, "logits/rejected": -2.820847749710083, "logps/chosen": -43.02545166015625, "logps/rejected": -67.06218719482422, "loss": 0.217, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8877732753753662, "rewards/margins": 2.566277027130127, "rewards/rejected": -4.454050064086914, "step": 364 }, { "epoch": 0.6925996204933587, "grad_norm": 1.0147178173065186, "learning_rate": 6.300975398803362e-05, "logits/chosen": -2.817196846008301, "logits/rejected": -2.819939613342285, "logps/chosen": -43.927947998046875, "logps/rejected": -77.98281860351562, "loss": 0.0992, "rewards/accuracies": 1.0, "rewards/chosen": -2.025996208190918, "rewards/margins": 3.6155223846435547, "rewards/rejected": -5.641518592834473, "step": 365 }, { "epoch": 0.6944971537001897, "grad_norm": 2.812535285949707, "learning_rate": 6.232721063648148e-05, "logits/chosen": -2.8124892711639404, "logits/rejected": -2.813704252243042, "logps/chosen": -39.741539001464844, "logps/rejected": -72.29251098632812, "loss": 0.3044, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6100232601165771, "rewards/margins": 3.4391074180603027, "rewards/rejected": -5.049130439758301, "step": 366 }, { "epoch": 0.6963946869070209, "grad_norm": 4.147728443145752, "learning_rate": 6.164670653184285e-05, "logits/chosen": -2.8021512031555176, "logits/rejected": -2.8073835372924805, "logps/chosen": -49.98514175415039, "logps/rejected": -68.45268249511719, "loss": 0.7597, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6653554439544678, "rewards/margins": 1.931579351425171, "rewards/rejected": -4.5969343185424805, "step": 367 }, { "epoch": 0.698292220113852, "grad_norm": 4.9097676277160645, "learning_rate": 6.09682785101449e-05, "logits/chosen": -2.754014253616333, "logits/rejected": -2.753483295440674, "logps/chosen": -50.1568603515625, "logps/rejected": -72.04705047607422, "loss": 0.343, "rewards/accuracies": 0.9375, "rewards/chosen": -2.7088911533355713, "rewards/margins": 2.3041090965270996, "rewards/rejected": -5.01300048828125, "step": 368 }, { "epoch": 0.7001897533206831, "grad_norm": 1.8070886135101318, "learning_rate": 6.0291963295035484e-05, "logits/chosen": -2.7732787132263184, "logits/rejected": -2.773905038833618, "logps/chosen": -36.48072052001953, "logps/rejected": -64.48783874511719, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": -1.3253145217895508, "rewards/margins": 2.909857749938965, "rewards/rejected": -4.235172271728516, "step": 369 }, { "epoch": 0.7020872865275142, "grad_norm": 1.1058545112609863, "learning_rate": 5.961779749579516e-05, "logits/chosen": -2.8352999687194824, "logits/rejected": -2.837979793548584, "logps/chosen": -40.074058532714844, "logps/rejected": -78.63117980957031, "loss": 0.1221, "rewards/accuracies": 0.9375, "rewards/chosen": -1.5429928302764893, "rewards/margins": 4.072991847991943, "rewards/rejected": -5.615984916687012, "step": 370 }, { "epoch": 0.7039848197343453, "grad_norm": 1.185791015625, "learning_rate": 5.894581760535549e-05, "logits/chosen": -2.7890102863311768, "logits/rejected": -2.798060417175293, "logps/chosen": -39.764137268066406, "logps/rejected": -71.1139144897461, "loss": 0.1825, "rewards/accuracies": 0.9375, "rewards/chosen": -1.6215232610702515, "rewards/margins": 3.141541004180908, "rewards/rejected": -4.763064384460449, "step": 371 }, { "epoch": 0.7058823529411765, "grad_norm": 2.253542423248291, "learning_rate": 5.827605999832375e-05, "logits/chosen": -2.8037962913513184, "logits/rejected": -2.810068368911743, "logps/chosen": -40.40297317504883, "logps/rejected": -69.3699951171875, "loss": 0.356, "rewards/accuracies": 0.875, "rewards/chosen": -1.7513362169265747, "rewards/margins": 2.997833728790283, "rewards/rejected": -4.749169826507568, "step": 372 }, { "epoch": 0.7077798861480076, "grad_norm": 4.954280376434326, "learning_rate": 5.7608560929013946e-05, "logits/chosen": -2.830198049545288, "logits/rejected": -2.8298377990722656, "logps/chosen": -38.188899993896484, "logps/rejected": -60.74665832519531, "loss": 0.3994, "rewards/accuracies": 0.75, "rewards/chosen": -1.466092586517334, "rewards/margins": 2.4330639839172363, "rewards/rejected": -3.8991568088531494, "step": 373 }, { "epoch": 0.7096774193548387, "grad_norm": 2.809379816055298, "learning_rate": 5.694335652948415e-05, "logits/chosen": -2.7694361209869385, "logits/rejected": -2.7702527046203613, "logps/chosen": -39.33440017700195, "logps/rejected": -61.418212890625, "loss": 0.453, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5252565145492554, "rewards/margins": 2.4085230827331543, "rewards/rejected": -3.933779716491699, "step": 374 }, { "epoch": 0.7115749525616698, "grad_norm": 4.30332612991333, "learning_rate": 5.628048280758096e-05, "logits/chosen": -2.835770845413208, "logits/rejected": -2.8393208980560303, "logps/chosen": -40.133426666259766, "logps/rejected": -70.28953552246094, "loss": 0.2347, "rewards/accuracies": 0.875, "rewards/chosen": -1.6678059101104736, "rewards/margins": 3.160830497741699, "rewards/rejected": -4.82863712310791, "step": 375 }, { "epoch": 0.713472485768501, "grad_norm": 9.771546363830566, "learning_rate": 5.5619975644990244e-05, "logits/chosen": -2.8247642517089844, "logits/rejected": -2.828068971633911, "logps/chosen": -42.2925910949707, "logps/rejected": -60.983116149902344, "loss": 0.6838, "rewards/accuracies": 0.75, "rewards/chosen": -1.81632661819458, "rewards/margins": 2.0536108016967773, "rewards/rejected": -3.8699374198913574, "step": 376 }, { "epoch": 0.715370018975332, "grad_norm": 8.389379501342773, "learning_rate": 5.496187079529465e-05, "logits/chosen": -2.8321757316589355, "logits/rejected": -2.8335793018341064, "logps/chosen": -39.37532043457031, "logps/rejected": -62.030555725097656, "loss": 0.4633, "rewards/accuracies": 0.75, "rewards/chosen": -1.6666603088378906, "rewards/margins": 2.3203835487365723, "rewards/rejected": -3.987044095993042, "step": 377 }, { "epoch": 0.7172675521821632, "grad_norm": 0.9523990154266357, "learning_rate": 5.4306203882038664e-05, "logits/chosen": -2.8029139041900635, "logits/rejected": -2.8072001934051514, "logps/chosen": -38.88301467895508, "logps/rejected": -63.761016845703125, "loss": 0.155, "rewards/accuracies": 1.0, "rewards/chosen": -1.5240936279296875, "rewards/margins": 2.7224197387695312, "rewards/rejected": -4.246513366699219, "step": 378 }, { "epoch": 0.7191650853889943, "grad_norm": 1.4619261026382446, "learning_rate": 5.365301039679984e-05, "logits/chosen": -2.7594830989837646, "logits/rejected": -2.759218692779541, "logps/chosen": -43.129417419433594, "logps/rejected": -69.13524627685547, "loss": 0.1813, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7728075981140137, "rewards/margins": 2.8003039360046387, "rewards/rejected": -4.573111534118652, "step": 379 }, { "epoch": 0.7210626185958254, "grad_norm": 0.7891879677772522, "learning_rate": 5.300232569726804e-05, "logits/chosen": -2.8181209564208984, "logits/rejected": -2.818765878677368, "logps/chosen": -37.475074768066406, "logps/rejected": -67.34873962402344, "loss": 0.1258, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2424066066741943, "rewards/margins": 3.349724769592285, "rewards/rejected": -4.592131614685059, "step": 380 }, { "epoch": 0.7229601518026565, "grad_norm": 3.099241018295288, "learning_rate": 5.235418500533109e-05, "logits/chosen": -2.8263823986053467, "logits/rejected": -2.826491355895996, "logps/chosen": -34.44828796386719, "logps/rejected": -60.10238265991211, "loss": 0.1577, "rewards/accuracies": 1.0, "rewards/chosen": -1.0063425302505493, "rewards/margins": 2.7296359539031982, "rewards/rejected": -3.735978603363037, "step": 381 }, { "epoch": 0.7248576850094877, "grad_norm": 2.173335313796997, "learning_rate": 5.170862340516858e-05, "logits/chosen": -2.8429558277130127, "logits/rejected": -2.845327377319336, "logps/chosen": -37.72207260131836, "logps/rejected": -69.11197662353516, "loss": 0.1317, "rewards/accuracies": 0.9375, "rewards/chosen": -1.4073717594146729, "rewards/margins": 3.334414005279541, "rewards/rejected": -4.741786003112793, "step": 382 }, { "epoch": 0.7267552182163188, "grad_norm": 1.9054666757583618, "learning_rate": 5.1065675841352514e-05, "logits/chosen": -2.799262523651123, "logits/rejected": -2.7995381355285645, "logps/chosen": -42.19648742675781, "logps/rejected": -73.97077941894531, "loss": 0.1394, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8171238899230957, "rewards/margins": 3.4282195568084717, "rewards/rejected": -5.2453436851501465, "step": 383 }, { "epoch": 0.7286527514231499, "grad_norm": 2.1896724700927734, "learning_rate": 5.042537711695584e-05, "logits/chosen": -2.839466094970703, "logits/rejected": -2.839400291442871, "logps/chosen": -46.19039535522461, "logps/rejected": -78.2408447265625, "loss": 0.1499, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3111000061035156, "rewards/margins": 3.2166218757629395, "rewards/rejected": -5.527722358703613, "step": 384 }, { "epoch": 0.7305502846299811, "grad_norm": 3.224628210067749, "learning_rate": 4.9787761891668397e-05, "logits/chosen": -2.8577723503112793, "logits/rejected": -2.8555190563201904, "logps/chosen": -45.490821838378906, "logps/rejected": -75.40174865722656, "loss": 0.4481, "rewards/accuracies": 0.875, "rewards/chosen": -2.101290702819824, "rewards/margins": 3.2228689193725586, "rewards/rejected": -5.324159622192383, "step": 385 }, { "epoch": 0.7324478178368121, "grad_norm": 3.9173166751861572, "learning_rate": 4.915286467992097e-05, "logits/chosen": -2.853044033050537, "logits/rejected": -2.8523402214050293, "logps/chosen": -45.38521957397461, "logps/rejected": -70.02400207519531, "loss": 0.2852, "rewards/accuracies": 0.875, "rewards/chosen": -2.1188886165618896, "rewards/margins": 2.6662843227386475, "rewards/rejected": -4.785172939300537, "step": 386 }, { "epoch": 0.7343453510436433, "grad_norm": 2.7123172283172607, "learning_rate": 4.852071984901696e-05, "logits/chosen": -2.7952325344085693, "logits/rejected": -2.7948856353759766, "logps/chosen": -47.74720764160156, "logps/rejected": -74.95579528808594, "loss": 0.3017, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2861366271972656, "rewards/margins": 2.9639570713043213, "rewards/rejected": -5.250093460083008, "step": 387 }, { "epoch": 0.7362428842504743, "grad_norm": 2.0673792362213135, "learning_rate": 4.7891361617271845e-05, "logits/chosen": -2.8399112224578857, "logits/rejected": -2.8416287899017334, "logps/chosen": -49.615631103515625, "logps/rejected": -81.5777587890625, "loss": 0.2256, "rewards/accuracies": 0.875, "rewards/chosen": -2.6042308807373047, "rewards/margins": 3.308650255203247, "rewards/rejected": -5.912881374359131, "step": 388 }, { "epoch": 0.7381404174573055, "grad_norm": 1.5954556465148926, "learning_rate": 4.726482405216125e-05, "logits/chosen": -2.8556995391845703, "logits/rejected": -2.855954885482788, "logps/chosen": -42.215030670166016, "logps/rejected": -80.80831909179688, "loss": 0.126, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8362414836883545, "rewards/margins": 3.931171417236328, "rewards/rejected": -5.767412185668945, "step": 389 }, { "epoch": 0.7400379506641366, "grad_norm": 2.600886583328247, "learning_rate": 4.6641141068476666e-05, "logits/chosen": -2.856656074523926, "logits/rejected": -2.855344295501709, "logps/chosen": -46.701438903808594, "logps/rejected": -83.43122863769531, "loss": 0.202, "rewards/accuracies": 0.9375, "rewards/chosen": -2.251277208328247, "rewards/margins": 4.019396781921387, "rewards/rejected": -6.270673751831055, "step": 390 }, { "epoch": 0.7419354838709677, "grad_norm": 0.3920484483242035, "learning_rate": 4.602034642648968e-05, "logits/chosen": -2.8546879291534424, "logits/rejected": -2.8558170795440674, "logps/chosen": -38.9944953918457, "logps/rejected": -84.74649047851562, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -1.391613483428955, "rewards/margins": 4.989771842956543, "rewards/rejected": -6.381385803222656, "step": 391 }, { "epoch": 0.7438330170777988, "grad_norm": 1.4415783882141113, "learning_rate": 4.540247373012439e-05, "logits/chosen": -2.8511734008789062, "logits/rejected": -2.848379611968994, "logps/chosen": -49.37206268310547, "logps/rejected": -83.17179107666016, "loss": 0.0969, "rewards/accuracies": 1.0, "rewards/chosen": -2.363197088241577, "rewards/margins": 3.6192896366119385, "rewards/rejected": -5.982486724853516, "step": 392 }, { "epoch": 0.74573055028463, "grad_norm": 2.126145362854004, "learning_rate": 4.4787556425138675e-05, "logits/chosen": -2.8572943210601807, "logits/rejected": -2.8603005409240723, "logps/chosen": -55.69056701660156, "logps/rejected": -87.90967559814453, "loss": 0.1894, "rewards/accuracies": 0.875, "rewards/chosen": -3.2229087352752686, "rewards/margins": 3.3716800212860107, "rewards/rejected": -6.5945892333984375, "step": 393 }, { "epoch": 0.7476280834914611, "grad_norm": 1.2590640783309937, "learning_rate": 4.417562779731355e-05, "logits/chosen": -2.843212842941284, "logits/rejected": -2.843592882156372, "logps/chosen": -36.59722137451172, "logps/rejected": -79.27423095703125, "loss": 0.1432, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2509305477142334, "rewards/margins": 4.489564895629883, "rewards/rejected": -5.740495681762695, "step": 394 }, { "epoch": 0.7495256166982922, "grad_norm": 1.5835777521133423, "learning_rate": 4.356672097065134e-05, "logits/chosen": -2.8567137718200684, "logits/rejected": -2.8576886653900146, "logps/chosen": -42.656951904296875, "logps/rejected": -78.82894897460938, "loss": 0.1267, "rewards/accuracies": 0.9375, "rewards/chosen": -1.9459456205368042, "rewards/margins": 3.693349599838257, "rewards/rejected": -5.6392951011657715, "step": 395 }, { "epoch": 0.7514231499051234, "grad_norm": 6.007066249847412, "learning_rate": 4.29608689055829e-05, "logits/chosen": -2.8007290363311768, "logits/rejected": -2.799259901046753, "logps/chosen": -42.47441864013672, "logps/rejected": -78.50823974609375, "loss": 0.2904, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7924405336380005, "rewards/margins": 3.7978663444519043, "rewards/rejected": -5.590307235717773, "step": 396 }, { "epoch": 0.7533206831119544, "grad_norm": 1.4406176805496216, "learning_rate": 4.2358104397183264e-05, "logits/chosen": -2.8301925659179688, "logits/rejected": -2.836293935775757, "logps/chosen": -44.866912841796875, "logps/rejected": -86.76194763183594, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/chosen": -2.156167507171631, "rewards/margins": 4.227807998657227, "rewards/rejected": -6.383975982666016, "step": 397 }, { "epoch": 0.7552182163187856, "grad_norm": 3.5366456508636475, "learning_rate": 4.1758460073396436e-05, "logits/chosen": -2.815624237060547, "logits/rejected": -2.813138723373413, "logps/chosen": -58.1055908203125, "logps/rejected": -91.65219116210938, "loss": 0.235, "rewards/accuracies": 0.875, "rewards/chosen": -3.413323402404785, "rewards/margins": 3.6791296005249023, "rewards/rejected": -7.0924530029296875, "step": 398 }, { "epoch": 0.7571157495256167, "grad_norm": 4.2073516845703125, "learning_rate": 4.116196839326932e-05, "logits/chosen": -2.8639235496520996, "logits/rejected": -2.8655333518981934, "logps/chosen": -55.49702072143555, "logps/rejected": -83.14749145507812, "loss": 0.329, "rewards/accuracies": 0.8125, "rewards/chosen": -3.128420352935791, "rewards/margins": 2.9055938720703125, "rewards/rejected": -6.034013748168945, "step": 399 }, { "epoch": 0.7590132827324478, "grad_norm": 2.927692413330078, "learning_rate": 4.056866164519465e-05, "logits/chosen": -2.8432540893554688, "logits/rejected": -2.8449456691741943, "logps/chosen": -49.41889190673828, "logps/rejected": -92.33918762207031, "loss": 0.2371, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5948264598846436, "rewards/margins": 4.3953857421875, "rewards/rejected": -6.9902119636535645, "step": 400 }, { "epoch": 0.7609108159392789, "grad_norm": 1.7601803541183472, "learning_rate": 3.997857194516319e-05, "logits/chosen": -2.879696846008301, "logits/rejected": -2.876967191696167, "logps/chosen": -56.179996490478516, "logps/rejected": -98.41171264648438, "loss": 0.1464, "rewards/accuracies": 0.875, "rewards/chosen": -3.0214571952819824, "rewards/margins": 4.520035743713379, "rewards/rejected": -7.5414934158325195, "step": 401 }, { "epoch": 0.7628083491461101, "grad_norm": 2.3982231616973877, "learning_rate": 3.939173123502523e-05, "logits/chosen": -2.852635622024536, "logits/rejected": -2.8533170223236084, "logps/chosen": -55.57673263549805, "logps/rejected": -82.56744384765625, "loss": 0.5012, "rewards/accuracies": 0.9375, "rewards/chosen": -3.271477699279785, "rewards/margins": 2.8199305534362793, "rewards/rejected": -6.0914082527160645, "step": 402 }, { "epoch": 0.7647058823529411, "grad_norm": 1.9928884506225586, "learning_rate": 3.880817128076166e-05, "logits/chosen": -2.8672335147857666, "logits/rejected": -2.868398666381836, "logps/chosen": -59.27280044555664, "logps/rejected": -86.81177520751953, "loss": 0.255, "rewards/accuracies": 0.875, "rewards/chosen": -3.336042881011963, "rewards/margins": 3.0970020294189453, "rewards/rejected": -6.433044910430908, "step": 403 }, { "epoch": 0.7666034155597723, "grad_norm": 1.0021593570709229, "learning_rate": 3.8227923670764466e-05, "logits/chosen": -2.8443446159362793, "logits/rejected": -2.8455092906951904, "logps/chosen": -47.19717025756836, "logps/rejected": -91.81137084960938, "loss": 0.1083, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1716463565826416, "rewards/margins": 4.843894004821777, "rewards/rejected": -7.01554012298584, "step": 404 }, { "epoch": 0.7685009487666035, "grad_norm": 4.413395404815674, "learning_rate": 3.7651019814126654e-05, "logits/chosen": -2.8711767196655273, "logits/rejected": -2.8668553829193115, "logps/chosen": -53.58222198486328, "logps/rejected": -78.13496398925781, "loss": 0.6096, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8774609565734863, "rewards/margins": 2.763082981109619, "rewards/rejected": -5.6405439376831055, "step": 405 }, { "epoch": 0.7703984819734345, "grad_norm": 3.5480332374572754, "learning_rate": 3.707749093894231e-05, "logits/chosen": -2.8677728176116943, "logits/rejected": -2.868199348449707, "logps/chosen": -55.50325012207031, "logps/rejected": -71.5915756225586, "loss": 0.5146, "rewards/accuracies": 0.875, "rewards/chosen": -3.0266475677490234, "rewards/margins": 1.9313273429870605, "rewards/rejected": -4.957975387573242, "step": 406 }, { "epoch": 0.7722960151802657, "grad_norm": 1.3300139904022217, "learning_rate": 3.650736809061601e-05, "logits/chosen": -2.823054075241089, "logits/rejected": -2.8239519596099854, "logps/chosen": -46.36268997192383, "logps/rejected": -86.57362365722656, "loss": 0.1313, "rewards/accuracies": 1.0, "rewards/chosen": -2.1757278442382812, "rewards/margins": 4.097400665283203, "rewards/rejected": -6.273128986358643, "step": 407 }, { "epoch": 0.7741935483870968, "grad_norm": 1.6520681381225586, "learning_rate": 3.594068213018249e-05, "logits/chosen": -2.8503434658050537, "logits/rejected": -2.8497822284698486, "logps/chosen": -51.699127197265625, "logps/rejected": -85.61526489257812, "loss": 0.1361, "rewards/accuracies": 0.9375, "rewards/chosen": -2.76126766204834, "rewards/margins": 3.510023355484009, "rewards/rejected": -6.271291255950928, "step": 408 }, { "epoch": 0.7760910815939279, "grad_norm": 4.191739559173584, "learning_rate": 3.537746373263589e-05, "logits/chosen": -2.8701586723327637, "logits/rejected": -2.8664653301239014, "logps/chosen": -58.59525680541992, "logps/rejected": -86.79007720947266, "loss": 0.4554, "rewards/accuracies": 0.75, "rewards/chosen": -3.43748140335083, "rewards/margins": 3.084642171859741, "rewards/rejected": -6.522123336791992, "step": 409 }, { "epoch": 0.777988614800759, "grad_norm": 2.7124240398406982, "learning_rate": 3.481774338526954e-05, "logits/chosen": -2.8723881244659424, "logits/rejected": -2.8679914474487305, "logps/chosen": -57.07926940917969, "logps/rejected": -91.86167907714844, "loss": 0.5128, "rewards/accuracies": 0.875, "rewards/chosen": -3.1731064319610596, "rewards/margins": 3.747316837310791, "rewards/rejected": -6.92042350769043, "step": 410 }, { "epoch": 0.7798861480075902, "grad_norm": 2.129563331604004, "learning_rate": 3.426155138602558e-05, "logits/chosen": -2.853167772293091, "logits/rejected": -2.853346824645996, "logps/chosen": -59.06671142578125, "logps/rejected": -96.77989959716797, "loss": 0.4821, "rewards/accuracies": 0.8125, "rewards/chosen": -3.553715944290161, "rewards/margins": 3.9553308486938477, "rewards/rejected": -7.50904655456543, "step": 411 }, { "epoch": 0.7817836812144212, "grad_norm": 1.0496379137039185, "learning_rate": 3.370891784185478e-05, "logits/chosen": -2.851594924926758, "logits/rejected": -2.850982666015625, "logps/chosen": -45.39728546142578, "logps/rejected": -83.35406494140625, "loss": 0.0511, "rewards/accuracies": 1.0, "rewards/chosen": -2.002638101577759, "rewards/margins": 4.146675109863281, "rewards/rejected": -6.149312973022461, "step": 412 }, { "epoch": 0.7836812144212524, "grad_norm": 3.537407875061035, "learning_rate": 3.315987266708708e-05, "logits/chosen": -2.8707752227783203, "logits/rejected": -2.8682148456573486, "logps/chosen": -65.88800048828125, "logps/rejected": -93.33538055419922, "loss": 0.5083, "rewards/accuracies": 0.8125, "rewards/chosen": -4.260937690734863, "rewards/margins": 2.699514627456665, "rewards/rejected": -6.960453033447266, "step": 413 }, { "epoch": 0.7855787476280834, "grad_norm": 2.036504030227661, "learning_rate": 3.261444558181218e-05, "logits/chosen": -2.822535753250122, "logits/rejected": -2.8231797218322754, "logps/chosen": -53.427268981933594, "logps/rejected": -87.4761962890625, "loss": 0.2484, "rewards/accuracies": 0.8125, "rewards/chosen": -2.955658197402954, "rewards/margins": 3.5562376976013184, "rewards/rejected": -6.511895656585693, "step": 414 }, { "epoch": 0.7874762808349146, "grad_norm": 6.943586349487305, "learning_rate": 3.207266611027069e-05, "logits/chosen": -2.845099449157715, "logits/rejected": -2.845888137817383, "logps/chosen": -62.221195220947266, "logps/rejected": -81.78451538085938, "loss": 0.8399, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9935247898101807, "rewards/margins": 1.944313406944275, "rewards/rejected": -5.937838554382324, "step": 415 }, { "epoch": 0.7893738140417458, "grad_norm": 0.8565374612808228, "learning_rate": 3.153456357925617e-05, "logits/chosen": -2.877192497253418, "logits/rejected": -2.8789360523223877, "logps/chosen": -50.52465057373047, "logps/rejected": -88.89420318603516, "loss": 0.0881, "rewards/accuracies": 1.0, "rewards/chosen": -2.6657750606536865, "rewards/margins": 4.038374900817871, "rewards/rejected": -6.7041497230529785, "step": 416 }, { "epoch": 0.7912713472485768, "grad_norm": 2.0553338527679443, "learning_rate": 3.100016711652752e-05, "logits/chosen": -2.8713204860687256, "logits/rejected": -2.8678972721099854, "logps/chosen": -51.89461898803711, "logps/rejected": -85.39608764648438, "loss": 0.3038, "rewards/accuracies": 0.875, "rewards/chosen": -2.7338039875030518, "rewards/margins": 3.546187400817871, "rewards/rejected": -6.279991149902344, "step": 417 }, { "epoch": 0.793168880455408, "grad_norm": 2.6920197010040283, "learning_rate": 3.0469505649232333e-05, "logits/chosen": -2.842066764831543, "logits/rejected": -2.8389217853546143, "logps/chosen": -62.65966796875, "logps/rejected": -88.504638671875, "loss": 0.4567, "rewards/accuracies": 0.8125, "rewards/chosen": -3.93137526512146, "rewards/margins": 2.59926176071167, "rewards/rejected": -6.530636787414551, "step": 418 }, { "epoch": 0.7950664136622391, "grad_norm": 3.4587042331695557, "learning_rate": 2.9942607902340945e-05, "logits/chosen": -2.8469345569610596, "logits/rejected": -2.844067335128784, "logps/chosen": -59.01725769042969, "logps/rejected": -73.55389404296875, "loss": 0.7181, "rewards/accuracies": 0.75, "rewards/chosen": -3.381943702697754, "rewards/margins": 1.5324156284332275, "rewards/rejected": -4.914359092712402, "step": 419 }, { "epoch": 0.7969639468690702, "grad_norm": 1.682063102722168, "learning_rate": 2.9419502397091713e-05, "logits/chosen": -2.816669225692749, "logits/rejected": -2.8211100101470947, "logps/chosen": -59.50233459472656, "logps/rejected": -79.79951477050781, "loss": 0.2215, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7046406269073486, "rewards/margins": 2.0102458000183105, "rewards/rejected": -5.714886665344238, "step": 420 }, { "epoch": 0.7988614800759013, "grad_norm": 2.2250099182128906, "learning_rate": 2.8900217449447074e-05, "logits/chosen": -2.8525190353393555, "logits/rejected": -2.8517796993255615, "logps/chosen": -49.88181686401367, "logps/rejected": -85.27168273925781, "loss": 0.2254, "rewards/accuracies": 0.8125, "rewards/chosen": -2.631014347076416, "rewards/margins": 3.6132094860076904, "rewards/rejected": -6.244223594665527, "step": 421 }, { "epoch": 0.8007590132827325, "grad_norm": 3.0041167736053467, "learning_rate": 2.8384781168560693e-05, "logits/chosen": -2.9001643657684326, "logits/rejected": -2.898693323135376, "logps/chosen": -58.808204650878906, "logps/rejected": -81.7398681640625, "loss": 0.3757, "rewards/accuracies": 0.75, "rewards/chosen": -3.4261245727539062, "rewards/margins": 2.559650182723999, "rewards/rejected": -5.985774993896484, "step": 422 }, { "epoch": 0.8026565464895635, "grad_norm": 0.5320444107055664, "learning_rate": 2.7873221455256004e-05, "logits/chosen": -2.875701427459717, "logits/rejected": -2.8719263076782227, "logps/chosen": -47.414459228515625, "logps/rejected": -90.77688598632812, "loss": 0.071, "rewards/accuracies": 1.0, "rewards/chosen": -2.345010757446289, "rewards/margins": 4.371488571166992, "rewards/rejected": -6.716499328613281, "step": 423 }, { "epoch": 0.8045540796963947, "grad_norm": 2.517533540725708, "learning_rate": 2.736556600051593e-05, "logits/chosen": -2.861175298690796, "logits/rejected": -2.861558675765991, "logps/chosen": -58.571083068847656, "logps/rejected": -79.73915100097656, "loss": 0.4111, "rewards/accuracies": 0.8125, "rewards/chosen": -3.626460075378418, "rewards/margins": 2.2237601280212402, "rewards/rejected": -5.8502197265625, "step": 424 }, { "epoch": 0.8064516129032258, "grad_norm": 1.6777634620666504, "learning_rate": 2.6861842283983953e-05, "logits/chosen": -2.8752527236938477, "logits/rejected": -2.8697383403778076, "logps/chosen": -51.51059341430664, "logps/rejected": -83.09491729736328, "loss": 0.1459, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8168253898620605, "rewards/margins": 3.3736629486083984, "rewards/rejected": -6.190488338470459, "step": 425 }, { "epoch": 0.8083491461100569, "grad_norm": 1.8491694927215576, "learning_rate": 2.6362077572476494e-05, "logits/chosen": -2.8406431674957275, "logits/rejected": -2.8375585079193115, "logps/chosen": -48.828189849853516, "logps/rejected": -79.91885375976562, "loss": 0.2327, "rewards/accuracies": 0.875, "rewards/chosen": -2.5947234630584717, "rewards/margins": 3.1402082443237305, "rewards/rejected": -5.734931945800781, "step": 426 }, { "epoch": 0.8102466793168881, "grad_norm": 3.158411979675293, "learning_rate": 2.586629891850716e-05, "logits/chosen": -2.8103625774383545, "logits/rejected": -2.81091570854187, "logps/chosen": -46.74591064453125, "logps/rejected": -72.90126037597656, "loss": 0.4351, "rewards/accuracies": 0.875, "rewards/chosen": -2.266634702682495, "rewards/margins": 2.8894031047821045, "rewards/rejected": -5.1560378074646, "step": 427 }, { "epoch": 0.8121442125237192, "grad_norm": 2.506629467010498, "learning_rate": 2.537453315882222e-05, "logits/chosen": -2.887596368789673, "logits/rejected": -2.88572359085083, "logps/chosen": -51.39200973510742, "logps/rejected": -80.5135498046875, "loss": 0.3368, "rewards/accuracies": 0.875, "rewards/chosen": -2.7697482109069824, "rewards/margins": 3.133552074432373, "rewards/rejected": -5.9033002853393555, "step": 428 }, { "epoch": 0.8140417457305503, "grad_norm": 1.5517326593399048, "learning_rate": 2.4886806912948035e-05, "logits/chosen": -2.8428924083709717, "logits/rejected": -2.8404133319854736, "logps/chosen": -50.483375549316406, "logps/rejected": -78.89204406738281, "loss": 0.1657, "rewards/accuracies": 0.9375, "rewards/chosen": -2.781102180480957, "rewards/margins": 2.961219072341919, "rewards/rejected": -5.742321014404297, "step": 429 }, { "epoch": 0.8159392789373814, "grad_norm": 1.744879961013794, "learning_rate": 2.4403146581749925e-05, "logits/chosen": -2.8795626163482666, "logits/rejected": -2.8794806003570557, "logps/chosen": -54.82472229003906, "logps/rejected": -84.26485443115234, "loss": 0.2315, "rewards/accuracies": 0.875, "rewards/chosen": -3.0898611545562744, "rewards/margins": 3.029693365097046, "rewards/rejected": -6.11955451965332, "step": 430 }, { "epoch": 0.8178368121442126, "grad_norm": 0.5849810242652893, "learning_rate": 2.3923578346003363e-05, "logits/chosen": -2.888624906539917, "logits/rejected": -2.882172107696533, "logps/chosen": -43.547874450683594, "logps/rejected": -82.17684173583984, "loss": 0.0473, "rewards/accuracies": 1.0, "rewards/chosen": -2.0178041458129883, "rewards/margins": 4.022542953491211, "rewards/rejected": -6.040347099304199, "step": 431 }, { "epoch": 0.8197343453510436, "grad_norm": 1.854899525642395, "learning_rate": 2.344812816497659e-05, "logits/chosen": -2.8812174797058105, "logits/rejected": -2.8820064067840576, "logps/chosen": -48.730194091796875, "logps/rejected": -85.76399230957031, "loss": 0.2789, "rewards/accuracies": 0.9375, "rewards/chosen": -2.402095079421997, "rewards/margins": 3.9352121353149414, "rewards/rejected": -6.337306976318359, "step": 432 }, { "epoch": 0.8216318785578748, "grad_norm": 1.663291335105896, "learning_rate": 2.2976821775025457e-05, "logits/chosen": -2.8493640422821045, "logits/rejected": -2.8438591957092285, "logps/chosen": -55.23139572143555, "logps/rejected": -83.07640075683594, "loss": 0.2587, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1326611042022705, "rewards/margins": 2.9811367988586426, "rewards/rejected": -6.113798141479492, "step": 433 }, { "epoch": 0.8235294117647058, "grad_norm": 2.9149162769317627, "learning_rate": 2.2509684688200384e-05, "logits/chosen": -2.8712880611419678, "logits/rejected": -2.8701071739196777, "logps/chosen": -65.67572021484375, "logps/rejected": -84.64015197753906, "loss": 0.4566, "rewards/accuracies": 0.875, "rewards/chosen": -4.112061500549316, "rewards/margins": 2.053865671157837, "rewards/rejected": -6.165927886962891, "step": 434 }, { "epoch": 0.825426944971537, "grad_norm": 1.1507574319839478, "learning_rate": 2.204674219086531e-05, "logits/chosen": -2.8618247509002686, "logits/rejected": -2.865861415863037, "logps/chosen": -58.50060272216797, "logps/rejected": -87.97689819335938, "loss": 0.0886, "rewards/accuracies": 1.0, "rewards/chosen": -3.454524040222168, "rewards/margins": 3.154420852661133, "rewards/rejected": -6.608944892883301, "step": 435 }, { "epoch": 0.8273244781783681, "grad_norm": 2.227458953857422, "learning_rate": 2.1588019342328968e-05, "logits/chosen": -2.8568058013916016, "logits/rejected": -2.850839614868164, "logps/chosen": -52.666900634765625, "logps/rejected": -85.18878936767578, "loss": 0.2316, "rewards/accuracies": 0.9375, "rewards/chosen": -2.931042194366455, "rewards/margins": 3.4106945991516113, "rewards/rejected": -6.341736793518066, "step": 436 }, { "epoch": 0.8292220113851992, "grad_norm": 2.9324381351470947, "learning_rate": 2.1133540973488342e-05, "logits/chosen": -2.854193687438965, "logits/rejected": -2.8481411933898926, "logps/chosen": -41.16516876220703, "logps/rejected": -77.86811828613281, "loss": 0.2019, "rewards/accuracies": 0.875, "rewards/chosen": -1.6302728652954102, "rewards/margins": 3.9812991619110107, "rewards/rejected": -5.611572265625, "step": 437 }, { "epoch": 0.8311195445920304, "grad_norm": 1.595365285873413, "learning_rate": 2.0683331685484652e-05, "logits/chosen": -2.834395170211792, "logits/rejected": -2.8295483589172363, "logps/chosen": -50.14398956298828, "logps/rejected": -86.78340148925781, "loss": 0.1211, "rewards/accuracies": 0.9375, "rewards/chosen": -2.632577896118164, "rewards/margins": 3.9267497062683105, "rewards/rejected": -6.559328079223633, "step": 438 }, { "epoch": 0.8330170777988615, "grad_norm": 2.071730852127075, "learning_rate": 2.0237415848371667e-05, "logits/chosen": -2.8482611179351807, "logits/rejected": -2.8530232906341553, "logps/chosen": -50.99557113647461, "logps/rejected": -84.28203582763672, "loss": 0.2539, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6730546951293945, "rewards/margins": 3.566554307937622, "rewards/rejected": -6.2396087646484375, "step": 439 }, { "epoch": 0.8349146110056926, "grad_norm": 1.4619513750076294, "learning_rate": 1.9795817599796418e-05, "logits/chosen": -2.8789496421813965, "logits/rejected": -2.876634120941162, "logps/chosen": -46.09120559692383, "logps/rejected": -80.81315612792969, "loss": 0.1742, "rewards/accuracies": 0.9375, "rewards/chosen": -2.269618034362793, "rewards/margins": 3.6128640174865723, "rewards/rejected": -5.882482051849365, "step": 440 }, { "epoch": 0.8368121442125237, "grad_norm": 2.196380376815796, "learning_rate": 1.9358560843692787e-05, "logits/chosen": -2.8895583152770996, "logits/rejected": -2.890669345855713, "logps/chosen": -57.63343811035156, "logps/rejected": -77.12810516357422, "loss": 0.435, "rewards/accuracies": 0.875, "rewards/chosen": -3.4425575733184814, "rewards/margins": 1.9679261445999146, "rewards/rejected": -5.4104838371276855, "step": 441 }, { "epoch": 0.8387096774193549, "grad_norm": 0.8209553956985474, "learning_rate": 1.892566924898751e-05, "logits/chosen": -2.870082139968872, "logits/rejected": -2.8673083782196045, "logps/chosen": -58.87828063964844, "logps/rejected": -95.53388977050781, "loss": 0.1457, "rewards/accuracies": 0.9375, "rewards/chosen": -3.519439697265625, "rewards/margins": 3.827193260192871, "rewards/rejected": -7.346632957458496, "step": 442 }, { "epoch": 0.8406072106261859, "grad_norm": 3.9470529556274414, "learning_rate": 1.8497166248318876e-05, "logits/chosen": -2.847348213195801, "logits/rejected": -2.8416690826416016, "logps/chosen": -54.76271057128906, "logps/rejected": -77.33283233642578, "loss": 0.5914, "rewards/accuracies": 0.75, "rewards/chosen": -3.093599796295166, "rewards/margins": 2.3917601108551025, "rewards/rejected": -5.485360145568848, "step": 443 }, { "epoch": 0.8425047438330171, "grad_norm": 1.719234824180603, "learning_rate": 1.807307503676846e-05, "logits/chosen": -2.855872392654419, "logits/rejected": -2.8510212898254395, "logps/chosen": -50.66022491455078, "logps/rejected": -79.40420532226562, "loss": 0.2117, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6416897773742676, "rewards/margins": 3.0230698585510254, "rewards/rejected": -5.664760112762451, "step": 444 }, { "epoch": 0.8444022770398482, "grad_norm": 2.0453107357025146, "learning_rate": 1.7653418570605475e-05, "logits/chosen": -2.822150468826294, "logits/rejected": -2.827209711074829, "logps/chosen": -46.558135986328125, "logps/rejected": -78.79747009277344, "loss": 0.1741, "rewards/accuracies": 0.9375, "rewards/chosen": -2.175271987915039, "rewards/margins": 3.4841248989105225, "rewards/rejected": -5.659396648406982, "step": 445 }, { "epoch": 0.8462998102466793, "grad_norm": 0.71958988904953, "learning_rate": 1.7238219566044145e-05, "logits/chosen": -2.831434726715088, "logits/rejected": -2.8327438831329346, "logps/chosen": -48.3876953125, "logps/rejected": -85.19735717773438, "loss": 0.0798, "rewards/accuracies": 1.0, "rewards/chosen": -2.3890655040740967, "rewards/margins": 3.872114419937134, "rewards/rejected": -6.2611799240112305, "step": 446 }, { "epoch": 0.8481973434535104, "grad_norm": 2.318376302719116, "learning_rate": 1.6827500498014025e-05, "logits/chosen": -2.8484344482421875, "logits/rejected": -2.845217704772949, "logps/chosen": -48.784584045410156, "logps/rejected": -87.87923431396484, "loss": 0.1897, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4254283905029297, "rewards/margins": 4.209994792938232, "rewards/rejected": -6.635422706604004, "step": 447 }, { "epoch": 0.8500948766603416, "grad_norm": 0.7029361724853516, "learning_rate": 1.6421283598943528e-05, "logits/chosen": -2.8784377574920654, "logits/rejected": -2.876771926879883, "logps/chosen": -49.851253509521484, "logps/rejected": -87.93343353271484, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": -2.5630671977996826, "rewards/margins": 4.124255180358887, "rewards/rejected": -6.687322616577148, "step": 448 }, { "epoch": 0.8519924098671727, "grad_norm": 1.5745196342468262, "learning_rate": 1.601959085755641e-05, "logits/chosen": -2.8467390537261963, "logits/rejected": -2.845139265060425, "logps/chosen": -50.44337844848633, "logps/rejected": -85.5137710571289, "loss": 0.1323, "rewards/accuracies": 0.9375, "rewards/chosen": -2.772374153137207, "rewards/margins": 3.719602346420288, "rewards/rejected": -6.491976737976074, "step": 449 }, { "epoch": 0.8538899430740038, "grad_norm": 2.028979778289795, "learning_rate": 1.562244401768144e-05, "logits/chosen": -2.8904149532318115, "logits/rejected": -2.884728193283081, "logps/chosen": -51.37632369995117, "logps/rejected": -79.03163146972656, "loss": 0.1726, "rewards/accuracies": 0.875, "rewards/chosen": -2.6548404693603516, "rewards/margins": 3.043051242828369, "rewards/rejected": -5.697892189025879, "step": 450 }, { "epoch": 0.855787476280835, "grad_norm": 3.1051995754241943, "learning_rate": 1.5229864577075547e-05, "logits/chosen": -2.870203971862793, "logits/rejected": -2.868276357650757, "logps/chosen": -51.94432830810547, "logps/rejected": -81.9127426147461, "loss": 0.4108, "rewards/accuracies": 0.875, "rewards/chosen": -2.717585802078247, "rewards/margins": 3.2403957843780518, "rewards/rejected": -5.957981586456299, "step": 451 }, { "epoch": 0.857685009487666, "grad_norm": 0.38512659072875977, "learning_rate": 1.484187378626002e-05, "logits/chosen": -2.884380578994751, "logits/rejected": -2.87831974029541, "logps/chosen": -49.78013229370117, "logps/rejected": -95.21629333496094, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -2.50553560256958, "rewards/margins": 4.855525970458984, "rewards/rejected": -7.361061096191406, "step": 452 }, { "epoch": 0.8595825426944972, "grad_norm": 2.417759656906128, "learning_rate": 1.4458492647370258e-05, "logits/chosen": -2.8602285385131836, "logits/rejected": -2.860398530960083, "logps/chosen": -54.827938079833984, "logps/rejected": -87.94296264648438, "loss": 0.2447, "rewards/accuracies": 0.875, "rewards/chosen": -3.203274726867676, "rewards/margins": 3.4562509059906006, "rewards/rejected": -6.6595258712768555, "step": 453 }, { "epoch": 0.8614800759013282, "grad_norm": 2.0201404094696045, "learning_rate": 1.4079741913018863e-05, "logits/chosen": -2.859805107116699, "logits/rejected": -2.8590550422668457, "logps/chosen": -49.323951721191406, "logps/rejected": -84.20152282714844, "loss": 0.139, "rewards/accuracies": 0.9375, "rewards/chosen": -2.51802396774292, "rewards/margins": 3.6508617401123047, "rewards/rejected": -6.168885707855225, "step": 454 }, { "epoch": 0.8633776091081594, "grad_norm": 1.3853485584259033, "learning_rate": 1.3705642085172366e-05, "logits/chosen": -2.845665454864502, "logits/rejected": -2.8443827629089355, "logps/chosen": -41.36186218261719, "logps/rejected": -85.60066223144531, "loss": 0.0883, "rewards/accuracies": 1.0, "rewards/chosen": -1.6150336265563965, "rewards/margins": 4.706649303436279, "rewards/rejected": -6.321683406829834, "step": 455 }, { "epoch": 0.8652751423149905, "grad_norm": 0.7386859059333801, "learning_rate": 1.3336213414041387e-05, "logits/chosen": -2.8452632427215576, "logits/rejected": -2.8436524868011475, "logps/chosen": -35.882301330566406, "logps/rejected": -82.32295227050781, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": -1.1484332084655762, "rewards/margins": 4.846029281616211, "rewards/rejected": -5.994462490081787, "step": 456 }, { "epoch": 0.8671726755218216, "grad_norm": 1.8411693572998047, "learning_rate": 1.2971475896984475e-05, "logits/chosen": -2.8674824237823486, "logits/rejected": -2.86271595954895, "logps/chosen": -55.15509796142578, "logps/rejected": -81.25284576416016, "loss": 0.2226, "rewards/accuracies": 0.875, "rewards/chosen": -3.0428030490875244, "rewards/margins": 2.7782084941864014, "rewards/rejected": -5.821011543273926, "step": 457 }, { "epoch": 0.8690702087286527, "grad_norm": 0.8854163885116577, "learning_rate": 1.2611449277425713e-05, "logits/chosen": -2.8654544353485107, "logits/rejected": -2.8657302856445312, "logps/chosen": -56.486427307128906, "logps/rejected": -95.68757629394531, "loss": 0.0783, "rewards/accuracies": 1.0, "rewards/chosen": -3.267014980316162, "rewards/margins": 4.12831449508667, "rewards/rejected": -7.395329475402832, "step": 458 }, { "epoch": 0.8709677419354839, "grad_norm": 2.766991376876831, "learning_rate": 1.2256153043785912e-05, "logits/chosen": -2.8615031242370605, "logits/rejected": -2.859715223312378, "logps/chosen": -59.30867004394531, "logps/rejected": -87.28678131103516, "loss": 0.3159, "rewards/accuracies": 0.8125, "rewards/chosen": -3.432471990585327, "rewards/margins": 3.0275392532348633, "rewards/rejected": -6.4600114822387695, "step": 459 }, { "epoch": 0.872865275142315, "grad_norm": 1.7893755435943604, "learning_rate": 1.1905606428427774e-05, "logits/chosen": -2.881412982940674, "logits/rejected": -2.8749964237213135, "logps/chosen": -49.450645446777344, "logps/rejected": -82.91507720947266, "loss": 0.2335, "rewards/accuracies": 0.875, "rewards/chosen": -2.522357225418091, "rewards/margins": 3.599726676940918, "rewards/rejected": -6.122084140777588, "step": 460 }, { "epoch": 0.8747628083491461, "grad_norm": 1.8468047380447388, "learning_rate": 1.1559828406614714e-05, "logits/chosen": -2.8736824989318848, "logits/rejected": -2.8692877292633057, "logps/chosen": -48.415138244628906, "logps/rejected": -77.85014343261719, "loss": 0.1343, "rewards/accuracies": 1.0, "rewards/chosen": -2.3501672744750977, "rewards/margins": 3.143881320953369, "rewards/rejected": -5.494048118591309, "step": 461 }, { "epoch": 0.8766603415559773, "grad_norm": 3.239903688430786, "learning_rate": 1.1218837695483853e-05, "logits/chosen": -2.856627941131592, "logits/rejected": -2.8530499935150146, "logps/chosen": -48.137454986572266, "logps/rejected": -82.09251403808594, "loss": 0.334, "rewards/accuracies": 0.9375, "rewards/chosen": -2.357161045074463, "rewards/margins": 3.6000640392303467, "rewards/rejected": -5.957225322723389, "step": 462 }, { "epoch": 0.8785578747628083, "grad_norm": 2.1839210987091064, "learning_rate": 1.0882652753032795e-05, "logits/chosen": -2.872593402862549, "logits/rejected": -2.8667614459991455, "logps/chosen": -51.027793884277344, "logps/rejected": -88.22506713867188, "loss": 0.1381, "rewards/accuracies": 0.875, "rewards/chosen": -2.631429672241211, "rewards/margins": 3.949152946472168, "rewards/rejected": -6.580582618713379, "step": 463 }, { "epoch": 0.8804554079696395, "grad_norm": 2.2396671772003174, "learning_rate": 1.0551291777120464e-05, "logits/chosen": -2.873810052871704, "logits/rejected": -2.871718168258667, "logps/chosen": -60.82017517089844, "logps/rejected": -88.28251647949219, "loss": 0.2474, "rewards/accuracies": 0.9375, "rewards/chosen": -3.7728419303894043, "rewards/margins": 2.73274302482605, "rewards/rejected": -6.505584716796875, "step": 464 }, { "epoch": 0.8823529411764706, "grad_norm": 2.284513473510742, "learning_rate": 1.0224772704482033e-05, "logits/chosen": -2.8778035640716553, "logits/rejected": -2.8775930404663086, "logps/chosen": -49.48849105834961, "logps/rejected": -84.93687438964844, "loss": 0.349, "rewards/accuracies": 0.9375, "rewards/chosen": -2.605888605117798, "rewards/margins": 3.584686279296875, "rewards/rejected": -6.190574645996094, "step": 465 }, { "epoch": 0.8842504743833017, "grad_norm": 3.715801477432251, "learning_rate": 9.903113209758096e-06, "logits/chosen": -2.8709402084350586, "logits/rejected": -2.8691956996917725, "logps/chosen": -52.825950622558594, "logps/rejected": -80.30274200439453, "loss": 0.3896, "rewards/accuracies": 0.875, "rewards/chosen": -2.904201030731201, "rewards/margins": 2.963268280029297, "rewards/rejected": -5.867469787597656, "step": 466 }, { "epoch": 0.8861480075901328, "grad_norm": 1.438846230506897, "learning_rate": 9.586330704537849e-06, "logits/chosen": -2.8450632095336914, "logits/rejected": -2.8445072174072266, "logps/chosen": -53.51820755004883, "logps/rejected": -85.2142333984375, "loss": 0.1714, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0311946868896484, "rewards/margins": 3.2895748615264893, "rewards/rejected": -6.320769786834717, "step": 467 }, { "epoch": 0.888045540796964, "grad_norm": 2.9390602111816406, "learning_rate": 9.274442336416567e-06, "logits/chosen": -2.845296621322632, "logits/rejected": -2.842207431793213, "logps/chosen": -51.43974304199219, "logps/rejected": -87.38579559326172, "loss": 0.1854, "rewards/accuracies": 0.875, "rewards/chosen": -2.63873028755188, "rewards/margins": 3.945521354675293, "rewards/rejected": -6.584251403808594, "step": 468 }, { "epoch": 0.889943074003795, "grad_norm": 2.258697032928467, "learning_rate": 8.967464988067475e-06, "logits/chosen": -2.887526512145996, "logits/rejected": -2.8856074810028076, "logps/chosen": -46.49995422363281, "logps/rejected": -79.75428771972656, "loss": 0.2359, "rewards/accuracies": 0.875, "rewards/chosen": -2.2005090713500977, "rewards/margins": 3.588965892791748, "rewards/rejected": -5.7894744873046875, "step": 469 }, { "epoch": 0.8918406072106262, "grad_norm": 0.39533525705337524, "learning_rate": 8.665415276327871e-06, "logits/chosen": -2.8955729007720947, "logits/rejected": -2.888808012008667, "logps/chosen": -47.26777648925781, "logps/rejected": -95.47332000732422, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": -2.2491884231567383, "rewards/margins": 5.138714790344238, "rewards/rejected": -7.387903213500977, "step": 470 }, { "epoch": 0.8937381404174574, "grad_norm": 1.2186378240585327, "learning_rate": 8.368309551299536e-06, "logits/chosen": -2.856180429458618, "logits/rejected": -2.849057912826538, "logps/chosen": -50.02391052246094, "logps/rejected": -84.98136138916016, "loss": 0.0835, "rewards/accuracies": 1.0, "rewards/chosen": -2.512538433074951, "rewards/margins": 3.8806662559509277, "rewards/rejected": -6.393204689025879, "step": 471 }, { "epoch": 0.8956356736242884, "grad_norm": 4.68803071975708, "learning_rate": 8.076163895463861e-06, "logits/chosen": -2.8694870471954346, "logits/rejected": -2.8696372509002686, "logps/chosen": -63.10512924194336, "logps/rejected": -81.60807037353516, "loss": 0.5112, "rewards/accuracies": 0.75, "rewards/chosen": -3.999472141265869, "rewards/margins": 1.9333720207214355, "rewards/rejected": -5.932844161987305, "step": 472 }, { "epoch": 0.8975332068311196, "grad_norm": 1.2908293008804321, "learning_rate": 7.788994122811178e-06, "logits/chosen": -2.8755910396575928, "logits/rejected": -2.874612331390381, "logps/chosen": -54.27519989013672, "logps/rejected": -92.8865966796875, "loss": 0.1045, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9691150188446045, "rewards/margins": 4.181435585021973, "rewards/rejected": -7.150550842285156, "step": 473 }, { "epoch": 0.8994307400379506, "grad_norm": 1.4010708332061768, "learning_rate": 7.506815777984788e-06, "logits/chosen": -2.865668773651123, "logits/rejected": -2.86503267288208, "logps/chosen": -51.853553771972656, "logps/rejected": -90.462158203125, "loss": 0.0805, "rewards/accuracies": 1.0, "rewards/chosen": -2.75411319732666, "rewards/margins": 4.066606521606445, "rewards/rejected": -6.8207197189331055, "step": 474 }, { "epoch": 0.9013282732447818, "grad_norm": 0.9278813600540161, "learning_rate": 7.229644135439473e-06, "logits/chosen": -2.874293804168701, "logits/rejected": -2.873574733734131, "logps/chosen": -51.141422271728516, "logps/rejected": -86.1817855834961, "loss": 0.0876, "rewards/accuracies": 1.0, "rewards/chosen": -2.603024959564209, "rewards/margins": 3.8318567276000977, "rewards/rejected": -6.434881687164307, "step": 475 }, { "epoch": 0.9032258064516129, "grad_norm": 2.820253372192383, "learning_rate": 6.957494198614778e-06, "logits/chosen": -2.8808329105377197, "logits/rejected": -2.8789174556732178, "logps/chosen": -52.99219512939453, "logps/rejected": -86.43820190429688, "loss": 0.1575, "rewards/accuracies": 0.9375, "rewards/chosen": -2.9275622367858887, "rewards/margins": 3.5494418144226074, "rewards/rejected": -6.477004051208496, "step": 476 }, { "epoch": 0.905123339658444, "grad_norm": 3.0501718521118164, "learning_rate": 6.690380699122767e-06, "logits/chosen": -2.8456592559814453, "logits/rejected": -2.848313093185425, "logps/chosen": -44.644290924072266, "logps/rejected": -78.11421966552734, "loss": 0.1851, "rewards/accuracies": 0.875, "rewards/chosen": -2.182406425476074, "rewards/margins": 3.503129482269287, "rewards/rejected": -5.685535430908203, "step": 477 }, { "epoch": 0.9070208728652751, "grad_norm": 3.485112190246582, "learning_rate": 6.428318095950647e-06, "logits/chosen": -2.8692822456359863, "logits/rejected": -2.8702754974365234, "logps/chosen": -55.28034973144531, "logps/rejected": -90.0372314453125, "loss": 0.2902, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2888870239257812, "rewards/margins": 3.50645112991333, "rewards/rejected": -6.795337677001953, "step": 478 }, { "epoch": 0.9089184060721063, "grad_norm": 2.162829637527466, "learning_rate": 6.171320574678063e-06, "logits/chosen": -2.8754329681396484, "logits/rejected": -2.8744900226593018, "logps/chosen": -46.66167068481445, "logps/rejected": -88.14268493652344, "loss": 0.1513, "rewards/accuracies": 0.9375, "rewards/chosen": -2.3261728286743164, "rewards/margins": 4.249751567840576, "rewards/rejected": -6.575924873352051, "step": 479 }, { "epoch": 0.9108159392789373, "grad_norm": 3.0580861568450928, "learning_rate": 5.919402046709288e-06, "logits/chosen": -2.8514211177825928, "logits/rejected": -2.848707675933838, "logps/chosen": -52.81618118286133, "logps/rejected": -86.4152603149414, "loss": 0.2967, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9318675994873047, "rewards/margins": 3.5067787170410156, "rewards/rejected": -6.43864631652832, "step": 480 }, { "epoch": 0.9127134724857685, "grad_norm": 3.6979422569274902, "learning_rate": 5.672576148520137e-06, "logits/chosen": -2.889620065689087, "logits/rejected": -2.8883321285247803, "logps/chosen": -60.14141082763672, "logps/rejected": -79.537109375, "loss": 0.7965, "rewards/accuracies": 0.75, "rewards/chosen": -3.719564437866211, "rewards/margins": 2.0429325103759766, "rewards/rejected": -5.7624969482421875, "step": 481 }, { "epoch": 0.9146110056925996, "grad_norm": 3.231797933578491, "learning_rate": 5.430856240919779e-06, "logits/chosen": -2.8718159198760986, "logits/rejected": -2.8671488761901855, "logps/chosen": -59.959632873535156, "logps/rejected": -91.78632354736328, "loss": 0.3767, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6989855766296387, "rewards/margins": 3.2586991786956787, "rewards/rejected": -6.9576849937438965, "step": 482 }, { "epoch": 0.9165085388994307, "grad_norm": 1.663153886795044, "learning_rate": 5.194255408327619e-06, "logits/chosen": -2.8569464683532715, "logits/rejected": -2.851780652999878, "logps/chosen": -54.817230224609375, "logps/rejected": -77.68112182617188, "loss": 0.1908, "rewards/accuracies": 0.9375, "rewards/chosen": -3.1171646118164062, "rewards/margins": 2.463062286376953, "rewards/rejected": -5.580226898193359, "step": 483 }, { "epoch": 0.9184060721062619, "grad_norm": 4.839911937713623, "learning_rate": 4.962786458064972e-06, "logits/chosen": -2.8591115474700928, "logits/rejected": -2.8583099842071533, "logps/chosen": -51.154937744140625, "logps/rejected": -89.58143615722656, "loss": 0.514, "rewards/accuracies": 0.8125, "rewards/chosen": -2.68982195854187, "rewards/margins": 4.086490631103516, "rewards/rejected": -6.776312828063965, "step": 484 }, { "epoch": 0.920303605313093, "grad_norm": 2.3768200874328613, "learning_rate": 4.7364619196617495e-06, "logits/chosen": -2.891602039337158, "logits/rejected": -2.8876867294311523, "logps/chosen": -48.912391662597656, "logps/rejected": -88.95805358886719, "loss": 0.1241, "rewards/accuracies": 0.9375, "rewards/chosen": -2.4300827980041504, "rewards/margins": 4.336850643157959, "rewards/rejected": -6.766933441162109, "step": 485 }, { "epoch": 0.9222011385199241, "grad_norm": 1.8126496076583862, "learning_rate": 4.515294044178331e-06, "logits/chosen": -2.8283488750457764, "logits/rejected": -2.826066493988037, "logps/chosen": -58.70185470581055, "logps/rejected": -91.96338653564453, "loss": 0.1629, "rewards/accuracies": 0.875, "rewards/chosen": -3.5918807983398438, "rewards/margins": 3.4037351608276367, "rewards/rejected": -6.9956159591674805, "step": 486 }, { "epoch": 0.9240986717267552, "grad_norm": 1.346358060836792, "learning_rate": 4.299294803542331e-06, "logits/chosen": -2.8834807872772217, "logits/rejected": -2.8820314407348633, "logps/chosen": -47.56989288330078, "logps/rejected": -80.78783416748047, "loss": 0.1421, "rewards/accuracies": 1.0, "rewards/chosen": -2.3317975997924805, "rewards/margins": 3.4790170192718506, "rewards/rejected": -5.810814380645752, "step": 487 }, { "epoch": 0.9259962049335864, "grad_norm": 3.8116936683654785, "learning_rate": 4.0884758899006e-06, "logits/chosen": -2.836764335632324, "logits/rejected": -2.840043783187866, "logps/chosen": -50.94938659667969, "logps/rejected": -78.5450210571289, "loss": 0.4692, "rewards/accuracies": 0.8125, "rewards/chosen": -2.733532428741455, "rewards/margins": 2.9857707023620605, "rewards/rejected": -5.719302654266357, "step": 488 }, { "epoch": 0.9278937381404174, "grad_norm": 3.3547513484954834, "learning_rate": 3.882848714986243e-06, "logits/chosen": -2.890246868133545, "logits/rejected": -2.886960506439209, "logps/chosen": -57.828041076660156, "logps/rejected": -84.50794219970703, "loss": 0.2635, "rewards/accuracies": 0.8125, "rewards/chosen": -3.4307875633239746, "rewards/margins": 2.7383012771606445, "rewards/rejected": -6.169088840484619, "step": 489 }, { "epoch": 0.9297912713472486, "grad_norm": 1.3184571266174316, "learning_rate": 3.6824244095010065e-06, "logits/chosen": -2.83508563041687, "logits/rejected": -2.8344218730926514, "logps/chosen": -46.53615951538086, "logps/rejected": -92.15784454345703, "loss": 0.0859, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1809654235839844, "rewards/margins": 4.734975814819336, "rewards/rejected": -6.9159417152404785, "step": 490 }, { "epoch": 0.9316888045540797, "grad_norm": 2.9140141010284424, "learning_rate": 3.487213822512714e-06, "logits/chosen": -2.8875601291656494, "logits/rejected": -2.8827908039093018, "logps/chosen": -50.426536560058594, "logps/rejected": -89.96681213378906, "loss": 0.3374, "rewards/accuracies": 0.9375, "rewards/chosen": -2.6878647804260254, "rewards/margins": 4.1089630126953125, "rewards/rejected": -6.796828269958496, "step": 491 }, { "epoch": 0.9335863377609108, "grad_norm": 1.0782984495162964, "learning_rate": 3.2972275208679625e-06, "logits/chosen": -2.8685128688812256, "logits/rejected": -2.8658132553100586, "logps/chosen": -48.13031768798828, "logps/rejected": -84.63717651367188, "loss": 0.0898, "rewards/accuracies": 1.0, "rewards/chosen": -2.3174753189086914, "rewards/margins": 3.971602439880371, "rewards/rejected": -6.2890777587890625, "step": 492 }, { "epoch": 0.9354838709677419, "grad_norm": 3.8515820503234863, "learning_rate": 3.112475788620217e-06, "logits/chosen": -2.8882944583892822, "logits/rejected": -2.8865597248077393, "logps/chosen": -49.90972900390625, "logps/rejected": -77.10386657714844, "loss": 0.3545, "rewards/accuracies": 0.875, "rewards/chosen": -2.617600202560425, "rewards/margins": 2.900125741958618, "rewards/rejected": -5.517725944519043, "step": 493 }, { "epoch": 0.937381404174573, "grad_norm": 3.0631866455078125, "learning_rate": 2.932968626473065e-06, "logits/chosen": -2.8472578525543213, "logits/rejected": -2.8467025756835938, "logps/chosen": -48.0924072265625, "logps/rejected": -73.13133239746094, "loss": 0.5153, "rewards/accuracies": 0.75, "rewards/chosen": -2.564882278442383, "rewards/margins": 2.5196237564086914, "rewards/rejected": -5.084506034851074, "step": 494 }, { "epoch": 0.9392789373814042, "grad_norm": 2.3246684074401855, "learning_rate": 2.7587157512388718e-06, "logits/chosen": -2.8876593112945557, "logits/rejected": -2.886383295059204, "logps/chosen": -48.26333999633789, "logps/rejected": -82.16961669921875, "loss": 0.2638, "rewards/accuracies": 0.875, "rewards/chosen": -2.446296215057373, "rewards/margins": 3.771624803543091, "rewards/rejected": -6.217921257019043, "step": 495 }, { "epoch": 0.9411764705882353, "grad_norm": 0.9831016063690186, "learning_rate": 2.589726595312858e-06, "logits/chosen": -2.8822598457336426, "logits/rejected": -2.879408121109009, "logps/chosen": -42.318702697753906, "logps/rejected": -80.41962432861328, "loss": 0.0659, "rewards/accuracies": 1.0, "rewards/chosen": -1.6236395835876465, "rewards/margins": 4.238935470581055, "rewards/rejected": -5.862575531005859, "step": 496 }, { "epoch": 0.9430740037950665, "grad_norm": 1.4202706813812256, "learning_rate": 2.426010306162485e-06, "logits/chosen": -2.83363938331604, "logits/rejected": -2.8289742469787598, "logps/chosen": -51.449668884277344, "logps/rejected": -89.0745849609375, "loss": 0.1617, "rewards/accuracies": 0.9375, "rewards/chosen": -2.716837167739868, "rewards/margins": 4.036784648895264, "rewards/rejected": -6.753622055053711, "step": 497 }, { "epoch": 0.9449715370018975, "grad_norm": 0.9878135323524475, "learning_rate": 2.2675757458323065e-06, "logits/chosen": -2.8792364597320557, "logits/rejected": -2.8786449432373047, "logps/chosen": -48.229270935058594, "logps/rejected": -84.9887466430664, "loss": 0.0762, "rewards/accuracies": 1.0, "rewards/chosen": -2.416011095046997, "rewards/margins": 3.865017890930176, "rewards/rejected": -6.281028747558594, "step": 498 }, { "epoch": 0.9468690702087287, "grad_norm": 1.8558670282363892, "learning_rate": 2.1144314904642195e-06, "logits/chosen": -2.8796749114990234, "logits/rejected": -2.8758296966552734, "logps/chosen": -43.505531311035156, "logps/rejected": -75.1484146118164, "loss": 0.2833, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8704358339309692, "rewards/margins": 3.288053274154663, "rewards/rejected": -5.158489227294922, "step": 499 }, { "epoch": 0.9487666034155597, "grad_norm": 1.5600035190582275, "learning_rate": 1.9665858298333005e-06, "logits/chosen": -2.855041027069092, "logits/rejected": -2.856189727783203, "logps/chosen": -45.85496139526367, "logps/rejected": -85.69245147705078, "loss": 0.1211, "rewards/accuracies": 0.9375, "rewards/chosen": -2.0445146560668945, "rewards/margins": 4.26096248626709, "rewards/rejected": -6.305477142333984, "step": 500 }, { "epoch": 0.9506641366223909, "grad_norm": 2.5038869380950928, "learning_rate": 1.8240467668990457e-06, "logits/chosen": -2.883979082107544, "logits/rejected": -2.8837075233459473, "logps/chosen": -55.38407516479492, "logps/rejected": -86.02963256835938, "loss": 0.2542, "rewards/accuracies": 0.8125, "rewards/chosen": -3.258172035217285, "rewards/margins": 3.1368069648742676, "rewards/rejected": -6.394979000091553, "step": 501 }, { "epoch": 0.952561669829222, "grad_norm": 2.0583744049072266, "learning_rate": 1.6868220173721471e-06, "logits/chosen": -2.8597681522369385, "logits/rejected": -2.8624424934387207, "logps/chosen": -51.48234939575195, "logps/rejected": -80.75935363769531, "loss": 0.3349, "rewards/accuracies": 0.8125, "rewards/chosen": -2.759897232055664, "rewards/margins": 3.021780014038086, "rewards/rejected": -5.78167724609375, "step": 502 }, { "epoch": 0.9544592030360531, "grad_norm": 2.2353053092956543, "learning_rate": 1.5549190092968736e-06, "logits/chosen": -2.8665668964385986, "logits/rejected": -2.8670010566711426, "logps/chosen": -50.2732048034668, "logps/rejected": -88.10029602050781, "loss": 0.386, "rewards/accuracies": 0.9375, "rewards/chosen": -2.653740406036377, "rewards/margins": 4.059865951538086, "rewards/rejected": -6.713606834411621, "step": 503 }, { "epoch": 0.9563567362428842, "grad_norm": 2.396918773651123, "learning_rate": 1.4283448826489798e-06, "logits/chosen": -2.8291938304901123, "logits/rejected": -2.8266398906707764, "logps/chosen": -46.44121551513672, "logps/rejected": -79.84049987792969, "loss": 0.2428, "rewards/accuracies": 0.875, "rewards/chosen": -2.244828701019287, "rewards/margins": 3.5079965591430664, "rewards/rejected": -5.7528252601623535, "step": 504 }, { "epoch": 0.9582542694497154, "grad_norm": 1.3848168849945068, "learning_rate": 1.3071064889491724e-06, "logits/chosen": -2.8477964401245117, "logits/rejected": -2.8448750972747803, "logps/chosen": -41.835227966308594, "logps/rejected": -74.19090270996094, "loss": 0.1726, "rewards/accuracies": 0.875, "rewards/chosen": -1.6162993907928467, "rewards/margins": 3.534118890762329, "rewards/rejected": -5.150418281555176, "step": 505 }, { "epoch": 0.9601518026565465, "grad_norm": 1.7604390382766724, "learning_rate": 1.1912103908922945e-06, "logits/chosen": -2.871805429458618, "logits/rejected": -2.8669626712799072, "logps/chosen": -49.46722412109375, "logps/rejected": -83.61421203613281, "loss": 0.2727, "rewards/accuracies": 0.875, "rewards/chosen": -2.5857064723968506, "rewards/margins": 3.5844779014587402, "rewards/rejected": -6.170184135437012, "step": 506 }, { "epoch": 0.9620493358633776, "grad_norm": 2.633267641067505, "learning_rate": 1.0806628619920322e-06, "logits/chosen": -2.8769755363464355, "logits/rejected": -2.8758394718170166, "logps/chosen": -51.57659149169922, "logps/rejected": -75.70911407470703, "loss": 0.3124, "rewards/accuracies": 0.875, "rewards/chosen": -2.7957983016967773, "rewards/margins": 2.4409711360931396, "rewards/rejected": -5.236769676208496, "step": 507 }, { "epoch": 0.9639468690702088, "grad_norm": 0.38449132442474365, "learning_rate": 9.754698862413759e-07, "logits/chosen": -2.8793885707855225, "logits/rejected": -2.876129388809204, "logps/chosen": -53.12155532836914, "logps/rejected": -92.95342254638672, "loss": 0.0331, "rewards/accuracies": 1.0, "rewards/chosen": -2.927220106124878, "rewards/margins": 4.145195007324219, "rewards/rejected": -7.072414875030518, "step": 508 }, { "epoch": 0.9658444022770398, "grad_norm": 1.3234167098999023, "learning_rate": 8.75637157788689e-07, "logits/chosen": -2.8668880462646484, "logits/rejected": -2.8626136779785156, "logps/chosen": -46.708091735839844, "logps/rejected": -91.33255767822266, "loss": 0.192, "rewards/accuracies": 0.875, "rewards/chosen": -2.2253031730651855, "rewards/margins": 4.6860833168029785, "rewards/rejected": -6.911386013031006, "step": 509 }, { "epoch": 0.967741935483871, "grad_norm": 1.2830538749694824, "learning_rate": 7.81170080629412e-07, "logits/chosen": -2.8674538135528564, "logits/rejected": -2.8622677326202393, "logps/chosen": -50.81001281738281, "logps/rejected": -82.64586639404297, "loss": 0.1122, "rewards/accuracies": 0.9375, "rewards/chosen": -2.573124408721924, "rewards/margins": 3.5462069511413574, "rewards/rejected": -6.1193318367004395, "step": 510 }, { "epoch": 0.969639468690702, "grad_norm": 2.888144016265869, "learning_rate": 6.920737683136613e-07, "logits/chosen": -2.8974032402038574, "logits/rejected": -2.891505479812622, "logps/chosen": -53.14949035644531, "logps/rejected": -89.72847747802734, "loss": 0.2069, "rewards/accuracies": 0.9375, "rewards/chosen": -2.929083824157715, "rewards/margins": 3.8716461658477783, "rewards/rejected": -6.800730228424072, "step": 511 }, { "epoch": 0.9715370018975332, "grad_norm": 1.7620785236358643, "learning_rate": 6.083530436693408e-07, "logits/chosen": -2.8567254543304443, "logits/rejected": -2.863250970840454, "logps/chosen": -49.66737365722656, "logps/rejected": -83.99765014648438, "loss": 0.2175, "rewards/accuracies": 0.9375, "rewards/chosen": -2.5828912258148193, "rewards/margins": 3.6682820320129395, "rewards/rejected": -6.25117301940918, "step": 512 }, { "epoch": 0.9734345351043643, "grad_norm": 1.5268384218215942, "learning_rate": 5.300124385410943e-07, "logits/chosen": -2.8635151386260986, "logits/rejected": -2.86435866355896, "logps/chosen": -55.493438720703125, "logps/rejected": -85.88677978515625, "loss": 0.1664, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0996596813201904, "rewards/margins": 3.1925599575042725, "rewards/rejected": -6.292219161987305, "step": 513 }, { "epoch": 0.9753320683111955, "grad_norm": 2.3480827808380127, "learning_rate": 4.570561935450468e-07, "logits/chosen": -2.8450422286987305, "logits/rejected": -2.8451755046844482, "logps/chosen": -58.186397552490234, "logps/rejected": -92.19591522216797, "loss": 0.4327, "rewards/accuracies": 0.8125, "rewards/chosen": -3.466181755065918, "rewards/margins": 3.5265114307403564, "rewards/rejected": -6.992692947387695, "step": 514 }, { "epoch": 0.9772296015180265, "grad_norm": 2.6257131099700928, "learning_rate": 3.8948825783918784e-07, "logits/chosen": -2.8356828689575195, "logits/rejected": -2.834594964981079, "logps/chosen": -51.776058197021484, "logps/rejected": -81.43892669677734, "loss": 0.1723, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8794238567352295, "rewards/margins": 3.0788588523864746, "rewards/rejected": -5.958282947540283, "step": 515 }, { "epoch": 0.9791271347248577, "grad_norm": 3.8994312286376953, "learning_rate": 3.273122889096536e-07, "logits/chosen": -2.866316556930542, "logits/rejected": -2.8655130863189697, "logps/chosen": -57.50481414794922, "logps/rejected": -87.61444854736328, "loss": 0.2925, "rewards/accuracies": 0.875, "rewards/chosen": -3.3754281997680664, "rewards/margins": 3.1350836753845215, "rewards/rejected": -6.510511875152588, "step": 516 }, { "epoch": 0.9810246679316889, "grad_norm": 1.6719425916671753, "learning_rate": 2.7053165237268527e-07, "logits/chosen": -2.875308036804199, "logits/rejected": -2.874027967453003, "logps/chosen": -60.042579650878906, "logps/rejected": -89.38323974609375, "loss": 0.2483, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7195136547088623, "rewards/margins": 3.0455737113952637, "rewards/rejected": -6.765087127685547, "step": 517 }, { "epoch": 0.9829222011385199, "grad_norm": 2.5794994831085205, "learning_rate": 2.191494217925305e-07, "logits/chosen": -2.8739867210388184, "logits/rejected": -2.876070022583008, "logps/chosen": -47.57518768310547, "logps/rejected": -83.28882598876953, "loss": 0.2543, "rewards/accuracies": 0.875, "rewards/chosen": -2.296959638595581, "rewards/margins": 3.761139392852783, "rewards/rejected": -6.058098793029785, "step": 518 }, { "epoch": 0.9848197343453511, "grad_norm": 1.4231163263320923, "learning_rate": 1.7316837851499844e-07, "logits/chosen": -2.8610801696777344, "logits/rejected": -2.858586072921753, "logps/chosen": -49.79618453979492, "logps/rejected": -86.7958755493164, "loss": 0.1576, "rewards/accuracies": 0.875, "rewards/chosen": -2.5396759510040283, "rewards/margins": 3.977015972137451, "rewards/rejected": -6.516692161560059, "step": 519 }, { "epoch": 0.9867172675521821, "grad_norm": 3.5399532318115234, "learning_rate": 1.3259101151694708e-07, "logits/chosen": -2.888908624649048, "logits/rejected": -2.8908607959747314, "logps/chosen": -57.93926239013672, "logps/rejected": -87.50239562988281, "loss": 0.348, "rewards/accuracies": 0.8125, "rewards/chosen": -3.480964183807373, "rewards/margins": 3.0687217712402344, "rewards/rejected": -6.549685478210449, "step": 520 }, { "epoch": 0.9886148007590133, "grad_norm": 3.352266788482666, "learning_rate": 9.741951727152421e-08, "logits/chosen": -2.894845485687256, "logits/rejected": -2.8936767578125, "logps/chosen": -56.55711364746094, "logps/rejected": -79.75418853759766, "loss": 0.3825, "rewards/accuracies": 0.6875, "rewards/chosen": -3.3504998683929443, "rewards/margins": 2.4052634239196777, "rewards/rejected": -5.755763053894043, "step": 521 }, { "epoch": 0.9905123339658444, "grad_norm": 1.2697792053222656, "learning_rate": 6.765579962928482e-08, "logits/chosen": -2.8767342567443848, "logits/rejected": -2.880286931991577, "logps/chosen": -52.612449645996094, "logps/rejected": -82.76181030273438, "loss": 0.0968, "rewards/accuracies": 1.0, "rewards/chosen": -2.910796642303467, "rewards/margins": 3.1595427989959717, "rewards/rejected": -6.070339679718018, "step": 522 }, { "epoch": 0.9924098671726755, "grad_norm": 1.018975853919983, "learning_rate": 4.330146971515125e-08, "logits/chosen": -2.852424144744873, "logits/rejected": -2.8456950187683105, "logps/chosen": -51.23678207397461, "logps/rejected": -86.56326293945312, "loss": 0.1052, "rewards/accuracies": 0.9375, "rewards/chosen": -2.8189241886138916, "rewards/margins": 3.7502474784851074, "rewards/rejected": -6.56917142868042, "step": 523 }, { "epoch": 0.9943074003795066, "grad_norm": 0.7454613447189331, "learning_rate": 2.435784584114975e-08, "logits/chosen": -2.8692071437835693, "logits/rejected": -2.8686625957489014, "logps/chosen": -52.17524719238281, "logps/rejected": -87.58660888671875, "loss": 0.1174, "rewards/accuracies": 0.9375, "rewards/chosen": -2.771207332611084, "rewards/margins": 3.7996647357940674, "rewards/rejected": -6.5708723068237305, "step": 524 }, { "epoch": 0.9962049335863378, "grad_norm": 3.760188102722168, "learning_rate": 1.0825953435122938e-08, "logits/chosen": -2.8647942543029785, "logits/rejected": -2.8657193183898926, "logps/chosen": -49.69865417480469, "logps/rejected": -85.46101379394531, "loss": 0.3014, "rewards/accuracies": 0.875, "rewards/chosen": -2.602159261703491, "rewards/margins": 3.695681571960449, "rewards/rejected": -6.297840595245361, "step": 525 }, { "epoch": 0.9981024667931688, "grad_norm": 1.6153333187103271, "learning_rate": 2.7065249851743193e-09, "logits/chosen": -2.8594605922698975, "logits/rejected": -2.861586332321167, "logps/chosen": -52.35853576660156, "logps/rejected": -86.91337585449219, "loss": 0.0971, "rewards/accuracies": 1.0, "rewards/chosen": -2.7869677543640137, "rewards/margins": 3.7072904109954834, "rewards/rejected": -6.494257926940918, "step": 526 }, { "epoch": 1.0, "grad_norm": 3.749000072479248, "learning_rate": 0.0, "logits/chosen": -2.897618055343628, "logits/rejected": -2.891883373260498, "logps/chosen": -56.75947189331055, "logps/rejected": -83.30915069580078, "loss": 0.3765, "rewards/accuracies": 0.8333333730697632, "rewards/chosen": -3.231624126434326, "rewards/margins": 2.8773982524871826, "rewards/rejected": -6.10902214050293, "step": 527 }, { "epoch": 1.0, "eval_logits/chosen": -2.868335485458374, "eval_logits/rejected": -2.866347312927246, "eval_logps/chosen": -49.55250930786133, "eval_logps/rejected": -86.6895523071289, "eval_loss": 0.14940239489078522, "eval_rewards/accuracies": 0.945090115070343, "eval_rewards/chosen": -2.55690598487854, "eval_rewards/margins": 3.9059858322143555, "eval_rewards/rejected": -6.462891578674316, "eval_runtime": 5440.1428, "eval_samples_per_second": 1.548, "eval_steps_per_second": 0.097, "step": 527 } ], "logging_steps": 1, "max_steps": 527, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }