{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00333889816360601, "grad_norm": 0.8906780481338501, "learning_rate": 0.0, "logits/chosen": -2.6484375, "logits/rejected": -2.7734375, "logps/chosen": -182.0, "logps/rejected": -244.5, "loss": 1.5239, "rewards/accuracies": 0.421875, "rewards/chosen": 0.02294921875, "rewards/margins": -0.33056640625, "rewards/rejected": 0.3515625, "step": 1 }, { "epoch": 0.00667779632721202, "grad_norm": 0.8854408860206604, "learning_rate": 1.6000000000000003e-05, "logits/chosen": -3.2109375, "logits/rejected": -2.6640625, "logps/chosen": -133.5, "logps/rejected": -258.5, "loss": 1.103, "rewards/accuracies": 0.546875, "rewards/chosen": -0.0283203125, "rewards/margins": 0.0703125, "rewards/rejected": -0.09814453125, "step": 2 }, { "epoch": 0.01001669449081803, "grad_norm": 1.1921889781951904, "learning_rate": 3.2000000000000005e-05, "logits/chosen": -3.0, "logits/rejected": -2.6640625, "logps/chosen": -130.0, "logps/rejected": -226.0, "loss": 1.5938, "rewards/accuracies": 0.28125, "rewards/chosen": -0.44921875, "rewards/margins": -0.763671875, "rewards/rejected": 0.3125, "step": 3 }, { "epoch": 0.01335559265442404, "grad_norm": 0.9537683725357056, "learning_rate": 4.8e-05, "logits/chosen": -2.890625, "logits/rejected": -2.8125, "logps/chosen": -147.5, "logps/rejected": -281.0, "loss": 1.0503, "rewards/accuracies": 0.578125, "rewards/chosen": 0.02099609375, "rewards/margins": 0.28369140625, "rewards/rejected": -0.26171875, "step": 4 }, { "epoch": 0.01669449081803005, "grad_norm": 1.2217826843261719, "learning_rate": 6.400000000000001e-05, "logits/chosen": -2.90625, "logits/rejected": -2.7734375, "logps/chosen": -135.75, "logps/rejected": -232.5, "loss": 1.3145, "rewards/accuracies": 0.46875, "rewards/chosen": 0.1611328125, "rewards/margins": 0.0048828125, "rewards/rejected": 0.15606689453125, "step": 5 }, { "epoch": 0.02003338898163606, "grad_norm": 1.0575661659240723, "learning_rate": 8e-05, "logits/chosen": -3.0, "logits/rejected": -2.640625, "logps/chosen": -142.5, "logps/rejected": -260.5, "loss": 1.1968, "rewards/accuracies": 0.46875, "rewards/chosen": 0.104736328125, "rewards/margins": -0.081298828125, "rewards/rejected": 0.185546875, "step": 6 }, { "epoch": 0.02337228714524207, "grad_norm": 1.3832917213439941, "learning_rate": 9.6e-05, "logits/chosen": -3.140625, "logits/rejected": -2.9375, "logps/chosen": -119.25, "logps/rejected": -241.0, "loss": 1.1948, "rewards/accuracies": 0.390625, "rewards/chosen": 0.16455078125, "rewards/margins": -0.1988525390625, "rewards/rejected": 0.36328125, "step": 7 }, { "epoch": 0.02671118530884808, "grad_norm": 0.9630353450775146, "learning_rate": 0.000112, "logits/chosen": -2.7734375, "logits/rejected": -2.625, "logps/chosen": -159.5, "logps/rejected": -256.0, "loss": 0.8555, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0166015625, "rewards/margins": 1.04296875, "rewards/rejected": -1.02734375, "step": 8 }, { "epoch": 0.03005008347245409, "grad_norm": 1.3834831714630127, "learning_rate": 0.00012800000000000002, "logits/chosen": -2.921875, "logits/rejected": -2.703125, "logps/chosen": -143.5, "logps/rejected": -224.5, "loss": 1.1067, "rewards/accuracies": 0.546875, "rewards/chosen": 0.2060546875, "rewards/margins": 0.4296875, "rewards/rejected": -0.222686767578125, "step": 9 }, { "epoch": 0.0333889816360601, "grad_norm": 1.5977782011032104, "learning_rate": 0.000144, "logits/chosen": -2.7890625, "logits/rejected": -2.7890625, "logps/chosen": -160.0, "logps/rejected": -271.0, "loss": 0.896, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0777587890625, "rewards/margins": 0.669921875, "rewards/rejected": -0.5927734375, "step": 10 }, { "epoch": 0.03672787979966611, "grad_norm": 0.9674336910247803, "learning_rate": 0.00016, "logits/chosen": -2.484375, "logits/rejected": -2.734375, "logps/chosen": -182.0, "logps/rejected": -217.0, "loss": 0.4347, "rewards/accuracies": 0.75, "rewards/chosen": 0.208984375, "rewards/margins": 1.55078125, "rewards/rejected": -1.34375, "step": 11 }, { "epoch": 0.04006677796327212, "grad_norm": 1.0390831232070923, "learning_rate": 0.00015999950159857832, "logits/chosen": -2.953125, "logits/rejected": -2.78125, "logps/chosen": -149.0, "logps/rejected": -259.5, "loss": 0.3765, "rewards/accuracies": 0.84375, "rewards/chosen": 0.3333740234375, "rewards/margins": 2.6015625, "rewards/rejected": -2.265625, "step": 12 }, { "epoch": 0.04340567612687813, "grad_norm": 0.7539263963699341, "learning_rate": 0.00015999800640052332, "logits/chosen": -2.9609375, "logits/rejected": -2.734375, "logps/chosen": -164.5, "logps/rejected": -295.0, "loss": 0.2193, "rewards/accuracies": 0.875, "rewards/chosen": 0.7236328125, "rewards/margins": 4.15625, "rewards/rejected": -3.4296875, "step": 13 }, { "epoch": 0.04674457429048414, "grad_norm": 0.8638622760772705, "learning_rate": 0.00015999551442446528, "logits/chosen": -3.2890625, "logits/rejected": -2.515625, "logps/chosen": -144.0, "logps/rejected": -289.0, "loss": 0.2724, "rewards/accuracies": 0.921875, "rewards/chosen": 0.4716796875, "rewards/margins": 4.484375, "rewards/rejected": -4.015625, "step": 14 }, { "epoch": 0.05008347245409015, "grad_norm": 0.5347347855567932, "learning_rate": 0.00015999202570145425, "logits/chosen": -3.2734375, "logits/rejected": -2.5546875, "logps/chosen": -113.0, "logps/rejected": -288.0, "loss": 0.1353, "rewards/accuracies": 0.9375, "rewards/chosen": 0.955078125, "rewards/margins": 5.78125, "rewards/rejected": -4.828125, "step": 15 }, { "epoch": 0.05342237061769616, "grad_norm": 0.5463722348213196, "learning_rate": 0.0001599875402749599, "logits/chosen": -3.359375, "logits/rejected": -2.7734375, "logps/chosen": -138.0, "logps/rejected": -241.0, "loss": 0.1262, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4609375, "rewards/margins": 6.0625, "rewards/rejected": -4.609375, "step": 16 }, { "epoch": 0.05676126878130217, "grad_norm": 0.6224486231803894, "learning_rate": 0.00015998205820087077, "logits/chosen": -3.625, "logits/rejected": -2.78125, "logps/chosen": -105.0, "logps/rejected": -271.0, "loss": 0.1513, "rewards/accuracies": 0.9375, "rewards/chosen": 1.890625, "rewards/margins": 7.015625, "rewards/rejected": -5.125, "step": 17 }, { "epoch": 0.06010016694490818, "grad_norm": 0.31991323828697205, "learning_rate": 0.00015997557954749368, "logits/chosen": -3.71875, "logits/rejected": -2.6171875, "logps/chosen": -113.5, "logps/rejected": -292.0, "loss": 0.0388, "rewards/accuracies": 1.0, "rewards/chosen": 2.6953125, "rewards/margins": 8.3125, "rewards/rejected": -5.625, "step": 18 }, { "epoch": 0.06343906510851419, "grad_norm": 0.1611785888671875, "learning_rate": 0.00015996810439555294, "logits/chosen": -3.7578125, "logits/rejected": -2.9140625, "logps/chosen": -121.25, "logps/rejected": -330.0, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": 2.328125, "rewards/margins": 8.875, "rewards/rejected": -6.515625, "step": 19 }, { "epoch": 0.0667779632721202, "grad_norm": 0.2885970175266266, "learning_rate": 0.00015995963283818918, "logits/chosen": -4.03125, "logits/rejected": -2.6953125, "logps/chosen": -117.75, "logps/rejected": -312.0, "loss": 0.0381, "rewards/accuracies": 0.984375, "rewards/chosen": 2.296875, "rewards/margins": 8.28125, "rewards/rejected": -6.0, "step": 20 }, { "epoch": 0.07011686143572621, "grad_norm": 0.1798153966665268, "learning_rate": 0.00015995016498095827, "logits/chosen": -3.6953125, "logits/rejected": -2.9453125, "logps/chosen": -124.0, "logps/rejected": -300.0, "loss": 0.0241, "rewards/accuracies": 0.984375, "rewards/chosen": 2.921875, "rewards/margins": 9.875, "rewards/rejected": -6.9375, "step": 21 }, { "epoch": 0.07345575959933222, "grad_norm": 0.04417094215750694, "learning_rate": 0.0001599397009418301, "logits/chosen": -3.7421875, "logits/rejected": -2.703125, "logps/chosen": -129.5, "logps/rejected": -347.0, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 2.4921875, "rewards/margins": 10.15625, "rewards/rejected": -7.671875, "step": 22 }, { "epoch": 0.07679465776293823, "grad_norm": 0.0934915617108345, "learning_rate": 0.00015992824085118694, "logits/chosen": -3.421875, "logits/rejected": -2.9765625, "logps/chosen": -108.0, "logps/rejected": -303.0, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 2.8359375, "rewards/margins": 10.65625, "rewards/rejected": -7.8125, "step": 23 }, { "epoch": 0.08013355592654424, "grad_norm": 0.03649423271417618, "learning_rate": 0.00015991578485182194, "logits/chosen": -3.96875, "logits/rejected": -2.7890625, "logps/chosen": -100.25, "logps/rejected": -345.0, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 3.4296875, "rewards/margins": 12.0, "rewards/rejected": -8.59375, "step": 24 }, { "epoch": 0.08347245409015025, "grad_norm": 0.29851219058036804, "learning_rate": 0.00015990233309893726, "logits/chosen": -3.8203125, "logits/rejected": -3.1484375, "logps/chosen": -110.25, "logps/rejected": -313.0, "loss": 0.0403, "rewards/accuracies": 0.984375, "rewards/chosen": 3.0703125, "rewards/margins": 10.59375, "rewards/rejected": -7.5, "step": 25 }, { "epoch": 0.08681135225375626, "grad_norm": 0.09218787401914597, "learning_rate": 0.00015988788576014228, "logits/chosen": -4.1796875, "logits/rejected": -3.015625, "logps/chosen": -113.25, "logps/rejected": -331.0, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 3.2578125, "rewards/margins": 12.0, "rewards/rejected": -8.75, "step": 26 }, { "epoch": 0.09015025041736227, "grad_norm": 0.010930394753813744, "learning_rate": 0.0001598724430154513, "logits/chosen": -4.625, "logits/rejected": -3.0625, "logps/chosen": -75.0, "logps/rejected": -328.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 3.5, "rewards/margins": 12.875, "rewards/rejected": -9.375, "step": 27 }, { "epoch": 0.09348914858096828, "grad_norm": 0.025569891557097435, "learning_rate": 0.00015985600505728152, "logits/chosen": -4.5625, "logits/rejected": -3.1015625, "logps/chosen": -81.75, "logps/rejected": -301.0, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 2.8828125, "rewards/margins": 11.96875, "rewards/rejected": -9.09375, "step": 28 }, { "epoch": 0.09682804674457429, "grad_norm": 0.00938540231436491, "learning_rate": 0.00015983857209045046, "logits/chosen": -3.8984375, "logits/rejected": -3.03125, "logps/chosen": -154.0, "logps/rejected": -353.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 3.40625, "rewards/margins": 12.96875, "rewards/rejected": -9.5625, "step": 29 }, { "epoch": 0.1001669449081803, "grad_norm": 0.00449951458722353, "learning_rate": 0.00015982014433217346, "logits/chosen": -4.359375, "logits/rejected": -3.15625, "logps/chosen": -110.25, "logps/rejected": -349.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 3.6875, "rewards/margins": 14.40625, "rewards/rejected": -10.6875, "step": 30 }, { "epoch": 0.10350584307178631, "grad_norm": 0.1915261447429657, "learning_rate": 0.0001598007220120611, "logits/chosen": -4.546875, "logits/rejected": -3.09375, "logps/chosen": -108.25, "logps/rejected": -386.0, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 3.6953125, "rewards/margins": 15.75, "rewards/rejected": -12.0625, "step": 31 }, { "epoch": 0.10684474123539232, "grad_norm": 0.015207415446639061, "learning_rate": 0.0001597803053721162, "logits/chosen": -4.921875, "logits/rejected": -3.2734375, "logps/chosen": -102.25, "logps/rejected": -355.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 3.0859375, "rewards/margins": 13.65625, "rewards/rejected": -10.5625, "step": 32 }, { "epoch": 0.11018363939899833, "grad_norm": 0.052224867045879364, "learning_rate": 0.00015975889466673073, "logits/chosen": -4.84375, "logits/rejected": -3.2265625, "logps/chosen": -107.75, "logps/rejected": -387.0, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 3.1796875, "rewards/margins": 15.375, "rewards/rejected": -12.21875, "step": 33 }, { "epoch": 0.11352253756260434, "grad_norm": 0.018112409859895706, "learning_rate": 0.0001597364901626829, "logits/chosen": -4.640625, "logits/rejected": -3.3828125, "logps/chosen": -121.5, "logps/rejected": -367.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 3.53125, "rewards/margins": 15.65625, "rewards/rejected": -12.125, "step": 34 }, { "epoch": 0.11686143572621036, "grad_norm": 0.03322592005133629, "learning_rate": 0.00015971309213913366, "logits/chosen": -4.984375, "logits/rejected": -3.4296875, "logps/chosen": -89.75, "logps/rejected": -365.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 3.5078125, "rewards/margins": 15.53125, "rewards/rejected": -12.0625, "step": 35 }, { "epoch": 0.12020033388981637, "grad_norm": 0.051106907427310944, "learning_rate": 0.00015968870088762315, "logits/chosen": -4.609375, "logits/rejected": -3.28125, "logps/chosen": -98.75, "logps/rejected": -359.0, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 3.296875, "rewards/margins": 16.15625, "rewards/rejected": -12.90625, "step": 36 }, { "epoch": 0.12353923205342238, "grad_norm": 0.040621671825647354, "learning_rate": 0.00015966331671206724, "logits/chosen": -4.4375, "logits/rejected": -3.2421875, "logps/chosen": -120.0, "logps/rejected": -406.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 3.8671875, "rewards/margins": 17.4375, "rewards/rejected": -13.625, "step": 37 }, { "epoch": 0.12687813021702837, "grad_norm": 0.053807105869054794, "learning_rate": 0.00015963693992875367, "logits/chosen": -4.546875, "logits/rejected": -3.3203125, "logps/chosen": -117.0, "logps/rejected": -378.0, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 3.3984375, "rewards/margins": 17.0625, "rewards/rejected": -13.6875, "step": 38 }, { "epoch": 0.1302170283806344, "grad_norm": 0.008678439073264599, "learning_rate": 0.00015960957086633812, "logits/chosen": -4.6875, "logits/rejected": -3.5078125, "logps/chosen": -108.0, "logps/rejected": -368.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.8515625, "rewards/margins": 18.6875, "rewards/rejected": -14.84375, "step": 39 }, { "epoch": 0.1335559265442404, "grad_norm": 0.05349210277199745, "learning_rate": 0.00015958120986584007, "logits/chosen": -4.5625, "logits/rejected": -3.3203125, "logps/chosen": -121.0, "logps/rejected": -348.0, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 3.1953125, "rewards/margins": 16.15625, "rewards/rejected": -13.0, "step": 40 }, { "epoch": 0.13689482470784642, "grad_norm": 0.01919226534664631, "learning_rate": 0.00015955185728063859, "logits/chosen": -4.671875, "logits/rejected": -3.65625, "logps/chosen": -111.75, "logps/rejected": -348.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 3.3515625, "rewards/margins": 16.0, "rewards/rejected": -12.625, "step": 41 }, { "epoch": 0.14023372287145242, "grad_norm": 0.003901825286448002, "learning_rate": 0.0001595215134764679, "logits/chosen": -4.890625, "logits/rejected": -3.484375, "logps/chosen": -96.75, "logps/rejected": -401.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.0390625, "rewards/margins": 17.4375, "rewards/rejected": -14.40625, "step": 42 }, { "epoch": 0.14357262103505844, "grad_norm": 1.0987324714660645, "learning_rate": 0.00015949017883141293, "logits/chosen": -4.4375, "logits/rejected": -3.5625, "logps/chosen": -132.25, "logps/rejected": -354.0, "loss": 1.1098, "rewards/accuracies": 0.984375, "rewards/chosen": 2.125, "rewards/margins": 16.625, "rewards/rejected": -14.4375, "step": 43 }, { "epoch": 0.14691151919866444, "grad_norm": 0.0008750148699618876, "learning_rate": 0.00015945785373590446, "logits/chosen": -4.78125, "logits/rejected": -3.578125, "logps/chosen": -112.75, "logps/rejected": -404.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.40625, "rewards/margins": 20.1875, "rewards/rejected": -16.8125, "step": 44 }, { "epoch": 0.15025041736227046, "grad_norm": 0.00039893100620247424, "learning_rate": 0.0001594245385927143, "logits/chosen": -4.796875, "logits/rejected": -3.6875, "logps/chosen": -109.0, "logps/rejected": -436.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.90625, "rewards/margins": 21.0625, "rewards/rejected": -17.25, "step": 45 }, { "epoch": 0.15358931552587646, "grad_norm": 3.564608414308168e-05, "learning_rate": 0.00015939023381695034, "logits/chosen": -5.015625, "logits/rejected": -3.5546875, "logps/chosen": -122.0, "logps/rejected": -404.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.9453125, "rewards/margins": 20.5, "rewards/rejected": -16.5625, "step": 46 }, { "epoch": 0.15692821368948248, "grad_norm": 0.15678206086158752, "learning_rate": 0.0001593549398360513, "logits/chosen": -5.25, "logits/rejected": -3.546875, "logps/chosen": -103.75, "logps/rejected": -430.0, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 3.765625, "rewards/margins": 21.25, "rewards/rejected": -17.5, "step": 47 }, { "epoch": 0.16026711185308848, "grad_norm": 0.003343533491715789, "learning_rate": 0.00015931865708978144, "logits/chosen": -4.828125, "logits/rejected": -3.75, "logps/chosen": -115.25, "logps/rejected": -455.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 3.46875, "rewards/margins": 22.1875, "rewards/rejected": -18.75, "step": 48 }, { "epoch": 0.1636060100166945, "grad_norm": 0.0013559595681726933, "learning_rate": 0.0001592813860302251, "logits/chosen": -4.78125, "logits/rejected": -3.515625, "logps/chosen": -98.0, "logps/rejected": -460.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.1484375, "rewards/margins": 22.1875, "rewards/rejected": -19.0, "step": 49 }, { "epoch": 0.1669449081803005, "grad_norm": 0.010648728348314762, "learning_rate": 0.00015924312712178095, "logits/chosen": -5.34375, "logits/rejected": -3.6953125, "logps/chosen": -101.5, "logps/rejected": -440.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 2.828125, "rewards/margins": 21.875, "rewards/rejected": -19.0625, "step": 50 }, { "epoch": 0.17028380634390652, "grad_norm": 0.0414469912648201, "learning_rate": 0.00015920388084115635, "logits/chosen": -4.78125, "logits/rejected": -3.6484375, "logps/chosen": -111.0, "logps/rejected": -429.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 3.6484375, "rewards/margins": 22.875, "rewards/rejected": -19.25, "step": 51 }, { "epoch": 0.17362270450751252, "grad_norm": 0.0018979490268975496, "learning_rate": 0.00015916364767736143, "logits/chosen": -5.078125, "logits/rejected": -3.546875, "logps/chosen": -132.25, "logps/rejected": -401.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.0, "rewards/margins": 19.875, "rewards/rejected": -16.875, "step": 52 }, { "epoch": 0.17696160267111852, "grad_norm": 1.743229768180754e-05, "learning_rate": 0.00015912242813170274, "logits/chosen": -4.984375, "logits/rejected": -3.7265625, "logps/chosen": -125.25, "logps/rejected": -457.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.4140625, "rewards/margins": 23.25, "rewards/rejected": -19.875, "step": 53 }, { "epoch": 0.18030050083472454, "grad_norm": 0.13024184107780457, "learning_rate": 0.00015908022271777733, "logits/chosen": -5.515625, "logits/rejected": -3.796875, "logps/chosen": -83.0, "logps/rejected": -467.0, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 3.3828125, "rewards/margins": 24.0, "rewards/rejected": -20.625, "step": 54 }, { "epoch": 0.18363939899833054, "grad_norm": 0.0001471200812375173, "learning_rate": 0.0001590370319614662, "logits/chosen": -4.859375, "logits/rejected": -3.703125, "logps/chosen": -111.5, "logps/rejected": -470.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.546875, "rewards/margins": 24.5625, "rewards/rejected": -21.0, "step": 55 }, { "epoch": 0.18697829716193656, "grad_norm": 0.0001591207692399621, "learning_rate": 0.00015899285640092763, "logits/chosen": -5.046875, "logits/rejected": -3.8046875, "logps/chosen": -115.75, "logps/rejected": -455.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.703125, "rewards/margins": 23.625, "rewards/rejected": -19.875, "step": 56 }, { "epoch": 0.19031719532554256, "grad_norm": 0.0009253205498680472, "learning_rate": 0.00015894769658659073, "logits/chosen": -4.65625, "logits/rejected": -3.65625, "logps/chosen": -122.25, "logps/rejected": -446.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.40625, "rewards/margins": 22.0625, "rewards/rejected": -18.5625, "step": 57 }, { "epoch": 0.19365609348914858, "grad_norm": 4.7780202294234186e-05, "learning_rate": 0.00015890155308114837, "logits/chosen": -5.234375, "logits/rejected": -3.4765625, "logps/chosen": -113.75, "logps/rejected": -467.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6640625, "rewards/margins": 24.1875, "rewards/rejected": -20.5, "step": 58 }, { "epoch": 0.19699499165275458, "grad_norm": 3.8026719266781583e-05, "learning_rate": 0.00015885442645955026, "logits/chosen": -5.0625, "logits/rejected": -3.890625, "logps/chosen": -119.75, "logps/rejected": -399.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.171875, "rewards/margins": 22.9375, "rewards/rejected": -18.75, "step": 59 }, { "epoch": 0.2003338898163606, "grad_norm": 0.00020460848463699222, "learning_rate": 0.00015880631730899578, "logits/chosen": -4.5859375, "logits/rejected": -3.53125, "logps/chosen": -99.0, "logps/rejected": -449.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6328125, "rewards/margins": 22.875, "rewards/rejected": -19.25, "step": 60 }, { "epoch": 0.2036727879799666, "grad_norm": 0.00048449577298015356, "learning_rate": 0.0001587572262289267, "logits/chosen": -5.546875, "logits/rejected": -3.9453125, "logps/chosen": -87.5, "logps/rejected": -464.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.96875, "rewards/margins": 24.4375, "rewards/rejected": -21.4375, "step": 61 }, { "epoch": 0.20701168614357263, "grad_norm": 0.14330030977725983, "learning_rate": 0.00015870715383101955, "logits/chosen": -5.671875, "logits/rejected": -3.7265625, "logps/chosen": -80.25, "logps/rejected": -457.0, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 3.5234375, "rewards/margins": 23.875, "rewards/rejected": -20.375, "step": 62 }, { "epoch": 0.21035058430717862, "grad_norm": 5.157471969141625e-05, "learning_rate": 0.00015865610073917825, "logits/chosen": -5.65625, "logits/rejected": -3.875, "logps/chosen": -96.5, "logps/rejected": -399.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.0703125, "rewards/margins": 23.125, "rewards/rejected": -19.0625, "step": 63 }, { "epoch": 0.21368948247078465, "grad_norm": 0.001272167544811964, "learning_rate": 0.0001586040675895261, "logits/chosen": -5.03125, "logits/rejected": -3.828125, "logps/chosen": -126.75, "logps/rejected": -453.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.1796875, "rewards/margins": 23.4375, "rewards/rejected": -20.3125, "step": 64 }, { "epoch": 0.21702838063439064, "grad_norm": 0.10344758629798889, "learning_rate": 0.00015855105503039804, "logits/chosen": -5.0, "logits/rejected": -3.65625, "logps/chosen": -93.75, "logps/rejected": -463.0, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 3.921875, "rewards/margins": 24.0, "rewards/rejected": -20.0625, "step": 65 }, { "epoch": 0.22036727879799667, "grad_norm": 0.000509591365698725, "learning_rate": 0.00015849706372233238, "logits/chosen": -5.390625, "logits/rejected": -3.7109375, "logps/chosen": -99.5, "logps/rejected": -421.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.34375, "rewards/margins": 22.625, "rewards/rejected": -19.3125, "step": 66 }, { "epoch": 0.22370617696160267, "grad_norm": 0.0010495522292330861, "learning_rate": 0.0001584420943380628, "logits/chosen": -5.265625, "logits/rejected": -3.9375, "logps/chosen": -109.25, "logps/rejected": -458.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.3203125, "rewards/margins": 23.1875, "rewards/rejected": -19.875, "step": 67 }, { "epoch": 0.2270450751252087, "grad_norm": 0.00022398516011890024, "learning_rate": 0.0001583861475625097, "logits/chosen": -5.296875, "logits/rejected": -3.6796875, "logps/chosen": -100.0, "logps/rejected": -489.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6796875, "rewards/margins": 24.6875, "rewards/rejected": -21.0, "step": 68 }, { "epoch": 0.2303839732888147, "grad_norm": 1.072521808964666e-05, "learning_rate": 0.00015832922409277198, "logits/chosen": -4.859375, "logits/rejected": -3.9765625, "logps/chosen": -124.75, "logps/rejected": -395.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.640625, "rewards/margins": 22.625, "rewards/rejected": -18.9375, "step": 69 }, { "epoch": 0.2337228714524207, "grad_norm": 0.00017957530508283526, "learning_rate": 0.00015827132463811804, "logits/chosen": -5.09375, "logits/rejected": -3.828125, "logps/chosen": -106.5, "logps/rejected": -407.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.609375, "rewards/margins": 22.5, "rewards/rejected": -18.875, "step": 70 }, { "epoch": 0.2370617696160267, "grad_norm": 0.0003923263284377754, "learning_rate": 0.00015821244991997717, "logits/chosen": -4.671875, "logits/rejected": -3.53125, "logps/chosen": -102.25, "logps/rejected": -459.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.9453125, "rewards/margins": 23.875, "rewards/rejected": -20.0, "step": 71 }, { "epoch": 0.24040066777963273, "grad_norm": 1.9827170035568997e-05, "learning_rate": 0.0001581526006719304, "logits/chosen": -5.53125, "logits/rejected": -3.84375, "logps/chosen": -89.75, "logps/rejected": -465.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.28125, "rewards/margins": 24.75, "rewards/rejected": -20.4375, "step": 72 }, { "epoch": 0.24373956594323873, "grad_norm": 0.2003210037946701, "learning_rate": 0.0001580917776397016, "logits/chosen": -4.875, "logits/rejected": -3.6875, "logps/chosen": -123.0, "logps/rejected": -445.0, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 4.0546875, "rewards/margins": 23.4375, "rewards/rejected": -19.3125, "step": 73 }, { "epoch": 0.24707846410684475, "grad_norm": 4.490726860240102e-05, "learning_rate": 0.0001580299815811478, "logits/chosen": -5.046875, "logits/rejected": -3.6328125, "logps/chosen": -96.25, "logps/rejected": -463.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6015625, "rewards/margins": 24.4375, "rewards/rejected": -20.8125, "step": 74 }, { "epoch": 0.25041736227045075, "grad_norm": 0.0011267291847616434, "learning_rate": 0.00015796721326625013, "logits/chosen": -5.078125, "logits/rejected": -3.5625, "logps/chosen": -99.25, "logps/rejected": -478.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8046875, "rewards/margins": 25.25, "rewards/rejected": -21.4375, "step": 75 }, { "epoch": 0.25375626043405675, "grad_norm": 2.087617986035184e-06, "learning_rate": 0.00015790347347710405, "logits/chosen": -5.046875, "logits/rejected": -3.53125, "logps/chosen": -118.0, "logps/rejected": -464.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.9296875, "rewards/margins": 24.4375, "rewards/rejected": -20.5, "step": 76 }, { "epoch": 0.2570951585976628, "grad_norm": 0.00048489755135960877, "learning_rate": 0.00015783876300790956, "logits/chosen": -5.078125, "logits/rejected": -3.6015625, "logps/chosen": -102.25, "logps/rejected": -466.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.3359375, "rewards/margins": 23.375, "rewards/rejected": -20.0, "step": 77 }, { "epoch": 0.2604340567612688, "grad_norm": 5.250661706668325e-05, "learning_rate": 0.0001577730826649614, "logits/chosen": -4.984375, "logits/rejected": -3.484375, "logps/chosen": -123.25, "logps/rejected": -448.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.234375, "rewards/margins": 23.6875, "rewards/rejected": -19.5, "step": 78 }, { "epoch": 0.2637729549248748, "grad_norm": 2.072748429782223e-05, "learning_rate": 0.00015770643326663898, "logits/chosen": -5.25, "logits/rejected": -3.359375, "logps/chosen": -80.25, "logps/rejected": -486.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.46875, "rewards/margins": 23.625, "rewards/rejected": -20.1875, "step": 79 }, { "epoch": 0.2671118530884808, "grad_norm": 0.002645147731527686, "learning_rate": 0.0001576388156433962, "logits/chosen": -5.15625, "logits/rejected": -3.515625, "logps/chosen": -119.0, "logps/rejected": -456.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.765625, "rewards/margins": 24.0625, "rewards/rejected": -20.3125, "step": 80 }, { "epoch": 0.2704507512520868, "grad_norm": 1.2537796465039719e-05, "learning_rate": 0.00015757023063775106, "logits/chosen": -4.71875, "logits/rejected": -3.5078125, "logps/chosen": -119.25, "logps/rejected": -425.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.2421875, "rewards/margins": 23.25, "rewards/rejected": -19.0, "step": 81 }, { "epoch": 0.27378964941569284, "grad_norm": 0.0001700354478089139, "learning_rate": 0.00015750067910427513, "logits/chosen": -5.125, "logits/rejected": -3.59375, "logps/chosen": -108.5, "logps/rejected": -414.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.7421875, "rewards/margins": 22.0625, "rewards/rejected": -18.3125, "step": 82 }, { "epoch": 0.27712854757929883, "grad_norm": 5.510517439688556e-05, "learning_rate": 0.000157430161909583, "logits/chosen": -5.0, "logits/rejected": -3.40625, "logps/chosen": -102.0, "logps/rejected": -452.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8828125, "rewards/margins": 22.875, "rewards/rejected": -19.0, "step": 83 }, { "epoch": 0.28046744574290483, "grad_norm": 0.0010590353049337864, "learning_rate": 0.00015735867993232143, "logits/chosen": -5.078125, "logits/rejected": -3.5703125, "logps/chosen": -105.25, "logps/rejected": -399.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.2265625, "rewards/margins": 21.9375, "rewards/rejected": -17.6875, "step": 84 }, { "epoch": 0.2838063439065108, "grad_norm": 9.394536027684808e-05, "learning_rate": 0.0001572862340631584, "logits/chosen": -4.984375, "logits/rejected": -3.6875, "logps/chosen": -90.75, "logps/rejected": -433.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.3125, "rewards/margins": 22.875, "rewards/rejected": -18.5625, "step": 85 }, { "epoch": 0.2871452420701169, "grad_norm": 0.0001689967029960826, "learning_rate": 0.00015721282520477197, "logits/chosen": -4.828125, "logits/rejected": -3.65625, "logps/chosen": -95.5, "logps/rejected": -429.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.9375, "rewards/margins": 22.5, "rewards/rejected": -18.5625, "step": 86 }, { "epoch": 0.2904841402337229, "grad_norm": 9.815259545575827e-06, "learning_rate": 0.00015713845427183922, "logits/chosen": -5.09375, "logits/rejected": -3.4140625, "logps/chosen": -87.5, "logps/rejected": -453.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.53125, "rewards/margins": 24.1875, "rewards/rejected": -19.625, "step": 87 }, { "epoch": 0.2938230383973289, "grad_norm": 2.0043949916725978e-05, "learning_rate": 0.0001570631221910245, "logits/chosen": -4.703125, "logits/rejected": -3.5546875, "logps/chosen": -127.0, "logps/rejected": -440.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.265625, "rewards/margins": 23.9375, "rewards/rejected": -19.6875, "step": 88 }, { "epoch": 0.29716193656093487, "grad_norm": 0.000917528523132205, "learning_rate": 0.00015698682990096828, "logits/chosen": -4.90625, "logits/rejected": -3.5625, "logps/chosen": -111.25, "logps/rejected": -377.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.546875, "rewards/margins": 22.25, "rewards/rejected": -17.6875, "step": 89 }, { "epoch": 0.3005008347245409, "grad_norm": 8.072228229139e-05, "learning_rate": 0.00015690957835227522, "logits/chosen": -5.28125, "logits/rejected": -3.4765625, "logps/chosen": -79.25, "logps/rejected": -458.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.96875, "rewards/margins": 22.8125, "rewards/rejected": -18.8125, "step": 90 }, { "epoch": 0.3038397328881469, "grad_norm": 4.4778818846680224e-05, "learning_rate": 0.00015683136850750236, "logits/chosen": -4.453125, "logits/rejected": -3.28125, "logps/chosen": -104.25, "logps/rejected": -458.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.4765625, "rewards/margins": 24.0625, "rewards/rejected": -20.5625, "step": 91 }, { "epoch": 0.3071786310517529, "grad_norm": 5.669149322784506e-05, "learning_rate": 0.00015675220134114712, "logits/chosen": -4.796875, "logits/rejected": -3.4765625, "logps/chosen": -94.25, "logps/rejected": -428.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.1796875, "rewards/margins": 23.1875, "rewards/rejected": -19.0625, "step": 92 }, { "epoch": 0.3105175292153589, "grad_norm": 7.771019227220677e-06, "learning_rate": 0.00015667207783963516, "logits/chosen": -5.09375, "logits/rejected": -3.359375, "logps/chosen": -102.25, "logps/rejected": -465.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.90625, "rewards/margins": 24.0625, "rewards/rejected": -20.125, "step": 93 }, { "epoch": 0.31385642737896496, "grad_norm": 0.0015844438457861543, "learning_rate": 0.00015659099900130826, "logits/chosen": -4.9375, "logits/rejected": -3.265625, "logps/chosen": -97.25, "logps/rejected": -508.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.8046875, "rewards/margins": 25.125, "rewards/rejected": -21.375, "step": 94 }, { "epoch": 0.31719532554257096, "grad_norm": 0.0008710987749509513, "learning_rate": 0.00015650896583641158, "logits/chosen": -5.140625, "logits/rejected": -3.3046875, "logps/chosen": -102.5, "logps/rejected": -449.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6171875, "rewards/margins": 23.875, "rewards/rejected": -20.25, "step": 95 }, { "epoch": 0.32053422370617696, "grad_norm": 2.450255806252244e-06, "learning_rate": 0.00015642597936708127, "logits/chosen": -4.6875, "logits/rejected": -3.3515625, "logps/chosen": -124.5, "logps/rejected": -473.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.375, "rewards/margins": 25.0, "rewards/rejected": -20.625, "step": 96 }, { "epoch": 0.32387312186978295, "grad_norm": 4.694379822467454e-05, "learning_rate": 0.00015634204062733167, "logits/chosen": -5.015625, "logits/rejected": -3.59375, "logps/chosen": -103.75, "logps/rejected": -385.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.6875, "rewards/margins": 23.0, "rewards/rejected": -18.3125, "step": 97 }, { "epoch": 0.327212020033389, "grad_norm": 0.0004240713897161186, "learning_rate": 0.00015625715066304246, "logits/chosen": -4.9375, "logits/rejected": -3.671875, "logps/chosen": -103.75, "logps/rejected": -385.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.875, "rewards/margins": 22.125, "rewards/rejected": -18.3125, "step": 98 }, { "epoch": 0.330550918196995, "grad_norm": 0.864057183265686, "learning_rate": 0.00015617131053194565, "logits/chosen": -4.609375, "logits/rejected": -3.4453125, "logps/chosen": -128.0, "logps/rejected": -453.0, "loss": 0.0811, "rewards/accuracies": 0.984375, "rewards/chosen": 3.671875, "rewards/margins": 22.625, "rewards/rejected": -18.9375, "step": 99 }, { "epoch": 0.333889816360601, "grad_norm": 0.0037979809567332268, "learning_rate": 0.0001560845213036123, "logits/chosen": -4.65625, "logits/rejected": -3.65625, "logps/chosen": -113.5, "logps/rejected": -393.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.890625, "rewards/margins": 24.0625, "rewards/rejected": -20.25, "step": 100 }, { "epoch": 0.337228714524207, "grad_norm": 0.0002999906719196588, "learning_rate": 0.00015599678405943927, "logits/chosen": -5.171875, "logits/rejected": -3.7890625, "logps/chosen": -104.75, "logps/rejected": -471.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.7890625, "rewards/margins": 27.0625, "rewards/rejected": -23.25, "step": 101 }, { "epoch": 0.34056761268781305, "grad_norm": 0.38996586203575134, "learning_rate": 0.00015590809989263576, "logits/chosen": -5.265625, "logits/rejected": -3.3359375, "logps/chosen": -95.25, "logps/rejected": -461.0, "loss": 0.0173, "rewards/accuracies": 0.984375, "rewards/chosen": 3.4140625, "rewards/margins": 24.8125, "rewards/rejected": -21.4375, "step": 102 }, { "epoch": 0.34390651085141904, "grad_norm": 0.0001715045509627089, "learning_rate": 0.00015581846990820965, "logits/chosen": -5.375, "logits/rejected": -3.953125, "logps/chosen": -119.75, "logps/rejected": -469.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8359375, "rewards/margins": 27.1875, "rewards/rejected": -23.3125, "step": 103 }, { "epoch": 0.34724540901502504, "grad_norm": 0.0018466322217136621, "learning_rate": 0.00015572789522295372, "logits/chosen": -5.21875, "logits/rejected": -4.0390625, "logps/chosen": -117.25, "logps/rejected": -507.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.6015625, "rewards/margins": 29.3125, "rewards/rejected": -25.625, "step": 104 }, { "epoch": 0.35058430717863104, "grad_norm": 1.7793492588680238e-05, "learning_rate": 0.00015563637696543173, "logits/chosen": -5.375, "logits/rejected": -3.765625, "logps/chosen": -122.5, "logps/rejected": -574.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.1953125, "rewards/margins": 32.0, "rewards/rejected": -28.75, "step": 105 }, { "epoch": 0.35392320534223703, "grad_norm": 0.0019883729983121157, "learning_rate": 0.00015554391627596446, "logits/chosen": -5.75, "logits/rejected": -3.84375, "logps/chosen": -74.75, "logps/rejected": -558.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.4296875, "rewards/margins": 30.75, "rewards/rejected": -27.25, "step": 106 }, { "epoch": 0.3572621035058431, "grad_norm": 0.0002132615481968969, "learning_rate": 0.0001554505143066154, "logits/chosen": -5.515625, "logits/rejected": -3.8984375, "logps/chosen": -118.5, "logps/rejected": -496.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.265625, "rewards/margins": 30.0, "rewards/rejected": -25.75, "step": 107 }, { "epoch": 0.3606010016694491, "grad_norm": 0.018147334456443787, "learning_rate": 0.0001553561722211764, "logits/chosen": -5.5625, "logits/rejected": -3.9375, "logps/chosen": -98.75, "logps/rejected": -535.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 3.8828125, "rewards/margins": 32.5, "rewards/rejected": -28.5625, "step": 108 }, { "epoch": 0.3639398998330551, "grad_norm": 0.00014658304280601442, "learning_rate": 0.00015526089119515316, "logits/chosen": -5.796875, "logits/rejected": -3.875, "logps/chosen": -107.5, "logps/rejected": -476.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.1171875, "rewards/margins": 30.375, "rewards/rejected": -26.25, "step": 109 }, { "epoch": 0.3672787979966611, "grad_norm": 0.5631054043769836, "learning_rate": 0.00015516467241575066, "logits/chosen": -5.46875, "logits/rejected": -3.9453125, "logps/chosen": -91.0, "logps/rejected": -548.0, "loss": 0.063, "rewards/accuracies": 0.984375, "rewards/chosen": 4.0703125, "rewards/margins": 32.875, "rewards/rejected": -28.75, "step": 110 }, { "epoch": 0.37061769616026713, "grad_norm": 0.0015704554971307516, "learning_rate": 0.00015506751708185837, "logits/chosen": -5.359375, "logits/rejected": -3.9609375, "logps/chosen": -97.0, "logps/rejected": -463.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.515625, "rewards/margins": 29.625, "rewards/rejected": -25.125, "step": 111 }, { "epoch": 0.3739565943238731, "grad_norm": 0.03998275473713875, "learning_rate": 0.00015496942640403515, "logits/chosen": -5.5625, "logits/rejected": -3.6484375, "logps/chosen": -90.0, "logps/rejected": -502.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 3.375, "rewards/margins": 30.125, "rewards/rejected": -26.6875, "step": 112 }, { "epoch": 0.3772954924874791, "grad_norm": 1.5272264136001468e-05, "learning_rate": 0.00015487040160449433, "logits/chosen": -5.21875, "logits/rejected": -3.859375, "logps/chosen": -108.0, "logps/rejected": -489.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.109375, "rewards/margins": 29.0625, "rewards/rejected": -24.9375, "step": 113 }, { "epoch": 0.3806343906510851, "grad_norm": 2.4422410206170753e-05, "learning_rate": 0.00015477044391708848, "logits/chosen": -5.28125, "logits/rejected": -3.7890625, "logps/chosen": -119.0, "logps/rejected": -501.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.375, "rewards/margins": 30.1875, "rewards/rejected": -25.8125, "step": 114 }, { "epoch": 0.38397328881469117, "grad_norm": 0.009292054921388626, "learning_rate": 0.00015466955458729386, "logits/chosen": -5.640625, "logits/rejected": -3.6484375, "logps/chosen": -84.75, "logps/rejected": -505.0, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": 4.1171875, "rewards/margins": 29.5, "rewards/rejected": -25.3125, "step": 115 }, { "epoch": 0.38731218697829717, "grad_norm": 0.016696617007255554, "learning_rate": 0.00015456773487219517, "logits/chosen": -5.109375, "logits/rejected": -3.640625, "logps/chosen": -92.5, "logps/rejected": -484.0, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": 4.28125, "rewards/margins": 29.375, "rewards/rejected": -25.0625, "step": 116 }, { "epoch": 0.39065108514190316, "grad_norm": 3.30902221321594e-05, "learning_rate": 0.00015446498604046967, "logits/chosen": -4.9375, "logits/rejected": -3.3359375, "logps/chosen": -110.25, "logps/rejected": -536.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.453125, "rewards/margins": 29.875, "rewards/rejected": -25.5, "step": 117 }, { "epoch": 0.39398998330550916, "grad_norm": 0.007862205617129803, "learning_rate": 0.00015436130937237144, "logits/chosen": -4.28125, "logits/rejected": -3.265625, "logps/chosen": -102.5, "logps/rejected": -476.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 4.703125, "rewards/margins": 29.0, "rewards/rejected": -24.25, "step": 118 }, { "epoch": 0.3973288814691152, "grad_norm": 0.00031752747599966824, "learning_rate": 0.00015425670615971544, "logits/chosen": -4.84375, "logits/rejected": -3.0859375, "logps/chosen": -105.75, "logps/rejected": -473.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.78125, "rewards/margins": 29.6875, "rewards/rejected": -24.9375, "step": 119 }, { "epoch": 0.4006677796327212, "grad_norm": 3.322392512927763e-05, "learning_rate": 0.00015415117770586144, "logits/chosen": -5.234375, "logits/rejected": -3.1875, "logps/chosen": -75.25, "logps/rejected": -491.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.9375, "rewards/margins": 29.8125, "rewards/rejected": -24.8125, "step": 120 }, { "epoch": 0.4040066777963272, "grad_norm": 0.00014395274047274143, "learning_rate": 0.00015404472532569771, "logits/chosen": -5.078125, "logits/rejected": -3.109375, "logps/chosen": -107.0, "logps/rejected": -468.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.90625, "rewards/margins": 28.4375, "rewards/rejected": -23.5625, "step": 121 }, { "epoch": 0.4073455759599332, "grad_norm": 3.165722773701418e-06, "learning_rate": 0.0001539373503456247, "logits/chosen": -4.765625, "logits/rejected": -3.125, "logps/chosen": -110.25, "logps/rejected": -497.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.875, "rewards/margins": 28.125, "rewards/rejected": -23.1875, "step": 122 }, { "epoch": 0.41068447412353926, "grad_norm": 0.05887475982308388, "learning_rate": 0.00015382905410353846, "logits/chosen": -4.84375, "logits/rejected": -3.375, "logps/chosen": -121.0, "logps/rejected": -435.0, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 4.03125, "rewards/margins": 24.9375, "rewards/rejected": -20.9375, "step": 123 }, { "epoch": 0.41402337228714525, "grad_norm": 0.0702575147151947, "learning_rate": 0.00015371983794881404, "logits/chosen": -4.890625, "logits/rejected": -3.0546875, "logps/chosen": -69.25, "logps/rejected": -460.0, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 3.84375, "rewards/margins": 26.8125, "rewards/rejected": -23.0, "step": 124 }, { "epoch": 0.41736227045075125, "grad_norm": 0.01235484890639782, "learning_rate": 0.0001536097032422886, "logits/chosen": -4.9375, "logits/rejected": -2.921875, "logps/chosen": -102.5, "logps/rejected": -492.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 4.59375, "rewards/margins": 27.1875, "rewards/rejected": -22.625, "step": 125 }, { "epoch": 0.42070116861435725, "grad_norm": 0.0033392056357115507, "learning_rate": 0.00015349865135624448, "logits/chosen": -5.078125, "logits/rejected": -2.8125, "logps/chosen": -93.5, "logps/rejected": -439.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.34375, "rewards/margins": 25.375, "rewards/rejected": -21.0, "step": 126 }, { "epoch": 0.4240400667779633, "grad_norm": 0.0017836507176980376, "learning_rate": 0.0001533866836743922, "logits/chosen": -5.21875, "logits/rejected": -3.15625, "logps/chosen": -95.25, "logps/rejected": -440.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.171875, "rewards/margins": 27.5, "rewards/rejected": -22.3125, "step": 127 }, { "epoch": 0.4273789649415693, "grad_norm": 1.02886324384599e-05, "learning_rate": 0.00015327380159185295, "logits/chosen": -4.8125, "logits/rejected": -3.0, "logps/chosen": -98.0, "logps/rejected": -496.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.90625, "rewards/margins": 28.75, "rewards/rejected": -23.875, "step": 128 }, { "epoch": 0.4307178631051753, "grad_norm": 0.0003149933472741395, "learning_rate": 0.00015316000651514157, "logits/chosen": -5.140625, "logits/rejected": -2.9765625, "logps/chosen": -82.25, "logps/rejected": -471.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.78125, "rewards/margins": 27.5625, "rewards/rejected": -22.75, "step": 129 }, { "epoch": 0.4340567612687813, "grad_norm": 5.283300197334029e-05, "learning_rate": 0.0001530452998621487, "logits/chosen": -4.8125, "logits/rejected": -3.0390625, "logps/chosen": -112.0, "logps/rejected": -428.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.859375, "rewards/margins": 26.0, "rewards/rejected": -21.125, "step": 130 }, { "epoch": 0.4373956594323873, "grad_norm": 7.664081931579858e-05, "learning_rate": 0.00015292968306212336, "logits/chosen": -4.453125, "logits/rejected": -2.9375, "logps/chosen": -101.5, "logps/rejected": -450.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.203125, "rewards/margins": 26.875, "rewards/rejected": -21.6875, "step": 131 }, { "epoch": 0.44073455759599334, "grad_norm": 0.00020385748939588666, "learning_rate": 0.00015281315755565498, "logits/chosen": -4.765625, "logits/rejected": -3.1015625, "logps/chosen": -95.25, "logps/rejected": -448.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.859375, "rewards/margins": 27.125, "rewards/rejected": -22.25, "step": 132 }, { "epoch": 0.44407345575959933, "grad_norm": 0.000807323376648128, "learning_rate": 0.0001526957247946555, "logits/chosen": -5.078125, "logits/rejected": -2.75, "logps/chosen": -98.75, "logps/rejected": -494.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.765625, "rewards/margins": 28.4375, "rewards/rejected": -23.6875, "step": 133 }, { "epoch": 0.44741235392320533, "grad_norm": 0.0005043560522608459, "learning_rate": 0.0001525773862423413, "logits/chosen": -4.984375, "logits/rejected": -2.921875, "logps/chosen": -100.0, "logps/rejected": -462.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.578125, "rewards/margins": 27.375, "rewards/rejected": -22.8125, "step": 134 }, { "epoch": 0.4507512520868113, "grad_norm": 0.0002074016520055011, "learning_rate": 0.00015245814337321492, "logits/chosen": -4.8125, "logits/rejected": -2.71875, "logps/chosen": -106.0, "logps/rejected": -532.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.640625, "rewards/margins": 29.4375, "rewards/rejected": -24.8125, "step": 135 }, { "epoch": 0.4540901502504174, "grad_norm": 8.115387754514813e-05, "learning_rate": 0.0001523379976730468, "logits/chosen": -4.90625, "logits/rejected": -2.90625, "logps/chosen": -88.5, "logps/rejected": -502.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.8125, "rewards/margins": 28.5, "rewards/rejected": -23.6875, "step": 136 }, { "epoch": 0.4574290484140234, "grad_norm": 3.2275711419060826e-06, "learning_rate": 0.00015221695063885664, "logits/chosen": -4.75, "logits/rejected": -2.8046875, "logps/chosen": -85.0, "logps/rejected": -494.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.03125, "rewards/margins": 29.1875, "rewards/rejected": -24.1875, "step": 137 }, { "epoch": 0.4607679465776294, "grad_norm": 0.0006476023118011653, "learning_rate": 0.00015209500377889472, "logits/chosen": -4.6875, "logits/rejected": -2.8359375, "logps/chosen": -107.75, "logps/rejected": -498.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.671875, "rewards/margins": 28.5, "rewards/rejected": -23.8125, "step": 138 }, { "epoch": 0.46410684474123537, "grad_norm": 4.4746982894139364e-05, "learning_rate": 0.00015197215861262325, "logits/chosen": -4.796875, "logits/rejected": -2.65625, "logps/chosen": -102.0, "logps/rejected": -566.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.140625, "rewards/margins": 30.875, "rewards/rejected": -25.75, "step": 139 }, { "epoch": 0.4674457429048414, "grad_norm": 0.05361659452319145, "learning_rate": 0.00015184841667069748, "logits/chosen": -4.796875, "logits/rejected": -2.9609375, "logps/chosen": -98.75, "logps/rejected": -447.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 4.78125, "rewards/margins": 28.0625, "rewards/rejected": -23.25, "step": 140 }, { "epoch": 0.4707846410684474, "grad_norm": 0.00023327009694185108, "learning_rate": 0.0001517237794949463, "logits/chosen": -4.609375, "logits/rejected": -3.03125, "logps/chosen": -103.5, "logps/rejected": -476.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.4453125, "rewards/margins": 28.5625, "rewards/rejected": -24.125, "step": 141 }, { "epoch": 0.4741235392320534, "grad_norm": 0.2315979301929474, "learning_rate": 0.00015159824863835336, "logits/chosen": -5.484375, "logits/rejected": -2.578125, "logps/chosen": -76.5, "logps/rejected": -536.0, "loss": 0.0126, "rewards/accuracies": 0.984375, "rewards/chosen": 3.59375, "rewards/margins": 29.0, "rewards/rejected": -25.4375, "step": 142 }, { "epoch": 0.4774624373956594, "grad_norm": 0.1798364669084549, "learning_rate": 0.00015147182566503764, "logits/chosen": -5.078125, "logits/rejected": -2.9296875, "logps/chosen": -101.25, "logps/rejected": -478.0, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 5.078125, "rewards/margins": 28.4375, "rewards/rejected": -23.4375, "step": 143 }, { "epoch": 0.48080133555926546, "grad_norm": 2.0006180420750752e-05, "learning_rate": 0.00015134451215023385, "logits/chosen": -4.34375, "logits/rejected": -2.96875, "logps/chosen": -103.0, "logps/rejected": -493.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.109375, "rewards/margins": 29.0, "rewards/rejected": -23.875, "step": 144 }, { "epoch": 0.48414023372287146, "grad_norm": 0.0023838214110583067, "learning_rate": 0.0001512163096802729, "logits/chosen": -4.96875, "logits/rejected": -2.828125, "logps/chosen": -107.25, "logps/rejected": -492.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.078125, "rewards/margins": 28.0625, "rewards/rejected": -23.9375, "step": 145 }, { "epoch": 0.48747913188647746, "grad_norm": 0.006409293040633202, "learning_rate": 0.00015108721985256215, "logits/chosen": -4.765625, "logits/rejected": -2.734375, "logps/chosen": -79.0, "logps/rejected": -548.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 4.625, "rewards/margins": 31.6875, "rewards/rejected": -27.0, "step": 146 }, { "epoch": 0.49081803005008345, "grad_norm": 0.00015124543278943747, "learning_rate": 0.00015095724427556544, "logits/chosen": -4.84375, "logits/rejected": -2.8515625, "logps/chosen": -102.25, "logps/rejected": -460.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.859375, "rewards/margins": 27.0625, "rewards/rejected": -22.1875, "step": 147 }, { "epoch": 0.4941569282136895, "grad_norm": 0.00012339219392742962, "learning_rate": 0.00015082638456878308, "logits/chosen": -4.75, "logits/rejected": -2.9921875, "logps/chosen": -89.0, "logps/rejected": -481.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.2265625, "rewards/margins": 28.9375, "rewards/rejected": -24.6875, "step": 148 }, { "epoch": 0.4974958263772955, "grad_norm": 1.230505313287722e-05, "learning_rate": 0.0001506946423627316, "logits/chosen": -5.0, "logits/rejected": -3.1171875, "logps/chosen": -82.5, "logps/rejected": -489.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.59375, "rewards/margins": 29.125, "rewards/rejected": -24.5, "step": 149 }, { "epoch": 0.5008347245409015, "grad_norm": 2.1004785594413988e-05, "learning_rate": 0.00015056201929892368, "logits/chosen": -4.25, "logits/rejected": -2.9765625, "logps/chosen": -112.25, "logps/rejected": -434.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.171875, "rewards/margins": 27.0625, "rewards/rejected": -22.875, "step": 150 }, { "epoch": 0.5041736227045075, "grad_norm": 0.00019315003009978682, "learning_rate": 0.00015042851702984732, "logits/chosen": -4.84375, "logits/rejected": -2.875, "logps/chosen": -108.75, "logps/rejected": -448.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.625, "rewards/margins": 27.5625, "rewards/rejected": -22.9375, "step": 151 }, { "epoch": 0.5075125208681135, "grad_norm": 0.003308866871520877, "learning_rate": 0.00015029413721894558, "logits/chosen": -4.75, "logits/rejected": -2.984375, "logps/chosen": -122.5, "logps/rejected": -488.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.609375, "rewards/margins": 28.625, "rewards/rejected": -24.0625, "step": 152 }, { "epoch": 0.5108514190317195, "grad_norm": 5.397659606387606e-06, "learning_rate": 0.00015015888154059568, "logits/chosen": -4.53125, "logits/rejected": -3.1640625, "logps/chosen": -101.5, "logps/rejected": -505.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.75, "rewards/margins": 29.3125, "rewards/rejected": -24.5625, "step": 153 }, { "epoch": 0.5141903171953256, "grad_norm": 0.10684552043676376, "learning_rate": 0.00015002275168008816, "logits/chosen": -5.46875, "logits/rejected": -3.015625, "logps/chosen": -71.75, "logps/rejected": -500.0, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 4.2578125, "rewards/margins": 29.4375, "rewards/rejected": -25.125, "step": 154 }, { "epoch": 0.5175292153589316, "grad_norm": 3.8425196180469356e-07, "learning_rate": 0.00014988574933360593, "logits/chosen": -4.71875, "logits/rejected": -2.9375, "logps/chosen": -101.25, "logps/rejected": -507.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.765625, "rewards/margins": 29.75, "rewards/rejected": -25.0, "step": 155 }, { "epoch": 0.5208681135225376, "grad_norm": 7.517022822867148e-06, "learning_rate": 0.0001497478762082031, "logits/chosen": -5.125, "logits/rejected": -2.609375, "logps/chosen": -96.75, "logps/rejected": -532.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.515625, "rewards/margins": 32.0, "rewards/rejected": -27.4375, "step": 156 }, { "epoch": 0.5242070116861436, "grad_norm": 0.00013846807996742427, "learning_rate": 0.00014960913402178373, "logits/chosen": -5.328125, "logits/rejected": -2.953125, "logps/chosen": -86.25, "logps/rejected": -520.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.015625, "rewards/margins": 30.5625, "rewards/rejected": -25.5625, "step": 157 }, { "epoch": 0.5275459098497496, "grad_norm": 3.7317280657589436e-05, "learning_rate": 0.00014946952450308035, "logits/chosen": -5.03125, "logits/rejected": -3.0859375, "logps/chosen": -83.5, "logps/rejected": -505.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.546875, "rewards/margins": 30.625, "rewards/rejected": -26.0625, "step": 158 }, { "epoch": 0.5308848080133556, "grad_norm": 1.3853728887625039e-05, "learning_rate": 0.00014932904939163257, "logits/chosen": -5.125, "logits/rejected": -2.8359375, "logps/chosen": -76.5, "logps/rejected": -490.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.859375, "rewards/margins": 29.8125, "rewards/rejected": -25.0625, "step": 159 }, { "epoch": 0.5342237061769616, "grad_norm": 6.569054676219821e-06, "learning_rate": 0.00014918771043776524, "logits/chosen": -4.734375, "logits/rejected": -3.0859375, "logps/chosen": -116.75, "logps/rejected": -504.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.453125, "rewards/margins": 29.75, "rewards/rejected": -25.25, "step": 160 }, { "epoch": 0.5375626043405676, "grad_norm": 0.018823888152837753, "learning_rate": 0.00014904550940256675, "logits/chosen": -4.6875, "logits/rejected": -2.8203125, "logps/chosen": -103.75, "logps/rejected": -521.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 4.953125, "rewards/margins": 30.125, "rewards/rejected": -25.1875, "step": 161 }, { "epoch": 0.5409015025041736, "grad_norm": 0.01165434904396534, "learning_rate": 0.00014890244805786706, "logits/chosen": -5.03125, "logits/rejected": -2.9921875, "logps/chosen": -100.25, "logps/rejected": -465.0, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": 4.46875, "rewards/margins": 29.375, "rewards/rejected": -24.875, "step": 162 }, { "epoch": 0.5442404006677797, "grad_norm": 2.0660480004153214e-05, "learning_rate": 0.00014875852818621563, "logits/chosen": -4.6875, "logits/rejected": -2.8671875, "logps/chosen": -112.0, "logps/rejected": -467.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.6875, "rewards/margins": 29.5625, "rewards/rejected": -24.8125, "step": 163 }, { "epoch": 0.5475792988313857, "grad_norm": 0.002862096531316638, "learning_rate": 0.00014861375158085915, "logits/chosen": -4.46875, "logits/rejected": -2.9921875, "logps/chosen": -110.75, "logps/rejected": -487.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.703125, "rewards/margins": 29.5625, "rewards/rejected": -24.8125, "step": 164 }, { "epoch": 0.5509181969949917, "grad_norm": 0.0027916012331843376, "learning_rate": 0.00014846812004571928, "logits/chosen": -4.953125, "logits/rejected": -3.1015625, "logps/chosen": -92.25, "logps/rejected": -482.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 4.234375, "rewards/margins": 29.375, "rewards/rejected": -25.125, "step": 165 }, { "epoch": 0.5542570951585977, "grad_norm": 0.00024799967650324106, "learning_rate": 0.0001483216353953701, "logits/chosen": -4.71875, "logits/rejected": -2.9296875, "logps/chosen": -108.0, "logps/rejected": -451.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.6875, "rewards/margins": 28.5625, "rewards/rejected": -23.9375, "step": 166 }, { "epoch": 0.5575959933222037, "grad_norm": 0.0007697382825426757, "learning_rate": 0.00014817429945501563, "logits/chosen": -4.578125, "logits/rejected": -2.875, "logps/chosen": -109.5, "logps/rejected": -505.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.0859375, "rewards/margins": 29.0, "rewards/rejected": -24.875, "step": 167 }, { "epoch": 0.5609348914858097, "grad_norm": 3.113675120403059e-05, "learning_rate": 0.00014802611406046685, "logits/chosen": -4.953125, "logits/rejected": -3.140625, "logps/chosen": -89.5, "logps/rejected": -470.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.8125, "rewards/margins": 29.25, "rewards/rejected": -24.5, "step": 168 }, { "epoch": 0.5642737896494157, "grad_norm": 0.0006815637461841106, "learning_rate": 0.00014787708105811905, "logits/chosen": -4.984375, "logits/rejected": -2.984375, "logps/chosen": -88.375, "logps/rejected": -471.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.859375, "rewards/margins": 29.375, "rewards/rejected": -24.5, "step": 169 }, { "epoch": 0.5676126878130217, "grad_norm": 0.00014252612891141325, "learning_rate": 0.00014772720230492878, "logits/chosen": -5.15625, "logits/rejected": -2.8203125, "logps/chosen": -103.25, "logps/rejected": -525.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.703125, "rewards/margins": 30.3125, "rewards/rejected": -25.625, "step": 170 }, { "epoch": 0.5709515859766278, "grad_norm": 2.218190456915181e-05, "learning_rate": 0.00014757647966839058, "logits/chosen": -5.0, "logits/rejected": -2.8359375, "logps/chosen": -81.875, "logps/rejected": -514.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.15625, "rewards/margins": 30.375, "rewards/rejected": -26.25, "step": 171 }, { "epoch": 0.5742904841402338, "grad_norm": 0.00024572337861172855, "learning_rate": 0.0001474249150265139, "logits/chosen": -4.921875, "logits/rejected": -2.921875, "logps/chosen": -77.125, "logps/rejected": -491.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.734375, "rewards/margins": 29.8125, "rewards/rejected": -25.0625, "step": 172 }, { "epoch": 0.5776293823038398, "grad_norm": 0.062237586826086044, "learning_rate": 0.00014727251026779953, "logits/chosen": -4.921875, "logits/rejected": -3.0625, "logps/chosen": -104.25, "logps/rejected": -480.0, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 3.84375, "rewards/margins": 28.3125, "rewards/rejected": -24.5, "step": 173 }, { "epoch": 0.5809682804674458, "grad_norm": 0.010632738471031189, "learning_rate": 0.0001471192672912162, "logits/chosen": -5.0, "logits/rejected": -2.859375, "logps/chosen": -91.0, "logps/rejected": -540.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 4.53125, "rewards/margins": 31.75, "rewards/rejected": -27.25, "step": 174 }, { "epoch": 0.5843071786310517, "grad_norm": 0.004395844414830208, "learning_rate": 0.00014696518800617686, "logits/chosen": -4.796875, "logits/rejected": -3.109375, "logps/chosen": -100.25, "logps/rejected": -480.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 3.703125, "rewards/margins": 27.8125, "rewards/rejected": -24.0625, "step": 175 }, { "epoch": 0.5876460767946577, "grad_norm": 0.0008318977779708803, "learning_rate": 0.00014681027433251486, "logits/chosen": -4.546875, "logits/rejected": -3.078125, "logps/chosen": -108.25, "logps/rejected": -473.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.765625, "rewards/margins": 29.4375, "rewards/rejected": -24.625, "step": 176 }, { "epoch": 0.5909849749582637, "grad_norm": 0.47385621070861816, "learning_rate": 0.00014665452820046006, "logits/chosen": -4.71875, "logits/rejected": -3.015625, "logps/chosen": -92.25, "logps/rejected": -467.0, "loss": 0.0391, "rewards/accuracies": 0.984375, "rewards/chosen": 4.125, "rewards/margins": 27.625, "rewards/rejected": -23.5625, "step": 177 }, { "epoch": 0.5943238731218697, "grad_norm": 2.916028734034626e-07, "learning_rate": 0.00014649795155061485, "logits/chosen": -4.921875, "logits/rejected": -2.828125, "logps/chosen": -86.25, "logps/rejected": -518.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.65625, "rewards/margins": 30.625, "rewards/rejected": -26.0, "step": 178 }, { "epoch": 0.5976627712854758, "grad_norm": 0.06330767273902893, "learning_rate": 0.00014634054633392982, "logits/chosen": -4.5, "logits/rejected": -2.75, "logps/chosen": -98.25, "logps/rejected": -497.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 4.875, "rewards/margins": 29.6875, "rewards/rejected": -24.875, "step": 179 }, { "epoch": 0.6010016694490818, "grad_norm": 0.0008164051687344909, "learning_rate": 0.00014618231451167955, "logits/chosen": -4.640625, "logits/rejected": -2.65625, "logps/chosen": -94.75, "logps/rejected": -432.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.46875, "rewards/margins": 26.5625, "rewards/rejected": -21.0625, "step": 180 }, { "epoch": 0.6043405676126878, "grad_norm": 6.780491821700707e-05, "learning_rate": 0.00014602325805543822, "logits/chosen": -4.265625, "logits/rejected": -2.84375, "logps/chosen": -97.25, "logps/rejected": -453.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.875, "rewards/margins": 27.5, "rewards/rejected": -22.5625, "step": 181 }, { "epoch": 0.6076794657762938, "grad_norm": 4.8602585593471304e-05, "learning_rate": 0.00014586337894705487, "logits/chosen": -4.1875, "logits/rejected": -2.78125, "logps/chosen": -94.0, "logps/rejected": -409.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.140625, "rewards/margins": 26.125, "rewards/rejected": -21.0, "step": 182 }, { "epoch": 0.6110183639398998, "grad_norm": 0.0002642914478201419, "learning_rate": 0.00014570267917862891, "logits/chosen": -4.421875, "logits/rejected": -2.703125, "logps/chosen": -99.25, "logps/rejected": -434.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.171875, "rewards/margins": 24.4375, "rewards/rejected": -19.25, "step": 183 }, { "epoch": 0.6143572621035058, "grad_norm": 0.0028488298412412405, "learning_rate": 0.00014554116075248514, "logits/chosen": -4.171875, "logits/rejected": -2.6171875, "logps/chosen": -100.25, "logps/rejected": -429.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.015625, "rewards/margins": 25.125, "rewards/rejected": -20.125, "step": 184 }, { "epoch": 0.6176961602671118, "grad_norm": 0.00043352670036256313, "learning_rate": 0.0001453788256811489, "logits/chosen": -4.078125, "logits/rejected": -2.7578125, "logps/chosen": -100.25, "logps/rejected": -406.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.03125, "rewards/margins": 24.6875, "rewards/rejected": -18.6875, "step": 185 }, { "epoch": 0.6210350584307178, "grad_norm": 0.0006578704342246056, "learning_rate": 0.00014521567598732097, "logits/chosen": -4.2421875, "logits/rejected": -2.515625, "logps/chosen": -83.0, "logps/rejected": -400.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.6875, "rewards/margins": 23.375, "rewards/rejected": -17.6875, "step": 186 }, { "epoch": 0.6243739565943238, "grad_norm": 0.0001582528348080814, "learning_rate": 0.00014505171370385233, "logits/chosen": -4.2265625, "logits/rejected": -2.5546875, "logps/chosen": -87.25, "logps/rejected": -462.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.125, "rewards/margins": 26.0625, "rewards/rejected": -19.9375, "step": 187 }, { "epoch": 0.6277128547579299, "grad_norm": 2.958109871542547e-05, "learning_rate": 0.00014488694087371883, "logits/chosen": -4.234375, "logits/rejected": -2.1796875, "logps/chosen": -85.75, "logps/rejected": -447.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.078125, "rewards/margins": 25.5625, "rewards/rejected": -19.5, "step": 188 }, { "epoch": 0.6310517529215359, "grad_norm": 0.0010354547994211316, "learning_rate": 0.00014472135954999581, "logits/chosen": -4.0, "logits/rejected": -2.5546875, "logps/chosen": -90.75, "logps/rejected": -452.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.515625, "rewards/margins": 25.4375, "rewards/rejected": -19.9375, "step": 189 }, { "epoch": 0.6343906510851419, "grad_norm": 0.0017567307222634554, "learning_rate": 0.00014455497179583244, "logits/chosen": -4.3125, "logits/rejected": -2.5546875, "logps/chosen": -79.0, "logps/rejected": -453.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.09375, "rewards/margins": 23.875, "rewards/rejected": -18.8125, "step": 190 }, { "epoch": 0.6377295492487479, "grad_norm": 1.4471517715719528e-05, "learning_rate": 0.00014438777968442607, "logits/chosen": -3.8671875, "logits/rejected": -2.5390625, "logps/chosen": -99.75, "logps/rejected": -490.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.640625, "rewards/margins": 25.8125, "rewards/rejected": -20.1875, "step": 191 }, { "epoch": 0.6410684474123539, "grad_norm": 1.1293011993984692e-05, "learning_rate": 0.00014421978529899633, "logits/chosen": -3.8046875, "logits/rejected": -2.4296875, "logps/chosen": -70.375, "logps/rejected": -495.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.09375, "rewards/margins": 26.25, "rewards/rejected": -21.1875, "step": 192 }, { "epoch": 0.6444073455759599, "grad_norm": 0.031924691051244736, "learning_rate": 0.00014405099073275924, "logits/chosen": -3.953125, "logits/rejected": -2.3359375, "logps/chosen": -111.0, "logps/rejected": -445.0, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 5.71875, "rewards/margins": 24.75, "rewards/rejected": -19.0625, "step": 193 }, { "epoch": 0.6477462437395659, "grad_norm": 8.30372482596431e-06, "learning_rate": 0.00014388139808890112, "logits/chosen": -3.796875, "logits/rejected": -2.421875, "logps/chosen": -108.75, "logps/rejected": -434.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.234375, "rewards/margins": 24.5625, "rewards/rejected": -18.375, "step": 194 }, { "epoch": 0.6510851419031719, "grad_norm": 0.016556670889258385, "learning_rate": 0.00014371100948055226, "logits/chosen": -4.296875, "logits/rejected": -2.4921875, "logps/chosen": -88.5, "logps/rejected": -417.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 5.015625, "rewards/margins": 23.25, "rewards/rejected": -18.25, "step": 195 }, { "epoch": 0.654424040066778, "grad_norm": 0.0003383951261639595, "learning_rate": 0.0001435398270307609, "logits/chosen": -3.8203125, "logits/rejected": -2.5625, "logps/chosen": -96.25, "logps/rejected": -386.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.03125, "rewards/margins": 23.625, "rewards/rejected": -17.625, "step": 196 }, { "epoch": 0.657762938230384, "grad_norm": 0.00017920513346325606, "learning_rate": 0.00014336785287246632, "logits/chosen": -4.0625, "logits/rejected": -2.4140625, "logps/chosen": -78.75, "logps/rejected": -421.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.890625, "rewards/margins": 24.0, "rewards/rejected": -18.125, "step": 197 }, { "epoch": 0.66110183639399, "grad_norm": 0.0014842700911685824, "learning_rate": 0.00014319508914847274, "logits/chosen": -3.640625, "logits/rejected": -2.4765625, "logps/chosen": -107.5, "logps/rejected": -433.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.578125, "rewards/margins": 23.0625, "rewards/rejected": -17.4375, "step": 198 }, { "epoch": 0.664440734557596, "grad_norm": 0.0002902350970543921, "learning_rate": 0.00014302153801142226, "logits/chosen": -3.671875, "logits/rejected": -2.359375, "logps/chosen": -105.25, "logps/rejected": -386.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.234375, "rewards/margins": 23.0, "rewards/rejected": -16.6875, "step": 199 }, { "epoch": 0.667779632721202, "grad_norm": 0.000622183782979846, "learning_rate": 0.00014284720162376823, "logits/chosen": -4.0546875, "logits/rejected": -2.265625, "logps/chosen": -93.0, "logps/rejected": -482.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.34375, "rewards/margins": 24.6875, "rewards/rejected": -19.3125, "step": 200 }, { "epoch": 0.671118530884808, "grad_norm": 0.032146863639354706, "learning_rate": 0.0001426720821577482, "logits/chosen": -4.1171875, "logits/rejected": -2.390625, "logps/chosen": -70.0, "logps/rejected": -463.0, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 5.25, "rewards/margins": 24.3125, "rewards/rejected": -19.0625, "step": 201 }, { "epoch": 0.674457429048414, "grad_norm": 0.0003144640941172838, "learning_rate": 0.000142496181795357, "logits/chosen": -4.125, "logits/rejected": -2.4296875, "logps/chosen": -91.0, "logps/rejected": -429.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.671875, "rewards/margins": 23.3125, "rewards/rejected": -17.625, "step": 202 }, { "epoch": 0.67779632721202, "grad_norm": 0.00027810977189801633, "learning_rate": 0.00014231950272831936, "logits/chosen": -3.9375, "logits/rejected": -2.4453125, "logps/chosen": -92.5, "logps/rejected": -425.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.21875, "rewards/margins": 23.8125, "rewards/rejected": -18.6875, "step": 203 }, { "epoch": 0.6811352253756261, "grad_norm": 0.12220592051744461, "learning_rate": 0.00014214204715806271, "logits/chosen": -3.8828125, "logits/rejected": -2.5234375, "logps/chosen": -100.5, "logps/rejected": -428.0, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 5.890625, "rewards/margins": 24.1875, "rewards/rejected": -18.25, "step": 204 }, { "epoch": 0.6844741235392321, "grad_norm": 1.1589469067985192e-05, "learning_rate": 0.00014196381729568983, "logits/chosen": -3.6640625, "logits/rejected": -2.3828125, "logps/chosen": -126.5, "logps/rejected": -467.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.015625, "rewards/margins": 25.3125, "rewards/rejected": -19.25, "step": 205 }, { "epoch": 0.6878130217028381, "grad_norm": 2.4499566279700957e-05, "learning_rate": 0.00014178481536195113, "logits/chosen": -4.203125, "logits/rejected": -2.6015625, "logps/chosen": -71.25, "logps/rejected": -402.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.75, "rewards/margins": 23.5, "rewards/rejected": -17.75, "step": 206 }, { "epoch": 0.6911519198664441, "grad_norm": 0.00026768725365400314, "learning_rate": 0.000141605043587217, "logits/chosen": -4.0546875, "logits/rejected": -2.296875, "logps/chosen": -73.75, "logps/rejected": -426.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.3125, "rewards/margins": 23.4375, "rewards/rejected": -18.0625, "step": 207 }, { "epoch": 0.6944908180300501, "grad_norm": 4.719466232927516e-05, "learning_rate": 0.0001414245042114502, "logits/chosen": -4.0, "logits/rejected": -2.4453125, "logps/chosen": -62.75, "logps/rejected": -405.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.734375, "rewards/margins": 23.4375, "rewards/rejected": -17.75, "step": 208 }, { "epoch": 0.6978297161936561, "grad_norm": 0.0002969894267152995, "learning_rate": 0.00014124319948417773, "logits/chosen": -3.84375, "logits/rejected": -2.2734375, "logps/chosen": -88.5, "logps/rejected": -420.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.359375, "rewards/margins": 23.1875, "rewards/rejected": -17.875, "step": 209 }, { "epoch": 0.7011686143572621, "grad_norm": 0.0002236636937595904, "learning_rate": 0.000141061131664463, "logits/chosen": -4.0625, "logits/rejected": -2.4609375, "logps/chosen": -71.25, "logps/rejected": -454.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.5, "rewards/margins": 25.0, "rewards/rejected": -19.5, "step": 210 }, { "epoch": 0.7045075125208681, "grad_norm": 1.3881902305001859e-05, "learning_rate": 0.00014087830302087742, "logits/chosen": -3.875, "logits/rejected": -2.359375, "logps/chosen": -76.0, "logps/rejected": -468.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.515625, "rewards/margins": 25.0625, "rewards/rejected": -19.5625, "step": 211 }, { "epoch": 0.7078464106844741, "grad_norm": 0.018235376104712486, "learning_rate": 0.00014069471583147249, "logits/chosen": -3.71875, "logits/rejected": -2.484375, "logps/chosen": -114.5, "logps/rejected": -384.0, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 5.15625, "rewards/margins": 21.75, "rewards/rejected": -16.625, "step": 212 }, { "epoch": 0.7111853088480802, "grad_norm": 0.00014337942411657423, "learning_rate": 0.00014051037238375103, "logits/chosen": -3.7421875, "logits/rejected": -2.5234375, "logps/chosen": -102.0, "logps/rejected": -421.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.4375, "rewards/margins": 23.1875, "rewards/rejected": -17.75, "step": 213 }, { "epoch": 0.7145242070116862, "grad_norm": 3.2657169413141673e-06, "learning_rate": 0.00014032527497463901, "logits/chosen": -3.5234375, "logits/rejected": -2.2578125, "logps/chosen": -108.0, "logps/rejected": -398.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.796875, "rewards/margins": 23.0, "rewards/rejected": -17.1875, "step": 214 }, { "epoch": 0.7178631051752922, "grad_norm": 0.0009526070207357407, "learning_rate": 0.00014013942591045668, "logits/chosen": -3.671875, "logits/rejected": -2.140625, "logps/chosen": -107.75, "logps/rejected": -423.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.84375, "rewards/margins": 22.75, "rewards/rejected": -16.9375, "step": 215 }, { "epoch": 0.7212020033388982, "grad_norm": 0.0013299849815666676, "learning_rate": 0.00013995282750689001, "logits/chosen": -3.6171875, "logits/rejected": -2.2578125, "logps/chosen": -91.625, "logps/rejected": -461.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.65625, "rewards/margins": 24.4375, "rewards/rejected": -18.75, "step": 216 }, { "epoch": 0.7245409015025042, "grad_norm": 0.0005564424791373312, "learning_rate": 0.00013976548208896177, "logits/chosen": -3.3125, "logits/rejected": -2.3515625, "logps/chosen": -120.5, "logps/rejected": -394.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.203125, "rewards/margins": 23.5, "rewards/rejected": -17.3125, "step": 217 }, { "epoch": 0.7278797996661102, "grad_norm": 0.0002660456520970911, "learning_rate": 0.00013957739199100248, "logits/chosen": -3.734375, "logits/rejected": -2.3203125, "logps/chosen": -87.5, "logps/rejected": -453.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.390625, "rewards/margins": 24.125, "rewards/rejected": -18.6875, "step": 218 }, { "epoch": 0.7312186978297162, "grad_norm": 8.14365193946287e-05, "learning_rate": 0.00013938855955662142, "logits/chosen": -3.140625, "logits/rejected": -2.3515625, "logps/chosen": -108.75, "logps/rejected": -404.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.921875, "rewards/margins": 23.125, "rewards/rejected": -17.125, "step": 219 }, { "epoch": 0.7345575959933222, "grad_norm": 3.825878229690716e-05, "learning_rate": 0.00013919898713867754, "logits/chosen": -3.8359375, "logits/rejected": -2.4296875, "logps/chosen": -91.75, "logps/rejected": -420.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.1875, "rewards/margins": 23.8125, "rewards/rejected": -17.59375, "step": 220 }, { "epoch": 0.7378964941569283, "grad_norm": 0.0003690333687700331, "learning_rate": 0.00013900867709924978, "logits/chosen": -3.90625, "logits/rejected": -2.2734375, "logps/chosen": -94.25, "logps/rejected": -448.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.390625, "rewards/margins": 24.1875, "rewards/rejected": -18.8125, "step": 221 }, { "epoch": 0.7412353923205343, "grad_norm": 0.0014959557447582483, "learning_rate": 0.00013881763180960809, "logits/chosen": -3.7890625, "logits/rejected": -2.4609375, "logps/chosen": -119.75, "logps/rejected": -417.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.734375, "rewards/margins": 23.8125, "rewards/rejected": -18.125, "step": 222 }, { "epoch": 0.7445742904841403, "grad_norm": 0.0005811589653603733, "learning_rate": 0.00013862585365018352, "logits/chosen": -4.0, "logits/rejected": -2.390625, "logps/chosen": -109.5, "logps/rejected": -429.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.8125, "rewards/margins": 23.6875, "rewards/rejected": -17.875, "step": 223 }, { "epoch": 0.7479131886477463, "grad_norm": 0.00026676716515794396, "learning_rate": 0.00013843334501053878, "logits/chosen": -3.921875, "logits/rejected": -2.390625, "logps/chosen": -88.75, "logps/rejected": -430.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.46875, "rewards/margins": 24.0, "rewards/rejected": -18.5, "step": 224 }, { "epoch": 0.7512520868113522, "grad_norm": 0.00046272281906567514, "learning_rate": 0.00013824010828933833, "logits/chosen": -3.96875, "logits/rejected": -2.3515625, "logps/chosen": -70.25, "logps/rejected": -453.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.53125, "rewards/margins": 24.8125, "rewards/rejected": -19.1875, "step": 225 }, { "epoch": 0.7545909849749582, "grad_norm": 0.0005486281588673592, "learning_rate": 0.0001380461458943186, "logits/chosen": -3.8046875, "logits/rejected": -2.34375, "logps/chosen": -90.75, "logps/rejected": -458.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.453125, "rewards/margins": 25.1875, "rewards/rejected": -19.75, "step": 226 }, { "epoch": 0.7579298831385642, "grad_norm": 0.005520923994481564, "learning_rate": 0.00013785146024225797, "logits/chosen": -4.359375, "logits/rejected": -2.46875, "logps/chosen": -94.0, "logps/rejected": -392.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.59375, "rewards/margins": 22.75, "rewards/rejected": -17.125, "step": 227 }, { "epoch": 0.7612687813021702, "grad_norm": 8.077368693193421e-05, "learning_rate": 0.0001376560537589465, "logits/chosen": -3.5625, "logits/rejected": -2.2734375, "logps/chosen": -82.0, "logps/rejected": -415.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.796875, "rewards/margins": 23.125, "rewards/rejected": -18.375, "step": 228 }, { "epoch": 0.7646076794657763, "grad_norm": 0.00024681369541212916, "learning_rate": 0.000137459928879156, "logits/chosen": -4.21875, "logits/rejected": -2.15625, "logps/chosen": -76.25, "logps/rejected": -468.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.03125, "rewards/margins": 24.75, "rewards/rejected": -19.75, "step": 229 }, { "epoch": 0.7679465776293823, "grad_norm": 2.9520870157284662e-06, "learning_rate": 0.00013726308804660938, "logits/chosen": -3.796875, "logits/rejected": -2.40625, "logps/chosen": -99.5, "logps/rejected": -455.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.296875, "rewards/margins": 25.3125, "rewards/rejected": -20.0, "step": 230 }, { "epoch": 0.7712854757929883, "grad_norm": 0.4641003906726837, "learning_rate": 0.00013706553371395044, "logits/chosen": -3.8046875, "logits/rejected": -2.375, "logps/chosen": -94.75, "logps/rejected": -438.0, "loss": 0.0183, "rewards/accuracies": 0.984375, "rewards/chosen": 5.0625, "rewards/margins": 23.5, "rewards/rejected": -18.375, "step": 231 }, { "epoch": 0.7746243739565943, "grad_norm": 0.0002731184067670256, "learning_rate": 0.00013686726834271316, "logits/chosen": -3.71875, "logits/rejected": -2.3203125, "logps/chosen": -89.25, "logps/rejected": -367.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.734375, "rewards/margins": 21.8125, "rewards/rejected": -16.03125, "step": 232 }, { "epoch": 0.7779632721202003, "grad_norm": 0.02955230325460434, "learning_rate": 0.00013666829440329113, "logits/chosen": -4.1875, "logits/rejected": -2.203125, "logps/chosen": -59.125, "logps/rejected": -431.0, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 5.296875, "rewards/margins": 23.625, "rewards/rejected": -18.375, "step": 233 }, { "epoch": 0.7813021702838063, "grad_norm": 3.0029235858819447e-05, "learning_rate": 0.00013646861437490674, "logits/chosen": -3.7265625, "logits/rejected": -2.2265625, "logps/chosen": -98.25, "logps/rejected": -416.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.625, "rewards/margins": 23.375, "rewards/rejected": -17.75, "step": 234 }, { "epoch": 0.7846410684474123, "grad_norm": 0.018931280821561813, "learning_rate": 0.00013626823074558019, "logits/chosen": -3.890625, "logits/rejected": -2.203125, "logps/chosen": -89.0, "logps/rejected": -415.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 5.03125, "rewards/margins": 22.75, "rewards/rejected": -17.6875, "step": 235 }, { "epoch": 0.7879799666110183, "grad_norm": 3.846998879453167e-05, "learning_rate": 0.00013606714601209865, "logits/chosen": -3.5234375, "logits/rejected": -2.1953125, "logps/chosen": -92.0, "logps/rejected": -404.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.515625, "rewards/margins": 22.625, "rewards/rejected": -17.125, "step": 236 }, { "epoch": 0.7913188647746243, "grad_norm": 0.00047268884372897446, "learning_rate": 0.00013586536267998504, "logits/chosen": -3.96875, "logits/rejected": -2.1171875, "logps/chosen": -90.25, "logps/rejected": -482.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.109375, "rewards/margins": 25.9375, "rewards/rejected": -19.8125, "step": 237 }, { "epoch": 0.7946577629382304, "grad_norm": 0.00010255785309709609, "learning_rate": 0.00013566288326346683, "logits/chosen": -3.9765625, "logits/rejected": -2.140625, "logps/chosen": -76.0, "logps/rejected": -383.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.5, "rewards/margins": 22.9375, "rewards/rejected": -17.4375, "step": 238 }, { "epoch": 0.7979966611018364, "grad_norm": 0.00026641954900696874, "learning_rate": 0.0001354597102854448, "logits/chosen": -3.71875, "logits/rejected": -2.21875, "logps/chosen": -88.25, "logps/rejected": -368.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.875, "rewards/margins": 22.1875, "rewards/rejected": -16.3125, "step": 239 }, { "epoch": 0.8013355592654424, "grad_norm": 0.0015892143128439784, "learning_rate": 0.00013525584627746142, "logits/chosen": -3.6328125, "logits/rejected": -2.03125, "logps/chosen": -98.5, "logps/rejected": -400.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.515625, "rewards/margins": 22.375, "rewards/rejected": -16.90625, "step": 240 }, { "epoch": 0.8046744574290484, "grad_norm": 0.0008768016705289483, "learning_rate": 0.0001350512937796695, "logits/chosen": -4.09375, "logits/rejected": -2.12890625, "logps/chosen": -61.875, "logps/rejected": -414.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.453125, "rewards/margins": 22.75, "rewards/rejected": -17.375, "step": 241 }, { "epoch": 0.8080133555926544, "grad_norm": 0.0008365919347852468, "learning_rate": 0.00013484605534080045, "logits/chosen": -3.484375, "logits/rejected": -2.02734375, "logps/chosen": -90.25, "logps/rejected": -382.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.65625, "rewards/margins": 21.625, "rewards/rejected": -15.96875, "step": 242 }, { "epoch": 0.8113522537562604, "grad_norm": 0.0011707853991538286, "learning_rate": 0.00013464013351813248, "logits/chosen": -3.8125, "logits/rejected": -2.1953125, "logps/chosen": -75.5, "logps/rejected": -402.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.734375, "rewards/margins": 22.5, "rewards/rejected": -16.75, "step": 243 }, { "epoch": 0.8146911519198664, "grad_norm": 0.05999299883842468, "learning_rate": 0.00013443353087745885, "logits/chosen": -3.546875, "logits/rejected": -2.0390625, "logps/chosen": -119.75, "logps/rejected": -389.0, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 6.296875, "rewards/margins": 21.375, "rewards/rejected": -15.09375, "step": 244 }, { "epoch": 0.8180300500834724, "grad_norm": 1.330207032879116e-05, "learning_rate": 0.00013422624999305578, "logits/chosen": -3.25, "logits/rejected": -2.0859375, "logps/chosen": -101.0, "logps/rejected": -434.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.625, "rewards/margins": 23.3125, "rewards/rejected": -17.625, "step": 245 }, { "epoch": 0.8213689482470785, "grad_norm": 0.001336806220933795, "learning_rate": 0.00013401829344765045, "logits/chosen": -3.65625, "logits/rejected": -2.078125, "logps/chosen": -93.75, "logps/rejected": -427.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.109375, "rewards/margins": 22.0625, "rewards/rejected": -17.0, "step": 246 }, { "epoch": 0.8247078464106845, "grad_norm": 1.3096532711642794e-05, "learning_rate": 0.00013380966383238883, "logits/chosen": -3.5625, "logits/rejected": -2.125, "logps/chosen": -95.25, "logps/rejected": -477.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.21875, "rewards/margins": 24.9375, "rewards/rejected": -19.625, "step": 247 }, { "epoch": 0.8280467445742905, "grad_norm": 0.0002120399149134755, "learning_rate": 0.00013360036374680334, "logits/chosen": -3.5625, "logits/rejected": -1.796875, "logps/chosen": -82.75, "logps/rejected": -436.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.5, "rewards/margins": 23.5625, "rewards/rejected": -18.0625, "step": 248 }, { "epoch": 0.8313856427378965, "grad_norm": 0.0001271862565772608, "learning_rate": 0.0001333903957987805, "logits/chosen": -3.46875, "logits/rejected": -2.15625, "logps/chosen": -80.25, "logps/rejected": -415.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.859375, "rewards/margins": 24.25, "rewards/rejected": -18.4375, "step": 249 }, { "epoch": 0.8347245409015025, "grad_norm": 1.912344669108279e-05, "learning_rate": 0.00013317976260452836, "logits/chosen": -3.4765625, "logits/rejected": -1.74609375, "logps/chosen": -108.75, "logps/rejected": -422.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 6.1875, "rewards/margins": 22.9375, "rewards/rejected": -16.78125, "step": 250 }, { "epoch": 0.8380634390651085, "grad_norm": 0.0015317240031436086, "learning_rate": 0.00013296846678854406, "logits/chosen": -3.7265625, "logits/rejected": -2.109375, "logps/chosen": -84.0, "logps/rejected": -411.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.59375, "rewards/margins": 23.625, "rewards/rejected": -18.0, "step": 251 }, { "epoch": 0.8414023372287145, "grad_norm": 0.013954302296042442, "learning_rate": 0.0001327565109835809, "logits/chosen": -3.578125, "logits/rejected": -2.078125, "logps/chosen": -119.5, "logps/rejected": -471.0, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 5.25, "rewards/margins": 24.0, "rewards/rejected": -18.75, "step": 252 }, { "epoch": 0.8447412353923205, "grad_norm": 0.0019699318800121546, "learning_rate": 0.00013254389783061584, "logits/chosen": -3.6953125, "logits/rejected": -1.88671875, "logps/chosen": -105.75, "logps/rejected": -401.0, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": 5.484375, "rewards/margins": 22.625, "rewards/rejected": -17.15625, "step": 253 }, { "epoch": 0.8480801335559266, "grad_norm": 0.03861398622393608, "learning_rate": 0.00013233062997881627, "logits/chosen": -3.453125, "logits/rejected": -2.0859375, "logps/chosen": -104.75, "logps/rejected": -402.0, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 5.5625, "rewards/margins": 22.25, "rewards/rejected": -16.6875, "step": 254 }, { "epoch": 0.8514190317195326, "grad_norm": 8.354683086508885e-05, "learning_rate": 0.00013211671008550718, "logits/chosen": -3.7265625, "logits/rejected": -1.89453125, "logps/chosen": -89.25, "logps/rejected": -417.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.46875, "rewards/margins": 23.0625, "rewards/rejected": -17.625, "step": 255 }, { "epoch": 0.8547579298831386, "grad_norm": 0.015042081475257874, "learning_rate": 0.0001319021408161381, "logits/chosen": -3.671875, "logits/rejected": -2.078125, "logps/chosen": -89.0, "logps/rejected": -421.0, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": 5.78125, "rewards/margins": 24.375, "rewards/rejected": -18.5625, "step": 256 }, { "epoch": 0.8580968280467446, "grad_norm": 0.0004783151962328702, "learning_rate": 0.0001316869248442497, "logits/chosen": -3.5078125, "logits/rejected": -2.15625, "logps/chosen": -100.25, "logps/rejected": -445.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.21875, "rewards/margins": 24.3125, "rewards/rejected": -19.0625, "step": 257 }, { "epoch": 0.8614357262103506, "grad_norm": 0.11549096554517746, "learning_rate": 0.00013147106485144068, "logits/chosen": -3.5703125, "logits/rejected": -1.9765625, "logps/chosen": -98.0, "logps/rejected": -449.0, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 4.953125, "rewards/margins": 23.625, "rewards/rejected": -18.625, "step": 258 }, { "epoch": 0.8647746243739566, "grad_norm": 0.005697562824934721, "learning_rate": 0.00013125456352733423, "logits/chosen": -3.5859375, "logits/rejected": -2.0859375, "logps/chosen": -104.5, "logps/rejected": -425.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 5.3125, "rewards/margins": 23.375, "rewards/rejected": -18.0625, "step": 259 }, { "epoch": 0.8681135225375626, "grad_norm": 0.07381512969732285, "learning_rate": 0.0001310374235695445, "logits/chosen": -3.328125, "logits/rejected": -1.92578125, "logps/chosen": -98.25, "logps/rejected": -465.0, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 5.296875, "rewards/margins": 25.9375, "rewards/rejected": -20.625, "step": 260 }, { "epoch": 0.8714524207011686, "grad_norm": 0.004491521045565605, "learning_rate": 0.00013081964768364308, "logits/chosen": -3.546875, "logits/rejected": -2.140625, "logps/chosen": -95.5, "logps/rejected": -450.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 5.28125, "rewards/margins": 24.875, "rewards/rejected": -19.625, "step": 261 }, { "epoch": 0.8747913188647746, "grad_norm": 0.00013118820788804442, "learning_rate": 0.0001306012385831253, "logits/chosen": -3.765625, "logits/rejected": -2.109375, "logps/chosen": -106.25, "logps/rejected": -419.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.734375, "rewards/margins": 23.4375, "rewards/rejected": -18.75, "step": 262 }, { "epoch": 0.8781302170283807, "grad_norm": 0.00017179777205456048, "learning_rate": 0.00013038219898937629, "logits/chosen": -3.875, "logits/rejected": -2.1171875, "logps/chosen": -63.125, "logps/rejected": -422.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.171875, "rewards/margins": 23.75, "rewards/rejected": -18.5625, "step": 263 }, { "epoch": 0.8814691151919867, "grad_norm": 0.00010286461474606767, "learning_rate": 0.00013016253163163714, "logits/chosen": -3.4765625, "logits/rejected": -2.03125, "logps/chosen": -100.25, "logps/rejected": -409.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.046875, "rewards/margins": 22.875, "rewards/rejected": -17.8125, "step": 264 }, { "epoch": 0.8848080133555927, "grad_norm": 0.0005210632225498557, "learning_rate": 0.000129942239246971, "logits/chosen": -3.765625, "logits/rejected": -2.1015625, "logps/chosen": -90.75, "logps/rejected": -473.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.671875, "rewards/margins": 26.4375, "rewards/rejected": -20.6875, "step": 265 }, { "epoch": 0.8881469115191987, "grad_norm": 0.0008880659588612616, "learning_rate": 0.00012972132458022878, "logits/chosen": -3.53125, "logits/rejected": -1.99609375, "logps/chosen": -104.5, "logps/rejected": -401.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.890625, "rewards/margins": 22.1875, "rewards/rejected": -17.1875, "step": 266 }, { "epoch": 0.8914858096828047, "grad_norm": 7.1963854679779615e-06, "learning_rate": 0.00012949979038401503, "logits/chosen": -3.2734375, "logits/rejected": -2.125, "logps/chosen": -110.0, "logps/rejected": -424.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.5, "rewards/margins": 24.125, "rewards/rejected": -18.625, "step": 267 }, { "epoch": 0.8948247078464107, "grad_norm": 0.0002867156290449202, "learning_rate": 0.00012927763941865378, "logits/chosen": -4.1875, "logits/rejected": -1.87109375, "logps/chosen": -86.75, "logps/rejected": -478.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.21875, "rewards/margins": 26.6875, "rewards/rejected": -21.4375, "step": 268 }, { "epoch": 0.8981636060100167, "grad_norm": 0.7056188583374023, "learning_rate": 0.00012905487445215394, "logits/chosen": -4.0703125, "logits/rejected": -2.078125, "logps/chosen": -92.75, "logps/rejected": -399.0, "loss": 0.2617, "rewards/accuracies": 0.984375, "rewards/chosen": 5.546875, "rewards/margins": 22.75, "rewards/rejected": -17.25, "step": 269 }, { "epoch": 0.9015025041736227, "grad_norm": 3.4810282159014605e-06, "learning_rate": 0.0001288314982601749, "logits/chosen": -4.1171875, "logits/rejected": -2.2109375, "logps/chosen": -105.25, "logps/rejected": -473.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.5625, "rewards/margins": 27.75, "rewards/rejected": -22.1875, "step": 270 }, { "epoch": 0.9048414023372288, "grad_norm": 2.5784022000152618e-05, "learning_rate": 0.00012860751362599193, "logits/chosen": -4.0078125, "logits/rejected": -2.296875, "logps/chosen": -79.125, "logps/rejected": -472.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.34375, "rewards/margins": 28.875, "rewards/rejected": -23.5625, "step": 271 }, { "epoch": 0.9081803005008348, "grad_norm": 5.8086599892703816e-05, "learning_rate": 0.00012838292334046156, "logits/chosen": -4.59375, "logits/rejected": -2.25, "logps/chosen": -84.0, "logps/rejected": -514.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.390625, "rewards/margins": 30.25, "rewards/rejected": -25.875, "step": 272 }, { "epoch": 0.9115191986644408, "grad_norm": 2.144264362868853e-05, "learning_rate": 0.00012815773020198674, "logits/chosen": -4.703125, "logits/rejected": -2.265625, "logps/chosen": -87.75, "logps/rejected": -630.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8046875, "rewards/margins": 35.375, "rewards/rejected": -31.625, "step": 273 }, { "epoch": 0.9148580968280468, "grad_norm": 0.010493806563317776, "learning_rate": 0.00012793193701648195, "logits/chosen": -4.046875, "logits/rejected": -2.328125, "logps/chosen": -98.0, "logps/rejected": -568.0, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": 4.09375, "rewards/margins": 35.625, "rewards/rejected": -31.5625, "step": 274 }, { "epoch": 0.9181969949916527, "grad_norm": 0.0006047628121450543, "learning_rate": 0.0001277055465973383, "logits/chosen": -4.796875, "logits/rejected": -2.640625, "logps/chosen": -93.0, "logps/rejected": -584.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.6484375, "rewards/margins": 37.375, "rewards/rejected": -33.8125, "step": 275 }, { "epoch": 0.9215358931552587, "grad_norm": 0.0015730452723801136, "learning_rate": 0.0001274785617653885, "logits/chosen": -5.0, "logits/rejected": -2.8203125, "logps/chosen": -99.25, "logps/rejected": -501.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.8671875, "rewards/margins": 33.8125, "rewards/rejected": -29.9375, "step": 276 }, { "epoch": 0.9248747913188647, "grad_norm": 2.2766906113247387e-05, "learning_rate": 0.00012725098534887162, "logits/chosen": -4.59375, "logits/rejected": -2.7265625, "logps/chosen": -106.75, "logps/rejected": -640.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.3203125, "rewards/margins": 41.875, "rewards/rejected": -38.5, "step": 277 }, { "epoch": 0.9282136894824707, "grad_norm": 4.425516090122983e-05, "learning_rate": 0.00012702282018339786, "logits/chosen": -5.125, "logits/rejected": -2.9453125, "logps/chosen": -107.75, "logps/rejected": -592.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.0625, "rewards/margins": 40.75, "rewards/rejected": -37.625, "step": 278 }, { "epoch": 0.9315525876460768, "grad_norm": 0.002112521091476083, "learning_rate": 0.00012679406911191333, "logits/chosen": -4.90625, "logits/rejected": -2.921875, "logps/chosen": -108.75, "logps/rejected": -564.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.59375, "rewards/margins": 38.75, "rewards/rejected": -35.25, "step": 279 }, { "epoch": 0.9348914858096828, "grad_norm": 6.724369995936286e-06, "learning_rate": 0.00012656473498466446, "logits/chosen": -5.1875, "logits/rejected": -2.9921875, "logps/chosen": -115.75, "logps/rejected": -666.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.9921875, "rewards/margins": 45.5, "rewards/rejected": -42.625, "step": 280 }, { "epoch": 0.9382303839732888, "grad_norm": 9.46976160776103e-06, "learning_rate": 0.00012633482065916267, "logits/chosen": -5.375, "logits/rejected": -2.875, "logps/chosen": -127.75, "logps/rejected": -700.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.328125, "rewards/margins": 46.375, "rewards/rejected": -44.0, "step": 281 }, { "epoch": 0.9415692821368948, "grad_norm": 0.0018350208410993218, "learning_rate": 0.00012610432900014864, "logits/chosen": -5.140625, "logits/rejected": -3.078125, "logps/chosen": -132.5, "logps/rejected": -660.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.2890625, "rewards/margins": 46.25, "rewards/rejected": -43.875, "step": 282 }, { "epoch": 0.9449081803005008, "grad_norm": 0.5556798577308655, "learning_rate": 0.0001258732628795566, "logits/chosen": -5.234375, "logits/rejected": -3.1875, "logps/chosen": -106.5, "logps/rejected": -670.0, "loss": 0.023, "rewards/accuracies": 0.984375, "rewards/chosen": 2.984375, "rewards/margins": 47.125, "rewards/rejected": -44.25, "step": 283 }, { "epoch": 0.9482470784641068, "grad_norm": 5.686655276804231e-05, "learning_rate": 0.00012564162517647863, "logits/chosen": -5.375, "logits/rejected": -3.03125, "logps/chosen": -91.0, "logps/rejected": -756.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 2.84375, "rewards/margins": 51.375, "rewards/rejected": -48.5, "step": 284 }, { "epoch": 0.9515859766277128, "grad_norm": 1.8947954231407493e-05, "learning_rate": 0.00012540941877712877, "logits/chosen": -4.875, "logits/rejected": -2.9921875, "logps/chosen": -112.5, "logps/rejected": -714.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.5703125, "rewards/margins": 50.625, "rewards/rejected": -47.0, "step": 285 }, { "epoch": 0.9549248747913188, "grad_norm": 2.8031481633661315e-05, "learning_rate": 0.00012517664657480694, "logits/chosen": -4.765625, "logits/rejected": -2.890625, "logps/chosen": -124.25, "logps/rejected": -710.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.1015625, "rewards/margins": 48.375, "rewards/rejected": -45.375, "step": 286 }, { "epoch": 0.9582637729549248, "grad_norm": 1.9544756924005924e-06, "learning_rate": 0.00012494331146986314, "logits/chosen": -5.109375, "logits/rejected": -2.875, "logps/chosen": -100.0, "logps/rejected": -704.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.59375, "rewards/margins": 48.75, "rewards/rejected": -45.125, "step": 287 }, { "epoch": 0.9616026711185309, "grad_norm": 1.1154845103167332e-10, "learning_rate": 0.00012470941636966103, "logits/chosen": -5.15625, "logits/rejected": -2.8828125, "logps/chosen": -94.0, "logps/rejected": -734.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.9609375, "rewards/margins": 51.375, "rewards/rejected": -47.5, "step": 288 }, { "epoch": 0.9649415692821369, "grad_norm": 1.1422841453168076e-05, "learning_rate": 0.00012447496418854188, "logits/chosen": -5.1875, "logits/rejected": -2.921875, "logps/chosen": -122.25, "logps/rejected": -720.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.640625, "rewards/margins": 50.25, "rewards/rejected": -46.625, "step": 289 }, { "epoch": 0.9682804674457429, "grad_norm": 1.922987102886964e-08, "learning_rate": 0.00012423995784778817, "logits/chosen": -5.0625, "logits/rejected": -3.0, "logps/chosen": -109.0, "logps/rejected": -722.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 3.578125, "rewards/margins": 50.125, "rewards/rejected": -46.625, "step": 290 }, { "epoch": 0.9716193656093489, "grad_norm": 0.7204757332801819, "learning_rate": 0.00012400440027558732, "logits/chosen": -5.09375, "logits/rejected": -2.859375, "logps/chosen": -120.75, "logps/rejected": -694.0, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 2.546875, "rewards/margins": 45.875, "rewards/rejected": -43.25, "step": 291 }, { "epoch": 0.9749582637729549, "grad_norm": 7.915846822470485e-08, "learning_rate": 0.000123768294406995, "logits/chosen": -5.03125, "logits/rejected": -3.1484375, "logps/chosen": -118.75, "logps/rejected": -678.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.5625, "rewards/margins": 49.125, "rewards/rejected": -44.625, "step": 292 }, { "epoch": 0.9782971619365609, "grad_norm": 3.635519169620238e-05, "learning_rate": 0.00012353164318389874, "logits/chosen": -4.96875, "logits/rejected": -3.109375, "logps/chosen": -99.25, "logps/rejected": -630.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.265625, "rewards/margins": 45.25, "rewards/rejected": -41.0, "step": 293 }, { "epoch": 0.9816360601001669, "grad_norm": 2.1648361325787846e-06, "learning_rate": 0.00012329444955498116, "logits/chosen": -4.9375, "logits/rejected": -2.734375, "logps/chosen": -107.75, "logps/rejected": -664.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.734375, "rewards/margins": 47.625, "rewards/rejected": -42.875, "step": 294 }, { "epoch": 0.9849749582637729, "grad_norm": 4.366306711744983e-07, "learning_rate": 0.00012305671647568338, "logits/chosen": -4.890625, "logits/rejected": -2.9375, "logps/chosen": -116.75, "logps/rejected": -668.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.921875, "rewards/margins": 46.375, "rewards/rejected": -41.5, "step": 295 }, { "epoch": 0.988313856427379, "grad_norm": 9.796775884751696e-06, "learning_rate": 0.00012281844690816793, "logits/chosen": -4.3125, "logits/rejected": -2.65625, "logps/chosen": -115.75, "logps/rejected": -678.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.609375, "rewards/margins": 46.375, "rewards/rejected": -41.75, "step": 296 }, { "epoch": 0.991652754590985, "grad_norm": 1.267866196030809e-06, "learning_rate": 0.0001225796438212822, "logits/chosen": -4.484375, "logits/rejected": -2.828125, "logps/chosen": -81.0, "logps/rejected": -646.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.265625, "rewards/margins": 46.75, "rewards/rejected": -41.375, "step": 297 }, { "epoch": 0.994991652754591, "grad_norm": 7.400533519330565e-10, "learning_rate": 0.00012234031019052103, "logits/chosen": -5.25, "logits/rejected": -2.8125, "logps/chosen": -72.75, "logps/rejected": -664.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.203125, "rewards/margins": 46.125, "rewards/rejected": -41.0, "step": 298 }, { "epoch": 0.998330550918197, "grad_norm": 1.766308876938183e-08, "learning_rate": 0.00012210044899799003, "logits/chosen": -4.171875, "logits/rejected": -2.765625, "logps/chosen": -90.5, "logps/rejected": -684.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 5.875, "rewards/margins": 48.0, "rewards/rejected": -42.125, "step": 299 }, { "epoch": 1.0, "grad_norm": 3.3598018944758223e-06, "learning_rate": 0.00012186006323236816, "logits/chosen": -4.0625, "logits/rejected": -2.6875, "logps/chosen": -91.0, "logps/rejected": -680.0, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": 4.40625, "rewards/margins": 48.5, "rewards/rejected": -44.0, "step": 300 } ], "logging_steps": 1, "max_steps": 900, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }