{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 3884, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0012873326467559218, "grad_norm": 114.5, "learning_rate": 4.99356333676622e-07, "logits/chosen": -0.27128905057907104, "logits/rejected": -0.23935547471046448, "logps/chosen": -282.0, "logps/rejected": -243.39999389648438, "loss": 0.6953, "rewards/accuracies": 0.284761905670166, "rewards/chosen": 0.004421996884047985, "rewards/margins": 0.0062500000931322575, "rewards/rejected": -0.0018127441871911287, "step": 5 }, { "epoch": 0.0025746652935118436, "grad_norm": 119.5, "learning_rate": 4.987126673532441e-07, "logits/chosen": 0.0113525390625, "logits/rejected": -0.384521484375, "logps/chosen": -247.10000610351562, "logps/rejected": -201.3000030517578, "loss": 0.6852, "rewards/accuracies": 0.31775975227355957, "rewards/chosen": 0.02788543701171875, "rewards/margins": 0.01920166052877903, "rewards/rejected": 0.00865936279296875, "step": 10 }, { "epoch": 0.0038619979402677654, "grad_norm": 133.0, "learning_rate": 4.980690010298661e-07, "logits/chosen": -0.19875487685203552, "logits/rejected": -0.1733245849609375, "logps/chosen": -305.6000061035156, "logps/rejected": -293.3999938964844, "loss": 0.6906, "rewards/accuracies": 0.39083331823349, "rewards/chosen": 0.01595458947122097, "rewards/margins": 0.01104125939309597, "rewards/rejected": 0.004895019344985485, "step": 15 }, { "epoch": 0.005149330587023687, "grad_norm": 113.5, "learning_rate": 4.974253347064881e-07, "logits/chosen": -0.24980469048023224, "logits/rejected": -0.1827392578125, "logps/chosen": -315.20001220703125, "logps/rejected": -269.20001220703125, "loss": 0.6852, "rewards/accuracies": 0.42321428656578064, "rewards/chosen": 0.01884765550494194, "rewards/margins": 0.01981201209127903, "rewards/rejected": -0.0010253905784338713, "step": 20 }, { "epoch": 0.006436663233779609, "grad_norm": 112.5, "learning_rate": 4.967816683831102e-07, "logits/chosen": -0.23886719346046448, "logits/rejected": -0.22606201469898224, "logps/chosen": -313.0, "logps/rejected": -285.6000061035156, "loss": 0.6937, "rewards/accuracies": 0.2996428608894348, "rewards/chosen": 0.02387237548828125, "rewards/margins": 0.00753097515553236, "rewards/rejected": 0.016323089599609375, "step": 25 }, { "epoch": 0.007723995880535531, "grad_norm": 121.5, "learning_rate": 4.961380020597322e-07, "logits/chosen": -0.28593748807907104, "logits/rejected": -0.208984375, "logps/chosen": -282.0, "logps/rejected": -265.6000061035156, "loss": 0.6859, "rewards/accuracies": 0.3175000250339508, "rewards/chosen": 0.02805175818502903, "rewards/margins": 0.01729736290872097, "rewards/rejected": 0.01072082482278347, "step": 30 }, { "epoch": 0.009011328527291453, "grad_norm": 224.0, "learning_rate": 4.954943357363543e-07, "logits/chosen": -0.17050781846046448, "logits/rejected": -0.11972656100988388, "logps/chosen": -252.39999389648438, "logps/rejected": -241.8000030517578, "loss": 0.693, "rewards/accuracies": 0.3400000035762787, "rewards/chosen": 0.02674560621380806, "rewards/margins": 0.007098388858139515, "rewards/rejected": 0.01962890662252903, "step": 35 }, { "epoch": 0.010298661174047374, "grad_norm": 137.0, "learning_rate": 4.948506694129763e-07, "logits/chosen": -0.3345703184604645, "logits/rejected": -0.3319335877895355, "logps/chosen": -314.6000061035156, "logps/rejected": -287.79998779296875, "loss": 0.6914, "rewards/accuracies": 0.3612121343612671, "rewards/chosen": 0.03081054612994194, "rewards/margins": 0.0054260254837572575, "rewards/rejected": 0.02540283277630806, "step": 40 }, { "epoch": 0.011585993820803296, "grad_norm": 157.0, "learning_rate": 4.942070030895984e-07, "logits/chosen": -0.26494139432907104, "logits/rejected": -0.18378905951976776, "logps/chosen": -324.6000061035156, "logps/rejected": -302.79998779296875, "loss": 0.6742, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": 0.03303222730755806, "rewards/margins": 0.04052734375, "rewards/rejected": -0.00750732421875, "step": 45 }, { "epoch": 0.012873326467559218, "grad_norm": 137.0, "learning_rate": 4.935633367662204e-07, "logits/chosen": -0.19140625, "logits/rejected": -0.164306640625, "logps/chosen": -299.79998779296875, "logps/rejected": -262.0, "loss": 0.6883, "rewards/accuracies": 0.3680194914340973, "rewards/chosen": 0.04104004055261612, "rewards/margins": 0.010144042782485485, "rewards/rejected": 0.03090820275247097, "step": 50 }, { "epoch": 0.01416065911431514, "grad_norm": 188.0, "learning_rate": 4.929196704428423e-07, "logits/chosen": -0.31953126192092896, "logits/rejected": -0.3017578125, "logps/chosen": -257.20001220703125, "logps/rejected": -233.0, "loss": 0.675, "rewards/accuracies": 0.3452380895614624, "rewards/chosen": 0.07456054538488388, "rewards/margins": 0.04254760593175888, "rewards/rejected": 0.0318603515625, "step": 55 }, { "epoch": 0.015447991761071062, "grad_norm": 131.0, "learning_rate": 4.922760041194645e-07, "logits/chosen": -0.32353514432907104, "logits/rejected": -0.22797851264476776, "logps/chosen": -253.39999389648438, "logps/rejected": -242.1999969482422, "loss": 0.6844, "rewards/accuracies": 0.3891666829586029, "rewards/chosen": 0.03336792066693306, "rewards/margins": 0.0133056640625, "rewards/rejected": 0.02004089392721653, "step": 60 }, { "epoch": 0.016735324407826983, "grad_norm": 130.0, "learning_rate": 4.916323377960865e-07, "logits/chosen": -0.19794921576976776, "logits/rejected": -0.21699218451976776, "logps/chosen": -291.3999938964844, "logps/rejected": -272.0, "loss": 0.6984, "rewards/accuracies": 0.3466666638851166, "rewards/chosen": 0.04517211765050888, "rewards/margins": -0.0038085938431322575, "rewards/rejected": 0.04903564602136612, "step": 65 }, { "epoch": 0.018022657054582905, "grad_norm": 169.0, "learning_rate": 4.909886714727085e-07, "logits/chosen": -0.26611328125, "logits/rejected": -0.12578125298023224, "logps/chosen": -332.6000061035156, "logps/rejected": -276.79998779296875, "loss": 0.6641, "rewards/accuracies": 0.4099999964237213, "rewards/chosen": 0.09858398139476776, "rewards/margins": 0.0767822265625, "rewards/rejected": 0.021728515625, "step": 70 }, { "epoch": 0.019309989701338827, "grad_norm": 147.0, "learning_rate": 4.903450051493306e-07, "logits/chosen": -0.25800782442092896, "logits/rejected": -0.1435546875, "logps/chosen": -359.20001220703125, "logps/rejected": -296.79998779296875, "loss": 0.6828, "rewards/accuracies": 0.3816666603088379, "rewards/chosen": 0.06953124701976776, "rewards/margins": 0.02211914025247097, "rewards/rejected": 0.04746093600988388, "step": 75 }, { "epoch": 0.02059732234809475, "grad_norm": 147.0, "learning_rate": 4.897013388259526e-07, "logits/chosen": -0.24189452826976776, "logits/rejected": -0.16386719048023224, "logps/chosen": -383.6000061035156, "logps/rejected": -360.0, "loss": 0.6711, "rewards/accuracies": 0.47857141494750977, "rewards/chosen": 0.09990234673023224, "rewards/margins": 0.05498046800494194, "rewards/rejected": 0.04497070237994194, "step": 80 }, { "epoch": 0.02188465499485067, "grad_norm": 125.0, "learning_rate": 4.890576725025746e-07, "logits/chosen": -0.31621092557907104, "logits/rejected": -0.2562499940395355, "logps/chosen": -296.79998779296875, "logps/rejected": -316.79998779296875, "loss": 0.6961, "rewards/accuracies": 0.3149999976158142, "rewards/chosen": 0.06352539360523224, "rewards/margins": -0.0023193359375, "rewards/rejected": 0.06599120795726776, "step": 85 }, { "epoch": 0.023171987641606592, "grad_norm": 130.0, "learning_rate": 4.884140061791967e-07, "logits/chosen": -0.36328125, "logits/rejected": -0.22895507514476776, "logps/chosen": -305.0, "logps/rejected": -273.20001220703125, "loss": 0.682, "rewards/accuracies": 0.4425000250339508, "rewards/chosen": 0.06186523288488388, "rewards/margins": 0.029936527833342552, "rewards/rejected": 0.03187255933880806, "step": 90 }, { "epoch": 0.024459320288362514, "grad_norm": 140.0, "learning_rate": 4.877703398558187e-07, "logits/chosen": -0.26933592557907104, "logits/rejected": -0.29121094942092896, "logps/chosen": -336.79998779296875, "logps/rejected": -279.20001220703125, "loss": 0.6875, "rewards/accuracies": 0.3774999976158142, "rewards/chosen": 0.07382812350988388, "rewards/margins": 0.02683105506002903, "rewards/rejected": 0.04715576022863388, "step": 95 }, { "epoch": 0.025746652935118436, "grad_norm": 114.0, "learning_rate": 4.871266735324407e-07, "logits/chosen": -0.35595703125, "logits/rejected": -0.1771240234375, "logps/chosen": -335.6000061035156, "logps/rejected": -303.3999938964844, "loss": 0.6758, "rewards/accuracies": 0.3319047689437866, "rewards/chosen": 0.07749023288488388, "rewards/margins": 0.03952636569738388, "rewards/rejected": 0.03795165941119194, "step": 100 }, { "epoch": 0.027033985581874358, "grad_norm": 130.0, "learning_rate": 4.864830072090629e-07, "logits/chosen": -0.24355468153953552, "logits/rejected": -0.28339844942092896, "logps/chosen": -318.3999938964844, "logps/rejected": -284.3999938964844, "loss": 0.6672, "rewards/accuracies": 0.4841667115688324, "rewards/chosen": 0.08295898139476776, "rewards/margins": 0.05992431566119194, "rewards/rejected": 0.02309570275247097, "step": 105 }, { "epoch": 0.02832131822863028, "grad_norm": 130.0, "learning_rate": 4.858393408856848e-07, "logits/chosen": -0.2540039122104645, "logits/rejected": -0.26445311307907104, "logps/chosen": -309.20001220703125, "logps/rejected": -302.20001220703125, "loss": 0.6977, "rewards/accuracies": 0.41214290261268616, "rewards/chosen": 0.07436523586511612, "rewards/margins": 0.014146232977509499, "rewards/rejected": 0.06025390699505806, "step": 110 }, { "epoch": 0.0296086508753862, "grad_norm": 131.0, "learning_rate": 4.851956745623069e-07, "logits/chosen": -0.35478514432907104, "logits/rejected": -0.22612304985523224, "logps/chosen": -318.79998779296875, "logps/rejected": -309.6000061035156, "loss": 0.7023, "rewards/accuracies": 0.3078787922859192, "rewards/chosen": 0.07236327975988388, "rewards/margins": -0.008862304501235485, "rewards/rejected": 0.08115234225988388, "step": 115 }, { "epoch": 0.030895983522142123, "grad_norm": 141.0, "learning_rate": 4.845520082389289e-07, "logits/chosen": -0.46074217557907104, "logits/rejected": -0.37187498807907104, "logps/chosen": -343.3999938964844, "logps/rejected": -297.3999938964844, "loss": 0.6898, "rewards/accuracies": 0.39916667342185974, "rewards/chosen": 0.09521484375, "rewards/margins": 0.02117919921875, "rewards/rejected": 0.073974609375, "step": 120 }, { "epoch": 0.032183316168898045, "grad_norm": 134.0, "learning_rate": 4.839083419155509e-07, "logits/chosen": -0.2909179627895355, "logits/rejected": -0.13120117783546448, "logps/chosen": -308.3999938964844, "logps/rejected": -270.0, "loss": 0.6664, "rewards/accuracies": 0.45650792121887207, "rewards/chosen": 0.1083984375, "rewards/margins": 0.06306152045726776, "rewards/rejected": 0.04550781100988388, "step": 125 }, { "epoch": 0.03347064881565397, "grad_norm": 132.0, "learning_rate": 4.83264675592173e-07, "logits/chosen": -0.2548049986362457, "logits/rejected": -0.220703125, "logps/chosen": -278.79998779296875, "logps/rejected": -268.79998779296875, "loss": 0.6727, "rewards/accuracies": 0.45666664838790894, "rewards/chosen": 0.08295898139476776, "rewards/margins": 0.04765624925494194, "rewards/rejected": 0.03542480617761612, "step": 130 }, { "epoch": 0.03475798146240989, "grad_norm": 129.0, "learning_rate": 4.82621009268795e-07, "logits/chosen": -0.3733383119106293, "logits/rejected": -0.17514649033546448, "logps/chosen": -335.6000061035156, "logps/rejected": -250.1999969482422, "loss": 0.6438, "rewards/accuracies": 0.5590018033981323, "rewards/chosen": 0.1416015625, "rewards/margins": 0.1064453125, "rewards/rejected": 0.03525543212890625, "step": 135 }, { "epoch": 0.03604531410916581, "grad_norm": 134.0, "learning_rate": 4.819773429454171e-07, "logits/chosen": -0.263671875, "logits/rejected": -0.23662109673023224, "logps/chosen": -313.0, "logps/rejected": -246.8000030517578, "loss": 0.6867, "rewards/accuracies": 0.48666661977767944, "rewards/chosen": 0.07548828423023224, "rewards/margins": 0.024318695068359375, "rewards/rejected": 0.05118408054113388, "step": 140 }, { "epoch": 0.03733264675592173, "grad_norm": 284.0, "learning_rate": 4.813336766220391e-07, "logits/chosen": -0.3550781309604645, "logits/rejected": -0.333984375, "logps/chosen": -302.20001220703125, "logps/rejected": -239.5, "loss": 0.6664, "rewards/accuracies": 0.4892857074737549, "rewards/chosen": 0.14274902641773224, "rewards/margins": 0.08186034858226776, "rewards/rejected": 0.060699462890625, "step": 145 }, { "epoch": 0.038619979402677654, "grad_norm": 129.0, "learning_rate": 4.806900102986612e-07, "logits/chosen": -0.31816405057907104, "logits/rejected": -0.2972656190395355, "logps/chosen": -325.20001220703125, "logps/rejected": -275.6000061035156, "loss": 0.6617, "rewards/accuracies": 0.5113553404808044, "rewards/chosen": 0.123046875, "rewards/margins": 0.06791992485523224, "rewards/rejected": 0.05527343600988388, "step": 150 }, { "epoch": 0.039907312049433576, "grad_norm": 150.0, "learning_rate": 4.800463439752832e-07, "logits/chosen": -0.16226807236671448, "logits/rejected": -0.30058592557907104, "logps/chosen": -191.60000610351562, "logps/rejected": -171.3000030517578, "loss": 0.6805, "rewards/accuracies": 0.4246428608894348, "rewards/chosen": 0.07666015625, "rewards/margins": 0.02585449256002903, "rewards/rejected": 0.05087890475988388, "step": 155 }, { "epoch": 0.0411946446961895, "grad_norm": 131.0, "learning_rate": 4.794026776519052e-07, "logits/chosen": -0.17207030951976776, "logits/rejected": -0.33867186307907104, "logps/chosen": -254.1999969482422, "logps/rejected": -243.8000030517578, "loss": 0.6727, "rewards/accuracies": 0.5442399382591248, "rewards/chosen": 0.11767578125, "rewards/margins": 0.05272216722369194, "rewards/rejected": 0.06501464545726776, "step": 160 }, { "epoch": 0.04248197734294542, "grad_norm": 132.0, "learning_rate": 4.787590113285273e-07, "logits/chosen": -0.24570313096046448, "logits/rejected": -0.22910156846046448, "logps/chosen": -278.0, "logps/rejected": -278.79998779296875, "loss": 0.668, "rewards/accuracies": 0.4844047427177429, "rewards/chosen": 0.10390625149011612, "rewards/margins": 0.05412597581744194, "rewards/rejected": 0.04974060133099556, "step": 165 }, { "epoch": 0.04376930998970134, "grad_norm": 132.0, "learning_rate": 4.781153450051493e-07, "logits/chosen": -0.27910155057907104, "logits/rejected": -0.2945312559604645, "logps/chosen": -319.20001220703125, "logps/rejected": -318.0, "loss": 0.6898, "rewards/accuracies": 0.35448721051216125, "rewards/chosen": 0.12241210788488388, "rewards/margins": 0.0228424072265625, "rewards/rejected": 0.099609375, "step": 170 }, { "epoch": 0.04505664263645726, "grad_norm": 136.0, "learning_rate": 4.774716786817714e-07, "logits/chosen": -0.15109863877296448, "logits/rejected": -0.04311828687787056, "logps/chosen": -277.20001220703125, "logps/rejected": -289.20001220703125, "loss": 0.6852, "rewards/accuracies": 0.465238094329834, "rewards/chosen": 0.08798827975988388, "rewards/margins": 0.02510986290872097, "rewards/rejected": 0.062744140625, "step": 175 }, { "epoch": 0.046343975283213185, "grad_norm": 121.0, "learning_rate": 4.7682801235839336e-07, "logits/chosen": -0.2354656159877777, "logits/rejected": -0.22178955376148224, "logps/chosen": -292.20001220703125, "logps/rejected": -288.20001220703125, "loss": 0.6695, "rewards/accuracies": 0.47809529304504395, "rewards/chosen": 0.13427734375, "rewards/margins": 0.05891113355755806, "rewards/rejected": 0.07553710788488388, "step": 180 }, { "epoch": 0.047631307929969106, "grad_norm": 162.0, "learning_rate": 4.7618434603501545e-07, "logits/chosen": -0.306640625, "logits/rejected": -0.185546875, "logps/chosen": -327.0, "logps/rejected": -309.6000061035156, "loss": 0.6711, "rewards/accuracies": 0.4816666543483734, "rewards/chosen": 0.13408203423023224, "rewards/margins": 0.05561218410730362, "rewards/rejected": 0.07856445014476776, "step": 185 }, { "epoch": 0.04891864057672503, "grad_norm": 140.0, "learning_rate": 4.755406797116375e-07, "logits/chosen": -0.44921875, "logits/rejected": -0.24418945610523224, "logps/chosen": -260.6000061035156, "logps/rejected": -296.0, "loss": 0.6773, "rewards/accuracies": 0.39725273847579956, "rewards/chosen": 0.09746094048023224, "rewards/margins": 0.03985595703125, "rewards/rejected": 0.05779419094324112, "step": 190 }, { "epoch": 0.05020597322348095, "grad_norm": 119.5, "learning_rate": 4.748970133882595e-07, "logits/chosen": -0.2955078184604645, "logits/rejected": -0.239990234375, "logps/chosen": -274.20001220703125, "logps/rejected": -247.1999969482422, "loss": 0.6711, "rewards/accuracies": 0.5275000333786011, "rewards/chosen": 0.12460937350988388, "rewards/margins": 0.04873047024011612, "rewards/rejected": 0.07583007961511612, "step": 195 }, { "epoch": 0.05149330587023687, "grad_norm": 136.0, "learning_rate": 4.742533470648816e-07, "logits/chosen": -0.29707032442092896, "logits/rejected": -0.37812501192092896, "logps/chosen": -271.3999938964844, "logps/rejected": -286.79998779296875, "loss": 0.7023, "rewards/accuracies": 0.4214285910129547, "rewards/chosen": 0.07823486626148224, "rewards/margins": -0.010101318359375, "rewards/rejected": 0.08833007514476776, "step": 200 }, { "epoch": 0.052780638516992794, "grad_norm": 124.0, "learning_rate": 4.7360968074150357e-07, "logits/chosen": -0.42402344942092896, "logits/rejected": -0.30253905057907104, "logps/chosen": -289.6000061035156, "logps/rejected": -303.79998779296875, "loss": 0.6844, "rewards/accuracies": 0.5017856955528259, "rewards/chosen": 0.12587890028953552, "rewards/margins": 0.02407226525247097, "rewards/rejected": 0.10170898586511612, "step": 205 }, { "epoch": 0.054067971163748715, "grad_norm": 136.0, "learning_rate": 4.729660144181256e-07, "logits/chosen": -0.36152344942092896, "logits/rejected": -0.33417969942092896, "logps/chosen": -348.0, "logps/rejected": -313.79998779296875, "loss": 0.6742, "rewards/accuracies": 0.45192307233810425, "rewards/chosen": 0.13808593153953552, "rewards/margins": 0.05313720554113388, "rewards/rejected": 0.08477783203125, "step": 210 }, { "epoch": 0.05535530381050464, "grad_norm": 123.0, "learning_rate": 4.7232234809474765e-07, "logits/chosen": -0.25712889432907104, "logits/rejected": -0.2515625059604645, "logps/chosen": -320.3999938964844, "logps/rejected": -268.79998779296875, "loss": 0.6734, "rewards/accuracies": 0.5074999928474426, "rewards/chosen": 0.14570312201976776, "rewards/margins": 0.0543212890625, "rewards/rejected": 0.091552734375, "step": 215 }, { "epoch": 0.05664263645726056, "grad_norm": 142.0, "learning_rate": 4.716786817713697e-07, "logits/chosen": -0.4742187559604645, "logits/rejected": -0.3480468690395355, "logps/chosen": -337.8999938964844, "logps/rejected": -285.79998779296875, "loss": 0.6734, "rewards/accuracies": 0.4488889276981354, "rewards/chosen": 0.13583984971046448, "rewards/margins": 0.05962524563074112, "rewards/rejected": 0.07624511420726776, "step": 220 }, { "epoch": 0.05792996910401648, "grad_norm": 146.0, "learning_rate": 4.7103501544799174e-07, "logits/chosen": -0.3023437559604645, "logits/rejected": -0.28437501192092896, "logps/chosen": -329.0, "logps/rejected": -315.6000061035156, "loss": 0.6937, "rewards/accuracies": 0.4004395604133606, "rewards/chosen": 0.13203124701976776, "rewards/margins": 0.02143554762005806, "rewards/rejected": 0.11069335788488388, "step": 225 }, { "epoch": 0.0592173017507724, "grad_norm": 132.0, "learning_rate": 4.703913491246138e-07, "logits/chosen": -0.35859376192092896, "logits/rejected": -0.34648436307907104, "logps/chosen": -348.0, "logps/rejected": -327.0, "loss": 0.6609, "rewards/accuracies": 0.4816666543483734, "rewards/chosen": 0.169677734375, "rewards/margins": 0.07862548530101776, "rewards/rejected": 0.09101562201976776, "step": 230 }, { "epoch": 0.060504634397528324, "grad_norm": 132.0, "learning_rate": 4.697476828012358e-07, "logits/chosen": -0.39472657442092896, "logits/rejected": -0.20766600966453552, "logps/chosen": -373.20001220703125, "logps/rejected": -344.79998779296875, "loss": 0.6766, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": 0.1611328125, "rewards/margins": 0.049072265625, "rewards/rejected": 0.11196289211511612, "step": 235 }, { "epoch": 0.061791967044284246, "grad_norm": 136.0, "learning_rate": 4.6910401647785787e-07, "logits/chosen": -0.31367188692092896, "logits/rejected": -0.3333984315395355, "logps/chosen": -301.0, "logps/rejected": -290.20001220703125, "loss": 0.6453, "rewards/accuracies": 0.55057692527771, "rewards/chosen": 0.18828125298023224, "rewards/margins": 0.11381836235523224, "rewards/rejected": 0.0740509033203125, "step": 240 }, { "epoch": 0.06307929969104016, "grad_norm": 129.0, "learning_rate": 4.6846035015447986e-07, "logits/chosen": -0.216796875, "logits/rejected": -0.19218750298023224, "logps/chosen": -286.0, "logps/rejected": -296.0, "loss": 0.6766, "rewards/accuracies": 0.5058333277702332, "rewards/chosen": 0.14189453423023224, "rewards/margins": 0.04348144680261612, "rewards/rejected": 0.098541259765625, "step": 245 }, { "epoch": 0.06436663233779609, "grad_norm": 127.0, "learning_rate": 4.6781668383110195e-07, "logits/chosen": -0.17656250298023224, "logits/rejected": -0.15854492783546448, "logps/chosen": -276.20001220703125, "logps/rejected": -317.3999938964844, "loss": 0.6805, "rewards/accuracies": 0.4375, "rewards/chosen": 0.08989258110523224, "rewards/margins": 0.04062499850988388, "rewards/rejected": 0.04951171949505806, "step": 250 }, { "epoch": 0.065653964984552, "grad_norm": 186.0, "learning_rate": 4.67173017507724e-07, "logits/chosen": -0.19047851860523224, "logits/rejected": -0.15229491889476776, "logps/chosen": -308.0, "logps/rejected": -311.20001220703125, "loss": 0.657, "rewards/accuracies": 0.5141667127609253, "rewards/chosen": 0.15620116889476776, "rewards/margins": 0.08925781399011612, "rewards/rejected": 0.06684570014476776, "step": 255 }, { "epoch": 0.06694129763130793, "grad_norm": 128.0, "learning_rate": 4.66529351184346e-07, "logits/chosen": -0.22646483778953552, "logits/rejected": -0.21484375, "logps/chosen": -302.3999938964844, "logps/rejected": -295.6000061035156, "loss": 0.6687, "rewards/accuracies": 0.5358333587646484, "rewards/chosen": 0.12197265774011612, "rewards/margins": 0.0576171875, "rewards/rejected": 0.06428222358226776, "step": 260 }, { "epoch": 0.06822863027806385, "grad_norm": 126.5, "learning_rate": 4.658856848609681e-07, "logits/chosen": -0.27421873807907104, "logits/rejected": -0.16127929091453552, "logps/chosen": -310.3999938964844, "logps/rejected": -256.0, "loss": 0.6586, "rewards/accuracies": 0.5633333325386047, "rewards/chosen": 0.16123047471046448, "rewards/margins": 0.09423828125, "rewards/rejected": 0.06710509955883026, "step": 265 }, { "epoch": 0.06951596292481978, "grad_norm": 139.0, "learning_rate": 4.652420185375901e-07, "logits/chosen": -0.24160155653953552, "logits/rejected": -0.140625, "logps/chosen": -353.3999938964844, "logps/rejected": -305.20001220703125, "loss": 0.6672, "rewards/accuracies": 0.5516666173934937, "rewards/chosen": 0.15625, "rewards/margins": 0.06242675706744194, "rewards/rejected": 0.09394530951976776, "step": 270 }, { "epoch": 0.07080329557157569, "grad_norm": 134.0, "learning_rate": 4.645983522142121e-07, "logits/chosen": -0.47773438692092896, "logits/rejected": -0.4214843809604645, "logps/chosen": -317.3999938964844, "logps/rejected": -269.6000061035156, "loss": 0.6633, "rewards/accuracies": 0.6306411027908325, "rewards/chosen": 0.19570311903953552, "rewards/margins": 0.0968017578125, "rewards/rejected": 0.09880981594324112, "step": 275 }, { "epoch": 0.07209062821833162, "grad_norm": 135.0, "learning_rate": 4.639546858908342e-07, "logits/chosen": -0.2601562440395355, "logits/rejected": -0.19687500596046448, "logps/chosen": -337.6000061035156, "logps/rejected": -266.20001220703125, "loss": 0.6477, "rewards/accuracies": 0.5841666460037231, "rewards/chosen": 0.13720703125, "rewards/margins": 0.10708007961511612, "rewards/rejected": 0.03012695349752903, "step": 280 }, { "epoch": 0.07337796086508754, "grad_norm": 140.0, "learning_rate": 4.633110195674562e-07, "logits/chosen": -0.17416992783546448, "logits/rejected": -0.14794921875, "logps/chosen": -285.20001220703125, "logps/rejected": -257.0, "loss": 0.6687, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.12241210788488388, "rewards/margins": 0.07406006008386612, "rewards/rejected": 0.04838867112994194, "step": 285 }, { "epoch": 0.07466529351184346, "grad_norm": 138.0, "learning_rate": 4.6266735324407824e-07, "logits/chosen": -0.28691405057907104, "logits/rejected": -0.25214844942092896, "logps/chosen": -308.79998779296875, "logps/rejected": -310.0, "loss": 0.6672, "rewards/accuracies": 0.5101648569107056, "rewards/chosen": 0.13925781846046448, "rewards/margins": 0.0703125, "rewards/rejected": 0.06906738132238388, "step": 290 }, { "epoch": 0.07595262615859938, "grad_norm": 163.0, "learning_rate": 4.620236869207003e-07, "logits/chosen": -0.3472656309604645, "logits/rejected": -0.2983642518520355, "logps/chosen": -348.6000061035156, "logps/rejected": -339.6000061035156, "loss": 0.6516, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": 0.19716796278953552, "rewards/margins": 0.11539001762866974, "rewards/rejected": 0.08188476413488388, "step": 295 }, { "epoch": 0.07723995880535531, "grad_norm": 112.0, "learning_rate": 4.613800205973223e-07, "logits/chosen": -0.3154296875, "logits/rejected": -0.23349609971046448, "logps/chosen": -306.79998779296875, "logps/rejected": -270.20001220703125, "loss": 0.6672, "rewards/accuracies": 0.43464288115501404, "rewards/chosen": 0.17387695610523224, "rewards/margins": 0.0648905411362648, "rewards/rejected": 0.10922851413488388, "step": 300 }, { "epoch": 0.07852729145211122, "grad_norm": 133.0, "learning_rate": 4.6073635427394437e-07, "logits/chosen": -0.18173828721046448, "logits/rejected": -0.164306640625, "logps/chosen": -312.79998779296875, "logps/rejected": -279.79998779296875, "loss": 0.6641, "rewards/accuracies": 0.46345239877700806, "rewards/chosen": 0.12785644829273224, "rewards/margins": 0.07490234076976776, "rewards/rejected": 0.05292968824505806, "step": 305 }, { "epoch": 0.07981462409886715, "grad_norm": 131.0, "learning_rate": 4.6009268795056636e-07, "logits/chosen": -0.22646483778953552, "logits/rejected": -0.22268065810203552, "logps/chosen": -301.79998779296875, "logps/rejected": -279.3999938964844, "loss": 0.6523, "rewards/accuracies": 0.5433333516120911, "rewards/chosen": 0.15996094048023224, "rewards/margins": 0.09870605170726776, "rewards/rejected": 0.06142578274011612, "step": 310 }, { "epoch": 0.08110195674562307, "grad_norm": 106.0, "learning_rate": 4.5944902162718845e-07, "logits/chosen": -0.38749998807907104, "logits/rejected": -0.373046875, "logps/chosen": -265.20001220703125, "logps/rejected": -287.20001220703125, "loss": 0.6859, "rewards/accuracies": 0.5248077511787415, "rewards/chosen": 0.13544921576976776, "rewards/margins": 0.04482422024011612, "rewards/rejected": 0.09062500298023224, "step": 315 }, { "epoch": 0.082389289392379, "grad_norm": 137.0, "learning_rate": 4.588053553038105e-07, "logits/chosen": -0.2705078125, "logits/rejected": -0.20344237983226776, "logps/chosen": -302.0, "logps/rejected": -255.10000610351562, "loss": 0.6672, "rewards/accuracies": 0.5113095045089722, "rewards/chosen": 0.1572265625, "rewards/margins": 0.06533203274011612, "rewards/rejected": 0.09195556491613388, "step": 320 }, { "epoch": 0.08367662203913491, "grad_norm": 138.0, "learning_rate": 4.581616889804325e-07, "logits/chosen": -0.4496093690395355, "logits/rejected": -0.46367186307907104, "logps/chosen": -340.0, "logps/rejected": -307.20001220703125, "loss": 0.6641, "rewards/accuracies": 0.5108333826065063, "rewards/chosen": 0.20585937798023224, "rewards/margins": 0.08552245795726776, "rewards/rejected": 0.12041015923023224, "step": 325 }, { "epoch": 0.08496395468589084, "grad_norm": 116.0, "learning_rate": 4.575180226570546e-07, "logits/chosen": -0.25957030057907104, "logits/rejected": -0.24609375, "logps/chosen": -266.6000061035156, "logps/rejected": -260.3999938964844, "loss": 0.6641, "rewards/accuracies": 0.5023809671401978, "rewards/chosen": 0.12734374403953552, "rewards/margins": 0.06943359225988388, "rewards/rejected": 0.05791015550494194, "step": 330 }, { "epoch": 0.08625128733264675, "grad_norm": 137.0, "learning_rate": 4.568743563336766e-07, "logits/chosen": -0.322265625, "logits/rejected": -0.22280272841453552, "logps/chosen": -259.0, "logps/rejected": -196.6999969482422, "loss": 0.6633, "rewards/accuracies": 0.5404869914054871, "rewards/chosen": 0.12617187201976776, "rewards/margins": 0.07353515923023224, "rewards/rejected": 0.05245818942785263, "step": 335 }, { "epoch": 0.08753861997940268, "grad_norm": 146.0, "learning_rate": 4.562306900102986e-07, "logits/chosen": -0.3169921934604645, "logits/rejected": -0.37324219942092896, "logps/chosen": -266.1000061035156, "logps/rejected": -258.20001220703125, "loss": 0.6609, "rewards/accuracies": 0.5236905217170715, "rewards/chosen": 0.16230468451976776, "rewards/margins": 0.0828857421875, "rewards/rejected": 0.07935790717601776, "step": 340 }, { "epoch": 0.0888259526261586, "grad_norm": 140.0, "learning_rate": 4.555870236869207e-07, "logits/chosen": -0.45136719942092896, "logits/rejected": -0.18623046576976776, "logps/chosen": -193.3000030517578, "logps/rejected": -216.60000610351562, "loss": 0.6742, "rewards/accuracies": 0.4840908944606781, "rewards/chosen": 0.13730469346046448, "rewards/margins": 0.06208496168255806, "rewards/rejected": 0.07521972805261612, "step": 345 }, { "epoch": 0.09011328527291453, "grad_norm": 125.0, "learning_rate": 4.549433573635427e-07, "logits/chosen": -0.2948242127895355, "logits/rejected": -0.2691406309604645, "logps/chosen": -330.79998779296875, "logps/rejected": -271.79998779296875, "loss": 0.6547, "rewards/accuracies": 0.535641074180603, "rewards/chosen": 0.17597655951976776, "rewards/margins": 0.08984375, "rewards/rejected": 0.08627929538488388, "step": 350 }, { "epoch": 0.09140061791967044, "grad_norm": 172.0, "learning_rate": 4.5429969104016474e-07, "logits/chosen": -0.20737305283546448, "logits/rejected": -0.18388672173023224, "logps/chosen": -299.0, "logps/rejected": -258.3999938964844, "loss": 0.6508, "rewards/accuracies": 0.5383332967758179, "rewards/chosen": 0.17333984375, "rewards/margins": 0.10517577826976776, "rewards/rejected": 0.06787719577550888, "step": 355 }, { "epoch": 0.09268795056642637, "grad_norm": 110.0, "learning_rate": 4.5365602471678684e-07, "logits/chosen": -0.447265625, "logits/rejected": -0.3005737364292145, "logps/chosen": -336.3999938964844, "logps/rejected": -313.79998779296875, "loss": 0.6586, "rewards/accuracies": 0.5334523916244507, "rewards/chosen": 0.18808594346046448, "rewards/margins": 0.0863037109375, "rewards/rejected": 0.101806640625, "step": 360 }, { "epoch": 0.09397528321318228, "grad_norm": 144.0, "learning_rate": 4.5301235839340883e-07, "logits/chosen": -0.41230469942092896, "logits/rejected": -0.40300291776657104, "logps/chosen": -368.79998779296875, "logps/rejected": -299.20001220703125, "loss": 0.6617, "rewards/accuracies": 0.46416670083999634, "rewards/chosen": 0.16132812201976776, "rewards/margins": 0.08078613132238388, "rewards/rejected": 0.08046875149011612, "step": 365 }, { "epoch": 0.09526261585993821, "grad_norm": 126.5, "learning_rate": 4.5236869207003087e-07, "logits/chosen": -0.26630860567092896, "logits/rejected": -0.17941895127296448, "logps/chosen": -292.6000061035156, "logps/rejected": -269.6000061035156, "loss": 0.6516, "rewards/accuracies": 0.5725000500679016, "rewards/chosen": 0.14077147841453552, "rewards/margins": 0.09445953369140625, "rewards/rejected": 0.04622802883386612, "step": 370 }, { "epoch": 0.09654994850669413, "grad_norm": 121.0, "learning_rate": 4.517250257466529e-07, "logits/chosen": -0.30156248807907104, "logits/rejected": -0.22031250596046448, "logps/chosen": -317.0, "logps/rejected": -295.79998779296875, "loss": 0.632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23574218153953552, "rewards/margins": 0.15581054985523224, "rewards/rejected": 0.08037109673023224, "step": 375 }, { "epoch": 0.09783728115345006, "grad_norm": 132.0, "learning_rate": 4.5108135942327496e-07, "logits/chosen": -0.41425782442092896, "logits/rejected": -0.2860351502895355, "logps/chosen": -309.0, "logps/rejected": -232.10000610351562, "loss": 0.6445, "rewards/accuracies": 0.6383333802223206, "rewards/chosen": 0.19355468451976776, "rewards/margins": 0.11928711086511612, "rewards/rejected": 0.07414551079273224, "step": 380 }, { "epoch": 0.09912461380020597, "grad_norm": 120.5, "learning_rate": 4.50437693099897e-07, "logits/chosen": -0.3140625059604645, "logits/rejected": -0.30244141817092896, "logps/chosen": -332.0, "logps/rejected": -268.3999938964844, "loss": 0.643, "rewards/accuracies": 0.5107142925262451, "rewards/chosen": 0.22402343153953552, "rewards/margins": 0.12294922024011612, "rewards/rejected": 0.10048828274011612, "step": 385 }, { "epoch": 0.1004119464469619, "grad_norm": 110.0, "learning_rate": 4.49794026776519e-07, "logits/chosen": -0.2567382752895355, "logits/rejected": -0.232421875, "logps/chosen": -323.20001220703125, "logps/rejected": -258.79998779296875, "loss": 0.6375, "rewards/accuracies": 0.6125000715255737, "rewards/chosen": 0.20869140326976776, "rewards/margins": 0.14423827826976776, "rewards/rejected": 0.06437988579273224, "step": 390 }, { "epoch": 0.10169927909371781, "grad_norm": 213.0, "learning_rate": 4.491503604531411e-07, "logits/chosen": -0.3031249940395355, "logits/rejected": -0.22003173828125, "logps/chosen": -318.3999938964844, "logps/rejected": -229.39999389648438, "loss": 0.6445, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.21689453721046448, "rewards/margins": 0.12705078721046448, "rewards/rejected": 0.089599609375, "step": 395 }, { "epoch": 0.10298661174047374, "grad_norm": 165.0, "learning_rate": 4.4850669412976313e-07, "logits/chosen": -0.1279296875, "logits/rejected": -0.11099395900964737, "logps/chosen": -272.79998779296875, "logps/rejected": -275.79998779296875, "loss": 0.6937, "rewards/accuracies": 0.5209524035453796, "rewards/chosen": 0.15566405653953552, "rewards/margins": 0.026123046875, "rewards/rejected": 0.12949219346046448, "step": 400 }, { "epoch": 0.10427394438722966, "grad_norm": 134.0, "learning_rate": 4.478630278063851e-07, "logits/chosen": -0.31083983182907104, "logits/rejected": -0.2655273377895355, "logps/chosen": -307.3999938964844, "logps/rejected": -298.3999938964844, "loss": 0.6586, "rewards/accuracies": 0.59333336353302, "rewards/chosen": 0.24179688096046448, "rewards/margins": 0.1076202392578125, "rewards/rejected": 0.1341552734375, "step": 405 }, { "epoch": 0.10556127703398559, "grad_norm": 114.5, "learning_rate": 4.472193614830072e-07, "logits/chosen": -0.2607421875, "logits/rejected": -0.27021485567092896, "logps/chosen": -302.6000061035156, "logps/rejected": -261.20001220703125, "loss": 0.657, "rewards/accuracies": 0.4791666865348816, "rewards/chosen": 0.150390625, "rewards/margins": 0.08657226711511612, "rewards/rejected": 0.06356201320886612, "step": 410 }, { "epoch": 0.1068486096807415, "grad_norm": 150.0, "learning_rate": 4.4657569515962926e-07, "logits/chosen": -0.22905273735523224, "logits/rejected": -0.29277342557907104, "logps/chosen": -292.20001220703125, "logps/rejected": -276.3999938964844, "loss": 0.6375, "rewards/accuracies": 0.5440934300422668, "rewards/chosen": 0.20136718451976776, "rewards/margins": 0.13632813096046448, "rewards/rejected": 0.0650634765625, "step": 415 }, { "epoch": 0.10813594232749743, "grad_norm": 132.0, "learning_rate": 4.4593202883625124e-07, "logits/chosen": -0.19296875596046448, "logits/rejected": -0.20527343451976776, "logps/chosen": -288.3999938964844, "logps/rejected": -270.20001220703125, "loss": 0.668, "rewards/accuracies": 0.53083336353302, "rewards/chosen": 0.13759765028953552, "rewards/margins": 0.07159423828125, "rewards/rejected": 0.06615600734949112, "step": 420 }, { "epoch": 0.10942327497425335, "grad_norm": 148.0, "learning_rate": 4.4528836251287334e-07, "logits/chosen": -0.3128906190395355, "logits/rejected": -0.36030274629592896, "logps/chosen": -293.6000061035156, "logps/rejected": -308.79998779296875, "loss": 0.6734, "rewards/accuracies": 0.4766666889190674, "rewards/chosen": 0.15146484971046448, "rewards/margins": 0.05170898512005806, "rewards/rejected": 0.10001220554113388, "step": 425 }, { "epoch": 0.11071060762100927, "grad_norm": 126.0, "learning_rate": 4.4464469618949533e-07, "logits/chosen": -0.20818480849266052, "logits/rejected": -0.17268066108226776, "logps/chosen": -260.3999938964844, "logps/rejected": -272.3999938964844, "loss": 0.6773, "rewards/accuracies": 0.5232142806053162, "rewards/chosen": 0.13007812201976776, "rewards/margins": 0.055419921875, "rewards/rejected": 0.07471923530101776, "step": 430 }, { "epoch": 0.11199794026776519, "grad_norm": 136.0, "learning_rate": 4.4400102986611737e-07, "logits/chosen": -0.3052734434604645, "logits/rejected": -0.2783203125, "logps/chosen": -324.3999938964844, "logps/rejected": -307.6000061035156, "loss": 0.6797, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.17822265625, "rewards/margins": 0.06101074069738388, "rewards/rejected": 0.11748047173023224, "step": 435 }, { "epoch": 0.11328527291452112, "grad_norm": 136.0, "learning_rate": 4.4335736354273947e-07, "logits/chosen": -0.2699218690395355, "logits/rejected": -0.24677734076976776, "logps/chosen": -329.20001220703125, "logps/rejected": -302.3999938964844, "loss": 0.6828, "rewards/accuracies": 0.4425000250339508, "rewards/chosen": 0.19130858778953552, "rewards/margins": 0.05561523512005806, "rewards/rejected": 0.13603515923023224, "step": 440 }, { "epoch": 0.11457260556127703, "grad_norm": 104.5, "learning_rate": 4.4271369721936146e-07, "logits/chosen": -0.24139404296875, "logits/rejected": -0.19287109375, "logps/chosen": -272.0, "logps/rejected": -250.39999389648438, "loss": 0.65, "rewards/accuracies": 0.6230769157409668, "rewards/chosen": 0.1884765625, "rewards/margins": 0.09650878608226776, "rewards/rejected": 0.09213867038488388, "step": 445 }, { "epoch": 0.11585993820803296, "grad_norm": 115.5, "learning_rate": 4.420700308959835e-07, "logits/chosen": -0.2740722596645355, "logits/rejected": -0.31328123807907104, "logps/chosen": -301.20001220703125, "logps/rejected": -325.6000061035156, "loss": 0.6813, "rewards/accuracies": 0.511025607585907, "rewards/chosen": 0.14902344346046448, "rewards/margins": 0.07402344048023224, "rewards/rejected": 0.07492675632238388, "step": 450 }, { "epoch": 0.11714727085478888, "grad_norm": 125.5, "learning_rate": 4.414263645726055e-07, "logits/chosen": -0.2845703065395355, "logits/rejected": -0.2627929747104645, "logps/chosen": -288.79998779296875, "logps/rejected": -281.6000061035156, "loss": 0.6578, "rewards/accuracies": 0.6133333444595337, "rewards/chosen": 0.13017578423023224, "rewards/margins": 0.08662109076976776, "rewards/rejected": 0.043611906468868256, "step": 455 }, { "epoch": 0.1184346035015448, "grad_norm": 135.0, "learning_rate": 4.407826982492276e-07, "logits/chosen": -0.3082031309604645, "logits/rejected": -0.3052734434604645, "logps/chosen": -287.3999938964844, "logps/rejected": -280.0, "loss": 0.6898, "rewards/accuracies": 0.42750000953674316, "rewards/chosen": 0.126708984375, "rewards/margins": 0.025634765625, "rewards/rejected": 0.10136719048023224, "step": 460 }, { "epoch": 0.11972193614830072, "grad_norm": 126.0, "learning_rate": 4.4013903192584963e-07, "logits/chosen": -0.2197265625, "logits/rejected": -0.130126953125, "logps/chosen": -267.0, "logps/rejected": -269.79998779296875, "loss": 0.65, "rewards/accuracies": 0.5803030729293823, "rewards/chosen": 0.193359375, "rewards/margins": 0.10043945163488388, "rewards/rejected": 0.09291992336511612, "step": 465 }, { "epoch": 0.12100926879505665, "grad_norm": 158.0, "learning_rate": 4.394953656024716e-07, "logits/chosen": -0.3115234375, "logits/rejected": -0.25078123807907104, "logps/chosen": -315.0, "logps/rejected": -275.20001220703125, "loss": 0.6484, "rewards/accuracies": 0.5606410503387451, "rewards/chosen": 0.19902344048023224, "rewards/margins": 0.10244140774011612, "rewards/rejected": 0.09677734225988388, "step": 470 }, { "epoch": 0.12229660144181256, "grad_norm": 114.0, "learning_rate": 4.388516992790937e-07, "logits/chosen": -0.21347656846046448, "logits/rejected": -0.18437500298023224, "logps/chosen": -257.20001220703125, "logps/rejected": -288.6000061035156, "loss": 0.6531, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.12236328423023224, "rewards/margins": 0.09687499701976776, "rewards/rejected": 0.02541198767721653, "step": 475 }, { "epoch": 0.12358393408856849, "grad_norm": 180.0, "learning_rate": 4.3820803295571576e-07, "logits/chosen": -0.21269531548023224, "logits/rejected": -0.24375610053539276, "logps/chosen": -316.79998779296875, "logps/rejected": -275.1000061035156, "loss": 0.6336, "rewards/accuracies": 0.710357129573822, "rewards/chosen": 0.23964843153953552, "rewards/margins": 0.16708984971046448, "rewards/rejected": 0.07272949069738388, "step": 480 }, { "epoch": 0.12487126673532441, "grad_norm": 127.0, "learning_rate": 4.3756436663233775e-07, "logits/chosen": -0.32734376192092896, "logits/rejected": -0.27998048067092896, "logps/chosen": -306.6000061035156, "logps/rejected": -326.20001220703125, "loss": 0.6727, "rewards/accuracies": 0.5188636183738708, "rewards/chosen": 0.20927734673023224, "rewards/margins": 0.06523437798023224, "rewards/rejected": 0.14433594048023224, "step": 485 }, { "epoch": 0.12615859938208032, "grad_norm": 116.0, "learning_rate": 4.3692070030895984e-07, "logits/chosen": -0.18757323920726776, "logits/rejected": -0.2518554627895355, "logps/chosen": -246.60000610351562, "logps/rejected": -309.79998779296875, "loss": 0.6664, "rewards/accuracies": 0.515576958656311, "rewards/chosen": 0.119140625, "rewards/margins": 0.06613769382238388, "rewards/rejected": 0.05268554762005806, "step": 490 }, { "epoch": 0.12744593202883625, "grad_norm": 142.0, "learning_rate": 4.3627703398558183e-07, "logits/chosen": -0.2652343809604645, "logits/rejected": -0.195556640625, "logps/chosen": -260.79998779296875, "logps/rejected": -309.20001220703125, "loss": 0.6477, "rewards/accuracies": 0.5491666793823242, "rewards/chosen": 0.21308593451976776, "rewards/margins": 0.11894531548023224, "rewards/rejected": 0.09455566108226776, "step": 495 }, { "epoch": 0.12873326467559218, "grad_norm": 117.5, "learning_rate": 4.356333676622039e-07, "logits/chosen": -0.2216796875, "logits/rejected": -0.05421142652630806, "logps/chosen": -274.3999938964844, "logps/rejected": -256.6000061035156, "loss": 0.6398, "rewards/accuracies": 0.6353572010993958, "rewards/chosen": 0.19345703721046448, "rewards/margins": 0.13037109375, "rewards/rejected": 0.06308593600988388, "step": 500 }, { "epoch": 0.1300205973223481, "grad_norm": 116.0, "learning_rate": 4.3498970133882597e-07, "logits/chosen": -0.34687501192092896, "logits/rejected": -0.32011717557907104, "logps/chosen": -293.79998779296875, "logps/rejected": -259.0, "loss": 0.6477, "rewards/accuracies": 0.5697802305221558, "rewards/chosen": 0.22343750298023224, "rewards/margins": 0.12272949516773224, "rewards/rejected": 0.1005859375, "step": 505 }, { "epoch": 0.131307929969104, "grad_norm": 120.0, "learning_rate": 4.3434603501544796e-07, "logits/chosen": -0.3275390565395355, "logits/rejected": -0.29082030057907104, "logps/chosen": -285.6000061035156, "logps/rejected": -260.6000061035156, "loss": 0.6648, "rewards/accuracies": 0.5275000333786011, "rewards/chosen": 0.16425780951976776, "rewards/margins": 0.09200439602136612, "rewards/rejected": 0.07244873046875, "step": 510 }, { "epoch": 0.13259526261585994, "grad_norm": 112.0, "learning_rate": 4.3370236869207e-07, "logits/chosen": -0.35859376192092896, "logits/rejected": -0.26347655057907104, "logps/chosen": -277.0, "logps/rejected": -253.60000610351562, "loss": 0.6156, "rewards/accuracies": 0.6905086636543274, "rewards/chosen": 0.224609375, "rewards/margins": 0.18354491889476776, "rewards/rejected": 0.04160156100988388, "step": 515 }, { "epoch": 0.13388259526261587, "grad_norm": 118.0, "learning_rate": 4.330587023686921e-07, "logits/chosen": -0.17543944716453552, "logits/rejected": -0.2835937440395355, "logps/chosen": -292.6000061035156, "logps/rejected": -313.0, "loss": 0.6578, "rewards/accuracies": 0.534166693687439, "rewards/chosen": 0.17753906548023224, "rewards/margins": 0.09375, "rewards/rejected": 0.08371581882238388, "step": 520 }, { "epoch": 0.1351699279093718, "grad_norm": 165.0, "learning_rate": 4.324150360453141e-07, "logits/chosen": -0.38679200410842896, "logits/rejected": -0.28369140625, "logps/chosen": -346.79998779296875, "logps/rejected": -279.0, "loss": 0.6516, "rewards/accuracies": 0.6016666293144226, "rewards/chosen": 0.19921875, "rewards/margins": 0.11733398586511612, "rewards/rejected": 0.08188476413488388, "step": 525 }, { "epoch": 0.1364572605561277, "grad_norm": 260.0, "learning_rate": 4.3177136972193613e-07, "logits/chosen": -0.4075683653354645, "logits/rejected": -0.25468748807907104, "logps/chosen": -287.79998779296875, "logps/rejected": -277.20001220703125, "loss": 0.6594, "rewards/accuracies": 0.4650000035762787, "rewards/chosen": 0.22734375298023224, "rewards/margins": 0.09101562201976776, "rewards/rejected": 0.13623046875, "step": 530 }, { "epoch": 0.13774459320288363, "grad_norm": 148.0, "learning_rate": 4.311277033985581e-07, "logits/chosen": -0.33720701932907104, "logits/rejected": -0.2982421815395355, "logps/chosen": -297.6000061035156, "logps/rejected": -317.20001220703125, "loss": 0.6883, "rewards/accuracies": 0.4550732672214508, "rewards/chosen": 0.19023437798023224, "rewards/margins": 0.0400390625, "rewards/rejected": 0.15029296278953552, "step": 535 }, { "epoch": 0.13903192584963955, "grad_norm": 132.0, "learning_rate": 4.304840370751802e-07, "logits/chosen": -0.12543945014476776, "logits/rejected": 0.04807128757238388, "logps/chosen": -263.75, "logps/rejected": -286.1000061035156, "loss": 0.6625, "rewards/accuracies": 0.534166693687439, "rewards/chosen": 0.2003173828125, "rewards/margins": 0.09206543117761612, "rewards/rejected": 0.10834960639476776, "step": 540 }, { "epoch": 0.14031925849639545, "grad_norm": 129.0, "learning_rate": 4.2984037075180226e-07, "logits/chosen": -0.20380859076976776, "logits/rejected": -0.15693359076976776, "logps/chosen": -284.6000061035156, "logps/rejected": -270.20001220703125, "loss": 0.6477, "rewards/accuracies": 0.5584524273872375, "rewards/chosen": 0.16660156846046448, "rewards/margins": 0.11865234375, "rewards/rejected": 0.04779052734375, "step": 545 }, { "epoch": 0.14160659114315138, "grad_norm": 118.5, "learning_rate": 4.2919670442842425e-07, "logits/chosen": -0.40800780057907104, "logits/rejected": -0.3677734434604645, "logps/chosen": -312.20001220703125, "logps/rejected": -267.6000061035156, "loss": 0.6594, "rewards/accuracies": 0.49916666746139526, "rewards/chosen": 0.21308593451976776, "rewards/margins": 0.089630126953125, "rewards/rejected": 0.12333984673023224, "step": 550 }, { "epoch": 0.1428939237899073, "grad_norm": 144.0, "learning_rate": 4.2855303810504634e-07, "logits/chosen": -0.3073486387729645, "logits/rejected": -0.32203370332717896, "logps/chosen": -262.3999938964844, "logps/rejected": -257.0, "loss": 0.6453, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": 0.20751953125, "rewards/margins": 0.12636718153953552, "rewards/rejected": 0.08082275092601776, "step": 555 }, { "epoch": 0.14418125643666324, "grad_norm": 128.0, "learning_rate": 4.2790937178166833e-07, "logits/chosen": -0.31181639432907104, "logits/rejected": -0.30126953125, "logps/chosen": -275.79998779296875, "logps/rejected": -257.6000061035156, "loss": 0.6937, "rewards/accuracies": 0.4191666543483734, "rewards/chosen": 0.19648437201976776, "rewards/margins": 0.02781372144818306, "rewards/rejected": 0.16855469346046448, "step": 560 }, { "epoch": 0.14546858908341914, "grad_norm": 117.5, "learning_rate": 4.272657054582904e-07, "logits/chosen": -0.31523436307907104, "logits/rejected": -0.23974609375, "logps/chosen": -313.3999938964844, "logps/rejected": -267.79998779296875, "loss": 0.6273, "rewards/accuracies": 0.6324999928474426, "rewards/chosen": 0.20371094346046448, "rewards/margins": 0.16074219346046448, "rewards/rejected": 0.04288329929113388, "step": 565 }, { "epoch": 0.14675592173017507, "grad_norm": 106.5, "learning_rate": 4.2662203913491247e-07, "logits/chosen": -0.16997070610523224, "logits/rejected": 0.004162597469985485, "logps/chosen": -271.20001220703125, "logps/rejected": -250.8000030517578, "loss": 0.6859, "rewards/accuracies": 0.5451923608779907, "rewards/chosen": 0.10227050632238388, "rewards/margins": 0.05811767652630806, "rewards/rejected": 0.04408111423254013, "step": 570 }, { "epoch": 0.148043254376931, "grad_norm": 139.0, "learning_rate": 4.2597837281153446e-07, "logits/chosen": -0.23818358778953552, "logits/rejected": -0.24111327528953552, "logps/chosen": -263.0, "logps/rejected": -315.3999938964844, "loss": 0.682, "rewards/accuracies": 0.5608333349227905, "rewards/chosen": 0.15556640923023224, "rewards/margins": 0.05327148362994194, "rewards/rejected": 0.10214843600988388, "step": 575 }, { "epoch": 0.14933058702368693, "grad_norm": 112.5, "learning_rate": 4.253347064881565e-07, "logits/chosen": -0.4419921934604645, "logits/rejected": -0.3089843690395355, "logps/chosen": -329.20001220703125, "logps/rejected": -318.0, "loss": 0.6328, "rewards/accuracies": 0.5861905217170715, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.14829102158546448, "rewards/rejected": 0.09733887016773224, "step": 580 }, { "epoch": 0.15061791967044283, "grad_norm": 125.0, "learning_rate": 4.246910401647786e-07, "logits/chosen": -0.29296875, "logits/rejected": -0.20571288466453552, "logps/chosen": -340.79998779296875, "logps/rejected": -317.20001220703125, "loss": 0.6203, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20039062201976776, "rewards/margins": 0.18769530951976776, "rewards/rejected": 0.013079834170639515, "step": 585 }, { "epoch": 0.15190525231719876, "grad_norm": 133.0, "learning_rate": 4.240473738414006e-07, "logits/chosen": -0.3423828184604645, "logits/rejected": -0.3177734315395355, "logps/chosen": -348.79998779296875, "logps/rejected": -305.0, "loss": 0.6461, "rewards/accuracies": 0.490119069814682, "rewards/chosen": 0.24648436903953552, "rewards/margins": 0.12675781548023224, "rewards/rejected": 0.11979980766773224, "step": 590 }, { "epoch": 0.1531925849639547, "grad_norm": 134.0, "learning_rate": 4.2340370751802263e-07, "logits/chosen": -0.3602539002895355, "logits/rejected": -0.3282226622104645, "logps/chosen": -334.0, "logps/rejected": -288.0, "loss": 0.6203, "rewards/accuracies": 0.554807722568512, "rewards/chosen": 0.24238280951976776, "rewards/margins": 0.17402343451976776, "rewards/rejected": 0.06873778998851776, "step": 595 }, { "epoch": 0.15447991761071062, "grad_norm": 101.0, "learning_rate": 4.227600411946447e-07, "logits/chosen": -0.4273437559604645, "logits/rejected": -0.3843750059604645, "logps/chosen": -288.0, "logps/rejected": -224.6999969482422, "loss": 0.6258, "rewards/accuracies": 0.6024999618530273, "rewards/chosen": 0.236328125, "rewards/margins": 0.15610352158546448, "rewards/rejected": 0.08002929389476776, "step": 600 }, { "epoch": 0.15576725025746652, "grad_norm": 139.0, "learning_rate": 4.221163748712667e-07, "logits/chosen": -0.3431640565395355, "logits/rejected": -0.2943359315395355, "logps/chosen": -375.6000061035156, "logps/rejected": -318.3999938964844, "loss": 0.6531, "rewards/accuracies": 0.5366666913032532, "rewards/chosen": 0.23671874403953552, "rewards/margins": 0.12138672173023224, "rewards/rejected": 0.11552734673023224, "step": 605 }, { "epoch": 0.15705458290422245, "grad_norm": 174.0, "learning_rate": 4.2147270854788876e-07, "logits/chosen": -0.20731811225414276, "logits/rejected": -0.14223632216453552, "logps/chosen": -267.3999938964844, "logps/rejected": -247.1999969482422, "loss": 0.6148, "rewards/accuracies": 0.6649999618530273, "rewards/chosen": 0.19511719048023224, "rewards/margins": 0.18671874701976776, "rewards/rejected": 0.00846252404153347, "step": 610 }, { "epoch": 0.15834191555097837, "grad_norm": 129.0, "learning_rate": 4.208290422245108e-07, "logits/chosen": -0.38359373807907104, "logits/rejected": -0.2581543028354645, "logps/chosen": -313.20001220703125, "logps/rejected": -290.79998779296875, "loss": 0.6523, "rewards/accuracies": 0.6008332967758179, "rewards/chosen": 0.21982422471046448, "rewards/margins": 0.12448730319738388, "rewards/rejected": 0.09530029445886612, "step": 615 }, { "epoch": 0.1596292481977343, "grad_norm": 131.0, "learning_rate": 4.2018537590113285e-07, "logits/chosen": -0.18037109076976776, "logits/rejected": -0.29374998807907104, "logps/chosen": -246.60000610351562, "logps/rejected": -314.79998779296875, "loss": 0.6469, "rewards/accuracies": 0.6207143068313599, "rewards/chosen": 0.15371093153953552, "rewards/margins": 0.107666015625, "rewards/rejected": 0.04611816257238388, "step": 620 }, { "epoch": 0.1609165808444902, "grad_norm": 235.0, "learning_rate": 4.195417095777549e-07, "logits/chosen": -0.3179687559604645, "logits/rejected": -0.2589111328125, "logps/chosen": -303.6000061035156, "logps/rejected": -269.0, "loss": 0.6125, "rewards/accuracies": 0.6692307591438293, "rewards/chosen": 0.24453124403953552, "rewards/margins": 0.20004883408546448, "rewards/rejected": 0.04472656175494194, "step": 625 }, { "epoch": 0.16220391349124613, "grad_norm": 112.5, "learning_rate": 4.188980432543769e-07, "logits/chosen": -0.2998046875, "logits/rejected": -0.180419921875, "logps/chosen": -263.0, "logps/rejected": -212.39999389648438, "loss": 0.6406, "rewards/accuracies": 0.6029545664787292, "rewards/chosen": 0.19843749701976776, "rewards/margins": 0.14462891221046448, "rewards/rejected": 0.053497314453125, "step": 630 }, { "epoch": 0.16349124613800206, "grad_norm": 130.0, "learning_rate": 4.18254376930999e-07, "logits/chosen": -0.23720702528953552, "logits/rejected": -0.18540039658546448, "logps/chosen": -314.6000061035156, "logps/rejected": -307.0, "loss": 0.6594, "rewards/accuracies": 0.5766667127609253, "rewards/chosen": 0.21699218451976776, "rewards/margins": 0.1092529296875, "rewards/rejected": 0.10820312798023224, "step": 635 }, { "epoch": 0.164778578784758, "grad_norm": 154.0, "learning_rate": 4.1761071060762096e-07, "logits/chosen": -0.3628906309604645, "logits/rejected": -0.28984373807907104, "logps/chosen": -288.79998779296875, "logps/rejected": -272.20001220703125, "loss": 0.6898, "rewards/accuracies": 0.4348718225955963, "rewards/chosen": 0.14306640625, "rewards/margins": 0.02601318433880806, "rewards/rejected": 0.11713866889476776, "step": 640 }, { "epoch": 0.1660659114315139, "grad_norm": 143.0, "learning_rate": 4.16967044284243e-07, "logits/chosen": -0.23066405951976776, "logits/rejected": -0.18076172471046448, "logps/chosen": -307.6000061035156, "logps/rejected": -293.6000061035156, "loss": 0.6719, "rewards/accuracies": 0.4972619116306305, "rewards/chosen": 0.16494140028953552, "rewards/margins": 0.05735473707318306, "rewards/rejected": 0.10764160007238388, "step": 645 }, { "epoch": 0.16735324407826982, "grad_norm": 153.0, "learning_rate": 4.163233779608651e-07, "logits/chosen": -0.28081053495407104, "logits/rejected": -0.3490234315395355, "logps/chosen": -318.6000061035156, "logps/rejected": -306.79998779296875, "loss": 0.6414, "rewards/accuracies": 0.625, "rewards/chosen": 0.25117188692092896, "rewards/margins": 0.13417968153953552, "rewards/rejected": 0.11679687350988388, "step": 650 }, { "epoch": 0.16864057672502575, "grad_norm": 110.0, "learning_rate": 4.156797116374871e-07, "logits/chosen": -0.32207030057907104, "logits/rejected": -0.2759765684604645, "logps/chosen": -229.1999969482422, "logps/rejected": -229.39999389648438, "loss": 0.6586, "rewards/accuracies": 0.5423077344894409, "rewards/chosen": 0.15341797471046448, "rewards/margins": 0.08717956393957138, "rewards/rejected": 0.06611327826976776, "step": 655 }, { "epoch": 0.16992790937178168, "grad_norm": 114.5, "learning_rate": 4.1503604531410913e-07, "logits/chosen": -0.40898436307907104, "logits/rejected": -0.32792967557907104, "logps/chosen": -354.20001220703125, "logps/rejected": -293.3999938964844, "loss": 0.6172, "rewards/accuracies": 0.5691666603088379, "rewards/chosen": 0.2451171875, "rewards/margins": 0.18559570610523224, "rewards/rejected": 0.05979614332318306, "step": 660 }, { "epoch": 0.17121524201853758, "grad_norm": 137.0, "learning_rate": 4.1439237899073123e-07, "logits/chosen": -0.10302734375, "logits/rejected": -0.13535156846046448, "logps/chosen": -279.3999938964844, "logps/rejected": -321.6000061035156, "loss": 0.6469, "rewards/accuracies": 0.5733333826065063, "rewards/chosen": 0.20454101264476776, "rewards/margins": 0.12111816555261612, "rewards/rejected": 0.08370666205883026, "step": 665 }, { "epoch": 0.1725025746652935, "grad_norm": 105.0, "learning_rate": 4.137487126673532e-07, "logits/chosen": -0.34199219942092896, "logits/rejected": -0.3046875, "logps/chosen": -299.6000061035156, "logps/rejected": -290.79998779296875, "loss": 0.6164, "rewards/accuracies": 0.6553571820259094, "rewards/chosen": 0.23720702528953552, "rewards/margins": 0.17939452826976776, "rewards/rejected": 0.057769775390625, "step": 670 }, { "epoch": 0.17378990731204944, "grad_norm": 145.0, "learning_rate": 4.1310504634397526e-07, "logits/chosen": -0.32515257596969604, "logits/rejected": -0.3157958984375, "logps/chosen": -348.0, "logps/rejected": -265.0, "loss": 0.6414, "rewards/accuracies": 0.6119047999382019, "rewards/chosen": 0.2001953125, "rewards/margins": 0.12539061903953552, "rewards/rejected": 0.07485351711511612, "step": 675 }, { "epoch": 0.17507723995880536, "grad_norm": 118.5, "learning_rate": 4.124613800205973e-07, "logits/chosen": -0.4046874940395355, "logits/rejected": -0.26708984375, "logps/chosen": -328.6000061035156, "logps/rejected": -360.3999938964844, "loss": 0.6617, "rewards/accuracies": 0.5683333277702332, "rewards/chosen": 0.21494141221046448, "rewards/margins": 0.11171875149011612, "rewards/rejected": 0.10317382961511612, "step": 680 }, { "epoch": 0.17636457260556127, "grad_norm": 114.5, "learning_rate": 4.1181771369721935e-07, "logits/chosen": -0.22441406548023224, "logits/rejected": -0.1478271484375, "logps/chosen": -331.6000061035156, "logps/rejected": -270.20001220703125, "loss": 0.643, "rewards/accuracies": 0.5808333158493042, "rewards/chosen": 0.23154297471046448, "rewards/margins": 0.13771972060203552, "rewards/rejected": 0.09345702826976776, "step": 685 }, { "epoch": 0.1776519052523172, "grad_norm": 124.0, "learning_rate": 4.111740473738414e-07, "logits/chosen": -0.36347657442092896, "logits/rejected": -0.30546873807907104, "logps/chosen": -314.79998779296875, "logps/rejected": -338.3999938964844, "loss": 0.6758, "rewards/accuracies": 0.5400000810623169, "rewards/chosen": 0.15888671576976776, "rewards/margins": 0.06025390699505806, "rewards/rejected": 0.09892578423023224, "step": 690 }, { "epoch": 0.17893923789907312, "grad_norm": 164.0, "learning_rate": 4.1053038105046343e-07, "logits/chosen": -0.3160156309604645, "logits/rejected": -0.27519530057907104, "logps/chosen": -368.79998779296875, "logps/rejected": -318.0, "loss": 0.6359, "rewards/accuracies": 0.5800000429153442, "rewards/chosen": 0.22539062798023224, "rewards/margins": 0.14560547471046448, "rewards/rejected": 0.07971344143152237, "step": 695 }, { "epoch": 0.18022657054582905, "grad_norm": 207.0, "learning_rate": 4.098867147270855e-07, "logits/chosen": -0.31425780057907104, "logits/rejected": -0.416015625, "logps/chosen": -335.79998779296875, "logps/rejected": -304.0, "loss": 0.6625, "rewards/accuracies": 0.5182143449783325, "rewards/chosen": 0.24863281846046448, "rewards/margins": 0.09174804389476776, "rewards/rejected": 0.15676574409008026, "step": 700 }, { "epoch": 0.18151390319258495, "grad_norm": 115.0, "learning_rate": 4.0924304840370747e-07, "logits/chosen": -0.2811523377895355, "logits/rejected": -0.2685546875, "logps/chosen": -313.20001220703125, "logps/rejected": -308.3999938964844, "loss": 0.6219, "rewards/accuracies": 0.6351190805435181, "rewards/chosen": 0.2470703125, "rewards/margins": 0.17578125, "rewards/rejected": 0.07119140774011612, "step": 705 }, { "epoch": 0.18280123583934088, "grad_norm": 137.0, "learning_rate": 4.085993820803295e-07, "logits/chosen": -0.3584960997104645, "logits/rejected": -0.3773437440395355, "logps/chosen": -322.3999938964844, "logps/rejected": -302.79998779296875, "loss": 0.6672, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.18730469048023224, "rewards/margins": 0.09260253608226776, "rewards/rejected": 0.09453125298023224, "step": 710 }, { "epoch": 0.1840885684860968, "grad_norm": 137.0, "learning_rate": 4.079557157569516e-07, "logits/chosen": -0.3470703065395355, "logits/rejected": -0.49687498807907104, "logps/chosen": -238.4499969482422, "logps/rejected": -247.60000610351562, "loss": 0.6391, "rewards/accuracies": 0.5017857551574707, "rewards/chosen": 0.21059569716453552, "rewards/margins": 0.15239258110523224, "rewards/rejected": 0.05859375, "step": 715 }, { "epoch": 0.18537590113285274, "grad_norm": 120.0, "learning_rate": 4.073120494335736e-07, "logits/chosen": -0.45781248807907104, "logits/rejected": -0.42167967557907104, "logps/chosen": -320.6000061035156, "logps/rejected": -288.20001220703125, "loss": 0.6562, "rewards/accuracies": 0.5333333611488342, "rewards/chosen": 0.22158202528953552, "rewards/margins": 0.09648437798023224, "rewards/rejected": 0.12517090141773224, "step": 720 }, { "epoch": 0.18666323377960864, "grad_norm": 106.0, "learning_rate": 4.0666838311019564e-07, "logits/chosen": -0.19287109375, "logits/rejected": -0.0897216796875, "logps/chosen": -302.3999938964844, "logps/rejected": -262.3999938964844, "loss": 0.6305, "rewards/accuracies": 0.6448076963424683, "rewards/chosen": 0.21962890028953552, "rewards/margins": 0.16650390625, "rewards/rejected": 0.05291748046875, "step": 725 }, { "epoch": 0.18795056642636457, "grad_norm": 139.0, "learning_rate": 4.0602471678681773e-07, "logits/chosen": -0.20634765923023224, "logits/rejected": -0.107421875, "logps/chosen": -306.6000061035156, "logps/rejected": -308.79998779296875, "loss": 0.6656, "rewards/accuracies": 0.4464285969734192, "rewards/chosen": 0.19228515028953552, "rewards/margins": 0.087158203125, "rewards/rejected": 0.10537109524011612, "step": 730 }, { "epoch": 0.1892378990731205, "grad_norm": 130.0, "learning_rate": 4.053810504634397e-07, "logits/chosen": -0.2790283262729645, "logits/rejected": -0.36445313692092896, "logps/chosen": -339.79998779296875, "logps/rejected": -344.0, "loss": 0.6719, "rewards/accuracies": 0.47161421179771423, "rewards/chosen": 0.24443359673023224, "rewards/margins": 0.07243652641773224, "rewards/rejected": 0.17221680283546448, "step": 735 }, { "epoch": 0.19052523171987643, "grad_norm": 155.0, "learning_rate": 4.0473738414006176e-07, "logits/chosen": -0.28935545682907104, "logits/rejected": -0.3360351622104645, "logps/chosen": -291.0, "logps/rejected": -284.0, "loss": 0.6531, "rewards/accuracies": 0.5183333158493042, "rewards/chosen": 0.21171875298023224, "rewards/margins": 0.09981689602136612, "rewards/rejected": 0.11210937798023224, "step": 740 }, { "epoch": 0.19181256436663233, "grad_norm": 118.5, "learning_rate": 4.040937178166838e-07, "logits/chosen": -0.40751951932907104, "logits/rejected": -0.22634276747703552, "logps/chosen": -340.20001220703125, "logps/rejected": -271.3999938964844, "loss": 0.6109, "rewards/accuracies": 0.6076191067695618, "rewards/chosen": 0.31660157442092896, "rewards/margins": 0.20869140326976776, "rewards/rejected": 0.10791015625, "step": 745 }, { "epoch": 0.19309989701338826, "grad_norm": 135.0, "learning_rate": 4.0345005149330585e-07, "logits/chosen": -0.24062499403953552, "logits/rejected": -0.2933593690395355, "logps/chosen": -287.3999938964844, "logps/rejected": -266.0, "loss": 0.6445, "rewards/accuracies": 0.560952365398407, "rewards/chosen": 0.21875, "rewards/margins": 0.12744140625, "rewards/rejected": 0.09123535454273224, "step": 750 }, { "epoch": 0.19438722966014418, "grad_norm": 153.0, "learning_rate": 4.028063851699279e-07, "logits/chosen": -0.3680664002895355, "logits/rejected": -0.2857421934604645, "logps/chosen": -327.0, "logps/rejected": -312.79998779296875, "loss": 0.6797, "rewards/accuracies": 0.49632781744003296, "rewards/chosen": 0.21367187798023224, "rewards/margins": 0.09194336086511612, "rewards/rejected": 0.12147130817174911, "step": 755 }, { "epoch": 0.1956745623069001, "grad_norm": 130.0, "learning_rate": 4.0216271884654994e-07, "logits/chosen": -0.20424804091453552, "logits/rejected": -0.22900390625, "logps/chosen": -279.3999938964844, "logps/rejected": -284.0, "loss": 0.6633, "rewards/accuracies": 0.5027381181716919, "rewards/chosen": 0.19716796278953552, "rewards/margins": 0.09023437649011612, "rewards/rejected": 0.10688476264476776, "step": 760 }, { "epoch": 0.196961894953656, "grad_norm": 127.5, "learning_rate": 4.01519052523172e-07, "logits/chosen": -0.2577148377895355, "logits/rejected": -0.21113280951976776, "logps/chosen": -278.0, "logps/rejected": -238.1999969482422, "loss": 0.6344, "rewards/accuracies": 0.6159848570823669, "rewards/chosen": 0.19853515923023224, "rewards/margins": 0.138671875, "rewards/rejected": 0.0599365234375, "step": 765 }, { "epoch": 0.19824922760041194, "grad_norm": 132.0, "learning_rate": 4.00875386199794e-07, "logits/chosen": -0.31816405057907104, "logits/rejected": -0.36689454317092896, "logps/chosen": -328.20001220703125, "logps/rejected": -286.3999938964844, "loss": 0.6328, "rewards/accuracies": 0.621666669845581, "rewards/chosen": 0.26191407442092896, "rewards/margins": 0.17200927436351776, "rewards/rejected": 0.08999023586511612, "step": 770 }, { "epoch": 0.19953656024716787, "grad_norm": 121.0, "learning_rate": 4.0023171987641606e-07, "logits/chosen": -0.2935546934604645, "logits/rejected": -0.2896484434604645, "logps/chosen": -302.6000061035156, "logps/rejected": -277.20001220703125, "loss": 0.6445, "rewards/accuracies": 0.5857142806053162, "rewards/chosen": 0.18916015326976776, "rewards/margins": 0.13341674208641052, "rewards/rejected": 0.05574340745806694, "step": 775 }, { "epoch": 0.2008238928939238, "grad_norm": 107.5, "learning_rate": 3.995880535530381e-07, "logits/chosen": -0.3455856442451477, "logits/rejected": -0.4048828184604645, "logps/chosen": -301.0, "logps/rejected": -295.79998779296875, "loss": 0.6406, "rewards/accuracies": 0.5994445085525513, "rewards/chosen": 0.23046875, "rewards/margins": 0.1328125, "rewards/rejected": 0.09775390475988388, "step": 780 }, { "epoch": 0.2021112255406797, "grad_norm": 125.5, "learning_rate": 3.989443872296601e-07, "logits/chosen": -0.2757812440395355, "logits/rejected": -0.21533203125, "logps/chosen": -271.3999938964844, "logps/rejected": -244.39999389648438, "loss": 0.6523, "rewards/accuracies": 0.5539394021034241, "rewards/chosen": 0.22187499701976776, "rewards/margins": 0.11470947414636612, "rewards/rejected": 0.10730590671300888, "step": 785 }, { "epoch": 0.20339855818743563, "grad_norm": 136.0, "learning_rate": 3.9830072090628214e-07, "logits/chosen": -0.29902344942092896, "logits/rejected": -0.392578125, "logps/chosen": -278.79998779296875, "logps/rejected": -246.0, "loss": 0.6477, "rewards/accuracies": 0.5900000333786011, "rewards/chosen": 0.18813475966453552, "rewards/margins": 0.12852783501148224, "rewards/rejected": 0.0595703125, "step": 790 }, { "epoch": 0.20468589083419156, "grad_norm": 129.0, "learning_rate": 3.9765705458290423e-07, "logits/chosen": -0.3394531309604645, "logits/rejected": -0.3111328184604645, "logps/chosen": -328.0, "logps/rejected": -282.20001220703125, "loss": 0.632, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.25, "rewards/margins": 0.1572265625, "rewards/rejected": 0.0927734375, "step": 795 }, { "epoch": 0.2059732234809475, "grad_norm": 110.0, "learning_rate": 3.970133882595262e-07, "logits/chosen": -0.2991699278354645, "logits/rejected": -0.16586914658546448, "logps/chosen": -291.20001220703125, "logps/rejected": -283.3999938964844, "loss": 0.6516, "rewards/accuracies": 0.5894047617912292, "rewards/chosen": 0.15936279296875, "rewards/margins": 0.12119140475988388, "rewards/rejected": 0.03803711012005806, "step": 800 }, { "epoch": 0.2072605561277034, "grad_norm": 106.0, "learning_rate": 3.9636972193614827e-07, "logits/chosen": -0.27363282442092896, "logits/rejected": -0.28144532442092896, "logps/chosen": -327.3999938964844, "logps/rejected": -288.20001220703125, "loss": 0.6711, "rewards/accuracies": 0.5824999809265137, "rewards/chosen": 0.19150391221046448, "rewards/margins": 0.08940429985523224, "rewards/rejected": 0.10202636569738388, "step": 805 }, { "epoch": 0.20854788877445932, "grad_norm": 126.0, "learning_rate": 3.9572605561277036e-07, "logits/chosen": -0.2853027284145355, "logits/rejected": -0.21054688096046448, "logps/chosen": -281.6000061035156, "logps/rejected": -276.79998779296875, "loss": 0.6438, "rewards/accuracies": 0.59333336353302, "rewards/chosen": 0.20187988877296448, "rewards/margins": 0.13388672471046448, "rewards/rejected": 0.06791992485523224, "step": 810 }, { "epoch": 0.20983522142121525, "grad_norm": 150.0, "learning_rate": 3.9508238928939235e-07, "logits/chosen": -0.505078136920929, "logits/rejected": -0.25175780057907104, "logps/chosen": -306.0, "logps/rejected": -277.20001220703125, "loss": 0.6516, "rewards/accuracies": 0.572692334651947, "rewards/chosen": 0.22119140625, "rewards/margins": 0.11210937798023224, "rewards/rejected": 0.10908202826976776, "step": 815 }, { "epoch": 0.21112255406797117, "grad_norm": 128.0, "learning_rate": 3.944387229660144e-07, "logits/chosen": -0.40546876192092896, "logits/rejected": -0.3157958984375, "logps/chosen": -254.60000610351562, "logps/rejected": -210.39999389648438, "loss": 0.6547, "rewards/accuracies": 0.5543254017829895, "rewards/chosen": 0.18730469048023224, "rewards/margins": 0.11113281548023224, "rewards/rejected": 0.07634277641773224, "step": 820 }, { "epoch": 0.21240988671472708, "grad_norm": 156.0, "learning_rate": 3.9379505664263644e-07, "logits/chosen": -0.38330078125, "logits/rejected": -0.4039062559604645, "logps/chosen": -347.6000061035156, "logps/rejected": -324.3999938964844, "loss": 0.6562, "rewards/accuracies": 0.5850000381469727, "rewards/chosen": 0.2523437440395355, "rewards/margins": 0.11666564643383026, "rewards/rejected": 0.13583984971046448, "step": 825 }, { "epoch": 0.213697219361483, "grad_norm": 144.0, "learning_rate": 3.931513903192585e-07, "logits/chosen": -0.3912109434604645, "logits/rejected": -0.24921874701976776, "logps/chosen": -336.0, "logps/rejected": -302.20001220703125, "loss": 0.6367, "rewards/accuracies": 0.529212474822998, "rewards/chosen": 0.30195313692092896, "rewards/margins": 0.16542968153953552, "rewards/rejected": 0.136474609375, "step": 830 }, { "epoch": 0.21498455200823893, "grad_norm": 148.0, "learning_rate": 3.925077239958805e-07, "logits/chosen": -0.38847655057907104, "logits/rejected": -0.4253906309604645, "logps/chosen": -315.79998779296875, "logps/rejected": -283.79998779296875, "loss": 0.6367, "rewards/accuracies": 0.5368939638137817, "rewards/chosen": 0.2669921815395355, "rewards/margins": 0.15351562201976776, "rewards/rejected": 0.11414794623851776, "step": 835 }, { "epoch": 0.21627188465499486, "grad_norm": 138.0, "learning_rate": 3.9186405767250257e-07, "logits/chosen": -0.2689453065395355, "logits/rejected": -0.17530517280101776, "logps/chosen": -333.0, "logps/rejected": -264.79998779296875, "loss": 0.6742, "rewards/accuracies": 0.5254761576652527, "rewards/chosen": 0.16357421875, "rewards/margins": 0.09479980170726776, "rewards/rejected": 0.06855468451976776, "step": 840 }, { "epoch": 0.21755921730175076, "grad_norm": 118.0, "learning_rate": 3.912203913491246e-07, "logits/chosen": -0.30488282442092896, "logits/rejected": -0.24985352158546448, "logps/chosen": -259.6000061035156, "logps/rejected": -247.1999969482422, "loss": 0.6477, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": 0.15717773139476776, "rewards/margins": 0.11972656100988388, "rewards/rejected": 0.03773193433880806, "step": 845 }, { "epoch": 0.2188465499485067, "grad_norm": 116.0, "learning_rate": 3.905767250257466e-07, "logits/chosen": -0.3006225526332855, "logits/rejected": -0.2666015625, "logps/chosen": -286.0, "logps/rejected": -258.6000061035156, "loss": 0.6305, "rewards/accuracies": 0.6250758171081543, "rewards/chosen": 0.23935547471046448, "rewards/margins": 0.18046875298023224, "rewards/rejected": 0.05893554538488388, "step": 850 }, { "epoch": 0.22013388259526262, "grad_norm": 104.5, "learning_rate": 3.899330587023687e-07, "logits/chosen": -0.21992187201976776, "logits/rejected": -0.208984375, "logps/chosen": -323.20001220703125, "logps/rejected": -284.6000061035156, "loss": 0.6055, "rewards/accuracies": 0.6283333897590637, "rewards/chosen": 0.296875, "rewards/margins": 0.20253905653953552, "rewards/rejected": 0.09470520168542862, "step": 855 }, { "epoch": 0.22142121524201855, "grad_norm": 109.0, "learning_rate": 3.8928939237899074e-07, "logits/chosen": -0.3184570372104645, "logits/rejected": -0.25517576932907104, "logps/chosen": -345.79998779296875, "logps/rejected": -286.79998779296875, "loss": 0.6641, "rewards/accuracies": 0.5358333587646484, "rewards/chosen": 0.21816405653953552, "rewards/margins": 0.11304931342601776, "rewards/rejected": 0.10528258979320526, "step": 860 }, { "epoch": 0.22270854788877445, "grad_norm": 107.5, "learning_rate": 3.886457260556127e-07, "logits/chosen": -0.3091796934604645, "logits/rejected": -0.260986328125, "logps/chosen": -263.79998779296875, "logps/rejected": -235.60000610351562, "loss": 0.6195, "rewards/accuracies": 0.5564285516738892, "rewards/chosen": 0.21396484971046448, "rewards/margins": 0.19218750298023224, "rewards/rejected": 0.02135314978659153, "step": 865 }, { "epoch": 0.22399588053553038, "grad_norm": 187.0, "learning_rate": 3.8800205973223477e-07, "logits/chosen": -0.3109374940395355, "logits/rejected": -0.17256470024585724, "logps/chosen": -256.79998779296875, "logps/rejected": -301.6000061035156, "loss": 0.6789, "rewards/accuracies": 0.5936688184738159, "rewards/chosen": 0.2060546875, "rewards/margins": 0.05534667894244194, "rewards/rejected": 0.150390625, "step": 870 }, { "epoch": 0.2252832131822863, "grad_norm": 99.5, "learning_rate": 3.8735839340885686e-07, "logits/chosen": -0.31328123807907104, "logits/rejected": -0.2837890684604645, "logps/chosen": -284.79998779296875, "logps/rejected": -295.20001220703125, "loss": 0.5938, "rewards/accuracies": 0.6808333396911621, "rewards/chosen": 0.265625, "rewards/margins": 0.23583984375, "rewards/rejected": 0.02988281287252903, "step": 875 }, { "epoch": 0.22657054582904224, "grad_norm": 106.0, "learning_rate": 3.8671472708547885e-07, "logits/chosen": -0.3064819276332855, "logits/rejected": -0.21660156548023224, "logps/chosen": -329.3999938964844, "logps/rejected": -272.20001220703125, "loss": 0.6547, "rewards/accuracies": 0.5066667199134827, "rewards/chosen": 0.19121094048023224, "rewards/margins": 0.11606445163488388, "rewards/rejected": 0.07534179836511612, "step": 880 }, { "epoch": 0.22785787847579814, "grad_norm": 145.0, "learning_rate": 3.860710607621009e-07, "logits/chosen": -0.23256835341453552, "logits/rejected": -0.11562500149011612, "logps/chosen": -306.79998779296875, "logps/rejected": -298.3999938964844, "loss": 0.6391, "rewards/accuracies": 0.6266666650772095, "rewards/chosen": 0.2197265625, "rewards/margins": 0.14992675185203552, "rewards/rejected": 0.06996307522058487, "step": 885 }, { "epoch": 0.22914521112255407, "grad_norm": 172.0, "learning_rate": 3.8542739443872294e-07, "logits/chosen": -0.20795898139476776, "logits/rejected": -0.20341797173023224, "logps/chosen": -315.20001220703125, "logps/rejected": -268.20001220703125, "loss": 0.6508, "rewards/accuracies": 0.5927777886390686, "rewards/chosen": 0.26640623807907104, "rewards/margins": 0.13413086533546448, "rewards/rejected": 0.13222655653953552, "step": 890 }, { "epoch": 0.23043254376931, "grad_norm": 107.5, "learning_rate": 3.84783728115345e-07, "logits/chosen": -0.23183593153953552, "logits/rejected": -0.23867186903953552, "logps/chosen": -265.20001220703125, "logps/rejected": -257.3999938964844, "loss": 0.6539, "rewards/accuracies": 0.5566666722297668, "rewards/chosen": 0.18027344346046448, "rewards/margins": 0.11812744289636612, "rewards/rejected": 0.06196289137005806, "step": 895 }, { "epoch": 0.23171987641606592, "grad_norm": 102.0, "learning_rate": 3.84140061791967e-07, "logits/chosen": -0.22421875596046448, "logits/rejected": -0.11005859076976776, "logps/chosen": -304.3999938964844, "logps/rejected": -244.60000610351562, "loss": 0.6109, "rewards/accuracies": 0.6336904764175415, "rewards/chosen": 0.2544921934604645, "rewards/margins": 0.20937499403953552, "rewards/rejected": 0.04482422024011612, "step": 900 }, { "epoch": 0.23300720906282182, "grad_norm": 166.0, "learning_rate": 3.8349639546858907e-07, "logits/chosen": -0.3525390625, "logits/rejected": -0.3037109375, "logps/chosen": -327.6000061035156, "logps/rejected": -239.1999969482422, "loss": 0.632, "rewards/accuracies": 0.5972727537155151, "rewards/chosen": 0.17617186903953552, "rewards/margins": 0.14091797173023224, "rewards/rejected": 0.03510131686925888, "step": 905 }, { "epoch": 0.23429454170957775, "grad_norm": 122.0, "learning_rate": 3.828527291452111e-07, "logits/chosen": -0.40996092557907104, "logits/rejected": -0.4593749940395355, "logps/chosen": -368.0, "logps/rejected": -376.3999938964844, "loss": 0.6562, "rewards/accuracies": 0.4725000262260437, "rewards/chosen": 0.29277342557907104, "rewards/margins": 0.09850616753101349, "rewards/rejected": 0.19443359971046448, "step": 910 }, { "epoch": 0.23558187435633368, "grad_norm": 155.0, "learning_rate": 3.8220906282183315e-07, "logits/chosen": -0.17778320610523224, "logits/rejected": -0.14296874403953552, "logps/chosen": -228.0, "logps/rejected": -295.79998779296875, "loss": 0.6766, "rewards/accuracies": 0.5301923155784607, "rewards/chosen": 0.18417969346046448, "rewards/margins": 0.069091796875, "rewards/rejected": 0.11518554389476776, "step": 915 }, { "epoch": 0.2368692070030896, "grad_norm": 120.0, "learning_rate": 3.815653964984552e-07, "logits/chosen": -0.32001954317092896, "logits/rejected": -0.27045899629592896, "logps/chosen": -278.79998779296875, "logps/rejected": -230.39999389648438, "loss": 0.6555, "rewards/accuracies": 0.47238096594810486, "rewards/chosen": 0.16357421875, "rewards/margins": 0.0970458984375, "rewards/rejected": 0.06650390475988388, "step": 920 }, { "epoch": 0.2381565396498455, "grad_norm": 111.5, "learning_rate": 3.8092173017507724e-07, "logits/chosen": -0.37226563692092896, "logits/rejected": -0.32587891817092896, "logps/chosen": -272.0, "logps/rejected": -277.79998779296875, "loss": 0.6617, "rewards/accuracies": 0.5008333325386047, "rewards/chosen": 0.21767577528953552, "rewards/margins": 0.11337890475988388, "rewards/rejected": 0.10439147800207138, "step": 925 }, { "epoch": 0.23944387229660144, "grad_norm": 125.0, "learning_rate": 3.8027806385169923e-07, "logits/chosen": -0.3466796875, "logits/rejected": -0.2886718809604645, "logps/chosen": -263.3999938964844, "logps/rejected": -243.89999389648438, "loss": 0.6484, "rewards/accuracies": 0.6378663182258606, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.13427734375, "rewards/rejected": 0.11137695610523224, "step": 930 }, { "epoch": 0.24073120494335737, "grad_norm": 119.0, "learning_rate": 3.796343975283213e-07, "logits/chosen": -0.2899414002895355, "logits/rejected": -0.34355467557907104, "logps/chosen": -267.0, "logps/rejected": -276.0, "loss": 0.6594, "rewards/accuracies": 0.6008332967758179, "rewards/chosen": 0.15263672173023224, "rewards/margins": 0.10625000298023224, "rewards/rejected": 0.04647216945886612, "step": 935 }, { "epoch": 0.2420185375901133, "grad_norm": 136.0, "learning_rate": 3.7899073120494337e-07, "logits/chosen": -0.17695312201976776, "logits/rejected": -0.21210937201976776, "logps/chosen": -287.3999938964844, "logps/rejected": -286.3999938964844, "loss": 0.6586, "rewards/accuracies": 0.5937121510505676, "rewards/chosen": 0.24257811903953552, "rewards/margins": 0.11298827826976776, "rewards/rejected": 0.12946777045726776, "step": 940 }, { "epoch": 0.2433058702368692, "grad_norm": 137.0, "learning_rate": 3.7834706488156536e-07, "logits/chosen": -0.4527343809604645, "logits/rejected": -0.39042967557907104, "logps/chosen": -331.20001220703125, "logps/rejected": -298.3999938964844, "loss": 0.6562, "rewards/accuracies": 0.5297619104385376, "rewards/chosen": 0.23867186903953552, "rewards/margins": 0.110595703125, "rewards/rejected": 0.12830810248851776, "step": 945 }, { "epoch": 0.24459320288362513, "grad_norm": 130.0, "learning_rate": 3.777033985581874e-07, "logits/chosen": -0.40351563692092896, "logits/rejected": -0.3746093809604645, "logps/chosen": -279.6000061035156, "logps/rejected": -279.3999938964844, "loss": 0.6523, "rewards/accuracies": 0.5708791613578796, "rewards/chosen": 0.248046875, "rewards/margins": 0.11542968451976776, "rewards/rejected": 0.13327637314796448, "step": 950 }, { "epoch": 0.24588053553038106, "grad_norm": 112.5, "learning_rate": 3.770597322348095e-07, "logits/chosen": -0.22785015404224396, "logits/rejected": -0.1572265625, "logps/chosen": -256.79998779296875, "logps/rejected": -251.0, "loss": 0.6484, "rewards/accuracies": 0.6241666674613953, "rewards/chosen": 0.20341797173023224, "rewards/margins": 0.13603515923023224, "rewards/rejected": 0.06730957329273224, "step": 955 }, { "epoch": 0.24716786817713698, "grad_norm": 134.0, "learning_rate": 3.764160659114315e-07, "logits/chosen": -0.2798828184604645, "logits/rejected": -0.1639404296875, "logps/chosen": -368.0, "logps/rejected": -316.0, "loss": 0.6141, "rewards/accuracies": 0.6316666603088379, "rewards/chosen": 0.3109374940395355, "rewards/margins": 0.22285155951976776, "rewards/rejected": 0.08793945610523224, "step": 960 }, { "epoch": 0.24845520082389289, "grad_norm": 100.5, "learning_rate": 3.7577239958805353e-07, "logits/chosen": -0.2666015625, "logits/rejected": -0.240234375, "logps/chosen": -308.3999938964844, "logps/rejected": -284.20001220703125, "loss": 0.6148, "rewards/accuracies": 0.6308333277702332, "rewards/chosen": 0.22275391221046448, "rewards/margins": 0.18789061903953552, "rewards/rejected": 0.03530273586511612, "step": 965 }, { "epoch": 0.24974253347064881, "grad_norm": 98.0, "learning_rate": 3.7512873326467557e-07, "logits/chosen": -0.30517578125, "logits/rejected": -0.31621092557907104, "logps/chosen": -334.6000061035156, "logps/rejected": -296.3999938964844, "loss": 0.6469, "rewards/accuracies": 0.5985714197158813, "rewards/chosen": 0.23203125596046448, "rewards/margins": 0.14316406846046448, "rewards/rejected": 0.08919067680835724, "step": 970 }, { "epoch": 0.25102986611740474, "grad_norm": 117.5, "learning_rate": 3.744850669412976e-07, "logits/chosen": -0.4361328184604645, "logits/rejected": -0.3243164122104645, "logps/chosen": -351.6000061035156, "logps/rejected": -280.0, "loss": 0.6234, "rewards/accuracies": 0.6266666650772095, "rewards/chosen": 0.25468748807907104, "rewards/margins": 0.18300780653953552, "rewards/rejected": 0.07152404636144638, "step": 975 }, { "epoch": 0.25231719876416064, "grad_norm": 144.0, "learning_rate": 3.7384140061791965e-07, "logits/chosen": -0.24814453721046448, "logits/rejected": -0.22099609673023224, "logps/chosen": -314.79998779296875, "logps/rejected": -271.20001220703125, "loss": 0.6398, "rewards/accuracies": 0.595897376537323, "rewards/chosen": 0.24472656846046448, "rewards/margins": 0.14931640028953552, "rewards/rejected": 0.09550781548023224, "step": 980 }, { "epoch": 0.2536045314109166, "grad_norm": 132.0, "learning_rate": 3.731977342945417e-07, "logits/chosen": -0.36250001192092896, "logits/rejected": -0.30683594942092896, "logps/chosen": -329.20001220703125, "logps/rejected": -318.0, "loss": 0.65, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": 0.20371094346046448, "rewards/margins": 0.11530761420726776, "rewards/rejected": 0.08840332180261612, "step": 985 }, { "epoch": 0.2548918640576725, "grad_norm": 113.5, "learning_rate": 3.7255406797116374e-07, "logits/chosen": -0.22119140625, "logits/rejected": -0.20839843153953552, "logps/chosen": -304.6000061035156, "logps/rejected": -285.3999938964844, "loss": 0.6281, "rewards/accuracies": 0.5649999976158142, "rewards/chosen": 0.23593750596046448, "rewards/margins": 0.18173828721046448, "rewards/rejected": 0.054290771484375, "step": 990 }, { "epoch": 0.2561791967044284, "grad_norm": 122.5, "learning_rate": 3.7191040164778573e-07, "logits/chosen": -0.23808594048023224, "logits/rejected": -0.18603515625, "logps/chosen": -310.20001220703125, "logps/rejected": -257.0, "loss": 0.6727, "rewards/accuracies": 0.49416667222976685, "rewards/chosen": 0.13505859673023224, "rewards/margins": 0.07310791313648224, "rewards/rejected": 0.061767578125, "step": 995 }, { "epoch": 0.25746652935118436, "grad_norm": 97.0, "learning_rate": 3.712667353244078e-07, "logits/chosen": -0.3472656309604645, "logits/rejected": -0.28203123807907104, "logps/chosen": -279.3999938964844, "logps/rejected": -271.79998779296875, "loss": 0.6594, "rewards/accuracies": 0.5208333730697632, "rewards/chosen": 0.20624999701976776, "rewards/margins": 0.10161133110523224, "rewards/rejected": 0.10480956733226776, "step": 1000 }, { "epoch": 0.25875386199794026, "grad_norm": 120.5, "learning_rate": 3.7062306900102987e-07, "logits/chosen": -0.45585936307907104, "logits/rejected": -0.38652342557907104, "logps/chosen": -333.20001220703125, "logps/rejected": -274.20001220703125, "loss": 0.632, "rewards/accuracies": 0.5894047617912292, "rewards/chosen": 0.25751954317092896, "rewards/margins": 0.15288086235523224, "rewards/rejected": 0.104736328125, "step": 1005 }, { "epoch": 0.2600411946446962, "grad_norm": 137.0, "learning_rate": 3.6997940267765186e-07, "logits/chosen": -0.3531250059604645, "logits/rejected": -0.2943359315395355, "logps/chosen": -348.0, "logps/rejected": -301.6000061035156, "loss": 0.6531, "rewards/accuracies": 0.5866667032241821, "rewards/chosen": 0.23984375596046448, "rewards/margins": 0.12924805283546448, "rewards/rejected": 0.11038818210363388, "step": 1010 }, { "epoch": 0.2613285272914521, "grad_norm": 121.5, "learning_rate": 3.6933573635427395e-07, "logits/chosen": -0.37519532442092896, "logits/rejected": -0.31074219942092896, "logps/chosen": -360.3999938964844, "logps/rejected": -289.3999938964844, "loss": 0.6594, "rewards/accuracies": 0.5234090685844421, "rewards/chosen": 0.29316407442092896, "rewards/margins": 0.127197265625, "rewards/rejected": 0.16582031548023224, "step": 1015 }, { "epoch": 0.262615859938208, "grad_norm": 131.0, "learning_rate": 3.68692070030896e-07, "logits/chosen": -0.24199219048023224, "logits/rejected": -0.24882812798023224, "logps/chosen": -287.0, "logps/rejected": -297.3999938964844, "loss": 0.6391, "rewards/accuracies": 0.5842857360839844, "rewards/chosen": 0.23281249403953552, "rewards/margins": 0.15761718153953552, "rewards/rejected": 0.0750732421875, "step": 1020 }, { "epoch": 0.263903192584964, "grad_norm": 202.0, "learning_rate": 3.68048403707518e-07, "logits/chosen": -0.29511719942092896, "logits/rejected": -0.27460938692092896, "logps/chosen": -299.3999938964844, "logps/rejected": -234.8000030517578, "loss": 0.6375, "rewards/accuracies": 0.5619444847106934, "rewards/chosen": 0.24257811903953552, "rewards/margins": 0.130615234375, "rewards/rejected": 0.11245117336511612, "step": 1025 }, { "epoch": 0.2651905252317199, "grad_norm": 132.0, "learning_rate": 3.6740473738414003e-07, "logits/chosen": -0.3662109375, "logits/rejected": -0.28095704317092896, "logps/chosen": -319.3999938964844, "logps/rejected": -284.79998779296875, "loss": 0.6422, "rewards/accuracies": 0.6050000190734863, "rewards/chosen": 0.24355468153953552, "rewards/margins": 0.14658203721046448, "rewards/rejected": 0.09711913764476776, "step": 1030 }, { "epoch": 0.2664778578784758, "grad_norm": 133.0, "learning_rate": 3.6676107106076207e-07, "logits/chosen": -0.26640623807907104, "logits/rejected": -0.26777344942092896, "logps/chosen": -291.79998779296875, "logps/rejected": -289.79998779296875, "loss": 0.6609, "rewards/accuracies": 0.5115384459495544, "rewards/chosen": 0.24570313096046448, "rewards/margins": 0.09743652492761612, "rewards/rejected": 0.14814452826976776, "step": 1035 }, { "epoch": 0.26776519052523173, "grad_norm": 119.5, "learning_rate": 3.661174047373841e-07, "logits/chosen": -0.26093751192092896, "logits/rejected": -0.17216797173023224, "logps/chosen": -292.79998779296875, "logps/rejected": -275.0, "loss": 0.6305, "rewards/accuracies": 0.6191666722297668, "rewards/chosen": 0.2623046934604645, "rewards/margins": 0.19233398139476776, "rewards/rejected": 0.07058105617761612, "step": 1040 }, { "epoch": 0.26905252317198763, "grad_norm": 133.0, "learning_rate": 3.6547373841400616e-07, "logits/chosen": -0.38398438692092896, "logits/rejected": -0.2894531190395355, "logps/chosen": -388.0, "logps/rejected": -303.20001220703125, "loss": 0.593, "rewards/accuracies": 0.6467857360839844, "rewards/chosen": 0.3394531309604645, "rewards/margins": 0.24716797471046448, "rewards/rejected": 0.09208984673023224, "step": 1045 }, { "epoch": 0.2703398558187436, "grad_norm": 104.5, "learning_rate": 3.648300720906282e-07, "logits/chosen": -0.22227783501148224, "logits/rejected": -0.17100830376148224, "logps/chosen": -246.0, "logps/rejected": -248.0, "loss": 0.6234, "rewards/accuracies": 0.65666663646698, "rewards/chosen": 0.2782226502895355, "rewards/margins": 0.19863280653953552, "rewards/rejected": 0.07993163913488388, "step": 1050 }, { "epoch": 0.2716271884654995, "grad_norm": 136.0, "learning_rate": 3.6418640576725024e-07, "logits/chosen": -0.357421875, "logits/rejected": -0.3542724549770355, "logps/chosen": -289.6000061035156, "logps/rejected": -319.6000061035156, "loss": 0.6859, "rewards/accuracies": 0.5208333730697632, "rewards/chosen": 0.21650390326976776, "rewards/margins": 0.0501708984375, "rewards/rejected": 0.16652831435203552, "step": 1055 }, { "epoch": 0.2729145211122554, "grad_norm": 143.0, "learning_rate": 3.635427394438723e-07, "logits/chosen": -0.44023436307907104, "logits/rejected": -0.36054688692092896, "logps/chosen": -304.0, "logps/rejected": -294.79998779296875, "loss": 0.6172, "rewards/accuracies": 0.6214393377304077, "rewards/chosen": 0.24101562798023224, "rewards/margins": 0.20703125, "rewards/rejected": 0.03398437425494194, "step": 1060 }, { "epoch": 0.27420185375901135, "grad_norm": 144.0, "learning_rate": 3.6289907312049433e-07, "logits/chosen": -0.4429687559604645, "logits/rejected": -0.35332030057907104, "logps/chosen": -308.0, "logps/rejected": -310.6000061035156, "loss": 0.6727, "rewards/accuracies": 0.6303571462631226, "rewards/chosen": 0.22041015326976776, "rewards/margins": 0.08045653998851776, "rewards/rejected": 0.13974609971046448, "step": 1065 }, { "epoch": 0.27548918640576725, "grad_norm": 140.0, "learning_rate": 3.6225540679711637e-07, "logits/chosen": -0.3589843809604645, "logits/rejected": -0.25, "logps/chosen": -275.6000061035156, "logps/rejected": -243.39999389648438, "loss": 0.6594, "rewards/accuracies": 0.5766667127609253, "rewards/chosen": 0.22714844346046448, "rewards/margins": 0.10097656399011612, "rewards/rejected": 0.12607422471046448, "step": 1070 }, { "epoch": 0.27677651905252315, "grad_norm": 132.0, "learning_rate": 3.6161174047373836e-07, "logits/chosen": -0.3208984434604645, "logits/rejected": -0.23671874403953552, "logps/chosen": -310.3999938964844, "logps/rejected": -298.20001220703125, "loss": 0.6219, "rewards/accuracies": 0.6480952501296997, "rewards/chosen": 0.25224608182907104, "rewards/margins": 0.18632812798023224, "rewards/rejected": 0.06562499701976776, "step": 1075 }, { "epoch": 0.2780638516992791, "grad_norm": 117.0, "learning_rate": 3.6096807415036046e-07, "logits/chosen": -0.2652831971645355, "logits/rejected": -0.10992431640625, "logps/chosen": -263.20001220703125, "logps/rejected": -248.0, "loss": 0.6422, "rewards/accuracies": 0.6110897660255432, "rewards/chosen": 0.27294921875, "rewards/margins": 0.16347655653953552, "rewards/rejected": 0.10860595852136612, "step": 1080 }, { "epoch": 0.279351184346035, "grad_norm": 127.5, "learning_rate": 3.603244078269825e-07, "logits/chosen": -0.33320313692092896, "logits/rejected": -0.30146485567092896, "logps/chosen": -395.20001220703125, "logps/rejected": -305.20001220703125, "loss": 0.6195, "rewards/accuracies": 0.6891666650772095, "rewards/chosen": 0.3193359375, "rewards/margins": 0.18779297173023224, "rewards/rejected": 0.13100585341453552, "step": 1085 }, { "epoch": 0.2806385169927909, "grad_norm": 109.0, "learning_rate": 3.596807415036045e-07, "logits/chosen": -0.3843750059604645, "logits/rejected": -0.3880859315395355, "logps/chosen": -352.79998779296875, "logps/rejected": -319.0, "loss": 0.6344, "rewards/accuracies": 0.5528846979141235, "rewards/chosen": 0.2962890565395355, "rewards/margins": 0.15175780653953552, "rewards/rejected": 0.14494629204273224, "step": 1090 }, { "epoch": 0.28192584963954687, "grad_norm": 155.0, "learning_rate": 3.590370751802266e-07, "logits/chosen": -0.3402343690395355, "logits/rejected": -0.29150390625, "logps/chosen": -286.3999938964844, "logps/rejected": -256.6000061035156, "loss": 0.65, "rewards/accuracies": 0.510606050491333, "rewards/chosen": 0.2138671875, "rewards/margins": 0.13217774033546448, "rewards/rejected": 0.08120117336511612, "step": 1095 }, { "epoch": 0.28321318228630277, "grad_norm": 127.0, "learning_rate": 3.583934088568486e-07, "logits/chosen": -0.3046875, "logits/rejected": -0.25361329317092896, "logps/chosen": -335.3999938964844, "logps/rejected": -291.6000061035156, "loss": 0.6383, "rewards/accuracies": 0.5816666483879089, "rewards/chosen": 0.24570313096046448, "rewards/margins": 0.12636718153953552, "rewards/rejected": 0.11953125149011612, "step": 1100 }, { "epoch": 0.2845005149330587, "grad_norm": 117.5, "learning_rate": 3.577497425334706e-07, "logits/chosen": -0.19633789360523224, "logits/rejected": -0.15680694580078125, "logps/chosen": -280.0, "logps/rejected": -265.3999938964844, "loss": 0.6789, "rewards/accuracies": 0.5563095808029175, "rewards/chosen": 0.17021484673023224, "rewards/margins": 0.07050780951976776, "rewards/rejected": 0.09946288913488388, "step": 1105 }, { "epoch": 0.2857878475798146, "grad_norm": 128.0, "learning_rate": 3.571060762100927e-07, "logits/chosen": -0.4085937440395355, "logits/rejected": -0.40556639432907104, "logps/chosen": -307.6000061035156, "logps/rejected": -235.1999969482422, "loss": 0.6367, "rewards/accuracies": 0.6466666460037231, "rewards/chosen": 0.2909179627895355, "rewards/margins": 0.17067870497703552, "rewards/rejected": 0.11992187798023224, "step": 1110 }, { "epoch": 0.2870751802265705, "grad_norm": 154.0, "learning_rate": 3.564624098867147e-07, "logits/chosen": -0.23173828423023224, "logits/rejected": -0.2787109315395355, "logps/chosen": -308.79998779296875, "logps/rejected": -286.3999938964844, "loss": 0.6492, "rewards/accuracies": 0.48952382802963257, "rewards/chosen": 0.2021484375, "rewards/margins": 0.126220703125, "rewards/rejected": 0.07607422024011612, "step": 1115 }, { "epoch": 0.2883625128733265, "grad_norm": 148.0, "learning_rate": 3.5581874356333674e-07, "logits/chosen": -0.3423828184604645, "logits/rejected": -0.2593750059604645, "logps/chosen": -301.6000061035156, "logps/rejected": -291.0, "loss": 0.6242, "rewards/accuracies": 0.6735714673995972, "rewards/chosen": 0.26640623807907104, "rewards/margins": 0.17338867485523224, "rewards/rejected": 0.0928955078125, "step": 1120 }, { "epoch": 0.2896498455200824, "grad_norm": 124.5, "learning_rate": 3.551750772399588e-07, "logits/chosen": -0.2587890625, "logits/rejected": -0.35624998807907104, "logps/chosen": -291.3999938964844, "logps/rejected": -252.60000610351562, "loss": 0.6727, "rewards/accuracies": 0.5707167983055115, "rewards/chosen": 0.23652343451976776, "rewards/margins": 0.08464355766773224, "rewards/rejected": 0.15166015923023224, "step": 1125 }, { "epoch": 0.2909371781668383, "grad_norm": 128.0, "learning_rate": 3.5453141091658083e-07, "logits/chosen": -0.27324217557907104, "logits/rejected": -0.26738280057907104, "logps/chosen": -307.6000061035156, "logps/rejected": -262.3999938964844, "loss": 0.6656, "rewards/accuracies": 0.5558333992958069, "rewards/chosen": 0.17138671875, "rewards/margins": 0.11513672024011612, "rewards/rejected": 0.05646972730755806, "step": 1130 }, { "epoch": 0.29222451081359424, "grad_norm": 122.5, "learning_rate": 3.5388774459320287e-07, "logits/chosen": -0.2793945372104645, "logits/rejected": -0.189453125, "logps/chosen": -282.0, "logps/rejected": -305.20001220703125, "loss": 0.6094, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 0.21992187201976776, "rewards/margins": 0.20234374701976776, "rewards/rejected": 0.01750488206744194, "step": 1135 }, { "epoch": 0.29351184346035014, "grad_norm": 178.0, "learning_rate": 3.5324407826982486e-07, "logits/chosen": -0.18271484971046448, "logits/rejected": -0.15910644829273224, "logps/chosen": -308.0, "logps/rejected": -262.20001220703125, "loss": 0.618, "rewards/accuracies": 0.6185714602470398, "rewards/chosen": 0.29960936307907104, "rewards/margins": 0.189453125, "rewards/rejected": 0.10976562649011612, "step": 1140 }, { "epoch": 0.2947991761071061, "grad_norm": 132.0, "learning_rate": 3.5260041194644696e-07, "logits/chosen": -0.3193359375, "logits/rejected": -0.25800782442092896, "logps/chosen": -296.0, "logps/rejected": -294.79998779296875, "loss": 0.6656, "rewards/accuracies": 0.5575000047683716, "rewards/chosen": 0.16318359971046448, "rewards/margins": 0.1141510009765625, "rewards/rejected": 0.04897461086511612, "step": 1145 }, { "epoch": 0.296086508753862, "grad_norm": 100.5, "learning_rate": 3.51956745623069e-07, "logits/chosen": -0.1669921875, "logits/rejected": -0.009238624945282936, "logps/chosen": -327.79998779296875, "logps/rejected": -279.20001220703125, "loss": 0.6594, "rewards/accuracies": 0.5683333277702332, "rewards/chosen": 0.15800781548023224, "rewards/margins": 0.11203613132238388, "rewards/rejected": 0.0458984375, "step": 1150 }, { "epoch": 0.2973738414006179, "grad_norm": 110.5, "learning_rate": 3.51313079299691e-07, "logits/chosen": -0.31806641817092896, "logits/rejected": -0.2679687440395355, "logps/chosen": -297.79998779296875, "logps/rejected": -275.79998779296875, "loss": 0.6633, "rewards/accuracies": 0.5227380990982056, "rewards/chosen": 0.14594726264476776, "rewards/margins": 0.08833007514476776, "rewards/rejected": 0.05776367336511612, "step": 1155 }, { "epoch": 0.29866117404737386, "grad_norm": 115.0, "learning_rate": 3.506694129763131e-07, "logits/chosen": -0.22089843451976776, "logits/rejected": -0.15726318955421448, "logps/chosen": -305.20001220703125, "logps/rejected": -253.8000030517578, "loss": 0.6312, "rewards/accuracies": 0.6035714149475098, "rewards/chosen": 0.22871093451976776, "rewards/margins": 0.16669921576976776, "rewards/rejected": 0.0620880126953125, "step": 1160 }, { "epoch": 0.29994850669412976, "grad_norm": 106.0, "learning_rate": 3.5002574665293513e-07, "logits/chosen": -0.3753906190395355, "logits/rejected": -0.37055665254592896, "logps/chosen": -355.20001220703125, "logps/rejected": -284.6000061035156, "loss": 0.6164, "rewards/accuracies": 0.6525000333786011, "rewards/chosen": 0.30585938692092896, "rewards/margins": 0.20244140923023224, "rewards/rejected": 0.10358886420726776, "step": 1165 }, { "epoch": 0.30123583934088566, "grad_norm": 136.0, "learning_rate": 3.493820803295571e-07, "logits/chosen": -0.2562011778354645, "logits/rejected": -0.17685547471046448, "logps/chosen": -327.0, "logps/rejected": -339.20001220703125, "loss": 0.6672, "rewards/accuracies": 0.4791666865348816, "rewards/chosen": 0.20722655951976776, "rewards/margins": 0.10605774074792862, "rewards/rejected": 0.10117187350988388, "step": 1170 }, { "epoch": 0.3025231719876416, "grad_norm": 155.0, "learning_rate": 3.487384140061792e-07, "logits/chosen": -0.22443847358226776, "logits/rejected": -0.15385742485523224, "logps/chosen": -317.3999938964844, "logps/rejected": -305.3999938964844, "loss": 0.6617, "rewards/accuracies": 0.5212180018424988, "rewards/chosen": 0.23945312201976776, "rewards/margins": 0.09714355319738388, "rewards/rejected": 0.14241942763328552, "step": 1175 }, { "epoch": 0.3038105046343975, "grad_norm": 123.0, "learning_rate": 3.480947476828012e-07, "logits/chosen": -0.2925781309604645, "logits/rejected": -0.3021484315395355, "logps/chosen": -329.79998779296875, "logps/rejected": -258.79998779296875, "loss": 0.6305, "rewards/accuracies": 0.5548809766769409, "rewards/chosen": 0.22265625, "rewards/margins": 0.18603515625, "rewards/rejected": 0.03647460788488388, "step": 1180 }, { "epoch": 0.30509783728115347, "grad_norm": 123.5, "learning_rate": 3.4745108135942325e-07, "logits/chosen": -0.22768554091453552, "logits/rejected": -0.21715088188648224, "logps/chosen": -364.79998779296875, "logps/rejected": -297.3999938964844, "loss": 0.6234, "rewards/accuracies": 0.654358983039856, "rewards/chosen": 0.27265626192092896, "rewards/margins": 0.18378905951976776, "rewards/rejected": 0.08941040188074112, "step": 1185 }, { "epoch": 0.3063851699279094, "grad_norm": 105.5, "learning_rate": 3.4680741503604534e-07, "logits/chosen": -0.34003907442092896, "logits/rejected": -0.26423341035842896, "logps/chosen": -334.3999938964844, "logps/rejected": -325.6000061035156, "loss": 0.6633, "rewards/accuracies": 0.4630952477455139, "rewards/chosen": 0.23974609375, "rewards/margins": 0.12716063857078552, "rewards/rejected": 0.11223449558019638, "step": 1190 }, { "epoch": 0.3076725025746653, "grad_norm": 143.0, "learning_rate": 3.4616374871266733e-07, "logits/chosen": -0.3628906309604645, "logits/rejected": -0.2652343809604645, "logps/chosen": -282.0, "logps/rejected": -271.3999938964844, "loss": 0.6352, "rewards/accuracies": 0.557509183883667, "rewards/chosen": 0.2562499940395355, "rewards/margins": 0.15273436903953552, "rewards/rejected": 0.10336913913488388, "step": 1195 }, { "epoch": 0.30895983522142123, "grad_norm": 144.0, "learning_rate": 3.455200823892894e-07, "logits/chosen": -0.40312498807907104, "logits/rejected": -0.16351929306983948, "logps/chosen": -345.0, "logps/rejected": -270.20001220703125, "loss": 0.6297, "rewards/accuracies": 0.6009524464607239, "rewards/chosen": 0.3150390684604645, "rewards/margins": 0.188720703125, "rewards/rejected": 0.12636718153953552, "step": 1200 }, { "epoch": 0.31024716786817713, "grad_norm": 142.0, "learning_rate": 3.448764160659114e-07, "logits/chosen": -0.35566407442092896, "logits/rejected": -0.3221435546875, "logps/chosen": -317.6000061035156, "logps/rejected": -322.79998779296875, "loss": 0.6523, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.24570313096046448, "rewards/margins": 0.13516846299171448, "rewards/rejected": 0.1102294921875, "step": 1205 }, { "epoch": 0.31153450051493303, "grad_norm": 228.0, "learning_rate": 3.4423274974253346e-07, "logits/chosen": -0.41240233182907104, "logits/rejected": -0.3428710997104645, "logps/chosen": -366.0, "logps/rejected": -304.79998779296875, "loss": 0.6383, "rewards/accuracies": 0.6360714435577393, "rewards/chosen": 0.357421875, "rewards/margins": 0.21210937201976776, "rewards/rejected": 0.14582519233226776, "step": 1210 }, { "epoch": 0.312821833161689, "grad_norm": 102.0, "learning_rate": 3.435890834191555e-07, "logits/chosen": -0.25957030057907104, "logits/rejected": -0.150390625, "logps/chosen": -294.20001220703125, "logps/rejected": -259.6000061035156, "loss": 0.6398, "rewards/accuracies": 0.6610714197158813, "rewards/chosen": 0.21601562201976776, "rewards/margins": 0.16274413466453552, "rewards/rejected": 0.05327148362994194, "step": 1215 }, { "epoch": 0.3141091658084449, "grad_norm": 129.0, "learning_rate": 3.429454170957775e-07, "logits/chosen": -0.21357421576976776, "logits/rejected": -0.19876709580421448, "logps/chosen": -306.20001220703125, "logps/rejected": -303.20001220703125, "loss": 0.65, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.18583984673023224, "rewards/margins": 0.13037109375, "rewards/rejected": 0.05561523512005806, "step": 1220 }, { "epoch": 0.31539649845520085, "grad_norm": 124.0, "learning_rate": 3.423017507723996e-07, "logits/chosen": -0.21474608778953552, "logits/rejected": -0.2808593809604645, "logps/chosen": -264.0, "logps/rejected": -280.6000061035156, "loss": 0.6391, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21171875298023224, "rewards/margins": 0.14707031846046448, "rewards/rejected": 0.06484375149011612, "step": 1225 }, { "epoch": 0.31668383110195675, "grad_norm": 107.5, "learning_rate": 3.4165808444902163e-07, "logits/chosen": -0.30517578125, "logits/rejected": -0.19941405951976776, "logps/chosen": -305.79998779296875, "logps/rejected": -281.6000061035156, "loss": 0.593, "rewards/accuracies": 0.6908974647521973, "rewards/chosen": 0.2822265625, "rewards/margins": 0.25996094942092896, "rewards/rejected": 0.02220458909869194, "step": 1230 }, { "epoch": 0.31797116374871265, "grad_norm": 98.5, "learning_rate": 3.410144181256436e-07, "logits/chosen": -0.26103514432907104, "logits/rejected": -0.2753448486328125, "logps/chosen": -339.6000061035156, "logps/rejected": -289.79998779296875, "loss": 0.6617, "rewards/accuracies": 0.5900000333786011, "rewards/chosen": 0.25273436307907104, "rewards/margins": 0.10273437201976776, "rewards/rejected": 0.15000000596046448, "step": 1235 }, { "epoch": 0.3192584963954686, "grad_norm": 174.0, "learning_rate": 3.403707518022657e-07, "logits/chosen": -0.2783203125, "logits/rejected": -0.3690429627895355, "logps/chosen": -309.79998779296875, "logps/rejected": -283.0, "loss": 0.6289, "rewards/accuracies": 0.5241667032241821, "rewards/chosen": 0.3033203184604645, "rewards/margins": 0.17236328125, "rewards/rejected": 0.13076171278953552, "step": 1240 }, { "epoch": 0.3205458290422245, "grad_norm": 126.0, "learning_rate": 3.3972708547888776e-07, "logits/chosen": -0.31171876192092896, "logits/rejected": -0.3199218809604645, "logps/chosen": -297.79998779296875, "logps/rejected": -250.39999389648438, "loss": 0.6453, "rewards/accuracies": 0.5666667222976685, "rewards/chosen": 0.2353515625, "rewards/margins": 0.13242188096046448, "rewards/rejected": 0.10319213569164276, "step": 1245 }, { "epoch": 0.3218331616889804, "grad_norm": 123.5, "learning_rate": 3.3908341915550975e-07, "logits/chosen": -0.2508300840854645, "logits/rejected": -0.12099609524011612, "logps/chosen": -278.3999938964844, "logps/rejected": -281.0, "loss": 0.6422, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": 0.22089843451976776, "rewards/margins": 0.140625, "rewards/rejected": 0.08035888522863388, "step": 1250 }, { "epoch": 0.32312049433573636, "grad_norm": 147.0, "learning_rate": 3.3843975283213184e-07, "logits/chosen": -0.33320313692092896, "logits/rejected": -0.17929688096046448, "logps/chosen": -315.0, "logps/rejected": -339.20001220703125, "loss": 0.6641, "rewards/accuracies": 0.6083333492279053, "rewards/chosen": 0.31914061307907104, "rewards/margins": 0.13691405951976776, "rewards/rejected": 0.18161621689796448, "step": 1255 }, { "epoch": 0.32440782698249226, "grad_norm": 143.0, "learning_rate": 3.3779608650875383e-07, "logits/chosen": -0.4251953065395355, "logits/rejected": -0.3818359375, "logps/chosen": -301.6000061035156, "logps/rejected": -251.1999969482422, "loss": 0.6172, "rewards/accuracies": 0.5783241987228394, "rewards/chosen": 0.2607421875, "rewards/margins": 0.190673828125, "rewards/rejected": 0.07009277492761612, "step": 1260 }, { "epoch": 0.3256951596292482, "grad_norm": 113.0, "learning_rate": 3.371524201853759e-07, "logits/chosen": -0.5160156488418579, "logits/rejected": -0.28593748807907104, "logps/chosen": -270.0, "logps/rejected": -241.1999969482422, "loss": 0.6031, "rewards/accuracies": 0.628928542137146, "rewards/chosen": 0.26484376192092896, "rewards/margins": 0.22226563096046448, "rewards/rejected": 0.04265136644244194, "step": 1265 }, { "epoch": 0.3269824922760041, "grad_norm": 148.0, "learning_rate": 3.3650875386199797e-07, "logits/chosen": -0.3050293028354645, "logits/rejected": -0.29389649629592896, "logps/chosen": -291.3999938964844, "logps/rejected": -288.0, "loss": 0.6461, "rewards/accuracies": 0.5814744234085083, "rewards/chosen": 0.22441406548023224, "rewards/margins": 0.13846436142921448, "rewards/rejected": 0.0859375, "step": 1270 }, { "epoch": 0.32826982492276, "grad_norm": 121.5, "learning_rate": 3.3586508753861996e-07, "logits/chosen": -0.41386717557907104, "logits/rejected": -0.38359373807907104, "logps/chosen": -341.3999938964844, "logps/rejected": -315.6000061035156, "loss": 0.668, "rewards/accuracies": 0.5398077368736267, "rewards/chosen": 0.25312501192092896, "rewards/margins": 0.09785155951976776, "rewards/rejected": 0.15541991591453552, "step": 1275 }, { "epoch": 0.329557157569516, "grad_norm": 110.0, "learning_rate": 3.35221421215242e-07, "logits/chosen": -0.2916015684604645, "logits/rejected": -0.28437501192092896, "logps/chosen": -318.20001220703125, "logps/rejected": -325.20001220703125, "loss": 0.6234, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22675780951976776, "rewards/margins": 0.17617186903953552, "rewards/rejected": 0.05063476413488388, "step": 1280 }, { "epoch": 0.3308444902162719, "grad_norm": 158.0, "learning_rate": 3.34577754891864e-07, "logits/chosen": -0.35888671875, "logits/rejected": -0.48682862520217896, "logps/chosen": -323.0, "logps/rejected": -238.60000610351562, "loss": 0.6125, "rewards/accuracies": 0.5235714316368103, "rewards/chosen": 0.3046875, "rewards/margins": 0.20322266221046448, "rewards/rejected": 0.10148926079273224, "step": 1285 }, { "epoch": 0.3321318228630278, "grad_norm": 102.5, "learning_rate": 3.339340885684861e-07, "logits/chosen": -0.2662597596645355, "logits/rejected": -0.2857421934604645, "logps/chosen": -288.0, "logps/rejected": -271.20001220703125, "loss": 0.6516, "rewards/accuracies": 0.567307710647583, "rewards/chosen": 0.21591797471046448, "rewards/margins": 0.12949219346046448, "rewards/rejected": 0.08657226711511612, "step": 1290 }, { "epoch": 0.33341915550978374, "grad_norm": 97.0, "learning_rate": 3.3329042224510813e-07, "logits/chosen": -0.30097657442092896, "logits/rejected": -0.21357421576976776, "logps/chosen": -293.20001220703125, "logps/rejected": -303.0, "loss": 0.6562, "rewards/accuracies": 0.6050000190734863, "rewards/chosen": 0.23320312798023224, "rewards/margins": 0.11176757514476776, "rewards/rejected": 0.1214599609375, "step": 1295 }, { "epoch": 0.33470648815653964, "grad_norm": 138.0, "learning_rate": 3.326467559217301e-07, "logits/chosen": -0.3070312440395355, "logits/rejected": -0.26689451932907104, "logps/chosen": -378.0, "logps/rejected": -351.6000061035156, "loss": 0.6203, "rewards/accuracies": 0.5691667199134827, "rewards/chosen": 0.3173828125, "rewards/margins": 0.18618163466453552, "rewards/rejected": 0.13105468451976776, "step": 1300 }, { "epoch": 0.3359938208032956, "grad_norm": 103.5, "learning_rate": 3.320030895983522e-07, "logits/chosen": -0.2978515625, "logits/rejected": -0.30742186307907104, "logps/chosen": -273.6000061035156, "logps/rejected": -259.0, "loss": 0.6422, "rewards/accuracies": 0.6066666841506958, "rewards/chosen": 0.20332030951976776, "rewards/margins": 0.13981933891773224, "rewards/rejected": 0.06352539360523224, "step": 1305 }, { "epoch": 0.3372811534500515, "grad_norm": 205.0, "learning_rate": 3.3135942327497426e-07, "logits/chosen": -0.25407713651657104, "logits/rejected": -0.181640625, "logps/chosen": -265.3999938964844, "logps/rejected": -284.20001220703125, "loss": 0.6703, "rewards/accuracies": 0.5217424631118774, "rewards/chosen": 0.18007811903953552, "rewards/margins": 0.1048583984375, "rewards/rejected": 0.0751953125, "step": 1310 }, { "epoch": 0.3385684860968074, "grad_norm": 130.0, "learning_rate": 3.3071575695159625e-07, "logits/chosen": -0.1962890625, "logits/rejected": -0.14238281548023224, "logps/chosen": -319.20001220703125, "logps/rejected": -286.0, "loss": 0.6328, "rewards/accuracies": 0.6541666984558105, "rewards/chosen": 0.2513671815395355, "rewards/margins": 0.19667968153953552, "rewards/rejected": 0.05439148098230362, "step": 1315 }, { "epoch": 0.33985581874356335, "grad_norm": 142.0, "learning_rate": 3.3007209062821835e-07, "logits/chosen": -0.33818358182907104, "logits/rejected": -0.5386108160018921, "logps/chosen": -297.20001220703125, "logps/rejected": -355.6000061035156, "loss": 0.6562, "rewards/accuracies": 0.5394231081008911, "rewards/chosen": 0.2783203125, "rewards/margins": 0.10698242485523224, "rewards/rejected": 0.17094726860523224, "step": 1320 }, { "epoch": 0.34114315139031925, "grad_norm": 119.5, "learning_rate": 3.2942842430484033e-07, "logits/chosen": -0.28496092557907104, "logits/rejected": -0.243316650390625, "logps/chosen": -332.0, "logps/rejected": -258.0, "loss": 0.6195, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.27519530057907104, "rewards/margins": 0.20332030951976776, "rewards/rejected": 0.072021484375, "step": 1325 }, { "epoch": 0.34243048403707516, "grad_norm": 125.0, "learning_rate": 3.287847579814624e-07, "logits/chosen": -0.05937499925494194, "logits/rejected": -0.30644530057907104, "logps/chosen": -310.20001220703125, "logps/rejected": -281.3999938964844, "loss": 0.6203, "rewards/accuracies": 0.6803571581840515, "rewards/chosen": 0.28925782442092896, "rewards/margins": 0.20498046278953552, "rewards/rejected": 0.08444824069738388, "step": 1330 }, { "epoch": 0.3437178166838311, "grad_norm": 104.0, "learning_rate": 3.2814109165808447e-07, "logits/chosen": -0.24155274033546448, "logits/rejected": -0.2881103456020355, "logps/chosen": -324.6000061035156, "logps/rejected": -317.20001220703125, "loss": 0.6609, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.21923828125, "rewards/margins": 0.09238281100988388, "rewards/rejected": 0.126953125, "step": 1335 }, { "epoch": 0.345005149330587, "grad_norm": 186.0, "learning_rate": 3.2749742533470646e-07, "logits/chosen": -0.48662108182907104, "logits/rejected": -0.3490234315395355, "logps/chosen": -337.6000061035156, "logps/rejected": -303.0, "loss": 0.6562, "rewards/accuracies": 0.493644654750824, "rewards/chosen": 0.3294921815395355, "rewards/margins": 0.1382904052734375, "rewards/rejected": 0.19189453125, "step": 1340 }, { "epoch": 0.34629248197734297, "grad_norm": 123.0, "learning_rate": 3.268537590113285e-07, "logits/chosen": -0.16311034560203552, "logits/rejected": -0.14033202826976776, "logps/chosen": -272.3999938964844, "logps/rejected": -278.0, "loss": 0.6586, "rewards/accuracies": 0.5734090805053711, "rewards/chosen": 0.16972656548023224, "rewards/margins": 0.119873046875, "rewards/rejected": 0.05007324367761612, "step": 1345 }, { "epoch": 0.34757981462409887, "grad_norm": 161.0, "learning_rate": 3.262100926879506e-07, "logits/chosen": -0.20729979872703552, "logits/rejected": 0.08317260444164276, "logps/chosen": -292.20001220703125, "logps/rejected": -300.8999938964844, "loss": 0.6703, "rewards/accuracies": 0.5183333158493042, "rewards/chosen": 0.17919921875, "rewards/margins": 0.09648437798023224, "rewards/rejected": 0.08251953125, "step": 1350 }, { "epoch": 0.34886714727085477, "grad_norm": 111.5, "learning_rate": 3.255664263645726e-07, "logits/chosen": -0.28935545682907104, "logits/rejected": -0.275390625, "logps/chosen": -268.6000061035156, "logps/rejected": -265.79998779296875, "loss": 0.6391, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.232421875, "rewards/margins": 0.14736327528953552, "rewards/rejected": 0.08535156399011612, "step": 1355 }, { "epoch": 0.35015447991761073, "grad_norm": 108.5, "learning_rate": 3.2492276004119463e-07, "logits/chosen": -0.2544921934604645, "logits/rejected": -0.26142579317092896, "logps/chosen": -258.79998779296875, "logps/rejected": -250.60000610351562, "loss": 0.6672, "rewards/accuracies": 0.5191666483879089, "rewards/chosen": 0.19814452528953552, "rewards/margins": 0.09201660007238388, "rewards/rejected": 0.10588379204273224, "step": 1360 }, { "epoch": 0.35144181256436663, "grad_norm": 119.0, "learning_rate": 3.242790937178166e-07, "logits/chosen": -0.4371093809604645, "logits/rejected": -0.34687501192092896, "logps/chosen": -295.3999938964844, "logps/rejected": -329.0, "loss": 0.6828, "rewards/accuracies": 0.5625, "rewards/chosen": 0.23847655951976776, "rewards/margins": 0.06718750298023224, "rewards/rejected": 0.17072753608226776, "step": 1365 }, { "epoch": 0.35272914521112253, "grad_norm": 122.0, "learning_rate": 3.236354273944387e-07, "logits/chosen": -0.30058592557907104, "logits/rejected": -0.2731262147426605, "logps/chosen": -282.3999938964844, "logps/rejected": -238.1999969482422, "loss": 0.6844, "rewards/accuracies": 0.5646428465843201, "rewards/chosen": 0.19160155951976776, "rewards/margins": 0.04798584058880806, "rewards/rejected": 0.14365234971046448, "step": 1370 }, { "epoch": 0.3540164778578785, "grad_norm": 163.0, "learning_rate": 3.2299176107106076e-07, "logits/chosen": -0.30029296875, "logits/rejected": -0.3408203125, "logps/chosen": -315.3999938964844, "logps/rejected": -359.6000061035156, "loss": 0.6695, "rewards/accuracies": 0.49988096952438354, "rewards/chosen": 0.2158203125, "rewards/margins": 0.0833740234375, "rewards/rejected": 0.13286133110523224, "step": 1375 }, { "epoch": 0.3553038105046344, "grad_norm": 122.0, "learning_rate": 3.2234809474768275e-07, "logits/chosen": -0.34257811307907104, "logits/rejected": -0.264404296875, "logps/chosen": -348.3999938964844, "logps/rejected": -286.79998779296875, "loss": 0.6445, "rewards/accuracies": 0.4804762303829193, "rewards/chosen": 0.24050292372703552, "rewards/margins": 0.14584961533546448, "rewards/rejected": 0.09456177055835724, "step": 1380 }, { "epoch": 0.35659114315139034, "grad_norm": 99.0, "learning_rate": 3.2170442842430485e-07, "logits/chosen": -0.36503905057907104, "logits/rejected": -0.36279296875, "logps/chosen": -265.3999938964844, "logps/rejected": -256.3999938964844, "loss": 0.6406, "rewards/accuracies": 0.5486904978752136, "rewards/chosen": 0.27734375, "rewards/margins": 0.17087402939796448, "rewards/rejected": 0.10590820014476776, "step": 1385 }, { "epoch": 0.35787847579814624, "grad_norm": 134.0, "learning_rate": 3.2106076210092684e-07, "logits/chosen": -0.3291992247104645, "logits/rejected": -0.32451170682907104, "logps/chosen": -344.0, "logps/rejected": -312.79998779296875, "loss": 0.6859, "rewards/accuracies": 0.5102381110191345, "rewards/chosen": 0.17617186903953552, "rewards/margins": 0.04499511793255806, "rewards/rejected": 0.13124999403953552, "step": 1390 }, { "epoch": 0.35916580844490215, "grad_norm": 123.0, "learning_rate": 3.204170957775489e-07, "logits/chosen": -0.34003907442092896, "logits/rejected": -0.15859374403953552, "logps/chosen": -259.0, "logps/rejected": -247.0, "loss": 0.6391, "rewards/accuracies": 0.6012637615203857, "rewards/chosen": 0.20292969048023224, "rewards/margins": 0.15644530951976776, "rewards/rejected": 0.04624328762292862, "step": 1395 }, { "epoch": 0.3604531410916581, "grad_norm": 115.0, "learning_rate": 3.19773429454171e-07, "logits/chosen": -0.26984864473342896, "logits/rejected": -0.2757812440395355, "logps/chosen": -322.0, "logps/rejected": -275.0, "loss": 0.6328, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.23740234971046448, "rewards/margins": 0.15938720107078552, "rewards/rejected": 0.07785644382238388, "step": 1400 }, { "epoch": 0.361740473738414, "grad_norm": 124.0, "learning_rate": 3.1912976313079296e-07, "logits/chosen": -0.2979980409145355, "logits/rejected": -0.28046876192092896, "logps/chosen": -343.6000061035156, "logps/rejected": -299.20001220703125, "loss": 0.6227, "rewards/accuracies": 0.5724999904632568, "rewards/chosen": 0.3623046875, "rewards/margins": 0.2119140625, "rewards/rejected": 0.150390625, "step": 1405 }, { "epoch": 0.3630278063851699, "grad_norm": 145.0, "learning_rate": 3.18486096807415e-07, "logits/chosen": -0.16416016221046448, "logits/rejected": -0.17141112685203552, "logps/chosen": -315.0, "logps/rejected": -348.79998779296875, "loss": 0.6703, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2523437440395355, "rewards/margins": 0.0838623046875, "rewards/rejected": 0.168701171875, "step": 1410 }, { "epoch": 0.36431513903192586, "grad_norm": 124.5, "learning_rate": 3.178424304840371e-07, "logits/chosen": -0.33857423067092896, "logits/rejected": -0.27104490995407104, "logps/chosen": -301.79998779296875, "logps/rejected": -274.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.5583333373069763, "rewards/chosen": 0.19853515923023224, "rewards/margins": 0.16816405951976776, "rewards/rejected": 0.030517578125, "step": 1415 }, { "epoch": 0.36560247167868176, "grad_norm": 122.5, "learning_rate": 3.171987641606591e-07, "logits/chosen": -0.15771484375, "logits/rejected": -0.22270508110523224, "logps/chosen": -286.3999938964844, "logps/rejected": -282.3999938964844, "loss": 0.6586, "rewards/accuracies": 0.5534090995788574, "rewards/chosen": 0.22568359971046448, "rewards/margins": 0.10354004055261612, "rewards/rejected": 0.12248535454273224, "step": 1420 }, { "epoch": 0.3668898043254377, "grad_norm": 119.0, "learning_rate": 3.1655509783728114e-07, "logits/chosen": -0.3492187559604645, "logits/rejected": -0.38457030057907104, "logps/chosen": -340.20001220703125, "logps/rejected": -348.79998779296875, "loss": 0.625, "rewards/accuracies": 0.6835898160934448, "rewards/chosen": 0.3045898377895355, "rewards/margins": 0.20644530653953552, "rewards/rejected": 0.098388671875, "step": 1425 }, { "epoch": 0.3681771369721936, "grad_norm": 187.0, "learning_rate": 3.159114315139032e-07, "logits/chosen": -0.4839843809604645, "logits/rejected": -0.544726550579071, "logps/chosen": -300.0, "logps/rejected": -226.0, "loss": 0.6359, "rewards/accuracies": 0.6600000262260437, "rewards/chosen": 0.24667969346046448, "rewards/margins": 0.1416015625, "rewards/rejected": 0.10517577826976776, "step": 1430 }, { "epoch": 0.3694644696189495, "grad_norm": 111.0, "learning_rate": 3.152677651905252e-07, "logits/chosen": -0.1581573486328125, "logits/rejected": -0.21322020888328552, "logps/chosen": -242.0, "logps/rejected": -268.3999938964844, "loss": 0.6398, "rewards/accuracies": 0.604358971118927, "rewards/chosen": 0.15500488877296448, "rewards/margins": 0.15292969346046448, "rewards/rejected": 0.0020019530784338713, "step": 1435 }, { "epoch": 0.3707518022657055, "grad_norm": 145.0, "learning_rate": 3.1462409886714726e-07, "logits/chosen": -0.2798828184604645, "logits/rejected": -0.240234375, "logps/chosen": -312.20001220703125, "logps/rejected": -265.6000061035156, "loss": 0.6516, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.16660156846046448, "rewards/margins": 0.10585937649011612, "rewards/rejected": 0.06071777269244194, "step": 1440 }, { "epoch": 0.3720391349124614, "grad_norm": 165.0, "learning_rate": 3.1398043254376925e-07, "logits/chosen": -0.17690429091453552, "logits/rejected": -0.17724609375, "logps/chosen": -305.20001220703125, "logps/rejected": -272.0, "loss": 0.6617, "rewards/accuracies": 0.5383332967758179, "rewards/chosen": 0.232421875, "rewards/margins": 0.13658447563648224, "rewards/rejected": 0.09586181491613388, "step": 1445 }, { "epoch": 0.3733264675592173, "grad_norm": 113.5, "learning_rate": 3.1333676622039135e-07, "logits/chosen": -0.4175781309604645, "logits/rejected": -0.43925780057907104, "logps/chosen": -384.3999938964844, "logps/rejected": -312.79998779296875, "loss": 0.6219, "rewards/accuracies": 0.6396428942680359, "rewards/chosen": 0.31621092557907104, "rewards/margins": 0.18398436903953552, "rewards/rejected": 0.13242188096046448, "step": 1450 }, { "epoch": 0.37461380020597324, "grad_norm": 392.0, "learning_rate": 3.126930998970134e-07, "logits/chosen": -0.20942382514476776, "logits/rejected": -0.171142578125, "logps/chosen": -269.0, "logps/rejected": -253.0, "loss": 0.6539, "rewards/accuracies": 0.5459523797035217, "rewards/chosen": 0.17880859971046448, "rewards/margins": 0.11552734673023224, "rewards/rejected": 0.06306152045726776, "step": 1455 }, { "epoch": 0.37590113285272914, "grad_norm": 120.0, "learning_rate": 3.120494335736354e-07, "logits/chosen": -0.21703490614891052, "logits/rejected": -0.197509765625, "logps/chosen": -348.79998779296875, "logps/rejected": -326.0, "loss": 0.6344, "rewards/accuracies": 0.5441666841506958, "rewards/chosen": 0.2632812559604645, "rewards/margins": 0.17792968451976776, "rewards/rejected": 0.0849609375, "step": 1460 }, { "epoch": 0.3771884654994851, "grad_norm": 173.0, "learning_rate": 3.114057672502575e-07, "logits/chosen": -0.306640625, "logits/rejected": -0.19492188096046448, "logps/chosen": -315.0, "logps/rejected": -288.20001220703125, "loss": 0.6336, "rewards/accuracies": 0.6385714411735535, "rewards/chosen": 0.23281249403953552, "rewards/margins": 0.16342774033546448, "rewards/rejected": 0.06953124701976776, "step": 1465 }, { "epoch": 0.378475798146241, "grad_norm": 151.0, "learning_rate": 3.1076210092687947e-07, "logits/chosen": -0.4453125, "logits/rejected": -0.390625, "logps/chosen": -333.6000061035156, "logps/rejected": -267.6000061035156, "loss": 0.6508, "rewards/accuracies": 0.5743131637573242, "rewards/chosen": 0.2613281309604645, "rewards/margins": 0.12834472954273224, "rewards/rejected": 0.13291016221046448, "step": 1470 }, { "epoch": 0.3797631307929969, "grad_norm": 130.0, "learning_rate": 3.101184346035015e-07, "logits/chosen": -0.22456054389476776, "logits/rejected": -0.3013671934604645, "logps/chosen": -327.79998779296875, "logps/rejected": -334.20001220703125, "loss": 0.6734, "rewards/accuracies": 0.5808333158493042, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.08154296875, "rewards/rejected": 0.16464844346046448, "step": 1475 }, { "epoch": 0.38105046343975285, "grad_norm": 231.0, "learning_rate": 3.094747682801236e-07, "logits/chosen": -0.3272460997104645, "logits/rejected": 0.14936523139476776, "logps/chosen": -216.8000030517578, "logps/rejected": -252.39999389648438, "loss": 0.6633, "rewards/accuracies": 0.5272727012634277, "rewards/chosen": 0.18984374403953552, "rewards/margins": 0.10073242336511612, "rewards/rejected": 0.08913574367761612, "step": 1480 }, { "epoch": 0.38233779608650875, "grad_norm": 106.5, "learning_rate": 3.088311019567456e-07, "logits/chosen": -0.3285156190395355, "logits/rejected": -0.26591795682907104, "logps/chosen": -328.79998779296875, "logps/rejected": -284.3999938964844, "loss": 0.6352, "rewards/accuracies": 0.6449999809265137, "rewards/chosen": 0.2880859375, "rewards/margins": 0.14873047173023224, "rewards/rejected": 0.13935546576976776, "step": 1485 }, { "epoch": 0.38362512873326465, "grad_norm": 121.0, "learning_rate": 3.0818743563336764e-07, "logits/chosen": -0.3714843690395355, "logits/rejected": -0.27497559785842896, "logps/chosen": -335.3999938964844, "logps/rejected": -281.0, "loss": 0.6242, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2601562440395355, "rewards/margins": 0.16562500596046448, "rewards/rejected": 0.09445800632238388, "step": 1490 }, { "epoch": 0.3849124613800206, "grad_norm": 106.5, "learning_rate": 3.0754376930998973e-07, "logits/chosen": -0.421875, "logits/rejected": -0.3431640565395355, "logps/chosen": -305.20001220703125, "logps/rejected": -279.0, "loss": 0.6562, "rewards/accuracies": 0.5719230771064758, "rewards/chosen": 0.26152342557907104, "rewards/margins": 0.11552734673023224, "rewards/rejected": 0.1453857421875, "step": 1495 }, { "epoch": 0.3861997940267765, "grad_norm": 104.0, "learning_rate": 3.069001029866117e-07, "logits/chosen": -0.3314453065395355, "logits/rejected": -0.27978515625, "logps/chosen": -285.79998779296875, "logps/rejected": -272.0, "loss": 0.6172, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.28515625, "rewards/margins": 0.19234618544578552, "rewards/rejected": 0.09257812798023224, "step": 1500 }, { "epoch": 0.38748712667353247, "grad_norm": 108.5, "learning_rate": 3.0625643666323377e-07, "logits/chosen": -0.2835937440395355, "logits/rejected": -0.24716797471046448, "logps/chosen": -320.79998779296875, "logps/rejected": -312.0, "loss": 0.6227, "rewards/accuracies": 0.5526922941207886, "rewards/chosen": 0.2607421875, "rewards/margins": 0.19003906846046448, "rewards/rejected": 0.07072754204273224, "step": 1505 }, { "epoch": 0.38877445932028837, "grad_norm": 142.0, "learning_rate": 3.056127703398558e-07, "logits/chosen": -0.3399414122104645, "logits/rejected": -0.3782714903354645, "logps/chosen": -339.79998779296875, "logps/rejected": -280.20001220703125, "loss": 0.6266, "rewards/accuracies": 0.550595223903656, "rewards/chosen": 0.28886717557907104, "rewards/margins": 0.17429199814796448, "rewards/rejected": 0.11475829780101776, "step": 1510 }, { "epoch": 0.39006179196704427, "grad_norm": 105.5, "learning_rate": 3.0496910401647785e-07, "logits/chosen": -0.4488281309604645, "logits/rejected": -0.324462890625, "logps/chosen": -354.3999938964844, "logps/rejected": -304.3999938964844, "loss": 0.6117, "rewards/accuracies": 0.6833974123001099, "rewards/chosen": 0.3333984315395355, "rewards/margins": 0.20644530653953552, "rewards/rejected": 0.12666015326976776, "step": 1515 }, { "epoch": 0.3913491246138002, "grad_norm": 120.5, "learning_rate": 3.043254376930999e-07, "logits/chosen": -0.40507811307907104, "logits/rejected": -0.2964843809604645, "logps/chosen": -299.0, "logps/rejected": -306.6000061035156, "loss": 0.6344, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": 0.2564453184604645, "rewards/margins": 0.16612549126148224, "rewards/rejected": 0.09047851711511612, "step": 1520 }, { "epoch": 0.3926364572605561, "grad_norm": 113.5, "learning_rate": 3.036817713697219e-07, "logits/chosen": -0.3511718809604645, "logits/rejected": -0.3416992127895355, "logps/chosen": -326.3999938964844, "logps/rejected": -290.0, "loss": 0.6305, "rewards/accuracies": 0.6183333396911621, "rewards/chosen": 0.22841796278953552, "rewards/margins": 0.16968993842601776, "rewards/rejected": 0.05910644680261612, "step": 1525 }, { "epoch": 0.393923789907312, "grad_norm": 187.0, "learning_rate": 3.03038105046344e-07, "logits/chosen": -0.24238280951976776, "logits/rejected": -0.28801268339157104, "logps/chosen": -273.0, "logps/rejected": -281.0, "loss": 0.675, "rewards/accuracies": 0.5891666412353516, "rewards/chosen": 0.20722655951976776, "rewards/margins": 0.09375, "rewards/rejected": 0.11320801079273224, "step": 1530 }, { "epoch": 0.395211122554068, "grad_norm": 160.0, "learning_rate": 3.0239443872296597e-07, "logits/chosen": -0.15751953423023224, "logits/rejected": -0.16708984971046448, "logps/chosen": -265.70001220703125, "logps/rejected": -253.0, "loss": 0.6445, "rewards/accuracies": 0.6059091687202454, "rewards/chosen": 0.16044922173023224, "rewards/margins": 0.1275634765625, "rewards/rejected": 0.03298339992761612, "step": 1535 }, { "epoch": 0.3964984552008239, "grad_norm": 174.0, "learning_rate": 3.01750772399588e-07, "logits/chosen": -0.24355468153953552, "logits/rejected": -0.26875001192092896, "logps/chosen": -312.3999938964844, "logps/rejected": -370.79998779296875, "loss": 0.6625, "rewards/accuracies": 0.5733333826065063, "rewards/chosen": 0.22490234673023224, "rewards/margins": 0.12666015326976776, "rewards/rejected": 0.09799804538488388, "step": 1540 }, { "epoch": 0.39778578784757984, "grad_norm": 142.0, "learning_rate": 3.011071060762101e-07, "logits/chosen": -0.2286376953125, "logits/rejected": -0.24462890625, "logps/chosen": -274.79998779296875, "logps/rejected": -287.79998779296875, "loss": 0.625, "rewards/accuracies": 0.6433441042900085, "rewards/chosen": 0.2728515565395355, "rewards/margins": 0.17559814453125, "rewards/rejected": 0.09736327826976776, "step": 1545 }, { "epoch": 0.39907312049433574, "grad_norm": 136.0, "learning_rate": 3.004634397528321e-07, "logits/chosen": -0.23247070610523224, "logits/rejected": -0.30859375, "logps/chosen": -303.0, "logps/rejected": -257.3999938964844, "loss": 0.6648, "rewards/accuracies": 0.620604395866394, "rewards/chosen": 0.22343750298023224, "rewards/margins": 0.12092284858226776, "rewards/rejected": 0.10244140774011612, "step": 1550 }, { "epoch": 0.40036045314109164, "grad_norm": 119.5, "learning_rate": 2.9981977342945414e-07, "logits/chosen": -0.33906251192092896, "logits/rejected": -0.3238281309604645, "logps/chosen": -315.20001220703125, "logps/rejected": -312.3999938964844, "loss": 0.5961, "rewards/accuracies": 0.6753571629524231, "rewards/chosen": 0.2974609434604645, "rewards/margins": 0.2564453184604645, "rewards/rejected": 0.04072265699505806, "step": 1555 }, { "epoch": 0.4016477857878476, "grad_norm": 118.0, "learning_rate": 2.9917610710607623e-07, "logits/chosen": -0.18828125298023224, "logits/rejected": -0.12680664658546448, "logps/chosen": -339.20001220703125, "logps/rejected": -308.3999938964844, "loss": 0.6109, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.27460938692092896, "rewards/margins": 0.21367187798023224, "rewards/rejected": 0.06113281100988388, "step": 1560 }, { "epoch": 0.4029351184346035, "grad_norm": 125.0, "learning_rate": 2.985324407826982e-07, "logits/chosen": -0.32695311307907104, "logits/rejected": -0.3310546875, "logps/chosen": -301.0, "logps/rejected": -293.79998779296875, "loss": 0.6297, "rewards/accuracies": 0.6822435855865479, "rewards/chosen": 0.265625, "rewards/margins": 0.18027344346046448, "rewards/rejected": 0.08527831733226776, "step": 1565 }, { "epoch": 0.4042224510813594, "grad_norm": 132.0, "learning_rate": 2.9788877445932027e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.15751953423023224, "logps/chosen": -297.3999938964844, "logps/rejected": -280.20001220703125, "loss": 0.6719, "rewards/accuracies": 0.5026190876960754, "rewards/chosen": 0.19326171278953552, "rewards/margins": 0.07148437201976776, "rewards/rejected": 0.12150879204273224, "step": 1570 }, { "epoch": 0.40550978372811536, "grad_norm": 128.0, "learning_rate": 2.972451081359423e-07, "logits/chosen": -0.10957030951976776, "logits/rejected": -0.19370117783546448, "logps/chosen": -270.6000061035156, "logps/rejected": -262.3999938964844, "loss": 0.6391, "rewards/accuracies": 0.591785728931427, "rewards/chosen": 0.163330078125, "rewards/margins": 0.15498046576976776, "rewards/rejected": 0.00830078125, "step": 1575 }, { "epoch": 0.40679711637487126, "grad_norm": 107.5, "learning_rate": 2.9660144181256435e-07, "logits/chosen": -0.3681640625, "logits/rejected": -0.375, "logps/chosen": -295.0, "logps/rejected": -237.0, "loss": 0.6305, "rewards/accuracies": 0.5525000095367432, "rewards/chosen": 0.2916503846645355, "rewards/margins": 0.17341308295726776, "rewards/rejected": 0.11835937201976776, "step": 1580 }, { "epoch": 0.4080844490216272, "grad_norm": 168.0, "learning_rate": 2.959577754891864e-07, "logits/chosen": -0.36083984375, "logits/rejected": -0.36567384004592896, "logps/chosen": -356.3999938964844, "logps/rejected": -305.0, "loss": 0.6664, "rewards/accuracies": 0.5545238256454468, "rewards/chosen": 0.25605469942092896, "rewards/margins": 0.10356445610523224, "rewards/rejected": 0.15255126357078552, "step": 1585 }, { "epoch": 0.4093717816683831, "grad_norm": 115.5, "learning_rate": 2.9531410916580844e-07, "logits/chosen": -0.38691407442092896, "logits/rejected": -0.3017578125, "logps/chosen": -284.20001220703125, "logps/rejected": -278.3999938964844, "loss": 0.6203, "rewards/accuracies": 0.6470237970352173, "rewards/chosen": 0.2705078125, "rewards/margins": 0.19746093451976776, "rewards/rejected": 0.07292480766773224, "step": 1590 }, { "epoch": 0.410659114315139, "grad_norm": 147.0, "learning_rate": 2.946704428424305e-07, "logits/chosen": -0.2955078184604645, "logits/rejected": -0.2621093690395355, "logps/chosen": -329.20001220703125, "logps/rejected": -298.0, "loss": 0.6266, "rewards/accuracies": 0.5985714197158813, "rewards/chosen": 0.23066405951976776, "rewards/margins": 0.17978516221046448, "rewards/rejected": 0.05054321140050888, "step": 1595 }, { "epoch": 0.411946446961895, "grad_norm": 156.0, "learning_rate": 2.940267765190525e-07, "logits/chosen": -0.3192382752895355, "logits/rejected": -0.2855468690395355, "logps/chosen": -324.6000061035156, "logps/rejected": -266.79998779296875, "loss": 0.6406, "rewards/accuracies": 0.5726190805435181, "rewards/chosen": 0.205078125, "rewards/margins": 0.143310546875, "rewards/rejected": 0.06229247897863388, "step": 1600 }, { "epoch": 0.4132337796086509, "grad_norm": 140.0, "learning_rate": 2.933831101956745e-07, "logits/chosen": -0.34160155057907104, "logits/rejected": -0.3402343690395355, "logps/chosen": -319.6000061035156, "logps/rejected": -258.79998779296875, "loss": 0.632, "rewards/accuracies": 0.5680555105209351, "rewards/chosen": 0.25371092557907104, "rewards/margins": 0.16083984076976776, "rewards/rejected": 0.09322509914636612, "step": 1605 }, { "epoch": 0.4145211122554068, "grad_norm": 146.0, "learning_rate": 2.927394438722966e-07, "logits/chosen": -0.32353514432907104, "logits/rejected": -0.20263671875, "logps/chosen": -256.6000061035156, "logps/rejected": -244.39999389648438, "loss": 0.6406, "rewards/accuracies": 0.6532575488090515, "rewards/chosen": 0.23417969048023224, "rewards/margins": 0.15561524033546448, "rewards/rejected": 0.07875976711511612, "step": 1610 }, { "epoch": 0.41580844490216273, "grad_norm": 122.0, "learning_rate": 2.920957775489186e-07, "logits/chosen": -0.23457030951976776, "logits/rejected": -0.25371092557907104, "logps/chosen": -321.3999938964844, "logps/rejected": -310.3999938964844, "loss": 0.6469, "rewards/accuracies": 0.5023717880249023, "rewards/chosen": 0.2554687559604645, "rewards/margins": 0.14020995795726776, "rewards/rejected": 0.11518554389476776, "step": 1615 }, { "epoch": 0.41709577754891863, "grad_norm": 158.0, "learning_rate": 2.9145211122554064e-07, "logits/chosen": -0.46210938692092896, "logits/rejected": -0.36992186307907104, "logps/chosen": -351.79998779296875, "logps/rejected": -322.3999938964844, "loss": 0.6398, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.2630859315395355, "rewards/margins": 0.15156249701976776, "rewards/rejected": 0.11133117973804474, "step": 1620 }, { "epoch": 0.41838311019567453, "grad_norm": 121.5, "learning_rate": 2.9080844490216274e-07, "logits/chosen": -0.3671875, "logits/rejected": -0.27519530057907104, "logps/chosen": -267.79998779296875, "logps/rejected": -243.60000610351562, "loss": 0.6398, "rewards/accuracies": 0.45355314016342163, "rewards/chosen": 0.24062499403953552, "rewards/margins": 0.15142822265625, "rewards/rejected": 0.08964844048023224, "step": 1625 }, { "epoch": 0.4196704428424305, "grad_norm": 144.0, "learning_rate": 2.9016477857878473e-07, "logits/chosen": -0.4310546815395355, "logits/rejected": -0.3958984315395355, "logps/chosen": -321.20001220703125, "logps/rejected": -265.20001220703125, "loss": 0.6367, "rewards/accuracies": 0.6160256862640381, "rewards/chosen": 0.29023438692092896, "rewards/margins": 0.18266601860523224, "rewards/rejected": 0.10744629055261612, "step": 1630 }, { "epoch": 0.4209577754891864, "grad_norm": 131.0, "learning_rate": 2.8952111225540677e-07, "logits/chosen": -0.17724609375, "logits/rejected": -0.029296875, "logps/chosen": -304.3999938964844, "logps/rejected": -281.20001220703125, "loss": 0.6105, "rewards/accuracies": 0.6489285826683044, "rewards/chosen": 0.2894531190395355, "rewards/margins": 0.22541503608226776, "rewards/rejected": 0.064453125, "step": 1635 }, { "epoch": 0.42224510813594235, "grad_norm": 132.0, "learning_rate": 2.8887744593202886e-07, "logits/chosen": -0.31855469942092896, "logits/rejected": -0.15581054985523224, "logps/chosen": -334.0, "logps/rejected": -309.0, "loss": 0.6352, "rewards/accuracies": 0.5641666650772095, "rewards/chosen": 0.25410157442092896, "rewards/margins": 0.15899658203125, "rewards/rejected": 0.094970703125, "step": 1640 }, { "epoch": 0.42353244078269825, "grad_norm": 140.0, "learning_rate": 2.8823377960865085e-07, "logits/chosen": -0.3394531309604645, "logits/rejected": -0.28740233182907104, "logps/chosen": -311.20001220703125, "logps/rejected": -312.3999938964844, "loss": 0.6414, "rewards/accuracies": 0.559166669845581, "rewards/chosen": 0.22480468451976776, "rewards/margins": 0.14145508408546448, "rewards/rejected": 0.08339843899011612, "step": 1645 }, { "epoch": 0.42481977342945415, "grad_norm": 130.0, "learning_rate": 2.875901132852729e-07, "logits/chosen": -0.35468751192092896, "logits/rejected": -0.2994140684604645, "logps/chosen": -259.0, "logps/rejected": -267.3999938964844, "loss": 0.6188, "rewards/accuracies": 0.5866667032241821, "rewards/chosen": 0.23749999701976776, "rewards/margins": 0.20332030951976776, "rewards/rejected": 0.03425293043255806, "step": 1650 }, { "epoch": 0.4261071060762101, "grad_norm": 103.5, "learning_rate": 2.8694644696189494e-07, "logits/chosen": -0.27373045682907104, "logits/rejected": -0.2353515625, "logps/chosen": -268.3999938964844, "logps/rejected": -263.6000061035156, "loss": 0.6531, "rewards/accuracies": 0.5549242496490479, "rewards/chosen": 0.2708984315395355, "rewards/margins": 0.12841796875, "rewards/rejected": 0.14238281548023224, "step": 1655 }, { "epoch": 0.427394438722966, "grad_norm": 127.0, "learning_rate": 2.86302780638517e-07, "logits/chosen": -0.23837891221046448, "logits/rejected": -0.20878906548023224, "logps/chosen": -362.0, "logps/rejected": -336.0, "loss": 0.6266, "rewards/accuracies": 0.5949999690055847, "rewards/chosen": 0.27250975370407104, "rewards/margins": 0.18789061903953552, "rewards/rejected": 0.08502502739429474, "step": 1660 }, { "epoch": 0.4286817713697219, "grad_norm": 124.5, "learning_rate": 2.85659114315139e-07, "logits/chosen": -0.30976563692092896, "logits/rejected": -0.39726561307907104, "logps/chosen": -310.79998779296875, "logps/rejected": -375.6000061035156, "loss": 0.6555, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.22617188096046448, "rewards/margins": 0.12583008408546448, "rewards/rejected": 0.10073242336511612, "step": 1665 }, { "epoch": 0.42996910401647787, "grad_norm": 148.0, "learning_rate": 2.8501544799176107e-07, "logits/chosen": -0.3880371153354645, "logits/rejected": -0.28125, "logps/chosen": -355.20001220703125, "logps/rejected": -312.3999938964844, "loss": 0.6586, "rewards/accuracies": 0.5827381014823914, "rewards/chosen": 0.2557617127895355, "rewards/margins": 0.12998047471046448, "rewards/rejected": 0.1259765625, "step": 1670 }, { "epoch": 0.43125643666323377, "grad_norm": 120.5, "learning_rate": 2.843717816683831e-07, "logits/chosen": -0.3832031190395355, "logits/rejected": -0.4115234315395355, "logps/chosen": -266.3999938964844, "logps/rejected": -223.39999389648438, "loss": 0.6375, "rewards/accuracies": 0.5525000691413879, "rewards/chosen": 0.31367188692092896, "rewards/margins": 0.16269531846046448, "rewards/rejected": 0.151611328125, "step": 1675 }, { "epoch": 0.4325437693099897, "grad_norm": 113.5, "learning_rate": 2.837281153450051e-07, "logits/chosen": -0.3785156309604645, "logits/rejected": -0.46484375, "logps/chosen": -298.0, "logps/rejected": -224.5, "loss": 0.7039, "rewards/accuracies": 0.5289285778999329, "rewards/chosen": 0.15517577528953552, "rewards/margins": 0.03038330003619194, "rewards/rejected": 0.12543945014476776, "step": 1680 }, { "epoch": 0.4338311019567456, "grad_norm": 140.0, "learning_rate": 2.830844490216272e-07, "logits/chosen": -0.44648438692092896, "logits/rejected": -0.4749999940395355, "logps/chosen": -345.6000061035156, "logps/rejected": -297.3999938964844, "loss": 0.6289, "rewards/accuracies": 0.6546429395675659, "rewards/chosen": 0.3597656190395355, "rewards/margins": 0.17890624701976776, "rewards/rejected": 0.18125000596046448, "step": 1685 }, { "epoch": 0.4351184346035015, "grad_norm": 129.0, "learning_rate": 2.8244078269824924e-07, "logits/chosen": -0.30244141817092896, "logits/rejected": -0.27119141817092896, "logps/chosen": -302.0, "logps/rejected": -292.0, "loss": 0.6633, "rewards/accuracies": 0.5824999809265137, "rewards/chosen": 0.20498046278953552, "rewards/margins": 0.1181640625, "rewards/rejected": 0.08685302734375, "step": 1690 }, { "epoch": 0.4364057672502575, "grad_norm": 127.5, "learning_rate": 2.8179711637487123e-07, "logits/chosen": -0.21381835639476776, "logits/rejected": -0.18837890028953552, "logps/chosen": -296.79998779296875, "logps/rejected": -285.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.5791667103767395, "rewards/chosen": 0.22285155951976776, "rewards/margins": 0.17246094346046448, "rewards/rejected": 0.05097656324505806, "step": 1695 }, { "epoch": 0.4376930998970134, "grad_norm": 113.5, "learning_rate": 2.8115345005149327e-07, "logits/chosen": -0.20468750596046448, "logits/rejected": 0.0024169920943677425, "logps/chosen": -305.6000061035156, "logps/rejected": -278.0, "loss": 0.6297, "rewards/accuracies": 0.6425000429153442, "rewards/chosen": 0.23710937798023224, "rewards/margins": 0.16914062201976776, "rewards/rejected": 0.06816406548023224, "step": 1700 }, { "epoch": 0.4389804325437693, "grad_norm": 117.5, "learning_rate": 2.8050978372811537e-07, "logits/chosen": -0.22343750298023224, "logits/rejected": -0.14504393935203552, "logps/chosen": -307.20001220703125, "logps/rejected": -337.20001220703125, "loss": 0.6453, "rewards/accuracies": 0.5607143044471741, "rewards/chosen": 0.22724609076976776, "rewards/margins": 0.15366211533546448, "rewards/rejected": 0.0732421875, "step": 1705 }, { "epoch": 0.44026776519052524, "grad_norm": 123.5, "learning_rate": 2.7986611740473736e-07, "logits/chosen": -0.16220703721046448, "logits/rejected": -0.263671875, "logps/chosen": -324.79998779296875, "logps/rejected": -322.79998779296875, "loss": 0.6602, "rewards/accuracies": 0.5083333253860474, "rewards/chosen": 0.23085936903953552, "rewards/margins": 0.10714111477136612, "rewards/rejected": 0.12333984673023224, "step": 1710 }, { "epoch": 0.44155509783728114, "grad_norm": 128.0, "learning_rate": 2.792224510813594e-07, "logits/chosen": -0.3958984315395355, "logits/rejected": 0.32929688692092896, "logps/chosen": -238.6750030517578, "logps/rejected": -265.79998779296875, "loss": 0.6336, "rewards/accuracies": 0.5965476632118225, "rewards/chosen": 0.22587890923023224, "rewards/margins": 0.18183593451976776, "rewards/rejected": 0.04414062574505806, "step": 1715 }, { "epoch": 0.4428424304840371, "grad_norm": 129.0, "learning_rate": 2.7857878475798144e-07, "logits/chosen": -0.30937498807907104, "logits/rejected": -0.16035155951976776, "logps/chosen": -344.3999938964844, "logps/rejected": -321.3999938964844, "loss": 0.6664, "rewards/accuracies": 0.5116666555404663, "rewards/chosen": 0.21953125298023224, "rewards/margins": 0.11491699516773224, "rewards/rejected": 0.10468749701976776, "step": 1720 }, { "epoch": 0.444129763130793, "grad_norm": 102.5, "learning_rate": 2.779351184346035e-07, "logits/chosen": -0.3172851502895355, "logits/rejected": -0.4065185487270355, "logps/chosen": -297.0, "logps/rejected": -271.3999938964844, "loss": 0.6219, "rewards/accuracies": 0.6221428513526917, "rewards/chosen": 0.22812500596046448, "rewards/margins": 0.18486328423023224, "rewards/rejected": 0.043243408203125, "step": 1725 }, { "epoch": 0.4454170957775489, "grad_norm": 114.5, "learning_rate": 2.7729145211122553e-07, "logits/chosen": -0.38066405057907104, "logits/rejected": -0.3539062440395355, "logps/chosen": -339.20001220703125, "logps/rejected": -319.6000061035156, "loss": 0.6531, "rewards/accuracies": 0.5891667008399963, "rewards/chosen": 0.22841796278953552, "rewards/margins": 0.13095703721046448, "rewards/rejected": 0.09765625, "step": 1730 }, { "epoch": 0.44670442842430486, "grad_norm": 151.0, "learning_rate": 2.7664778578784757e-07, "logits/chosen": -0.28105467557907104, "logits/rejected": -0.237945556640625, "logps/chosen": -306.20001220703125, "logps/rejected": -304.0, "loss": 0.6469, "rewards/accuracies": 0.5439743399620056, "rewards/chosen": 0.24228516221046448, "rewards/margins": 0.13375243544578552, "rewards/rejected": 0.10881347954273224, "step": 1735 }, { "epoch": 0.44799176107106076, "grad_norm": 136.0, "learning_rate": 2.760041194644696e-07, "logits/chosen": -0.28947752714157104, "logits/rejected": -0.09294433891773224, "logps/chosen": -276.79998779296875, "logps/rejected": -227.1999969482422, "loss": 0.6352, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.21523436903953552, "rewards/margins": 0.16621093451976776, "rewards/rejected": 0.049560546875, "step": 1740 }, { "epoch": 0.44927909371781666, "grad_norm": 111.5, "learning_rate": 2.7536045314109166e-07, "logits/chosen": -0.3773437440395355, "logits/rejected": -0.3593505918979645, "logps/chosen": -290.1000061035156, "logps/rejected": -264.79998779296875, "loss": 0.625, "rewards/accuracies": 0.6941666603088379, "rewards/chosen": 0.2691406309604645, "rewards/margins": 0.18349608778953552, "rewards/rejected": 0.08564452826976776, "step": 1745 }, { "epoch": 0.4505664263645726, "grad_norm": 127.0, "learning_rate": 2.747167868177137e-07, "logits/chosen": -0.36406248807907104, "logits/rejected": -0.3255859315395355, "logps/chosen": -320.3999938964844, "logps/rejected": -312.3999938964844, "loss": 0.6602, "rewards/accuracies": 0.5633333921432495, "rewards/chosen": 0.22499999403953552, "rewards/margins": 0.1248779296875, "rewards/rejected": 0.10009765625, "step": 1750 }, { "epoch": 0.4518537590113285, "grad_norm": 123.0, "learning_rate": 2.7407312049433574e-07, "logits/chosen": -0.2906250059604645, "logits/rejected": -0.25898438692092896, "logps/chosen": -351.20001220703125, "logps/rejected": -258.79998779296875, "loss": 0.6234, "rewards/accuracies": 0.6341666579246521, "rewards/chosen": 0.31425780057907104, "rewards/margins": 0.19267578423023224, "rewards/rejected": 0.12138672173023224, "step": 1755 }, { "epoch": 0.45314109165808447, "grad_norm": 143.0, "learning_rate": 2.7342945417095773e-07, "logits/chosen": -0.23085936903953552, "logits/rejected": -0.23569336533546448, "logps/chosen": -270.79998779296875, "logps/rejected": -261.0, "loss": 0.6336, "rewards/accuracies": 0.5991666913032532, "rewards/chosen": 0.23798827826976776, "rewards/margins": 0.17928466200828552, "rewards/rejected": 0.05864257737994194, "step": 1760 }, { "epoch": 0.4544284243048404, "grad_norm": 135.0, "learning_rate": 2.727857878475798e-07, "logits/chosen": -0.407470703125, "logits/rejected": -0.3992675840854645, "logps/chosen": -308.0, "logps/rejected": -366.0, "loss": 0.6609, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.2099609375, "rewards/margins": 0.112548828125, "rewards/rejected": 0.09717407077550888, "step": 1765 }, { "epoch": 0.4557157569515963, "grad_norm": 134.0, "learning_rate": 2.7214212152420187e-07, "logits/chosen": -0.2255859375, "logits/rejected": -0.17680664360523224, "logps/chosen": -347.20001220703125, "logps/rejected": -330.0, "loss": 0.6422, "rewards/accuracies": 0.6108333468437195, "rewards/chosen": 0.31035155057907104, "rewards/margins": 0.15263672173023224, "rewards/rejected": 0.15830078721046448, "step": 1770 }, { "epoch": 0.45700308959835223, "grad_norm": 113.5, "learning_rate": 2.7149845520082386e-07, "logits/chosen": -0.26459962129592896, "logits/rejected": -0.3724609315395355, "logps/chosen": -353.6000061035156, "logps/rejected": -278.20001220703125, "loss": 0.6266, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": 0.3521484434604645, "rewards/margins": 0.17910155653953552, "rewards/rejected": 0.17324218153953552, "step": 1775 }, { "epoch": 0.45829042224510813, "grad_norm": 124.0, "learning_rate": 2.708547888774459e-07, "logits/chosen": -0.2757812440395355, "logits/rejected": -0.29345703125, "logps/chosen": -307.79998779296875, "logps/rejected": -301.20001220703125, "loss": 0.6445, "rewards/accuracies": 0.5107142925262451, "rewards/chosen": 0.21503905951976776, "rewards/margins": 0.13701172173023224, "rewards/rejected": 0.07817383110523224, "step": 1780 }, { "epoch": 0.45957775489186403, "grad_norm": 123.5, "learning_rate": 2.70211122554068e-07, "logits/chosen": -0.3466796875, "logits/rejected": -0.3091796934604645, "logps/chosen": -286.6000061035156, "logps/rejected": -242.39999389648438, "loss": 0.6359, "rewards/accuracies": 0.6238095760345459, "rewards/chosen": 0.21445313096046448, "rewards/margins": 0.14707031846046448, "rewards/rejected": 0.06708984076976776, "step": 1785 }, { "epoch": 0.46086508753862, "grad_norm": 112.0, "learning_rate": 2.6956745623069e-07, "logits/chosen": -0.2818359434604645, "logits/rejected": -0.357421875, "logps/chosen": -282.3999938964844, "logps/rejected": -268.79998779296875, "loss": 0.6672, "rewards/accuracies": 0.5344230532646179, "rewards/chosen": 0.19345703721046448, "rewards/margins": 0.11107788234949112, "rewards/rejected": 0.0823974609375, "step": 1790 }, { "epoch": 0.4621524201853759, "grad_norm": 161.0, "learning_rate": 2.6892378990731203e-07, "logits/chosen": -0.2197265625, "logits/rejected": -0.18648681044578552, "logps/chosen": -296.79998779296875, "logps/rejected": -260.6000061035156, "loss": 0.643, "rewards/accuracies": 0.5608333349227905, "rewards/chosen": 0.21962890028953552, "rewards/margins": 0.14150390028953552, "rewards/rejected": 0.0784912109375, "step": 1795 }, { "epoch": 0.46343975283213185, "grad_norm": 112.0, "learning_rate": 2.6828012358393407e-07, "logits/chosen": -0.2876953184604645, "logits/rejected": -0.20155028998851776, "logps/chosen": -282.79998779296875, "logps/rejected": -312.20001220703125, "loss": 0.5891, "rewards/accuracies": 0.6892856955528259, "rewards/chosen": 0.30000001192092896, "rewards/margins": 0.2611328065395355, "rewards/rejected": 0.03889160230755806, "step": 1800 }, { "epoch": 0.46472708547888775, "grad_norm": 118.5, "learning_rate": 2.676364572605561e-07, "logits/chosen": -0.28413087129592896, "logits/rejected": -0.2967285215854645, "logps/chosen": -242.0, "logps/rejected": -223.0, "loss": 0.6492, "rewards/accuracies": 0.6216667294502258, "rewards/chosen": 0.2655273377895355, "rewards/margins": 0.11076660454273224, "rewards/rejected": 0.15483398735523224, "step": 1805 }, { "epoch": 0.46601441812564365, "grad_norm": 168.0, "learning_rate": 2.6699279093717816e-07, "logits/chosen": -0.0374755859375, "logits/rejected": -0.0654296875, "logps/chosen": -227.0, "logps/rejected": -244.60000610351562, "loss": 0.6844, "rewards/accuracies": 0.5271428823471069, "rewards/chosen": 0.1239013671875, "rewards/margins": 0.05390968173742294, "rewards/rejected": 0.07017822563648224, "step": 1810 }, { "epoch": 0.4673017507723996, "grad_norm": 122.5, "learning_rate": 2.663491246138002e-07, "logits/chosen": -0.12871094048023224, "logits/rejected": -0.12651367485523224, "logps/chosen": -261.79998779296875, "logps/rejected": -271.0, "loss": 0.6289, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.23857422173023224, "rewards/margins": 0.198486328125, "rewards/rejected": 0.03957519680261612, "step": 1815 }, { "epoch": 0.4685890834191555, "grad_norm": 170.0, "learning_rate": 2.6570545829042224e-07, "logits/chosen": -0.18132324516773224, "logits/rejected": -0.25825196504592896, "logps/chosen": -290.6000061035156, "logps/rejected": -314.79998779296875, "loss": 0.6461, "rewards/accuracies": 0.6451923251152039, "rewards/chosen": 0.2314453125, "rewards/margins": 0.14731445908546448, "rewards/rejected": 0.0845947265625, "step": 1820 }, { "epoch": 0.4698764160659114, "grad_norm": 125.0, "learning_rate": 2.6506179196704423e-07, "logits/chosen": -0.3384765684604645, "logits/rejected": -0.31328123807907104, "logps/chosen": -337.20001220703125, "logps/rejected": -305.79998779296875, "loss": 0.6117, "rewards/accuracies": 0.6442307233810425, "rewards/chosen": 0.2777343690395355, "rewards/margins": 0.21445313096046448, "rewards/rejected": 0.06308593600988388, "step": 1825 }, { "epoch": 0.47116374871266736, "grad_norm": 106.5, "learning_rate": 2.6441812564366633e-07, "logits/chosen": -0.361328125, "logits/rejected": -0.31572264432907104, "logps/chosen": -228.0, "logps/rejected": -232.1999969482422, "loss": 0.6164, "rewards/accuracies": 0.6428571939468384, "rewards/chosen": 0.23164062201976776, "rewards/margins": 0.19863280653953552, "rewards/rejected": 0.03289794921875, "step": 1830 }, { "epoch": 0.47245108135942326, "grad_norm": 106.5, "learning_rate": 2.6377445932028837e-07, "logits/chosen": -0.21370849013328552, "logits/rejected": -0.26152342557907104, "logps/chosen": -295.0, "logps/rejected": -300.0, "loss": 0.6422, "rewards/accuracies": 0.635952353477478, "rewards/chosen": 0.18593749403953552, "rewards/margins": 0.14003905653953552, "rewards/rejected": 0.04586181789636612, "step": 1835 }, { "epoch": 0.4737384140061792, "grad_norm": 136.0, "learning_rate": 2.6313079299691036e-07, "logits/chosen": -0.2626953125, "logits/rejected": -0.16403809189796448, "logps/chosen": -312.0, "logps/rejected": -244.60000610351562, "loss": 0.6414, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.24824218451976776, "rewards/margins": 0.18112793564796448, "rewards/rejected": 0.06697998195886612, "step": 1840 }, { "epoch": 0.4750257466529351, "grad_norm": 116.5, "learning_rate": 2.6248712667353246e-07, "logits/chosen": -0.43437498807907104, "logits/rejected": -0.3208984434604645, "logps/chosen": -340.0, "logps/rejected": -282.6000061035156, "loss": 0.5969, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2943359315395355, "rewards/margins": 0.24980469048023224, "rewards/rejected": 0.04436035081744194, "step": 1845 }, { "epoch": 0.476313079299691, "grad_norm": 144.0, "learning_rate": 2.618434603501545e-07, "logits/chosen": -0.30937498807907104, "logits/rejected": -0.23862305283546448, "logps/chosen": -361.20001220703125, "logps/rejected": -343.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.6074999570846558, "rewards/chosen": 0.2779296934604645, "rewards/margins": 0.15605469048023224, "rewards/rejected": 0.12202148139476776, "step": 1850 }, { "epoch": 0.477600411946447, "grad_norm": 141.0, "learning_rate": 2.611997940267765e-07, "logits/chosen": -0.28583985567092896, "logits/rejected": -0.21517333388328552, "logps/chosen": -287.3999938964844, "logps/rejected": -256.0, "loss": 0.6695, "rewards/accuracies": 0.5265909433364868, "rewards/chosen": 0.18012695014476776, "rewards/margins": 0.0908203125, "rewards/rejected": 0.08955077826976776, "step": 1855 }, { "epoch": 0.4788877445932029, "grad_norm": 125.5, "learning_rate": 2.6055612770339853e-07, "logits/chosen": -0.15683594346046448, "logits/rejected": -0.20527343451976776, "logps/chosen": -218.1999969482422, "logps/rejected": -251.0, "loss": 0.6312, "rewards/accuracies": 0.5761905312538147, "rewards/chosen": 0.22412109375, "rewards/margins": 0.16823120415210724, "rewards/rejected": 0.05587158352136612, "step": 1860 }, { "epoch": 0.4801750772399588, "grad_norm": 172.0, "learning_rate": 2.599124613800206e-07, "logits/chosen": -0.32221680879592896, "logits/rejected": -0.3490234315395355, "logps/chosen": -281.3999938964844, "logps/rejected": -269.6000061035156, "loss": 0.6734, "rewards/accuracies": 0.5383333563804626, "rewards/chosen": 0.17919921875, "rewards/margins": 0.07656250149011612, "rewards/rejected": 0.10249023139476776, "step": 1865 }, { "epoch": 0.48146240988671474, "grad_norm": 119.0, "learning_rate": 2.592687950566426e-07, "logits/chosen": -0.24140624701976776, "logits/rejected": -0.21298828721046448, "logps/chosen": -314.79998779296875, "logps/rejected": -282.0, "loss": 0.607, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.27070313692092896, "rewards/margins": 0.212890625, "rewards/rejected": 0.05770263820886612, "step": 1870 }, { "epoch": 0.48274974253347064, "grad_norm": 120.0, "learning_rate": 2.5862512873326466e-07, "logits/chosen": -0.46367186307907104, "logits/rejected": -0.32011717557907104, "logps/chosen": -403.20001220703125, "logps/rejected": -315.79998779296875, "loss": 0.6422, "rewards/accuracies": 0.5136111378669739, "rewards/chosen": 0.3345703184604645, "rewards/margins": 0.16777344048023224, "rewards/rejected": 0.16645507514476776, "step": 1875 }, { "epoch": 0.4840370751802266, "grad_norm": 163.0, "learning_rate": 2.579814624098867e-07, "logits/chosen": -0.23126068711280823, "logits/rejected": -0.15546874701976776, "logps/chosen": -255.60000610351562, "logps/rejected": -233.0, "loss": 0.6164, "rewards/accuracies": 0.6767857074737549, "rewards/chosen": 0.22539062798023224, "rewards/margins": 0.19511719048023224, "rewards/rejected": 0.02990112267434597, "step": 1880 }, { "epoch": 0.4853244078269825, "grad_norm": 157.0, "learning_rate": 2.5733779608650874e-07, "logits/chosen": -0.3686767518520355, "logits/rejected": -0.31425780057907104, "logps/chosen": -394.3999938964844, "logps/rejected": -327.20001220703125, "loss": 0.607, "rewards/accuracies": 0.7050000429153442, "rewards/chosen": 0.3349609375, "rewards/margins": 0.21572265028953552, "rewards/rejected": 0.11949463188648224, "step": 1885 }, { "epoch": 0.4866117404737384, "grad_norm": 143.0, "learning_rate": 2.566941297631308e-07, "logits/chosen": -0.22666016221046448, "logits/rejected": -0.20517578721046448, "logps/chosen": -273.20001220703125, "logps/rejected": -270.79998779296875, "loss": 0.6633, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.19882813096046448, "rewards/margins": 0.10231323540210724, "rewards/rejected": 0.09638671576976776, "step": 1890 }, { "epoch": 0.48789907312049435, "grad_norm": 126.5, "learning_rate": 2.5605046343975283e-07, "logits/chosen": -0.197265625, "logits/rejected": -0.4310546815395355, "logps/chosen": -197.0, "logps/rejected": -191.60000610351562, "loss": 0.6672, "rewards/accuracies": 0.5436965823173523, "rewards/chosen": 0.19228515028953552, "rewards/margins": 0.07490234076976776, "rewards/rejected": 0.11728515475988388, "step": 1895 }, { "epoch": 0.48918640576725025, "grad_norm": 121.0, "learning_rate": 2.5540679711637487e-07, "logits/chosen": -0.388671875, "logits/rejected": -0.3062500059604645, "logps/chosen": -299.20001220703125, "logps/rejected": -259.20001220703125, "loss": 0.6523, "rewards/accuracies": 0.4969230592250824, "rewards/chosen": 0.23642578721046448, "rewards/margins": 0.13455811142921448, "rewards/rejected": 0.10161133110523224, "step": 1900 }, { "epoch": 0.49047373841400616, "grad_norm": 133.0, "learning_rate": 2.5476313079299686e-07, "logits/chosen": -0.09195251762866974, "logits/rejected": -0.14726562798023224, "logps/chosen": -310.0, "logps/rejected": -303.20001220703125, "loss": 0.6594, "rewards/accuracies": 0.621666669845581, "rewards/chosen": 0.19960936903953552, "rewards/margins": 0.116607666015625, "rewards/rejected": 0.0830078125, "step": 1905 }, { "epoch": 0.4917610710607621, "grad_norm": 113.0, "learning_rate": 2.5411946446961896e-07, "logits/chosen": -0.37109375, "logits/rejected": -0.17939452826976776, "logps/chosen": -303.6000061035156, "logps/rejected": -294.0, "loss": 0.6289, "rewards/accuracies": 0.5691667199134827, "rewards/chosen": 0.23808594048023224, "rewards/margins": 0.1729736328125, "rewards/rejected": 0.06511230766773224, "step": 1910 }, { "epoch": 0.493048403707518, "grad_norm": 131.0, "learning_rate": 2.53475798146241e-07, "logits/chosen": -0.21445313096046448, "logits/rejected": -0.25776368379592896, "logps/chosen": -271.6000061035156, "logps/rejected": -281.3999938964844, "loss": 0.6094, "rewards/accuracies": 0.6433333158493042, "rewards/chosen": 0.25312501192092896, "rewards/margins": 0.21640625596046448, "rewards/rejected": 0.03667602688074112, "step": 1915 }, { "epoch": 0.49433573635427397, "grad_norm": 121.0, "learning_rate": 2.52832131822863e-07, "logits/chosen": -0.41093748807907104, "logits/rejected": -0.36699217557907104, "logps/chosen": -341.20001220703125, "logps/rejected": -299.20001220703125, "loss": 0.6234, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2867187559604645, "rewards/margins": 0.19775390625, "rewards/rejected": 0.08896484225988388, "step": 1920 }, { "epoch": 0.49562306900102987, "grad_norm": 130.0, "learning_rate": 2.521884654994851e-07, "logits/chosen": -0.32060545682907104, "logits/rejected": -0.3916992247104645, "logps/chosen": -325.6000061035156, "logps/rejected": -283.3999938964844, "loss": 0.6727, "rewards/accuracies": 0.48269233107566833, "rewards/chosen": 0.216796875, "rewards/margins": 0.09055175632238388, "rewards/rejected": 0.12607422471046448, "step": 1925 }, { "epoch": 0.49691040164778577, "grad_norm": 193.0, "learning_rate": 2.5154479917610713e-07, "logits/chosen": -0.19882813096046448, "logits/rejected": -0.31733399629592896, "logps/chosen": -289.0, "logps/rejected": -272.3999938964844, "loss": 0.6898, "rewards/accuracies": 0.524642825126648, "rewards/chosen": 0.1650390625, "rewards/margins": 0.05976562574505806, "rewards/rejected": 0.10551758110523224, "step": 1930 }, { "epoch": 0.4981977342945417, "grad_norm": 135.0, "learning_rate": 2.509011328527291e-07, "logits/chosen": -0.43212890625, "logits/rejected": -0.24067382514476776, "logps/chosen": -287.79998779296875, "logps/rejected": -218.10000610351562, "loss": 0.6227, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.28398436307907104, "rewards/margins": 0.20107421278953552, "rewards/rejected": 0.08261718600988388, "step": 1935 }, { "epoch": 0.49948506694129763, "grad_norm": 106.5, "learning_rate": 2.5025746652935116e-07, "logits/chosen": -0.25468748807907104, "logits/rejected": -0.22690430283546448, "logps/chosen": -302.0, "logps/rejected": -286.3999938964844, "loss": 0.6156, "rewards/accuracies": 0.683104395866394, "rewards/chosen": 0.3216796815395355, "rewards/margins": 0.2138671875, "rewards/rejected": 0.10751952975988388, "step": 1940 }, { "epoch": 0.5007723995880535, "grad_norm": 117.0, "learning_rate": 2.496138002059732e-07, "logits/chosen": -0.3895507752895355, "logits/rejected": -0.255859375, "logps/chosen": -271.3999938964844, "logps/rejected": -223.6999969482422, "loss": 0.6328, "rewards/accuracies": 0.5348776578903198, "rewards/chosen": 0.27265626192092896, "rewards/margins": 0.1494140625, "rewards/rejected": 0.123046875, "step": 1945 }, { "epoch": 0.5020597322348095, "grad_norm": 125.5, "learning_rate": 2.4897013388259525e-07, "logits/chosen": -0.21855469048023224, "logits/rejected": -0.162109375, "logps/chosen": -337.6000061035156, "logps/rejected": -306.3999938964844, "loss": 0.6305, "rewards/accuracies": 0.5958333611488342, "rewards/chosen": 0.21757812798023224, "rewards/margins": 0.17070312798023224, "rewards/rejected": 0.04704589769244194, "step": 1950 }, { "epoch": 0.5033470648815654, "grad_norm": 239.0, "learning_rate": 2.483264675592173e-07, "logits/chosen": -0.2943359315395355, "logits/rejected": -0.2713867127895355, "logps/chosen": -266.0, "logps/rejected": -252.1999969482422, "loss": 0.632, "rewards/accuracies": 0.5648809671401978, "rewards/chosen": 0.23789063096046448, "rewards/margins": 0.15126952528953552, "rewards/rejected": 0.08719787746667862, "step": 1955 }, { "epoch": 0.5046343975283213, "grad_norm": 134.0, "learning_rate": 2.4768280123583933e-07, "logits/chosen": -0.3326171934604645, "logits/rejected": -0.27900391817092896, "logps/chosen": -314.3999938964844, "logps/rejected": -321.0, "loss": 0.6422, "rewards/accuracies": 0.6048809289932251, "rewards/chosen": 0.2574218809604645, "rewards/margins": 0.17536239326000214, "rewards/rejected": 0.081756591796875, "step": 1960 }, { "epoch": 0.5059217301750772, "grad_norm": 163.0, "learning_rate": 2.470391349124614e-07, "logits/chosen": -0.393310546875, "logits/rejected": -0.3730224668979645, "logps/chosen": -296.0, "logps/rejected": -266.6000061035156, "loss": 0.6945, "rewards/accuracies": 0.511236310005188, "rewards/chosen": 0.20000000298023224, "rewards/margins": 0.03787841647863388, "rewards/rejected": 0.162109375, "step": 1965 }, { "epoch": 0.5072090628218332, "grad_norm": 158.0, "learning_rate": 2.463954685890834e-07, "logits/chosen": -0.3548828065395355, "logits/rejected": -0.3320068418979645, "logps/chosen": -431.6000061035156, "logps/rejected": -375.20001220703125, "loss": 0.6391, "rewards/accuracies": 0.4908333718776703, "rewards/chosen": 0.3462890684604645, "rewards/margins": 0.15507812798023224, "rewards/rejected": 0.19155272841453552, "step": 1970 }, { "epoch": 0.508496395468589, "grad_norm": 136.0, "learning_rate": 2.4575180226570546e-07, "logits/chosen": -0.2581543028354645, "logits/rejected": -0.2526611387729645, "logps/chosen": -303.0, "logps/rejected": -307.6000061035156, "loss": 0.6672, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 0.22539062798023224, "rewards/margins": 0.09335937350988388, "rewards/rejected": 0.13204345107078552, "step": 1975 }, { "epoch": 0.509783728115345, "grad_norm": 131.0, "learning_rate": 2.451081359423275e-07, "logits/chosen": -0.22866210341453552, "logits/rejected": -0.20229491591453552, "logps/chosen": -275.0, "logps/rejected": -321.20001220703125, "loss": 0.6562, "rewards/accuracies": 0.6150000095367432, "rewards/chosen": 0.21962890028953552, "rewards/margins": 0.14238281548023224, "rewards/rejected": 0.07709960639476776, "step": 1980 }, { "epoch": 0.511071060762101, "grad_norm": 116.5, "learning_rate": 2.4446446961894955e-07, "logits/chosen": -0.3199218809604645, "logits/rejected": -0.287109375, "logps/chosen": -279.79998779296875, "logps/rejected": -290.3999938964844, "loss": 0.6445, "rewards/accuracies": 0.5553571581840515, "rewards/chosen": 0.263671875, "rewards/margins": 0.12849120795726776, "rewards/rejected": 0.13515624403953552, "step": 1985 }, { "epoch": 0.5123583934088568, "grad_norm": 114.0, "learning_rate": 2.4382080329557153e-07, "logits/chosen": -0.30683594942092896, "logits/rejected": -0.13056640326976776, "logps/chosen": -308.20001220703125, "logps/rejected": -260.6000061035156, "loss": 0.6125, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.30927735567092896, "rewards/margins": 0.22293701767921448, "rewards/rejected": 0.08632812649011612, "step": 1990 }, { "epoch": 0.5136457260556128, "grad_norm": 132.0, "learning_rate": 2.4317713697219363e-07, "logits/chosen": -0.26445311307907104, "logits/rejected": -0.33203125, "logps/chosen": -269.20001220703125, "logps/rejected": -290.79998779296875, "loss": 0.6719, "rewards/accuracies": 0.5156410932540894, "rewards/chosen": 0.18710938096046448, "rewards/margins": 0.07571105659008026, "rewards/rejected": 0.11162109673023224, "step": 1995 }, { "epoch": 0.5149330587023687, "grad_norm": 161.0, "learning_rate": 2.425334706488156e-07, "logits/chosen": -0.536328136920929, "logits/rejected": -0.3873046934604645, "logps/chosen": -293.0, "logps/rejected": -243.60000610351562, "loss": 0.6234, "rewards/accuracies": 0.6473193764686584, "rewards/chosen": 0.2798828184604645, "rewards/margins": 0.18378905951976776, "rewards/rejected": 0.09653320163488388, "step": 2000 }, { "epoch": 0.5162203913491246, "grad_norm": 106.5, "learning_rate": 2.4188980432543766e-07, "logits/chosen": -0.2982421815395355, "logits/rejected": -0.19462890923023224, "logps/chosen": -287.20001220703125, "logps/rejected": -270.20001220703125, "loss": 0.6492, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.18623046576976776, "rewards/margins": 0.12705078721046448, "rewards/rejected": 0.05908203125, "step": 2005 }, { "epoch": 0.5175077239958805, "grad_norm": 122.5, "learning_rate": 2.412461380020597e-07, "logits/chosen": -0.3333984315395355, "logits/rejected": -0.23061522841453552, "logps/chosen": -344.79998779296875, "logps/rejected": -293.20001220703125, "loss": 0.6102, "rewards/accuracies": 0.6183333992958069, "rewards/chosen": 0.259765625, "rewards/margins": 0.21787109971046448, "rewards/rejected": 0.041834354400634766, "step": 2010 }, { "epoch": 0.5187950566426365, "grad_norm": 127.5, "learning_rate": 2.4060247167868175e-07, "logits/chosen": -0.30937498807907104, "logits/rejected": -0.27558594942092896, "logps/chosen": -265.79998779296875, "logps/rejected": -266.3500061035156, "loss": 0.6383, "rewards/accuracies": 0.6069048047065735, "rewards/chosen": 0.22929687798023224, "rewards/margins": 0.14726562798023224, "rewards/rejected": 0.0823974609375, "step": 2015 }, { "epoch": 0.5200823892893924, "grad_norm": 250.0, "learning_rate": 2.399588053553038e-07, "logits/chosen": -0.19667968153953552, "logits/rejected": -0.14655151963233948, "logps/chosen": -269.3999938964844, "logps/rejected": -237.8000030517578, "loss": 0.6469, "rewards/accuracies": 0.5787302255630493, "rewards/chosen": 0.2757812440395355, "rewards/margins": 0.13839110732078552, "rewards/rejected": 0.13725586235523224, "step": 2020 }, { "epoch": 0.5213697219361483, "grad_norm": 113.5, "learning_rate": 2.3931513903192583e-07, "logits/chosen": -0.33574217557907104, "logits/rejected": -0.14169922471046448, "logps/chosen": -302.3999938964844, "logps/rejected": -262.3999938964844, "loss": 0.6117, "rewards/accuracies": 0.6199175715446472, "rewards/chosen": 0.2728515565395355, "rewards/margins": 0.21950682997703552, "rewards/rejected": 0.05288086086511612, "step": 2025 }, { "epoch": 0.5226570545829042, "grad_norm": 108.0, "learning_rate": 2.386714727085479e-07, "logits/chosen": -0.31132811307907104, "logits/rejected": -0.23300781846046448, "logps/chosen": -343.20001220703125, "logps/rejected": -303.6000061035156, "loss": 0.6492, "rewards/accuracies": 0.4958333373069763, "rewards/chosen": 0.2650390565395355, "rewards/margins": 0.14400634169578552, "rewards/rejected": 0.12167968600988388, "step": 2030 }, { "epoch": 0.5239443872296602, "grad_norm": 102.5, "learning_rate": 2.3802780638516992e-07, "logits/chosen": -0.2958984375, "logits/rejected": -0.19374999403953552, "logps/chosen": -231.8000030517578, "logps/rejected": -206.39999389648438, "loss": 0.6578, "rewards/accuracies": 0.5175000429153442, "rewards/chosen": 0.19140625, "rewards/margins": 0.107421875, "rewards/rejected": 0.08389892429113388, "step": 2035 }, { "epoch": 0.525231719876416, "grad_norm": 146.0, "learning_rate": 2.3738414006179194e-07, "logits/chosen": -0.21162109076976776, "logits/rejected": -0.21210937201976776, "logps/chosen": -280.79998779296875, "logps/rejected": -288.79998779296875, "loss": 0.6492, "rewards/accuracies": 0.5690476298332214, "rewards/chosen": 0.27558594942092896, "rewards/margins": 0.14707031846046448, "rewards/rejected": 0.12861327826976776, "step": 2040 }, { "epoch": 0.526519052523172, "grad_norm": 114.5, "learning_rate": 2.36740473738414e-07, "logits/chosen": -0.3619140684604645, "logits/rejected": -0.28691405057907104, "logps/chosen": -342.3999938964844, "logps/rejected": -280.6000061035156, "loss": 0.6102, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.3265624940395355, "rewards/margins": 0.2353515625, "rewards/rejected": 0.09099121391773224, "step": 2045 }, { "epoch": 0.527806385169928, "grad_norm": 115.0, "learning_rate": 2.3609680741503605e-07, "logits/chosen": -0.36640626192092896, "logits/rejected": -0.3187499940395355, "logps/chosen": -301.3999938964844, "logps/rejected": -276.20001220703125, "loss": 0.6562, "rewards/accuracies": 0.5275000333786011, "rewards/chosen": 0.24687500298023224, "rewards/margins": 0.139454647898674, "rewards/rejected": 0.10756836086511612, "step": 2050 }, { "epoch": 0.5290937178166838, "grad_norm": 118.0, "learning_rate": 2.3545314109165806e-07, "logits/chosen": -0.3726562559604645, "logits/rejected": -0.2890625, "logps/chosen": -286.0, "logps/rejected": -258.3999938964844, "loss": 0.6883, "rewards/accuracies": 0.5208333730697632, "rewards/chosen": 0.15859374403953552, "rewards/margins": 0.044921875, "rewards/rejected": 0.11391601711511612, "step": 2055 }, { "epoch": 0.5303810504634398, "grad_norm": 109.0, "learning_rate": 2.348094747682801e-07, "logits/chosen": -0.23330077528953552, "logits/rejected": -0.26240235567092896, "logps/chosen": -280.79998779296875, "logps/rejected": -263.20001220703125, "loss": 0.6156, "rewards/accuracies": 0.6692307591438293, "rewards/chosen": 0.20488281548023224, "rewards/margins": 0.18984374403953552, "rewards/rejected": 0.01523437537252903, "step": 2060 }, { "epoch": 0.5316683831101957, "grad_norm": 116.5, "learning_rate": 2.3416580844490218e-07, "logits/chosen": -0.23857422173023224, "logits/rejected": -0.2533203065395355, "logps/chosen": -305.3999938964844, "logps/rejected": -321.6000061035156, "loss": 0.6375, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.20039062201976776, "rewards/margins": 0.14814452826976776, "rewards/rejected": 0.05224609375, "step": 2065 }, { "epoch": 0.5329557157569516, "grad_norm": 112.0, "learning_rate": 2.335221421215242e-07, "logits/chosen": -0.2674804627895355, "logits/rejected": -0.24522705376148224, "logps/chosen": -252.1999969482422, "logps/rejected": -255.0, "loss": 0.6328, "rewards/accuracies": 0.5958333611488342, "rewards/chosen": 0.24433593451976776, "rewards/margins": 0.1651763916015625, "rewards/rejected": 0.07905273139476776, "step": 2070 }, { "epoch": 0.5342430484037075, "grad_norm": 144.0, "learning_rate": 2.3287847579814623e-07, "logits/chosen": -0.4146484434604645, "logits/rejected": -0.30078125, "logps/chosen": -271.3999938964844, "logps/rejected": -247.1999969482422, "loss": 0.6687, "rewards/accuracies": 0.5575000047683716, "rewards/chosen": 0.21230468153953552, "rewards/margins": 0.09470214694738388, "rewards/rejected": 0.11757812649011612, "step": 2075 }, { "epoch": 0.5355303810504635, "grad_norm": 138.0, "learning_rate": 2.3223480947476825e-07, "logits/chosen": -0.31464844942092896, "logits/rejected": -0.18359375, "logps/chosen": -351.6000061035156, "logps/rejected": -291.6000061035156, "loss": 0.6523, "rewards/accuracies": 0.5706959962844849, "rewards/chosen": 0.2962890565395355, "rewards/margins": 0.16035155951976776, "rewards/rejected": 0.13632813096046448, "step": 2080 }, { "epoch": 0.5368177136972193, "grad_norm": 162.0, "learning_rate": 2.315911431513903e-07, "logits/chosen": -0.23427733778953552, "logits/rejected": -0.16474609076976776, "logps/chosen": -310.79998779296875, "logps/rejected": -286.79998779296875, "loss": 0.657, "rewards/accuracies": 0.5271428823471069, "rewards/chosen": 0.24179688096046448, "rewards/margins": 0.10549316555261612, "rewards/rejected": 0.13627929985523224, "step": 2085 }, { "epoch": 0.5381050463439753, "grad_norm": 135.0, "learning_rate": 2.3094747682801236e-07, "logits/chosen": -0.26953125, "logits/rejected": -0.35664063692092896, "logps/chosen": -285.79998779296875, "logps/rejected": -265.79998779296875, "loss": 0.6555, "rewards/accuracies": 0.6200000047683716, "rewards/chosen": 0.26191407442092896, "rewards/margins": 0.12648925185203552, "rewards/rejected": 0.13540038466453552, "step": 2090 }, { "epoch": 0.5393923789907312, "grad_norm": 141.0, "learning_rate": 2.3030381050463438e-07, "logits/chosen": -0.2884765565395355, "logits/rejected": -0.2474365234375, "logps/chosen": -270.79998779296875, "logps/rejected": -287.6000061035156, "loss": 0.6305, "rewards/accuracies": 0.6324999928474426, "rewards/chosen": 0.23642578721046448, "rewards/margins": 0.199951171875, "rewards/rejected": 0.0362548828125, "step": 2095 }, { "epoch": 0.5406797116374872, "grad_norm": 163.0, "learning_rate": 2.2966014418125642e-07, "logits/chosen": -0.3233398497104645, "logits/rejected": -0.33281248807907104, "logps/chosen": -303.20001220703125, "logps/rejected": -284.0, "loss": 0.643, "rewards/accuracies": 0.5645346641540527, "rewards/chosen": 0.234375, "rewards/margins": 0.11738280951976776, "rewards/rejected": 0.11669921875, "step": 2100 }, { "epoch": 0.541967044284243, "grad_norm": 312.0, "learning_rate": 2.2901647785787846e-07, "logits/chosen": -0.3441406190395355, "logits/rejected": -0.33671873807907104, "logps/chosen": -341.20001220703125, "logps/rejected": -330.3999938964844, "loss": 0.6414, "rewards/accuracies": 0.5755952596664429, "rewards/chosen": 0.30078125, "rewards/margins": 0.17758789658546448, "rewards/rejected": 0.12333984673023224, "step": 2105 }, { "epoch": 0.543254376930999, "grad_norm": 129.0, "learning_rate": 2.283728115345005e-07, "logits/chosen": -0.23886719346046448, "logits/rejected": -0.29511719942092896, "logps/chosen": -322.20001220703125, "logps/rejected": -267.20001220703125, "loss": 0.6422, "rewards/accuracies": 0.5951923131942749, "rewards/chosen": 0.2412109375, "rewards/margins": 0.16707763075828552, "rewards/rejected": 0.07436523586511612, "step": 2110 }, { "epoch": 0.5445417095777549, "grad_norm": 133.0, "learning_rate": 2.2772914521112255e-07, "logits/chosen": -0.2593750059604645, "logits/rejected": -0.19912108778953552, "logps/chosen": -326.79998779296875, "logps/rejected": -299.3999938964844, "loss": 0.618, "rewards/accuracies": 0.5959615707397461, "rewards/chosen": 0.291015625, "rewards/margins": 0.21728515625, "rewards/rejected": 0.07443659007549286, "step": 2115 }, { "epoch": 0.5458290422245108, "grad_norm": 196.0, "learning_rate": 2.2708547888774457e-07, "logits/chosen": -0.47734373807907104, "logits/rejected": -0.3968749940395355, "logps/chosen": -299.3999938964844, "logps/rejected": -256.0, "loss": 0.6383, "rewards/accuracies": 0.5967857241630554, "rewards/chosen": 0.30097657442092896, "rewards/margins": 0.15773925185203552, "rewards/rejected": 0.14340820908546448, "step": 2120 }, { "epoch": 0.5471163748712667, "grad_norm": 127.5, "learning_rate": 2.264418125643666e-07, "logits/chosen": -0.34199219942092896, "logits/rejected": -0.28522950410842896, "logps/chosen": -335.6000061035156, "logps/rejected": -329.20001220703125, "loss": 0.6117, "rewards/accuracies": 0.6057575941085815, "rewards/chosen": 0.3070312440395355, "rewards/margins": 0.23261718451976776, "rewards/rejected": 0.07421875, "step": 2125 }, { "epoch": 0.5484037075180227, "grad_norm": 177.0, "learning_rate": 2.2579814624098868e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.17851562798023224, "logps/chosen": -355.6000061035156, "logps/rejected": -288.3999938964844, "loss": 0.6219, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": 0.2081298828125, "rewards/margins": 0.19462890923023224, "rewards/rejected": 0.013531493954360485, "step": 2130 }, { "epoch": 0.5496910401647785, "grad_norm": 115.0, "learning_rate": 2.251544799176107e-07, "logits/chosen": -0.1572265625, "logits/rejected": -0.076385498046875, "logps/chosen": -314.0, "logps/rejected": -301.20001220703125, "loss": 0.6828, "rewards/accuracies": 0.49416667222976685, "rewards/chosen": 0.20058593153953552, "rewards/margins": 0.08812256157398224, "rewards/rejected": 0.11269531399011612, "step": 2135 }, { "epoch": 0.5509783728115345, "grad_norm": 146.0, "learning_rate": 2.2451081359423274e-07, "logits/chosen": -0.23906250298023224, "logits/rejected": -0.23173828423023224, "logps/chosen": -276.0, "logps/rejected": -318.79998779296875, "loss": 0.6406, "rewards/accuracies": 0.6108333468437195, "rewards/chosen": 0.20253905653953552, "rewards/margins": 0.14873047173023224, "rewards/rejected": 0.053985595703125, "step": 2140 }, { "epoch": 0.5522657054582905, "grad_norm": 115.0, "learning_rate": 2.2386714727085478e-07, "logits/chosen": -0.3070312440395355, "logits/rejected": -0.23701171576976776, "logps/chosen": -344.79998779296875, "logps/rejected": -266.6000061035156, "loss": 0.5875, "rewards/accuracies": 0.7160714268684387, "rewards/chosen": 0.31523436307907104, "rewards/margins": 0.2730468809604645, "rewards/rejected": 0.04221191257238388, "step": 2145 }, { "epoch": 0.5535530381050463, "grad_norm": 126.0, "learning_rate": 2.2322348094747682e-07, "logits/chosen": -0.2513671815395355, "logits/rejected": -0.20458984375, "logps/chosen": -291.6000061035156, "logps/rejected": -304.79998779296875, "loss": 0.6109, "rewards/accuracies": 0.6425000429153442, "rewards/chosen": 0.2669921815395355, "rewards/margins": 0.228515625, "rewards/rejected": 0.038543701171875, "step": 2150 }, { "epoch": 0.5548403707518023, "grad_norm": 122.0, "learning_rate": 2.2257981462409886e-07, "logits/chosen": -0.22084960341453552, "logits/rejected": -0.18488769233226776, "logps/chosen": -313.3999938964844, "logps/rejected": -277.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.5892424583435059, "rewards/chosen": 0.24355468153953552, "rewards/margins": 0.18476562201976776, "rewards/rejected": 0.05927734449505806, "step": 2155 }, { "epoch": 0.5561277033985582, "grad_norm": 207.0, "learning_rate": 2.219361483007209e-07, "logits/chosen": -0.18637695908546448, "logits/rejected": -0.1163330078125, "logps/chosen": -307.0, "logps/rejected": -312.6000061035156, "loss": 0.6172, "rewards/accuracies": 0.6019047498703003, "rewards/chosen": 0.22929687798023224, "rewards/margins": 0.19580078125, "rewards/rejected": 0.03354492038488388, "step": 2160 }, { "epoch": 0.5574150360453141, "grad_norm": 137.0, "learning_rate": 2.2129248197734292e-07, "logits/chosen": -0.3623046875, "logits/rejected": -0.3589843809604645, "logps/chosen": -366.20001220703125, "logps/rejected": -296.0, "loss": 0.6234, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.2874999940395355, "rewards/margins": 0.20742186903953552, "rewards/rejected": 0.08012695610523224, "step": 2165 }, { "epoch": 0.55870236869207, "grad_norm": 121.5, "learning_rate": 2.20648815653965e-07, "logits/chosen": -0.32441407442092896, "logits/rejected": -0.26884764432907104, "logps/chosen": -287.0, "logps/rejected": -257.20001220703125, "loss": 0.6281, "rewards/accuracies": 0.6452381014823914, "rewards/chosen": 0.2542968690395355, "rewards/margins": 0.16083984076976776, "rewards/rejected": 0.09309081733226776, "step": 2170 }, { "epoch": 0.559989701338826, "grad_norm": 119.0, "learning_rate": 2.20005149330587e-07, "logits/chosen": -0.33232420682907104, "logits/rejected": -0.30683594942092896, "logps/chosen": -272.0, "logps/rejected": -239.60000610351562, "loss": 0.6117, "rewards/accuracies": 0.5816666483879089, "rewards/chosen": 0.23300781846046448, "rewards/margins": 0.21210937201976776, "rewards/rejected": 0.02112426795065403, "step": 2175 }, { "epoch": 0.5612770339855818, "grad_norm": 135.0, "learning_rate": 2.1936148300720905e-07, "logits/chosen": -0.353515625, "logits/rejected": -0.3182128965854645, "logps/chosen": -292.3999938964844, "logps/rejected": -283.20001220703125, "loss": 0.6273, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 0.29462891817092896, "rewards/margins": 0.1865234375, "rewards/rejected": 0.10828246921300888, "step": 2180 }, { "epoch": 0.5625643666323378, "grad_norm": 139.0, "learning_rate": 2.187178166838311e-07, "logits/chosen": -0.32011717557907104, "logits/rejected": -0.37812501192092896, "logps/chosen": -273.79998779296875, "logps/rejected": -315.6000061035156, "loss": 0.6305, "rewards/accuracies": 0.6164102554321289, "rewards/chosen": 0.22324219346046448, "rewards/margins": 0.17343750596046448, "rewards/rejected": 0.04995117336511612, "step": 2185 }, { "epoch": 0.5638516992790937, "grad_norm": 148.0, "learning_rate": 2.1807415036045314e-07, "logits/chosen": -0.26093751192092896, "logits/rejected": -0.14433594048023224, "logps/chosen": -302.3999938964844, "logps/rejected": -297.6000061035156, "loss": 0.6281, "rewards/accuracies": 0.6150000095367432, "rewards/chosen": 0.21894530951976776, "rewards/margins": 0.17138671875, "rewards/rejected": 0.047607421875, "step": 2190 }, { "epoch": 0.5651390319258497, "grad_norm": 114.5, "learning_rate": 2.1743048403707518e-07, "logits/chosen": -0.18720702826976776, "logits/rejected": -0.3387207090854645, "logps/chosen": -276.6000061035156, "logps/rejected": -260.79998779296875, "loss": 0.6562, "rewards/accuracies": 0.6225000619888306, "rewards/chosen": 0.18898925185203552, "rewards/margins": 0.12285156548023224, "rewards/rejected": 0.06581268459558487, "step": 2195 }, { "epoch": 0.5664263645726055, "grad_norm": 138.0, "learning_rate": 2.1678681771369722e-07, "logits/chosen": -0.3555664122104645, "logits/rejected": -0.24980469048023224, "logps/chosen": -297.6000061035156, "logps/rejected": -288.20001220703125, "loss": 0.6164, "rewards/accuracies": 0.6110714673995972, "rewards/chosen": 0.24482421576976776, "rewards/margins": 0.21347656846046448, "rewards/rejected": 0.03115234337747097, "step": 2200 }, { "epoch": 0.5677136972193615, "grad_norm": 133.0, "learning_rate": 2.1614315139031924e-07, "logits/chosen": -0.510937511920929, "logits/rejected": -0.4527343809604645, "logps/chosen": -328.0, "logps/rejected": -329.79998779296875, "loss": 0.6695, "rewards/accuracies": 0.4828571677207947, "rewards/chosen": 0.21660156548023224, "rewards/margins": 0.07354736328125, "rewards/rejected": 0.14277343451976776, "step": 2205 }, { "epoch": 0.5690010298661174, "grad_norm": 128.0, "learning_rate": 2.154994850669413e-07, "logits/chosen": -0.25566405057907104, "logits/rejected": -0.14316406846046448, "logps/chosen": -297.20001220703125, "logps/rejected": -254.60000610351562, "loss": 0.6094, "rewards/accuracies": 0.6467949151992798, "rewards/chosen": 0.2578125, "rewards/margins": 0.22246094048023224, "rewards/rejected": 0.03558349609375, "step": 2210 }, { "epoch": 0.5702883625128733, "grad_norm": 117.0, "learning_rate": 2.1485581874356332e-07, "logits/chosen": -0.2914062440395355, "logits/rejected": -0.21108397841453552, "logps/chosen": -286.20001220703125, "logps/rejected": -284.79998779296875, "loss": 0.6562, "rewards/accuracies": 0.5967857241630554, "rewards/chosen": 0.21738281846046448, "rewards/margins": 0.13034668564796448, "rewards/rejected": 0.08681640774011612, "step": 2215 }, { "epoch": 0.5715756951596292, "grad_norm": 118.0, "learning_rate": 2.1421215242018537e-07, "logits/chosen": -0.15874023735523224, "logits/rejected": -0.201171875, "logps/chosen": -256.6000061035156, "logps/rejected": -248.8000030517578, "loss": 0.6008, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 0.2421875, "rewards/margins": 0.24960938096046448, "rewards/rejected": -0.007702636532485485, "step": 2220 }, { "epoch": 0.5728630278063852, "grad_norm": 114.0, "learning_rate": 2.135684860968074e-07, "logits/chosen": -0.4984374940395355, "logits/rejected": -0.361328125, "logps/chosen": -281.79998779296875, "logps/rejected": -337.0, "loss": 0.6266, "rewards/accuracies": 0.5915934443473816, "rewards/chosen": 0.20527343451976776, "rewards/margins": 0.1611328125, "rewards/rejected": 0.04399413987994194, "step": 2225 }, { "epoch": 0.574150360453141, "grad_norm": 111.5, "learning_rate": 2.1292481977342942e-07, "logits/chosen": -0.21953125298023224, "logits/rejected": -0.1669921875, "logps/chosen": -296.3999938964844, "logps/rejected": -276.20001220703125, "loss": 0.6055, "rewards/accuracies": 0.746666669845581, "rewards/chosen": 0.26191407442092896, "rewards/margins": 0.22499999403953552, "rewards/rejected": 0.03617553785443306, "step": 2230 }, { "epoch": 0.575437693099897, "grad_norm": 128.0, "learning_rate": 2.122811534500515e-07, "logits/chosen": -0.24617919325828552, "logits/rejected": -0.23818358778953552, "logps/chosen": -305.79998779296875, "logps/rejected": -264.0, "loss": 0.6539, "rewards/accuracies": 0.5766667127609253, "rewards/chosen": 0.22407226264476776, "rewards/margins": 0.12832030653953552, "rewards/rejected": 0.09589691460132599, "step": 2235 }, { "epoch": 0.576725025746653, "grad_norm": 142.0, "learning_rate": 2.1163748712667354e-07, "logits/chosen": -0.4625000059604645, "logits/rejected": -0.29121094942092896, "logps/chosen": -314.0, "logps/rejected": -271.3999938964844, "loss": 0.6508, "rewards/accuracies": 0.49897441267967224, "rewards/chosen": 0.22343750298023224, "rewards/margins": 0.12001953274011612, "rewards/rejected": 0.10344238579273224, "step": 2240 }, { "epoch": 0.5780123583934088, "grad_norm": 108.0, "learning_rate": 2.1099382080329555e-07, "logits/chosen": -0.40253907442092896, "logits/rejected": -0.3721679747104645, "logps/chosen": -329.79998779296875, "logps/rejected": -276.0, "loss": 0.6195, "rewards/accuracies": 0.7089285850524902, "rewards/chosen": 0.28828126192092896, "rewards/margins": 0.20224609971046448, "rewards/rejected": 0.08591308444738388, "step": 2245 }, { "epoch": 0.5792996910401648, "grad_norm": 138.0, "learning_rate": 2.103501544799176e-07, "logits/chosen": -0.12812499701976776, "logits/rejected": -0.28144532442092896, "logps/chosen": -266.3999938964844, "logps/rejected": -312.20001220703125, "loss": 0.6797, "rewards/accuracies": 0.5805769562721252, "rewards/chosen": 0.18320313096046448, "rewards/margins": 0.0677490234375, "rewards/rejected": 0.1153564453125, "step": 2250 }, { "epoch": 0.5805870236869207, "grad_norm": 95.0, "learning_rate": 2.0970648815653964e-07, "logits/chosen": -0.35078126192092896, "logits/rejected": -0.18242187798023224, "logps/chosen": -359.3999938964844, "logps/rejected": -282.0, "loss": 0.5898, "rewards/accuracies": 0.7230128049850464, "rewards/chosen": 0.34160155057907104, "rewards/margins": 0.2713867127895355, "rewards/rejected": 0.07014159858226776, "step": 2255 }, { "epoch": 0.5818743563336766, "grad_norm": 123.0, "learning_rate": 2.0906282183316168e-07, "logits/chosen": -0.27460938692092896, "logits/rejected": -0.2674804627895355, "logps/chosen": -301.20001220703125, "logps/rejected": -267.0, "loss": 0.6867, "rewards/accuracies": 0.5175000429153442, "rewards/chosen": 0.2876953184604645, "rewards/margins": 0.06279297173023224, "rewards/rejected": 0.22519531846046448, "step": 2260 }, { "epoch": 0.5831616889804325, "grad_norm": 184.0, "learning_rate": 2.0841915550978372e-07, "logits/chosen": -0.2542968690395355, "logits/rejected": -0.36113280057907104, "logps/chosen": -301.6000061035156, "logps/rejected": -268.20001220703125, "loss": 0.6055, "rewards/accuracies": 0.6441667079925537, "rewards/chosen": 0.28105467557907104, "rewards/margins": 0.22578124701976776, "rewards/rejected": 0.05576171725988388, "step": 2265 }, { "epoch": 0.5844490216271885, "grad_norm": 117.0, "learning_rate": 2.0777548918640574e-07, "logits/chosen": -0.24482421576976776, "logits/rejected": -0.14990234375, "logps/chosen": -272.3999938964844, "logps/rejected": -259.20001220703125, "loss": 0.6391, "rewards/accuracies": 0.6758333444595337, "rewards/chosen": 0.22871093451976776, "rewards/margins": 0.1495361328125, "rewards/rejected": 0.07915039360523224, "step": 2270 }, { "epoch": 0.5857363542739444, "grad_norm": 135.0, "learning_rate": 2.071318228630278e-07, "logits/chosen": -0.3687500059604645, "logits/rejected": -0.32539063692092896, "logps/chosen": -335.20001220703125, "logps/rejected": -323.6000061035156, "loss": 0.632, "rewards/accuracies": 0.5815476179122925, "rewards/chosen": 0.2564453184604645, "rewards/margins": 0.15476074814796448, "rewards/rejected": 0.101806640625, "step": 2275 }, { "epoch": 0.5870236869207003, "grad_norm": 111.0, "learning_rate": 2.0648815653964985e-07, "logits/chosen": -0.11171875149011612, "logits/rejected": -0.16958007216453552, "logps/chosen": -253.39999389648438, "logps/rejected": -245.39999389648438, "loss": 0.6539, "rewards/accuracies": 0.5774999856948853, "rewards/chosen": 0.13906249403953552, "rewards/margins": 0.10507812350988388, "rewards/rejected": 0.03404540941119194, "step": 2280 }, { "epoch": 0.5883110195674562, "grad_norm": 113.0, "learning_rate": 2.0584449021627187e-07, "logits/chosen": -0.38593751192092896, "logits/rejected": -0.26416015625, "logps/chosen": -351.3999938964844, "logps/rejected": -298.79998779296875, "loss": 0.6453, "rewards/accuracies": 0.5622619390487671, "rewards/chosen": 0.27851563692092896, "rewards/margins": 0.13608399033546448, "rewards/rejected": 0.142333984375, "step": 2285 }, { "epoch": 0.5895983522142122, "grad_norm": 107.0, "learning_rate": 2.052008238928939e-07, "logits/chosen": -0.43632811307907104, "logits/rejected": -0.39863282442092896, "logps/chosen": -328.79998779296875, "logps/rejected": -329.20001220703125, "loss": 0.6273, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2841796875, "rewards/margins": 0.1865234375, "rewards/rejected": 0.09804687649011612, "step": 2290 }, { "epoch": 0.590885684860968, "grad_norm": 122.5, "learning_rate": 2.0455715756951595e-07, "logits/chosen": -0.20986327528953552, "logits/rejected": -0.2481689453125, "logps/chosen": -285.79998779296875, "logps/rejected": -300.0, "loss": 0.668, "rewards/accuracies": 0.5903571844100952, "rewards/chosen": 0.190673828125, "rewards/margins": 0.09038086235523224, "rewards/rejected": 0.10019531100988388, "step": 2295 }, { "epoch": 0.592173017507724, "grad_norm": 135.0, "learning_rate": 2.03913491246138e-07, "logits/chosen": -0.2886718809604645, "logits/rejected": -0.36357420682907104, "logps/chosen": -324.0, "logps/rejected": -314.79998779296875, "loss": 0.6266, "rewards/accuracies": 0.603205144405365, "rewards/chosen": 0.35332030057907104, "rewards/margins": 0.20126953721046448, "rewards/rejected": 0.15263672173023224, "step": 2300 }, { "epoch": 0.59346035015448, "grad_norm": 119.5, "learning_rate": 2.0326982492276004e-07, "logits/chosen": -0.25273436307907104, "logits/rejected": -0.24062499403953552, "logps/chosen": -357.20001220703125, "logps/rejected": -360.0, "loss": 0.6234, "rewards/accuracies": 0.6634523868560791, "rewards/chosen": 0.3001953065395355, "rewards/margins": 0.19179686903953552, "rewards/rejected": 0.108551025390625, "step": 2305 }, { "epoch": 0.5947476828012358, "grad_norm": 123.5, "learning_rate": 2.0262615859938205e-07, "logits/chosen": -0.24951171875, "logits/rejected": -0.18603515625, "logps/chosen": -359.3999938964844, "logps/rejected": -339.20001220703125, "loss": 0.6453, "rewards/accuracies": 0.625, "rewards/chosen": 0.26542967557907104, "rewards/margins": 0.15422363579273224, "rewards/rejected": 0.11066894233226776, "step": 2310 }, { "epoch": 0.5960350154479918, "grad_norm": 107.5, "learning_rate": 2.0198249227600412e-07, "logits/chosen": -0.28046876192092896, "logits/rejected": -0.3031249940395355, "logps/chosen": -314.0, "logps/rejected": -263.79998779296875, "loss": 0.6508, "rewards/accuracies": 0.5641666650772095, "rewards/chosen": 0.21035155653953552, "rewards/margins": 0.12236328423023224, "rewards/rejected": 0.08833007514476776, "step": 2315 }, { "epoch": 0.5973223480947477, "grad_norm": 103.5, "learning_rate": 2.0133882595262617e-07, "logits/chosen": -0.3492187559604645, "logits/rejected": -0.3423828184604645, "logps/chosen": -314.3999938964844, "logps/rejected": -282.79998779296875, "loss": 0.6312, "rewards/accuracies": 0.6555768847465515, "rewards/chosen": 0.20888671278953552, "rewards/margins": 0.16484375298023224, "rewards/rejected": 0.0438232421875, "step": 2320 }, { "epoch": 0.5986096807415036, "grad_norm": 123.5, "learning_rate": 2.0069515962924818e-07, "logits/chosen": -0.3412109315395355, "logits/rejected": -0.30488282442092896, "logps/chosen": -259.79998779296875, "logps/rejected": -299.6000061035156, "loss": 0.6375, "rewards/accuracies": 0.591785728931427, "rewards/chosen": 0.23886719346046448, "rewards/margins": 0.14326171576976776, "rewards/rejected": 0.09556885063648224, "step": 2325 }, { "epoch": 0.5998970133882595, "grad_norm": 132.0, "learning_rate": 2.0005149330587023e-07, "logits/chosen": -0.46875, "logits/rejected": -0.39765626192092896, "logps/chosen": -268.0, "logps/rejected": -243.39999389648438, "loss": 0.6336, "rewards/accuracies": 0.5577020049095154, "rewards/chosen": 0.23476561903953552, "rewards/margins": 0.13632813096046448, "rewards/rejected": 0.09877929836511612, "step": 2330 }, { "epoch": 0.6011843460350155, "grad_norm": 139.0, "learning_rate": 1.9940782698249227e-07, "logits/chosen": -0.3701171875, "logits/rejected": -0.208984375, "logps/chosen": -283.20001220703125, "logps/rejected": -318.79998779296875, "loss": 0.6219, "rewards/accuracies": 0.59492427110672, "rewards/chosen": 0.23457030951976776, "rewards/margins": 0.20097656548023224, "rewards/rejected": 0.03359375149011612, "step": 2335 }, { "epoch": 0.6024716786817713, "grad_norm": 172.0, "learning_rate": 1.987641606591143e-07, "logits/chosen": -0.11964721977710724, "logits/rejected": -0.11005859076976776, "logps/chosen": -288.79998779296875, "logps/rejected": -286.79998779296875, "loss": 0.6438, "rewards/accuracies": 0.6058333516120911, "rewards/chosen": 0.23151855170726776, "rewards/margins": 0.13203124701976776, "rewards/rejected": 0.09919433295726776, "step": 2340 }, { "epoch": 0.6037590113285273, "grad_norm": 136.0, "learning_rate": 1.9812049433573635e-07, "logits/chosen": -0.35175782442092896, "logits/rejected": -0.3746093809604645, "logps/chosen": -314.6000061035156, "logps/rejected": -295.0, "loss": 0.643, "rewards/accuracies": 0.5970238447189331, "rewards/chosen": 0.2818359434604645, "rewards/margins": 0.15170899033546448, "rewards/rejected": 0.13036498427391052, "step": 2345 }, { "epoch": 0.6050463439752832, "grad_norm": 114.0, "learning_rate": 1.9747682801235837e-07, "logits/chosen": -0.3929687440395355, "logits/rejected": -0.37187498807907104, "logps/chosen": -310.3999938964844, "logps/rejected": -313.6000061035156, "loss": 0.6336, "rewards/accuracies": 0.5340659022331238, "rewards/chosen": 0.2598632872104645, "rewards/margins": 0.1552734375, "rewards/rejected": 0.10455627739429474, "step": 2350 }, { "epoch": 0.6063336766220392, "grad_norm": 124.0, "learning_rate": 1.9683316168898044e-07, "logits/chosen": -0.24008789658546448, "logits/rejected": -0.22197265923023224, "logps/chosen": -314.3999938964844, "logps/rejected": -289.3999938964844, "loss": 0.5836, "rewards/accuracies": 0.6976190805435181, "rewards/chosen": 0.3115234375, "rewards/margins": 0.28339844942092896, "rewards/rejected": 0.02756347693502903, "step": 2355 }, { "epoch": 0.607621009268795, "grad_norm": 174.0, "learning_rate": 1.9618949536560248e-07, "logits/chosen": -0.251953125, "logits/rejected": -0.24052734673023224, "logps/chosen": -283.6000061035156, "logps/rejected": -301.3999938964844, "loss": 0.6742, "rewards/accuracies": 0.5617856979370117, "rewards/chosen": 0.16337890923023224, "rewards/margins": 0.07666015625, "rewards/rejected": 0.08630981296300888, "step": 2360 }, { "epoch": 0.608908341915551, "grad_norm": 121.0, "learning_rate": 1.955458290422245e-07, "logits/chosen": -0.43535155057907104, "logits/rejected": -0.38398438692092896, "logps/chosen": -273.6000061035156, "logps/rejected": -244.89999389648438, "loss": 0.6031, "rewards/accuracies": 0.683104395866394, "rewards/chosen": 0.2835937440395355, "rewards/margins": 0.22382812201976776, "rewards/rejected": 0.06002197414636612, "step": 2365 }, { "epoch": 0.6101956745623069, "grad_norm": 118.0, "learning_rate": 1.9490216271884654e-07, "logits/chosen": -0.3150390684604645, "logits/rejected": -0.13325195014476776, "logps/chosen": -291.3999938964844, "logps/rejected": -273.6000061035156, "loss": 0.6297, "rewards/accuracies": 0.6469780206680298, "rewards/chosen": 0.2548828125, "rewards/margins": 0.19687500596046448, "rewards/rejected": 0.05744628980755806, "step": 2370 }, { "epoch": 0.6114830072090628, "grad_norm": 114.0, "learning_rate": 1.9425849639546856e-07, "logits/chosen": -0.216064453125, "logits/rejected": -0.18017578125, "logps/chosen": -270.6000061035156, "logps/rejected": -288.6000061035156, "loss": 0.6242, "rewards/accuracies": 0.6301282048225403, "rewards/chosen": 0.26679688692092896, "rewards/margins": 0.18339844048023224, "rewards/rejected": 0.08334960788488388, "step": 2375 }, { "epoch": 0.6127703398558187, "grad_norm": 118.5, "learning_rate": 1.9361483007209063e-07, "logits/chosen": -0.4378906190395355, "logits/rejected": -0.3939453065395355, "logps/chosen": -321.20001220703125, "logps/rejected": -291.0, "loss": 0.6047, "rewards/accuracies": 0.6899999976158142, "rewards/chosen": 0.3363281190395355, "rewards/margins": 0.23291015625, "rewards/rejected": 0.10312499850988388, "step": 2380 }, { "epoch": 0.6140576725025747, "grad_norm": 126.0, "learning_rate": 1.9297116374871267e-07, "logits/chosen": -0.349609375, "logits/rejected": -0.2981933653354645, "logps/chosen": -276.3999938964844, "logps/rejected": -243.1999969482422, "loss": 0.6352, "rewards/accuracies": 0.5416666269302368, "rewards/chosen": 0.20781250298023224, "rewards/margins": 0.1640625, "rewards/rejected": 0.04412994533777237, "step": 2385 }, { "epoch": 0.6153450051493305, "grad_norm": 169.0, "learning_rate": 1.9232749742533468e-07, "logits/chosen": -0.24384765326976776, "logits/rejected": -0.3519531190395355, "logps/chosen": -258.6000061035156, "logps/rejected": -256.79998779296875, "loss": 0.6227, "rewards/accuracies": 0.5800000429153442, "rewards/chosen": 0.2798828184604645, "rewards/margins": 0.18857422471046448, "rewards/rejected": 0.09088134765625, "step": 2390 }, { "epoch": 0.6166323377960865, "grad_norm": 114.5, "learning_rate": 1.9168383110195673e-07, "logits/chosen": -0.16562500596046448, "logits/rejected": -0.1546630859375, "logps/chosen": -331.3999938964844, "logps/rejected": -309.20001220703125, "loss": 0.632, "rewards/accuracies": 0.6833333373069763, "rewards/chosen": 0.22236327826976776, "rewards/margins": 0.18613281846046448, "rewards/rejected": 0.0361328125, "step": 2395 }, { "epoch": 0.6179196704428425, "grad_norm": 158.0, "learning_rate": 1.910401647785788e-07, "logits/chosen": -0.31245118379592896, "logits/rejected": -0.14970703423023224, "logps/chosen": -264.79998779296875, "logps/rejected": -282.6000061035156, "loss": 0.6461, "rewards/accuracies": 0.5732600688934326, "rewards/chosen": 0.26152342557907104, "rewards/margins": 0.15078124403953552, "rewards/rejected": 0.11105956882238388, "step": 2400 }, { "epoch": 0.6192070030895983, "grad_norm": 108.0, "learning_rate": 1.903964984552008e-07, "logits/chosen": -0.2681640684604645, "logits/rejected": -0.19594725966453552, "logps/chosen": -308.20001220703125, "logps/rejected": -253.60000610351562, "loss": 0.6523, "rewards/accuracies": 0.4975000023841858, "rewards/chosen": 0.21230468153953552, "rewards/margins": 0.10371093451976776, "rewards/rejected": 0.1083984375, "step": 2405 }, { "epoch": 0.6204943357363543, "grad_norm": 128.0, "learning_rate": 1.8975283213182286e-07, "logits/chosen": -0.24959716200828552, "logits/rejected": -0.20306396484375, "logps/chosen": -341.6000061035156, "logps/rejected": -281.0, "loss": 0.5828, "rewards/accuracies": 0.7091666460037231, "rewards/chosen": 0.2593750059604645, "rewards/margins": 0.28496092557907104, "rewards/rejected": -0.02534179762005806, "step": 2410 }, { "epoch": 0.6217816683831102, "grad_norm": 119.5, "learning_rate": 1.8910916580844487e-07, "logits/chosen": -0.3779296875, "logits/rejected": -0.28217774629592896, "logps/chosen": -326.3999938964844, "logps/rejected": -285.20001220703125, "loss": 0.6539, "rewards/accuracies": 0.49272727966308594, "rewards/chosen": 0.22148437798023224, "rewards/margins": 0.12604980170726776, "rewards/rejected": 0.09564208984375, "step": 2415 }, { "epoch": 0.6230690010298661, "grad_norm": 127.5, "learning_rate": 1.8846549948506694e-07, "logits/chosen": -0.3687500059604645, "logits/rejected": -0.2603515684604645, "logps/chosen": -292.0, "logps/rejected": -274.0, "loss": 0.6375, "rewards/accuracies": 0.5742856860160828, "rewards/chosen": 0.26249998807907104, "rewards/margins": 0.16384276747703552, "rewards/rejected": 0.09892578423023224, "step": 2420 }, { "epoch": 0.624356333676622, "grad_norm": 112.5, "learning_rate": 1.8782183316168898e-07, "logits/chosen": -0.26347655057907104, "logits/rejected": -0.143157958984375, "logps/chosen": -310.0, "logps/rejected": -297.79998779296875, "loss": 0.6102, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.184326171875, "rewards/margins": 0.20771484076976776, "rewards/rejected": -0.02341308631002903, "step": 2425 }, { "epoch": 0.625643666323378, "grad_norm": 130.0, "learning_rate": 1.87178166838311e-07, "logits/chosen": -0.2835693359375, "logits/rejected": -0.2470703125, "logps/chosen": -311.79998779296875, "logps/rejected": -287.3999938964844, "loss": 0.6891, "rewards/accuracies": 0.4752778112888336, "rewards/chosen": 0.22890624403953552, "rewards/margins": 0.05367431789636612, "rewards/rejected": 0.17512206733226776, "step": 2430 }, { "epoch": 0.6269309989701339, "grad_norm": 170.0, "learning_rate": 1.8653450051493304e-07, "logits/chosen": -0.3558593690395355, "logits/rejected": -0.38203126192092896, "logps/chosen": -316.79998779296875, "logps/rejected": -293.20001220703125, "loss": 0.6508, "rewards/accuracies": 0.608589768409729, "rewards/chosen": 0.25175780057907104, "rewards/margins": 0.13405761122703552, "rewards/rejected": 0.11777343600988388, "step": 2435 }, { "epoch": 0.6282183316168898, "grad_norm": 140.0, "learning_rate": 1.858908341915551e-07, "logits/chosen": -0.32792967557907104, "logits/rejected": -0.29609376192092896, "logps/chosen": -308.3999938964844, "logps/rejected": -322.3999938964844, "loss": 0.6617, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.21367187798023224, "rewards/margins": 0.10598144680261612, "rewards/rejected": 0.10795898735523224, "step": 2440 }, { "epoch": 0.6295056642636457, "grad_norm": 151.0, "learning_rate": 1.8524716786817713e-07, "logits/chosen": -0.18032225966453552, "logits/rejected": -0.1318359375, "logps/chosen": -271.3999938964844, "logps/rejected": -287.79998779296875, "loss": 0.6633, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1767578125, "rewards/margins": 0.10954590141773224, "rewards/rejected": 0.067138671875, "step": 2445 }, { "epoch": 0.6307929969104017, "grad_norm": 148.0, "learning_rate": 1.8460350154479917e-07, "logits/chosen": -0.28388673067092896, "logits/rejected": -0.3207031190395355, "logps/chosen": -302.20001220703125, "logps/rejected": -266.0, "loss": 0.6, "rewards/accuracies": 0.6861904859542847, "rewards/chosen": 0.2789062559604645, "rewards/margins": 0.22763672471046448, "rewards/rejected": 0.05117187649011612, "step": 2450 }, { "epoch": 0.6320803295571575, "grad_norm": 123.5, "learning_rate": 1.8395983522142119e-07, "logits/chosen": -0.19809570908546448, "logits/rejected": -0.22705078125, "logps/chosen": -298.3999938964844, "logps/rejected": -307.0, "loss": 0.6039, "rewards/accuracies": 0.6341666579246521, "rewards/chosen": 0.24941405653953552, "rewards/margins": 0.22519531846046448, "rewards/rejected": 0.02451171912252903, "step": 2455 }, { "epoch": 0.6333676622039135, "grad_norm": 126.5, "learning_rate": 1.8331616889804326e-07, "logits/chosen": -0.29609376192092896, "logits/rejected": -0.17280273139476776, "logps/chosen": -310.79998779296875, "logps/rejected": -316.3999938964844, "loss": 0.6148, "rewards/accuracies": 0.6956044435501099, "rewards/chosen": 0.2627929747104645, "rewards/margins": 0.2158203125, "rewards/rejected": 0.04755859449505806, "step": 2460 }, { "epoch": 0.6346549948506695, "grad_norm": 158.0, "learning_rate": 1.826725025746653e-07, "logits/chosen": -0.41035157442092896, "logits/rejected": -0.3331542909145355, "logps/chosen": -348.3999938964844, "logps/rejected": -356.79998779296875, "loss": 0.6547, "rewards/accuracies": 0.48416668176651, "rewards/chosen": 0.23476561903953552, "rewards/margins": 0.13349609076976776, "rewards/rejected": 0.10088501125574112, "step": 2465 }, { "epoch": 0.6359423274974253, "grad_norm": 116.0, "learning_rate": 1.8202883625128731e-07, "logits/chosen": -0.33564454317092896, "logits/rejected": -0.25849610567092896, "logps/chosen": -294.79998779296875, "logps/rejected": -285.3999938964844, "loss": 0.6242, "rewards/accuracies": 0.6504700779914856, "rewards/chosen": 0.2769531309604645, "rewards/margins": 0.1939697265625, "rewards/rejected": 0.08228759467601776, "step": 2470 }, { "epoch": 0.6372296601441813, "grad_norm": 119.5, "learning_rate": 1.8138516992790936e-07, "logits/chosen": -0.23100586235523224, "logits/rejected": -0.23964843153953552, "logps/chosen": -315.6000061035156, "logps/rejected": -299.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.559166669845581, "rewards/chosen": 0.22832031548023224, "rewards/margins": 0.16127929091453552, "rewards/rejected": 0.06646728515625, "step": 2475 }, { "epoch": 0.6385169927909372, "grad_norm": 122.5, "learning_rate": 1.8074150360453143e-07, "logits/chosen": -0.20756836235523224, "logits/rejected": -0.20175781846046448, "logps/chosen": -293.79998779296875, "logps/rejected": -310.79998779296875, "loss": 0.6148, "rewards/accuracies": 0.6394047737121582, "rewards/chosen": 0.24599608778953552, "rewards/margins": 0.20502929389476776, "rewards/rejected": 0.04074706882238388, "step": 2480 }, { "epoch": 0.6398043254376931, "grad_norm": 133.0, "learning_rate": 1.8009783728115344e-07, "logits/chosen": -0.27763670682907104, "logits/rejected": -0.29267579317092896, "logps/chosen": -306.6000061035156, "logps/rejected": -291.3999938964844, "loss": 0.6453, "rewards/accuracies": 0.51583331823349, "rewards/chosen": 0.22832031548023224, "rewards/margins": 0.13798828423023224, "rewards/rejected": 0.09038086235523224, "step": 2485 }, { "epoch": 0.641091658084449, "grad_norm": 140.0, "learning_rate": 1.7945417095777549e-07, "logits/chosen": -0.26787108182907104, "logits/rejected": -0.21689453721046448, "logps/chosen": -233.39999389648438, "logps/rejected": -256.3999938964844, "loss": 0.6328, "rewards/accuracies": 0.6328571438789368, "rewards/chosen": 0.20937499403953552, "rewards/margins": 0.15712890028953552, "rewards/rejected": 0.05244140699505806, "step": 2490 }, { "epoch": 0.642378990731205, "grad_norm": 131.0, "learning_rate": 1.788105046343975e-07, "logits/chosen": -0.26044923067092896, "logits/rejected": -0.2787109315395355, "logps/chosen": -272.6000061035156, "logps/rejected": -258.0, "loss": 0.6312, "rewards/accuracies": 0.6661363840103149, "rewards/chosen": 0.21953125298023224, "rewards/margins": 0.18105468153953552, "rewards/rejected": 0.03872070461511612, "step": 2495 }, { "epoch": 0.6436663233779608, "grad_norm": 124.0, "learning_rate": 1.7816683831101954e-07, "logits/chosen": -0.2992187440395355, "logits/rejected": -0.2769531309604645, "logps/chosen": -338.6000061035156, "logps/rejected": -306.6000061035156, "loss": 0.6484, "rewards/accuracies": 0.571309506893158, "rewards/chosen": 0.3095703125, "rewards/margins": 0.144287109375, "rewards/rejected": 0.16547851264476776, "step": 2500 }, { "epoch": 0.6449536560247168, "grad_norm": 135.0, "learning_rate": 1.7752317198764161e-07, "logits/chosen": -0.211669921875, "logits/rejected": -0.2611328065395355, "logps/chosen": -309.3999938964844, "logps/rejected": -255.0, "loss": 0.6398, "rewards/accuracies": 0.6275458335876465, "rewards/chosen": 0.22705078125, "rewards/margins": 0.1673603057861328, "rewards/rejected": 0.05999755859375, "step": 2505 }, { "epoch": 0.6462409886714727, "grad_norm": 120.5, "learning_rate": 1.7687950566426363e-07, "logits/chosen": -0.36640626192092896, "logits/rejected": -0.3291015625, "logps/chosen": -308.79998779296875, "logps/rejected": -260.3999938964844, "loss": 0.657, "rewards/accuracies": 0.6071428656578064, "rewards/chosen": 0.3070312440395355, "rewards/margins": 0.13107910752296448, "rewards/rejected": 0.17615966498851776, "step": 2510 }, { "epoch": 0.6475283213182287, "grad_norm": 114.5, "learning_rate": 1.7623583934088567e-07, "logits/chosen": -0.3169921934604645, "logits/rejected": -0.31328123807907104, "logps/chosen": -386.3999938964844, "logps/rejected": -298.6000061035156, "loss": 0.6867, "rewards/accuracies": 0.5786904692649841, "rewards/chosen": 0.2314453125, "rewards/margins": 0.10615234076976776, "rewards/rejected": 0.12553711235523224, "step": 2515 }, { "epoch": 0.6488156539649845, "grad_norm": 120.0, "learning_rate": 1.7559217301750771e-07, "logits/chosen": -0.3636718690395355, "logits/rejected": -0.33281248807907104, "logps/chosen": -320.0, "logps/rejected": -300.6000061035156, "loss": 0.6305, "rewards/accuracies": 0.6175000071525574, "rewards/chosen": 0.2939453125, "rewards/margins": 0.18100586533546448, "rewards/rejected": 0.113525390625, "step": 2520 }, { "epoch": 0.6501029866117405, "grad_norm": 129.0, "learning_rate": 1.7494850669412976e-07, "logits/chosen": -0.23593750596046448, "logits/rejected": -0.22353515028953552, "logps/chosen": -329.79998779296875, "logps/rejected": -295.3999938964844, "loss": 0.6078, "rewards/accuracies": 0.6535714268684387, "rewards/chosen": 0.25146484375, "rewards/margins": 0.23886719346046448, "rewards/rejected": 0.012939453125, "step": 2525 }, { "epoch": 0.6513903192584964, "grad_norm": 171.0, "learning_rate": 1.743048403707518e-07, "logits/chosen": -0.3004394471645355, "logits/rejected": -0.22709961235523224, "logps/chosen": -328.3999938964844, "logps/rejected": -312.0, "loss": 0.5789, "rewards/accuracies": 0.6791666746139526, "rewards/chosen": 0.2630859315395355, "rewards/margins": 0.2757812440395355, "rewards/rejected": -0.01259765587747097, "step": 2530 }, { "epoch": 0.6526776519052523, "grad_norm": 116.0, "learning_rate": 1.7366117404737382e-07, "logits/chosen": -0.28193360567092896, "logits/rejected": -0.30859375, "logps/chosen": -354.20001220703125, "logps/rejected": -249.60000610351562, "loss": 0.6391, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2591796815395355, "rewards/margins": 0.16162109375, "rewards/rejected": 0.0975799560546875, "step": 2535 }, { "epoch": 0.6539649845520082, "grad_norm": 102.5, "learning_rate": 1.7301750772399586e-07, "logits/chosen": -0.33320313692092896, "logits/rejected": -0.24843749403953552, "logps/chosen": -325.6000061035156, "logps/rejected": -283.3999938964844, "loss": 0.6523, "rewards/accuracies": 0.5976190567016602, "rewards/chosen": 0.22429199516773224, "rewards/margins": 0.15625, "rewards/rejected": 0.06790771335363388, "step": 2540 }, { "epoch": 0.6552523171987642, "grad_norm": 127.5, "learning_rate": 1.7237384140061793e-07, "logits/chosen": -0.22480468451976776, "logits/rejected": -0.14948730170726776, "logps/chosen": -228.39999389648438, "logps/rejected": -229.0, "loss": 0.6461, "rewards/accuracies": 0.5199999809265137, "rewards/chosen": 0.13505859673023224, "rewards/margins": 0.12480469048023224, "rewards/rejected": 0.010693359188735485, "step": 2545 }, { "epoch": 0.65653964984552, "grad_norm": 122.5, "learning_rate": 1.7173017507723994e-07, "logits/chosen": -0.17607422173023224, "logits/rejected": -0.16914062201976776, "logps/chosen": -289.79998779296875, "logps/rejected": -262.3999938964844, "loss": 0.6164, "rewards/accuracies": 0.6550000309944153, "rewards/chosen": 0.2630859315395355, "rewards/margins": 0.198974609375, "rewards/rejected": 0.06443633884191513, "step": 2550 }, { "epoch": 0.657826982492276, "grad_norm": 213.0, "learning_rate": 1.71086508753862e-07, "logits/chosen": -0.4820312559604645, "logits/rejected": -0.2734375, "logps/chosen": -310.20001220703125, "logps/rejected": -259.20001220703125, "loss": 0.6188, "rewards/accuracies": 0.5779762268066406, "rewards/chosen": 0.2640624940395355, "rewards/margins": 0.20131835341453552, "rewards/rejected": 0.06269530951976776, "step": 2555 }, { "epoch": 0.659114315139032, "grad_norm": 132.0, "learning_rate": 1.7044284243048403e-07, "logits/chosen": -0.30908203125, "logits/rejected": -0.3310546875, "logps/chosen": -281.6000061035156, "logps/rejected": -299.6000061035156, "loss": 0.6719, "rewards/accuracies": 0.5676923394203186, "rewards/chosen": 0.18359375, "rewards/margins": 0.07900390774011612, "rewards/rejected": 0.104705810546875, "step": 2560 }, { "epoch": 0.6604016477857878, "grad_norm": 111.0, "learning_rate": 1.6979917610710607e-07, "logits/chosen": -0.32744139432907104, "logits/rejected": -0.40410155057907104, "logps/chosen": -271.3999938964844, "logps/rejected": -291.20001220703125, "loss": 0.6352, "rewards/accuracies": 0.5321428179740906, "rewards/chosen": 0.2613281309604645, "rewards/margins": 0.13786621391773224, "rewards/rejected": 0.12363281100988388, "step": 2565 }, { "epoch": 0.6616889804325438, "grad_norm": 133.0, "learning_rate": 1.6915550978372812e-07, "logits/chosen": -0.28642576932907104, "logits/rejected": -0.24995116889476776, "logps/chosen": -370.79998779296875, "logps/rejected": -291.3999938964844, "loss": 0.6164, "rewards/accuracies": 0.6895238161087036, "rewards/chosen": 0.2865234315395355, "rewards/margins": 0.20820312201976776, "rewards/rejected": 0.07807616889476776, "step": 2570 }, { "epoch": 0.6629763130792997, "grad_norm": 204.0, "learning_rate": 1.6851184346035013e-07, "logits/chosen": -0.35566407442092896, "logits/rejected": -0.2518554627895355, "logps/chosen": -338.3999938964844, "logps/rejected": -254.1999969482422, "loss": 0.6297, "rewards/accuracies": 0.6309524178504944, "rewards/chosen": 0.2841796875, "rewards/margins": 0.17539063096046448, "rewards/rejected": 0.108154296875, "step": 2575 }, { "epoch": 0.6642636457260556, "grad_norm": 122.0, "learning_rate": 1.6786817713697217e-07, "logits/chosen": -0.2923828065395355, "logits/rejected": -0.32988280057907104, "logps/chosen": -338.3999938964844, "logps/rejected": -363.20001220703125, "loss": 0.643, "rewards/accuracies": 0.5925000309944153, "rewards/chosen": 0.25566405057907104, "rewards/margins": 0.15507812798023224, "rewards/rejected": 0.10051269829273224, "step": 2580 }, { "epoch": 0.6655509783728115, "grad_norm": 122.5, "learning_rate": 1.6722451081359424e-07, "logits/chosen": -0.3472656309604645, "logits/rejected": -0.3499999940395355, "logps/chosen": -305.6000061035156, "logps/rejected": -321.6000061035156, "loss": 0.6648, "rewards/accuracies": 0.6102272868156433, "rewards/chosen": 0.22805175185203552, "rewards/margins": 0.09896240383386612, "rewards/rejected": 0.12910155951976776, "step": 2585 }, { "epoch": 0.6668383110195675, "grad_norm": 124.0, "learning_rate": 1.6658084449021626e-07, "logits/chosen": -0.22910156846046448, "logits/rejected": -0.22104492783546448, "logps/chosen": -273.6000061035156, "logps/rejected": -235.0, "loss": 0.5984, "rewards/accuracies": 0.5982692837715149, "rewards/chosen": 0.32539063692092896, "rewards/margins": 0.25761717557907104, "rewards/rejected": 0.06784667819738388, "step": 2590 }, { "epoch": 0.6681256436663234, "grad_norm": 140.0, "learning_rate": 1.659371781668383e-07, "logits/chosen": -0.2818603515625, "logits/rejected": -0.25019532442092896, "logps/chosen": -271.6000061035156, "logps/rejected": -266.0, "loss": 0.6203, "rewards/accuracies": 0.6561111211776733, "rewards/chosen": 0.29121094942092896, "rewards/margins": 0.20527343451976776, "rewards/rejected": 0.08564452826976776, "step": 2595 }, { "epoch": 0.6694129763130793, "grad_norm": 104.5, "learning_rate": 1.6529351184346034e-07, "logits/chosen": -0.4984374940395355, "logits/rejected": -0.38017576932907104, "logps/chosen": -351.6000061035156, "logps/rejected": -294.3999938964844, "loss": 0.6281, "rewards/accuracies": 0.6283333897590637, "rewards/chosen": 0.2998046875, "rewards/margins": 0.19775390625, "rewards/rejected": 0.10214843600988388, "step": 2600 }, { "epoch": 0.6707003089598352, "grad_norm": 127.5, "learning_rate": 1.646498455200824e-07, "logits/chosen": -0.27910155057907104, "logits/rejected": -0.3165039122104645, "logps/chosen": -247.8000030517578, "logps/rejected": -263.20001220703125, "loss": 0.6219, "rewards/accuracies": 0.6946794986724854, "rewards/chosen": 0.22187499701976776, "rewards/margins": 0.19082030653953552, "rewards/rejected": 0.03155517578125, "step": 2605 }, { "epoch": 0.6719876416065912, "grad_norm": 175.0, "learning_rate": 1.6400617919670443e-07, "logits/chosen": -0.34716796875, "logits/rejected": -0.35527342557907104, "logps/chosen": -315.3999938964844, "logps/rejected": -254.0, "loss": 0.6492, "rewards/accuracies": 0.6033333539962769, "rewards/chosen": 0.27265626192092896, "rewards/margins": 0.14912109076976776, "rewards/rejected": 0.123291015625, "step": 2610 }, { "epoch": 0.673274974253347, "grad_norm": 104.5, "learning_rate": 1.6336251287332645e-07, "logits/chosen": -0.24140624701976776, "logits/rejected": -0.30595701932907104, "logps/chosen": -320.0, "logps/rejected": -289.6000061035156, "loss": 0.6578, "rewards/accuracies": 0.5108333230018616, "rewards/chosen": 0.23994140326976776, "rewards/margins": 0.111328125, "rewards/rejected": 0.12890625, "step": 2615 }, { "epoch": 0.674562306900103, "grad_norm": 119.5, "learning_rate": 1.627188465499485e-07, "logits/chosen": -0.2593750059604645, "logits/rejected": -0.316162109375, "logps/chosen": -285.6000061035156, "logps/rejected": -332.79998779296875, "loss": 0.6453, "rewards/accuracies": 0.5084523558616638, "rewards/chosen": 0.2591796815395355, "rewards/margins": 0.12690429389476776, "rewards/rejected": 0.13295897841453552, "step": 2620 }, { "epoch": 0.675849639546859, "grad_norm": 166.0, "learning_rate": 1.6207518022657056e-07, "logits/chosen": -0.35078126192092896, "logits/rejected": -0.19257812201976776, "logps/chosen": -275.20001220703125, "logps/rejected": -209.39999389648438, "loss": 0.6352, "rewards/accuracies": 0.5670163035392761, "rewards/chosen": 0.2959960997104645, "rewards/margins": 0.18391934037208557, "rewards/rejected": 0.11166992038488388, "step": 2625 }, { "epoch": 0.6771369721936148, "grad_norm": 104.0, "learning_rate": 1.6143151390319257e-07, "logits/chosen": -0.24863281846046448, "logits/rejected": -0.23271484673023224, "logps/chosen": -272.20001220703125, "logps/rejected": -257.3999938964844, "loss": 0.6461, "rewards/accuracies": 0.6063186526298523, "rewards/chosen": 0.19902344048023224, "rewards/margins": 0.13666991889476776, "rewards/rejected": 0.06235351413488388, "step": 2630 }, { "epoch": 0.6784243048403708, "grad_norm": 122.5, "learning_rate": 1.6078784757981462e-07, "logits/chosen": -0.2597412168979645, "logits/rejected": -0.23602294921875, "logps/chosen": -260.6000061035156, "logps/rejected": -235.1999969482422, "loss": 0.6125, "rewards/accuracies": 0.6183333396911621, "rewards/chosen": 0.2598632872104645, "rewards/margins": 0.21718749403953552, "rewards/rejected": 0.04265594482421875, "step": 2635 }, { "epoch": 0.6797116374871267, "grad_norm": 130.0, "learning_rate": 1.6014418125643666e-07, "logits/chosen": -0.33808594942092896, "logits/rejected": -0.30000001192092896, "logps/chosen": -319.6000061035156, "logps/rejected": -316.6000061035156, "loss": 0.6375, "rewards/accuracies": 0.6426190137863159, "rewards/chosen": 0.2642578184604645, "rewards/margins": 0.15439453721046448, "rewards/rejected": 0.10996093600988388, "step": 2640 }, { "epoch": 0.6809989701338826, "grad_norm": 174.0, "learning_rate": 1.5950051493305868e-07, "logits/chosen": -0.3486328125, "logits/rejected": -0.3271484375, "logps/chosen": -295.79998779296875, "logps/rejected": -251.39999389648438, "loss": 0.5891, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.3734374940395355, "rewards/margins": 0.24404296278953552, "rewards/rejected": 0.12954100966453552, "step": 2645 }, { "epoch": 0.6822863027806385, "grad_norm": 129.0, "learning_rate": 1.5885684860968075e-07, "logits/chosen": -0.3173828125, "logits/rejected": -0.163818359375, "logps/chosen": -292.0, "logps/rejected": -265.0, "loss": 0.643, "rewards/accuracies": 0.625, "rewards/chosen": 0.20351561903953552, "rewards/margins": 0.14707031846046448, "rewards/rejected": 0.0565185546875, "step": 2650 }, { "epoch": 0.6835736354273945, "grad_norm": 131.0, "learning_rate": 1.5821318228630276e-07, "logits/chosen": -0.2734375, "logits/rejected": -0.2041015625, "logps/chosen": -304.0, "logps/rejected": -309.6000061035156, "loss": 0.6461, "rewards/accuracies": 0.6358333826065063, "rewards/chosen": 0.23945312201976776, "rewards/margins": 0.15170899033546448, "rewards/rejected": 0.08774413913488388, "step": 2655 }, { "epoch": 0.6848609680741503, "grad_norm": 110.5, "learning_rate": 1.575695159629248e-07, "logits/chosen": -0.2646484375, "logits/rejected": -0.22517089545726776, "logps/chosen": -341.20001220703125, "logps/rejected": -263.6000061035156, "loss": 0.5797, "rewards/accuracies": 0.6933333277702332, "rewards/chosen": 0.29960936307907104, "rewards/margins": 0.30156248807907104, "rewards/rejected": -0.0012084960471838713, "step": 2660 }, { "epoch": 0.6861483007209063, "grad_norm": 113.0, "learning_rate": 1.5692584963954685e-07, "logits/chosen": -0.3154296875, "logits/rejected": -0.23876953125, "logps/chosen": -273.0, "logps/rejected": -259.0, "loss": 0.6156, "rewards/accuracies": 0.6168939471244812, "rewards/chosen": 0.26484376192092896, "rewards/margins": 0.211669921875, "rewards/rejected": 0.05351562425494194, "step": 2665 }, { "epoch": 0.6874356333676622, "grad_norm": 114.0, "learning_rate": 1.562821833161689e-07, "logits/chosen": -0.2552734315395355, "logits/rejected": -0.2783203125, "logps/chosen": -309.20001220703125, "logps/rejected": -315.20001220703125, "loss": 0.6672, "rewards/accuracies": 0.5433333516120911, "rewards/chosen": 0.2392578125, "rewards/margins": 0.11933593451976776, "rewards/rejected": 0.11972656100988388, "step": 2670 }, { "epoch": 0.6887229660144182, "grad_norm": 145.0, "learning_rate": 1.5563851699279093e-07, "logits/chosen": -0.49609375, "logits/rejected": -0.373046875, "logps/chosen": -287.6000061035156, "logps/rejected": -278.3999938964844, "loss": 0.6227, "rewards/accuracies": 0.6192857027053833, "rewards/chosen": 0.32441407442092896, "rewards/margins": 0.20849609375, "rewards/rejected": 0.11678466945886612, "step": 2675 }, { "epoch": 0.690010298661174, "grad_norm": 137.0, "learning_rate": 1.5499485066941297e-07, "logits/chosen": -0.25439453125, "logits/rejected": -0.23359374701976776, "logps/chosen": -328.0, "logps/rejected": -330.79998779296875, "loss": 0.6594, "rewards/accuracies": 0.4833333492279053, "rewards/chosen": 0.20292969048023224, "rewards/margins": 0.12470702826976776, "rewards/rejected": 0.07792969048023224, "step": 2680 }, { "epoch": 0.69129763130793, "grad_norm": 108.5, "learning_rate": 1.54351184346035e-07, "logits/chosen": -0.3057617247104645, "logits/rejected": -0.20289000868797302, "logps/chosen": -305.3999938964844, "logps/rejected": -304.79998779296875, "loss": 0.6281, "rewards/accuracies": 0.5291666388511658, "rewards/chosen": 0.2562499940395355, "rewards/margins": 0.1635284423828125, "rewards/rejected": 0.09257812798023224, "step": 2685 }, { "epoch": 0.6925849639546859, "grad_norm": 107.5, "learning_rate": 1.5370751802265706e-07, "logits/chosen": -0.23867186903953552, "logits/rejected": -0.12221679836511612, "logps/chosen": -316.3999938964844, "logps/rejected": -283.3999938964844, "loss": 0.6148, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2685546875, "rewards/margins": 0.2294921875, "rewards/rejected": 0.03901367262005806, "step": 2690 }, { "epoch": 0.6938722966014418, "grad_norm": 114.0, "learning_rate": 1.5306385169927908e-07, "logits/chosen": -0.2630371153354645, "logits/rejected": -0.21425780653953552, "logps/chosen": -300.3999938964844, "logps/rejected": -237.0, "loss": 0.6516, "rewards/accuracies": 0.5083333253860474, "rewards/chosen": 0.19052734971046448, "rewards/margins": 0.128692626953125, "rewards/rejected": 0.06191406399011612, "step": 2695 }, { "epoch": 0.6951596292481977, "grad_norm": 138.0, "learning_rate": 1.5242018537590112e-07, "logits/chosen": -0.42158204317092896, "logits/rejected": -0.05292968824505806, "logps/chosen": -304.79998779296875, "logps/rejected": -291.6000061035156, "loss": 0.6469, "rewards/accuracies": 0.5475000143051147, "rewards/chosen": 0.21699218451976776, "rewards/margins": 0.13271483778953552, "rewards/rejected": 0.08417968451976776, "step": 2700 }, { "epoch": 0.6964469618949537, "grad_norm": 138.0, "learning_rate": 1.5177651905252316e-07, "logits/chosen": -0.388671875, "logits/rejected": -0.4156250059604645, "logps/chosen": -346.79998779296875, "logps/rejected": -286.6000061035156, "loss": 0.6074, "rewards/accuracies": 0.6121794581413269, "rewards/chosen": 0.39628905057907104, "rewards/margins": 0.22233887016773224, "rewards/rejected": 0.17319336533546448, "step": 2705 }, { "epoch": 0.6977342945417095, "grad_norm": 114.0, "learning_rate": 1.511328527291452e-07, "logits/chosen": -0.30156248807907104, "logits/rejected": -0.21989746391773224, "logps/chosen": -304.79998779296875, "logps/rejected": -285.79998779296875, "loss": 0.6445, "rewards/accuracies": 0.5600000619888306, "rewards/chosen": 0.212890625, "rewards/margins": 0.17421874403953552, "rewards/rejected": 0.03837890550494194, "step": 2710 }, { "epoch": 0.6990216271884655, "grad_norm": 105.0, "learning_rate": 1.5048918640576725e-07, "logits/chosen": -0.24453124403953552, "logits/rejected": -0.17626953125, "logps/chosen": -289.3999938964844, "logps/rejected": -300.0, "loss": 0.65, "rewards/accuracies": 0.5625, "rewards/chosen": 0.18173828721046448, "rewards/margins": 0.14814452826976776, "rewards/rejected": 0.03330077975988388, "step": 2715 }, { "epoch": 0.7003089598352215, "grad_norm": 111.0, "learning_rate": 1.498455200823893e-07, "logits/chosen": -0.262451171875, "logits/rejected": -0.23548583686351776, "logps/chosen": -350.79998779296875, "logps/rejected": -293.3999938964844, "loss": 0.6164, "rewards/accuracies": 0.6230769157409668, "rewards/chosen": 0.32148438692092896, "rewards/margins": 0.24921874701976776, "rewards/rejected": 0.07172851264476776, "step": 2720 }, { "epoch": 0.7015962924819773, "grad_norm": 112.5, "learning_rate": 1.492018537590113e-07, "logits/chosen": -0.30937498807907104, "logits/rejected": -0.27958983182907104, "logps/chosen": -318.79998779296875, "logps/rejected": -266.20001220703125, "loss": 0.6398, "rewards/accuracies": 0.5916666388511658, "rewards/chosen": 0.31035155057907104, "rewards/margins": 0.18349608778953552, "rewards/rejected": 0.12661132216453552, "step": 2725 }, { "epoch": 0.7028836251287333, "grad_norm": 123.0, "learning_rate": 1.4855818743563338e-07, "logits/chosen": -0.29960936307907104, "logits/rejected": -0.2392578125, "logps/chosen": -319.20001220703125, "logps/rejected": -314.3999938964844, "loss": 0.6516, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2593750059604645, "rewards/margins": 0.14859619736671448, "rewards/rejected": 0.111083984375, "step": 2730 }, { "epoch": 0.7041709577754892, "grad_norm": 134.0, "learning_rate": 1.4791452111225542e-07, "logits/chosen": -0.29443359375, "logits/rejected": -0.2537597715854645, "logps/chosen": -291.0, "logps/rejected": -292.3999938964844, "loss": 0.6242, "rewards/accuracies": 0.5591667294502258, "rewards/chosen": 0.25078123807907104, "rewards/margins": 0.187744140625, "rewards/rejected": 0.06318359076976776, "step": 2735 }, { "epoch": 0.7054582904222451, "grad_norm": 115.5, "learning_rate": 1.4727085478887743e-07, "logits/chosen": -0.3919921815395355, "logits/rejected": -0.2772460877895355, "logps/chosen": -310.3999938964844, "logps/rejected": -279.20001220703125, "loss": 0.6305, "rewards/accuracies": 0.6780953407287598, "rewards/chosen": 0.24370117485523224, "rewards/margins": 0.1848503053188324, "rewards/rejected": 0.058624267578125, "step": 2740 }, { "epoch": 0.706745623069001, "grad_norm": 99.0, "learning_rate": 1.4662718846549948e-07, "logits/chosen": -0.31085205078125, "logits/rejected": -0.2767578065395355, "logps/chosen": -281.0, "logps/rejected": -275.20001220703125, "loss": 0.6687, "rewards/accuracies": 0.513076901435852, "rewards/chosen": 0.20888671278953552, "rewards/margins": 0.10483398288488388, "rewards/rejected": 0.10380859673023224, "step": 2745 }, { "epoch": 0.708032955715757, "grad_norm": 126.0, "learning_rate": 1.4598352214212152e-07, "logits/chosen": -0.3387207090854645, "logits/rejected": -0.26008301973342896, "logps/chosen": -326.79998779296875, "logps/rejected": -315.20001220703125, "loss": 0.6375, "rewards/accuracies": 0.6041666865348816, "rewards/chosen": 0.230224609375, "rewards/margins": 0.158935546875, "rewards/rejected": 0.07097168266773224, "step": 2750 }, { "epoch": 0.7093202883625128, "grad_norm": 98.5, "learning_rate": 1.4533985581874356e-07, "logits/chosen": -0.23183593153953552, "logits/rejected": -0.16586914658546448, "logps/chosen": -276.3999938964844, "logps/rejected": -261.3999938964844, "loss": 0.618, "rewards/accuracies": 0.6120238304138184, "rewards/chosen": 0.23544922471046448, "rewards/margins": 0.2001953125, "rewards/rejected": 0.03554687649011612, "step": 2755 }, { "epoch": 0.7106076210092688, "grad_norm": 102.0, "learning_rate": 1.446961894953656e-07, "logits/chosen": -0.25029295682907104, "logits/rejected": -0.2919921875, "logps/chosen": -270.0, "logps/rejected": -288.0, "loss": 0.6469, "rewards/accuracies": 0.4720238149166107, "rewards/chosen": 0.24814453721046448, "rewards/margins": 0.12143554538488388, "rewards/rejected": 0.12709960341453552, "step": 2760 }, { "epoch": 0.7118949536560247, "grad_norm": 190.0, "learning_rate": 1.4405252317198762e-07, "logits/chosen": -0.29541015625, "logits/rejected": -0.18597412109375, "logps/chosen": -228.10000610351562, "logps/rejected": -254.0, "loss": 0.668, "rewards/accuracies": 0.5433333516120911, "rewards/chosen": 0.18940429389476776, "rewards/margins": 0.09062500298023224, "rewards/rejected": 0.09862060844898224, "step": 2765 }, { "epoch": 0.7131822863027807, "grad_norm": 138.0, "learning_rate": 1.434088568486097e-07, "logits/chosen": -0.18024902045726776, "logits/rejected": -0.24658203125, "logps/chosen": -301.3999938964844, "logps/rejected": -289.3999938964844, "loss": 0.6398, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.22890624403953552, "rewards/margins": 0.15068359673023224, "rewards/rejected": 0.07817383110523224, "step": 2770 }, { "epoch": 0.7144696189495365, "grad_norm": 147.0, "learning_rate": 1.4276519052523173e-07, "logits/chosen": -0.27998048067092896, "logits/rejected": -0.37500762939453125, "logps/chosen": -270.20001220703125, "logps/rejected": -252.8000030517578, "loss": 0.6891, "rewards/accuracies": 0.46000003814697266, "rewards/chosen": 0.16943359375, "rewards/margins": 0.03525390475988388, "rewards/rejected": 0.13408203423023224, "step": 2775 }, { "epoch": 0.7157569515962925, "grad_norm": 121.0, "learning_rate": 1.4212152420185375e-07, "logits/chosen": -0.23300781846046448, "logits/rejected": -0.15576171875, "logps/chosen": -310.0, "logps/rejected": -258.3999938964844, "loss": 0.5852, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.28730469942092896, "rewards/margins": 0.2701171934604645, "rewards/rejected": 0.0179443359375, "step": 2780 }, { "epoch": 0.7170442842430484, "grad_norm": 147.0, "learning_rate": 1.414778578784758e-07, "logits/chosen": -0.3309692442417145, "logits/rejected": -0.2551513612270355, "logps/chosen": -271.3999938964844, "logps/rejected": -295.79998779296875, "loss": 0.6469, "rewards/accuracies": 0.5459523797035217, "rewards/chosen": 0.23828125, "rewards/margins": 0.12451171875, "rewards/rejected": 0.11367187649011612, "step": 2785 }, { "epoch": 0.7183316168898043, "grad_norm": 128.0, "learning_rate": 1.408341915550978e-07, "logits/chosen": -0.2923828065395355, "logits/rejected": -0.22529296576976776, "logps/chosen": -298.0, "logps/rejected": -290.3999938964844, "loss": 0.5867, "rewards/accuracies": 0.7041667103767395, "rewards/chosen": 0.3160156309604645, "rewards/margins": 0.271484375, "rewards/rejected": 0.04489745944738388, "step": 2790 }, { "epoch": 0.7196189495365602, "grad_norm": 113.0, "learning_rate": 1.4019052523171988e-07, "logits/chosen": -0.28266602754592896, "logits/rejected": -0.17326660454273224, "logps/chosen": -289.20001220703125, "logps/rejected": -301.6000061035156, "loss": 0.643, "rewards/accuracies": 0.6317857503890991, "rewards/chosen": 0.25947266817092896, "rewards/margins": 0.1778564453125, "rewards/rejected": 0.08145751804113388, "step": 2795 }, { "epoch": 0.7209062821833162, "grad_norm": 149.0, "learning_rate": 1.3954685890834192e-07, "logits/chosen": -0.3326171934604645, "logits/rejected": -0.25230711698532104, "logps/chosen": -269.3999938964844, "logps/rejected": -304.20001220703125, "loss": 0.6016, "rewards/accuracies": 0.6594871878623962, "rewards/chosen": 0.255859375, "rewards/margins": 0.248046875, "rewards/rejected": 0.00791015662252903, "step": 2800 }, { "epoch": 0.722193614830072, "grad_norm": 130.0, "learning_rate": 1.3890319258496394e-07, "logits/chosen": -0.4300781190395355, "logits/rejected": -0.3792968690395355, "logps/chosen": -323.20001220703125, "logps/rejected": -330.79998779296875, "loss": 0.7023, "rewards/accuracies": 0.4510897696018219, "rewards/chosen": 0.16035155951976776, "rewards/margins": 0.0223846435546875, "rewards/rejected": 0.1376953125, "step": 2805 }, { "epoch": 0.723480947476828, "grad_norm": 191.0, "learning_rate": 1.3825952626158598e-07, "logits/chosen": -0.3857421875, "logits/rejected": -0.3896484375, "logps/chosen": -292.79998779296875, "logps/rejected": -277.3999938964844, "loss": 0.6391, "rewards/accuracies": 0.54666668176651, "rewards/chosen": 0.28398436307907104, "rewards/margins": 0.15810546278953552, "rewards/rejected": 0.1259765625, "step": 2810 }, { "epoch": 0.724768280123584, "grad_norm": 157.0, "learning_rate": 1.3761585993820805e-07, "logits/chosen": -0.46757811307907104, "logits/rejected": -0.42890626192092896, "logps/chosen": -331.6000061035156, "logps/rejected": -319.0, "loss": 0.6469, "rewards/accuracies": 0.5359090566635132, "rewards/chosen": 0.30498045682907104, "rewards/margins": 0.14018554985523224, "rewards/rejected": 0.1650390625, "step": 2815 }, { "epoch": 0.7260556127703398, "grad_norm": 119.0, "learning_rate": 1.3697219361483006e-07, "logits/chosen": -0.20205077528953552, "logits/rejected": -0.209808349609375, "logps/chosen": -270.79998779296875, "logps/rejected": -275.3999938964844, "loss": 0.6555, "rewards/accuracies": 0.5575000047683716, "rewards/chosen": 0.16796875, "rewards/margins": 0.11118163913488388, "rewards/rejected": 0.05688171461224556, "step": 2820 }, { "epoch": 0.7273429454170958, "grad_norm": 121.5, "learning_rate": 1.363285272914521e-07, "logits/chosen": -0.15749511122703552, "logits/rejected": -0.1142578125, "logps/chosen": -294.0, "logps/rejected": -261.3999938964844, "loss": 0.6062, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": 0.24775390326976776, "rewards/margins": 0.23281249403953552, "rewards/rejected": 0.014859008602797985, "step": 2825 }, { "epoch": 0.7286302780638517, "grad_norm": 136.0, "learning_rate": 1.3568486096807412e-07, "logits/chosen": -0.31669920682907104, "logits/rejected": -0.12026367336511612, "logps/chosen": -293.0, "logps/rejected": -248.60000610351562, "loss": 0.6148, "rewards/accuracies": 0.6475790739059448, "rewards/chosen": 0.26093751192092896, "rewards/margins": 0.21870116889476776, "rewards/rejected": 0.04224853590130806, "step": 2830 }, { "epoch": 0.7299176107106076, "grad_norm": 151.0, "learning_rate": 1.350411946446962e-07, "logits/chosen": -0.22714844346046448, "logits/rejected": -0.2567382752895355, "logps/chosen": -293.6000061035156, "logps/rejected": -293.79998779296875, "loss": 0.6391, "rewards/accuracies": 0.5824999809265137, "rewards/chosen": 0.21127930283546448, "rewards/margins": 0.15898437798023224, "rewards/rejected": 0.05209960788488388, "step": 2835 }, { "epoch": 0.7312049433573635, "grad_norm": 123.5, "learning_rate": 1.3439752832131823e-07, "logits/chosen": -0.22055664658546448, "logits/rejected": -0.13046875596046448, "logps/chosen": -358.79998779296875, "logps/rejected": -304.3999938964844, "loss": 0.5883, "rewards/accuracies": 0.7258333563804626, "rewards/chosen": 0.2724609375, "rewards/margins": 0.2660156190395355, "rewards/rejected": 0.006823730655014515, "step": 2840 }, { "epoch": 0.7324922760041195, "grad_norm": 118.0, "learning_rate": 1.3375386199794025e-07, "logits/chosen": -0.24873046576976776, "logits/rejected": -0.262490838766098, "logps/chosen": -254.8000030517578, "logps/rejected": -226.39999389648438, "loss": 0.6641, "rewards/accuracies": 0.5944805145263672, "rewards/chosen": 0.16982421278953552, "rewards/margins": 0.1009674072265625, "rewards/rejected": 0.06870117038488388, "step": 2845 }, { "epoch": 0.7337796086508754, "grad_norm": 118.5, "learning_rate": 1.331101956745623e-07, "logits/chosen": -0.24964599311351776, "logits/rejected": -0.33112794160842896, "logps/chosen": -281.79998779296875, "logps/rejected": -275.6000061035156, "loss": 0.6523, "rewards/accuracies": 0.5683333277702332, "rewards/chosen": 0.21738281846046448, "rewards/margins": 0.12753906846046448, "rewards/rejected": 0.08983764797449112, "step": 2850 }, { "epoch": 0.7350669412976313, "grad_norm": 124.0, "learning_rate": 1.3246652935118436e-07, "logits/chosen": -0.35087889432907104, "logits/rejected": -0.361328125, "logps/chosen": -306.0, "logps/rejected": -291.0, "loss": 0.6352, "rewards/accuracies": 0.6100000143051147, "rewards/chosen": 0.2750000059604645, "rewards/margins": 0.17949219048023224, "rewards/rejected": 0.09565429389476776, "step": 2855 }, { "epoch": 0.7363542739443872, "grad_norm": 154.0, "learning_rate": 1.3182286302780638e-07, "logits/chosen": -0.2894531190395355, "logits/rejected": -0.34711915254592896, "logps/chosen": -268.20001220703125, "logps/rejected": -243.1999969482422, "loss": 0.6648, "rewards/accuracies": 0.5338383913040161, "rewards/chosen": 0.19863280653953552, "rewards/margins": 0.07974853366613388, "rewards/rejected": 0.119140625, "step": 2860 }, { "epoch": 0.7376416065911432, "grad_norm": 146.0, "learning_rate": 1.3117919670442842e-07, "logits/chosen": -0.2655273377895355, "logits/rejected": -0.18525390326976776, "logps/chosen": -243.39999389648438, "logps/rejected": -243.0, "loss": 0.6266, "rewards/accuracies": 0.635952353477478, "rewards/chosen": 0.21328124403953552, "rewards/margins": 0.1767578125, "rewards/rejected": 0.03664550930261612, "step": 2865 }, { "epoch": 0.738928939237899, "grad_norm": 143.0, "learning_rate": 1.3053553038105044e-07, "logits/chosen": -0.4375, "logits/rejected": -0.3951171934604645, "logps/chosen": -331.79998779296875, "logps/rejected": -311.3999938964844, "loss": 0.6547, "rewards/accuracies": 0.5175000429153442, "rewards/chosen": 0.26972657442092896, "rewards/margins": 0.13193359971046448, "rewards/rejected": 0.1376953125, "step": 2870 }, { "epoch": 0.740216271884655, "grad_norm": 123.5, "learning_rate": 1.298918640576725e-07, "logits/chosen": -0.31464844942092896, "logits/rejected": -0.26191407442092896, "logps/chosen": -355.20001220703125, "logps/rejected": -321.3999938964844, "loss": 0.6062, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": 0.29511719942092896, "rewards/margins": 0.23457030951976776, "rewards/rejected": 0.06085205078125, "step": 2875 }, { "epoch": 0.741503604531411, "grad_norm": 104.0, "learning_rate": 1.2924819773429455e-07, "logits/chosen": -0.3193359375, "logits/rejected": -0.3255859315395355, "logps/chosen": -269.79998779296875, "logps/rejected": -239.60000610351562, "loss": 0.6234, "rewards/accuracies": 0.6134615540504456, "rewards/chosen": 0.23554687201976776, "rewards/margins": 0.19960936903953552, "rewards/rejected": 0.03623046725988388, "step": 2880 }, { "epoch": 0.7427909371781668, "grad_norm": 114.0, "learning_rate": 1.2860453141091657e-07, "logits/chosen": -0.21298828721046448, "logits/rejected": -0.16093750298023224, "logps/chosen": -239.39999389648438, "logps/rejected": -254.60000610351562, "loss": 0.6602, "rewards/accuracies": 0.5824999809265137, "rewards/chosen": 0.171875, "rewards/margins": 0.09340820461511612, "rewards/rejected": 0.07834472507238388, "step": 2885 }, { "epoch": 0.7440782698249228, "grad_norm": 116.0, "learning_rate": 1.279608650875386e-07, "logits/chosen": -0.3716796934604645, "logits/rejected": -0.3065429627895355, "logps/chosen": -306.3999938964844, "logps/rejected": -319.20001220703125, "loss": 0.6484, "rewards/accuracies": 0.5227564573287964, "rewards/chosen": 0.18964843451976776, "rewards/margins": 0.12924805283546448, "rewards/rejected": 0.06032714992761612, "step": 2890 }, { "epoch": 0.7453656024716787, "grad_norm": 143.0, "learning_rate": 1.2731719876416068e-07, "logits/chosen": -0.28662109375, "logits/rejected": -0.3020263612270355, "logps/chosen": -384.3999938964844, "logps/rejected": -305.79998779296875, "loss": 0.6066, "rewards/accuracies": 0.5803571939468384, "rewards/chosen": 0.38984376192092896, "rewards/margins": 0.24051514267921448, "rewards/rejected": 0.14921875298023224, "step": 2895 }, { "epoch": 0.7466529351184346, "grad_norm": 159.0, "learning_rate": 1.266735324407827e-07, "logits/chosen": -0.30986326932907104, "logits/rejected": -0.306640625, "logps/chosen": -301.3999938964844, "logps/rejected": -374.79998779296875, "loss": 0.6531, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.17558594048023224, "rewards/margins": 0.13420410454273224, "rewards/rejected": 0.04127807542681694, "step": 2900 }, { "epoch": 0.7479402677651905, "grad_norm": 127.0, "learning_rate": 1.2602986611740474e-07, "logits/chosen": -0.2818359434604645, "logits/rejected": -0.30351561307907104, "logps/chosen": -327.20001220703125, "logps/rejected": -266.3999938964844, "loss": 0.6203, "rewards/accuracies": 0.5841666460037231, "rewards/chosen": 0.2562499940395355, "rewards/margins": 0.19047851860523224, "rewards/rejected": 0.06582031399011612, "step": 2905 }, { "epoch": 0.7492276004119465, "grad_norm": 138.0, "learning_rate": 1.2538619979402675e-07, "logits/chosen": -0.38945311307907104, "logits/rejected": -0.12998047471046448, "logps/chosen": -277.6000061035156, "logps/rejected": -243.1999969482422, "loss": 0.6172, "rewards/accuracies": 0.610313892364502, "rewards/chosen": 0.29707032442092896, "rewards/margins": 0.18974609673023224, "rewards/rejected": 0.10690917819738388, "step": 2910 }, { "epoch": 0.7505149330587023, "grad_norm": 96.0, "learning_rate": 1.247425334706488e-07, "logits/chosen": -0.3714843690395355, "logits/rejected": -0.250244140625, "logps/chosen": -261.3999938964844, "logps/rejected": -265.20001220703125, "loss": 0.657, "rewards/accuracies": 0.591858983039856, "rewards/chosen": 0.21513672173023224, "rewards/margins": 0.110107421875, "rewards/rejected": 0.10472412407398224, "step": 2915 }, { "epoch": 0.7518022657054583, "grad_norm": 105.0, "learning_rate": 1.2409886714727084e-07, "logits/chosen": -0.3330078125, "logits/rejected": -0.22011718153953552, "logps/chosen": -335.3999938964844, "logps/rejected": -308.6000061035156, "loss": 0.6531, "rewards/accuracies": 0.5688095092773438, "rewards/chosen": 0.2925781309604645, "rewards/margins": 0.14052733778953552, "rewards/rejected": 0.15186767280101776, "step": 2920 }, { "epoch": 0.7530895983522142, "grad_norm": 132.0, "learning_rate": 1.2345520082389288e-07, "logits/chosen": -0.12922362983226776, "logits/rejected": -0.17041015625, "logps/chosen": -338.79998779296875, "logps/rejected": -310.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.5625, "rewards/chosen": 0.21699218451976776, "rewards/margins": 0.15927734971046448, "rewards/rejected": 0.05745849758386612, "step": 2925 }, { "epoch": 0.7543769309989702, "grad_norm": 158.0, "learning_rate": 1.2281153450051492e-07, "logits/chosen": -0.39912110567092896, "logits/rejected": -0.2628417909145355, "logps/chosen": -313.79998779296875, "logps/rejected": -288.6000061035156, "loss": 0.6234, "rewards/accuracies": 0.5233333706855774, "rewards/chosen": 0.2808593809604645, "rewards/margins": 0.19648437201976776, "rewards/rejected": 0.08458862453699112, "step": 2930 }, { "epoch": 0.755664263645726, "grad_norm": 125.5, "learning_rate": 1.2216786817713697e-07, "logits/chosen": -0.4737304747104645, "logits/rejected": -0.3832031190395355, "logps/chosen": -310.20001220703125, "logps/rejected": -252.39999389648438, "loss": 0.6234, "rewards/accuracies": 0.6020207405090332, "rewards/chosen": 0.28447264432907104, "rewards/margins": 0.19266358017921448, "rewards/rejected": 0.09149780124425888, "step": 2935 }, { "epoch": 0.756951596292482, "grad_norm": 166.0, "learning_rate": 1.21524201853759e-07, "logits/chosen": -0.37675780057907104, "logits/rejected": -0.32734376192092896, "logps/chosen": -349.6000061035156, "logps/rejected": -307.6000061035156, "loss": 0.6258, "rewards/accuracies": 0.6266666650772095, "rewards/chosen": 0.3392578065395355, "rewards/margins": 0.189453125, "rewards/rejected": 0.15007324516773224, "step": 2940 }, { "epoch": 0.7582389289392379, "grad_norm": 126.5, "learning_rate": 1.2088053553038105e-07, "logits/chosen": -0.28858643770217896, "logits/rejected": -0.26249998807907104, "logps/chosen": -303.79998779296875, "logps/rejected": -277.79998779296875, "loss": 0.6641, "rewards/accuracies": 0.4941233992576599, "rewards/chosen": 0.24091796576976776, "rewards/margins": 0.119140625, "rewards/rejected": 0.12135620415210724, "step": 2945 }, { "epoch": 0.7595262615859938, "grad_norm": 88.5, "learning_rate": 1.202368692070031e-07, "logits/chosen": -0.18242187798023224, "logits/rejected": -0.14658203721046448, "logps/chosen": -267.6000061035156, "logps/rejected": -250.39999389648438, "loss": 0.5938, "rewards/accuracies": 0.6401923298835754, "rewards/chosen": 0.2333984375, "rewards/margins": 0.2655273377895355, "rewards/rejected": -0.03232421725988388, "step": 2950 }, { "epoch": 0.7608135942327497, "grad_norm": 148.0, "learning_rate": 1.195932028836251e-07, "logits/chosen": -0.2884765565395355, "logits/rejected": -0.21726074814796448, "logps/chosen": -314.6000061035156, "logps/rejected": -284.0, "loss": 0.6195, "rewards/accuracies": 0.6002380847930908, "rewards/chosen": 0.27734375, "rewards/margins": 0.20048828423023224, "rewards/rejected": 0.07670898735523224, "step": 2955 }, { "epoch": 0.7621009268795057, "grad_norm": 126.5, "learning_rate": 1.1894953656024715e-07, "logits/chosen": -0.24514159560203552, "logits/rejected": -0.16015625, "logps/chosen": -262.0, "logps/rejected": -280.6000061035156, "loss": 0.6258, "rewards/accuracies": 0.6263095140457153, "rewards/chosen": 0.17822265625, "rewards/margins": 0.1658935546875, "rewards/rejected": 0.01263427734375, "step": 2960 }, { "epoch": 0.7633882595262615, "grad_norm": 147.0, "learning_rate": 1.1830587023686921e-07, "logits/chosen": -0.5152343511581421, "logits/rejected": -0.4730468690395355, "logps/chosen": -309.3999938964844, "logps/rejected": -266.79998779296875, "loss": 0.6242, "rewards/accuracies": 0.6378571391105652, "rewards/chosen": 0.3099609315395355, "rewards/margins": 0.18466797471046448, "rewards/rejected": 0.1251220703125, "step": 2965 }, { "epoch": 0.7646755921730175, "grad_norm": 157.0, "learning_rate": 1.1766220391349124e-07, "logits/chosen": -0.13327637314796448, "logits/rejected": -0.12089844048023224, "logps/chosen": -338.79998779296875, "logps/rejected": -245.0, "loss": 0.5867, "rewards/accuracies": 0.7422436475753784, "rewards/chosen": 0.3140625059604645, "rewards/margins": 0.2562499940395355, "rewards/rejected": 0.05787353590130806, "step": 2970 }, { "epoch": 0.7659629248197735, "grad_norm": 124.0, "learning_rate": 1.1701853759011328e-07, "logits/chosen": -0.23471680283546448, "logits/rejected": -0.20463867485523224, "logps/chosen": -324.20001220703125, "logps/rejected": -294.79998779296875, "loss": 0.6273, "rewards/accuracies": 0.6275641322135925, "rewards/chosen": 0.23955078423023224, "rewards/margins": 0.18942871689796448, "rewards/rejected": 0.04990539699792862, "step": 2975 }, { "epoch": 0.7672502574665293, "grad_norm": 100.0, "learning_rate": 1.1637487126673531e-07, "logits/chosen": -0.31904298067092896, "logits/rejected": -0.30156248807907104, "logps/chosen": -318.20001220703125, "logps/rejected": -266.3999938964844, "loss": 0.6172, "rewards/accuracies": 0.662857174873352, "rewards/chosen": 0.2884765565395355, "rewards/margins": 0.19501952826976776, "rewards/rejected": 0.09360351413488388, "step": 2980 }, { "epoch": 0.7685375901132853, "grad_norm": 133.0, "learning_rate": 1.1573120494335737e-07, "logits/chosen": -0.24624022841453552, "logits/rejected": -0.3802856504917145, "logps/chosen": -299.79998779296875, "logps/rejected": -283.0, "loss": 0.6539, "rewards/accuracies": 0.5625000596046448, "rewards/chosen": 0.2503906190395355, "rewards/margins": 0.115966796875, "rewards/rejected": 0.134765625, "step": 2985 }, { "epoch": 0.7698249227600412, "grad_norm": 158.0, "learning_rate": 1.150875386199794e-07, "logits/chosen": -0.43671876192092896, "logits/rejected": -0.4052734375, "logps/chosen": -378.3999938964844, "logps/rejected": -310.6000061035156, "loss": 0.632, "rewards/accuracies": 0.579880952835083, "rewards/chosen": 0.39335936307907104, "rewards/margins": 0.15864257514476776, "rewards/rejected": 0.23515625298023224, "step": 2990 }, { "epoch": 0.7711122554067971, "grad_norm": 128.0, "learning_rate": 1.1444387229660144e-07, "logits/chosen": -0.3681640625, "logits/rejected": -0.2839111387729645, "logps/chosen": -287.0, "logps/rejected": -282.20001220703125, "loss": 0.643, "rewards/accuracies": 0.5977272987365723, "rewards/chosen": 0.21464844048023224, "rewards/margins": 0.13525390625, "rewards/rejected": 0.07976074516773224, "step": 2995 }, { "epoch": 0.772399588053553, "grad_norm": 151.0, "learning_rate": 1.1380020597322347e-07, "logits/chosen": -0.3511718809604645, "logits/rejected": -0.2967529296875, "logps/chosen": -388.79998779296875, "logps/rejected": -333.3999938964844, "loss": 0.6469, "rewards/accuracies": 0.5479761958122253, "rewards/chosen": 0.32109373807907104, "rewards/margins": 0.13969726860523224, "rewards/rejected": 0.181640625, "step": 3000 }, { "epoch": 0.773686920700309, "grad_norm": 114.5, "learning_rate": 1.1315653964984552e-07, "logits/chosen": -0.42500001192092896, "logits/rejected": -0.3462890684604645, "logps/chosen": -362.0, "logps/rejected": -293.20001220703125, "loss": 0.6102, "rewards/accuracies": 0.6541666984558105, "rewards/chosen": 0.30156248807907104, "rewards/margins": 0.2554687559604645, "rewards/rejected": 0.04630126804113388, "step": 3005 }, { "epoch": 0.7749742533470649, "grad_norm": 246.0, "learning_rate": 1.1251287332646755e-07, "logits/chosen": -0.1611328125, "logits/rejected": -0.0005615234258584678, "logps/chosen": -266.8999938964844, "logps/rejected": -247.39999389648438, "loss": 0.6055, "rewards/accuracies": 0.6559615731239319, "rewards/chosen": 0.2353515625, "rewards/margins": 0.23259277641773224, "rewards/rejected": 0.003143310546875, "step": 3010 }, { "epoch": 0.7762615859938208, "grad_norm": 135.0, "learning_rate": 1.118692070030896e-07, "logits/chosen": -0.46367186307907104, "logits/rejected": -0.33696287870407104, "logps/chosen": -330.3999938964844, "logps/rejected": -302.79998779296875, "loss": 0.6781, "rewards/accuracies": 0.5099999904632568, "rewards/chosen": 0.25312501192092896, "rewards/margins": 0.07363281399011612, "rewards/rejected": 0.17988280951976776, "step": 3015 }, { "epoch": 0.7775489186405767, "grad_norm": 123.0, "learning_rate": 1.1122554067971163e-07, "logits/chosen": -0.46015626192092896, "logits/rejected": -0.34394532442092896, "logps/chosen": -325.79998779296875, "logps/rejected": -239.39999389648438, "loss": 0.6445, "rewards/accuracies": 0.5525000691413879, "rewards/chosen": 0.2899414002895355, "rewards/margins": 0.16000977158546448, "rewards/rejected": 0.13007812201976776, "step": 3020 }, { "epoch": 0.7788362512873327, "grad_norm": 153.0, "learning_rate": 1.1058187435633368e-07, "logits/chosen": -0.349609375, "logits/rejected": -0.30097657442092896, "logps/chosen": -330.79998779296875, "logps/rejected": -279.0, "loss": 0.6141, "rewards/accuracies": 0.6739743947982788, "rewards/chosen": 0.2725585997104645, "rewards/margins": 0.20802612602710724, "rewards/rejected": 0.06410064548254013, "step": 3025 }, { "epoch": 0.7801235839340885, "grad_norm": 130.0, "learning_rate": 1.0993820803295571e-07, "logits/chosen": -0.3199218809604645, "logits/rejected": -0.3306640684604645, "logps/chosen": -303.3999938964844, "logps/rejected": -293.79998779296875, "loss": 0.6391, "rewards/accuracies": 0.6183333396911621, "rewards/chosen": 0.25458985567092896, "rewards/margins": 0.15495605766773224, "rewards/rejected": 0.0994873046875, "step": 3030 }, { "epoch": 0.7814109165808445, "grad_norm": 133.0, "learning_rate": 1.0929454170957775e-07, "logits/chosen": -0.20126953721046448, "logits/rejected": -0.11264648288488388, "logps/chosen": -302.79998779296875, "logps/rejected": -249.8000030517578, "loss": 0.6242, "rewards/accuracies": 0.5241667032241821, "rewards/chosen": 0.26752930879592896, "rewards/margins": 0.20205077528953552, "rewards/rejected": 0.06538085639476776, "step": 3035 }, { "epoch": 0.7826982492276005, "grad_norm": 102.0, "learning_rate": 1.0865087538619978e-07, "logits/chosen": -0.17412109673023224, "logits/rejected": -0.18325194716453552, "logps/chosen": -229.0, "logps/rejected": -261.20001220703125, "loss": 0.6234, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2373046875, "rewards/margins": 0.18232421576976776, "rewards/rejected": 0.05492553859949112, "step": 3040 }, { "epoch": 0.7839855818743563, "grad_norm": 213.0, "learning_rate": 1.0800720906282184e-07, "logits/chosen": -0.3533691465854645, "logits/rejected": -0.35478514432907104, "logps/chosen": -277.20001220703125, "logps/rejected": -302.20001220703125, "loss": 0.6453, "rewards/accuracies": 0.5085317492485046, "rewards/chosen": 0.19814452528953552, "rewards/margins": 0.13847656548023224, "rewards/rejected": 0.05992431566119194, "step": 3045 }, { "epoch": 0.7852729145211123, "grad_norm": 208.0, "learning_rate": 1.0736354273944387e-07, "logits/chosen": -0.2754882872104645, "logits/rejected": -0.14199218153953552, "logps/chosen": -237.39999389648438, "logps/rejected": -239.10000610351562, "loss": 0.6297, "rewards/accuracies": 0.6379870176315308, "rewards/chosen": 0.21601562201976776, "rewards/margins": 0.17119140923023224, "rewards/rejected": 0.04502124711871147, "step": 3050 }, { "epoch": 0.7865602471678682, "grad_norm": 150.0, "learning_rate": 1.067198764160659e-07, "logits/chosen": -0.41191405057907104, "logits/rejected": -0.4251953065395355, "logps/chosen": -348.20001220703125, "logps/rejected": -374.3999938964844, "loss": 0.6594, "rewards/accuracies": 0.5660713911056519, "rewards/chosen": 0.30024415254592896, "rewards/margins": 0.1259765625, "rewards/rejected": 0.17385253310203552, "step": 3055 }, { "epoch": 0.787847579814624, "grad_norm": 137.0, "learning_rate": 1.0607621009268794e-07, "logits/chosen": -0.5355468988418579, "logits/rejected": -0.37773436307907104, "logps/chosen": -370.79998779296875, "logps/rejected": -318.20001220703125, "loss": 0.6023, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": 0.3843750059604645, "rewards/margins": 0.24228516221046448, "rewards/rejected": 0.14208984375, "step": 3060 }, { "epoch": 0.78913491246138, "grad_norm": 100.0, "learning_rate": 1.0543254376930998e-07, "logits/chosen": -0.29902344942092896, "logits/rejected": -0.2833007872104645, "logps/chosen": -271.20001220703125, "logps/rejected": -262.3999938964844, "loss": 0.6352, "rewards/accuracies": 0.5785897970199585, "rewards/chosen": 0.3031249940395355, "rewards/margins": 0.1942138671875, "rewards/rejected": 0.10869140923023224, "step": 3065 }, { "epoch": 0.790422245108136, "grad_norm": 149.0, "learning_rate": 1.0478887744593203e-07, "logits/chosen": -0.42988282442092896, "logits/rejected": -0.3705078065395355, "logps/chosen": -323.3999938964844, "logps/rejected": -268.6000061035156, "loss": 0.6625, "rewards/accuracies": 0.5308424830436707, "rewards/chosen": 0.2533203065395355, "rewards/margins": 0.10399017482995987, "rewards/rejected": 0.14921875298023224, "step": 3070 }, { "epoch": 0.7917095777548918, "grad_norm": 134.0, "learning_rate": 1.0414521112255406e-07, "logits/chosen": -0.27910155057907104, "logits/rejected": -0.19775390625, "logps/chosen": -292.3999938964844, "logps/rejected": -257.0, "loss": 0.6266, "rewards/accuracies": 0.5637471079826355, "rewards/chosen": 0.251953125, "rewards/margins": 0.16816405951976776, "rewards/rejected": 0.08378906548023224, "step": 3075 }, { "epoch": 0.7929969104016478, "grad_norm": 120.0, "learning_rate": 1.035015447991761e-07, "logits/chosen": -0.24062499403953552, "logits/rejected": -0.20546874403953552, "logps/chosen": -253.0, "logps/rejected": -275.0, "loss": 0.6242, "rewards/accuracies": 0.7217857241630554, "rewards/chosen": 0.2001953125, "rewards/margins": 0.17281189560890198, "rewards/rejected": 0.02733154222369194, "step": 3080 }, { "epoch": 0.7942842430484037, "grad_norm": 101.5, "learning_rate": 1.0285787847579814e-07, "logits/chosen": -0.302734375, "logits/rejected": -0.2972656190395355, "logps/chosen": -315.6000061035156, "logps/rejected": -308.3999938964844, "loss": 0.6328, "rewards/accuracies": 0.6066666841506958, "rewards/chosen": 0.32636719942092896, "rewards/margins": 0.17563477158546448, "rewards/rejected": 0.15068359673023224, "step": 3085 }, { "epoch": 0.7955715756951597, "grad_norm": 114.0, "learning_rate": 1.0221421215242018e-07, "logits/chosen": -0.34843748807907104, "logits/rejected": -0.234375, "logps/chosen": -350.79998779296875, "logps/rejected": -309.79998779296875, "loss": 0.5762, "rewards/accuracies": 0.751346230506897, "rewards/chosen": 0.3238281309604645, "rewards/margins": 0.3232421875, "rewards/rejected": 0.0010620116954669356, "step": 3090 }, { "epoch": 0.7968589083419155, "grad_norm": 179.0, "learning_rate": 1.0157054582904221e-07, "logits/chosen": -0.21840819716453552, "logits/rejected": -0.35771483182907104, "logps/chosen": -235.0, "logps/rejected": -235.8000030517578, "loss": 0.618, "rewards/accuracies": 0.5780952572822571, "rewards/chosen": 0.21113280951976776, "rewards/margins": 0.18125000596046448, "rewards/rejected": 0.02988281287252903, "step": 3095 }, { "epoch": 0.7981462409886715, "grad_norm": 112.5, "learning_rate": 1.0092687950566426e-07, "logits/chosen": -0.28984373807907104, "logits/rejected": -0.17592772841453552, "logps/chosen": -272.6000061035156, "logps/rejected": -245.10000610351562, "loss": 0.6219, "rewards/accuracies": 0.5338095426559448, "rewards/chosen": 0.2109375, "rewards/margins": 0.1865234375, "rewards/rejected": 0.02431640587747097, "step": 3100 }, { "epoch": 0.7994335736354274, "grad_norm": 166.0, "learning_rate": 1.002832131822863e-07, "logits/chosen": -0.16865234076976776, "logits/rejected": -0.17128905653953552, "logps/chosen": -274.20001220703125, "logps/rejected": -270.79998779296875, "loss": 0.6523, "rewards/accuracies": 0.5533791780471802, "rewards/chosen": 0.23281249403953552, "rewards/margins": 0.11911620944738388, "rewards/rejected": 0.11367187649011612, "step": 3105 }, { "epoch": 0.8007209062821833, "grad_norm": 122.5, "learning_rate": 9.963954685890834e-08, "logits/chosen": -0.21406249701976776, "logits/rejected": -0.181640625, "logps/chosen": -281.6000061035156, "logps/rejected": -299.20001220703125, "loss": 0.6289, "rewards/accuracies": 0.6469047665596008, "rewards/chosen": 0.22597655653953552, "rewards/margins": 0.18359375, "rewards/rejected": 0.04216308519244194, "step": 3110 }, { "epoch": 0.8020082389289392, "grad_norm": 146.0, "learning_rate": 9.899588053553037e-08, "logits/chosen": -0.15366211533546448, "logits/rejected": -0.10031738132238388, "logps/chosen": -245.0, "logps/rejected": -234.39999389648438, "loss": 0.657, "rewards/accuracies": 0.614448070526123, "rewards/chosen": 0.16640624403953552, "rewards/margins": 0.1014404296875, "rewards/rejected": 0.06523437798023224, "step": 3115 }, { "epoch": 0.8032955715756952, "grad_norm": 121.5, "learning_rate": 9.835221421215241e-08, "logits/chosen": -0.3607421815395355, "logits/rejected": -0.2719970643520355, "logps/chosen": -270.0, "logps/rejected": -263.79998779296875, "loss": 0.625, "rewards/accuracies": 0.57833331823349, "rewards/chosen": 0.21308593451976776, "rewards/margins": 0.16191406548023224, "rewards/rejected": 0.05117187649011612, "step": 3120 }, { "epoch": 0.804582904222451, "grad_norm": 118.5, "learning_rate": 9.770854788877446e-08, "logits/chosen": -0.24335937201976776, "logits/rejected": -0.16396483778953552, "logps/chosen": -285.20001220703125, "logps/rejected": -266.3999938964844, "loss": 0.5953, "rewards/accuracies": 0.684166669845581, "rewards/chosen": 0.24482421576976776, "rewards/margins": 0.2606445252895355, "rewards/rejected": -0.01549682579934597, "step": 3125 }, { "epoch": 0.805870236869207, "grad_norm": 110.5, "learning_rate": 9.70648815653965e-08, "logits/chosen": -0.15256348252296448, "logits/rejected": -0.19741210341453552, "logps/chosen": -348.3999938964844, "logps/rejected": -315.0, "loss": 0.6367, "rewards/accuracies": 0.5908333659172058, "rewards/chosen": 0.27880859375, "rewards/margins": 0.18125000596046448, "rewards/rejected": 0.09772948920726776, "step": 3130 }, { "epoch": 0.807157569515963, "grad_norm": 133.0, "learning_rate": 9.642121524201853e-08, "logits/chosen": -0.28437501192092896, "logits/rejected": -0.21308593451976776, "logps/chosen": -328.0, "logps/rejected": -308.0, "loss": 0.6656, "rewards/accuracies": 0.5325000286102295, "rewards/chosen": 0.20273438096046448, "rewards/margins": 0.10922851413488388, "rewards/rejected": 0.09372711181640625, "step": 3135 }, { "epoch": 0.8084449021627188, "grad_norm": 129.0, "learning_rate": 9.577754891864057e-08, "logits/chosen": -0.36835938692092896, "logits/rejected": -0.2554687559604645, "logps/chosen": -295.6000061035156, "logps/rejected": -287.6000061035156, "loss": 0.6602, "rewards/accuracies": 0.5466667413711548, "rewards/chosen": 0.24589844048023224, "rewards/margins": 0.13427734375, "rewards/rejected": 0.11125488579273224, "step": 3140 }, { "epoch": 0.8097322348094748, "grad_norm": 112.0, "learning_rate": 9.513388259526261e-08, "logits/chosen": -0.101959228515625, "logits/rejected": -0.17646484076976776, "logps/chosen": -242.8000030517578, "logps/rejected": -220.60000610351562, "loss": 0.6734, "rewards/accuracies": 0.48000001907348633, "rewards/chosen": 0.18632812798023224, "rewards/margins": 0.06096191331744194, "rewards/rejected": 0.12602539360523224, "step": 3145 }, { "epoch": 0.8110195674562307, "grad_norm": 133.0, "learning_rate": 9.449021627188466e-08, "logits/chosen": -0.21318359673023224, "logits/rejected": 0.04121093824505806, "logps/chosen": -298.3999938964844, "logps/rejected": -244.0, "loss": 0.6273, "rewards/accuracies": 0.6318590044975281, "rewards/chosen": 0.2621093690395355, "rewards/margins": 0.19438476860523224, "rewards/rejected": 0.06752929836511612, "step": 3150 }, { "epoch": 0.8123069001029866, "grad_norm": 158.0, "learning_rate": 9.384654994850669e-08, "logits/chosen": -0.3438476622104645, "logits/rejected": -0.38916015625, "logps/chosen": -304.0, "logps/rejected": -279.79998779296875, "loss": 0.6117, "rewards/accuracies": 0.5955769419670105, "rewards/chosen": 0.3187499940395355, "rewards/margins": 0.21103516221046448, "rewards/rejected": 0.1080322265625, "step": 3155 }, { "epoch": 0.8135942327497425, "grad_norm": 116.5, "learning_rate": 9.320288362512873e-08, "logits/chosen": -0.39262694120407104, "logits/rejected": -0.4039062559604645, "logps/chosen": -258.6000061035156, "logps/rejected": -303.6000061035156, "loss": 0.6891, "rewards/accuracies": 0.43916669487953186, "rewards/chosen": 0.16396483778953552, "rewards/margins": 0.04667968675494194, "rewards/rejected": 0.11711426079273224, "step": 3160 }, { "epoch": 0.8148815653964985, "grad_norm": 139.0, "learning_rate": 9.255921730175077e-08, "logits/chosen": -0.29913330078125, "logits/rejected": -0.25041502714157104, "logps/chosen": -280.0, "logps/rejected": -273.3999938964844, "loss": 0.6344, "rewards/accuracies": 0.6260714530944824, "rewards/chosen": 0.22578124701976776, "rewards/margins": 0.16665038466453552, "rewards/rejected": 0.05921630933880806, "step": 3165 }, { "epoch": 0.8161688980432544, "grad_norm": 120.5, "learning_rate": 9.191555097837281e-08, "logits/chosen": -0.4859375059604645, "logits/rejected": -0.36796873807907104, "logps/chosen": -313.0, "logps/rejected": -252.8000030517578, "loss": 0.6789, "rewards/accuracies": 0.5633333325386047, "rewards/chosen": 0.24921874701976776, "rewards/margins": 0.07297363132238388, "rewards/rejected": 0.17617186903953552, "step": 3170 }, { "epoch": 0.8174562306900103, "grad_norm": 145.0, "learning_rate": 9.127188465499484e-08, "logits/chosen": -0.3013671934604645, "logits/rejected": -0.177734375, "logps/chosen": -340.0, "logps/rejected": -245.8000030517578, "loss": 0.5969, "rewards/accuracies": 0.6361905336380005, "rewards/chosen": 0.2874999940395355, "rewards/margins": 0.24843749403953552, "rewards/rejected": 0.03897704929113388, "step": 3175 }, { "epoch": 0.8187435633367662, "grad_norm": 140.0, "learning_rate": 9.062821833161689e-08, "logits/chosen": -0.3291015625, "logits/rejected": -0.32890623807907104, "logps/chosen": -306.0, "logps/rejected": -300.20001220703125, "loss": 0.6813, "rewards/accuracies": 0.5214285850524902, "rewards/chosen": 0.21757812798023224, "rewards/margins": 0.08144531399011612, "rewards/rejected": 0.1357421875, "step": 3180 }, { "epoch": 0.8200308959835222, "grad_norm": 248.0, "learning_rate": 8.998455200823893e-08, "logits/chosen": -0.2060546875, "logits/rejected": -0.28178709745407104, "logps/chosen": -281.6000061035156, "logps/rejected": -232.39999389648438, "loss": 0.6289, "rewards/accuracies": 0.6170330047607422, "rewards/chosen": 0.23232421278953552, "rewards/margins": 0.173828125, "rewards/rejected": 0.05859375, "step": 3185 }, { "epoch": 0.821318228630278, "grad_norm": 113.0, "learning_rate": 8.934088568486097e-08, "logits/chosen": -0.32929688692092896, "logits/rejected": -0.2962646484375, "logps/chosen": -311.20001220703125, "logps/rejected": -258.6000061035156, "loss": 0.6336, "rewards/accuracies": 0.5575000047683716, "rewards/chosen": 0.21914061903953552, "rewards/margins": 0.16103515028953552, "rewards/rejected": 0.0582275390625, "step": 3190 }, { "epoch": 0.822605561277034, "grad_norm": 118.0, "learning_rate": 8.8697219361483e-08, "logits/chosen": -0.4208984375, "logits/rejected": -0.43085938692092896, "logps/chosen": -321.20001220703125, "logps/rejected": -277.20001220703125, "loss": 0.618, "rewards/accuracies": 0.5703571438789368, "rewards/chosen": 0.28886717557907104, "rewards/margins": 0.20820312201976776, "rewards/rejected": 0.08046875149011612, "step": 3195 }, { "epoch": 0.82389289392379, "grad_norm": 137.0, "learning_rate": 8.805355303810503e-08, "logits/chosen": -0.20947265625, "logits/rejected": -0.2705078125, "logps/chosen": -298.3999938964844, "logps/rejected": -288.20001220703125, "loss": 0.6266, "rewards/accuracies": 0.6586363911628723, "rewards/chosen": 0.26171875, "rewards/margins": 0.189453125, "rewards/rejected": 0.07158203423023224, "step": 3200 }, { "epoch": 0.8251802265705458, "grad_norm": 156.0, "learning_rate": 8.740988671472709e-08, "logits/chosen": -0.3091796934604645, "logits/rejected": -0.25029295682907104, "logps/chosen": -296.3999938964844, "logps/rejected": -300.0, "loss": 0.6727, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1865234375, "rewards/margins": 0.08686523139476776, "rewards/rejected": 0.09962920844554901, "step": 3205 }, { "epoch": 0.8264675592173018, "grad_norm": 134.0, "learning_rate": 8.676622039134912e-08, "logits/chosen": -0.3375000059604645, "logits/rejected": -0.23325195908546448, "logps/chosen": -349.79998779296875, "logps/rejected": -309.20001220703125, "loss": 0.6273, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.20664063096046448, "rewards/margins": 0.16933593153953552, "rewards/rejected": 0.03705444186925888, "step": 3210 }, { "epoch": 0.8277548918640577, "grad_norm": 142.0, "learning_rate": 8.612255406797116e-08, "logits/chosen": -0.21005859971046448, "logits/rejected": -0.16594238579273224, "logps/chosen": -289.79998779296875, "logps/rejected": -254.0, "loss": 0.6234, "rewards/accuracies": 0.628333330154419, "rewards/chosen": 0.22207030653953552, "rewards/margins": 0.17739257216453552, "rewards/rejected": 0.04484863206744194, "step": 3215 }, { "epoch": 0.8290422245108136, "grad_norm": 130.0, "learning_rate": 8.547888774459319e-08, "logits/chosen": -0.37031251192092896, "logits/rejected": -0.3466796875, "logps/chosen": -267.79998779296875, "logps/rejected": -237.5, "loss": 0.6547, "rewards/accuracies": 0.52934730052948, "rewards/chosen": 0.2525390684604645, "rewards/margins": 0.10761718451976776, "rewards/rejected": 0.14450684189796448, "step": 3220 }, { "epoch": 0.8303295571575695, "grad_norm": 103.0, "learning_rate": 8.483522142121524e-08, "logits/chosen": -0.214599609375, "logits/rejected": -0.14335937798023224, "logps/chosen": -293.0, "logps/rejected": -269.20001220703125, "loss": 0.6227, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": 0.28046876192092896, "rewards/margins": 0.20458984375, "rewards/rejected": 0.07561035454273224, "step": 3225 }, { "epoch": 0.8316168898043255, "grad_norm": 127.5, "learning_rate": 8.419155509783727e-08, "logits/chosen": -0.2749267518520355, "logits/rejected": -0.244140625, "logps/chosen": -316.6000061035156, "logps/rejected": -301.0, "loss": 0.6148, "rewards/accuracies": 0.6708333492279053, "rewards/chosen": 0.2972656190395355, "rewards/margins": 0.21240234375, "rewards/rejected": 0.08505859225988388, "step": 3230 }, { "epoch": 0.8329042224510813, "grad_norm": 124.0, "learning_rate": 8.354788877445932e-08, "logits/chosen": -0.30058592557907104, "logits/rejected": -0.19970703125, "logps/chosen": -291.6000061035156, "logps/rejected": -269.79998779296875, "loss": 0.643, "rewards/accuracies": 0.596666693687439, "rewards/chosen": 0.19179686903953552, "rewards/margins": 0.14494629204273224, "rewards/rejected": 0.04655761644244194, "step": 3235 }, { "epoch": 0.8341915550978373, "grad_norm": 110.5, "learning_rate": 8.290422245108136e-08, "logits/chosen": -0.3876953125, "logits/rejected": -0.3583984375, "logps/chosen": -274.79998779296875, "logps/rejected": -292.79998779296875, "loss": 0.6531, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2632812559604645, "rewards/margins": 0.13093261420726776, "rewards/rejected": 0.13251952826976776, "step": 3240 }, { "epoch": 0.8354788877445932, "grad_norm": 119.0, "learning_rate": 8.22605561277034e-08, "logits/chosen": -0.3109374940395355, "logits/rejected": -0.3166747987270355, "logps/chosen": -289.6000061035156, "logps/rejected": -310.3999938964844, "loss": 0.6273, "rewards/accuracies": 0.5905952453613281, "rewards/chosen": 0.23486328125, "rewards/margins": 0.17753906548023224, "rewards/rejected": 0.05722656100988388, "step": 3245 }, { "epoch": 0.8367662203913491, "grad_norm": 153.0, "learning_rate": 8.161688980432543e-08, "logits/chosen": -0.39179688692092896, "logits/rejected": -0.3709960877895355, "logps/chosen": -337.6000061035156, "logps/rejected": -267.0, "loss": 0.6141, "rewards/accuracies": 0.6185897588729858, "rewards/chosen": 0.2925781309604645, "rewards/margins": 0.20644530653953552, "rewards/rejected": 0.08544921875, "step": 3250 }, { "epoch": 0.838053553038105, "grad_norm": 109.5, "learning_rate": 8.097322348094747e-08, "logits/chosen": NaN, "logits/rejected": 0.1962890625, "logps/chosen": -239.39999389648438, "logps/rejected": -227.8000030517578, "loss": 0.6461, "rewards/accuracies": 0.5266666412353516, "rewards/chosen": 0.14935913681983948, "rewards/margins": 0.130126953125, "rewards/rejected": 0.01938476599752903, "step": 3255 }, { "epoch": 0.839340885684861, "grad_norm": 114.0, "learning_rate": 8.032955715756952e-08, "logits/chosen": -0.2994628846645355, "logits/rejected": -0.33525389432907104, "logps/chosen": -296.6000061035156, "logps/rejected": -350.79998779296875, "loss": 0.618, "rewards/accuracies": 0.6958333253860474, "rewards/chosen": 0.24160155653953552, "rewards/margins": 0.20834961533546448, "rewards/rejected": 0.03339843824505806, "step": 3260 }, { "epoch": 0.8406282183316169, "grad_norm": 117.0, "learning_rate": 7.968589083419156e-08, "logits/chosen": -0.3021484315395355, "logits/rejected": -0.23847655951976776, "logps/chosen": -320.0, "logps/rejected": -280.6000061035156, "loss": 0.6117, "rewards/accuracies": 0.661309540271759, "rewards/chosen": 0.24433593451976776, "rewards/margins": 0.23164062201976776, "rewards/rejected": 0.01226806640625, "step": 3265 }, { "epoch": 0.8419155509783728, "grad_norm": 135.0, "learning_rate": 7.904222451081359e-08, "logits/chosen": -0.3363281190395355, "logits/rejected": -0.42460936307907104, "logps/chosen": -376.0, "logps/rejected": -333.79998779296875, "loss": 0.6633, "rewards/accuracies": 0.528205156326294, "rewards/chosen": 0.33320313692092896, "rewards/margins": 0.13320311903953552, "rewards/rejected": 0.200439453125, "step": 3270 }, { "epoch": 0.8432028836251287, "grad_norm": 124.0, "learning_rate": 7.839855818743563e-08, "logits/chosen": -0.3044189512729645, "logits/rejected": -0.21269531548023224, "logps/chosen": -318.79998779296875, "logps/rejected": -260.6000061035156, "loss": 0.6023, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.29121094942092896, "rewards/margins": 0.236328125, "rewards/rejected": 0.05502929538488388, "step": 3275 }, { "epoch": 0.8444902162718847, "grad_norm": 108.5, "learning_rate": 7.775489186405767e-08, "logits/chosen": -0.3255859315395355, "logits/rejected": -0.21955566108226776, "logps/chosen": -326.3999938964844, "logps/rejected": -279.3999938964844, "loss": 0.6438, "rewards/accuracies": 0.5458333492279053, "rewards/chosen": 0.24882812798023224, "rewards/margins": 0.16729736328125, "rewards/rejected": 0.0814208984375, "step": 3280 }, { "epoch": 0.8457775489186405, "grad_norm": 119.5, "learning_rate": 7.711122554067972e-08, "logits/chosen": -0.41289061307907104, "logits/rejected": -0.3720703125, "logps/chosen": -299.20001220703125, "logps/rejected": -301.79998779296875, "loss": 0.6727, "rewards/accuracies": 0.4633333683013916, "rewards/chosen": 0.22988280653953552, "rewards/margins": 0.087890625, "rewards/rejected": 0.14189453423023224, "step": 3285 }, { "epoch": 0.8470648815653965, "grad_norm": 132.0, "learning_rate": 7.646755921730175e-08, "logits/chosen": -0.42744141817092896, "logits/rejected": -0.4136718809604645, "logps/chosen": -316.0, "logps/rejected": -319.0, "loss": 0.6227, "rewards/accuracies": 0.5833333730697632, "rewards/chosen": 0.32402342557907104, "rewards/margins": 0.20976562798023224, "rewards/rejected": 0.11376953125, "step": 3290 }, { "epoch": 0.8483522142121525, "grad_norm": 108.0, "learning_rate": 7.582389289392379e-08, "logits/chosen": -0.3814453184604645, "logits/rejected": -0.3529296815395355, "logps/chosen": -329.6000061035156, "logps/rejected": -336.79998779296875, "loss": 0.6102, "rewards/accuracies": 0.5816666483879089, "rewards/chosen": 0.2621093690395355, "rewards/margins": 0.22578124701976776, "rewards/rejected": 0.036651611328125, "step": 3295 }, { "epoch": 0.8496395468589083, "grad_norm": 121.0, "learning_rate": 7.518022657054583e-08, "logits/chosen": -0.29218751192092896, "logits/rejected": -0.2007087767124176, "logps/chosen": -294.0, "logps/rejected": -280.3999938964844, "loss": 0.6211, "rewards/accuracies": 0.5616666674613953, "rewards/chosen": 0.25468748807907104, "rewards/margins": 0.19169922173023224, "rewards/rejected": 0.06326904147863388, "step": 3300 }, { "epoch": 0.8509268795056643, "grad_norm": 278.0, "learning_rate": 7.453656024716787e-08, "logits/chosen": -0.5443359613418579, "logits/rejected": -0.4683593809604645, "logps/chosen": -358.0, "logps/rejected": -342.20001220703125, "loss": 0.6758, "rewards/accuracies": 0.4678571820259094, "rewards/chosen": 0.28593748807907104, "rewards/margins": 0.07504882663488388, "rewards/rejected": 0.21103516221046448, "step": 3305 }, { "epoch": 0.8522142121524202, "grad_norm": 137.0, "learning_rate": 7.38928939237899e-08, "logits/chosen": -0.2847656309604645, "logits/rejected": -0.22187499701976776, "logps/chosen": -346.0, "logps/rejected": -295.20001220703125, "loss": 0.618, "rewards/accuracies": 0.6631602048873901, "rewards/chosen": 0.2772460877895355, "rewards/margins": 0.197265625, "rewards/rejected": 0.07968749850988388, "step": 3310 }, { "epoch": 0.8535015447991761, "grad_norm": 106.0, "learning_rate": 7.324922760041195e-08, "logits/chosen": -0.2802734375, "logits/rejected": -0.27753907442092896, "logps/chosen": -293.6000061035156, "logps/rejected": -305.20001220703125, "loss": 0.5875, "rewards/accuracies": 0.7346212267875671, "rewards/chosen": 0.357421875, "rewards/margins": 0.2914062440395355, "rewards/rejected": 0.06621094048023224, "step": 3315 }, { "epoch": 0.854788877445932, "grad_norm": 106.5, "learning_rate": 7.260556127703399e-08, "logits/chosen": -0.31328123807907104, "logits/rejected": -0.18178100883960724, "logps/chosen": -277.29998779296875, "logps/rejected": -267.0, "loss": 0.6414, "rewards/accuracies": 0.5647222399711609, "rewards/chosen": 0.16628417372703552, "rewards/margins": 0.14924316108226776, "rewards/rejected": 0.01707153394818306, "step": 3320 }, { "epoch": 0.856076210092688, "grad_norm": 139.0, "learning_rate": 7.196189495365603e-08, "logits/chosen": -0.3167968690395355, "logits/rejected": -0.2793945372104645, "logps/chosen": -309.6000061035156, "logps/rejected": -298.79998779296875, "loss": 0.6578, "rewards/accuracies": 0.5944048166275024, "rewards/chosen": 0.20253905653953552, "rewards/margins": 0.11113281548023224, "rewards/rejected": 0.09147949516773224, "step": 3325 }, { "epoch": 0.8573635427394438, "grad_norm": 121.0, "learning_rate": 7.131822863027806e-08, "logits/chosen": -0.3707031309604645, "logits/rejected": -0.29863280057907104, "logps/chosen": -354.79998779296875, "logps/rejected": -315.20001220703125, "loss": 0.6492, "rewards/accuracies": 0.5491666793823242, "rewards/chosen": 0.27324217557907104, "rewards/margins": 0.1591796875, "rewards/rejected": 0.11362304538488388, "step": 3330 }, { "epoch": 0.8586508753861998, "grad_norm": 110.0, "learning_rate": 7.067456230690009e-08, "logits/chosen": -0.29804688692092896, "logits/rejected": -0.21992187201976776, "logps/chosen": -245.0, "logps/rejected": -221.8000030517578, "loss": 0.6516, "rewards/accuracies": 0.559166669845581, "rewards/chosen": 0.20097656548023224, "rewards/margins": 0.142578125, "rewards/rejected": 0.05837402492761612, "step": 3335 }, { "epoch": 0.8599382080329557, "grad_norm": 120.5, "learning_rate": 7.003089598352215e-08, "logits/chosen": -0.3306640684604645, "logits/rejected": -0.31718748807907104, "logps/chosen": -327.6000061035156, "logps/rejected": -299.20001220703125, "loss": 0.6172, "rewards/accuracies": 0.5722435712814331, "rewards/chosen": 0.28398436307907104, "rewards/margins": 0.21296386420726776, "rewards/rejected": 0.07094726711511612, "step": 3340 }, { "epoch": 0.8612255406797117, "grad_norm": 139.0, "learning_rate": 6.938722966014417e-08, "logits/chosen": -0.21152344346046448, "logits/rejected": -0.34541016817092896, "logps/chosen": -325.6000061035156, "logps/rejected": -288.0, "loss": 0.6602, "rewards/accuracies": 0.5936905145645142, "rewards/chosen": 0.2802734375, "rewards/margins": 0.11752929538488388, "rewards/rejected": 0.162841796875, "step": 3345 }, { "epoch": 0.8625128733264675, "grad_norm": 117.0, "learning_rate": 6.874356333676622e-08, "logits/chosen": -0.25078123807907104, "logits/rejected": -0.11328125, "logps/chosen": -319.6000061035156, "logps/rejected": -287.0, "loss": 0.6406, "rewards/accuracies": 0.591025710105896, "rewards/chosen": 0.22065429389476776, "rewards/margins": 0.18828125298023224, "rewards/rejected": 0.03218994289636612, "step": 3350 }, { "epoch": 0.8638002059732235, "grad_norm": 124.0, "learning_rate": 6.809989701338825e-08, "logits/chosen": -0.18437500298023224, "logits/rejected": -0.12412109225988388, "logps/chosen": -316.20001220703125, "logps/rejected": -305.0, "loss": 0.6445, "rewards/accuracies": 0.6096428632736206, "rewards/chosen": 0.23906250298023224, "rewards/margins": 0.13754883408546448, "rewards/rejected": 0.101318359375, "step": 3355 }, { "epoch": 0.8650875386199794, "grad_norm": 118.5, "learning_rate": 6.74562306900103e-08, "logits/chosen": -0.314697265625, "logits/rejected": -0.18603515625, "logps/chosen": -296.1000061035156, "logps/rejected": -278.20001220703125, "loss": 0.6414, "rewards/accuracies": 0.6257143020629883, "rewards/chosen": 0.26386719942092896, "rewards/margins": 0.15156249701976776, "rewards/rejected": 0.11268310248851776, "step": 3360 }, { "epoch": 0.8663748712667353, "grad_norm": 121.5, "learning_rate": 6.681256436663233e-08, "logits/chosen": -0.42460936307907104, "logits/rejected": -0.20830078423023224, "logps/chosen": -309.6000061035156, "logps/rejected": -266.79998779296875, "loss": 0.6062, "rewards/accuracies": 0.6019047498703003, "rewards/chosen": 0.30170899629592896, "rewards/margins": 0.23222656548023224, "rewards/rejected": 0.06953124701976776, "step": 3365 }, { "epoch": 0.8676622039134912, "grad_norm": 167.0, "learning_rate": 6.616889804325438e-08, "logits/chosen": -0.3453125059604645, "logits/rejected": -0.45610350370407104, "logps/chosen": -323.3999938964844, "logps/rejected": -248.10000610351562, "loss": 0.6531, "rewards/accuracies": 0.6175000071525574, "rewards/chosen": 0.27958983182907104, "rewards/margins": 0.13298340141773224, "rewards/rejected": 0.14682617783546448, "step": 3370 }, { "epoch": 0.8689495365602472, "grad_norm": 122.5, "learning_rate": 6.55252317198764e-08, "logits/chosen": -0.455078125, "logits/rejected": -0.3486328125, "logps/chosen": -315.0, "logps/rejected": -281.0, "loss": 0.6172, "rewards/accuracies": 0.5845237970352173, "rewards/chosen": 0.28398436307907104, "rewards/margins": 0.21992187201976776, "rewards/rejected": 0.06430663913488388, "step": 3375 }, { "epoch": 0.870236869207003, "grad_norm": 131.0, "learning_rate": 6.488156539649846e-08, "logits/chosen": -0.38701170682907104, "logits/rejected": -0.3374786376953125, "logps/chosen": -346.0, "logps/rejected": -277.3999938964844, "loss": 0.6531, "rewards/accuracies": 0.4858333468437195, "rewards/chosen": 0.25166016817092896, "rewards/margins": 0.13374023139476776, "rewards/rejected": 0.11772461235523224, "step": 3380 }, { "epoch": 0.871524201853759, "grad_norm": 100.5, "learning_rate": 6.423789907312049e-08, "logits/chosen": -0.3072265684604645, "logits/rejected": -0.25273436307907104, "logps/chosen": -283.0, "logps/rejected": -280.20001220703125, "loss": 0.6375, "rewards/accuracies": 0.5819047689437866, "rewards/chosen": 0.22343750298023224, "rewards/margins": 0.16240234673023224, "rewards/rejected": 0.061048220843076706, "step": 3385 }, { "epoch": 0.872811534500515, "grad_norm": 118.0, "learning_rate": 6.359423274974253e-08, "logits/chosen": -0.25849610567092896, "logits/rejected": -0.20258788764476776, "logps/chosen": -309.20001220703125, "logps/rejected": -269.3999938964844, "loss": 0.6211, "rewards/accuracies": 0.635952353477478, "rewards/chosen": 0.2568359375, "rewards/margins": 0.19400635361671448, "rewards/rejected": 0.06290893256664276, "step": 3390 }, { "epoch": 0.8740988671472708, "grad_norm": 154.0, "learning_rate": 6.295056642636456e-08, "logits/chosen": -0.29765623807907104, "logits/rejected": -0.19384765625, "logps/chosen": -363.20001220703125, "logps/rejected": -321.3999938964844, "loss": 0.6648, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.2754882872104645, "rewards/margins": 0.12290038913488388, "rewards/rejected": 0.15273436903953552, "step": 3395 }, { "epoch": 0.8753861997940268, "grad_norm": 107.5, "learning_rate": 6.23069001029866e-08, "logits/chosen": -0.22666016221046448, "logits/rejected": 0.07578124850988388, "logps/chosen": -263.0, "logps/rejected": -241.3000030517578, "loss": 0.6531, "rewards/accuracies": 0.5434523820877075, "rewards/chosen": 0.19697265326976776, "rewards/margins": 0.128173828125, "rewards/rejected": 0.06894531100988388, "step": 3400 }, { "epoch": 0.8766735324407827, "grad_norm": 149.0, "learning_rate": 6.166323377960865e-08, "logits/chosen": -0.2857421934604645, "logits/rejected": -0.2687011659145355, "logps/chosen": -334.3999938964844, "logps/rejected": -317.20001220703125, "loss": 0.632, "rewards/accuracies": 0.5449999570846558, "rewards/chosen": 0.2333984375, "rewards/margins": 0.166259765625, "rewards/rejected": 0.067138671875, "step": 3405 }, { "epoch": 0.8779608650875386, "grad_norm": 102.5, "learning_rate": 6.101956745623069e-08, "logits/chosen": -0.275390625, "logits/rejected": -0.253173828125, "logps/chosen": -314.3999938964844, "logps/rejected": -283.3999938964844, "loss": 0.5984, "rewards/accuracies": 0.6744444370269775, "rewards/chosen": 0.24843749403953552, "rewards/margins": 0.2520996034145355, "rewards/rejected": -0.0038818358443677425, "step": 3410 }, { "epoch": 0.8792481977342945, "grad_norm": 196.0, "learning_rate": 6.037590113285273e-08, "logits/chosen": -0.30078125, "logits/rejected": -0.18359375, "logps/chosen": -267.3999938964844, "logps/rejected": -269.20001220703125, "loss": 0.6383, "rewards/accuracies": 0.6523076891899109, "rewards/chosen": 0.2835937440395355, "rewards/margins": 0.16505737602710724, "rewards/rejected": 0.11874999850988388, "step": 3415 }, { "epoch": 0.8805355303810505, "grad_norm": 114.0, "learning_rate": 5.973223480947476e-08, "logits/chosen": -0.22999267280101776, "logits/rejected": -0.13823242485523224, "logps/chosen": -341.6000061035156, "logps/rejected": -312.79998779296875, "loss": 0.6141, "rewards/accuracies": 0.6017857193946838, "rewards/chosen": 0.24257811903953552, "rewards/margins": 0.19882813096046448, "rewards/rejected": 0.04377441480755806, "step": 3420 }, { "epoch": 0.8818228630278064, "grad_norm": 134.0, "learning_rate": 5.9088568486096805e-08, "logits/chosen": -0.29736328125, "logits/rejected": -0.2601562440395355, "logps/chosen": -346.3999938964844, "logps/rejected": -277.20001220703125, "loss": 0.6586, "rewards/accuracies": 0.5342856645584106, "rewards/chosen": 0.24033203721046448, "rewards/margins": 0.13450928032398224, "rewards/rejected": 0.10595703125, "step": 3425 }, { "epoch": 0.8831101956745623, "grad_norm": 113.0, "learning_rate": 5.844490216271884e-08, "logits/chosen": -0.29218751192092896, "logits/rejected": -0.29277342557907104, "logps/chosen": -344.20001220703125, "logps/rejected": -321.20001220703125, "loss": 0.643, "rewards/accuracies": 0.590833306312561, "rewards/chosen": 0.3045898377895355, "rewards/margins": 0.1748046875, "rewards/rejected": 0.13039550185203552, "step": 3430 }, { "epoch": 0.8843975283213182, "grad_norm": 168.0, "learning_rate": 5.7801235839340884e-08, "logits/chosen": -0.0966796875, "logits/rejected": -0.27324217557907104, "logps/chosen": -267.79998779296875, "logps/rejected": -252.60000610351562, "loss": 0.6359, "rewards/accuracies": 0.5684090852737427, "rewards/chosen": 0.2890625, "rewards/margins": 0.15507812798023224, "rewards/rejected": 0.1337890625, "step": 3435 }, { "epoch": 0.8856848609680742, "grad_norm": 137.0, "learning_rate": 5.715756951596292e-08, "logits/chosen": -0.21096190810203552, "logits/rejected": -0.233154296875, "logps/chosen": -306.79998779296875, "logps/rejected": -338.3999938964844, "loss": 0.6344, "rewards/accuracies": 0.5892857313156128, "rewards/chosen": 0.2900390625, "rewards/margins": 0.17460937798023224, "rewards/rejected": 0.11503906548023224, "step": 3440 }, { "epoch": 0.88697219361483, "grad_norm": 114.0, "learning_rate": 5.651390319258496e-08, "logits/chosen": -0.29609376192092896, "logits/rejected": -0.3316406309604645, "logps/chosen": -307.20001220703125, "logps/rejected": -342.79998779296875, "loss": 0.6406, "rewards/accuracies": 0.5260897874832153, "rewards/chosen": 0.30078125, "rewards/margins": 0.14577636122703552, "rewards/rejected": 0.15468749403953552, "step": 3445 }, { "epoch": 0.888259526261586, "grad_norm": 124.5, "learning_rate": 5.5870236869207e-08, "logits/chosen": -0.22353515028953552, "logits/rejected": -0.25822752714157104, "logps/chosen": -305.79998779296875, "logps/rejected": -244.0, "loss": 0.5922, "rewards/accuracies": 0.71833336353302, "rewards/chosen": 0.3046875, "rewards/margins": 0.25566405057907104, "rewards/rejected": 0.04877319186925888, "step": 3450 }, { "epoch": 0.889546858908342, "grad_norm": 113.5, "learning_rate": 5.522657054582904e-08, "logits/chosen": -0.2716064453125, "logits/rejected": -0.2742675840854645, "logps/chosen": -265.6000061035156, "logps/rejected": -281.0, "loss": 0.6789, "rewards/accuracies": 0.5327380895614624, "rewards/chosen": 0.12563475966453552, "rewards/margins": 0.06841430813074112, "rewards/rejected": 0.0574951171875, "step": 3455 }, { "epoch": 0.8908341915550978, "grad_norm": 113.5, "learning_rate": 5.458290422245108e-08, "logits/chosen": -0.263671875, "logits/rejected": -0.23574218153953552, "logps/chosen": -303.20001220703125, "logps/rejected": -270.0, "loss": 0.6297, "rewards/accuracies": 0.5909523963928223, "rewards/chosen": 0.21855469048023224, "rewards/margins": 0.19365234673023224, "rewards/rejected": 0.02471618726849556, "step": 3460 }, { "epoch": 0.8921215242018538, "grad_norm": 168.0, "learning_rate": 5.393923789907312e-08, "logits/chosen": -0.337890625, "logits/rejected": -0.23935547471046448, "logps/chosen": -253.8000030517578, "logps/rejected": -269.0, "loss": 0.6367, "rewards/accuracies": 0.5703571438789368, "rewards/chosen": 0.22324219346046448, "rewards/margins": 0.13935546576976776, "rewards/rejected": 0.08359374850988388, "step": 3465 }, { "epoch": 0.8934088568486097, "grad_norm": 104.5, "learning_rate": 5.3295571575695156e-08, "logits/chosen": -0.29121094942092896, "logits/rejected": -0.15493163466453552, "logps/chosen": -266.6000061035156, "logps/rejected": -239.60000610351562, "loss": 0.6344, "rewards/accuracies": 0.5727273225784302, "rewards/chosen": 0.21601562201976776, "rewards/margins": 0.15639647841453552, "rewards/rejected": 0.05966796725988388, "step": 3470 }, { "epoch": 0.8946961894953656, "grad_norm": 147.0, "learning_rate": 5.26519052523172e-08, "logits/chosen": -0.353515625, "logits/rejected": -0.2642578184604645, "logps/chosen": -345.6000061035156, "logps/rejected": -321.6000061035156, "loss": 0.6078, "rewards/accuracies": 0.690833330154419, "rewards/chosen": 0.3578124940395355, "rewards/margins": 0.24179688096046448, "rewards/rejected": 0.11582031100988388, "step": 3475 }, { "epoch": 0.8959835221421215, "grad_norm": 137.0, "learning_rate": 5.2008238928939235e-08, "logits/chosen": -0.4105590879917145, "logits/rejected": -0.325186163187027, "logps/chosen": -331.0, "logps/rejected": -328.3999938964844, "loss": 0.6687, "rewards/accuracies": 0.5508333444595337, "rewards/chosen": 0.2880859375, "rewards/margins": 0.10859374701976776, "rewards/rejected": 0.17958983778953552, "step": 3480 }, { "epoch": 0.8972708547888775, "grad_norm": 124.0, "learning_rate": 5.136457260556128e-08, "logits/chosen": -0.16062012314796448, "logits/rejected": -0.12939453125, "logps/chosen": -285.3999938964844, "logps/rejected": -296.6000061035156, "loss": 0.6344, "rewards/accuracies": 0.6742857098579407, "rewards/chosen": 0.19550780951976776, "rewards/margins": 0.18017578125, "rewards/rejected": 0.0155029296875, "step": 3485 }, { "epoch": 0.8985581874356333, "grad_norm": 145.0, "learning_rate": 5.0720906282183313e-08, "logits/chosen": -0.22148437798023224, "logits/rejected": -0.171722412109375, "logps/chosen": -308.3999938964844, "logps/rejected": -294.79998779296875, "loss": 0.6484, "rewards/accuracies": 0.5255952477455139, "rewards/chosen": 0.216796875, "rewards/margins": 0.13293953239917755, "rewards/rejected": 0.08413086086511612, "step": 3490 }, { "epoch": 0.8998455200823893, "grad_norm": 125.0, "learning_rate": 5.0077239958805356e-08, "logits/chosen": -0.4683593809604645, "logits/rejected": -0.46660155057907104, "logps/chosen": -370.0, "logps/rejected": -301.79998779296875, "loss": 0.6164, "rewards/accuracies": 0.5480769276618958, "rewards/chosen": 0.3648437559604645, "rewards/margins": 0.22224120795726776, "rewards/rejected": 0.14261475205421448, "step": 3495 }, { "epoch": 0.9011328527291452, "grad_norm": 153.0, "learning_rate": 4.943357363542739e-08, "logits/chosen": -0.28349608182907104, "logits/rejected": -0.05732421949505806, "logps/chosen": -274.20001220703125, "logps/rejected": -234.1999969482422, "loss": 0.6062, "rewards/accuracies": 0.6841667294502258, "rewards/chosen": 0.24746093153953552, "rewards/margins": 0.23349609971046448, "rewards/rejected": 0.01389160193502903, "step": 3500 }, { "epoch": 0.9024201853759012, "grad_norm": 106.5, "learning_rate": 4.8789907312049435e-08, "logits/chosen": -0.35468751192092896, "logits/rejected": -0.29439085721969604, "logps/chosen": -342.0, "logps/rejected": -309.79998779296875, "loss": 0.6242, "rewards/accuracies": 0.5848718285560608, "rewards/chosen": 0.26972657442092896, "rewards/margins": 0.20297852158546448, "rewards/rejected": 0.06708984076976776, "step": 3505 }, { "epoch": 0.903707518022657, "grad_norm": 124.5, "learning_rate": 4.814624098867147e-08, "logits/chosen": -0.31621092557907104, "logits/rejected": -0.3505859375, "logps/chosen": -326.20001220703125, "logps/rejected": -292.0, "loss": 0.6266, "rewards/accuracies": 0.5948077440261841, "rewards/chosen": 0.29736328125, "rewards/margins": 0.21533203125, "rewards/rejected": 0.08217773586511612, "step": 3510 }, { "epoch": 0.904994850669413, "grad_norm": 154.0, "learning_rate": 4.7502574665293514e-08, "logits/chosen": -0.2928710877895355, "logits/rejected": -0.228363037109375, "logps/chosen": -351.20001220703125, "logps/rejected": -297.79998779296875, "loss": 0.6133, "rewards/accuracies": 0.67166668176651, "rewards/chosen": 0.3021484315395355, "rewards/margins": 0.23613281548023224, "rewards/rejected": 0.06572265923023224, "step": 3515 }, { "epoch": 0.9062821833161689, "grad_norm": 143.0, "learning_rate": 4.685890834191555e-08, "logits/chosen": -0.24140624701976776, "logits/rejected": -0.12744140625, "logps/chosen": -287.6000061035156, "logps/rejected": -283.6000061035156, "loss": 0.6242, "rewards/accuracies": 0.6178571581840515, "rewards/chosen": 0.28691405057907104, "rewards/margins": 0.21601562201976776, "rewards/rejected": 0.07041015475988388, "step": 3520 }, { "epoch": 0.9075695159629248, "grad_norm": 129.0, "learning_rate": 4.621524201853759e-08, "logits/chosen": -0.26093751192092896, "logits/rejected": -0.28369140625, "logps/chosen": -300.20001220703125, "logps/rejected": -318.3999938964844, "loss": 0.6398, "rewards/accuracies": 0.629807710647583, "rewards/chosen": 0.24824218451976776, "rewards/margins": 0.15468749403953552, "rewards/rejected": 0.09389648586511612, "step": 3525 }, { "epoch": 0.9088568486096807, "grad_norm": 133.0, "learning_rate": 4.557157569515963e-08, "logits/chosen": -0.3082031309604645, "logits/rejected": -0.29545897245407104, "logps/chosen": -320.20001220703125, "logps/rejected": -283.6000061035156, "loss": 0.6312, "rewards/accuracies": 0.5983333587646484, "rewards/chosen": 0.2562499940395355, "rewards/margins": 0.15292969346046448, "rewards/rejected": 0.10289917141199112, "step": 3530 }, { "epoch": 0.9101441812564367, "grad_norm": 124.0, "learning_rate": 4.492790937178167e-08, "logits/chosen": -0.17036132514476776, "logits/rejected": -0.14531250298023224, "logps/chosen": -266.6000061035156, "logps/rejected": -270.79998779296875, "loss": 0.65, "rewards/accuracies": 0.5891667008399963, "rewards/chosen": 0.16328124701976776, "rewards/margins": 0.13124999403953552, "rewards/rejected": 0.0318603515625, "step": 3535 }, { "epoch": 0.9114315139031925, "grad_norm": 117.5, "learning_rate": 4.42842430484037e-08, "logits/chosen": -0.2607421875, "logits/rejected": -0.18212890625, "logps/chosen": -302.79998779296875, "logps/rejected": -280.6000061035156, "loss": 0.618, "rewards/accuracies": 0.6609524488449097, "rewards/chosen": 0.29423826932907104, "rewards/margins": 0.215423583984375, "rewards/rejected": 0.07817383110523224, "step": 3540 }, { "epoch": 0.9127188465499485, "grad_norm": 157.0, "learning_rate": 4.364057672502574e-08, "logits/chosen": -0.28193360567092896, "logits/rejected": -0.3349609375, "logps/chosen": -337.20001220703125, "logps/rejected": -337.20001220703125, "loss": 0.6531, "rewards/accuracies": 0.522261917591095, "rewards/chosen": 0.25947266817092896, "rewards/margins": 0.14414063096046448, "rewards/rejected": 0.1151123046875, "step": 3545 }, { "epoch": 0.9140061791967045, "grad_norm": 111.0, "learning_rate": 4.299691040164778e-08, "logits/chosen": -0.3423828184604645, "logits/rejected": -0.32197266817092896, "logps/chosen": -304.0, "logps/rejected": -315.0, "loss": 0.6453, "rewards/accuracies": 0.5691666603088379, "rewards/chosen": 0.20878906548023224, "rewards/margins": 0.12522277235984802, "rewards/rejected": 0.08369140326976776, "step": 3550 }, { "epoch": 0.9152935118434603, "grad_norm": 110.5, "learning_rate": 4.235324407826982e-08, "logits/chosen": -0.27220457792282104, "logits/rejected": -0.2060546875, "logps/chosen": -304.3999938964844, "logps/rejected": -302.3999938964844, "loss": 0.6672, "rewards/accuracies": 0.5699999928474426, "rewards/chosen": 0.22792968153953552, "rewards/margins": 0.08608398586511612, "rewards/rejected": 0.141845703125, "step": 3555 }, { "epoch": 0.9165808444902163, "grad_norm": 137.0, "learning_rate": 4.170957775489186e-08, "logits/chosen": -0.38593751192092896, "logits/rejected": -0.4022460877895355, "logps/chosen": -349.6000061035156, "logps/rejected": -299.3999938964844, "loss": 0.6805, "rewards/accuracies": 0.48500004410743713, "rewards/chosen": 0.20126953721046448, "rewards/margins": 0.06210937350988388, "rewards/rejected": 0.138916015625, "step": 3560 }, { "epoch": 0.9178681771369722, "grad_norm": 194.0, "learning_rate": 4.10659114315139e-08, "logits/chosen": -0.3218750059604645, "logits/rejected": -0.2784179747104645, "logps/chosen": -321.20001220703125, "logps/rejected": -318.6000061035156, "loss": 0.6523, "rewards/accuracies": 0.5816666483879089, "rewards/chosen": 0.20146484673023224, "rewards/margins": 0.14023438096046448, "rewards/rejected": 0.06123046949505806, "step": 3565 }, { "epoch": 0.9191555097837281, "grad_norm": 159.0, "learning_rate": 4.042224510813594e-08, "logits/chosen": -0.2793945372104645, "logits/rejected": -0.1826171875, "logps/chosen": -291.1000061035156, "logps/rejected": -257.8999938964844, "loss": 0.6281, "rewards/accuracies": 0.59333336353302, "rewards/chosen": 0.2933593690395355, "rewards/margins": 0.17919921875, "rewards/rejected": 0.11391601711511612, "step": 3570 }, { "epoch": 0.920442842430484, "grad_norm": 182.0, "learning_rate": 3.977857878475798e-08, "logits/chosen": -0.2715820372104645, "logits/rejected": -0.22187499701976776, "logps/chosen": -307.0, "logps/rejected": -233.60000610351562, "loss": 0.6414, "rewards/accuracies": 0.6338095664978027, "rewards/chosen": 0.2582031190395355, "rewards/margins": 0.16445311903953552, "rewards/rejected": 0.09379883110523224, "step": 3575 }, { "epoch": 0.92173017507724, "grad_norm": 144.0, "learning_rate": 3.9134912461380015e-08, "logits/chosen": -0.3248046934604645, "logits/rejected": -0.3617187440395355, "logps/chosen": -310.20001220703125, "logps/rejected": -314.6000061035156, "loss": 0.6484, "rewards/accuracies": 0.590833306312561, "rewards/chosen": 0.21698608994483948, "rewards/margins": 0.11188964545726776, "rewards/rejected": 0.10512695461511612, "step": 3580 }, { "epoch": 0.9230175077239959, "grad_norm": 116.5, "learning_rate": 3.849124613800206e-08, "logits/chosen": -0.29902344942092896, "logits/rejected": -0.33642578125, "logps/chosen": -333.79998779296875, "logps/rejected": -262.20001220703125, "loss": 0.6305, "rewards/accuracies": 0.6308333277702332, "rewards/chosen": 0.27197265625, "rewards/margins": 0.17607422173023224, "rewards/rejected": 0.09562988579273224, "step": 3585 }, { "epoch": 0.9243048403707518, "grad_norm": 116.5, "learning_rate": 3.7847579814624094e-08, "logits/chosen": -0.5189453363418579, "logits/rejected": -0.20527343451976776, "logps/chosen": -296.20001220703125, "logps/rejected": -271.0, "loss": 0.6164, "rewards/accuracies": 0.5714743733406067, "rewards/chosen": 0.3115234375, "rewards/margins": 0.203125, "rewards/rejected": 0.10854492336511612, "step": 3590 }, { "epoch": 0.9255921730175077, "grad_norm": 126.0, "learning_rate": 3.720391349124614e-08, "logits/chosen": -0.331787109375, "logits/rejected": -0.181884765625, "logps/chosen": -320.3999938964844, "logps/rejected": -310.3999938964844, "loss": 0.6625, "rewards/accuracies": 0.5060606002807617, "rewards/chosen": 0.27617186307907104, "rewards/margins": 0.11630859225988388, "rewards/rejected": 0.16054686903953552, "step": 3595 }, { "epoch": 0.9268795056642637, "grad_norm": 115.0, "learning_rate": 3.656024716786817e-08, "logits/chosen": -0.3509765565395355, "logits/rejected": -0.22382812201976776, "logps/chosen": -272.8999938964844, "logps/rejected": -277.79998779296875, "loss": 0.6148, "rewards/accuracies": 0.5809524059295654, "rewards/chosen": 0.2828125059604645, "rewards/margins": 0.22568359971046448, "rewards/rejected": 0.05755005031824112, "step": 3600 }, { "epoch": 0.9281668383110195, "grad_norm": 152.0, "learning_rate": 3.5916580844490216e-08, "logits/chosen": -0.28779298067092896, "logits/rejected": -0.12056274712085724, "logps/chosen": -262.20001220703125, "logps/rejected": -263.6000061035156, "loss": 0.6508, "rewards/accuracies": 0.5864377617835999, "rewards/chosen": 0.23867186903953552, "rewards/margins": 0.135498046875, "rewards/rejected": 0.10341797024011612, "step": 3605 }, { "epoch": 0.9294541709577755, "grad_norm": 134.0, "learning_rate": 3.527291452111225e-08, "logits/chosen": -0.19916991889476776, "logits/rejected": -0.18837890028953552, "logps/chosen": -305.20001220703125, "logps/rejected": -259.79998779296875, "loss": 0.6461, "rewards/accuracies": 0.6550000309944153, "rewards/chosen": 0.20624999701976776, "rewards/margins": 0.13046875596046448, "rewards/rejected": 0.07585449516773224, "step": 3610 }, { "epoch": 0.9307415036045315, "grad_norm": 104.5, "learning_rate": 3.4629248197734294e-08, "logits/chosen": -0.27177733182907104, "logits/rejected": -0.24208983778953552, "logps/chosen": -338.79998779296875, "logps/rejected": -320.0, "loss": 0.6266, "rewards/accuracies": 0.6283333897590637, "rewards/chosen": 0.32597655057907104, "rewards/margins": 0.23333740234375, "rewards/rejected": 0.09233398735523224, "step": 3615 }, { "epoch": 0.9320288362512873, "grad_norm": 131.0, "learning_rate": 3.398558187435633e-08, "logits/chosen": -0.4320312440395355, "logits/rejected": -0.26123046875, "logps/chosen": -353.6000061035156, "logps/rejected": -286.20001220703125, "loss": 0.618, "rewards/accuracies": 0.5667857527732849, "rewards/chosen": 0.3232421875, "rewards/margins": 0.20693358778953552, "rewards/rejected": 0.11650390923023224, "step": 3620 }, { "epoch": 0.9333161688980433, "grad_norm": 121.5, "learning_rate": 3.334191555097837e-08, "logits/chosen": -0.36328125, "logits/rejected": -0.3340820372104645, "logps/chosen": -306.3999938964844, "logps/rejected": -273.20001220703125, "loss": 0.6391, "rewards/accuracies": 0.5491666793823242, "rewards/chosen": 0.24765625596046448, "rewards/margins": 0.13886718451976776, "rewards/rejected": 0.10867919772863388, "step": 3625 }, { "epoch": 0.9346035015447992, "grad_norm": 114.0, "learning_rate": 3.269824922760041e-08, "logits/chosen": -0.2451171875, "logits/rejected": -0.21533203125, "logps/chosen": -309.79998779296875, "logps/rejected": -321.79998779296875, "loss": 0.6398, "rewards/accuracies": 0.5470238924026489, "rewards/chosen": 0.2548828125, "rewards/margins": 0.16440430283546448, "rewards/rejected": 0.09018554538488388, "step": 3630 }, { "epoch": 0.935890834191555, "grad_norm": 135.0, "learning_rate": 3.205458290422245e-08, "logits/chosen": -0.37651365995407104, "logits/rejected": -0.39238280057907104, "logps/chosen": -308.3999938964844, "logps/rejected": -295.6000061035156, "loss": 0.6891, "rewards/accuracies": 0.5861722230911255, "rewards/chosen": 0.22431640326976776, "rewards/margins": 0.06694336235523224, "rewards/rejected": 0.15732422471046448, "step": 3635 }, { "epoch": 0.937178166838311, "grad_norm": 130.0, "learning_rate": 3.141091658084449e-08, "logits/chosen": -0.48320311307907104, "logits/rejected": -0.4507812559604645, "logps/chosen": -325.6000061035156, "logps/rejected": -284.20001220703125, "loss": 0.618, "rewards/accuracies": 0.6336363554000854, "rewards/chosen": 0.2865234315395355, "rewards/margins": 0.20156249403953552, "rewards/rejected": 0.08525390923023224, "step": 3640 }, { "epoch": 0.938465499485067, "grad_norm": 168.0, "learning_rate": 3.076725025746653e-08, "logits/chosen": -0.39921873807907104, "logits/rejected": -0.3404296934604645, "logps/chosen": -377.0, "logps/rejected": -296.0, "loss": 0.6516, "rewards/accuracies": 0.5141667127609253, "rewards/chosen": 0.2718749940395355, "rewards/margins": 0.13231201469898224, "rewards/rejected": 0.13996581733226776, "step": 3645 }, { "epoch": 0.9397528321318228, "grad_norm": 302.0, "learning_rate": 3.0123583934088567e-08, "logits/chosen": -0.14707031846046448, "logits/rejected": -0.29267579317092896, "logps/chosen": -329.79998779296875, "logps/rejected": -293.79998779296875, "loss": 0.6219, "rewards/accuracies": 0.5672435760498047, "rewards/chosen": 0.3365234434604645, "rewards/margins": 0.20224609971046448, "rewards/rejected": 0.13422851264476776, "step": 3650 }, { "epoch": 0.9410401647785788, "grad_norm": 97.5, "learning_rate": 2.9479917610710606e-08, "logits/chosen": -0.19732666015625, "logits/rejected": -0.1516426056623459, "logps/chosen": -307.20001220703125, "logps/rejected": -295.0, "loss": 0.6484, "rewards/accuracies": 0.6291667222976685, "rewards/chosen": 0.22333984076976776, "rewards/margins": 0.15231934189796448, "rewards/rejected": 0.07080078125, "step": 3655 }, { "epoch": 0.9423274974253347, "grad_norm": 154.0, "learning_rate": 2.8836251287332645e-08, "logits/chosen": -0.30878907442092896, "logits/rejected": -0.144927978515625, "logps/chosen": -320.0, "logps/rejected": -289.79998779296875, "loss": 0.6594, "rewards/accuracies": 0.6585256457328796, "rewards/chosen": 0.2255859375, "rewards/margins": 0.12851563096046448, "rewards/rejected": 0.09730835258960724, "step": 3660 }, { "epoch": 0.9436148300720907, "grad_norm": 116.5, "learning_rate": 2.8192584963954685e-08, "logits/chosen": -0.32011717557907104, "logits/rejected": -0.32927244901657104, "logps/chosen": -312.3999938964844, "logps/rejected": -297.3999938964844, "loss": 0.6305, "rewards/accuracies": 0.57833331823349, "rewards/chosen": 0.2671875059604645, "rewards/margins": 0.19163207709789276, "rewards/rejected": 0.07548828423023224, "step": 3665 }, { "epoch": 0.9449021627188465, "grad_norm": 151.0, "learning_rate": 2.7548918640576724e-08, "logits/chosen": -0.2674804627895355, "logits/rejected": -0.3290039002895355, "logps/chosen": -290.0, "logps/rejected": -227.1999969482422, "loss": 0.6336, "rewards/accuracies": 0.6341667175292969, "rewards/chosen": 0.25078123807907104, "rewards/margins": 0.15546874701976776, "rewards/rejected": 0.09553222358226776, "step": 3670 }, { "epoch": 0.9461894953656025, "grad_norm": 226.0, "learning_rate": 2.6905252317198764e-08, "logits/chosen": -0.37128907442092896, "logits/rejected": -0.361328125, "logps/chosen": -294.0, "logps/rejected": -233.0, "loss": 0.6406, "rewards/accuracies": 0.6008333563804626, "rewards/chosen": 0.32929688692092896, "rewards/margins": 0.14384765923023224, "rewards/rejected": 0.18520507216453552, "step": 3675 }, { "epoch": 0.9474768280123584, "grad_norm": 121.0, "learning_rate": 2.6261585993820803e-08, "logits/chosen": -0.19414062798023224, "logits/rejected": -0.12812499701976776, "logps/chosen": -256.20001220703125, "logps/rejected": -241.1999969482422, "loss": 0.6195, "rewards/accuracies": 0.6530769467353821, "rewards/chosen": 0.23745116591453552, "rewards/margins": 0.1982421875, "rewards/rejected": 0.03925781324505806, "step": 3680 }, { "epoch": 0.9487641606591143, "grad_norm": 158.0, "learning_rate": 2.5617919670442842e-08, "logits/chosen": -0.23886719346046448, "logits/rejected": -0.234375, "logps/chosen": -284.0, "logps/rejected": -302.3999938964844, "loss": 0.6484, "rewards/accuracies": 0.5858333706855774, "rewards/chosen": 0.21220703423023224, "rewards/margins": 0.13920898735523224, "rewards/rejected": 0.07294921576976776, "step": 3685 }, { "epoch": 0.9500514933058702, "grad_norm": 110.0, "learning_rate": 2.497425334706488e-08, "logits/chosen": -0.31794434785842896, "logits/rejected": -0.38203126192092896, "logps/chosen": -317.20001220703125, "logps/rejected": -276.20001220703125, "loss": 0.6414, "rewards/accuracies": 0.5892857313156128, "rewards/chosen": 0.306640625, "rewards/margins": 0.15634766221046448, "rewards/rejected": 0.15019531548023224, "step": 3690 }, { "epoch": 0.9513388259526262, "grad_norm": 266.0, "learning_rate": 2.433058702368692e-08, "logits/chosen": -0.24638672173023224, "logits/rejected": -0.24846191704273224, "logps/chosen": -295.3999938964844, "logps/rejected": -246.39999389648438, "loss": 0.6172, "rewards/accuracies": 0.6440476179122925, "rewards/chosen": 0.20878906548023224, "rewards/margins": 0.21220703423023224, "rewards/rejected": -0.0035827637184411287, "step": 3695 }, { "epoch": 0.952626158599382, "grad_norm": 101.5, "learning_rate": 2.368692070030896e-08, "logits/chosen": -0.39179688692092896, "logits/rejected": -0.28754884004592896, "logps/chosen": -331.20001220703125, "logps/rejected": -301.3999938964844, "loss": 0.5953, "rewards/accuracies": 0.6358333826065063, "rewards/chosen": 0.3525390625, "rewards/margins": 0.27031248807907104, "rewards/rejected": 0.08179931342601776, "step": 3700 }, { "epoch": 0.953913491246138, "grad_norm": 132.0, "learning_rate": 2.3043254376931e-08, "logits/chosen": -0.3994140625, "logits/rejected": -0.3480468690395355, "logps/chosen": -360.0, "logps/rejected": -269.3999938964844, "loss": 0.6281, "rewards/accuracies": 0.6160714030265808, "rewards/chosen": 0.28828126192092896, "rewards/margins": 0.17695312201976776, "rewards/rejected": 0.11086425930261612, "step": 3705 }, { "epoch": 0.955200823892894, "grad_norm": 136.0, "learning_rate": 2.239958805355304e-08, "logits/chosen": -0.2880859375, "logits/rejected": -0.3456054627895355, "logps/chosen": -310.3999938964844, "logps/rejected": -281.6000061035156, "loss": 0.6492, "rewards/accuracies": 0.5639377236366272, "rewards/chosen": 0.25312501192092896, "rewards/margins": 0.12720337510108948, "rewards/rejected": 0.12607422471046448, "step": 3710 }, { "epoch": 0.9564881565396498, "grad_norm": 368.0, "learning_rate": 2.1755921730175075e-08, "logits/chosen": -0.31689453125, "logits/rejected": -0.3046875, "logps/chosen": -301.79998779296875, "logps/rejected": -249.3000030517578, "loss": 0.6484, "rewards/accuracies": 0.628611147403717, "rewards/chosen": 0.22832031548023224, "rewards/margins": 0.13027343153953552, "rewards/rejected": 0.09819336235523224, "step": 3715 }, { "epoch": 0.9577754891864058, "grad_norm": 136.0, "learning_rate": 2.1112255406797115e-08, "logits/chosen": -0.287353515625, "logits/rejected": -0.21357421576976776, "logps/chosen": -340.0, "logps/rejected": -265.0, "loss": 0.6648, "rewards/accuracies": 0.5860256552696228, "rewards/chosen": 0.23115234076976776, "rewards/margins": 0.11083984375, "rewards/rejected": 0.11976318061351776, "step": 3720 }, { "epoch": 0.9590628218331617, "grad_norm": 117.0, "learning_rate": 2.0468589083419154e-08, "logits/chosen": -0.23374024033546448, "logits/rejected": -0.18544921278953552, "logps/chosen": -270.6000061035156, "logps/rejected": -252.1999969482422, "loss": 0.6734, "rewards/accuracies": 0.5398809313774109, "rewards/chosen": 0.16230468451976776, "rewards/margins": 0.07950439304113388, "rewards/rejected": 0.08278808742761612, "step": 3725 }, { "epoch": 0.9603501544799176, "grad_norm": 115.5, "learning_rate": 1.9824922760041193e-08, "logits/chosen": -0.36406248807907104, "logits/rejected": -0.35664063692092896, "logps/chosen": -318.3999938964844, "logps/rejected": -261.79998779296875, "loss": 0.657, "rewards/accuracies": 0.5199999809265137, "rewards/chosen": 0.23359374701976776, "rewards/margins": 0.10202636569738388, "rewards/rejected": 0.13139648735523224, "step": 3730 }, { "epoch": 0.9616374871266735, "grad_norm": 105.5, "learning_rate": 1.9181256436663233e-08, "logits/chosen": -0.4466796815395355, "logits/rejected": -0.34589844942092896, "logps/chosen": -322.20001220703125, "logps/rejected": -295.0, "loss": 0.6531, "rewards/accuracies": 0.5243939161300659, "rewards/chosen": 0.21923828125, "rewards/margins": 0.10861816257238388, "rewards/rejected": 0.11040039360523224, "step": 3735 }, { "epoch": 0.9629248197734295, "grad_norm": 106.5, "learning_rate": 1.8537590113285272e-08, "logits/chosen": -0.13461914658546448, "logits/rejected": -0.13559570908546448, "logps/chosen": -210.0, "logps/rejected": -219.39999389648438, "loss": 0.6078, "rewards/accuracies": 0.6425000429153442, "rewards/chosen": 0.20058593153953552, "rewards/margins": 0.2060546875, "rewards/rejected": -0.005809021182358265, "step": 3740 }, { "epoch": 0.9642121524201854, "grad_norm": 110.0, "learning_rate": 1.789392378990731e-08, "logits/chosen": -0.3172363340854645, "logits/rejected": -0.17685547471046448, "logps/chosen": -298.20001220703125, "logps/rejected": -266.6000061035156, "loss": 0.607, "rewards/accuracies": 0.6235256195068359, "rewards/chosen": 0.3587890565395355, "rewards/margins": 0.24003906548023224, "rewards/rejected": 0.11903686821460724, "step": 3745 }, { "epoch": 0.9654994850669413, "grad_norm": 112.0, "learning_rate": 1.725025746652935e-08, "logits/chosen": -0.4232421815395355, "logits/rejected": -0.38447266817092896, "logps/chosen": -241.8000030517578, "logps/rejected": -258.6000061035156, "loss": 0.6555, "rewards/accuracies": 0.4719444811344147, "rewards/chosen": 0.22785644233226776, "rewards/margins": 0.12346725165843964, "rewards/rejected": 0.10456542670726776, "step": 3750 }, { "epoch": 0.9667868177136972, "grad_norm": 124.5, "learning_rate": 1.660659114315139e-08, "logits/chosen": -0.2890625, "logits/rejected": -0.27900391817092896, "logps/chosen": -277.79998779296875, "logps/rejected": -267.79998779296875, "loss": 0.6336, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.28154295682907104, "rewards/margins": 0.18769225478172302, "rewards/rejected": 0.0941162109375, "step": 3755 }, { "epoch": 0.9680741503604532, "grad_norm": 130.0, "learning_rate": 1.596292481977343e-08, "logits/chosen": -0.21845702826976776, "logits/rejected": -0.11843261867761612, "logps/chosen": -274.0, "logps/rejected": -254.8000030517578, "loss": 0.6531, "rewards/accuracies": 0.6034523844718933, "rewards/chosen": 0.149658203125, "rewards/margins": 0.1146240234375, "rewards/rejected": 0.03508300706744194, "step": 3760 }, { "epoch": 0.969361483007209, "grad_norm": 141.0, "learning_rate": 1.531925849639547e-08, "logits/chosen": -0.33830565214157104, "logits/rejected": -0.25434571504592896, "logps/chosen": -297.79998779296875, "logps/rejected": -264.3999938964844, "loss": 0.6211, "rewards/accuracies": 0.5789743661880493, "rewards/chosen": 0.2525390684604645, "rewards/margins": 0.17851562798023224, "rewards/rejected": 0.07451172173023224, "step": 3765 }, { "epoch": 0.970648815653965, "grad_norm": 120.5, "learning_rate": 1.4675592173017507e-08, "logits/chosen": -0.3427734375, "logits/rejected": 0.012402343563735485, "logps/chosen": -266.8999938964844, "logps/rejected": -253.60000610351562, "loss": 0.6289, "rewards/accuracies": 0.654358983039856, "rewards/chosen": 0.2748046815395355, "rewards/margins": 0.18310546875, "rewards/rejected": 0.09119872748851776, "step": 3770 }, { "epoch": 0.971936148300721, "grad_norm": 163.0, "learning_rate": 1.4031925849639546e-08, "logits/chosen": -0.341796875, "logits/rejected": -0.3960937559604645, "logps/chosen": -324.3999938964844, "logps/rejected": -317.0, "loss": 0.6516, "rewards/accuracies": 0.5952777862548828, "rewards/chosen": 0.32792967557907104, "rewards/margins": 0.16864013671875, "rewards/rejected": 0.15869140625, "step": 3775 }, { "epoch": 0.9732234809474768, "grad_norm": 119.0, "learning_rate": 1.3388259526261585e-08, "logits/chosen": -0.24287109076976776, "logits/rejected": -0.23613281548023224, "logps/chosen": -295.6000061035156, "logps/rejected": -301.6000061035156, "loss": 0.5961, "rewards/accuracies": 0.5844047665596008, "rewards/chosen": 0.25175780057907104, "rewards/margins": 0.24824218451976776, "rewards/rejected": 0.0038085938431322575, "step": 3780 }, { "epoch": 0.9745108135942327, "grad_norm": 129.0, "learning_rate": 1.2744593202883625e-08, "logits/chosen": -0.17534179985523224, "logits/rejected": -0.3487304747104645, "logps/chosen": -335.6000061035156, "logps/rejected": -292.3999938964844, "loss": 0.6188, "rewards/accuracies": 0.6127381324768066, "rewards/chosen": 0.25703126192092896, "rewards/margins": 0.18447265028953552, "rewards/rejected": 0.07277832180261612, "step": 3785 }, { "epoch": 0.9757981462409887, "grad_norm": 125.5, "learning_rate": 1.2100926879505664e-08, "logits/chosen": -0.36005860567092896, "logits/rejected": -0.4683593809604645, "logps/chosen": -275.8999938964844, "logps/rejected": -280.79998779296875, "loss": 0.6, "rewards/accuracies": 0.6730555295944214, "rewards/chosen": 0.3349609375, "rewards/margins": 0.23798827826976776, "rewards/rejected": 0.09709472954273224, "step": 3790 }, { "epoch": 0.9770854788877446, "grad_norm": 118.5, "learning_rate": 1.1457260556127703e-08, "logits/chosen": -0.4212890565395355, "logits/rejected": -0.28765869140625, "logps/chosen": -254.8000030517578, "logps/rejected": -277.79998779296875, "loss": 0.6117, "rewards/accuracies": 0.7029545903205872, "rewards/chosen": 0.24160155653953552, "rewards/margins": 0.21269531548023224, "rewards/rejected": 0.02957763709127903, "step": 3795 }, { "epoch": 0.9783728115345005, "grad_norm": 124.0, "learning_rate": 1.0813594232749741e-08, "logits/chosen": -0.29072266817092896, "logits/rejected": -0.25761717557907104, "logps/chosen": -298.79998779296875, "logps/rejected": -272.79998779296875, "loss": 0.6359, "rewards/accuracies": 0.5674999952316284, "rewards/chosen": 0.23710937798023224, "rewards/margins": 0.17246094346046448, "rewards/rejected": 0.06520996242761612, "step": 3800 }, { "epoch": 0.9796601441812565, "grad_norm": 230.0, "learning_rate": 1.016992790937178e-08, "logits/chosen": -0.30097657442092896, "logits/rejected": -0.21816405653953552, "logps/chosen": -239.39999389648438, "logps/rejected": -256.0, "loss": 0.6805, "rewards/accuracies": 0.5, "rewards/chosen": 0.13322754204273224, "rewards/margins": 0.05538024753332138, "rewards/rejected": 0.07770995795726776, "step": 3805 }, { "epoch": 0.9809474768280123, "grad_norm": 142.0, "learning_rate": 9.52626158599382e-09, "logits/chosen": -0.251953125, "logits/rejected": -0.22412109375, "logps/chosen": -283.20001220703125, "logps/rejected": -257.0, "loss": 0.6406, "rewards/accuracies": 0.6116666793823242, "rewards/chosen": 0.22065429389476776, "rewards/margins": 0.17753906548023224, "rewards/rejected": 0.04331054538488388, "step": 3810 }, { "epoch": 0.9822348094747683, "grad_norm": 120.5, "learning_rate": 8.88259526261586e-09, "logits/chosen": -0.18330077826976776, "logits/rejected": -0.18408203125, "logps/chosen": -244.8000030517578, "logps/rejected": -251.1999969482422, "loss": 0.6539, "rewards/accuracies": 0.5685714483261108, "rewards/chosen": 0.21230468153953552, "rewards/margins": 0.12492676079273224, "rewards/rejected": 0.08709106594324112, "step": 3815 }, { "epoch": 0.9835221421215242, "grad_norm": 119.0, "learning_rate": 8.238928939237899e-09, "logits/chosen": -0.17158202826976776, "logits/rejected": -0.23515625298023224, "logps/chosen": -306.79998779296875, "logps/rejected": -331.6000061035156, "loss": 0.668, "rewards/accuracies": 0.5041667222976685, "rewards/chosen": 0.21601562201976776, "rewards/margins": 0.09406737983226776, "rewards/rejected": 0.1220703125, "step": 3820 }, { "epoch": 0.9848094747682801, "grad_norm": 127.0, "learning_rate": 7.595262615859938e-09, "logits/chosen": -0.12719115614891052, "logits/rejected": -0.12050781399011612, "logps/chosen": -260.6000061035156, "logps/rejected": -296.0, "loss": 0.6484, "rewards/accuracies": 0.59416663646698, "rewards/chosen": 0.17817382514476776, "rewards/margins": 0.12004394829273224, "rewards/rejected": 0.05795898288488388, "step": 3825 }, { "epoch": 0.986096807415036, "grad_norm": 146.0, "learning_rate": 6.951596292481977e-09, "logits/chosen": -0.22417601943016052, "logits/rejected": -0.22900390625, "logps/chosen": -343.6000061035156, "logps/rejected": -307.79998779296875, "loss": 0.6457, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.27421873807907104, "rewards/margins": 0.15410156548023224, "rewards/rejected": 0.11955566704273224, "step": 3830 }, { "epoch": 0.987384140061792, "grad_norm": 112.0, "learning_rate": 6.307929969104016e-09, "logits/chosen": -0.22163085639476776, "logits/rejected": -0.0777587890625, "logps/chosen": -305.3999938964844, "logps/rejected": -302.79998779296875, "loss": 0.6438, "rewards/accuracies": 0.59333336353302, "rewards/chosen": 0.18364258110523224, "rewards/margins": 0.18046875298023224, "rewards/rejected": 0.0033203125931322575, "step": 3835 }, { "epoch": 0.9886714727085479, "grad_norm": 118.5, "learning_rate": 5.664263645726055e-09, "logits/chosen": -0.27241212129592896, "logits/rejected": -0.32499998807907104, "logps/chosen": -361.20001220703125, "logps/rejected": -312.3999938964844, "loss": 0.6344, "rewards/accuracies": 0.6120238304138184, "rewards/chosen": 0.2874999940395355, "rewards/margins": 0.18540039658546448, "rewards/rejected": 0.10224609076976776, "step": 3840 }, { "epoch": 0.9899588053553038, "grad_norm": 109.5, "learning_rate": 5.020597322348095e-09, "logits/chosen": -0.22248534858226776, "logits/rejected": -0.22749023139476776, "logps/chosen": -379.20001220703125, "logps/rejected": -278.79998779296875, "loss": 0.5984, "rewards/accuracies": 0.6541666984558105, "rewards/chosen": 0.3167968690395355, "rewards/margins": 0.25468748807907104, "rewards/rejected": 0.06247558444738388, "step": 3845 }, { "epoch": 0.9912461380020597, "grad_norm": 121.5, "learning_rate": 4.376930998970134e-09, "logits/chosen": -0.34492188692092896, "logits/rejected": -0.2894531190395355, "logps/chosen": -373.20001220703125, "logps/rejected": -313.79998779296875, "loss": 0.6422, "rewards/accuracies": 0.6208333373069763, "rewards/chosen": 0.30097657442092896, "rewards/margins": 0.16640624403953552, "rewards/rejected": 0.13457031548023224, "step": 3850 }, { "epoch": 0.9925334706488157, "grad_norm": 109.5, "learning_rate": 3.733264675592173e-09, "logits/chosen": -0.22670897841453552, "logits/rejected": -0.03676757961511612, "logps/chosen": -274.6000061035156, "logps/rejected": -252.60000610351562, "loss": 0.6242, "rewards/accuracies": 0.6549242734909058, "rewards/chosen": 0.22763672471046448, "rewards/margins": 0.19598388671875, "rewards/rejected": 0.03153076022863388, "step": 3855 }, { "epoch": 0.9938208032955715, "grad_norm": 109.0, "learning_rate": 3.089598352214212e-09, "logits/chosen": -0.21162109076976776, "logits/rejected": -0.23759765923023224, "logps/chosen": -324.20001220703125, "logps/rejected": -294.6000061035156, "loss": 0.6367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20361328125, "rewards/margins": 0.15380859375, "rewards/rejected": 0.0494384765625, "step": 3860 }, { "epoch": 0.9951081359423275, "grad_norm": 139.0, "learning_rate": 2.445932028836251e-09, "logits/chosen": -0.3402343690395355, "logits/rejected": -0.353790283203125, "logps/chosen": -324.79998779296875, "logps/rejected": -320.0, "loss": 0.6656, "rewards/accuracies": 0.4848809838294983, "rewards/chosen": 0.23916015028953552, "rewards/margins": 0.11647949367761612, "rewards/rejected": 0.12236328423023224, "step": 3865 }, { "epoch": 0.9963954685890835, "grad_norm": 152.0, "learning_rate": 1.8022657054582903e-09, "logits/chosen": -0.18865355849266052, "logits/rejected": -0.056884765625, "logps/chosen": -308.0, "logps/rejected": -298.0, "loss": 0.6078, "rewards/accuracies": 0.7566666603088379, "rewards/chosen": 0.2783203125, "rewards/margins": 0.24453124403953552, "rewards/rejected": 0.03457031399011612, "step": 3870 }, { "epoch": 0.9976828012358393, "grad_norm": 153.0, "learning_rate": 1.1585993820803295e-09, "logits/chosen": -0.2802734375, "logits/rejected": -0.2734375, "logps/chosen": -329.0, "logps/rejected": -291.6000061035156, "loss": 0.6734, "rewards/accuracies": 0.5366666913032532, "rewards/chosen": 0.18408203125, "rewards/margins": 0.08190612494945526, "rewards/rejected": 0.10195312649011612, "step": 3875 }, { "epoch": 0.9989701338825953, "grad_norm": 142.0, "learning_rate": 5.149330587023687e-10, "logits/chosen": -0.48046875, "logits/rejected": -0.2916015684604645, "logps/chosen": -315.6000061035156, "logps/rejected": -237.0, "loss": 0.6523, "rewards/accuracies": 0.5397436022758484, "rewards/chosen": 0.2754882872104645, "rewards/margins": 0.12949219346046448, "rewards/rejected": 0.14583340287208557, "step": 3880 } ], "logging_steps": 5, "max_steps": 3884, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }