{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9998424353196986, "eval_steps": 500, "global_step": 11898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025210348848202185, "grad_norm": 121.69796752929688, "learning_rate": 4.201680672268907e-09, "logits/chosen": -1.1642577648162842, "logits/rejected": NaN, "logps/chosen": -167.7218780517578, "logps/rejected": -254.0437469482422, "loss": 0.6969, "rewards/accuracies": 0.2593750059604645, "rewards/chosen": 0.0011812209850177169, "rewards/margins": -0.004633903503417969, "rewards/rejected": 0.005809783935546875, "step": 10 }, { "epoch": 0.005042069769640437, "grad_norm": 134.09786987304688, "learning_rate": 8.403361344537815e-09, "logits/chosen": -1.198974609375, "logits/rejected": -1.295263648033142, "logps/chosen": -159.84375, "logps/rejected": -271.37188720703125, "loss": 0.6992, "rewards/accuracies": 0.359375, "rewards/chosen": -0.000698947929777205, "rewards/margins": -0.006379318423569202, "rewards/rejected": 0.00568466167896986, "step": 20 }, { "epoch": 0.007563104654460656, "grad_norm": 166.2114715576172, "learning_rate": 1.2605042016806723e-08, "logits/chosen": -1.1946289539337158, "logits/rejected": NaN, "logps/chosen": -154.8046875, "logps/rejected": -255.72500610351562, "loss": 0.6915, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.001498317695222795, "rewards/margins": 0.0016607284778729081, "rewards/rejected": -0.0001771926908986643, "step": 30 }, { "epoch": 0.010084139539280874, "grad_norm": 127.86746978759766, "learning_rate": 1.680672268907563e-08, "logits/chosen": -1.2091553211212158, "logits/rejected": -1.341284155845642, "logps/chosen": -161.22811889648438, "logps/rejected": -242.53125, "loss": 0.6949, "rewards/accuracies": 0.3499999940395355, "rewards/chosen": 0.004694461822509766, "rewards/margins": -0.0007511138683184981, "rewards/rejected": 0.005431270692497492, "step": 40 }, { "epoch": 0.012605174424101093, "grad_norm": 153.59178161621094, "learning_rate": 2.1008403361344538e-08, "logits/chosen": -1.193579077720642, "logits/rejected": -1.3173828125, "logps/chosen": -158.5749969482422, "logps/rejected": -257.40155029296875, "loss": 0.6888, "rewards/accuracies": 0.38749998807907104, "rewards/chosen": 0.009693240746855736, "rewards/margins": 0.007991599850356579, "rewards/rejected": 0.0017045974964275956, "step": 50 }, { "epoch": 0.015126209308921312, "grad_norm": 111.13823699951172, "learning_rate": 2.5210084033613446e-08, "logits/chosen": -1.178564429283142, "logits/rejected": -1.309326171875, "logps/chosen": -157.9718780517578, "logps/rejected": -258.59686279296875, "loss": 0.6893, "rewards/accuracies": 0.4156250059604645, "rewards/chosen": 0.016267776489257812, "rewards/margins": 0.012189483270049095, "rewards/rejected": 0.004095935728400946, "step": 60 }, { "epoch": 0.01764724419374153, "grad_norm": 128.80116271972656, "learning_rate": 2.941176470588235e-08, "logits/chosen": -1.2095458507537842, "logits/rejected": NaN, "logps/chosen": -171.74609375, "logps/rejected": -262.9750061035156, "loss": 0.6794, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 0.03838386386632919, "rewards/margins": 0.026642894372344017, "rewards/rejected": 0.011784935370087624, "step": 70 }, { "epoch": 0.020168279078561748, "grad_norm": 124.77375793457031, "learning_rate": 3.361344537815126e-08, "logits/chosen": -1.246191382408142, "logits/rejected": NaN, "logps/chosen": -168.5593719482422, "logps/rejected": -253.6218719482422, "loss": 0.6756, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.06352348625659943, "rewards/margins": 0.038286399096250534, "rewards/rejected": 0.02525787428021431, "step": 80 }, { "epoch": 0.022689313963381967, "grad_norm": 115.48733520507812, "learning_rate": 3.7815126050420164e-08, "logits/chosen": -1.2252197265625, "logits/rejected": NaN, "logps/chosen": -180.11093139648438, "logps/rejected": -258.6875, "loss": 0.6698, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.10401000827550888, "rewards/margins": 0.05587005615234375, "rewards/rejected": 0.04810447618365288, "step": 90 }, { "epoch": 0.025210348848202186, "grad_norm": 122.07361602783203, "learning_rate": 4.2016806722689076e-08, "logits/chosen": -1.1952636241912842, "logits/rejected": NaN, "logps/chosen": -154.83984375, "logps/rejected": -264.9375, "loss": 0.6526, "rewards/accuracies": 0.671875, "rewards/chosen": 0.13868407905101776, "rewards/margins": 0.08979644626379013, "rewards/rejected": 0.04893455654382706, "step": 100 }, { "epoch": 0.027731383733022405, "grad_norm": 106.35610961914062, "learning_rate": 4.621848739495798e-08, "logits/chosen": -1.1889770030975342, "logits/rejected": -1.284423828125, "logps/chosen": -168.7624969482422, "logps/rejected": -262.79376220703125, "loss": 0.6332, "rewards/accuracies": 0.734375, "rewards/chosen": 0.213165283203125, "rewards/margins": 0.13777923583984375, "rewards/rejected": 0.07544021308422089, "step": 110 }, { "epoch": 0.030252418617842624, "grad_norm": 104.47399139404297, "learning_rate": 5.042016806722689e-08, "logits/chosen": -1.2095947265625, "logits/rejected": NaN, "logps/chosen": -159.5625, "logps/rejected": -270.78125, "loss": 0.5939, "rewards/accuracies": 0.840624988079071, "rewards/chosen": 0.3447509706020355, "rewards/margins": 0.2230377197265625, "rewards/rejected": 0.12161636352539062, "step": 120 }, { "epoch": 0.03277345350266284, "grad_norm": 83.41116333007812, "learning_rate": 5.46218487394958e-08, "logits/chosen": -1.237939476966858, "logits/rejected": NaN, "logps/chosen": -152.18594360351562, "logps/rejected": -254.71249389648438, "loss": 0.5733, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.4788574278354645, "rewards/margins": 0.278717041015625, "rewards/rejected": 0.20026549696922302, "step": 130 }, { "epoch": 0.03529448838748306, "grad_norm": 87.54658508300781, "learning_rate": 5.88235294117647e-08, "logits/chosen": -1.179174780845642, "logits/rejected": -1.288598656654358, "logps/chosen": -167.1828155517578, "logps/rejected": -260.2749938964844, "loss": 0.5545, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": 0.6065673828125, "rewards/margins": 0.3455352783203125, "rewards/rejected": 0.260855108499527, "step": 140 }, { "epoch": 0.03781552327230328, "grad_norm": 72.57316589355469, "learning_rate": 6.302521008403361e-08, "logits/chosen": -1.205810546875, "logits/rejected": NaN, "logps/chosen": -145.2687530517578, "logps/rejected": -282.76873779296875, "loss": 0.5308, "rewards/accuracies": 0.809374988079071, "rewards/chosen": 0.777294933795929, "rewards/margins": 0.42306822538375854, "rewards/rejected": 0.3541717529296875, "step": 150 }, { "epoch": 0.040336558157123496, "grad_norm": 67.11067199707031, "learning_rate": 6.722689075630252e-08, "logits/chosen": -1.186181664466858, "logits/rejected": -1.308251976966858, "logps/chosen": -160.29531860351562, "logps/rejected": -247.90625, "loss": 0.482, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.9421631097793579, "rewards/margins": 0.5807861089706421, "rewards/rejected": 0.36149901151657104, "step": 160 }, { "epoch": 0.04285759304194372, "grad_norm": 61.78547668457031, "learning_rate": 7.142857142857142e-08, "logits/chosen": -1.2709472179412842, "logits/rejected": NaN, "logps/chosen": -154.17343139648438, "logps/rejected": -248.17813110351562, "loss": 0.4618, "rewards/accuracies": 0.8343750238418579, "rewards/chosen": 1.078271508216858, "rewards/margins": 0.6813751459121704, "rewards/rejected": 0.396820068359375, "step": 170 }, { "epoch": 0.045378627926763934, "grad_norm": 76.56179809570312, "learning_rate": 7.563025210084033e-08, "logits/chosen": -1.275390625, "logits/rejected": -1.3555176258087158, "logps/chosen": -158.7335968017578, "logps/rejected": -272.3062438964844, "loss": 0.4293, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 1.153076171875, "rewards/margins": 0.8406616449356079, "rewards/rejected": 0.312155157327652, "step": 180 }, { "epoch": 0.04789966281158416, "grad_norm": 60.289886474609375, "learning_rate": 7.983193277310923e-08, "logits/chosen": -1.2170898914337158, "logits/rejected": -1.32373046875, "logps/chosen": -164.5203094482422, "logps/rejected": -272.6875, "loss": 0.3854, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 1.2356688976287842, "rewards/margins": 0.9962402582168579, "rewards/rejected": 0.2404022216796875, "step": 190 }, { "epoch": 0.05042069769640437, "grad_norm": 44.86962127685547, "learning_rate": 8.403361344537815e-08, "logits/chosen": -1.2443358898162842, "logits/rejected": NaN, "logps/chosen": -147.1648406982422, "logps/rejected": -244.75625610351562, "loss": 0.3811, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.233373999595642, "rewards/margins": 1.039404273033142, "rewards/rejected": 0.19380874931812286, "step": 200 }, { "epoch": 0.052941732581224595, "grad_norm": 60.75041198730469, "learning_rate": 8.823529411764706e-08, "logits/chosen": -1.266699194908142, "logits/rejected": NaN, "logps/chosen": -146.3937530517578, "logps/rejected": -235.6062469482422, "loss": 0.3873, "rewards/accuracies": 0.8531249761581421, "rewards/chosen": 1.202417016029358, "rewards/margins": 1.086389183998108, "rewards/rejected": 0.11592254787683487, "step": 210 }, { "epoch": 0.05546276746604481, "grad_norm": 57.16844940185547, "learning_rate": 9.243697478991596e-08, "logits/chosen": -1.3025391101837158, "logits/rejected": -1.3809082508087158, "logps/chosen": -149.3507843017578, "logps/rejected": -253.984375, "loss": 0.3408, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 1.3173949718475342, "rewards/margins": 1.397576928138733, "rewards/rejected": -0.07978057861328125, "step": 220 }, { "epoch": 0.05798380235086503, "grad_norm": 53.39495849609375, "learning_rate": 9.663865546218488e-08, "logits/chosen": -1.2878906726837158, "logits/rejected": NaN, "logps/chosen": -148.5890655517578, "logps/rejected": -242.22500610351562, "loss": 0.3352, "rewards/accuracies": 0.890625, "rewards/chosen": 1.35626220703125, "rewards/margins": 1.4325134754180908, "rewards/rejected": -0.07616577297449112, "step": 230 }, { "epoch": 0.06050483723568525, "grad_norm": 78.91970825195312, "learning_rate": 1.0084033613445378e-07, "logits/chosen": -1.278173804283142, "logits/rejected": NaN, "logps/chosen": -152.08438110351562, "logps/rejected": -266.45001220703125, "loss": 0.3138, "rewards/accuracies": 0.890625, "rewards/chosen": 1.372772216796875, "rewards/margins": 1.775488257408142, "rewards/rejected": -0.4019302427768707, "step": 240 }, { "epoch": 0.06302587212050546, "grad_norm": 133.6031951904297, "learning_rate": 1.0504201680672269e-07, "logits/chosen": -1.272216796875, "logits/rejected": NaN, "logps/chosen": -150.96875, "logps/rejected": -259.2250061035156, "loss": 0.3157, "rewards/accuracies": 0.875, "rewards/chosen": 1.279052734375, "rewards/margins": 1.795751929283142, "rewards/rejected": -0.5172408819198608, "step": 250 }, { "epoch": 0.06554690700532569, "grad_norm": 32.11806869506836, "learning_rate": 1.092436974789916e-07, "logits/chosen": -1.281774878501892, "logits/rejected": -1.434667944908142, "logps/chosen": -144.3625030517578, "logps/rejected": -264.3999938964844, "loss": 0.2958, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 1.348120093345642, "rewards/margins": 1.8904907703399658, "rewards/rejected": -0.5426963567733765, "step": 260 }, { "epoch": 0.06806794189014591, "grad_norm": 49.727752685546875, "learning_rate": 1.134453781512605e-07, "logits/chosen": -1.2805664539337158, "logits/rejected": -1.3869140148162842, "logps/chosen": -136.8878936767578, "logps/rejected": -262.9750061035156, "loss": 0.2593, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.4929077625274658, "rewards/margins": 2.12017822265625, "rewards/rejected": -0.6261566281318665, "step": 270 }, { "epoch": 0.07058897677496612, "grad_norm": 37.77398681640625, "learning_rate": 1.176470588235294e-07, "logits/chosen": -1.3101074695587158, "logits/rejected": NaN, "logps/chosen": -145.609375, "logps/rejected": -285.1312561035156, "loss": 0.2958, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 1.308282494544983, "rewards/margins": 2.092968702316284, "rewards/rejected": -0.7835952639579773, "step": 280 }, { "epoch": 0.07311001165978634, "grad_norm": 71.79366302490234, "learning_rate": 1.2184873949579832e-07, "logits/chosen": -1.3406250476837158, "logits/rejected": -1.4490234851837158, "logps/chosen": -163.9031219482422, "logps/rejected": -278.8062438964844, "loss": 0.3053, "rewards/accuracies": 0.893750011920929, "rewards/chosen": 1.3402588367462158, "rewards/margins": 2.210522413253784, "rewards/rejected": -0.8699371218681335, "step": 290 }, { "epoch": 0.07563104654460656, "grad_norm": 61.28352355957031, "learning_rate": 1.2605042016806723e-07, "logits/chosen": -1.271337866783142, "logits/rejected": NaN, "logps/chosen": -160.94140625, "logps/rejected": -310.0625, "loss": 0.2433, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 1.6212036609649658, "rewards/margins": 2.5866942405700684, "rewards/rejected": -0.96533203125, "step": 300 }, { "epoch": 0.07815208142942678, "grad_norm": 43.60219192504883, "learning_rate": 1.3025210084033613e-07, "logits/chosen": -1.2708008289337158, "logits/rejected": NaN, "logps/chosen": -152.171875, "logps/rejected": -294.1468811035156, "loss": 0.2323, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.7021667957305908, "rewards/margins": 2.7122559547424316, "rewards/rejected": -1.0107910633087158, "step": 310 }, { "epoch": 0.08067311631424699, "grad_norm": 39.72595977783203, "learning_rate": 1.3445378151260504e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -149.42343139648438, "logps/rejected": -270.48748779296875, "loss": 0.3241, "rewards/accuracies": 0.878125011920929, "rewards/chosen": 1.2721344232559204, "rewards/margins": 2.300433397293091, "rewards/rejected": -1.0282318592071533, "step": 320 }, { "epoch": 0.08319415119906722, "grad_norm": 103.66915893554688, "learning_rate": 1.3865546218487394e-07, "logits/chosen": -1.250970482826233, "logits/rejected": -1.393701195716858, "logps/chosen": -141.34530639648438, "logps/rejected": -266.234375, "loss": 0.2552, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 1.4436218738555908, "rewards/margins": 2.680419921875, "rewards/rejected": -1.2370483875274658, "step": 330 }, { "epoch": 0.08571518608388744, "grad_norm": 40.145484924316406, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -1.27587890625, "logits/rejected": -1.457495093345642, "logps/chosen": -166.83203125, "logps/rejected": -284.06561279296875, "loss": 0.2377, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": 1.461798071861267, "rewards/margins": 2.8802733421325684, "rewards/rejected": -1.4191070795059204, "step": 340 }, { "epoch": 0.08823622096870766, "grad_norm": 99.10304260253906, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -1.293066382408142, "logits/rejected": -1.4145019054412842, "logps/chosen": -136.765625, "logps/rejected": -265.3500061035156, "loss": 0.2607, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.313446044921875, "rewards/margins": 2.7121825218200684, "rewards/rejected": -1.3989746570587158, "step": 350 }, { "epoch": 0.09075725585352787, "grad_norm": 31.387958526611328, "learning_rate": 1.5126050420168066e-07, "logits/chosen": -1.272607445716858, "logits/rejected": NaN, "logps/chosen": -144.6640625, "logps/rejected": -267.79376220703125, "loss": 0.2476, "rewards/accuracies": 0.90625, "rewards/chosen": 1.335168480873108, "rewards/margins": 2.7579588890075684, "rewards/rejected": -1.4239623546600342, "step": 360 }, { "epoch": 0.09327829073834809, "grad_norm": 41.44064712524414, "learning_rate": 1.554621848739496e-07, "logits/chosen": -1.295996069908142, "logits/rejected": NaN, "logps/chosen": -140.48202514648438, "logps/rejected": -257.88751220703125, "loss": 0.2147, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.368096947669983, "rewards/margins": 2.859619140625, "rewards/rejected": -1.4910888671875, "step": 370 }, { "epoch": 0.09579932562316831, "grad_norm": 70.35003662109375, "learning_rate": 1.5966386554621847e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -157.47421264648438, "logps/rejected": -268.0062561035156, "loss": 0.2438, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2121245861053467, "rewards/margins": 2.838427782058716, "rewards/rejected": -1.624755859375, "step": 380 }, { "epoch": 0.09832036050798854, "grad_norm": 35.62227249145508, "learning_rate": 1.638655462184874e-07, "logits/chosen": -1.2773926258087158, "logits/rejected": NaN, "logps/chosen": -147.81405639648438, "logps/rejected": -288.0218811035156, "loss": 0.1842, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.428442358970642, "rewards/margins": 3.290478467941284, "rewards/rejected": -1.8629639148712158, "step": 390 }, { "epoch": 0.10084139539280874, "grad_norm": 39.115394592285156, "learning_rate": 1.680672268907563e-07, "logits/chosen": -1.2592041492462158, "logits/rejected": -1.392480492591858, "logps/chosen": -161.6796875, "logps/rejected": -305.9437561035156, "loss": 0.2136, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 1.3365875482559204, "rewards/margins": 3.2051634788513184, "rewards/rejected": -1.869140625, "step": 400 }, { "epoch": 0.10336243027762897, "grad_norm": 41.85042190551758, "learning_rate": 1.722689075630252e-07, "logits/chosen": -1.321313500404358, "logits/rejected": -1.489990234375, "logps/chosen": -162.67031860351562, "logps/rejected": -306.5625, "loss": 0.2153, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 1.407690405845642, "rewards/margins": 3.5083985328674316, "rewards/rejected": -2.099853515625, "step": 410 }, { "epoch": 0.10588346516244919, "grad_norm": 25.772274017333984, "learning_rate": 1.764705882352941e-07, "logits/chosen": -1.291723608970642, "logits/rejected": NaN, "logps/chosen": -154.17733764648438, "logps/rejected": -288.0843811035156, "loss": 0.184, "rewards/accuracies": 0.921875, "rewards/chosen": 1.3763916492462158, "rewards/margins": 3.686718702316284, "rewards/rejected": -2.3080077171325684, "step": 420 }, { "epoch": 0.1084045000472694, "grad_norm": 69.8096923828125, "learning_rate": 1.8067226890756302e-07, "logits/chosen": -1.362451195716858, "logits/rejected": -1.4901854991912842, "logps/chosen": -161.95938110351562, "logps/rejected": -277.1312561035156, "loss": 0.2085, "rewards/accuracies": 0.921875, "rewards/chosen": 1.3309814929962158, "rewards/margins": 3.608105421066284, "rewards/rejected": -2.276416063308716, "step": 430 }, { "epoch": 0.11092553493208962, "grad_norm": 75.18663024902344, "learning_rate": 1.8487394957983192e-07, "logits/chosen": -1.2987792491912842, "logits/rejected": NaN, "logps/chosen": -147.8328094482422, "logps/rejected": -283.45623779296875, "loss": 0.2237, "rewards/accuracies": 0.90625, "rewards/chosen": 1.352569580078125, "rewards/margins": 3.6245360374450684, "rewards/rejected": -2.270458936691284, "step": 440 }, { "epoch": 0.11344656981690984, "grad_norm": 36.783973693847656, "learning_rate": 1.8907563025210083e-07, "logits/chosen": -1.2957763671875, "logits/rejected": NaN, "logps/chosen": -151.41561889648438, "logps/rejected": -298.61248779296875, "loss": 0.1508, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 1.4122588634490967, "rewards/margins": 3.8343749046325684, "rewards/rejected": -2.424023389816284, "step": 450 }, { "epoch": 0.11596760470173006, "grad_norm": 46.090667724609375, "learning_rate": 1.9327731092436976e-07, "logits/chosen": -1.2771728038787842, "logits/rejected": -1.435791015625, "logps/chosen": -176.88906860351562, "logps/rejected": -289.16876220703125, "loss": 0.2106, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.191375732421875, "rewards/margins": 3.565966844558716, "rewards/rejected": -2.3740234375, "step": 460 }, { "epoch": 0.11848863958655027, "grad_norm": 46.5994758605957, "learning_rate": 1.9747899159663864e-07, "logits/chosen": -1.2624022960662842, "logits/rejected": -1.4131348133087158, "logps/chosen": -154.71249389648438, "logps/rejected": -306.47186279296875, "loss": 0.1463, "rewards/accuracies": 0.953125, "rewards/chosen": 1.210272192955017, "rewards/margins": 4.011523246765137, "rewards/rejected": -2.802539110183716, "step": 470 }, { "epoch": 0.1210096744713705, "grad_norm": 71.67864990234375, "learning_rate": 2.0168067226890757e-07, "logits/chosen": -1.226098656654358, "logits/rejected": NaN, "logps/chosen": -159.5124969482422, "logps/rejected": -289.0625, "loss": 0.1829, "rewards/accuracies": 0.921875, "rewards/chosen": 0.769573986530304, "rewards/margins": 3.9100584983825684, "rewards/rejected": -3.139355421066284, "step": 480 }, { "epoch": 0.12353070935619072, "grad_norm": 69.92384338378906, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -1.267919898033142, "logits/rejected": NaN, "logps/chosen": -165.10311889648438, "logps/rejected": -289.73748779296875, "loss": 0.2115, "rewards/accuracies": 0.909375011920929, "rewards/chosen": 1.03289794921875, "rewards/margins": 4.0882568359375, "rewards/rejected": -3.053906202316284, "step": 490 }, { "epoch": 0.12605174424101093, "grad_norm": 23.45460319519043, "learning_rate": 2.1008403361344538e-07, "logits/chosen": -1.2769043445587158, "logits/rejected": NaN, "logps/chosen": -153.1531219482422, "logps/rejected": -279.40936279296875, "loss": 0.156, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 1.158300757408142, "rewards/margins": 4.141308784484863, "rewards/rejected": -2.982714891433716, "step": 500 }, { "epoch": 0.12857277912583115, "grad_norm": 41.021629333496094, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -1.2012939453125, "logits/rejected": -1.44091796875, "logps/chosen": -131.734375, "logps/rejected": -280.1937561035156, "loss": 0.1503, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 1.29656982421875, "rewards/margins": 4.237011909484863, "rewards/rejected": -2.9404296875, "step": 510 }, { "epoch": 0.13109381401065137, "grad_norm": 59.36827087402344, "learning_rate": 2.184873949579832e-07, "logits/chosen": -1.25048828125, "logits/rejected": NaN, "logps/chosen": -157.1437530517578, "logps/rejected": -285.3687438964844, "loss": 0.182, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.143060326576233, "rewards/margins": 4.1776123046875, "rewards/rejected": -3.0338377952575684, "step": 520 }, { "epoch": 0.1336148488954716, "grad_norm": 41.35979080200195, "learning_rate": 2.226890756302521e-07, "logits/chosen": -1.188989281654358, "logits/rejected": NaN, "logps/chosen": -136.6374969482422, "logps/rejected": -278.6875, "loss": 0.1445, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.7148467898368835, "rewards/margins": 4.350293159484863, "rewards/rejected": -3.6333985328674316, "step": 530 }, { "epoch": 0.13613588378029182, "grad_norm": 40.82756805419922, "learning_rate": 2.26890756302521e-07, "logits/chosen": -1.3253905773162842, "logits/rejected": NaN, "logps/chosen": -145.84219360351562, "logps/rejected": -285.73126220703125, "loss": 0.1772, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 1.3921382427215576, "rewards/margins": 4.575390815734863, "rewards/rejected": -3.1845703125, "step": 540 }, { "epoch": 0.13865691866511204, "grad_norm": 45.29145431518555, "learning_rate": 2.3109243697478993e-07, "logits/chosen": -1.251806616783142, "logits/rejected": NaN, "logps/chosen": -159.5421905517578, "logps/rejected": -297.36248779296875, "loss": 0.1858, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.8018738031387329, "rewards/margins": 4.318921089172363, "rewards/rejected": -3.518994092941284, "step": 550 }, { "epoch": 0.14117795354993223, "grad_norm": 29.39824104309082, "learning_rate": 2.352941176470588e-07, "logits/chosen": -1.19482421875, "logits/rejected": -1.3500487804412842, "logps/chosen": -150.7890625, "logps/rejected": -274.125, "loss": 0.1472, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 1.420477271080017, "rewards/margins": 4.666796684265137, "rewards/rejected": -3.247119188308716, "step": 560 }, { "epoch": 0.14369898843475246, "grad_norm": 34.0945930480957, "learning_rate": 2.394957983193277e-07, "logits/chosen": -1.2300293445587158, "logits/rejected": -1.3331787586212158, "logps/chosen": -143.99063110351562, "logps/rejected": -294.33123779296875, "loss": 0.1407, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5211365222930908, "rewards/margins": 4.573046684265137, "rewards/rejected": -3.052441358566284, "step": 570 }, { "epoch": 0.14622002331957268, "grad_norm": 55.69940948486328, "learning_rate": 2.4369747899159664e-07, "logits/chosen": -1.2146728038787842, "logits/rejected": -1.3605225086212158, "logps/chosen": -138.64688110351562, "logps/rejected": -297.73126220703125, "loss": 0.1866, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.1699645519256592, "rewards/margins": 4.471337795257568, "rewards/rejected": -3.3030762672424316, "step": 580 }, { "epoch": 0.1487410582043929, "grad_norm": 47.5145149230957, "learning_rate": 2.478991596638655e-07, "logits/chosen": -1.260498046875, "logits/rejected": -1.371337890625, "logps/chosen": -162.7976531982422, "logps/rejected": -287.54376220703125, "loss": 0.1551, "rewards/accuracies": 0.921875, "rewards/chosen": 0.8300994634628296, "rewards/margins": 4.725683689117432, "rewards/rejected": -3.8941407203674316, "step": 590 }, { "epoch": 0.15126209308921312, "grad_norm": 35.59428405761719, "learning_rate": 2.5210084033613445e-07, "logits/chosen": -1.2677733898162842, "logits/rejected": NaN, "logps/chosen": -173.00936889648438, "logps/rejected": -307.76251220703125, "loss": 0.193, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 1.1744353771209717, "rewards/margins": 4.914355278015137, "rewards/rejected": -3.73876953125, "step": 600 }, { "epoch": 0.15378312797403335, "grad_norm": 35.68425750732422, "learning_rate": 2.5630252100840333e-07, "logits/chosen": -1.2052733898162842, "logits/rejected": NaN, "logps/chosen": -158.33984375, "logps/rejected": -285.79998779296875, "loss": 0.1804, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3694031238555908, "rewards/margins": 4.724877834320068, "rewards/rejected": -3.3541016578674316, "step": 610 }, { "epoch": 0.15630416285885357, "grad_norm": 41.837947845458984, "learning_rate": 2.6050420168067226e-07, "logits/chosen": -1.26318359375, "logits/rejected": NaN, "logps/chosen": -150.40664672851562, "logps/rejected": -287.36248779296875, "loss": 0.137, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.861297607421875, "rewards/margins": 4.630419731140137, "rewards/rejected": -3.7689452171325684, "step": 620 }, { "epoch": 0.1588251977436738, "grad_norm": 49.752777099609375, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -1.2652099132537842, "logits/rejected": -1.3555419445037842, "logps/chosen": -160.28280639648438, "logps/rejected": -310.625, "loss": 0.1518, "rewards/accuracies": 0.9375, "rewards/chosen": 0.940380871295929, "rewards/margins": 4.933447360992432, "rewards/rejected": -3.9922852516174316, "step": 630 }, { "epoch": 0.16134623262849399, "grad_norm": 23.018596649169922, "learning_rate": 2.689075630252101e-07, "logits/chosen": -1.2270996570587158, "logits/rejected": -1.3590576648712158, "logps/chosen": -140.9250030517578, "logps/rejected": -306.125, "loss": 0.1467, "rewards/accuracies": 0.921875, "rewards/chosen": 0.5660644769668579, "rewards/margins": 5.0205078125, "rewards/rejected": -4.452343940734863, "step": 640 }, { "epoch": 0.1638672675133142, "grad_norm": 54.051151275634766, "learning_rate": 2.7310924369747895e-07, "logits/chosen": -1.250341773033142, "logits/rejected": NaN, "logps/chosen": -150.7624969482422, "logps/rejected": -307.9375, "loss": 0.1421, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.0407836437225342, "rewards/margins": 5.029065132141113, "rewards/rejected": -3.9854493141174316, "step": 650 }, { "epoch": 0.16638830239813443, "grad_norm": 46.02471160888672, "learning_rate": 2.773109243697479e-07, "logits/chosen": -1.2473876476287842, "logits/rejected": NaN, "logps/chosen": -155.3156280517578, "logps/rejected": -318.875, "loss": 0.1818, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": 1.2938324213027954, "rewards/margins": 5.305371284484863, "rewards/rejected": -4.010449409484863, "step": 660 }, { "epoch": 0.16890933728295465, "grad_norm": 61.8232536315918, "learning_rate": 2.815126050420168e-07, "logits/chosen": -1.145776391029358, "logits/rejected": -1.349511742591858, "logps/chosen": -159.85311889648438, "logps/rejected": -302.5625, "loss": 0.1548, "rewards/accuracies": 0.9375, "rewards/chosen": 0.942962646484375, "rewards/margins": 5.232079982757568, "rewards/rejected": -4.289453029632568, "step": 670 }, { "epoch": 0.17143037216777487, "grad_norm": 11.542558670043945, "learning_rate": 2.857142857142857e-07, "logits/chosen": -1.193817138671875, "logits/rejected": -1.3623535633087158, "logps/chosen": -148.6593780517578, "logps/rejected": -317.4375, "loss": 0.1261, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 1.0304969549179077, "rewards/margins": 5.677343845367432, "rewards/rejected": -4.647656440734863, "step": 680 }, { "epoch": 0.1739514070525951, "grad_norm": 37.83146286010742, "learning_rate": 2.899159663865546e-07, "logits/chosen": -1.239404320716858, "logits/rejected": NaN, "logps/chosen": -155.0500030517578, "logps/rejected": -306.23748779296875, "loss": 0.1539, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.1901366710662842, "rewards/margins": 5.6220703125, "rewards/rejected": -4.432812690734863, "step": 690 }, { "epoch": 0.17647244193741532, "grad_norm": 28.85287094116211, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.2263915538787842, "logits/rejected": -1.3689453601837158, "logps/chosen": -151.63906860351562, "logps/rejected": -295.54998779296875, "loss": 0.1588, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.8080505132675171, "rewards/margins": 5.195116996765137, "rewards/rejected": -4.386914253234863, "step": 700 }, { "epoch": 0.17899347682223551, "grad_norm": 43.315956115722656, "learning_rate": 2.9831932773109244e-07, "logits/chosen": -1.2091796398162842, "logits/rejected": NaN, "logps/chosen": -151.2890625, "logps/rejected": -315.35626220703125, "loss": 0.1421, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.689617931842804, "rewards/margins": 5.572802543640137, "rewards/rejected": -4.885546684265137, "step": 710 }, { "epoch": 0.18151451170705574, "grad_norm": 38.689735412597656, "learning_rate": 3.025210084033613e-07, "logits/chosen": -1.2744629383087158, "logits/rejected": NaN, "logps/chosen": -163.83438110351562, "logps/rejected": -313.3125, "loss": 0.1396, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.791668713092804, "rewards/margins": 5.314062595367432, "rewards/rejected": -4.5224609375, "step": 720 }, { "epoch": 0.18403554659187596, "grad_norm": 19.689559936523438, "learning_rate": 3.0672268907563024e-07, "logits/chosen": -1.212988257408142, "logits/rejected": NaN, "logps/chosen": -160.0906219482422, "logps/rejected": -332.3374938964844, "loss": 0.1544, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 0.279092401266098, "rewards/margins": 5.348095893859863, "rewards/rejected": -5.068359375, "step": 730 }, { "epoch": 0.18655658147669618, "grad_norm": 27.18768882751465, "learning_rate": 3.109243697478992e-07, "logits/chosen": -1.2724120616912842, "logits/rejected": NaN, "logps/chosen": -155.0656280517578, "logps/rejected": -323.1812438964844, "loss": 0.1175, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.510443091392517, "rewards/margins": 6.041259765625, "rewards/rejected": -4.53173828125, "step": 740 }, { "epoch": 0.1890776163615164, "grad_norm": 67.36737823486328, "learning_rate": 3.1512605042016805e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -142.1171875, "logps/rejected": -303.79376220703125, "loss": 0.1373, "rewards/accuracies": 0.9375, "rewards/chosen": 1.2210693359375, "rewards/margins": 5.70068359375, "rewards/rejected": -4.480078220367432, "step": 750 }, { "epoch": 0.19159865124633663, "grad_norm": 51.65882110595703, "learning_rate": 3.1932773109243693e-07, "logits/chosen": -1.201928734779358, "logits/rejected": NaN, "logps/chosen": -165.6687469482422, "logps/rejected": -307.03125, "loss": 0.1242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5905090570449829, "rewards/margins": 5.729296684265137, "rewards/rejected": -5.141015529632568, "step": 760 }, { "epoch": 0.19411968613115685, "grad_norm": 64.9820785522461, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -1.2545654773712158, "logits/rejected": NaN, "logps/chosen": -145.11874389648438, "logps/rejected": -287.4750061035156, "loss": 0.1279, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 1.3775513172149658, "rewards/margins": 5.882421970367432, "rewards/rejected": -4.504101753234863, "step": 770 }, { "epoch": 0.19664072101597707, "grad_norm": 55.42124557495117, "learning_rate": 3.277310924369748e-07, "logits/chosen": -1.205078125, "logits/rejected": NaN, "logps/chosen": -166.02969360351562, "logps/rejected": -320.08123779296875, "loss": 0.1934, "rewards/accuracies": 0.918749988079071, "rewards/chosen": 0.28471070528030396, "rewards/margins": 5.51171875, "rewards/rejected": -5.224999904632568, "step": 780 }, { "epoch": 0.19916175590079727, "grad_norm": 75.03282165527344, "learning_rate": 3.319327731092437e-07, "logits/chosen": -1.208532691001892, "logits/rejected": NaN, "logps/chosen": -163.3679656982422, "logps/rejected": -328.0874938964844, "loss": 0.1423, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.9740051031112671, "rewards/margins": 6.013671875, "rewards/rejected": -5.036913871765137, "step": 790 }, { "epoch": 0.2016827907856175, "grad_norm": 32.82435989379883, "learning_rate": 3.361344537815126e-07, "logits/chosen": -1.21533203125, "logits/rejected": NaN, "logps/chosen": -156.69686889648438, "logps/rejected": -299.1312561035156, "loss": 0.1544, "rewards/accuracies": 0.9375, "rewards/chosen": 1.0113494396209717, "rewards/margins": 5.468652248382568, "rewards/rejected": -4.456738471984863, "step": 800 }, { "epoch": 0.2042038256704377, "grad_norm": 57.44865798950195, "learning_rate": 3.403361344537815e-07, "logits/chosen": -1.261962890625, "logits/rejected": NaN, "logps/chosen": -152.26718139648438, "logps/rejected": -293.15625, "loss": 0.1443, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 1.4093658924102783, "rewards/margins": 5.573534965515137, "rewards/rejected": -4.165331840515137, "step": 810 }, { "epoch": 0.20672486055525793, "grad_norm": 87.95596313476562, "learning_rate": 3.445378151260504e-07, "logits/chosen": -1.1342895030975342, "logits/rejected": -1.3787841796875, "logps/chosen": -177.00936889648438, "logps/rejected": -323.07501220703125, "loss": 0.1268, "rewards/accuracies": 0.953125, "rewards/chosen": 0.7821716070175171, "rewards/margins": 5.596484184265137, "rewards/rejected": -4.813672065734863, "step": 820 }, { "epoch": 0.20924589544007816, "grad_norm": 61.02523422241211, "learning_rate": 3.487394957983193e-07, "logits/chosen": -1.155603051185608, "logits/rejected": -1.3408081531524658, "logps/chosen": -171.1515655517578, "logps/rejected": -317.9624938964844, "loss": 0.128, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.4427551329135895, "rewards/margins": 5.905566215515137, "rewards/rejected": -5.462890625, "step": 830 }, { "epoch": 0.21176693032489838, "grad_norm": 73.02073669433594, "learning_rate": 3.529411764705882e-07, "logits/chosen": -1.1920166015625, "logits/rejected": NaN, "logps/chosen": -149.6750030517578, "logps/rejected": -311.20001220703125, "loss": 0.1459, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.7467803955078125, "rewards/margins": 6.098730564117432, "rewards/rejected": -5.352734565734863, "step": 840 }, { "epoch": 0.2142879652097186, "grad_norm": 47.99951934814453, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -1.2458007335662842, "logits/rejected": -1.4048340320587158, "logps/chosen": -159.7531280517578, "logps/rejected": -300.28125, "loss": 0.1502, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.31168824434280396, "rewards/margins": 5.913866996765137, "rewards/rejected": -5.600390434265137, "step": 850 }, { "epoch": 0.2168090000945388, "grad_norm": 43.07872772216797, "learning_rate": 3.6134453781512604e-07, "logits/chosen": -1.156652808189392, "logits/rejected": NaN, "logps/chosen": -143.83358764648438, "logps/rejected": -298.625, "loss": 0.1088, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.043756127357483, "rewards/margins": 5.8837890625, "rewards/rejected": -4.84033203125, "step": 860 }, { "epoch": 0.21933003497935902, "grad_norm": 59.290584564208984, "learning_rate": 3.655462184873949e-07, "logits/chosen": -1.1311523914337158, "logits/rejected": -1.2646605968475342, "logps/chosen": -151.3562469482422, "logps/rejected": -308.5249938964844, "loss": 0.1278, "rewards/accuracies": 0.9375, "rewards/chosen": 0.05946655198931694, "rewards/margins": 5.780663967132568, "rewards/rejected": -5.720507621765137, "step": 870 }, { "epoch": 0.22185106986417924, "grad_norm": 27.17367935180664, "learning_rate": 3.6974789915966385e-07, "logits/chosen": -1.2374756336212158, "logits/rejected": NaN, "logps/chosen": -178.3984375, "logps/rejected": -339.8500061035156, "loss": 0.1667, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.02569274976849556, "rewards/margins": 5.862109184265137, "rewards/rejected": -5.839257717132568, "step": 880 }, { "epoch": 0.22437210474899946, "grad_norm": 45.749656677246094, "learning_rate": 3.739495798319328e-07, "logits/chosen": -1.2567138671875, "logits/rejected": NaN, "logps/chosen": -154.6906280517578, "logps/rejected": -283.35626220703125, "loss": 0.1273, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 1.49078369140625, "rewards/margins": 5.433007717132568, "rewards/rejected": -3.9437499046325684, "step": 890 }, { "epoch": 0.22689313963381968, "grad_norm": 33.90043640136719, "learning_rate": 3.7815126050420166e-07, "logits/chosen": -1.27197265625, "logits/rejected": -1.40625, "logps/chosen": -156.88906860351562, "logps/rejected": -287.6499938964844, "loss": 0.1556, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3336273431777954, "rewards/margins": 5.287304878234863, "rewards/rejected": -3.9527344703674316, "step": 900 }, { "epoch": 0.2294141745186399, "grad_norm": 32.42388916015625, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -1.251367211341858, "logits/rejected": -1.395605444908142, "logps/chosen": -184.85781860351562, "logps/rejected": -284.79376220703125, "loss": 0.1216, "rewards/accuracies": 0.9375, "rewards/chosen": 1.3917815685272217, "rewards/margins": 5.544921875, "rewards/rejected": -4.154296875, "step": 910 }, { "epoch": 0.23193520940346013, "grad_norm": 23.97354507446289, "learning_rate": 3.865546218487395e-07, "logits/chosen": -1.1873047351837158, "logits/rejected": -1.3604004383087158, "logps/chosen": -152.41561889648438, "logps/rejected": -303.20623779296875, "loss": 0.1231, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.910632312297821, "rewards/margins": 6.35205078125, "rewards/rejected": -5.443945407867432, "step": 920 }, { "epoch": 0.23445624428828035, "grad_norm": 29.323768615722656, "learning_rate": 3.907563025210084e-07, "logits/chosen": -1.2045166492462158, "logits/rejected": NaN, "logps/chosen": -164.515625, "logps/rejected": -303.84375, "loss": 0.1181, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.7027541995048523, "rewards/margins": 6.204882621765137, "rewards/rejected": -5.499218940734863, "step": 930 }, { "epoch": 0.23697727917310055, "grad_norm": 23.243837356567383, "learning_rate": 3.949579831932773e-07, "logits/chosen": -1.2096679210662842, "logits/rejected": NaN, "logps/chosen": -150.5859375, "logps/rejected": -309.58123779296875, "loss": 0.1214, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 1.093042016029358, "rewards/margins": 6.399804592132568, "rewards/rejected": -5.308203220367432, "step": 940 }, { "epoch": 0.23949831405792077, "grad_norm": 89.10974884033203, "learning_rate": 3.991596638655462e-07, "logits/chosen": -1.1432616710662842, "logits/rejected": NaN, "logps/chosen": -136.5613250732422, "logps/rejected": -302.39373779296875, "loss": 0.1781, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 1.1718323230743408, "rewards/margins": 6.290136814117432, "rewards/rejected": -5.116991996765137, "step": 950 }, { "epoch": 0.242019348942741, "grad_norm": 31.785062789916992, "learning_rate": 4.0336134453781514e-07, "logits/chosen": -1.232519507408142, "logits/rejected": NaN, "logps/chosen": -144.453125, "logps/rejected": -293.7437438964844, "loss": 0.1482, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5431702136993408, "rewards/margins": 5.99267578125, "rewards/rejected": -4.44921875, "step": 960 }, { "epoch": 0.24454038382756121, "grad_norm": 12.84706974029541, "learning_rate": 4.07563025210084e-07, "logits/chosen": -1.2469971179962158, "logits/rejected": -1.399999976158142, "logps/chosen": -160.4093780517578, "logps/rejected": -322.7124938964844, "loss": 0.0942, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.9326843023300171, "rewards/margins": 6.527441501617432, "rewards/rejected": -5.595507621765137, "step": 970 }, { "epoch": 0.24706141871238144, "grad_norm": 58.66096496582031, "learning_rate": 4.117647058823529e-07, "logits/chosen": -1.249536156654358, "logits/rejected": -1.357031226158142, "logps/chosen": -169.88436889648438, "logps/rejected": -320.1499938964844, "loss": 0.1186, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.46808165311813354, "rewards/margins": 6.712500095367432, "rewards/rejected": -6.243750095367432, "step": 980 }, { "epoch": 0.24958245359720166, "grad_norm": 67.7430419921875, "learning_rate": 4.159663865546218e-07, "logits/chosen": -1.168127417564392, "logits/rejected": NaN, "logps/chosen": -152.38125610351562, "logps/rejected": -316.61248779296875, "loss": 0.1304, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.5408126711845398, "rewards/margins": 6.974804878234863, "rewards/rejected": -6.434960842132568, "step": 990 }, { "epoch": 0.25210348848202185, "grad_norm": 47.05867004394531, "learning_rate": 4.2016806722689076e-07, "logits/chosen": -1.167944312095642, "logits/rejected": -1.3955810070037842, "logps/chosen": -175.16561889648438, "logps/rejected": -357.92498779296875, "loss": 0.1186, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04983215406537056, "rewards/margins": 6.864062309265137, "rewards/rejected": -6.816210746765137, "step": 1000 }, { "epoch": 0.2546245233668421, "grad_norm": 48.764732360839844, "learning_rate": 4.2436974789915964e-07, "logits/chosen": -1.1951904296875, "logits/rejected": NaN, "logps/chosen": -167.70156860351562, "logps/rejected": -326.6625061035156, "loss": 0.1578, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 0.5569305419921875, "rewards/margins": 6.462206840515137, "rewards/rejected": -5.9052734375, "step": 1010 }, { "epoch": 0.2571455582516623, "grad_norm": 34.11311721801758, "learning_rate": 4.285714285714285e-07, "logits/chosen": -1.1873290538787842, "logits/rejected": NaN, "logps/chosen": -159.1171875, "logps/rejected": -310.61248779296875, "loss": 0.1341, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.8962036371231079, "rewards/margins": 6.41015625, "rewards/rejected": -5.515234470367432, "step": 1020 }, { "epoch": 0.25966659313648255, "grad_norm": 14.987260818481445, "learning_rate": 4.327731092436975e-07, "logits/chosen": -1.174646019935608, "logits/rejected": -1.297460913658142, "logps/chosen": -156.55624389648438, "logps/rejected": -333.9624938964844, "loss": 0.134, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 1.315209984779358, "rewards/margins": 6.632910251617432, "rewards/rejected": -5.316504001617432, "step": 1030 }, { "epoch": 0.26218762802130274, "grad_norm": 44.27821731567383, "learning_rate": 4.369747899159664e-07, "logits/chosen": NaN, "logits/rejected": -1.2626159191131592, "logps/chosen": -145.54843139648438, "logps/rejected": -310.20001220703125, "loss": 0.116, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.747998058795929, "rewards/margins": 6.880078315734863, "rewards/rejected": -6.130663871765137, "step": 1040 }, { "epoch": 0.26470866290612294, "grad_norm": 72.14604187011719, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -1.132116675376892, "logits/rejected": NaN, "logps/chosen": -155.9140625, "logps/rejected": -313.1499938964844, "loss": 0.1038, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.12890625, "rewards/margins": 7.073046684265137, "rewards/rejected": -7.199023246765137, "step": 1050 }, { "epoch": 0.2672296977909432, "grad_norm": 37.23501205444336, "learning_rate": 4.453781512605042e-07, "logits/chosen": -1.1457703113555908, "logits/rejected": NaN, "logps/chosen": -146.3367156982422, "logps/rejected": -313.375, "loss": 0.1306, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.27369385957717896, "rewards/margins": 6.737500190734863, "rewards/rejected": -6.466406345367432, "step": 1060 }, { "epoch": 0.2697507326757634, "grad_norm": 10.790033340454102, "learning_rate": 4.495798319327731e-07, "logits/chosen": -1.215734839439392, "logits/rejected": NaN, "logps/chosen": -154.7578125, "logps/rejected": -341.5, "loss": 0.0823, "rewards/accuracies": 0.971875011920929, "rewards/chosen": 1.236444115638733, "rewards/margins": 7.986718654632568, "rewards/rejected": -6.749804496765137, "step": 1070 }, { "epoch": 0.27227176756058363, "grad_norm": 41.846031188964844, "learning_rate": 4.53781512605042e-07, "logits/chosen": -1.1467163562774658, "logits/rejected": NaN, "logps/chosen": -174.9562530517578, "logps/rejected": -330.0, "loss": 0.1451, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.079498291015625, "rewards/margins": 7.237890720367432, "rewards/rejected": -7.3173828125, "step": 1080 }, { "epoch": 0.2747928024454038, "grad_norm": 32.15214157104492, "learning_rate": 4.579831932773109e-07, "logits/chosen": -1.241387963294983, "logits/rejected": NaN, "logps/chosen": -150.23593139648438, "logps/rejected": -322.3999938964844, "loss": 0.0903, "rewards/accuracies": 0.953125, "rewards/chosen": 0.2512451112270355, "rewards/margins": 7.097851753234863, "rewards/rejected": -6.844336032867432, "step": 1090 }, { "epoch": 0.2773138373302241, "grad_norm": 15.110795021057129, "learning_rate": 4.6218487394957986e-07, "logits/chosen": -1.186669945716858, "logits/rejected": NaN, "logps/chosen": -165.0281219482422, "logps/rejected": -335.2875061035156, "loss": 0.158, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.951873779296875, "rewards/margins": 7.143359184265137, "rewards/rejected": -6.193359375, "step": 1100 }, { "epoch": 0.2798348722150443, "grad_norm": 56.17876434326172, "learning_rate": 4.6638655462184874e-07, "logits/chosen": -1.219628930091858, "logits/rejected": NaN, "logps/chosen": -155.22030639648438, "logps/rejected": -280.4312438964844, "loss": 0.1609, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 1.274169921875, "rewards/margins": 5.996484279632568, "rewards/rejected": -4.720898628234863, "step": 1110 }, { "epoch": 0.28235590709986447, "grad_norm": 35.18253707885742, "learning_rate": 4.705882352941176e-07, "logits/chosen": -1.1398193836212158, "logits/rejected": -1.354736328125, "logps/chosen": -158.0968780517578, "logps/rejected": -319.73126220703125, "loss": 0.1303, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.24439087510108948, "rewards/margins": 7.136914253234863, "rewards/rejected": -6.894335746765137, "step": 1120 }, { "epoch": 0.2848769419846847, "grad_norm": 22.63697052001953, "learning_rate": 4.747899159663865e-07, "logits/chosen": -1.185949683189392, "logits/rejected": NaN, "logps/chosen": -179.7687530517578, "logps/rejected": -364.86248779296875, "loss": 0.1367, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.25355225801467896, "rewards/margins": 7.541406154632568, "rewards/rejected": -7.793359279632568, "step": 1130 }, { "epoch": 0.2873979768695049, "grad_norm": 33.91263961791992, "learning_rate": 4.789915966386554e-07, "logits/chosen": -1.239556908607483, "logits/rejected": -1.3345215320587158, "logps/chosen": -165.89218139648438, "logps/rejected": -313.11248779296875, "loss": 0.1286, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.8713623285293579, "rewards/margins": 6.8173828125, "rewards/rejected": -5.9453125, "step": 1140 }, { "epoch": 0.28991901175432516, "grad_norm": 33.56672668457031, "learning_rate": 4.831932773109244e-07, "logits/chosen": -1.2441895008087158, "logits/rejected": -1.3756835460662842, "logps/chosen": -144.6984405517578, "logps/rejected": -289.17498779296875, "loss": 0.1144, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.6471923589706421, "rewards/margins": 5.587695121765137, "rewards/rejected": -4.941601753234863, "step": 1150 }, { "epoch": 0.29244004663914536, "grad_norm": 41.389591217041016, "learning_rate": 4.873949579831933e-07, "logits/chosen": -1.202185034751892, "logits/rejected": NaN, "logps/chosen": -202.0812530517578, "logps/rejected": -313.23126220703125, "loss": 0.1504, "rewards/accuracies": 0.9375, "rewards/chosen": 0.9189087152481079, "rewards/margins": 6.798828125, "rewards/rejected": -5.878320217132568, "step": 1160 }, { "epoch": 0.2949610815239656, "grad_norm": 44.791053771972656, "learning_rate": 4.915966386554621e-07, "logits/chosen": -1.0989196300506592, "logits/rejected": NaN, "logps/chosen": -177.97500610351562, "logps/rejected": -320.26251220703125, "loss": 0.1385, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.37928467988967896, "rewards/margins": 6.673632621765137, "rewards/rejected": -6.294531345367432, "step": 1170 }, { "epoch": 0.2974821164087858, "grad_norm": 28.665725708007812, "learning_rate": 4.95798319327731e-07, "logits/chosen": -1.169988989830017, "logits/rejected": NaN, "logps/chosen": -152.1320343017578, "logps/rejected": -323.8374938964844, "loss": 0.1457, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 0.6125122308731079, "rewards/margins": 6.616601467132568, "rewards/rejected": -6.006445407867432, "step": 1180 }, { "epoch": 0.30000315129360605, "grad_norm": 28.431589126586914, "learning_rate": 5e-07, "logits/chosen": -1.222387671470642, "logits/rejected": NaN, "logps/chosen": -160.8937530517578, "logps/rejected": -332.5625, "loss": 0.1107, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.9510528445243835, "rewards/margins": 6.88818359375, "rewards/rejected": -5.939257621765137, "step": 1190 }, { "epoch": 0.30252418617842625, "grad_norm": 37.57841110229492, "learning_rate": 4.999989240484344e-07, "logits/chosen": -1.167700171470642, "logits/rejected": -1.2830321788787842, "logps/chosen": -160.72500610351562, "logps/rejected": -334.5874938964844, "loss": 0.1443, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9903320074081421, "rewards/margins": 6.776659965515137, "rewards/rejected": -5.78515625, "step": 1200 }, { "epoch": 0.30504522106324644, "grad_norm": 28.987812042236328, "learning_rate": 4.999956962029988e-07, "logits/chosen": -1.1586425304412842, "logits/rejected": NaN, "logps/chosen": -165.11563110351562, "logps/rejected": -343.45623779296875, "loss": 0.1353, "rewards/accuracies": 0.953125, "rewards/chosen": 0.4927001893520355, "rewards/margins": 6.9130859375, "rewards/rejected": -6.423437595367432, "step": 1210 }, { "epoch": 0.3075662559480667, "grad_norm": 52.151424407958984, "learning_rate": 4.999903164914773e-07, "logits/chosen": -1.1226806640625, "logits/rejected": NaN, "logps/chosen": -156.0984344482422, "logps/rejected": -309.7749938964844, "loss": 0.1071, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.24541015923023224, "rewards/margins": 7.130273342132568, "rewards/rejected": -6.885546684265137, "step": 1220 }, { "epoch": 0.3100872908328869, "grad_norm": 9.880250930786133, "learning_rate": 4.999827849601764e-07, "logits/chosen": -1.1767578125, "logits/rejected": -1.2824828624725342, "logps/chosen": -161.56405639648438, "logps/rejected": -342.1187438964844, "loss": 0.1251, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 1.1699097156524658, "rewards/margins": 7.360058784484863, "rewards/rejected": -6.192773342132568, "step": 1230 }, { "epoch": 0.31260832571770714, "grad_norm": 21.178600311279297, "learning_rate": 4.999731016739247e-07, "logits/chosen": -1.1359374523162842, "logits/rejected": -1.2759521007537842, "logps/chosen": -143.1531219482422, "logps/rejected": -326.23126220703125, "loss": 0.1067, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.4987731873989105, "rewards/margins": 7.367383003234863, "rewards/rejected": -6.872265815734863, "step": 1240 }, { "epoch": 0.31512936060252733, "grad_norm": 75.81818389892578, "learning_rate": 4.99961266716072e-07, "logits/chosen": -1.1541259288787842, "logits/rejected": NaN, "logps/chosen": -157.2687530517578, "logps/rejected": -298.4937438964844, "loss": 0.1665, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4899536073207855, "rewards/margins": 7.196679592132568, "rewards/rejected": -6.705468654632568, "step": 1250 }, { "epoch": 0.3176503954873476, "grad_norm": 16.892955780029297, "learning_rate": 4.999472801884891e-07, "logits/chosen": -1.2179076671600342, "logits/rejected": -1.3529052734375, "logps/chosen": -141.3328094482422, "logps/rejected": -314.3125, "loss": 0.1066, "rewards/accuracies": 0.953125, "rewards/chosen": 1.08819580078125, "rewards/margins": 7.204297065734863, "rewards/rejected": -6.112890720367432, "step": 1260 }, { "epoch": 0.3201714303721678, "grad_norm": 79.07512664794922, "learning_rate": 4.999311422115667e-07, "logits/chosen": -1.19091796875, "logits/rejected": -1.330297827720642, "logps/chosen": -154.8992156982422, "logps/rejected": -321.56561279296875, "loss": 0.14, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.2592560052871704, "rewards/margins": 7.298364162445068, "rewards/rejected": -6.040820121765137, "step": 1270 }, { "epoch": 0.32269246525698797, "grad_norm": 38.38762283325195, "learning_rate": 4.99912852924214e-07, "logits/chosen": -1.2513916492462158, "logits/rejected": -1.383264183998108, "logps/chosen": -142.32266235351562, "logps/rejected": -295.0249938964844, "loss": 0.1568, "rewards/accuracies": 0.921875, "rewards/chosen": 1.4396483898162842, "rewards/margins": 7.054785251617432, "rewards/rejected": -5.613671779632568, "step": 1280 }, { "epoch": 0.3252135001418082, "grad_norm": 43.1234245300293, "learning_rate": 4.998924124838582e-07, "logits/chosen": -1.1780517101287842, "logits/rejected": -1.2249603271484375, "logps/chosen": -142.52499389648438, "logps/rejected": -328.98748779296875, "loss": 0.14, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.793322741985321, "rewards/margins": 7.04345703125, "rewards/rejected": -6.251367092132568, "step": 1290 }, { "epoch": 0.3277345350266284, "grad_norm": 54.35491943359375, "learning_rate": 4.99869821066443e-07, "logits/chosen": -1.1600128412246704, "logits/rejected": NaN, "logps/chosen": -160.04061889648438, "logps/rejected": -316.1625061035156, "loss": 0.1087, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.9168609380722046, "rewards/margins": 7.022851467132568, "rewards/rejected": -6.104101657867432, "step": 1300 }, { "epoch": 0.33025556991144867, "grad_norm": 44.54366683959961, "learning_rate": 4.998450788664262e-07, "logits/chosen": -1.1287109851837158, "logits/rejected": -1.282958984375, "logps/chosen": -157.6125030517578, "logps/rejected": -343.70001220703125, "loss": 0.0839, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.483154296875, "rewards/margins": 7.580078125, "rewards/rejected": -7.09765625, "step": 1310 }, { "epoch": 0.33277660479626886, "grad_norm": 62.74064636230469, "learning_rate": 4.998181860967792e-07, "logits/chosen": -1.061743140220642, "logits/rejected": NaN, "logps/chosen": -158.06875610351562, "logps/rejected": -319.0625, "loss": 0.0996, "rewards/accuracies": 0.953125, "rewards/chosen": 0.300384521484375, "rewards/margins": 7.689648628234863, "rewards/rejected": -7.390234470367432, "step": 1320 }, { "epoch": 0.3352976396810891, "grad_norm": 48.61635208129883, "learning_rate": 4.997891429889845e-07, "logits/chosen": -1.1568634510040283, "logits/rejected": -1.228662133216858, "logps/chosen": -145.77206420898438, "logps/rejected": -335.45001220703125, "loss": 0.1915, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.546295166015625, "rewards/margins": 7.787499904632568, "rewards/rejected": -7.245898246765137, "step": 1330 }, { "epoch": 0.3378186745659093, "grad_norm": 31.12544059753418, "learning_rate": 4.997579497930341e-07, "logits/chosen": -1.18359375, "logits/rejected": NaN, "logps/chosen": -163.68789672851562, "logps/rejected": -323.73126220703125, "loss": 0.1249, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": 1.112451195716858, "rewards/margins": 7.639355659484863, "rewards/rejected": -6.526562690734863, "step": 1340 }, { "epoch": 0.3403397094507295, "grad_norm": 32.51969528198242, "learning_rate": 4.997246067774266e-07, "logits/chosen": -1.2276611328125, "logits/rejected": NaN, "logps/chosen": -131.0812530517578, "logps/rejected": -309.61248779296875, "loss": 0.0943, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5178039073944092, "rewards/margins": 7.313867092132568, "rewards/rejected": -5.796679496765137, "step": 1350 }, { "epoch": 0.34286074433554975, "grad_norm": 28.024734497070312, "learning_rate": 4.99689114229166e-07, "logits/chosen": -1.191076636314392, "logits/rejected": NaN, "logps/chosen": -148.95938110351562, "logps/rejected": -314.3062438964844, "loss": 0.1013, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.8425842523574829, "rewards/margins": 7.1806640625, "rewards/rejected": -6.334570407867432, "step": 1360 }, { "epoch": 0.34538177922036994, "grad_norm": 9.129551887512207, "learning_rate": 4.996514724537585e-07, "logits/chosen": -1.091040015220642, "logits/rejected": NaN, "logps/chosen": -170.3156280517578, "logps/rejected": -339.45001220703125, "loss": 0.098, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.2812957763671875, "rewards/margins": 8.298242568969727, "rewards/rejected": -8.016210556030273, "step": 1370 }, { "epoch": 0.3479028141051902, "grad_norm": 7.993910312652588, "learning_rate": 4.996116817752096e-07, "logits/chosen": -1.0496337413787842, "logits/rejected": NaN, "logps/chosen": -156.4187469482422, "logps/rejected": -365.9750061035156, "loss": 0.106, "rewards/accuracies": 0.953125, "rewards/chosen": -0.22316893935203552, "rewards/margins": 8.667577743530273, "rewards/rejected": -8.890233993530273, "step": 1380 }, { "epoch": 0.3504238489900104, "grad_norm": 49.35456466674805, "learning_rate": 4.995697425360223e-07, "logits/chosen": -1.205346703529358, "logits/rejected": NaN, "logps/chosen": -160.92343139648438, "logps/rejected": -307.66876220703125, "loss": 0.1836, "rewards/accuracies": 0.909375011920929, "rewards/chosen": 0.7360900640487671, "rewards/margins": 7.489794731140137, "rewards/rejected": -6.752148628234863, "step": 1390 }, { "epoch": 0.35294488387483064, "grad_norm": 22.946325302124023, "learning_rate": 4.995256550971933e-07, "logits/chosen": -1.25, "logits/rejected": NaN, "logps/chosen": -150.88436889648438, "logps/rejected": -329.07501220703125, "loss": 0.0939, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.473474144935608, "rewards/margins": 7.68115234375, "rewards/rejected": -6.206640720367432, "step": 1400 }, { "epoch": 0.35546591875965083, "grad_norm": 69.77560424804688, "learning_rate": 4.9947941983821e-07, "logits/chosen": -1.234130859375, "logits/rejected": NaN, "logps/chosen": -155.6140594482422, "logps/rejected": -327.0562438964844, "loss": 0.1034, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.2058746814727783, "rewards/margins": 7.556542873382568, "rewards/rejected": -6.352929592132568, "step": 1410 }, { "epoch": 0.35798695364447103, "grad_norm": 23.90690803527832, "learning_rate": 4.994310371570477e-07, "logits/chosen": -1.21630859375, "logits/rejected": -1.2993652820587158, "logps/chosen": -139.94375610351562, "logps/rejected": -327.88751220703125, "loss": 0.1111, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 1.0072815418243408, "rewards/margins": 8.621679306030273, "rewards/rejected": -7.616406440734863, "step": 1420 }, { "epoch": 0.3605079885292913, "grad_norm": 88.03437042236328, "learning_rate": 4.993805074701659e-07, "logits/chosen": -1.1068909168243408, "logits/rejected": NaN, "logps/chosen": -177.1046905517578, "logps/rejected": -334.5874938964844, "loss": 0.1784, "rewards/accuracies": 0.921875, "rewards/chosen": 0.731091320514679, "rewards/margins": 7.857617378234863, "rewards/rejected": -7.122265815734863, "step": 1430 }, { "epoch": 0.3630290234141115, "grad_norm": 23.345043182373047, "learning_rate": 4.993278312125045e-07, "logits/chosen": -1.1685364246368408, "logits/rejected": -1.3270263671875, "logps/chosen": -161.7136688232422, "logps/rejected": -321.8187561035156, "loss": 0.1292, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3545394837856293, "rewards/margins": 7.658593654632568, "rewards/rejected": -7.3046875, "step": 1440 }, { "epoch": 0.3655500582989317, "grad_norm": 51.19300079345703, "learning_rate": 4.992730088374802e-07, "logits/chosen": -1.1191895008087158, "logits/rejected": NaN, "logps/chosen": -173.64999389648438, "logps/rejected": -352.6625061035156, "loss": 0.1537, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.3135620057582855, "rewards/margins": 7.834277153015137, "rewards/rejected": -8.145312309265137, "step": 1450 }, { "epoch": 0.3680710931837519, "grad_norm": 18.219587326049805, "learning_rate": 4.992160408169828e-07, "logits/chosen": -1.291015625, "logits/rejected": -1.3077392578125, "logps/chosen": -170.55313110351562, "logps/rejected": -334.1312561035156, "loss": 0.1527, "rewards/accuracies": 0.9375, "rewards/chosen": 0.2528839111328125, "rewards/margins": 7.329687595367432, "rewards/rejected": -7.077538967132568, "step": 1460 }, { "epoch": 0.37059212806857217, "grad_norm": 41.05903244018555, "learning_rate": 4.991569276413711e-07, "logits/chosen": -1.1570556163787842, "logits/rejected": NaN, "logps/chosen": -174.4875030517578, "logps/rejected": -340.4750061035156, "loss": 0.123, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.935192883014679, "rewards/margins": 7.357812404632568, "rewards/rejected": -6.419140815734863, "step": 1470 }, { "epoch": 0.37311316295339236, "grad_norm": 45.42596435546875, "learning_rate": 4.990956698194681e-07, "logits/chosen": -1.232177734375, "logits/rejected": NaN, "logps/chosen": -166.39688110351562, "logps/rejected": -313.95001220703125, "loss": 0.1252, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.9435485601425171, "rewards/margins": 7.081250190734863, "rewards/rejected": -6.138476371765137, "step": 1480 }, { "epoch": 0.3756341978382126, "grad_norm": 40.43252944946289, "learning_rate": 4.990322678785578e-07, "logits/chosen": -1.2293701171875, "logits/rejected": -1.328466773033142, "logps/chosen": -154.5671844482422, "logps/rejected": -342.35626220703125, "loss": 0.1219, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.514965832233429, "rewards/margins": 7.545800685882568, "rewards/rejected": -7.029687404632568, "step": 1490 }, { "epoch": 0.3781552327230328, "grad_norm": 51.85287857055664, "learning_rate": 4.989667223643792e-07, "logits/chosen": -1.1646850109100342, "logits/rejected": -1.282324194908142, "logps/chosen": -159.1671905517578, "logps/rejected": -339.4624938964844, "loss": 0.0935, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.7927093505859375, "rewards/margins": 7.356054782867432, "rewards/rejected": -6.56640625, "step": 1500 }, { "epoch": 0.380676267607853, "grad_norm": 34.19110107421875, "learning_rate": 4.988990338411229e-07, "logits/chosen": -1.1754028797149658, "logits/rejected": -1.322998046875, "logps/chosen": -181.77969360351562, "logps/rejected": -328.1625061035156, "loss": 0.1515, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6220123171806335, "rewards/margins": 7.954297065734863, "rewards/rejected": -7.3310546875, "step": 1510 }, { "epoch": 0.38319730249267325, "grad_norm": 27.37630271911621, "learning_rate": 4.988292028914254e-07, "logits/chosen": -1.240087866783142, "logits/rejected": -1.297582983970642, "logps/chosen": -155.4695281982422, "logps/rejected": -328.7124938964844, "loss": 0.1191, "rewards/accuracies": 0.953125, "rewards/chosen": 0.6837097406387329, "rewards/margins": 7.533593654632568, "rewards/rejected": -6.853125095367432, "step": 1520 }, { "epoch": 0.38571833737749345, "grad_norm": 30.307113647460938, "learning_rate": 4.987572301163644e-07, "logits/chosen": -1.2538940906524658, "logits/rejected": NaN, "logps/chosen": -172.734375, "logps/rejected": -320.78125, "loss": 0.1526, "rewards/accuracies": 0.9375, "rewards/chosen": 0.7056442499160767, "rewards/margins": 7.426953315734863, "rewards/rejected": -6.720898628234863, "step": 1530 }, { "epoch": 0.3882393722623137, "grad_norm": 5.467235088348389, "learning_rate": 4.986831161354537e-07, "logits/chosen": -1.2825438976287842, "logits/rejected": -1.338720679283142, "logps/chosen": -145.56875610351562, "logps/rejected": -333.375, "loss": 0.0823, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.8991119265556335, "rewards/margins": 8.086718559265137, "rewards/rejected": -7.187890529632568, "step": 1540 }, { "epoch": 0.3907604071471339, "grad_norm": 29.91176986694336, "learning_rate": 4.986068615866377e-07, "logits/chosen": -1.1746947765350342, "logits/rejected": -1.2974364757537842, "logps/chosen": -160.15234375, "logps/rejected": -341.3500061035156, "loss": 0.1905, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.19044800102710724, "rewards/margins": 7.838842868804932, "rewards/rejected": -8.029296875, "step": 1550 }, { "epoch": 0.39328144203195414, "grad_norm": 4.688235282897949, "learning_rate": 4.985284671262863e-07, "logits/chosen": -1.137548804283142, "logits/rejected": -1.25787353515625, "logps/chosen": -174.5124969482422, "logps/rejected": -338.5249938964844, "loss": 0.1362, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.3547790050506592, "rewards/margins": 7.465624809265137, "rewards/rejected": -8.815234184265137, "step": 1560 }, { "epoch": 0.39580247691677434, "grad_norm": 14.337002754211426, "learning_rate": 4.984479334291882e-07, "logits/chosen": -1.1846191883087158, "logits/rejected": NaN, "logps/chosen": -184.5437469482422, "logps/rejected": -359.17498779296875, "loss": 0.0991, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1377716064453125, "rewards/margins": 7.668359279632568, "rewards/rejected": -8.808984756469727, "step": 1570 }, { "epoch": 0.39832351180159453, "grad_norm": 18.446247100830078, "learning_rate": 4.983652611885465e-07, "logits/chosen": -1.169103980064392, "logits/rejected": -1.275610327720642, "logps/chosen": -168.9015655517578, "logps/rejected": -354.86248779296875, "loss": 0.0917, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.773510754108429, "rewards/margins": 7.349609375, "rewards/rejected": -8.121484756469727, "step": 1580 }, { "epoch": 0.4008445466864148, "grad_norm": 35.12605285644531, "learning_rate": 4.982804511159718e-07, "logits/chosen": -1.153234839439392, "logits/rejected": -1.237512230873108, "logps/chosen": -168.94686889648438, "logps/rejected": -344.6625061035156, "loss": 0.065, "rewards/accuracies": 0.96875, "rewards/chosen": 0.4343002438545227, "rewards/margins": 7.962109565734863, "rewards/rejected": -7.526171684265137, "step": 1590 }, { "epoch": 0.403365581571235, "grad_norm": 21.124860763549805, "learning_rate": 4.981935039414763e-07, "logits/chosen": -1.1712830066680908, "logits/rejected": NaN, "logps/chosen": -158.65625, "logps/rejected": -308.4312438964844, "loss": 0.0851, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7620803713798523, "rewards/margins": 8.064062118530273, "rewards/rejected": -7.295702934265137, "step": 1600 }, { "epoch": 0.4058866164560552, "grad_norm": 43.04229736328125, "learning_rate": 4.981044204134676e-07, "logits/chosen": -1.16748046875, "logits/rejected": NaN, "logps/chosen": -156.7804718017578, "logps/rejected": -347.8500061035156, "loss": 0.1124, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.7720916867256165, "rewards/margins": 8.72265625, "rewards/rejected": -7.949023246765137, "step": 1610 }, { "epoch": 0.4084076513408754, "grad_norm": 43.80091857910156, "learning_rate": 4.980132012987421e-07, "logits/chosen": -1.165185570716858, "logits/rejected": NaN, "logps/chosen": -155.1906280517578, "logps/rejected": -346.4750061035156, "loss": 0.1244, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.77508544921875, "rewards/margins": 7.932812690734863, "rewards/rejected": -7.1611328125, "step": 1620 }, { "epoch": 0.41092868622569567, "grad_norm": 19.51668930053711, "learning_rate": 4.979198473824788e-07, "logits/chosen": -1.1923949718475342, "logits/rejected": NaN, "logps/chosen": -139.5554656982422, "logps/rejected": -338.36248779296875, "loss": 0.0951, "rewards/accuracies": 0.953125, "rewards/chosen": 1.2476317882537842, "rewards/margins": 7.804296970367432, "rewards/rejected": -6.558984279632568, "step": 1630 }, { "epoch": 0.41344972111051587, "grad_norm": 23.152746200561523, "learning_rate": 4.97824359468232e-07, "logits/chosen": -1.2561767101287842, "logits/rejected": NaN, "logps/chosen": -181.80313110351562, "logps/rejected": -331.5687561035156, "loss": 0.1526, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.5425354242324829, "rewards/margins": 7.602929592132568, "rewards/rejected": -7.060937404632568, "step": 1640 }, { "epoch": 0.41597075599533606, "grad_norm": 23.78422737121582, "learning_rate": 4.977267383779244e-07, "logits/chosen": -1.2080566883087158, "logits/rejected": NaN, "logps/chosen": -165.6125030517578, "logps/rejected": -342.73748779296875, "loss": 0.0764, "rewards/accuracies": 0.971875011920929, "rewards/chosen": 0.690112292766571, "rewards/margins": 8.057812690734863, "rewards/rejected": -7.364843845367432, "step": 1650 }, { "epoch": 0.4184917908801563, "grad_norm": 41.27838134765625, "learning_rate": 4.976269849518408e-07, "logits/chosen": -1.1915404796600342, "logits/rejected": NaN, "logps/chosen": -158.51327514648438, "logps/rejected": -351.61248779296875, "loss": 0.1395, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.12341918796300888, "rewards/margins": 8.069531440734863, "rewards/rejected": -7.945703029632568, "step": 1660 }, { "epoch": 0.4210128257649765, "grad_norm": 49.29518508911133, "learning_rate": 4.9752510004862e-07, "logits/chosen": -1.1208007335662842, "logits/rejected": -1.158715844154358, "logps/chosen": -158.84530639648438, "logps/rejected": -348.7875061035156, "loss": 0.1154, "rewards/accuracies": 0.9375, "rewards/chosen": -0.17221984267234802, "rewards/margins": 7.984375, "rewards/rejected": -8.160937309265137, "step": 1670 }, { "epoch": 0.42353386064979676, "grad_norm": 43.36105728149414, "learning_rate": 4.974210845452476e-07, "logits/chosen": -1.054022192955017, "logits/rejected": -1.246118187904358, "logps/chosen": -161.43124389648438, "logps/rejected": -379.23126220703125, "loss": 0.0955, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.520904541015625, "rewards/margins": 8.060155868530273, "rewards/rejected": -8.584179878234863, "step": 1680 }, { "epoch": 0.42605489553461695, "grad_norm": 25.2663631439209, "learning_rate": 4.97314939337049e-07, "logits/chosen": -1.160375952720642, "logits/rejected": NaN, "logps/chosen": -167.96875, "logps/rejected": -353.7124938964844, "loss": 0.1077, "rewards/accuracies": 0.96875, "rewards/chosen": -0.10071029514074326, "rewards/margins": 8.234179496765137, "rewards/rejected": -8.336328506469727, "step": 1690 }, { "epoch": 0.4285759304194372, "grad_norm": 43.25634765625, "learning_rate": 4.972066653376808e-07, "logits/chosen": -1.0984070301055908, "logits/rejected": -1.223425269126892, "logps/chosen": -151.28750610351562, "logps/rejected": -343.5625, "loss": 0.0584, "rewards/accuracies": 0.96875, "rewards/chosen": 0.129180908203125, "rewards/margins": 9.109766006469727, "rewards/rejected": -8.980077743530273, "step": 1700 }, { "epoch": 0.4310969653042574, "grad_norm": 58.69062423706055, "learning_rate": 4.970962634791238e-07, "logits/chosen": -1.254064917564392, "logits/rejected": -1.2837402820587158, "logps/chosen": -162.85311889648438, "logps/rejected": -339.8062438964844, "loss": 0.1466, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.5502716302871704, "rewards/margins": 9.201367378234863, "rewards/rejected": -8.649999618530273, "step": 1710 }, { "epoch": 0.4336180001890776, "grad_norm": 33.20095443725586, "learning_rate": 4.969837347116744e-07, "logits/chosen": -1.186132788658142, "logits/rejected": -1.3200805187225342, "logps/chosen": -161.31015014648438, "logps/rejected": -360.8999938964844, "loss": 0.1199, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.4854629635810852, "rewards/margins": 8.929101943969727, "rewards/rejected": -8.44140625, "step": 1720 }, { "epoch": 0.43613903507389784, "grad_norm": 18.996963500976562, "learning_rate": 4.968690800039365e-07, "logits/chosen": -1.165679931640625, "logits/rejected": NaN, "logps/chosen": -177.41250610351562, "logps/rejected": -366.51251220703125, "loss": 0.113, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8264404535293579, "rewards/margins": 9.617773056030273, "rewards/rejected": -10.446093559265137, "step": 1730 }, { "epoch": 0.43866006995871804, "grad_norm": 8.821640014648438, "learning_rate": 4.967523003428134e-07, "logits/chosen": -1.1068115234375, "logits/rejected": NaN, "logps/chosen": -189.0812530517578, "logps/rejected": -385.3374938964844, "loss": 0.1556, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -2.0282835960388184, "rewards/margins": 8.932812690734863, "rewards/rejected": -10.959375381469727, "step": 1740 }, { "epoch": 0.4411811048435383, "grad_norm": 11.586294174194336, "learning_rate": 4.966333967334992e-07, "logits/chosen": -1.124719262123108, "logits/rejected": NaN, "logps/chosen": -191.859375, "logps/rejected": -348.82501220703125, "loss": 0.1356, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7725372314453125, "rewards/margins": 8.964550971984863, "rewards/rejected": -9.741406440734863, "step": 1750 }, { "epoch": 0.4437021397283585, "grad_norm": 35.946800231933594, "learning_rate": 4.965123701994703e-07, "logits/chosen": -1.1837158203125, "logits/rejected": NaN, "logps/chosen": -165.8718719482422, "logps/rejected": -337.48748779296875, "loss": 0.1424, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -0.23861083388328552, "rewards/margins": 8.182421684265137, "rewards/rejected": -8.4248046875, "step": 1760 }, { "epoch": 0.44622317461317873, "grad_norm": 18.85517120361328, "learning_rate": 4.963892217824761e-07, "logits/chosen": -1.1935303211212158, "logits/rejected": NaN, "logps/chosen": -131.22500610351562, "logps/rejected": -323.70001220703125, "loss": 0.096, "rewards/accuracies": 0.953125, "rewards/chosen": 0.2587112486362457, "rewards/margins": 8.2470703125, "rewards/rejected": -7.984375, "step": 1770 }, { "epoch": 0.4487442094979989, "grad_norm": 53.279300689697266, "learning_rate": 4.962639525425303e-07, "logits/chosen": -1.2455322742462158, "logits/rejected": -1.3541748523712158, "logps/chosen": -177.0515594482422, "logps/rejected": -349.1625061035156, "loss": 0.1291, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.05239257961511612, "rewards/margins": 8.387304306030273, "rewards/rejected": -8.446484565734863, "step": 1780 }, { "epoch": 0.4512652443828191, "grad_norm": 37.797088623046875, "learning_rate": 4.961365635579021e-07, "logits/chosen": -1.17010498046875, "logits/rejected": -1.287438988685608, "logps/chosen": -164.71875, "logps/rejected": -322.51251220703125, "loss": 0.1153, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.37158203125, "rewards/margins": 8.505859375, "rewards/rejected": -8.877734184265137, "step": 1790 }, { "epoch": 0.45378627926763937, "grad_norm": 40.78840255737305, "learning_rate": 4.960070559251066e-07, "logits/chosen": -1.0511658191680908, "logits/rejected": NaN, "logps/chosen": -176.5749969482422, "logps/rejected": -349.63751220703125, "loss": 0.1531, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.934173583984375, "rewards/margins": 8.639062881469727, "rewards/rejected": -9.570898056030273, "step": 1800 }, { "epoch": 0.45630731415245956, "grad_norm": 2.93536639213562, "learning_rate": 4.958754307588952e-07, "logits/chosen": -1.0366699695587158, "logits/rejected": NaN, "logps/chosen": -167.24063110351562, "logps/rejected": -362.38751220703125, "loss": 0.0943, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.971325695514679, "rewards/margins": 8.864160537719727, "rewards/rejected": -9.840624809265137, "step": 1810 }, { "epoch": 0.4588283490372798, "grad_norm": 35.41891860961914, "learning_rate": 4.957416891922463e-07, "logits/chosen": -1.200769066810608, "logits/rejected": NaN, "logps/chosen": -167.82186889648438, "logps/rejected": -359.79998779296875, "loss": 0.1198, "rewards/accuracies": 0.953125, "rewards/chosen": -0.12543945014476776, "rewards/margins": 8.527734756469727, "rewards/rejected": -8.653124809265137, "step": 1820 }, { "epoch": 0.4613493839221, "grad_norm": 38.26395797729492, "learning_rate": 4.956058323763555e-07, "logits/chosen": -1.18231201171875, "logits/rejected": NaN, "logps/chosen": -162.4578094482422, "logps/rejected": -324.1000061035156, "loss": 0.108, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.564958930015564, "rewards/margins": 8.226953506469727, "rewards/rejected": -7.6630859375, "step": 1830 }, { "epoch": 0.46387041880692026, "grad_norm": 16.926193237304688, "learning_rate": 4.954678614806258e-07, "logits/chosen": -1.147003173828125, "logits/rejected": NaN, "logps/chosen": -173.94140625, "logps/rejected": -353.5249938964844, "loss": 0.1022, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.45454102754592896, "rewards/margins": 8.19921875, "rewards/rejected": -7.745703220367432, "step": 1840 }, { "epoch": 0.46639145369174045, "grad_norm": 37.22435760498047, "learning_rate": 4.953277776926571e-07, "logits/chosen": -1.15411376953125, "logits/rejected": -1.281378149986267, "logps/chosen": -159.8859405517578, "logps/rejected": -329.8812561035156, "loss": 0.1261, "rewards/accuracies": 0.953125, "rewards/chosen": 0.41975098848342896, "rewards/margins": 8.056640625, "rewards/rejected": -7.642382621765137, "step": 1850 }, { "epoch": 0.4689124885765607, "grad_norm": 40.29639434814453, "learning_rate": 4.951855822182363e-07, "logits/chosen": -1.2020142078399658, "logits/rejected": NaN, "logps/chosen": -150.28359985351562, "logps/rejected": -330.8500061035156, "loss": 0.0777, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.294000267982483, "rewards/margins": 7.897265434265137, "rewards/rejected": -6.6044921875, "step": 1860 }, { "epoch": 0.4714335234613809, "grad_norm": 14.668291091918945, "learning_rate": 4.95041276281327e-07, "logits/chosen": -1.144189476966858, "logits/rejected": NaN, "logps/chosen": -131.9093780517578, "logps/rejected": -300.03125, "loss": 0.089, "rewards/accuracies": 0.953125, "rewards/chosen": 0.932537853717804, "rewards/margins": 7.545507907867432, "rewards/rejected": -6.612500190734863, "step": 1870 }, { "epoch": 0.4739545583462011, "grad_norm": 17.22679901123047, "learning_rate": 4.948948611240588e-07, "logits/chosen": -1.1851317882537842, "logits/rejected": -1.324682593345642, "logps/chosen": -149.609375, "logps/rejected": -323.86248779296875, "loss": 0.0822, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.8300536870956421, "rewards/margins": 8.061254501342773, "rewards/rejected": -7.23193359375, "step": 1880 }, { "epoch": 0.47647559323102134, "grad_norm": 74.50685119628906, "learning_rate": 4.947463380067166e-07, "logits/chosen": -1.147924780845642, "logits/rejected": NaN, "logps/chosen": -147.7234344482422, "logps/rejected": -328.875, "loss": 0.141, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.11084594577550888, "rewards/margins": 8.559961318969727, "rewards/rejected": -8.457422256469727, "step": 1890 }, { "epoch": 0.47899662811584154, "grad_norm": 5.231132507324219, "learning_rate": 4.945957082077298e-07, "logits/chosen": -1.148461937904358, "logits/rejected": -1.272680640220642, "logps/chosen": -154.1640625, "logps/rejected": -361.875, "loss": 0.0956, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.3473754823207855, "rewards/margins": 9.430273056030273, "rewards/rejected": -9.0859375, "step": 1900 }, { "epoch": 0.4815176630006618, "grad_norm": 48.987876892089844, "learning_rate": 4.944429730236617e-07, "logits/chosen": -1.1704925298690796, "logits/rejected": NaN, "logps/chosen": -165.85000610351562, "logps/rejected": -349.40625, "loss": 0.1807, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.41611480712890625, "rewards/margins": 9.469531059265137, "rewards/rejected": -9.053906440734863, "step": 1910 }, { "epoch": 0.484038697885482, "grad_norm": 34.76906967163086, "learning_rate": 4.942881337691971e-07, "logits/chosen": -1.148290991783142, "logits/rejected": NaN, "logps/chosen": -157.28750610351562, "logps/rejected": -345.76251220703125, "loss": 0.154, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.38721925020217896, "rewards/margins": 9.164648056030273, "rewards/rejected": -8.778124809265137, "step": 1920 }, { "epoch": 0.48655973277030223, "grad_norm": 8.718462944030762, "learning_rate": 4.941311917771324e-07, "logits/chosen": -1.0844848155975342, "logits/rejected": NaN, "logps/chosen": -181.11093139648438, "logps/rejected": -363.2124938964844, "loss": 0.0949, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0011627196799963713, "rewards/margins": 9.568750381469727, "rewards/rejected": -9.569921493530273, "step": 1930 }, { "epoch": 0.48908076765512243, "grad_norm": 60.81895446777344, "learning_rate": 4.939721483983639e-07, "logits/chosen": -0.9905761480331421, "logits/rejected": -1.2114746570587158, "logps/chosen": -166.484375, "logps/rejected": -353.51251220703125, "loss": 0.1022, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.374871850013733, "rewards/margins": 8.926953315734863, "rewards/rejected": -10.297656059265137, "step": 1940 }, { "epoch": 0.4916018025399426, "grad_norm": 23.947477340698242, "learning_rate": 4.938110050018747e-07, "logits/chosen": -1.0089294910430908, "logits/rejected": -1.2086975574493408, "logps/chosen": -172.68594360351562, "logps/rejected": -366.20001220703125, "loss": 0.1269, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.9927246570587158, "rewards/margins": 8.818359375, "rewards/rejected": -10.810155868530273, "step": 1950 }, { "epoch": 0.4941228374247629, "grad_norm": 35.51654052734375, "learning_rate": 4.936477629747253e-07, "logits/chosen": -1.097631812095642, "logits/rejected": NaN, "logps/chosen": -192.91561889648438, "logps/rejected": -392.2875061035156, "loss": 0.1463, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.8665955066680908, "rewards/margins": 8.555273056030273, "rewards/rejected": -10.421483993530273, "step": 1960 }, { "epoch": 0.49664387230958307, "grad_norm": 81.0794677734375, "learning_rate": 4.934824237220395e-07, "logits/chosen": -1.082489013671875, "logits/rejected": NaN, "logps/chosen": -172.50546264648438, "logps/rejected": -338.5249938964844, "loss": 0.105, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.13001708686351776, "rewards/margins": 8.348437309265137, "rewards/rejected": -8.4794921875, "step": 1970 }, { "epoch": 0.4991649071944033, "grad_norm": 11.260257720947266, "learning_rate": 4.933149886669936e-07, "logits/chosen": -1.1701171398162842, "logits/rejected": NaN, "logps/chosen": -158.59219360351562, "logps/rejected": -330.3374938964844, "loss": 0.1162, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.7947418093681335, "rewards/margins": 7.731835842132568, "rewards/rejected": -6.937304496765137, "step": 1980 }, { "epoch": 0.5016859420792236, "grad_norm": 7.895501136779785, "learning_rate": 4.931454592508037e-07, "logits/chosen": -1.041479468345642, "logits/rejected": NaN, "logps/chosen": -138.38436889648438, "logps/rejected": -315.6625061035156, "loss": 0.0789, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.27969780564308167, "rewards/margins": 8.47265625, "rewards/rejected": -8.19140625, "step": 1990 }, { "epoch": 0.5042069769640437, "grad_norm": 65.24656677246094, "learning_rate": 4.929738369327133e-07, "logits/chosen": -1.0851562023162842, "logits/rejected": -1.2528808116912842, "logps/chosen": -158.18124389648438, "logps/rejected": -356.63751220703125, "loss": 0.0916, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.4310851991176605, "rewards/margins": 8.785351753234863, "rewards/rejected": -9.213671684265137, "step": 2000 }, { "epoch": 0.506728011848864, "grad_norm": 25.298076629638672, "learning_rate": 4.928001231899809e-07, "logits/chosen": -1.1340820789337158, "logits/rejected": NaN, "logps/chosen": -170.9765625, "logps/rejected": -350.2562561035156, "loss": 0.1459, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.6165221929550171, "rewards/margins": 9.159570693969727, "rewards/rejected": -9.775781631469727, "step": 2010 }, { "epoch": 0.5092490467336842, "grad_norm": 64.31370544433594, "learning_rate": 4.926243195178669e-07, "logits/chosen": -1.1616942882537842, "logits/rejected": -1.2569458484649658, "logps/chosen": -166.25, "logps/rejected": -334.61248779296875, "loss": 0.1978, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.35383301973342896, "rewards/margins": 8.579882621765137, "rewards/rejected": -8.930859565734863, "step": 2020 }, { "epoch": 0.5117700816185043, "grad_norm": 54.36347579956055, "learning_rate": 4.924464274296214e-07, "logits/chosen": -1.1165771484375, "logits/rejected": NaN, "logps/chosen": -166.328125, "logps/rejected": -336.8062438964844, "loss": 0.1994, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.35926514863967896, "rewards/margins": 8.010644912719727, "rewards/rejected": -8.371679306030273, "step": 2030 }, { "epoch": 0.5142911165033246, "grad_norm": 15.46080207824707, "learning_rate": 4.922664484564704e-07, "logits/chosen": -1.1998779773712158, "logits/rejected": -1.3221435546875, "logps/chosen": -158.28125, "logps/rejected": -361.7749938964844, "loss": 0.0873, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.5888274908065796, "rewards/margins": 9.322851181030273, "rewards/rejected": -8.739843368530273, "step": 2040 }, { "epoch": 0.5168121513881448, "grad_norm": 44.14103698730469, "learning_rate": 4.920843841476032e-07, "logits/chosen": -1.189569115638733, "logits/rejected": -1.25543212890625, "logps/chosen": -156.83438110351562, "logps/rejected": -348.4624938964844, "loss": 0.0891, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05631103366613388, "rewards/margins": 9.009570121765137, "rewards/rejected": -8.953125, "step": 2050 }, { "epoch": 0.5193331862729651, "grad_norm": 56.55488967895508, "learning_rate": 4.91900236070159e-07, "logits/chosen": -0.977160632610321, "logits/rejected": NaN, "logps/chosen": -161.67343139648438, "logps/rejected": -352.9437561035156, "loss": 0.1058, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5609496831893921, "rewards/margins": 9.3330078125, "rewards/rejected": -9.892578125, "step": 2060 }, { "epoch": 0.5218542211577852, "grad_norm": 53.710968017578125, "learning_rate": 4.917140058092128e-07, "logits/chosen": -1.1936523914337158, "logits/rejected": NaN, "logps/chosen": -177.7859344482422, "logps/rejected": -359.0249938964844, "loss": 0.1192, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.5446106195449829, "rewards/margins": 9.752734184265137, "rewards/rejected": -10.30078125, "step": 2070 }, { "epoch": 0.5243752560426055, "grad_norm": 6.296230792999268, "learning_rate": 4.915256949677628e-07, "logits/chosen": -1.100549340248108, "logits/rejected": NaN, "logps/chosen": -162.22265625, "logps/rejected": -354.63751220703125, "loss": 0.1456, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.6857849359512329, "rewards/margins": 9.808008193969727, "rewards/rejected": -10.494140625, "step": 2080 }, { "epoch": 0.5268962909274257, "grad_norm": 2.8434290885925293, "learning_rate": 4.913353051667155e-07, "logits/chosen": -1.051245093345642, "logits/rejected": NaN, "logps/chosen": -181.24374389648438, "logps/rejected": -357.75, "loss": 0.1215, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.011260986328125, "rewards/margins": 9.109375, "rewards/rejected": -10.118359565734863, "step": 2090 }, { "epoch": 0.5294173258122459, "grad_norm": 51.87717056274414, "learning_rate": 4.911428380448727e-07, "logits/chosen": -1.0196136236190796, "logits/rejected": NaN, "logps/chosen": -178.50625610351562, "logps/rejected": -343.0375061035156, "loss": 0.0992, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.1700042486190796, "rewards/margins": 9.272656440734863, "rewards/rejected": -10.436718940734863, "step": 2100 }, { "epoch": 0.5319383606970661, "grad_norm": 39.03517150878906, "learning_rate": 4.909482952589169e-07, "logits/chosen": -1.0492675304412842, "logits/rejected": NaN, "logps/chosen": -175.8718719482422, "logps/rejected": -372.29998779296875, "loss": 0.1706, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8878173828125, "rewards/margins": 9.0791015625, "rewards/rejected": -9.967187881469727, "step": 2110 }, { "epoch": 0.5344593955818864, "grad_norm": 36.857444763183594, "learning_rate": 4.907516784833968e-07, "logits/chosen": -1.138452172279358, "logits/rejected": NaN, "logps/chosen": -170.056640625, "logps/rejected": -371.57501220703125, "loss": 0.0709, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.14375457167625427, "rewards/margins": 9.03515625, "rewards/rejected": -9.182031631469727, "step": 2120 }, { "epoch": 0.5369804304667066, "grad_norm": 20.172332763671875, "learning_rate": 4.905529894107136e-07, "logits/chosen": -1.126074194908142, "logits/rejected": -1.2149749994277954, "logps/chosen": -176.2312469482422, "logps/rejected": -342.76251220703125, "loss": 0.1075, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.13487854599952698, "rewards/margins": 9.310254096984863, "rewards/rejected": -9.444140434265137, "step": 2130 }, { "epoch": 0.5395014653515268, "grad_norm": 62.058815002441406, "learning_rate": 4.903522297511058e-07, "logits/chosen": -1.2261962890625, "logits/rejected": -1.2561767101287842, "logps/chosen": -160.3937530517578, "logps/rejected": -367.25, "loss": 0.0803, "rewards/accuracies": 0.971875011920929, "rewards/chosen": 0.35161131620407104, "rewards/margins": 9.594335556030273, "rewards/rejected": -9.242968559265137, "step": 2140 }, { "epoch": 0.542022500236347, "grad_norm": 26.38711166381836, "learning_rate": 4.901494012326346e-07, "logits/chosen": -1.157617211341858, "logits/rejected": -1.235571265220642, "logps/chosen": -168.08749389648438, "logps/rejected": -350.875, "loss": 0.101, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.18499144911766052, "rewards/margins": 9.465527534484863, "rewards/rejected": -9.281641006469727, "step": 2150 }, { "epoch": 0.5445435351211673, "grad_norm": 42.21955490112305, "learning_rate": 4.899445056011695e-07, "logits/chosen": -1.073950171470642, "logits/rejected": -1.179785132408142, "logps/chosen": -182.5390625, "logps/rejected": -364.71875, "loss": 0.0901, "rewards/accuracies": 0.953125, "rewards/chosen": -0.41880494356155396, "rewards/margins": 10.140039443969727, "rewards/rejected": -10.560937881469727, "step": 2160 }, { "epoch": 0.5470645700059874, "grad_norm": 33.1414794921875, "learning_rate": 4.897375446203727e-07, "logits/chosen": NaN, "logits/rejected": -1.0929443836212158, "logps/chosen": -164.3781280517578, "logps/rejected": -379.1499938964844, "loss": 0.1607, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.436126708984375, "rewards/margins": 9.450586318969727, "rewards/rejected": -10.887890815734863, "step": 2170 }, { "epoch": 0.5495856048908077, "grad_norm": 6.710944175720215, "learning_rate": 4.89528520071684e-07, "logits/chosen": -1.209814429283142, "logits/rejected": -1.280889868736267, "logps/chosen": -160.17343139648438, "logps/rejected": -365.39373779296875, "loss": 0.0848, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.679840087890625, "rewards/margins": 9.635156631469727, "rewards/rejected": -10.310937881469727, "step": 2180 }, { "epoch": 0.5521066397756279, "grad_norm": 8.57651424407959, "learning_rate": 4.893174337543058e-07, "logits/chosen": -1.109106421470642, "logits/rejected": -1.180078148841858, "logps/chosen": -157.9250030517578, "logps/rejected": -362.9937438964844, "loss": 0.1092, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0389525890350342, "rewards/margins": 9.461718559265137, "rewards/rejected": -10.499218940734863, "step": 2190 }, { "epoch": 0.5546276746604482, "grad_norm": 24.483020782470703, "learning_rate": 4.891042874851873e-07, "logits/chosen": -1.21746826171875, "logits/rejected": NaN, "logps/chosen": -177.68124389648438, "logps/rejected": -363.9750061035156, "loss": 0.0547, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.44205015897750854, "rewards/margins": 9.950780868530273, "rewards/rejected": -10.391406059265137, "step": 2200 }, { "epoch": 0.5571487095452683, "grad_norm": 70.53258514404297, "learning_rate": 4.888890830990091e-07, "logits/chosen": -1.0935852527618408, "logits/rejected": -1.194726586341858, "logps/chosen": -178.85000610351562, "logps/rejected": -365.7124938964844, "loss": 0.1365, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.27095335721969604, "rewards/margins": 10.329687118530273, "rewards/rejected": -10.598437309265137, "step": 2210 }, { "epoch": 0.5596697444300885, "grad_norm": 35.11616897583008, "learning_rate": 4.88671822448167e-07, "logits/chosen": -1.1938660144805908, "logits/rejected": -1.2064940929412842, "logps/chosen": -167.87655639648438, "logps/rejected": -345.61248779296875, "loss": 0.1319, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.44330137968063354, "rewards/margins": 9.998827934265137, "rewards/rejected": -9.554296493530273, "step": 2220 }, { "epoch": 0.5621907793149088, "grad_norm": 12.91801929473877, "learning_rate": 4.884525074027566e-07, "logits/chosen": -1.1938598155975342, "logits/rejected": NaN, "logps/chosen": -152.640625, "logps/rejected": -357.2562561035156, "loss": 0.0869, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.9036712646484375, "rewards/margins": 8.7646484375, "rewards/rejected": -7.856054782867432, "step": 2230 }, { "epoch": 0.5647118141997289, "grad_norm": 25.854982376098633, "learning_rate": 4.882311398505568e-07, "logits/chosen": -1.189367651939392, "logits/rejected": NaN, "logps/chosen": -171.734375, "logps/rejected": -325.0625, "loss": 0.1297, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.3280273377895355, "rewards/margins": 8.994336128234863, "rewards/rejected": -9.322656631469727, "step": 2240 }, { "epoch": 0.5672328490845492, "grad_norm": 51.82754898071289, "learning_rate": 4.880077216970139e-07, "logits/chosen": -1.16705322265625, "logits/rejected": -1.2383911609649658, "logps/chosen": -156.3234405517578, "logps/rejected": -373.6000061035156, "loss": 0.0705, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.33837890625, "rewards/margins": 10.006640434265137, "rewards/rejected": -10.342968940734863, "step": 2250 }, { "epoch": 0.5697538839693694, "grad_norm": 42.65985870361328, "learning_rate": 4.877822548652244e-07, "logits/chosen": -1.112420678138733, "logits/rejected": -1.255395531654358, "logps/chosen": -183.875, "logps/rejected": -393.57501220703125, "loss": 0.1164, "rewards/accuracies": 0.953125, "rewards/chosen": -0.48215943574905396, "rewards/margins": 10.734766006469727, "rewards/rejected": -11.212499618530273, "step": 2260 }, { "epoch": 0.5722749188541897, "grad_norm": 35.932735443115234, "learning_rate": 4.875547412959198e-07, "logits/chosen": -1.0635284185409546, "logits/rejected": -1.175024390220642, "logps/chosen": -178.203125, "logps/rejected": -406.6875, "loss": 0.0879, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.9829956293106079, "rewards/margins": 11.654296875, "rewards/rejected": -12.634765625, "step": 2270 }, { "epoch": 0.5747959537390098, "grad_norm": 63.61937713623047, "learning_rate": 4.873251829474485e-07, "logits/chosen": -1.1728515625, "logits/rejected": NaN, "logps/chosen": -195.15469360351562, "logps/rejected": -381.38751220703125, "loss": 0.1612, "rewards/accuracies": 0.9375, "rewards/chosen": -0.89678955078125, "rewards/margins": 10.889941215515137, "rewards/rejected": -11.781641006469727, "step": 2280 }, { "epoch": 0.5773169886238301, "grad_norm": 4.922110557556152, "learning_rate": 4.870935817957599e-07, "logits/chosen": -1.12310791015625, "logits/rejected": NaN, "logps/chosen": -199.3156280517578, "logps/rejected": -342.125, "loss": 0.1361, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.7691589593887329, "rewards/margins": 11.031396865844727, "rewards/rejected": -10.260156631469727, "step": 2290 }, { "epoch": 0.5798380235086503, "grad_norm": 25.650833129882812, "learning_rate": 4.868599398343871e-07, "logits/chosen": -1.115930199623108, "logits/rejected": -1.175537109375, "logps/chosen": -141.83749389648438, "logps/rejected": -332.1499938964844, "loss": 0.0917, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10904846340417862, "rewards/margins": 9.793164253234863, "rewards/rejected": -9.684374809265137, "step": 2300 }, { "epoch": 0.5823590583934706, "grad_norm": 70.23519897460938, "learning_rate": 4.866242590744294e-07, "logits/chosen": -1.140527367591858, "logits/rejected": -1.2777831554412842, "logps/chosen": -169.7468719482422, "logps/rejected": -404.0375061035156, "loss": 0.1036, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.008319091983139515, "rewards/margins": 10.530077934265137, "rewards/rejected": -10.523828506469727, "step": 2310 }, { "epoch": 0.5848800932782907, "grad_norm": 50.69040298461914, "learning_rate": 4.863865415445356e-07, "logits/chosen": -1.123388648033142, "logits/rejected": -1.17657470703125, "logps/chosen": -160.2531280517578, "logps/rejected": -357.625, "loss": 0.1548, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.07565917819738388, "rewards/margins": 10.223437309265137, "rewards/rejected": -10.293749809265137, "step": 2320 }, { "epoch": 0.587401128163111, "grad_norm": 27.327226638793945, "learning_rate": 4.861467892908859e-07, "logits/chosen": -1.131494164466858, "logits/rejected": NaN, "logps/chosen": -159.68124389648438, "logps/rejected": -352.67498779296875, "loss": 0.1454, "rewards/accuracies": 0.934374988079071, "rewards/chosen": 0.16353759169578552, "rewards/margins": 10.327343940734863, "rewards/rejected": -10.161913871765137, "step": 2330 }, { "epoch": 0.5899221630479312, "grad_norm": 57.25568389892578, "learning_rate": 4.85905004377175e-07, "logits/chosen": -1.1108871698379517, "logits/rejected": NaN, "logps/chosen": -184.875, "logps/rejected": -387.7875061035156, "loss": 0.0777, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.20819091796875, "rewards/margins": 10.596094131469727, "rewards/rejected": -10.807812690734863, "step": 2340 }, { "epoch": 0.5924431979327514, "grad_norm": 55.484397888183594, "learning_rate": 4.856611888845937e-07, "logits/chosen": -1.169470191001892, "logits/rejected": -1.2133057117462158, "logps/chosen": -193.9562530517578, "logps/rejected": -397.2749938964844, "loss": 0.096, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.2906860411167145, "rewards/margins": 10.153905868530273, "rewards/rejected": -10.445703506469727, "step": 2350 }, { "epoch": 0.5949642328175716, "grad_norm": 50.575531005859375, "learning_rate": 4.854153449118112e-07, "logits/chosen": -1.123693823814392, "logits/rejected": -1.156494140625, "logps/chosen": -166.29061889648438, "logps/rejected": -338.6000061035156, "loss": 0.1051, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.13056334853172302, "rewards/margins": 9.580370903015137, "rewards/rejected": -9.4443359375, "step": 2360 }, { "epoch": 0.5974852677023919, "grad_norm": 10.975263595581055, "learning_rate": 4.851674745749571e-07, "logits/chosen": -1.0478394031524658, "logits/rejected": NaN, "logps/chosen": -156.97109985351562, "logps/rejected": -330.6625061035156, "loss": 0.1304, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.04766845703125, "rewards/margins": 9.923632621765137, "rewards/rejected": -9.876953125, "step": 2370 }, { "epoch": 0.6000063025872121, "grad_norm": 47.366092681884766, "learning_rate": 4.849175800076034e-07, "logits/chosen": -1.1410095691680908, "logits/rejected": NaN, "logps/chosen": -188.81875610351562, "logps/rejected": -386.4375, "loss": 0.0862, "rewards/accuracies": 0.953125, "rewards/chosen": -0.40714341402053833, "rewards/margins": 10.071093559265137, "rewards/rejected": -10.476953506469727, "step": 2380 }, { "epoch": 0.6025273374720322, "grad_norm": 31.29020118713379, "learning_rate": 4.846656633607458e-07, "logits/chosen": -0.9641281366348267, "logits/rejected": -1.1159378290176392, "logps/chosen": -168.25936889648438, "logps/rejected": -398.60626220703125, "loss": 0.1153, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7259155511856079, "rewards/margins": 9.9423828125, "rewards/rejected": -10.666015625, "step": 2390 }, { "epoch": 0.6050483723568525, "grad_norm": 23.906402587890625, "learning_rate": 4.844117268027848e-07, "logits/chosen": -1.0350220203399658, "logits/rejected": NaN, "logps/chosen": -179.63436889648438, "logps/rejected": -355.0375061035156, "loss": 0.1081, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.294165015220642, "rewards/margins": 9.905858993530273, "rewards/rejected": -11.200390815734863, "step": 2400 }, { "epoch": 0.6075694072416727, "grad_norm": 40.88800811767578, "learning_rate": 4.841557725195083e-07, "logits/chosen": -1.068090796470642, "logits/rejected": NaN, "logps/chosen": -174.4812469482422, "logps/rejected": -372.1000061035156, "loss": 0.1191, "rewards/accuracies": 0.953125, "rewards/chosen": -0.8603851199150085, "rewards/margins": 10.120702743530273, "rewards/rejected": -10.979687690734863, "step": 2410 }, { "epoch": 0.6100904421264929, "grad_norm": 42.18296813964844, "learning_rate": 4.838978027140713e-07, "logits/chosen": -0.9261474609375, "logits/rejected": -1.2021362781524658, "logps/chosen": -156.74063110351562, "logps/rejected": -350.9125061035156, "loss": 0.125, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.618542492389679, "rewards/margins": 9.337109565734863, "rewards/rejected": -9.955078125, "step": 2420 }, { "epoch": 0.6126114770113131, "grad_norm": 6.31051778793335, "learning_rate": 4.836378196069781e-07, "logits/chosen": -1.008764624595642, "logits/rejected": -1.1471068859100342, "logps/chosen": -156.8703155517578, "logps/rejected": -323.26251220703125, "loss": 0.1408, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.678619384765625, "rewards/margins": 8.6962890625, "rewards/rejected": -9.371874809265137, "step": 2430 }, { "epoch": 0.6151325118961334, "grad_norm": 36.19623565673828, "learning_rate": 4.833758254360625e-07, "logits/chosen": -1.1063721179962158, "logits/rejected": NaN, "logps/chosen": -172.2843780517578, "logps/rejected": -343.4624938964844, "loss": 0.1003, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.725573718547821, "rewards/margins": 9.340234756469727, "rewards/rejected": -10.066797256469727, "step": 2440 }, { "epoch": 0.6176535467809536, "grad_norm": 9.959573745727539, "learning_rate": 4.831118224564688e-07, "logits/chosen": -1.0214722156524658, "logits/rejected": NaN, "logps/chosen": -187.11874389648438, "logps/rejected": -380.6499938964844, "loss": 0.1227, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.8923614621162415, "rewards/margins": 9.869531631469727, "rewards/rejected": -10.756640434265137, "step": 2450 }, { "epoch": 0.6201745816657738, "grad_norm": 26.491514205932617, "learning_rate": 4.828458129406322e-07, "logits/chosen": -1.0314605236053467, "logits/rejected": NaN, "logps/chosen": -171.2218780517578, "logps/rejected": -351.5625, "loss": 0.0755, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.747418224811554, "rewards/margins": 9.834375381469727, "rewards/rejected": -10.584765434265137, "step": 2460 }, { "epoch": 0.622695616550594, "grad_norm": 122.34345245361328, "learning_rate": 4.825777991782599e-07, "logits/chosen": -1.02764892578125, "logits/rejected": -1.125524878501892, "logps/chosen": -174.19686889648438, "logps/rejected": -376.95001220703125, "loss": 0.1039, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.8089019656181335, "rewards/margins": 10.124608993530273, "rewards/rejected": -10.930078506469727, "step": 2470 }, { "epoch": 0.6252166514354143, "grad_norm": 37.06415557861328, "learning_rate": 4.823077834763102e-07, "logits/chosen": -0.9547653198242188, "logits/rejected": NaN, "logps/chosen": -159.2937469482422, "logps/rejected": -364.29998779296875, "loss": 0.146, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.5744964480400085, "rewards/margins": 10.406445503234863, "rewards/rejected": -10.977343559265137, "step": 2480 }, { "epoch": 0.6277376863202344, "grad_norm": 39.44899368286133, "learning_rate": 4.820357681589738e-07, "logits/chosen": -1.0011444091796875, "logits/rejected": -1.1051146984100342, "logps/chosen": -165.35000610351562, "logps/rejected": -357.75, "loss": 0.1473, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.0424247980117798, "rewards/margins": 10.018603324890137, "rewards/rejected": -11.0625, "step": 2490 }, { "epoch": 0.6302587212050547, "grad_norm": 61.26754379272461, "learning_rate": 4.817617555676531e-07, "logits/chosen": -0.8036254644393921, "logits/rejected": -1.0375503301620483, "logps/chosen": -161.3640594482422, "logps/rejected": -363.1625061035156, "loss": 0.1086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.243402123451233, "rewards/margins": 10.059961318969727, "rewards/rejected": -11.299219131469727, "step": 2500 }, { "epoch": 0.6327797560898749, "grad_norm": 38.478275299072266, "learning_rate": 4.814857480609423e-07, "logits/chosen": -1.01806640625, "logits/rejected": -1.19903564453125, "logps/chosen": -182.1999969482422, "logps/rejected": -372.6625061035156, "loss": 0.1792, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.9863159656524658, "rewards/margins": 9.485644340515137, "rewards/rejected": -11.474218368530273, "step": 2510 }, { "epoch": 0.6353007909746952, "grad_norm": 75.65766906738281, "learning_rate": 4.812077480146071e-07, "logits/chosen": -0.910137951374054, "logits/rejected": -1.101318359375, "logps/chosen": -183.9375, "logps/rejected": -367.6000061035156, "loss": 0.1726, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.424450635910034, "rewards/margins": 8.590624809265137, "rewards/rejected": -11.014452934265137, "step": 2520 }, { "epoch": 0.6378218258595153, "grad_norm": 46.309452056884766, "learning_rate": 4.809277578215642e-07, "logits/chosen": -0.9774169921875, "logits/rejected": NaN, "logps/chosen": -176.30313110351562, "logps/rejected": -356.54998779296875, "loss": 0.0868, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.5592467784881592, "rewards/margins": 9.064453125, "rewards/rejected": -10.622265815734863, "step": 2530 }, { "epoch": 0.6403428607443356, "grad_norm": 21.755626678466797, "learning_rate": 4.806457798918605e-07, "logits/chosen": -1.09869384765625, "logits/rejected": NaN, "logps/chosen": -182.55624389648438, "logps/rejected": -382.1875, "loss": 0.1329, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.999005138874054, "rewards/margins": 9.275976181030273, "rewards/rejected": -10.275390625, "step": 2540 }, { "epoch": 0.6428638956291558, "grad_norm": 29.04755210876465, "learning_rate": 4.80361816652653e-07, "logits/chosen": -1.1414062976837158, "logits/rejected": -1.2694580554962158, "logps/chosen": -178.49374389648438, "logps/rejected": -354.625, "loss": 0.1272, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -1.211938500404358, "rewards/margins": 9.155858993530273, "rewards/rejected": -10.364453315734863, "step": 2550 }, { "epoch": 0.6453849305139759, "grad_norm": 15.099742889404297, "learning_rate": 4.800758705481872e-07, "logits/chosen": -1.106530785560608, "logits/rejected": -1.2060058116912842, "logps/chosen": -181.80313110351562, "logps/rejected": -351.9750061035156, "loss": 0.1159, "rewards/accuracies": 0.9375, "rewards/chosen": -0.68414306640625, "rewards/margins": 8.308202743530273, "rewards/rejected": -8.993359565734863, "step": 2560 }, { "epoch": 0.6479059653987962, "grad_norm": 36.96305847167969, "learning_rate": 4.797879440397764e-07, "logits/chosen": -1.0145385265350342, "logits/rejected": NaN, "logps/chosen": -154.421875, "logps/rejected": -324.07501220703125, "loss": 0.1413, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6538635492324829, "rewards/margins": 7.828711032867432, "rewards/rejected": -8.482421875, "step": 2570 }, { "epoch": 0.6504270002836164, "grad_norm": 21.71329116821289, "learning_rate": 4.794980396057802e-07, "logits/chosen": -1.045068383216858, "logits/rejected": NaN, "logps/chosen": -180.1531219482422, "logps/rejected": -348.61248779296875, "loss": 0.1292, "rewards/accuracies": 0.953125, "rewards/chosen": -0.6067970395088196, "rewards/margins": 7.794140815734863, "rewards/rejected": -8.406445503234863, "step": 2580 }, { "epoch": 0.6529480351684367, "grad_norm": 66.87492370605469, "learning_rate": 4.792061597415838e-07, "logits/chosen": -1.13153076171875, "logits/rejected": NaN, "logps/chosen": -170.42813110351562, "logps/rejected": -345.88751220703125, "loss": 0.1106, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.6267852783203125, "rewards/margins": 8.701952934265137, "rewards/rejected": -9.328906059265137, "step": 2590 }, { "epoch": 0.6554690700532568, "grad_norm": 40.47690963745117, "learning_rate": 4.78912306959576e-07, "logits/chosen": -1.009979248046875, "logits/rejected": -1.122290015220642, "logps/chosen": -167.02499389648438, "logps/rejected": -365.70001220703125, "loss": 0.1177, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.49114990234375, "rewards/margins": 9.353124618530273, "rewards/rejected": -9.841015815734863, "step": 2600 }, { "epoch": 0.6579901049380771, "grad_norm": 14.012775421142578, "learning_rate": 4.786164837891277e-07, "logits/chosen": -1.0084960460662842, "logits/rejected": -1.198632836341858, "logps/chosen": -166.04531860351562, "logps/rejected": -382.8999938964844, "loss": 0.1179, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.890209972858429, "rewards/margins": 9.641016006469727, "rewards/rejected": -10.534765243530273, "step": 2610 }, { "epoch": 0.6605111398228973, "grad_norm": 24.993738174438477, "learning_rate": 4.7831869277657e-07, "logits/chosen": -1.0231444835662842, "logits/rejected": NaN, "logps/chosen": -157.21719360351562, "logps/rejected": -361.79998779296875, "loss": 0.1022, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7171295285224915, "rewards/margins": 9.939062118530273, "rewards/rejected": -10.656641006469727, "step": 2620 }, { "epoch": 0.6630321747077175, "grad_norm": 21.615503311157227, "learning_rate": 4.780189364851726e-07, "logits/chosen": -1.057957410812378, "logits/rejected": NaN, "logps/chosen": -170.77188110351562, "logps/rejected": -376.9437561035156, "loss": 0.0782, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.5716431140899658, "rewards/margins": 9.749608993530273, "rewards/rejected": -11.3203125, "step": 2630 }, { "epoch": 0.6655532095925377, "grad_norm": 50.3950080871582, "learning_rate": 4.777172174951216e-07, "logits/chosen": -1.071173071861267, "logits/rejected": NaN, "logps/chosen": -185.4343719482422, "logps/rejected": -399.8500061035156, "loss": 0.0867, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.7712645530700684, "rewards/margins": 9.438281059265137, "rewards/rejected": -12.207422256469727, "step": 2640 }, { "epoch": 0.668074244477358, "grad_norm": 47.26682662963867, "learning_rate": 4.77413538403497e-07, "logits/chosen": -1.1140930652618408, "logits/rejected": -1.218505859375, "logps/chosen": -169.8562469482422, "logps/rejected": -383.29998779296875, "loss": 0.0753, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.972564697265625, "rewards/margins": 10.592187881469727, "rewards/rejected": -11.564062118530273, "step": 2650 }, { "epoch": 0.6705952793621782, "grad_norm": 56.71453094482422, "learning_rate": 4.771079018242509e-07, "logits/chosen": -1.1331908702850342, "logits/rejected": -1.1501038074493408, "logps/chosen": -190.0187530517578, "logps/rejected": -390.82501220703125, "loss": 0.1518, "rewards/accuracies": 0.96875, "rewards/chosen": -0.85272216796875, "rewards/margins": 10.324170112609863, "rewards/rejected": -11.169336318969727, "step": 2660 }, { "epoch": 0.6731163142469984, "grad_norm": 71.81742095947266, "learning_rate": 4.7680031038818445e-07, "logits/chosen": -0.991625964641571, "logits/rejected": -1.1465942859649658, "logps/chosen": -183.671875, "logps/rejected": -374.36248779296875, "loss": 0.1293, "rewards/accuracies": 0.9375, "rewards/chosen": -1.596527099609375, "rewards/margins": 9.287500381469727, "rewards/rejected": -10.880468368530273, "step": 2670 }, { "epoch": 0.6756373491318186, "grad_norm": 38.87578201293945, "learning_rate": 4.7649076674292564e-07, "logits/chosen": -1.1806762218475342, "logits/rejected": NaN, "logps/chosen": -162.6867218017578, "logps/rejected": -349.85626220703125, "loss": 0.0965, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.395089715719223, "rewards/margins": 9.583593368530273, "rewards/rejected": -9.978906631469727, "step": 2680 }, { "epoch": 0.6781583840166389, "grad_norm": 48.65221405029297, "learning_rate": 4.761792735529061e-07, "logits/chosen": -1.1571838855743408, "logits/rejected": NaN, "logps/chosen": -151.14608764648438, "logps/rejected": -352.5562438964844, "loss": 0.1264, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.22144469618797302, "rewards/margins": 8.943750381469727, "rewards/rejected": -8.72265625, "step": 2690 }, { "epoch": 0.680679418901459, "grad_norm": 51.25635528564453, "learning_rate": 4.7586583349933864e-07, "logits/chosen": -1.1124694347381592, "logits/rejected": -1.23919677734375, "logps/chosen": -184.7859344482422, "logps/rejected": -365.14373779296875, "loss": 0.1434, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.46543580293655396, "rewards/margins": 8.931836128234863, "rewards/rejected": -9.398046493530273, "step": 2700 }, { "epoch": 0.6832004537862792, "grad_norm": 72.31169128417969, "learning_rate": 4.755504492801937e-07, "logits/chosen": -1.158532738685608, "logits/rejected": NaN, "logps/chosen": -159.33749389648438, "logps/rejected": -343.5874938964844, "loss": 0.1412, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.44873046875, "rewards/margins": 9.490625381469727, "rewards/rejected": -9.944921493530273, "step": 2710 }, { "epoch": 0.6857214886710995, "grad_norm": 52.035865783691406, "learning_rate": 4.7523312361017654e-07, "logits/chosen": -1.16119384765625, "logits/rejected": NaN, "logps/chosen": -163.90469360351562, "logps/rejected": -340.54376220703125, "loss": 0.116, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.513012707233429, "rewards/margins": 9.056055068969727, "rewards/rejected": -9.575780868530273, "step": 2720 }, { "epoch": 0.6882425235559197, "grad_norm": 44.41587448120117, "learning_rate": 4.7491385922070347e-07, "logits/chosen": NaN, "logits/rejected": -1.2440185546875, "logps/chosen": -172.27108764648438, "logps/rejected": -378.3500061035156, "loss": 0.0982, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.0006347656017169356, "rewards/margins": 9.794921875, "rewards/rejected": -9.801172256469727, "step": 2730 }, { "epoch": 0.6907635584407399, "grad_norm": 22.4324893951416, "learning_rate": 4.7459265885987865e-07, "logits/chosen": -1.129461646080017, "logits/rejected": NaN, "logps/chosen": -170.5593719482422, "logps/rejected": -347.0874938964844, "loss": 0.1668, "rewards/accuracies": 0.9375, "rewards/chosen": 0.0157470703125, "rewards/margins": 9.361865043640137, "rewards/rejected": -9.344922065734863, "step": 2740 }, { "epoch": 0.6932845933255601, "grad_norm": 32.34984588623047, "learning_rate": 4.7426952529247047e-07, "logits/chosen": -1.243994116783142, "logits/rejected": NaN, "logps/chosen": -167.28750610351562, "logps/rejected": -351.86248779296875, "loss": 0.1175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.2801147401332855, "rewards/margins": 8.803125381469727, "rewards/rejected": -9.076952934265137, "step": 2750 }, { "epoch": 0.6958056282103804, "grad_norm": 51.00815963745117, "learning_rate": 4.739444612998872e-07, "logits/chosen": -1.2051270008087158, "logits/rejected": NaN, "logps/chosen": -159.3718719482422, "logps/rejected": -324.45001220703125, "loss": 0.0817, "rewards/accuracies": 0.96875, "rewards/chosen": -0.02908935584127903, "rewards/margins": 8.913281440734863, "rewards/rejected": -8.948827743530273, "step": 2760 }, { "epoch": 0.6983266630952005, "grad_norm": 13.842464447021484, "learning_rate": 4.7361746968015396e-07, "logits/chosen": -1.2155272960662842, "logits/rejected": -1.3493835926055908, "logps/chosen": -187.0968780517578, "logps/rejected": -379.5375061035156, "loss": 0.1133, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.25117796659469604, "rewards/margins": 9.676366806030273, "rewards/rejected": -9.926562309265137, "step": 2770 }, { "epoch": 0.7008476979800208, "grad_norm": 17.352542877197266, "learning_rate": 4.732885532478879e-07, "logits/chosen": -1.16937255859375, "logits/rejected": NaN, "logps/chosen": -171.9968719482422, "logps/rejected": -335.32501220703125, "loss": 0.1669, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.3734374940395355, "rewards/margins": 9.385937690734863, "rewards/rejected": -9.762109756469727, "step": 2780 }, { "epoch": 0.703368732864841, "grad_norm": 31.495655059814453, "learning_rate": 4.729577148342742e-07, "logits/chosen": -1.220849633216858, "logits/rejected": -1.313940405845642, "logps/chosen": -173.6218719482422, "logps/rejected": -362.17498779296875, "loss": 0.1305, "rewards/accuracies": 0.9375, "rewards/chosen": -0.608734130859375, "rewards/margins": 9.650976181030273, "rewards/rejected": -10.257031440734863, "step": 2790 }, { "epoch": 0.7058897677496613, "grad_norm": 23.545095443725586, "learning_rate": 4.7262495728704156e-07, "logits/chosen": -1.121942162513733, "logits/rejected": NaN, "logps/chosen": -157.9796905517578, "logps/rejected": -366.8500061035156, "loss": 0.1128, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.0003540039178915322, "rewards/margins": 9.650976181030273, "rewards/rejected": -9.648046493530273, "step": 2800 }, { "epoch": 0.7084108026344814, "grad_norm": 5.404031276702881, "learning_rate": 4.7229028347043826e-07, "logits/chosen": -1.132849097251892, "logits/rejected": -1.2069213390350342, "logps/chosen": -162.09335327148438, "logps/rejected": -357.2250061035156, "loss": 0.0878, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.035980224609375, "rewards/margins": 10.238672256469727, "rewards/rejected": -10.275781631469727, "step": 2810 }, { "epoch": 0.7109318375193017, "grad_norm": 22.959991455078125, "learning_rate": 4.719536962652067e-07, "logits/chosen": -1.0430206060409546, "logits/rejected": NaN, "logps/chosen": -156.03125, "logps/rejected": -371.71875, "loss": 0.1369, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.4612487852573395, "rewards/margins": 9.723828315734863, "rewards/rejected": -10.183202743530273, "step": 2820 }, { "epoch": 0.7134528724041219, "grad_norm": 33.791542053222656, "learning_rate": 4.7161519856855915e-07, "logits/chosen": -1.0109374523162842, "logits/rejected": NaN, "logps/chosen": -178.4406280517578, "logps/rejected": -349.625, "loss": 0.1599, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8753296136856079, "rewards/margins": 9.182812690734863, "rewards/rejected": -10.061718940734863, "step": 2830 }, { "epoch": 0.7159739072889421, "grad_norm": 3.4620347023010254, "learning_rate": 4.7127479329415266e-07, "logits/chosen": -1.0955718755722046, "logits/rejected": NaN, "logps/chosen": -180.7062530517578, "logps/rejected": -361.0625, "loss": 0.067, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8418213129043579, "rewards/margins": 9.619140625, "rewards/rejected": -10.460546493530273, "step": 2840 }, { "epoch": 0.7184949421737623, "grad_norm": 6.781332492828369, "learning_rate": 4.709324833720639e-07, "logits/chosen": -1.1355469226837158, "logits/rejected": -1.2831542491912842, "logps/chosen": -188.8874969482422, "logps/rejected": -356.82501220703125, "loss": 0.1057, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4379119873046875, "rewards/margins": 9.899999618530273, "rewards/rejected": -10.337499618530273, "step": 2850 }, { "epoch": 0.7210159770585826, "grad_norm": 7.445805072784424, "learning_rate": 4.7058827174876406e-07, "logits/chosen": -1.1583251953125, "logits/rejected": NaN, "logps/chosen": -145.6328125, "logps/rejected": -336.6875, "loss": 0.0898, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.38640135526657104, "rewards/margins": 9.4990234375, "rewards/rejected": -9.115234375, "step": 2860 }, { "epoch": 0.7235370119434028, "grad_norm": 42.1134147644043, "learning_rate": 4.7024216138709333e-07, "logits/chosen": -1.1540206670761108, "logits/rejected": NaN, "logps/chosen": -149.43124389648438, "logps/rejected": -343.78125, "loss": 0.0569, "rewards/accuracies": 0.971875011920929, "rewards/chosen": 0.556549072265625, "rewards/margins": 9.412500381469727, "rewards/rejected": -8.856836318969727, "step": 2870 }, { "epoch": 0.726058046828223, "grad_norm": 71.23540496826172, "learning_rate": 4.6989415526623566e-07, "logits/chosen": NaN, "logits/rejected": -1.201635718345642, "logps/chosen": -156.3640594482422, "logps/rejected": -363.1187438964844, "loss": 0.1553, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.35232239961624146, "rewards/margins": 9.641992568969727, "rewards/rejected": -9.991015434265137, "step": 2880 }, { "epoch": 0.7285790817130432, "grad_norm": 52.25419998168945, "learning_rate": 4.69544256381693e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -159.27188110351562, "logps/rejected": -351.73748779296875, "loss": 0.197, "rewards/accuracies": 0.953125, "rewards/chosen": -0.98419189453125, "rewards/margins": 9.7109375, "rewards/rejected": -10.694140434265137, "step": 2890 }, { "epoch": 0.7311001165978634, "grad_norm": 2.8718042373657227, "learning_rate": 4.691924677452592e-07, "logits/chosen": -1.071380615234375, "logits/rejected": NaN, "logps/chosen": -187.515625, "logps/rejected": -389.4624938964844, "loss": 0.0964, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.4369170665740967, "rewards/margins": 10.377148628234863, "rewards/rejected": -11.81640625, "step": 2900 }, { "epoch": 0.7336211514826836, "grad_norm": 68.36690521240234, "learning_rate": 4.688387923849947e-07, "logits/chosen": -0.9525512456893921, "logits/rejected": NaN, "logps/chosen": -191.8249969482422, "logps/rejected": -377.2749938964844, "loss": 0.1597, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.3919799327850342, "rewards/margins": 9.7705078125, "rewards/rejected": -11.165234565734863, "step": 2910 }, { "epoch": 0.7361421863675038, "grad_norm": 27.340354919433594, "learning_rate": 4.684832333451998e-07, "logits/chosen": -1.121392846107483, "logits/rejected": -1.1581299304962158, "logps/chosen": -170.3546905517578, "logps/rejected": -380.6499938964844, "loss": 0.1052, "rewards/accuracies": 0.953125, "rewards/chosen": -0.31104737520217896, "rewards/margins": 9.759765625, "rewards/rejected": -10.067968368530273, "step": 2920 }, { "epoch": 0.7386632212523241, "grad_norm": 15.053115844726562, "learning_rate": 4.68125793686389e-07, "logits/chosen": -1.20819091796875, "logits/rejected": NaN, "logps/chosen": -175.14688110351562, "logps/rejected": -354.57501220703125, "loss": 0.0894, "rewards/accuracies": 0.940625011920929, "rewards/chosen": 0.5164123773574829, "rewards/margins": 9.205273628234863, "rewards/rejected": -8.690820693969727, "step": 2930 }, { "epoch": 0.7411842561371443, "grad_norm": 62.93861770629883, "learning_rate": 4.677664764852644e-07, "logits/chosen": -1.1517455577850342, "logits/rejected": NaN, "logps/chosen": -157.52499389648438, "logps/rejected": -341.0687561035156, "loss": 0.1371, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.46763914823532104, "rewards/margins": 9.241796493530273, "rewards/rejected": -8.776562690734863, "step": 2940 }, { "epoch": 0.7437052910219645, "grad_norm": 47.2610969543457, "learning_rate": 4.6740528483468926e-07, "logits/chosen": -1.175531029701233, "logits/rejected": NaN, "logps/chosen": -166.87344360351562, "logps/rejected": -350.66876220703125, "loss": 0.1358, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.17811278998851776, "rewards/margins": 9.807519912719727, "rewards/rejected": -9.984375, "step": 2950 }, { "epoch": 0.7462263259067847, "grad_norm": 69.66057586669922, "learning_rate": 4.670422218436613e-07, "logits/chosen": -1.125695824623108, "logits/rejected": NaN, "logps/chosen": -171.0812530517578, "logps/rejected": -354.7124938964844, "loss": 0.1807, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1480529308319092, "rewards/margins": 9.001367568969727, "rewards/rejected": -10.150781631469727, "step": 2960 }, { "epoch": 0.748747360791605, "grad_norm": 6.245145797729492, "learning_rate": 4.6667729063728616e-07, "logits/chosen": -1.1269729137420654, "logits/rejected": -1.1989867687225342, "logps/chosen": -164.29531860351562, "logps/rejected": -356.9375, "loss": 0.1546, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.778350830078125, "rewards/margins": 9.005468368530273, "rewards/rejected": -9.783594131469727, "step": 2970 }, { "epoch": 0.7512683956764252, "grad_norm": 21.513164520263672, "learning_rate": 4.663104943567502e-07, "logits/chosen": -1.191247582435608, "logits/rejected": NaN, "logps/chosen": -146.9609375, "logps/rejected": -344.3500061035156, "loss": 0.1262, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4271225035190582, "rewards/margins": 8.695703506469727, "rewards/rejected": -9.118359565734863, "step": 2980 }, { "epoch": 0.7537894305612454, "grad_norm": 40.27738571166992, "learning_rate": 4.659418361592936e-07, "logits/chosen": -1.1365966796875, "logits/rejected": -1.183630347251892, "logps/chosen": -169.03750610351562, "logps/rejected": -357.1937561035156, "loss": 0.2436, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.9814056158065796, "rewards/margins": 8.381444931030273, "rewards/rejected": -9.364453315734863, "step": 2990 }, { "epoch": 0.7563104654460656, "grad_norm": 31.710542678833008, "learning_rate": 4.655713192181835e-07, "logits/chosen": -1.2083740234375, "logits/rejected": -1.2632567882537842, "logps/chosen": -184.2312469482422, "logps/rejected": -351.7124938964844, "loss": 0.1309, "rewards/accuracies": 0.953125, "rewards/chosen": -1.0959599018096924, "rewards/margins": 8.731640815734863, "rewards/rejected": -9.824999809265137, "step": 3000 }, { "epoch": 0.7588315003308859, "grad_norm": 17.581069946289062, "learning_rate": 4.651989467226859e-07, "logits/chosen": -1.179663062095642, "logits/rejected": NaN, "logps/chosen": -169.50155639648438, "logps/rejected": -348.96875, "loss": 0.0613, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.8971923589706421, "rewards/margins": 8.7041015625, "rewards/rejected": -9.602343559265137, "step": 3010 }, { "epoch": 0.761352535215706, "grad_norm": 39.081539154052734, "learning_rate": 4.648247218780391e-07, "logits/chosen": -1.236364722251892, "logits/rejected": NaN, "logps/chosen": -161.05313110351562, "logps/rejected": -336.8125, "loss": 0.1619, "rewards/accuracies": 0.953125, "rewards/chosen": -0.4837280213832855, "rewards/margins": 8.605273246765137, "rewards/rejected": -9.090624809265137, "step": 3020 }, { "epoch": 0.7638735701005263, "grad_norm": 56.11845779418945, "learning_rate": 4.644486479054256e-07, "logits/chosen": -1.264440894126892, "logits/rejected": NaN, "logps/chosen": -176.5703125, "logps/rejected": -371.625, "loss": 0.0628, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.0862579345703125, "rewards/margins": 9.984766006469727, "rewards/rejected": -9.901952743530273, "step": 3030 }, { "epoch": 0.7663946049853465, "grad_norm": 36.80084228515625, "learning_rate": 4.640707280419444e-07, "logits/chosen": -1.239538550376892, "logits/rejected": -1.3049499988555908, "logps/chosen": -159.71719360351562, "logps/rejected": -346.67498779296875, "loss": 0.1664, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.06094970554113388, "rewards/margins": 9.89990234375, "rewards/rejected": -9.962890625, "step": 3040 }, { "epoch": 0.7689156398701668, "grad_norm": 72.6322021484375, "learning_rate": 4.636909655405832e-07, "logits/chosen": -1.217041015625, "logits/rejected": NaN, "logps/chosen": -161.29843139648438, "logps/rejected": -360.875, "loss": 0.1528, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.4689086973667145, "rewards/margins": 9.300585746765137, "rewards/rejected": -9.774999618530273, "step": 3050 }, { "epoch": 0.7714366747549869, "grad_norm": 29.712482452392578, "learning_rate": 4.633093636701904e-07, "logits/chosen": -1.104516625404358, "logits/rejected": NaN, "logps/chosen": -150.10311889648438, "logps/rejected": -338.6625061035156, "loss": 0.1024, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.014376831240952015, "rewards/margins": 9.6357421875, "rewards/rejected": -9.623437881469727, "step": 3060 }, { "epoch": 0.7739577096398071, "grad_norm": 18.42807960510254, "learning_rate": 4.629259257154472e-07, "logits/chosen": -1.1743590831756592, "logits/rejected": -1.247314453125, "logps/chosen": -146.0390625, "logps/rejected": -328.42498779296875, "loss": 0.1133, "rewards/accuracies": 0.953125, "rewards/chosen": 0.5333892703056335, "rewards/margins": 8.572851181030273, "rewards/rejected": -8.038671493530273, "step": 3070 }, { "epoch": 0.7764787445246274, "grad_norm": 20.868642807006836, "learning_rate": 4.625406549768389e-07, "logits/chosen": -1.202490210533142, "logits/rejected": NaN, "logps/chosen": -161.4367218017578, "logps/rejected": -361.6312561035156, "loss": 0.0678, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.6272643804550171, "rewards/margins": 8.673437118530273, "rewards/rejected": -8.042187690734863, "step": 3080 }, { "epoch": 0.7789997794094475, "grad_norm": 20.915069580078125, "learning_rate": 4.621535547706267e-07, "logits/chosen": -1.117218017578125, "logits/rejected": NaN, "logps/chosen": -161.4499969482422, "logps/rejected": -341.4125061035156, "loss": 0.0842, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.13579407334327698, "rewards/margins": 9.614209175109863, "rewards/rejected": -9.746874809265137, "step": 3090 }, { "epoch": 0.7815208142942678, "grad_norm": 60.12716293334961, "learning_rate": 4.6176462842881914e-07, "logits/chosen": -1.1165893077850342, "logits/rejected": -1.1640090942382812, "logps/chosen": -185.72500610351562, "logps/rejected": -406.9375, "loss": 0.1164, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.896105945110321, "rewards/margins": 10.113866806030273, "rewards/rejected": -11.014843940734863, "step": 3100 }, { "epoch": 0.784041849179088, "grad_norm": 74.80657958984375, "learning_rate": 4.6137387929914355e-07, "logits/chosen": -1.090917944908142, "logits/rejected": -1.1743347644805908, "logps/chosen": -172.15625, "logps/rejected": -378.9125061035156, "loss": 0.081, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.7052001953125, "rewards/margins": 10.761327743530273, "rewards/rejected": -11.47265625, "step": 3110 }, { "epoch": 0.7865628840639083, "grad_norm": 18.4414119720459, "learning_rate": 4.60981310745017e-07, "logits/chosen": -1.14752197265625, "logits/rejected": -1.2168090343475342, "logps/chosen": -173.859375, "logps/rejected": -371.45001220703125, "loss": 0.1095, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.21656493842601776, "rewards/margins": 10.518359184265137, "rewards/rejected": -10.736719131469727, "step": 3120 }, { "epoch": 0.7890839189487284, "grad_norm": 27.281539916992188, "learning_rate": 4.6058692614551755e-07, "logits/chosen": -1.1750977039337158, "logits/rejected": NaN, "logps/chosen": -181.0281219482422, "logps/rejected": -371.79998779296875, "loss": 0.1107, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.21417236328125, "rewards/margins": 10.007421493530273, "rewards/rejected": -9.796875, "step": 3130 }, { "epoch": 0.7916049538335487, "grad_norm": 24.56909942626953, "learning_rate": 4.6019072889535495e-07, "logits/chosen": -1.124420166015625, "logits/rejected": NaN, "logps/chosen": -174.48281860351562, "logps/rejected": -380.0874938964844, "loss": 0.1132, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.13621826469898224, "rewards/margins": 9.794336318969727, "rewards/rejected": -9.651171684265137, "step": 3140 }, { "epoch": 0.7941259887183689, "grad_norm": 16.69878578186035, "learning_rate": 4.5979272240484156e-07, "logits/chosen": -1.1886475086212158, "logits/rejected": NaN, "logps/chosen": -161.4328155517578, "logps/rejected": -348.2124938964844, "loss": 0.0862, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": 0.19298401474952698, "rewards/margins": 9.667187690734863, "rewards/rejected": -9.474218368530273, "step": 3150 }, { "epoch": 0.7966470236031891, "grad_norm": 21.253299713134766, "learning_rate": 4.593929100998632e-07, "logits/chosen": -1.153540015220642, "logits/rejected": -1.2482421398162842, "logps/chosen": -151.5078125, "logps/rejected": -357.98748779296875, "loss": 0.1358, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": 0.02947387658059597, "rewards/margins": 9.797460556030273, "rewards/rejected": -9.768359184265137, "step": 3160 }, { "epoch": 0.7991680584880093, "grad_norm": 10.994426727294922, "learning_rate": 4.5899129542184914e-07, "logits/chosen": -1.157568335533142, "logits/rejected": -1.2133300304412842, "logps/chosen": -168.328125, "logps/rejected": -386.2124938964844, "loss": 0.1047, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4851318299770355, "rewards/margins": 10.282910346984863, "rewards/rejected": -10.766406059265137, "step": 3170 }, { "epoch": 0.8016890933728296, "grad_norm": 24.658891677856445, "learning_rate": 4.5858788182774296e-07, "logits/chosen": -1.1103484630584717, "logits/rejected": NaN, "logps/chosen": -160.93905639648438, "logps/rejected": -352.0874938964844, "loss": 0.1165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.605480968952179, "rewards/margins": 9.900781631469727, "rewards/rejected": -10.507031440734863, "step": 3180 }, { "epoch": 0.8042101282576498, "grad_norm": 29.145156860351562, "learning_rate": 4.581826727899725e-07, "logits/chosen": -1.167394995689392, "logits/rejected": NaN, "logps/chosen": -178.27188110351562, "logps/rejected": -368.54998779296875, "loss": 0.1349, "rewards/accuracies": 0.96875, "rewards/chosen": -0.47876280546188354, "rewards/margins": 10.230859756469727, "rewards/rejected": -10.707422256469727, "step": 3190 }, { "epoch": 0.80673116314247, "grad_norm": 13.656593322753906, "learning_rate": 4.577756717964203e-07, "logits/chosen": -1.006250023841858, "logits/rejected": -1.12603759765625, "logps/chosen": -166.57656860351562, "logps/rejected": -389.29376220703125, "loss": 0.1086, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.8930877447128296, "rewards/margins": 10.187891006469727, "rewards/rejected": -11.080469131469727, "step": 3200 }, { "epoch": 0.8092521980272902, "grad_norm": 50.37730407714844, "learning_rate": 4.57366882350393e-07, "logits/chosen": -1.0048186779022217, "logits/rejected": -1.1439087390899658, "logps/chosen": -161.296875, "logps/rejected": -374.5625, "loss": 0.0891, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -1.18218994140625, "rewards/margins": 10.554296493530273, "rewards/rejected": -11.733983993530273, "step": 3210 }, { "epoch": 0.8117732329121105, "grad_norm": 1.671088695526123, "learning_rate": 4.569563079705919e-07, "logits/chosen": -1.100805640220642, "logits/rejected": NaN, "logps/chosen": -197.8406219482422, "logps/rejected": -397.42498779296875, "loss": 0.0877, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.915014624595642, "rewards/margins": 10.767187118530273, "rewards/rejected": -12.688281059265137, "step": 3220 }, { "epoch": 0.8142942677969306, "grad_norm": 7.9098591804504395, "learning_rate": 4.5654395219108224e-07, "logits/chosen": -1.17889404296875, "logits/rejected": NaN, "logps/chosen": -202.1374969482422, "logps/rejected": -391.45001220703125, "loss": 0.2048, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.300567626953125, "rewards/margins": 11.093358993530273, "rewards/rejected": -12.394922256469727, "step": 3230 }, { "epoch": 0.8168153026817508, "grad_norm": 18.829072952270508, "learning_rate": 4.5612981856126264e-07, "logits/chosen": -1.1366088390350342, "logits/rejected": -1.1718413829803467, "logps/chosen": -166.68905639648438, "logps/rejected": -375.3125, "loss": 0.1494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.40760499238967896, "rewards/margins": 10.385156631469727, "rewards/rejected": -9.974609375, "step": 3240 }, { "epoch": 0.8193363375665711, "grad_norm": 14.356622695922852, "learning_rate": 4.55713910645835e-07, "logits/chosen": -1.2176024913787842, "logits/rejected": NaN, "logps/chosen": -152.7421875, "logps/rejected": -359.7749938964844, "loss": 0.0474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.800280749797821, "rewards/margins": 10.255273818969727, "rewards/rejected": -9.4541015625, "step": 3250 }, { "epoch": 0.8218573724513913, "grad_norm": 60.6351432800293, "learning_rate": 4.552962320247734e-07, "logits/chosen": -1.2867310047149658, "logits/rejected": NaN, "logps/chosen": -188.109375, "logps/rejected": -395.7250061035156, "loss": 0.0696, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.3631179928779602, "rewards/margins": 11.067968368530273, "rewards/rejected": -11.428125381469727, "step": 3260 }, { "epoch": 0.8243784073362115, "grad_norm": 43.393280029296875, "learning_rate": 4.5487678629329373e-07, "logits/chosen": -1.0770645141601562, "logits/rejected": NaN, "logps/chosen": -186.3359375, "logps/rejected": -392.8999938964844, "loss": 0.1648, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.514013648033142, "rewards/margins": 10.564745903015137, "rewards/rejected": -12.077343940734863, "step": 3270 }, { "epoch": 0.8268994422210317, "grad_norm": 61.13141632080078, "learning_rate": 4.544555770618222e-07, "logits/chosen": -1.079901099205017, "logits/rejected": NaN, "logps/chosen": -179.515625, "logps/rejected": -388.9750061035156, "loss": 0.1175, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.5921752452850342, "rewards/margins": 9.610937118530273, "rewards/rejected": -11.201562881469727, "step": 3280 }, { "epoch": 0.829420477105852, "grad_norm": 11.599440574645996, "learning_rate": 4.540326079559647e-07, "logits/chosen": -1.16888427734375, "logits/rejected": -1.2025146484375, "logps/chosen": -176.3874969482422, "logps/rejected": -351.0249938964844, "loss": 0.054, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6295257806777954, "rewards/margins": 10.145703315734863, "rewards/rejected": -10.77734375, "step": 3290 }, { "epoch": 0.8319415119906721, "grad_norm": 18.263927459716797, "learning_rate": 4.5360788261647544e-07, "logits/chosen": -1.157812476158142, "logits/rejected": NaN, "logps/chosen": -165.07656860351562, "logps/rejected": -360.26873779296875, "loss": 0.1195, "rewards/accuracies": 0.953125, "rewards/chosen": -0.26597291231155396, "rewards/margins": 10.313672065734863, "rewards/rejected": -10.580078125, "step": 3300 }, { "epoch": 0.8344625468754924, "grad_norm": 59.108219146728516, "learning_rate": 4.531814046992255e-07, "logits/chosen": -1.0770142078399658, "logits/rejected": NaN, "logps/chosen": -164.35311889648438, "logps/rejected": -361.4624938964844, "loss": 0.0813, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.0789794921875, "rewards/margins": 10.3466796875, "rewards/rejected": -10.425000190734863, "step": 3310 }, { "epoch": 0.8369835817603126, "grad_norm": 6.978919506072998, "learning_rate": 4.5275317787517166e-07, "logits/chosen": -1.1707274913787842, "logits/rejected": -1.234521508216858, "logps/chosen": -190.2062530517578, "logps/rejected": -386.8999938964844, "loss": 0.1263, "rewards/accuracies": 0.953125, "rewards/chosen": -0.91717529296875, "rewards/margins": 10.071093559265137, "rewards/rejected": -10.992578506469727, "step": 3320 }, { "epoch": 0.8395046166451329, "grad_norm": 83.6766357421875, "learning_rate": 4.5232320583032437e-07, "logits/chosen": -1.032983422279358, "logits/rejected": NaN, "logps/chosen": -155.0124969482422, "logps/rejected": -361.7749938964844, "loss": 0.1079, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.7118499875068665, "rewards/margins": 10.724218368530273, "rewards/rejected": -11.439062118530273, "step": 3330 }, { "epoch": 0.842025651529953, "grad_norm": 25.58526039123535, "learning_rate": 4.518914922657164e-07, "logits/chosen": -1.046624779701233, "logits/rejected": NaN, "logps/chosen": -171.0656280517578, "logps/rejected": -384.36248779296875, "loss": 0.0967, "rewards/accuracies": 0.953125, "rewards/chosen": -0.7386840581893921, "rewards/margins": 11.245702743530273, "rewards/rejected": -11.982421875, "step": 3340 }, { "epoch": 0.8445466864147733, "grad_norm": 1.7357717752456665, "learning_rate": 4.5145804089737093e-07, "logits/chosen": -1.092797875404358, "logits/rejected": -1.1073486804962158, "logps/chosen": -164.49063110351562, "logps/rejected": -357.9750061035156, "loss": 0.1253, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.1672484874725342, "rewards/margins": 10.969531059265137, "rewards/rejected": -12.138280868530273, "step": 3350 }, { "epoch": 0.8470677212995935, "grad_norm": 52.012176513671875, "learning_rate": 4.510228554562693e-07, "logits/chosen": -1.070459008216858, "logits/rejected": -1.0793335437774658, "logps/chosen": -168.1796875, "logps/rejected": -378.26251220703125, "loss": 0.1387, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.796429455280304, "rewards/margins": 10.983007431030273, "rewards/rejected": -11.788671493530273, "step": 3360 }, { "epoch": 0.8495887561844137, "grad_norm": 90.80325317382812, "learning_rate": 4.505859396883192e-07, "logits/chosen": -1.015832543373108, "logits/rejected": -1.1730422973632812, "logps/chosen": -160.1171875, "logps/rejected": -367.42498779296875, "loss": 0.1657, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -2.086726427078247, "rewards/margins": 10.274218559265137, "rewards/rejected": -12.361719131469727, "step": 3370 }, { "epoch": 0.8521097910692339, "grad_norm": 19.289592742919922, "learning_rate": 4.501472973543222e-07, "logits/chosen": -1.042724609375, "logits/rejected": -1.1186339855194092, "logps/chosen": -182.0656280517578, "logps/rejected": -391.625, "loss": 0.1119, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.770355224609375, "rewards/margins": 10.538866996765137, "rewards/rejected": -12.3125, "step": 3380 }, { "epoch": 0.8546308259540542, "grad_norm": 31.64702606201172, "learning_rate": 4.497069322299417e-07, "logits/chosen": -1.0700500011444092, "logits/rejected": NaN, "logps/chosen": -180.1437530517578, "logps/rejected": -380.0874938964844, "loss": 0.1437, "rewards/accuracies": 0.953125, "rewards/chosen": -1.7483642101287842, "rewards/margins": 10.579297065734863, "rewards/rejected": -12.327343940734863, "step": 3390 }, { "epoch": 0.8571518608388744, "grad_norm": 57.1508903503418, "learning_rate": 4.4926484810567e-07, "logits/chosen": -1.0974609851837158, "logits/rejected": -1.1549072265625, "logps/chosen": -198.85311889648438, "logps/rejected": -390.3500061035156, "loss": 0.1529, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.2325682640075684, "rewards/margins": 9.199804306030273, "rewards/rejected": -11.439844131469727, "step": 3400 }, { "epoch": 0.8596728957236945, "grad_norm": 56.43617630004883, "learning_rate": 4.4882104878679584e-07, "logits/chosen": -1.0511963367462158, "logits/rejected": NaN, "logps/chosen": -176.9031219482422, "logps/rejected": -321.5375061035156, "loss": 0.1493, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.3497650623321533, "rewards/margins": 8.378710746765137, "rewards/rejected": -10.730859756469727, "step": 3410 }, { "epoch": 0.8621939306085148, "grad_norm": 53.345829010009766, "learning_rate": 4.4837553809337194e-07, "logits/chosen": -0.9500274658203125, "logits/rejected": NaN, "logps/chosen": -171.5968780517578, "logps/rejected": -350.375, "loss": 0.1676, "rewards/accuracies": 0.9375, "rewards/chosen": -1.7271728515625, "rewards/margins": 8.792773246765137, "rewards/rejected": -10.523046493530273, "step": 3420 }, { "epoch": 0.864714965493335, "grad_norm": 56.3369026184082, "learning_rate": 4.479283198601816e-07, "logits/chosen": -1.1729247570037842, "logits/rejected": NaN, "logps/chosen": -166.54531860351562, "logps/rejected": -367.36248779296875, "loss": 0.1187, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.955920398235321, "rewards/margins": 9.428515434265137, "rewards/rejected": -10.387499809265137, "step": 3430 }, { "epoch": 0.8672360003781552, "grad_norm": 8.500638008117676, "learning_rate": 4.474793979367061e-07, "logits/chosen": -1.129125952720642, "logits/rejected": NaN, "logps/chosen": -147.55313110351562, "logps/rejected": -344.7749938964844, "loss": 0.1032, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.19537353515625, "rewards/margins": 9.437891006469727, "rewards/rejected": -9.62890625, "step": 3440 }, { "epoch": 0.8697570352629754, "grad_norm": 5.672114849090576, "learning_rate": 4.470287761870916e-07, "logits/chosen": -1.1424071788787842, "logits/rejected": -1.1978638172149658, "logps/chosen": -167.5437469482422, "logps/rejected": -380.98748779296875, "loss": 0.1352, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.3550170958042145, "rewards/margins": 9.89453125, "rewards/rejected": -10.25, "step": 3450 }, { "epoch": 0.8722780701477957, "grad_norm": 36.25192642211914, "learning_rate": 4.465764584901152e-07, "logits/chosen": -0.846148669719696, "logits/rejected": NaN, "logps/chosen": -160.9968719482422, "logps/rejected": -352.67498779296875, "loss": 0.1067, "rewards/accuracies": 0.953125, "rewards/chosen": -1.6075439453125, "rewards/margins": 9.757031440734863, "rewards/rejected": -11.362109184265137, "step": 3460 }, { "epoch": 0.8747991050326159, "grad_norm": 19.19021224975586, "learning_rate": 4.461224487391526e-07, "logits/chosen": -0.91790771484375, "logits/rejected": NaN, "logps/chosen": -182.3874969482422, "logps/rejected": -377.3374938964844, "loss": 0.1787, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.269311547279358, "rewards/margins": 10.130859375, "rewards/rejected": -11.399999618530273, "step": 3470 }, { "epoch": 0.8773201399174361, "grad_norm": 78.8823013305664, "learning_rate": 4.456667508421438e-07, "logits/chosen": -1.0114867687225342, "logits/rejected": NaN, "logps/chosen": -178.22500610351562, "logps/rejected": -369.11248779296875, "loss": 0.1339, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.1865723133087158, "rewards/margins": 8.898241996765137, "rewards/rejected": -10.0859375, "step": 3480 }, { "epoch": 0.8798411748022563, "grad_norm": 52.49982833862305, "learning_rate": 4.4520936872155967e-07, "logits/chosen": -1.0533874034881592, "logits/rejected": NaN, "logps/chosen": -168.6531219482422, "logps/rejected": -368.0249938964844, "loss": 0.1192, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.39655762910842896, "rewards/margins": 9.777539253234863, "rewards/rejected": -10.174609184265137, "step": 3490 }, { "epoch": 0.8823622096870766, "grad_norm": 44.04365539550781, "learning_rate": 4.447503063143683e-07, "logits/chosen": -1.0109374523162842, "logits/rejected": NaN, "logps/chosen": -160.43124389648438, "logps/rejected": -378.7749938964844, "loss": 0.1557, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.434326171875, "rewards/margins": 9.932421684265137, "rewards/rejected": -10.36328125, "step": 3500 }, { "epoch": 0.8848832445718967, "grad_norm": 32.24228286743164, "learning_rate": 4.4428956757200096e-07, "logits/chosen": -1.063073754310608, "logits/rejected": NaN, "logps/chosen": -152.60311889648438, "logps/rejected": -350.9750061035156, "loss": 0.0949, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.6404708623886108, "rewards/margins": 10.235742568969727, "rewards/rejected": -10.871484756469727, "step": 3510 }, { "epoch": 0.887404279456717, "grad_norm": 37.73710632324219, "learning_rate": 4.4382715646031834e-07, "logits/chosen": -1.019293189048767, "logits/rejected": NaN, "logps/chosen": -172.54061889648438, "logps/rejected": -346.04998779296875, "loss": 0.1407, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.70458984375, "rewards/margins": 9.946484565734863, "rewards/rejected": -10.65234375, "step": 3520 }, { "epoch": 0.8899253143415372, "grad_norm": 45.52389144897461, "learning_rate": 4.4336307695957605e-07, "logits/chosen": -0.9697204828262329, "logits/rejected": NaN, "logps/chosen": -177.44375610351562, "logps/rejected": -375.73748779296875, "loss": 0.1288, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.5701233148574829, "rewards/margins": 10.115234375, "rewards/rejected": -10.688281059265137, "step": 3530 }, { "epoch": 0.8924463492263575, "grad_norm": 6.707828521728516, "learning_rate": 4.428973330643906e-07, "logits/chosen": -1.0758178234100342, "logits/rejected": NaN, "logps/chosen": -160.09375, "logps/rejected": -368.5249938964844, "loss": 0.0817, "rewards/accuracies": 0.96875, "rewards/chosen": 0.175445556640625, "rewards/margins": 10.035741806030273, "rewards/rejected": -9.861328125, "step": 3540 }, { "epoch": 0.8949673841111776, "grad_norm": 1.2877650260925293, "learning_rate": 4.4242992878370493e-07, "logits/chosen": -1.0353209972381592, "logits/rejected": NaN, "logps/chosen": -167.8468780517578, "logps/rejected": -374.625, "loss": 0.0956, "rewards/accuracies": 0.971875011920929, "rewards/chosen": 0.07937316596508026, "rewards/margins": 10.598828315734863, "rewards/rejected": -10.520312309265137, "step": 3550 }, { "epoch": 0.8974884189959978, "grad_norm": 17.253376007080078, "learning_rate": 4.4196086814075405e-07, "logits/chosen": -1.0886108875274658, "logits/rejected": NaN, "logps/chosen": -168.5343780517578, "logps/rejected": -364.42498779296875, "loss": 0.1227, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.15085449814796448, "rewards/margins": 10.610937118530273, "rewards/rejected": -10.767187118530273, "step": 3560 }, { "epoch": 0.9000094538808181, "grad_norm": 55.92461395263672, "learning_rate": 4.4149015517303035e-07, "logits/chosen": -1.0092589855194092, "logits/rejected": -1.014520287513733, "logps/chosen": -163.49374389648438, "logps/rejected": -364.3374938964844, "loss": 0.1688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.0191650390625, "rewards/margins": 10.278905868530273, "rewards/rejected": -10.297656059265137, "step": 3570 }, { "epoch": 0.9025304887656382, "grad_norm": 21.99595069885254, "learning_rate": 4.410177939322484e-07, "logits/chosen": -1.0551025867462158, "logits/rejected": -1.1238524913787842, "logps/chosen": -167.94686889648438, "logps/rejected": -370.6000061035156, "loss": 0.0653, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.01094970665872097, "rewards/margins": 10.125, "rewards/rejected": -10.1328125, "step": 3580 }, { "epoch": 0.9050515236504585, "grad_norm": 45.42665100097656, "learning_rate": 4.4054378848431086e-07, "logits/chosen": -0.966204822063446, "logits/rejected": -1.0410645008087158, "logps/chosen": -169.67343139648438, "logps/rejected": -368.8374938964844, "loss": 0.0869, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.37445068359375, "rewards/margins": 9.761327743530273, "rewards/rejected": -10.136327743530273, "step": 3590 }, { "epoch": 0.9075725585352787, "grad_norm": 27.84415626525879, "learning_rate": 4.40068142909273e-07, "logits/chosen": -0.9756103754043579, "logits/rejected": NaN, "logps/chosen": -179.49063110351562, "logps/rejected": -363.125, "loss": 0.1207, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.0178344249725342, "rewards/margins": 9.869140625, "rewards/rejected": -10.892969131469727, "step": 3600 }, { "epoch": 0.910093593420099, "grad_norm": 0.7303586006164551, "learning_rate": 4.395908613013076e-07, "logits/chosen": -1.0614013671875, "logits/rejected": NaN, "logps/chosen": -188.234375, "logps/rejected": -392.17498779296875, "loss": 0.1835, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.937853991985321, "rewards/margins": 10.530858993530273, "rewards/rejected": -11.464062690734863, "step": 3610 }, { "epoch": 0.9126146283049191, "grad_norm": 15.696537971496582, "learning_rate": 4.391119477686698e-07, "logits/chosen": -0.817340075969696, "logits/rejected": -0.89813232421875, "logps/chosen": -162.4421844482422, "logps/rejected": -363.0375061035156, "loss": 0.1003, "rewards/accuracies": 0.953125, "rewards/chosen": -1.1145751476287842, "rewards/margins": 10.147851943969727, "rewards/rejected": -11.257031440734863, "step": 3620 }, { "epoch": 0.9151356631897394, "grad_norm": 51.999542236328125, "learning_rate": 4.386314064336617e-07, "logits/chosen": -0.8851287961006165, "logits/rejected": NaN, "logps/chosen": -163.875, "logps/rejected": -379.07501220703125, "loss": 0.1044, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.640362560749054, "rewards/margins": 10.839452743530273, "rewards/rejected": -11.481640815734863, "step": 3630 }, { "epoch": 0.9176566980745596, "grad_norm": 37.51946258544922, "learning_rate": 4.38149241432597e-07, "logits/chosen": -0.9081176519393921, "logits/rejected": NaN, "logps/chosen": -162.04843139648438, "logps/rejected": -374.0375061035156, "loss": 0.0684, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.06914062798023224, "rewards/margins": 11.979687690734863, "rewards/rejected": -11.913281440734863, "step": 3640 }, { "epoch": 0.9201777329593799, "grad_norm": 8.335555076599121, "learning_rate": 4.3766545691576507e-07, "logits/chosen": -1.010168433189392, "logits/rejected": NaN, "logps/chosen": -144.328125, "logps/rejected": -370.3999938964844, "loss": 0.0857, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.03264465183019638, "rewards/margins": 11.324999809265137, "rewards/rejected": -11.360156059265137, "step": 3650 }, { "epoch": 0.9226987678442, "grad_norm": 61.38607406616211, "learning_rate": 4.3718005704739567e-07, "logits/chosen": -0.865203857421875, "logits/rejected": -0.978558361530304, "logps/chosen": -164.9015655517578, "logps/rejected": -399.48748779296875, "loss": 0.0753, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.453576683998108, "rewards/margins": 11.546483993530273, "rewards/rejected": -12.998437881469727, "step": 3660 }, { "epoch": 0.9252198027290203, "grad_norm": 59.07481002807617, "learning_rate": 4.366930460056227e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -187.38125610351562, "logps/rejected": -421.1625061035156, "loss": 0.0981, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.081286668777466, "rewards/margins": 12.447265625, "rewards/rejected": -14.530468940734863, "step": 3670 }, { "epoch": 0.9277408376138405, "grad_norm": 46.77458953857422, "learning_rate": 4.362044279824487e-07, "logits/chosen": -0.83843994140625, "logits/rejected": NaN, "logps/chosen": -179.765625, "logps/rejected": -405.0, "loss": 0.1047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.4435181617736816, "rewards/margins": 11.6484375, "rewards/rejected": -14.088281631469727, "step": 3680 }, { "epoch": 0.9302618724986607, "grad_norm": 25.149253845214844, "learning_rate": 4.357142071837081e-07, "logits/chosen": -0.9413512945175171, "logits/rejected": NaN, "logps/chosen": -170.33438110351562, "logps/rejected": -378.0, "loss": 0.1366, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.4821914434432983, "rewards/margins": 11.664843559265137, "rewards/rejected": -13.148828506469727, "step": 3690 }, { "epoch": 0.9327829073834809, "grad_norm": 69.09249877929688, "learning_rate": 4.3522238782903157e-07, "logits/chosen": -1.020758032798767, "logits/rejected": NaN, "logps/chosen": -177.3796844482422, "logps/rejected": -390.88751220703125, "loss": 0.1083, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5137695074081421, "rewards/margins": 11.770703315734863, "rewards/rejected": -12.285937309265137, "step": 3700 }, { "epoch": 0.9353039422683012, "grad_norm": 27.99321746826172, "learning_rate": 4.347289741518097e-07, "logits/chosen": -0.9540923833847046, "logits/rejected": -0.979327380657196, "logps/chosen": -171.1750030517578, "logps/rejected": -385.29998779296875, "loss": 0.1387, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.07280273735523224, "rewards/margins": 11.5458984375, "rewards/rejected": -11.620702743530273, "step": 3710 }, { "epoch": 0.9378249771531214, "grad_norm": 10.560809135437012, "learning_rate": 4.342339703991561e-07, "logits/chosen": -0.9766448736190796, "logits/rejected": NaN, "logps/chosen": -170.9343719482422, "logps/rejected": -394.1499938964844, "loss": 0.1261, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.019500732421875, "rewards/margins": 11.651171684265137, "rewards/rejected": -11.679296493530273, "step": 3720 }, { "epoch": 0.9403460120379415, "grad_norm": 25.8317813873291, "learning_rate": 4.337373808318713e-07, "logits/chosen": -1.001983642578125, "logits/rejected": -1.052636742591858, "logps/chosen": -190.94375610351562, "logps/rejected": -389.6312561035156, "loss": 0.2092, "rewards/accuracies": 0.9375, "rewards/chosen": -1.182255506515503, "rewards/margins": 11.231640815734863, "rewards/rejected": -12.404687881469727, "step": 3730 }, { "epoch": 0.9428670469227618, "grad_norm": 51.11595153808594, "learning_rate": 4.33239209724406e-07, "logits/chosen": -0.950854480266571, "logits/rejected": -1.1151244640350342, "logps/chosen": -189.00155639648438, "logps/rejected": -398.3999938964844, "loss": 0.1389, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.658203125, "rewards/margins": 11.22265625, "rewards/rejected": -12.885937690734863, "step": 3740 }, { "epoch": 0.945388081807582, "grad_norm": 39.52537155151367, "learning_rate": 4.327394613648239e-07, "logits/chosen": -0.972918689250946, "logits/rejected": NaN, "logps/chosen": -181.52499389648438, "logps/rejected": -388.04998779296875, "loss": 0.0835, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.1360565423965454, "rewards/margins": 11.808984756469727, "rewards/rejected": -12.942187309265137, "step": 3750 }, { "epoch": 0.9479091166924022, "grad_norm": 32.618534088134766, "learning_rate": 4.322381400547653e-07, "logits/chosen": -1.044830322265625, "logits/rejected": NaN, "logps/chosen": -171.61874389648438, "logps/rejected": -378.3999938964844, "loss": 0.1047, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.069464087486267, "rewards/margins": 11.179296493530273, "rewards/rejected": -12.248827934265137, "step": 3760 }, { "epoch": 0.9504301515772224, "grad_norm": 40.17506790161133, "learning_rate": 4.317352501094099e-07, "logits/chosen": -0.998059093952179, "logits/rejected": -1.0156066417694092, "logps/chosen": -181.1218719482422, "logps/rejected": -372.11248779296875, "loss": 0.0722, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.6667999029159546, "rewards/margins": 11.063281059265137, "rewards/rejected": -11.721484184265137, "step": 3770 }, { "epoch": 0.9529511864620427, "grad_norm": 88.13125610351562, "learning_rate": 4.3123079585743933e-07, "logits/chosen": -0.976550281047821, "logits/rejected": -1.047338843345642, "logps/chosen": -163.4734344482422, "logps/rejected": -396.8125, "loss": 0.1799, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.3643127381801605, "rewards/margins": 11.5634765625, "rewards/rejected": -11.930468559265137, "step": 3780 }, { "epoch": 0.9554722213468629, "grad_norm": 23.75210189819336, "learning_rate": 4.3072478164100035e-07, "logits/chosen": -0.988391101360321, "logits/rejected": NaN, "logps/chosen": -151.97811889648438, "logps/rejected": -404.07501220703125, "loss": 0.1331, "rewards/accuracies": 0.965624988079071, "rewards/chosen": 0.025115966796875, "rewards/margins": 11.219141006469727, "rewards/rejected": -11.193554878234863, "step": 3790 }, { "epoch": 0.9579932562316831, "grad_norm": 35.96145248413086, "learning_rate": 4.3021721181566726e-07, "logits/chosen": -0.979156494140625, "logits/rejected": -1.0947449207305908, "logps/chosen": -180.1593780517578, "logps/rejected": -372.8812561035156, "loss": 0.1512, "rewards/accuracies": 0.9375, "rewards/chosen": -0.14527587592601776, "rewards/margins": 10.525195121765137, "rewards/rejected": -10.670702934265137, "step": 3800 }, { "epoch": 0.9605142911165033, "grad_norm": 36.0563850402832, "learning_rate": 4.297080907504046e-07, "logits/chosen": -1.0145171880722046, "logits/rejected": -1.113500952720642, "logps/chosen": -170.60311889648438, "logps/rejected": -355.76251220703125, "loss": 0.1121, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.5273987054824829, "rewards/margins": 10.003515243530273, "rewards/rejected": -10.528124809265137, "step": 3810 }, { "epoch": 0.9630353260013236, "grad_norm": 31.05841064453125, "learning_rate": 4.2919742282752914e-07, "logits/chosen": -1.0819823741912842, "logits/rejected": NaN, "logps/chosen": -150.140625, "logps/rejected": -365.5249938964844, "loss": 0.1339, "rewards/accuracies": 0.953125, "rewards/chosen": -0.01550903357565403, "rewards/margins": 9.757031440734863, "rewards/rejected": -9.772656440734863, "step": 3820 }, { "epoch": 0.9655563608861437, "grad_norm": 31.215999603271484, "learning_rate": 4.2868521244267234e-07, "logits/chosen": -0.997692883014679, "logits/rejected": -1.09552001953125, "logps/chosen": -150.625, "logps/rejected": -366.4312438964844, "loss": 0.1377, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11275939643383026, "rewards/margins": 9.579150199890137, "rewards/rejected": -9.694531440734863, "step": 3830 }, { "epoch": 0.968077395770964, "grad_norm": 50.384071350097656, "learning_rate": 4.2817146400474293e-07, "logits/chosen": -0.950024425983429, "logits/rejected": -1.085662841796875, "logps/chosen": -169.3156280517578, "logps/rejected": -377.58123779296875, "loss": 0.1113, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.4775634706020355, "rewards/margins": 10.173047065734863, "rewards/rejected": -10.648046493530273, "step": 3840 }, { "epoch": 0.9705984306557842, "grad_norm": 7.307712078094482, "learning_rate": 4.276561819358883e-07, "logits/chosen": -1.033624291419983, "logits/rejected": NaN, "logps/chosen": -163.9734344482422, "logps/rejected": -376.45001220703125, "loss": 0.1375, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.6813720464706421, "rewards/margins": 9.715234756469727, "rewards/rejected": -10.391698837280273, "step": 3850 }, { "epoch": 0.9731194655406045, "grad_norm": 34.68162536621094, "learning_rate": 4.271393706714569e-07, "logits/chosen": -1.031701683998108, "logits/rejected": -1.001196265220642, "logps/chosen": -183.74374389648438, "logps/rejected": -390.73748779296875, "loss": 0.0606, "rewards/accuracies": 0.96875, "rewards/chosen": -0.6088821291923523, "rewards/margins": 11.064062118530273, "rewards/rejected": -11.671483993530273, "step": 3860 }, { "epoch": 0.9756405004254246, "grad_norm": 38.80615997314453, "learning_rate": 4.266210346599597e-07, "logits/chosen": -0.9942687749862671, "logits/rejected": -1.03326416015625, "logps/chosen": -180.44686889648438, "logps/rejected": -362.7749938964844, "loss": 0.1379, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.431640625, "rewards/margins": 10.529492378234863, "rewards/rejected": -11.962109565734863, "step": 3870 }, { "epoch": 0.9781615353102449, "grad_norm": 46.40849685668945, "learning_rate": 4.261011783630325e-07, "logits/chosen": -0.9635864496231079, "logits/rejected": NaN, "logps/chosen": -180.83438110351562, "logps/rejected": -376.5, "loss": 0.108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.090423583984375, "rewards/margins": 10.874218940734863, "rewards/rejected": -11.969922065734863, "step": 3880 }, { "epoch": 0.9806825701950651, "grad_norm": 100.51557159423828, "learning_rate": 4.255798062553966e-07, "logits/chosen": -1.0066254138946533, "logits/rejected": NaN, "logps/chosen": -175.5124969482422, "logps/rejected": -377.7875061035156, "loss": 0.1421, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.9200103282928467, "rewards/margins": 10.061718940734863, "rewards/rejected": -11.982030868530273, "step": 3890 }, { "epoch": 0.9832036050798852, "grad_norm": 8.742952346801758, "learning_rate": 4.250569228248213e-07, "logits/chosen": -0.8941650390625, "logits/rejected": NaN, "logps/chosen": -194.7843780517578, "logps/rejected": -396.29376220703125, "loss": 0.0932, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.704858422279358, "rewards/margins": 10.353320121765137, "rewards/rejected": -12.060937881469727, "step": 3900 }, { "epoch": 0.9857246399647055, "grad_norm": 32.93906784057617, "learning_rate": 4.245325325720844e-07, "logits/chosen": -0.986480712890625, "logits/rejected": -1.003778100013733, "logps/chosen": -188.8796844482422, "logps/rejected": -394.9125061035156, "loss": 0.0921, "rewards/accuracies": 0.953125, "rewards/chosen": -0.9792038202285767, "rewards/margins": 10.959375381469727, "rewards/rejected": -11.942578315734863, "step": 3910 }, { "epoch": 0.9882456748495257, "grad_norm": 31.22580337524414, "learning_rate": 4.2400664001093407e-07, "logits/chosen": -0.8323608636856079, "logits/rejected": NaN, "logps/chosen": -152.0671844482422, "logps/rejected": -362.5375061035156, "loss": 0.0424, "rewards/accuracies": 0.984375, "rewards/chosen": -0.936352550983429, "rewards/margins": 10.887890815734863, "rewards/rejected": -11.824609756469727, "step": 3920 }, { "epoch": 0.990766709734346, "grad_norm": 12.507460594177246, "learning_rate": 4.234792496680497e-07, "logits/chosen": -0.909869372844696, "logits/rejected": NaN, "logps/chosen": -157.8328094482422, "logps/rejected": -387.63751220703125, "loss": 0.0818, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.38767242431640625, "rewards/margins": 12.4921875, "rewards/rejected": -12.878125190734863, "step": 3930 }, { "epoch": 0.9932877446191661, "grad_norm": 11.319442749023438, "learning_rate": 4.2295036608300305e-07, "logits/chosen": -0.8025146722793579, "logits/rejected": NaN, "logps/chosen": -154.6999969482422, "logps/rejected": -380.375, "loss": 0.1872, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12092514336109161, "rewards/margins": 11.358007431030273, "rewards/rejected": -11.235547065734863, "step": 3940 }, { "epoch": 0.9958087795039864, "grad_norm": 5.107358932495117, "learning_rate": 4.224199938082191e-07, "logits/chosen": -0.980236828327179, "logits/rejected": NaN, "logps/chosen": -160.15625, "logps/rejected": -373.5625, "loss": 0.1371, "rewards/accuracies": 0.953125, "rewards/chosen": -0.09317626804113388, "rewards/margins": 11.287890434265137, "rewards/rejected": -11.389843940734863, "step": 3950 }, { "epoch": 0.9983298143888066, "grad_norm": 36.80963134765625, "learning_rate": 4.218881374089369e-07, "logits/chosen": -0.943707287311554, "logits/rejected": -1.1158447265625, "logps/chosen": -162.8601531982422, "logps/rejected": -378.2124938964844, "loss": 0.1493, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.3005691468715668, "rewards/margins": 10.454687118530273, "rewards/rejected": -10.160547256469727, "step": 3960 }, { "epoch": 1.0010084139539281, "grad_norm": 4.910854816436768, "learning_rate": 4.2135480146317016e-07, "logits/chosen": -1.0979818105697632, "logits/rejected": NaN, "logps/chosen": -168.8463592529297, "logps/rejected": -340.6190490722656, "loss": 0.1277, "rewards/accuracies": 0.9553571343421936, "rewards/chosen": 0.644805908203125, "rewards/margins": 9.254650115966797, "rewards/rejected": -8.615513801574707, "step": 3970 }, { "epoch": 1.0035294488387483, "grad_norm": 3.0653560161590576, "learning_rate": 4.2081999056166807e-07, "logits/chosen": -1.129632592201233, "logits/rejected": NaN, "logps/chosen": -179.5437469482422, "logps/rejected": -387.3374938964844, "loss": 0.0311, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.44250792264938354, "rewards/margins": 11.218164443969727, "rewards/rejected": -10.775390625, "step": 3980 }, { "epoch": 1.0060504837235684, "grad_norm": 1.7009403705596924, "learning_rate": 4.202837093078756e-07, "logits/chosen": -0.997119128704071, "logits/rejected": -1.080163598060608, "logps/chosen": -180.296875, "logps/rejected": -393.7875061035156, "loss": 0.0277, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.5342499017715454, "rewards/margins": 12.102343559265137, "rewards/rejected": -12.635546684265137, "step": 3990 }, { "epoch": 1.0085715186083888, "grad_norm": 2.2212038040161133, "learning_rate": 4.1974596231789416e-07, "logits/chosen": -0.7734924554824829, "logits/rejected": NaN, "logps/chosen": -161.74063110351562, "logps/rejected": -393.54998779296875, "loss": 0.0552, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8840881586074829, "rewards/margins": 13.051172256469727, "rewards/rejected": -13.937108993530273, "step": 4000 }, { "epoch": 1.011092553493209, "grad_norm": 20.321584701538086, "learning_rate": 4.192067542204413e-07, "logits/chosen": -0.847900390625, "logits/rejected": -0.8488098382949829, "logps/chosen": -195.48046875, "logps/rejected": -411.45001220703125, "loss": 0.0091, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.341363549232483, "rewards/margins": 13.375781059265137, "rewards/rejected": -14.71484375, "step": 4010 }, { "epoch": 1.0136135883780293, "grad_norm": 1.7012672424316406, "learning_rate": 4.186660896568116e-07, "logits/chosen": -0.9663940668106079, "logits/rejected": -1.019677758216858, "logps/chosen": -185.3273468017578, "logps/rejected": -432.21875, "loss": 0.0252, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.439172387123108, "rewards/margins": 14.435546875, "rewards/rejected": -15.858593940734863, "step": 4020 }, { "epoch": 1.0161346232628494, "grad_norm": 43.201881408691406, "learning_rate": 4.1812397328083584e-07, "logits/chosen": -0.7998992800712585, "logits/rejected": -0.885083019733429, "logps/chosen": -162.61874389648438, "logps/rejected": -397.125, "loss": 0.0601, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.860687255859375, "rewards/margins": 13.939844131469727, "rewards/rejected": -14.801562309265137, "step": 4030 }, { "epoch": 1.0186556581476696, "grad_norm": 3.4739487171173096, "learning_rate": 4.1758040975884195e-07, "logits/chosen": -0.7442077398300171, "logits/rejected": -0.942645251750946, "logps/chosen": -161.1843719482422, "logps/rejected": -382.32501220703125, "loss": 0.0437, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.8776000738143921, "rewards/margins": 13.224218368530273, "rewards/rejected": -14.102734565734863, "step": 4040 }, { "epoch": 1.02117669303249, "grad_norm": 31.309629440307617, "learning_rate": 4.1703540376961406e-07, "logits/chosen": -0.8846985101699829, "logits/rejected": -0.90301513671875, "logps/chosen": -188.9499969482422, "logps/rejected": -407.67498779296875, "loss": 0.0426, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.0725219249725342, "rewards/margins": 13.767969131469727, "rewards/rejected": -14.838281631469727, "step": 4050 }, { "epoch": 1.02369772791731, "grad_norm": 29.20194435119629, "learning_rate": 4.164889600043525e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -159.2375030517578, "logps/rejected": -398.9125061035156, "loss": 0.0172, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.49177247285842896, "rewards/margins": 13.801562309265137, "rewards/rejected": -14.294530868530273, "step": 4060 }, { "epoch": 1.0262187628021302, "grad_norm": 0.537968635559082, "learning_rate": 4.1594108316663347e-07, "logits/chosen": -0.782763659954071, "logits/rejected": NaN, "logps/chosen": -170.390625, "logps/rejected": -392.0249938964844, "loss": 0.0229, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.628063976764679, "rewards/margins": 14.064453125, "rewards/rejected": -14.696484565734863, "step": 4070 }, { "epoch": 1.0287397976869506, "grad_norm": 3.314084768295288, "learning_rate": 4.153917779723686e-07, "logits/chosen": -0.8406006097793579, "logits/rejected": NaN, "logps/chosen": -178.953125, "logps/rejected": -402.26251220703125, "loss": 0.0368, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7112671136856079, "rewards/margins": 14.256250381469727, "rewards/rejected": -14.964062690734863, "step": 4080 }, { "epoch": 1.0312608325717707, "grad_norm": 1.0263309478759766, "learning_rate": 4.14841049149764e-07, "logits/chosen": -0.7445617914199829, "logits/rejected": NaN, "logps/chosen": -161.36328125, "logps/rejected": -394.625, "loss": 0.031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5408355593681335, "rewards/margins": 14.001562118530273, "rewards/rejected": -14.542187690734863, "step": 4090 }, { "epoch": 1.0337818674565908, "grad_norm": 0.8060359358787537, "learning_rate": 4.142889014392802e-07, "logits/chosen": -0.862518310546875, "logits/rejected": NaN, "logps/chosen": -189.22811889648438, "logps/rejected": -427.98748779296875, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1475830078125, "rewards/margins": 15.468358993530273, "rewards/rejected": -16.614843368530273, "step": 4100 }, { "epoch": 1.0363029023414112, "grad_norm": 43.990848541259766, "learning_rate": 4.137353395935905e-07, "logits/chosen": -0.6894165277481079, "logits/rejected": NaN, "logps/chosen": -172.14688110351562, "logps/rejected": -419.9375, "loss": 0.0307, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.06915283203125, "rewards/margins": 16.03515625, "rewards/rejected": -17.103124618530273, "step": 4110 }, { "epoch": 1.0388239372262313, "grad_norm": 52.504356384277344, "learning_rate": 4.13180368377541e-07, "logits/chosen": -0.8577026128768921, "logits/rejected": NaN, "logps/chosen": -170.2156219482422, "logps/rejected": -432.5625, "loss": 0.049, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1389954090118408, "rewards/margins": 15.968358993530273, "rewards/rejected": -17.096094131469727, "step": 4120 }, { "epoch": 1.0413449721110515, "grad_norm": 0.5120437145233154, "learning_rate": 4.126239925681088e-07, "logits/chosen": -0.7004455327987671, "logits/rejected": -0.843487560749054, "logps/chosen": -158.8359375, "logps/rejected": -416.36248779296875, "loss": 0.0095, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7388855218887329, "rewards/margins": 16.246484756469727, "rewards/rejected": -16.989843368530273, "step": 4130 }, { "epoch": 1.0438660069958718, "grad_norm": 0.8208231925964355, "learning_rate": 4.120662169543612e-07, "logits/chosen": -0.8147293329238892, "logits/rejected": -0.8252319097518921, "logps/chosen": -177.97500610351562, "logps/rejected": -418.2749938964844, "loss": 0.0787, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.2500183582305908, "rewards/margins": 14.790624618530273, "rewards/rejected": -16.044530868530273, "step": 4140 }, { "epoch": 1.046387041880692, "grad_norm": 40.62232971191406, "learning_rate": 4.1150704633741456e-07, "logits/chosen": -0.7378395199775696, "logits/rejected": NaN, "logps/chosen": -174.828125, "logps/rejected": -406.45001220703125, "loss": 0.0214, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.522216796875, "rewards/margins": 14.909375190734863, "rewards/rejected": -16.435155868530273, "step": 4150 }, { "epoch": 1.0489080767655123, "grad_norm": 64.33751678466797, "learning_rate": 4.1094648553039315e-07, "logits/chosen": -0.7418152093887329, "logits/rejected": -0.8817383050918579, "logps/chosen": -171.6843719482422, "logps/rejected": -431.88751220703125, "loss": 0.0363, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.972973644733429, "rewards/margins": 15.439844131469727, "rewards/rejected": -16.407032012939453, "step": 4160 }, { "epoch": 1.0514291116503325, "grad_norm": 0.17918507754802704, "learning_rate": 4.103845393583868e-07, "logits/chosen": -0.901379406452179, "logits/rejected": -0.9739013910293579, "logps/chosen": -150.8781280517578, "logps/rejected": -405.75, "loss": 0.028, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.09697723388671875, "rewards/margins": 15.589062690734863, "rewards/rejected": -15.490625381469727, "step": 4170 }, { "epoch": 1.0539501465351526, "grad_norm": 19.528215408325195, "learning_rate": 4.0982121265841073e-07, "logits/chosen": -0.947418212890625, "logits/rejected": NaN, "logps/chosen": -168.77499389648438, "logps/rejected": -421.01251220703125, "loss": 0.0423, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.04995117336511612, "rewards/margins": 15.001953125, "rewards/rejected": -14.956640243530273, "step": 4180 }, { "epoch": 1.056471181419973, "grad_norm": 0.44857892394065857, "learning_rate": 4.092565102793628e-07, "logits/chosen": -0.890704333782196, "logits/rejected": -0.9279235601425171, "logps/chosen": -158.16561889648438, "logps/rejected": -426.3500061035156, "loss": 0.0396, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.571270763874054, "rewards/margins": 14.227343559265137, "rewards/rejected": -14.8046875, "step": 4190 }, { "epoch": 1.0589922163047931, "grad_norm": 9.257643699645996, "learning_rate": 4.0869043708198224e-07, "logits/chosen": -0.815661609172821, "logits/rejected": NaN, "logps/chosen": -195.515625, "logps/rejected": -456.88751220703125, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.36669921875, "rewards/margins": 15.433984756469727, "rewards/rejected": -16.803905487060547, "step": 4200 }, { "epoch": 1.0615132511896133, "grad_norm": 2.966176748275757, "learning_rate": 4.0812299793880785e-07, "logits/chosen": -0.939404308795929, "logits/rejected": -0.895977795124054, "logps/chosen": -172.94375610351562, "logps/rejected": -444.0874938964844, "loss": 0.0451, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1003296375274658, "rewards/margins": 16.6201171875, "rewards/rejected": -17.7265625, "step": 4210 }, { "epoch": 1.0640342860744336, "grad_norm": 54.08489990234375, "learning_rate": 4.075541977341358e-07, "logits/chosen": -0.878094494342804, "logits/rejected": NaN, "logps/chosen": -161.0460968017578, "logps/rejected": -432.54998779296875, "loss": 0.0323, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.982434093952179, "rewards/margins": 15.187108993530273, "rewards/rejected": -16.170312881469727, "step": 4220 }, { "epoch": 1.0665553209592538, "grad_norm": 5.797738075256348, "learning_rate": 4.0698404136397805e-07, "logits/chosen": -0.853137195110321, "logits/rejected": NaN, "logps/chosen": -168.69375610351562, "logps/rejected": -423.7749938964844, "loss": 0.037, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.3080260753631592, "rewards/margins": 15.174219131469727, "rewards/rejected": -16.479686737060547, "step": 4230 }, { "epoch": 1.069076355844074, "grad_norm": 0.09000521898269653, "learning_rate": 4.0641253373601957e-07, "logits/chosen": -0.712415337562561, "logits/rejected": -0.8466033935546875, "logps/chosen": -170.86563110351562, "logps/rejected": -434.3374938964844, "loss": 0.0977, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3107178211212158, "rewards/margins": 15.857030868530273, "rewards/rejected": -17.165624618530273, "step": 4240 }, { "epoch": 1.0715973907288943, "grad_norm": 48.33607864379883, "learning_rate": 4.0583967976957654e-07, "logits/chosen": -0.795581042766571, "logits/rejected": -0.854931652545929, "logps/chosen": -163.1218719482422, "logps/rejected": -420.8999938964844, "loss": 0.0419, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.514294445514679, "rewards/margins": 15.6484375, "rewards/rejected": -16.161718368530273, "step": 4250 }, { "epoch": 1.0741184256137144, "grad_norm": 32.8408203125, "learning_rate": 4.0526548439555407e-07, "logits/chosen": -0.852764904499054, "logits/rejected": NaN, "logps/chosen": -175.2140655517578, "logps/rejected": -419.6499938964844, "loss": 0.0555, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2991516590118408, "rewards/margins": 15.190234184265137, "rewards/rejected": -16.489843368530273, "step": 4260 }, { "epoch": 1.0766394604985345, "grad_norm": 31.646310806274414, "learning_rate": 4.046899525564034e-07, "logits/chosen": -0.8620849847793579, "logits/rejected": NaN, "logps/chosen": -195.1953125, "logps/rejected": -426.3374938964844, "loss": 0.0725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6074050664901733, "rewards/margins": 14.827343940734863, "rewards/rejected": -16.44140625, "step": 4270 }, { "epoch": 1.079160495383355, "grad_norm": 1.301358938217163, "learning_rate": 4.0411308920607953e-07, "logits/chosen": -0.8445831537246704, "logits/rejected": NaN, "logps/chosen": -174.75, "logps/rejected": -434.79998779296875, "loss": 0.0451, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.144525170326233, "rewards/margins": 15.845703125, "rewards/rejected": -16.996875762939453, "step": 4280 }, { "epoch": 1.081681530268175, "grad_norm": 32.33489990234375, "learning_rate": 4.0353489930999876e-07, "logits/chosen": -0.848114013671875, "logits/rejected": NaN, "logps/chosen": -181.1750030517578, "logps/rejected": -425.2124938964844, "loss": 0.0686, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.2070190906524658, "rewards/margins": 15.977734565734863, "rewards/rejected": -17.189844131469727, "step": 4290 }, { "epoch": 1.0842025651529954, "grad_norm": 1.4178301095962524, "learning_rate": 4.029553878449956e-07, "logits/chosen": -0.656402587890625, "logits/rejected": -0.8496643304824829, "logps/chosen": -171.8000030517578, "logps/rejected": -435.98748779296875, "loss": 0.008, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.603613257408142, "rewards/margins": 15.882031440734863, "rewards/rejected": -17.483592987060547, "step": 4300 }, { "epoch": 1.0867236000378155, "grad_norm": 56.40255355834961, "learning_rate": 4.0237455979928024e-07, "logits/chosen": -0.8558715581893921, "logits/rejected": -0.8710571527481079, "logps/chosen": -193.5749969482422, "logps/rejected": -437.6499938964844, "loss": 0.0694, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.474645972251892, "rewards/margins": 15.274511337280273, "rewards/rejected": -16.748046875, "step": 4310 }, { "epoch": 1.0892446349226357, "grad_norm": 32.58179473876953, "learning_rate": 4.0179242017239544e-07, "logits/chosen": -0.8743911981582642, "logits/rejected": -0.91058349609375, "logps/chosen": -190.2843780517578, "logps/rejected": -417.76251220703125, "loss": 0.0936, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.326574683189392, "rewards/margins": 15.385156631469727, "rewards/rejected": -16.715625762939453, "step": 4320 }, { "epoch": 1.091765669807456, "grad_norm": 32.152198791503906, "learning_rate": 4.012089739751735e-07, "logits/chosen": -0.700396716594696, "logits/rejected": NaN, "logps/chosen": -161.0828094482422, "logps/rejected": -425.1625061035156, "loss": 0.0523, "rewards/accuracies": 0.984375, "rewards/chosen": -1.390252709388733, "rewards/margins": 15.692187309265137, "rewards/rejected": -17.085155487060547, "step": 4330 }, { "epoch": 1.0942867046922762, "grad_norm": 47.866764068603516, "learning_rate": 4.006242262296933e-07, "logits/chosen": -0.8344451785087585, "logits/rejected": NaN, "logps/chosen": -153.9093780517578, "logps/rejected": -408.95001220703125, "loss": 0.028, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.5624755620956421, "rewards/margins": 15.794921875, "rewards/rejected": -16.355859756469727, "step": 4340 }, { "epoch": 1.0968077395770963, "grad_norm": 18.778791427612305, "learning_rate": 4.0003818196923677e-07, "logits/chosen": -0.939849853515625, "logits/rejected": -0.8819946050643921, "logps/chosen": -153.609375, "logps/rejected": -384.1000061035156, "loss": 0.1003, "rewards/accuracies": 0.96875, "rewards/chosen": -0.18875733017921448, "rewards/margins": 15.050390243530273, "rewards/rejected": -15.243749618530273, "step": 4350 }, { "epoch": 1.0993287744619167, "grad_norm": 1.5609654188156128, "learning_rate": 3.994508462382459e-07, "logits/chosen": -0.9391311407089233, "logits/rejected": -1.0138733386993408, "logps/chosen": -141.8625030517578, "logps/rejected": -414.0625, "loss": 0.0272, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.06588134914636612, "rewards/margins": 14.853124618530273, "rewards/rejected": -14.78515625, "step": 4360 }, { "epoch": 1.1018498093467368, "grad_norm": 2.3153107166290283, "learning_rate": 3.98862224092279e-07, "logits/chosen": -0.68609619140625, "logits/rejected": -0.8112426996231079, "logps/chosen": -184.35311889648438, "logps/rejected": -415.4125061035156, "loss": 0.0311, "rewards/accuracies": 0.984375, "rewards/chosen": -1.579833984375, "rewards/margins": 15.617968559265137, "rewards/rejected": -17.198436737060547, "step": 4370 }, { "epoch": 1.104370844231557, "grad_norm": 10.20581340789795, "learning_rate": 3.982723205979675e-07, "logits/chosen": -0.7482544183731079, "logits/rejected": NaN, "logps/chosen": -196.9812469482422, "logps/rejected": -440.4125061035156, "loss": 0.0401, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1445679664611816, "rewards/margins": 15.895312309265137, "rewards/rejected": -18.043750762939453, "step": 4380 }, { "epoch": 1.1068918791163773, "grad_norm": 0.22120849788188934, "learning_rate": 3.976811408329721e-07, "logits/chosen": -0.6115478277206421, "logits/rejected": -0.8300720453262329, "logps/chosen": -175.85311889648438, "logps/rejected": -445.26251220703125, "loss": 0.0426, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.273181200027466, "rewards/margins": 15.760937690734863, "rewards/rejected": -18.032032012939453, "step": 4390 }, { "epoch": 1.1094129140011975, "grad_norm": 1.0646626949310303, "learning_rate": 3.9708868988593916e-07, "logits/chosen": -0.6912567019462585, "logits/rejected": NaN, "logps/chosen": -173.22030639648438, "logps/rejected": -439.5249938964844, "loss": 0.0222, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.839196801185608, "rewards/margins": 16.298437118530273, "rewards/rejected": -18.142969131469727, "step": 4400 }, { "epoch": 1.1119339488860178, "grad_norm": 0.8677557110786438, "learning_rate": 3.9649497285645673e-07, "logits/chosen": -0.7456115484237671, "logits/rejected": NaN, "logps/chosen": -184.63827514648438, "logps/rejected": -458.48748779296875, "loss": 0.0091, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4429137706756592, "rewards/margins": 16.710155487060547, "rewards/rejected": -18.154687881469727, "step": 4410 }, { "epoch": 1.114454983770838, "grad_norm": 20.164892196655273, "learning_rate": 3.958999948550111e-07, "logits/chosen": -0.828930675983429, "logits/rejected": NaN, "logps/chosen": -189.4093780517578, "logps/rejected": -428.29998779296875, "loss": 0.064, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.934741199016571, "rewards/margins": 16.392187118530273, "rewards/rejected": -17.3203125, "step": 4420 }, { "epoch": 1.116976018655658, "grad_norm": 10.153223037719727, "learning_rate": 3.9530376100294236e-07, "logits/chosen": -0.867602527141571, "logits/rejected": NaN, "logps/chosen": -178.19375610351562, "logps/rejected": -452.9375, "loss": 0.0245, "rewards/accuracies": 0.984375, "rewards/chosen": -0.6298462152481079, "rewards/margins": 15.197265625, "rewards/rejected": -15.828906059265137, "step": 4430 }, { "epoch": 1.1194970535404785, "grad_norm": 6.944910526275635, "learning_rate": 3.9470627643240054e-07, "logits/chosen": -0.93878173828125, "logits/rejected": NaN, "logps/chosen": -188.4265594482422, "logps/rejected": -427.91876220703125, "loss": 0.0321, "rewards/accuracies": 0.984375, "rewards/chosen": -0.7821151614189148, "rewards/margins": 15.071484565734863, "rewards/rejected": -15.850390434265137, "step": 4440 }, { "epoch": 1.1220180884252986, "grad_norm": 1.323344111442566, "learning_rate": 3.941075462863011e-07, "logits/chosen": -0.7843353152275085, "logits/rejected": -0.882397472858429, "logps/chosen": -187.2312469482422, "logps/rejected": -440.6499938964844, "loss": 0.0468, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.3226501941680908, "rewards/margins": 15.931249618530273, "rewards/rejected": -17.248437881469727, "step": 4450 }, { "epoch": 1.1245391233101187, "grad_norm": 30.944652557373047, "learning_rate": 3.935075757182813e-07, "logits/chosen": -0.696331799030304, "logits/rejected": -0.816741943359375, "logps/chosen": -173.47811889648438, "logps/rejected": -435.63751220703125, "loss": 0.045, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.3406920433044434, "rewards/margins": 15.443359375, "rewards/rejected": -17.78125, "step": 4460 }, { "epoch": 1.127060158194939, "grad_norm": 28.859397888183594, "learning_rate": 3.9290636989265536e-07, "logits/chosen": -0.89483642578125, "logits/rejected": NaN, "logps/chosen": -186.8125, "logps/rejected": -435.9750061035156, "loss": 0.0461, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.4681639671325684, "rewards/margins": 15.239453315734863, "rewards/rejected": -17.711719512939453, "step": 4470 }, { "epoch": 1.1295811930797592, "grad_norm": 0.29573288559913635, "learning_rate": 3.923039339843699e-07, "logits/chosen": -0.819714367389679, "logits/rejected": NaN, "logps/chosen": -173.52499389648438, "logps/rejected": -414.0249938964844, "loss": 0.0126, "rewards/accuracies": 0.984375, "rewards/chosen": -1.065582275390625, "rewards/margins": 15.750781059265137, "rewards/rejected": -16.807031631469727, "step": 4480 }, { "epoch": 1.1321022279645794, "grad_norm": 0.5903184413909912, "learning_rate": 3.9170027317895993e-07, "logits/chosen": -0.857037365436554, "logits/rejected": -0.913128674030304, "logps/chosen": -185.90780639648438, "logps/rejected": -448.01251220703125, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.043371558189392, "rewards/margins": 16.575780868530273, "rewards/rejected": -17.625782012939453, "step": 4490 }, { "epoch": 1.1346232628493997, "grad_norm": 33.92226791381836, "learning_rate": 3.910953926725037e-07, "logits/chosen": -0.8277953863143921, "logits/rejected": NaN, "logps/chosen": -198.13436889648438, "logps/rejected": -439.25, "loss": 0.0257, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.754132091999054, "rewards/margins": 16.370311737060547, "rewards/rejected": -17.135156631469727, "step": 4500 }, { "epoch": 1.1371442977342199, "grad_norm": 4.057684421539307, "learning_rate": 3.904892976715783e-07, "logits/chosen": -0.884753406047821, "logits/rejected": -0.910186767578125, "logps/chosen": -187.16250610351562, "logps/rejected": -439.7250061035156, "loss": 0.0088, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.75286865234375, "rewards/margins": 17.134374618530273, "rewards/rejected": -17.880468368530273, "step": 4510 }, { "epoch": 1.1396653326190402, "grad_norm": 29.96042251586914, "learning_rate": 3.898819933932146e-07, "logits/chosen": -0.8850555419921875, "logits/rejected": NaN, "logps/chosen": -197.0906219482422, "logps/rejected": -457.8999938964844, "loss": 0.0227, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2597687244415283, "rewards/margins": 16.990625381469727, "rewards/rejected": -18.24609375, "step": 4520 }, { "epoch": 1.1421863675038604, "grad_norm": 62.65980529785156, "learning_rate": 3.8927348506485253e-07, "logits/chosen": -0.8273956179618835, "logits/rejected": NaN, "logps/chosen": -191.3312530517578, "logps/rejected": -446.92498779296875, "loss": 0.0891, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.6036865711212158, "rewards/margins": 16.846485137939453, "rewards/rejected": -18.442188262939453, "step": 4530 }, { "epoch": 1.1447074023886805, "grad_norm": 0.040741514414548874, "learning_rate": 3.8866377792429593e-07, "logits/chosen": -0.768585205078125, "logits/rejected": NaN, "logps/chosen": -179.86874389648438, "logps/rejected": -442.45001220703125, "loss": 0.0438, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9556763172149658, "rewards/margins": 16.858592987060547, "rewards/rejected": -18.814062118530273, "step": 4540 }, { "epoch": 1.1472284372735007, "grad_norm": 22.012849807739258, "learning_rate": 3.880528772196677e-07, "logits/chosen": -0.7656310796737671, "logits/rejected": NaN, "logps/chosen": -189.9734344482422, "logps/rejected": -462.7124938964844, "loss": 0.0596, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.3709319829940796, "rewards/margins": 17.796875, "rewards/rejected": -19.169530868530273, "step": 4550 }, { "epoch": 1.149749472158321, "grad_norm": 7.90686559677124, "learning_rate": 3.8744078820936445e-07, "logits/chosen": -0.7236602902412415, "logits/rejected": -0.8540130853652954, "logps/chosen": -193.41561889648438, "logps/rejected": -457.86248779296875, "loss": 0.0633, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6199829578399658, "rewards/margins": 17.340234756469727, "rewards/rejected": -18.953907012939453, "step": 4560 }, { "epoch": 1.1522705070431412, "grad_norm": 2.5892670154571533, "learning_rate": 3.8682751616201106e-07, "logits/chosen": -0.6721771359443665, "logits/rejected": -0.78631591796875, "logps/chosen": -152.9562530517578, "logps/rejected": -418.20001220703125, "loss": 0.0387, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2701553106307983, "rewards/margins": 17.060155868530273, "rewards/rejected": -18.329687118530273, "step": 4570 }, { "epoch": 1.1547915419279615, "grad_norm": 0.5847570300102234, "learning_rate": 3.862130663564158e-07, "logits/chosen": -0.6242614984512329, "logits/rejected": -0.8474655151367188, "logps/chosen": -163.8781280517578, "logps/rejected": -433.5625, "loss": 0.0223, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.342626929283142, "rewards/margins": 17.76171875, "rewards/rejected": -19.102344512939453, "step": 4580 }, { "epoch": 1.1573125768127817, "grad_norm": 2.9834423065185547, "learning_rate": 3.855974440815244e-07, "logits/chosen": -0.6628662347793579, "logits/rejected": NaN, "logps/chosen": -173.671875, "logps/rejected": -478.7749938964844, "loss": 0.0331, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0401673316955566, "rewards/margins": 17.744531631469727, "rewards/rejected": -19.785938262939453, "step": 4590 }, { "epoch": 1.1598336116976018, "grad_norm": 38.29576873779297, "learning_rate": 3.8498065463637505e-07, "logits/chosen": -0.7745605707168579, "logits/rejected": -0.848828136920929, "logps/chosen": -179.37655639648438, "logps/rejected": -473.1000061035156, "loss": 0.0226, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.765924096107483, "rewards/margins": 18.313282012939453, "rewards/rejected": -20.069530487060547, "step": 4600 }, { "epoch": 1.1623546465824222, "grad_norm": 0.06788729876279831, "learning_rate": 3.843627033300521e-07, "logits/chosen": -0.594830334186554, "logits/rejected": -0.766552746295929, "logps/chosen": -177.9875030517578, "logps/rejected": -450.42498779296875, "loss": 0.0299, "rewards/accuracies": 0.984375, "rewards/chosen": -2.1217575073242188, "rewards/margins": 18.549219131469727, "rewards/rejected": -20.674219131469727, "step": 4610 }, { "epoch": 1.1648756814672423, "grad_norm": 16.247982025146484, "learning_rate": 3.83743595481641e-07, "logits/chosen": -0.6792846918106079, "logits/rejected": -0.661816418170929, "logps/chosen": -183.421875, "logps/rejected": -453.7875061035156, "loss": 0.053, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0847411155700684, "rewards/margins": 18.391407012939453, "rewards/rejected": -20.470312118530273, "step": 4620 }, { "epoch": 1.1673967163520624, "grad_norm": 32.13294219970703, "learning_rate": 3.831233364201825e-07, "logits/chosen": -0.6574462652206421, "logits/rejected": NaN, "logps/chosen": -182.8468780517578, "logps/rejected": -459.25, "loss": 0.0287, "rewards/accuracies": 0.984375, "rewards/chosen": -2.2820982933044434, "rewards/margins": 18.239063262939453, "rewards/rejected": -20.537500381469727, "step": 4630 }, { "epoch": 1.1699177512368828, "grad_norm": 0.13954737782478333, "learning_rate": 3.8250193148462583e-07, "logits/chosen": -0.6697998046875, "logits/rejected": NaN, "logps/chosen": -196.30313110351562, "logps/rejected": -462.4375, "loss": 0.0319, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4587769508361816, "rewards/margins": 18.317188262939453, "rewards/rejected": -20.76953125, "step": 4640 }, { "epoch": 1.172438786121703, "grad_norm": 0.488079696893692, "learning_rate": 3.8187938602378413e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -167.5500030517578, "logps/rejected": -431.0874938964844, "loss": 0.0334, "rewards/accuracies": 0.984375, "rewards/chosen": -1.77490234375, "rewards/margins": 17.375782012939453, "rewards/rejected": -19.145313262939453, "step": 4650 }, { "epoch": 1.174959821006523, "grad_norm": 1.3813257217407227, "learning_rate": 3.812557053962875e-07, "logits/chosen": -0.5510803461074829, "logits/rejected": -0.61663818359375, "logps/chosen": -174.2062530517578, "logps/rejected": -429.9125061035156, "loss": 0.023, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5551025867462158, "rewards/margins": 17.265233993530273, "rewards/rejected": -18.825000762939453, "step": 4660 }, { "epoch": 1.1774808558913434, "grad_norm": 2.1922290325164795, "learning_rate": 3.8063089497053713e-07, "logits/chosen": -0.5169891119003296, "logits/rejected": -0.7167602777481079, "logps/chosen": -162.9421844482422, "logps/rejected": -457.6875, "loss": 0.0309, "rewards/accuracies": 0.984375, "rewards/chosen": -2.115273952484131, "rewards/margins": 17.363672256469727, "rewards/rejected": -19.471874237060547, "step": 4670 }, { "epoch": 1.1800018907761636, "grad_norm": 24.671855926513672, "learning_rate": 3.80004960124659e-07, "logits/chosen": -0.5870300531387329, "logits/rejected": -0.598742663860321, "logps/chosen": -176.8249969482422, "logps/rejected": -459.0249938964844, "loss": 0.0511, "rewards/accuracies": 0.96875, "rewards/chosen": -1.6081879138946533, "rewards/margins": 19.241405487060547, "rewards/rejected": -20.860157012939453, "step": 4680 }, { "epoch": 1.182522925660984, "grad_norm": 110.37706756591797, "learning_rate": 3.7937790624645776e-07, "logits/chosen": -0.6319427490234375, "logits/rejected": NaN, "logps/chosen": -180.87655639648438, "logps/rejected": -448.375, "loss": 0.1214, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.6691573858261108, "rewards/margins": 18.247655868530273, "rewards/rejected": -19.915624618530273, "step": 4690 }, { "epoch": 1.185043960545804, "grad_norm": 96.3826904296875, "learning_rate": 3.7874973873337026e-07, "logits/chosen": -0.5775116086006165, "logits/rejected": NaN, "logps/chosen": -181.0437469482422, "logps/rejected": -494.4125061035156, "loss": 0.072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8772156238555908, "rewards/margins": 19.025781631469727, "rewards/rejected": -20.897655487060547, "step": 4700 }, { "epoch": 1.1875649954306242, "grad_norm": 5.518882751464844, "learning_rate": 3.78120462992419e-07, "logits/chosen": -0.5583282709121704, "logits/rejected": NaN, "logps/chosen": -193.6125030517578, "logps/rejected": -473.38751220703125, "loss": 0.0479, "rewards/accuracies": 0.984375, "rewards/chosen": -2.6495118141174316, "rewards/margins": 18.890233993530273, "rewards/rejected": -21.541406631469727, "step": 4710 }, { "epoch": 1.1900860303154446, "grad_norm": 14.727373123168945, "learning_rate": 3.774900844401657e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -193.33438110351562, "logps/rejected": -489.38751220703125, "loss": 0.0069, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.304858446121216, "rewards/margins": 18.685155868530273, "rewards/rejected": -20.989843368530273, "step": 4720 }, { "epoch": 1.1926070652002647, "grad_norm": 77.03333282470703, "learning_rate": 3.768586085026648e-07, "logits/chosen": -0.7048279047012329, "logits/rejected": NaN, "logps/chosen": -167.1437530517578, "logps/rejected": -482.7250061035156, "loss": 0.0241, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1109740734100342, "rewards/margins": 19.665624618530273, "rewards/rejected": -20.78125, "step": 4730 }, { "epoch": 1.1951281000850849, "grad_norm": 13.447142601013184, "learning_rate": 3.7622604061541646e-07, "logits/chosen": -0.5366241335868835, "logits/rejected": NaN, "logps/chosen": -164.94375610351562, "logps/rejected": -448.17498779296875, "loss": 0.0506, "rewards/accuracies": 0.984375, "rewards/chosen": -1.029571533203125, "rewards/margins": 18.068750381469727, "rewards/rejected": -19.092187881469727, "step": 4740 }, { "epoch": 1.1976491349699052, "grad_norm": 33.20822525024414, "learning_rate": 3.755923862233199e-07, "logits/chosen": -0.6381179690361023, "logits/rejected": -0.7450500726699829, "logps/chosen": -156.8562469482422, "logps/rejected": -415.9375, "loss": 0.0199, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.113897681236267, "rewards/margins": 16.806249618530273, "rewards/rejected": -17.928125381469727, "step": 4750 }, { "epoch": 1.2001701698547254, "grad_norm": 3.509812593460083, "learning_rate": 3.7495765078062653e-07, "logits/chosen": -0.4502319395542145, "logits/rejected": NaN, "logps/chosen": -203.92813110351562, "logps/rejected": -444.95001220703125, "loss": 0.0852, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.660290479660034, "rewards/margins": 16.199609756469727, "rewards/rejected": -18.858592987060547, "step": 4760 }, { "epoch": 1.2026912047395455, "grad_norm": 13.525970458984375, "learning_rate": 3.7432183975089326e-07, "logits/chosen": -0.563854992389679, "logits/rejected": NaN, "logps/chosen": -195.125, "logps/rejected": -458.98748779296875, "loss": 0.0599, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.8112730979919434, "rewards/margins": 16.762500762939453, "rewards/rejected": -20.579687118530273, "step": 4770 }, { "epoch": 1.2052122396243659, "grad_norm": 16.99437141418457, "learning_rate": 3.7368495860693493e-07, "logits/chosen": -0.49774169921875, "logits/rejected": NaN, "logps/chosen": -206.0625, "logps/rejected": -481.0, "loss": 0.0181, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.202416896820068, "rewards/margins": 16.602344512939453, "rewards/rejected": -20.80078125, "step": 4780 }, { "epoch": 1.207733274509186, "grad_norm": 10.724235534667969, "learning_rate": 3.730470128307778e-07, "logits/chosen": -0.456369012594223, "logits/rejected": NaN, "logps/chosen": -192.3000030517578, "logps/rejected": -461.1000061035156, "loss": 0.0762, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9044189453125, "rewards/margins": 16.600000381469727, "rewards/rejected": -19.51171875, "step": 4790 }, { "epoch": 1.2102543093940064, "grad_norm": 0.8202361464500427, "learning_rate": 3.7240800791361176e-07, "logits/chosen": -0.6577895879745483, "logits/rejected": -0.6787017583847046, "logps/chosen": -180.24374389648438, "logps/rejected": -450.0625, "loss": 0.0824, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.425341844558716, "rewards/margins": 17.694921493530273, "rewards/rejected": -20.120311737060547, "step": 4800 }, { "epoch": 1.2127753442788265, "grad_norm": 72.00366973876953, "learning_rate": 3.717679493557437e-07, "logits/chosen": -0.4876708984375, "logits/rejected": -0.6789795160293579, "logps/chosen": -185.2062530517578, "logps/rejected": -474.25, "loss": 0.025, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.491442918777466, "rewards/margins": 17.414844512939453, "rewards/rejected": -19.908594131469727, "step": 4810 }, { "epoch": 1.2152963791636466, "grad_norm": 0.38998374342918396, "learning_rate": 3.7112684266654954e-07, "logits/chosen": -0.48649293184280396, "logits/rejected": NaN, "logps/chosen": -176.35000610351562, "logps/rejected": -443.3125, "loss": 0.0338, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9800293445587158, "rewards/margins": 17.452342987060547, "rewards/rejected": -19.430469512939453, "step": 4820 }, { "epoch": 1.2178174140484668, "grad_norm": 91.42658233642578, "learning_rate": 3.7048469336442735e-07, "logits/chosen": -0.5245330929756165, "logits/rejected": -0.5959838628768921, "logps/chosen": -172.74374389648438, "logps/rejected": -463.7875061035156, "loss": 0.0553, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.5302245616912842, "rewards/margins": 17.704687118530273, "rewards/rejected": -19.236719131469727, "step": 4830 }, { "epoch": 1.2203384489332871, "grad_norm": 1.3661686182022095, "learning_rate": 3.698415069767494e-07, "logits/chosen": -0.7337982058525085, "logits/rejected": NaN, "logps/chosen": -174.52499389648438, "logps/rejected": -437.75, "loss": 0.02, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.308508276939392, "rewards/margins": 17.560155868530273, "rewards/rejected": -18.875, "step": 4840 }, { "epoch": 1.2228594838181073, "grad_norm": 74.0618896484375, "learning_rate": 3.69197289039815e-07, "logits/chosen": -0.6135619878768921, "logits/rejected": NaN, "logps/chosen": -191.1750030517578, "logps/rejected": -459.2124938964844, "loss": 0.1245, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.421600341796875, "rewards/margins": 16.59375, "rewards/rejected": -18.006250381469727, "step": 4850 }, { "epoch": 1.2253805187029276, "grad_norm": 5.496711730957031, "learning_rate": 3.6855204509880243e-07, "logits/chosen": -0.5252532958984375, "logits/rejected": NaN, "logps/chosen": -164.81094360351562, "logps/rejected": -432.42498779296875, "loss": 0.034, "rewards/accuracies": 0.984375, "rewards/chosen": -1.73382568359375, "rewards/margins": 16.966014862060547, "rewards/rejected": -18.69921875, "step": 4860 }, { "epoch": 1.2279015535877478, "grad_norm": 32.78495788574219, "learning_rate": 3.6790578070772166e-07, "logits/chosen": -0.729217529296875, "logits/rejected": NaN, "logps/chosen": -184.28750610351562, "logps/rejected": -448.9624938964844, "loss": 0.0459, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5094726085662842, "rewards/margins": 16.883594512939453, "rewards/rejected": -18.389062881469727, "step": 4870 }, { "epoch": 1.230422588472568, "grad_norm": 0.15759225189685822, "learning_rate": 3.672585014293661e-07, "logits/chosen": -0.5635436773300171, "logits/rejected": NaN, "logps/chosen": -176.74374389648438, "logps/rejected": -467.36248779296875, "loss": 0.0404, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.0954681634902954, "rewards/margins": 17.58203125, "rewards/rejected": -18.6875, "step": 4880 }, { "epoch": 1.2329436233573883, "grad_norm": 1.506486177444458, "learning_rate": 3.666102128352649e-07, "logits/chosen": -0.6444259881973267, "logits/rejected": NaN, "logps/chosen": -168.1796875, "logps/rejected": -406.20001220703125, "loss": 0.0541, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.4007202088832855, "rewards/margins": 16.621875762939453, "rewards/rejected": -17.013280868530273, "step": 4890 }, { "epoch": 1.2354646582422084, "grad_norm": 23.36858367919922, "learning_rate": 3.6596092050563513e-07, "logits/chosen": -0.5533996820449829, "logits/rejected": NaN, "logps/chosen": -186.7937469482422, "logps/rejected": -472.6499938964844, "loss": 0.0465, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1389038562774658, "rewards/margins": 17.483983993530273, "rewards/rejected": -18.625782012939453, "step": 4900 }, { "epoch": 1.2379856931270286, "grad_norm": 38.460933685302734, "learning_rate": 3.653106300293336e-07, "logits/chosen": -0.630859375, "logits/rejected": NaN, "logps/chosen": -191.02188110351562, "logps/rejected": -454.79998779296875, "loss": 0.0528, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1395447254180908, "rewards/margins": 17.396093368530273, "rewards/rejected": -18.524999618530273, "step": 4910 }, { "epoch": 1.240506728011849, "grad_norm": 4.799333095550537, "learning_rate": 3.6465934700380873e-07, "logits/chosen": -0.6342529058456421, "logits/rejected": NaN, "logps/chosen": -203.140625, "logps/rejected": -449.38751220703125, "loss": 0.0365, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.287846326828003, "rewards/margins": 16.837108612060547, "rewards/rejected": -19.125, "step": 4920 }, { "epoch": 1.243027762896669, "grad_norm": 0.01622737944126129, "learning_rate": 3.640070770350524e-07, "logits/chosen": -0.434793084859848, "logits/rejected": NaN, "logps/chosen": -190.74374389648438, "logps/rejected": -454.82501220703125, "loss": 0.0561, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.1303391456604004, "rewards/margins": 18.042186737060547, "rewards/rejected": -20.176563262939453, "step": 4930 }, { "epoch": 1.2455487977814892, "grad_norm": 3.086185932159424, "learning_rate": 3.633538257375519e-07, "logits/chosen": -0.8136962652206421, "logits/rejected": NaN, "logps/chosen": -201.3874969482422, "logps/rejected": -466.57501220703125, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.217419385910034, "rewards/margins": 17.982812881469727, "rewards/rejected": -20.203125, "step": 4940 }, { "epoch": 1.2480698326663096, "grad_norm": 3.8737025260925293, "learning_rate": 3.626995987342412e-07, "logits/chosen": -0.634899914264679, "logits/rejected": NaN, "logps/chosen": -187.2468719482422, "logps/rejected": -463.875, "loss": 0.0805, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.354229688644409, "rewards/margins": 17.305078506469727, "rewards/rejected": -19.659374237060547, "step": 4950 }, { "epoch": 1.2505908675511297, "grad_norm": 1.117434024810791, "learning_rate": 3.620444016564528e-07, "logits/chosen": -0.5168914794921875, "logits/rejected": NaN, "logps/chosen": -175.875, "logps/rejected": -452.11248779296875, "loss": 0.0708, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.087716579437256, "rewards/margins": 17.024219512939453, "rewards/rejected": -20.110157012939453, "step": 4960 }, { "epoch": 1.25311190243595, "grad_norm": 0.07455004006624222, "learning_rate": 3.6138824014386945e-07, "logits/chosen": -0.6730133295059204, "logits/rejected": -0.80548095703125, "logps/chosen": -199.50625610351562, "logps/rejected": -470.125, "loss": 0.0453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5207152366638184, "rewards/margins": 18.236719131469727, "rewards/rejected": -20.751562118530273, "step": 4970 }, { "epoch": 1.2556329373207702, "grad_norm": 17.843994140625, "learning_rate": 3.6073111984447497e-07, "logits/chosen": -0.608929455280304, "logits/rejected": NaN, "logps/chosen": -186.4718780517578, "logps/rejected": -444.76251220703125, "loss": 0.0422, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.762872338294983, "rewards/margins": 16.639842987060547, "rewards/rejected": -18.395313262939453, "step": 4980 }, { "epoch": 1.2581539722055903, "grad_norm": 5.764465808868408, "learning_rate": 3.600730464145064e-07, "logits/chosen": -0.676239013671875, "logits/rejected": -0.8214966058731079, "logps/chosen": -181.99374389648438, "logps/rejected": -446.98748779296875, "loss": 0.0432, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.820867896080017, "rewards/margins": 16.55859375, "rewards/rejected": -18.381250381469727, "step": 4990 }, { "epoch": 1.2606750070904107, "grad_norm": 37.280582427978516, "learning_rate": 3.594140255184048e-07, "logits/chosen": -0.542956531047821, "logits/rejected": -0.6413513422012329, "logps/chosen": -182.5, "logps/rejected": -437.51251220703125, "loss": 0.0619, "rewards/accuracies": 0.96875, "rewards/chosen": -3.02410888671875, "rewards/margins": 16.636327743530273, "rewards/rejected": -19.666406631469727, "step": 5000 }, { "epoch": 1.2631960419752308, "grad_norm": 41.70035171508789, "learning_rate": 3.5875406282876676e-07, "logits/chosen": -0.5741058588027954, "logits/rejected": -0.846264660358429, "logps/chosen": -213.265625, "logps/rejected": -460.6000061035156, "loss": 0.0424, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.3206543922424316, "rewards/margins": 16.348047256469727, "rewards/rejected": -19.66796875, "step": 5010 }, { "epoch": 1.265717076860051, "grad_norm": 4.336178302764893, "learning_rate": 3.5809316402629533e-07, "logits/chosen": -0.568096935749054, "logits/rejected": -0.676647961139679, "logps/chosen": -187.35311889648438, "logps/rejected": -440.07501220703125, "loss": 0.0588, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.4081788063049316, "rewards/margins": 16.447656631469727, "rewards/rejected": -19.858592987060547, "step": 5020 }, { "epoch": 1.2682381117448713, "grad_norm": 19.287961959838867, "learning_rate": 3.5743133479975137e-07, "logits/chosen": -0.646801769733429, "logits/rejected": NaN, "logps/chosen": -200.3125, "logps/rejected": -469.6875, "loss": 0.0322, "rewards/accuracies": 0.984375, "rewards/chosen": -2.770672559738159, "rewards/margins": 17.32421875, "rewards/rejected": -20.099218368530273, "step": 5030 }, { "epoch": 1.2707591466296915, "grad_norm": 0.24015294015407562, "learning_rate": 3.567685808459044e-07, "logits/chosen": -0.6792877316474915, "logits/rejected": -0.6730712652206421, "logps/chosen": -176.83749389648438, "logps/rejected": -444.5874938964844, "loss": 0.0729, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8268859386444092, "rewards/margins": 17.83203125, "rewards/rejected": -19.66015625, "step": 5040 }, { "epoch": 1.2732801815145116, "grad_norm": 0.9365430474281311, "learning_rate": 3.5610490786948353e-07, "logits/chosen": -0.6522156000137329, "logits/rejected": -0.867340087890625, "logps/chosen": -177.8874969482422, "logps/rejected": -462.5375061035156, "loss": 0.0155, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9126724004745483, "rewards/margins": 17.822656631469727, "rewards/rejected": -19.729686737060547, "step": 5050 }, { "epoch": 1.275801216399332, "grad_norm": 1.9426658153533936, "learning_rate": 3.5544032158312883e-07, "logits/chosen": -0.7311080694198608, "logits/rejected": NaN, "logps/chosen": -199.4968719482422, "logps/rejected": -473.1499938964844, "loss": 0.0564, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.63739013671875, "rewards/margins": 17.164844512939453, "rewards/rejected": -19.807031631469727, "step": 5060 }, { "epoch": 1.2783222512841521, "grad_norm": 0.5340166687965393, "learning_rate": 3.5477482770734137e-07, "logits/chosen": -0.605181872844696, "logits/rejected": -0.609301745891571, "logps/chosen": -170.1984405517578, "logps/rejected": -435.6499938964844, "loss": 0.0492, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5174560546875, "rewards/margins": 16.803125381469727, "rewards/rejected": -19.326562881469727, "step": 5070 }, { "epoch": 1.2808432861689725, "grad_norm": 14.329954147338867, "learning_rate": 3.5410843197043454e-07, "logits/chosen": -0.7229949831962585, "logits/rejected": NaN, "logps/chosen": -188.44375610351562, "logps/rejected": -461.3500061035156, "loss": 0.0188, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.803106665611267, "rewards/margins": 17.576562881469727, "rewards/rejected": -19.37109375, "step": 5080 }, { "epoch": 1.2833643210537926, "grad_norm": 0.08909779787063599, "learning_rate": 3.534411401084848e-07, "logits/chosen": -0.7681472897529602, "logits/rejected": -0.8647216558456421, "logps/chosen": -184.26718139648438, "logps/rejected": -445.88751220703125, "loss": 0.0518, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2644469738006592, "rewards/margins": 18.335155487060547, "rewards/rejected": -19.60546875, "step": 5090 }, { "epoch": 1.2858853559386128, "grad_norm": 7.347795009613037, "learning_rate": 3.5277295786528183e-07, "logits/chosen": -0.7448791265487671, "logits/rejected": NaN, "logps/chosen": -171.9968719482422, "logps/rejected": -433.1625061035156, "loss": 0.0118, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.73138427734375, "rewards/margins": 17.712499618530273, "rewards/rejected": -19.44140625, "step": 5100 }, { "epoch": 1.288406390823433, "grad_norm": 7.123908042907715, "learning_rate": 3.521038909922794e-07, "logits/chosen": -0.575817883014679, "logits/rejected": -0.7146362066268921, "logps/chosen": -172.55313110351562, "logps/rejected": -472.8125, "loss": 0.0388, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.6087768077850342, "rewards/margins": 18.64453125, "rewards/rejected": -20.256250381469727, "step": 5110 }, { "epoch": 1.2909274257082533, "grad_norm": 15.770383834838867, "learning_rate": 3.5143394524854613e-07, "logits/chosen": -0.716168224811554, "logits/rejected": NaN, "logps/chosen": -212.140625, "logps/rejected": -470.3500061035156, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8871185779571533, "rewards/margins": 18.524999618530273, "rewards/rejected": -20.410938262939453, "step": 5120 }, { "epoch": 1.2934484605930734, "grad_norm": 5.0542731285095215, "learning_rate": 3.5076312640071515e-07, "logits/chosen": -0.564953625202179, "logits/rejected": -0.866833508014679, "logps/chosen": -180.3625030517578, "logps/rejected": -479.29998779296875, "loss": 0.0039, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.47869873046875, "rewards/margins": 18.438282012939453, "rewards/rejected": -20.921092987060547, "step": 5130 }, { "epoch": 1.2959694954778938, "grad_norm": 6.033849716186523, "learning_rate": 3.5009144022293533e-07, "logits/chosen": -0.8160613775253296, "logits/rejected": -0.826733410358429, "logps/chosen": -197.6281280517578, "logps/rejected": -497.92498779296875, "loss": 0.0209, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.093005418777466, "rewards/margins": 19.922657012939453, "rewards/rejected": -22.017969131469727, "step": 5140 }, { "epoch": 1.298490530362714, "grad_norm": 0.2906882166862488, "learning_rate": 3.4941889249682095e-07, "logits/chosen": -0.790295422077179, "logits/rejected": -0.8916870355606079, "logps/chosen": -189.234375, "logps/rejected": -490.0, "loss": 0.0294, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.8627227544784546, "rewards/margins": 19.714061737060547, "rewards/rejected": -21.571094512939453, "step": 5150 }, { "epoch": 1.301011565247534, "grad_norm": 62.98758316040039, "learning_rate": 3.487454890114023e-07, "logits/chosen": -0.7264038324356079, "logits/rejected": NaN, "logps/chosen": -203.9499969482422, "logps/rejected": -494.7749938964844, "loss": 0.0054, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.5980896949768066, "rewards/margins": 19.564062118530273, "rewards/rejected": -22.164844512939453, "step": 5160 }, { "epoch": 1.3035326001323544, "grad_norm": 0.056722406297922134, "learning_rate": 3.480712355630757e-07, "logits/chosen": -0.530194103717804, "logits/rejected": -0.8155883550643921, "logps/chosen": -199.60000610351562, "logps/rejected": -510.67498779296875, "loss": 0.0869, "rewards/accuracies": 0.984375, "rewards/chosen": -3.550701856613159, "rewards/margins": 19.362499237060547, "rewards/rejected": -22.91015625, "step": 5170 }, { "epoch": 1.3060536350171745, "grad_norm": 39.53110122680664, "learning_rate": 3.4739613795555345e-07, "logits/chosen": -0.568408191204071, "logits/rejected": NaN, "logps/chosen": -214.88125610351562, "logps/rejected": -492.0, "loss": 0.0464, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -4.383715629577637, "rewards/margins": 18.739063262939453, "rewards/rejected": -23.12109375, "step": 5180 }, { "epoch": 1.308574669901995, "grad_norm": 1.081410527229309, "learning_rate": 3.4672020199981414e-07, "logits/chosen": -0.547320544719696, "logits/rejected": -0.740161120891571, "logps/chosen": -197.47500610351562, "logps/rejected": -466.5249938964844, "loss": 0.0604, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.8207764625549316, "rewards/margins": 19.251562118530273, "rewards/rejected": -22.071094512939453, "step": 5190 }, { "epoch": 1.311095704786815, "grad_norm": 51.64591598510742, "learning_rate": 3.4604343351405276e-07, "logits/chosen": -0.6024414300918579, "logits/rejected": -0.8381103277206421, "logps/chosen": -191.88436889648438, "logps/rejected": -480.9750061035156, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.549877882003784, "rewards/margins": 18.856250762939453, "rewards/rejected": -21.397655487060547, "step": 5200 }, { "epoch": 1.3136167396716352, "grad_norm": 6.819519996643066, "learning_rate": 3.4536583832363e-07, "logits/chosen": -0.5940658450126648, "logits/rejected": -0.7266845703125, "logps/chosen": -169.4421844482422, "logps/rejected": -464.04998779296875, "loss": 0.037, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.2802672386169434, "rewards/margins": 17.58203125, "rewards/rejected": -19.849218368530273, "step": 5210 }, { "epoch": 1.3161377745564553, "grad_norm": 0.693911075592041, "learning_rate": 3.4468742226102285e-07, "logits/chosen": -0.7145630121231079, "logits/rejected": NaN, "logps/chosen": -186.4187469482422, "logps/rejected": -450.88751220703125, "loss": 0.0161, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.951287865638733, "rewards/margins": 18.79296875, "rewards/rejected": -20.74609375, "step": 5220 }, { "epoch": 1.3186588094412757, "grad_norm": 15.80001163482666, "learning_rate": 3.44008191165774e-07, "logits/chosen": -0.6357879638671875, "logits/rejected": NaN, "logps/chosen": -195.2843780517578, "logps/rejected": -491.4375, "loss": 0.0333, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.2079100608825684, "rewards/margins": 18.950000762939453, "rewards/rejected": -21.155467987060547, "step": 5230 }, { "epoch": 1.3211798443260958, "grad_norm": 10.180394172668457, "learning_rate": 3.4332815088444126e-07, "logits/chosen": -0.6378875970840454, "logits/rejected": -0.872546374797821, "logps/chosen": -189.6906280517578, "logps/rejected": -480.75, "loss": 0.0197, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.4325013160705566, "rewards/margins": 18.842967987060547, "rewards/rejected": -21.280467987060547, "step": 5240 }, { "epoch": 1.3237008792109162, "grad_norm": 9.390606880187988, "learning_rate": 3.4264730727054813e-07, "logits/chosen": -0.6574798822402954, "logits/rejected": -0.8277038335800171, "logps/chosen": -187.72811889648438, "logps/rejected": -480.82501220703125, "loss": 0.0621, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5950133800506592, "rewards/margins": 19.850000381469727, "rewards/rejected": -21.438282012939453, "step": 5250 }, { "epoch": 1.3262219140957363, "grad_norm": 66.66351318359375, "learning_rate": 3.4196566618453236e-07, "logits/chosen": -0.6452835202217102, "logits/rejected": -0.7001953125, "logps/chosen": -178.8874969482422, "logps/rejected": -478.79998779296875, "loss": 0.0523, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2240478992462158, "rewards/margins": 18.651561737060547, "rewards/rejected": -19.860157012939453, "step": 5260 }, { "epoch": 1.3287429489805564, "grad_norm": 4.744858264923096, "learning_rate": 3.4128323349369657e-07, "logits/chosen": -0.715441882610321, "logits/rejected": NaN, "logps/chosen": -172.3937530517578, "logps/rejected": -477.7875061035156, "loss": 0.0128, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.8378921747207642, "rewards/margins": 19.028125762939453, "rewards/rejected": -19.873437881469727, "step": 5270 }, { "epoch": 1.3312639838653768, "grad_norm": 0.056269947439432144, "learning_rate": 3.4060001507215675e-07, "logits/chosen": -0.6790100336074829, "logits/rejected": NaN, "logps/chosen": -157.85000610351562, "logps/rejected": -463.54998779296875, "loss": 0.0372, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.9040740728378296, "rewards/margins": 19.041015625, "rewards/rejected": -19.942188262939453, "step": 5280 }, { "epoch": 1.333785018750197, "grad_norm": 4.893336296081543, "learning_rate": 3.399160168007924e-07, "logits/chosen": -0.67840576171875, "logits/rejected": NaN, "logps/chosen": -173.2546844482422, "logps/rejected": -447.9624938964844, "loss": 0.0364, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.22723388671875, "rewards/margins": 19.236719131469727, "rewards/rejected": -20.467187881469727, "step": 5290 }, { "epoch": 1.3363060536350173, "grad_norm": 12.221407890319824, "learning_rate": 3.392312445671957e-07, "logits/chosen": -0.6845947504043579, "logits/rejected": -0.7198028564453125, "logps/chosen": -201.47811889648438, "logps/rejected": -506.8125, "loss": 0.067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.668798804283142, "rewards/margins": 19.248046875, "rewards/rejected": -20.919530868530273, "step": 5300 }, { "epoch": 1.3388270885198374, "grad_norm": 54.841068267822266, "learning_rate": 3.385457042656206e-07, "logits/chosen": -0.806713879108429, "logits/rejected": -0.753521740436554, "logps/chosen": -187.0, "logps/rejected": -478.26251220703125, "loss": 0.0409, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.792749047279358, "rewards/margins": 18.739063262939453, "rewards/rejected": -20.528905868530273, "step": 5310 }, { "epoch": 1.3413481234046576, "grad_norm": 8.162787437438965, "learning_rate": 3.378594017969324e-07, "logits/chosen": -0.658764660358429, "logits/rejected": NaN, "logps/chosen": -162.74374389648438, "logps/rejected": -434.3999938964844, "loss": 0.0192, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.9190658330917358, "rewards/margins": 17.8828125, "rewards/rejected": -18.802343368530273, "step": 5320 }, { "epoch": 1.3438691582894777, "grad_norm": 43.64297103881836, "learning_rate": 3.3717234306855686e-07, "logits/chosen": -0.7374328374862671, "logits/rejected": NaN, "logps/chosen": -180.49844360351562, "logps/rejected": -468.36248779296875, "loss": 0.0399, "rewards/accuracies": 0.984375, "rewards/chosen": -1.0295288562774658, "rewards/margins": 18.47265625, "rewards/rejected": -19.509374618530273, "step": 5330 }, { "epoch": 1.346390193174298, "grad_norm": 54.865142822265625, "learning_rate": 3.364845339944292e-07, "logits/chosen": NaN, "logits/rejected": -0.860064685344696, "logps/chosen": -172.82968139648438, "logps/rejected": -468.1499938964844, "loss": 0.0383, "rewards/accuracies": 0.984375, "rewards/chosen": -1.222253441810608, "rewards/margins": 18.608592987060547, "rewards/rejected": -19.834375381469727, "step": 5340 }, { "epoch": 1.3489112280591182, "grad_norm": 4.584923267364502, "learning_rate": 3.357959804949435e-07, "logits/chosen": NaN, "logits/rejected": -0.7727416753768921, "logps/chosen": -178.0625, "logps/rejected": -435.95001220703125, "loss": 0.0356, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.168426513671875, "rewards/margins": 18.414844512939453, "rewards/rejected": -19.569530487060547, "step": 5350 }, { "epoch": 1.3514322629439386, "grad_norm": 46.42981719970703, "learning_rate": 3.3510668849690155e-07, "logits/chosen": -0.7303711175918579, "logits/rejected": NaN, "logps/chosen": -158.2843780517578, "logps/rejected": -432.6000061035156, "loss": 0.0572, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0106384754180908, "rewards/margins": 18.412500381469727, "rewards/rejected": -19.420312881469727, "step": 5360 }, { "epoch": 1.3539532978287587, "grad_norm": 0.15078957378864288, "learning_rate": 3.3441666393346167e-07, "logits/chosen": -0.747753918170929, "logits/rejected": NaN, "logps/chosen": -175.546875, "logps/rejected": -450.2124938964844, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.679327368736267, "rewards/margins": 18.8515625, "rewards/rejected": -20.524219512939453, "step": 5370 }, { "epoch": 1.3564743327135789, "grad_norm": 0.11633408814668655, "learning_rate": 3.33725912744088e-07, "logits/chosen": -0.6293060183525085, "logits/rejected": NaN, "logps/chosen": -192.5421905517578, "logps/rejected": -483.1000061035156, "loss": 0.0563, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9471435546875, "rewards/margins": 18.93359375, "rewards/rejected": -21.873437881469727, "step": 5380 }, { "epoch": 1.358995367598399, "grad_norm": 0.7999312877655029, "learning_rate": 3.330344408744992e-07, "logits/chosen": -0.556903064250946, "logits/rejected": NaN, "logps/chosen": -187.64999389648438, "logps/rejected": -451.57501220703125, "loss": 0.0145, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.137451171875, "rewards/margins": 18.057811737060547, "rewards/rejected": -21.19921875, "step": 5390 }, { "epoch": 1.3615164024832194, "grad_norm": 45.47043991088867, "learning_rate": 3.3234225427661697e-07, "logits/chosen": -0.5834075808525085, "logits/rejected": NaN, "logps/chosen": -187.41250610351562, "logps/rejected": -440.17498779296875, "loss": 0.0676, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.6042113304138184, "rewards/margins": 17.75390625, "rewards/rejected": -20.354686737060547, "step": 5400 }, { "epoch": 1.3640374373680395, "grad_norm": 8.535640716552734, "learning_rate": 3.316493589085155e-07, "logits/chosen": -0.640612781047821, "logits/rejected": -0.804309070110321, "logps/chosen": -188.67813110351562, "logps/rejected": -452.8999938964844, "loss": 0.0533, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.816357374191284, "rewards/margins": 18.309961318969727, "rewards/rejected": -21.1328125, "step": 5410 }, { "epoch": 1.3665584722528599, "grad_norm": 48.65213394165039, "learning_rate": 3.3095576073436964e-07, "logits/chosen": -0.579742431640625, "logits/rejected": NaN, "logps/chosen": -189.421875, "logps/rejected": -472.17498779296875, "loss": 0.0224, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7956299781799316, "rewards/margins": 18.146875381469727, "rewards/rejected": -20.946094512939453, "step": 5420 }, { "epoch": 1.36907950713768, "grad_norm": 0.09824220091104507, "learning_rate": 3.3026146572440366e-07, "logits/chosen": -0.638171374797821, "logits/rejected": NaN, "logps/chosen": -177.16561889648438, "logps/rejected": -449.2749938964844, "loss": 0.0642, "rewards/accuracies": 0.984375, "rewards/chosen": -2.4397215843200684, "rewards/margins": 18.557811737060547, "rewards/rejected": -20.99609375, "step": 5430 }, { "epoch": 1.3716005420225001, "grad_norm": 0.136211559176445, "learning_rate": 3.295664798548401e-07, "logits/chosen": -0.6215301752090454, "logits/rejected": NaN, "logps/chosen": -186.18124389648438, "logps/rejected": -470.07501220703125, "loss": 0.055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.51519775390625, "rewards/margins": 18.856250762939453, "rewards/rejected": -21.375, "step": 5440 }, { "epoch": 1.3741215769073205, "grad_norm": 11.681754112243652, "learning_rate": 3.288708091078479e-07, "logits/chosen": -0.7405914068222046, "logits/rejected": -0.8556457757949829, "logps/chosen": -186.05313110351562, "logps/rejected": -468.5375061035156, "loss": 0.0505, "rewards/accuracies": 0.984375, "rewards/chosen": -2.10784912109375, "rewards/margins": 18.296875, "rewards/rejected": -20.3984375, "step": 5450 }, { "epoch": 1.3766426117921406, "grad_norm": 30.3269100189209, "learning_rate": 3.281744594714914e-07, "logits/chosen": -0.688793957233429, "logits/rejected": NaN, "logps/chosen": -204.38827514648438, "logps/rejected": -461.42498779296875, "loss": 0.0766, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.8259398937225342, "rewards/margins": 18.052734375, "rewards/rejected": -19.875782012939453, "step": 5460 }, { "epoch": 1.379163646676961, "grad_norm": 4.838999271392822, "learning_rate": 3.274774369396783e-07, "logits/chosen": -0.670397937297821, "logits/rejected": -0.85906982421875, "logps/chosen": -177.13436889648438, "logps/rejected": -451.82501220703125, "loss": 0.0323, "rewards/accuracies": 0.984375, "rewards/chosen": -1.41070556640625, "rewards/margins": 17.125, "rewards/rejected": -18.53515625, "step": 5470 }, { "epoch": 1.3816846815617811, "grad_norm": 100.23827362060547, "learning_rate": 3.267797475121087e-07, "logits/chosen": -0.736981213092804, "logits/rejected": NaN, "logps/chosen": -184.27188110351562, "logps/rejected": -451.07501220703125, "loss": 0.1024, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.4908874034881592, "rewards/margins": 17.778125762939453, "rewards/rejected": -19.271093368530273, "step": 5480 }, { "epoch": 1.3842057164466013, "grad_norm": 51.60257339477539, "learning_rate": 3.260813971942226e-07, "logits/chosen": -0.739727795124054, "logits/rejected": NaN, "logps/chosen": -191.9250030517578, "logps/rejected": -491.82501220703125, "loss": 0.0996, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.8755767345428467, "rewards/margins": 18.693944931030273, "rewards/rejected": -20.569530487060547, "step": 5490 }, { "epoch": 1.3867267513314214, "grad_norm": 3.7584786415100098, "learning_rate": 3.2538239199714917e-07, "logits/chosen": -0.6392883062362671, "logits/rejected": -0.7017181515693665, "logps/chosen": -176.21249389648438, "logps/rejected": -446.86248779296875, "loss": 0.0706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0320067405700684, "rewards/margins": 18.237890243530273, "rewards/rejected": -20.262500762939453, "step": 5500 }, { "epoch": 1.3892477862162418, "grad_norm": 2.013878107070923, "learning_rate": 3.246827379376542e-07, "logits/chosen": -0.8095794916152954, "logits/rejected": NaN, "logps/chosen": -184.2062530517578, "logps/rejected": -484.0375061035156, "loss": 0.0137, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.634613037109375, "rewards/margins": 17.814062118530273, "rewards/rejected": -18.448436737060547, "step": 5510 }, { "epoch": 1.391768821101062, "grad_norm": 21.074371337890625, "learning_rate": 3.239824410380888e-07, "logits/chosen": -0.644927978515625, "logits/rejected": -0.8379486203193665, "logps/chosen": -163.0703125, "logps/rejected": -430.67498779296875, "loss": 0.0213, "rewards/accuracies": 0.984375, "rewards/chosen": -0.44141846895217896, "rewards/margins": 17.966014862060547, "rewards/rejected": -18.407812118530273, "step": 5520 }, { "epoch": 1.3942898559858823, "grad_norm": 3.520463705062866, "learning_rate": 3.232815073263372e-07, "logits/chosen": -0.7175628542900085, "logits/rejected": NaN, "logps/chosen": -163.11874389648438, "logps/rejected": -458.36248779296875, "loss": 0.0336, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.131860375404358, "rewards/margins": 17.977344512939453, "rewards/rejected": -19.099609375, "step": 5530 }, { "epoch": 1.3968108908707024, "grad_norm": 0.1468203067779541, "learning_rate": 3.225799428357652e-07, "logits/chosen": -0.716381847858429, "logits/rejected": NaN, "logps/chosen": -168.5187530517578, "logps/rejected": -450.0625, "loss": 0.052, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6399109363555908, "rewards/margins": 17.870311737060547, "rewards/rejected": -19.5078125, "step": 5540 }, { "epoch": 1.3993319257555226, "grad_norm": 52.94057083129883, "learning_rate": 3.2187775360516827e-07, "logits/chosen": -0.757458508014679, "logits/rejected": NaN, "logps/chosen": -204.91250610351562, "logps/rejected": -469.70001220703125, "loss": 0.062, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.0960021018981934, "rewards/margins": 18.253124237060547, "rewards/rejected": -20.349218368530273, "step": 5550 }, { "epoch": 1.401852960640343, "grad_norm": 0.5873245596885681, "learning_rate": 3.2117494567871914e-07, "logits/chosen": -0.768817126750946, "logits/rejected": NaN, "logps/chosen": -186.91250610351562, "logps/rejected": -453.9375, "loss": 0.0247, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.10205078125, "rewards/margins": 17.833593368530273, "rewards/rejected": -19.94140625, "step": 5560 }, { "epoch": 1.404373995525163, "grad_norm": 0.6917257308959961, "learning_rate": 3.20471525105916e-07, "logits/chosen": -0.7955581545829773, "logits/rejected": NaN, "logps/chosen": -194.35311889648438, "logps/rejected": -443.25, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2620346546173096, "rewards/margins": 18.544530868530273, "rewards/rejected": -19.80078125, "step": 5570 }, { "epoch": 1.4068950304099834, "grad_norm": 0.08345922827720642, "learning_rate": 3.197674979415308e-07, "logits/chosen": -0.8746185302734375, "logits/rejected": NaN, "logps/chosen": -188.98281860351562, "logps/rejected": -461.1875, "loss": 0.0259, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.066503882408142, "rewards/margins": 17.45703125, "rewards/rejected": -18.524999618530273, "step": 5580 }, { "epoch": 1.4094160652948036, "grad_norm": 0.05125381425023079, "learning_rate": 3.190628702455565e-07, "logits/chosen": -0.7813812494277954, "logits/rejected": -0.8575592041015625, "logps/chosen": -182.578125, "logps/rejected": -449.61248779296875, "loss": 0.0346, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.400299072265625, "rewards/margins": 17.686717987060547, "rewards/rejected": -19.08984375, "step": 5590 }, { "epoch": 1.4119371001796237, "grad_norm": 0.042266443371772766, "learning_rate": 3.183576480831551e-07, "logits/chosen": -0.5748321413993835, "logits/rejected": NaN, "logps/chosen": -179.4875030517578, "logps/rejected": -463.92498779296875, "loss": 0.0646, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.970342993736267, "rewards/margins": 17.603906631469727, "rewards/rejected": -19.575780868530273, "step": 5600 }, { "epoch": 1.4144581350644438, "grad_norm": 8.200540542602539, "learning_rate": 3.17651837524606e-07, "logits/chosen": -0.611492931842804, "logits/rejected": NaN, "logps/chosen": -182.75936889648438, "logps/rejected": -471.9750061035156, "loss": 0.0576, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0624022483825684, "rewards/margins": 18.772655487060547, "rewards/rejected": -20.829687118530273, "step": 5610 }, { "epoch": 1.4169791699492642, "grad_norm": 0.026907438412308693, "learning_rate": 3.1694544464525274e-07, "logits/chosen": -0.6356788873672485, "logits/rejected": NaN, "logps/chosen": -178.5812530517578, "logps/rejected": -466.54998779296875, "loss": 0.0346, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.2338805198669434, "rewards/margins": 18.436717987060547, "rewards/rejected": -20.671875, "step": 5620 }, { "epoch": 1.4195002048340843, "grad_norm": 8.511577606201172, "learning_rate": 3.162384755254517e-07, "logits/chosen": -0.614971935749054, "logits/rejected": NaN, "logps/chosen": -184.24374389648438, "logps/rejected": -467.2749938964844, "loss": 0.025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.923681616783142, "rewards/margins": 19.262500762939453, "rewards/rejected": -21.189062118530273, "step": 5630 }, { "epoch": 1.4220212397189047, "grad_norm": 0.014999392442405224, "learning_rate": 3.155309362505191e-07, "logits/chosen": -0.6339660882949829, "logits/rejected": -0.758056640625, "logps/chosen": -168.6687469482422, "logps/rejected": -442.1000061035156, "loss": 0.0366, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9180572032928467, "rewards/margins": 18.878124237060547, "rewards/rejected": -20.796875, "step": 5640 }, { "epoch": 1.4245422746037248, "grad_norm": 0.3347621560096741, "learning_rate": 3.1482283291067886e-07, "logits/chosen": -0.629559338092804, "logits/rejected": -0.7935577630996704, "logps/chosen": -184.8937530517578, "logps/rejected": -452.2250061035156, "loss": 0.0149, "rewards/accuracies": 0.984375, "rewards/chosen": -2.284423828125, "rewards/margins": 18.400781631469727, "rewards/rejected": -20.686717987060547, "step": 5650 }, { "epoch": 1.427063309488545, "grad_norm": 83.31278991699219, "learning_rate": 3.141141716010101e-07, "logits/chosen": -0.6112915277481079, "logits/rejected": NaN, "logps/chosen": -175.20938110351562, "logps/rejected": -463.25, "loss": 0.061, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.649571180343628, "rewards/margins": 19.163280487060547, "rewards/rejected": -20.8046875, "step": 5660 }, { "epoch": 1.4295843443733653, "grad_norm": 0.09786761552095413, "learning_rate": 3.134049584213949e-07, "logits/chosen": -0.685528576374054, "logits/rejected": NaN, "logps/chosen": -179.82656860351562, "logps/rejected": -501.3500061035156, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5926086902618408, "rewards/margins": 19.038280487060547, "rewards/rejected": -20.634374618530273, "step": 5670 }, { "epoch": 1.4321053792581855, "grad_norm": 51.414207458496094, "learning_rate": 3.1269519947646534e-07, "logits/chosen": -0.7231048345565796, "logits/rejected": NaN, "logps/chosen": -168.68905639648438, "logps/rejected": -470.29998779296875, "loss": 0.035, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.1681060791015625, "rewards/margins": 19.107812881469727, "rewards/rejected": -20.283594131469727, "step": 5680 }, { "epoch": 1.4346264141430056, "grad_norm": 0.023043908178806305, "learning_rate": 3.119849008755515e-07, "logits/chosen": -0.735308825969696, "logits/rejected": NaN, "logps/chosen": -191.7375030517578, "logps/rejected": -483.9624938964844, "loss": 0.0375, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.094171166419983, "rewards/margins": 20.107030868530273, "rewards/rejected": -21.198436737060547, "step": 5690 }, { "epoch": 1.437147449027826, "grad_norm": 4.010029315948486, "learning_rate": 3.112740687326286e-07, "logits/chosen": -0.5741821527481079, "logits/rejected": NaN, "logps/chosen": -170.4031219482422, "logps/rejected": -444.2875061035156, "loss": 0.0283, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.45367431640625, "rewards/margins": 18.207813262939453, "rewards/rejected": -19.65234375, "step": 5700 }, { "epoch": 1.4396684839126461, "grad_norm": 16.32939910888672, "learning_rate": 3.105627091662641e-07, "logits/chosen": -0.5653244256973267, "logits/rejected": -0.7067199945449829, "logps/chosen": -162.4656219482422, "logps/rejected": -462.26251220703125, "loss": 0.0538, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0947508811950684, "rewards/margins": 18.944530487060547, "rewards/rejected": -21.040624618530273, "step": 5710 }, { "epoch": 1.4421895187974663, "grad_norm": 24.02434730529785, "learning_rate": 3.098508282995657e-07, "logits/chosen": -0.5919780731201172, "logits/rejected": NaN, "logps/chosen": -189.6843719482422, "logps/rejected": -485.6875, "loss": 0.0238, "rewards/accuracies": 0.984375, "rewards/chosen": -1.733673095703125, "rewards/margins": 19.244140625, "rewards/rejected": -20.979686737060547, "step": 5720 }, { "epoch": 1.4447105536822866, "grad_norm": 3.0828983783721924, "learning_rate": 3.091384322601279e-07, "logits/chosen": -0.5786922574043274, "logits/rejected": -0.806353747844696, "logps/chosen": -199.27499389648438, "logps/rejected": -488.5874938964844, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.869329810142517, "rewards/margins": 18.319530487060547, "rewards/rejected": -20.200000762939453, "step": 5730 }, { "epoch": 1.4472315885671068, "grad_norm": 27.968093872070312, "learning_rate": 3.0842552717998e-07, "logits/chosen": -0.49460142850875854, "logits/rejected": NaN, "logps/chosen": -176.88125610351562, "logps/rejected": -481.625, "loss": 0.0448, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.896270751953125, "rewards/margins": 18.928125381469727, "rewards/rejected": -20.819530487060547, "step": 5740 }, { "epoch": 1.4497526234519271, "grad_norm": 0.8847183585166931, "learning_rate": 3.077121191955324e-07, "logits/chosen": -0.563830554485321, "logits/rejected": NaN, "logps/chosen": -189.97811889648438, "logps/rejected": -470.3999938964844, "loss": 0.0637, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0958893299102783, "rewards/margins": 18.637500762939453, "rewards/rejected": -20.728906631469727, "step": 5750 }, { "epoch": 1.4522736583367473, "grad_norm": 2.53399658203125, "learning_rate": 3.0699821444752484e-07, "logits/chosen": -0.6556243896484375, "logits/rejected": -0.8245605230331421, "logps/chosen": -191.5812530517578, "logps/rejected": -470.0625, "loss": 0.0318, "rewards/accuracies": 0.984375, "rewards/chosen": -2.0931763648986816, "rewards/margins": 18.502344131469727, "rewards/rejected": -20.59765625, "step": 5760 }, { "epoch": 1.4547946932215674, "grad_norm": 52.675601959228516, "learning_rate": 3.062838190809727e-07, "logits/chosen": -0.6445373296737671, "logits/rejected": NaN, "logps/chosen": -202.0031280517578, "logps/rejected": -477.67498779296875, "loss": 0.0795, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.971105933189392, "rewards/margins": 18.785547256469727, "rewards/rejected": -20.762500762939453, "step": 5770 }, { "epoch": 1.4573157281063875, "grad_norm": 0.35835254192352295, "learning_rate": 3.055689392451144e-07, "logits/chosen": -0.41037291288375854, "logits/rejected": NaN, "logps/chosen": -189.69686889648438, "logps/rejected": -469.36248779296875, "loss": 0.0092, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.052606225013733, "rewards/margins": 19.864063262939453, "rewards/rejected": -20.91015625, "step": 5780 }, { "epoch": 1.459836762991208, "grad_norm": 13.983587265014648, "learning_rate": 3.0485358109335875e-07, "logits/chosen": -0.662188708782196, "logits/rejected": NaN, "logps/chosen": -183.50936889648438, "logps/rejected": -459.4375, "loss": 0.0347, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.132128953933716, "rewards/margins": 18.952342987060547, "rewards/rejected": -21.084375381469727, "step": 5790 }, { "epoch": 1.462357797876028, "grad_norm": 0.7630747556686401, "learning_rate": 3.041377507832313e-07, "logits/chosen": -0.7520607113838196, "logits/rejected": -0.8930755853652954, "logps/chosen": -198.8312530517578, "logps/rejected": -505.875, "loss": 0.0424, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.343328833580017, "rewards/margins": 20.614063262939453, "rewards/rejected": -21.957813262939453, "step": 5800 }, { "epoch": 1.4648788327608484, "grad_norm": 0.42671069502830505, "learning_rate": 3.034214544763223e-07, "logits/chosen": -0.698455810546875, "logits/rejected": -0.819079577922821, "logps/chosen": -174.39999389648438, "logps/rejected": -458.5874938964844, "loss": 0.0292, "rewards/accuracies": 0.984375, "rewards/chosen": -0.787127673625946, "rewards/margins": 19.611328125, "rewards/rejected": -20.399219512939453, "step": 5810 }, { "epoch": 1.4673998676456685, "grad_norm": 170.15286254882812, "learning_rate": 3.0270469833823246e-07, "logits/chosen": -0.641461193561554, "logits/rejected": NaN, "logps/chosen": -157.0812530517578, "logps/rejected": -430.95001220703125, "loss": 0.0424, "rewards/accuracies": 0.984375, "rewards/chosen": -1.4701416492462158, "rewards/margins": 19.125391006469727, "rewards/rejected": -20.598438262939453, "step": 5820 }, { "epoch": 1.4699209025304887, "grad_norm": 2.709918260574341, "learning_rate": 3.019874885385211e-07, "logits/chosen": -0.7296905517578125, "logits/rejected": NaN, "logps/chosen": -188.5656280517578, "logps/rejected": -473.92498779296875, "loss": 0.0538, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5839111804962158, "rewards/margins": 19.225391387939453, "rewards/rejected": -20.810155868530273, "step": 5830 }, { "epoch": 1.472441937415309, "grad_norm": 0.7835653424263, "learning_rate": 3.012698312506523e-07, "logits/chosen": -0.7591491937637329, "logits/rejected": NaN, "logps/chosen": -189.4015655517578, "logps/rejected": -493.1625061035156, "loss": 0.0699, "rewards/accuracies": 0.984375, "rewards/chosen": -0.9632781744003296, "rewards/margins": 19.246875762939453, "rewards/rejected": -20.214061737060547, "step": 5840 }, { "epoch": 1.4749629723001292, "grad_norm": 82.73615264892578, "learning_rate": 3.0055173265194184e-07, "logits/chosen": -0.643890380859375, "logits/rejected": -0.7097107172012329, "logps/chosen": -157.96875, "logps/rejected": -410.63751220703125, "loss": 0.1139, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0188019275665283, "rewards/margins": 17.049219131469727, "rewards/rejected": -18.062891006469727, "step": 5850 }, { "epoch": 1.4774840071849495, "grad_norm": 3.155005931854248, "learning_rate": 2.998331989235042e-07, "logits/chosen": -0.6026641726493835, "logits/rejected": -0.7734233736991882, "logps/chosen": -170.5625, "logps/rejected": -461.3125, "loss": 0.0578, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.907238781452179, "rewards/margins": 18.430469512939453, "rewards/rejected": -19.333593368530273, "step": 5860 }, { "epoch": 1.4800050420697697, "grad_norm": 2.321043014526367, "learning_rate": 2.991142362501994e-07, "logits/chosen": -0.847851574420929, "logits/rejected": NaN, "logps/chosen": -175.3359375, "logps/rejected": -461.79998779296875, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.714569091796875, "rewards/margins": 17.739452362060547, "rewards/rejected": -18.4453125, "step": 5870 }, { "epoch": 1.4825260769545898, "grad_norm": 0.20635591447353363, "learning_rate": 2.9839485082057945e-07, "logits/chosen": -0.7409011721611023, "logits/rejected": NaN, "logps/chosen": -167.72811889648438, "logps/rejected": -451.3500061035156, "loss": 0.0298, "rewards/accuracies": 0.984375, "rewards/chosen": -0.850390613079071, "rewards/margins": 18.15625, "rewards/rejected": -19.021093368530273, "step": 5880 }, { "epoch": 1.48504711183941, "grad_norm": 0.1218407079577446, "learning_rate": 2.976750488268355e-07, "logits/chosen": -0.7181396484375, "logits/rejected": NaN, "logps/chosen": -172.3625030517578, "logps/rejected": -472.82501220703125, "loss": 0.1076, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.748089611530304, "rewards/margins": 18.299219131469727, "rewards/rejected": -19.056249618530273, "step": 5890 }, { "epoch": 1.4875681467242303, "grad_norm": 0.008219333365559578, "learning_rate": 2.96954836464744e-07, "logits/chosen": -0.7714568972587585, "logits/rejected": NaN, "logps/chosen": -162.14999389648438, "logps/rejected": -477.82501220703125, "loss": 0.0072, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7599426507949829, "rewards/margins": 19.932422637939453, "rewards/rejected": -20.685155868530273, "step": 5900 }, { "epoch": 1.4900891816090505, "grad_norm": 0.24571184813976288, "learning_rate": 2.9623421993361407e-07, "logits/chosen": -0.7967529296875, "logits/rejected": NaN, "logps/chosen": -178.1593780517578, "logps/rejected": -481.1000061035156, "loss": 0.0081, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0583312511444092, "rewards/margins": 19.78515625, "rewards/rejected": -20.833593368530273, "step": 5910 }, { "epoch": 1.4926102164938708, "grad_norm": 4.207306861877441, "learning_rate": 2.955132054362335e-07, "logits/chosen": -0.6488403081893921, "logits/rejected": NaN, "logps/chosen": -188.671875, "logps/rejected": -459.79998779296875, "loss": 0.0263, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.4127686023712158, "rewards/margins": 18.094532012939453, "rewards/rejected": -19.5234375, "step": 5920 }, { "epoch": 1.495131251378691, "grad_norm": 52.68553161621094, "learning_rate": 2.9479179917881593e-07, "logits/chosen": -0.684918224811554, "logits/rejected": NaN, "logps/chosen": -184.63436889648438, "logps/rejected": -488.3374938964844, "loss": 0.0441, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.547082543373108, "rewards/margins": 19.548437118530273, "rewards/rejected": -21.09765625, "step": 5930 }, { "epoch": 1.497652286263511, "grad_norm": 0.8104216456413269, "learning_rate": 2.9407000737094655e-07, "logits/chosen": -0.751416027545929, "logits/rejected": NaN, "logps/chosen": -187.5710906982422, "logps/rejected": -455.2124938964844, "loss": 0.067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.2852599620819092, "rewards/margins": 19.024219512939453, "rewards/rejected": -20.30859375, "step": 5940 }, { "epoch": 1.5001733211483312, "grad_norm": 0.2608201205730438, "learning_rate": 2.9334783622552983e-07, "logits/chosen": -0.717791736125946, "logits/rejected": -0.8227500915527344, "logps/chosen": -176.58438110351562, "logps/rejected": -475.0249938964844, "loss": 0.0221, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.183624267578125, "rewards/margins": 19.016407012939453, "rewards/rejected": -20.205469131469727, "step": 5950 }, { "epoch": 1.5026943560331516, "grad_norm": 0.07467667758464813, "learning_rate": 2.9262529195873506e-07, "logits/chosen": -0.7648879885673523, "logits/rejected": NaN, "logps/chosen": -183.0593719482422, "logps/rejected": -467.76251220703125, "loss": 0.0934, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8779723644256592, "rewards/margins": 19.253124237060547, "rewards/rejected": -21.137500762939453, "step": 5960 }, { "epoch": 1.505215390917972, "grad_norm": 0.012220881879329681, "learning_rate": 2.9190238078994326e-07, "logits/chosen": -0.666455090045929, "logits/rejected": NaN, "logps/chosen": -190.72811889648438, "logps/rejected": -442.3374938964844, "loss": 0.0626, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9583740234375, "rewards/margins": 18.516407012939453, "rewards/rejected": -20.48046875, "step": 5970 }, { "epoch": 1.507736425802792, "grad_norm": 58.802242279052734, "learning_rate": 2.911791089416938e-07, "logits/chosen": -0.7513839602470398, "logits/rejected": -0.8649047613143921, "logps/chosen": -176.03750610351562, "logps/rejected": -456.7749938964844, "loss": 0.0807, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.0760436058044434, "rewards/margins": 18.671875, "rewards/rejected": -20.745311737060547, "step": 5980 }, { "epoch": 1.5102574606876122, "grad_norm": 0.16032403707504272, "learning_rate": 2.904554826396304e-07, "logits/chosen": -0.675311267375946, "logits/rejected": -0.7901763916015625, "logps/chosen": -181.0968780517578, "logps/rejected": -471.25, "loss": 0.0489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.277636766433716, "rewards/margins": 18.046092987060547, "rewards/rejected": -20.328907012939453, "step": 5990 }, { "epoch": 1.5127784955724324, "grad_norm": 8.040952682495117, "learning_rate": 2.89731508112448e-07, "logits/chosen": -0.7001556158065796, "logits/rejected": NaN, "logps/chosen": -181.62344360351562, "logps/rejected": -448.38751220703125, "loss": 0.0264, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.566857933998108, "rewards/margins": 17.814062118530273, "rewards/rejected": -19.382030487060547, "step": 6000 }, { "epoch": 1.5152995304572527, "grad_norm": 0.3535599410533905, "learning_rate": 2.890071915918387e-07, "logits/chosen": -0.7514801025390625, "logits/rejected": NaN, "logps/chosen": -178.88671875, "logps/rejected": -453.75, "loss": 0.0276, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.9452056884765625, "rewards/margins": 18.060937881469727, "rewards/rejected": -18.998046875, "step": 6010 }, { "epoch": 1.5178205653420729, "grad_norm": 0.08801557868719101, "learning_rate": 2.8828253931243846e-07, "logits/chosen": -0.6727844476699829, "logits/rejected": NaN, "logps/chosen": -168.33749389648438, "logps/rejected": -435.375, "loss": 0.0274, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5053741931915283, "rewards/margins": 17.995311737060547, "rewards/rejected": -19.512500762939453, "step": 6020 }, { "epoch": 1.5203416002268932, "grad_norm": 3.0909578800201416, "learning_rate": 2.8755755751177333e-07, "logits/chosen": -0.582049548625946, "logits/rejected": NaN, "logps/chosen": -185.9875030517578, "logps/rejected": -469.42498779296875, "loss": 0.0657, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.0116944313049316, "rewards/margins": 18.745311737060547, "rewards/rejected": -20.746875762939453, "step": 6030 }, { "epoch": 1.5228626351117134, "grad_norm": 0.011192746460437775, "learning_rate": 2.8683225243020576e-07, "logits/chosen": -0.6610183715820312, "logits/rejected": NaN, "logps/chosen": -188.328125, "logps/rejected": -466.75, "loss": 0.0217, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.7321808338165283, "rewards/margins": 18.3203125, "rewards/rejected": -20.053905487060547, "step": 6040 }, { "epoch": 1.5253836699965335, "grad_norm": 0.6133855581283569, "learning_rate": 2.861066303108808e-07, "logits/chosen": -0.719982922077179, "logits/rejected": NaN, "logps/chosen": -181.41561889648438, "logps/rejected": -474.375, "loss": 0.0174, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0392518043518066, "rewards/margins": 18.616405487060547, "rewards/rejected": -20.659374237060547, "step": 6050 }, { "epoch": 1.5279047048813537, "grad_norm": 44.381919860839844, "learning_rate": 2.8538069739967257e-07, "logits/chosen": -0.621020495891571, "logits/rejected": NaN, "logps/chosen": -189.328125, "logps/rejected": -470.01251220703125, "loss": 0.0566, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.5277099609375, "rewards/margins": 19.309375762939453, "rewards/rejected": -21.844532012939453, "step": 6060 }, { "epoch": 1.530425739766174, "grad_norm": 1.4036754369735718, "learning_rate": 2.8465445994513024e-07, "logits/chosen": -0.495025634765625, "logits/rejected": -0.6875442266464233, "logps/chosen": -191.33749389648438, "logps/rejected": -456.17498779296875, "loss": 0.0829, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.093994140625, "rewards/margins": 18.05859375, "rewards/rejected": -21.159374237060547, "step": 6070 }, { "epoch": 1.5329467746509944, "grad_norm": 88.67369079589844, "learning_rate": 2.8392792419842447e-07, "logits/chosen": -0.6396576166152954, "logits/rejected": -0.858673095703125, "logps/chosen": -207.7624969482422, "logps/rejected": -469.82501220703125, "loss": 0.0528, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0166258811950684, "rewards/margins": 18.125, "rewards/rejected": -21.145313262939453, "step": 6080 }, { "epoch": 1.5354678095358145, "grad_norm": 20.665836334228516, "learning_rate": 2.832010964132934e-07, "logits/chosen": -0.569183349609375, "logits/rejected": -0.752911388874054, "logps/chosen": -182.3312530517578, "logps/rejected": -468.7875061035156, "loss": 0.0564, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.2418150901794434, "rewards/margins": 18.728515625, "rewards/rejected": -20.974218368530273, "step": 6090 }, { "epoch": 1.5379888444206347, "grad_norm": 1.7503215074539185, "learning_rate": 2.82473982845989e-07, "logits/chosen": -0.49421387910842896, "logits/rejected": NaN, "logps/chosen": -176.89688110351562, "logps/rejected": -453.4750061035156, "loss": 0.0327, "rewards/accuracies": 0.984375, "rewards/chosen": -2.2040343284606934, "rewards/margins": 18.246875762939453, "rewards/rejected": -20.446094512939453, "step": 6100 }, { "epoch": 1.5405098793054548, "grad_norm": 1.8968802690505981, "learning_rate": 2.8174658975522305e-07, "logits/chosen": -0.59796142578125, "logits/rejected": NaN, "logps/chosen": -190.1906280517578, "logps/rejected": -439.23748779296875, "loss": 0.0686, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0533690452575684, "rewards/margins": 18.075780868530273, "rewards/rejected": -20.124217987060547, "step": 6110 }, { "epoch": 1.5430309141902752, "grad_norm": 0.416347473859787, "learning_rate": 2.810189234021135e-07, "logits/chosen": -0.5477234125137329, "logits/rejected": NaN, "logps/chosen": -201.83749389648438, "logps/rejected": -475.5249938964844, "loss": 0.0355, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9830079078674316, "rewards/margins": 17.330860137939453, "rewards/rejected": -20.31640625, "step": 6120 }, { "epoch": 1.5455519490750953, "grad_norm": 0.3686789870262146, "learning_rate": 2.802909900501304e-07, "logits/chosen": -0.6567016839981079, "logits/rejected": NaN, "logps/chosen": -176.3781280517578, "logps/rejected": -449.9125061035156, "loss": 0.0247, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.505566358566284, "rewards/margins": 16.622264862060547, "rewards/rejected": -19.121875762939453, "step": 6130 }, { "epoch": 1.5480729839599157, "grad_norm": 66.93819427490234, "learning_rate": 2.7956279596504197e-07, "logits/chosen": -0.673065185546875, "logits/rejected": NaN, "logps/chosen": -192.14999389648438, "logps/rejected": -473.45001220703125, "loss": 0.0903, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.91009521484375, "rewards/margins": 17.532812118530273, "rewards/rejected": -20.434375762939453, "step": 6140 }, { "epoch": 1.5505940188447358, "grad_norm": 17.657838821411133, "learning_rate": 2.7883434741486065e-07, "logits/chosen": -0.5307372808456421, "logits/rejected": -0.679608166217804, "logps/chosen": -176.8468780517578, "logps/rejected": -469.5375061035156, "loss": 0.0597, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.2745361328125, "rewards/margins": 18.021093368530273, "rewards/rejected": -20.296875, "step": 6150 }, { "epoch": 1.553115053729556, "grad_norm": 0.16777978837490082, "learning_rate": 2.7810565066978944e-07, "logits/chosen": -0.6759368777275085, "logits/rejected": NaN, "logps/chosen": -185.96249389648438, "logps/rejected": -458.5625, "loss": 0.0247, "rewards/accuracies": 0.984375, "rewards/chosen": -2.2616515159606934, "rewards/margins": 17.666406631469727, "rewards/rejected": -19.921092987060547, "step": 6160 }, { "epoch": 1.555636088614376, "grad_norm": 65.296142578125, "learning_rate": 2.7737671200216745e-07, "logits/chosen": -0.7854217290878296, "logits/rejected": -0.9262145757675171, "logps/chosen": -188.0906219482422, "logps/rejected": -455.75, "loss": 0.1595, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.9907715320587158, "rewards/margins": 17.985157012939453, "rewards/rejected": -19.974218368530273, "step": 6170 }, { "epoch": 1.5581571234991964, "grad_norm": 70.2752456665039, "learning_rate": 2.766475376864163e-07, "logits/chosen": -0.6931701898574829, "logits/rejected": -0.808361828327179, "logps/chosen": -178.50625610351562, "logps/rejected": -455.79998779296875, "loss": 0.0537, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0672850608825684, "rewards/margins": 17.951562881469727, "rewards/rejected": -20.013280868530273, "step": 6180 }, { "epoch": 1.5606781583840168, "grad_norm": 6.826298713684082, "learning_rate": 2.75918133998986e-07, "logits/chosen": -0.73663330078125, "logits/rejected": -0.950671374797821, "logps/chosen": -179.1750030517578, "logps/rejected": -497.5, "loss": 0.0266, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7587769031524658, "rewards/margins": 18.385156631469727, "rewards/rejected": -20.146875381469727, "step": 6190 }, { "epoch": 1.563199193268837, "grad_norm": 41.797691345214844, "learning_rate": 2.751885072183009e-07, "logits/chosen": -0.853594958782196, "logits/rejected": -0.900164783000946, "logps/chosen": -177.14688110351562, "logps/rejected": -431.3500061035156, "loss": 0.0361, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5211913585662842, "rewards/margins": 18.095312118530273, "rewards/rejected": -19.623437881469727, "step": 6200 }, { "epoch": 1.565720228153657, "grad_norm": 1.8181588649749756, "learning_rate": 2.744586636247056e-07, "logits/chosen": -0.638354480266571, "logits/rejected": -0.8666321039199829, "logps/chosen": -159.64218139648438, "logps/rejected": -477.375, "loss": 0.0223, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.913647472858429, "rewards/margins": 18.747655868530273, "rewards/rejected": -19.664844512939453, "step": 6210 }, { "epoch": 1.5682412630384772, "grad_norm": 0.01200844720005989, "learning_rate": 2.7372860950041085e-07, "logits/chosen": -0.7231353521347046, "logits/rejected": -0.813427746295929, "logps/chosen": -185.9812469482422, "logps/rejected": -460.17498779296875, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.361700415611267, "rewards/margins": 17.887500762939453, "rewards/rejected": -19.24609375, "step": 6220 }, { "epoch": 1.5707622979232974, "grad_norm": 1.388627290725708, "learning_rate": 2.7299835112943984e-07, "logits/chosen": -0.811053454875946, "logits/rejected": NaN, "logps/chosen": -177.69686889648438, "logps/rejected": -458.875, "loss": 0.0754, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8188598155975342, "rewards/margins": 18.542186737060547, "rewards/rejected": -20.362499237060547, "step": 6230 }, { "epoch": 1.5732833328081177, "grad_norm": 6.00600528717041, "learning_rate": 2.7226789479757355e-07, "logits/chosen": -0.705090343952179, "logits/rejected": -0.9579101800918579, "logps/chosen": -158.59530639648438, "logps/rejected": -474.1000061035156, "loss": 0.0562, "rewards/accuracies": 0.984375, "rewards/chosen": -0.677014172077179, "rewards/margins": 18.135936737060547, "rewards/rejected": -18.822656631469727, "step": 6240 }, { "epoch": 1.575804367692938, "grad_norm": 2.14218807220459, "learning_rate": 2.7153724679229707e-07, "logits/chosen": -0.8020919561386108, "logits/rejected": NaN, "logps/chosen": -172.1171875, "logps/rejected": -459.98748779296875, "loss": 0.0107, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.940631091594696, "rewards/margins": 17.967967987060547, "rewards/rejected": -18.913280487060547, "step": 6250 }, { "epoch": 1.5783254025777582, "grad_norm": 37.78008270263672, "learning_rate": 2.7080641340274536e-07, "logits/chosen": -0.7461029291152954, "logits/rejected": NaN, "logps/chosen": -158.48281860351562, "logps/rejected": -419.26251220703125, "loss": 0.0366, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.032780408859253, "rewards/margins": 17.6875, "rewards/rejected": -18.725000381469727, "step": 6260 }, { "epoch": 1.5808464374625784, "grad_norm": 47.56843948364258, "learning_rate": 2.70075400919649e-07, "logits/chosen": -0.7171657681465149, "logits/rejected": -0.809344470500946, "logps/chosen": -185.5124969482422, "logps/rejected": -501.1499938964844, "loss": 0.0141, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.534185767173767, "rewards/margins": 19.935155868530273, "rewards/rejected": -21.471874237060547, "step": 6270 }, { "epoch": 1.5833674723473985, "grad_norm": 84.2939453125, "learning_rate": 2.6934421563528037e-07, "logits/chosen": -0.689166247844696, "logits/rejected": -0.876208484172821, "logps/chosen": -165.6875, "logps/rejected": -459.07501220703125, "loss": 0.0736, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.7718994617462158, "rewards/margins": 18.842187881469727, "rewards/rejected": -20.6171875, "step": 6280 }, { "epoch": 1.5858885072322189, "grad_norm": 16.786128997802734, "learning_rate": 2.6861286384339884e-07, "logits/chosen": -0.676196277141571, "logits/rejected": NaN, "logps/chosen": -170.4250030517578, "logps/rejected": -455.07501220703125, "loss": 0.0425, "rewards/accuracies": 0.984375, "rewards/chosen": -1.46588134765625, "rewards/margins": 18.87109375, "rewards/rejected": -20.342187881469727, "step": 6290 }, { "epoch": 1.588409542117039, "grad_norm": 1.9659384489059448, "learning_rate": 2.6788135183919743e-07, "logits/chosen": NaN, "logits/rejected": -0.8734039068222046, "logps/chosen": -190.22500610351562, "logps/rejected": -486.375, "loss": 0.0914, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.3184082508087158, "rewards/margins": 19.87890625, "rewards/rejected": -21.196874618530273, "step": 6300 }, { "epoch": 1.5909305770018594, "grad_norm": 0.9574471712112427, "learning_rate": 2.671496859192479e-07, "logits/chosen": -0.7553955316543579, "logits/rejected": -0.781140148639679, "logps/chosen": -186.4562530517578, "logps/rejected": -477.54998779296875, "loss": 0.0843, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8253662586212158, "rewards/margins": 19.424999237060547, "rewards/rejected": -21.253124237060547, "step": 6310 }, { "epoch": 1.5934516118866795, "grad_norm": 0.002664657309651375, "learning_rate": 2.6641787238144703e-07, "logits/chosen": -0.775103747844696, "logits/rejected": NaN, "logps/chosen": -180.2937469482422, "logps/rejected": -470.38751220703125, "loss": 0.1059, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9110596179962158, "rewards/margins": 18.891407012939453, "rewards/rejected": -20.801563262939453, "step": 6320 }, { "epoch": 1.5959726467714996, "grad_norm": 9.260642051696777, "learning_rate": 2.656859175249622e-07, "logits/chosen": -0.812908947467804, "logits/rejected": NaN, "logps/chosen": -185.6843719482422, "logps/rejected": -450.1000061035156, "loss": 0.0305, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.498968482017517, "rewards/margins": 17.683202743530273, "rewards/rejected": -19.18359375, "step": 6330 }, { "epoch": 1.5984936816563198, "grad_norm": 31.012985229492188, "learning_rate": 2.6495382765017726e-07, "logits/chosen": -0.5672820806503296, "logits/rejected": NaN, "logps/chosen": -161.6062469482422, "logps/rejected": -449.7124938964844, "loss": 0.0165, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.5901336669921875, "rewards/margins": 18.303905487060547, "rewards/rejected": -19.883594512939453, "step": 6340 }, { "epoch": 1.6010147165411401, "grad_norm": 4.527191638946533, "learning_rate": 2.6422160905863816e-07, "logits/chosen": -0.725848376750946, "logits/rejected": NaN, "logps/chosen": -182.70938110351562, "logps/rejected": -457.5625, "loss": 0.0331, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.45343017578125, "rewards/margins": 17.230859756469727, "rewards/rejected": -18.688282012939453, "step": 6350 }, { "epoch": 1.6035357514259605, "grad_norm": 15.601044654846191, "learning_rate": 2.634892680529988e-07, "logits/chosen": -0.6796966791152954, "logits/rejected": -0.8301563262939453, "logps/chosen": -168.71249389648438, "logps/rejected": -444.4375, "loss": 0.0447, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.6978088617324829, "rewards/margins": 18.0, "rewards/rejected": -18.700780868530273, "step": 6360 }, { "epoch": 1.6060567863107806, "grad_norm": 0.18399210274219513, "learning_rate": 2.627568109369668e-07, "logits/chosen": -0.7406005859375, "logits/rejected": -0.7659759521484375, "logps/chosen": -178.9140625, "logps/rejected": -470.82501220703125, "loss": 0.013, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.66302490234375, "rewards/margins": 19.61328125, "rewards/rejected": -20.271093368530273, "step": 6370 }, { "epoch": 1.6085778211956008, "grad_norm": 1.3337942361831665, "learning_rate": 2.6202424401524914e-07, "logits/chosen": -0.669384777545929, "logits/rejected": NaN, "logps/chosen": -159.52499389648438, "logps/rejected": -452.13751220703125, "loss": 0.0457, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1685912609100342, "rewards/margins": 18.718358993530273, "rewards/rejected": -19.893749237060547, "step": 6380 }, { "epoch": 1.611098856080421, "grad_norm": 2.778611183166504, "learning_rate": 2.6129157359349806e-07, "logits/chosen": -0.7060546875, "logits/rejected": -0.7804778814315796, "logps/chosen": -184.5500030517578, "logps/rejected": -462.54998779296875, "loss": 0.0193, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3743102550506592, "rewards/margins": 19.071094512939453, "rewards/rejected": -20.439062118530273, "step": 6390 }, { "epoch": 1.6136198909652413, "grad_norm": 6.017164707183838, "learning_rate": 2.605588059782567e-07, "logits/chosen": -0.742993175983429, "logits/rejected": -0.8419555425643921, "logps/chosen": -198.8874969482422, "logps/rejected": -469.26251220703125, "loss": 0.0462, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1143341064453125, "rewards/margins": 17.717187881469727, "rewards/rejected": -19.838281631469727, "step": 6400 }, { "epoch": 1.6161409258500614, "grad_norm": 4.22816276550293, "learning_rate": 2.5982594747690483e-07, "logits/chosen": -0.8320159912109375, "logits/rejected": NaN, "logps/chosen": -179.6875, "logps/rejected": -464.9624938964844, "loss": 0.0328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.304223656654358, "rewards/margins": 18.757030487060547, "rewards/rejected": -20.073436737060547, "step": 6410 }, { "epoch": 1.6186619607348818, "grad_norm": 0.8588093519210815, "learning_rate": 2.590930043976044e-07, "logits/chosen": -0.7221771478652954, "logits/rejected": NaN, "logps/chosen": -176.2312469482422, "logps/rejected": -479.9624938964844, "loss": 0.0768, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.2497879266738892, "rewards/margins": 19.174999237060547, "rewards/rejected": -20.430469512939453, "step": 6420 }, { "epoch": 1.621182995619702, "grad_norm": 0.06687968224287033, "learning_rate": 2.583599830492453e-07, "logits/chosen": -0.587677001953125, "logits/rejected": -0.796582043170929, "logps/chosen": -174.14218139648438, "logps/rejected": -464.1000061035156, "loss": 0.02, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.480065941810608, "rewards/margins": 18.73046875, "rewards/rejected": -20.217187881469727, "step": 6430 }, { "epoch": 1.623704030504522, "grad_norm": 0.4646073281764984, "learning_rate": 2.576268897413916e-07, "logits/chosen": -0.6786133050918579, "logits/rejected": NaN, "logps/chosen": -186.046875, "logps/rejected": -465.625, "loss": 0.0945, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9458404779434204, "rewards/margins": 18.587499618530273, "rewards/rejected": -20.535938262939453, "step": 6440 }, { "epoch": 1.6262250653893422, "grad_norm": 15.430932998657227, "learning_rate": 2.5689373078422603e-07, "logits/chosen": -0.6741363406181335, "logits/rejected": NaN, "logps/chosen": -184.8937530517578, "logps/rejected": -484.8999938964844, "loss": 0.0241, "rewards/accuracies": 0.984375, "rewards/chosen": -1.7551758289337158, "rewards/margins": 20.016407012939453, "rewards/rejected": -21.770313262939453, "step": 6450 }, { "epoch": 1.6287461002741626, "grad_norm": 0.302950382232666, "learning_rate": 2.5616051248849707e-07, "logits/chosen": -0.6902496218681335, "logits/rejected": NaN, "logps/chosen": -174.96875, "logps/rejected": -466.7124938964844, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.798162817955017, "rewards/margins": 19.590625762939453, "rewards/rejected": -21.389842987060547, "step": 6460 }, { "epoch": 1.631267135158983, "grad_norm": 0.06844989210367203, "learning_rate": 2.5542724116546365e-07, "logits/chosen": -0.670214831829071, "logits/rejected": NaN, "logps/chosen": -208.05624389648438, "logps/rejected": -484.79998779296875, "loss": 0.081, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.679670810699463, "rewards/margins": 19.006250381469727, "rewards/rejected": -21.696094512939453, "step": 6470 }, { "epoch": 1.633788170043803, "grad_norm": 57.29985046386719, "learning_rate": 2.5469392312684123e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.28750610351562, "logps/rejected": -478.2749938964844, "loss": 0.0407, "rewards/accuracies": 0.984375, "rewards/chosen": -1.671289086341858, "rewards/margins": 19.548437118530273, "rewards/rejected": -21.220312118530273, "step": 6480 }, { "epoch": 1.6363092049286232, "grad_norm": 12.487762451171875, "learning_rate": 2.539605646847473e-07, "logits/chosen": -0.814282238483429, "logits/rejected": -0.8914794921875, "logps/chosen": -205.9812469482422, "logps/rejected": -475.7749938964844, "loss": 0.1269, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.858483910560608, "rewards/margins": 18.947656631469727, "rewards/rejected": -20.796092987060547, "step": 6490 }, { "epoch": 1.6388302398134433, "grad_norm": 7.739950656890869, "learning_rate": 2.532271721516472e-07, "logits/chosen": -0.803668200969696, "logits/rejected": -0.807110607624054, "logps/chosen": -190.39999389648438, "logps/rejected": -468.8500061035156, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.786328136920929, "rewards/margins": 17.427343368530273, "rewards/rejected": -18.21875, "step": 6500 }, { "epoch": 1.6413512746982635, "grad_norm": 4.8918914794921875, "learning_rate": 2.524937518402997e-07, "logits/chosen": -0.6752563714981079, "logits/rejected": NaN, "logps/chosen": -187.8718719482422, "logps/rejected": -453.9375, "loss": 0.0344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9451659917831421, "rewards/margins": 16.935546875, "rewards/rejected": -17.87890625, "step": 6510 }, { "epoch": 1.6438723095830838, "grad_norm": 25.61776351928711, "learning_rate": 2.5176031006370253e-07, "logits/chosen": -0.6820937991142273, "logits/rejected": NaN, "logps/chosen": -175.85000610351562, "logps/rejected": -444.8500061035156, "loss": 0.0433, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4692504405975342, "rewards/margins": 16.716796875, "rewards/rejected": -18.19921875, "step": 6520 }, { "epoch": 1.6463933444679042, "grad_norm": 0.07864467799663544, "learning_rate": 2.510268531350384e-07, "logits/chosen": -0.5272857546806335, "logits/rejected": NaN, "logps/chosen": -181.99063110351562, "logps/rejected": -441.3500061035156, "loss": 0.0302, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1986327171325684, "rewards/margins": 17.64453125, "rewards/rejected": -19.84375, "step": 6530 }, { "epoch": 1.6489143793527243, "grad_norm": 12.877532005310059, "learning_rate": 2.502933873676204e-07, "logits/chosen": -0.671679675579071, "logits/rejected": -0.751416027545929, "logps/chosen": -173.16561889648438, "logps/rejected": -459.0, "loss": 0.0639, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.3019042015075684, "rewards/margins": 17.896093368530273, "rewards/rejected": -20.189844131469727, "step": 6540 }, { "epoch": 1.6514354142375445, "grad_norm": 0.4123245179653168, "learning_rate": 2.4955991907483763e-07, "logits/chosen": -0.6989105343818665, "logits/rejected": -0.6989715695381165, "logps/chosen": -192.94686889648438, "logps/rejected": -455.9624938964844, "loss": 0.0359, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.562277317047119, "rewards/margins": 18.185546875, "rewards/rejected": -20.75, "step": 6550 }, { "epoch": 1.6539564491223646, "grad_norm": 1.3467117547988892, "learning_rate": 2.4882645457010096e-07, "logits/chosen": -0.7137451171875, "logits/rejected": NaN, "logps/chosen": -173.1281280517578, "logps/rejected": -452.5375061035156, "loss": 0.0679, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1317138671875, "rewards/margins": 18.28515625, "rewards/rejected": -20.424219131469727, "step": 6560 }, { "epoch": 1.656477484007185, "grad_norm": 72.8359603881836, "learning_rate": 2.480930001667887e-07, "logits/chosen": -0.653515636920929, "logits/rejected": NaN, "logps/chosen": -194.89999389648438, "logps/rejected": -450.8999938964844, "loss": 0.131, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.808084011077881, "rewards/margins": 17.088281631469727, "rewards/rejected": -19.892969131469727, "step": 6570 }, { "epoch": 1.6589985188920051, "grad_norm": 66.76577758789062, "learning_rate": 2.473595621781919e-07, "logits/chosen": -0.8263183832168579, "logits/rejected": -1.0068480968475342, "logps/chosen": -183.47500610351562, "logps/rejected": -467.54998779296875, "loss": 0.0191, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.989227294921875, "rewards/margins": 17.641407012939453, "rewards/rejected": -19.621875762939453, "step": 6580 }, { "epoch": 1.6615195537768255, "grad_norm": 113.94587707519531, "learning_rate": 2.4662614691746096e-07, "logits/chosen": -0.7021118402481079, "logits/rejected": -0.731109619140625, "logps/chosen": -190.90625, "logps/rejected": -460.29998779296875, "loss": 0.0722, "rewards/accuracies": 0.984375, "rewards/chosen": -2.644458055496216, "rewards/margins": 17.772655487060547, "rewards/rejected": -20.421875, "step": 6590 }, { "epoch": 1.6640405886616456, "grad_norm": 36.315025329589844, "learning_rate": 2.4589276069754994e-07, "logits/chosen": -0.6609527468681335, "logits/rejected": -0.714251697063446, "logps/chosen": -189.75, "logps/rejected": -466.29998779296875, "loss": 0.0616, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.2477965354919434, "rewards/margins": 18.181249618530273, "rewards/rejected": -20.4296875, "step": 6600 }, { "epoch": 1.6665616235464658, "grad_norm": 6.579713344573975, "learning_rate": 2.451594098311635e-07, "logits/chosen": -0.6878417730331421, "logits/rejected": NaN, "logps/chosen": -168.58438110351562, "logps/rejected": -419.1875, "loss": 0.05, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.488012671470642, "rewards/margins": 18.344532012939453, "rewards/rejected": -19.836719512939453, "step": 6610 }, { "epoch": 1.669082658431286, "grad_norm": 41.050350189208984, "learning_rate": 2.4442610063070143e-07, "logits/chosen": -0.633715808391571, "logits/rejected": NaN, "logps/chosen": -148.6281280517578, "logps/rejected": -430.17498779296875, "loss": 0.0247, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.6467559337615967, "rewards/margins": 17.702342987060547, "rewards/rejected": -19.350000381469727, "step": 6620 }, { "epoch": 1.6716036933161063, "grad_norm": 73.03778076171875, "learning_rate": 2.4369283940820557e-07, "logits/chosen": -0.8274383544921875, "logits/rejected": NaN, "logps/chosen": -188.484375, "logps/rejected": -474.11248779296875, "loss": 0.0728, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.5973389148712158, "rewards/margins": 18.411718368530273, "rewards/rejected": -20.000782012939453, "step": 6630 }, { "epoch": 1.6741247282009266, "grad_norm": 0.18467800319194794, "learning_rate": 2.429596324753042e-07, "logits/chosen": -0.7457641363143921, "logits/rejected": NaN, "logps/chosen": -210.06875610351562, "logps/rejected": -475.70001220703125, "loss": 0.0213, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.2911743223667145, "rewards/margins": 20.637500762939453, "rewards/rejected": -20.927343368530273, "step": 6640 }, { "epoch": 1.6766457630857468, "grad_norm": 0.6635369658470154, "learning_rate": 2.422264861431584e-07, "logits/chosen": -0.7395904660224915, "logits/rejected": -0.834240734577179, "logps/chosen": -180.0734405517578, "logps/rejected": -449.7250061035156, "loss": 0.0316, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.300207495689392, "rewards/margins": 18.912500381469727, "rewards/rejected": -20.212499618530273, "step": 6650 }, { "epoch": 1.679166797970567, "grad_norm": 4.234791278839111, "learning_rate": 2.41493406722408e-07, "logits/chosen": NaN, "logits/rejected": -0.6779571771621704, "logps/chosen": -149.5906219482422, "logps/rejected": -490.86248779296875, "loss": 0.0707, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.2848907709121704, "rewards/margins": 19.846094131469727, "rewards/rejected": -21.132030487060547, "step": 6660 }, { "epoch": 1.681687832855387, "grad_norm": 0.04590437561273575, "learning_rate": 2.407604005231163e-07, "logits/chosen": -0.797375500202179, "logits/rejected": NaN, "logps/chosen": -168.2312469482422, "logps/rejected": -462.92498779296875, "loss": 0.019, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5738525390625, "rewards/margins": 19.364063262939453, "rewards/rejected": -19.940624237060547, "step": 6670 }, { "epoch": 1.6842088677402074, "grad_norm": 6.1979475021362305, "learning_rate": 2.4002747385471686e-07, "logits/chosen": -0.7247253656387329, "logits/rejected": -0.8516174554824829, "logps/chosen": -150.1906280517578, "logps/rejected": -440.79998779296875, "loss": 0.0375, "rewards/accuracies": 0.984375, "rewards/chosen": -0.6256622076034546, "rewards/margins": 17.967967987060547, "rewards/rejected": -18.604686737060547, "step": 6680 }, { "epoch": 1.6867299026250275, "grad_norm": 36.865966796875, "learning_rate": 2.392946330259583e-07, "logits/chosen": -0.768963634967804, "logits/rejected": -0.7250732183456421, "logps/chosen": -178.0359344482422, "logps/rejected": -461.76251220703125, "loss": 0.0297, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.6326507329940796, "rewards/margins": 19.693750381469727, "rewards/rejected": -20.33203125, "step": 6690 }, { "epoch": 1.689250937509848, "grad_norm": 53.49083709716797, "learning_rate": 2.385618843448507e-07, "logits/chosen": -0.7243286371231079, "logits/rejected": NaN, "logps/chosen": -181.52499389648438, "logps/rejected": -449.5625, "loss": 0.0516, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.54534912109375, "rewards/margins": 19.345312118530273, "rewards/rejected": -20.896093368530273, "step": 6700 }, { "epoch": 1.691771972394668, "grad_norm": 7.706853866577148, "learning_rate": 2.378292341186107e-07, "logits/chosen": -0.687640368938446, "logits/rejected": -0.7944885492324829, "logps/chosen": -192.99374389648438, "logps/rejected": -470.0625, "loss": 0.0783, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4153199195861816, "rewards/margins": 20.166406631469727, "rewards/rejected": -22.578125, "step": 6710 }, { "epoch": 1.6942930072794882, "grad_norm": 12.34585952758789, "learning_rate": 2.370966886536074e-07, "logits/chosen": -0.6819747686386108, "logits/rejected": NaN, "logps/chosen": -176.6125030517578, "logps/rejected": -477.07501220703125, "loss": 0.0183, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.9883087277412415, "rewards/margins": 20.591796875, "rewards/rejected": -21.583593368530273, "step": 6720 }, { "epoch": 1.6968140421643083, "grad_norm": 5.592272758483887, "learning_rate": 2.3636425425530857e-07, "logits/chosen": -0.6762679815292358, "logits/rejected": NaN, "logps/chosen": -192.3874969482422, "logps/rejected": -489.92498779296875, "loss": 0.0115, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.0374755859375, "rewards/margins": 20.36328125, "rewards/rejected": -21.399999618530273, "step": 6730 }, { "epoch": 1.6993350770491287, "grad_norm": 39.47167205810547, "learning_rate": 2.3563193722822555e-07, "logits/chosen": -0.6667038202285767, "logits/rejected": NaN, "logps/chosen": -182.6062469482422, "logps/rejected": -472.5625, "loss": 0.0726, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.618676781654358, "rewards/margins": 20.526561737060547, "rewards/rejected": -22.14453125, "step": 6740 }, { "epoch": 1.701856111933949, "grad_norm": 1.1513264179229736, "learning_rate": 2.3489974387585964e-07, "logits/chosen": -0.6696411371231079, "logits/rejected": -0.7214721441268921, "logps/chosen": -196.88436889648438, "logps/rejected": -481.61248779296875, "loss": 0.1204, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2811126708984375, "rewards/margins": 19.505468368530273, "rewards/rejected": -21.782812118530273, "step": 6750 }, { "epoch": 1.7043771468187692, "grad_norm": 1.6919903755187988, "learning_rate": 2.3416768050064739e-07, "logits/chosen": -0.7163360714912415, "logits/rejected": -0.726428210735321, "logps/chosen": -196.40625, "logps/rejected": -481.0249938964844, "loss": 0.0564, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9108612537384033, "rewards/margins": 20.208593368530273, "rewards/rejected": -22.117969512939453, "step": 6760 }, { "epoch": 1.7068981817035893, "grad_norm": 4.6897292137146, "learning_rate": 2.334357534039069e-07, "logits/chosen": -0.560101330280304, "logits/rejected": NaN, "logps/chosen": -175.53750610351562, "logps/rejected": -459.51251220703125, "loss": 0.0778, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.0721678733825684, "rewards/margins": 19.320703506469727, "rewards/rejected": -21.397655487060547, "step": 6770 }, { "epoch": 1.7094192165884095, "grad_norm": 16.368515014648438, "learning_rate": 2.3270396888578283e-07, "logits/chosen": -0.4869705140590668, "logits/rejected": NaN, "logps/chosen": -167.2843780517578, "logps/rejected": -459.2250061035156, "loss": 0.0396, "rewards/accuracies": 0.984375, "rewards/chosen": -2.4362196922302246, "rewards/margins": 18.588672637939453, "rewards/rejected": -21.017969131469727, "step": 6780 }, { "epoch": 1.7119402514732298, "grad_norm": 2.939669609069824, "learning_rate": 2.3197233324519274e-07, "logits/chosen": -0.7528915405273438, "logits/rejected": NaN, "logps/chosen": -204.60311889648438, "logps/rejected": -485.48748779296875, "loss": 0.0151, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.604846239089966, "rewards/margins": 19.262500762939453, "rewards/rejected": -21.875, "step": 6790 }, { "epoch": 1.71446128635805, "grad_norm": 69.9095458984375, "learning_rate": 2.312408527797729e-07, "logits/chosen": -0.737384021282196, "logits/rejected": NaN, "logps/chosen": -192.33438110351562, "logps/rejected": -484.625, "loss": 0.0639, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6840331554412842, "rewards/margins": 19.669530868530273, "rewards/rejected": -21.346874237060547, "step": 6800 }, { "epoch": 1.7169823212428703, "grad_norm": 48.1747932434082, "learning_rate": 2.305095337858236e-07, "logits/chosen": -0.650317370891571, "logits/rejected": NaN, "logps/chosen": -172.99374389648438, "logps/rejected": -465.9750061035156, "loss": 0.013, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.988360583782196, "rewards/margins": 19.787500381469727, "rewards/rejected": -20.774999618530273, "step": 6810 }, { "epoch": 1.7195033561276905, "grad_norm": 9.841669082641602, "learning_rate": 2.2977838255825545e-07, "logits/chosen": -0.8398803472518921, "logits/rejected": NaN, "logps/chosen": -178.4812469482422, "logps/rejected": -477.57501220703125, "loss": 0.0232, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.890716552734375, "rewards/margins": 19.670312881469727, "rewards/rejected": -20.557811737060547, "step": 6820 }, { "epoch": 1.7220243910125106, "grad_norm": 39.982032775878906, "learning_rate": 2.2904740539053477e-07, "logits/chosen": -0.7770019769668579, "logits/rejected": NaN, "logps/chosen": -162.27969360351562, "logps/rejected": -414.73748779296875, "loss": 0.0499, "rewards/accuracies": 0.984375, "rewards/chosen": -0.568188488483429, "rewards/margins": 18.178905487060547, "rewards/rejected": -18.752344131469727, "step": 6830 }, { "epoch": 1.7245454258973307, "grad_norm": 3.293665647506714, "learning_rate": 2.2831660857462998e-07, "logits/chosen": -0.7047179937362671, "logits/rejected": -0.79547119140625, "logps/chosen": -181.9890594482422, "logps/rejected": -464.9375, "loss": 0.0263, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0764862298965454, "rewards/margins": 18.867969512939453, "rewards/rejected": -19.946874618530273, "step": 6840 }, { "epoch": 1.727066460782151, "grad_norm": 24.696273803710938, "learning_rate": 2.275859984009568e-07, "logits/chosen": -0.7184997797012329, "logits/rejected": NaN, "logps/chosen": -182.734375, "logps/rejected": -464.25, "loss": 0.0451, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.883978247642517, "rewards/margins": 18.73046875, "rewards/rejected": -20.616405487060547, "step": 6850 }, { "epoch": 1.7295874956669715, "grad_norm": 66.87100219726562, "learning_rate": 2.2685558115832445e-07, "logits/chosen": -0.718371570110321, "logits/rejected": NaN, "logps/chosen": -177.0, "logps/rejected": -468.9750061035156, "loss": 0.0916, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.8262939453125, "rewards/margins": 19.073436737060547, "rewards/rejected": -20.898828506469727, "step": 6860 }, { "epoch": 1.7321085305517916, "grad_norm": 0.29539117217063904, "learning_rate": 2.2612536313388172e-07, "logits/chosen": -0.7327026128768921, "logits/rejected": NaN, "logps/chosen": -175.84375, "logps/rejected": -479.1499938964844, "loss": 0.0309, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.011309862136841, "rewards/margins": 18.930469512939453, "rewards/rejected": -20.942188262939453, "step": 6870 }, { "epoch": 1.7346295654366117, "grad_norm": 1.858283519744873, "learning_rate": 2.253953506130622e-07, "logits/chosen": -0.6799408197402954, "logits/rejected": -0.83111572265625, "logps/chosen": -161.5749969482422, "logps/rejected": -452.7749938964844, "loss": 0.0137, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.5684814453125, "rewards/margins": 18.64453125, "rewards/rejected": -20.212499618530273, "step": 6880 }, { "epoch": 1.7371506003214319, "grad_norm": 20.143537521362305, "learning_rate": 2.2466554987953107e-07, "logits/chosen": -0.744403064250946, "logits/rejected": -0.840533435344696, "logps/chosen": -183.6687469482422, "logps/rejected": -462.7875061035156, "loss": 0.0274, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9594604969024658, "rewards/margins": 18.991405487060547, "rewards/rejected": -20.952342987060547, "step": 6890 }, { "epoch": 1.739671635206252, "grad_norm": 0.25678664445877075, "learning_rate": 2.2393596721512994e-07, "logits/chosen": -0.73486328125, "logits/rejected": NaN, "logps/chosen": -205.86874389648438, "logps/rejected": -449.88751220703125, "loss": 0.0473, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.492181420326233, "rewards/margins": 19.129980087280273, "rewards/rejected": -20.618749618530273, "step": 6900 }, { "epoch": 1.7421926700910724, "grad_norm": 0.2366468608379364, "learning_rate": 2.23206608899824e-07, "logits/chosen": -0.7992095947265625, "logits/rejected": NaN, "logps/chosen": -188.125, "logps/rejected": -466.4624938964844, "loss": 0.0988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.28324818611145, "rewards/margins": 19.007421493530273, "rewards/rejected": -21.29296875, "step": 6910 }, { "epoch": 1.7447137049758927, "grad_norm": 0.03655132278800011, "learning_rate": 2.2247748121164686e-07, "logits/chosen": -0.7703582644462585, "logits/rejected": NaN, "logps/chosen": -169.3585968017578, "logps/rejected": -458.8999938964844, "loss": 0.0586, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.732702612876892, "rewards/margins": 18.995311737060547, "rewards/rejected": -20.725000381469727, "step": 6920 }, { "epoch": 1.7472347398607129, "grad_norm": 59.9242057800293, "learning_rate": 2.2174859042664706e-07, "logits/chosen": -0.818835437297821, "logits/rejected": NaN, "logps/chosen": -185.0515594482422, "logps/rejected": -455.0, "loss": 0.0566, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.0396971702575684, "rewards/margins": 18.055469512939453, "rewards/rejected": -20.087499618530273, "step": 6930 }, { "epoch": 1.749755774745533, "grad_norm": 8.386969566345215, "learning_rate": 2.210199428188343e-07, "logits/chosen": -0.760943591594696, "logits/rejected": NaN, "logps/chosen": -186.78750610351562, "logps/rejected": -445.92498779296875, "loss": 0.0197, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8042724132537842, "rewards/margins": 17.608983993530273, "rewards/rejected": -19.423437118530273, "step": 6940 }, { "epoch": 1.7522768096303531, "grad_norm": 3.1718451976776123, "learning_rate": 2.2029154466012466e-07, "logits/chosen": -0.698822021484375, "logits/rejected": NaN, "logps/chosen": -169.6437530517578, "logps/rejected": -481.88751220703125, "loss": 0.0612, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8034026622772217, "rewards/margins": 19.09375, "rewards/rejected": -20.899999618530273, "step": 6950 }, { "epoch": 1.7547978445151735, "grad_norm": 3.8260059356689453, "learning_rate": 2.1956340222028732e-07, "logits/chosen": -0.76580810546875, "logits/rejected": -0.8892577886581421, "logps/chosen": -181.00625610351562, "logps/rejected": -478.73748779296875, "loss": 0.032, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1592819690704346, "rewards/margins": 19.716405868530273, "rewards/rejected": -20.877344131469727, "step": 6960 }, { "epoch": 1.7573188793999936, "grad_norm": 3.2014732360839844, "learning_rate": 2.1883552176689016e-07, "logits/chosen": -0.842041015625, "logits/rejected": NaN, "logps/chosen": -195.8859405517578, "logps/rejected": -499.07501220703125, "loss": 0.0195, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.927001953125, "rewards/margins": 21.14453125, "rewards/rejected": -23.079687118530273, "step": 6970 }, { "epoch": 1.759839914284814, "grad_norm": 22.38792610168457, "learning_rate": 2.181079095652463e-07, "logits/chosen": -0.7971221804618835, "logits/rejected": -0.793078601360321, "logps/chosen": -194.00936889648438, "logps/rejected": -458.29998779296875, "loss": 0.0322, "rewards/accuracies": 0.984375, "rewards/chosen": -2.45623779296875, "rewards/margins": 19.543750762939453, "rewards/rejected": -22.013280868530273, "step": 6980 }, { "epoch": 1.7623609491696342, "grad_norm": 19.611053466796875, "learning_rate": 2.1738057187835952e-07, "logits/chosen": -0.7796691656112671, "logits/rejected": NaN, "logps/chosen": -176.7921905517578, "logps/rejected": -463.8500061035156, "loss": 0.0531, "rewards/accuracies": 0.984375, "rewards/chosen": -1.4391387701034546, "rewards/margins": 20.099218368530273, "rewards/rejected": -21.546092987060547, "step": 6990 }, { "epoch": 1.7648819840544543, "grad_norm": 95.13735961914062, "learning_rate": 2.1665351496687068e-07, "logits/chosen": -0.6321914792060852, "logits/rejected": NaN, "logps/chosen": -174.1687469482422, "logps/rejected": -493.32501220703125, "loss": 0.0766, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.332275390625, "rewards/margins": 20.799999237060547, "rewards/rejected": -22.141407012939453, "step": 7000 }, { "epoch": 1.7674030189392744, "grad_norm": 20.681346893310547, "learning_rate": 2.159267450890042e-07, "logits/chosen": -0.635729968547821, "logits/rejected": -0.6843765377998352, "logps/chosen": -191.1765594482422, "logps/rejected": -477.61248779296875, "loss": 0.0877, "rewards/accuracies": 0.96875, "rewards/chosen": -1.8444030284881592, "rewards/margins": 19.438282012939453, "rewards/rejected": -21.27734375, "step": 7010 }, { "epoch": 1.7699240538240948, "grad_norm": 72.04264831542969, "learning_rate": 2.1520026850051342e-07, "logits/chosen": -0.813995361328125, "logits/rejected": -0.8036773800849915, "logps/chosen": -196.5671844482422, "logps/rejected": -492.70001220703125, "loss": 0.0704, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9802124500274658, "rewards/margins": 19.235937118530273, "rewards/rejected": -21.220312118530273, "step": 7020 }, { "epoch": 1.7724450887089152, "grad_norm": 10.39279842376709, "learning_rate": 2.1447409145462742e-07, "logits/chosen": -0.702105700969696, "logits/rejected": NaN, "logps/chosen": -182.9968719482422, "logps/rejected": -467.5375061035156, "loss": 0.0526, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.223895311355591, "rewards/margins": 19.650781631469727, "rewards/rejected": -21.879688262939453, "step": 7030 }, { "epoch": 1.7749661235937353, "grad_norm": 0.19950856268405914, "learning_rate": 2.1374822020199668e-07, "logits/chosen": -0.7407897710800171, "logits/rejected": NaN, "logps/chosen": -202.71249389648438, "logps/rejected": -476.3500061035156, "loss": 0.0167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.534832715988159, "rewards/margins": 19.232812881469727, "rewards/rejected": -21.764062881469727, "step": 7040 }, { "epoch": 1.7774871584785554, "grad_norm": 3.4304370880126953, "learning_rate": 2.130226609906399e-07, "logits/chosen": -0.666821300983429, "logits/rejected": NaN, "logps/chosen": -173.82968139648438, "logps/rejected": -473.625, "loss": 0.0617, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.794714331626892, "rewards/margins": 20.267187118530273, "rewards/rejected": -22.063282012939453, "step": 7050 }, { "epoch": 1.7800081933633756, "grad_norm": 0.5486415028572083, "learning_rate": 2.1229742006588953e-07, "logits/chosen": -0.8071228265762329, "logits/rejected": NaN, "logps/chosen": -186.10000610351562, "logps/rejected": -473.86248779296875, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.1982848644256592, "rewards/margins": 20.344532012939453, "rewards/rejected": -21.544530868530273, "step": 7060 }, { "epoch": 1.782529228248196, "grad_norm": 70.84906768798828, "learning_rate": 2.115725036703383e-07, "logits/chosen": -0.781158447265625, "logits/rejected": NaN, "logps/chosen": -191.06640625, "logps/rejected": -486.70001220703125, "loss": 0.0416, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.780993640422821, "rewards/margins": 19.37890625, "rewards/rejected": -20.158594131469727, "step": 7070 }, { "epoch": 1.785050263133016, "grad_norm": 0.5038511753082275, "learning_rate": 2.1084791804378592e-07, "logits/chosen": -0.6648041009902954, "logits/rejected": NaN, "logps/chosen": -188.84375, "logps/rejected": -462.2749938964844, "loss": 0.1073, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.0742003917694092, "rewards/margins": 19.797657012939453, "rewards/rejected": -20.878124237060547, "step": 7080 }, { "epoch": 1.7875712980178364, "grad_norm": 2.3009352684020996, "learning_rate": 2.101236694231845e-07, "logits/chosen": -0.6827484369277954, "logits/rejected": -0.819915771484375, "logps/chosen": -170.1843719482422, "logps/rejected": -461.13751220703125, "loss": 0.0586, "rewards/accuracies": 0.984375, "rewards/chosen": -1.344763159751892, "rewards/margins": 19.045312881469727, "rewards/rejected": -20.396875381469727, "step": 7090 }, { "epoch": 1.7900923329026566, "grad_norm": 51.67293167114258, "learning_rate": 2.0939976404258567e-07, "logits/chosen": -0.714642345905304, "logits/rejected": NaN, "logps/chosen": -176.9484405517578, "logps/rejected": -455.7124938964844, "loss": 0.0367, "rewards/accuracies": 0.984375, "rewards/chosen": -1.17315673828125, "rewards/margins": 18.661327362060547, "rewards/rejected": -19.832813262939453, "step": 7100 }, { "epoch": 1.7926133677874767, "grad_norm": 2.7875092029571533, "learning_rate": 2.086762081330863e-07, "logits/chosen": -0.6247283816337585, "logits/rejected": NaN, "logps/chosen": -192.30313110351562, "logps/rejected": -487.8500061035156, "loss": 0.0137, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.8659088611602783, "rewards/margins": 19.479686737060547, "rewards/rejected": -21.345312118530273, "step": 7110 }, { "epoch": 1.7951344026722968, "grad_norm": 69.04942321777344, "learning_rate": 2.079530079227755e-07, "logits/chosen": -0.5758941769599915, "logits/rejected": NaN, "logps/chosen": -176.6640625, "logps/rejected": -474.8999938964844, "loss": 0.0401, "rewards/accuracies": 0.984375, "rewards/chosen": -1.656280517578125, "rewards/margins": 19.767969131469727, "rewards/rejected": -21.422657012939453, "step": 7120 }, { "epoch": 1.7976554375571172, "grad_norm": 11.499399185180664, "learning_rate": 2.072301696366803e-07, "logits/chosen": -0.7105926275253296, "logits/rejected": -0.744598388671875, "logps/chosen": -191.8000030517578, "logps/rejected": -462.2124938964844, "loss": 0.0789, "rewards/accuracies": 0.96875, "rewards/chosen": -2.134777784347534, "rewards/margins": 19.352344512939453, "rewards/rejected": -21.487499237060547, "step": 7130 }, { "epoch": 1.8001764724419376, "grad_norm": 25.87421417236328, "learning_rate": 2.0650769949671257e-07, "logits/chosen": -0.656707763671875, "logits/rejected": NaN, "logps/chosen": -194.92813110351562, "logps/rejected": -477.70001220703125, "loss": 0.04, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6577332019805908, "rewards/margins": 19.882030487060547, "rewards/rejected": -21.540624618530273, "step": 7140 }, { "epoch": 1.8026975073267577, "grad_norm": 51.94773483276367, "learning_rate": 2.057856037216155e-07, "logits/chosen": -0.674530029296875, "logits/rejected": NaN, "logps/chosen": -197.4406280517578, "logps/rejected": -480.0625, "loss": 0.0563, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.828375220298767, "rewards/margins": 19.7109375, "rewards/rejected": -21.538280487060547, "step": 7150 }, { "epoch": 1.8052185422115778, "grad_norm": 48.938568115234375, "learning_rate": 2.0506388852690958e-07, "logits/chosen": -0.5410003662109375, "logits/rejected": NaN, "logps/chosen": -193.8484344482422, "logps/rejected": -476.57501220703125, "loss": 0.0699, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.9700438976287842, "rewards/margins": 19.77734375, "rewards/rejected": -21.74609375, "step": 7160 }, { "epoch": 1.807739577096398, "grad_norm": 1.8946012258529663, "learning_rate": 2.043425601248397e-07, "logits/chosen": -0.6873718500137329, "logits/rejected": -0.74725341796875, "logps/chosen": -188.1374969482422, "logps/rejected": -511.70001220703125, "loss": 0.0366, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.2983031272888184, "rewards/margins": 19.899219512939453, "rewards/rejected": -22.1953125, "step": 7170 }, { "epoch": 1.8102606119812181, "grad_norm": 70.67001342773438, "learning_rate": 2.03621624724321e-07, "logits/chosen": -0.7924743890762329, "logits/rejected": -0.831402599811554, "logps/chosen": -201.3312530517578, "logps/rejected": -469.59375, "loss": 0.0813, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8412902355194092, "rewards/margins": 18.733983993530273, "rewards/rejected": -20.564062118530273, "step": 7180 }, { "epoch": 1.8127816468660385, "grad_norm": 52.07129669189453, "learning_rate": 2.0290108853088634e-07, "logits/chosen": -0.662585437297821, "logits/rejected": -0.672943115234375, "logps/chosen": -180.61563110351562, "logps/rejected": -494.9750061035156, "loss": 0.0473, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.432116746902466, "rewards/margins": 18.439062118530273, "rewards/rejected": -20.871875762939453, "step": 7190 }, { "epoch": 1.8153026817508588, "grad_norm": 18.42094612121582, "learning_rate": 2.0218095774663197e-07, "logits/chosen": -0.6338256597518921, "logits/rejected": -0.8898071050643921, "logps/chosen": -186.32186889648438, "logps/rejected": -482.375, "loss": 0.0489, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.444653272628784, "rewards/margins": 18.08984375, "rewards/rejected": -20.533594131469727, "step": 7200 }, { "epoch": 1.817823716635679, "grad_norm": 7.40588903427124, "learning_rate": 2.0146123857016453e-07, "logits/chosen": -0.6038604974746704, "logits/rejected": -0.6962677240371704, "logps/chosen": -192.10000610351562, "logps/rejected": -475.2124938964844, "loss": 0.0574, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.281079053878784, "rewards/margins": 18.835155487060547, "rewards/rejected": -21.119531631469727, "step": 7210 }, { "epoch": 1.8203447515204991, "grad_norm": 1.0274378061294556, "learning_rate": 2.0074193719654803e-07, "logits/chosen": -0.645184338092804, "logits/rejected": -0.697399914264679, "logps/chosen": -188.28750610351562, "logps/rejected": -467.6000061035156, "loss": 0.0935, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.572094678878784, "rewards/margins": 18.223438262939453, "rewards/rejected": -20.786718368530273, "step": 7220 }, { "epoch": 1.8228657864053193, "grad_norm": 65.84089660644531, "learning_rate": 2.0002305981724983e-07, "logits/chosen": -0.6537262201309204, "logits/rejected": NaN, "logps/chosen": -194.6453094482422, "logps/rejected": -498.92498779296875, "loss": 0.0306, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.047982692718506, "rewards/margins": 18.482030868530273, "rewards/rejected": -20.539844512939453, "step": 7230 }, { "epoch": 1.8253868212901396, "grad_norm": 0.810290515422821, "learning_rate": 1.99304612620088e-07, "logits/chosen": -0.6396545171737671, "logits/rejected": NaN, "logps/chosen": -183.6843719482422, "logps/rejected": -477.8500061035156, "loss": 0.0153, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2914917469024658, "rewards/margins": 19.078907012939453, "rewards/rejected": -20.364843368530273, "step": 7240 }, { "epoch": 1.8279078561749598, "grad_norm": 15.033353805541992, "learning_rate": 1.9858660178917743e-07, "logits/chosen": -0.724499523639679, "logits/rejected": NaN, "logps/chosen": -190.99374389648438, "logps/rejected": -481.1875, "loss": 0.0566, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.928655982017517, "rewards/margins": 18.564062118530273, "rewards/rejected": -20.48828125, "step": 7250 }, { "epoch": 1.8304288910597801, "grad_norm": 2.845219612121582, "learning_rate": 1.9786903350487737e-07, "logits/chosen": -0.5604141354560852, "logits/rejected": -0.6288207769393921, "logps/chosen": -181.52499389648438, "logps/rejected": -473.3999938964844, "loss": 0.0148, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.654840111732483, "rewards/margins": 18.728124618530273, "rewards/rejected": -20.385156631469727, "step": 7260 }, { "epoch": 1.8329499259446003, "grad_norm": 0.15034154057502747, "learning_rate": 1.9715191394373745e-07, "logits/chosen": -0.5615875124931335, "logits/rejected": NaN, "logps/chosen": -178.10311889648438, "logps/rejected": -445.5249938964844, "loss": 0.0309, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0207276344299316, "rewards/margins": 18.728906631469727, "rewards/rejected": -20.754688262939453, "step": 7270 }, { "epoch": 1.8354709608294204, "grad_norm": 21.85536003112793, "learning_rate": 1.964352492784449e-07, "logits/chosen": -0.6807052493095398, "logits/rejected": -0.756457507610321, "logps/chosen": -192.9968719482422, "logps/rejected": -486.26251220703125, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.842413306236267, "rewards/margins": 19.897655487060547, "rewards/rejected": -21.735937118530273, "step": 7280 }, { "epoch": 1.8379919957142405, "grad_norm": 0.3627617061138153, "learning_rate": 1.957190456777717e-07, "logits/chosen": -0.7985168695449829, "logits/rejected": -0.760974109172821, "logps/chosen": -208.09375, "logps/rejected": -491.32501220703125, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.3120849132537842, "rewards/margins": 20.079687118530273, "rewards/rejected": -21.385156631469727, "step": 7290 }, { "epoch": 1.840513030599061, "grad_norm": 35.124237060546875, "learning_rate": 1.9500330930652073e-07, "logits/chosen": -0.6303657293319702, "logits/rejected": -0.7206573486328125, "logps/chosen": -167.97811889648438, "logps/rejected": -467.45001220703125, "loss": 0.0528, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.615380883216858, "rewards/margins": 19.237499237060547, "rewards/rejected": -20.8515625, "step": 7300 }, { "epoch": 1.8430340654838813, "grad_norm": 0.14710010588169098, "learning_rate": 1.9428804632547348e-07, "logits/chosen": -0.6661132574081421, "logits/rejected": NaN, "logps/chosen": -173.44375610351562, "logps/rejected": -477.0375061035156, "loss": 0.0786, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.575708031654358, "rewards/margins": 19.700780868530273, "rewards/rejected": -21.282812118530273, "step": 7310 }, { "epoch": 1.8455551003687014, "grad_norm": 1.7275532484054565, "learning_rate": 1.9357326289133635e-07, "logits/chosen": -0.6885444521903992, "logits/rejected": NaN, "logps/chosen": -176.84219360351562, "logps/rejected": -458.45001220703125, "loss": 0.0516, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.545770287513733, "rewards/margins": 18.810937881469727, "rewards/rejected": -20.353906631469727, "step": 7320 }, { "epoch": 1.8480761352535215, "grad_norm": 2.420135736465454, "learning_rate": 1.9285896515668841e-07, "logits/chosen": NaN, "logits/rejected": -0.7598632574081421, "logps/chosen": -167.79061889648438, "logps/rejected": -482.7124938964844, "loss": 0.0713, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2104918956756592, "rewards/margins": 18.934375762939453, "rewards/rejected": -20.153905868530273, "step": 7330 }, { "epoch": 1.8505971701383417, "grad_norm": 76.22583770751953, "learning_rate": 1.9214515926992775e-07, "logits/chosen": -0.619793713092804, "logits/rejected": -0.772936999797821, "logps/chosen": -172.49063110351562, "logps/rejected": -490.0, "loss": 0.0787, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5539429187774658, "rewards/margins": 19.603124618530273, "rewards/rejected": -21.153125762939453, "step": 7340 }, { "epoch": 1.853118205023162, "grad_norm": 7.726154327392578, "learning_rate": 1.9143185137521863e-07, "logits/chosen": -0.6979736089706421, "logits/rejected": -0.8121582269668579, "logps/chosen": -194.484375, "logps/rejected": -456.7749938964844, "loss": 0.0225, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.4523239135742188, "rewards/margins": 18.924999237060547, "rewards/rejected": -20.378124237060547, "step": 7350 }, { "epoch": 1.8556392399079822, "grad_norm": 0.47721678018569946, "learning_rate": 1.9071904761243935e-07, "logits/chosen": -0.6947296261787415, "logits/rejected": -0.710235595703125, "logps/chosen": -175.14688110351562, "logps/rejected": -460.26251220703125, "loss": 0.0604, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.3774292469024658, "rewards/margins": 18.991405487060547, "rewards/rejected": -20.366405487060547, "step": 7360 }, { "epoch": 1.8581602747928025, "grad_norm": 0.2997173070907593, "learning_rate": 1.9000675411712827e-07, "logits/chosen": -0.5560890436172485, "logits/rejected": -0.6580810546875, "logps/chosen": -168.22500610351562, "logps/rejected": -455.70001220703125, "loss": 0.0236, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.790563941001892, "rewards/margins": 18.965625762939453, "rewards/rejected": -20.758594512939453, "step": 7370 }, { "epoch": 1.8606813096776227, "grad_norm": 61.388545989990234, "learning_rate": 1.8929497702043194e-07, "logits/chosen": -0.639147937297821, "logits/rejected": NaN, "logps/chosen": -189.6875, "logps/rejected": -495.25, "loss": 0.0454, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.968725562095642, "rewards/margins": 19.250782012939453, "rewards/rejected": -21.224218368530273, "step": 7380 }, { "epoch": 1.8632023445624428, "grad_norm": 41.08460998535156, "learning_rate": 1.8858372244905162e-07, "logits/chosen": -0.7700225710868835, "logits/rejected": NaN, "logps/chosen": -190.80624389648438, "logps/rejected": -458.4750061035156, "loss": 0.0372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9184112548828125, "rewards/margins": 17.93359375, "rewards/rejected": -19.849218368530273, "step": 7390 }, { "epoch": 1.865723379447263, "grad_norm": 0.9268859028816223, "learning_rate": 1.878729965251913e-07, "logits/chosen": -0.7063537836074829, "logits/rejected": NaN, "logps/chosen": -177.6437530517578, "logps/rejected": -448.79998779296875, "loss": 0.0146, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.726354956626892, "rewards/margins": 18.910938262939453, "rewards/rejected": -20.633983612060547, "step": 7400 }, { "epoch": 1.8682444143320833, "grad_norm": 0.21246209740638733, "learning_rate": 1.871628053665043e-07, "logits/chosen": -0.566998302936554, "logits/rejected": NaN, "logps/chosen": -170.1125030517578, "logps/rejected": -464.11248779296875, "loss": 0.0276, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.370288133621216, "rewards/margins": 19.689844131469727, "rewards/rejected": -22.047657012939453, "step": 7410 }, { "epoch": 1.8707654492169037, "grad_norm": 0.06450022011995316, "learning_rate": 1.864531550860407e-07, "logits/chosen": -0.5721801519393921, "logits/rejected": -0.705584704875946, "logps/chosen": -191.2468719482422, "logps/rejected": -509.875, "loss": 0.0252, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.153002977371216, "rewards/margins": 20.930469512939453, "rewards/rejected": -23.068750381469727, "step": 7420 }, { "epoch": 1.8732864841017238, "grad_norm": 0.2902052700519562, "learning_rate": 1.8574405179219548e-07, "logits/chosen": -0.7152618169784546, "logits/rejected": -0.7979065179824829, "logps/chosen": -174.08749389648438, "logps/rejected": -480.8500061035156, "loss": 0.0136, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.608679175376892, "rewards/margins": 20.546875, "rewards/rejected": -22.15234375, "step": 7430 }, { "epoch": 1.875807518986544, "grad_norm": 0.06781815737485886, "learning_rate": 1.8503550158865476e-07, "logits/chosen": -0.5836547613143921, "logits/rejected": -0.733654797077179, "logps/chosen": -188.81094360351562, "logps/rejected": -486.0, "loss": 0.0479, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.6231200695037842, "rewards/margins": 19.719532012939453, "rewards/rejected": -21.332813262939453, "step": 7440 }, { "epoch": 1.878328553871364, "grad_norm": 0.30700716376304626, "learning_rate": 1.8432751057434438e-07, "logits/chosen": -0.691332995891571, "logits/rejected": NaN, "logps/chosen": -178.7375030517578, "logps/rejected": -468.7875061035156, "loss": 0.0561, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.21923828125, "rewards/margins": 19.881250381469727, "rewards/rejected": -21.1015625, "step": 7450 }, { "epoch": 1.8808495887561845, "grad_norm": 0.014285302720963955, "learning_rate": 1.8362008484337637e-07, "logits/chosen": -0.5404113531112671, "logits/rejected": NaN, "logps/chosen": -179.0031280517578, "logps/rejected": -492.45001220703125, "loss": 0.0212, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.013671875, "rewards/margins": 19.6015625, "rewards/rejected": -21.622655868530273, "step": 7460 }, { "epoch": 1.8833706236410046, "grad_norm": 9.284937858581543, "learning_rate": 1.8291323048499762e-07, "logits/chosen": -0.5774810910224915, "logits/rejected": -0.7581787109375, "logps/chosen": -182.97500610351562, "logps/rejected": -471.25, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.35357666015625, "rewards/margins": 20.333593368530273, "rewards/rejected": -22.684375762939453, "step": 7470 }, { "epoch": 1.885891658525825, "grad_norm": 17.006526947021484, "learning_rate": 1.8220695358353643e-07, "logits/chosen": -0.5866638422012329, "logits/rejected": -0.6140075922012329, "logps/chosen": -192.9765625, "logps/rejected": -496.48748779296875, "loss": 0.0664, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3496766090393066, "rewards/margins": 20.529687881469727, "rewards/rejected": -22.8828125, "step": 7480 }, { "epoch": 1.888412693410645, "grad_norm": 0.007912328466773033, "learning_rate": 1.815012602183506e-07, "logits/chosen": -0.600231945514679, "logits/rejected": NaN, "logps/chosen": -167.7578125, "logps/rejected": -472.20001220703125, "loss": 0.0708, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -1.9096558094024658, "rewards/margins": 19.241796493530273, "rewards/rejected": -21.149219512939453, "step": 7490 }, { "epoch": 1.8909337282954652, "grad_norm": 1.6755987405776978, "learning_rate": 1.8079615646377535e-07, "logits/chosen": -0.704907238483429, "logits/rejected": NaN, "logps/chosen": -183.94686889648438, "logps/rejected": -473.38751220703125, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3169922828674316, "rewards/margins": 19.6171875, "rewards/rejected": -21.935155868530273, "step": 7500 }, { "epoch": 1.8934547631802854, "grad_norm": 91.46187591552734, "learning_rate": 1.800916483890705e-07, "logits/chosen": -0.6366943120956421, "logits/rejected": -0.626574695110321, "logps/chosen": -177.64999389648438, "logps/rejected": -494.17498779296875, "loss": 0.1646, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.9006836414337158, "rewards/margins": 20.233983993530273, "rewards/rejected": -22.127344131469727, "step": 7510 }, { "epoch": 1.8959757980651057, "grad_norm": 0.008524438366293907, "learning_rate": 1.793877420583686e-07, "logits/chosen": -0.7119385004043579, "logits/rejected": NaN, "logps/chosen": -171.9484405517578, "logps/rejected": -487.57501220703125, "loss": 0.0541, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.3837982416152954, "rewards/margins": 20.760936737060547, "rewards/rejected": -22.15234375, "step": 7520 }, { "epoch": 1.898496832949926, "grad_norm": 1.585130214691162, "learning_rate": 1.786844435306225e-07, "logits/chosen": -0.695056140422821, "logits/rejected": NaN, "logps/chosen": -178.93905639648438, "logps/rejected": -428.1625061035156, "loss": 0.0183, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4152542352676392, "rewards/margins": 19.518749237060547, "rewards/rejected": -20.928905487060547, "step": 7530 }, { "epoch": 1.9010178678347462, "grad_norm": 4.065629005432129, "learning_rate": 1.7798175885955364e-07, "logits/chosen": -0.6947906613349915, "logits/rejected": NaN, "logps/chosen": -180.8359375, "logps/rejected": -453.7749938964844, "loss": 0.047, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.5058746337890625, "rewards/margins": 19.884765625, "rewards/rejected": -21.392187118530273, "step": 7540 }, { "epoch": 1.9035389027195664, "grad_norm": 0.6680436134338379, "learning_rate": 1.7727969409359922e-07, "logits/chosen": -0.6851593255996704, "logits/rejected": NaN, "logps/chosen": -152.3156280517578, "logps/rejected": -467.0, "loss": 0.0328, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.098443627357483, "rewards/margins": 20.483592987060547, "rewards/rejected": -21.585155487060547, "step": 7550 }, { "epoch": 1.9060599376043865, "grad_norm": 0.18277904391288757, "learning_rate": 1.7657825527586066e-07, "logits/chosen": -0.770739734172821, "logits/rejected": NaN, "logps/chosen": -180.5812530517578, "logps/rejected": -480.3999938964844, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4186187982559204, "rewards/margins": 20.110157012939453, "rewards/rejected": -21.520313262939453, "step": 7560 }, { "epoch": 1.9085809724892067, "grad_norm": 0.12441498041152954, "learning_rate": 1.7587744844405172e-07, "logits/chosen": -0.725860595703125, "logits/rejected": NaN, "logps/chosen": -179.2156219482422, "logps/rejected": -472.0, "loss": 0.0881, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8803313970565796, "rewards/margins": 19.865625381469727, "rewards/rejected": -21.755468368530273, "step": 7570 }, { "epoch": 1.911102007374027, "grad_norm": 22.828123092651367, "learning_rate": 1.7517727963044592e-07, "logits/chosen": -0.671002209186554, "logits/rejected": NaN, "logps/chosen": -176.50155639648438, "logps/rejected": -456.7875061035156, "loss": 0.0117, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.7687561511993408, "rewards/margins": 20.39453125, "rewards/rejected": -22.16015625, "step": 7580 }, { "epoch": 1.9136230422588474, "grad_norm": 3.295154571533203, "learning_rate": 1.7447775486182518e-07, "logits/chosen": -0.7874206304550171, "logits/rejected": NaN, "logps/chosen": -172.859375, "logps/rejected": -471.29998779296875, "loss": 0.0414, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.119683861732483, "rewards/margins": 20.096485137939453, "rewards/rejected": -21.216405868530273, "step": 7590 }, { "epoch": 1.9161440771436675, "grad_norm": 9.848821640014648, "learning_rate": 1.7377888015942748e-07, "logits/chosen": -0.8193267583847046, "logits/rejected": NaN, "logps/chosen": -173.5203094482422, "logps/rejected": -451.5874938964844, "loss": 0.048, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.0111510753631592, "rewards/margins": 19.619922637939453, "rewards/rejected": -20.627344131469727, "step": 7600 }, { "epoch": 1.9186651120284877, "grad_norm": 1.3826103210449219, "learning_rate": 1.7308066153889578e-07, "logits/chosen": -0.715545654296875, "logits/rejected": -0.7178894281387329, "logps/chosen": -165.5703125, "logps/rejected": -474.04998779296875, "loss": 0.0677, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8017517328262329, "rewards/margins": 19.48828125, "rewards/rejected": -20.299219131469727, "step": 7610 }, { "epoch": 1.9211861469133078, "grad_norm": 1.5975853204727173, "learning_rate": 1.7238310501022517e-07, "logits/chosen": -0.7082626223564148, "logits/rejected": -0.8083251714706421, "logps/chosen": -165.1640625, "logps/rejected": -453.2250061035156, "loss": 0.0478, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.0098083019256592, "rewards/margins": 19.0224609375, "rewards/rejected": -20.037109375, "step": 7620 }, { "epoch": 1.9237071817981282, "grad_norm": 0.5424899458885193, "learning_rate": 1.71686216577712e-07, "logits/chosen": -0.764819324016571, "logits/rejected": NaN, "logps/chosen": -154.7390594482422, "logps/rejected": -454.0562438964844, "loss": 0.0494, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.19486084580421448, "rewards/margins": 19.0546875, "rewards/rejected": -19.25, "step": 7630 }, { "epoch": 1.9262282166829483, "grad_norm": 0.15385442972183228, "learning_rate": 1.70990002239902e-07, "logits/chosen": -0.7367004156112671, "logits/rejected": -0.750720202922821, "logps/chosen": -182.3166046142578, "logps/rejected": -464.42498779296875, "loss": 0.0353, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.5466674566268921, "rewards/margins": 20.047657012939453, "rewards/rejected": -20.592578887939453, "step": 7640 }, { "epoch": 1.9287492515677687, "grad_norm": 34.09640884399414, "learning_rate": 1.7029446798953828e-07, "logits/chosen": -0.656341552734375, "logits/rejected": -0.88226318359375, "logps/chosen": -166.1593780517578, "logps/rejected": -494.1499938964844, "loss": 0.0307, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.1880981922149658, "rewards/margins": 20.294530868530273, "rewards/rejected": -21.489063262939453, "step": 7650 }, { "epoch": 1.9312702864525888, "grad_norm": 2.2534139156341553, "learning_rate": 1.6959961981351025e-07, "logits/chosen": -0.6523941159248352, "logits/rejected": NaN, "logps/chosen": -165.13125610351562, "logps/rejected": -463.57501220703125, "loss": 0.0237, "rewards/accuracies": 0.984375, "rewards/chosen": -1.314660668373108, "rewards/margins": 20.239063262939453, "rewards/rejected": -21.55078125, "step": 7660 }, { "epoch": 1.933791321337409, "grad_norm": 2.889364004135132, "learning_rate": 1.6890546369280167e-07, "logits/chosen": -0.6918319463729858, "logits/rejected": NaN, "logps/chosen": -197.18124389648438, "logps/rejected": -485.26251220703125, "loss": 0.0907, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.697845458984375, "rewards/margins": 19.506250381469727, "rewards/rejected": -21.206249237060547, "step": 7670 }, { "epoch": 1.936312356222229, "grad_norm": 1.1023322343826294, "learning_rate": 1.6821200560243963e-07, "logits/chosen": -0.6774047613143921, "logits/rejected": -0.648266613483429, "logps/chosen": -165.8156280517578, "logps/rejected": -474.79998779296875, "loss": 0.0412, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8671905994415283, "rewards/margins": 19.482030868530273, "rewards/rejected": -21.342187881469727, "step": 7680 }, { "epoch": 1.9388333911070494, "grad_norm": 11.59310531616211, "learning_rate": 1.6751925151144259e-07, "logits/chosen": -0.7368103265762329, "logits/rejected": -0.7714599370956421, "logps/chosen": -180.1984405517578, "logps/rejected": -473.54998779296875, "loss": 0.0509, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.8629639148712158, "rewards/margins": 19.21484375, "rewards/rejected": -21.0859375, "step": 7690 }, { "epoch": 1.9413544259918698, "grad_norm": 15.703198432922363, "learning_rate": 1.6682720738276918e-07, "logits/chosen": -0.8040405511856079, "logits/rejected": NaN, "logps/chosen": -190.8562469482422, "logps/rejected": -481.36248779296875, "loss": 0.087, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.9440063238143921, "rewards/margins": 20.28515625, "rewards/rejected": -21.23046875, "step": 7700 }, { "epoch": 1.94387546087669, "grad_norm": 0.46048372983932495, "learning_rate": 1.6613587917326738e-07, "logits/chosen": -0.7071990966796875, "logits/rejected": NaN, "logps/chosen": -190.4375, "logps/rejected": -474.25, "loss": 0.0603, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1200928688049316, "rewards/margins": 19.165624618530273, "rewards/rejected": -21.284374237060547, "step": 7710 }, { "epoch": 1.94639649576151, "grad_norm": 45.26466369628906, "learning_rate": 1.6544527283362237e-07, "logits/chosen": -0.6915038824081421, "logits/rejected": -0.72296142578125, "logps/chosen": -170.4765625, "logps/rejected": -469.1499938964844, "loss": 0.0532, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.8714172840118408, "rewards/margins": 20.087499618530273, "rewards/rejected": -21.9609375, "step": 7720 }, { "epoch": 1.9489175306463302, "grad_norm": 31.897815704345703, "learning_rate": 1.6475539430830604e-07, "logits/chosen": -0.7085312008857727, "logits/rejected": NaN, "logps/chosen": -188.421875, "logps/rejected": -499.2749938964844, "loss": 0.0836, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.3899903297424316, "rewards/margins": 20.053125381469727, "rewards/rejected": -22.442968368530273, "step": 7730 }, { "epoch": 1.9514385655311506, "grad_norm": 0.8322807550430298, "learning_rate": 1.640662495355253e-07, "logits/chosen": -0.706573486328125, "logits/rejected": -0.736010730266571, "logps/chosen": -189.1710968017578, "logps/rejected": -477.5375061035156, "loss": 0.0189, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6896605491638184, "rewards/margins": 20.00390625, "rewards/rejected": -22.70703125, "step": 7740 }, { "epoch": 1.9539596004159707, "grad_norm": 98.13661193847656, "learning_rate": 1.6337784444717142e-07, "logits/chosen": -0.663983166217804, "logits/rejected": NaN, "logps/chosen": -183.60311889648438, "logps/rejected": -462.8999938964844, "loss": 0.0669, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2816162109375, "rewards/margins": 18.323827743530273, "rewards/rejected": -21.60546875, "step": 7750 }, { "epoch": 1.956480635300791, "grad_norm": 15.614567756652832, "learning_rate": 1.626901849687687e-07, "logits/chosen": -0.658111572265625, "logits/rejected": -0.807403564453125, "logps/chosen": -187.6374969482422, "logps/rejected": -478.2250061035156, "loss": 0.0742, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1369872093200684, "rewards/margins": 19.396875381469727, "rewards/rejected": -22.528125762939453, "step": 7760 }, { "epoch": 1.9590016701856112, "grad_norm": 0.2808733582496643, "learning_rate": 1.6200327701942328e-07, "logits/chosen": -0.7643798589706421, "logits/rejected": -0.7845824956893921, "logps/chosen": -191.4968719482422, "logps/rejected": -488.375, "loss": 0.0629, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.6181213855743408, "rewards/margins": 20.001562118530273, "rewards/rejected": -21.616405487060547, "step": 7770 }, { "epoch": 1.9615227050704314, "grad_norm": 1.534867286682129, "learning_rate": 1.6131712651177288e-07, "logits/chosen": -0.6949402093887329, "logits/rejected": -0.690582275390625, "logps/chosen": -171.49844360351562, "logps/rejected": -459.29998779296875, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.931341528892517, "rewards/margins": 19.6328125, "rewards/rejected": -21.557031631469727, "step": 7780 }, { "epoch": 1.9640437399552515, "grad_norm": 3.118785858154297, "learning_rate": 1.6063173935193503e-07, "logits/chosen": -0.7106841802597046, "logits/rejected": NaN, "logps/chosen": -168.7531280517578, "logps/rejected": -467.1000061035156, "loss": 0.0478, "rewards/accuracies": 0.984375, "rewards/chosen": -1.744591474533081, "rewards/margins": 19.336719512939453, "rewards/rejected": -21.08203125, "step": 7790 }, { "epoch": 1.9665647748400719, "grad_norm": 70.76205444335938, "learning_rate": 1.5994712143945693e-07, "logits/chosen": -0.5911804437637329, "logits/rejected": NaN, "logps/chosen": -178.234375, "logps/rejected": -466.82501220703125, "loss": 0.0464, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.5986754894256592, "rewards/margins": 19.553125381469727, "rewards/rejected": -21.149999618530273, "step": 7800 }, { "epoch": 1.9690858097248922, "grad_norm": 0.4477299749851227, "learning_rate": 1.592632786672642e-07, "logits/chosen": -0.6996825933456421, "logits/rejected": -0.767718493938446, "logps/chosen": -168.984375, "logps/rejected": -484.07501220703125, "loss": 0.054, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.862432837486267, "rewards/margins": 19.627344131469727, "rewards/rejected": -21.498437881469727, "step": 7810 }, { "epoch": 1.9716068446097124, "grad_norm": 2.0040738582611084, "learning_rate": 1.5858021692161054e-07, "logits/chosen": -0.6574767827987671, "logits/rejected": NaN, "logps/chosen": -194.3000030517578, "logps/rejected": -450.6000061035156, "loss": 0.0706, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.0360107421875, "rewards/margins": 19.083593368530273, "rewards/rejected": -21.114063262939453, "step": 7820 }, { "epoch": 1.9741278794945325, "grad_norm": 0.01335596852004528, "learning_rate": 1.578979420820268e-07, "logits/chosen": -0.6592162847518921, "logits/rejected": NaN, "logps/chosen": -197.9656219482422, "logps/rejected": -464.8374938964844, "loss": 0.0133, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.4591002464294434, "rewards/margins": 19.51171875, "rewards/rejected": -21.97265625, "step": 7830 }, { "epoch": 1.9766489143793526, "grad_norm": 20.088245391845703, "learning_rate": 1.572164600212703e-07, "logits/chosen": -0.703601062297821, "logits/rejected": NaN, "logps/chosen": -188.83749389648438, "logps/rejected": -489.625, "loss": 0.008, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.1286377906799316, "rewards/margins": 19.869531631469727, "rewards/rejected": -22.003124237060547, "step": 7840 }, { "epoch": 1.9791699492641728, "grad_norm": 41.42198181152344, "learning_rate": 1.5653577660527474e-07, "logits/chosen": -0.781292736530304, "logits/rejected": NaN, "logps/chosen": -194.4093780517578, "logps/rejected": -478.125, "loss": 0.0427, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6494781970977783, "rewards/margins": 20.44140625, "rewards/rejected": -22.087499618530273, "step": 7850 }, { "epoch": 1.9816909841489931, "grad_norm": 2.804954767227173, "learning_rate": 1.5585589769309904e-07, "logits/chosen": -0.8326476812362671, "logits/rejected": NaN, "logps/chosen": -184.875, "logps/rejected": -496.4375, "loss": 0.007, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.2515747547149658, "rewards/margins": 20.635156631469727, "rewards/rejected": -21.882030487060547, "step": 7860 }, { "epoch": 1.9842120190338135, "grad_norm": 3.4820339679718018, "learning_rate": 1.5517682913687764e-07, "logits/chosen": -0.6216278076171875, "logits/rejected": NaN, "logps/chosen": -184.16250610351562, "logps/rejected": -479.20001220703125, "loss": 0.0563, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.8856353759765625, "rewards/margins": 20.357812881469727, "rewards/rejected": -22.236719131469727, "step": 7870 }, { "epoch": 1.9867330539186336, "grad_norm": 19.71022605895996, "learning_rate": 1.544985767817693e-07, "logits/chosen": -0.7120879888534546, "logits/rejected": NaN, "logps/chosen": -196.53750610351562, "logps/rejected": -477.1000061035156, "loss": 0.1387, "rewards/accuracies": 0.96875, "rewards/chosen": -2.2622923851013184, "rewards/margins": 20.076562881469727, "rewards/rejected": -22.338281631469727, "step": 7880 }, { "epoch": 1.9892540888034538, "grad_norm": 0.021564556285738945, "learning_rate": 1.5382114646590776e-07, "logits/chosen": -0.7777099609375, "logits/rejected": -0.8798156976699829, "logps/chosen": -199.0656280517578, "logps/rejected": -497.61248779296875, "loss": 0.0086, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.956658959388733, "rewards/margins": 20.783594131469727, "rewards/rejected": -22.7421875, "step": 7890 }, { "epoch": 1.991775123688274, "grad_norm": 0.02744057960808277, "learning_rate": 1.5314454402035055e-07, "logits/chosen": -0.8630920648574829, "logits/rejected": NaN, "logps/chosen": -185.57968139648438, "logps/rejected": -480.29998779296875, "loss": 0.0216, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.6900856494903564, "rewards/margins": 21.256250381469727, "rewards/rejected": -22.9453125, "step": 7900 }, { "epoch": 1.9942961585730943, "grad_norm": 15.757294654846191, "learning_rate": 1.5246877526902925e-07, "logits/chosen": -0.80950927734375, "logits/rejected": NaN, "logps/chosen": -174.13125610351562, "logps/rejected": -462.6000061035156, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.781396508216858, "rewards/margins": 20.399219512939453, "rewards/rejected": -22.182031631469727, "step": 7910 }, { "epoch": 1.9968171934579144, "grad_norm": 0.18647083640098572, "learning_rate": 1.5179384602869963e-07, "logits/chosen": -0.8525635004043579, "logits/rejected": NaN, "logps/chosen": -185.92813110351562, "logps/rejected": -471.17498779296875, "loss": 0.0345, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1864807605743408, "rewards/margins": 20.210546493530273, "rewards/rejected": -21.389842987060547, "step": 7920 }, { "epoch": 1.9993382283427348, "grad_norm": 0.40762361884117126, "learning_rate": 1.5111976210889093e-07, "logits/chosen": -0.785656750202179, "logits/rejected": NaN, "logps/chosen": -190.26406860351562, "logps/rejected": -493.3374938964844, "loss": 0.0412, "rewards/accuracies": 0.984375, "rewards/chosen": -1.2083160877227783, "rewards/margins": 20.9453125, "rewards/rejected": -22.157032012939453, "step": 7930 }, { "epoch": 2.0020168279078563, "grad_norm": 0.0007798176375217736, "learning_rate": 1.5044652931185647e-07, "logits/chosen": -0.8207310438156128, "logits/rejected": NaN, "logps/chosen": -189.55654907226562, "logps/rejected": -480.2738037109375, "loss": 0.0249, "rewards/accuracies": 0.9821428656578064, "rewards/chosen": -1.319225549697876, "rewards/margins": 20.701637268066406, "rewards/rejected": -22.0230655670166, "step": 7940 }, { "epoch": 2.0045378627926764, "grad_norm": 0.21533571183681488, "learning_rate": 1.4977415343252313e-07, "logits/chosen": -0.7747879028320312, "logits/rejected": -0.729931652545929, "logps/chosen": -174.8562469482422, "logps/rejected": -468.8500061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.335205078125, "rewards/margins": 21.3671875, "rewards/rejected": -22.689844131469727, "step": 7950 }, { "epoch": 2.0070588976774966, "grad_norm": 3.7052395343780518, "learning_rate": 1.4910264025844217e-07, "logits/chosen": -0.753125011920929, "logits/rejected": -0.766796886920929, "logps/chosen": -176.9187469482422, "logps/rejected": -484.4750061035156, "loss": 0.0168, "rewards/accuracies": 0.984375, "rewards/chosen": -1.4602234363555908, "rewards/margins": 20.552343368530273, "rewards/rejected": -22.014062881469727, "step": 7960 }, { "epoch": 2.0095799325623167, "grad_norm": 0.9079403281211853, "learning_rate": 1.4843199556973868e-07, "logits/chosen": -0.8482300043106079, "logits/rejected": -0.830060601234436, "logps/chosen": -194.91561889648438, "logps/rejected": -492.4750061035156, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.097253441810608, "rewards/margins": 21.444530487060547, "rewards/rejected": -22.551563262939453, "step": 7970 }, { "epoch": 2.012100967447137, "grad_norm": 1.3060338497161865, "learning_rate": 1.4776222513906216e-07, "logits/chosen": -0.7681945562362671, "logits/rejected": NaN, "logps/chosen": -189.7843780517578, "logps/rejected": -480.7749938964844, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.470977783203125, "rewards/margins": 20.692968368530273, "rewards/rejected": -22.16796875, "step": 7980 }, { "epoch": 2.0146220023319574, "grad_norm": 22.071378707885742, "learning_rate": 1.4709333473153717e-07, "logits/chosen": -0.7845550775527954, "logits/rejected": NaN, "logps/chosen": -189.171875, "logps/rejected": -498.5, "loss": 0.0068, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.397467017173767, "rewards/margins": 21.315624237060547, "rewards/rejected": -22.710155487060547, "step": 7990 }, { "epoch": 2.0171430372167776, "grad_norm": 0.0007188312010839581, "learning_rate": 1.4642533010471304e-07, "logits/chosen": -0.720611572265625, "logits/rejected": -0.9003967046737671, "logps/chosen": -193.22811889648438, "logps/rejected": -514.8875122070312, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.566253662109375, "rewards/margins": 21.8203125, "rewards/rejected": -23.382030487060547, "step": 8000 }, { "epoch": 2.0196640721015977, "grad_norm": 0.3879481852054596, "learning_rate": 1.4575821700851485e-07, "logits/chosen": -0.787628173828125, "logits/rejected": -0.7338622808456421, "logps/chosen": -185.4875030517578, "logps/rejected": -496.5249938964844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.423150658607483, "rewards/margins": 22.127344131469727, "rewards/rejected": -23.546875, "step": 8010 }, { "epoch": 2.022185106986418, "grad_norm": 4.094455242156982, "learning_rate": 1.4509200118519347e-07, "logits/chosen": -0.703021228313446, "logits/rejected": NaN, "logps/chosen": -180.96875, "logps/rejected": -503.625, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.690313696861267, "rewards/margins": 22.779687881469727, "rewards/rejected": -24.462499618530273, "step": 8020 }, { "epoch": 2.024706141871238, "grad_norm": 0.043080516159534454, "learning_rate": 1.444266883692768e-07, "logits/chosen": -0.690258800983429, "logits/rejected": NaN, "logps/chosen": -189.18124389648438, "logps/rejected": -484.36248779296875, "loss": 0.0155, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.990264892578125, "rewards/margins": 21.278125762939453, "rewards/rejected": -23.2578125, "step": 8030 }, { "epoch": 2.0272271767560586, "grad_norm": 0.07462955266237259, "learning_rate": 1.4376228428751963e-07, "logits/chosen": -0.746978759765625, "logits/rejected": -0.81854248046875, "logps/chosen": -175.14999389648438, "logps/rejected": -539.25, "loss": 0.0032, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.3858153820037842, "rewards/margins": 22.475000381469727, "rewards/rejected": -23.857812881469727, "step": 8040 }, { "epoch": 2.0297482116408787, "grad_norm": 0.14849430322647095, "learning_rate": 1.4309879465885478e-07, "logits/chosen": -0.7994934320449829, "logits/rejected": -0.716998279094696, "logps/chosen": -178.67813110351562, "logps/rejected": -493.8500061035156, "loss": 0.0089, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.586090087890625, "rewards/margins": 21.871875762939453, "rewards/rejected": -23.453907012939453, "step": 8050 }, { "epoch": 2.032269246525699, "grad_norm": 0.04362007975578308, "learning_rate": 1.4243622519434407e-07, "logits/chosen": -0.5079193115234375, "logits/rejected": NaN, "logps/chosen": -155.60311889648438, "logps/rejected": -478.13751220703125, "loss": 0.0068, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.660559058189392, "rewards/margins": 20.98046875, "rewards/rejected": -22.650781631469727, "step": 8060 }, { "epoch": 2.034790281410519, "grad_norm": 0.7638606429100037, "learning_rate": 1.4177458159712863e-07, "logits/chosen": -0.6184173822402954, "logits/rejected": NaN, "logps/chosen": -191.11563110351562, "logps/rejected": -481.6875, "loss": 0.0088, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.5751953125, "rewards/margins": 20.755468368530273, "rewards/rejected": -22.314062118530273, "step": 8070 }, { "epoch": 2.037311316295339, "grad_norm": 0.16579696536064148, "learning_rate": 1.411138695623802e-07, "logits/chosen": -0.733410656452179, "logits/rejected": -0.825976550579071, "logps/chosen": -172.1750030517578, "logps/rejected": -477.0249938964844, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -1.2306182384490967, "rewards/margins": 21.978124618530273, "rewards/rejected": -23.212499618530273, "step": 8080 }, { "epoch": 2.0398323511801593, "grad_norm": 0.028141072019934654, "learning_rate": 1.4045409477725185e-07, "logits/chosen": -0.740863025188446, "logits/rejected": -0.7755676507949829, "logps/chosen": -193.33438110351562, "logps/rejected": -493.9125061035156, "loss": 0.0092, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9249267578125, "rewards/margins": 21.936717987060547, "rewards/rejected": -23.866405487060547, "step": 8090 }, { "epoch": 2.04235338606498, "grad_norm": 0.053659792989492416, "learning_rate": 1.3979526292082938e-07, "logits/chosen": -0.71807861328125, "logits/rejected": NaN, "logps/chosen": -195.36563110351562, "logps/rejected": -516.2000122070312, "loss": 0.0054, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.07763671875, "rewards/margins": 21.35546875, "rewards/rejected": -23.424999237060547, "step": 8100 }, { "epoch": 2.0448744209498, "grad_norm": 0.08490239828824997, "learning_rate": 1.391373796640822e-07, "logits/chosen": -0.7188720703125, "logits/rejected": NaN, "logps/chosen": -159.62734985351562, "logps/rejected": -470.7250061035156, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.460546851158142, "rewards/margins": 21.6953125, "rewards/rejected": -23.159374237060547, "step": 8110 }, { "epoch": 2.04739545583462, "grad_norm": 0.3199053704738617, "learning_rate": 1.3848045066981433e-07, "logits/chosen": -0.6535888910293579, "logits/rejected": NaN, "logps/chosen": -183.63436889648438, "logps/rejected": -481.8999938964844, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6223022937774658, "rewards/margins": 21.611719131469727, "rewards/rejected": -23.22265625, "step": 8120 }, { "epoch": 2.0499164907194403, "grad_norm": 1.2857191904913634e-05, "learning_rate": 1.3782448159261617e-07, "logits/chosen": -0.6824096441268921, "logits/rejected": NaN, "logps/chosen": -179.8953094482422, "logps/rejected": -501.7124938964844, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.0194361209869385, "rewards/margins": 22.142969131469727, "rewards/rejected": -24.171875, "step": 8130 }, { "epoch": 2.0524375256042604, "grad_norm": 0.5615473985671997, "learning_rate": 1.3716947807881524e-07, "logits/chosen": -0.5925232172012329, "logits/rejected": NaN, "logps/chosen": -189.44375610351562, "logps/rejected": -507.45001220703125, "loss": 0.0062, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.508828639984131, "rewards/margins": 23.025781631469727, "rewards/rejected": -25.538280487060547, "step": 8140 }, { "epoch": 2.054958560489081, "grad_norm": 0.004191859625279903, "learning_rate": 1.3651544576642808e-07, "logits/chosen": -0.55712890625, "logits/rejected": NaN, "logps/chosen": -166.9499969482422, "logps/rejected": -501.25, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4795868396759033, "rewards/margins": 24.540624618530273, "rewards/rejected": -26.026561737060547, "step": 8150 }, { "epoch": 2.057479595373901, "grad_norm": 0.07632782310247421, "learning_rate": 1.358623902851112e-07, "logits/chosen": -0.6258575320243835, "logits/rejected": -0.7196380496025085, "logps/chosen": -176.0578155517578, "logps/rejected": -521.9249877929688, "loss": 0.0126, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4186768531799316, "rewards/margins": 22.360937118530273, "rewards/rejected": -24.768749237060547, "step": 8160 }, { "epoch": 2.0600006302587213, "grad_norm": 0.1790694147348404, "learning_rate": 1.3521031725611342e-07, "logits/chosen": -0.6497741937637329, "logits/rejected": -0.781567394733429, "logps/chosen": -187.96249389648438, "logps/rejected": -505.25, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.741711378097534, "rewards/margins": 21.752344131469727, "rewards/rejected": -24.489843368530273, "step": 8170 }, { "epoch": 2.0625216651435414, "grad_norm": 0.080039843916893, "learning_rate": 1.345592322922266e-07, "logits/chosen": -0.7708465456962585, "logits/rejected": NaN, "logps/chosen": -188.97811889648438, "logps/rejected": -495.92498779296875, "loss": 0.0042, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.875274658203125, "rewards/margins": 21.779687881469727, "rewards/rejected": -23.6484375, "step": 8180 }, { "epoch": 2.0650427000283615, "grad_norm": 0.005268774926662445, "learning_rate": 1.3390914099773773e-07, "logits/chosen": NaN, "logits/rejected": -0.7193603515625, "logps/chosen": -202.1437530517578, "logps/rejected": -515.6749877929688, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.032794237136841, "rewards/margins": 22.036718368530273, "rewards/rejected": -24.071094512939453, "step": 8190 }, { "epoch": 2.0675637349131817, "grad_norm": 0.02497304044663906, "learning_rate": 1.3326004896838096e-07, "logits/chosen": -0.6950439214706421, "logits/rejected": -0.6516205072402954, "logps/chosen": -186.04061889648438, "logps/rejected": -474.8500061035156, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.50634765625, "rewards/margins": 21.44921875, "rewards/rejected": -23.956249237060547, "step": 8200 }, { "epoch": 2.0700847697980023, "grad_norm": 32.05925750732422, "learning_rate": 1.3261196179128885e-07, "logits/chosen": -0.6463470458984375, "logits/rejected": -0.6456268429756165, "logps/chosen": -191.2312469482422, "logps/rejected": -524.2249755859375, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3506407737731934, "rewards/margins": 23.018749237060547, "rewards/rejected": -25.374217987060547, "step": 8210 }, { "epoch": 2.0726058046828224, "grad_norm": 0.17158713936805725, "learning_rate": 1.3196488504494477e-07, "logits/chosen": -0.6011260747909546, "logits/rejected": NaN, "logps/chosen": -174.35781860351562, "logps/rejected": -495.1499938964844, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.832006812095642, "rewards/margins": 22.092187881469727, "rewards/rejected": -23.915624618530273, "step": 8220 }, { "epoch": 2.0751268395676425, "grad_norm": 0.0007873151334933937, "learning_rate": 1.3131882429913449e-07, "logits/chosen": -0.645092785358429, "logits/rejected": -0.7343505620956421, "logps/chosen": -184.8234405517578, "logps/rejected": -498.7124938964844, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.006884813308716, "rewards/margins": 22.411718368530273, "rewards/rejected": -24.421875, "step": 8230 }, { "epoch": 2.0776478744524627, "grad_norm": 0.34729138016700745, "learning_rate": 1.3067378511489865e-07, "logits/chosen": -0.6878936886787415, "logits/rejected": -0.682360827922821, "logps/chosen": -173.42813110351562, "logps/rejected": -489.2875061035156, "loss": 0.0121, "rewards/accuracies": 0.984375, "rewards/chosen": -2.143084764480591, "rewards/margins": 22.290624618530273, "rewards/rejected": -24.439844131469727, "step": 8240 }, { "epoch": 2.080168909337283, "grad_norm": 0.16304218769073486, "learning_rate": 1.3002977304448477e-07, "logits/chosen": -0.5750366449356079, "logits/rejected": NaN, "logps/chosen": -172.3835906982422, "logps/rejected": -503.0, "loss": 0.008, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9543640613555908, "rewards/margins": 22.196094512939453, "rewards/rejected": -24.138280868530273, "step": 8250 }, { "epoch": 2.082689944222103, "grad_norm": 5.615628719329834, "learning_rate": 1.2938679363129896e-07, "logits/chosen": -0.5113891363143921, "logits/rejected": -0.5301758050918579, "logps/chosen": -181.0656280517578, "logps/rejected": -485.0, "loss": 0.0045, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.882580518722534, "rewards/margins": 21.908594131469727, "rewards/rejected": -24.791406631469727, "step": 8260 }, { "epoch": 2.0852109791069235, "grad_norm": 0.03226521983742714, "learning_rate": 1.287448524098591e-07, "logits/chosen": -0.6430816650390625, "logits/rejected": -0.7104827761650085, "logps/chosen": -188.4562530517578, "logps/rejected": -504.20001220703125, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.1846251487731934, "rewards/margins": 22.814844131469727, "rewards/rejected": -25.004688262939453, "step": 8270 }, { "epoch": 2.0877320139917437, "grad_norm": 0.015295448713004589, "learning_rate": 1.2810395490574637e-07, "logits/chosen": -0.6052764654159546, "logits/rejected": NaN, "logps/chosen": -205.33438110351562, "logps/rejected": -518.5999755859375, "loss": 0.0189, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.967419385910034, "rewards/margins": 22.094532012939453, "rewards/rejected": -25.046092987060547, "step": 8280 }, { "epoch": 2.090253048876564, "grad_norm": 0.08312857151031494, "learning_rate": 1.2746410663555817e-07, "logits/chosen": -0.6122649908065796, "logits/rejected": -0.6173034906387329, "logps/chosen": -204.64999389648438, "logps/rejected": -505.95001220703125, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.8070068359375, "rewards/margins": 23.782812118530273, "rewards/rejected": -26.587499618530273, "step": 8290 }, { "epoch": 2.092774083761384, "grad_norm": 0.15910691022872925, "learning_rate": 1.268253131068604e-07, "logits/chosen": -0.6361755132675171, "logits/rejected": NaN, "logps/chosen": -190.2375030517578, "logps/rejected": -486.9750061035156, "loss": 0.0057, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.653088331222534, "rewards/margins": 23.197656631469727, "rewards/rejected": -25.857812881469727, "step": 8300 }, { "epoch": 2.095295118646204, "grad_norm": 0.8201706409454346, "learning_rate": 1.261875798181404e-07, "logits/chosen": -0.6924377679824829, "logits/rejected": -0.8551269769668579, "logps/chosen": -198.8468780517578, "logps/rejected": -519.6875, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7086853981018066, "rewards/margins": 23.264842987060547, "rewards/rejected": -25.978124618530273, "step": 8310 }, { "epoch": 2.0978161535310247, "grad_norm": 1.0445002317428589, "learning_rate": 1.2555091225875912e-07, "logits/chosen": -0.6440185308456421, "logits/rejected": -0.652935802936554, "logps/chosen": -192.4093780517578, "logps/rejected": -505.17498779296875, "loss": 0.0084, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.291943311691284, "rewards/margins": 23.1015625, "rewards/rejected": -26.387500762939453, "step": 8320 }, { "epoch": 2.100337188415845, "grad_norm": 0.16155143082141876, "learning_rate": 1.2491531590890413e-07, "logits/chosen": -0.686474621295929, "logits/rejected": NaN, "logps/chosen": -209.7062530517578, "logps/rejected": -521.5, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -3.0187315940856934, "rewards/margins": 22.885156631469727, "rewards/rejected": -25.896093368530273, "step": 8330 }, { "epoch": 2.102858223300665, "grad_norm": 0.008164100348949432, "learning_rate": 1.2428079623954274e-07, "logits/chosen": -0.4913391172885895, "logits/rejected": -0.642871081829071, "logps/chosen": -186.5500030517578, "logps/rejected": -500.3500061035156, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0483031272888184, "rewards/margins": 21.978906631469727, "rewards/rejected": -25.0390625, "step": 8340 }, { "epoch": 2.105379258185485, "grad_norm": 0.35126253962516785, "learning_rate": 1.236473587123743e-07, "logits/chosen": -0.4706222414970398, "logits/rejected": -0.628125011920929, "logps/chosen": -183.38436889648438, "logps/rejected": -479.82501220703125, "loss": 0.0103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1562561988830566, "rewards/margins": 20.982812881469727, "rewards/rejected": -24.137500762939453, "step": 8350 }, { "epoch": 2.1079002930703052, "grad_norm": 0.3342233896255493, "learning_rate": 1.2301500877978353e-07, "logits/chosen": -0.5683227777481079, "logits/rejected": NaN, "logps/chosen": -184.6765594482422, "logps/rejected": -476.73748779296875, "loss": 0.0089, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.422900438308716, "rewards/margins": 21.844532012939453, "rewards/rejected": -24.25390625, "step": 8360 }, { "epoch": 2.1104213279551254, "grad_norm": 0.2802039086818695, "learning_rate": 1.2238375188479374e-07, "logits/chosen": -0.5149596929550171, "logits/rejected": NaN, "logps/chosen": -192.80624389648438, "logps/rejected": -513.1500244140625, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.006274461746216, "rewards/margins": 21.357030868530273, "rewards/rejected": -24.364843368530273, "step": 8370 }, { "epoch": 2.112942362839946, "grad_norm": 0.2585996091365814, "learning_rate": 1.217535934610196e-07, "logits/chosen": -0.5754058957099915, "logits/rejected": -0.739270031452179, "logps/chosen": -185.0656280517578, "logps/rejected": -522.7249755859375, "loss": 0.0102, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.022360324859619, "rewards/margins": 23.142969131469727, "rewards/rejected": -25.160938262939453, "step": 8380 }, { "epoch": 2.115463397724766, "grad_norm": 0.04932140186429024, "learning_rate": 1.2112453893262077e-07, "logits/chosen": -0.7499634027481079, "logits/rejected": NaN, "logps/chosen": -208.7468719482422, "logps/rejected": -523.4000244140625, "loss": 0.0048, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.924444556236267, "rewards/margins": 22.244531631469727, "rewards/rejected": -24.174999237060547, "step": 8390 }, { "epoch": 2.1179844326095862, "grad_norm": 0.16603635251522064, "learning_rate": 1.204965937142548e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -202.96875, "logps/rejected": -506.48748779296875, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7004120349884033, "rewards/margins": 22.16015625, "rewards/rejected": -24.857812881469727, "step": 8400 }, { "epoch": 2.1205054674944064, "grad_norm": 0.08768913149833679, "learning_rate": 1.1986976321103073e-07, "logits/chosen": -0.5782226324081421, "logits/rejected": -0.632336437702179, "logps/chosen": -174.10000610351562, "logps/rejected": -503.0, "loss": 0.0217, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3991668224334717, "rewards/margins": 23.78515625, "rewards/rejected": -26.178125381469727, "step": 8410 }, { "epoch": 2.1230265023792265, "grad_norm": 0.449298232793808, "learning_rate": 1.1924405281846285e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.5187530517578, "logps/rejected": -505.1625061035156, "loss": 0.0221, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3435044288635254, "rewards/margins": 21.919530868530273, "rewards/rejected": -24.255468368530273, "step": 8420 }, { "epoch": 2.1255475372640467, "grad_norm": 0.0019782735034823418, "learning_rate": 1.1861946792242372e-07, "logits/chosen": -0.5041595697402954, "logits/rejected": NaN, "logps/chosen": -183.2624969482422, "logps/rejected": -502.2749938964844, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.344558000564575, "rewards/margins": 22.135156631469727, "rewards/rejected": -24.471874237060547, "step": 8430 }, { "epoch": 2.1280685721488672, "grad_norm": 0.03849278390407562, "learning_rate": 1.1799601389909795e-07, "logits/chosen": -0.5089035034179688, "logits/rejected": -0.5583740472793579, "logps/chosen": -174.8125, "logps/rejected": -499.42498779296875, "loss": 0.0068, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.824261426925659, "rewards/margins": 22.614843368530273, "rewards/rejected": -25.440624237060547, "step": 8440 }, { "epoch": 2.1305896070336874, "grad_norm": 0.015464773401618004, "learning_rate": 1.1737369611493639e-07, "logits/chosen": -0.565869152545929, "logits/rejected": NaN, "logps/chosen": -182.609375, "logps/rejected": -522.4000244140625, "loss": 0.0063, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1770997047424316, "rewards/margins": 23.234375, "rewards/rejected": -25.407032012939453, "step": 8450 }, { "epoch": 2.1331106419185075, "grad_norm": 2.951711893081665, "learning_rate": 1.1675251992660931e-07, "logits/chosen": -0.607006847858429, "logits/rejected": NaN, "logps/chosen": -203.60000610351562, "logps/rejected": -540.4249877929688, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.4244751930236816, "rewards/margins": 23.210155487060547, "rewards/rejected": -25.637500762939453, "step": 8460 }, { "epoch": 2.1356316768033277, "grad_norm": 0.41513341665267944, "learning_rate": 1.161324906809607e-07, "logits/chosen": -0.49367064237594604, "logits/rejected": -0.752392590045929, "logps/chosen": -193.0546875, "logps/rejected": -521.5999755859375, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.170825242996216, "rewards/margins": 22.682031631469727, "rewards/rejected": -24.862499237060547, "step": 8470 }, { "epoch": 2.138152711688148, "grad_norm": 16.19542121887207, "learning_rate": 1.155136137149619e-07, "logits/chosen": -0.6715606451034546, "logits/rejected": NaN, "logps/chosen": -199.24374389648438, "logps/rejected": -534.5999755859375, "loss": 0.0039, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.073413133621216, "rewards/margins": 22.947656631469727, "rewards/rejected": -25.0234375, "step": 8480 }, { "epoch": 2.1406737465729684, "grad_norm": 2.7683212757110596, "learning_rate": 1.1489589435566627e-07, "logits/chosen": -0.6733551025390625, "logits/rejected": NaN, "logps/chosen": -198.2375030517578, "logps/rejected": -527.5250244140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -2.3383636474609375, "rewards/margins": 23.575000762939453, "rewards/rejected": -25.924999237060547, "step": 8490 }, { "epoch": 2.1431947814577885, "grad_norm": 0.032765839248895645, "learning_rate": 1.1427933792016248e-07, "logits/chosen": -0.7175918817520142, "logits/rejected": NaN, "logps/chosen": -186.203125, "logps/rejected": -504.8125, "loss": 0.0044, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.398974657058716, "rewards/margins": 21.917186737060547, "rewards/rejected": -24.307811737060547, "step": 8500 }, { "epoch": 2.1457158163426087, "grad_norm": 33.504615783691406, "learning_rate": 1.1366394971552962e-07, "logits/chosen": -0.633044421672821, "logits/rejected": NaN, "logps/chosen": -178.86874389648438, "logps/rejected": -551.4124755859375, "loss": 0.0063, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.9199645519256592, "rewards/margins": 23.69140625, "rewards/rejected": -25.600780487060547, "step": 8510 }, { "epoch": 2.148236851227429, "grad_norm": 0.14874233305454254, "learning_rate": 1.1304973503879076e-07, "logits/chosen": -0.5862762331962585, "logits/rejected": -0.6952270269393921, "logps/chosen": -186.8000030517578, "logps/rejected": -490.3500061035156, "loss": 0.0071, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.405068874359131, "rewards/margins": 23.357030868530273, "rewards/rejected": -25.7734375, "step": 8520 }, { "epoch": 2.150757886112249, "grad_norm": 0.26555824279785156, "learning_rate": 1.1243669917686797e-07, "logits/chosen": -0.5498809814453125, "logits/rejected": NaN, "logps/chosen": -184.75625610351562, "logps/rejected": -535.2999877929688, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1405272483825684, "rewards/margins": 22.5, "rewards/rejected": -25.637500762939453, "step": 8530 }, { "epoch": 2.153278920997069, "grad_norm": 0.030856024473905563, "learning_rate": 1.1182484740653636e-07, "logits/chosen": -0.667370617389679, "logits/rejected": NaN, "logps/chosen": -184.1125030517578, "logps/rejected": -509.5, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.382617235183716, "rewards/margins": 23.020313262939453, "rewards/rejected": -25.392969131469727, "step": 8540 }, { "epoch": 2.1557999558818897, "grad_norm": 0.6778072714805603, "learning_rate": 1.1121418499437881e-07, "logits/chosen": -0.5462452173233032, "logits/rejected": NaN, "logps/chosen": -192.3937530517578, "logps/rejected": -518.8375244140625, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.813633680343628, "rewards/margins": 22.840625762939453, "rewards/rejected": -25.65625, "step": 8550 }, { "epoch": 2.15832099076671, "grad_norm": 0.25990569591522217, "learning_rate": 1.1060471719674092e-07, "logits/chosen": -0.5995849370956421, "logits/rejected": -0.7560058832168579, "logps/chosen": -183.5656280517578, "logps/rejected": -510.4750061035156, "loss": 0.0074, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.5783050060272217, "rewards/margins": 23.501562118530273, "rewards/rejected": -26.0703125, "step": 8560 }, { "epoch": 2.16084202565153, "grad_norm": 1.2656493186950684, "learning_rate": 1.099964492596852e-07, "logits/chosen": -0.5207961797714233, "logits/rejected": -0.682629406452179, "logps/chosen": -185.6531219482422, "logps/rejected": -537.5750122070312, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8835082054138184, "rewards/margins": 23.459375381469727, "rewards/rejected": -26.337499618530273, "step": 8570 }, { "epoch": 2.16336306053635, "grad_norm": 0.004162727389484644, "learning_rate": 1.0938938641894635e-07, "logits/chosen": -0.628826916217804, "logits/rejected": NaN, "logps/chosen": -186.015625, "logps/rejected": -519.7249755859375, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.1372008323669434, "rewards/margins": 23.143749237060547, "rewards/rejected": -26.282812118530273, "step": 8580 }, { "epoch": 2.16588409542117, "grad_norm": 0.022089149802923203, "learning_rate": 1.087835338998862e-07, "logits/chosen": -0.5686920285224915, "logits/rejected": -0.5601135492324829, "logps/chosen": -203.9499969482422, "logps/rejected": -519.5750122070312, "loss": 0.009, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.859570264816284, "rewards/margins": 22.122655868530273, "rewards/rejected": -24.96484375, "step": 8590 }, { "epoch": 2.168405130305991, "grad_norm": 0.04163559526205063, "learning_rate": 1.0817889691744844e-07, "logits/chosen": -0.6544860601425171, "logits/rejected": -0.704925537109375, "logps/chosen": -186.19686889648438, "logps/rejected": -535.9749755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.358691453933716, "rewards/margins": 23.528905868530273, "rewards/rejected": -25.87890625, "step": 8600 }, { "epoch": 2.170926165190811, "grad_norm": 0.009220617823302746, "learning_rate": 1.0757548067611388e-07, "logits/chosen": -0.508715808391571, "logits/rejected": -0.3232177793979645, "logps/chosen": -189.5968780517578, "logps/rejected": -502.1499938964844, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": -2.10797119140625, "rewards/margins": 22.631250381469727, "rewards/rejected": -24.740625381469727, "step": 8610 }, { "epoch": 2.173447200075631, "grad_norm": 0.23059673607349396, "learning_rate": 1.0697329036985567e-07, "logits/chosen": -0.7089172601699829, "logits/rejected": NaN, "logps/chosen": -187.8125, "logps/rejected": -491.79998779296875, "loss": 0.033, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.217944383621216, "rewards/margins": 22.678125381469727, "rewards/rejected": -24.897655487060547, "step": 8620 }, { "epoch": 2.175968234960451, "grad_norm": 0.05619926005601883, "learning_rate": 1.0637233118209482e-07, "logits/chosen": -0.5822693109512329, "logits/rejected": -0.692108154296875, "logps/chosen": -182.64999389648438, "logps/rejected": -508.6499938964844, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7896454334259033, "rewards/margins": 22.895313262939453, "rewards/rejected": -25.682811737060547, "step": 8630 }, { "epoch": 2.1784892698452714, "grad_norm": 0.03413026034832001, "learning_rate": 1.0577260828565492e-07, "logits/chosen": -0.7267395257949829, "logits/rejected": NaN, "logps/chosen": -201.24374389648438, "logps/rejected": -490.6000061035156, "loss": 0.0175, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.214038133621216, "rewards/margins": 22.61328125, "rewards/rejected": -24.8359375, "step": 8640 }, { "epoch": 2.1810103047300915, "grad_norm": 0.07549922168254852, "learning_rate": 1.0517412684271856e-07, "logits/chosen": -0.7540832757949829, "logits/rejected": NaN, "logps/chosen": -197.3515625, "logps/rejected": -515.5499877929688, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9284789562225342, "rewards/margins": 22.720312118530273, "rewards/rejected": -24.650781631469727, "step": 8650 }, { "epoch": 2.183531339614912, "grad_norm": 0.06380689144134521, "learning_rate": 1.0457689200478185e-07, "logits/chosen": -0.6685150265693665, "logits/rejected": -0.785778820514679, "logps/chosen": -186.93124389648438, "logps/rejected": -507.57501220703125, "loss": 0.0205, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.010266065597534, "rewards/margins": 23.117969512939453, "rewards/rejected": -25.126562118530273, "step": 8660 }, { "epoch": 2.186052374499732, "grad_norm": 10.604216575622559, "learning_rate": 1.0398090891261105e-07, "logits/chosen": -0.6311233639717102, "logits/rejected": NaN, "logps/chosen": -213.5906219482422, "logps/rejected": -498.17498779296875, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2112548351287842, "rewards/margins": 22.981250762939453, "rewards/rejected": -24.19921875, "step": 8670 }, { "epoch": 2.1885734093845524, "grad_norm": 0.07694726437330246, "learning_rate": 1.0338618269619762e-07, "logits/chosen": -0.676373302936554, "logits/rejected": NaN, "logps/chosen": -183.3390655517578, "logps/rejected": -537.2874755859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -2.114856004714966, "rewards/margins": 22.721094131469727, "rewards/rejected": -24.83203125, "step": 8680 }, { "epoch": 2.1910944442693725, "grad_norm": 0.4385008215904236, "learning_rate": 1.0279271847471426e-07, "logits/chosen": -0.7219024896621704, "logits/rejected": NaN, "logps/chosen": -203.6531219482422, "logps/rejected": -526.6124877929688, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.71539306640625, "rewards/margins": 23.446094512939453, "rewards/rejected": -26.153125762939453, "step": 8690 }, { "epoch": 2.1936154791541926, "grad_norm": 1.8098787069320679, "learning_rate": 1.0220052135647129e-07, "logits/chosen": -0.6419128179550171, "logits/rejected": NaN, "logps/chosen": -187.78750610351562, "logps/rejected": -524.0999755859375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -2.495990037918091, "rewards/margins": 22.775781631469727, "rewards/rejected": -25.274999618530273, "step": 8700 }, { "epoch": 2.196136514039013, "grad_norm": 0.2733747959136963, "learning_rate": 1.0160959643887187e-07, "logits/chosen": -0.5561767816543579, "logits/rejected": -0.7377258539199829, "logps/chosen": -189.55624389648438, "logps/rejected": -505.29998779296875, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.7879912853240967, "rewards/margins": 23.16796875, "rewards/rejected": -25.956249237060547, "step": 8710 }, { "epoch": 2.1986575489238334, "grad_norm": 55.20503234863281, "learning_rate": 1.010199488083687e-07, "logits/chosen": -0.6082916259765625, "logits/rejected": -0.7617431879043579, "logps/chosen": -191.0625, "logps/rejected": -515.875, "loss": 0.0263, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.380593776702881, "rewards/margins": 23.111719131469727, "rewards/rejected": -25.482812881469727, "step": 8720 }, { "epoch": 2.2011785838086535, "grad_norm": 1.033888339996338, "learning_rate": 1.0043158354042027e-07, "logits/chosen": -0.5296951532363892, "logits/rejected": NaN, "logps/chosen": -184.08438110351562, "logps/rejected": -512.3875122070312, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.231781005859375, "rewards/margins": 23.122655868530273, "rewards/rejected": -25.346874237060547, "step": 8730 }, { "epoch": 2.2036996186934736, "grad_norm": 0.01579919457435608, "learning_rate": 9.984450569944672e-08, "logits/chosen": -0.618011474609375, "logits/rejected": NaN, "logps/chosen": -173.08749389648438, "logps/rejected": -495.75, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.9207947254180908, "rewards/margins": 24.477344512939453, "rewards/rejected": -26.409374237060547, "step": 8740 }, { "epoch": 2.2062206535782938, "grad_norm": 0.10896661132574081, "learning_rate": 9.925872033878662e-08, "logits/chosen": -0.548962414264679, "logits/rejected": -0.6350463628768921, "logps/chosen": -169.9031219482422, "logps/rejected": -494.04998779296875, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.085662841796875, "rewards/margins": 23.21875, "rewards/rejected": -25.303125381469727, "step": 8750 }, { "epoch": 2.208741688463114, "grad_norm": 0.034469153732061386, "learning_rate": 9.867423250065332e-08, "logits/chosen": -0.6638580560684204, "logits/rejected": NaN, "logps/chosen": -182.9093780517578, "logps/rejected": -500.17498779296875, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.859472632408142, "rewards/margins": 23.27734375, "rewards/rejected": -25.146093368530273, "step": 8760 }, { "epoch": 2.2112627233479345, "grad_norm": 0.01249816082417965, "learning_rate": 9.809104721609182e-08, "logits/chosen": -0.572009265422821, "logits/rejected": NaN, "logps/chosen": -170.8718719482422, "logps/rejected": -501.875, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.341552734375, "rewards/margins": 22.319530487060547, "rewards/rejected": -24.664844512939453, "step": 8770 }, { "epoch": 2.2137837582327546, "grad_norm": 1.2210211753845215, "learning_rate": 9.75091695049349e-08, "logits/chosen": -0.5610198974609375, "logits/rejected": NaN, "logps/chosen": -177.60000610351562, "logps/rejected": -504.1499938964844, "loss": 0.0112, "rewards/accuracies": 0.984375, "rewards/chosen": -1.839324951171875, "rewards/margins": 23.157032012939453, "rewards/rejected": -24.993749618530273, "step": 8780 }, { "epoch": 2.216304793117575, "grad_norm": 0.004287341143935919, "learning_rate": 9.692860437576061e-08, "logits/chosen": -0.664581298828125, "logits/rejected": NaN, "logps/chosen": -183.04922485351562, "logps/rejected": -511.54998779296875, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.349679470062256, "rewards/margins": 22.176563262939453, "rewards/rejected": -24.522655487060547, "step": 8790 }, { "epoch": 2.218825828002395, "grad_norm": 0.10356592386960983, "learning_rate": 9.634935682584846e-08, "logits/chosen": -0.6716705560684204, "logits/rejected": NaN, "logps/chosen": -176.23281860351562, "logps/rejected": -505.5874938964844, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.502752661705017, "rewards/margins": 22.940624237060547, "rewards/rejected": -24.44140625, "step": 8800 }, { "epoch": 2.221346862887215, "grad_norm": 0.05902937799692154, "learning_rate": 9.577143184113711e-08, "logits/chosen": -0.706036388874054, "logits/rejected": -0.575122058391571, "logps/chosen": -198.6218719482422, "logps/rejected": -512.5750122070312, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3118042945861816, "rewards/margins": 22.766407012939453, "rewards/rejected": -25.078125, "step": 8810 }, { "epoch": 2.2238678977720356, "grad_norm": 0.016209879890084267, "learning_rate": 9.519483439618075e-08, "logits/chosen": -0.6571624875068665, "logits/rejected": NaN, "logps/chosen": -198.33438110351562, "logps/rejected": -509.98748779296875, "loss": 0.01, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1527771949768066, "rewards/margins": 22.4609375, "rewards/rejected": -24.61328125, "step": 8820 }, { "epoch": 2.226388932656856, "grad_norm": 2.1198272705078125, "learning_rate": 9.461956945410676e-08, "logits/chosen": -0.7927703857421875, "logits/rejected": -0.6424010992050171, "logps/chosen": -193.265625, "logps/rejected": -494.88751220703125, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.7798340320587158, "rewards/margins": 23.357030868530273, "rewards/rejected": -25.13671875, "step": 8830 }, { "epoch": 2.228909967541676, "grad_norm": 0.0001365284260828048, "learning_rate": 9.404564196657298e-08, "logits/chosen": -0.680499255657196, "logits/rejected": -0.67327880859375, "logps/chosen": -197.5500030517578, "logps/rejected": -503.6499938964844, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7637786865234375, "rewards/margins": 23.039844512939453, "rewards/rejected": -24.795312881469727, "step": 8840 }, { "epoch": 2.231431002426496, "grad_norm": 5.577780723571777, "learning_rate": 9.347305687372475e-08, "logits/chosen": -0.720446765422821, "logits/rejected": NaN, "logps/chosen": -190.1531219482422, "logps/rejected": -521.8875122070312, "loss": 0.0071, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6292541027069092, "rewards/margins": 22.396875381469727, "rewards/rejected": -24.029687881469727, "step": 8850 }, { "epoch": 2.233952037311316, "grad_norm": 6.6063313484191895, "learning_rate": 9.290181910415263e-08, "logits/chosen": -0.6093170046806335, "logits/rejected": NaN, "logps/chosen": -192.66250610351562, "logps/rejected": -506.42498779296875, "loss": 0.0038, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7713959217071533, "rewards/margins": 22.517187118530273, "rewards/rejected": -24.295312881469727, "step": 8860 }, { "epoch": 2.2364730721961363, "grad_norm": 0.020197290927171707, "learning_rate": 9.233193357485014e-08, "logits/chosen": -0.528124988079071, "logits/rejected": NaN, "logps/chosen": -187.2109375, "logps/rejected": -500.25, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.128521680831909, "rewards/margins": 22.759374618530273, "rewards/rejected": -24.889842987060547, "step": 8870 }, { "epoch": 2.238994107080957, "grad_norm": 0.13371047377586365, "learning_rate": 9.176340519117106e-08, "logits/chosen": -0.6137065887451172, "logits/rejected": NaN, "logps/chosen": -202.7421875, "logps/rejected": -501.32501220703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -0.7882728576660156, "rewards/margins": 23.3125, "rewards/rejected": -24.1015625, "step": 8880 }, { "epoch": 2.241515141965777, "grad_norm": 0.05830936133861542, "learning_rate": 9.11962388467874e-08, "logits/chosen": NaN, "logits/rejected": -0.671429455280304, "logps/chosen": -181.75, "logps/rejected": -512.2000122070312, "loss": 0.0086, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.131091356277466, "rewards/margins": 22.143749237060547, "rewards/rejected": -24.276561737060547, "step": 8890 }, { "epoch": 2.244036176850597, "grad_norm": 0.7780137658119202, "learning_rate": 9.063043942364717e-08, "logits/chosen": -0.660369873046875, "logits/rejected": -0.678942859172821, "logps/chosen": -197.00936889648438, "logps/rejected": -505.20001220703125, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2073731422424316, "rewards/margins": 21.897655487060547, "rewards/rejected": -24.119531631469727, "step": 8900 }, { "epoch": 2.2465572117354173, "grad_norm": 1.135434627532959, "learning_rate": 9.006601179193283e-08, "logits/chosen": -0.542193591594696, "logits/rejected": NaN, "logps/chosen": -180.0281219482422, "logps/rejected": -474.1000061035156, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7533447742462158, "rewards/margins": 22.120311737060547, "rewards/rejected": -23.872655868530273, "step": 8910 }, { "epoch": 2.2490782466202375, "grad_norm": 0.0330791138112545, "learning_rate": 8.950296081001846e-08, "logits/chosen": -0.564404308795929, "logits/rejected": NaN, "logps/chosen": -186.1843719482422, "logps/rejected": -504.67498779296875, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.043069362640381, "rewards/margins": 22.571094512939453, "rewards/rejected": -24.618749618530273, "step": 8920 }, { "epoch": 2.251599281505058, "grad_norm": 0.10023608058691025, "learning_rate": 8.894129132442898e-08, "logits/chosen": -0.647290050983429, "logits/rejected": NaN, "logps/chosen": -182.22811889648438, "logps/rejected": -483.11248779296875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.8515503406524658, "rewards/margins": 21.87109375, "rewards/rejected": -23.723438262939453, "step": 8930 }, { "epoch": 2.254120316389878, "grad_norm": 0.9761331677436829, "learning_rate": 8.838100816979751e-08, "logits/chosen": -0.5711914300918579, "logits/rejected": NaN, "logps/chosen": -182.1593780517578, "logps/rejected": -513.375, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9016845226287842, "rewards/margins": 22.97265625, "rewards/rejected": -24.875, "step": 8940 }, { "epoch": 2.2566413512746983, "grad_norm": 0.7104736566543579, "learning_rate": 8.782211616882451e-08, "logits/chosen": -0.7064193487167358, "logits/rejected": -0.7240356206893921, "logps/chosen": -203.99374389648438, "logps/rejected": -507.82501220703125, "loss": 0.0271, "rewards/accuracies": 0.984375, "rewards/chosen": -1.7844970226287842, "rewards/margins": 22.178125381469727, "rewards/rejected": -23.96484375, "step": 8950 }, { "epoch": 2.2591623861595185, "grad_norm": 0.11162414401769638, "learning_rate": 8.726462013223568e-08, "logits/chosen": -0.6090179681777954, "logits/rejected": NaN, "logps/chosen": -192.02188110351562, "logps/rejected": -499.0625, "loss": 0.0133, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.339123487472534, "rewards/margins": 21.965625762939453, "rewards/rejected": -24.293750762939453, "step": 8960 }, { "epoch": 2.2616834210443386, "grad_norm": 18.811843872070312, "learning_rate": 8.67085248587408e-08, "logits/chosen": -0.724151611328125, "logits/rejected": NaN, "logps/chosen": -204.63436889648438, "logps/rejected": -511.29998779296875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -2.4880127906799316, "rewards/margins": 22.55859375, "rewards/rejected": -25.056249618530273, "step": 8970 }, { "epoch": 2.2642044559291588, "grad_norm": 0.25828278064727783, "learning_rate": 8.615383513499271e-08, "logits/chosen": -0.6452270746231079, "logits/rejected": -0.7155914306640625, "logps/chosen": -183.9656219482422, "logps/rejected": -512.75, "loss": 0.0062, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9232330322265625, "rewards/margins": 22.575000762939453, "rewards/rejected": -24.510936737060547, "step": 8980 }, { "epoch": 2.266725490813979, "grad_norm": 0.23100516200065613, "learning_rate": 8.56005557355455e-08, "logits/chosen": -0.700836181640625, "logits/rejected": -0.7312591671943665, "logps/chosen": -179.83749389648438, "logps/rejected": -510.25, "loss": 0.0192, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8591216802597046, "rewards/margins": 23.002344131469727, "rewards/rejected": -24.868749618530273, "step": 8990 }, { "epoch": 2.2692465256987995, "grad_norm": 0.3243653178215027, "learning_rate": 8.50486914228138e-08, "logits/chosen": -0.7167205810546875, "logits/rejected": -0.590344250202179, "logps/chosen": -183.9718780517578, "logps/rejected": -503.17498779296875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.4675781726837158, "rewards/margins": 22.69921875, "rewards/rejected": -24.171875, "step": 9000 }, { "epoch": 2.2717675605836196, "grad_norm": 0.1270705908536911, "learning_rate": 8.449824694703192e-08, "logits/chosen": -0.574169933795929, "logits/rejected": NaN, "logps/chosen": -176.27108764648438, "logps/rejected": -507.8125, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.8175475597381592, "rewards/margins": 23.132030487060547, "rewards/rejected": -24.958593368530273, "step": 9010 }, { "epoch": 2.2742885954684398, "grad_norm": 0.11125287413597107, "learning_rate": 8.39492270462126e-08, "logits/chosen": -0.5930755734443665, "logits/rejected": NaN, "logps/chosen": -187.375, "logps/rejected": -498.17498779296875, "loss": 0.0152, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8370177745819092, "rewards/margins": 23.028125762939453, "rewards/rejected": -24.875782012939453, "step": 9020 }, { "epoch": 2.27680963035326, "grad_norm": 0.03120291233062744, "learning_rate": 8.340163644610634e-08, "logits/chosen": -0.5126724243164062, "logits/rejected": NaN, "logps/chosen": -182.2062530517578, "logps/rejected": -492.17498779296875, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.794342041015625, "rewards/margins": 23.456249237060547, "rewards/rejected": -25.251562118530273, "step": 9030 }, { "epoch": 2.2793306652380805, "grad_norm": 1.7625787258148193, "learning_rate": 8.285547986016081e-08, "logits/chosen": -0.33406829833984375, "logits/rejected": NaN, "logps/chosen": -173.7703094482422, "logps/rejected": -496.54998779296875, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7865722179412842, "rewards/margins": 22.692968368530273, "rewards/rejected": -24.475000381469727, "step": 9040 }, { "epoch": 2.2818517001229006, "grad_norm": 0.041448939591646194, "learning_rate": 8.231076198948044e-08, "logits/chosen": -0.665087878704071, "logits/rejected": NaN, "logps/chosen": -189.36874389648438, "logps/rejected": -546.8250122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.704620361328125, "rewards/margins": 22.796875, "rewards/rejected": -25.493749618530273, "step": 9050 }, { "epoch": 2.2843727350077208, "grad_norm": 0.5553317666053772, "learning_rate": 8.176748752278543e-08, "logits/chosen": -0.6299270391464233, "logits/rejected": NaN, "logps/chosen": -190.9718780517578, "logps/rejected": -531.3875122070312, "loss": 0.0122, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.2134201526641846, "rewards/margins": 23.452342987060547, "rewards/rejected": -25.6640625, "step": 9060 }, { "epoch": 2.286893769892541, "grad_norm": 0.09391612559556961, "learning_rate": 8.122566113637203e-08, "logits/chosen": -0.5599304437637329, "logits/rejected": NaN, "logps/chosen": -187.94375610351562, "logps/rejected": -496.125, "loss": 0.0118, "rewards/accuracies": 0.984375, "rewards/chosen": -2.6195311546325684, "rewards/margins": 22.7421875, "rewards/rejected": -25.359375, "step": 9070 }, { "epoch": 2.289414804777361, "grad_norm": 0.19259902834892273, "learning_rate": 8.068528749407169e-08, "logits/chosen": -0.5530120730400085, "logits/rejected": -0.753265380859375, "logps/chosen": -187.42813110351562, "logps/rejected": -520.0250244140625, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.7335236072540283, "rewards/margins": 23.114843368530273, "rewards/rejected": -25.854686737060547, "step": 9080 }, { "epoch": 2.291935839662181, "grad_norm": 0.2742903232574463, "learning_rate": 8.014637124721149e-08, "logits/chosen": -0.5449981689453125, "logits/rejected": -0.5865997076034546, "logps/chosen": -195.77499389648438, "logps/rejected": -516.5750122070312, "loss": 0.0308, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7935729026794434, "rewards/margins": 23.119531631469727, "rewards/rejected": -25.90625, "step": 9090 }, { "epoch": 2.2944568745470013, "grad_norm": 0.5204123854637146, "learning_rate": 7.960891703457362e-08, "logits/chosen": -0.714007556438446, "logits/rejected": -0.6059325933456421, "logps/chosen": -197.61563110351562, "logps/rejected": -516.1749877929688, "loss": 0.0032, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7585632801055908, "rewards/margins": 24.014062881469727, "rewards/rejected": -25.768749237060547, "step": 9100 }, { "epoch": 2.296977909431822, "grad_norm": 0.07295586168766022, "learning_rate": 7.907292948235555e-08, "logits/chosen": -0.5487915277481079, "logits/rejected": -0.732189953327179, "logps/chosen": -194.02188110351562, "logps/rejected": -516.5750122070312, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4027037620544434, "rewards/margins": 22.305469512939453, "rewards/rejected": -24.711719512939453, "step": 9110 }, { "epoch": 2.299498944316642, "grad_norm": 0.2045769989490509, "learning_rate": 7.853841320413065e-08, "logits/chosen": -0.5667877197265625, "logits/rejected": NaN, "logps/chosen": -169.17813110351562, "logps/rejected": -500.0249938964844, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.823095679283142, "rewards/margins": 23.303905487060547, "rewards/rejected": -25.126562118530273, "step": 9120 }, { "epoch": 2.302019979201462, "grad_norm": 0.00011334159353282303, "learning_rate": 7.800537280080785e-08, "logits/chosen": -0.617382824420929, "logits/rejected": NaN, "logps/chosen": -212.2937469482422, "logps/rejected": -525.2999877929688, "loss": 0.0246, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2810959815979004, "rewards/margins": 22.694530487060547, "rewards/rejected": -24.978124618530273, "step": 9130 }, { "epoch": 2.3045410140862823, "grad_norm": 0.0010294439271092415, "learning_rate": 7.747381286059232e-08, "logits/chosen": -0.5419853329658508, "logits/rejected": NaN, "logps/chosen": -186.33438110351562, "logps/rejected": -493.875, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.6976866722106934, "rewards/margins": 22.521875381469727, "rewards/rejected": -25.23046875, "step": 9140 }, { "epoch": 2.3070620489711025, "grad_norm": 3.7995963096618652, "learning_rate": 7.694373795894621e-08, "logits/chosen": -0.5619567632675171, "logits/rejected": NaN, "logps/chosen": -180.08749389648438, "logps/rejected": -488.0249938964844, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4690704345703125, "rewards/margins": 23.006250381469727, "rewards/rejected": -25.481250762939453, "step": 9150 }, { "epoch": 2.309583083855923, "grad_norm": 0.003494368167594075, "learning_rate": 7.641515265854882e-08, "logits/chosen": -0.5179382562637329, "logits/rejected": NaN, "logps/chosen": -188.7781219482422, "logps/rejected": -515.875, "loss": 0.0071, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7904937267303467, "rewards/margins": 23.56640625, "rewards/rejected": -26.353124618530273, "step": 9160 }, { "epoch": 2.312104118740743, "grad_norm": 1.0984117984771729, "learning_rate": 7.588806150925755e-08, "logits/chosen": -0.5441681146621704, "logits/rejected": NaN, "logps/chosen": -203.04061889648438, "logps/rejected": -523.6375122070312, "loss": 0.0081, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.190502882003784, "rewards/margins": 23.460155487060547, "rewards/rejected": -25.653125762939453, "step": 9170 }, { "epoch": 2.3146251536255633, "grad_norm": 0.0009259980288334191, "learning_rate": 7.536246904806878e-08, "logits/chosen": -0.6914413571357727, "logits/rejected": NaN, "logps/chosen": -198.5, "logps/rejected": -535.2999877929688, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.721881151199341, "rewards/margins": 23.245311737060547, "rewards/rejected": -25.961719512939453, "step": 9180 }, { "epoch": 2.3171461885103835, "grad_norm": 0.0012849702034145594, "learning_rate": 7.483837979907886e-08, "logits/chosen": -0.5592986941337585, "logits/rejected": -0.577288806438446, "logps/chosen": -180.2687530517578, "logps/rejected": -505.86248779296875, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7396011352539062, "rewards/margins": 23.48828125, "rewards/rejected": -26.232812881469727, "step": 9190 }, { "epoch": 2.3196672233952036, "grad_norm": 1.3943421840667725, "learning_rate": 7.431579827344486e-08, "logits/chosen": -0.6082671880722046, "logits/rejected": -0.7104126214981079, "logps/chosen": -182.07968139648438, "logps/rejected": -513.2249755859375, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.190478563308716, "rewards/margins": 23.057031631469727, "rewards/rejected": -25.248437881469727, "step": 9200 }, { "epoch": 2.3221882582800237, "grad_norm": 2.043091297149658, "learning_rate": 7.379472896934619e-08, "logits/chosen": -0.4491821229457855, "logits/rejected": NaN, "logps/chosen": -176.29061889648438, "logps/rejected": -502.6000061035156, "loss": 0.0286, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3286499977111816, "rewards/margins": 22.8203125, "rewards/rejected": -25.134374618530273, "step": 9210 }, { "epoch": 2.3247092931648443, "grad_norm": 0.009395342320203781, "learning_rate": 7.327517637194535e-08, "logits/chosen": NaN, "logits/rejected": -0.70745849609375, "logps/chosen": -190.9250030517578, "logps/rejected": -538.0750122070312, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.6370606422424316, "rewards/margins": 23.185937881469727, "rewards/rejected": -25.831249237060547, "step": 9220 }, { "epoch": 2.3272303280496645, "grad_norm": 1.8139564990997314, "learning_rate": 7.275714495334997e-08, "logits/chosen": -0.6340252161026001, "logits/rejected": NaN, "logps/chosen": -174.4968719482422, "logps/rejected": -501.4624938964844, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.6552491188049316, "rewards/margins": 22.52734375, "rewards/rejected": -25.184375762939453, "step": 9230 }, { "epoch": 2.3297513629344846, "grad_norm": 0.011351453140377998, "learning_rate": 7.224063917257369e-08, "logits/chosen": -0.5819427371025085, "logits/rejected": -0.600238025188446, "logps/chosen": -202.43905639648438, "logps/rejected": -528.9500122070312, "loss": 0.0028, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.610034227371216, "rewards/margins": 22.8359375, "rewards/rejected": -25.453907012939453, "step": 9240 }, { "epoch": 2.3322723978193047, "grad_norm": 0.016216302290558815, "learning_rate": 7.172566347549808e-08, "logits/chosen": -0.571789562702179, "logits/rejected": NaN, "logps/chosen": -179.0500030517578, "logps/rejected": -537.5750122070312, "loss": 0.0049, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.239349365234375, "rewards/margins": 22.971874237060547, "rewards/rejected": -25.203125, "step": 9250 }, { "epoch": 2.334793432704125, "grad_norm": 0.0023681053426116705, "learning_rate": 7.12122222948345e-08, "logits/chosen": -0.5828277468681335, "logits/rejected": -0.595843493938446, "logps/chosen": -187.9250030517578, "logps/rejected": -497.2250061035156, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.597912549972534, "rewards/margins": 23.193750381469727, "rewards/rejected": -25.792186737060547, "step": 9260 }, { "epoch": 2.3373144675889455, "grad_norm": 0.002917754463851452, "learning_rate": 7.070032005008567e-08, "logits/chosen": -0.5151001214981079, "logits/rejected": NaN, "logps/chosen": -179.63125610351562, "logps/rejected": -527.375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -2.075030565261841, "rewards/margins": 23.696094512939453, "rewards/rejected": -25.770313262939453, "step": 9270 }, { "epoch": 2.3398355024737656, "grad_norm": 0.008216632530093193, "learning_rate": 7.018996114750766e-08, "logits/chosen": -0.6895080804824829, "logits/rejected": NaN, "logps/chosen": -220.76718139648438, "logps/rejected": -503.07501220703125, "loss": 0.0045, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7183929681777954, "rewards/margins": 23.053125381469727, "rewards/rejected": -24.767969131469727, "step": 9280 }, { "epoch": 2.3423565373585857, "grad_norm": 5.763053894042969, "learning_rate": 6.968114998007232e-08, "logits/chosen": -0.602343738079071, "logits/rejected": -0.66259765625, "logps/chosen": -188.12344360351562, "logps/rejected": -521.7750244140625, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.663903832435608, "rewards/margins": 23.198436737060547, "rewards/rejected": -24.861719131469727, "step": 9290 }, { "epoch": 2.344877572243406, "grad_norm": 0.04956959933042526, "learning_rate": 6.917389092742893e-08, "logits/chosen": -0.6860504150390625, "logits/rejected": NaN, "logps/chosen": -187.02188110351562, "logps/rejected": -524.3375244140625, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8054535388946533, "rewards/margins": 23.092967987060547, "rewards/rejected": -24.905467987060547, "step": 9300 }, { "epoch": 2.347398607128226, "grad_norm": 0.1562097668647766, "learning_rate": 6.866818835586687e-08, "logits/chosen": -0.633105456829071, "logits/rejected": -0.729400634765625, "logps/chosen": -178.5703125, "logps/rejected": -502.04998779296875, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9486877918243408, "rewards/margins": 23.409374237060547, "rewards/rejected": -25.356250762939453, "step": 9310 }, { "epoch": 2.349919642013046, "grad_norm": 0.0035679463762789965, "learning_rate": 6.816404661827785e-08, "logits/chosen": -0.583508312702179, "logits/rejected": NaN, "logps/chosen": -189.4656219482422, "logps/rejected": -513.1500244140625, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.213531494140625, "rewards/margins": 23.40234375, "rewards/rejected": -25.6171875, "step": 9320 }, { "epoch": 2.3524406768978667, "grad_norm": 0.0404689759016037, "learning_rate": 6.766147005411879e-08, "logits/chosen": -0.5824432373046875, "logits/rejected": NaN, "logps/chosen": -186.8937530517578, "logps/rejected": -502.79998779296875, "loss": 0.0109, "rewards/accuracies": 0.984375, "rewards/chosen": -2.384045362472534, "rewards/margins": 22.985937118530273, "rewards/rejected": -25.372655868530273, "step": 9330 }, { "epoch": 2.354961711782687, "grad_norm": 1.1833690404891968, "learning_rate": 6.716046298937384e-08, "logits/chosen": -0.678753674030304, "logits/rejected": NaN, "logps/chosen": -186.4187469482422, "logps/rejected": -530.2249755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.418949842453003, "rewards/margins": 24.107812881469727, "rewards/rejected": -25.529687881469727, "step": 9340 }, { "epoch": 2.357482746667507, "grad_norm": 0.0053831059485673904, "learning_rate": 6.666102973651782e-08, "logits/chosen": -0.5808990597724915, "logits/rejected": NaN, "logps/chosen": -184.3359375, "logps/rejected": -509.8999938964844, "loss": 0.0074, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0275511741638184, "rewards/margins": 23.571874618530273, "rewards/rejected": -25.604686737060547, "step": 9350 }, { "epoch": 2.360003781552327, "grad_norm": 0.022121619433164597, "learning_rate": 6.616317459447851e-08, "logits/chosen": -0.5590850710868835, "logits/rejected": NaN, "logps/chosen": -173.2624969482422, "logps/rejected": -500.42498779296875, "loss": 0.0116, "rewards/accuracies": 0.984375, "rewards/chosen": -2.087658643722534, "rewards/margins": 23.045312881469727, "rewards/rejected": -25.131250381469727, "step": 9360 }, { "epoch": 2.3625248164371473, "grad_norm": 0.001288356026634574, "learning_rate": 6.566690184860028e-08, "logits/chosen": -0.5484253168106079, "logits/rejected": NaN, "logps/chosen": -192.1374969482422, "logps/rejected": -516.2249755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -2.6998748779296875, "rewards/margins": 24.121875762939453, "rewards/rejected": -26.815624237060547, "step": 9370 }, { "epoch": 2.365045851321968, "grad_norm": 0.00407898984849453, "learning_rate": 6.517221577060644e-08, "logits/chosen": -0.4794921875, "logits/rejected": NaN, "logps/chosen": -177.7062530517578, "logps/rejected": -542.4500122070312, "loss": 0.0174, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1601715087890625, "rewards/margins": 24.332813262939453, "rewards/rejected": -27.490625381469727, "step": 9380 }, { "epoch": 2.367566886206788, "grad_norm": 0.04257339984178543, "learning_rate": 6.46791206185631e-08, "logits/chosen": -0.5895843505859375, "logits/rejected": NaN, "logps/chosen": -202.44375610351562, "logps/rejected": -526.7625122070312, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.632946729660034, "rewards/margins": 24.439062118530273, "rewards/rejected": -27.060155868530273, "step": 9390 }, { "epoch": 2.370087921091608, "grad_norm": 0.06470364332199097, "learning_rate": 6.418762063684239e-08, "logits/chosen": -0.530133068561554, "logits/rejected": NaN, "logps/chosen": -188.29061889648438, "logps/rejected": -526.5499877929688, "loss": 0.013, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7549071311950684, "rewards/margins": 23.928905487060547, "rewards/rejected": -26.693750381469727, "step": 9400 }, { "epoch": 2.3726089559764283, "grad_norm": 0.006570747122168541, "learning_rate": 6.36977200560856e-08, "logits/chosen": -0.5714279413223267, "logits/rejected": -0.5997070074081421, "logps/chosen": -199.890625, "logps/rejected": -537.1500244140625, "loss": 0.008, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.62274169921875, "rewards/margins": 24.157032012939453, "rewards/rejected": -26.787500381469727, "step": 9410 }, { "epoch": 2.3751299908612484, "grad_norm": 0.001034194603562355, "learning_rate": 6.320942309316704e-08, "logits/chosen": -0.47511976957321167, "logits/rejected": NaN, "logps/chosen": -170.69375610351562, "logps/rejected": -487.17498779296875, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.5653929710388184, "rewards/margins": 22.994531631469727, "rewards/rejected": -25.560155868530273, "step": 9420 }, { "epoch": 2.3776510257460686, "grad_norm": 0.028816763311624527, "learning_rate": 6.272273395115794e-08, "logits/chosen": -0.645434558391571, "logits/rejected": NaN, "logps/chosen": -197.00936889648438, "logps/rejected": -543.9249877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.11199951171875, "rewards/margins": 23.655467987060547, "rewards/rejected": -25.768749237060547, "step": 9430 }, { "epoch": 2.380172060630889, "grad_norm": 0.001248158048838377, "learning_rate": 6.223765681928977e-08, "logits/chosen": -0.556079089641571, "logits/rejected": NaN, "logps/chosen": -186.52499389648438, "logps/rejected": -548.625, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.9823486804962158, "rewards/margins": 24.129688262939453, "rewards/rejected": -26.109375, "step": 9440 }, { "epoch": 2.3826930955157093, "grad_norm": 0.2244897335767746, "learning_rate": 6.175419587291853e-08, "logits/chosen": -0.695361316204071, "logits/rejected": -0.718151867389679, "logps/chosen": -203.86563110351562, "logps/rejected": -524.4749755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.8167724609375, "rewards/margins": 22.266407012939453, "rewards/rejected": -25.087499618530273, "step": 9450 }, { "epoch": 2.3852141304005294, "grad_norm": 30.898473739624023, "learning_rate": 6.127235527348862e-08, "logits/chosen": -0.6272033452987671, "logits/rejected": NaN, "logps/chosen": -193.59375, "logps/rejected": -510.57501220703125, "loss": 0.0133, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.842858910560608, "rewards/margins": 23.876562118530273, "rewards/rejected": -25.714061737060547, "step": 9460 }, { "epoch": 2.3877351652853496, "grad_norm": 20.018821716308594, "learning_rate": 6.079213916849737e-08, "logits/chosen": -0.6664932370185852, "logits/rejected": NaN, "logps/chosen": -201.6437530517578, "logps/rejected": -486.875, "loss": 0.0115, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.902783155441284, "rewards/margins": 22.385936737060547, "rewards/rejected": -25.297657012939453, "step": 9470 }, { "epoch": 2.3902562001701697, "grad_norm": 0.34252485632896423, "learning_rate": 6.031355169145882e-08, "logits/chosen": -0.66387939453125, "logits/rejected": -0.6833130121231079, "logps/chosen": -191.8359375, "logps/rejected": -512.7999877929688, "loss": 0.008, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.445269823074341, "rewards/margins": 22.603124618530273, "rewards/rejected": -25.042186737060547, "step": 9480 }, { "epoch": 2.3927772350549903, "grad_norm": 0.05983949080109596, "learning_rate": 5.983659696186868e-08, "logits/chosen": -0.548907458782196, "logits/rejected": -0.70849609375, "logps/chosen": -180.2429656982422, "logps/rejected": -528.5, "loss": 0.0062, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1622557640075684, "rewards/margins": 23.868749618530273, "rewards/rejected": -26.034374237060547, "step": 9490 }, { "epoch": 2.3952982699398104, "grad_norm": 0.47095534205436707, "learning_rate": 5.9361279085168274e-08, "logits/chosen": -0.66558837890625, "logits/rejected": -0.73175048828125, "logps/chosen": -175.90780639648438, "logps/rejected": -543.5, "loss": 0.0098, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9923279285430908, "rewards/margins": 23.014842987060547, "rewards/rejected": -25.00390625, "step": 9500 }, { "epoch": 2.3978193048246306, "grad_norm": 0.1909724324941635, "learning_rate": 5.888760215270988e-08, "logits/chosen": -0.5846801996231079, "logits/rejected": -0.6305481195449829, "logps/chosen": -163.73672485351562, "logps/rejected": -497.26251220703125, "loss": 0.0123, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.707940697669983, "rewards/margins": 22.706249237060547, "rewards/rejected": -24.413280487060547, "step": 9510 }, { "epoch": 2.4003403397094507, "grad_norm": 27.151918411254883, "learning_rate": 5.8415570241720916e-08, "logits/chosen": -0.5949646234512329, "logits/rejected": NaN, "logps/chosen": -187.8859405517578, "logps/rejected": -526.8250122070312, "loss": 0.0191, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6605148315429688, "rewards/margins": 22.90234375, "rewards/rejected": -24.5625, "step": 9520 }, { "epoch": 2.402861374594271, "grad_norm": 4.941977500915527, "learning_rate": 5.7945187415269076e-08, "logits/chosen": -0.623199462890625, "logits/rejected": NaN, "logps/chosen": -192.078125, "logps/rejected": -503.2250061035156, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.744354248046875, "rewards/margins": 22.302343368530273, "rewards/rejected": -24.05078125, "step": 9530 }, { "epoch": 2.405382409479091, "grad_norm": 1.7722201347351074, "learning_rate": 5.747645772222767e-08, "logits/chosen": -0.681994616985321, "logits/rejected": -0.5683959722518921, "logps/chosen": -158.33438110351562, "logps/rejected": -506.3500061035156, "loss": 0.0143, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.5272308588027954, "rewards/margins": 23.16796875, "rewards/rejected": -24.693750381469727, "step": 9540 }, { "epoch": 2.4079034443639116, "grad_norm": 153.9136199951172, "learning_rate": 5.700938519724016e-08, "logits/chosen": -0.639782726764679, "logits/rejected": -0.670745849609375, "logps/chosen": -163.328125, "logps/rejected": -514.4124755859375, "loss": 0.0593, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.798150658607483, "rewards/margins": 22.332813262939453, "rewards/rejected": -24.133594512939453, "step": 9550 }, { "epoch": 2.4104244792487317, "grad_norm": 0.9085888266563416, "learning_rate": 5.6543973860685796e-08, "logits/chosen": -0.511090099811554, "logits/rejected": NaN, "logps/chosen": -188.203125, "logps/rejected": -488.20001220703125, "loss": 0.0206, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9146347045898438, "rewards/margins": 22.337499618530273, "rewards/rejected": -24.245311737060547, "step": 9560 }, { "epoch": 2.412945514133552, "grad_norm": 0.010079721920192242, "learning_rate": 5.608022771864515e-08, "logits/chosen": -0.609161376953125, "logits/rejected": NaN, "logps/chosen": -180.04061889648438, "logps/rejected": -496.25, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.242297410964966, "rewards/margins": 22.598438262939453, "rewards/rejected": -24.832813262939453, "step": 9570 }, { "epoch": 2.415466549018372, "grad_norm": 0.012342572212219238, "learning_rate": 5.56181507628653e-08, "logits/chosen": -0.6390182375907898, "logits/rejected": NaN, "logps/chosen": -186.64999389648438, "logps/rejected": -520.2125244140625, "loss": 0.0045, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.2534728050231934, "rewards/margins": 22.458593368530273, "rewards/rejected": -24.704687118530273, "step": 9580 }, { "epoch": 2.417987583903192, "grad_norm": 0.24862723052501678, "learning_rate": 5.5157746970725614e-08, "logits/chosen": -0.5091552734375, "logits/rejected": NaN, "logps/chosen": -184.75, "logps/rejected": -487.25, "loss": 0.0083, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.8585937023162842, "rewards/margins": 22.98828125, "rewards/rejected": -24.849218368530273, "step": 9590 }, { "epoch": 2.4205086187880127, "grad_norm": 0.07158707827329636, "learning_rate": 5.469902030520346e-08, "logits/chosen": -0.6253722906112671, "logits/rejected": NaN, "logps/chosen": -185.203125, "logps/rejected": -488.2250061035156, "loss": 0.0258, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.274972438812256, "rewards/margins": 22.887500762939453, "rewards/rejected": -25.165624618530273, "step": 9600 }, { "epoch": 2.423029653672833, "grad_norm": 2.6753950119018555, "learning_rate": 5.424197471484041e-08, "logits/chosen": -0.6289215087890625, "logits/rejected": -0.6407989263534546, "logps/chosen": -192.03750610351562, "logps/rejected": -526.1500244140625, "loss": 0.0118, "rewards/accuracies": 0.984375, "rewards/chosen": -1.907690405845642, "rewards/margins": 22.275781631469727, "rewards/rejected": -24.193750381469727, "step": 9610 }, { "epoch": 2.425550688557653, "grad_norm": 0.002962864935398102, "learning_rate": 5.378661413370761e-08, "logits/chosen": -0.548736572265625, "logits/rejected": -0.6564880609512329, "logps/chosen": -175.1374969482422, "logps/rejected": -495.48748779296875, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.448071241378784, "rewards/margins": 22.405467987060547, "rewards/rejected": -24.854686737060547, "step": 9620 }, { "epoch": 2.428071723442473, "grad_norm": 1.0465989112854004, "learning_rate": 5.333294248137268e-08, "logits/chosen": -0.656109631061554, "logits/rejected": -0.755969226360321, "logps/chosen": -193.25936889648438, "logps/rejected": -527.8499755859375, "loss": 0.0213, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.5808959007263184, "rewards/margins": 22.290624618530273, "rewards/rejected": -24.865625381469727, "step": 9630 }, { "epoch": 2.4305927583272933, "grad_norm": 0.3041989207267761, "learning_rate": 5.288096366286526e-08, "logits/chosen": -0.614501953125, "logits/rejected": -0.654571533203125, "logps/chosen": -183.4656219482422, "logps/rejected": -498.07501220703125, "loss": 0.0111, "rewards/accuracies": 0.984375, "rewards/chosen": -1.937475562095642, "rewards/margins": 22.541406631469727, "rewards/rejected": -24.482812881469727, "step": 9640 }, { "epoch": 2.4331137932121134, "grad_norm": 0.0156667772680521, "learning_rate": 5.243068156864405e-08, "logits/chosen": -0.578326404094696, "logits/rejected": NaN, "logps/chosen": -181.609375, "logps/rejected": -526.7249755859375, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.984136939048767, "rewards/margins": 22.637500762939453, "rewards/rejected": -24.624217987060547, "step": 9650 }, { "epoch": 2.4356348280969335, "grad_norm": 0.08630943298339844, "learning_rate": 5.1982100074562776e-08, "logits/chosen": -0.532177746295929, "logits/rejected": -0.58807373046875, "logps/chosen": -165.53125, "logps/rejected": -499.07501220703125, "loss": 0.0109, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9498504400253296, "rewards/margins": 22.81640625, "rewards/rejected": -24.776561737060547, "step": 9660 }, { "epoch": 2.438155862981754, "grad_norm": 0.02869526483118534, "learning_rate": 5.153522304183702e-08, "logits/chosen": -0.5354095697402954, "logits/rejected": -0.60858154296875, "logps/chosen": -178.5187530517578, "logps/rejected": -511.2250061035156, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.455310106277466, "rewards/margins": 22.579687118530273, "rewards/rejected": -25.034374237060547, "step": 9670 }, { "epoch": 2.4406768978665743, "grad_norm": 0.02604285627603531, "learning_rate": 5.10900543170113e-08, "logits/chosen": -0.585467517375946, "logits/rejected": NaN, "logps/chosen": -191.1328125, "logps/rejected": -511.5249938964844, "loss": 0.014, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8704131841659546, "rewards/margins": 23.584375381469727, "rewards/rejected": -25.442968368530273, "step": 9680 }, { "epoch": 2.4431979327513944, "grad_norm": 0.3385732173919678, "learning_rate": 5.064659773192542e-08, "logits/chosen": -0.5982605218887329, "logits/rejected": NaN, "logps/chosen": -176.13436889648438, "logps/rejected": -516.25, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.0861175060272217, "rewards/margins": 23.65625, "rewards/rejected": -25.740625381469727, "step": 9690 }, { "epoch": 2.4457189676362145, "grad_norm": 2.4273455142974854, "learning_rate": 5.020485710368177e-08, "logits/chosen": -0.5940307378768921, "logits/rejected": NaN, "logps/chosen": -189.5, "logps/rejected": -506.5249938964844, "loss": 0.0144, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2389283180236816, "rewards/margins": 22.904687881469727, "rewards/rejected": -25.139062881469727, "step": 9700 }, { "epoch": 2.448240002521035, "grad_norm": 0.012730150483548641, "learning_rate": 4.9764836234612665e-08, "logits/chosen": -0.68890380859375, "logits/rejected": NaN, "logps/chosen": -184.41250610351562, "logps/rejected": -510.75, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6711204051971436, "rewards/margins": 23.350780487060547, "rewards/rejected": -25.028125762939453, "step": 9710 }, { "epoch": 2.4507610374058553, "grad_norm": 0.009715922176837921, "learning_rate": 4.932653891224719e-08, "logits/chosen": -0.4089643359184265, "logits/rejected": -0.6844848394393921, "logps/chosen": -168.03280639648438, "logps/rejected": -496.0874938964844, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.335552930831909, "rewards/margins": 21.842967987060547, "rewards/rejected": -24.180469512939453, "step": 9720 }, { "epoch": 2.4532820722906754, "grad_norm": 0.01195364911109209, "learning_rate": 4.8889968909278824e-08, "logits/chosen": -0.6956329345703125, "logits/rejected": NaN, "logps/chosen": -200.3359375, "logps/rejected": -516.7625122070312, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4527039527893066, "rewards/margins": 22.7734375, "rewards/rejected": -25.228124618530273, "step": 9730 }, { "epoch": 2.4558031071754955, "grad_norm": 0.7000452876091003, "learning_rate": 4.845512998353296e-08, "logits/chosen": -0.7273666262626648, "logits/rejected": NaN, "logps/chosen": -191.6218719482422, "logps/rejected": -517.2750244140625, "loss": 0.0255, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.8313477039337158, "rewards/margins": 22.817968368530273, "rewards/rejected": -24.657032012939453, "step": 9740 }, { "epoch": 2.4583241420603157, "grad_norm": 0.0005341186188161373, "learning_rate": 4.802202587793469e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -169.8562469482422, "logps/rejected": -524.875, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.684057593345642, "rewards/margins": 23.332813262939453, "rewards/rejected": -25.026561737060547, "step": 9750 }, { "epoch": 2.460845176945136, "grad_norm": 0.01546004880219698, "learning_rate": 4.7590660320476236e-08, "logits/chosen": -0.6833130121231079, "logits/rejected": NaN, "logps/chosen": -204.8312530517578, "logps/rejected": -495.38751220703125, "loss": 0.0113, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.0525755882263184, "rewards/margins": 21.974218368530273, "rewards/rejected": -24.017969131469727, "step": 9760 }, { "epoch": 2.463366211829956, "grad_norm": 39.99349594116211, "learning_rate": 4.716103702418528e-08, "logits/chosen": -0.6118835210800171, "logits/rejected": NaN, "logps/chosen": -172.22500610351562, "logps/rejected": -498.92498779296875, "loss": 0.0032, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.195080518722534, "rewards/margins": 23.303125381469727, "rewards/rejected": -25.5078125, "step": 9770 }, { "epoch": 2.4658872467147765, "grad_norm": 0.18656328320503235, "learning_rate": 4.673315968709257e-08, "logits/chosen": -0.6329025030136108, "logits/rejected": NaN, "logps/chosen": -181.02499389648438, "logps/rejected": -506.76251220703125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.3791260719299316, "rewards/margins": 23.688282012939453, "rewards/rejected": -26.064844131469727, "step": 9780 }, { "epoch": 2.4684082815995967, "grad_norm": 0.03424971550703049, "learning_rate": 4.630703199220054e-08, "logits/chosen": -0.660449206829071, "logits/rejected": -0.7666991949081421, "logps/chosen": -193.49063110351562, "logps/rejected": -506.2250061035156, "loss": 0.0196, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1398072242736816, "rewards/margins": 22.82421875, "rewards/rejected": -24.971094131469727, "step": 9790 }, { "epoch": 2.470929316484417, "grad_norm": 0.22832272946834564, "learning_rate": 4.588265760745125e-08, "logits/chosen": -0.4319000244140625, "logits/rejected": NaN, "logps/chosen": -159.78125, "logps/rejected": -482.04998779296875, "loss": 0.0069, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.249847412109375, "rewards/margins": 22.839061737060547, "rewards/rejected": -25.084375381469727, "step": 9800 }, { "epoch": 2.473450351369237, "grad_norm": 0.17879614233970642, "learning_rate": 4.546004018569488e-08, "logits/chosen": -0.7122802734375, "logits/rejected": -0.7381523251533508, "logps/chosen": -189.4031219482422, "logps/rejected": -526.3375244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.117236375808716, "rewards/margins": 23.619531631469727, "rewards/rejected": -25.744531631469727, "step": 9810 }, { "epoch": 2.475971386254057, "grad_norm": 0.006770871579647064, "learning_rate": 4.503918336465859e-08, "logits/chosen": -0.582226574420929, "logits/rejected": NaN, "logps/chosen": -179.2937469482422, "logps/rejected": -512.2000122070312, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.12890625, "rewards/margins": 23.11328125, "rewards/rejected": -25.246875762939453, "step": 9820 }, { "epoch": 2.4784924211388777, "grad_norm": 112.03388214111328, "learning_rate": 4.462009076691472e-08, "logits/chosen": -0.6258697509765625, "logits/rejected": NaN, "logps/chosen": -183.75, "logps/rejected": -509.0874938964844, "loss": 0.0193, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.172802686691284, "rewards/margins": 23.353906631469727, "rewards/rejected": -25.542186737060547, "step": 9830 }, { "epoch": 2.481013456023698, "grad_norm": 0.022206999361515045, "learning_rate": 4.420276599984993e-08, "logits/chosen": -0.596478283405304, "logits/rejected": -0.5864898562431335, "logps/chosen": -192.4093780517578, "logps/rejected": -537.5999755859375, "loss": 0.0229, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.042315721511841, "rewards/margins": 24.086719512939453, "rewards/rejected": -26.134374618530273, "step": 9840 }, { "epoch": 2.483534490908518, "grad_norm": 4.006641864776611, "learning_rate": 4.3787212655634234e-08, "logits/chosen": -0.6448173522949219, "logits/rejected": NaN, "logps/chosen": -187.6218719482422, "logps/rejected": -513.9749755859375, "loss": 0.0068, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.444897413253784, "rewards/margins": 23.366405487060547, "rewards/rejected": -25.814062118530273, "step": 9850 }, { "epoch": 2.486055525793338, "grad_norm": 0.0026610679924488068, "learning_rate": 4.337343431118973e-08, "logits/chosen": -0.5865371823310852, "logits/rejected": -0.6873108148574829, "logps/chosen": -182.5500030517578, "logps/rejected": -516.1375122070312, "loss": 0.0039, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.067797899246216, "rewards/margins": 23.1796875, "rewards/rejected": -25.235157012939453, "step": 9860 }, { "epoch": 2.4885765606781582, "grad_norm": 7.922913551330566, "learning_rate": 4.296143452816009e-08, "logits/chosen": -0.64544677734375, "logits/rejected": NaN, "logps/chosen": -205.0906219482422, "logps/rejected": -541.8250122070312, "loss": 0.0066, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0446319580078125, "rewards/margins": 23.534374237060547, "rewards/rejected": -25.5703125, "step": 9870 }, { "epoch": 2.4910975955629784, "grad_norm": 0.0018488122150301933, "learning_rate": 4.255121685287974e-08, "logits/chosen": -0.6066528558731079, "logits/rejected": NaN, "logps/chosen": -194.63125610351562, "logps/rejected": -519.7750244140625, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.662145972251892, "rewards/margins": 23.559375762939453, "rewards/rejected": -25.225000381469727, "step": 9880 }, { "epoch": 2.493618630447799, "grad_norm": 1.909725546836853, "learning_rate": 4.214278481634362e-08, "logits/chosen": -0.5079590082168579, "logits/rejected": -0.744952380657196, "logps/chosen": -177.10000610351562, "logps/rejected": -521.7625122070312, "loss": 0.003, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.307830810546875, "rewards/margins": 22.732030868530273, "rewards/rejected": -25.04296875, "step": 9890 }, { "epoch": 2.496139665332619, "grad_norm": 0.03362171724438667, "learning_rate": 4.173614193417629e-08, "logits/chosen": -0.517498791217804, "logits/rejected": NaN, "logps/chosen": -183.3125, "logps/rejected": -510.38751220703125, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4841856956481934, "rewards/margins": 23.122655868530273, "rewards/rejected": -25.600000381469727, "step": 9900 }, { "epoch": 2.4986607002174392, "grad_norm": 21.49625587463379, "learning_rate": 4.133129170660227e-08, "logits/chosen": -0.6445342898368835, "logits/rejected": -0.5988219976425171, "logps/chosen": -193.28125, "logps/rejected": -535.7125244140625, "loss": 0.0112, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.391247510910034, "rewards/margins": 23.508594512939453, "rewards/rejected": -25.905467987060547, "step": 9910 }, { "epoch": 2.5011817351022594, "grad_norm": 0.2957840859889984, "learning_rate": 4.0928237618415294e-08, "logits/chosen": -0.6313842535018921, "logits/rejected": NaN, "logps/chosen": -186.38436889648438, "logps/rejected": -506.32501220703125, "loss": 0.0121, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.093493700027466, "rewards/margins": 23.227344512939453, "rewards/rejected": -25.32421875, "step": 9920 }, { "epoch": 2.50370276998708, "grad_norm": 1.4838266372680664, "learning_rate": 4.052698313894892e-08, "logits/chosen": -0.6427764892578125, "logits/rejected": NaN, "logps/chosen": -203.41561889648438, "logps/rejected": -513.9249877929688, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.640698194503784, "rewards/margins": 22.139842987060547, "rewards/rejected": -24.784374237060547, "step": 9930 }, { "epoch": 2.5062238048719, "grad_norm": 0.3431829810142517, "learning_rate": 4.0127531722046195e-08, "logits/chosen": -0.6220901608467102, "logits/rejected": NaN, "logps/chosen": -191.484375, "logps/rejected": -522.75, "loss": 0.023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.417315721511841, "rewards/margins": 23.264062881469727, "rewards/rejected": -25.6796875, "step": 9940 }, { "epoch": 2.5087448397567202, "grad_norm": 0.0007416014559566975, "learning_rate": 3.972988680603001e-08, "logits/chosen": -0.6319580078125, "logits/rejected": -0.6706482172012329, "logps/chosen": -183.3125, "logps/rejected": -510.04998779296875, "loss": 0.0105, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.0761351585388184, "rewards/margins": 22.235937118530273, "rewards/rejected": -24.313282012939453, "step": 9950 }, { "epoch": 2.5112658746415404, "grad_norm": 0.7528015375137329, "learning_rate": 3.933405181367391e-08, "logits/chosen": -0.5141540765762329, "logits/rejected": NaN, "logps/chosen": -172.958984375, "logps/rejected": -540.5250244140625, "loss": 0.0315, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.193866014480591, "rewards/margins": 22.709375381469727, "rewards/rejected": -24.908594131469727, "step": 9960 }, { "epoch": 2.5137869095263605, "grad_norm": 1.4657273292541504, "learning_rate": 3.894003015217206e-08, "logits/chosen": -0.7128387689590454, "logits/rejected": NaN, "logps/chosen": -190.1687469482422, "logps/rejected": -514.2249755859375, "loss": 0.006, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5181884765625, "rewards/margins": 22.8984375, "rewards/rejected": -24.420312881469727, "step": 9970 }, { "epoch": 2.5163079444111807, "grad_norm": 0.03356481343507767, "learning_rate": 3.854782521311018e-08, "logits/chosen": -0.6091552972793579, "logits/rejected": -0.722149670124054, "logps/chosen": -192.89688110351562, "logps/rejected": -512.5750122070312, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9321320056915283, "rewards/margins": 23.11328125, "rewards/rejected": -25.041406631469727, "step": 9980 }, { "epoch": 2.518828979296001, "grad_norm": 0.015421370975673199, "learning_rate": 3.815744037243651e-08, "logits/chosen": -0.7298644781112671, "logits/rejected": -0.6954376101493835, "logps/chosen": -168.171875, "logps/rejected": -507.98748779296875, "loss": 0.0083, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.601934790611267, "rewards/margins": 22.3203125, "rewards/rejected": -23.913280487060547, "step": 9990 }, { "epoch": 2.5213500141808214, "grad_norm": 0.06496239453554153, "learning_rate": 3.776887899043246e-08, "logits/chosen": -0.628857433795929, "logits/rejected": -0.66693115234375, "logps/chosen": -197.33749389648438, "logps/rejected": -538.9749755859375, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.075244188308716, "rewards/margins": 23.21875, "rewards/rejected": -25.294530868530273, "step": 10000 }, { "epoch": 2.5238710490656415, "grad_norm": 0.0616004504263401, "learning_rate": 3.7382144411683857e-08, "logits/chosen": -0.68682861328125, "logits/rejected": NaN, "logps/chosen": -185.04061889648438, "logps/rejected": -481.8999938964844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.758087158203125, "rewards/margins": 22.700780868530273, "rewards/rejected": -24.462499618530273, "step": 10010 }, { "epoch": 2.5263920839504617, "grad_norm": 1.089551568031311, "learning_rate": 3.699723996505205e-08, "logits/chosen": -0.5983139276504517, "logits/rejected": NaN, "logps/chosen": -175.08438110351562, "logps/rejected": -503.375, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.4794727563858032, "rewards/margins": 22.44140625, "rewards/rejected": -23.921875, "step": 10020 }, { "epoch": 2.528913118835282, "grad_norm": 0.6410160064697266, "learning_rate": 3.661416896364547e-08, "logits/chosen": -0.716839611530304, "logits/rejected": NaN, "logps/chosen": -186.44375610351562, "logps/rejected": -523.125, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.349700927734375, "rewards/margins": 22.742969512939453, "rewards/rejected": -24.100780487060547, "step": 10030 }, { "epoch": 2.531434153720102, "grad_norm": 0.006452443543821573, "learning_rate": 3.623293470479075e-08, "logits/chosen": -0.5586684942245483, "logits/rejected": NaN, "logps/chosen": -170.71249389648438, "logps/rejected": -490.01251220703125, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.7635986804962158, "rewards/margins": 22.775781631469727, "rewards/rejected": -24.537500381469727, "step": 10040 }, { "epoch": 2.5339551886049225, "grad_norm": 2.511284351348877, "learning_rate": 3.58535404700048e-08, "logits/chosen": -0.6672729253768921, "logits/rejected": NaN, "logps/chosen": -186.3515625, "logps/rejected": -499.7124938964844, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.5800796747207642, "rewards/margins": 22.986719131469727, "rewards/rejected": -24.560937881469727, "step": 10050 }, { "epoch": 2.5364762234897427, "grad_norm": 0.005085676908493042, "learning_rate": 3.5475989524966085e-08, "logits/chosen": -0.6732513308525085, "logits/rejected": NaN, "logps/chosen": -174.3046875, "logps/rejected": -524.5750122070312, "loss": 0.0106, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.6556793451309204, "rewards/margins": 24.19921875, "rewards/rejected": -25.859375, "step": 10060 }, { "epoch": 2.538997258374563, "grad_norm": 1.4087114334106445, "learning_rate": 3.5100285119486926e-08, "logits/chosen": -0.622161865234375, "logits/rejected": NaN, "logps/chosen": -187.0343780517578, "logps/rejected": -515.7249755859375, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.70648193359375, "rewards/margins": 23.232030868530273, "rewards/rejected": -24.944530487060547, "step": 10070 }, { "epoch": 2.541518293259383, "grad_norm": 1.3052477836608887, "learning_rate": 3.472643048748525e-08, "logits/chosen": -0.7644287347793579, "logits/rejected": NaN, "logps/chosen": -216.7687530517578, "logps/rejected": -538.7249755859375, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.092425584793091, "rewards/margins": 23.200000762939453, "rewards/rejected": -25.289844512939453, "step": 10080 }, { "epoch": 2.544039328144203, "grad_norm": 0.03035840392112732, "learning_rate": 3.43544288469568e-08, "logits/chosen": -0.6636962890625, "logits/rejected": NaN, "logps/chosen": -210.8468780517578, "logps/rejected": -498.6000061035156, "loss": 0.0115, "rewards/accuracies": 0.984375, "rewards/chosen": -2.1844940185546875, "rewards/margins": 22.778905868530273, "rewards/rejected": -24.957813262939453, "step": 10090 }, { "epoch": 2.546560363029023, "grad_norm": 0.02495010569691658, "learning_rate": 3.398428339994763e-08, "logits/chosen": -0.6489959955215454, "logits/rejected": -0.7570556402206421, "logps/chosen": -183.90625, "logps/rejected": -522.0999755859375, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0023865699768066, "rewards/margins": 22.930469512939453, "rewards/rejected": -24.939844131469727, "step": 10100 }, { "epoch": 2.5490813979138434, "grad_norm": 0.00019575905753299594, "learning_rate": 3.3615997332526345e-08, "logits/chosen": -0.728558361530304, "logits/rejected": NaN, "logps/chosen": -195.0343780517578, "logps/rejected": -554.9000244140625, "loss": 0.0074, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.806249976158142, "rewards/margins": 23.407032012939453, "rewards/rejected": -25.213281631469727, "step": 10110 }, { "epoch": 2.551602432798664, "grad_norm": 11.081186294555664, "learning_rate": 3.32495738147566e-08, "logits/chosen": -0.613446056842804, "logits/rejected": NaN, "logps/chosen": -189.7531280517578, "logps/rejected": -528.9249877929688, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0674500465393066, "rewards/margins": 22.98046875, "rewards/rejected": -25.051563262939453, "step": 10120 }, { "epoch": 2.554123467683484, "grad_norm": 6.472830772399902, "learning_rate": 3.288501600067017e-08, "logits/chosen": -0.5703796148300171, "logits/rejected": -0.6316894292831421, "logps/chosen": -179.203125, "logps/rejected": -506.9125061035156, "loss": 0.0221, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8700683116912842, "rewards/margins": 23.31640625, "rewards/rejected": -25.19921875, "step": 10130 }, { "epoch": 2.5566445025683042, "grad_norm": 0.0006528699304908514, "learning_rate": 3.2522327028239456e-08, "logits/chosen": -0.6433624029159546, "logits/rejected": -0.7221313714981079, "logps/chosen": -200.1906280517578, "logps/rejected": -513.375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.493365526199341, "rewards/margins": 23.056249618530273, "rewards/rejected": -25.551563262939453, "step": 10140 }, { "epoch": 2.5591655374531244, "grad_norm": 0.0666164830327034, "learning_rate": 3.2161510019350524e-08, "logits/chosen": -0.7743469476699829, "logits/rejected": -0.793475329875946, "logps/chosen": -189.43124389648438, "logps/rejected": -516.5, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.864172339439392, "rewards/margins": 23.383594512939453, "rewards/rejected": -25.245311737060547, "step": 10150 }, { "epoch": 2.561686572337945, "grad_norm": 0.07946520298719406, "learning_rate": 3.180256807977638e-08, "logits/chosen": -0.6202850341796875, "logits/rejected": -0.6843841671943665, "logps/chosen": -184.17031860351562, "logps/rejected": -486.92498779296875, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8031494617462158, "rewards/margins": 22.995311737060547, "rewards/rejected": -24.8046875, "step": 10160 }, { "epoch": 2.564207607222765, "grad_norm": 0.3877386152744293, "learning_rate": 3.144550429915027e-08, "logits/chosen": -0.6302825808525085, "logits/rejected": NaN, "logps/chosen": -194.9718780517578, "logps/rejected": -531.4375, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0993895530700684, "rewards/margins": 23.349218368530273, "rewards/rejected": -25.457813262939453, "step": 10170 }, { "epoch": 2.5667286421075852, "grad_norm": 98.48660278320312, "learning_rate": 3.10903217509387e-08, "logits/chosen": -0.6485489010810852, "logits/rejected": -0.7340347170829773, "logps/chosen": -202.94686889648438, "logps/rejected": -511.20001220703125, "loss": 0.0255, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.364819288253784, "rewards/margins": 22.354686737060547, "rewards/rejected": -24.720312118530273, "step": 10180 }, { "epoch": 2.5692496769924054, "grad_norm": 0.4552406370639801, "learning_rate": 3.0737023492415606e-08, "logits/chosen": -0.6245056390762329, "logits/rejected": NaN, "logps/chosen": -170.7624969482422, "logps/rejected": -511.0249938964844, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9499695301055908, "rewards/margins": 22.661718368530273, "rewards/rejected": -24.622655868530273, "step": 10190 }, { "epoch": 2.5717707118772255, "grad_norm": 0.3325541913509369, "learning_rate": 3.0385612564635346e-08, "logits/chosen": -0.590710461139679, "logits/rejected": NaN, "logps/chosen": -175.1484375, "logps/rejected": -518.375, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.5335845947265625, "rewards/margins": 23.87109375, "rewards/rejected": -25.407032012939453, "step": 10200 }, { "epoch": 2.5742917467620456, "grad_norm": 24.480588912963867, "learning_rate": 3.003609199240711e-08, "logits/chosen": -0.576403796672821, "logits/rejected": NaN, "logps/chosen": -181.04061889648438, "logps/rejected": -513.2000122070312, "loss": 0.0059, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.6594269275665283, "rewards/margins": 23.956249237060547, "rewards/rejected": -25.618749618530273, "step": 10210 }, { "epoch": 2.576812781646866, "grad_norm": 0.17172731459140778, "learning_rate": 2.9688464784268563e-08, "logits/chosen": -0.6367126703262329, "logits/rejected": NaN, "logps/chosen": -195.9375, "logps/rejected": -528.2000122070312, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.324023485183716, "rewards/margins": 23.111719131469727, "rewards/rejected": -25.4296875, "step": 10220 }, { "epoch": 2.5793338165316864, "grad_norm": 0.4188025891780853, "learning_rate": 2.9342733932459923e-08, "logits/chosen": -0.597393810749054, "logits/rejected": -0.701263427734375, "logps/chosen": -177.2468719482422, "logps/rejected": -513.6500244140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -2.029296875, "rewards/margins": 23.227344512939453, "rewards/rejected": -25.246875762939453, "step": 10230 }, { "epoch": 2.5818548514165065, "grad_norm": 0.2516571581363678, "learning_rate": 2.8998902412898514e-08, "logits/chosen": -0.5778564214706421, "logits/rejected": NaN, "logps/chosen": -177.5749969482422, "logps/rejected": -518.4500122070312, "loss": 0.0242, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.68853759765625, "rewards/margins": 23.895313262939453, "rewards/rejected": -25.572656631469727, "step": 10240 }, { "epoch": 2.5843758863013266, "grad_norm": 0.01217047218233347, "learning_rate": 2.8656973185152754e-08, "logits/chosen": -0.670178234577179, "logits/rejected": NaN, "logps/chosen": -185.3249969482422, "logps/rejected": -498.70001220703125, "loss": 0.0224, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.175280809402466, "rewards/margins": 22.78125, "rewards/rejected": -24.962499618530273, "step": 10250 }, { "epoch": 2.586896921186147, "grad_norm": 0.07357902079820633, "learning_rate": 2.831694919241695e-08, "logits/chosen": -0.5917602777481079, "logits/rejected": -0.6695877313613892, "logps/chosen": -179.8312530517578, "logps/rejected": -511.4750061035156, "loss": 0.0292, "rewards/accuracies": 0.984375, "rewards/chosen": -2.118335008621216, "rewards/margins": 23.149219512939453, "rewards/rejected": -25.278125762939453, "step": 10260 }, { "epoch": 2.5894179560709674, "grad_norm": 3.638575315475464, "learning_rate": 2.7978833361485933e-08, "logits/chosen": -0.602740466594696, "logits/rejected": -0.740478515625, "logps/chosen": -174.94375610351562, "logps/rejected": -504.7749938964844, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.910211205482483, "rewards/margins": 23.036718368530273, "rewards/rejected": -24.946094512939453, "step": 10270 }, { "epoch": 2.5919389909557875, "grad_norm": 0.01984625868499279, "learning_rate": 2.7642628602729758e-08, "logits/chosen": -0.53411865234375, "logits/rejected": NaN, "logps/chosen": -173.5343780517578, "logps/rejected": -524.6749877929688, "loss": 0.0168, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.674676537513733, "rewards/margins": 23.6484375, "rewards/rejected": -25.307811737060547, "step": 10280 }, { "epoch": 2.5944600258406076, "grad_norm": 0.026013599708676338, "learning_rate": 2.7308337810068665e-08, "logits/chosen": -0.6943694949150085, "logits/rejected": -0.7312988042831421, "logps/chosen": -192.27499389648438, "logps/rejected": -512.7625122070312, "loss": 0.0237, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8552734851837158, "rewards/margins": 23.182811737060547, "rewards/rejected": -25.033594131469727, "step": 10290 }, { "epoch": 2.596981060725428, "grad_norm": 0.006101132370531559, "learning_rate": 2.6975963860948247e-08, "logits/chosen": -0.5199218988418579, "logits/rejected": -0.6098693609237671, "logps/chosen": -202.29061889648438, "logps/rejected": -532.9500122070312, "loss": 0.0133, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.3497557640075684, "rewards/margins": 22.650781631469727, "rewards/rejected": -25.005859375, "step": 10300 }, { "epoch": 2.599502095610248, "grad_norm": 45.63267517089844, "learning_rate": 2.664550961631476e-08, "logits/chosen": -0.613568127155304, "logits/rejected": NaN, "logps/chosen": -182.50936889648438, "logps/rejected": -499.2250061035156, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.8498138189315796, "rewards/margins": 22.671092987060547, "rewards/rejected": -24.528125762939453, "step": 10310 }, { "epoch": 2.602023130495068, "grad_norm": 0.19368556141853333, "learning_rate": 2.6316977920590234e-08, "logits/chosen": -0.784008800983429, "logits/rejected": NaN, "logps/chosen": -196.7937469482422, "logps/rejected": -527.0250244140625, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.0732970237731934, "rewards/margins": 24.005468368530273, "rewards/rejected": -26.079687118530273, "step": 10320 }, { "epoch": 2.604544165379888, "grad_norm": 2.8183982372283936, "learning_rate": 2.599037160164827e-08, "logits/chosen": -0.4788970947265625, "logits/rejected": -0.681640625, "logps/chosen": -175.29061889648438, "logps/rejected": -521.0, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.883337378501892, "rewards/margins": 23.943750381469727, "rewards/rejected": -25.8203125, "step": 10330 }, { "epoch": 2.607065200264709, "grad_norm": 0.004563579801470041, "learning_rate": 2.5665693470789423e-08, "logits/chosen": -0.64593505859375, "logits/rejected": NaN, "logps/chosen": -185.875, "logps/rejected": -503.6625061035156, "loss": 0.0346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1563477516174316, "rewards/margins": 22.146875381469727, "rewards/rejected": -24.314062118530273, "step": 10340 }, { "epoch": 2.609586235149529, "grad_norm": 0.046407051384449005, "learning_rate": 2.534294632271733e-08, "logits/chosen": NaN, "logits/rejected": -0.661059558391571, "logps/chosen": -189.71249389648438, "logps/rejected": -509.38751220703125, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.964727759361267, "rewards/margins": 22.358592987060547, "rewards/rejected": -24.321874618530273, "step": 10350 }, { "epoch": 2.612107270034349, "grad_norm": 0.05637986212968826, "learning_rate": 2.5022132935514333e-08, "logits/chosen": -0.505816638469696, "logits/rejected": -0.7021819949150085, "logps/chosen": -169.74844360351562, "logps/rejected": -495.4750061035156, "loss": 0.0177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.4689452648162842, "rewards/margins": 22.791406631469727, "rewards/rejected": -24.262500762939453, "step": 10360 }, { "epoch": 2.614628304919169, "grad_norm": 0.2901539206504822, "learning_rate": 2.470325607061774e-08, "logits/chosen": -0.6204620599746704, "logits/rejected": -0.657275378704071, "logps/chosen": -200.8046875, "logps/rejected": -532.7000122070312, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.2115721702575684, "rewards/margins": 23.096094131469727, "rewards/rejected": -25.301563262939453, "step": 10370 }, { "epoch": 2.61714933980399, "grad_norm": 0.45390257239341736, "learning_rate": 2.4386318472796125e-08, "logits/chosen": -0.765795886516571, "logits/rejected": NaN, "logps/chosen": -171.33749389648438, "logps/rejected": -501.8374938964844, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.3085159063339233, "rewards/margins": 22.693750381469727, "rewards/rejected": -23.994531631469727, "step": 10380 }, { "epoch": 2.61967037468881, "grad_norm": 0.44859716296195984, "learning_rate": 2.4071322870125475e-08, "logits/chosen": -0.7022339105606079, "logits/rejected": -0.7218551635742188, "logps/chosen": -186.625, "logps/rejected": -510.0874938964844, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.580590844154358, "rewards/margins": 23.146875381469727, "rewards/rejected": -24.725780487060547, "step": 10390 }, { "epoch": 2.62219140957363, "grad_norm": 0.17178454995155334, "learning_rate": 2.3758271973965848e-08, "logits/chosen": -0.5673828125, "logits/rejected": NaN, "logps/chosen": -182.7859344482422, "logps/rejected": -502.01251220703125, "loss": 0.0185, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.7186546325683594, "rewards/margins": 22.262500762939453, "rewards/rejected": -23.982030868530273, "step": 10400 }, { "epoch": 2.62471244445845, "grad_norm": 0.07989232242107391, "learning_rate": 2.344716847893813e-08, "logits/chosen": -0.46070557832717896, "logits/rejected": -0.7214034795761108, "logps/chosen": -186.5187530517578, "logps/rejected": -514.4249877929688, "loss": 0.03, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.197924852371216, "rewards/margins": 21.90625, "rewards/rejected": -24.106250762939453, "step": 10410 }, { "epoch": 2.6272334793432703, "grad_norm": 0.010318616405129433, "learning_rate": 2.313801506290064e-08, "logits/chosen": -0.6169387698173523, "logits/rejected": NaN, "logps/chosen": -181.109375, "logps/rejected": -477.3500061035156, "loss": 0.0092, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.700402855873108, "rewards/margins": 22.71484375, "rewards/rejected": -24.413280487060547, "step": 10420 }, { "epoch": 2.6297545142280905, "grad_norm": 22.80099105834961, "learning_rate": 2.283081438692619e-08, "logits/chosen": -0.6281951665878296, "logits/rejected": -0.7438293695449829, "logps/chosen": -177.7843780517578, "logps/rejected": -498.42498779296875, "loss": 0.0143, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.656793236732483, "rewards/margins": 22.911718368530273, "rewards/rejected": -24.578907012939453, "step": 10430 }, { "epoch": 2.6322755491129106, "grad_norm": 0.36309000849723816, "learning_rate": 2.252556909527911e-08, "logits/chosen": -0.612841784954071, "logits/rejected": NaN, "logps/chosen": -181.63436889648438, "logps/rejected": -501.0249938964844, "loss": 0.0326, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.23797607421875, "rewards/margins": 22.634374618530273, "rewards/rejected": -23.871875762939453, "step": 10440 }, { "epoch": 2.634796583997731, "grad_norm": 4.133944988250732, "learning_rate": 2.222228181539268e-08, "logits/chosen": -0.6419593691825867, "logits/rejected": NaN, "logps/chosen": -187.6750030517578, "logps/rejected": -511.625, "loss": 0.0058, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.889129638671875, "rewards/margins": 22.7265625, "rewards/rejected": -24.626562118530273, "step": 10450 }, { "epoch": 2.6373176188825513, "grad_norm": 0.06691758334636688, "learning_rate": 2.1920955157846228e-08, "logits/chosen": -0.601489245891571, "logits/rejected": -0.6326858401298523, "logps/chosen": -176.5500030517578, "logps/rejected": -509.63751220703125, "loss": 0.0058, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.733618140220642, "rewards/margins": 23.592187881469727, "rewards/rejected": -25.322656631469727, "step": 10460 }, { "epoch": 2.6398386537673715, "grad_norm": 0.6210386157035828, "learning_rate": 2.1621591716342926e-08, "logits/chosen": -0.678271472454071, "logits/rejected": NaN, "logps/chosen": -180.6984405517578, "logps/rejected": -504.5375061035156, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.8482177257537842, "rewards/margins": 23.02734375, "rewards/rejected": -24.876562118530273, "step": 10470 }, { "epoch": 2.6423596886521916, "grad_norm": 0.401109516620636, "learning_rate": 2.1324194067687235e-08, "logits/chosen": -0.6202972531318665, "logits/rejected": NaN, "logps/chosen": -187.97811889648438, "logps/rejected": -522.9249877929688, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.7019164562225342, "rewards/margins": 23.796875, "rewards/rejected": -25.504688262939453, "step": 10480 }, { "epoch": 2.644880723537012, "grad_norm": 0.8880982995033264, "learning_rate": 2.1028764771762906e-08, "logits/chosen": -0.5582946538925171, "logits/rejected": NaN, "logps/chosen": -189.9890594482422, "logps/rejected": -511.8125, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.349951148033142, "rewards/margins": 23.608592987060547, "rewards/rejected": -24.959375381469727, "step": 10490 }, { "epoch": 2.6474017584218323, "grad_norm": 1.29482102394104, "learning_rate": 2.073530637151086e-08, "logits/chosen": -0.5321044921875, "logits/rejected": NaN, "logps/chosen": -185.7218780517578, "logps/rejected": -494.32501220703125, "loss": 0.0101, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7534668445587158, "rewards/margins": 22.91796875, "rewards/rejected": -24.660938262939453, "step": 10500 }, { "epoch": 2.6499227933066525, "grad_norm": 0.00416861055418849, "learning_rate": 2.0443821392907208e-08, "logits/chosen": -0.602325439453125, "logits/rejected": NaN, "logps/chosen": -171.515625, "logps/rejected": -516.3875122070312, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.547064185142517, "rewards/margins": 23.91796875, "rewards/rejected": -25.465625762939453, "step": 10510 }, { "epoch": 2.6524438281914726, "grad_norm": 0.06830200552940369, "learning_rate": 2.0154312344941833e-08, "logits/chosen": -0.627758800983429, "logits/rejected": NaN, "logps/chosen": -184.734375, "logps/rejected": -490.0625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.960412621498108, "rewards/margins": 23.336719512939453, "rewards/rejected": -25.294530868530273, "step": 10520 }, { "epoch": 2.6549648630762928, "grad_norm": 1.5767103433609009, "learning_rate": 1.9866781719596355e-08, "logits/chosen": -0.5752288699150085, "logits/rejected": NaN, "logps/chosen": -187.86563110351562, "logps/rejected": -516.3624877929688, "loss": 0.0071, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1022095680236816, "rewards/margins": 23.7578125, "rewards/rejected": -25.857030868530273, "step": 10530 }, { "epoch": 2.657485897961113, "grad_norm": 0.8631983995437622, "learning_rate": 1.9581231991823045e-08, "logits/chosen": -0.5971618890762329, "logits/rejected": -0.6310058832168579, "logps/chosen": -200.0656280517578, "logps/rejected": -521.6500244140625, "loss": 0.0284, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.938720703125, "rewards/margins": 22.603124618530273, "rewards/rejected": -25.542186737060547, "step": 10540 }, { "epoch": 2.660006932845933, "grad_norm": 0.02119021862745285, "learning_rate": 1.92976656195232e-08, "logits/chosen": -0.611401379108429, "logits/rejected": NaN, "logps/chosen": -176.7156219482422, "logps/rejected": -521.4249877929688, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6690247058868408, "rewards/margins": 23.467967987060547, "rewards/rejected": -25.133594512939453, "step": 10550 }, { "epoch": 2.6625279677307536, "grad_norm": 0.007240401115268469, "learning_rate": 1.9016085043526446e-08, "logits/chosen": -0.5849059820175171, "logits/rejected": NaN, "logps/chosen": -182.43594360351562, "logps/rejected": -519.75, "loss": 0.0089, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.262225389480591, "rewards/margins": 22.749217987060547, "rewards/rejected": -25.017187118530273, "step": 10560 }, { "epoch": 2.6650490026155738, "grad_norm": 0.6946568489074707, "learning_rate": 1.8736492687569163e-08, "logits/chosen": -0.6962035894393921, "logits/rejected": -0.8087707757949829, "logps/chosen": -187.92813110351562, "logps/rejected": -528.2750244140625, "loss": 0.0082, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.338824510574341, "rewards/margins": 22.257030487060547, "rewards/rejected": -24.604686737060547, "step": 10570 }, { "epoch": 2.667570037500394, "grad_norm": 0.9413801431655884, "learning_rate": 1.8458890958273994e-08, "logits/chosen": -0.5606139898300171, "logits/rejected": NaN, "logps/chosen": -187.55624389648438, "logps/rejected": -520.5999755859375, "loss": 0.0038, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.014688014984131, "rewards/margins": 23.178125381469727, "rewards/rejected": -25.208593368530273, "step": 10580 }, { "epoch": 2.670091072385214, "grad_norm": 0.0013297642581164837, "learning_rate": 1.818328224512916e-08, "logits/chosen": -0.5798584222793579, "logits/rejected": NaN, "logps/chosen": -182.4656219482422, "logps/rejected": -512.1500244140625, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.788507103919983, "rewards/margins": 24.084375381469727, "rewards/rejected": -25.8671875, "step": 10590 }, { "epoch": 2.6726121072700346, "grad_norm": 0.005578004755079746, "learning_rate": 1.790966892046758e-08, "logits/chosen": -0.5863800048828125, "logits/rejected": NaN, "logps/chosen": -187.1062469482422, "logps/rejected": -526.0, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9410400390625, "rewards/margins": 23.385936737060547, "rewards/rejected": -25.31640625, "step": 10600 }, { "epoch": 2.6751331421548548, "grad_norm": 0.7929794788360596, "learning_rate": 1.7638053339446818e-08, "logits/chosen": -0.7598907351493835, "logits/rejected": -0.744793713092804, "logps/chosen": -187.8234405517578, "logps/rejected": -513.0499877929688, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.131976366043091, "rewards/margins": 23.235157012939453, "rewards/rejected": -25.348438262939453, "step": 10610 }, { "epoch": 2.677654177039675, "grad_norm": 0.010788300074636936, "learning_rate": 1.736843784002848e-08, "logits/chosen": -0.5150955319404602, "logits/rejected": -0.612011730670929, "logps/chosen": -182.55624389648438, "logps/rejected": -514.125, "loss": 0.0226, "rewards/accuracies": 0.984375, "rewards/chosen": -2.036334276199341, "rewards/margins": 23.306249618530273, "rewards/rejected": -25.346094131469727, "step": 10620 }, { "epoch": 2.680175211924495, "grad_norm": 3.035761594772339, "learning_rate": 1.7100824742958375e-08, "logits/chosen": -0.6308616399765015, "logits/rejected": -0.604504406452179, "logps/chosen": -185.83749389648438, "logps/rejected": -549.3125, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9239623546600342, "rewards/margins": 24.215625762939453, "rewards/rejected": -26.1328125, "step": 10630 }, { "epoch": 2.682696246809315, "grad_norm": 0.034774474799633026, "learning_rate": 1.683521635174631e-08, "logits/chosen": -0.6006530523300171, "logits/rejected": NaN, "logps/chosen": -181.53750610351562, "logps/rejected": -515.7374877929688, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8397705554962158, "rewards/margins": 22.795312881469727, "rewards/rejected": -24.635936737060547, "step": 10640 }, { "epoch": 2.6852172816941353, "grad_norm": 0.06426440924406052, "learning_rate": 1.657161495264639e-08, "logits/chosen": -0.696978747844696, "logits/rejected": -0.721771240234375, "logps/chosen": -186.0539093017578, "logps/rejected": -505.9750061035156, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.0764403343200684, "rewards/margins": 23.698436737060547, "rewards/rejected": -25.780467987060547, "step": 10650 }, { "epoch": 2.6877383165789555, "grad_norm": 0.004620195832103491, "learning_rate": 1.6310022814637364e-08, "logits/chosen": -0.504168689250946, "logits/rejected": -0.631439208984375, "logps/chosen": -163.90625, "logps/rejected": -501.7124938964844, "loss": 0.003, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.7212616205215454, "rewards/margins": 22.719532012939453, "rewards/rejected": -24.444530487060547, "step": 10660 }, { "epoch": 2.690259351463776, "grad_norm": 0.019032970070838928, "learning_rate": 1.605044218940299e-08, "logits/chosen": -0.6350494623184204, "logits/rejected": NaN, "logps/chosen": -193.5437469482422, "logps/rejected": -521.0499877929688, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3607239723205566, "rewards/margins": 22.567188262939453, "rewards/rejected": -24.9375, "step": 10670 }, { "epoch": 2.692780386348596, "grad_norm": 0.00788954272866249, "learning_rate": 1.579287531131268e-08, "logits/chosen": -0.5842208862304688, "logits/rejected": NaN, "logps/chosen": -202.29061889648438, "logps/rejected": -537.6500244140625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.848516821861267, "rewards/margins": 23.32421875, "rewards/rejected": -25.176563262939453, "step": 10680 }, { "epoch": 2.6953014212334163, "grad_norm": 0.09523136168718338, "learning_rate": 1.553732439740227e-08, "logits/chosen": -0.6182586550712585, "logits/rejected": NaN, "logps/chosen": -202.0437469482422, "logps/rejected": -509.2749938964844, "loss": 0.0202, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.997997999191284, "rewards/margins": 22.7578125, "rewards/rejected": -25.753124237060547, "step": 10690 }, { "epoch": 2.6978224561182365, "grad_norm": 0.012195896357297897, "learning_rate": 1.5283791647355133e-08, "logits/chosen": -0.6173232793807983, "logits/rejected": -0.661376953125, "logps/chosen": -193.4656219482422, "logps/rejected": -518.875, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.114245653152466, "rewards/margins": 23.146093368530273, "rewards/rejected": -25.26171875, "step": 10700 }, { "epoch": 2.7003434910030566, "grad_norm": 0.047548070549964905, "learning_rate": 1.503227924348288e-08, "logits/chosen": -0.5211029052734375, "logits/rejected": NaN, "logps/chosen": -178.9031219482422, "logps/rejected": -500.82501220703125, "loss": 0.0068, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.065023899078369, "rewards/margins": 22.576562881469727, "rewards/rejected": -24.6484375, "step": 10710 }, { "epoch": 2.702864525887877, "grad_norm": 0.7713449597358704, "learning_rate": 1.4782789350706759e-08, "logits/chosen": -0.5887542963027954, "logits/rejected": -0.5023437738418579, "logps/chosen": -188.5500030517578, "logps/rejected": -508.29998779296875, "loss": 0.0114, "rewards/accuracies": 0.984375, "rewards/chosen": -2.497637987136841, "rewards/margins": 22.677343368530273, "rewards/rejected": -25.186717987060547, "step": 10720 }, { "epoch": 2.7053855607726973, "grad_norm": 0.020407138392329216, "learning_rate": 1.4535324116539238e-08, "logits/chosen": -0.46443480253219604, "logits/rejected": -0.538745105266571, "logps/chosen": -175.16561889648438, "logps/rejected": -498.20001220703125, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.871124267578125, "rewards/margins": 22.114063262939453, "rewards/rejected": -24.979686737060547, "step": 10730 }, { "epoch": 2.7079065956575175, "grad_norm": 1.113742709159851, "learning_rate": 1.4289885671065011e-08, "logits/chosen": -0.5189574956893921, "logits/rejected": NaN, "logps/chosen": -174.7062530517578, "logps/rejected": -497.54998779296875, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.438854932785034, "rewards/margins": 22.598438262939453, "rewards/rejected": -25.042186737060547, "step": 10740 }, { "epoch": 2.7104276305423376, "grad_norm": 0.00042262044735252857, "learning_rate": 1.404647612692328e-08, "logits/chosen": -0.5724731683731079, "logits/rejected": NaN, "logps/chosen": -185.36563110351562, "logps/rejected": -530.0250244140625, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.874359130859375, "rewards/margins": 23.954687118530273, "rewards/rejected": -25.8359375, "step": 10750 }, { "epoch": 2.7129486654271577, "grad_norm": 93.40037536621094, "learning_rate": 1.3805097579288938e-08, "logits/chosen": -0.524615466594696, "logits/rejected": -0.6293609738349915, "logps/chosen": -189.50625610351562, "logps/rejected": -527.125, "loss": 0.0624, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.3453125953674316, "rewards/margins": 23.051563262939453, "rewards/rejected": -25.403905868530273, "step": 10760 }, { "epoch": 2.715469700311978, "grad_norm": 0.32481613755226135, "learning_rate": 1.3565752105855088e-08, "logits/chosen": -0.42139893770217896, "logits/rejected": NaN, "logps/chosen": -170.28125, "logps/rejected": -498.82501220703125, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3935303688049316, "rewards/margins": 22.782032012939453, "rewards/rejected": -25.169530868530273, "step": 10770 }, { "epoch": 2.717990735196798, "grad_norm": 1.135568380355835, "learning_rate": 1.332844176681483e-08, "logits/chosen": -0.47706907987594604, "logits/rejected": -0.5418030023574829, "logps/chosen": -169.24374389648438, "logps/rejected": -482.92498779296875, "loss": 0.0095, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.244525194168091, "rewards/margins": 22.944530487060547, "rewards/rejected": -25.189062118530273, "step": 10780 }, { "epoch": 2.7205117700816186, "grad_norm": 1.3182650804519653, "learning_rate": 1.3093168604843524e-08, "logits/chosen": -0.5200134515762329, "logits/rejected": -0.663952648639679, "logps/chosen": -183.59375, "logps/rejected": -521.8250122070312, "loss": 0.0174, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.534008741378784, "rewards/margins": 23.211719512939453, "rewards/rejected": -25.737499237060547, "step": 10790 }, { "epoch": 2.7230328049664387, "grad_norm": 0.010590100660920143, "learning_rate": 1.2859934645081477e-08, "logits/chosen": -0.6326751708984375, "logits/rejected": -0.632061779499054, "logps/chosen": -205.2687530517578, "logps/rejected": -536.3125, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.0528197288513184, "rewards/margins": 22.879688262939453, "rewards/rejected": -25.932811737060547, "step": 10800 }, { "epoch": 2.725553839851259, "grad_norm": 0.10699805617332458, "learning_rate": 1.2628741895116174e-08, "logits/chosen": -0.6144164800643921, "logits/rejected": NaN, "logps/chosen": -182.234375, "logps/rejected": -512.7750244140625, "loss": 0.0139, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.8279602527618408, "rewards/margins": 23.333593368530273, "rewards/rejected": -25.1640625, "step": 10810 }, { "epoch": 2.728074874736079, "grad_norm": 36.319522857666016, "learning_rate": 1.2399592344965293e-08, "logits/chosen": -0.574572741985321, "logits/rejected": NaN, "logps/chosen": -197.71875, "logps/rejected": -502.79998779296875, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.5087523460388184, "rewards/margins": 23.19921875, "rewards/rejected": -25.7109375, "step": 10820 }, { "epoch": 2.7305959096208996, "grad_norm": 0.1626068502664566, "learning_rate": 1.2172487967059276e-08, "logits/chosen": -0.623809814453125, "logits/rejected": NaN, "logps/chosen": -195.45938110351562, "logps/rejected": -520.5999755859375, "loss": 0.0069, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3736541271209717, "rewards/margins": 23.227344512939453, "rewards/rejected": -25.591405868530273, "step": 10830 }, { "epoch": 2.7331169445057197, "grad_norm": 0.07587932050228119, "learning_rate": 1.1947430716224727e-08, "logits/chosen": -0.4790710508823395, "logits/rejected": -0.5732177495956421, "logps/chosen": -170.28750610351562, "logps/rejected": -501.25, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8634886741638184, "rewards/margins": 22.587499618530273, "rewards/rejected": -25.447656631469727, "step": 10840 }, { "epoch": 2.73563797939054, "grad_norm": 0.2931421399116516, "learning_rate": 1.1724422529667182e-08, "logits/chosen": -0.4326171875, "logits/rejected": NaN, "logps/chosen": -171.1999969482422, "logps/rejected": -507.95001220703125, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6337342262268066, "rewards/margins": 23.015625, "rewards/rejected": -25.650781631469727, "step": 10850 }, { "epoch": 2.73815901427536, "grad_norm": 0.030632754787802696, "learning_rate": 1.1503465326954703e-08, "logits/chosen": -0.6555694341659546, "logits/rejected": -0.605480968952179, "logps/chosen": -191.5968780517578, "logps/rejected": -513.6749877929688, "loss": 0.0125, "rewards/accuracies": 0.984375, "rewards/chosen": -2.3972535133361816, "rewards/margins": 23.90625, "rewards/rejected": -26.29296875, "step": 10860 }, { "epoch": 2.74068004916018, "grad_norm": 0.0046407137997448444, "learning_rate": 1.1284561010001304e-08, "logits/chosen": -0.6458709836006165, "logits/rejected": -0.7799926996231079, "logps/chosen": -208.265625, "logps/rejected": -540.5250244140625, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.718841552734375, "rewards/margins": 23.189844131469727, "rewards/rejected": -25.893749237060547, "step": 10870 }, { "epoch": 2.7432010840450003, "grad_norm": 0.33725881576538086, "learning_rate": 1.1067711463050495e-08, "logits/chosen": -0.6037689447402954, "logits/rejected": NaN, "logps/chosen": -197.0203094482422, "logps/rejected": -530.9249877929688, "loss": 0.0338, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.300891160964966, "rewards/margins": 23.357030868530273, "rewards/rejected": -25.6484375, "step": 10880 }, { "epoch": 2.7457221189298204, "grad_norm": 0.009941743686795235, "learning_rate": 1.0852918552659185e-08, "logits/chosen": -0.4281249940395355, "logits/rejected": NaN, "logps/chosen": -161.7624969482422, "logps/rejected": -494.67498779296875, "loss": 0.0087, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.823840379714966, "rewards/margins": 22.65625, "rewards/rejected": -25.481250762939453, "step": 10890 }, { "epoch": 2.748243153814641, "grad_norm": 0.014433473348617554, "learning_rate": 1.0640184127681472e-08, "logits/chosen": -0.5462585687637329, "logits/rejected": NaN, "logps/chosen": -187.9640655517578, "logps/rejected": -515.2874755859375, "loss": 0.0115, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.310803174972534, "rewards/margins": 22.665624618530273, "rewards/rejected": -24.974218368530273, "step": 10900 }, { "epoch": 2.750764188699461, "grad_norm": 0.0016704428708180785, "learning_rate": 1.0429510019252936e-08, "logits/chosen": -0.5711914300918579, "logits/rejected": NaN, "logps/chosen": -180.7375030517578, "logps/rejected": -510.1499938964844, "loss": 0.0187, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.658184766769409, "rewards/margins": 23.256250381469727, "rewards/rejected": -25.920312881469727, "step": 10910 }, { "epoch": 2.7532852235842813, "grad_norm": 0.001147833769209683, "learning_rate": 1.0220898040774611e-08, "logits/chosen": -0.6013320684432983, "logits/rejected": NaN, "logps/chosen": -213.89688110351562, "logps/rejected": -535.9249877929688, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.5356507301330566, "rewards/margins": 23.639062881469727, "rewards/rejected": -26.172657012939453, "step": 10920 }, { "epoch": 2.7558062584691014, "grad_norm": 0.014165423810482025, "learning_rate": 1.0014349987897575e-08, "logits/chosen": -0.5018249750137329, "logits/rejected": NaN, "logps/chosen": -203.08749389648438, "logps/rejected": -533.1749877929688, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6001923084259033, "rewards/margins": 22.744531631469727, "rewards/rejected": -25.353124618530273, "step": 10930 }, { "epoch": 2.758327293353922, "grad_norm": 0.15819789469242096, "learning_rate": 9.809867638507468e-09, "logits/chosen": -0.44491881132125854, "logits/rejected": -0.5956924557685852, "logps/chosen": -176.19375610351562, "logps/rejected": -499.4750061035156, "loss": 0.0131, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.155658006668091, "rewards/margins": 23.044530868530273, "rewards/rejected": -26.194530487060547, "step": 10940 }, { "epoch": 2.760848328238742, "grad_norm": 0.03380697965621948, "learning_rate": 9.607452752709105e-09, "logits/chosen": -0.678070068359375, "logits/rejected": -0.81622314453125, "logps/chosen": -194.4031219482422, "logps/rejected": -546.6500244140625, "loss": 0.0038, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.65533447265625, "rewards/margins": 23.390625, "rewards/rejected": -25.0625, "step": 10950 }, { "epoch": 2.7633693631235623, "grad_norm": 0.3556543290615082, "learning_rate": 9.407107072811393e-09, "logits/chosen": -0.6246582269668579, "logits/rejected": -0.597277820110321, "logps/chosen": -181.8125, "logps/rejected": -523.3250122070312, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4027771949768066, "rewards/margins": 23.178125381469727, "rewards/rejected": -25.575000762939453, "step": 10960 }, { "epoch": 2.7658903980083824, "grad_norm": 8.485246658325195, "learning_rate": 9.208832323312293e-09, "logits/chosen": -0.5010010004043579, "logits/rejected": NaN, "logps/chosen": -183.71249389648438, "logps/rejected": -503.1499938964844, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7240326404571533, "rewards/margins": 22.520313262939453, "rewards/rejected": -25.252344131469727, "step": 10970 }, { "epoch": 2.7684114328932026, "grad_norm": 0.009081006981432438, "learning_rate": 9.012630210884053e-09, "logits/chosen": -0.5685364007949829, "logits/rejected": NaN, "logps/chosen": -181.3937530517578, "logps/rejected": -490.45001220703125, "loss": 0.0133, "rewards/accuracies": 0.984375, "rewards/chosen": -2.0523743629455566, "rewards/margins": 22.934375762939453, "rewards/rejected": -24.984375, "step": 10980 }, { "epoch": 2.7709324677780227, "grad_norm": 0.01585337147116661, "learning_rate": 8.818502424358442e-09, "logits/chosen": -0.591326892375946, "logits/rejected": NaN, "logps/chosen": -188.1671905517578, "logps/rejected": -516.5125122070312, "loss": 0.0042, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.583569288253784, "rewards/margins": 23.040624618530273, "rewards/rejected": -25.6328125, "step": 10990 }, { "epoch": 2.773453502662843, "grad_norm": 0.20236138999462128, "learning_rate": 8.62645063471218e-09, "logits/chosen": -0.5772857666015625, "logits/rejected": NaN, "logps/chosen": -198.6125030517578, "logps/rejected": -517.9500122070312, "loss": 0.0071, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.803796410560608, "rewards/margins": 23.864063262939453, "rewards/rejected": -25.674219131469727, "step": 11000 }, { "epoch": 2.7759745375476634, "grad_norm": 0.001202670857310295, "learning_rate": 8.43647649505269e-09, "logits/chosen": -0.5064666867256165, "logits/rejected": -0.6378631591796875, "logps/chosen": -189.9375, "logps/rejected": -523.9249877929688, "loss": 0.0257, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.175830125808716, "rewards/margins": 23.760156631469727, "rewards/rejected": -25.939062118530273, "step": 11010 }, { "epoch": 2.7784955724324836, "grad_norm": 0.034793779253959656, "learning_rate": 8.248581640603741e-09, "logits/chosen": -0.588214099407196, "logits/rejected": -0.65203857421875, "logps/chosen": -188.1593780517578, "logps/rejected": -505.125, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.251800537109375, "rewards/margins": 22.231250762939453, "rewards/rejected": -24.478124618530273, "step": 11020 }, { "epoch": 2.7810166073173037, "grad_norm": 0.00020020821830257773, "learning_rate": 8.062767688691463e-09, "logits/chosen": -0.5116668939590454, "logits/rejected": NaN, "logps/chosen": -191.5226593017578, "logps/rejected": -520.7000122070312, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.29229736328125, "rewards/margins": 22.709375381469727, "rewards/rejected": -24.997655868530273, "step": 11030 }, { "epoch": 2.783537642202124, "grad_norm": 0.0002920420665759593, "learning_rate": 7.879036238730319e-09, "logits/chosen": -0.6101531982421875, "logits/rejected": NaN, "logps/chosen": -177.203125, "logps/rejected": -521.0875244140625, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.5814330577850342, "rewards/margins": 24.137500762939453, "rewards/rejected": -25.732030868530273, "step": 11040 }, { "epoch": 2.7860586770869444, "grad_norm": 0.003845860715955496, "learning_rate": 7.697388872209498e-09, "logits/chosen": -0.5588318109512329, "logits/rejected": NaN, "logps/chosen": -179.82186889648438, "logps/rejected": -508.32501220703125, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.199572801589966, "rewards/margins": 23.264842987060547, "rewards/rejected": -25.465625762939453, "step": 11050 }, { "epoch": 2.7885797119717646, "grad_norm": 0.03040510229766369, "learning_rate": 7.517827152679096e-09, "logits/chosen": -0.5302063226699829, "logits/rejected": NaN, "logps/chosen": -178.05624389648438, "logps/rejected": -512.5, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0514464378356934, "rewards/margins": 22.909374237060547, "rewards/rejected": -24.950780868530273, "step": 11060 }, { "epoch": 2.7911007468565847, "grad_norm": 10.3204984664917, "learning_rate": 7.34035262573679e-09, "logits/chosen": -0.5398223996162415, "logits/rejected": NaN, "logps/chosen": -176.234375, "logps/rejected": -527.4249877929688, "loss": 0.0116, "rewards/accuracies": 0.984375, "rewards/chosen": -2.17474365234375, "rewards/margins": 23.286718368530273, "rewards/rejected": -25.462499618530273, "step": 11070 }, { "epoch": 2.793621781741405, "grad_norm": 1.2039408683776855, "learning_rate": 7.164966819014628e-09, "logits/chosen": -0.5317276120185852, "logits/rejected": NaN, "logps/chosen": -195.44686889648438, "logps/rejected": -517.3250122070312, "loss": 0.0945, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1417298316955566, "rewards/margins": 22.28125, "rewards/rejected": -25.423437118530273, "step": 11080 }, { "epoch": 2.796142816626225, "grad_norm": 0.00830569677054882, "learning_rate": 6.991671242165625e-09, "logits/chosen": -0.52044677734375, "logits/rejected": NaN, "logps/chosen": -180.57968139648438, "logps/rejected": -513.2000122070312, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.1470580101013184, "rewards/margins": 22.737499237060547, "rewards/rejected": -24.892969131469727, "step": 11090 }, { "epoch": 2.798663851511045, "grad_norm": 0.26190564036369324, "learning_rate": 6.820467386850964e-09, "logits/chosen": -0.5323173403739929, "logits/rejected": -0.5982300043106079, "logps/chosen": -192.4562530517578, "logps/rejected": -527.2249755859375, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.925280809402466, "rewards/margins": 22.846094131469727, "rewards/rejected": -25.78125, "step": 11100 }, { "epoch": 2.8011848863958653, "grad_norm": 0.0023386459797620773, "learning_rate": 6.651356726727064e-09, "logits/chosen": -0.5098327398300171, "logits/rejected": -0.6766265630722046, "logps/chosen": -186.015625, "logps/rejected": -516.4249877929688, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6297028064727783, "rewards/margins": 23.060937881469727, "rewards/rejected": -25.682811737060547, "step": 11110 }, { "epoch": 2.803705921280686, "grad_norm": 0.004022575914859772, "learning_rate": 6.4843407174330065e-09, "logits/chosen": -0.569293200969696, "logits/rejected": NaN, "logps/chosen": -187.74374389648438, "logps/rejected": -508.8374938964844, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.253775119781494, "rewards/margins": 23.295312881469727, "rewards/rejected": -25.546092987060547, "step": 11120 }, { "epoch": 2.806226956165506, "grad_norm": 2.0627245903015137, "learning_rate": 6.319420796577879e-09, "logits/chosen": -0.5465148687362671, "logits/rejected": NaN, "logps/chosen": -188.6750030517578, "logps/rejected": -479.17498779296875, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.928436279296875, "rewards/margins": 22.287500381469727, "rewards/rejected": -25.215625762939453, "step": 11130 }, { "epoch": 2.808747991050326, "grad_norm": 0.17918051779270172, "learning_rate": 6.156598383728451e-09, "logits/chosen": -0.49794769287109375, "logits/rejected": -0.583361804485321, "logps/chosen": -199.1062469482422, "logps/rejected": -532.2125244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.6426634788513184, "rewards/margins": 23.307031631469727, "rewards/rejected": -25.953125, "step": 11140 }, { "epoch": 2.8112690259351463, "grad_norm": 2.466387987136841, "learning_rate": 5.995874880396962e-09, "logits/chosen": -0.40708619356155396, "logits/rejected": NaN, "logps/chosen": -185.72500610351562, "logps/rejected": -511.4375, "loss": 0.0217, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.9419007301330566, "rewards/margins": 22.653125762939453, "rewards/rejected": -25.59765625, "step": 11150 }, { "epoch": 2.813790060819967, "grad_norm": 0.011646196246147156, "learning_rate": 5.83725167002902e-09, "logits/chosen": -0.633074939250946, "logits/rejected": NaN, "logps/chosen": -190.19375610351562, "logps/rejected": -518.0, "loss": 0.0249, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9786133766174316, "rewards/margins": 22.913280487060547, "rewards/rejected": -25.895313262939453, "step": 11160 }, { "epoch": 2.816311095704787, "grad_norm": 0.006578652188181877, "learning_rate": 5.680730117991833e-09, "logits/chosen": -0.556439220905304, "logits/rejected": NaN, "logps/chosen": -182.8406219482422, "logps/rejected": -525.8499755859375, "loss": 0.0069, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.250018358230591, "rewards/margins": 23.58203125, "rewards/rejected": -25.857812881469727, "step": 11170 }, { "epoch": 2.818832130589607, "grad_norm": 0.11757870018482208, "learning_rate": 5.5263115715621925e-09, "logits/chosen": -0.5669769048690796, "logits/rejected": NaN, "logps/chosen": -212.1999969482422, "logps/rejected": -517.1875, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.881927490234375, "rewards/margins": 23.088281631469727, "rewards/rejected": -25.966405868530273, "step": 11180 }, { "epoch": 2.8213531654744273, "grad_norm": 4.0108819007873535, "learning_rate": 5.373997359915172e-09, "logits/chosen": -0.5775344967842102, "logits/rejected": NaN, "logps/chosen": -200.38436889648438, "logps/rejected": -521.9000244140625, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.4747862815856934, "rewards/margins": 23.15625, "rewards/rejected": -25.621875762939453, "step": 11190 }, { "epoch": 2.8238742003592474, "grad_norm": 6.865295886993408, "learning_rate": 5.223788794112449e-09, "logits/chosen": -0.635754406452179, "logits/rejected": NaN, "logps/chosen": -183.14688110351562, "logps/rejected": -494.3500061035156, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.704720973968506, "rewards/margins": 22.185937881469727, "rewards/rejected": -24.895313262939453, "step": 11200 }, { "epoch": 2.8263952352440675, "grad_norm": 27.493497848510742, "learning_rate": 5.075687167091169e-09, "logits/chosen": -0.561901867389679, "logits/rejected": NaN, "logps/chosen": -182.5593719482422, "logps/rejected": -499.4750061035156, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.150524854660034, "rewards/margins": 23.388280868530273, "rewards/rejected": -25.534374237060547, "step": 11210 }, { "epoch": 2.8289162701288877, "grad_norm": 0.014758600853383541, "learning_rate": 4.9296937536527635e-09, "logits/chosen": -0.4463996887207031, "logits/rejected": NaN, "logps/chosen": -166.34375, "logps/rejected": -485.4125061035156, "loss": 0.0157, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4957518577575684, "rewards/margins": 22.127344131469727, "rewards/rejected": -24.616405487060547, "step": 11220 }, { "epoch": 2.8314373050137083, "grad_norm": 0.06804516166448593, "learning_rate": 4.785809810451958e-09, "logits/chosen": -0.6615234613418579, "logits/rejected": NaN, "logps/chosen": -225.80313110351562, "logps/rejected": -514.4500122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.47039794921875, "rewards/margins": 23.446094512939453, "rewards/rejected": -24.921875, "step": 11230 }, { "epoch": 2.8339583398985284, "grad_norm": 0.0011388554703444242, "learning_rate": 4.644036575985999e-09, "logits/chosen": -0.6394103765487671, "logits/rejected": -0.6281372308731079, "logps/chosen": -180.7375030517578, "logps/rejected": -532.7125244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.27203369140625, "rewards/margins": 23.789844512939453, "rewards/rejected": -26.067188262939453, "step": 11240 }, { "epoch": 2.8364793747833486, "grad_norm": 5.495447635650635, "learning_rate": 4.504375270583921e-09, "logits/chosen": -0.6085449457168579, "logits/rejected": -0.6452575922012329, "logps/chosen": -180.86563110351562, "logps/rejected": -546.0, "loss": 0.0028, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.786901831626892, "rewards/margins": 23.610157012939453, "rewards/rejected": -25.397655487060547, "step": 11250 }, { "epoch": 2.8390004096681687, "grad_norm": 0.23765547573566437, "learning_rate": 4.366827096396131e-09, "logits/chosen": -0.661938488483429, "logits/rejected": NaN, "logps/chosen": -204.9140625, "logps/rejected": -537.0750122070312, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.528454542160034, "rewards/margins": 22.921875, "rewards/rejected": -25.44921875, "step": 11260 }, { "epoch": 2.8415214445529893, "grad_norm": 0.06473126262426376, "learning_rate": 4.231393237384057e-09, "logits/chosen": -0.569378674030304, "logits/rejected": -0.7176154851913452, "logps/chosen": -171.1281280517578, "logps/rejected": -513.3250122070312, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.197643995285034, "rewards/margins": 23.370311737060547, "rewards/rejected": -25.568750381469727, "step": 11270 }, { "epoch": 2.8440424794378094, "grad_norm": 0.008800173178315163, "learning_rate": 4.098074859309825e-09, "logits/chosen": -0.552966296672821, "logits/rejected": NaN, "logps/chosen": -188.30233764648438, "logps/rejected": -509.4750061035156, "loss": 0.0029, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.2547240257263184, "rewards/margins": 23.503124237060547, "rewards/rejected": -25.760936737060547, "step": 11280 }, { "epoch": 2.8465635143226296, "grad_norm": 14.837903022766113, "learning_rate": 3.9668731097264315e-09, "logits/chosen": -0.563427746295929, "logits/rejected": NaN, "logps/chosen": -174.578125, "logps/rejected": -506.9125061035156, "loss": 0.0071, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.0876007080078125, "rewards/margins": 23.244531631469727, "rewards/rejected": -25.329687118530273, "step": 11290 }, { "epoch": 2.8490845492074497, "grad_norm": 0.003233085386455059, "learning_rate": 3.837789117967643e-09, "logits/chosen": -0.7621399164199829, "logits/rejected": NaN, "logps/chosen": -208.4812469482422, "logps/rejected": -524.5499877929688, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6551146507263184, "rewards/margins": 23.544530868530273, "rewards/rejected": -26.19921875, "step": 11300 }, { "epoch": 2.85160558409227, "grad_norm": 0.1335436850786209, "learning_rate": 3.7108239951385014e-09, "logits/chosen": -0.5518890619277954, "logits/rejected": NaN, "logps/chosen": -183.68124389648438, "logps/rejected": -504.45001220703125, "loss": 0.0079, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.805755615234375, "rewards/margins": 23.374217987060547, "rewards/rejected": -25.174999237060547, "step": 11310 }, { "epoch": 2.85412661897709, "grad_norm": 0.4535803496837616, "learning_rate": 3.585978834105524e-09, "logits/chosen": -0.56915283203125, "logits/rejected": NaN, "logps/chosen": -187.4812469482422, "logps/rejected": -503.375, "loss": 0.0086, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.641552686691284, "rewards/margins": 23.3203125, "rewards/rejected": -25.967187881469727, "step": 11320 }, { "epoch": 2.85664765386191, "grad_norm": 0.3353064954280853, "learning_rate": 3.463254709487551e-09, "logits/chosen": -0.5579208135604858, "logits/rejected": NaN, "logps/chosen": -191.6999969482422, "logps/rejected": -534.5499877929688, "loss": 0.0109, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.554486036300659, "rewards/margins": 23.0703125, "rewards/rejected": -25.619531631469727, "step": 11330 }, { "epoch": 2.8591686887467307, "grad_norm": 0.353141188621521, "learning_rate": 3.342652677646246e-09, "logits/chosen": -0.5489776730537415, "logits/rejected": NaN, "logps/chosen": -174.47421264648438, "logps/rejected": -497.5249938964844, "loss": 0.012, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.1373047828674316, "rewards/margins": 23.196874618530273, "rewards/rejected": -25.337499618530273, "step": 11340 }, { "epoch": 2.861689723631551, "grad_norm": 0.1515815407037735, "learning_rate": 3.2241737766771637e-09, "logits/chosen": -0.619738757610321, "logits/rejected": NaN, "logps/chosen": -193.828125, "logps/rejected": -515.8499755859375, "loss": 0.0064, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.586657762527466, "rewards/margins": 23.431249618530273, "rewards/rejected": -26.021875381469727, "step": 11350 }, { "epoch": 2.864210758516371, "grad_norm": 0.06697161495685577, "learning_rate": 3.1078190264008376e-09, "logits/chosen": -0.487396240234375, "logits/rejected": NaN, "logps/chosen": -173.4968719482422, "logps/rejected": -498.4375, "loss": 0.0078, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.023974657058716, "rewards/margins": 23.360937118530273, "rewards/rejected": -25.387500762939453, "step": 11360 }, { "epoch": 2.866731793401191, "grad_norm": 0.010781998746097088, "learning_rate": 2.9935894283538154e-09, "logits/chosen": -0.4952239990234375, "logits/rejected": -0.53955078125, "logps/chosen": -181.27188110351562, "logps/rejected": -519.2249755859375, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4141297340393066, "rewards/margins": 23.395313262939453, "rewards/rejected": -25.807031631469727, "step": 11370 }, { "epoch": 2.8692528282860112, "grad_norm": 0.12105485051870346, "learning_rate": 2.8814859657802227e-09, "logits/chosen": -0.5911773443222046, "logits/rejected": -0.6278076171875, "logps/chosen": -193.1750030517578, "logps/rejected": -510.3999938964844, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.8079345226287842, "rewards/margins": 23.828125, "rewards/rejected": -25.629688262939453, "step": 11380 }, { "epoch": 2.871773863170832, "grad_norm": 0.0028187879361212254, "learning_rate": 2.77150960362324e-09, "logits/chosen": -0.6187225580215454, "logits/rejected": -0.7689208984375, "logps/chosen": -181.578125, "logps/rejected": -512.3250122070312, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0689697265625, "rewards/margins": 23.749217987060547, "rewards/rejected": -25.813282012939453, "step": 11390 }, { "epoch": 2.874294898055652, "grad_norm": 0.006821407005190849, "learning_rate": 2.6636612885167775e-09, "logits/chosen": NaN, "logits/rejected": -0.5026825070381165, "logps/chosen": -193.84530639648438, "logps/rejected": -508.95001220703125, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.564038038253784, "rewards/margins": 23.68359375, "rewards/rejected": -26.248437881469727, "step": 11400 }, { "epoch": 2.876815932940472, "grad_norm": 0.01892421394586563, "learning_rate": 2.5579419487773424e-09, "logits/chosen": -0.5743957757949829, "logits/rejected": -0.6407928466796875, "logps/chosen": -187.3625030517578, "logps/rejected": -522.5499877929688, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9545608758926392, "rewards/margins": 24.5390625, "rewards/rejected": -26.489063262939453, "step": 11410 }, { "epoch": 2.8793369678252922, "grad_norm": 0.019449761137366295, "learning_rate": 2.4543524943960448e-09, "logits/chosen": -0.5170532464981079, "logits/rejected": NaN, "logps/chosen": -180.30313110351562, "logps/rejected": -520.4749755859375, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0782470703125, "rewards/margins": 23.923437118530273, "rewards/rejected": -26.006250381469727, "step": 11420 }, { "epoch": 2.8818580027101124, "grad_norm": 0.037210751324892044, "learning_rate": 2.352893817030799e-09, "logits/chosen": -0.6040191650390625, "logits/rejected": NaN, "logps/chosen": -189.8406219482422, "logps/rejected": -524.6749877929688, "loss": 0.0322, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.509143114089966, "rewards/margins": 22.389842987060547, "rewards/rejected": -24.8984375, "step": 11430 }, { "epoch": 2.8843790375949325, "grad_norm": 0.03474215790629387, "learning_rate": 2.253566789998523e-09, "logits/chosen": -0.5817291140556335, "logits/rejected": NaN, "logps/chosen": -186.4875030517578, "logps/rejected": -512.875, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.784661889076233, "rewards/margins": 23.47265625, "rewards/rejected": -25.253124237060547, "step": 11440 }, { "epoch": 2.8869000724797527, "grad_norm": 0.012929543852806091, "learning_rate": 2.156372268267842e-09, "logits/chosen": -0.6210479736328125, "logits/rejected": -0.6966797113418579, "logps/chosen": -191.08438110351562, "logps/rejected": -527.2750244140625, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3511931896209717, "rewards/margins": 23.266407012939453, "rewards/rejected": -25.612499237060547, "step": 11450 }, { "epoch": 2.8894211073645732, "grad_norm": 0.0009751119650900364, "learning_rate": 2.061311088451506e-09, "logits/chosen": -0.59698486328125, "logits/rejected": -0.6089752316474915, "logps/chosen": -176.4656219482422, "logps/rejected": -502.95001220703125, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.327557325363159, "rewards/margins": 23.161718368530273, "rewards/rejected": -25.495311737060547, "step": 11460 }, { "epoch": 2.8919421422493934, "grad_norm": 0.046667251735925674, "learning_rate": 1.9683840687993448e-09, "logits/chosen": -0.5225860476493835, "logits/rejected": NaN, "logps/chosen": -177.69375610351562, "logps/rejected": -486.5, "loss": 0.0417, "rewards/accuracies": 0.984375, "rewards/chosen": -2.467358350753784, "rewards/margins": 22.70703125, "rewards/rejected": -25.172657012939453, "step": 11470 }, { "epoch": 2.8944631771342135, "grad_norm": 0.012255662120878696, "learning_rate": 1.8775920091911034e-09, "logits/chosen": -0.562207043170929, "logits/rejected": NaN, "logps/chosen": -188.8625030517578, "logps/rejected": -536.5750122070312, "loss": 0.011, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.348828077316284, "rewards/margins": 23.010936737060547, "rewards/rejected": -25.362499237060547, "step": 11480 }, { "epoch": 2.8969842120190337, "grad_norm": 0.003325071418657899, "learning_rate": 1.7889356911296448e-09, "logits/chosen": -0.46673583984375, "logits/rejected": NaN, "logps/chosen": -179.8000030517578, "logps/rejected": -519.3499755859375, "loss": 0.0069, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.948144555091858, "rewards/margins": 23.278905868530273, "rewards/rejected": -25.21875, "step": 11490 }, { "epoch": 2.8995052469038542, "grad_norm": 0.013228733092546463, "learning_rate": 1.702415877734259e-09, "logits/chosen": -0.5218475461006165, "logits/rejected": -0.6741393804550171, "logps/chosen": -183.50625610351562, "logps/rejected": -508.2749938964844, "loss": 0.0144, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.3779358863830566, "rewards/margins": 23.48046875, "rewards/rejected": -25.854686737060547, "step": 11500 }, { "epoch": 2.9020262817886744, "grad_norm": 0.040093716233968735, "learning_rate": 1.6180333137339186e-09, "logits/chosen": -0.619067370891571, "logits/rejected": NaN, "logps/chosen": -191.72500610351562, "logps/rejected": -509.79998779296875, "loss": 0.013, "rewards/accuracies": 0.984375, "rewards/chosen": -2.891796827316284, "rewards/margins": 22.58203125, "rewards/rejected": -25.46875, "step": 11510 }, { "epoch": 2.9045473166734945, "grad_norm": 0.01464893203228712, "learning_rate": 1.5357887254610623e-09, "logits/chosen": -0.4354919493198395, "logits/rejected": -0.6741714477539062, "logps/chosen": -176.4812469482422, "logps/rejected": -512.5374755859375, "loss": 0.0132, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.273449659347534, "rewards/margins": 22.803905487060547, "rewards/rejected": -25.0703125, "step": 11520 }, { "epoch": 2.9070683515583147, "grad_norm": 0.001219803700223565, "learning_rate": 1.4556828208452388e-09, "logits/chosen": -0.6785491704940796, "logits/rejected": NaN, "logps/chosen": -200.21875, "logps/rejected": -511.04998779296875, "loss": 0.0227, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6597900390625, "rewards/margins": 22.928905487060547, "rewards/rejected": -25.592967987060547, "step": 11530 }, { "epoch": 2.909589386443135, "grad_norm": 0.004240900278091431, "learning_rate": 1.3777162894070272e-09, "logits/chosen": -0.621081531047821, "logits/rejected": NaN, "logps/chosen": -182.1062469482422, "logps/rejected": -527.5625, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9327392578125, "rewards/margins": 24.142187118530273, "rewards/rejected": -26.075000762939453, "step": 11540 }, { "epoch": 2.912110421327955, "grad_norm": 0.039362505078315735, "learning_rate": 1.3018898022521263e-09, "logits/chosen": -0.4564270079135895, "logits/rejected": -0.6351989507675171, "logps/chosen": -176.6531219482422, "logps/rejected": -505.04998779296875, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.5458436012268066, "rewards/margins": 22.345312118530273, "rewards/rejected": -24.892187118530273, "step": 11550 }, { "epoch": 2.914631456212775, "grad_norm": 0.05185255780816078, "learning_rate": 1.2282040120655534e-09, "logits/chosen": -0.4644103944301605, "logits/rejected": -0.5674194097518921, "logps/chosen": -195.3125, "logps/rejected": -496.8125, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8215575218200684, "rewards/margins": 22.624217987060547, "rewards/rejected": -25.4375, "step": 11560 }, { "epoch": 2.9171524910975957, "grad_norm": 0.04047249257564545, "learning_rate": 1.1566595531060374e-09, "logits/chosen": -0.599139392375946, "logits/rejected": NaN, "logps/chosen": -183.05624389648438, "logps/rejected": -484.6875, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.718579053878784, "rewards/margins": 22.317968368530273, "rewards/rejected": -25.045312881469727, "step": 11570 }, { "epoch": 2.919673525982416, "grad_norm": 1.9203213453292847, "learning_rate": 1.087257041200551e-09, "logits/chosen": -0.5541900396347046, "logits/rejected": NaN, "logps/chosen": -172.328125, "logps/rejected": -515.75, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9318726062774658, "rewards/margins": 23.635936737060547, "rewards/rejected": -25.56640625, "step": 11580 }, { "epoch": 2.922194560867236, "grad_norm": 0.6373929977416992, "learning_rate": 1.019997073739065e-09, "logits/chosen": -0.6421111822128296, "logits/rejected": NaN, "logps/chosen": -197.02499389648438, "logps/rejected": -527.5875244140625, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.207629442214966, "rewards/margins": 23.204687118530273, "rewards/rejected": -25.412500381469727, "step": 11590 }, { "epoch": 2.924715595752056, "grad_norm": 2.3887343406677246, "learning_rate": 9.548802296692749e-10, "logits/chosen": -0.4904327392578125, "logits/rejected": NaN, "logps/chosen": -187.21875, "logps/rejected": -512.5499877929688, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.9633057117462158, "rewards/margins": 23.025781631469727, "rewards/rejected": -24.999217987060547, "step": 11600 }, { "epoch": 2.9272366306368767, "grad_norm": 0.002411498222500086, "learning_rate": 8.919070694917708e-10, "logits/chosen": -0.5903106927871704, "logits/rejected": NaN, "logps/chosen": -183.59375, "logps/rejected": -499.1499938964844, "loss": 0.0063, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.442657470703125, "rewards/margins": 22.854686737060547, "rewards/rejected": -25.296092987060547, "step": 11610 }, { "epoch": 2.929757665521697, "grad_norm": 0.21687322854995728, "learning_rate": 8.310781352550977e-10, "logits/chosen": -0.5360077023506165, "logits/rejected": NaN, "logps/chosen": -171.265625, "logps/rejected": -513.0750122070312, "loss": 0.0091, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -1.6723816394805908, "rewards/margins": 23.475780487060547, "rewards/rejected": -25.154687881469727, "step": 11620 }, { "epoch": 2.932278700406517, "grad_norm": 0.0026091921608895063, "learning_rate": 7.723939505511478e-10, "logits/chosen": -0.6072021722793579, "logits/rejected": -0.6943420171737671, "logps/chosen": -168.69375610351562, "logps/rejected": -527.2249755859375, "loss": 0.0119, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8299851417541504, "rewards/margins": 23.744531631469727, "rewards/rejected": -26.578125, "step": 11630 }, { "epoch": 2.934799735291337, "grad_norm": 0.018609557300806046, "learning_rate": 7.15855020510664e-10, "logits/chosen": -0.5442962646484375, "logits/rejected": -0.635693371295929, "logps/chosen": -168.16250610351562, "logps/rejected": -506.6000061035156, "loss": 0.0143, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.0801024436950684, "rewards/margins": 23.264062881469727, "rewards/rejected": -25.342187881469727, "step": 11640 }, { "epoch": 2.9373207701761572, "grad_norm": 0.06253838539123535, "learning_rate": 6.614618317988263e-10, "logits/chosen": -0.5096222162246704, "logits/rejected": -0.557147204875946, "logps/chosen": -180.015625, "logps/rejected": -517.25, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2388854026794434, "rewards/margins": 23.858592987060547, "rewards/rejected": -26.0859375, "step": 11650 }, { "epoch": 2.9398418050609774, "grad_norm": 18.320070266723633, "learning_rate": 6.092148526111451e-10, "logits/chosen": -0.4997054934501648, "logits/rejected": NaN, "logps/chosen": -171.75936889648438, "logps/rejected": -544.2999877929688, "loss": 0.0079, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.432781934738159, "rewards/margins": 23.026561737060547, "rewards/rejected": -25.452342987060547, "step": 11660 }, { "epoch": 2.9423628399457975, "grad_norm": 0.025767100974917412, "learning_rate": 5.591145326693525e-10, "logits/chosen": -0.5679260492324829, "logits/rejected": -0.620404064655304, "logps/chosen": -188.0656280517578, "logps/rejected": -502.70001220703125, "loss": 0.014, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.574084520339966, "rewards/margins": 23.493749618530273, "rewards/rejected": -26.064844131469727, "step": 11670 }, { "epoch": 2.944883874830618, "grad_norm": 1.6805107593536377, "learning_rate": 5.111613032176277e-10, "logits/chosen": -0.49570924043655396, "logits/rejected": NaN, "logps/chosen": -180.375, "logps/rejected": -539.3125, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.195812940597534, "rewards/margins": 23.280467987060547, "rewards/rejected": -25.4765625, "step": 11680 }, { "epoch": 2.9474049097154382, "grad_norm": 0.09722450375556946, "learning_rate": 4.6535557701873896e-10, "logits/chosen": -0.6372619867324829, "logits/rejected": NaN, "logps/chosen": -180.51406860351562, "logps/rejected": -498.70001220703125, "loss": 0.0217, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.157003879547119, "rewards/margins": 23.232030868530273, "rewards/rejected": -25.387500762939453, "step": 11690 }, { "epoch": 2.9499259446002584, "grad_norm": 3.3431973457336426, "learning_rate": 4.216977483506856e-10, "logits/chosen": -0.6600707769393921, "logits/rejected": NaN, "logps/chosen": -220.75625610351562, "logps/rejected": -535.8250122070312, "loss": 0.0469, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.378753662109375, "rewards/margins": 23.456249237060547, "rewards/rejected": -25.831249237060547, "step": 11700 }, { "epoch": 2.9524469794850785, "grad_norm": 0.35802513360977173, "learning_rate": 3.8018819300308925e-10, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -170.4171905517578, "logps/rejected": -506.875, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.398364305496216, "rewards/margins": 23.270313262939453, "rewards/rejected": -25.664844512939453, "step": 11710 }, { "epoch": 2.954968014369899, "grad_norm": 0.9873806834220886, "learning_rate": 3.408272682741409e-10, "logits/chosen": -0.600146472454071, "logits/rejected": NaN, "logps/chosen": -182.83749389648438, "logps/rejected": -506.25, "loss": 0.0194, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.978796362876892, "rewards/margins": 23.364063262939453, "rewards/rejected": -25.344532012939453, "step": 11720 }, { "epoch": 2.9574890492547192, "grad_norm": 0.014487285166978836, "learning_rate": 3.036153129674368e-10, "logits/chosen": -0.548107922077179, "logits/rejected": -0.651275634765625, "logps/chosen": -193.64999389648438, "logps/rejected": -490.0249938964844, "loss": 0.006, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.8207764625549316, "rewards/margins": 22.640625, "rewards/rejected": -25.46875, "step": 11730 }, { "epoch": 2.9600100841395394, "grad_norm": 0.16425976157188416, "learning_rate": 2.685526473890365e-10, "logits/chosen": -0.5684814453125, "logits/rejected": NaN, "logps/chosen": -189.27188110351562, "logps/rejected": -507.63751220703125, "loss": 0.0112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.409027099609375, "rewards/margins": 23.396093368530273, "rewards/rejected": -25.795312881469727, "step": 11740 }, { "epoch": 2.9625311190243595, "grad_norm": 0.3813161253929138, "learning_rate": 2.3563957334482575e-10, "logits/chosen": -0.557482898235321, "logits/rejected": NaN, "logps/chosen": -188.80624389648438, "logps/rejected": -530.4625244140625, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1075682640075684, "rewards/margins": 23.164844512939453, "rewards/rejected": -25.275781631469727, "step": 11750 }, { "epoch": 2.9650521539091796, "grad_norm": 0.015139306895434856, "learning_rate": 2.0487637413776903e-10, "logits/chosen": -0.577178955078125, "logits/rejected": -0.7109611630439758, "logps/chosen": -183.56405639648438, "logps/rejected": -530.8125, "loss": 0.0112, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.6968626976013184, "rewards/margins": 23.536718368530273, "rewards/rejected": -26.231250762939453, "step": 11760 }, { "epoch": 2.967573188794, "grad_norm": 0.008983856067061424, "learning_rate": 1.762633145655501e-10, "logits/chosen": -0.5163558721542358, "logits/rejected": NaN, "logps/chosen": -169.10000610351562, "logps/rejected": -540.2999877929688, "loss": 0.0494, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8533751964569092, "rewards/margins": 23.720312118530273, "rewards/rejected": -25.581249237060547, "step": 11770 }, { "epoch": 2.97009422367882, "grad_norm": 0.6495814323425293, "learning_rate": 1.4980064091835166e-10, "logits/chosen": -0.6654937863349915, "logits/rejected": -0.743695080280304, "logps/chosen": -184.5, "logps/rejected": -517.7249755859375, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2094216346740723, "rewards/margins": 23.12109375, "rewards/rejected": -25.331249237060547, "step": 11780 }, { "epoch": 2.9726152585636405, "grad_norm": 0.001566762919537723, "learning_rate": 1.2548858097655157e-10, "logits/chosen": -0.5447632074356079, "logits/rejected": NaN, "logps/chosen": -181.2687530517578, "logps/rejected": -518.5999755859375, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.292724609375, "rewards/margins": 23.377344131469727, "rewards/rejected": -25.665624618530273, "step": 11790 }, { "epoch": 2.9751362934484606, "grad_norm": 70.58358764648438, "learning_rate": 1.0332734400897437e-10, "logits/chosen": -0.582012951374054, "logits/rejected": NaN, "logps/chosen": -173.6999969482422, "logps/rejected": -508.54998779296875, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.977899193763733, "rewards/margins": 24.021093368530273, "rewards/rejected": -25.990625381469727, "step": 11800 }, { "epoch": 2.977657328333281, "grad_norm": 0.02307814173400402, "learning_rate": 8.331712077094821e-11, "logits/chosen": -0.6499572992324829, "logits/rejected": -0.6706878542900085, "logps/chosen": -209.13125610351562, "logps/rejected": -525.5, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.872204542160034, "rewards/margins": 22.524219512939453, "rewards/rejected": -25.393749237060547, "step": 11810 }, { "epoch": 2.980178363218101, "grad_norm": 0.029035158455371857, "learning_rate": 6.545808350272297e-11, "logits/chosen": -0.5842529535293579, "logits/rejected": -0.7164306640625, "logps/chosen": -189.203125, "logps/rejected": -518.7249755859375, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.45147705078125, "rewards/margins": 23.453125, "rewards/rejected": -25.912500381469727, "step": 11820 }, { "epoch": 2.9826993981029215, "grad_norm": 0.03365403786301613, "learning_rate": 4.9750385927971315e-11, "logits/chosen": -0.530230700969696, "logits/rejected": -0.7279022336006165, "logps/chosen": -211.71249389648438, "logps/rejected": -549.0, "loss": 0.0155, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9494996070861816, "rewards/margins": 23.158594131469727, "rewards/rejected": -26.098438262939453, "step": 11830 }, { "epoch": 2.9852204329877416, "grad_norm": 0.18058663606643677, "learning_rate": 3.619416325251201e-11, "logits/chosen": -0.6911224126815796, "logits/rejected": NaN, "logps/chosen": -199.13125610351562, "logps/rejected": -529.2999877929688, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.116955518722534, "rewards/margins": 23.560155868530273, "rewards/rejected": -25.672657012939453, "step": 11840 }, { "epoch": 2.987741467872562, "grad_norm": 0.0009694344480521977, "learning_rate": 2.4789532162977632e-11, "logits/chosen": -0.5097717046737671, "logits/rejected": -0.6451660394668579, "logps/chosen": -176.5515594482422, "logps/rejected": -506.375, "loss": 0.0151, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.3196778297424316, "rewards/margins": 23.283594131469727, "rewards/rejected": -25.615625381469727, "step": 11850 }, { "epoch": 2.990262502757382, "grad_norm": 0.8926187753677368, "learning_rate": 1.5536590826065177e-11, "logits/chosen": -0.526409924030304, "logits/rejected": NaN, "logps/chosen": -181.078125, "logps/rejected": -501.1000061035156, "loss": 0.0092, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.451306104660034, "rewards/margins": 23.240625381469727, "rewards/rejected": -25.692968368530273, "step": 11860 }, { "epoch": 2.992783537642202, "grad_norm": 0.02170671336352825, "learning_rate": 8.435418887509094e-12, "logits/chosen": -0.5674682855606079, "logits/rejected": NaN, "logps/chosen": -189.2234344482422, "logps/rejected": -525.6500244140625, "loss": 0.0101, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.7766356468200684, "rewards/margins": 22.827342987060547, "rewards/rejected": -25.594532012939453, "step": 11870 }, { "epoch": 2.995304572527022, "grad_norm": 21.275997161865234, "learning_rate": 3.486077471415161e-12, "logits/chosen": -0.554400622844696, "logits/rejected": -0.5768340826034546, "logps/chosen": -185.00625610351562, "logps/rejected": -515.5625, "loss": 0.0122, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1358399391174316, "rewards/margins": 23.147655487060547, "rewards/rejected": -25.2734375, "step": 11880 }, { "epoch": 2.9978256074118423, "grad_norm": 0.07857047766447067, "learning_rate": 6.886091798441462e-13, "logits/chosen": -0.537109375, "logits/rejected": NaN, "logps/chosen": -183.9875030517578, "logps/rejected": -550.625, "loss": 0.0088, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.9035736322402954, "rewards/margins": 23.746875762939453, "rewards/rejected": -25.651561737060547, "step": 11890 } ], "logging_steps": 10, "max_steps": 11898, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }