{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9998424353196986, "eval_steps": 500, "global_step": 11898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025210348848202185, "grad_norm": 440.4095153808594, "learning_rate": 3.781512605042016e-09, "logits/chosen": -0.577069103717804, "logits/rejected": NaN, "logps/chosen": -132.00936889648438, "logps/rejected": -500.0, "loss": 0.6941, "rewards/accuracies": 0.296875, "rewards/chosen": 0.0025633841287344694, "rewards/margins": 0.000540164124686271, "rewards/rejected": 0.0020439147483557463, "step": 10 }, { "epoch": 0.005042069769640437, "grad_norm": 443.8802490234375, "learning_rate": 7.983193277310924e-09, "logits/chosen": -0.6251770257949829, "logits/rejected": NaN, "logps/chosen": -126.79374694824219, "logps/rejected": -527.0, "loss": 0.708, "rewards/accuracies": 0.3343749940395355, "rewards/chosen": -0.009954547509551048, "rewards/margins": -0.01906433142721653, "rewards/rejected": 0.00906524620950222, "step": 20 }, { "epoch": 0.007563104654460656, "grad_norm": 455.8319091796875, "learning_rate": 1.2184873949579832e-08, "logits/chosen": -0.5696380734443665, "logits/rejected": NaN, "logps/chosen": -131.93905639648438, "logps/rejected": -514.5875244140625, "loss": 0.7107, "rewards/accuracies": 0.359375, "rewards/chosen": -0.004595947451889515, "rewards/margins": -0.016945267096161842, "rewards/rejected": 0.012355041690170765, "step": 30 }, { "epoch": 0.010084139539280874, "grad_norm": 419.24725341796875, "learning_rate": 1.638655462184874e-08, "logits/chosen": -0.6208831667900085, "logits/rejected": -0.7034057378768921, "logps/chosen": -138.68124389648438, "logps/rejected": -495.4750061035156, "loss": 0.6965, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.004216575529426336, "rewards/margins": 0.008298492059111595, "rewards/rejected": -0.01253509521484375, "step": 40 }, { "epoch": 0.012605174424101093, "grad_norm": 435.0349426269531, "learning_rate": 2.0588235294117647e-08, "logits/chosen": -0.551074206829071, "logits/rejected": -0.4991699159145355, "logps/chosen": -137.7578125, "logps/rejected": -512.7374877929688, "loss": 0.6874, "rewards/accuracies": 0.46562498807907104, "rewards/chosen": -0.008432770147919655, "rewards/margins": 0.0336339958012104, "rewards/rejected": -0.04209594801068306, "step": 50 }, { "epoch": 0.015126209308921312, "grad_norm": 372.1994323730469, "learning_rate": 2.478991596638655e-08, "logits/chosen": -0.564129650592804, "logits/rejected": NaN, "logps/chosen": -131.20468139648438, "logps/rejected": -510.45001220703125, "loss": 0.6557, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.016366008669137955, "rewards/margins": 0.09404639899730682, "rewards/rejected": -0.1104477196931839, "step": 60 }, { "epoch": 0.01764724419374153, "grad_norm": 429.7486267089844, "learning_rate": 2.8991596638655463e-08, "logits/chosen": -0.71234130859375, "logits/rejected": NaN, "logps/chosen": -146.85000610351562, "logps/rejected": -509.8999938964844, "loss": 0.6369, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.020509671419858932, "rewards/margins": 0.13446465134620667, "rewards/rejected": -0.15505218505859375, "step": 70 }, { "epoch": 0.020168279078561748, "grad_norm": 317.4393310546875, "learning_rate": 3.319327731092437e-08, "logits/chosen": -0.6867309808731079, "logits/rejected": NaN, "logps/chosen": -133.890625, "logps/rejected": -508.6000061035156, "loss": 0.5794, "rewards/accuracies": 0.778124988079071, "rewards/chosen": -0.06360282748937607, "rewards/margins": 0.2713913023471832, "rewards/rejected": -0.3348846435546875, "step": 80 }, { "epoch": 0.022689313963381967, "grad_norm": 319.6298522949219, "learning_rate": 3.7394957983193276e-08, "logits/chosen": -0.5997711420059204, "logits/rejected": -0.724536120891571, "logps/chosen": -146.1687469482422, "logps/rejected": -528.2249755859375, "loss": 0.5049, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": -0.081792451441288, "rewards/margins": 0.4673599302768707, "rewards/rejected": -0.5494140386581421, "step": 90 }, { "epoch": 0.025210348848202186, "grad_norm": 252.6404266357422, "learning_rate": 4.159663865546218e-08, "logits/chosen": -0.5720275640487671, "logits/rejected": -0.643786609172821, "logps/chosen": -132.890625, "logps/rejected": -523.9124755859375, "loss": 0.4762, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.10703583061695099, "rewards/margins": 0.5750762820243835, "rewards/rejected": -0.6818603277206421, "step": 100 }, { "epoch": 0.027731383733022405, "grad_norm": 172.73245239257812, "learning_rate": 4.5798319327731086e-08, "logits/chosen": -0.5433899164199829, "logits/rejected": -0.5498291254043579, "logps/chosen": -148.9499969482422, "logps/rejected": -524.3499755859375, "loss": 0.3481, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.13220195472240448, "rewards/margins": 1.080407738685608, "rewards/rejected": -1.2128417491912842, "step": 110 }, { "epoch": 0.030252418617842624, "grad_norm": 156.39102172851562, "learning_rate": 5e-08, "logits/chosen": -0.5068206787109375, "logits/rejected": NaN, "logps/chosen": -142.796875, "logps/rejected": -549.875, "loss": 0.3064, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.21010513603687286, "rewards/margins": 1.3999512195587158, "rewards/rejected": -1.6103026866912842, "step": 120 }, { "epoch": 0.03277345350266284, "grad_norm": 149.56472778320312, "learning_rate": 5.420168067226891e-08, "logits/chosen": -0.5582367181777954, "logits/rejected": -0.584747314453125, "logps/chosen": -129.6796875, "logps/rejected": -537.2750244140625, "loss": 0.2549, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.2083911895751953, "rewards/margins": 1.713623046875, "rewards/rejected": -1.921630859375, "step": 130 }, { "epoch": 0.03529448838748306, "grad_norm": 129.69622802734375, "learning_rate": 5.8403361344537814e-08, "logits/chosen": -0.4997390806674957, "logits/rejected": NaN, "logps/chosen": -136.14688110351562, "logps/rejected": -543.4500122070312, "loss": 0.2281, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.16068115830421448, "rewards/margins": 1.9688599109649658, "rewards/rejected": -2.1297850608825684, "step": 140 }, { "epoch": 0.03781552327230328, "grad_norm": 100.1859359741211, "learning_rate": 6.260504201680673e-08, "logits/chosen": -0.510577380657196, "logits/rejected": NaN, "logps/chosen": -127.3968734741211, "logps/rejected": -565.8499755859375, "loss": 0.1715, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.1392166167497635, "rewards/margins": 2.439697265625, "rewards/rejected": -2.5789551734924316, "step": 150 }, { "epoch": 0.040336558157123496, "grad_norm": 83.32747650146484, "learning_rate": 6.680672268907563e-08, "logits/chosen": -0.4240051209926605, "logits/rejected": NaN, "logps/chosen": -143.70156860351562, "logps/rejected": -531.8250122070312, "loss": 0.1581, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.10658035427331924, "rewards/margins": 2.8875975608825684, "rewards/rejected": -2.995068311691284, "step": 160 }, { "epoch": 0.04285759304194372, "grad_norm": 67.98423767089844, "learning_rate": 7.100840336134454e-08, "logits/chosen": -0.541229248046875, "logits/rejected": -0.49873656034469604, "logps/chosen": -138.765625, "logps/rejected": -539.7000122070312, "loss": 0.1166, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -0.09358825534582138, "rewards/margins": 3.6114258766174316, "rewards/rejected": -3.706249952316284, "step": 170 }, { "epoch": 0.045378627926763934, "grad_norm": 45.3515739440918, "learning_rate": 7.521008403361344e-08, "logits/chosen": -0.5728424191474915, "logits/rejected": -0.562542736530304, "logps/chosen": -147.10000610351562, "logps/rejected": -577.4249877929688, "loss": 0.1048, "rewards/accuracies": 0.96875, "rewards/chosen": -0.25597381591796875, "rewards/margins": 4.108007907867432, "rewards/rejected": -4.362597465515137, "step": 180 }, { "epoch": 0.04789966281158416, "grad_norm": 82.60939025878906, "learning_rate": 7.941176470588235e-08, "logits/chosen": -0.5129119753837585, "logits/rejected": NaN, "logps/chosen": -151.75936889648438, "logps/rejected": -580.75, "loss": 0.1199, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.2112220823764801, "rewards/margins": 4.51318359375, "rewards/rejected": -4.723340034484863, "step": 190 }, { "epoch": 0.05042069769640437, "grad_norm": 30.832969665527344, "learning_rate": 8.361344537815125e-08, "logits/chosen": -0.5011871457099915, "logits/rejected": -0.43841248750686646, "logps/chosen": -136.84375, "logps/rejected": -560.1749877929688, "loss": 0.0811, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.2356243133544922, "rewards/margins": 4.741991996765137, "rewards/rejected": -4.980664253234863, "step": 200 }, { "epoch": 0.052941732581224595, "grad_norm": 106.49891662597656, "learning_rate": 8.781512605042016e-08, "logits/chosen": -0.40560150146484375, "logits/rejected": NaN, "logps/chosen": -144.66250610351562, "logps/rejected": -548.2125244140625, "loss": 0.1187, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.29690855741500854, "rewards/margins": 4.688330173492432, "rewards/rejected": -4.983691215515137, "step": 210 }, { "epoch": 0.05546276746604481, "grad_norm": 60.09769821166992, "learning_rate": 9.201680672268907e-08, "logits/chosen": -0.4920410215854645, "logits/rejected": -0.4406982362270355, "logps/chosen": -141.0, "logps/rejected": -556.3250122070312, "loss": 0.1035, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.23477134108543396, "rewards/margins": 4.777636528015137, "rewards/rejected": -5.009961128234863, "step": 220 }, { "epoch": 0.05798380235086503, "grad_norm": 99.6902084350586, "learning_rate": 9.621848739495798e-08, "logits/chosen": -0.5321990847587585, "logits/rejected": NaN, "logps/chosen": -137.79296875, "logps/rejected": -543.5499877929688, "loss": 0.1022, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.14935608208179474, "rewards/margins": 5.039648532867432, "rewards/rejected": -5.1865234375, "step": 230 }, { "epoch": 0.06050483723568525, "grad_norm": 68.26801300048828, "learning_rate": 1.004201680672269e-07, "logits/chosen": -0.5098114013671875, "logits/rejected": NaN, "logps/chosen": -136.5515594482422, "logps/rejected": -580.7000122070312, "loss": 0.0738, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.18746796250343323, "rewards/margins": 5.334277153015137, "rewards/rejected": -5.523046970367432, "step": 240 }, { "epoch": 0.06302587212050546, "grad_norm": 104.2403793334961, "learning_rate": 1.046218487394958e-07, "logits/chosen": -0.41889649629592896, "logits/rejected": NaN, "logps/chosen": -135.4210968017578, "logps/rejected": -562.25, "loss": 0.0724, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.23416443169116974, "rewards/margins": 5.51953125, "rewards/rejected": -5.7548828125, "step": 250 }, { "epoch": 0.06554690700532569, "grad_norm": 27.94379997253418, "learning_rate": 1.088235294117647e-07, "logits/chosen": -0.45817261934280396, "logits/rejected": -0.496002197265625, "logps/chosen": -125.0582046508789, "logps/rejected": -582.7000122070312, "loss": 0.0835, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.24934692680835724, "rewards/margins": 5.881249904632568, "rewards/rejected": -6.132226467132568, "step": 260 }, { "epoch": 0.06806794189014591, "grad_norm": 103.5284423828125, "learning_rate": 1.1302521008403361e-07, "logits/chosen": -0.32607728242874146, "logits/rejected": -0.42371827363967896, "logps/chosen": -133.71444702148438, "logps/rejected": -574.0999755859375, "loss": 0.0804, "rewards/accuracies": 0.953125, "rewards/chosen": -0.20223388075828552, "rewards/margins": 5.983984470367432, "rewards/rejected": -6.186718940734863, "step": 270 }, { "epoch": 0.07058897677496612, "grad_norm": 92.21675872802734, "learning_rate": 1.1722689075630252e-07, "logits/chosen": -0.5078521966934204, "logits/rejected": NaN, "logps/chosen": -134.66250610351562, "logps/rejected": -606.5499877929688, "loss": 0.114, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.197132870554924, "rewards/margins": 6.277636528015137, "rewards/rejected": -6.472070217132568, "step": 280 }, { "epoch": 0.07311001165978634, "grad_norm": 86.92982482910156, "learning_rate": 1.2142857142857143e-07, "logits/chosen": -0.489654541015625, "logits/rejected": -0.49092406034469604, "logps/chosen": -161.5046844482422, "logps/rejected": -588.9249877929688, "loss": 0.0852, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -0.3724265992641449, "rewards/margins": 6.262890815734863, "rewards/rejected": -6.637304782867432, "step": 290 }, { "epoch": 0.07563104654460656, "grad_norm": 29.039987564086914, "learning_rate": 1.2563025210084034e-07, "logits/chosen": -0.42262572050094604, "logits/rejected": NaN, "logps/chosen": -143.72225952148438, "logps/rejected": -619.2999877929688, "loss": 0.0404, "rewards/accuracies": 0.984375, "rewards/chosen": -0.05202636867761612, "rewards/margins": 6.761132717132568, "rewards/rejected": -6.810351371765137, "step": 300 }, { "epoch": 0.07815208142942678, "grad_norm": 73.7264175415039, "learning_rate": 1.2983193277310924e-07, "logits/chosen": -0.39212340116500854, "logits/rejected": NaN, "logps/chosen": -147.45156860351562, "logps/rejected": -605.0, "loss": 0.068, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2724365293979645, "rewards/margins": 6.6640625, "rewards/rejected": -6.936132907867432, "step": 310 }, { "epoch": 0.08067311631424699, "grad_norm": 89.97295379638672, "learning_rate": 1.3403361344537815e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -143.62344360351562, "logps/rejected": -582.9749755859375, "loss": 0.0902, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.3591275215148926, "rewards/margins": 6.760937690734863, "rewards/rejected": -7.117968559265137, "step": 320 }, { "epoch": 0.08319415119906722, "grad_norm": 134.4097137451172, "learning_rate": 1.3823529411764705e-07, "logits/chosen": -0.325225830078125, "logits/rejected": -0.31226807832717896, "logps/chosen": -137.3515625, "logps/rejected": -587.2999877929688, "loss": 0.0596, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.332815557718277, "rewards/margins": 6.865624904632568, "rewards/rejected": -7.199023246765137, "step": 330 }, { "epoch": 0.08571518608388744, "grad_norm": 59.434120178222656, "learning_rate": 1.4243697478991596e-07, "logits/chosen": -0.3578124940395355, "logits/rejected": -0.345663458108902, "logps/chosen": -151.04452514648438, "logps/rejected": -595.2249755859375, "loss": 0.1036, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.3701736330986023, "rewards/margins": 6.863671779632568, "rewards/rejected": -7.234570503234863, "step": 340 }, { "epoch": 0.08823622096870766, "grad_norm": 78.33367156982422, "learning_rate": 1.4663865546218486e-07, "logits/chosen": -0.3355819582939148, "logits/rejected": -0.4521728456020355, "logps/chosen": -129.328125, "logps/rejected": -586.25, "loss": 0.0379, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.41057586669921875, "rewards/margins": 7.12890625, "rewards/rejected": -7.538281440734863, "step": 350 }, { "epoch": 0.09075725585352787, "grad_norm": 8.3495454788208, "learning_rate": 1.5084033613445377e-07, "logits/chosen": -0.3832763731479645, "logits/rejected": NaN, "logps/chosen": -126.2421875, "logps/rejected": -588.0999755859375, "loss": 0.041, "rewards/accuracies": 0.984375, "rewards/chosen": -0.19840697944164276, "rewards/margins": 7.523046970367432, "rewards/rejected": -7.724023342132568, "step": 360 }, { "epoch": 0.09327829073834809, "grad_norm": 39.65727996826172, "learning_rate": 1.5504201680672267e-07, "logits/chosen": -0.35246580839157104, "logits/rejected": NaN, "logps/chosen": -133.8937530517578, "logps/rejected": -582.4500122070312, "loss": 0.0467, "rewards/accuracies": 0.984375, "rewards/chosen": -0.43196791410446167, "rewards/margins": 7.970703125, "rewards/rejected": -8.404296875, "step": 370 }, { "epoch": 0.09579932562316831, "grad_norm": 7.947347164154053, "learning_rate": 1.5924369747899158e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -142.2375030517578, "logps/rejected": -591.9000244140625, "loss": 0.0435, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.6537407040596008, "rewards/margins": 7.981493949890137, "rewards/rejected": -8.634180068969727, "step": 380 }, { "epoch": 0.09832036050798854, "grad_norm": 14.516721725463867, "learning_rate": 1.634453781512605e-07, "logits/chosen": -0.3058929443359375, "logits/rejected": NaN, "logps/chosen": -138.54531860351562, "logps/rejected": -605.5499877929688, "loss": 0.0596, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.48939210176467896, "rewards/margins": 8.316210746765137, "rewards/rejected": -8.8037109375, "step": 390 }, { "epoch": 0.10084139539280874, "grad_norm": 15.254312515258789, "learning_rate": 1.676470588235294e-07, "logits/chosen": -0.3211425840854645, "logits/rejected": -0.3604888916015625, "logps/chosen": -147.2156219482422, "logps/rejected": -623.75, "loss": 0.0458, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.712432861328125, "rewards/margins": 8.205273628234863, "rewards/rejected": -8.918359756469727, "step": 400 }, { "epoch": 0.10336243027762897, "grad_norm": 40.877899169921875, "learning_rate": 1.7184873949579832e-07, "logits/chosen": -0.3630828857421875, "logits/rejected": NaN, "logps/chosen": -140.2312469482422, "logps/rejected": -629.5750122070312, "loss": 0.0337, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.534045398235321, "rewards/margins": 9.034375190734863, "rewards/rejected": -9.566015243530273, "step": 410 }, { "epoch": 0.10588346516244919, "grad_norm": 91.06171417236328, "learning_rate": 1.7605042016806722e-07, "logits/chosen": -0.271994024515152, "logits/rejected": -0.3463989198207855, "logps/chosen": -146.0046844482422, "logps/rejected": -641.9000244140625, "loss": 0.0693, "rewards/accuracies": 0.96875, "rewards/chosen": -0.8040019869804382, "rewards/margins": 9.616406440734863, "rewards/rejected": -10.418749809265137, "step": 420 }, { "epoch": 0.1084045000472694, "grad_norm": 63.959781646728516, "learning_rate": 1.8025210084033613e-07, "logits/chosen": -0.3647705018520355, "logits/rejected": -0.36964112520217896, "logps/chosen": -158.95938110351562, "logps/rejected": -616.8499755859375, "loss": 0.0597, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.8494507074356079, "rewards/margins": 9.371874809265137, "rewards/rejected": -10.219531059265137, "step": 430 }, { "epoch": 0.11092553493208962, "grad_norm": 127.61373901367188, "learning_rate": 1.8445378151260503e-07, "logits/chosen": -0.3619628846645355, "logits/rejected": NaN, "logps/chosen": -135.265625, "logps/rejected": -600.1375122070312, "loss": 0.0723, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.43186646699905396, "rewards/margins": 8.658398628234863, "rewards/rejected": -9.088086128234863, "step": 440 }, { "epoch": 0.11344656981690984, "grad_norm": 239.58192443847656, "learning_rate": 1.8865546218487394e-07, "logits/chosen": -0.318746954202652, "logits/rejected": NaN, "logps/chosen": -141.28750610351562, "logps/rejected": -623.875, "loss": 0.0753, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.5964935421943665, "rewards/margins": 8.805273056030273, "rewards/rejected": -9.399999618530273, "step": 450 }, { "epoch": 0.11596760470173006, "grad_norm": 31.077861785888672, "learning_rate": 1.9285714285714284e-07, "logits/chosen": -0.2322540283203125, "logits/rejected": NaN, "logps/chosen": -175.7375030517578, "logps/rejected": -603.3250122070312, "loss": 0.0526, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -0.6093841791152954, "rewards/margins": 8.379687309265137, "rewards/rejected": -8.98828125, "step": 460 }, { "epoch": 0.11848863958655027, "grad_norm": 34.034854888916016, "learning_rate": 1.9705882352941175e-07, "logits/chosen": -0.27532655000686646, "logits/rejected": -0.3618118166923523, "logps/chosen": -145.8406219482422, "logps/rejected": -632.0250244140625, "loss": 0.0284, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5204010009765625, "rewards/margins": 9.147656440734863, "rewards/rejected": -9.667577743530273, "step": 470 }, { "epoch": 0.1210096744713705, "grad_norm": 150.0583953857422, "learning_rate": 2.0126050420168068e-07, "logits/chosen": -0.22016295790672302, "logits/rejected": NaN, "logps/chosen": -143.79531860351562, "logps/rejected": -609.2249755859375, "loss": 0.0601, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.8002033233642578, "rewards/margins": 9.013280868530273, "rewards/rejected": -9.810742378234863, "step": 480 }, { "epoch": 0.12353070935619072, "grad_norm": 154.522216796875, "learning_rate": 2.0546218487394956e-07, "logits/chosen": -0.256622314453125, "logits/rejected": NaN, "logps/chosen": -152.4968719482422, "logps/rejected": -623.9249877929688, "loss": 0.0671, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.874847412109375, "rewards/margins": 9.343358993530273, "rewards/rejected": -10.216796875, "step": 490 }, { "epoch": 0.12605174424101093, "grad_norm": 12.099916458129883, "learning_rate": 2.096638655462185e-07, "logits/chosen": -0.27784425020217896, "logits/rejected": -0.2543441653251648, "logps/chosen": -136.35781860351562, "logps/rejected": -601.2750244140625, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.627209484577179, "rewards/margins": 9.126562118530273, "rewards/rejected": -9.754687309265137, "step": 500 }, { "epoch": 0.12857277912583115, "grad_norm": 37.19308853149414, "learning_rate": 2.1386554621848737e-07, "logits/chosen": -0.21010132133960724, "logits/rejected": -0.360281378030777, "logps/chosen": -129.21249389648438, "logps/rejected": -612.875, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.6629524230957031, "rewards/margins": 9.358983993530273, "rewards/rejected": -10.020703315734863, "step": 510 }, { "epoch": 0.13109381401065137, "grad_norm": 147.76068115234375, "learning_rate": 2.180672268907563e-07, "logits/chosen": -0.21889495849609375, "logits/rejected": NaN, "logps/chosen": -152.0703125, "logps/rejected": -616.2999877929688, "loss": 0.0517, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9470916986465454, "rewards/margins": 9.478124618530273, "rewards/rejected": -10.426172256469727, "step": 520 }, { "epoch": 0.1336148488954716, "grad_norm": 46.334564208984375, "learning_rate": 2.2226890756302518e-07, "logits/chosen": -0.15239258110523224, "logits/rejected": NaN, "logps/chosen": -127.072265625, "logps/rejected": -599.625, "loss": 0.0501, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.8330215215682983, "rewards/margins": 9.802929878234863, "rewards/rejected": -10.636133193969727, "step": 530 }, { "epoch": 0.13613588378029182, "grad_norm": 261.5362548828125, "learning_rate": 2.264705882352941e-07, "logits/chosen": -0.2505554258823395, "logits/rejected": NaN, "logps/chosen": -149.4343719482422, "logps/rejected": -611.4249877929688, "loss": 0.0638, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.0054733753204346, "rewards/margins": 10.029882431030273, "rewards/rejected": -11.037890434265137, "step": 540 }, { "epoch": 0.13865691866511204, "grad_norm": 142.20545959472656, "learning_rate": 2.3067226890756302e-07, "logits/chosen": -0.21903076767921448, "logits/rejected": NaN, "logps/chosen": -154.61093139648438, "logps/rejected": -621.0, "loss": 0.0619, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.852313220500946, "rewards/margins": 9.61328125, "rewards/rejected": -10.46875, "step": 550 }, { "epoch": 0.14117795354993223, "grad_norm": 167.96498107910156, "learning_rate": 2.3487394957983192e-07, "logits/chosen": -0.17975768446922302, "logits/rejected": NaN, "logps/chosen": -141.39688110351562, "logps/rejected": -604.7000122070312, "loss": 0.0729, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -0.5346344113349915, "rewards/margins": 9.677343368530273, "rewards/rejected": -10.2119140625, "step": 560 }, { "epoch": 0.14369898843475246, "grad_norm": 91.47438049316406, "learning_rate": 2.3907563025210085e-07, "logits/chosen": -0.2486572265625, "logits/rejected": -0.11507568508386612, "logps/chosen": -143.02969360351562, "logps/rejected": -619.125, "loss": 0.0477, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6945648193359375, "rewards/margins": 9.453320503234863, "rewards/rejected": -10.143359184265137, "step": 570 }, { "epoch": 0.14622002331957268, "grad_norm": 34.233123779296875, "learning_rate": 2.4327731092436973e-07, "logits/chosen": -0.11099014431238174, "logits/rejected": -0.06396560370922089, "logps/chosen": -137.02188110351562, "logps/rejected": -633.6500244140625, "loss": 0.0434, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.9053863286972046, "rewards/margins": 10.24853515625, "rewards/rejected": -11.150390625, "step": 580 }, { "epoch": 0.1487410582043929, "grad_norm": 7.282068729400635, "learning_rate": 2.4747899159663866e-07, "logits/chosen": -0.257620245218277, "logits/rejected": NaN, "logps/chosen": -145.9562530517578, "logps/rejected": -615.5, "loss": 0.0374, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.782806396484375, "rewards/margins": 10.684374809265137, "rewards/rejected": -11.466405868530273, "step": 590 }, { "epoch": 0.15126209308921312, "grad_norm": 129.00172424316406, "learning_rate": 2.5168067226890754e-07, "logits/chosen": -0.2649803161621094, "logits/rejected": NaN, "logps/chosen": -143.96875, "logps/rejected": -657.9500122070312, "loss": 0.0447, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.1699615716934204, "rewards/margins": 11.480859756469727, "rewards/rejected": -12.647656440734863, "step": 600 }, { "epoch": 0.15378312797403335, "grad_norm": 26.55073356628418, "learning_rate": 2.558823529411764e-07, "logits/chosen": -0.180766299366951, "logits/rejected": NaN, "logps/chosen": -153.96875, "logps/rejected": -627.0, "loss": 0.0527, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.2839233875274658, "rewards/margins": 10.841796875, "rewards/rejected": -12.129687309265137, "step": 610 }, { "epoch": 0.15630416285885357, "grad_norm": 161.7655029296875, "learning_rate": 2.600840336134454e-07, "logits/chosen": -0.16300582885742188, "logits/rejected": NaN, "logps/chosen": -150.69686889648438, "logps/rejected": -621.0250244140625, "loss": 0.0834, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.1056334972381592, "rewards/margins": 10.753710746765137, "rewards/rejected": -11.861132621765137, "step": 620 }, { "epoch": 0.1588251977436738, "grad_norm": 26.444154739379883, "learning_rate": 2.642857142857143e-07, "logits/chosen": -0.25060731172561646, "logits/rejected": NaN, "logps/chosen": -147.88125610351562, "logps/rejected": -651.9249877929688, "loss": 0.0319, "rewards/accuracies": 0.984375, "rewards/chosen": -0.9451080560684204, "rewards/margins": 11.11328125, "rewards/rejected": -12.059374809265137, "step": 630 }, { "epoch": 0.16134623262849399, "grad_norm": 12.158480644226074, "learning_rate": 2.6848739495798316e-07, "logits/chosen": -0.2347976714372635, "logits/rejected": -0.15274658799171448, "logps/chosen": -138.35311889648438, "logps/rejected": -638.5999755859375, "loss": 0.0376, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -0.9307907223701477, "rewards/margins": 10.91796875, "rewards/rejected": -11.849218368530273, "step": 640 }, { "epoch": 0.1638672675133142, "grad_norm": 20.361804962158203, "learning_rate": 2.7268907563025204e-07, "logits/chosen": -0.21390685439109802, "logits/rejected": NaN, "logps/chosen": -145.8671875, "logps/rejected": -661.0499877929688, "loss": 0.0178, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0421158075332642, "rewards/margins": 11.625391006469727, "rewards/rejected": -12.671483993530273, "step": 650 }, { "epoch": 0.16638830239813443, "grad_norm": 111.68109893798828, "learning_rate": 2.76890756302521e-07, "logits/chosen": -0.21732711791992188, "logits/rejected": NaN, "logps/chosen": -147.84375, "logps/rejected": -664.5, "loss": 0.0392, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.9790900945663452, "rewards/margins": 11.541015625, "rewards/rejected": -12.517187118530273, "step": 660 }, { "epoch": 0.16890933728295465, "grad_norm": 17.39920997619629, "learning_rate": 2.810924369747899e-07, "logits/chosen": -0.12575379014015198, "logits/rejected": -0.14319458603858948, "logps/chosen": -153.4406280517578, "logps/rejected": -628.0250244140625, "loss": 0.0337, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0883651971817017, "rewards/margins": 10.819531440734863, "rewards/rejected": -11.912500381469727, "step": 670 }, { "epoch": 0.17143037216777487, "grad_norm": 57.879798889160156, "learning_rate": 2.852941176470588e-07, "logits/chosen": -0.2189628630876541, "logits/rejected": -0.23386840522289276, "logps/chosen": -136.9921875, "logps/rejected": -662.4749755859375, "loss": 0.0172, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.025854468345642, "rewards/margins": 12.080859184265137, "rewards/rejected": -13.106640815734863, "step": 680 }, { "epoch": 0.1739514070525951, "grad_norm": 9.353719711303711, "learning_rate": 2.8949579831932776e-07, "logits/chosen": -0.20029297471046448, "logits/rejected": NaN, "logps/chosen": -158.8312530517578, "logps/rejected": -660.6749877929688, "loss": 0.0573, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.638146996498108, "rewards/margins": 12.424609184265137, "rewards/rejected": -14.061327934265137, "step": 690 }, { "epoch": 0.17647244193741532, "grad_norm": 27.733154296875, "learning_rate": 2.9369747899159664e-07, "logits/chosen": -0.15936049818992615, "logits/rejected": NaN, "logps/chosen": -148.6515655517578, "logps/rejected": -622.9249877929688, "loss": 0.0528, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3777587413787842, "rewards/margins": 11.417577743530273, "rewards/rejected": -12.801953315734863, "step": 700 }, { "epoch": 0.17899347682223551, "grad_norm": 104.19239807128906, "learning_rate": 2.978991596638655e-07, "logits/chosen": -0.21711425483226776, "logits/rejected": NaN, "logps/chosen": -143.47811889648438, "logps/rejected": -647.0999755859375, "loss": 0.1018, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.0642181634902954, "rewards/margins": 11.556933403015137, "rewards/rejected": -12.6142578125, "step": 710 }, { "epoch": 0.18151451170705574, "grad_norm": 41.1180534362793, "learning_rate": 3.021008403361344e-07, "logits/chosen": -0.3298599123954773, "logits/rejected": NaN, "logps/chosen": -151.18594360351562, "logps/rejected": -643.875, "loss": 0.0468, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -0.972613513469696, "rewards/margins": 11.009374618530273, "rewards/rejected": -11.980077743530273, "step": 720 }, { "epoch": 0.18403554659187596, "grad_norm": 10.826868057250977, "learning_rate": 3.063025210084034e-07, "logits/chosen": -0.2567993104457855, "logits/rejected": NaN, "logps/chosen": -140.6531219482422, "logps/rejected": -650.0250244140625, "loss": 0.0264, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.019963026046753, "rewards/margins": 11.306249618530273, "rewards/rejected": -12.331640243530273, "step": 730 }, { "epoch": 0.18655658147669618, "grad_norm": 29.72112274169922, "learning_rate": 3.1050420168067226e-07, "logits/chosen": -0.24164123833179474, "logits/rejected": -0.24046020209789276, "logps/chosen": -148.8625030517578, "logps/rejected": -676.4500122070312, "loss": 0.0177, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.5790176391601562, "rewards/margins": 12.710156440734863, "rewards/rejected": -14.289843559265137, "step": 740 }, { "epoch": 0.1890776163615164, "grad_norm": 34.85043716430664, "learning_rate": 3.1470588235294114e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -142.83438110351562, "logps/rejected": -662.0750122070312, "loss": 0.0404, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.541131615638733, "rewards/margins": 12.637499809265137, "rewards/rejected": -14.1796875, "step": 750 }, { "epoch": 0.19159865124633663, "grad_norm": 78.05842590332031, "learning_rate": 3.1890756302521007e-07, "logits/chosen": -0.1314697265625, "logits/rejected": NaN, "logps/chosen": -152.41561889648438, "logps/rejected": -652.8499755859375, "loss": 0.0258, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1221115589141846, "rewards/margins": 12.901562690734863, "rewards/rejected": -14.01953125, "step": 760 }, { "epoch": 0.19411968613115685, "grad_norm": 26.553483963012695, "learning_rate": 3.23109243697479e-07, "logits/chosen": -0.09644164890050888, "logits/rejected": NaN, "logps/chosen": -151.79061889648438, "logps/rejected": -641.0, "loss": 0.042, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1772797107696533, "rewards/margins": 13.462109565734863, "rewards/rejected": -14.64453125, "step": 770 }, { "epoch": 0.19664072101597707, "grad_norm": 63.085487365722656, "learning_rate": 3.273109243697479e-07, "logits/chosen": -0.09723053127527237, "logits/rejected": NaN, "logps/chosen": -164.484375, "logps/rejected": -676.7750244140625, "loss": 0.0664, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -1.7198760509490967, "rewards/margins": 13.627734184265137, "rewards/rejected": -15.342577934265137, "step": 780 }, { "epoch": 0.19916175590079727, "grad_norm": 73.2733154296875, "learning_rate": 3.3151260504201676e-07, "logits/chosen": -0.06730346381664276, "logits/rejected": NaN, "logps/chosen": -157.6593780517578, "logps/rejected": -705.25, "loss": 0.0418, "rewards/accuracies": 0.984375, "rewards/chosen": -1.4564666748046875, "rewards/margins": 14.506250381469727, "rewards/rejected": -15.966405868530273, "step": 790 }, { "epoch": 0.2016827907856175, "grad_norm": 156.40457153320312, "learning_rate": 3.357142857142857e-07, "logits/chosen": -0.1163356751203537, "logits/rejected": NaN, "logps/chosen": -136.40469360351562, "logps/rejected": -645.9500122070312, "loss": 0.0646, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -0.9577881097793579, "rewards/margins": 13.38671875, "rewards/rejected": -14.3515625, "step": 800 }, { "epoch": 0.2042038256704377, "grad_norm": 1.094292402267456, "learning_rate": 3.399159663865546e-07, "logits/chosen": -0.19172057509422302, "logits/rejected": NaN, "logps/chosen": -140.5500030517578, "logps/rejected": -636.9249877929688, "loss": 0.0212, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.0465118885040283, "rewards/margins": 12.727343559265137, "rewards/rejected": -13.775781631469727, "step": 810 }, { "epoch": 0.20672486055525793, "grad_norm": 45.88282775878906, "learning_rate": 3.441176470588235e-07, "logits/chosen": 0.01890411414206028, "logits/rejected": NaN, "logps/chosen": -159.8546905517578, "logps/rejected": -662.5250244140625, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.0829299688339233, "rewards/margins": 12.585156440734863, "rewards/rejected": -13.670702934265137, "step": 820 }, { "epoch": 0.20924589544007816, "grad_norm": 7.57520866394043, "learning_rate": 3.483193277310924e-07, "logits/chosen": -0.0719757080078125, "logits/rejected": NaN, "logps/chosen": -158.0695343017578, "logps/rejected": -668.5499877929688, "loss": 0.0223, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.6362335681915283, "rewards/margins": 13.396093368530273, "rewards/rejected": -15.033594131469727, "step": 830 }, { "epoch": 0.21176693032489838, "grad_norm": 39.72539520263672, "learning_rate": 3.5252100840336136e-07, "logits/chosen": -0.03143615648150444, "logits/rejected": NaN, "logps/chosen": -156.8125, "logps/rejected": -665.4500122070312, "loss": 0.0275, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.699467420578003, "rewards/margins": 13.196680068969727, "rewards/rejected": -14.8984375, "step": 840 }, { "epoch": 0.2142879652097186, "grad_norm": 98.35755920410156, "learning_rate": 3.5672268907563024e-07, "logits/chosen": -0.2149612456560135, "logits/rejected": -0.11498565971851349, "logps/chosen": -139.17813110351562, "logps/rejected": -645.7750244140625, "loss": 0.053, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.134879469871521, "rewards/margins": 13.194140434265137, "rewards/rejected": -14.328906059265137, "step": 850 }, { "epoch": 0.2168090000945388, "grad_norm": 17.604656219482422, "learning_rate": 3.609243697478991e-07, "logits/chosen": -0.07838897407054901, "logits/rejected": NaN, "logps/chosen": -137.8484344482422, "logps/rejected": -654.7999877929688, "loss": 0.0111, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.2402340173721313, "rewards/margins": 13.36328125, "rewards/rejected": -14.602343559265137, "step": 860 }, { "epoch": 0.21933003497935902, "grad_norm": 3.9229958057403564, "learning_rate": 3.6512605042016805e-07, "logits/chosen": -0.03133545070886612, "logits/rejected": 0.13977356255054474, "logps/chosen": -133.5046844482422, "logps/rejected": -655.4000244140625, "loss": 0.049, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.663018822669983, "rewards/margins": 13.384374618530273, "rewards/rejected": -15.049219131469727, "step": 870 }, { "epoch": 0.22185106986417924, "grad_norm": 55.13627243041992, "learning_rate": 3.69327731092437e-07, "logits/chosen": -0.204803466796875, "logits/rejected": NaN, "logps/chosen": -139.0734405517578, "logps/rejected": -699.9749755859375, "loss": 0.0204, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.403350830078125, "rewards/margins": 14.721094131469727, "rewards/rejected": -16.130468368530273, "step": 880 }, { "epoch": 0.22437210474899946, "grad_norm": 5.272310733795166, "learning_rate": 3.7352941176470586e-07, "logits/chosen": -0.12968139350414276, "logits/rejected": NaN, "logps/chosen": -159.52499389648438, "logps/rejected": -645.0499877929688, "loss": 0.0471, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.655371069908142, "rewards/margins": 13.823437690734863, "rewards/rejected": -15.482812881469727, "step": 890 }, { "epoch": 0.22689313963381968, "grad_norm": 9.145122528076172, "learning_rate": 3.7773109243697474e-07, "logits/chosen": -0.21729126572608948, "logits/rejected": NaN, "logps/chosen": -152.1984405517578, "logps/rejected": -662.3250122070312, "loss": 0.0321, "rewards/accuracies": 0.984375, "rewards/chosen": -1.6608703136444092, "rewards/margins": 14.476953506469727, "rewards/rejected": -16.135936737060547, "step": 900 }, { "epoch": 0.2294141745186399, "grad_norm": 26.28160858154297, "learning_rate": 3.8193277310924367e-07, "logits/chosen": -0.102020263671875, "logits/rejected": NaN, "logps/chosen": -192.0265655517578, "logps/rejected": -654.7999877929688, "loss": 0.0537, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0395140647888184, "rewards/margins": 14.080859184265137, "rewards/rejected": -16.122655868530273, "step": 910 }, { "epoch": 0.23193520940346013, "grad_norm": 107.3590316772461, "learning_rate": 3.861344537815126e-07, "logits/chosen": -0.14012756943702698, "logits/rejected": NaN, "logps/chosen": -149.4812469482422, "logps/rejected": -669.7750244140625, "loss": 0.0456, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.3984194993972778, "rewards/margins": 14.499218940734863, "rewards/rejected": -15.889062881469727, "step": 920 }, { "epoch": 0.23445624428828035, "grad_norm": 23.720867156982422, "learning_rate": 3.903361344537815e-07, "logits/chosen": -0.2259521484375, "logits/rejected": NaN, "logps/chosen": -162.6453094482422, "logps/rejected": -654.4000244140625, "loss": 0.0304, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3534362316131592, "rewards/margins": 13.361328125, "rewards/rejected": -14.722265243530273, "step": 930 }, { "epoch": 0.23697727917310055, "grad_norm": 3.2642321586608887, "learning_rate": 3.945378151260504e-07, "logits/chosen": -0.19977417588233948, "logits/rejected": NaN, "logps/chosen": -141.359375, "logps/rejected": -671.75, "loss": 0.0533, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.5034576654434204, "rewards/margins": 14.128515243530273, "rewards/rejected": -15.631640434265137, "step": 940 }, { "epoch": 0.23949831405792077, "grad_norm": 61.138343811035156, "learning_rate": 3.9873949579831934e-07, "logits/chosen": -0.00986700039356947, "logits/rejected": NaN, "logps/chosen": -148.18905639648438, "logps/rejected": -665.2999877929688, "loss": 0.0644, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.6996314525604248, "rewards/margins": 13.725390434265137, "rewards/rejected": -15.426562309265137, "step": 950 }, { "epoch": 0.242019348942741, "grad_norm": 14.840069770812988, "learning_rate": 4.029411764705882e-07, "logits/chosen": -0.20095977187156677, "logits/rejected": NaN, "logps/chosen": -140.5812530517578, "logps/rejected": -654.625, "loss": 0.0346, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.284490942955017, "rewards/margins": 13.808984756469727, "rewards/rejected": -15.099218368530273, "step": 960 }, { "epoch": 0.24454038382756121, "grad_norm": 75.08119201660156, "learning_rate": 4.071428571428571e-07, "logits/chosen": -0.24829712510108948, "logits/rejected": NaN, "logps/chosen": -152.00936889648438, "logps/rejected": -682.5250244140625, "loss": 0.0354, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5782196521759033, "rewards/margins": 14.588671684265137, "rewards/rejected": -16.172657012939453, "step": 970 }, { "epoch": 0.24706141871238144, "grad_norm": 45.17262649536133, "learning_rate": 4.1134453781512603e-07, "logits/chosen": -0.2459152191877365, "logits/rejected": NaN, "logps/chosen": -156.59219360351562, "logps/rejected": -677.2249755859375, "loss": 0.0415, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.675347924232483, "rewards/margins": 14.583984375, "rewards/rejected": -16.26171875, "step": 980 }, { "epoch": 0.24958245359720166, "grad_norm": 89.21812438964844, "learning_rate": 4.1554621848739496e-07, "logits/chosen": -0.180805966258049, "logits/rejected": -0.20294189453125, "logps/chosen": -143.0359344482422, "logps/rejected": -682.7000122070312, "loss": 0.032, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.359087347984314, "rewards/margins": 15.517187118530273, "rewards/rejected": -16.874217987060547, "step": 990 }, { "epoch": 0.25210348848202185, "grad_norm": 128.68922424316406, "learning_rate": 4.1974789915966384e-07, "logits/chosen": -0.22924499213695526, "logits/rejected": -0.3039306700229645, "logps/chosen": -145.2921905517578, "logps/rejected": -722.2249755859375, "loss": 0.0351, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.4789642095565796, "rewards/margins": 16.240625381469727, "rewards/rejected": -17.715234756469727, "step": 1000 }, { "epoch": 0.2546245233668421, "grad_norm": 70.48312377929688, "learning_rate": 4.239495798319327e-07, "logits/chosen": -0.17773742973804474, "logits/rejected": NaN, "logps/chosen": -173.58749389648438, "logps/rejected": -703.2999877929688, "loss": 0.0339, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0451292991638184, "rewards/margins": 15.705273628234863, "rewards/rejected": -17.748437881469727, "step": 1010 }, { "epoch": 0.2571455582516623, "grad_norm": 0.6016368865966797, "learning_rate": 4.2815126050420165e-07, "logits/chosen": -0.2291107177734375, "logits/rejected": NaN, "logps/chosen": -150.81405639648438, "logps/rejected": -678.5499877929688, "loss": 0.0434, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.407934546470642, "rewards/margins": 16.046875, "rewards/rejected": -17.450780868530273, "step": 1020 }, { "epoch": 0.25966659313648255, "grad_norm": 65.30123138427734, "learning_rate": 4.323529411764706e-07, "logits/chosen": -0.22453002631664276, "logits/rejected": NaN, "logps/chosen": -154.4562530517578, "logps/rejected": -675.2750244140625, "loss": 0.043, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -0.9184325933456421, "rewards/margins": 14.559765815734863, "rewards/rejected": -15.479687690734863, "step": 1030 }, { "epoch": 0.26218762802130274, "grad_norm": 7.09271764755249, "learning_rate": 4.3655462184873946e-07, "logits/chosen": NaN, "logits/rejected": 0.01162109337747097, "logps/chosen": -154.27188110351562, "logps/rejected": -684.0250244140625, "loss": 0.055, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.9839751720428467, "rewards/margins": 15.699609756469727, "rewards/rejected": -17.6875, "step": 1040 }, { "epoch": 0.26470866290612294, "grad_norm": 17.634531021118164, "learning_rate": 4.407563025210084e-07, "logits/chosen": -0.217387393116951, "logits/rejected": NaN, "logps/chosen": -121.3765640258789, "logps/rejected": -659.3125, "loss": 0.0108, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.7791488766670227, "rewards/margins": 15.673047065734863, "rewards/rejected": -16.450389862060547, "step": 1050 }, { "epoch": 0.2672296977909432, "grad_norm": 6.283233642578125, "learning_rate": 4.4495798319327727e-07, "logits/chosen": -0.132823184132576, "logits/rejected": NaN, "logps/chosen": -127.9671859741211, "logps/rejected": -673.0999755859375, "loss": 0.0434, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.0257995128631592, "rewards/margins": 15.92578125, "rewards/rejected": -16.961328506469727, "step": 1060 }, { "epoch": 0.2697507326757634, "grad_norm": 0.28115129470825195, "learning_rate": 4.491596638655462e-07, "logits/chosen": -0.27166444063186646, "logits/rejected": NaN, "logps/chosen": -147.0, "logps/rejected": -722.7000122070312, "loss": 0.0125, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4796035289764404, "rewards/margins": 17.301563262939453, "rewards/rejected": -18.783594131469727, "step": 1070 }, { "epoch": 0.27227176756058363, "grad_norm": 7.69546365737915, "learning_rate": 4.533613445378151e-07, "logits/chosen": -0.13826294243335724, "logits/rejected": 0.021759033203125, "logps/chosen": -158.5031280517578, "logps/rejected": -709.9000244140625, "loss": 0.0747, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.104818820953369, "rewards/margins": 17.4765625, "rewards/rejected": -19.583593368530273, "step": 1080 }, { "epoch": 0.2747928024454038, "grad_norm": 0.08461493253707886, "learning_rate": 4.57563025210084e-07, "logits/chosen": -0.17571716010570526, "logits/rejected": NaN, "logps/chosen": -148.58438110351562, "logps/rejected": -681.875, "loss": 0.0746, "rewards/accuracies": 0.96875, "rewards/chosen": -1.9655883312225342, "rewards/margins": 16.105077743530273, "rewards/rejected": -18.069921493530273, "step": 1090 }, { "epoch": 0.2773138373302241, "grad_norm": 0.3307419717311859, "learning_rate": 4.6176470588235295e-07, "logits/chosen": -0.16093139350414276, "logits/rejected": NaN, "logps/chosen": -152.8156280517578, "logps/rejected": -705.5499877929688, "loss": 0.0488, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.5885406732559204, "rewards/margins": 16.139062881469727, "rewards/rejected": -17.732030868530273, "step": 1100 }, { "epoch": 0.2798348722150443, "grad_norm": 58.64488220214844, "learning_rate": 4.659663865546218e-07, "logits/chosen": -0.15778693556785583, "logits/rejected": -0.024840544909238815, "logps/chosen": -155.35000610351562, "logps/rejected": -648.1500244140625, "loss": 0.0919, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.47723388671875, "rewards/margins": 15.092577934265137, "rewards/rejected": -16.571874618530273, "step": 1110 }, { "epoch": 0.28235590709986447, "grad_norm": 1.0814224481582642, "learning_rate": 4.7016806722689076e-07, "logits/chosen": -0.20961609482765198, "logits/rejected": -0.11320953071117401, "logps/chosen": -144.4328155517578, "logps/rejected": -686.4000244140625, "loss": 0.0163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4766509532928467, "rewards/margins": 16.342578887939453, "rewards/rejected": -17.811717987060547, "step": 1120 }, { "epoch": 0.2848769419846847, "grad_norm": 102.6927719116211, "learning_rate": 4.7436974789915963e-07, "logits/chosen": -0.352386474609375, "logits/rejected": NaN, "logps/chosen": -161.2937469482422, "logps/rejected": -722.1500244140625, "loss": 0.0405, "rewards/accuracies": 0.984375, "rewards/chosen": -1.7180572748184204, "rewards/margins": 16.192968368530273, "rewards/rejected": -17.912500381469727, "step": 1130 }, { "epoch": 0.2873979768695049, "grad_norm": 3.4906530380249023, "learning_rate": 4.785714285714286e-07, "logits/chosen": -0.40484619140625, "logits/rejected": NaN, "logps/chosen": -144.22811889648438, "logps/rejected": -669.125, "loss": 0.036, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1700347661972046, "rewards/margins": 15.039453506469727, "rewards/rejected": -16.216405868530273, "step": 1140 }, { "epoch": 0.28991901175432516, "grad_norm": 68.7790298461914, "learning_rate": 4.827731092436974e-07, "logits/chosen": -0.24266357719898224, "logits/rejected": -0.15537413954734802, "logps/chosen": -151.1999969482422, "logps/rejected": -649.625, "loss": 0.0306, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.4614531993865967, "rewards/margins": 13.966796875, "rewards/rejected": -15.4296875, "step": 1150 }, { "epoch": 0.29244004663914536, "grad_norm": 116.71124267578125, "learning_rate": 4.869747899159664e-07, "logits/chosen": -0.3101333677768707, "logits/rejected": NaN, "logps/chosen": -192.0671844482422, "logps/rejected": -689.4500122070312, "loss": 0.0721, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.8016784191131592, "rewards/margins": 15.58984375, "rewards/rejected": -17.397655487060547, "step": 1160 }, { "epoch": 0.2949610815239656, "grad_norm": 202.02764892578125, "learning_rate": 4.911764705882352e-07, "logits/chosen": -0.29569703340530396, "logits/rejected": NaN, "logps/chosen": -178.30624389648438, "logps/rejected": -674.375, "loss": 0.0631, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.033721923828125, "rewards/margins": 14.323827743530273, "rewards/rejected": -16.357812881469727, "step": 1170 }, { "epoch": 0.2974821164087858, "grad_norm": 173.7059783935547, "learning_rate": 4.953781512605042e-07, "logits/chosen": -0.2562408447265625, "logits/rejected": NaN, "logps/chosen": -140.72225952148438, "logps/rejected": -679.0499877929688, "loss": 0.0493, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.1406456232070923, "rewards/margins": 15.240234375, "rewards/rejected": -16.37890625, "step": 1180 }, { "epoch": 0.30000315129360605, "grad_norm": 9.604238510131836, "learning_rate": 4.995798319327731e-07, "logits/chosen": -0.2718765139579773, "logits/rejected": NaN, "logps/chosen": -158.1750030517578, "logps/rejected": -702.125, "loss": 0.0682, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.565240502357483, "rewards/margins": 14.87109375, "rewards/rejected": -16.424610137939453, "step": 1190 }, { "epoch": 0.30252418617842625, "grad_norm": 10.292096138000488, "learning_rate": 4.999991284791131e-07, "logits/chosen": -0.2040252685546875, "logits/rejected": -0.04181518405675888, "logps/chosen": -144.1875, "logps/rejected": -698.9000244140625, "loss": 0.0262, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.4058220386505127, "rewards/margins": 14.449609756469727, "rewards/rejected": -15.856249809265137, "step": 1200 }, { "epoch": 0.30504522106324644, "grad_norm": 8.388823509216309, "learning_rate": 4.999961158221198e-07, "logits/chosen": -0.315652459859848, "logits/rejected": NaN, "logps/chosen": -154.0500030517578, "logps/rejected": -668.0750122070312, "loss": 0.0386, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.084283471107483, "rewards/margins": 13.185937881469727, "rewards/rejected": -14.261327743530273, "step": 1210 }, { "epoch": 0.3075662559480667, "grad_norm": 116.46009826660156, "learning_rate": 4.999909512954288e-07, "logits/chosen": -0.13143157958984375, "logits/rejected": NaN, "logps/chosen": -143.203125, "logps/rejected": -665.625, "loss": 0.0389, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.7484772205352783, "rewards/margins": 15.1953125, "rewards/rejected": -16.946874618530273, "step": 1220 }, { "epoch": 0.3100872908328869, "grad_norm": 0.3287848234176636, "learning_rate": 4.999836349434941e-07, "logits/chosen": -0.170379638671875, "logits/rejected": NaN, "logps/chosen": -154.57656860351562, "logps/rejected": -724.6500244140625, "loss": 0.0604, "rewards/accuracies": 0.96875, "rewards/chosen": -1.897332787513733, "rewards/margins": 17.006641387939453, "rewards/rejected": -18.911718368530273, "step": 1230 }, { "epoch": 0.31260832571770714, "grad_norm": 39.44196701049805, "learning_rate": 4.999741668292923e-07, "logits/chosen": -0.08110733330249786, "logits/rejected": NaN, "logps/chosen": -128.61563110351562, "logps/rejected": -690.75, "loss": 0.0146, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.5122848749160767, "rewards/margins": 16.572656631469727, "rewards/rejected": -18.080469131469727, "step": 1240 }, { "epoch": 0.31512936060252733, "grad_norm": 258.909423828125, "learning_rate": 4.999625470343211e-07, "logits/chosen": -0.09961853176355362, "logits/rejected": NaN, "logps/chosen": -158.9406280517578, "logps/rejected": -676.5750122070312, "loss": 0.0841, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.2067718505859375, "rewards/margins": 16.459375381469727, "rewards/rejected": -18.662500381469727, "step": 1250 }, { "epoch": 0.3176503954873476, "grad_norm": 3.5725820064544678, "learning_rate": 4.999487756585992e-07, "logits/chosen": -0.23354491591453552, "logits/rejected": NaN, "logps/chosen": -148.94686889648438, "logps/rejected": -673.5250244140625, "loss": 0.0597, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.583215355873108, "rewards/margins": 15.163671493530273, "rewards/rejected": -16.7578125, "step": 1260 }, { "epoch": 0.3201714303721678, "grad_norm": 209.47427368164062, "learning_rate": 4.999328528206654e-07, "logits/chosen": -0.16692200303077698, "logits/rejected": -0.10303497314453125, "logps/chosen": -157.4968719482422, "logps/rejected": -676.7000122070312, "loss": 0.0653, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.6222412586212158, "rewards/margins": 14.34765625, "rewards/rejected": -15.973047256469727, "step": 1270 }, { "epoch": 0.32269246525698797, "grad_norm": 18.309415817260742, "learning_rate": 4.99914778657577e-07, "logits/chosen": -0.20359496772289276, "logits/rejected": -0.10723724216222763, "logps/chosen": -139.3015594482422, "logps/rejected": -667.8250122070312, "loss": 0.0301, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.3667999505996704, "rewards/margins": 15.403905868530273, "rewards/rejected": -16.76953125, "step": 1280 }, { "epoch": 0.3252135001418082, "grad_norm": 9.32630443572998, "learning_rate": 4.998945533249097e-07, "logits/chosen": -0.11687926948070526, "logits/rejected": 0.048187255859375, "logps/chosen": -140.52499389648438, "logps/rejected": -678.75, "loss": 0.0621, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.7837555408477783, "rewards/margins": 14.629687309265137, "rewards/rejected": -16.41015625, "step": 1290 }, { "epoch": 0.3277345350266284, "grad_norm": 79.15485382080078, "learning_rate": 4.998721769967553e-07, "logits/chosen": -0.14106139540672302, "logits/rejected": -0.18908843398094177, "logps/chosen": -153.4578094482422, "logps/rejected": -668.2999877929688, "loss": 0.0455, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.7294890880584717, "rewards/margins": 14.367968559265137, "rewards/rejected": -16.095703125, "step": 1300 }, { "epoch": 0.33025556991144867, "grad_norm": 98.85838317871094, "learning_rate": 4.998476498657204e-07, "logits/chosen": -0.1795295774936676, "logits/rejected": -0.15382690727710724, "logps/chosen": -159.9812469482422, "logps/rejected": -701.0999755859375, "loss": 0.0453, "rewards/accuracies": 0.984375, "rewards/chosen": -1.888696312904358, "rewards/margins": 14.964062690734863, "rewards/rejected": -16.857030868530273, "step": 1310 }, { "epoch": 0.33277660479626886, "grad_norm": 249.66189575195312, "learning_rate": 4.998209721429251e-07, "logits/chosen": -0.10121841728687286, "logits/rejected": NaN, "logps/chosen": -152.2578125, "logps/rejected": -677.5499877929688, "loss": 0.0655, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7858489751815796, "rewards/margins": 16.123437881469727, "rewards/rejected": -17.91796875, "step": 1320 }, { "epoch": 0.3352976396810891, "grad_norm": 0.31916019320487976, "learning_rate": 4.99792144058001e-07, "logits/chosen": -0.20395508408546448, "logits/rejected": -0.01833343505859375, "logps/chosen": -145.2003936767578, "logps/rejected": -706.7249755859375, "loss": 0.1009, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.7696654796600342, "rewards/margins": 17.262889862060547, "rewards/rejected": -19.040624618530273, "step": 1330 }, { "epoch": 0.3378186745659093, "grad_norm": 205.91934204101562, "learning_rate": 4.997611658590889e-07, "logits/chosen": -0.22812728583812714, "logits/rejected": NaN, "logps/chosen": -173.0124969482422, "logps/rejected": -711.7999877929688, "loss": 0.0591, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.3512940406799316, "rewards/margins": 16.846094131469727, "rewards/rejected": -19.194530487060547, "step": 1340 }, { "epoch": 0.3403397094507295, "grad_norm": 25.222749710083008, "learning_rate": 4.997280378128374e-07, "logits/chosen": -0.19909057021141052, "logits/rejected": NaN, "logps/chosen": -141.3468780517578, "logps/rejected": -692.5, "loss": 0.0401, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7017700672149658, "rewards/margins": 16.875, "rewards/rejected": -18.569141387939453, "step": 1350 }, { "epoch": 0.34286074433554975, "grad_norm": 17.87675666809082, "learning_rate": 4.996927602043996e-07, "logits/chosen": -0.109161376953125, "logits/rejected": NaN, "logps/chosen": -152.21875, "logps/rejected": -700.2000122070312, "loss": 0.0264, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1607666015625, "rewards/margins": 16.373437881469727, "rewards/rejected": -18.532032012939453, "step": 1360 }, { "epoch": 0.34538177922036994, "grad_norm": 0.07087402790784836, "learning_rate": 4.996553333374317e-07, "logits/chosen": -0.1655426025390625, "logits/rejected": NaN, "logps/chosen": -156.6359405517578, "logps/rejected": -698.4500122070312, "loss": 0.031, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8177391290664673, "rewards/margins": 17.153514862060547, "rewards/rejected": -18.985157012939453, "step": 1370 }, { "epoch": 0.3479028141051902, "grad_norm": 5.534618377685547, "learning_rate": 4.996157575340895e-07, "logits/chosen": -0.1394607573747635, "logits/rejected": -0.02096252515912056, "logps/chosen": -149.35311889648438, "logps/rejected": -713.3250122070312, "loss": 0.0376, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.688543677330017, "rewards/margins": 16.490234375, "rewards/rejected": -18.170312881469727, "step": 1380 }, { "epoch": 0.3504238489900104, "grad_norm": 110.22240447998047, "learning_rate": 4.995740331350264e-07, "logits/chosen": -0.2533416748046875, "logits/rejected": NaN, "logps/chosen": -158.4765625, "logps/rejected": -667.7999877929688, "loss": 0.2509, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2084479331970215, "rewards/margins": 15.970312118530273, "rewards/rejected": -18.178905487060547, "step": 1390 }, { "epoch": 0.35294488387483064, "grad_norm": 1.5085846185684204, "learning_rate": 4.995301604993895e-07, "logits/chosen": -0.1863609254360199, "logits/rejected": NaN, "logps/chosen": -142.1593780517578, "logps/rejected": -709.0750122070312, "loss": 0.0254, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.8307082653045654, "rewards/margins": 16.404296875, "rewards/rejected": -18.241796493530273, "step": 1400 }, { "epoch": 0.35546591875965083, "grad_norm": 3.4462571144104004, "learning_rate": 4.994841400048178e-07, "logits/chosen": -0.22318725287914276, "logits/rejected": NaN, "logps/chosen": -159.2062530517578, "logps/rejected": -682.9000244140625, "loss": 0.0921, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -1.6819045543670654, "rewards/margins": 15.373437881469727, "rewards/rejected": -17.058202743530273, "step": 1410 }, { "epoch": 0.35798695364447103, "grad_norm": 21.146312713623047, "learning_rate": 4.994359720474378e-07, "logits/chosen": -0.03800506517291069, "logits/rejected": NaN, "logps/chosen": -151.71249389648438, "logps/rejected": -710.0499877929688, "loss": 0.1153, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.800921678543091, "rewards/margins": 18.053905487060547, "rewards/rejected": -20.848438262939453, "step": 1420 }, { "epoch": 0.3605079885292913, "grad_norm": 3.735551357269287, "learning_rate": 4.993856570418606e-07, "logits/chosen": 0.08062133938074112, "logits/rejected": NaN, "logps/chosen": -196.0906219482422, "logps/rejected": -724.5750122070312, "loss": 0.0431, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.146960496902466, "rewards/margins": 17.836328506469727, "rewards/rejected": -20.982030868530273, "step": 1430 }, { "epoch": 0.3630290234141115, "grad_norm": 12.435103416442871, "learning_rate": 4.993331954211781e-07, "logits/chosen": 0.03031310997903347, "logits/rejected": 0.06370238959789276, "logps/chosen": -163.3828125, "logps/rejected": -699.7249755859375, "loss": 0.0517, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.664318799972534, "rewards/margins": 17.264062881469727, "rewards/rejected": -19.924999237060547, "step": 1440 }, { "epoch": 0.3655500582989317, "grad_norm": 46.92791748046875, "learning_rate": 4.992785876369598e-07, "logits/chosen": -0.07246093451976776, "logits/rejected": 0.04775543138384819, "logps/chosen": -162.91561889648438, "logps/rejected": -725.75, "loss": 0.1249, "rewards/accuracies": 0.96875, "rewards/chosen": -2.113879442214966, "rewards/margins": 17.975391387939453, "rewards/rejected": -20.095312118530273, "step": 1450 }, { "epoch": 0.3680710931837519, "grad_norm": 1.3051823377609253, "learning_rate": 4.992218341592484e-07, "logits/chosen": -0.19097900390625, "logits/rejected": 0.01913147047162056, "logps/chosen": -169.84375, "logps/rejected": -734.0, "loss": 0.0648, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9871277809143066, "rewards/margins": 18.7421875, "rewards/rejected": -21.743749618530273, "step": 1460 }, { "epoch": 0.37059212806857217, "grad_norm": 18.901264190673828, "learning_rate": 4.991629354765556e-07, "logits/chosen": -0.03374633938074112, "logits/rejected": NaN, "logps/chosen": -177.69375610351562, "logps/rejected": -724.2249755859375, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.9853179454803467, "rewards/margins": 17.438282012939453, "rewards/rejected": -19.423437118530273, "step": 1470 }, { "epoch": 0.37311316295339236, "grad_norm": 6.783510208129883, "learning_rate": 4.991018920958586e-07, "logits/chosen": 0.008166504092514515, "logits/rejected": NaN, "logps/chosen": -157.6953125, "logps/rejected": -713.8499755859375, "loss": 0.0468, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.256103515625, "rewards/margins": 18.553125381469727, "rewards/rejected": -20.8046875, "step": 1480 }, { "epoch": 0.3756341978382126, "grad_norm": 85.04174041748047, "learning_rate": 4.990387045425952e-07, "logits/chosen": -0.0019210815662518144, "logits/rejected": NaN, "logps/chosen": -168.4875030517578, "logps/rejected": -747.75, "loss": 0.0244, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8998260498046875, "rewards/margins": 18.576171875, "rewards/rejected": -21.483983993530273, "step": 1490 }, { "epoch": 0.3781552327230328, "grad_norm": 162.41421508789062, "learning_rate": 4.989733733606593e-07, "logits/chosen": 0.0838470458984375, "logits/rejected": NaN, "logps/chosen": -178.1062469482422, "logps/rejected": -728.125, "loss": 0.0806, "rewards/accuracies": 0.96875, "rewards/chosen": -3.102832078933716, "rewards/margins": 17.641796112060547, "rewards/rejected": -20.739063262939453, "step": 1500 }, { "epoch": 0.380676267607853, "grad_norm": 1.560049057006836, "learning_rate": 4.989058991123964e-07, "logits/chosen": -0.02738647535443306, "logits/rejected": NaN, "logps/chosen": -159.3000030517578, "logps/rejected": -729.7249755859375, "loss": 0.0103, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.1772704124450684, "rewards/margins": 19.381250381469727, "rewards/rejected": -21.560937881469727, "step": 1510 }, { "epoch": 0.38319730249267325, "grad_norm": 0.1662023961544037, "learning_rate": 4.988362823785987e-07, "logits/chosen": -0.07960204780101776, "logits/rejected": NaN, "logps/chosen": -167.99063110351562, "logps/rejected": -727.3499755859375, "loss": 0.0835, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.382397413253784, "rewards/margins": 18.989063262939453, "rewards/rejected": -21.380468368530273, "step": 1520 }, { "epoch": 0.38571833737749345, "grad_norm": 0.9224507808685303, "learning_rate": 4.987645237585002e-07, "logits/chosen": -0.17086181044578552, "logits/rejected": NaN, "logps/chosen": -171.43515014648438, "logps/rejected": -715.0250244140625, "loss": 0.0644, "rewards/accuracies": 0.96875, "rewards/chosen": -2.2828307151794434, "rewards/margins": 18.049999237060547, "rewards/rejected": -20.331249237060547, "step": 1530 }, { "epoch": 0.3882393722623137, "grad_norm": 27.963619232177734, "learning_rate": 4.98690623869771e-07, "logits/chosen": -0.10349120944738388, "logits/rejected": 0.04844360426068306, "logps/chosen": -142.97811889648438, "logps/rejected": -731.4500122070312, "loss": 0.0073, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.065417528152466, "rewards/margins": 18.639842987060547, "rewards/rejected": -20.706249237060547, "step": 1540 }, { "epoch": 0.3907604071471339, "grad_norm": 10.456473350524902, "learning_rate": 4.98614583348513e-07, "logits/chosen": 0.04182128980755806, "logits/rejected": NaN, "logps/chosen": -157.61563110351562, "logps/rejected": -724.0250244140625, "loss": 0.0602, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.473715305328369, "rewards/margins": 18.240039825439453, "rewards/rejected": -20.702342987060547, "step": 1550 }, { "epoch": 0.39328144203195414, "grad_norm": 12.37748908996582, "learning_rate": 4.985364028492533e-07, "logits/chosen": -0.0668792724609375, "logits/rejected": NaN, "logps/chosen": -147.83633422851562, "logps/rejected": -695.3250122070312, "loss": 0.0697, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.9364013671875, "rewards/margins": 17.268749237060547, "rewards/rejected": -19.205469131469727, "step": 1560 }, { "epoch": 0.39580247691677434, "grad_norm": 7.272609233856201, "learning_rate": 4.984560830449397e-07, "logits/chosen": -0.26489943265914917, "logits/rejected": -0.04807128757238388, "logps/chosen": -159.22500610351562, "logps/rejected": -718.8499755859375, "loss": 0.0304, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.467157006263733, "rewards/margins": 17.595312118530273, "rewards/rejected": -19.064844131469727, "step": 1570 }, { "epoch": 0.39832351180159453, "grad_norm": 1.9215657711029053, "learning_rate": 4.983736246269334e-07, "logits/chosen": -0.13592681288719177, "logits/rejected": -0.09389343112707138, "logps/chosen": -158.9304656982422, "logps/rejected": -719.0, "loss": 0.0762, "rewards/accuracies": 0.984375, "rewards/chosen": -2.0018622875213623, "rewards/margins": 16.19921875, "rewards/rejected": -18.197656631469727, "step": 1580 }, { "epoch": 0.4008445466864148, "grad_norm": 117.11297607421875, "learning_rate": 4.982890283050051e-07, "logits/chosen": -0.16480103135108948, "logits/rejected": -0.10840453952550888, "logps/chosen": -163.15625, "logps/rejected": -699.875, "loss": 0.0313, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.8355224132537842, "rewards/margins": 15.626172065734863, "rewards/rejected": -17.462499618530273, "step": 1590 }, { "epoch": 0.403365581571235, "grad_norm": 36.481754302978516, "learning_rate": 4.982022948073268e-07, "logits/chosen": -0.12348632514476776, "logits/rejected": NaN, "logps/chosen": -165.3874969482422, "logps/rejected": -673.375, "loss": 0.0495, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.694689989089966, "rewards/margins": 16.36328125, "rewards/rejected": -19.05078125, "step": 1600 }, { "epoch": 0.4058866164560552, "grad_norm": 193.294921875, "learning_rate": 4.98113424880467e-07, "logits/chosen": -0.193766787648201, "logits/rejected": NaN, "logps/chosen": -156.140625, "logps/rejected": -713.625, "loss": 0.1126, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.008044481277466, "rewards/margins": 16.758007049560547, "rewards/rejected": -18.760936737060547, "step": 1610 }, { "epoch": 0.4084076513408754, "grad_norm": 11.4768705368042, "learning_rate": 4.980224192893833e-07, "logits/chosen": -0.18689727783203125, "logits/rejected": NaN, "logps/chosen": -159.5359344482422, "logps/rejected": -717.4749755859375, "loss": 0.0584, "rewards/accuracies": 0.984375, "rewards/chosen": -1.7647918462753296, "rewards/margins": 16.71875, "rewards/rejected": -18.473438262939453, "step": 1620 }, { "epoch": 0.41092868622569567, "grad_norm": 0.6125283241271973, "learning_rate": 4.97929278817417e-07, "logits/chosen": -0.15419921278953552, "logits/rejected": NaN, "logps/chosen": -138.82186889648438, "logps/rejected": -715.7999877929688, "loss": 0.0552, "rewards/accuracies": 0.96875, "rewards/chosen": -1.9030612707138062, "rewards/margins": 17.188282012939453, "rewards/rejected": -19.091405868530273, "step": 1630 }, { "epoch": 0.41344972111051587, "grad_norm": 38.0545768737793, "learning_rate": 4.97834004266285e-07, "logits/chosen": -0.15358886122703552, "logits/rejected": -0.04590759426355362, "logps/chosen": -175.171875, "logps/rejected": -746.875, "loss": 0.0454, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.151629686355591, "rewards/margins": 19.389842987060547, "rewards/rejected": -22.540624618530273, "step": 1640 }, { "epoch": 0.41597075599533606, "grad_norm": 101.85356140136719, "learning_rate": 4.977365964560737e-07, "logits/chosen": -0.08653564751148224, "logits/rejected": NaN, "logps/chosen": -164.2218780517578, "logps/rejected": -762.0499877929688, "loss": 0.0418, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.777600049972534, "rewards/margins": 20.989843368530273, "rewards/rejected": -23.76171875, "step": 1650 }, { "epoch": 0.4184917908801563, "grad_norm": 156.95828247070312, "learning_rate": 4.976370562252316e-07, "logits/chosen": -0.01792144775390625, "logits/rejected": -0.02015838585793972, "logps/chosen": -154.6765594482422, "logps/rejected": -749.9000244140625, "loss": 0.0999, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.5851683616638184, "rewards/margins": 19.177343368530273, "rewards/rejected": -21.766407012939453, "step": 1660 }, { "epoch": 0.4210128257649765, "grad_norm": 46.38771438598633, "learning_rate": 4.975353844305629e-07, "logits/chosen": -0.09960556030273438, "logits/rejected": NaN, "logps/chosen": -155.9484405517578, "logps/rejected": -717.9249877929688, "loss": 0.0366, "rewards/accuracies": 0.984375, "rewards/chosen": -2.019757032394409, "rewards/margins": 16.985937118530273, "rewards/rejected": -19.007030487060547, "step": 1670 }, { "epoch": 0.42353386064979676, "grad_norm": 1.8958660364151, "learning_rate": 4.974315819472186e-07, "logits/chosen": -0.06000461429357529, "logits/rejected": -0.11537323147058487, "logps/chosen": -146.48281860351562, "logps/rejected": -751.4000244140625, "loss": 0.0731, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.1915526390075684, "rewards/margins": 17.464061737060547, "rewards/rejected": -19.662891387939453, "step": 1680 }, { "epoch": 0.42605489553461695, "grad_norm": 215.57308959960938, "learning_rate": 4.973256496686904e-07, "logits/chosen": -0.09638061374425888, "logits/rejected": NaN, "logps/chosen": -162.0625, "logps/rejected": -738.0499877929688, "loss": 0.1179, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4776244163513184, "rewards/margins": 18.414648056030273, "rewards/rejected": -20.888280868530273, "step": 1690 }, { "epoch": 0.4285759304194372, "grad_norm": 3.957876682281494, "learning_rate": 4.972175885068024e-07, "logits/chosen": 0.035980224609375, "logits/rejected": 0.15756836533546448, "logps/chosen": -156.95468139648438, "logps/rejected": -724.9500122070312, "loss": 0.0289, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.501574754714966, "rewards/margins": 18.721874237060547, "rewards/rejected": -21.224218368530273, "step": 1700 }, { "epoch": 0.4310969653042574, "grad_norm": 89.84231567382812, "learning_rate": 4.971073993917031e-07, "logits/chosen": -0.1254890412092209, "logits/rejected": 0.1340286284685135, "logps/chosen": -175.73281860351562, "logps/rejected": -728.7750244140625, "loss": 0.0301, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9551758766174316, "rewards/margins": 19.071094512939453, "rewards/rejected": -22.035938262939453, "step": 1710 }, { "epoch": 0.4336180001890776, "grad_norm": 0.011667543090879917, "learning_rate": 4.969950832718577e-07, "logits/chosen": -0.05851440504193306, "logits/rejected": NaN, "logps/chosen": -162.40078735351562, "logps/rejected": -726.5250244140625, "loss": 0.0373, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.138021945953369, "rewards/margins": 18.149219512939453, "rewards/rejected": -20.288280487060547, "step": 1720 }, { "epoch": 0.43613903507389784, "grad_norm": 0.45146989822387695, "learning_rate": 4.968806411140398e-07, "logits/chosen": -0.11880187690258026, "logits/rejected": NaN, "logps/chosen": -168.6484375, "logps/rejected": -750.7249755859375, "loss": 0.0592, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.790087938308716, "rewards/margins": 20.334766387939453, "rewards/rejected": -23.1171875, "step": 1730 }, { "epoch": 0.43866006995871804, "grad_norm": 0.26689037680625916, "learning_rate": 4.967640739033233e-07, "logits/chosen": -0.16000060737133026, "logits/rejected": NaN, "logps/chosen": -168.0437469482422, "logps/rejected": -751.4749755859375, "loss": 0.0329, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.614788770675659, "rewards/margins": 19.498437881469727, "rewards/rejected": -22.114063262939453, "step": 1740 }, { "epoch": 0.4411811048435383, "grad_norm": 24.23302459716797, "learning_rate": 4.966453826430735e-07, "logits/chosen": -0.280282586812973, "logits/rejected": NaN, "logps/chosen": -160.2312469482422, "logps/rejected": -720.0499877929688, "loss": 0.0392, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.470813035964966, "rewards/margins": 19.128515243530273, "rewards/rejected": -21.603906631469727, "step": 1750 }, { "epoch": 0.4437021397283585, "grad_norm": 0.27737119793891907, "learning_rate": 4.965245683549387e-07, "logits/chosen": -0.2629989683628082, "logits/rejected": NaN, "logps/chosen": -165.2468719482422, "logps/rejected": -709.7249755859375, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6182098388671875, "rewards/margins": 18.076562881469727, "rewards/rejected": -20.688282012939453, "step": 1760 }, { "epoch": 0.44622317461317873, "grad_norm": 6.871726036071777, "learning_rate": 4.964016320788416e-07, "logits/chosen": -0.18518981337547302, "logits/rejected": NaN, "logps/chosen": -147.3937530517578, "logps/rejected": -722.75, "loss": 0.0397, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.6825318336486816, "rewards/margins": 19.80859375, "rewards/rejected": -22.4921875, "step": 1770 }, { "epoch": 0.4487442094979989, "grad_norm": 0.027423450723290443, "learning_rate": 4.962765748729701e-07, "logits/chosen": -0.24336853623390198, "logits/rejected": NaN, "logps/chosen": -185.14688110351562, "logps/rejected": -758.0250244140625, "loss": 0.0248, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.352374315261841, "rewards/margins": 20.252344131469727, "rewards/rejected": -23.606250762939453, "step": 1780 }, { "epoch": 0.4512652443828191, "grad_norm": 175.6820526123047, "learning_rate": 4.96149397813768e-07, "logits/chosen": -0.26005250215530396, "logits/rejected": -0.08037491142749786, "logps/chosen": -169.4718780517578, "logps/rejected": -707.4500122070312, "loss": 0.1766, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.0113768577575684, "rewards/margins": 18.789844512939453, "rewards/rejected": -21.794530868530273, "step": 1790 }, { "epoch": 0.45378627926763937, "grad_norm": 118.3797378540039, "learning_rate": 4.960201019959262e-07, "logits/chosen": -0.23851928114891052, "logits/rejected": NaN, "logps/chosen": -160.34219360351562, "logps/rejected": -692.875, "loss": 0.0751, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.252856492996216, "rewards/margins": 16.950000762939453, "rewards/rejected": -19.205469131469727, "step": 1800 }, { "epoch": 0.45630731415245956, "grad_norm": 6.438039302825928, "learning_rate": 4.958886885323729e-07, "logits/chosen": -0.15141144394874573, "logits/rejected": NaN, "logps/chosen": -153.9921875, "logps/rejected": -724.3250122070312, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -2.34161376953125, "rewards/margins": 18.424219131469727, "rewards/rejected": -20.770313262939453, "step": 1810 }, { "epoch": 0.4588283490372798, "grad_norm": 268.5538330078125, "learning_rate": 4.957551585542645e-07, "logits/chosen": -0.2631118893623352, "logits/rejected": -0.02345123328268528, "logps/chosen": -164.8718719482422, "logps/rejected": -774.2000122070312, "loss": 0.085, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.023468017578125, "rewards/margins": 21.53125, "rewards/rejected": -24.548437118530273, "step": 1820 }, { "epoch": 0.4613493839221, "grad_norm": 11.275500297546387, "learning_rate": 4.956195132109752e-07, "logits/chosen": -0.25829964876174927, "logits/rejected": NaN, "logps/chosen": -168.8249969482422, "logps/rejected": -712.0750122070312, "loss": 0.1575, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.789111375808716, "rewards/margins": 19.178125381469727, "rewards/rejected": -21.955469131469727, "step": 1830 }, { "epoch": 0.46387041880692026, "grad_norm": 8.381622314453125, "learning_rate": 4.954817536700875e-07, "logits/chosen": -0.269500732421875, "logits/rejected": NaN, "logps/chosen": -170.7624969482422, "logps/rejected": -721.375, "loss": 0.0487, "rewards/accuracies": 0.984375, "rewards/chosen": -2.299431562423706, "rewards/margins": 16.728124618530273, "rewards/rejected": -19.02734375, "step": 1840 }, { "epoch": 0.46639145369174045, "grad_norm": 26.830862045288086, "learning_rate": 4.953418811173823e-07, "logits/chosen": -0.30759182572364807, "logits/rejected": -0.13571396470069885, "logps/chosen": -160.0968780517578, "logps/rejected": -692.8250122070312, "loss": 0.0233, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.319244384765625, "rewards/margins": 16.255859375, "rewards/rejected": -18.577342987060547, "step": 1850 }, { "epoch": 0.4689124885765607, "grad_norm": 1.3447877168655396, "learning_rate": 4.951998967568282e-07, "logits/chosen": -0.2263641357421875, "logits/rejected": -0.061818696558475494, "logps/chosen": -164.89999389648438, "logps/rejected": -743.7750244140625, "loss": 0.0324, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.025115966796875, "rewards/margins": 19.542186737060547, "rewards/rejected": -22.559375762939453, "step": 1860 }, { "epoch": 0.4714335234613809, "grad_norm": 147.43569946289062, "learning_rate": 4.950558018105715e-07, "logits/chosen": 0.03471221774816513, "logits/rejected": NaN, "logps/chosen": -154.24844360351562, "logps/rejected": -720.2000122070312, "loss": 0.1181, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.8750853538513184, "rewards/margins": 19.921483993530273, "rewards/rejected": -22.788671493530273, "step": 1870 }, { "epoch": 0.4739545583462011, "grad_norm": 0.37306687235832214, "learning_rate": 4.949095975189258e-07, "logits/chosen": 0.0014892577892169356, "logits/rejected": 0.0879364013671875, "logps/chosen": -156.10311889648438, "logps/rejected": -739.0750122070312, "loss": 0.0496, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.830322265625, "rewards/margins": 20.83203125, "rewards/rejected": -23.657812118530273, "step": 1880 }, { "epoch": 0.47647559323102134, "grad_norm": 219.85206604003906, "learning_rate": 4.947612851403611e-07, "logits/chosen": 0.06925658881664276, "logits/rejected": 0.340911865234375, "logps/chosen": -168.5968780517578, "logps/rejected": -761.1749877929688, "loss": 0.073, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.269287109375, "rewards/margins": 21.760156631469727, "rewards/rejected": -26.030467987060547, "step": 1890 }, { "epoch": 0.47899662811584154, "grad_norm": 8.691323280334473, "learning_rate": 4.946108659514926e-07, "logits/chosen": -0.09223175048828125, "logits/rejected": -0.010797500610351562, "logps/chosen": -165.44686889648438, "logps/rejected": -762.2000122070312, "loss": 0.0899, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.7356200218200684, "rewards/margins": 20.838281631469727, "rewards/rejected": -23.5625, "step": 1900 }, { "epoch": 0.4815176630006618, "grad_norm": 20.514183044433594, "learning_rate": 4.944583412470706e-07, "logits/chosen": -0.28156280517578125, "logits/rejected": NaN, "logps/chosen": -175.5906219482422, "logps/rejected": -732.6500244140625, "loss": 0.1192, "rewards/accuracies": 0.96875, "rewards/chosen": -2.642181396484375, "rewards/margins": 19.438282012939453, "rewards/rejected": -22.07421875, "step": 1910 }, { "epoch": 0.484038697885482, "grad_norm": 5.058376789093018, "learning_rate": 4.943037123399686e-07, "logits/chosen": -0.1258392333984375, "logits/rejected": NaN, "logps/chosen": -153.921875, "logps/rejected": -748.7000122070312, "loss": 0.0534, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.5762481689453125, "rewards/margins": 20.157812118530273, "rewards/rejected": -22.725000381469727, "step": 1920 }, { "epoch": 0.48655973277030223, "grad_norm": 1.9676674604415894, "learning_rate": 4.941469805611723e-07, "logits/chosen": -0.22601318359375, "logits/rejected": NaN, "logps/chosen": -185.0906219482422, "logps/rejected": -734.7750244140625, "loss": 0.037, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9828248023986816, "rewards/margins": 18.524999618530273, "rewards/rejected": -21.515625, "step": 1930 }, { "epoch": 0.48908076765512243, "grad_norm": 29.435022354125977, "learning_rate": 4.93988147259768e-07, "logits/chosen": -0.12573547661304474, "logits/rejected": NaN, "logps/chosen": -150.1640625, "logps/rejected": -715.0499877929688, "loss": 0.0152, "rewards/accuracies": 0.984375, "rewards/chosen": -2.5247802734375, "rewards/margins": 18.344532012939453, "rewards/rejected": -20.867969512939453, "step": 1940 }, { "epoch": 0.4916018025399426, "grad_norm": 0.03506378084421158, "learning_rate": 4.938272138029315e-07, "logits/chosen": -0.11028747260570526, "logits/rejected": -0.06119384616613388, "logps/chosen": -167.34530639648438, "logps/rejected": -730.3499755859375, "loss": 0.0402, "rewards/accuracies": 0.984375, "rewards/chosen": -2.949658155441284, "rewards/margins": 19.164844512939453, "rewards/rejected": -22.109375, "step": 1950 }, { "epoch": 0.4941228374247629, "grad_norm": 3.678199529647827, "learning_rate": 4.936641815759155e-07, "logits/chosen": -0.12366332858800888, "logits/rejected": NaN, "logps/chosen": -193.8640594482422, "logps/rejected": -792.5999755859375, "loss": 0.0644, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9748778343200684, "rewards/margins": 21.315624237060547, "rewards/rejected": -25.282812118530273, "step": 1960 }, { "epoch": 0.49664387230958307, "grad_norm": 316.0157470703125, "learning_rate": 4.934990519820381e-07, "logits/chosen": 0.0013916015159338713, "logits/rejected": NaN, "logps/chosen": -171.1296844482422, "logps/rejected": -728.2750244140625, "loss": 0.0946, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8883299827575684, "rewards/margins": 19.977344512939453, "rewards/rejected": -22.861719131469727, "step": 1970 }, { "epoch": 0.4991649071944033, "grad_norm": 68.41110229492188, "learning_rate": 4.933318264426711e-07, "logits/chosen": -0.02680053748190403, "logits/rejected": NaN, "logps/chosen": -161.0343780517578, "logps/rejected": -741.9749755859375, "loss": 0.0451, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.79583740234375, "rewards/margins": 20.372655868530273, "rewards/rejected": -23.173437118530273, "step": 1980 }, { "epoch": 0.5016859420792236, "grad_norm": 16.433439254760742, "learning_rate": 4.93162506397227e-07, "logits/chosen": 0.21392059326171875, "logits/rejected": NaN, "logps/chosen": -152.1906280517578, "logps/rejected": -746.7750244140625, "loss": 0.0445, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.470837354660034, "rewards/margins": 21.13671875, "rewards/rejected": -24.607812881469727, "step": 1990 }, { "epoch": 0.5042069769640437, "grad_norm": 46.46327590942383, "learning_rate": 4.929910933031471e-07, "logits/chosen": -0.02016448974609375, "logits/rejected": 9.460448927711695e-05, "logps/chosen": -152.1296844482422, "logps/rejected": -740.1749877929688, "loss": 0.0579, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.7793946266174316, "rewards/margins": 18.895313262939453, "rewards/rejected": -21.665624618530273, "step": 2000 }, { "epoch": 0.506728011848864, "grad_norm": 0.6687387824058533, "learning_rate": 4.928175886358892e-07, "logits/chosen": -0.063812255859375, "logits/rejected": 0.1118011474609375, "logps/chosen": -167.81094360351562, "logps/rejected": -735.1749877929688, "loss": 0.1609, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.8603272438049316, "rewards/margins": 19.142187118530273, "rewards/rejected": -22.0, "step": 2010 }, { "epoch": 0.5092490467336842, "grad_norm": 134.17254638671875, "learning_rate": 4.926419938889138e-07, "logits/chosen": -0.02549133263528347, "logits/rejected": 0.15563392639160156, "logps/chosen": -159.546875, "logps/rejected": -719.0750122070312, "loss": 0.0501, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.461749315261841, "rewards/margins": 19.513671875, "rewards/rejected": -21.977344512939453, "step": 2020 }, { "epoch": 0.5117700816185043, "grad_norm": 33.98070526123047, "learning_rate": 4.924643105736727e-07, "logits/chosen": 0.146403506398201, "logits/rejected": NaN, "logps/chosen": -173.02188110351562, "logps/rejected": -728.9000244140625, "loss": 0.029, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.262622117996216, "rewards/margins": 19.410938262939453, "rewards/rejected": -22.677343368530273, "step": 2030 }, { "epoch": 0.5142911165033246, "grad_norm": 0.4970325827598572, "learning_rate": 4.922845402195951e-07, "logits/chosen": -0.03651733323931694, "logits/rejected": -0.03164520114660263, "logps/chosen": -158.015625, "logps/rejected": -752.1749877929688, "loss": 0.0321, "rewards/accuracies": 0.984375, "rewards/chosen": -2.7038116455078125, "rewards/margins": 19.075389862060547, "rewards/rejected": -21.782812118530273, "step": 2040 }, { "epoch": 0.5168121513881448, "grad_norm": 137.6778564453125, "learning_rate": 4.921026843740743e-07, "logits/chosen": 0.13727417588233948, "logits/rejected": 0.259359747171402, "logps/chosen": -171.77499389648438, "logps/rejected": -761.0250244140625, "loss": 0.04, "rewards/accuracies": 0.984375, "rewards/chosen": -3.7468504905700684, "rewards/margins": 20.745702743530273, "rewards/rejected": -24.489843368530273, "step": 2050 }, { "epoch": 0.5193331862729651, "grad_norm": 174.01023864746094, "learning_rate": 4.919187446024552e-07, "logits/chosen": 0.2943359315395355, "logits/rejected": NaN, "logps/chosen": -161.546875, "logps/rejected": -760.2000122070312, "loss": 0.0555, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.172802686691284, "rewards/margins": 21.45703125, "rewards/rejected": -24.62109375, "step": 2060 }, { "epoch": 0.5218542211577852, "grad_norm": 5.076344966888428, "learning_rate": 4.917327224880199e-07, "logits/chosen": 0.01833038404583931, "logits/rejected": 0.18987885117530823, "logps/chosen": -166.3078155517578, "logps/rejected": -766.75, "loss": 0.0422, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.926892042160034, "rewards/margins": 22.118749618530273, "rewards/rejected": -25.049999237060547, "step": 2070 }, { "epoch": 0.5243752560426055, "grad_norm": 1.3473880290985107, "learning_rate": 4.915446196319748e-07, "logits/chosen": 0.09606628119945526, "logits/rejected": NaN, "logps/chosen": -160.7609405517578, "logps/rejected": -754.625, "loss": 0.0711, "rewards/accuracies": 0.96875, "rewards/chosen": -3.338000535964966, "rewards/margins": 21.774219512939453, "rewards/rejected": -25.109375, "step": 2080 }, { "epoch": 0.5268962909274257, "grad_norm": 16.795961380004883, "learning_rate": 4.913544376534364e-07, "logits/chosen": 0.09474487602710724, "logits/rejected": NaN, "logps/chosen": -158.7937469482422, "logps/rejected": -744.3250122070312, "loss": 0.0327, "rewards/accuracies": 0.984375, "rewards/chosen": -2.922436475753784, "rewards/margins": 21.557811737060547, "rewards/rejected": -24.48046875, "step": 2090 }, { "epoch": 0.5294173258122459, "grad_norm": 4.726897239685059, "learning_rate": 4.911621781894175e-07, "logits/chosen": 0.16737213730812073, "logits/rejected": 0.34566038846969604, "logps/chosen": -166.1843719482422, "logps/rejected": -733.25, "loss": 0.0493, "rewards/accuracies": 0.984375, "rewards/chosen": -3.219674587249756, "rewards/margins": 21.255468368530273, "rewards/rejected": -24.48046875, "step": 2100 }, { "epoch": 0.5319383606970661, "grad_norm": 71.31873321533203, "learning_rate": 4.909678428948129e-07, "logits/chosen": 0.13542480766773224, "logits/rejected": NaN, "logps/chosen": -179.8312530517578, "logps/rejected": -782.1500244140625, "loss": 0.0628, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.025097608566284, "rewards/margins": 21.633594512939453, "rewards/rejected": -24.66015625, "step": 2110 }, { "epoch": 0.5344593955818864, "grad_norm": 98.24397277832031, "learning_rate": 4.907714334423857e-07, "logits/chosen": 0.02763977088034153, "logits/rejected": NaN, "logps/chosen": -167.2624969482422, "logps/rejected": -789.4249877929688, "loss": 0.0356, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.0910277366638184, "rewards/margins": 22.12890625, "rewards/rejected": -25.213281631469727, "step": 2120 }, { "epoch": 0.5369804304667066, "grad_norm": 0.05151905119419098, "learning_rate": 4.905729515227522e-07, "logits/chosen": -0.07982178032398224, "logits/rejected": NaN, "logps/chosen": -163.2531280517578, "logps/rejected": -751.9749755859375, "loss": 0.0142, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.779467821121216, "rewards/margins": 21.450000762939453, "rewards/rejected": -24.231250762939453, "step": 2130 }, { "epoch": 0.5395014653515268, "grad_norm": 0.1516437530517578, "learning_rate": 4.90372398844368e-07, "logits/chosen": -0.08057098090648651, "logits/rejected": NaN, "logps/chosen": -161.7109375, "logps/rejected": -771.7999877929688, "loss": 0.0481, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.6490111351013184, "rewards/margins": 21.821874618530273, "rewards/rejected": -24.473438262939453, "step": 2140 }, { "epoch": 0.542022500236347, "grad_norm": 0.010342867113649845, "learning_rate": 4.901697771335129e-07, "logits/chosen": -0.14234618842601776, "logits/rejected": 0.07589416205883026, "logps/chosen": -165.00936889648438, "logps/rejected": -762.4000244140625, "loss": 0.0725, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.5725159645080566, "rewards/margins": 22.12109375, "rewards/rejected": -24.69921875, "step": 2150 }, { "epoch": 0.5445435351211673, "grad_norm": 6.896526336669922, "learning_rate": 4.899650881342759e-07, "logits/chosen": -0.17307129502296448, "logits/rejected": NaN, "logps/chosen": -167.6953125, "logps/rejected": -754.1749877929688, "loss": 0.18, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4516754150390625, "rewards/margins": 21.400781631469727, "rewards/rejected": -23.861719131469727, "step": 2160 }, { "epoch": 0.5470645700059874, "grad_norm": 223.86227416992188, "learning_rate": 4.897583336085406e-07, "logits/chosen": -0.052935026586055756, "logits/rejected": NaN, "logps/chosen": -144.57968139648438, "logps/rejected": -752.5750122070312, "loss": 0.149, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.1187500953674316, "rewards/margins": 21.883203506469727, "rewards/rejected": -24.008594512939453, "step": 2170 }, { "epoch": 0.5495856048908077, "grad_norm": 39.056251525878906, "learning_rate": 4.895495153359699e-07, "logits/chosen": -0.07647399604320526, "logits/rejected": 0.07535400241613388, "logps/chosen": -156.90469360351562, "logps/rejected": -793.7000122070312, "loss": 0.1286, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.259488582611084, "rewards/margins": 23.499414443969727, "rewards/rejected": -26.77734375, "step": 2180 }, { "epoch": 0.5521066397756279, "grad_norm": 0.008407285436987877, "learning_rate": 4.893386351139906e-07, "logits/chosen": 0.12558594346046448, "logits/rejected": NaN, "logps/chosen": -154.41561889648438, "logps/rejected": -779.5999755859375, "loss": 0.0855, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.4640870094299316, "rewards/margins": 23.107812881469727, "rewards/rejected": -26.571094512939453, "step": 2190 }, { "epoch": 0.5546276746604482, "grad_norm": 364.4292907714844, "learning_rate": 4.891256947577779e-07, "logits/chosen": -0.062102317810058594, "logits/rejected": NaN, "logps/chosen": -177.8093719482422, "logps/rejected": -761.1500244140625, "loss": 0.066, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.20947265625, "rewards/margins": 21.582422256469727, "rewards/rejected": -24.802343368530273, "step": 2200 }, { "epoch": 0.5571487095452683, "grad_norm": 1.9797602891921997, "learning_rate": 4.889106961002401e-07, "logits/chosen": 0.03363342210650444, "logits/rejected": 0.258056640625, "logps/chosen": -163.7937469482422, "logps/rejected": -775.9749755859375, "loss": 0.0888, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.737478733062744, "rewards/margins": 23.102344512939453, "rewards/rejected": -25.83984375, "step": 2210 }, { "epoch": 0.5596697444300885, "grad_norm": 11.266992568969727, "learning_rate": 4.886936409920021e-07, "logits/chosen": 0.01772155798971653, "logits/rejected": 0.289987176656723, "logps/chosen": -168.9753875732422, "logps/rejected": -771.0750122070312, "loss": 0.1231, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.1752257347106934, "rewards/margins": 22.971874237060547, "rewards/rejected": -26.146093368530273, "step": 2220 }, { "epoch": 0.5621907793149088, "grad_norm": 71.43973541259766, "learning_rate": 4.8847453130139e-07, "logits/chosen": 0.029024887830018997, "logits/rejected": NaN, "logps/chosen": -175.16250610351562, "logps/rejected": -791.375, "loss": 0.0496, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.359637498855591, "rewards/margins": 22.777734756469727, "rewards/rejected": -26.142969131469727, "step": 2230 }, { "epoch": 0.5647118141997289, "grad_norm": 162.9497528076172, "learning_rate": 4.882533689144156e-07, "logits/chosen": -0.06598816066980362, "logits/rejected": NaN, "logps/chosen": -160.28125, "logps/rejected": -742.2249755859375, "loss": 0.049, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.106201171875, "rewards/margins": 22.042186737060547, "rewards/rejected": -25.137500762939453, "step": 2240 }, { "epoch": 0.5672328490845492, "grad_norm": 101.59967803955078, "learning_rate": 4.880301557347586e-07, "logits/chosen": -0.16319313645362854, "logits/rejected": 0.01937561109662056, "logps/chosen": -145.1960906982422, "logps/rejected": -778.0250244140625, "loss": 0.1196, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4509034156799316, "rewards/margins": 21.89453125, "rewards/rejected": -24.348438262939453, "step": 2250 }, { "epoch": 0.5697538839693694, "grad_norm": 259.0118713378906, "learning_rate": 4.878048936837518e-07, "logits/chosen": -0.1205596923828125, "logits/rejected": NaN, "logps/chosen": -166.39688110351562, "logps/rejected": -813.9000244140625, "loss": 0.1263, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8977112770080566, "rewards/margins": 24.412109375, "rewards/rejected": -27.318750381469727, "step": 2260 }, { "epoch": 0.5722749188541897, "grad_norm": 0.05970088765025139, "learning_rate": 4.875775847003635e-07, "logits/chosen": -0.13802489638328552, "logits/rejected": NaN, "logps/chosen": -174.2937469482422, "logps/rejected": -791.4500122070312, "loss": 0.0358, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.172283887863159, "rewards/margins": 22.681249618530273, "rewards/rejected": -25.85546875, "step": 2270 }, { "epoch": 0.5747959537390098, "grad_norm": 86.7126235961914, "learning_rate": 4.873482307411814e-07, "logits/chosen": -0.22110596299171448, "logits/rejected": NaN, "logps/chosen": -189.10000610351562, "logps/rejected": -716.2999877929688, "loss": 0.1957, "rewards/accuracies": 0.9375, "rewards/chosen": -3.0399537086486816, "rewards/margins": 18.889062881469727, "rewards/rejected": -21.931249618530273, "step": 2280 }, { "epoch": 0.5773169886238301, "grad_norm": 1590.2569580078125, "learning_rate": 4.871168337803956e-07, "logits/chosen": 0.02028656005859375, "logits/rejected": NaN, "logps/chosen": -166.2624969482422, "logps/rejected": -708.25, "loss": 0.0843, "rewards/accuracies": 0.96875, "rewards/chosen": -3.0229218006134033, "rewards/margins": 18.607030868530273, "rewards/rejected": -21.641407012939453, "step": 2290 }, { "epoch": 0.5798380235086503, "grad_norm": 113.45452880859375, "learning_rate": 4.868833958097815e-07, "logits/chosen": 0.0071353912353515625, "logits/rejected": NaN, "logps/chosen": -148.38125610351562, "logps/rejected": -704.2000122070312, "loss": 0.0592, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8743622303009033, "rewards/margins": 18.719141006469727, "rewards/rejected": -21.581249237060547, "step": 2300 }, { "epoch": 0.5823590583934706, "grad_norm": 80.87008666992188, "learning_rate": 4.866479188386825e-07, "logits/chosen": NaN, "logits/rejected": -0.08701705932617188, "logps/chosen": -174.38125610351562, "logps/rejected": -803.25, "loss": 0.0363, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.08502197265625, "rewards/margins": 20.979686737060547, "rewards/rejected": -24.079687118530273, "step": 2310 }, { "epoch": 0.5848800932782907, "grad_norm": 62.92067337036133, "learning_rate": 4.864104048939933e-07, "logits/chosen": -0.12831267714500427, "logits/rejected": 0.07422943413257599, "logps/chosen": -155.953125, "logps/rejected": -741.5499877929688, "loss": 0.0805, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.6608824729919434, "rewards/margins": 20.818750381469727, "rewards/rejected": -23.486719131469727, "step": 2320 }, { "epoch": 0.587401128163111, "grad_norm": 4.005105495452881, "learning_rate": 4.861708560201418e-07, "logits/chosen": -0.2908378541469574, "logits/rejected": NaN, "logps/chosen": -149.67031860351562, "logps/rejected": -727.7000122070312, "loss": 0.0679, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.0427613258361816, "rewards/margins": 20.249217987060547, "rewards/rejected": -22.293750762939453, "step": 2330 }, { "epoch": 0.5899221630479312, "grad_norm": 27.312328338623047, "learning_rate": 4.859292742790719e-07, "logits/chosen": -0.2913223206996918, "logits/rejected": NaN, "logps/chosen": -169.27188110351562, "logps/rejected": -786.5750122070312, "loss": 0.0807, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.5822386741638184, "rewards/margins": 21.970312118530273, "rewards/rejected": -24.5546875, "step": 2340 }, { "epoch": 0.5924431979327514, "grad_norm": 144.65675354003906, "learning_rate": 4.856856617502257e-07, "logits/chosen": -0.2508483827114105, "logits/rejected": NaN, "logps/chosen": -203.58438110351562, "logps/rejected": -817.2999877929688, "loss": 0.0682, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.927481174468994, "rewards/margins": 22.966405868530273, "rewards/rejected": -26.8984375, "step": 2350 }, { "epoch": 0.5949642328175716, "grad_norm": 34.01659393310547, "learning_rate": 4.854400205305255e-07, "logits/chosen": -0.09994812309741974, "logits/rejected": 0.24384155869483948, "logps/chosen": -180.2624969482422, "logps/rejected": -756.3250122070312, "loss": 0.0895, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.811279296875, "rewards/margins": 22.587499618530273, "rewards/rejected": -26.392187118530273, "step": 2360 }, { "epoch": 0.5974852677023919, "grad_norm": 3.814255714416504, "learning_rate": 4.851923527343556e-07, "logits/chosen": 0.02339477464556694, "logits/rejected": NaN, "logps/chosen": -166.84375, "logps/rejected": -749.9000244140625, "loss": 0.1553, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9041168689727783, "rewards/margins": 23.353124618530273, "rewards/rejected": -26.256250381469727, "step": 2370 }, { "epoch": 0.6000063025872121, "grad_norm": 194.227294921875, "learning_rate": 4.849426604935445e-07, "logits/chosen": -0.2653656005859375, "logits/rejected": NaN, "logps/chosen": -162.3984375, "logps/rejected": -781.75, "loss": 0.0582, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.539746046066284, "rewards/margins": 22.307811737060547, "rewards/rejected": -24.842187881469727, "step": 2380 }, { "epoch": 0.6025273374720322, "grad_norm": 0.015395522117614746, "learning_rate": 4.846909459573461e-07, "logits/chosen": 0.0334320068359375, "logits/rejected": NaN, "logps/chosen": -161.4562530517578, "logps/rejected": -813.25, "loss": 0.0344, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0741639137268066, "rewards/margins": 23.427343368530273, "rewards/rejected": -26.49609375, "step": 2390 }, { "epoch": 0.6050483723568525, "grad_norm": 247.4481658935547, "learning_rate": 4.844372112924218e-07, "logits/chosen": -0.02262268029153347, "logits/rejected": NaN, "logps/chosen": -168.44375610351562, "logps/rejected": -767.5250244140625, "loss": 0.0576, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.6278076171875, "rewards/margins": 23.178905487060547, "rewards/rejected": -26.793750762939453, "step": 2400 }, { "epoch": 0.6075694072416727, "grad_norm": 126.4100112915039, "learning_rate": 4.841814586828212e-07, "logits/chosen": -0.06553039699792862, "logits/rejected": NaN, "logps/chosen": -162.18124389648438, "logps/rejected": -774.9000244140625, "loss": 0.1512, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9208130836486816, "rewards/margins": 22.874217987060547, "rewards/rejected": -25.79296875, "step": 2410 }, { "epoch": 0.6100904421264929, "grad_norm": 10.238855361938477, "learning_rate": 4.839236903299637e-07, "logits/chosen": 0.09916992485523224, "logits/rejected": 0.10272216796875, "logps/chosen": -140.1046905517578, "logps/rejected": -755.3250122070312, "loss": 0.0876, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.3372223377227783, "rewards/margins": 22.083593368530273, "rewards/rejected": -24.418750762939453, "step": 2420 }, { "epoch": 0.6126114770113131, "grad_norm": 6.175973415374756, "learning_rate": 4.836639084526194e-07, "logits/chosen": -0.0039306641556322575, "logits/rejected": NaN, "logps/chosen": -146.93594360351562, "logps/rejected": -712.4500122070312, "loss": 0.042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.668505907058716, "rewards/margins": 20.431249618530273, "rewards/rejected": -23.100780487060547, "step": 2430 }, { "epoch": 0.6151325118961334, "grad_norm": 2.399904251098633, "learning_rate": 4.8340211528689e-07, "logits/chosen": -0.07123718410730362, "logits/rejected": NaN, "logps/chosen": -168.16561889648438, "logps/rejected": -746.3499755859375, "loss": 0.0902, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.147308349609375, "rewards/margins": 21.825780868530273, "rewards/rejected": -24.9765625, "step": 2440 }, { "epoch": 0.6176535467809536, "grad_norm": 0.10203604400157928, "learning_rate": 4.831383130861897e-07, "logits/chosen": -0.03415822982788086, "logits/rejected": 0.11562500149011612, "logps/chosen": -189.1531219482422, "logps/rejected": -803.3499755859375, "loss": 0.1232, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.916699171066284, "rewards/margins": 22.446874618530273, "rewards/rejected": -26.37109375, "step": 2450 }, { "epoch": 0.6201745816657738, "grad_norm": 87.49981689453125, "learning_rate": 4.828725041212255e-07, "logits/chosen": -0.09313354641199112, "logits/rejected": NaN, "logps/chosen": -153.296875, "logps/rejected": -728.0, "loss": 0.0845, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.4823150634765625, "rewards/margins": 20.28515625, "rewards/rejected": -22.776561737060547, "step": 2460 }, { "epoch": 0.622695616550594, "grad_norm": 255.51995849609375, "learning_rate": 4.82604690679978e-07, "logits/chosen": -0.22461700439453125, "logits/rejected": 0.02373352088034153, "logps/chosen": -161.08749389648438, "logps/rejected": -749.0, "loss": 0.0643, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0784912109375, "rewards/margins": 20.026172637939453, "rewards/rejected": -22.116405487060547, "step": 2470 }, { "epoch": 0.6252166514354143, "grad_norm": 3.3934412002563477, "learning_rate": 4.823348750676816e-07, "logits/chosen": -0.0695343017578125, "logits/rejected": 0.04907836765050888, "logps/chosen": -144.3671875, "logps/rejected": -766.5499877929688, "loss": 0.0714, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.321826219558716, "rewards/margins": 22.345312118530273, "rewards/rejected": -24.66796875, "step": 2480 }, { "epoch": 0.6277376863202344, "grad_norm": 152.36874389648438, "learning_rate": 4.820630596068047e-07, "logits/chosen": -0.16159972548484802, "logits/rejected": NaN, "logps/chosen": -175.2062530517578, "logps/rejected": -746.1749877929688, "loss": 0.0909, "rewards/accuracies": 0.96875, "rewards/chosen": -2.998974561691284, "rewards/margins": 21.321094512939453, "rewards/rejected": -24.317968368530273, "step": 2490 }, { "epoch": 0.6302587212050547, "grad_norm": 46.400421142578125, "learning_rate": 4.817892466370292e-07, "logits/chosen": 0.04945068433880806, "logits/rejected": 0.20886535942554474, "logps/chosen": -162.7062530517578, "logps/rejected": -746.1749877929688, "loss": 0.0539, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.147570848464966, "rewards/margins": 20.793750762939453, "rewards/rejected": -23.922657012939453, "step": 2500 }, { "epoch": 0.6327797560898749, "grad_norm": 2.2360360622406006, "learning_rate": 4.815134385152311e-07, "logits/chosen": -0.14120635390281677, "logits/rejected": NaN, "logps/chosen": -156.109375, "logps/rejected": -751.0499877929688, "loss": 0.1087, "rewards/accuracies": 0.984375, "rewards/chosen": -2.566943407058716, "rewards/margins": 20.81640625, "rewards/rejected": -23.385156631469727, "step": 2510 }, { "epoch": 0.6353007909746952, "grad_norm": 0.31254690885543823, "learning_rate": 4.812356376154599e-07, "logits/chosen": 0.12133713066577911, "logits/rejected": 0.2718009948730469, "logps/chosen": -175.94375610351562, "logps/rejected": -760.0, "loss": 0.0153, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5268797874450684, "rewards/margins": 21.208593368530273, "rewards/rejected": -24.7421875, "step": 2520 }, { "epoch": 0.6378218258595153, "grad_norm": 154.28758239746094, "learning_rate": 4.809558463289181e-07, "logits/chosen": 0.13915404677391052, "logits/rejected": 0.40690916776657104, "logps/chosen": -181.6531219482422, "logps/rejected": -782.125, "loss": 0.1807, "rewards/accuracies": 0.96875, "rewards/chosen": -4.763525485992432, "rewards/margins": 22.430469512939453, "rewards/rejected": -27.1953125, "step": 2530 }, { "epoch": 0.6403428607443356, "grad_norm": 7.909911632537842, "learning_rate": 4.806740670639407e-07, "logits/chosen": -0.05220641940832138, "logits/rejected": 0.18059387803077698, "logps/chosen": -161.74374389648438, "logps/rejected": -791.2000122070312, "loss": 0.0666, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.0971922874450684, "rewards/margins": 22.514842987060547, "rewards/rejected": -25.607812881469727, "step": 2540 }, { "epoch": 0.6428638956291558, "grad_norm": 51.19940185546875, "learning_rate": 4.803903022459743e-07, "logits/chosen": 0.01683502271771431, "logits/rejected": NaN, "logps/chosen": -153.6062469482422, "logps/rejected": -788.2750244140625, "loss": 0.1112, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.701385498046875, "rewards/margins": 25.252344131469727, "rewards/rejected": -27.954687118530273, "step": 2550 }, { "epoch": 0.6453849305139759, "grad_norm": 26.290342330932617, "learning_rate": 4.801045543175567e-07, "logits/chosen": -0.014514160342514515, "logits/rejected": NaN, "logps/chosen": -168.55624389648438, "logps/rejected": -782.875, "loss": 0.1059, "rewards/accuracies": 0.96875, "rewards/chosen": -2.6099853515625, "rewards/margins": 24.310937881469727, "rewards/rejected": -26.9296875, "step": 2560 }, { "epoch": 0.6479059653987962, "grad_norm": 18.076988220214844, "learning_rate": 4.798168257382952e-07, "logits/chosen": 0.22845153510570526, "logits/rejected": NaN, "logps/chosen": -159.671875, "logps/rejected": -744.1500244140625, "loss": 0.0627, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.219738721847534, "rewards/margins": 22.067968368530273, "rewards/rejected": -25.2890625, "step": 2570 }, { "epoch": 0.6504270002836164, "grad_norm": 316.5787658691406, "learning_rate": 4.79527118984846e-07, "logits/chosen": 0.05199890211224556, "logits/rejected": NaN, "logps/chosen": -182.8640594482422, "logps/rejected": -740.4749755859375, "loss": 0.1103, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.045703172683716, "rewards/margins": 19.6171875, "rewards/rejected": -22.664844512939453, "step": 2580 }, { "epoch": 0.6529480351684367, "grad_norm": 15.797821044921875, "learning_rate": 4.792354365508926e-07, "logits/chosen": 0.02642211876809597, "logits/rejected": NaN, "logps/chosen": -167.3718719482422, "logps/rejected": -775.9500122070312, "loss": 0.107, "rewards/accuracies": 0.96875, "rewards/chosen": -2.7794387340545654, "rewards/margins": 22.991405487060547, "rewards/rejected": -25.767187118530273, "step": 2590 }, { "epoch": 0.6554690700532568, "grad_norm": 322.9416809082031, "learning_rate": 4.789417809471242e-07, "logits/chosen": 0.12126617133617401, "logits/rejected": 0.38610535860061646, "logps/chosen": -179.484375, "logps/rejected": -783.0, "loss": 0.0421, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9222779273986816, "rewards/margins": 23.002344131469727, "rewards/rejected": -25.926563262939453, "step": 2600 }, { "epoch": 0.6579901049380771, "grad_norm": 67.42932891845703, "learning_rate": 4.786461547012147e-07, "logits/chosen": 0.11585693061351776, "logits/rejected": NaN, "logps/chosen": -148.9499969482422, "logps/rejected": -802.7750244140625, "loss": 0.0666, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.7706360816955566, "rewards/margins": 23.564062118530273, "rewards/rejected": -26.3359375, "step": 2610 }, { "epoch": 0.6605111398228973, "grad_norm": 3.984429359436035, "learning_rate": 4.783485603578002e-07, "logits/chosen": -0.0070648193359375, "logits/rejected": 0.122467041015625, "logps/chosen": -139.85000610351562, "logps/rejected": -729.9500122070312, "loss": 0.1797, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.2463531494140625, "rewards/margins": 19.159765243530273, "rewards/rejected": -21.410938262939453, "step": 2620 }, { "epoch": 0.6630321747077175, "grad_norm": 0.5628440976142883, "learning_rate": 4.780490004784574e-07, "logits/chosen": -0.09403228759765625, "logits/rejected": NaN, "logps/chosen": -156.01522827148438, "logps/rejected": -729.0750122070312, "loss": 0.1033, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.204174757003784, "rewards/margins": 18.682031631469727, "rewards/rejected": -20.892187118530273, "step": 2630 }, { "epoch": 0.6655532095925377, "grad_norm": 9.05879020690918, "learning_rate": 4.777474776416816e-07, "logits/chosen": -0.14089736342430115, "logits/rejected": NaN, "logps/chosen": -148.16561889648438, "logps/rejected": -772.9749755859375, "loss": 0.0861, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.284912109375, "rewards/margins": 21.623241424560547, "rewards/rejected": -23.909374237060547, "step": 2640 }, { "epoch": 0.668074244477358, "grad_norm": 149.5705108642578, "learning_rate": 4.774439944428647e-07, "logits/chosen": -0.023677825927734375, "logits/rejected": NaN, "logps/chosen": -153.5749969482422, "logps/rejected": -778.4249877929688, "loss": 0.0342, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4297852516174316, "rewards/margins": 23.241405487060547, "rewards/rejected": -25.673437118530273, "step": 2650 }, { "epoch": 0.6705952793621782, "grad_norm": 72.21862030029297, "learning_rate": 4.771385534942726e-07, "logits/chosen": 0.006884765811264515, "logits/rejected": NaN, "logps/chosen": -174.4562530517578, "logps/rejected": -808.3250122070312, "loss": 0.1449, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.376727342605591, "rewards/margins": 24.236719131469727, "rewards/rejected": -27.622655868530273, "step": 2660 }, { "epoch": 0.6731163142469984, "grad_norm": 1.3136367797851562, "learning_rate": 4.768311574250224e-07, "logits/chosen": 0.17354126274585724, "logits/rejected": 0.342233270406723, "logps/chosen": -171.27499389648438, "logps/rejected": -795.2000122070312, "loss": 0.0549, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.456652879714966, "rewards/margins": 24.223438262939453, "rewards/rejected": -27.671875, "step": 2670 }, { "epoch": 0.6756373491318186, "grad_norm": 58.9135627746582, "learning_rate": 4.7652180888106046e-07, "logits/chosen": 0.06976928561925888, "logits/rejected": NaN, "logps/chosen": -153.81405639648438, "logps/rejected": -778.9000244140625, "loss": 0.0949, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.964498996734619, "rewards/margins": 24.539844512939453, "rewards/rejected": -27.510936737060547, "step": 2680 }, { "epoch": 0.6781583840166389, "grad_norm": 63.57442092895508, "learning_rate": 4.7621051052513914e-07, "logits/chosen": 0.1649627685546875, "logits/rejected": NaN, "logps/chosen": -167.6203155517578, "logps/rejected": -771.125, "loss": 0.0917, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.9297242164611816, "rewards/margins": 22.321094512939453, "rewards/rejected": -25.2578125, "step": 2690 }, { "epoch": 0.680679418901459, "grad_norm": 151.97769165039062, "learning_rate": 4.7589726503679416e-07, "logits/chosen": 0.0764007568359375, "logits/rejected": 0.27435189485549927, "logps/chosen": -181.9406280517578, "logps/rejected": -745.4749755859375, "loss": 0.1759, "rewards/accuracies": 0.953125, "rewards/chosen": -2.605969190597534, "rewards/margins": 19.7890625, "rewards/rejected": -22.395313262939453, "step": 2700 }, { "epoch": 0.6832004537862792, "grad_norm": 184.77317810058594, "learning_rate": 4.7558207511232116e-07, "logits/chosen": 0.02455291710793972, "logits/rejected": NaN, "logps/chosen": -158.63436889648438, "logps/rejected": -729.6749877929688, "loss": 0.1162, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.872567653656006, "rewards/margins": 21.19921875, "rewards/rejected": -24.064062118530273, "step": 2710 }, { "epoch": 0.6857214886710995, "grad_norm": 0.5396440625190735, "learning_rate": 4.7526494346475297e-07, "logits/chosen": 0.11398925632238388, "logits/rejected": NaN, "logps/chosen": -168.39688110351562, "logps/rejected": -758.4500122070312, "loss": 0.0873, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.505859375, "rewards/margins": 22.564844131469727, "rewards/rejected": -26.05859375, "step": 2720 }, { "epoch": 0.6882425235559197, "grad_norm": 166.29287719726562, "learning_rate": 4.74945872823836e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -173.4343719482422, "logps/rejected": -806.4500122070312, "loss": 0.0517, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5068726539611816, "rewards/margins": 23.296875, "rewards/rejected": -26.796875, "step": 2730 }, { "epoch": 0.6907635584407399, "grad_norm": 87.88737487792969, "learning_rate": 4.746248659360066e-07, "logits/chosen": 0.04841766506433487, "logits/rejected": NaN, "logps/chosen": -165.078125, "logps/rejected": -748.4500122070312, "loss": 0.0644, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.9992918968200684, "rewards/margins": 21.15625, "rewards/rejected": -24.16015625, "step": 2740 }, { "epoch": 0.6932845933255601, "grad_norm": 38.208740234375, "learning_rate": 4.743019255643677e-07, "logits/chosen": -0.07803726196289062, "logits/rejected": NaN, "logps/chosen": -169.12344360351562, "logps/rejected": -735.0, "loss": 0.1161, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.929400682449341, "rewards/margins": 19.748828887939453, "rewards/rejected": -22.674219131469727, "step": 2750 }, { "epoch": 0.6958056282103804, "grad_norm": 8.350687980651855, "learning_rate": 4.739770544886648e-07, "logits/chosen": -0.07791747897863388, "logits/rejected": NaN, "logps/chosen": -150.0656280517578, "logps/rejected": -719.4000244140625, "loss": 0.043, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.563559055328369, "rewards/margins": 20.485937118530273, "rewards/rejected": -23.038280487060547, "step": 2760 }, { "epoch": 0.6983266630952005, "grad_norm": 2.743867874145508, "learning_rate": 4.736502555052624e-07, "logits/chosen": -0.05046691745519638, "logits/rejected": 0.07678528130054474, "logps/chosen": -179.99374389648438, "logps/rejected": -810.5499877929688, "loss": 0.0277, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.1817870140075684, "rewards/margins": 23.323436737060547, "rewards/rejected": -26.500782012939453, "step": 2770 }, { "epoch": 0.7008476979800208, "grad_norm": 26.9610538482666, "learning_rate": 4.7332153142711944e-07, "logits/chosen": 0.15575560927391052, "logits/rejected": NaN, "logps/chosen": -169.7062530517578, "logps/rejected": -765.7750244140625, "loss": 0.0184, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.4808197021484375, "rewards/margins": 23.842967987060547, "rewards/rejected": -27.328907012939453, "step": 2780 }, { "epoch": 0.703368732864841, "grad_norm": 218.36158752441406, "learning_rate": 4.729908850837654e-07, "logits/chosen": 0.08559875190258026, "logits/rejected": 0.327493280172348, "logps/chosen": -169.54061889648438, "logps/rejected": -821.4500122070312, "loss": 0.1163, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.9695067405700684, "rewards/margins": 26.557811737060547, "rewards/rejected": -30.52734375, "step": 2790 }, { "epoch": 0.7058897677496613, "grad_norm": 0.07881426066160202, "learning_rate": 4.72658319321276e-07, "logits/chosen": 0.04155578464269638, "logits/rejected": NaN, "logps/chosen": -152.49844360351562, "logps/rejected": -808.5999755859375, "loss": 0.0388, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.673828125, "rewards/margins": 25.207813262939453, "rewards/rejected": -27.896093368530273, "step": 2800 }, { "epoch": 0.7084108026344814, "grad_norm": 15.520272254943848, "learning_rate": 4.7232383700224827e-07, "logits/chosen": 0.09868164360523224, "logits/rejected": NaN, "logps/chosen": -161.2265625, "logps/rejected": -793.3499755859375, "loss": 0.0726, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0637450218200684, "rewards/margins": 25.203125, "rewards/rejected": -28.274219512939453, "step": 2810 }, { "epoch": 0.7109318375193017, "grad_norm": 0.006915172562003136, "learning_rate": 4.7198744100577657e-07, "logits/chosen": 0.26458740234375, "logits/rejected": NaN, "logps/chosen": -165.92343139648438, "logps/rejected": -792.7999877929688, "loss": 0.0233, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.418261766433716, "rewards/margins": 23.813282012939453, "rewards/rejected": -27.224218368530273, "step": 2820 }, { "epoch": 0.7134528724041219, "grad_norm": 54.25067901611328, "learning_rate": 4.716491342274272e-07, "logits/chosen": 0.2671875059604645, "logits/rejected": NaN, "logps/chosen": -186.7531280517578, "logps/rejected": -766.0750122070312, "loss": 0.0989, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.8971190452575684, "rewards/margins": 22.992969512939453, "rewards/rejected": -26.881250381469727, "step": 2830 }, { "epoch": 0.7159739072889421, "grad_norm": 63.91415023803711, "learning_rate": 4.7130891957921383e-07, "logits/chosen": 0.10384826362133026, "logits/rejected": NaN, "logps/chosen": -175.6593780517578, "logps/rejected": -757.7750244140625, "loss": 0.1582, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.052905321121216, "rewards/margins": 21.454687118530273, "rewards/rejected": -24.5078125, "step": 2840 }, { "epoch": 0.7184949421737623, "grad_norm": 155.74996948242188, "learning_rate": 4.709667999895724e-07, "logits/chosen": 0.08009643852710724, "logits/rejected": 0.2073516845703125, "logps/chosen": -177.32186889648438, "logps/rejected": -760.7249755859375, "loss": 0.0604, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0224366188049316, "rewards/margins": 22.112499237060547, "rewards/rejected": -25.126562118530273, "step": 2850 }, { "epoch": 0.7210159770585826, "grad_norm": 11.226325035095215, "learning_rate": 4.7062277840333574e-07, "logits/chosen": 0.01398315466940403, "logits/rejected": NaN, "logps/chosen": -153.96875, "logps/rejected": -749.9249877929688, "loss": 0.0642, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.32891845703125, "rewards/margins": 21.557811737060547, "rewards/rejected": -24.87890625, "step": 2860 }, { "epoch": 0.7235370119434028, "grad_norm": 163.14610290527344, "learning_rate": 4.702768577817083e-07, "logits/chosen": -0.03070678748190403, "logits/rejected": NaN, "logps/chosen": -154.2234344482422, "logps/rejected": -749.875, "loss": 0.0445, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.348437547683716, "rewards/margins": 21.241405487060547, "rewards/rejected": -23.596094131469727, "step": 2870 }, { "epoch": 0.726058046828223, "grad_norm": 233.50999450683594, "learning_rate": 4.6992904110224084e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -161.0203094482422, "logps/rejected": -761.0999755859375, "loss": 0.0567, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.797869920730591, "rewards/margins": 21.427343368530273, "rewards/rejected": -24.233592987060547, "step": 2880 }, { "epoch": 0.7285790817130432, "grad_norm": 91.89845275878906, "learning_rate": 4.6957933135880454e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -168.28125, "logps/rejected": -760.9500122070312, "loss": 0.0699, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.251074314117432, "rewards/margins": 22.8515625, "rewards/rejected": -27.099218368530273, "step": 2890 }, { "epoch": 0.7311001165978634, "grad_norm": 0.40941712260246277, "learning_rate": 4.692277315615654e-07, "logits/chosen": 0.12331084907054901, "logits/rejected": NaN, "logps/chosen": -187.59375, "logps/rejected": -819.2999877929688, "loss": 0.0647, "rewards/accuracies": 0.984375, "rewards/chosen": -4.338330268859863, "rewards/margins": 25.0625, "rewards/rejected": -29.392187118530273, "step": 2900 }, { "epoch": 0.7336211514826836, "grad_norm": 144.1935272216797, "learning_rate": 4.6887424473695814e-07, "logits/chosen": 0.09344635158777237, "logits/rejected": NaN, "logps/chosen": -182.1046905517578, "logps/rejected": -780.7249755859375, "loss": 0.1687, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -3.5861573219299316, "rewards/margins": 23.337499618530273, "rewards/rejected": -26.923437118530273, "step": 2910 }, { "epoch": 0.7361421863675038, "grad_norm": 2.717528820037842, "learning_rate": 4.6851887392766043e-07, "logits/chosen": -0.0047149658203125, "logits/rejected": NaN, "logps/chosen": -165.5968780517578, "logps/rejected": -802.375, "loss": 0.0782, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.896228075027466, "rewards/margins": 23.606250762939453, "rewards/rejected": -26.5, "step": 2920 }, { "epoch": 0.7386632212523241, "grad_norm": 14.419089317321777, "learning_rate": 4.681616221925665e-07, "logits/chosen": -0.02470092847943306, "logits/rejected": NaN, "logps/chosen": -179.4734344482422, "logps/rejected": -798.9749755859375, "loss": 0.0417, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.3457274436950684, "rewards/margins": 23.963281631469727, "rewards/rejected": -27.303905487060547, "step": 2930 }, { "epoch": 0.7411842561371443, "grad_norm": 156.2595672607422, "learning_rate": 4.678024926067609e-07, "logits/chosen": 0.06611023098230362, "logits/rejected": NaN, "logps/chosen": -156.88125610351562, "logps/rejected": -785.0499877929688, "loss": 0.0685, "rewards/accuracies": 0.984375, "rewards/chosen": -3.271252393722534, "rewards/margins": 24.3515625, "rewards/rejected": -27.64453125, "step": 2940 }, { "epoch": 0.7437052910219645, "grad_norm": 81.36058044433594, "learning_rate": 4.6744148826149194e-07, "logits/chosen": 0.06357116997241974, "logits/rejected": NaN, "logps/chosen": -166.06094360351562, "logps/rejected": -786.8499755859375, "loss": 0.0814, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.65771484375, "rewards/margins": 24.007030487060547, "rewards/rejected": -27.674999237060547, "step": 2950 }, { "epoch": 0.7462263259067847, "grad_norm": 27.23236083984375, "learning_rate": 4.670786122641451e-07, "logits/chosen": 0.06828002631664276, "logits/rejected": NaN, "logps/chosen": -166.04531860351562, "logps/rejected": -733.2999877929688, "loss": 0.1237, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.236889600753784, "rewards/margins": 19.796875, "rewards/rejected": -23.0234375, "step": 2960 }, { "epoch": 0.748747360791605, "grad_norm": 0.4971924126148224, "learning_rate": 4.667138677382164e-07, "logits/chosen": 0.10382995754480362, "logits/rejected": NaN, "logps/chosen": -157.6765594482422, "logps/rejected": -723.6749877929688, "loss": 0.1314, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.557055711746216, "rewards/margins": 18.608007431030273, "rewards/rejected": -21.155467987060547, "step": 2970 }, { "epoch": 0.7512683956764252, "grad_norm": 48.24627685546875, "learning_rate": 4.663472578232853e-07, "logits/chosen": 0.06282653659582138, "logits/rejected": NaN, "logps/chosen": -135.671875, "logps/rejected": -724.75, "loss": 0.0304, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.2314085960388184, "rewards/margins": 19.913280487060547, "rewards/rejected": -22.139842987060547, "step": 2980 }, { "epoch": 0.7537894305612454, "grad_norm": 46.61393737792969, "learning_rate": 4.6597878567498804e-07, "logits/chosen": 0.23824767768383026, "logits/rejected": 0.47975462675094604, "logps/chosen": -176.4718780517578, "logps/rejected": -771.2000122070312, "loss": 0.046, "rewards/accuracies": 0.984375, "rewards/chosen": -3.879199266433716, "rewards/margins": 21.743358612060547, "rewards/rejected": -25.619140625, "step": 2990 }, { "epoch": 0.7563104654460656, "grad_norm": 14.089741706848145, "learning_rate": 4.6560845446499007e-07, "logits/chosen": 0.08040771633386612, "logits/rejected": NaN, "logps/chosen": -178.40625, "logps/rejected": -799.5750122070312, "loss": 0.093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5696959495544434, "rewards/margins": 25.891796112060547, "rewards/rejected": -29.457813262939453, "step": 3000 }, { "epoch": 0.7588315003308859, "grad_norm": 30.064411163330078, "learning_rate": 4.652362673809589e-07, "logits/chosen": 0.30839234590530396, "logits/rejected": NaN, "logps/chosen": -186.03750610351562, "logps/rejected": -844.4749755859375, "loss": 0.0432, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.971484184265137, "rewards/margins": 28.689062118530273, "rewards/rejected": -33.64531326293945, "step": 3010 }, { "epoch": 0.761352535215706, "grad_norm": 56.49455261230469, "learning_rate": 4.6486222762653664e-07, "logits/chosen": 0.2698730528354645, "logits/rejected": NaN, "logps/chosen": -179.60781860351562, "logps/rejected": -806.75, "loss": 0.259, "rewards/accuracies": 0.96875, "rewards/chosen": -4.457348823547363, "rewards/margins": 26.405467987060547, "rewards/rejected": -30.859375, "step": 3020 }, { "epoch": 0.7638735701005263, "grad_norm": 28.634960174560547, "learning_rate": 4.644863384213128e-07, "logits/chosen": -0.01008758507668972, "logits/rejected": NaN, "logps/chosen": -187.5593719482422, "logps/rejected": -807.5, "loss": 0.1137, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.728832960128784, "rewards/margins": 24.103124618530273, "rewards/rejected": -27.823436737060547, "step": 3030 }, { "epoch": 0.7663946049853465, "grad_norm": 246.79608154296875, "learning_rate": 4.641086030007958e-07, "logits/chosen": 0.1654617339372635, "logits/rejected": NaN, "logps/chosen": -161.4875030517578, "logps/rejected": -785.6749877929688, "loss": 0.0739, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.034899950027466, "rewards/margins": 24.522655487060547, "rewards/rejected": -27.5546875, "step": 3040 }, { "epoch": 0.7689156398701668, "grad_norm": 203.7487030029297, "learning_rate": 4.6372902461638587e-07, "logits/chosen": 0.23123855888843536, "logits/rejected": 0.4457840025424957, "logps/chosen": -150.25, "logps/rejected": -793.0750122070312, "loss": 0.1047, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.752197265625, "rewards/margins": 23.264062881469727, "rewards/rejected": -26.017969131469727, "step": 3050 }, { "epoch": 0.7714366747549869, "grad_norm": 76.03661346435547, "learning_rate": 4.6334760653534666e-07, "logits/chosen": 0.49936217069625854, "logits/rejected": NaN, "logps/chosen": -149.6085968017578, "logps/rejected": -779.3250122070312, "loss": 0.0592, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.151623487472534, "rewards/margins": 24.246875762939453, "rewards/rejected": -27.396875381469727, "step": 3060 }, { "epoch": 0.7739577096398071, "grad_norm": 163.19842529296875, "learning_rate": 4.629643520407772e-07, "logits/chosen": 0.22030028700828552, "logits/rejected": 0.44069671630859375, "logps/chosen": -160.33358764648438, "logps/rejected": -748.875, "loss": 0.1443, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.824877977371216, "rewards/margins": 21.877344131469727, "rewards/rejected": -24.700000762939453, "step": 3070 }, { "epoch": 0.7764787445246274, "grad_norm": 92.91260528564453, "learning_rate": 4.625792644315837e-07, "logits/chosen": 0.13764342665672302, "logits/rejected": NaN, "logps/chosen": -174.9875030517578, "logps/rejected": -789.2750244140625, "loss": 0.0267, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.032519578933716, "rewards/margins": 22.618749618530273, "rewards/rejected": -25.637500762939453, "step": 3080 }, { "epoch": 0.7789997794094475, "grad_norm": 2.3325679302215576, "learning_rate": 4.621923470224511e-07, "logits/chosen": 0.1300918608903885, "logits/rejected": 0.26889342069625854, "logps/chosen": -159.7843780517578, "logps/rejected": -730.875, "loss": 0.0635, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.589977979660034, "rewards/margins": 21.06640625, "rewards/rejected": -23.655467987060547, "step": 3090 }, { "epoch": 0.7815208142942678, "grad_norm": 241.27296447753906, "learning_rate": 4.618036031438145e-07, "logits/chosen": 0.16794128715991974, "logits/rejected": 0.4120025634765625, "logps/chosen": -180.4875030517578, "logps/rejected": -820.2750244140625, "loss": 0.0754, "rewards/accuracies": 0.984375, "rewards/chosen": -3.655200242996216, "rewards/margins": 23.063282012939453, "rewards/rejected": -26.720312118530273, "step": 3100 }, { "epoch": 0.784041849179088, "grad_norm": 0.11340184509754181, "learning_rate": 4.6141303614183064e-07, "logits/chosen": 0.21788939833641052, "logits/rejected": 0.559460461139679, "logps/chosen": -174.7843780517578, "logps/rejected": -828.3250122070312, "loss": 0.0373, "rewards/accuracies": 0.96875, "rewards/chosen": -3.7233519554138184, "rewards/margins": 26.43359375, "rewards/rejected": -30.146875381469727, "step": 3110 }, { "epoch": 0.7865628840639083, "grad_norm": 190.56446838378906, "learning_rate": 4.610206493783488e-07, "logits/chosen": 0.1855209320783615, "logits/rejected": NaN, "logps/chosen": -174.0593719482422, "logps/rejected": -813.0750122070312, "loss": 0.1153, "rewards/accuracies": 0.984375, "rewards/chosen": -3.208203077316284, "rewards/margins": 26.454687118530273, "rewards/rejected": -29.6484375, "step": 3120 }, { "epoch": 0.7890839189487284, "grad_norm": 0.9708433747291565, "learning_rate": 4.6062644623088243e-07, "logits/chosen": -0.01124420203268528, "logits/rejected": NaN, "logps/chosen": -180.078125, "logps/rejected": -792.7000122070312, "loss": 0.0781, "rewards/accuracies": 0.984375, "rewards/chosen": -3.2111778259277344, "rewards/margins": 23.903905868530273, "rewards/rejected": -27.1171875, "step": 3130 }, { "epoch": 0.7916049538335487, "grad_norm": 57.568321228027344, "learning_rate": 4.602304300925793e-07, "logits/chosen": 0.08931274712085724, "logits/rejected": NaN, "logps/chosen": -168.71875, "logps/rejected": -785.5, "loss": 0.1497, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.766894578933716, "rewards/margins": 22.413280487060547, "rewards/rejected": -25.186717987060547, "step": 3140 }, { "epoch": 0.7941259887183689, "grad_norm": 200.3190155029297, "learning_rate": 4.598326043721928e-07, "logits/chosen": 0.1144176498055458, "logits/rejected": NaN, "logps/chosen": -166.84375, "logps/rejected": -745.0499877929688, "loss": 0.0363, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9968292713165283, "rewards/margins": 21.6328125, "rewards/rejected": -24.622655868530273, "step": 3150 }, { "epoch": 0.7966470236031891, "grad_norm": 0.28851839900016785, "learning_rate": 4.594329724940529e-07, "logits/chosen": 0.265756219625473, "logits/rejected": NaN, "logps/chosen": -159.0625, "logps/rejected": -787.4500122070312, "loss": 0.0741, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.466198682785034, "rewards/margins": 24.001171112060547, "rewards/rejected": -27.4609375, "step": 3160 }, { "epoch": 0.7991680584880093, "grad_norm": 76.99720001220703, "learning_rate": 4.5903153789803573e-07, "logits/chosen": 0.17586669325828552, "logits/rejected": 0.412954717874527, "logps/chosen": -169.2937469482422, "logps/rejected": -825.6500244140625, "loss": 0.0859, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.436929225921631, "rewards/margins": 25.028905868530273, "rewards/rejected": -28.479686737060547, "step": 3170 }, { "epoch": 0.8016890933728296, "grad_norm": 0.24017824232578278, "learning_rate": 4.5862830403953475e-07, "logits/chosen": 0.2032173126935959, "logits/rejected": NaN, "logps/chosen": -161.47030639648438, "logps/rejected": -780.4249877929688, "loss": 0.0612, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0895142555236816, "rewards/margins": 24.5625, "rewards/rejected": -27.662500381469727, "step": 3180 }, { "epoch": 0.8042101282576498, "grad_norm": 227.49537658691406, "learning_rate": 4.5822327438943076e-07, "logits/chosen": 0.03833312913775444, "logits/rejected": NaN, "logps/chosen": -171.1796875, "logps/rejected": -762.0499877929688, "loss": 0.0631, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.0060181617736816, "rewards/margins": 22.481250762939453, "rewards/rejected": -25.482030868530273, "step": 3190 }, { "epoch": 0.80673116314247, "grad_norm": 0.366372674703598, "learning_rate": 4.578164524340622e-07, "logits/chosen": 0.08175964653491974, "logits/rejected": 0.28181761503219604, "logps/chosen": -165.79531860351562, "logps/rejected": -784.9249877929688, "loss": 0.1956, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.695452928543091, "rewards/margins": 21.517578125, "rewards/rejected": -24.217187881469727, "step": 3200 }, { "epoch": 0.8092521980272902, "grad_norm": 2.3993303775787354, "learning_rate": 4.5740784167519465e-07, "logits/chosen": 0.05980835109949112, "logits/rejected": 0.08739013969898224, "logps/chosen": -149.0234375, "logps/rejected": -753.9749755859375, "loss": 0.0356, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.5335631370544434, "rewards/margins": 21.067577362060547, "rewards/rejected": -23.598438262939453, "step": 3210 }, { "epoch": 0.8117732329121105, "grad_norm": 89.7872543334961, "learning_rate": 4.569974456299913e-07, "logits/chosen": -0.09253539890050888, "logits/rejected": NaN, "logps/chosen": -171.85311889648438, "logps/rejected": -769.875, "loss": 0.0469, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.51513671875, "rewards/margins": 21.6328125, "rewards/rejected": -25.14453125, "step": 3220 }, { "epoch": 0.8142942677969306, "grad_norm": 0.6585999727249146, "learning_rate": 4.565852678309823e-07, "logits/chosen": -0.21192626655101776, "logits/rejected": NaN, "logps/chosen": -178.8156280517578, "logps/rejected": -790.1500244140625, "loss": 0.0965, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.38079833984375, "rewards/margins": 23.785938262939453, "rewards/rejected": -27.172657012939453, "step": 3230 }, { "epoch": 0.8168153026817508, "grad_norm": 24.303728103637695, "learning_rate": 4.561713118260343e-07, "logits/chosen": -0.13788756728172302, "logits/rejected": NaN, "logps/chosen": -166.24374389648438, "logps/rejected": -751.875, "loss": 0.0784, "rewards/accuracies": 0.984375, "rewards/chosen": -2.3538146018981934, "rewards/margins": 20.5859375, "rewards/rejected": -22.9453125, "step": 3240 }, { "epoch": 0.8193363375665711, "grad_norm": 77.9454574584961, "learning_rate": 4.5575558117832035e-07, "logits/chosen": -0.023935699835419655, "logits/rejected": NaN, "logps/chosen": -166.80313110351562, "logps/rejected": -784.375, "loss": 0.0152, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.2194762229919434, "rewards/margins": 23.337499618530273, "rewards/rejected": -26.560937881469727, "step": 3250 }, { "epoch": 0.8218573724513913, "grad_norm": 102.30642700195312, "learning_rate": 4.5533807946628875e-07, "logits/chosen": -0.13592529296875, "logits/rejected": NaN, "logps/chosen": -194.85311889648438, "logps/rejected": -843.3499755859375, "loss": 0.0541, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.361718654632568, "rewards/margins": 26.057031631469727, "rewards/rejected": -30.428125381469727, "step": 3260 }, { "epoch": 0.8243784073362115, "grad_norm": 194.23785400390625, "learning_rate": 4.5491881028363245e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -177.94375610351562, "logps/rejected": -853.1500244140625, "loss": 0.1312, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.089501857757568, "rewards/margins": 28.16796875, "rewards/rejected": -32.271095275878906, "step": 3270 }, { "epoch": 0.8268994422210317, "grad_norm": 167.15924072265625, "learning_rate": 4.5449777723925804e-07, "logits/chosen": 0.038421630859375, "logits/rejected": NaN, "logps/chosen": -168.78750610351562, "logps/rejected": -824.5250244140625, "loss": 0.1438, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.3435301780700684, "rewards/margins": 26.422657012939453, "rewards/rejected": -29.7734375, "step": 3280 }, { "epoch": 0.829420477105852, "grad_norm": 0.13043469190597534, "learning_rate": 4.5407498395725487e-07, "logits/chosen": 0.03316802904009819, "logits/rejected": 0.45424652099609375, "logps/chosen": -184.82656860351562, "logps/rejected": -820.75, "loss": 0.0712, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.256762504577637, "rewards/margins": 27.063282012939453, "rewards/rejected": -31.302343368530273, "step": 3290 }, { "epoch": 0.8319415119906721, "grad_norm": 0.019670505076646805, "learning_rate": 4.5365043407686377e-07, "logits/chosen": -0.04816436767578125, "logits/rejected": NaN, "logps/chosen": -170.23593139648438, "logps/rejected": -812.2999877929688, "loss": 0.1342, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9949707984924316, "rewards/margins": 25.999217987060547, "rewards/rejected": -29.98828125, "step": 3300 }, { "epoch": 0.8344625468754924, "grad_norm": 101.5582275390625, "learning_rate": 4.532241312524455e-07, "logits/chosen": 0.04168701171875, "logits/rejected": 0.1060638427734375, "logps/chosen": -160.4812469482422, "logps/rejected": -786.7999877929688, "loss": 0.0352, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1637024879455566, "rewards/margins": 24.12890625, "rewards/rejected": -27.292186737060547, "step": 3310 }, { "epoch": 0.8369835817603126, "grad_norm": 120.95923614501953, "learning_rate": 4.5279607915344975e-07, "logits/chosen": -0.13288268446922302, "logits/rejected": NaN, "logps/chosen": -190.125, "logps/rejected": -807.5499877929688, "loss": 0.1317, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.440185546875, "rewards/margins": 24.060155868530273, "rewards/rejected": -27.49609375, "step": 3320 }, { "epoch": 0.8395046166451329, "grad_norm": 23.875253677368164, "learning_rate": 4.5236628146438293e-07, "logits/chosen": -0.15766295790672302, "logits/rejected": NaN, "logps/chosen": -151.61483764648438, "logps/rejected": -748.5, "loss": 0.0438, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.3003478050231934, "rewards/margins": 22.939453125, "rewards/rejected": -25.243749618530273, "step": 3330 }, { "epoch": 0.842025651529953, "grad_norm": 0.22043640911579132, "learning_rate": 4.519347418847771e-07, "logits/chosen": -0.17631149291992188, "logits/rejected": NaN, "logps/chosen": -168.875, "logps/rejected": -811.4249877929688, "loss": 0.0581, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.170214891433716, "rewards/margins": 25.538280487060547, "rewards/rejected": -28.715625762939453, "step": 3340 }, { "epoch": 0.8445466864147733, "grad_norm": 189.6812744140625, "learning_rate": 4.515014641291578e-07, "logits/chosen": -0.18549804389476776, "logits/rejected": 0.24316100776195526, "logps/chosen": -184.83749389648438, "logps/rejected": -763.9749755859375, "loss": 0.1344, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.431121826171875, "rewards/margins": 23.27734375, "rewards/rejected": -27.704687118530273, "step": 3350 }, { "epoch": 0.8470677212995935, "grad_norm": 87.4990463256836, "learning_rate": 4.51066451927012e-07, "logits/chosen": -0.13362732529640198, "logits/rejected": NaN, "logps/chosen": -167.78280639648438, "logps/rejected": -763.7000122070312, "loss": 0.0402, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8543992042541504, "rewards/margins": 22.460155487060547, "rewards/rejected": -25.306249618530273, "step": 3360 }, { "epoch": 0.8495887561844137, "grad_norm": 303.5073547363281, "learning_rate": 4.506297090227561e-07, "logits/chosen": -0.0531463623046875, "logits/rejected": 0.05371551588177681, "logps/chosen": -150.6062469482422, "logps/rejected": -725.9000244140625, "loss": 0.2576, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.4186644554138184, "rewards/margins": 20.242578506469727, "rewards/rejected": -22.655467987060547, "step": 3370 }, { "epoch": 0.8521097910692339, "grad_norm": 0.05834820121526718, "learning_rate": 4.50191239175704e-07, "logits/chosen": -0.18189087510108948, "logits/rejected": NaN, "logps/chosen": -162.17813110351562, "logps/rejected": -758.6500244140625, "loss": 0.1824, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.418926954269409, "rewards/margins": 21.329296112060547, "rewards/rejected": -23.744922637939453, "step": 3380 }, { "epoch": 0.8546308259540542, "grad_norm": 198.28887939453125, "learning_rate": 4.497510461600341e-07, "logits/chosen": -0.18048706650733948, "logits/rejected": NaN, "logps/chosen": -160.71249389648438, "logps/rejected": -779.25, "loss": 0.1721, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0776610374450684, "rewards/margins": 23.221288681030273, "rewards/rejected": -26.296875, "step": 3390 }, { "epoch": 0.8571518608388744, "grad_norm": 147.4434814453125, "learning_rate": 4.493091337647574e-07, "logits/chosen": -0.17929382622241974, "logits/rejected": NaN, "logps/chosen": -186.82186889648438, "logps/rejected": -807.0250244140625, "loss": 0.1125, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.476855278015137, "rewards/margins": 24.260156631469727, "rewards/rejected": -28.746875762939453, "step": 3400 }, { "epoch": 0.8596728957236945, "grad_norm": 303.27032470703125, "learning_rate": 4.4886550579368445e-07, "logits/chosen": 0.01414337195456028, "logits/rejected": NaN, "logps/chosen": -166.22811889648438, "logps/rejected": -754.375, "loss": 0.1372, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.022936820983887, "rewards/margins": 24.953125, "rewards/rejected": -28.975000381469727, "step": 3410 }, { "epoch": 0.8621939306085148, "grad_norm": 0.7643707394599915, "learning_rate": 4.48420166065393e-07, "logits/chosen": -0.02941131591796875, "logits/rejected": NaN, "logps/chosen": -157.68911743164062, "logps/rejected": -775.4249877929688, "loss": 0.1308, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -2.866119384765625, "rewards/margins": 25.0546875, "rewards/rejected": -27.928905487060547, "step": 3420 }, { "epoch": 0.864714965493335, "grad_norm": 57.520355224609375, "learning_rate": 4.4797311841319494e-07, "logits/chosen": -0.1488296538591385, "logits/rejected": NaN, "logps/chosen": -170.9375, "logps/rejected": -806.1749877929688, "loss": 0.1082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.704144239425659, "rewards/margins": 24.964061737060547, "rewards/rejected": -28.673437118530273, "step": 3430 }, { "epoch": 0.8672360003781552, "grad_norm": 0.13432997465133667, "learning_rate": 4.4752436668510315e-07, "logits/chosen": 0.05154876783490181, "logits/rejected": 0.38385772705078125, "logps/chosen": -165.6796875, "logps/rejected": -787.0250244140625, "loss": 0.0622, "rewards/accuracies": 0.984375, "rewards/chosen": -4.180737495422363, "rewards/margins": 24.482030868530273, "rewards/rejected": -28.666406631469727, "step": 3440 }, { "epoch": 0.8697570352629754, "grad_norm": 28.056047439575195, "learning_rate": 4.4707391474379864e-07, "logits/chosen": -0.07776336371898651, "logits/rejected": 0.20037230849266052, "logps/chosen": -184.4140625, "logps/rejected": -807.7999877929688, "loss": 0.0625, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -4.61187744140625, "rewards/margins": 23.123437881469727, "rewards/rejected": -27.739063262939453, "step": 3450 }, { "epoch": 0.8722780701477957, "grad_norm": 247.884765625, "learning_rate": 4.4662176646659716e-07, "logits/chosen": 0.2542739808559418, "logits/rejected": 0.4799400269985199, "logps/chosen": -188.390625, "logps/rejected": -788.7999877929688, "loss": 0.2135, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.933203220367432, "rewards/margins": 23.666601181030273, "rewards/rejected": -29.600780487060547, "step": 3460 }, { "epoch": 0.8747991050326159, "grad_norm": 0.003803750267252326, "learning_rate": 4.4616792574541596e-07, "logits/chosen": 0.03205566480755806, "logits/rejected": NaN, "logps/chosen": -172.1414031982422, "logps/rejected": -814.7750244140625, "loss": 0.1753, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.055951118469238, "rewards/margins": 25.676563262939453, "rewards/rejected": -29.735157012939453, "step": 3470 }, { "epoch": 0.8773201399174361, "grad_norm": 0.417082279920578, "learning_rate": 4.4571239648674007e-07, "logits/chosen": -0.09753875434398651, "logits/rejected": NaN, "logps/chosen": -165.25155639648438, "logps/rejected": -753.375, "loss": 0.0754, "rewards/accuracies": 0.984375, "rewards/chosen": -2.791491746902466, "rewards/margins": 21.584375381469727, "rewards/rejected": -24.38671875, "step": 3480 }, { "epoch": 0.8798411748022563, "grad_norm": 228.15257263183594, "learning_rate": 4.4525518261158886e-07, "logits/chosen": -0.12496642768383026, "logits/rejected": NaN, "logps/chosen": -170.44375610351562, "logps/rejected": -785.875, "loss": 0.0692, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.326184034347534, "rewards/margins": 23.293750762939453, "rewards/rejected": -26.616405487060547, "step": 3490 }, { "epoch": 0.8823622096870766, "grad_norm": 4.8524017333984375, "learning_rate": 4.4479628805548213e-07, "logits/chosen": 0.00971069373190403, "logits/rejected": NaN, "logps/chosen": -176.61563110351562, "logps/rejected": -807.375, "loss": 0.2019, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.987377882003784, "rewards/margins": 23.899999618530273, "rewards/rejected": -27.892969131469727, "step": 3500 }, { "epoch": 0.8848832445718967, "grad_norm": 8.82070255279541, "learning_rate": 4.443357167684065e-07, "logits/chosen": -0.1063385009765625, "logits/rejected": NaN, "logps/chosen": -164.38125610351562, "logps/rejected": -757.3250122070312, "loss": 0.1209, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1664795875549316, "rewards/margins": 22.696094512939453, "rewards/rejected": -25.860937118530273, "step": 3510 }, { "epoch": 0.887404279456717, "grad_norm": 18.90492057800293, "learning_rate": 4.4387347271478115e-07, "logits/chosen": -0.20375671982765198, "logits/rejected": NaN, "logps/chosen": -163.46719360351562, "logps/rejected": -719.0499877929688, "loss": 0.0902, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.546722412109375, "rewards/margins": 19.819530487060547, "rewards/rejected": -22.365625381469727, "step": 3520 }, { "epoch": 0.8899253143415372, "grad_norm": 134.02085876464844, "learning_rate": 4.4340955987342375e-07, "logits/chosen": -0.17965392768383026, "logits/rejected": NaN, "logps/chosen": -167.1531219482422, "logps/rejected": -745.7000122070312, "loss": 0.145, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.653521776199341, "rewards/margins": 19.7890625, "rewards/rejected": -22.443750381469727, "step": 3530 }, { "epoch": 0.8924463492263575, "grad_norm": 0.024332895874977112, "learning_rate": 4.4294398223751626e-07, "logits/chosen": -0.16735228896141052, "logits/rejected": NaN, "logps/chosen": -152.81094360351562, "logps/rejected": -762.6500244140625, "loss": 0.0289, "rewards/accuracies": 0.984375, "rewards/chosen": -2.1810364723205566, "rewards/margins": 21.825000762939453, "rewards/rejected": -23.998437881469727, "step": 3540 }, { "epoch": 0.8949673841111776, "grad_norm": 0.22893188893795013, "learning_rate": 4.424767438145707e-07, "logits/chosen": 0.08214416354894638, "logits/rejected": NaN, "logps/chosen": -180.421875, "logps/rejected": -817.9500122070312, "loss": 0.0891, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6898086071014404, "rewards/margins": 25.599218368530273, "rewards/rejected": -29.292186737060547, "step": 3550 }, { "epoch": 0.8974884189959978, "grad_norm": 49.47077178955078, "learning_rate": 4.4200784862639415e-07, "logits/chosen": -0.06610260158777237, "logits/rejected": NaN, "logps/chosen": -191.109375, "logps/rejected": -798.7999877929688, "loss": 0.129, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.435320854187012, "rewards/margins": 23.994531631469727, "rewards/rejected": -28.430469512939453, "step": 3560 }, { "epoch": 0.9000094538808181, "grad_norm": 19.688636779785156, "learning_rate": 4.4153730070905485e-07, "logits/chosen": -0.02271118201315403, "logits/rejected": 0.3879905641078949, "logps/chosen": -171.96249389648438, "logps/rejected": -782.7000122070312, "loss": 0.126, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.9275450706481934, "rewards/margins": 22.907421112060547, "rewards/rejected": -26.84375, "step": 3570 }, { "epoch": 0.9025304887656382, "grad_norm": 0.15736792981624603, "learning_rate": 4.41065104112847e-07, "logits/chosen": -0.12029876559972763, "logits/rejected": NaN, "logps/chosen": -169.24374389648438, "logps/rejected": -776.375, "loss": 0.04, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1900634765625, "rewards/margins": 22.513280868530273, "rewards/rejected": -25.701562881469727, "step": 3580 }, { "epoch": 0.9050515236504585, "grad_norm": 54.832523345947266, "learning_rate": 4.4059126290225577e-07, "logits/chosen": -0.02287902869284153, "logits/rejected": NaN, "logps/chosen": -172.3937530517578, "logps/rejected": -783.125, "loss": 0.023, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.892468214035034, "rewards/margins": 22.998437881469727, "rewards/rejected": -25.8984375, "step": 3590 }, { "epoch": 0.9075725585352787, "grad_norm": 0.0831882581114769, "learning_rate": 4.401157811559228e-07, "logits/chosen": 0.08169250190258026, "logits/rejected": NaN, "logps/chosen": -193.2734375, "logps/rejected": -795.8499755859375, "loss": 0.1347, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.204992771148682, "rewards/margins": 24.403125762939453, "rewards/rejected": -28.609375, "step": 3600 }, { "epoch": 0.910093593420099, "grad_norm": 44.05506134033203, "learning_rate": 4.3963866296661087e-07, "logits/chosen": -0.06558837741613388, "logits/rejected": NaN, "logps/chosen": -182.4093780517578, "logps/rejected": -823.125, "loss": 0.1957, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.6355347633361816, "rewards/margins": 25.649219512939453, "rewards/rejected": -29.278905868530273, "step": 3610 }, { "epoch": 0.9126146283049191, "grad_norm": 145.77874755859375, "learning_rate": 4.3915991244116813e-07, "logits/chosen": 0.07108459621667862, "logits/rejected": 0.49142152070999146, "logps/chosen": -163.7062530517578, "logps/rejected": -768.7750244140625, "loss": 0.1147, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.2437987327575684, "rewards/margins": 23.572265625, "rewards/rejected": -26.807031631469727, "step": 3620 }, { "epoch": 0.9151356631897394, "grad_norm": 48.47760772705078, "learning_rate": 4.386795337004939e-07, "logits/chosen": 0.16656494140625, "logits/rejected": NaN, "logps/chosen": -185.8312530517578, "logps/rejected": -798.5, "loss": 0.1979, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -4.761645317077637, "rewards/margins": 23.952342987060547, "rewards/rejected": -28.713281631469727, "step": 3630 }, { "epoch": 0.9176566980745596, "grad_norm": 29.035720825195312, "learning_rate": 4.3819753087950214e-07, "logits/chosen": 0.293344110250473, "logits/rejected": NaN, "logps/chosen": -205.82186889648438, "logps/rejected": -795.25, "loss": 0.2134, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.575097560882568, "rewards/margins": 22.833593368530273, "rewards/rejected": -28.421092987060547, "step": 3640 }, { "epoch": 0.9201777329593799, "grad_norm": 244.97378540039062, "learning_rate": 4.377139081270863e-07, "logits/chosen": 0.164448544383049, "logits/rejected": NaN, "logps/chosen": -165.1999969482422, "logps/rejected": -774.5, "loss": 0.0853, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.878466844558716, "rewards/margins": 22.946483612060547, "rewards/rejected": -26.825780868530273, "step": 3650 }, { "epoch": 0.9226987678442, "grad_norm": 2.5043134689331055, "learning_rate": 4.372286696060838e-07, "logits/chosen": 0.3017120361328125, "logits/rejected": NaN, "logps/chosen": -167.2609405517578, "logps/rejected": -801.25, "loss": 0.1095, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.7817139625549316, "rewards/margins": 24.422657012939453, "rewards/rejected": -28.192968368530273, "step": 3660 }, { "epoch": 0.9252198027290203, "grad_norm": 136.25830078125, "learning_rate": 4.367418194932397e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -165.53125, "logps/rejected": -849.6500244140625, "loss": 0.0544, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.80560302734375, "rewards/margins": 27.100000381469727, "rewards/rejected": -30.904687881469727, "step": 3670 }, { "epoch": 0.9277408376138405, "grad_norm": 93.02103424072266, "learning_rate": 4.362533619791711e-07, "logits/chosen": 0.12004394829273224, "logits/rejected": NaN, "logps/chosen": -164.4187469482422, "logps/rejected": -819.7000122070312, "loss": 0.1626, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.88555908203125, "rewards/margins": 26.717187881469727, "rewards/rejected": -30.612499237060547, "step": 3680 }, { "epoch": 0.9302618724986607, "grad_norm": 193.71022033691406, "learning_rate": 4.3576330126833113e-07, "logits/chosen": -0.01665801927447319, "logits/rejected": NaN, "logps/chosen": -167.31405639648438, "logps/rejected": -788.875, "loss": 0.0939, "rewards/accuracies": 0.96875, "rewards/chosen": -3.509936571121216, "rewards/margins": 25.72265625, "rewards/rejected": -29.228906631469727, "step": 3690 }, { "epoch": 0.9327829073834809, "grad_norm": 152.2213897705078, "learning_rate": 4.352716415789724e-07, "logits/chosen": -0.15761108696460724, "logits/rejected": NaN, "logps/chosen": -166.16250610351562, "logps/rejected": -783.5250244140625, "loss": 0.17, "rewards/accuracies": 0.96875, "rewards/chosen": -3.027374267578125, "rewards/margins": 23.313282012939453, "rewards/rejected": -26.339061737060547, "step": 3700 }, { "epoch": 0.9353039422683012, "grad_norm": 0.937143087387085, "learning_rate": 4.3477838714311104e-07, "logits/chosen": -0.1270301789045334, "logits/rejected": 0.309844970703125, "logps/chosen": -155.79061889648438, "logps/rejected": -779.7750244140625, "loss": 0.085, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.8750853538513184, "rewards/margins": 22.87890625, "rewards/rejected": -25.757030487060547, "step": 3710 }, { "epoch": 0.9378249771531214, "grad_norm": 221.4364471435547, "learning_rate": 4.342835422064902e-07, "logits/chosen": -0.11585541069507599, "logits/rejected": NaN, "logps/chosen": -175.44686889648438, "logps/rejected": -793.375, "loss": 0.0491, "rewards/accuracies": 0.984375, "rewards/chosen": -3.406780958175659, "rewards/margins": 22.118749618530273, "rewards/rejected": -25.523828506469727, "step": 3720 }, { "epoch": 0.9403460120379415, "grad_norm": 83.56561279296875, "learning_rate": 4.337871110285432e-07, "logits/chosen": -0.12515564262866974, "logits/rejected": 0.15092162787914276, "logps/chosen": -181.8000030517578, "logps/rejected": -776.2000122070312, "loss": 0.155, "rewards/accuracies": 0.953125, "rewards/chosen": -3.958544969558716, "rewards/margins": 21.714452743530273, "rewards/rejected": -25.674999237060547, "step": 3730 }, { "epoch": 0.9428670469227618, "grad_norm": 90.9571762084961, "learning_rate": 4.3328909788235734e-07, "logits/chosen": -0.046141814440488815, "logits/rejected": NaN, "logps/chosen": -184.265625, "logps/rejected": -785.6749877929688, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8492431640625, "rewards/margins": 22.220312118530273, "rewards/rejected": -26.069530487060547, "step": 3740 }, { "epoch": 0.945388081807582, "grad_norm": 180.38621520996094, "learning_rate": 4.327895070546369e-07, "logits/chosen": -0.0221099853515625, "logits/rejected": 0.2549079954624176, "logps/chosen": -183.6906280517578, "logps/rejected": -800.2000122070312, "loss": 0.0401, "rewards/accuracies": 0.984375, "rewards/chosen": -3.7647461891174316, "rewards/margins": 24.067188262939453, "rewards/rejected": -27.829687118530273, "step": 3750 }, { "epoch": 0.9479091166924022, "grad_norm": 0.40417489409446716, "learning_rate": 4.3228834284566603e-07, "logits/chosen": -0.03294067457318306, "logits/rejected": NaN, "logps/chosen": -177.6218719482422, "logps/rejected": -800.4749755859375, "loss": 0.0431, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.991839647293091, "rewards/margins": 24.715625762939453, "rewards/rejected": -28.698436737060547, "step": 3760 }, { "epoch": 0.9504301515772224, "grad_norm": 260.8210754394531, "learning_rate": 4.3178560956927203e-07, "logits/chosen": 0.00822296179831028, "logits/rejected": NaN, "logps/chosen": -193.68124389648438, "logps/rejected": -797.7999877929688, "loss": 0.1067, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.832958936691284, "rewards/margins": 25.244531631469727, "rewards/rejected": -29.065624237060547, "step": 3770 }, { "epoch": 0.9529511864620427, "grad_norm": 212.8404083251953, "learning_rate": 4.3128131155278823e-07, "logits/chosen": 0.10973358154296875, "logits/rejected": 0.276846319437027, "logps/chosen": -165.0968780517578, "logps/rejected": -830.625, "loss": 0.1031, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.250445604324341, "rewards/margins": 26.317968368530273, "rewards/rejected": -29.55078125, "step": 3780 }, { "epoch": 0.9554722213468629, "grad_norm": 0.007663200609385967, "learning_rate": 4.3077545313701655e-07, "logits/chosen": 0.1707405149936676, "logits/rejected": NaN, "logps/chosen": -157.0281219482422, "logps/rejected": -858.8250122070312, "loss": 0.0563, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.380664110183716, "rewards/margins": 27.874608993530273, "rewards/rejected": -31.248437881469727, "step": 3790 }, { "epoch": 0.9579932562316831, "grad_norm": 38.79426574707031, "learning_rate": 4.3026803867619014e-07, "logits/chosen": 0.2537856996059418, "logits/rejected": NaN, "logps/chosen": -197.69686889648438, "logps/rejected": -863.6500244140625, "loss": 0.1872, "rewards/accuracies": 0.96875, "rewards/chosen": -5.405566215515137, "rewards/margins": 29.246875762939453, "rewards/rejected": -34.67499923706055, "step": 3800 }, { "epoch": 0.9605142911165033, "grad_norm": 18.640867233276367, "learning_rate": 4.297590725379362e-07, "logits/chosen": 0.1638946533203125, "logits/rejected": 0.5495773553848267, "logps/chosen": -187.3406219482422, "logps/rejected": -848.6500244140625, "loss": 0.2465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.835302829742432, "rewards/margins": 29.175390243530273, "rewards/rejected": -34.006248474121094, "step": 3810 }, { "epoch": 0.9630353260013236, "grad_norm": 0.36507317423820496, "learning_rate": 4.292485591032379e-07, "logits/chosen": -0.02750549279153347, "logits/rejected": NaN, "logps/chosen": -173.15469360351562, "logps/rejected": -815.0999755859375, "loss": 0.1582, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.603076219558716, "rewards/margins": 26.565624237060547, "rewards/rejected": -30.153905868530273, "step": 3820 }, { "epoch": 0.9655563608861437, "grad_norm": 15.062064170837402, "learning_rate": 4.287365027663972e-07, "logits/chosen": 0.19913025200366974, "logits/rejected": 0.4940185546875, "logps/chosen": -161.0625, "logps/rejected": -822.2999877929688, "loss": 0.1819, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.5314574241638184, "rewards/margins": 26.350780487060547, "rewards/rejected": -29.879688262939453, "step": 3830 }, { "epoch": 0.968077395770964, "grad_norm": 149.0153045654297, "learning_rate": 4.2822290793499654e-07, "logits/chosen": 0.27763062715530396, "logits/rejected": 0.40827637910842896, "logps/chosen": -178.00625610351562, "logps/rejected": -844.2999877929688, "loss": 0.0829, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.284765720367432, "rewards/margins": 27.110937118530273, "rewards/rejected": -31.384374618530273, "step": 3840 }, { "epoch": 0.9705984306557842, "grad_norm": 0.10489475727081299, "learning_rate": 4.2770777902986125e-07, "logits/chosen": 0.15451660752296448, "logits/rejected": NaN, "logps/chosen": -175.1906280517578, "logps/rejected": -842.8499755859375, "loss": 0.1193, "rewards/accuracies": 0.96875, "rewards/chosen": -4.088702201843262, "rewards/margins": 27.164453506469727, "rewards/rejected": -31.25, "step": 3850 }, { "epoch": 0.9731194655406045, "grad_norm": 0.0433906689286232, "learning_rate": 4.271911204850214e-07, "logits/chosen": 0.038425445556640625, "logits/rejected": NaN, "logps/chosen": -177.16561889648438, "logps/rejected": -842.2000122070312, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.581738233566284, "rewards/margins": 27.703125, "rewards/rejected": -31.286718368530273, "step": 3860 }, { "epoch": 0.9756405004254246, "grad_norm": 201.478515625, "learning_rate": 4.2667293674767345e-07, "logits/chosen": 0.137165829539299, "logits/rejected": 0.6140381097793579, "logps/chosen": -158.13436889648438, "logps/rejected": -816.625, "loss": 0.0721, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.608935594558716, "rewards/margins": 27.828125, "rewards/rejected": -31.446874618530273, "step": 3870 }, { "epoch": 0.9781615353102449, "grad_norm": 144.99913024902344, "learning_rate": 4.261532322781424e-07, "logits/chosen": 0.218536376953125, "logits/rejected": NaN, "logps/chosen": -180.453125, "logps/rejected": -815.2750244140625, "loss": 0.2858, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -4.072100639343262, "rewards/margins": 27.000782012939453, "rewards/rejected": -31.065624237060547, "step": 3880 }, { "epoch": 0.9806825701950651, "grad_norm": 1.2665526866912842, "learning_rate": 4.256320115498427e-07, "logits/chosen": 0.10284729301929474, "logits/rejected": 0.5178176760673523, "logps/chosen": -175.52188110351562, "logps/rejected": -806.9000244140625, "loss": 0.09, "rewards/accuracies": 0.984375, "rewards/chosen": -4.277194023132324, "rewards/margins": 25.2890625, "rewards/rejected": -29.556249618530273, "step": 3890 }, { "epoch": 0.9832036050798852, "grad_norm": 15.207539558410645, "learning_rate": 4.251092790492407e-07, "logits/chosen": 0.163981631398201, "logits/rejected": NaN, "logps/chosen": -196.63125610351562, "logps/rejected": -811.1749877929688, "loss": 0.0783, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.076605319976807, "rewards/margins": 24.635156631469727, "rewards/rejected": -28.728124618530273, "step": 3900 }, { "epoch": 0.9857246399647055, "grad_norm": 165.52328491210938, "learning_rate": 4.24585039275815e-07, "logits/chosen": 0.09450531005859375, "logits/rejected": 0.5258575677871704, "logps/chosen": -178.5906219482422, "logps/rejected": -793.5999755859375, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.347369432449341, "rewards/margins": 23.265625, "rewards/rejected": -26.614843368530273, "step": 3910 }, { "epoch": 0.9882456748495257, "grad_norm": 66.81855773925781, "learning_rate": 4.2405929674201853e-07, "logits/chosen": 0.258139044046402, "logits/rejected": NaN, "logps/chosen": -154.39999389648438, "logps/rejected": -746.75, "loss": 0.0648, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9149718284606934, "rewards/margins": 22.609375, "rewards/rejected": -25.520313262939453, "step": 3920 }, { "epoch": 0.990766709734346, "grad_norm": 33.075557708740234, "learning_rate": 4.2353205597323924e-07, "logits/chosen": 0.23624840378761292, "logits/rejected": NaN, "logps/chosen": -163.7421875, "logps/rejected": -829.1749877929688, "loss": 0.034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8348145484924316, "rewards/margins": 27.014062881469727, "rewards/rejected": -30.846874237060547, "step": 3930 }, { "epoch": 0.9932877446191661, "grad_norm": 0.050511281937360764, "learning_rate": 4.230033215077615e-07, "logits/chosen": 0.36613768339157104, "logits/rejected": NaN, "logps/chosen": -175.99063110351562, "logps/rejected": -842.3499755859375, "loss": 0.0611, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.204247951507568, "rewards/margins": 27.87109375, "rewards/rejected": -32.072654724121094, "step": 3940 }, { "epoch": 0.9958087795039864, "grad_norm": 0.7001527547836304, "learning_rate": 4.2247309789672663e-07, "logits/chosen": 0.16365966200828552, "logits/rejected": NaN, "logps/chosen": -181.0343780517578, "logps/rejected": -833.4000244140625, "loss": 0.0736, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.459069728851318, "rewards/margins": 27.760156631469727, "rewards/rejected": -32.21875, "step": 3950 }, { "epoch": 0.9983298143888066, "grad_norm": 106.69965362548828, "learning_rate": 4.2194138970409406e-07, "logits/chosen": 0.2620697021484375, "logits/rejected": 0.515704333782196, "logps/chosen": -179.8171844482422, "logps/rejected": -858.5250244140625, "loss": 0.1218, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.89501953125, "rewards/margins": 29.200000762939453, "rewards/rejected": -33.09375, "step": 3960 }, { "epoch": 1.0010084139539281, "grad_norm": 0.000635700358543545, "learning_rate": 4.214082015066019e-07, "logits/chosen": 0.133543461561203, "logits/rejected": NaN, "logps/chosen": -178.40475463867188, "logps/rejected": -805.1904907226562, "loss": 0.1003, "rewards/accuracies": 0.976190447807312, "rewards/chosen": -3.6435372829437256, "rewards/margins": 27.350446701049805, "rewards/rejected": -30.997024536132812, "step": 3970 }, { "epoch": 1.0035294488387483, "grad_norm": 0.23930050432682037, "learning_rate": 4.208735378937275e-07, "logits/chosen": 0.03572387620806694, "logits/rejected": 0.3430236876010895, "logps/chosen": -175.19686889648438, "logps/rejected": -896.4000244140625, "loss": 0.0047, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.4676513671875, "rewards/margins": 31.973438262939453, "rewards/rejected": -35.44218826293945, "step": 3980 }, { "epoch": 1.0060504837235684, "grad_norm": 0.2289968729019165, "learning_rate": 4.203374034676481e-07, "logits/chosen": 0.21469421684741974, "logits/rejected": 0.6227477788925171, "logps/chosen": -181.609375, "logps/rejected": -871.5750122070312, "loss": 0.0148, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.974212646484375, "rewards/margins": 30.5625, "rewards/rejected": -34.54375076293945, "step": 3990 }, { "epoch": 1.0085715186083888, "grad_norm": 0.003554339287802577, "learning_rate": 4.1979980284320103e-07, "logits/chosen": 0.5256263613700867, "logits/rejected": NaN, "logps/chosen": -166.4718780517578, "logps/rejected": -851.8499755859375, "loss": 0.0316, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.674151659011841, "rewards/margins": 31.405467987060547, "rewards/rejected": -35.08124923706055, "step": 4000 }, { "epoch": 1.011092553493209, "grad_norm": 7.418168544769287, "learning_rate": 4.1926074064784436e-07, "logits/chosen": 0.5824691653251648, "logits/rejected": NaN, "logps/chosen": -200.50625610351562, "logps/rejected": -869.2750244140625, "loss": 0.004, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.156396389007568, "rewards/margins": 30.1015625, "rewards/rejected": -35.2578125, "step": 4010 }, { "epoch": 1.0136135883780293, "grad_norm": 3.5604536533355713, "learning_rate": 4.187202215216163e-07, "logits/chosen": 0.42318421602249146, "logits/rejected": 0.8826843500137329, "logps/chosen": -167.36874389648438, "logps/rejected": -923.7750244140625, "loss": 0.019, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.045007228851318, "rewards/margins": 34.59453201293945, "rewards/rejected": -38.625, "step": 4020 }, { "epoch": 1.0161346232628494, "grad_norm": 179.42161560058594, "learning_rate": 4.181782501170964e-07, "logits/chosen": 0.6177978515625, "logits/rejected": 1.1247466802597046, "logps/chosen": -169.03125, "logps/rejected": -863.875, "loss": 0.0375, "rewards/accuracies": 0.984375, "rewards/chosen": -4.639794826507568, "rewards/margins": 31.241405487060547, "rewards/rejected": -35.884376525878906, "step": 4030 }, { "epoch": 1.0186556581476696, "grad_norm": 131.1550750732422, "learning_rate": 4.1763483109936423e-07, "logits/chosen": 0.6712417602539062, "logits/rejected": NaN, "logps/chosen": -179.86874389648438, "logps/rejected": -839.1749877929688, "loss": 0.0629, "rewards/accuracies": 0.984375, "rewards/chosen": -4.646960258483887, "rewards/margins": 30.529687881469727, "rewards/rejected": -35.1875, "step": 4040 }, { "epoch": 1.02117669303249, "grad_norm": 120.60309600830078, "learning_rate": 4.1708996914596027e-07, "logits/chosen": 0.5758606195449829, "logits/rejected": 1.1029174327850342, "logps/chosen": -179.2312469482422, "logps/rejected": -891.0, "loss": 0.0316, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.954174995422363, "rewards/margins": 32.576560974121094, "rewards/rejected": -37.53593826293945, "step": 4050 }, { "epoch": 1.02369772791731, "grad_norm": 109.05376434326172, "learning_rate": 4.1654366894684505e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -178.66561889648438, "logps/rejected": -895.3499755859375, "loss": 0.0488, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.2899169921875, "rewards/margins": 33.13945388793945, "rewards/rejected": -38.43281173706055, "step": 4060 }, { "epoch": 1.0262187628021302, "grad_norm": 81.1658706665039, "learning_rate": 4.1599593520435897e-07, "logits/chosen": 0.4514106810092926, "logits/rejected": NaN, "logps/chosen": -171.6437530517578, "logps/rejected": -832.9749755859375, "loss": 0.0708, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.7774415016174316, "rewards/margins": 29.700000762939453, "rewards/rejected": -33.4765625, "step": 4070 }, { "epoch": 1.0287397976869506, "grad_norm": 0.20542721450328827, "learning_rate": 4.15446772633182e-07, "logits/chosen": 0.34791868925094604, "logits/rejected": NaN, "logps/chosen": -178.29061889648438, "logps/rejected": -871.8250122070312, "loss": 0.0149, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.585131883621216, "rewards/margins": 32.057029724121094, "rewards/rejected": -35.6484375, "step": 4080 }, { "epoch": 1.0312608325717707, "grad_norm": 23.866361618041992, "learning_rate": 4.148961859602925e-07, "logits/chosen": 0.48788756132125854, "logits/rejected": NaN, "logps/chosen": -182.1359405517578, "logps/rejected": -835.6500244140625, "loss": 0.0492, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.285754203796387, "rewards/margins": 29.356250762939453, "rewards/rejected": -33.65156173706055, "step": 4090 }, { "epoch": 1.0337818674565908, "grad_norm": 0.044678959995508194, "learning_rate": 4.1434417992492743e-07, "logits/chosen": 0.2315673828125, "logits/rejected": NaN, "logps/chosen": -187.5749969482422, "logps/rejected": -862.5999755859375, "loss": 0.0175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.777099609375, "rewards/margins": 30.840625762939453, "rewards/rejected": -34.610939025878906, "step": 4100 }, { "epoch": 1.0363029023414112, "grad_norm": 84.70132446289062, "learning_rate": 4.137907592785409e-07, "logits/chosen": 0.4444465637207031, "logits/rejected": NaN, "logps/chosen": -161.86093139648438, "logps/rejected": -846.25, "loss": 0.031, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4414260387420654, "rewards/margins": 30.72265625, "rewards/rejected": -34.173439025878906, "step": 4110 }, { "epoch": 1.0388239372262313, "grad_norm": 0.12329636514186859, "learning_rate": 4.132359287847632e-07, "logits/chosen": 0.3983825743198395, "logits/rejected": NaN, "logps/chosen": -179.1843719482422, "logps/rejected": -873.2750244140625, "loss": 0.0028, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9240050315856934, "rewards/margins": 31.814062118530273, "rewards/rejected": -35.72968673706055, "step": 4120 }, { "epoch": 1.0413449721110515, "grad_norm": 1.4486453533172607, "learning_rate": 4.1267969321936037e-07, "logits/chosen": 0.6985321044921875, "logits/rejected": NaN, "logps/chosen": -170.734375, "logps/rejected": -879.375, "loss": 0.0203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.422924995422363, "rewards/margins": 32.89531326293945, "rewards/rejected": -37.3203125, "step": 4130 }, { "epoch": 1.0438660069958718, "grad_norm": 0.591741681098938, "learning_rate": 4.1212205737019253e-07, "logits/chosen": 0.645739734172821, "logits/rejected": NaN, "logps/chosen": -196.3249969482422, "logps/rejected": -880.4500122070312, "loss": 0.042, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.76214599609375, "rewards/margins": 32.80859375, "rewards/rejected": -37.568748474121094, "step": 4140 }, { "epoch": 1.046387041880692, "grad_norm": 2.2050483226776123, "learning_rate": 4.115630260371731e-07, "logits/chosen": 0.8250648379325867, "logits/rejected": NaN, "logps/chosen": -191.38125610351562, "logps/rejected": -889.4000244140625, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.573681831359863, "rewards/margins": 34.02265548706055, "rewards/rejected": -39.59687423706055, "step": 4150 }, { "epoch": 1.0489080767655123, "grad_norm": 0.1204008162021637, "learning_rate": 4.110026040322271e-07, "logits/chosen": 0.9512237310409546, "logits/rejected": 1.4651305675506592, "logps/chosen": -195.2687530517578, "logps/rejected": -963.2249755859375, "loss": 0.047, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -6.373730659484863, "rewards/margins": 37.59062576293945, "rewards/rejected": -43.96875, "step": 4160 }, { "epoch": 1.0514291116503325, "grad_norm": 2.777338981628418, "learning_rate": 4.104407961792501e-07, "logits/chosen": 0.7632400393486023, "logits/rejected": NaN, "logps/chosen": -174.2687530517578, "logps/rejected": -933.75, "loss": 0.0725, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.1597900390625, "rewards/margins": 37.3203125, "rewards/rejected": -42.478126525878906, "step": 4170 }, { "epoch": 1.0539501465351526, "grad_norm": 154.81118774414062, "learning_rate": 4.0987760731406633e-07, "logits/chosen": 0.5916091799736023, "logits/rejected": NaN, "logps/chosen": -194.1125030517578, "logps/rejected": -904.0999755859375, "loss": 0.0849, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.793847560882568, "rewards/margins": 33.38984298706055, "rewards/rejected": -38.17109298706055, "step": 4180 }, { "epoch": 1.056471181419973, "grad_norm": 37.762664794921875, "learning_rate": 4.0931304228438737e-07, "logits/chosen": 0.584643542766571, "logits/rejected": 0.954852283000946, "logps/chosen": -172.4484405517578, "logps/rejected": -873.1500244140625, "loss": 0.0243, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.597558498382568, "rewards/margins": 30.53515625, "rewards/rejected": -35.13593673706055, "step": 4190 }, { "epoch": 1.0589922163047931, "grad_norm": 8.251177787780762, "learning_rate": 4.0874710594977025e-07, "logits/chosen": 0.42817384004592896, "logits/rejected": NaN, "logps/chosen": -181.8625030517578, "logps/rejected": -878.0250244140625, "loss": 0.0742, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.6479249000549316, "rewards/margins": 30.127344131469727, "rewards/rejected": -33.775001525878906, "step": 4200 }, { "epoch": 1.0615132511896133, "grad_norm": 0.06474381685256958, "learning_rate": 4.081798031815755e-07, "logits/chosen": 0.3363639712333679, "logits/rejected": 0.9563964605331421, "logps/chosen": -172.7578125, "logps/rejected": -896.0, "loss": 0.0515, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.830639600753784, "rewards/margins": 32.484764099121094, "rewards/rejected": -36.328125, "step": 4210 }, { "epoch": 1.0640342860744336, "grad_norm": 0.0015355496434494853, "learning_rate": 4.0761113886292575e-07, "logits/chosen": 0.5524566769599915, "logits/rejected": NaN, "logps/chosen": -168.3937530517578, "logps/rejected": -888.5999755859375, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.342333793640137, "rewards/margins": 31.547657012939453, "rewards/rejected": -35.90156173706055, "step": 4220 }, { "epoch": 1.0665553209592538, "grad_norm": 0.7100661396980286, "learning_rate": 4.07041117888663e-07, "logits/chosen": 0.5736068487167358, "logits/rejected": NaN, "logps/chosen": -173.6593780517578, "logps/rejected": -858.1500244140625, "loss": 0.2201, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.446642875671387, "rewards/margins": 30.215625762939453, "rewards/rejected": -34.6640625, "step": 4230 }, { "epoch": 1.069076355844074, "grad_norm": 8.995537757873535, "learning_rate": 4.0646974516530683e-07, "logits/chosen": 0.652374267578125, "logits/rejected": NaN, "logps/chosen": -172.8468780517578, "logps/rejected": -864.75, "loss": 0.0315, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.29595947265625, "rewards/margins": 30.799219131469727, "rewards/rejected": -35.0859375, "step": 4240 }, { "epoch": 1.0715973907288943, "grad_norm": 5.3248491287231445, "learning_rate": 4.058970256110125e-07, "logits/chosen": 0.44820863008499146, "logits/rejected": 0.889361560344696, "logps/chosen": -164.58749389648438, "logps/rejected": -841.9249877929688, "loss": 0.0117, "rewards/accuracies": 0.984375, "rewards/chosen": -3.516317844390869, "rewards/margins": 29.732812881469727, "rewards/rejected": -33.2578125, "step": 4250 }, { "epoch": 1.0741184256137144, "grad_norm": 0.03202497959136963, "learning_rate": 4.0532296415552783e-07, "logits/chosen": 0.30464476346969604, "logits/rejected": NaN, "logps/chosen": -175.7859344482422, "logps/rejected": -841.9000244140625, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.6205811500549316, "rewards/margins": 29.690624237060547, "rewards/rejected": -33.29999923706055, "step": 4260 }, { "epoch": 1.0766394604985345, "grad_norm": 158.07711791992188, "learning_rate": 4.0474756574015145e-07, "logits/chosen": 0.31335145235061646, "logits/rejected": 0.6828651428222656, "logps/chosen": -194.81875610351562, "logps/rejected": -860.5999755859375, "loss": 0.0899, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.6396484375, "rewards/margins": 29.962499618530273, "rewards/rejected": -34.60468673706055, "step": 4270 }, { "epoch": 1.079160495383355, "grad_norm": 136.0279541015625, "learning_rate": 4.0417083531769e-07, "logits/chosen": 0.407461553812027, "logits/rejected": NaN, "logps/chosen": -172.7531280517578, "logps/rejected": -861.7750244140625, "loss": 0.0334, "rewards/accuracies": 0.984375, "rewards/chosen": -4.113512992858887, "rewards/margins": 30.202342987060547, "rewards/rejected": -34.30781173706055, "step": 4280 }, { "epoch": 1.081681530268175, "grad_norm": 49.89803695678711, "learning_rate": 4.035927778524154e-07, "logits/chosen": 0.4685684144496918, "logits/rejected": NaN, "logps/chosen": -187.7218780517578, "logps/rejected": -857.5250244140625, "loss": 0.0785, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.885693311691284, "rewards/margins": 30.971094131469727, "rewards/rejected": -34.864845275878906, "step": 4290 }, { "epoch": 1.0842025651529954, "grad_norm": 0.16288617253303528, "learning_rate": 4.0301339832002233e-07, "logits/chosen": 0.750640869140625, "logits/rejected": 1.0901672840118408, "logps/chosen": -178.57186889648438, "logps/rejected": -885.0499877929688, "loss": 0.0478, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.565783500671387, "rewards/margins": 31.588281631469727, "rewards/rejected": -36.1484375, "step": 4300 }, { "epoch": 1.0867236000378155, "grad_norm": 48.73146438598633, "learning_rate": 4.024327017075855e-07, "logits/chosen": 0.32609254121780396, "logits/rejected": NaN, "logps/chosen": -188.80624389648438, "logps/rejected": -848.2750244140625, "loss": 0.031, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6078858375549316, "rewards/margins": 29.435155868530273, "rewards/rejected": -33.04999923706055, "step": 4310 }, { "epoch": 1.0892446349226357, "grad_norm": 3.395510673522949, "learning_rate": 4.018506930135161e-07, "logits/chosen": 0.3918808102607727, "logits/rejected": NaN, "logps/chosen": -181.5625, "logps/rejected": -809.5, "loss": 0.0545, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.762377977371216, "rewards/margins": 26.276561737060547, "rewards/rejected": -30.0546875, "step": 4320 }, { "epoch": 1.091765669807456, "grad_norm": 99.9836196899414, "learning_rate": 4.012673772475196e-07, "logits/chosen": 0.5601745843887329, "logits/rejected": NaN, "logps/chosen": -158.79843139648438, "logps/rejected": -825.25, "loss": 0.0138, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.722756862640381, "rewards/margins": 27.65625, "rewards/rejected": -31.378124237060547, "step": 4330 }, { "epoch": 1.0942867046922762, "grad_norm": 274.24713134765625, "learning_rate": 4.00682759430552e-07, "logits/chosen": 0.6545044183731079, "logits/rejected": 1.0317329168319702, "logps/chosen": -164.8718719482422, "logps/rejected": -857.125, "loss": 0.045, "rewards/accuracies": 0.984375, "rewards/chosen": -4.632910251617432, "rewards/margins": 30.782032012939453, "rewards/rejected": -35.404685974121094, "step": 4340 }, { "epoch": 1.0968077395770963, "grad_norm": 21.135324478149414, "learning_rate": 4.00096844594777e-07, "logits/chosen": 0.6404052972793579, "logits/rejected": NaN, "logps/chosen": -179.2624969482422, "logps/rejected": -862.625, "loss": 0.0107, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.690405368804932, "rewards/margins": 31.893749237060547, "rewards/rejected": -37.59062576293945, "step": 4350 }, { "epoch": 1.0993287744619167, "grad_norm": 0.005358333699405193, "learning_rate": 3.9950963778352254e-07, "logits/chosen": 0.7091064453125, "logits/rejected": 1.1030426025390625, "logps/chosen": -168.9031219482422, "logps/rejected": -893.1500244140625, "loss": 0.0487, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.071008205413818, "rewards/margins": 32.037498474121094, "rewards/rejected": -37.09375, "step": 4360 }, { "epoch": 1.1018498093467368, "grad_norm": 0.14934541285037994, "learning_rate": 3.9892114405123723e-07, "logits/chosen": 0.760119616985321, "logits/rejected": 1.3250114917755127, "logps/chosen": -197.83438110351562, "logps/rejected": -865.4000244140625, "loss": 0.0755, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.365332126617432, "rewards/margins": 31.62890625, "rewards/rejected": -36.9765625, "step": 4370 }, { "epoch": 1.104370844231557, "grad_norm": 0.08405868709087372, "learning_rate": 3.9833136846344707e-07, "logits/chosen": 0.5381714105606079, "logits/rejected": NaN, "logps/chosen": -174.41250610351562, "logps/rejected": -880.6749877929688, "loss": 0.0076, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.090087890625, "rewards/margins": 32.29296875, "rewards/rejected": -36.376564025878906, "step": 4380 }, { "epoch": 1.1068918791163773, "grad_norm": 0.012920841574668884, "learning_rate": 3.977403160967119e-07, "logits/chosen": 0.5537201166152954, "logits/rejected": 0.9020477533340454, "logps/chosen": -165.2421875, "logps/rejected": -870.9000244140625, "loss": 0.1165, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.027014255523682, "rewards/margins": 30.501562118530273, "rewards/rejected": -34.536720275878906, "step": 4390 }, { "epoch": 1.1094129140011975, "grad_norm": 1.6359243392944336, "learning_rate": 3.971479920385814e-07, "logits/chosen": 0.3800998628139496, "logits/rejected": NaN, "logps/chosen": -149.86874389648438, "logps/rejected": -826.4249877929688, "loss": 0.0063, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.81439208984375, "rewards/margins": 28.087499618530273, "rewards/rejected": -30.900781631469727, "step": 4400 }, { "epoch": 1.1119339488860178, "grad_norm": 20.377290725708008, "learning_rate": 3.965544013875516e-07, "logits/chosen": 0.2144012451171875, "logits/rejected": NaN, "logps/chosen": -158.55624389648438, "logps/rejected": -830.5250244140625, "loss": 0.0184, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.869061231613159, "rewards/margins": 26.267187118530273, "rewards/rejected": -29.129688262939453, "step": 4410 }, { "epoch": 1.114454983770838, "grad_norm": 0.01240631751716137, "learning_rate": 3.959595492530207e-07, "logits/chosen": 0.09806976467370987, "logits/rejected": NaN, "logps/chosen": -176.6906280517578, "logps/rejected": -813.2000122070312, "loss": 0.0387, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9581236839294434, "rewards/margins": 27.100780487060547, "rewards/rejected": -30.0703125, "step": 4420 }, { "epoch": 1.116976018655658, "grad_norm": 0.00013521447544917464, "learning_rate": 3.953634407552456e-07, "logits/chosen": 0.24146728217601776, "logits/rejected": NaN, "logps/chosen": -177.921875, "logps/rejected": -856.4749755859375, "loss": 0.0943, "rewards/accuracies": 0.984375, "rewards/chosen": -3.7553343772888184, "rewards/margins": 27.310937881469727, "rewards/rejected": -31.0703125, "step": 4430 }, { "epoch": 1.1194970535404785, "grad_norm": 1.20767343044281, "learning_rate": 3.9476608102529707e-07, "logits/chosen": 0.17420348525047302, "logits/rejected": 0.7728241086006165, "logps/chosen": -189.5734405517578, "logps/rejected": -843.9500122070312, "loss": 0.1173, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.190771579742432, "rewards/margins": 27.860157012939453, "rewards/rejected": -32.040626525878906, "step": 4440 }, { "epoch": 1.1220180884252986, "grad_norm": 0.03709186986088753, "learning_rate": 3.9416747520501627e-07, "logits/chosen": 0.2887634336948395, "logits/rejected": 0.6867706179618835, "logps/chosen": -187.90625, "logps/rejected": -851.875, "loss": 0.0689, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.903124809265137, "rewards/margins": 27.560937881469727, "rewards/rejected": -32.484375, "step": 4450 }, { "epoch": 1.1245391233101187, "grad_norm": 5.675292015075684, "learning_rate": 3.935676284469702e-07, "logits/chosen": 0.4564208984375, "logits/rejected": NaN, "logps/chosen": -174.3468780517578, "logps/rejected": -849.0250244140625, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.359692573547363, "rewards/margins": 28.741405487060547, "rewards/rejected": -33.110939025878906, "step": 4460 }, { "epoch": 1.127060158194939, "grad_norm": 43.83598327636719, "learning_rate": 3.9296654591440734e-07, "logits/chosen": 0.1453399658203125, "logits/rejected": 0.6831512451171875, "logps/chosen": -185.5031280517578, "logps/rejected": -817.2000122070312, "loss": 0.061, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.266992092132568, "rewards/margins": 25.837499618530273, "rewards/rejected": -30.103124618530273, "step": 4470 }, { "epoch": 1.1295811930797592, "grad_norm": 17.282285690307617, "learning_rate": 3.923642327812132e-07, "logits/chosen": 0.193572998046875, "logits/rejected": 0.472970575094223, "logps/chosen": -158.40469360351562, "logps/rejected": -782.4249877929688, "loss": 0.0304, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.907910108566284, "rewards/margins": 25.530467987060547, "rewards/rejected": -28.431249618530273, "step": 4480 }, { "epoch": 1.1321022279645794, "grad_norm": 0.9761391282081604, "learning_rate": 3.917606942318657e-07, "logits/chosen": 0.06183014065027237, "logits/rejected": 0.4333633482456207, "logps/chosen": -176.140625, "logps/rejected": -825.125, "loss": 0.024, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.1516051292419434, "rewards/margins": 25.594921112060547, "rewards/rejected": -28.744531631469727, "step": 4490 }, { "epoch": 1.1346232628493997, "grad_norm": 0.2286805361509323, "learning_rate": 3.9115593546139106e-07, "logits/chosen": 0.07072601467370987, "logits/rejected": NaN, "logps/chosen": -174.97500610351562, "logps/rejected": -821.875, "loss": 0.1624, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4635009765625, "rewards/margins": 26.494531631469727, "rewards/rejected": -29.953907012939453, "step": 4500 }, { "epoch": 1.1371442977342199, "grad_norm": 3.604448080062866, "learning_rate": 3.905499616753183e-07, "logits/chosen": 0.11128692328929901, "logits/rejected": 0.5848175287246704, "logps/chosen": -177.1843719482422, "logps/rejected": -848.1500244140625, "loss": 0.0091, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.554888963699341, "rewards/margins": 29.814062118530273, "rewards/rejected": -33.349998474121094, "step": 4510 }, { "epoch": 1.1396653326190402, "grad_norm": 52.757347106933594, "learning_rate": 3.8994277808963507e-07, "logits/chosen": 0.18312835693359375, "logits/rejected": NaN, "logps/chosen": -194.4640655517578, "logps/rejected": -888.4000244140625, "loss": 0.0874, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.346423149108887, "rewards/margins": 31.485937118530273, "rewards/rejected": -35.821876525878906, "step": 4520 }, { "epoch": 1.1421863675038604, "grad_norm": 338.3120422363281, "learning_rate": 3.893343899307423e-07, "logits/chosen": 0.12305297702550888, "logits/rejected": NaN, "logps/chosen": -177.92813110351562, "logps/rejected": -846.4000244140625, "loss": 0.1322, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.986767530441284, "rewards/margins": 28.698436737060547, "rewards/rejected": -32.685935974121094, "step": 4530 }, { "epoch": 1.1447074023886805, "grad_norm": 0.21592830121517181, "learning_rate": 3.887248024354095e-07, "logits/chosen": 0.24382933974266052, "logits/rejected": NaN, "logps/chosen": -172.046875, "logps/rejected": -842.75, "loss": 0.0906, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.702954053878784, "rewards/margins": 28.807811737060547, "rewards/rejected": -32.506248474121094, "step": 4540 }, { "epoch": 1.1472284372735007, "grad_norm": 0.005632956512272358, "learning_rate": 3.881140208507298e-07, "logits/chosen": 0.27783507108688354, "logits/rejected": NaN, "logps/chosen": -169.7531280517578, "logps/rejected": -863.4500122070312, "loss": 0.0321, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.3394622802734375, "rewards/margins": 30.65234375, "rewards/rejected": -33.99687576293945, "step": 4550 }, { "epoch": 1.149749472158321, "grad_norm": 0.01375786866992712, "learning_rate": 3.8750205043407423e-07, "logits/chosen": 0.45721739530563354, "logits/rejected": NaN, "logps/chosen": -205.21249389648438, "logps/rejected": -866.5250244140625, "loss": 0.0971, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.812548637390137, "rewards/margins": 29.733203887939453, "rewards/rejected": -34.544532775878906, "step": 4560 }, { "epoch": 1.1522705070431412, "grad_norm": 203.30992126464844, "learning_rate": 3.8688889645304723e-07, "logits/chosen": 0.6460632085800171, "logits/rejected": 0.9874297976493835, "logps/chosen": -167.0828094482422, "logps/rejected": -848.9000244140625, "loss": 0.0529, "rewards/accuracies": 0.984375, "rewards/chosen": -4.7674560546875, "rewards/margins": 31.393749237060547, "rewards/rejected": -36.15625, "step": 4570 }, { "epoch": 1.1547915419279615, "grad_norm": 0.8318575024604797, "learning_rate": 3.8627456418544046e-07, "logits/chosen": 0.5642425417900085, "logits/rejected": NaN, "logps/chosen": -167.765625, "logps/rejected": -871.7750244140625, "loss": 0.0079, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.551806449890137, "rewards/margins": 33.252342224121094, "rewards/rejected": -37.814064025878906, "step": 4580 }, { "epoch": 1.1573125768127817, "grad_norm": 191.28915405273438, "learning_rate": 3.8565905891918813e-07, "logits/chosen": 0.4628639221191406, "logits/rejected": NaN, "logps/chosen": -192.88125610351562, "logps/rejected": -931.2999877929688, "loss": 0.1728, "rewards/accuracies": 0.96875, "rewards/chosen": -5.476733207702637, "rewards/margins": 34.27031326293945, "rewards/rejected": -39.75, "step": 4590 }, { "epoch": 1.1598336116976018, "grad_norm": 69.2548828125, "learning_rate": 3.850423859523212e-07, "logits/chosen": 0.2845214903354645, "logits/rejected": 0.673175036907196, "logps/chosen": -188.39999389648438, "logps/rejected": -917.4000244140625, "loss": 0.0263, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.175671577453613, "rewards/margins": 34.079689025878906, "rewards/rejected": -39.24687576293945, "step": 4600 }, { "epoch": 1.1623546465824222, "grad_norm": 105.04236602783203, "learning_rate": 3.8442455059292146e-07, "logits/chosen": 0.4285522401332855, "logits/rejected": 0.9464996457099915, "logps/chosen": -176.65625, "logps/rejected": -936.0499877929688, "loss": 0.0239, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.033349514007568, "rewards/margins": 37.73906326293945, "rewards/rejected": -42.79218673706055, "step": 4610 }, { "epoch": 1.1648756814672423, "grad_norm": 18.229995727539062, "learning_rate": 3.8380555815907636e-07, "logits/chosen": 0.37768250703811646, "logits/rejected": 1.0025298595428467, "logps/chosen": -186.5046844482422, "logps/rejected": -927.5, "loss": 0.1204, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -5.259442329406738, "rewards/margins": 36.23125076293945, "rewards/rejected": -41.485939025878906, "step": 4620 }, { "epoch": 1.1673967163520624, "grad_norm": 0.036758583039045334, "learning_rate": 3.831854139788329e-07, "logits/chosen": 0.299887090921402, "logits/rejected": NaN, "logps/chosen": -184.2375030517578, "logps/rejected": -909.7000122070312, "loss": 0.0325, "rewards/accuracies": 0.984375, "rewards/chosen": -4.980413913726807, "rewards/margins": 35.09375, "rewards/rejected": -40.05156326293945, "step": 4630 }, { "epoch": 1.1699177512368828, "grad_norm": 0.021119408309459686, "learning_rate": 3.825641233901518e-07, "logits/chosen": 0.25094908475875854, "logits/rejected": 0.8589752316474915, "logps/chosen": -179.359375, "logps/rejected": -907.5499877929688, "loss": 0.0042, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.59912109375, "rewards/margins": 34.325782775878906, "rewards/rejected": -38.931251525878906, "step": 4640 }, { "epoch": 1.172438786121703, "grad_norm": 0.04108046740293503, "learning_rate": 3.819416917408619e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -158.2109375, "logps/rejected": -817.7750244140625, "loss": 0.0369, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1478209495544434, "rewards/margins": 30.204687118530273, "rewards/rejected": -33.35468673706055, "step": 4650 }, { "epoch": 1.174959821006523, "grad_norm": 0.0054328469559550285, "learning_rate": 3.8131812438861353e-07, "logits/chosen": 0.24417877197265625, "logits/rejected": NaN, "logps/chosen": -175.58749389648438, "logps/rejected": -814.2249755859375, "loss": 0.0979, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.6534790992736816, "rewards/margins": 28.919139862060547, "rewards/rejected": -32.560157775878906, "step": 4660 }, { "epoch": 1.1774808558913434, "grad_norm": 138.3196563720703, "learning_rate": 3.8069342670083304e-07, "logits/chosen": 0.22496643662452698, "logits/rejected": 0.6182388067245483, "logps/chosen": -147.99374389648438, "logps/rejected": -821.2249755859375, "loss": 0.0259, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.920401096343994, "rewards/margins": 27.932031631469727, "rewards/rejected": -30.848438262939453, "step": 4670 }, { "epoch": 1.1800018907761636, "grad_norm": 0.6996208429336548, "learning_rate": 3.8006760405467574e-07, "logits/chosen": 0.29189378023147583, "logits/rejected": 0.8366363644599915, "logps/chosen": -177.421875, "logps/rejected": -839.0499877929688, "loss": 0.0711, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.988659620285034, "rewards/margins": 28.918750762939453, "rewards/rejected": -32.89921951293945, "step": 4680 }, { "epoch": 1.182522925660984, "grad_norm": 254.84886169433594, "learning_rate": 3.7944066183698076e-07, "logits/chosen": 0.384347528219223, "logits/rejected": NaN, "logps/chosen": -180.3390655517578, "logps/rejected": -855.1500244140625, "loss": 0.0791, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.800219535827637, "rewards/margins": 30.696874618530273, "rewards/rejected": -35.49687576293945, "step": 4690 }, { "epoch": 1.185043960545804, "grad_norm": 0.00024089829821605235, "learning_rate": 3.788126054442237e-07, "logits/chosen": 0.6186493039131165, "logits/rejected": 0.9716247320175171, "logps/chosen": -191.03750610351562, "logps/rejected": -940.7249755859375, "loss": 0.0509, "rewards/accuracies": 0.984375, "rewards/chosen": -5.574072360992432, "rewards/margins": 33.76171875, "rewards/rejected": -39.3515625, "step": 4700 }, { "epoch": 1.1875649954306242, "grad_norm": 2.277721432619728e-05, "learning_rate": 3.781834402824705e-07, "logits/chosen": 0.5634704828262329, "logits/rejected": NaN, "logps/chosen": -188.5906219482422, "logps/rejected": -919.9000244140625, "loss": 0.14, "rewards/accuracies": 0.984375, "rewards/chosen": -5.454980373382568, "rewards/margins": 34.517189025878906, "rewards/rejected": -39.97968673706055, "step": 4710 }, { "epoch": 1.1900860303154446, "grad_norm": 1.1226269006729126, "learning_rate": 3.775531717673313e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -198.21875, "logps/rejected": -884.2000122070312, "loss": 0.011, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.65130615234375, "rewards/margins": 30.890625, "rewards/rejected": -35.548439025878906, "step": 4720 }, { "epoch": 1.1926070652002647, "grad_norm": 31.514911651611328, "learning_rate": 3.769218053239132e-07, "logits/chosen": 0.3224731385707855, "logits/rejected": NaN, "logps/chosen": -162.078125, "logps/rejected": -924.7000122070312, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -4.0849609375, "rewards/margins": 34.396873474121094, "rewards/rejected": -38.48125076293945, "step": 4730 }, { "epoch": 1.1951281000850849, "grad_norm": 9.162474632263184, "learning_rate": 3.7628934638677377e-07, "logits/chosen": 0.47099608182907104, "logits/rejected": NaN, "logps/chosen": -169.515625, "logps/rejected": -882.9500122070312, "loss": 0.0074, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.18524169921875, "rewards/margins": 33.13359451293945, "rewards/rejected": -37.310935974121094, "step": 4740 }, { "epoch": 1.1976491349699052, "grad_norm": 146.970458984375, "learning_rate": 3.756558003998746e-07, "logits/chosen": 0.4527831971645355, "logits/rejected": NaN, "logps/chosen": -162.578125, "logps/rejected": -854.4749755859375, "loss": 0.0821, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.1435546875, "rewards/margins": 32.605079650878906, "rewards/rejected": -36.765625, "step": 4750 }, { "epoch": 1.2001701698547254, "grad_norm": 0.0043554832227528095, "learning_rate": 3.750211728165341e-07, "logits/chosen": 0.4542747437953949, "logits/rejected": NaN, "logps/chosen": -201.77188110351562, "logps/rejected": -853.8499755859375, "loss": 0.103, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.684277534484863, "rewards/margins": 30.431249618530273, "rewards/rejected": -35.12187576293945, "step": 4760 }, { "epoch": 1.2026912047395455, "grad_norm": 114.33245849609375, "learning_rate": 3.743854690993805e-07, "logits/chosen": 0.27254945039749146, "logits/rejected": NaN, "logps/chosen": -193.8625030517578, "logps/rejected": -876.375, "loss": 0.0961, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -5.123779296875, "rewards/margins": 30.948436737060547, "rewards/rejected": -36.080467224121094, "step": 4770 }, { "epoch": 1.2052122396243659, "grad_norm": 0.03330286219716072, "learning_rate": 3.737486947203051e-07, "logits/chosen": 0.19099731743335724, "logits/rejected": NaN, "logps/chosen": -177.74374389648438, "logps/rejected": -881.0999755859375, "loss": 0.007, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.459399223327637, "rewards/margins": 30.350000381469727, "rewards/rejected": -34.803123474121094, "step": 4780 }, { "epoch": 1.207733274509186, "grad_norm": 1.390887975692749, "learning_rate": 3.7311085516041514e-07, "logits/chosen": 0.17753753066062927, "logits/rejected": NaN, "logps/chosen": -162.4031219482422, "logps/rejected": -850.2249755859375, "loss": 0.0817, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.360485792160034, "rewards/margins": 30.579296112060547, "rewards/rejected": -33.931251525878906, "step": 4790 }, { "epoch": 1.2102543093940064, "grad_norm": 0.0019379057921469212, "learning_rate": 3.7247195590998623e-07, "logits/chosen": 0.01615600660443306, "logits/rejected": 0.577984631061554, "logps/chosen": -168.5703125, "logps/rejected": -851.4000244140625, "loss": 0.0549, "rewards/accuracies": 0.984375, "rewards/chosen": -3.794750928878784, "rewards/margins": 30.265625, "rewards/rejected": -34.0625, "step": 4800 }, { "epoch": 1.2127753442788265, "grad_norm": 344.7655029296875, "learning_rate": 3.718320024684157e-07, "logits/chosen": 0.23419037461280823, "logits/rejected": 0.6228790283203125, "logps/chosen": -181.40625, "logps/rejected": -858.5499877929688, "loss": 0.0836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.160962104797363, "rewards/margins": 28.70703125, "rewards/rejected": -32.85468673706055, "step": 4810 }, { "epoch": 1.2152963791636466, "grad_norm": 0.8431145548820496, "learning_rate": 3.711910003441748e-07, "logits/chosen": 0.333151251077652, "logits/rejected": NaN, "logps/chosen": -166.75155639648438, "logps/rejected": -844.9500122070312, "loss": 0.0337, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.4964599609375, "rewards/margins": 30.169530868530273, "rewards/rejected": -34.671875, "step": 4820 }, { "epoch": 1.2178174140484668, "grad_norm": 0.7152418494224548, "learning_rate": 3.705489550547614e-07, "logits/chosen": 0.33393555879592896, "logits/rejected": 0.7812652587890625, "logps/chosen": -177.5500030517578, "logps/rejected": -864.375, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.037103176116943, "rewards/margins": 29.579687118530273, "rewards/rejected": -33.61406326293945, "step": 4830 }, { "epoch": 1.2203384489332871, "grad_norm": 4.152603626251221, "learning_rate": 3.6990587212665235e-07, "logits/chosen": 0.09858093410730362, "logits/rejected": NaN, "logps/chosen": -163.703125, "logps/rejected": -834.7000122070312, "loss": 0.0158, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4376769065856934, "rewards/margins": 29.934375762939453, "rewards/rejected": -33.3828125, "step": 4840 }, { "epoch": 1.2228594838181073, "grad_norm": 126.59352111816406, "learning_rate": 3.692617570952568e-07, "logits/chosen": 0.23367920517921448, "logits/rejected": NaN, "logps/chosen": -191.99063110351562, "logps/rejected": -851.3499755859375, "loss": 0.0698, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.9572510719299316, "rewards/margins": 28.415624618530273, "rewards/rejected": -32.37187576293945, "step": 4850 }, { "epoch": 1.2253805187029276, "grad_norm": 163.4409637451172, "learning_rate": 3.686166155048669e-07, "logits/chosen": NaN, "logits/rejected": 0.745104193687439, "logps/chosen": -167.16250610351562, "logps/rejected": -855.1500244140625, "loss": 0.0282, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.364453315734863, "rewards/margins": 30.278125762939453, "rewards/rejected": -34.646873474121094, "step": 4860 }, { "epoch": 1.2279015535877478, "grad_norm": 0.00462773023173213, "learning_rate": 3.6797045290861185e-07, "logits/chosen": 0.16415099799633026, "logits/rejected": 0.5941573977470398, "logps/chosen": -177.640625, "logps/rejected": -852.5750122070312, "loss": 0.1466, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.1746826171875, "rewards/margins": 28.924219131469727, "rewards/rejected": -33.09375, "step": 4870 }, { "epoch": 1.230422588472568, "grad_norm": 0.3612661361694336, "learning_rate": 3.673232748684086e-07, "logits/chosen": 0.28151625394821167, "logits/rejected": NaN, "logps/chosen": -176.5749969482422, "logps/rejected": -876.5499877929688, "loss": 0.0621, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.689013719558716, "rewards/margins": 30.416406631469727, "rewards/rejected": -34.09687423706055, "step": 4880 }, { "epoch": 1.2329436233573883, "grad_norm": 0.0016590887680649757, "learning_rate": 3.666750869549152e-07, "logits/chosen": 0.12615355849266052, "logits/rejected": NaN, "logps/chosen": -159.75, "logps/rejected": -814.8250122070312, "loss": 0.0067, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.927607774734497, "rewards/margins": 29.693750381469727, "rewards/rejected": -32.618751525878906, "step": 4890 }, { "epoch": 1.2354646582422084, "grad_norm": 193.84292602539062, "learning_rate": 3.6602589474748194e-07, "logits/chosen": 0.2367095947265625, "logits/rejected": NaN, "logps/chosen": -179.5812530517578, "logps/rejected": -910.125, "loss": 0.0301, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.831225633621216, "rewards/margins": 32.11406326293945, "rewards/rejected": -35.939064025878906, "step": 4900 }, { "epoch": 1.2379856931270286, "grad_norm": 0.07781888544559479, "learning_rate": 3.6537570383410377e-07, "logits/chosen": 0.23301391303539276, "logits/rejected": NaN, "logps/chosen": -183.39999389648438, "logps/rejected": -898.6500244140625, "loss": 0.0261, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.30694580078125, "rewards/margins": 32.451560974121094, "rewards/rejected": -36.75468826293945, "step": 4910 }, { "epoch": 1.240506728011849, "grad_norm": 14.091386795043945, "learning_rate": 3.647245198113722e-07, "logits/chosen": 0.21778106689453125, "logits/rejected": NaN, "logps/chosen": -192.41250610351562, "logps/rejected": -882.1500244140625, "loss": 0.0232, "rewards/accuracies": 0.984375, "rewards/chosen": -4.571240425109863, "rewards/margins": 32.328125, "rewards/rejected": -36.889060974121094, "step": 4920 }, { "epoch": 1.243027762896669, "grad_norm": 0.00042195318383164704, "learning_rate": 3.640723482844269e-07, "logits/chosen": 0.458334356546402, "logits/rejected": NaN, "logps/chosen": -179.45468139648438, "logps/rejected": -914.5999755859375, "loss": 0.038, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.740771293640137, "rewards/margins": 35.49140548706055, "rewards/rejected": -40.22343826293945, "step": 4930 }, { "epoch": 1.2455487977814892, "grad_norm": 143.20164489746094, "learning_rate": 3.6341919486690783e-07, "logits/chosen": 0.15742187201976776, "logits/rejected": NaN, "logps/chosen": -190.5968780517578, "logps/rejected": -946.6500244140625, "loss": 0.1137, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.230175971984863, "rewards/margins": 36.896873474121094, "rewards/rejected": -42.12187576293945, "step": 4940 }, { "epoch": 1.2480698326663096, "grad_norm": 0.7540143132209778, "learning_rate": 3.627650651809064e-07, "logits/chosen": 0.30862730741500854, "logits/rejected": NaN, "logps/chosen": -175.91250610351562, "logps/rejected": -909.875, "loss": 0.1313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.434844970703125, "rewards/margins": 34.567970275878906, "rewards/rejected": -38.984375, "step": 4950 }, { "epoch": 1.2505908675511297, "grad_norm": 0.0001756981946527958, "learning_rate": 3.6210996485691743e-07, "logits/chosen": 0.295187383890152, "logits/rejected": NaN, "logps/chosen": -162.75936889648438, "logps/rejected": -859.6500244140625, "loss": 0.048, "rewards/accuracies": 0.984375, "rewards/chosen": -4.026269435882568, "rewards/margins": 31.495311737060547, "rewards/rejected": -35.537498474121094, "step": 4960 }, { "epoch": 1.25311190243595, "grad_norm": 0.020706798881292343, "learning_rate": 3.6145389953379085e-07, "logits/chosen": 0.20810547471046448, "logits/rejected": 0.7175461053848267, "logps/chosen": -180.8625030517578, "logps/rejected": -919.75, "loss": 0.0118, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.363037109375, "rewards/margins": 33.716407775878906, "rewards/rejected": -38.0546875, "step": 4970 }, { "epoch": 1.2556329373207702, "grad_norm": 198.37271118164062, "learning_rate": 3.6079687485868255e-07, "logits/chosen": 0.2887733578681946, "logits/rejected": 0.5821762084960938, "logps/chosen": -185.89218139648438, "logps/rejected": -869.2999877929688, "loss": 0.0306, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.702340602874756, "rewards/margins": 30.149999618530273, "rewards/rejected": -34.87187576293945, "step": 4980 }, { "epoch": 1.2581539722055903, "grad_norm": 0.0026216499973088503, "learning_rate": 3.601388964870066e-07, "logits/chosen": 0.064544677734375, "logits/rejected": NaN, "logps/chosen": -182.31405639648438, "logps/rejected": -852.3250122070312, "loss": 0.0201, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.345385551452637, "rewards/margins": 29.65234375, "rewards/rejected": -33.990623474121094, "step": 4990 }, { "epoch": 1.2606750070904107, "grad_norm": 1.4756479263305664, "learning_rate": 3.5947997008238564e-07, "logits/chosen": 0.11924438178539276, "logits/rejected": 0.6408447027206421, "logps/chosen": -170.71875, "logps/rejected": -833.0250244140625, "loss": 0.0146, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9439454078674316, "rewards/margins": 29.676563262939453, "rewards/rejected": -33.618751525878906, "step": 5000 }, { "epoch": 1.2631960419752308, "grad_norm": 0.00643693283200264, "learning_rate": 3.58820101316603e-07, "logits/chosen": 0.08644409477710724, "logits/rejected": 0.3919174075126648, "logps/chosen": -199.8171844482422, "logps/rejected": -870.75, "loss": 0.0107, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.600988864898682, "rewards/margins": 30.818750381469727, "rewards/rejected": -35.41718673706055, "step": 5010 }, { "epoch": 1.265717076860051, "grad_norm": 7.474320888519287, "learning_rate": 3.5815929586955323e-07, "logits/chosen": 0.23587265610694885, "logits/rejected": NaN, "logps/chosen": -162.6999969482422, "logps/rejected": -853.375, "loss": 0.0445, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.165032863616943, "rewards/margins": 31.857812881469727, "rewards/rejected": -36.03125, "step": 5020 }, { "epoch": 1.2682381117448713, "grad_norm": 4.330943584442139, "learning_rate": 3.574975594291936e-07, "logits/chosen": 0.09031371772289276, "logits/rejected": NaN, "logps/chosen": -176.74063110351562, "logps/rejected": -928.7999877929688, "loss": 0.0109, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.835284471511841, "rewards/margins": 36.032814025878906, "rewards/rejected": -39.86406326293945, "step": 5030 }, { "epoch": 1.2707591466296915, "grad_norm": 0.1962009072303772, "learning_rate": 3.568348976914951e-07, "logits/chosen": 0.27650755643844604, "logits/rejected": 0.9673919677734375, "logps/chosen": -174.0234375, "logps/rejected": -898.5999755859375, "loss": 0.0146, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.881433010101318, "rewards/margins": 34.728126525878906, "rewards/rejected": -39.615623474121094, "step": 5040 }, { "epoch": 1.2732801815145116, "grad_norm": 0.0823327898979187, "learning_rate": 3.56171316360393e-07, "logits/chosen": 0.3717407286167145, "logits/rejected": 0.6989806890487671, "logps/chosen": -176.4015655517578, "logps/rejected": -936.5, "loss": 0.0367, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.2762451171875, "rewards/margins": 36.092185974121094, "rewards/rejected": -41.373435974121094, "step": 5050 }, { "epoch": 1.275801216399332, "grad_norm": 1.129317283630371, "learning_rate": 3.555068211477384e-07, "logits/chosen": 0.17686156928539276, "logits/rejected": 0.7334854006767273, "logps/chosen": -196.21249389648438, "logps/rejected": -912.6500244140625, "loss": 0.1075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.834435939788818, "rewards/margins": 33.55546951293945, "rewards/rejected": -38.396873474121094, "step": 5060 }, { "epoch": 1.2783222512841521, "grad_norm": 0.0002974680974148214, "learning_rate": 3.548414177732486e-07, "logits/chosen": 0.36207884550094604, "logits/rejected": 0.9834869503974915, "logps/chosen": -155.0578155517578, "logps/rejected": -845.0250244140625, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.158282279968262, "rewards/margins": 31.221874237060547, "rewards/rejected": -35.381248474121094, "step": 5070 }, { "epoch": 1.2808432861689725, "grad_norm": 0.0027426069136708975, "learning_rate": 3.541751119644581e-07, "logits/chosen": 0.16865691542625427, "logits/rejected": NaN, "logps/chosen": -191.421875, "logps/rejected": -876.9749755859375, "loss": 0.0076, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.6650390625, "rewards/margins": 31.256250381469727, "rewards/rejected": -35.91875076293945, "step": 5080 }, { "epoch": 1.2833643210537926, "grad_norm": 0.017341647297143936, "learning_rate": 3.5350790945666895e-07, "logits/chosen": 0.20340880751609802, "logits/rejected": 0.6509460210800171, "logps/chosen": -163.90625, "logps/rejected": -879.75, "loss": 0.083, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.859417676925659, "rewards/margins": 33.185157775878906, "rewards/rejected": -37.046875, "step": 5090 }, { "epoch": 1.2858853559386128, "grad_norm": 66.12599182128906, "learning_rate": 3.528398159929019e-07, "logits/chosen": 0.13723449409008026, "logits/rejected": 0.656414806842804, "logps/chosen": -172.5500030517578, "logps/rejected": -861.0499877929688, "loss": 0.0559, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.197869777679443, "rewards/margins": 31.603124618530273, "rewards/rejected": -35.821876525878906, "step": 5100 }, { "epoch": 1.288406390823433, "grad_norm": 0.4163817763328552, "learning_rate": 3.5217083732384666e-07, "logits/chosen": 0.34574586153030396, "logits/rejected": NaN, "logps/chosen": -161.83438110351562, "logps/rejected": -879.2999877929688, "loss": 0.0328, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6670165061950684, "rewards/margins": 31.917186737060547, "rewards/rejected": -35.587501525878906, "step": 5110 }, { "epoch": 1.2909274257082533, "grad_norm": 0.0018832071218639612, "learning_rate": 3.515009792078123e-07, "logits/chosen": 0.15555724501609802, "logits/rejected": NaN, "logps/chosen": -189.7687530517578, "logps/rejected": -892.0499877929688, "loss": 0.1209, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.115698337554932, "rewards/margins": 32.998435974121094, "rewards/rejected": -37.109375, "step": 5120 }, { "epoch": 1.2934484605930734, "grad_norm": 0.0904546007514, "learning_rate": 3.5083024741067807e-07, "logits/chosen": 0.29151612520217896, "logits/rejected": NaN, "logps/chosen": -180.140625, "logps/rejected": -889.2999877929688, "loss": 0.0404, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.335205078125, "rewards/margins": 32.8125, "rewards/rejected": -37.14531326293945, "step": 5130 }, { "epoch": 1.2959694954778938, "grad_norm": 0.10055790841579437, "learning_rate": 3.5015864770584327e-07, "logits/chosen": 0.04179077222943306, "logits/rejected": 0.5701965093612671, "logps/chosen": -179.25, "logps/rejected": -949.0, "loss": 0.0663, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.368639945983887, "rewards/margins": 36.567970275878906, "rewards/rejected": -40.928123474121094, "step": 5140 }, { "epoch": 1.298490530362714, "grad_norm": 0.8132165670394897, "learning_rate": 3.49486185874178e-07, "logits/chosen": 0.02796630933880806, "logits/rejected": 0.44788819551467896, "logps/chosen": -171.9421844482422, "logps/rejected": -908.875, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -3.5085206031799316, "rewards/margins": 34.00859451293945, "rewards/rejected": -37.51093673706055, "step": 5150 }, { "epoch": 1.301011565247534, "grad_norm": 0.008875137194991112, "learning_rate": 3.488128677039731e-07, "logits/chosen": 0.02620544470846653, "logits/rejected": NaN, "logps/chosen": -192.203125, "logps/rejected": -925.125, "loss": 0.0558, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8206253051757812, "rewards/margins": 35.3359375, "rewards/rejected": -39.157814025878906, "step": 5160 }, { "epoch": 1.3035326001323544, "grad_norm": 10.89201545715332, "learning_rate": 3.4813869899089046e-07, "logits/chosen": 0.13696900010108948, "logits/rejected": NaN, "logps/chosen": -173.8249969482422, "logps/rejected": -915.6749877929688, "loss": 0.0088, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.082226753234863, "rewards/margins": 34.16015625, "rewards/rejected": -38.248435974121094, "step": 5170 }, { "epoch": 1.3060536350171745, "grad_norm": 0.08180950582027435, "learning_rate": 3.474636855379133e-07, "logits/chosen": 0.14324036240577698, "logits/rejected": NaN, "logps/chosen": -185.6140594482422, "logps/rejected": -895.2249755859375, "loss": 0.0741, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.360644340515137, "rewards/margins": 33.654685974121094, "rewards/rejected": -38.00312423706055, "step": 5180 }, { "epoch": 1.308574669901995, "grad_norm": 0.20844028890132904, "learning_rate": 3.467878331552956e-07, "logits/chosen": 0.13364258408546448, "logits/rejected": NaN, "logps/chosen": -180.14688110351562, "logps/rejected": -868.1500244140625, "loss": 0.0259, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.166918754577637, "rewards/margins": 32.907814025878906, "rewards/rejected": -37.078125, "step": 5190 }, { "epoch": 1.311095704786815, "grad_norm": 11.176012992858887, "learning_rate": 3.46111147660513e-07, "logits/chosen": 0.183918759226799, "logits/rejected": NaN, "logps/chosen": -186.7624969482422, "logps/rejected": -911.625, "loss": 0.0837, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.6236724853515625, "rewards/margins": 34.26874923706055, "rewards/rejected": -38.90156173706055, "step": 5200 }, { "epoch": 1.3136167396716352, "grad_norm": 284.7677307128906, "learning_rate": 3.45433634878212e-07, "logits/chosen": 0.3096862733364105, "logits/rejected": NaN, "logps/chosen": -182.52188110351562, "logps/rejected": -910.9000244140625, "loss": 0.0834, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.978131294250488, "rewards/margins": 34.41718673706055, "rewards/rejected": -39.400001525878906, "step": 5210 }, { "epoch": 1.3161377745564553, "grad_norm": 1.5785551071166992, "learning_rate": 3.447553006401601e-07, "logits/chosen": 0.16678467392921448, "logits/rejected": NaN, "logps/chosen": -181.86093139648438, "logps/rejected": -885.9000244140625, "loss": 0.0408, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.629156589508057, "rewards/margins": 34.099220275878906, "rewards/rejected": -38.7265625, "step": 5220 }, { "epoch": 1.3186588094412757, "grad_norm": 0.07508391886949539, "learning_rate": 3.440761507851956e-07, "logits/chosen": 0.03738708421587944, "logits/rejected": NaN, "logps/chosen": -173.72811889648438, "logps/rejected": -863.1500244140625, "loss": 0.1218, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1644530296325684, "rewards/margins": 30.779687881469727, "rewards/rejected": -33.959373474121094, "step": 5230 }, { "epoch": 1.3211798443260958, "grad_norm": 0.20797905325889587, "learning_rate": 3.433961911591773e-07, "logits/chosen": 0.03125915676355362, "logits/rejected": NaN, "logps/chosen": -165.94375610351562, "logps/rejected": -847.9500122070312, "loss": 0.0417, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.06201171875, "rewards/margins": 29.497655868530273, "rewards/rejected": -32.56562423706055, "step": 5240 }, { "epoch": 1.3237008792109162, "grad_norm": 0.26786068081855774, "learning_rate": 3.427154276149341e-07, "logits/chosen": 0.10884399712085724, "logits/rejected": NaN, "logps/chosen": -179.5968780517578, "logps/rejected": -867.1500244140625, "loss": 0.0073, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.272625684738159, "rewards/margins": 31.114063262939453, "rewards/rejected": -34.375, "step": 5250 }, { "epoch": 1.3262219140957363, "grad_norm": 123.56752014160156, "learning_rate": 3.4203386601221484e-07, "logits/chosen": 0.16787414252758026, "logits/rejected": NaN, "logps/chosen": -177.8937530517578, "logps/rejected": -877.9000244140625, "loss": 0.0465, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.897265672683716, "rewards/margins": 31.207813262939453, "rewards/rejected": -35.099998474121094, "step": 5260 }, { "epoch": 1.3287429489805564, "grad_norm": 29.827436447143555, "learning_rate": 3.413515122176379e-07, "logits/chosen": 0.09747924655675888, "logits/rejected": NaN, "logps/chosen": -179.9187469482422, "logps/rejected": -891.125, "loss": 0.0355, "rewards/accuracies": 0.984375, "rewards/chosen": -4.253393650054932, "rewards/margins": 31.938282012939453, "rewards/rejected": -36.19218826293945, "step": 5270 }, { "epoch": 1.3312639838653768, "grad_norm": 14.57050609588623, "learning_rate": 3.4066837210464014e-07, "logits/chosen": -0.01068878173828125, "logits/rejected": NaN, "logps/chosen": -158.1750030517578, "logps/rejected": -876.7999877929688, "loss": 0.0138, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.075878858566284, "rewards/margins": 32.243751525878906, "rewards/rejected": -35.314064025878906, "step": 5280 }, { "epoch": 1.333785018750197, "grad_norm": 28.000255584716797, "learning_rate": 3.3998445155342703e-07, "logits/chosen": -0.025803375989198685, "logits/rejected": NaN, "logps/chosen": -168.109375, "logps/rejected": -854.75, "loss": 0.0351, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4499754905700684, "rewards/margins": 32.517967224121094, "rewards/rejected": -35.97148513793945, "step": 5290 }, { "epoch": 1.3363060536350173, "grad_norm": 5.079505443572998, "learning_rate": 3.392997564509216e-07, "logits/chosen": 0.0025268555618822575, "logits/rejected": 0.4230712950229645, "logps/chosen": -193.1687469482422, "logps/rejected": -910.8250122070312, "loss": 0.0952, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.7455506324768066, "rewards/margins": 32.64921951293945, "rewards/rejected": -36.404685974121094, "step": 5300 }, { "epoch": 1.3388270885198374, "grad_norm": 122.31507110595703, "learning_rate": 3.3861429269071407e-07, "logits/chosen": -0.1624755859375, "logits/rejected": 0.370217889547348, "logps/chosen": -173.08438110351562, "logps/rejected": -877.5499877929688, "loss": 0.0291, "rewards/accuracies": 0.984375, "rewards/chosen": -3.501879930496216, "rewards/margins": 31.450000762939453, "rewards/rejected": -34.94843673706055, "step": 5310 }, { "epoch": 1.3413481234046576, "grad_norm": 0.21213938295841217, "learning_rate": 3.379280661730109e-07, "logits/chosen": 0.05019378662109375, "logits/rejected": NaN, "logps/chosen": -163.78750610351562, "logps/rejected": -810.625, "loss": 0.0127, "rewards/accuracies": 0.984375, "rewards/chosen": -3.5860228538513184, "rewards/margins": 28.657032012939453, "rewards/rejected": -32.24531173706055, "step": 5320 }, { "epoch": 1.3438691582894777, "grad_norm": 62.899227142333984, "learning_rate": 3.372410828045839e-07, "logits/chosen": 0.03264160081744194, "logits/rejected": NaN, "logps/chosen": -167.02499389648438, "logps/rejected": -882.0, "loss": 0.0333, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.484570264816284, "rewards/margins": 32.009376525878906, "rewards/rejected": -35.509376525878906, "step": 5330 }, { "epoch": 1.346390193174298, "grad_norm": 23.755189895629883, "learning_rate": 3.3655334849872e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -160.9796905517578, "logps/rejected": -851.5499877929688, "loss": 0.0521, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3324921131134033, "rewards/margins": 29.628124237060547, "rewards/rejected": -32.96562576293945, "step": 5340 }, { "epoch": 1.3489112280591182, "grad_norm": 0.16136090457439423, "learning_rate": 3.358648691751693e-07, "logits/chosen": NaN, "logits/rejected": 0.4700675904750824, "logps/chosen": -186.6765594482422, "logps/rejected": -833.4500122070312, "loss": 0.0217, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.385912895202637, "rewards/margins": 30.114063262939453, "rewards/rejected": -34.5, "step": 5350 }, { "epoch": 1.3514322629439386, "grad_norm": 0.001089265220798552, "learning_rate": 3.351756507600954e-07, "logits/chosen": 0.18600769340991974, "logits/rejected": NaN, "logps/chosen": -161.0906219482422, "logps/rejected": -841.1749877929688, "loss": 0.0341, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.21673583984375, "rewards/margins": 31.125, "rewards/rejected": -35.34687423706055, "step": 5360 }, { "epoch": 1.3539532978287587, "grad_norm": 40.4871940612793, "learning_rate": 3.344856991860231e-07, "logits/chosen": 0.09300079196691513, "logits/rejected": NaN, "logps/chosen": -168.5890655517578, "logps/rejected": -832.9749755859375, "loss": 0.0192, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.523266553878784, "rewards/margins": 29.544530868530273, "rewards/rejected": -33.088279724121094, "step": 5370 }, { "epoch": 1.3564743327135789, "grad_norm": 157.53854370117188, "learning_rate": 3.337950203917882e-07, "logits/chosen": -0.0010833740234375, "logits/rejected": NaN, "logps/chosen": -172.90625, "logps/rejected": -832.0250244140625, "loss": 0.0344, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.291674852371216, "rewards/margins": 27.724218368530273, "rewards/rejected": -31.013280868530273, "step": 5380 }, { "epoch": 1.358995367598399, "grad_norm": 0.7523600459098816, "learning_rate": 3.3310362032248634e-07, "logits/chosen": 0.18096923828125, "logits/rejected": NaN, "logps/chosen": -161.8625030517578, "logps/rejected": -818.0499877929688, "loss": 0.0353, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0033020973205566, "rewards/margins": 29.249217987060547, "rewards/rejected": -32.267189025878906, "step": 5390 }, { "epoch": 1.3615164024832194, "grad_norm": 22.393611907958984, "learning_rate": 3.324115049294212e-07, "logits/chosen": 0.24275055527687073, "logits/rejected": NaN, "logps/chosen": -183.85311889648438, "logps/rejected": -819.9500122070312, "loss": 0.0092, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.190869331359863, "rewards/margins": 29.16015625, "rewards/rejected": -33.34843826293945, "step": 5400 }, { "epoch": 1.3640374373680395, "grad_norm": 0.6763583421707153, "learning_rate": 3.3171868017005413e-07, "logits/chosen": 0.16892090439796448, "logits/rejected": NaN, "logps/chosen": -166.8093719482422, "logps/rejected": -838.0, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.798449754714966, "rewards/margins": 30.61328125, "rewards/rejected": -34.404685974121094, "step": 5410 }, { "epoch": 1.3665584722528599, "grad_norm": 134.9268341064453, "learning_rate": 3.31025152007952e-07, "logits/chosen": 0.14953307807445526, "logits/rejected": NaN, "logps/chosen": -182.1687469482422, "logps/rejected": -852.2000122070312, "loss": 0.0734, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7627196311950684, "rewards/margins": 30.689844131469727, "rewards/rejected": -34.454689025878906, "step": 5420 }, { "epoch": 1.36907950713768, "grad_norm": 0.00362749514169991, "learning_rate": 3.3033092641273665e-07, "logits/chosen": 0.22620849311351776, "logits/rejected": NaN, "logps/chosen": -173.3046875, "logps/rejected": -847.2999877929688, "loss": 0.0197, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.314013481140137, "rewards/margins": 31.609375, "rewards/rejected": -35.928123474121094, "step": 5430 }, { "epoch": 1.3716005420225001, "grad_norm": 0.0008110233466140926, "learning_rate": 3.296360093600329e-07, "logits/chosen": 0.22292175889015198, "logits/rejected": NaN, "logps/chosen": -184.44375610351562, "logps/rejected": -868.2000122070312, "loss": 0.0209, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.357324123382568, "rewards/margins": 30.874217987060547, "rewards/rejected": -35.21875, "step": 5440 }, { "epoch": 1.3741215769073205, "grad_norm": 0.017791559919714928, "learning_rate": 3.2894040683141755e-07, "logits/chosen": 0.10930176079273224, "logits/rejected": 0.45832520723342896, "logps/chosen": -184.953125, "logps/rejected": -868.5, "loss": 0.0198, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.138449192047119, "rewards/margins": 31.196874618530273, "rewards/rejected": -35.31718826293945, "step": 5450 }, { "epoch": 1.3766426117921406, "grad_norm": 123.06787109375, "learning_rate": 3.2824412481436766e-07, "logits/chosen": 0.21821288764476776, "logits/rejected": NaN, "logps/chosen": -190.63516235351562, "logps/rejected": -870.375, "loss": 0.0507, "rewards/accuracies": 0.984375, "rewards/chosen": -4.124267578125, "rewards/margins": 30.989843368530273, "rewards/rejected": -35.11249923706055, "step": 5460 }, { "epoch": 1.379163646676961, "grad_norm": 23.583995819091797, "learning_rate": 3.275471693022088e-07, "logits/chosen": 0.23296508193016052, "logits/rejected": 0.49132537841796875, "logps/chosen": -176.33438110351562, "logps/rejected": -858.5, "loss": 0.0272, "rewards/accuracies": 0.984375, "rewards/chosen": -4.176110744476318, "rewards/margins": 30.098438262939453, "rewards/rejected": -34.28593826293945, "step": 5470 }, { "epoch": 1.3816846815617811, "grad_norm": 221.44186401367188, "learning_rate": 3.2684954629406436e-07, "logits/chosen": 0.3171935975551605, "logits/rejected": 0.6527328491210938, "logps/chosen": -177.2265625, "logps/rejected": -868.0250244140625, "loss": 0.0403, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9609007835388184, "rewards/margins": 31.592187881469727, "rewards/rejected": -35.5625, "step": 5480 }, { "epoch": 1.3842057164466013, "grad_norm": 8.655308723449707, "learning_rate": 3.2615126179480263e-07, "logits/chosen": 0.169148251414299, "logits/rejected": NaN, "logps/chosen": -188.5625, "logps/rejected": -878.7000122070312, "loss": 0.1039, "rewards/accuracies": 0.96875, "rewards/chosen": -4.371142387390137, "rewards/margins": 29.642187118530273, "rewards/rejected": -34.017967224121094, "step": 5490 }, { "epoch": 1.3867267513314214, "grad_norm": 0.2005978673696518, "learning_rate": 3.254523218149861e-07, "logits/chosen": 0.3176635801792145, "logits/rejected": 0.8510090112686157, "logps/chosen": -172.0968780517578, "logps/rejected": -848.5, "loss": 0.0285, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.373095512390137, "rewards/margins": 29.98046875, "rewards/rejected": -34.35546875, "step": 5500 }, { "epoch": 1.3892477862162418, "grad_norm": 115.61199951171875, "learning_rate": 3.247527323708192e-07, "logits/chosen": 0.23228302597999573, "logits/rejected": NaN, "logps/chosen": -186.7468719482422, "logps/rejected": -901.5999755859375, "loss": 0.0169, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.924145460128784, "rewards/margins": 29.996875762939453, "rewards/rejected": -33.94062423706055, "step": 5510 }, { "epoch": 1.391768821101062, "grad_norm": 118.89640045166016, "learning_rate": 3.2405249948409686e-07, "logits/chosen": 0.4159301817417145, "logits/rejected": NaN, "logps/chosen": -161.1828155517578, "logps/rejected": -829.2000122070312, "loss": 0.0279, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.986886501312256, "rewards/margins": 29.384374618530273, "rewards/rejected": -33.368751525878906, "step": 5520 }, { "epoch": 1.3942898559858823, "grad_norm": 0.01832985319197178, "learning_rate": 3.2335162918215256e-07, "logits/chosen": 0.39665526151657104, "logits/rejected": NaN, "logps/chosen": -180.58438110351562, "logps/rejected": -905.7999877929688, "loss": 0.0449, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.904772758483887, "rewards/margins": 33.65156173706055, "rewards/rejected": -38.55781173706055, "step": 5530 }, { "epoch": 1.3968108908707024, "grad_norm": 0.00011498563981149346, "learning_rate": 3.2265012749780595e-07, "logits/chosen": 0.44072723388671875, "logits/rejected": NaN, "logps/chosen": -169.47811889648438, "logps/rejected": -903.9500122070312, "loss": 0.1382, "rewards/accuracies": 0.984375, "rewards/chosen": -4.145337104797363, "rewards/margins": 35.34453201293945, "rewards/rejected": -39.49687576293945, "step": 5540 }, { "epoch": 1.3993319257555226, "grad_norm": 13.224052429199219, "learning_rate": 3.219480004693121e-07, "logits/chosen": 0.2711044251918793, "logits/rejected": NaN, "logps/chosen": -209.51718139648438, "logps/rejected": -942.3250122070312, "loss": 0.202, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -5.240405082702637, "rewards/margins": 35.736717224121094, "rewards/rejected": -40.95703125, "step": 5550 }, { "epoch": 1.401852960640343, "grad_norm": 0.09170304983854294, "learning_rate": 3.212452541403082e-07, "logits/chosen": 0.23853150010108948, "logits/rejected": NaN, "logps/chosen": -190.03750610351562, "logps/rejected": -901.375, "loss": 0.0531, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.452807426452637, "rewards/margins": 33.88984298706055, "rewards/rejected": -38.32500076293945, "step": 5560 }, { "epoch": 1.404373995525163, "grad_norm": 4.789816379547119, "learning_rate": 3.205418945597624e-07, "logits/chosen": 0.20622864365577698, "logits/rejected": 0.7585471868515015, "logps/chosen": -175.99374389648438, "logps/rejected": -883.7999877929688, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.0293731689453125, "rewards/margins": 33.33124923706055, "rewards/rejected": -37.34843826293945, "step": 5570 }, { "epoch": 1.4068950304099834, "grad_norm": 0.005710373632609844, "learning_rate": 3.198379277819214e-07, "logits/chosen": 0.15541687607765198, "logits/rejected": 0.7833083868026733, "logps/chosen": -206.33749389648438, "logps/rejected": -910.8499755859375, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.53564453125, "rewards/margins": 33.537498474121094, "rewards/rejected": -38.079689025878906, "step": 5580 }, { "epoch": 1.4094160652948036, "grad_norm": 0.00019012155826203525, "learning_rate": 3.191333598662585e-07, "logits/chosen": 0.3066039979457855, "logits/rejected": NaN, "logps/chosen": -185.93905639648438, "logps/rejected": -897.2000122070312, "loss": 0.0955, "rewards/accuracies": 0.984375, "rewards/chosen": -4.856036186218262, "rewards/margins": 34.064064025878906, "rewards/rejected": -38.90937423706055, "step": 5590 }, { "epoch": 1.4119371001796237, "grad_norm": 4.7835211753845215, "learning_rate": 3.184281968774214e-07, "logits/chosen": 0.5444244146347046, "logits/rejected": NaN, "logps/chosen": -180.05313110351562, "logps/rejected": -899.8499755859375, "loss": 0.0222, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.881268501281738, "rewards/margins": 33.592185974121094, "rewards/rejected": -38.46562576293945, "step": 5600 }, { "epoch": 1.4144581350644438, "grad_norm": 0.16454458236694336, "learning_rate": 3.1772244488517965e-07, "logits/chosen": 0.435446172952652, "logits/rejected": NaN, "logps/chosen": -177.25, "logps/rejected": -923.0499877929688, "loss": 0.0463, "rewards/accuracies": 0.984375, "rewards/chosen": -4.910912990570068, "rewards/margins": 35.27031326293945, "rewards/rejected": -40.19062423706055, "step": 5610 }, { "epoch": 1.4169791699492642, "grad_norm": 71.71595764160156, "learning_rate": 3.170161099643731e-07, "logits/chosen": 0.583343505859375, "logits/rejected": 1.0093597173690796, "logps/chosen": -180.8203125, "logps/rejected": -913.0999755859375, "loss": 0.0262, "rewards/accuracies": 0.984375, "rewards/chosen": -4.870471000671387, "rewards/margins": 34.75, "rewards/rejected": -39.631248474121094, "step": 5620 }, { "epoch": 1.4195002048340843, "grad_norm": 0.012614699080586433, "learning_rate": 3.163091981948591e-07, "logits/chosen": 0.5020126104354858, "logits/rejected": 0.912798285484314, "logps/chosen": -186.0437469482422, "logps/rejected": -919.7999877929688, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.694451808929443, "rewards/margins": 35.19843673706055, "rewards/rejected": -39.89531326293945, "step": 5630 }, { "epoch": 1.4220212397189047, "grad_norm": 0.023576028645038605, "learning_rate": 3.1560171566146017e-07, "logits/chosen": 0.45921021699905396, "logits/rejected": NaN, "logps/chosen": -164.0124969482422, "logps/rejected": -870.2000122070312, "loss": 0.0902, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.366223335266113, "rewards/margins": 34.568748474121094, "rewards/rejected": -38.9375, "step": 5640 }, { "epoch": 1.4245422746037248, "grad_norm": 0.001523735118098557, "learning_rate": 3.148936684539118e-07, "logits/chosen": 0.5183387994766235, "logits/rejected": NaN, "logps/chosen": -188.99374389648438, "logps/rejected": -879.5499877929688, "loss": 0.0529, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.077038764953613, "rewards/margins": 33.66718673706055, "rewards/rejected": -38.743751525878906, "step": 5650 }, { "epoch": 1.427063309488545, "grad_norm": 0.6794477105140686, "learning_rate": 3.1418506266681e-07, "logits/chosen": 0.5622192621231079, "logits/rejected": NaN, "logps/chosen": -181.82186889648438, "logps/rejected": -905.7999877929688, "loss": 0.026, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.9195556640625, "rewards/margins": 35.078125, "rewards/rejected": -39.98906326293945, "step": 5660 }, { "epoch": 1.4295843443733653, "grad_norm": 0.004153388552367687, "learning_rate": 3.1347590439955893e-07, "logits/chosen": 0.49744874238967896, "logits/rejected": NaN, "logps/chosen": -205.640625, "logps/rejected": -955.4500122070312, "loss": 0.0054, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.161755561828613, "rewards/margins": 35.662498474121094, "rewards/rejected": -40.829689025878906, "step": 5670 }, { "epoch": 1.4321053792581855, "grad_norm": 0.061656780540943146, "learning_rate": 3.127661997563181e-07, "logits/chosen": 0.4654415249824524, "logits/rejected": 0.9173675775527954, "logps/chosen": -175.9031219482422, "logps/rejected": -918.8499755859375, "loss": 0.017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.513879299163818, "rewards/margins": 34.74687576293945, "rewards/rejected": -39.26874923706055, "step": 5680 }, { "epoch": 1.4346264141430056, "grad_norm": 0.502906322479248, "learning_rate": 3.1205595484595006e-07, "logits/chosen": 0.3243041932582855, "logits/rejected": NaN, "logps/chosen": -185.0812530517578, "logps/rejected": -921.6500244140625, "loss": 0.0119, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.424023628234863, "rewards/margins": 34.03593826293945, "rewards/rejected": -38.459373474121094, "step": 5690 }, { "epoch": 1.437147449027826, "grad_norm": 0.0007040489581413567, "learning_rate": 3.1134517578196786e-07, "logits/chosen": 0.492767333984375, "logits/rejected": NaN, "logps/chosen": -176.3359375, "logps/rejected": -839.9749755859375, "loss": 0.0384, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.468865871429443, "rewards/margins": 30.583593368530273, "rewards/rejected": -35.05156326293945, "step": 5700 }, { "epoch": 1.4396684839126461, "grad_norm": 0.1586219221353531, "learning_rate": 3.106338686824822e-07, "logits/chosen": 0.3844154477119446, "logits/rejected": NaN, "logps/chosen": -162.5625, "logps/rejected": -869.8499755859375, "loss": 0.0173, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.116247653961182, "rewards/margins": 32.283592224121094, "rewards/rejected": -36.40312576293945, "step": 5710 }, { "epoch": 1.4421895187974663, "grad_norm": 1.3687118291854858, "learning_rate": 3.0992203967014907e-07, "logits/chosen": 0.19659423828125, "logits/rejected": NaN, "logps/chosen": -185.08438110351562, "logps/rejected": -877.0999755859375, "loss": 0.0433, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5679078102111816, "rewards/margins": 30.996875762939453, "rewards/rejected": -34.560935974121094, "step": 5720 }, { "epoch": 1.4447105536822866, "grad_norm": 0.007584299426525831, "learning_rate": 3.0920969487211667e-07, "logits/chosen": 0.10819397121667862, "logits/rejected": 0.5118011236190796, "logps/chosen": -180.14688110351562, "logps/rejected": -858.5250244140625, "loss": 0.0598, "rewards/accuracies": 0.984375, "rewards/chosen": -3.211261034011841, "rewards/margins": 29.426563262939453, "rewards/rejected": -32.64374923706055, "step": 5730 }, { "epoch": 1.4472315885671068, "grad_norm": 1.4735720469616354e-05, "learning_rate": 3.0849684041997314e-07, "logits/chosen": 0.28924864530563354, "logits/rejected": NaN, "logps/chosen": -169.46875, "logps/rejected": -890.3499755859375, "loss": 0.0844, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.171777248382568, "rewards/margins": 31.764842987060547, "rewards/rejected": -35.9453125, "step": 5740 }, { "epoch": 1.4497526234519271, "grad_norm": 12.054537773132324, "learning_rate": 3.0778348244969323e-07, "logits/chosen": 0.25129395723342896, "logits/rejected": NaN, "logps/chosen": -186.0703125, "logps/rejected": -891.7999877929688, "loss": 0.0569, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.087005615234375, "rewards/margins": 33.13203048706055, "rewards/rejected": -37.20624923706055, "step": 5750 }, { "epoch": 1.4522736583367473, "grad_norm": 0.00018885507597588003, "learning_rate": 3.070696271015861e-07, "logits/chosen": 0.2258460968732834, "logits/rejected": 0.6811538934707642, "logps/chosen": -185.8390655517578, "logps/rejected": -899.75, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.935986280441284, "rewards/margins": 33.626564025878906, "rewards/rejected": -37.56562423706055, "step": 5760 }, { "epoch": 1.4547946932215674, "grad_norm": 88.6113052368164, "learning_rate": 3.0635528052024195e-07, "logits/chosen": 0.19734802842140198, "logits/rejected": NaN, "logps/chosen": -192.3562469482422, "logps/rejected": -909.375, "loss": 0.0128, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.435412406921387, "rewards/margins": 33.88750076293945, "rewards/rejected": -38.326560974121094, "step": 5770 }, { "epoch": 1.4573157281063875, "grad_norm": 248.97315979003906, "learning_rate": 3.056404488544794e-07, "logits/chosen": 0.5312668085098267, "logits/rejected": 0.833630383014679, "logps/chosen": -165.6875, "logps/rejected": -890.9500122070312, "loss": 0.0889, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.299072265625, "rewards/margins": 33.271095275878906, "rewards/rejected": -37.5546875, "step": 5780 }, { "epoch": 1.459836762991208, "grad_norm": 0.07541459053754807, "learning_rate": 3.049251382572922e-07, "logits/chosen": 0.263803094625473, "logits/rejected": NaN, "logps/chosen": -178.70938110351562, "logps/rejected": -862.6500244140625, "loss": 0.0222, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.043286323547363, "rewards/margins": 32.329689025878906, "rewards/rejected": -36.3671875, "step": 5790 }, { "epoch": 1.462357797876028, "grad_norm": 69.46263885498047, "learning_rate": 3.042093548857971e-07, "logits/chosen": 0.21806029975414276, "logits/rejected": 0.6313842535018921, "logps/chosen": -191.8625030517578, "logps/rejected": -948.2000122070312, "loss": 0.0084, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.3673095703125, "rewards/margins": 35.650001525878906, "rewards/rejected": -40.001564025878906, "step": 5800 }, { "epoch": 1.4648788327608484, "grad_norm": 0.05922152101993561, "learning_rate": 3.034931049011799e-07, "logits/chosen": 0.33344727754592896, "logits/rejected": 0.8459151983261108, "logps/chosen": -179.56875610351562, "logps/rejected": -897.4500122070312, "loss": 0.1434, "rewards/accuracies": 0.984375, "rewards/chosen": -4.518750190734863, "rewards/margins": 34.69218826293945, "rewards/rejected": -39.2109375, "step": 5810 }, { "epoch": 1.4673998676456685, "grad_norm": 6.264918804168701, "learning_rate": 3.027763944686429e-07, "logits/chosen": 0.322286993265152, "logits/rejected": NaN, "logps/chosen": -167.640625, "logps/rejected": -851.9000244140625, "loss": 0.0217, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.033807277679443, "rewards/margins": 32.657814025878906, "rewards/rejected": -36.704689025878906, "step": 5820 }, { "epoch": 1.4699209025304887, "grad_norm": 1.2613695859909058, "learning_rate": 3.0205922975735185e-07, "logits/chosen": 0.164256289601326, "logits/rejected": NaN, "logps/chosen": -190.5281219482422, "logps/rejected": -884.8499755859375, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.091992378234863, "rewards/margins": 31.939062118530273, "rewards/rejected": -36.040626525878906, "step": 5830 }, { "epoch": 1.472441937415309, "grad_norm": 0.6864455938339233, "learning_rate": 3.0134161694038266e-07, "logits/chosen": 0.12196769565343857, "logits/rejected": NaN, "logps/chosen": -182.7624969482422, "logps/rejected": -907.7249755859375, "loss": 0.0473, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6845855712890625, "rewards/margins": 32.0859375, "rewards/rejected": -35.76874923706055, "step": 5840 }, { "epoch": 1.4749629723001292, "grad_norm": 0.01639617048203945, "learning_rate": 3.0062356219466856e-07, "logits/chosen": 0.42818909883499146, "logits/rejected": NaN, "logps/chosen": -181.0593719482422, "logps/rejected": -807.7249755859375, "loss": 0.1598, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.899645805358887, "rewards/margins": 28.586719512939453, "rewards/rejected": -33.462501525878906, "step": 5850 }, { "epoch": 1.4774840071849495, "grad_norm": 201.4630889892578, "learning_rate": 2.999050717009463e-07, "logits/chosen": 0.34768447279930115, "logits/rejected": 0.705065906047821, "logps/chosen": -174.03750610351562, "logps/rejected": -871.0, "loss": 0.0576, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.354052543640137, "rewards/margins": 30.571874618530273, "rewards/rejected": -34.92499923706055, "step": 5860 }, { "epoch": 1.4800050420697697, "grad_norm": 129.50820922851562, "learning_rate": 2.991861516437039e-07, "logits/chosen": 0.008242798037827015, "logits/rejected": NaN, "logps/chosen": -189.50936889648438, "logps/rejected": -852.1749877929688, "loss": 0.1065, "rewards/accuracies": 0.984375, "rewards/chosen": -4.389624118804932, "rewards/margins": 28.447656631469727, "rewards/rejected": -32.826560974121094, "step": 5870 }, { "epoch": 1.4825260769545898, "grad_norm": 0.00023811578284949064, "learning_rate": 2.984668082111265e-07, "logits/chosen": 0.1350662261247635, "logits/rejected": 0.5300140380859375, "logps/chosen": -173.27499389648438, "logps/rejected": -832.5999755859375, "loss": 0.0053, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7204651832580566, "rewards/margins": 27.919530868530273, "rewards/rejected": -31.6328125, "step": 5880 }, { "epoch": 1.48504711183941, "grad_norm": 0.016716480255126953, "learning_rate": 2.977470475950436e-07, "logits/chosen": 0.10697784274816513, "logits/rejected": NaN, "logps/chosen": -175.484375, "logps/rejected": -848.0499877929688, "loss": 0.1036, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6146607398986816, "rewards/margins": 28.005468368530273, "rewards/rejected": -31.622655868530273, "step": 5890 }, { "epoch": 1.4875681467242303, "grad_norm": 0.05676331743597984, "learning_rate": 2.9702687599087587e-07, "logits/chosen": 0.04328308254480362, "logits/rejected": NaN, "logps/chosen": -151.2843780517578, "logps/rejected": -888.2000122070312, "loss": 0.0076, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.0963134765625, "rewards/margins": 31.837499618530273, "rewards/rejected": -34.9375, "step": 5900 }, { "epoch": 1.4900891816090505, "grad_norm": 181.16551208496094, "learning_rate": 2.963062995975814e-07, "logits/chosen": 0.03299865871667862, "logits/rejected": NaN, "logps/chosen": -185.82186889648438, "logps/rejected": -876.2000122070312, "loss": 0.0139, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.246533393859863, "rewards/margins": 29.987499237060547, "rewards/rejected": -34.22968673706055, "step": 5910 }, { "epoch": 1.4926102164938708, "grad_norm": 0.5272963047027588, "learning_rate": 2.955853246176024e-07, "logits/chosen": 0.0657196044921875, "logits/rejected": NaN, "logps/chosen": -186.38125610351562, "logps/rejected": -813.3499755859375, "loss": 0.0127, "rewards/accuracies": 0.984375, "rewards/chosen": -3.386523485183716, "rewards/margins": 26.931249618530273, "rewards/rejected": -30.315624237060547, "step": 5920 }, { "epoch": 1.495131251378691, "grad_norm": 0.2583748400211334, "learning_rate": 2.9486395725681233e-07, "logits/chosen": 0.044586181640625, "logits/rejected": NaN, "logps/chosen": -185.6125030517578, "logps/rejected": -866.75, "loss": 0.0326, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.675549268722534, "rewards/margins": 28.901561737060547, "rewards/rejected": -32.571876525878906, "step": 5930 }, { "epoch": 1.497652286263511, "grad_norm": 25.118654251098633, "learning_rate": 2.941422037244618e-07, "logits/chosen": 0.01375427283346653, "logits/rejected": NaN, "logps/chosen": -176.10311889648438, "logps/rejected": -827.4500122070312, "loss": 0.1017, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.426776170730591, "rewards/margins": 28.279687881469727, "rewards/rejected": -31.700000762939453, "step": 5940 }, { "epoch": 1.5001733211483312, "grad_norm": 0.006320135667920113, "learning_rate": 2.9342007023312563e-07, "logits/chosen": 0.08563232421875, "logits/rejected": NaN, "logps/chosen": -175.7546844482422, "logps/rejected": -864.5, "loss": 0.0723, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.9697508811950684, "rewards/margins": 29.525781631469727, "rewards/rejected": -33.48906326293945, "step": 5950 }, { "epoch": 1.5026943560331516, "grad_norm": 0.675847053527832, "learning_rate": 2.9269756299864906e-07, "logits/chosen": 0.05838165432214737, "logits/rejected": NaN, "logps/chosen": -172.89999389648438, "logps/rejected": -847.3250122070312, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.141523838043213, "rewards/margins": 29.274219512939453, "rewards/rejected": -33.41093826293945, "step": 5960 }, { "epoch": 1.505215390917972, "grad_norm": 0.0013099682983011007, "learning_rate": 2.9197468824009445e-07, "logits/chosen": 0.15484008193016052, "logits/rejected": NaN, "logps/chosen": -176.0968780517578, "logps/rejected": -812.875, "loss": 0.0963, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.91162109375, "rewards/margins": 28.626562118530273, "rewards/rejected": -32.52421951293945, "step": 5970 }, { "epoch": 1.507736425802792, "grad_norm": 32.84103012084961, "learning_rate": 2.912514521796876e-07, "logits/chosen": 0.17289122939109802, "logits/rejected": 0.5233154296875, "logps/chosen": -178.2062530517578, "logps/rejected": -847.4500122070312, "loss": 0.0042, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.321093559265137, "rewards/margins": 29.676563262939453, "rewards/rejected": -33.99687576293945, "step": 5980 }, { "epoch": 1.5102574606876122, "grad_norm": 16.875097274780273, "learning_rate": 2.905278610427643e-07, "logits/chosen": 0.3131866455078125, "logits/rejected": 0.6871551275253296, "logps/chosen": -190.296875, "logps/rejected": -873.9000244140625, "loss": 0.0589, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.357824802398682, "rewards/margins": 30.714061737060547, "rewards/rejected": -35.053123474121094, "step": 5990 }, { "epoch": 1.5127784955724324, "grad_norm": 12.09361743927002, "learning_rate": 2.898039210577166e-07, "logits/chosen": 0.34908753633499146, "logits/rejected": NaN, "logps/chosen": -184.44375610351562, "logps/rejected": -855.8499755859375, "loss": 0.0381, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.162072658538818, "rewards/margins": 30.632030487060547, "rewards/rejected": -34.787498474121094, "step": 6000 }, { "epoch": 1.5152995304572527, "grad_norm": 1.7319144010543823, "learning_rate": 2.890796384559395e-07, "logits/chosen": 0.4012710452079773, "logits/rejected": NaN, "logps/chosen": -188.6906280517578, "logps/rejected": -862.0999755859375, "loss": 0.0687, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.642333984375, "rewards/margins": 30.965625762939453, "rewards/rejected": -35.59687423706055, "step": 6010 }, { "epoch": 1.5178205653420729, "grad_norm": 0.03293292969465256, "learning_rate": 2.88355019471777e-07, "logits/chosen": 0.5435638427734375, "logits/rejected": 1.025054931640625, "logps/chosen": -185.4812469482422, "logps/rejected": -882.1500244140625, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.758008003234863, "rewards/margins": 33.1328125, "rewards/rejected": -38.876564025878906, "step": 6020 }, { "epoch": 1.5203416002268932, "grad_norm": 21.781160354614258, "learning_rate": 2.876300703424683e-07, "logits/chosen": 0.5136001706123352, "logits/rejected": NaN, "logps/chosen": -202.5, "logps/rejected": -912.0750122070312, "loss": 0.2034, "rewards/accuracies": 0.96875, "rewards/chosen": -6.126416206359863, "rewards/margins": 33.22734451293945, "rewards/rejected": -39.359375, "step": 6030 }, { "epoch": 1.5228626351117134, "grad_norm": 0.12210700660943985, "learning_rate": 2.8690479730809504e-07, "logits/chosen": 0.3735519349575043, "logits/rejected": NaN, "logps/chosen": -188.3562469482422, "logps/rejected": -895.6749877929688, "loss": 0.0456, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.496411323547363, "rewards/margins": 33.364845275878906, "rewards/rejected": -37.87187576293945, "step": 6040 }, { "epoch": 1.5253836699965335, "grad_norm": 0.009595613926649094, "learning_rate": 2.861792066115261e-07, "logits/chosen": 0.45885926485061646, "logits/rejected": NaN, "logps/chosen": -184.1687469482422, "logps/rejected": -913.4500122070312, "loss": 0.023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.659814357757568, "rewards/margins": 34.4296875, "rewards/rejected": -39.09375, "step": 6050 }, { "epoch": 1.5279047048813537, "grad_norm": 109.49754333496094, "learning_rate": 2.8545330449836525e-07, "logits/chosen": 0.43326109647750854, "logits/rejected": NaN, "logps/chosen": -198.6953125, "logps/rejected": -908.9500122070312, "loss": 0.099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.3087158203125, "rewards/margins": 34.446876525878906, "rewards/rejected": -39.765625, "step": 6060 }, { "epoch": 1.530425739766174, "grad_norm": 0.0014539376134052873, "learning_rate": 2.847270972168965e-07, "logits/chosen": 0.6994270086288452, "logits/rejected": 1.275177001953125, "logps/chosen": -194.4562530517578, "logps/rejected": -910.5750122070312, "loss": 0.0203, "rewards/accuracies": 0.984375, "rewards/chosen": -6.027685642242432, "rewards/margins": 34.700782775878906, "rewards/rejected": -40.732810974121094, "step": 6070 }, { "epoch": 1.5329467746509944, "grad_norm": 288.06097412109375, "learning_rate": 2.8400059101803077e-07, "logits/chosen": 0.49375152587890625, "logits/rejected": NaN, "logps/chosen": -195.5812530517578, "logps/rejected": -933.3499755859375, "loss": 0.1516, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -5.952588081359863, "rewards/margins": 35.4921875, "rewards/rejected": -41.44843673706055, "step": 6080 }, { "epoch": 1.5354678095358145, "grad_norm": 0.06282804906368256, "learning_rate": 2.8327379215525194e-07, "logits/chosen": 0.501007080078125, "logits/rejected": 0.9951232671737671, "logps/chosen": -184.17813110351562, "logps/rejected": -912.4249877929688, "loss": 0.0485, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.4116363525390625, "rewards/margins": 34.140625, "rewards/rejected": -39.54999923706055, "step": 6090 }, { "epoch": 1.5379888444206347, "grad_norm": 96.99334716796875, "learning_rate": 2.825467068845629e-07, "logits/chosen": 0.5646225214004517, "logits/rejected": 1.0760681629180908, "logps/chosen": -172.20938110351562, "logps/rejected": -898.6500244140625, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.930322170257568, "rewards/margins": 34.015625, "rewards/rejected": -38.9453125, "step": 6100 }, { "epoch": 1.5405098793054548, "grad_norm": 0.7479487061500549, "learning_rate": 2.8181934146443215e-07, "logits/chosen": 0.398397833108902, "logits/rejected": NaN, "logps/chosen": -164.67813110351562, "logps/rejected": -874.9749755859375, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.114773750305176, "rewards/margins": 34.118751525878906, "rewards/rejected": -38.220314025878906, "step": 6110 }, { "epoch": 1.5430309141902752, "grad_norm": 0.004042624495923519, "learning_rate": 2.8109170215573925e-07, "logits/chosen": 0.4764648377895355, "logits/rejected": NaN, "logps/chosen": -192.3125, "logps/rejected": -894.7999877929688, "loss": 0.0309, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.963208198547363, "rewards/margins": 31.631250381469727, "rewards/rejected": -36.610939025878906, "step": 6120 }, { "epoch": 1.5455519490750953, "grad_norm": 0.011180522851645947, "learning_rate": 2.803637952217214e-07, "logits/chosen": 0.3928115963935852, "logits/rejected": 0.751385509967804, "logps/chosen": -170.68124389648438, "logps/rejected": -848.7750244140625, "loss": 0.0241, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.363818168640137, "rewards/margins": 30.258594512939453, "rewards/rejected": -34.62968826293945, "step": 6130 }, { "epoch": 1.5480729839599157, "grad_norm": 0.08293312042951584, "learning_rate": 2.796356269279195e-07, "logits/chosen": 0.3579162657260895, "logits/rejected": NaN, "logps/chosen": -184.5859375, "logps/rejected": -898.8499755859375, "loss": 0.0815, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.30157470703125, "rewards/margins": 32.451560974121094, "rewards/rejected": -36.73749923706055, "step": 6140 }, { "epoch": 1.5505940188447358, "grad_norm": 10.715765953063965, "learning_rate": 2.7890720354212413e-07, "logits/chosen": 0.503765881061554, "logits/rejected": NaN, "logps/chosen": -177.375, "logps/rejected": -899.2000122070312, "loss": 0.0114, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.565673828125, "rewards/margins": 33.484375, "rewards/rejected": -38.05937576293945, "step": 6150 }, { "epoch": 1.553115053729556, "grad_norm": 0.0193801112473011, "learning_rate": 2.7817853133432145e-07, "logits/chosen": 0.44508057832717896, "logits/rejected": NaN, "logps/chosen": -183.0749969482422, "logps/rejected": -905.875, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.484551906585693, "rewards/margins": 34.87968826293945, "rewards/rejected": -39.359375, "step": 6160 }, { "epoch": 1.555636088614376, "grad_norm": 80.15372467041016, "learning_rate": 2.774496165766394e-07, "logits/chosen": 0.38025665283203125, "logits/rejected": NaN, "logps/chosen": -183.33438110351562, "logps/rejected": -920.2999877929688, "loss": 0.0719, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.006396293640137, "rewards/margins": 35.970314025878906, "rewards/rejected": -40.959373474121094, "step": 6170 }, { "epoch": 1.5581571234991964, "grad_norm": 0.002175786765292287, "learning_rate": 2.76720465543294e-07, "logits/chosen": 0.6227203607559204, "logits/rejected": 1.0214812755584717, "logps/chosen": -199.72030639648438, "logps/rejected": -926.3499755859375, "loss": 0.0571, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.197412014007568, "rewards/margins": 35.1171875, "rewards/rejected": -41.30781173706055, "step": 6180 }, { "epoch": 1.5606781583840168, "grad_norm": 0.006167754530906677, "learning_rate": 2.759910845105347e-07, "logits/chosen": 0.558306872844696, "logits/rejected": 0.6933380365371704, "logps/chosen": -196.78750610351562, "logps/rejected": -990.9000244140625, "loss": 0.1439, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -6.7412109375, "rewards/margins": 36.83906173706055, "rewards/rejected": -43.57343673706055, "step": 6190 }, { "epoch": 1.563199193268837, "grad_norm": 119.33955383300781, "learning_rate": 2.7526147975659085e-07, "logits/chosen": 0.45132142305374146, "logits/rejected": NaN, "logps/chosen": -190.4718780517578, "logps/rejected": -910.8499755859375, "loss": 0.0658, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -6.051098823547363, "rewards/margins": 36.028907775878906, "rewards/rejected": -42.087501525878906, "step": 6200 }, { "epoch": 1.565720228153657, "grad_norm": 0.17723068594932556, "learning_rate": 2.7453165756161745e-07, "logits/chosen": 0.5469115972518921, "logits/rejected": 0.7574554681777954, "logps/chosen": -164.06875610351562, "logps/rejected": -935.4749755859375, "loss": 0.0221, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.140356540679932, "rewards/margins": 35.791404724121094, "rewards/rejected": -39.950782775878906, "step": 6210 }, { "epoch": 1.5682412630384772, "grad_norm": 0.03730883449316025, "learning_rate": 2.738016242076411e-07, "logits/chosen": 0.37793272733688354, "logits/rejected": 0.8537226915359497, "logps/chosen": -182.1999969482422, "logps/rejected": -925.6500244140625, "loss": 0.0679, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.8392333984375, "rewards/margins": 35.67656326293945, "rewards/rejected": -40.506248474121094, "step": 6220 }, { "epoch": 1.5707622979232974, "grad_norm": 0.03400114178657532, "learning_rate": 2.7307138597850616e-07, "logits/chosen": 0.35713499784469604, "logits/rejected": NaN, "logps/chosen": -180.4031219482422, "logps/rejected": -903.0750122070312, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.547900199890137, "rewards/margins": 34.359375, "rewards/rejected": -38.915626525878906, "step": 6230 }, { "epoch": 1.5732833328081177, "grad_norm": 0.302316814661026, "learning_rate": 2.723409491598202e-07, "logits/chosen": 0.5575591921806335, "logits/rejected": 0.7347686886787415, "logps/chosen": -158.89999389648438, "logps/rejected": -931.8250122070312, "loss": 0.0774, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.000781059265137, "rewards/margins": 35.735939025878906, "rewards/rejected": -39.740623474121094, "step": 6240 }, { "epoch": 1.575804367692938, "grad_norm": 0.8600789904594421, "learning_rate": 2.716103200389005e-07, "logits/chosen": 0.4734329283237457, "logits/rejected": NaN, "logps/chosen": -188.57186889648438, "logps/rejected": -929.0499877929688, "loss": 0.0641, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.96142578125, "rewards/margins": 35.076560974121094, "rewards/rejected": -40.0390625, "step": 6250 }, { "epoch": 1.5783254025777582, "grad_norm": 169.1731719970703, "learning_rate": 2.7087950490471933e-07, "logits/chosen": 0.5609542727470398, "logits/rejected": NaN, "logps/chosen": -178.77188110351562, "logps/rejected": -884.0999755859375, "loss": 0.1779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.286913871765137, "rewards/margins": 35.32343673706055, "rewards/rejected": -40.603126525878906, "step": 6260 }, { "epoch": 1.5808464374625784, "grad_norm": 241.3647918701172, "learning_rate": 2.701485100478503e-07, "logits/chosen": 0.534881591796875, "logits/rejected": 1.1018707752227783, "logps/chosen": -200.20938110351562, "logps/rejected": -1021.2000122070312, "loss": 0.1125, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.70068359375, "rewards/margins": 40.482810974121094, "rewards/rejected": -46.201560974121094, "step": 6270 }, { "epoch": 1.5833674723473985, "grad_norm": 161.32952880859375, "learning_rate": 2.6941734176041375e-07, "logits/chosen": 0.601306140422821, "logits/rejected": 1.0089843273162842, "logps/chosen": -182.2062530517578, "logps/rejected": -929.2999877929688, "loss": 0.0155, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.365063667297363, "rewards/margins": 36.375, "rewards/rejected": -41.75312423706055, "step": 6280 }, { "epoch": 1.5858885072322189, "grad_norm": 0.002476937836036086, "learning_rate": 2.6868600633602313e-07, "logits/chosen": 0.682598888874054, "logits/rejected": NaN, "logps/chosen": -177.05313110351562, "logps/rejected": -931.5, "loss": 0.0092, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.748193264007568, "rewards/margins": 38.076560974121094, "rewards/rejected": -42.81718826293945, "step": 6290 }, { "epoch": 1.588409542117039, "grad_norm": 309.4949645996094, "learning_rate": 2.6795451006973026e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -192.3781280517578, "logps/rejected": -921.8499755859375, "loss": 0.0335, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.471337795257568, "rewards/margins": 34.423439025878906, "rewards/rejected": -38.90703201293945, "step": 6300 }, { "epoch": 1.5909305770018594, "grad_norm": 0.397205114364624, "learning_rate": 2.672228592579715e-07, "logits/chosen": 0.37930601835250854, "logits/rejected": 1.046417236328125, "logps/chosen": -181.0437469482422, "logps/rejected": -927.2999877929688, "loss": 0.0817, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.356835842132568, "rewards/margins": 35.904685974121094, "rewards/rejected": -40.259376525878906, "step": 6310 }, { "epoch": 1.5934516118866795, "grad_norm": 0.0034078657627105713, "learning_rate": 2.6649106019851385e-07, "logits/chosen": 0.5010501742362976, "logits/rejected": NaN, "logps/chosen": -183.4375, "logps/rejected": -932.3250122070312, "loss": 0.0945, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.055346488952637, "rewards/margins": 35.85546875, "rewards/rejected": -40.90156173706055, "step": 6320 }, { "epoch": 1.5959726467714996, "grad_norm": 0.0008646405767649412, "learning_rate": 2.657591191903998e-07, "logits/chosen": 0.540631115436554, "logits/rejected": NaN, "logps/chosen": -187.35000610351562, "logps/rejected": -892.6500244140625, "loss": 0.0038, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.110742092132568, "rewards/margins": 33.4765625, "rewards/rejected": -38.571876525878906, "step": 6330 }, { "epoch": 1.5984936816563198, "grad_norm": 266.9444885253906, "learning_rate": 2.6502704253389413e-07, "logits/chosen": 0.8308807611465454, "logits/rejected": NaN, "logps/chosen": -174.1531219482422, "logps/rejected": -937.4500122070312, "loss": 0.0858, "rewards/accuracies": 0.984375, "rewards/chosen": -5.5238189697265625, "rewards/margins": 36.764060974121094, "rewards/rejected": -42.296875, "step": 6340 }, { "epoch": 1.6010147165411401, "grad_norm": 0.15970396995544434, "learning_rate": 2.642948365304288e-07, "logits/chosen": 0.6377227902412415, "logits/rejected": NaN, "logps/chosen": -189.390625, "logps/rejected": -909.1500244140625, "loss": 0.0073, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.927343845367432, "rewards/margins": 34.15312576293945, "rewards/rejected": -39.0859375, "step": 6350 }, { "epoch": 1.6035357514259605, "grad_norm": 2.244065999984741, "learning_rate": 2.6356250748254964e-07, "logits/chosen": 0.5720916986465454, "logits/rejected": 1.089208960533142, "logps/chosen": -181.0859375, "logps/rejected": -887.5499877929688, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.553271293640137, "rewards/margins": 33.392189025878906, "rewards/rejected": -37.9609375, "step": 6360 }, { "epoch": 1.6060567863107806, "grad_norm": 0.0008298756438307464, "learning_rate": 2.6283006169386106e-07, "logits/chosen": 0.3985854983329773, "logits/rejected": 1.050512671470642, "logps/chosen": -186.49844360351562, "logps/rejected": -939.5499877929688, "loss": 0.0286, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.767382621765137, "rewards/margins": 36.064842224121094, "rewards/rejected": -40.821876525878906, "step": 6370 }, { "epoch": 1.6085778211956008, "grad_norm": 15.102959632873535, "learning_rate": 2.6209750546897276e-07, "logits/chosen": 0.49334716796875, "logits/rejected": NaN, "logps/chosen": -171.04061889648438, "logps/rejected": -892.9249877929688, "loss": 0.0615, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.445947170257568, "rewards/margins": 34.525001525878906, "rewards/rejected": -38.967185974121094, "step": 6380 }, { "epoch": 1.611098856080421, "grad_norm": 0.0026525785215198994, "learning_rate": 2.61364845113445e-07, "logits/chosen": 0.3122711181640625, "logits/rejected": NaN, "logps/chosen": -170.9406280517578, "logps/rejected": -898.5999755859375, "loss": 0.047, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9026122093200684, "rewards/margins": 34.428123474121094, "rewards/rejected": -38.32500076293945, "step": 6390 }, { "epoch": 1.6136198909652413, "grad_norm": 223.4058380126953, "learning_rate": 2.6063208693373394e-07, "logits/chosen": 0.3462890684604645, "logits/rejected": NaN, "logps/chosen": -200.234375, "logps/rejected": -888.4500122070312, "loss": 0.2582, "rewards/accuracies": 0.96875, "rewards/chosen": -4.770983695983887, "rewards/margins": 32.203125, "rewards/rejected": -36.990623474121094, "step": 6400 }, { "epoch": 1.6161409258500614, "grad_norm": 0.43258753418922424, "learning_rate": 2.598992372371383e-07, "logits/chosen": 0.19548340141773224, "logits/rejected": NaN, "logps/chosen": -171.6531219482422, "logps/rejected": -902.0250244140625, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.150988578796387, "rewards/margins": 33.845314025878906, "rewards/rejected": -37.99687576293945, "step": 6410 }, { "epoch": 1.6186619607348818, "grad_norm": 166.27064514160156, "learning_rate": 2.591663023317442e-07, "logits/chosen": 0.3000946044921875, "logits/rejected": NaN, "logps/chosen": -178.6640625, "logps/rejected": -904.5, "loss": 0.1769, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.788134813308716, "rewards/margins": 33.05859375, "rewards/rejected": -36.83984375, "step": 6420 }, { "epoch": 1.621182995619702, "grad_norm": 0.0021307547576725483, "learning_rate": 2.584332885263714e-07, "logits/chosen": 0.43063658475875854, "logits/rejected": NaN, "logps/chosen": -167.34375, "logps/rejected": -886.0999755859375, "loss": 0.027, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.632275342941284, "rewards/margins": 33.03125, "rewards/rejected": -36.66718673706055, "step": 6430 }, { "epoch": 1.623704030504522, "grad_norm": 0.018448958173394203, "learning_rate": 2.577002021305186e-07, "logits/chosen": 0.3148437440395355, "logits/rejected": NaN, "logps/chosen": -184.33438110351562, "logps/rejected": -877.6500244140625, "loss": 0.0149, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.501675605773926, "rewards/margins": 31.857030868530273, "rewards/rejected": -36.353126525878906, "step": 6440 }, { "epoch": 1.6262250653893422, "grad_norm": 0.09852965921163559, "learning_rate": 2.569670494543094e-07, "logits/chosen": 0.39621657133102417, "logits/rejected": NaN, "logps/chosen": -182.83438110351562, "logps/rejected": -905.9000244140625, "loss": 0.0432, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.563916206359863, "rewards/margins": 33.45624923706055, "rewards/rejected": -38.021873474121094, "step": 6450 }, { "epoch": 1.6287461002741626, "grad_norm": 1.4581339359283447, "learning_rate": 2.562338368084382e-07, "logits/chosen": 0.44031065702438354, "logits/rejected": 0.9739410281181335, "logps/chosen": -166.15625, "logps/rejected": -900.6500244140625, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.189099311828613, "rewards/margins": 34.842185974121094, "rewards/rejected": -39.015625, "step": 6460 }, { "epoch": 1.631267135158983, "grad_norm": 0.014941921457648277, "learning_rate": 2.555005705041152e-07, "logits/chosen": 0.41224366426467896, "logits/rejected": NaN, "logps/chosen": -193.63125610351562, "logps/rejected": -906.1500244140625, "loss": 0.1078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.150146484375, "rewards/margins": 33.30937576293945, "rewards/rejected": -38.470314025878906, "step": 6470 }, { "epoch": 1.633788170043803, "grad_norm": 0.030016258358955383, "learning_rate": 2.5476725685301257e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -198.7624969482422, "logps/rejected": -883.25, "loss": 0.1009, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.87078857421875, "rewards/margins": 31.270313262939453, "rewards/rejected": -36.154685974121094, "step": 6480 }, { "epoch": 1.6363092049286232, "grad_norm": 0.10206536948680878, "learning_rate": 2.5403390216721015e-07, "logits/chosen": 0.12936706840991974, "logits/rejected": NaN, "logps/chosen": -195.8484344482422, "logps/rejected": -892.0750122070312, "loss": 0.0272, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.388769626617432, "rewards/margins": 31.762500762939453, "rewards/rejected": -36.150001525878906, "step": 6490 }, { "epoch": 1.6388302398134433, "grad_norm": 36.599517822265625, "learning_rate": 2.533005127591409e-07, "logits/chosen": 0.12916870415210724, "logits/rejected": 0.6555099487304688, "logps/chosen": -187.44375610351562, "logps/rejected": -896.3499755859375, "loss": 0.0067, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.5064697265625, "rewards/margins": 31.134374618530273, "rewards/rejected": -35.6640625, "step": 6500 }, { "epoch": 1.6413512746982635, "grad_norm": 5.423956394195557, "learning_rate": 2.5256709494153677e-07, "logits/chosen": 0.2626098692417145, "logits/rejected": NaN, "logps/chosen": -180.8640594482422, "logps/rejected": -882.9500122070312, "loss": 0.0479, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.067822456359863, "rewards/margins": 31.585155487060547, "rewards/rejected": -35.65156173706055, "step": 6510 }, { "epoch": 1.6438723095830838, "grad_norm": 188.35255432128906, "learning_rate": 2.518336550273739e-07, "logits/chosen": 0.29042357206344604, "logits/rejected": NaN, "logps/chosen": -169.70938110351562, "logps/rejected": -874.25, "loss": 0.0565, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.134325981140137, "rewards/margins": 32.12109375, "rewards/rejected": -36.25468826293945, "step": 6520 }, { "epoch": 1.6463933444679042, "grad_norm": 6.0794940509367734e-05, "learning_rate": 2.5110019932981917e-07, "logits/chosen": 0.4166580140590668, "logits/rejected": NaN, "logps/chosen": -157.4992218017578, "logps/rejected": -873.0499877929688, "loss": 0.0104, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8951172828674316, "rewards/margins": 33.6796875, "rewards/rejected": -37.55937576293945, "step": 6530 }, { "epoch": 1.6489143793527243, "grad_norm": 143.5383758544922, "learning_rate": 2.503667341621749e-07, "logits/chosen": 0.33862534165382385, "logits/rejected": NaN, "logps/chosen": -169.94375610351562, "logps/rejected": -883.5999755859375, "loss": 0.1296, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.135705471038818, "rewards/margins": 33.10546875, "rewards/rejected": -37.24687576293945, "step": 6540 }, { "epoch": 1.6514354142375445, "grad_norm": 165.1653594970703, "learning_rate": 2.4963326583782504e-07, "logits/chosen": 0.38291627168655396, "logits/rejected": 1.0603348016738892, "logps/chosen": -187.42813110351562, "logps/rejected": -894.9249877929688, "loss": 0.0769, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.755663871765137, "rewards/margins": 33.756248474121094, "rewards/rejected": -38.52031326293945, "step": 6550 }, { "epoch": 1.6539564491223646, "grad_norm": 0.07881901413202286, "learning_rate": 2.488998006701808e-07, "logits/chosen": 0.316140741109848, "logits/rejected": 0.855725109577179, "logps/chosen": -161.30313110351562, "logps/rejected": -874.1749877929688, "loss": 0.0707, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.801129102706909, "rewards/margins": 33.017189025878906, "rewards/rejected": -36.829689025878906, "step": 6560 }, { "epoch": 1.656477484007185, "grad_norm": 1.2102409601211548, "learning_rate": 2.4816634497262607e-07, "logits/chosen": 0.26480406522750854, "logits/rejected": NaN, "logps/chosen": -183.61563110351562, "logps/rejected": -870.625, "loss": 0.0846, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.5769715309143066, "rewards/margins": 32.291404724121094, "rewards/rejected": -35.881248474121094, "step": 6570 }, { "epoch": 1.6589985188920051, "grad_norm": 119.04061126708984, "learning_rate": 2.474329050584633e-07, "logits/chosen": 0.23767700791358948, "logits/rejected": 0.5340903997421265, "logps/chosen": -181.77499389648438, "logps/rejected": -890.2999877929688, "loss": 0.0245, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.895263671875, "rewards/margins": 31.767187118530273, "rewards/rejected": -35.6796875, "step": 6580 }, { "epoch": 1.6615195537768255, "grad_norm": 0.7248159646987915, "learning_rate": 2.4669948724085906e-07, "logits/chosen": 0.2500244081020355, "logits/rejected": 0.886309802532196, "logps/chosen": -180.25936889648438, "logps/rejected": -863.7750244140625, "loss": 0.0818, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.391992092132568, "rewards/margins": 30.694530487060547, "rewards/rejected": -35.08671951293945, "step": 6590 }, { "epoch": 1.6640405886616456, "grad_norm": 141.44235229492188, "learning_rate": 2.459660978327898e-07, "logits/chosen": 0.4110275208950043, "logits/rejected": 1.0671905279159546, "logps/chosen": -174.4968719482422, "logps/rejected": -877.25, "loss": 0.0557, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.6516051292419434, "rewards/margins": 32.834373474121094, "rewards/rejected": -36.498435974121094, "step": 6600 }, { "epoch": 1.6665616235464658, "grad_norm": 312.61865234375, "learning_rate": 2.452327431469875e-07, "logits/chosen": 0.403085321187973, "logits/rejected": NaN, "logps/chosen": -162.0437469482422, "logps/rejected": -830.2249755859375, "loss": 0.1308, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.5623841285705566, "rewards/margins": 32.302345275878906, "rewards/rejected": -35.87812423706055, "step": 6610 }, { "epoch": 1.669082658431286, "grad_norm": 0.03852054476737976, "learning_rate": 2.444994294958848e-07, "logits/chosen": 0.58514404296875, "logits/rejected": NaN, "logps/chosen": -158.17813110351562, "logps/rejected": -842.0999755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.7623658180236816, "rewards/margins": 31.509374618530273, "rewards/rejected": -35.265625, "step": 6620 }, { "epoch": 1.6716036933161063, "grad_norm": 94.05976104736328, "learning_rate": 2.437661631915618e-07, "logits/chosen": 0.30764466524124146, "logits/rejected": NaN, "logps/chosen": -184.453125, "logps/rejected": -916.4749755859375, "loss": 0.0708, "rewards/accuracies": 0.984375, "rewards/chosen": -4.692431449890137, "rewards/margins": 34.13593673706055, "rewards/rejected": -38.814064025878906, "step": 6630 }, { "epoch": 1.6741247282009266, "grad_norm": 43.61452102661133, "learning_rate": 2.430329505456906e-07, "logits/chosen": 0.36845701932907104, "logits/rejected": NaN, "logps/chosen": -202.43124389648438, "logps/rejected": -941.1500244140625, "loss": 0.0254, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.946063280105591, "rewards/margins": 37.60625076293945, "rewards/rejected": -41.5625, "step": 6640 }, { "epoch": 1.6766457630857468, "grad_norm": 0.025136679410934448, "learning_rate": 2.422997978694815e-07, "logits/chosen": 0.4115707278251648, "logits/rejected": 0.9736877679824829, "logps/chosen": -173.671875, "logps/rejected": -909.2750244140625, "loss": 0.0466, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.076708793640137, "rewards/margins": 37.193748474121094, "rewards/rejected": -41.25312423706055, "step": 6650 }, { "epoch": 1.679166797970567, "grad_norm": 0.08584797382354736, "learning_rate": 2.4156671147362863e-07, "logits/chosen": NaN, "logits/rejected": 1.0963470935821533, "logps/chosen": -169.6750030517578, "logps/rejected": -966.75, "loss": 0.0768, "rewards/accuracies": 0.984375, "rewards/chosen": -4.97503662109375, "rewards/margins": 37.642189025878906, "rewards/rejected": -42.60625076293945, "step": 6660 }, { "epoch": 1.681687832855387, "grad_norm": 0.4959455132484436, "learning_rate": 2.408336976682558e-07, "logits/chosen": 0.44006651639938354, "logits/rejected": NaN, "logps/chosen": -195.0968780517578, "logps/rejected": -931.6500244140625, "loss": 0.0047, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.136315822601318, "rewards/margins": 36.373435974121094, "rewards/rejected": -41.51093673706055, "step": 6670 }, { "epoch": 1.6842088677402074, "grad_norm": 0.049879539757966995, "learning_rate": 2.4010076276286175e-07, "logits/chosen": 0.4572555422782898, "logits/rejected": 0.9236389398574829, "logps/chosen": -165.1492156982422, "logps/rejected": -892.9249877929688, "loss": 0.0669, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.330340385437012, "rewards/margins": 34.646873474121094, "rewards/rejected": -38.98906326293945, "step": 6680 }, { "epoch": 1.6867299026250275, "grad_norm": 124.24576568603516, "learning_rate": 2.3936791306626604e-07, "logits/chosen": 0.352447509765625, "logits/rejected": NaN, "logps/chosen": -195.0187530517578, "logps/rejected": -904.4249877929688, "loss": 0.0185, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.05303955078125, "rewards/margins": 34.295310974121094, "rewards/rejected": -39.357032775878906, "step": 6690 }, { "epoch": 1.689250937509848, "grad_norm": 0.10267318040132523, "learning_rate": 2.3863515488655505e-07, "logits/chosen": 0.22924843430519104, "logits/rejected": NaN, "logps/chosen": -195.2468719482422, "logps/rejected": -858.5499877929688, "loss": 0.1424, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -4.689233303070068, "rewards/margins": 32.5078125, "rewards/rejected": -37.189064025878906, "step": 6700 }, { "epoch": 1.691771972394668, "grad_norm": 0.005581216886639595, "learning_rate": 2.3790249453102724e-07, "logits/chosen": 0.17882537841796875, "logits/rejected": 0.678375244140625, "logps/chosen": -183.3796844482422, "logps/rejected": -876.6500244140625, "loss": 0.0515, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.167504787445068, "rewards/margins": 33.353515625, "rewards/rejected": -37.525779724121094, "step": 6710 }, { "epoch": 1.6942930072794882, "grad_norm": 74.74478149414062, "learning_rate": 2.3716993830613897e-07, "logits/chosen": 0.32161253690719604, "logits/rejected": NaN, "logps/chosen": -183.55624389648438, "logps/rejected": -914.5499877929688, "loss": 0.0645, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.127685546875, "rewards/margins": 34.932029724121094, "rewards/rejected": -39.06171798706055, "step": 6720 }, { "epoch": 1.6968140421643083, "grad_norm": 0.03582422435283661, "learning_rate": 2.364374925174504e-07, "logits/chosen": 0.23087462782859802, "logits/rejected": NaN, "logps/chosen": -196.3000030517578, "logps/rejected": -918.75, "loss": 0.0207, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.117846488952637, "rewards/margins": 33.79218673706055, "rewards/rejected": -37.915626525878906, "step": 6730 }, { "epoch": 1.6993350770491287, "grad_norm": 3.3346102237701416, "learning_rate": 2.3570516346957118e-07, "logits/chosen": 0.26418304443359375, "logits/rejected": NaN, "logps/chosen": -192.36874389648438, "logps/rejected": -891.9000244140625, "loss": 0.0978, "rewards/accuracies": 0.984375, "rewards/chosen": -4.76416015625, "rewards/margins": 32.803123474121094, "rewards/rejected": -37.57343673706055, "step": 6740 }, { "epoch": 1.701856111933949, "grad_norm": 57.588871002197266, "learning_rate": 2.3497295746610592e-07, "logits/chosen": 0.09332046657800674, "logits/rejected": NaN, "logps/chosen": -182.33438110351562, "logps/rejected": -842.3250122070312, "loss": 0.1382, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.391491651535034, "rewards/margins": 29.461719512939453, "rewards/rejected": -32.853126525878906, "step": 6750 }, { "epoch": 1.7043771468187692, "grad_norm": 0.011700620874762535, "learning_rate": 2.342408808096001e-07, "logits/chosen": 0.0007873534923419356, "logits/rejected": 0.608386218547821, "logps/chosen": -173.5265655517578, "logps/rejected": -841.1500244140625, "loss": 0.0581, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.1909079551696777, "rewards/margins": 29.399999618530273, "rewards/rejected": -32.595314025878906, "step": 6760 }, { "epoch": 1.7068981817035893, "grad_norm": 0.014525247737765312, "learning_rate": 2.3350893980148615e-07, "logits/chosen": 0.1634368896484375, "logits/rejected": NaN, "logps/chosen": -156.8390655517578, "logps/rejected": -806.8499755859375, "loss": 0.0867, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.4801559448242188, "rewards/margins": 28.85546875, "rewards/rejected": -31.342187881469727, "step": 6770 }, { "epoch": 1.7094192165884095, "grad_norm": 50.145896911621094, "learning_rate": 2.3277714074202847e-07, "logits/chosen": 0.410745233297348, "logits/rejected": NaN, "logps/chosen": -156.234375, "logps/rejected": -804.0, "loss": 0.0226, "rewards/accuracies": 0.984375, "rewards/chosen": -3.513772487640381, "rewards/margins": 27.549999237060547, "rewards/rejected": -31.073436737060547, "step": 6780 }, { "epoch": 1.7119402514732298, "grad_norm": 0.02211678959429264, "learning_rate": 2.3204548993026985e-07, "logits/chosen": 0.029128264635801315, "logits/rejected": NaN, "logps/chosen": -178.65469360351562, "logps/rejected": -863.7249755859375, "loss": 0.0176, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9761109352111816, "rewards/margins": 30.471874237060547, "rewards/rejected": -34.45781326293945, "step": 6790 }, { "epoch": 1.71446128635805, "grad_norm": 0.07683268934488297, "learning_rate": 2.313139936639769e-07, "logits/chosen": 0.22546692192554474, "logits/rejected": 0.5498901605606079, "logps/chosen": -171.19375610351562, "logps/rejected": -892.3250122070312, "loss": 0.0506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.143795967102051, "rewards/margins": 31.961719512939453, "rewards/rejected": -36.09687423706055, "step": 6800 }, { "epoch": 1.7169823212428703, "grad_norm": 76.63377380371094, "learning_rate": 2.3058265823958626e-07, "logits/chosen": 0.23795051872730255, "logits/rejected": 0.6871703863143921, "logps/chosen": -169.21249389648438, "logps/rejected": -875.9500122070312, "loss": 0.0855, "rewards/accuracies": 0.984375, "rewards/chosen": -3.407214403152466, "rewards/margins": 32.35663986206055, "rewards/rejected": -35.767189025878906, "step": 6810 }, { "epoch": 1.7195033561276905, "grad_norm": 0.015600884333252907, "learning_rate": 2.2985148995214975e-07, "logits/chosen": -0.0003906250058207661, "logits/rejected": NaN, "logps/chosen": -175.6374969482422, "logps/rejected": -893.9000244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.4068846702575684, "rewards/margins": 33.05937576293945, "rewards/rejected": -36.462501525878906, "step": 6820 }, { "epoch": 1.7220243910125106, "grad_norm": 16.352577209472656, "learning_rate": 2.2912049509528062e-07, "logits/chosen": 0.20193633437156677, "logits/rejected": NaN, "logps/chosen": -165.203125, "logps/rejected": -802.9000244140625, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3262877464294434, "rewards/margins": 29.9140625, "rewards/rejected": -33.251564025878906, "step": 6830 }, { "epoch": 1.7245454258973307, "grad_norm": 13.489235877990723, "learning_rate": 2.2838967996109948e-07, "logits/chosen": 0.2530517578125, "logits/rejected": 0.675213634967804, "logps/chosen": -176.02734375, "logps/rejected": -885.0, "loss": 0.1021, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.5744872093200684, "rewards/margins": 32.87187576293945, "rewards/rejected": -36.443748474121094, "step": 6840 }, { "epoch": 1.727066460782151, "grad_norm": 0.05179956555366516, "learning_rate": 2.276590508401798e-07, "logits/chosen": 0.300161749124527, "logits/rejected": NaN, "logps/chosen": -179.61563110351562, "logps/rejected": -869.7249755859375, "loss": 0.0854, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.265088081359863, "rewards/margins": 31.297657012939453, "rewards/rejected": -35.540626525878906, "step": 6850 }, { "epoch": 1.7295874956669715, "grad_norm": 104.88671112060547, "learning_rate": 2.2692861402149392e-07, "logits/chosen": 0.261474609375, "logits/rejected": NaN, "logps/chosen": -169.19686889648438, "logps/rejected": -875.2999877929688, "loss": 0.0828, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9559569358825684, "rewards/margins": 32.33906173706055, "rewards/rejected": -36.298439025878906, "step": 6860 }, { "epoch": 1.7321085305517916, "grad_norm": 3.2704391479492188, "learning_rate": 2.2619837579235883e-07, "logits/chosen": 0.18296508491039276, "logits/rejected": NaN, "logps/chosen": -178.1875, "logps/rejected": -877.7750244140625, "loss": 0.0508, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.113406181335449, "rewards/margins": 30.832813262939453, "rewards/rejected": -34.94843673706055, "step": 6870 }, { "epoch": 1.7346295654366117, "grad_norm": 4.63594388961792, "learning_rate": 2.2546834243838255e-07, "logits/chosen": 0.31621092557907104, "logits/rejected": 0.7062774896621704, "logps/chosen": -158.30859375, "logps/rejected": -857.5750122070312, "loss": 0.0225, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.6293883323669434, "rewards/margins": 31.50390625, "rewards/rejected": -35.1171875, "step": 6880 }, { "epoch": 1.7371506003214319, "grad_norm": 0.26748231053352356, "learning_rate": 2.2473852024340918e-07, "logits/chosen": 0.3127487301826477, "logits/rejected": NaN, "logps/chosen": -182.546875, "logps/rejected": -876.1500244140625, "loss": 0.0843, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.416369438171387, "rewards/margins": 32.61249923706055, "rewards/rejected": -37.02812576293945, "step": 6890 }, { "epoch": 1.739671635206252, "grad_norm": 0.01755344867706299, "learning_rate": 2.240089154894652e-07, "logits/chosen": 0.2948242127895355, "logits/rejected": NaN, "logps/chosen": -202.33438110351562, "logps/rejected": -875.0499877929688, "loss": 0.0753, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.160058498382568, "rewards/margins": 34.048439025878906, "rewards/rejected": -38.220314025878906, "step": 6900 }, { "epoch": 1.7421926700910724, "grad_norm": 71.02070617675781, "learning_rate": 2.2327953445670598e-07, "logits/chosen": 0.23780517280101776, "logits/rejected": NaN, "logps/chosen": -180.6248016357422, "logps/rejected": -883.9000244140625, "loss": 0.0293, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.477124214172363, "rewards/margins": 33.240623474121094, "rewards/rejected": -37.7109375, "step": 6910 }, { "epoch": 1.7447137049758927, "grad_norm": 0.18584544956684113, "learning_rate": 2.2255038342336057e-07, "logits/chosen": 0.26716309785842896, "logits/rejected": 0.7108062505722046, "logps/chosen": -177.1640625, "logps/rejected": -856.7000122070312, "loss": 0.0599, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.325488090515137, "rewards/margins": 31.235937118530273, "rewards/rejected": -35.587501525878906, "step": 6920 }, { "epoch": 1.7472347398607129, "grad_norm": 165.7333221435547, "learning_rate": 2.218214686656786e-07, "logits/chosen": 0.224700927734375, "logits/rejected": NaN, "logps/chosen": -165.9015655517578, "logps/rejected": -837.4000244140625, "loss": 0.1737, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.044189453125, "rewards/margins": 29.201562881469727, "rewards/rejected": -33.2578125, "step": 6930 }, { "epoch": 1.749755774745533, "grad_norm": 0.48710766434669495, "learning_rate": 2.2109279645787584e-07, "logits/chosen": 0.2537170350551605, "logits/rejected": NaN, "logps/chosen": -183.52188110351562, "logps/rejected": -839.9249877929688, "loss": 0.0475, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.9690794944763184, "rewards/margins": 30.028905868530273, "rewards/rejected": -33.99687576293945, "step": 6940 }, { "epoch": 1.7522768096303531, "grad_norm": 0.1448928415775299, "learning_rate": 2.2036437307208047e-07, "logits/chosen": 0.33891600370407104, "logits/rejected": NaN, "logps/chosen": -170.4265594482422, "logps/rejected": -907.1500244140625, "loss": 0.041, "rewards/accuracies": 0.984375, "rewards/chosen": -3.739184617996216, "rewards/margins": 33.76953125, "rewards/rejected": -37.50468826293945, "step": 6950 }, { "epoch": 1.7547978445151735, "grad_norm": 0.029630180448293686, "learning_rate": 2.196362047782786e-07, "logits/chosen": 0.17631836235523224, "logits/rejected": NaN, "logps/chosen": -181.1999969482422, "logps/rejected": -878.7000122070312, "loss": 0.0987, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.6750245094299316, "rewards/margins": 32.10546875, "rewards/rejected": -35.77812576293945, "step": 6960 }, { "epoch": 1.7573188793999936, "grad_norm": 0.007563443388789892, "learning_rate": 2.1890829784426072e-07, "logits/chosen": 0.11156310886144638, "logits/rejected": 0.8063141107559204, "logps/chosen": -173.0187530517578, "logps/rejected": -935.1500244140625, "loss": 0.0047, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.233203172683716, "rewards/margins": 35.95624923706055, "rewards/rejected": -39.19843673706055, "step": 6970 }, { "epoch": 1.759839914284814, "grad_norm": 167.17193603515625, "learning_rate": 2.1818065853556783e-07, "logits/chosen": 0.18614807724952698, "logits/rejected": 0.80224609375, "logps/chosen": -178.61874389648438, "logps/rejected": -865.25, "loss": 0.2061, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.970349073410034, "rewards/margins": 32.71875, "rewards/rejected": -36.68437576293945, "step": 6980 }, { "epoch": 1.7623609491696342, "grad_norm": 8.354236602783203, "learning_rate": 2.1745329311543708e-07, "logits/chosen": 0.21922150254249573, "logits/rejected": NaN, "logps/chosen": -181.5437469482422, "logps/rejected": -876.0499877929688, "loss": 0.0423, "rewards/accuracies": 0.984375, "rewards/chosen": -4.2470703125, "rewards/margins": 33.178123474121094, "rewards/rejected": -37.42499923706055, "step": 6990 }, { "epoch": 1.7648819840544543, "grad_norm": 0.029319506138563156, "learning_rate": 2.1672620784474812e-07, "logits/chosen": 0.4144653379917145, "logits/rejected": NaN, "logps/chosen": -172.7937469482422, "logps/rejected": -943.5499877929688, "loss": 0.053, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9745726585388184, "rewards/margins": 37.615623474121094, "rewards/rejected": -41.592185974121094, "step": 7000 }, { "epoch": 1.7674030189392744, "grad_norm": 279.075439453125, "learning_rate": 2.159994089819692e-07, "logits/chosen": 0.45476073026657104, "logits/rejected": 0.9902099370956421, "logps/chosen": -183.32656860351562, "logps/rejected": -921.0750122070312, "loss": 0.16, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.950787544250488, "rewards/margins": 35.77656173706055, "rewards/rejected": -40.74531173706055, "step": 7010 }, { "epoch": 1.7699240538240948, "grad_norm": 0.6907711029052734, "learning_rate": 2.1527290278310355e-07, "logits/chosen": 0.3418990969657898, "logits/rejected": NaN, "logps/chosen": -203.34375, "logps/rejected": -936.5499877929688, "loss": 0.0863, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.336572170257568, "rewards/margins": 35.760154724121094, "rewards/rejected": -41.107810974121094, "step": 7020 }, { "epoch": 1.7724450887089152, "grad_norm": 2.3322830200195312, "learning_rate": 2.1454669550163483e-07, "logits/chosen": 0.5193420648574829, "logits/rejected": NaN, "logps/chosen": -190.7843780517578, "logps/rejected": -955.5, "loss": 0.025, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.891503810882568, "rewards/margins": 39.04999923706055, "rewards/rejected": -44.912498474121094, "step": 7030 }, { "epoch": 1.7749661235937353, "grad_norm": 0.001005189842544496, "learning_rate": 2.1382079338847386e-07, "logits/chosen": 0.49747008085250854, "logits/rejected": 1.056970238685608, "logps/chosen": -204.484375, "logps/rejected": -957.2000122070312, "loss": 0.0495, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.181054592132568, "rewards/margins": 38.40546798706055, "rewards/rejected": -44.59375, "step": 7040 }, { "epoch": 1.7774871584785554, "grad_norm": 2.690774917602539, "learning_rate": 2.1309520269190499e-07, "logits/chosen": 0.6744659543037415, "logits/rejected": 1.261012315750122, "logps/chosen": -194.9968719482422, "logps/rejected": -994.0, "loss": 0.1293, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -6.272558689117432, "rewards/margins": 41.654685974121094, "rewards/rejected": -47.91875076293945, "step": 7050 }, { "epoch": 1.7800081933633756, "grad_norm": 53.73955154418945, "learning_rate": 2.1236992965753164e-07, "logits/chosen": 0.40281373262405396, "logits/rejected": NaN, "logps/chosen": -208.1437530517578, "logps/rejected": -984.7750244140625, "loss": 0.0594, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -6.256445407867432, "rewards/margins": 39.90234375, "rewards/rejected": -46.16718673706055, "step": 7060 }, { "epoch": 1.782529228248196, "grad_norm": 225.3563690185547, "learning_rate": 2.1164498052822305e-07, "logits/chosen": 0.29291000962257385, "logits/rejected": NaN, "logps/chosen": -204.30624389648438, "logps/rejected": -939.3499755859375, "loss": 0.0641, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.133130073547363, "rewards/margins": 35.778907775878906, "rewards/rejected": -40.90937423706055, "step": 7070 }, { "epoch": 1.785050263133016, "grad_norm": 90.87431335449219, "learning_rate": 2.1092036154406046e-07, "logits/chosen": 0.32719725370407104, "logits/rejected": NaN, "logps/chosen": -183.27499389648438, "logps/rejected": -909.7750244140625, "loss": 0.057, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.741674900054932, "rewards/margins": 35.42890548706055, "rewards/rejected": -40.162498474121094, "step": 7080 }, { "epoch": 1.7875712980178364, "grad_norm": 0.025178154930472374, "learning_rate": 2.1019607894228337e-07, "logits/chosen": 0.2861167788505554, "logits/rejected": NaN, "logps/chosen": -185.5031280517578, "logps/rejected": -900.0250244140625, "loss": 0.0666, "rewards/accuracies": 0.984375, "rewards/chosen": -4.492993354797363, "rewards/margins": 33.986717224121094, "rewards/rejected": -38.4765625, "step": 7090 }, { "epoch": 1.7900923329026566, "grad_norm": 3.0095712645561434e-05, "learning_rate": 2.0947213895723575e-07, "logits/chosen": 0.17290648818016052, "logits/rejected": NaN, "logps/chosen": -176.2937469482422, "logps/rejected": -883.0499877929688, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.864001512527466, "rewards/margins": 34.048439025878906, "rewards/rejected": -37.896873474121094, "step": 7100 }, { "epoch": 1.7926133677874767, "grad_norm": 67.23310089111328, "learning_rate": 2.0874854782031238e-07, "logits/chosen": 0.24807891249656677, "logits/rejected": NaN, "logps/chosen": -179.27188110351562, "logps/rejected": -926.0499877929688, "loss": 0.125, "rewards/accuracies": 0.984375, "rewards/chosen": -4.048474311828613, "rewards/margins": 34.91093826293945, "rewards/rejected": -38.9453125, "step": 7110 }, { "epoch": 1.7951344026722968, "grad_norm": 0.18998971581459045, "learning_rate": 2.0802531175990555e-07, "logits/chosen": 0.23861083388328552, "logits/rejected": NaN, "logps/chosen": -169.1796875, "logps/rejected": -876.2750244140625, "loss": 0.0724, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3438963890075684, "rewards/margins": 32.814064025878906, "rewards/rejected": -36.165626525878906, "step": 7120 }, { "epoch": 1.7976554375571172, "grad_norm": 130.56607055664062, "learning_rate": 2.0730243700135097e-07, "logits/chosen": 0.12704773247241974, "logits/rejected": 0.696014404296875, "logps/chosen": -182.4812469482422, "logps/rejected": -857.9749755859375, "loss": 0.0726, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.002587795257568, "rewards/margins": 31.493749618530273, "rewards/rejected": -35.515625, "step": 7130 }, { "epoch": 1.8001764724419376, "grad_norm": 0.06843842566013336, "learning_rate": 2.0657992976687438e-07, "logits/chosen": 0.1341201812028885, "logits/rejected": NaN, "logps/chosen": -188.1281280517578, "logps/rejected": -885.0999755859375, "loss": 0.0543, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8919434547424316, "rewards/margins": 32.349998474121094, "rewards/rejected": -36.25078201293945, "step": 7140 }, { "epoch": 1.8026975073267577, "grad_norm": 133.45614624023438, "learning_rate": 2.0585779627553814e-07, "logits/chosen": 0.26668089628219604, "logits/rejected": NaN, "logps/chosen": -186.7624969482422, "logps/rejected": -900.0, "loss": 0.1281, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.088793754577637, "rewards/margins": 34.185935974121094, "rewards/rejected": -38.279685974121094, "step": 7150 }, { "epoch": 1.8052185422115778, "grad_norm": 0.19773226976394653, "learning_rate": 2.051360427431877e-07, "logits/chosen": 0.1980545073747635, "logits/rejected": NaN, "logps/chosen": -187.2023468017578, "logps/rejected": -917.2000122070312, "loss": 0.07, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -4.253698825836182, "rewards/margins": 35.83906173706055, "rewards/rejected": -40.08281326293945, "step": 7160 }, { "epoch": 1.807739577096398, "grad_norm": 0.0013193109771236777, "learning_rate": 2.044146753823976e-07, "logits/chosen": 0.12807922065258026, "logits/rejected": NaN, "logps/chosen": -179.0703125, "logps/rejected": -956.2999877929688, "loss": 0.042, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.882974147796631, "rewards/margins": 36.888282775878906, "rewards/rejected": -40.76250076293945, "step": 7170 }, { "epoch": 1.8102606119812181, "grad_norm": 0.291951060295105, "learning_rate": 2.036937004024186e-07, "logits/chosen": 0.08298339694738388, "logits/rejected": NaN, "logps/chosen": -190.86563110351562, "logps/rejected": -895.7000122070312, "loss": 0.0627, "rewards/accuracies": 0.984375, "rewards/chosen": -4.645947456359863, "rewards/margins": 33.532814025878906, "rewards/rejected": -38.16796875, "step": 7180 }, { "epoch": 1.8127816468660385, "grad_norm": 0.20392246544361115, "learning_rate": 2.0297312400912408e-07, "logits/chosen": 0.16611938178539276, "logits/rejected": 0.646680474281311, "logps/chosen": -157.8718719482422, "logps/rejected": -918.5999755859375, "loss": 0.0785, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4946656227111816, "rewards/margins": 34.08281326293945, "rewards/rejected": -37.587501525878906, "step": 7190 }, { "epoch": 1.8153026817508588, "grad_norm": 0.00021016861137468368, "learning_rate": 2.022529524049564e-07, "logits/chosen": 0.24300841987133026, "logits/rejected": 0.4677276611328125, "logps/chosen": -161.6062469482422, "logps/rejected": -912.5, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.996194362640381, "rewards/margins": 33.560157775878906, "rewards/rejected": -37.564064025878906, "step": 7200 }, { "epoch": 1.817823716635679, "grad_norm": 0.017788277938961983, "learning_rate": 2.0153319178887356e-07, "logits/chosen": 0.338104248046875, "logits/rejected": 0.7548156976699829, "logps/chosen": -201.36563110351562, "logps/rejected": -910.4249877929688, "loss": 0.0223, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.235925197601318, "rewards/margins": 33.639060974121094, "rewards/rejected": -38.875, "step": 7210 }, { "epoch": 1.8203447515204991, "grad_norm": 0.04637831449508667, "learning_rate": 2.008138483562961e-07, "logits/chosen": 0.22354888916015625, "logits/rejected": 0.6998351812362671, "logps/chosen": -188.53750610351562, "logps/rejected": -891.5999755859375, "loss": 0.186, "rewards/accuracies": 0.984375, "rewards/chosen": -4.972363471984863, "rewards/margins": 32.807029724121094, "rewards/rejected": -37.765625, "step": 7220 }, { "epoch": 1.8228657864053193, "grad_norm": 8.011188507080078, "learning_rate": 2.0009492829905366e-07, "logits/chosen": 0.10363464057445526, "logits/rejected": NaN, "logps/chosen": -182.70468139648438, "logps/rejected": -892.5999755859375, "loss": 0.048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7957763671875, "rewards/margins": 31.8203125, "rewards/rejected": -35.631248474121094, "step": 7230 }, { "epoch": 1.8253868212901396, "grad_norm": 0.006351878400892019, "learning_rate": 1.993764378053315e-07, "logits/chosen": 0.11809692531824112, "logits/rejected": NaN, "logps/chosen": -180.09140014648438, "logps/rejected": -902.0, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5862793922424316, "rewards/margins": 33.685157775878906, "rewards/rejected": -37.271873474121094, "step": 7240 }, { "epoch": 1.8279078561749598, "grad_norm": 0.14728973805904388, "learning_rate": 1.9865838305961724e-07, "logits/chosen": NaN, "logits/rejected": 0.5173889398574829, "logps/chosen": -189.5500030517578, "logps/rejected": -930.0750122070312, "loss": 0.1018, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.704858303070068, "rewards/margins": 34.91875076293945, "rewards/rejected": -39.623435974121094, "step": 7250 }, { "epoch": 1.8304288910597801, "grad_norm": 0.002152829896658659, "learning_rate": 1.9794077024264812e-07, "logits/chosen": 0.2810775637626648, "logits/rejected": NaN, "logps/chosen": -186.89688110351562, "logps/rejected": -883.75, "loss": 0.0153, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.094763278961182, "rewards/margins": 32.73749923706055, "rewards/rejected": -36.82500076293945, "step": 7260 }, { "epoch": 1.8329499259446003, "grad_norm": 31.91798973083496, "learning_rate": 1.9722360553135715e-07, "logits/chosen": 0.25386351346969604, "logits/rejected": NaN, "logps/chosen": -161.5671844482422, "logps/rejected": -856.6749877929688, "loss": 0.0223, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.59320068359375, "rewards/margins": 33.26874923706055, "rewards/rejected": -36.857810974121094, "step": 7270 }, { "epoch": 1.8354709608294204, "grad_norm": 0.006988551467657089, "learning_rate": 1.9650689509882015e-07, "logits/chosen": 0.15821532905101776, "logits/rejected": NaN, "logps/chosen": -191.64999389648438, "logps/rejected": -900.5499877929688, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.102807521820068, "rewards/margins": 33.514060974121094, "rewards/rejected": -37.61796951293945, "step": 7280 }, { "epoch": 1.8379919957142405, "grad_norm": 0.0003114449791610241, "learning_rate": 1.9579064511420285e-07, "logits/chosen": 0.039878081530332565, "logits/rejected": 0.6669036746025085, "logps/chosen": -200.60311889648438, "logps/rejected": -907.4500122070312, "loss": 0.0094, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.600195407867432, "rewards/margins": 33.41093826293945, "rewards/rejected": -37.998435974121094, "step": 7290 }, { "epoch": 1.840513030599061, "grad_norm": 121.3001480102539, "learning_rate": 1.9507486174270775e-07, "logits/chosen": 0.35335540771484375, "logits/rejected": 0.799511730670929, "logps/chosen": -178.1374969482422, "logps/rejected": -913.5499877929688, "loss": 0.0186, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.746435642242432, "rewards/margins": 35.439064025878906, "rewards/rejected": -40.204689025878906, "step": 7300 }, { "epoch": 1.8430340654838813, "grad_norm": 10.164154052734375, "learning_rate": 1.9435955114552072e-07, "logits/chosen": 0.3717513978481293, "logits/rejected": NaN, "logps/chosen": -185.33749389648438, "logps/rejected": -937.9000244140625, "loss": 0.0382, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.820166110992432, "rewards/margins": 36.618751525878906, "rewards/rejected": -41.4296875, "step": 7310 }, { "epoch": 1.8455551003687014, "grad_norm": 0.003712801029905677, "learning_rate": 1.93644719479758e-07, "logits/chosen": 0.31903839111328125, "logits/rejected": NaN, "logps/chosen": -175.3562469482422, "logps/rejected": -898.4249877929688, "loss": 0.0393, "rewards/accuracies": 0.984375, "rewards/chosen": -4.803595066070557, "rewards/margins": 34.20234298706055, "rewards/rejected": -38.998435974121094, "step": 7320 }, { "epoch": 1.8480761352535215, "grad_norm": 1.1196213960647583, "learning_rate": 1.9293037289841385e-07, "logits/chosen": 0.258209228515625, "logits/rejected": NaN, "logps/chosen": -161.3468780517578, "logps/rejected": -898.3499755859375, "loss": 0.1083, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.791095018386841, "rewards/margins": 33.314842224121094, "rewards/rejected": -37.126564025878906, "step": 7330 }, { "epoch": 1.8505971701383417, "grad_norm": 62.0869255065918, "learning_rate": 1.9221651755030677e-07, "logits/chosen": 0.3366851806640625, "logits/rejected": 0.79254150390625, "logps/chosen": -173.375, "logps/rejected": -926.5999755859375, "loss": 0.1833, "rewards/accuracies": 0.984375, "rewards/chosen": -4.150263786315918, "rewards/margins": 34.735939025878906, "rewards/rejected": -38.881248474121094, "step": 7340 }, { "epoch": 1.853118205023162, "grad_norm": 0.038350749760866165, "learning_rate": 1.9150315958002692e-07, "logits/chosen": 0.169667050242424, "logits/rejected": 0.7784683108329773, "logps/chosen": -184.515625, "logps/rejected": -881.0999755859375, "loss": 0.0271, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.347155570983887, "rewards/margins": 33.467185974121094, "rewards/rejected": -37.810935974121094, "step": 7350 }, { "epoch": 1.8556392399079822, "grad_norm": 0.021166939288377762, "learning_rate": 1.907903051278833e-07, "logits/chosen": 0.20313414931297302, "logits/rejected": 0.7408965826034546, "logps/chosen": -173.13436889648438, "logps/rejected": -851.125, "loss": 0.1286, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.159277439117432, "rewards/margins": 30.669530868530273, "rewards/rejected": -34.81562423706055, "step": 7360 }, { "epoch": 1.8581602747928025, "grad_norm": 6.83241081237793, "learning_rate": 1.9007796032985097e-07, "logits/chosen": 0.3832954466342926, "logits/rejected": NaN, "logps/chosen": -156.25, "logps/rejected": -854.0999755859375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -4.06829833984375, "rewards/margins": 31.434375762939453, "rewards/rejected": -35.48125076293945, "step": 7370 }, { "epoch": 1.8606813096776227, "grad_norm": 147.59832763671875, "learning_rate": 1.8936613131751782e-07, "logits/chosen": 0.25216978788375854, "logits/rejected": NaN, "logps/chosen": -182.8125, "logps/rejected": -903.25, "loss": 0.0325, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.219702243804932, "rewards/margins": 31.767187118530273, "rewards/rejected": -35.98125076293945, "step": 7380 }, { "epoch": 1.8632023445624428, "grad_norm": 39.01150131225586, "learning_rate": 1.8865482421803212e-07, "logits/chosen": 0.12253723293542862, "logits/rejected": NaN, "logps/chosen": -196.66561889648438, "logps/rejected": -844.0, "loss": 0.05, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.333984375, "rewards/margins": 29.728124618530273, "rewards/rejected": -34.04999923706055, "step": 7390 }, { "epoch": 1.865723379447263, "grad_norm": 0.13574552536010742, "learning_rate": 1.8794404515404994e-07, "logits/chosen": 0.22584915161132812, "logits/rejected": NaN, "logps/chosen": -164.03750610351562, "logps/rejected": -820.375, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.483325242996216, "rewards/margins": 30.0, "rewards/rejected": -33.485939025878906, "step": 7400 }, { "epoch": 1.8682444143320833, "grad_norm": 0.09506259858608246, "learning_rate": 1.8723380024368197e-07, "logits/chosen": 0.3407959043979645, "logits/rejected": NaN, "logps/chosen": -153.25936889648438, "logps/rejected": -843.3499755859375, "loss": 0.0091, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.911267042160034, "rewards/margins": 31.896875381469727, "rewards/rejected": -34.806251525878906, "step": 7410 }, { "epoch": 1.8707654492169037, "grad_norm": 2.6575557399155514e-07, "learning_rate": 1.8652409560044107e-07, "logits/chosen": 0.3381027281284332, "logits/rejected": 0.7966064214706421, "logps/chosen": -164.4093780517578, "logps/rejected": -940.0250244140625, "loss": 0.0388, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.594677686691284, "rewards/margins": 35.22343826293945, "rewards/rejected": -38.795310974121094, "step": 7420 }, { "epoch": 1.8732864841017238, "grad_norm": 0.0004194665525574237, "learning_rate": 1.8581493733318992e-07, "logits/chosen": 0.20295238494873047, "logits/rejected": 0.7178985476493835, "logps/chosen": -170.83749389648438, "logps/rejected": -892.5499877929688, "loss": 0.0582, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9356446266174316, "rewards/margins": 33.48125076293945, "rewards/rejected": -37.412498474121094, "step": 7430 }, { "epoch": 1.875807518986544, "grad_norm": 4.565107519738376e-05, "learning_rate": 1.851063315460882e-07, "logits/chosen": 0.3568786680698395, "logits/rejected": 0.8616943359375, "logps/chosen": -177.1437530517578, "logps/rejected": -886.0499877929688, "loss": 0.0519, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.837292432785034, "rewards/margins": 32.369529724121094, "rewards/rejected": -36.20624923706055, "step": 7440 }, { "epoch": 1.878328553871364, "grad_norm": 78.50321960449219, "learning_rate": 1.8439828433853988e-07, "logits/chosen": 0.2665115296840668, "logits/rejected": 0.8746078610420227, "logps/chosen": -174.81875610351562, "logps/rejected": -894.0999755859375, "loss": 0.1358, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.9890990257263184, "rewards/margins": 34.0, "rewards/rejected": -37.98749923706055, "step": 7450 }, { "epoch": 1.8808495887561845, "grad_norm": 2.4803476333618164, "learning_rate": 1.8369080180514086e-07, "logits/chosen": 0.5407379269599915, "logits/rejected": 0.8890014886856079, "logps/chosen": -180.7781219482422, "logps/rejected": -949.25, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.821728706359863, "rewards/margins": 36.584373474121094, "rewards/rejected": -41.40937423706055, "step": 7460 }, { "epoch": 1.8833706236410046, "grad_norm": 0.14448761940002441, "learning_rate": 1.8298389003562686e-07, "logits/chosen": 0.5015827417373657, "logits/rejected": 1.0417953729629517, "logps/chosen": -186.1062469482422, "logps/rejected": -936.0999755859375, "loss": 0.0755, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.915258884429932, "rewards/margins": 38.33515548706055, "rewards/rejected": -43.27031326293945, "step": 7470 }, { "epoch": 1.885891658525825, "grad_norm": 0.0028649175073951483, "learning_rate": 1.822775551148204e-07, "logits/chosen": 0.6367095708847046, "logits/rejected": NaN, "logps/chosen": -190.1531219482422, "logps/rejected": -974.0499877929688, "loss": 0.06, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.167138576507568, "rewards/margins": 39.842185974121094, "rewards/rejected": -45.029685974121094, "step": 7480 }, { "epoch": 1.888412693410645, "grad_norm": 0.1559479534626007, "learning_rate": 1.815718031225787e-07, "logits/chosen": 0.600695788860321, "logits/rejected": NaN, "logps/chosen": -161.58984375, "logps/rejected": -900.5999755859375, "loss": 0.096, "rewards/accuracies": 0.96875, "rewards/chosen": -4.450268745422363, "rewards/margins": 35.756248474121094, "rewards/rejected": -40.20781326293945, "step": 7490 }, { "epoch": 1.8909337282954652, "grad_norm": 1.8829548358917236, "learning_rate": 1.808666401337414e-07, "logits/chosen": 0.41615599393844604, "logits/rejected": NaN, "logps/chosen": -187.75625610351562, "logps/rejected": -915.5, "loss": 0.0142, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.455346584320068, "rewards/margins": 36.423439025878906, "rewards/rejected": -40.889060974121094, "step": 7500 }, { "epoch": 1.8934547631802854, "grad_norm": 256.41436767578125, "learning_rate": 1.801620722180786e-07, "logits/chosen": 0.49209898710250854, "logits/rejected": NaN, "logps/chosen": -186.3874969482422, "logps/rejected": -923.4000244140625, "loss": 0.154, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.631060600280762, "rewards/margins": 34.75, "rewards/rejected": -39.360939025878906, "step": 7510 }, { "epoch": 1.8959757980651057, "grad_norm": 0.00205794139765203, "learning_rate": 1.7945810544023765e-07, "logits/chosen": 0.34843748807907104, "logits/rejected": NaN, "logps/chosen": -177.3937530517578, "logps/rejected": -921.5, "loss": 0.1717, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.253949165344238, "rewards/margins": 35.006248474121094, "rewards/rejected": -39.26093673706055, "step": 7520 }, { "epoch": 1.898496832949926, "grad_norm": 0.06003750115633011, "learning_rate": 1.787547458596918e-07, "logits/chosen": 0.31030577421188354, "logits/rejected": NaN, "logps/chosen": -173.77499389648438, "logps/rejected": -838.5999755859375, "loss": 0.0265, "rewards/accuracies": 0.984375, "rewards/chosen": -3.633056640625, "rewards/margins": 33.24531173706055, "rewards/rejected": -36.890625, "step": 7530 }, { "epoch": 1.9010178678347462, "grad_norm": 0.11015287786722183, "learning_rate": 1.7805199953068793e-07, "logits/chosen": 0.32524412870407104, "logits/rejected": NaN, "logps/chosen": -175.7468719482422, "logps/rejected": -881.4000244140625, "loss": 0.074, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8838868141174316, "rewards/margins": 34.24687576293945, "rewards/rejected": -38.131248474121094, "step": 7540 }, { "epoch": 1.9035389027195664, "grad_norm": 0.016452224925160408, "learning_rate": 1.7734987250219408e-07, "logits/chosen": 0.37641602754592896, "logits/rejected": NaN, "logps/chosen": -159.93124389648438, "logps/rejected": -887.4249877929688, "loss": 0.0447, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.1470947265625, "rewards/margins": 33.64765548706055, "rewards/rejected": -37.775779724121094, "step": 7550 }, { "epoch": 1.9060599376043865, "grad_norm": 0.026211723685264587, "learning_rate": 1.7664837081784755e-07, "logits/chosen": 0.16391296684741974, "logits/rejected": NaN, "logps/chosen": -167.52499389648438, "logps/rejected": -896.6749877929688, "loss": 0.0088, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.7594847679138184, "rewards/margins": 34.23906326293945, "rewards/rejected": -37.98906326293945, "step": 7560 }, { "epoch": 1.9085809724892067, "grad_norm": 151.83084106445312, "learning_rate": 1.7594750051590307e-07, "logits/chosen": 0.2714675962924957, "logits/rejected": NaN, "logps/chosen": -176.24063110351562, "logps/rejected": -892.75, "loss": 0.0958, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.474462985992432, "rewards/margins": 33.553123474121094, "rewards/rejected": -38.0234375, "step": 7570 }, { "epoch": 1.911102007374027, "grad_norm": 0.18508270382881165, "learning_rate": 1.752472676291808e-07, "logits/chosen": 0.23946456611156464, "logits/rejected": 1.0342559814453125, "logps/chosen": -182.4968719482422, "logps/rejected": -882.375, "loss": 0.0777, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.459631443023682, "rewards/margins": 33.931251525878906, "rewards/rejected": -38.38750076293945, "step": 7580 }, { "epoch": 1.9136230422588474, "grad_norm": 62.80792999267578, "learning_rate": 1.7454767818501394e-07, "logits/chosen": 0.04486694186925888, "logits/rejected": NaN, "logps/chosen": -175.56875610351562, "logps/rejected": -901.2249755859375, "loss": 0.0294, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.629467725753784, "rewards/margins": 34.98906326293945, "rewards/rejected": -38.610939025878906, "step": 7590 }, { "epoch": 1.9161440771436675, "grad_norm": 119.74451446533203, "learning_rate": 1.738487382051973e-07, "logits/chosen": 0.04913330078125, "logits/rejected": NaN, "logps/chosen": -167.49374389648438, "logps/rejected": -857.25, "loss": 0.154, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6549315452575684, "rewards/margins": 32.142967224121094, "rewards/rejected": -35.779685974121094, "step": 7600 }, { "epoch": 1.9186651120284877, "grad_norm": 84.35933685302734, "learning_rate": 1.7315045370593562e-07, "logits/chosen": 0.23141174018383026, "logits/rejected": 0.7218353152275085, "logps/chosen": -172.68124389648438, "logps/rejected": -894.3499755859375, "loss": 0.1557, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -4.152075290679932, "rewards/margins": 32.54375076293945, "rewards/rejected": -36.71406173706055, "step": 7610 }, { "epoch": 1.9211861469133078, "grad_norm": 0.00010659245890565217, "learning_rate": 1.7245283069779115e-07, "logits/chosen": 0.23632964491844177, "logits/rejected": NaN, "logps/chosen": -168.11093139648438, "logps/rejected": -858.1500244140625, "loss": 0.0857, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.843212842941284, "rewards/margins": 30.7421875, "rewards/rejected": -34.5859375, "step": 7620 }, { "epoch": 1.9237071817981282, "grad_norm": 0.1530640721321106, "learning_rate": 1.7175587518563242e-07, "logits/chosen": 0.10049743950366974, "logits/rejected": 0.5892913937568665, "logps/chosen": -160.80313110351562, "logps/rejected": -839.0750122070312, "loss": 0.027, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3436646461486816, "rewards/margins": 29.551563262939453, "rewards/rejected": -32.87812423706055, "step": 7630 }, { "epoch": 1.9262282166829483, "grad_norm": 0.6493778824806213, "learning_rate": 1.7105959316858243e-07, "logits/chosen": 0.09023437649011612, "logits/rejected": NaN, "logps/chosen": -171.875, "logps/rejected": -848.6500244140625, "loss": 0.013, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4498658180236816, "rewards/margins": 30.493749618530273, "rewards/rejected": -33.947654724121094, "step": 7640 }, { "epoch": 1.9287492515677687, "grad_norm": 99.34797668457031, "learning_rate": 1.7036399063996708e-07, "logits/chosen": 0.14205321669578552, "logits/rejected": 0.347940057516098, "logps/chosen": -155.4656219482422, "logps/rejected": -899.25, "loss": 0.0777, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.5834717750549316, "rewards/margins": 32.58984375, "rewards/rejected": -36.161720275878906, "step": 7650 }, { "epoch": 1.9312702864525888, "grad_norm": 0.005990031640976667, "learning_rate": 1.696690735872634e-07, "logits/chosen": 0.237030029296875, "logits/rejected": NaN, "logps/chosen": -165.8718719482422, "logps/rejected": -872.0, "loss": 0.0129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8414063453674316, "rewards/margins": 32.6484375, "rewards/rejected": -36.48906326293945, "step": 7660 }, { "epoch": 1.933791321337409, "grad_norm": 6.590335845947266, "learning_rate": 1.6897484799204794e-07, "logits/chosen": 0.18180008232593536, "logits/rejected": NaN, "logps/chosen": -191.7156219482422, "logps/rejected": -897.4000244140625, "loss": 0.0468, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.897900342941284, "rewards/margins": 32.828125, "rewards/rejected": -36.728126525878906, "step": 7670 }, { "epoch": 1.936312356222229, "grad_norm": 0.0005229563685134053, "learning_rate": 1.6828131982994587e-07, "logits/chosen": 0.15768127143383026, "logits/rejected": NaN, "logps/chosen": -153.2218780517578, "logps/rejected": -870.9500122070312, "loss": 0.0259, "rewards/accuracies": 0.984375, "rewards/chosen": -3.5161025524139404, "rewards/margins": 32.423439025878906, "rewards/rejected": -35.943748474121094, "step": 7680 }, { "epoch": 1.9388333911070494, "grad_norm": 0.010653932578861713, "learning_rate": 1.6758849507057878e-07, "logits/chosen": 0.08043670654296875, "logits/rejected": NaN, "logps/chosen": -179.7624969482422, "logps/rejected": -859.2750244140625, "loss": 0.1893, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.993725538253784, "rewards/margins": 30.884374618530273, "rewards/rejected": -34.87968826293945, "step": 7690 }, { "epoch": 1.9413544259918698, "grad_norm": 0.000639441714156419, "learning_rate": 1.668963796775137e-07, "logits/chosen": -0.03118896484375, "logits/rejected": NaN, "logps/chosen": -179.2375030517578, "logps/rejected": -892.5750122070312, "loss": 0.0069, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.6128907203674316, "rewards/margins": 33.26874923706055, "rewards/rejected": -36.87968826293945, "step": 7700 }, { "epoch": 1.94387546087669, "grad_norm": 0.5026647448539734, "learning_rate": 1.6620497960821172e-07, "logits/chosen": -0.011210632510483265, "logits/rejected": NaN, "logps/chosen": -178.99374389648438, "logps/rejected": -864.5999755859375, "loss": 0.0112, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.141152858734131, "rewards/margins": 32.146873474121094, "rewards/rejected": -35.279685974121094, "step": 7710 }, { "epoch": 1.94639649576151, "grad_norm": 135.45767211914062, "learning_rate": 1.6551430081397694e-07, "logits/chosen": 0.05743713304400444, "logits/rejected": NaN, "logps/chosen": -151.8984375, "logps/rejected": -845.6749877929688, "loss": 0.1187, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.295902967453003, "rewards/margins": 31.292186737060547, "rewards/rejected": -34.5703125, "step": 7720 }, { "epoch": 1.9489175306463302, "grad_norm": 0.40907326340675354, "learning_rate": 1.6482434923990465e-07, "logits/chosen": 0.07261963188648224, "logits/rejected": NaN, "logps/chosen": -172.1374969482422, "logps/rejected": -896.7000122070312, "loss": 0.2928, "rewards/accuracies": 0.984375, "rewards/chosen": -4.230542182922363, "rewards/margins": 32.453125, "rewards/rejected": -36.6796875, "step": 7730 }, { "epoch": 1.9514385655311506, "grad_norm": 0.0006441762088797987, "learning_rate": 1.641351308248306e-07, "logits/chosen": 0.0334014892578125, "logits/rejected": NaN, "logps/chosen": -167.8153839111328, "logps/rejected": -857.125, "loss": 0.1581, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.6994080543518066, "rewards/margins": 31.310937881469727, "rewards/rejected": -35.017189025878906, "step": 7740 }, { "epoch": 1.9539596004159707, "grad_norm": 0.02129516191780567, "learning_rate": 1.6344665150128003e-07, "logits/chosen": 0.10340919345617294, "logits/rejected": NaN, "logps/chosen": -159.9656219482422, "logps/rejected": -830.125, "loss": 0.0157, "rewards/accuracies": 0.984375, "rewards/chosen": -3.638409376144409, "rewards/margins": 30.440624237060547, "rewards/rejected": -34.08124923706055, "step": 7750 }, { "epoch": 1.956480635300791, "grad_norm": 6.315375328063965, "learning_rate": 1.6275891719541608e-07, "logits/chosen": 0.14374999701976776, "logits/rejected": 0.5421737432479858, "logps/chosen": -173.7421875, "logps/rejected": -872.4500122070312, "loss": 0.0271, "rewards/accuracies": 0.984375, "rewards/chosen": -4.201416015625, "rewards/margins": 31.7421875, "rewards/rejected": -35.931251525878906, "step": 7760 }, { "epoch": 1.9590016701856112, "grad_norm": 0.12938864529132843, "learning_rate": 1.620719338269892e-07, "logits/chosen": -0.02769622765481472, "logits/rejected": NaN, "logps/chosen": -184.67813110351562, "logps/rejected": -870.2000122070312, "loss": 0.0679, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4069581031799316, "rewards/margins": 31.003124237060547, "rewards/rejected": -34.396873474121094, "step": 7770 }, { "epoch": 1.9615227050704314, "grad_norm": 0.014672829769551754, "learning_rate": 1.613857073092859e-07, "logits/chosen": 0.08830566704273224, "logits/rejected": 0.7356292605400085, "logps/chosen": -160.30624389648438, "logps/rejected": -832.625, "loss": 0.0773, "rewards/accuracies": 0.984375, "rewards/chosen": -3.2857909202575684, "rewards/margins": 29.818750381469727, "rewards/rejected": -33.109375, "step": 7780 }, { "epoch": 1.9640437399552515, "grad_norm": 1.8060870170593262, "learning_rate": 1.607002435490784e-07, "logits/chosen": 0.16122741997241974, "logits/rejected": NaN, "logps/chosen": -168.49063110351562, "logps/rejected": -847.125, "loss": 0.1123, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.977465867996216, "rewards/margins": 29.617969512939453, "rewards/rejected": -33.60468673706055, "step": 7790 }, { "epoch": 1.9665647748400719, "grad_norm": 0.15415945649147034, "learning_rate": 1.6001554844657305e-07, "logits/chosen": 0.2672103941440582, "logits/rejected": NaN, "logps/chosen": -160.328125, "logps/rejected": -856.5250244140625, "loss": 0.0158, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.232196092605591, "rewards/margins": 31.831249237060547, "rewards/rejected": -35.0859375, "step": 7800 }, { "epoch": 1.9690858097248922, "grad_norm": 43.49271774291992, "learning_rate": 1.5933162789535984e-07, "logits/chosen": 0.26219767332077026, "logits/rejected": 0.750598132610321, "logps/chosen": -174.5281219482422, "logps/rejected": -909.4500122070312, "loss": 0.0357, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.505078315734863, "rewards/margins": 33.80156326293945, "rewards/rejected": -38.30937576293945, "step": 7810 }, { "epoch": 1.9716068446097124, "grad_norm": 1.3401269912719727, "learning_rate": 1.586484877823621e-07, "logits/chosen": 0.2572647035121918, "logits/rejected": NaN, "logps/chosen": -191.10000610351562, "logps/rejected": -850.375, "loss": 0.0591, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.241552829742432, "rewards/margins": 31.944530487060547, "rewards/rejected": -36.171875, "step": 7820 }, { "epoch": 1.9741278794945325, "grad_norm": 0.03180485963821411, "learning_rate": 1.5796613398778513e-07, "logits/chosen": 0.2545059323310852, "logits/rejected": NaN, "logps/chosen": -194.4656219482422, "logps/rejected": -893.25, "loss": 0.0888, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.726294040679932, "rewards/margins": 34.29218673706055, "rewards/rejected": -39.015625, "step": 7830 }, { "epoch": 1.9766489143793526, "grad_norm": 4.0275648643728346e-05, "learning_rate": 1.5728457238506592e-07, "logits/chosen": 0.24028626084327698, "logits/rejected": NaN, "logps/chosen": -181.828125, "logps/rejected": -897.2999877929688, "loss": 0.0367, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.088165283203125, "rewards/margins": 33.29375076293945, "rewards/rejected": -37.37421798706055, "step": 7840 }, { "epoch": 1.9791699492641728, "grad_norm": 0.2382972240447998, "learning_rate": 1.566038088408227e-07, "logits/chosen": 0.21912232041358948, "logits/rejected": NaN, "logps/chosen": -179.921875, "logps/rejected": -896.7000122070312, "loss": 0.0317, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9686522483825684, "rewards/margins": 34.453125, "rewards/rejected": -38.435935974121094, "step": 7850 }, { "epoch": 1.9816909841489931, "grad_norm": 0.10453198105096817, "learning_rate": 1.559238492148044e-07, "logits/chosen": 0.17775268852710724, "logits/rejected": 0.6379135251045227, "logps/chosen": -185.74374389648438, "logps/rejected": -919.375, "loss": 0.0379, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.2415771484375, "rewards/margins": 34.203125, "rewards/rejected": -38.46562576293945, "step": 7860 }, { "epoch": 1.9842120190338135, "grad_norm": 0.10902858525514603, "learning_rate": 1.552446993598399e-07, "logits/chosen": 0.38718587160110474, "logits/rejected": 0.9358550906181335, "logps/chosen": -174.04061889648438, "logps/rejected": -876.1500244140625, "loss": 0.0405, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.810742139816284, "rewards/margins": 32.76640701293945, "rewards/rejected": -36.57734298706055, "step": 7870 }, { "epoch": 1.9867330539186336, "grad_norm": 0.0038667183835059404, "learning_rate": 1.5456636512178794e-07, "logits/chosen": 0.28562623262405396, "logits/rejected": NaN, "logps/chosen": -188.2703094482422, "logps/rejected": -900.0999755859375, "loss": 0.2297, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.566552639007568, "rewards/margins": 34.634376525878906, "rewards/rejected": -39.19843673706055, "step": 7880 }, { "epoch": 1.9892540888034538, "grad_norm": 2.7628960609436035, "learning_rate": 1.5388885233948697e-07, "logits/chosen": 0.08388672024011612, "logits/rejected": 0.5630577206611633, "logps/chosen": -182.39688110351562, "logps/rejected": -911.7000122070312, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.17620849609375, "rewards/margins": 34.70624923706055, "rewards/rejected": -38.889060974121094, "step": 7890 }, { "epoch": 1.991775123688274, "grad_norm": 0.0016876989975571632, "learning_rate": 1.5321216684470446e-07, "logits/chosen": 0.03908080980181694, "logits/rejected": NaN, "logps/chosen": -173.0546875, "logps/rejected": -906.8499755859375, "loss": 0.1175, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8508880138397217, "rewards/margins": 35.892189025878906, "rewards/rejected": -39.732810974121094, "step": 7900 }, { "epoch": 1.9942961585730943, "grad_norm": 56.12511444091797, "learning_rate": 1.525363144620868e-07, "logits/chosen": 0.13910523056983948, "logits/rejected": NaN, "logps/chosen": -165.34375, "logps/rejected": -887.2000122070312, "loss": 0.0467, "rewards/accuracies": 0.984375, "rewards/chosen": -3.9519591331481934, "rewards/margins": 35.07890701293945, "rewards/rejected": -39.0234375, "step": 7910 }, { "epoch": 1.9968171934579144, "grad_norm": 8.173598289489746, "learning_rate": 1.518613010091095e-07, "logits/chosen": 0.05412750318646431, "logits/rejected": NaN, "logps/chosen": -181.43124389648438, "logps/rejected": -878.75, "loss": 0.0511, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.245507717132568, "rewards/margins": 33.127342224121094, "rewards/rejected": -37.384376525878906, "step": 7920 }, { "epoch": 1.9993382283427348, "grad_norm": 1.1945041418075562, "learning_rate": 1.5118713229602692e-07, "logits/chosen": 0.21494445204734802, "logits/rejected": NaN, "logps/chosen": -192.703125, "logps/rejected": -934.2000122070312, "loss": 0.0312, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.713671684265137, "rewards/margins": 36.135154724121094, "rewards/rejected": -40.860939025878906, "step": 7930 }, { "epoch": 2.0020168279078563, "grad_norm": 0.034581031650304794, "learning_rate": 1.5051381412582204e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -195.88987731933594, "logps/rejected": -933.3095092773438, "loss": 0.0583, "rewards/accuracies": 0.9910714030265808, "rewards/chosen": -4.909400463104248, "rewards/margins": 37.04166793823242, "rewards/rejected": -41.95089340209961, "step": 7940 }, { "epoch": 2.0045378627926764, "grad_norm": 144.4689483642578, "learning_rate": 1.4984135229415668e-07, "logits/chosen": 0.3311401307582855, "logits/rejected": NaN, "logps/chosen": -177.1999969482422, "logps/rejected": -929.5, "loss": 0.0372, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.238671779632568, "rewards/margins": 37.484375, "rewards/rejected": -42.728126525878906, "step": 7950 }, { "epoch": 2.0070588976774966, "grad_norm": 0.01795245334506035, "learning_rate": 1.491697525893219e-07, "logits/chosen": 0.353707879781723, "logits/rejected": NaN, "logps/chosen": -189.8125, "logps/rejected": -947.125, "loss": 0.0262, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.220995903015137, "rewards/margins": 37.64921951293945, "rewards/rejected": -42.87187576293945, "step": 7960 }, { "epoch": 2.0095799325623167, "grad_norm": 0.0021353703923523426, "learning_rate": 1.4849902079218767e-07, "logits/chosen": 0.12977752089500427, "logits/rejected": 0.802111804485321, "logps/chosen": -186.38436889648438, "logps/rejected": -971.9500122070312, "loss": 0.0121, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.741943359375, "rewards/margins": 39.59375, "rewards/rejected": -44.337501525878906, "step": 7970 }, { "epoch": 2.012100967447137, "grad_norm": 1.8951023817062378, "learning_rate": 1.4782916267615337e-07, "logits/chosen": 0.18678435683250427, "logits/rejected": NaN, "logps/chosen": -201.1687469482422, "logps/rejected": -943.3499755859375, "loss": 0.015, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.334619045257568, "rewards/margins": 37.47968673706055, "rewards/rejected": -42.826560974121094, "step": 7980 }, { "epoch": 2.0146220023319574, "grad_norm": 0.030085410922765732, "learning_rate": 1.4716018400709806e-07, "logits/chosen": 0.21689605712890625, "logits/rejected": NaN, "logps/chosen": -202.1906280517578, "logps/rejected": -956.25, "loss": 0.0582, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.495348930358887, "rewards/margins": 37.54999923706055, "rewards/rejected": -43.040626525878906, "step": 7990 }, { "epoch": 2.0171430372167776, "grad_norm": 0.00011016646749339998, "learning_rate": 1.4649209054333105e-07, "logits/chosen": 0.23560181260108948, "logits/rejected": 0.5966636538505554, "logps/chosen": -207.3312530517578, "logps/rejected": -975.5999755859375, "loss": 0.0078, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.1937408447265625, "rewards/margins": 38.59062576293945, "rewards/rejected": -43.78437423706055, "step": 8000 }, { "epoch": 2.0196640721015977, "grad_norm": 5.2616376876831055, "learning_rate": 1.4582488803554194e-07, "logits/chosen": 0.15915831923484802, "logits/rejected": 0.836016833782196, "logps/chosen": -185.33438110351562, "logps/rejected": -940.9000244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.509765625, "rewards/margins": 38.03437423706055, "rewards/rejected": -42.548439025878906, "step": 8010 }, { "epoch": 2.022185106986418, "grad_norm": 0.033615607768297195, "learning_rate": 1.4515858222675136e-07, "logits/chosen": 0.21024474501609802, "logits/rejected": NaN, "logps/chosen": -173.7937469482422, "logps/rejected": -962.8499755859375, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.6734619140625, "rewards/margins": 39.701560974121094, "rewards/rejected": -44.376564025878906, "step": 8020 }, { "epoch": 2.024706141871238, "grad_norm": 0.0008385602850466967, "learning_rate": 1.4449317885226153e-07, "logits/chosen": 0.27131348848342896, "logits/rejected": NaN, "logps/chosen": -189.69375610351562, "logps/rejected": -921.7000122070312, "loss": 0.0295, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.676800727844238, "rewards/margins": 37.748435974121094, "rewards/rejected": -42.439064025878906, "step": 8030 }, { "epoch": 2.0272271767560586, "grad_norm": 0.003849217901006341, "learning_rate": 1.43828683639607e-07, "logits/chosen": 0.23341675102710724, "logits/rejected": 0.7318938970565796, "logps/chosen": -175.6984405517578, "logps/rejected": -1025.75, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.673608303070068, "rewards/margins": 41.5234375, "rewards/rejected": -46.201560974121094, "step": 8040 }, { "epoch": 2.0297482116408787, "grad_norm": 0.18424677848815918, "learning_rate": 1.43165102308505e-07, "logits/chosen": 0.2866165041923523, "logits/rejected": 1.0343414545059204, "logps/chosen": -203.2312469482422, "logps/rejected": -979.75, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -6.016308784484863, "rewards/margins": 39.548439025878906, "rewards/rejected": -45.56562423706055, "step": 8050 }, { "epoch": 2.032269246525699, "grad_norm": 1.1038318872451782, "learning_rate": 1.4250244057080634e-07, "logits/chosen": 0.5042136907577515, "logits/rejected": NaN, "logps/chosen": -162.3249969482422, "logps/rejected": -962.75, "loss": 0.0337, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.961669921875, "rewards/margins": 40.5390625, "rewards/rejected": -45.48125076293945, "step": 8060 }, { "epoch": 2.034790281410519, "grad_norm": 7.91022102930583e-05, "learning_rate": 1.4184070413044677e-07, "logits/chosen": 0.36625367403030396, "logits/rejected": NaN, "logps/chosen": -190.7156219482422, "logps/rejected": -937.0499877929688, "loss": 0.011, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.942151069641113, "rewards/margins": 38.256248474121094, "rewards/rejected": -43.209373474121094, "step": 8070 }, { "epoch": 2.037311316295339, "grad_norm": 0.04693185165524483, "learning_rate": 1.4117989868339706e-07, "logits/chosen": 0.22976303100585938, "logits/rejected": NaN, "logps/chosen": -182.8625030517578, "logps/rejected": -939.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.847094535827637, "rewards/margins": 39.38593673706055, "rewards/rejected": -44.23749923706055, "step": 8080 }, { "epoch": 2.0398323511801593, "grad_norm": 0.0002555667015258223, "learning_rate": 1.4052002991761434e-07, "logits/chosen": 0.13442687690258026, "logits/rejected": 0.8015899658203125, "logps/chosen": -190.2765655517578, "logps/rejected": -943.0999755859375, "loss": 0.0086, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.910723686218262, "rewards/margins": 38.1640625, "rewards/rejected": -43.0546875, "step": 8090 }, { "epoch": 2.04235338606498, "grad_norm": 0.12331169843673706, "learning_rate": 1.3986110351299342e-07, "logits/chosen": 0.18000487983226776, "logits/rejected": NaN, "logps/chosen": -191.6875, "logps/rejected": -971.5999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.624804496765137, "rewards/margins": 39.014060974121094, "rewards/rejected": -43.646873474121094, "step": 8100 }, { "epoch": 2.0448744209498, "grad_norm": 0.005603868048638105, "learning_rate": 1.3920312514131742e-07, "logits/chosen": 0.2978576719760895, "logits/rejected": NaN, "logps/chosen": -166.33438110351562, "logps/rejected": -915.3499755859375, "loss": 0.0049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.564917087554932, "rewards/margins": 37.717185974121094, "rewards/rejected": -42.290626525878906, "step": 8110 }, { "epoch": 2.04739545583462, "grad_norm": 0.5351747274398804, "learning_rate": 1.3854610046620926e-07, "logits/chosen": 0.31724852323532104, "logits/rejected": NaN, "logps/chosen": -177.546875, "logps/rejected": -924.9000244140625, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.861462593078613, "rewards/margins": 37.482810974121094, "rewards/rejected": -42.34375, "step": 8120 }, { "epoch": 2.0499164907194403, "grad_norm": 3.0422907002503052e-06, "learning_rate": 1.3789003514308257e-07, "logits/chosen": 0.332192987203598, "logits/rejected": 0.8176819086074829, "logps/chosen": -180.3156280517578, "logps/rejected": -952.2999877929688, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.167187690734863, "rewards/margins": 38.795310974121094, "rewards/rejected": -43.9765625, "step": 8130 }, { "epoch": 2.0524375256042604, "grad_norm": 0.000558209721930325, "learning_rate": 1.372349348190937e-07, "logits/chosen": 0.341583251953125, "logits/rejected": NaN, "logps/chosen": -198.5656280517578, "logps/rejected": -960.4000244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.48681640625, "rewards/margins": 39.0390625, "rewards/rejected": -44.51874923706055, "step": 8140 }, { "epoch": 2.054958560489081, "grad_norm": 0.0016069002449512482, "learning_rate": 1.3658080513309217e-07, "logits/chosen": 0.49768370389938354, "logits/rejected": 0.8949981927871704, "logps/chosen": -168.5187530517578, "logps/rejected": -965.3499755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.425122261047363, "rewards/margins": 40.467185974121094, "rewards/rejected": -45.8828125, "step": 8150 }, { "epoch": 2.057479595373901, "grad_norm": 0.001727136317640543, "learning_rate": 1.3592765171557307e-07, "logits/chosen": 0.41114503145217896, "logits/rejected": NaN, "logps/chosen": -174.3828125, "logps/rejected": -966.3499755859375, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.120434761047363, "rewards/margins": 39.048439025878906, "rewards/rejected": -44.170310974121094, "step": 8160 }, { "epoch": 2.0600006302587213, "grad_norm": 0.0002446337603032589, "learning_rate": 1.3527548018862775e-07, "logits/chosen": 0.23077774047851562, "logits/rejected": NaN, "logps/chosen": -185.8937530517578, "logps/rejected": -936.6500244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.124243259429932, "rewards/margins": 37.623435974121094, "rewards/rejected": -42.74531173706055, "step": 8170 }, { "epoch": 2.0625216651435414, "grad_norm": 0.002001918386667967, "learning_rate": 1.346242961658962e-07, "logits/chosen": 0.08269958198070526, "logits/rejected": NaN, "logps/chosen": -185.4656219482422, "logps/rejected": -901.8499755859375, "loss": 0.0172, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.6182861328125, "rewards/margins": 34.509376525878906, "rewards/rejected": -39.12187576293945, "step": 8180 }, { "epoch": 2.0650427000283615, "grad_norm": 0.0002117256517522037, "learning_rate": 1.3397410525251806e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -191.1906280517578, "logps/rejected": -916.7000122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.315954685211182, "rewards/margins": 35.029685974121094, "rewards/rejected": -39.34375, "step": 8190 }, { "epoch": 2.0675637349131817, "grad_norm": 0.004602792207151651, "learning_rate": 1.3332491304508472e-07, "logits/chosen": 0.16581574082374573, "logits/rejected": 0.815747082233429, "logps/chosen": -185.74063110351562, "logps/rejected": -865.4500122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.421105861663818, "rewards/margins": 33.7421875, "rewards/rejected": -38.17499923706055, "step": 8200 }, { "epoch": 2.0700847697980023, "grad_norm": 0.4139191508293152, "learning_rate": 1.3267672513159133e-07, "logits/chosen": 0.2683563232421875, "logits/rejected": 0.8873519897460938, "logps/chosen": -197.50936889648438, "logps/rejected": -944.9500122070312, "loss": 0.0188, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.663232326507568, "rewards/margins": 36.37578201293945, "rewards/rejected": -41.037498474121094, "step": 8210 }, { "epoch": 2.0726058046828224, "grad_norm": 0.00204647914506495, "learning_rate": 1.3202954709138818e-07, "logits/chosen": 0.2787914276123047, "logits/rejected": NaN, "logps/chosen": -160.74063110351562, "logps/rejected": -902.4000244140625, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8236145973205566, "rewards/margins": 35.603126525878906, "rewards/rejected": -39.41718673706055, "step": 8220 }, { "epoch": 2.0751268395676425, "grad_norm": 0.7673875093460083, "learning_rate": 1.31383384495133e-07, "logits/chosen": 0.23453369736671448, "logits/rejected": 0.7910858392715454, "logps/chosen": -181.79531860351562, "logps/rejected": -920.25, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.535876274108887, "rewards/margins": 36.68828201293945, "rewards/rejected": -41.2265625, "step": 8230 }, { "epoch": 2.0776478744524627, "grad_norm": 0.02122541330754757, "learning_rate": 1.3073824290474323e-07, "logits/chosen": 0.250326544046402, "logits/rejected": NaN, "logps/chosen": -168.75625610351562, "logps/rejected": -908.0250244140625, "loss": 0.0288, "rewards/accuracies": 0.984375, "rewards/chosen": -4.48956298828125, "rewards/margins": 36.474998474121094, "rewards/rejected": -40.967185974121094, "step": 8240 }, { "epoch": 2.080168909337283, "grad_norm": 0.00020550383487716317, "learning_rate": 1.3009412787334762e-07, "logits/chosen": 0.3440490663051605, "logits/rejected": NaN, "logps/chosen": -170.2468719482422, "logps/rejected": -913.4500122070312, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9704957008361816, "rewards/margins": 36.5546875, "rewards/rejected": -40.55156326293945, "step": 8250 }, { "epoch": 2.082689944222103, "grad_norm": 0.2147044539451599, "learning_rate": 1.2945104494523875e-07, "logits/chosen": 0.4516448974609375, "logits/rejected": NaN, "logps/chosen": -174.875, "logps/rejected": -888.2999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.816741943359375, "rewards/margins": 34.79999923706055, "rewards/rejected": -39.615623474121094, "step": 8260 }, { "epoch": 2.0852109791069235, "grad_norm": 0.012966674752533436, "learning_rate": 1.2880899965582526e-07, "logits/chosen": 0.3217620849609375, "logits/rejected": 0.8363128900527954, "logps/chosen": -178.8874969482422, "logps/rejected": -916.0250244140625, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.439856052398682, "rewards/margins": 36.42265701293945, "rewards/rejected": -40.87812423706055, "step": 8270 }, { "epoch": 2.0877320139917437, "grad_norm": 0.00010526629921514541, "learning_rate": 1.281679975315843e-07, "logits/chosen": 0.2986045777797699, "logits/rejected": NaN, "logps/chosen": -197.3328094482422, "logps/rejected": -949.0499877929688, "loss": 0.0234, "rewards/accuracies": 0.984375, "rewards/chosen": -4.922411918640137, "rewards/margins": 37.87968826293945, "rewards/rejected": -42.80937576293945, "step": 8280 }, { "epoch": 2.090253048876564, "grad_norm": 0.7723494172096252, "learning_rate": 1.2752804409001372e-07, "logits/chosen": 0.3203063905239105, "logits/rejected": NaN, "logps/chosen": -194.8874969482422, "logps/rejected": -945.9000244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.063867092132568, "rewards/margins": 39.779685974121094, "rewards/rejected": -44.826560974121094, "step": 8290 }, { "epoch": 2.092774083761384, "grad_norm": 1.3351274901651777e-05, "learning_rate": 1.2688914483958487e-07, "logits/chosen": 0.34184569120407104, "logits/rejected": 1.179203748703003, "logps/chosen": -188.8249969482422, "logps/rejected": -931.7999877929688, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.605853080749512, "rewards/margins": 38.86406326293945, "rewards/rejected": -44.46406173706055, "step": 8300 }, { "epoch": 2.095295118646204, "grad_norm": 0.03036135621368885, "learning_rate": 1.262513052796948e-07, "logits/chosen": 0.18686524033546448, "logits/rejected": NaN, "logps/chosen": -195.93124389648438, "logps/rejected": -946.7249755859375, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.527441501617432, "rewards/margins": 38.30937576293945, "rewards/rejected": -43.829689025878906, "step": 8310 }, { "epoch": 2.0978161535310247, "grad_norm": 0.00213005137629807, "learning_rate": 1.256145309006195e-07, "logits/chosen": 0.31111449003219604, "logits/rejected": NaN, "logps/chosen": -186.296875, "logps/rejected": -938.5750122070312, "loss": 0.0029, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.662549018859863, "rewards/margins": 39.08281326293945, "rewards/rejected": -44.734375, "step": 8320 }, { "epoch": 2.100337188415845, "grad_norm": 0.00019047695968765765, "learning_rate": 1.2497882718346594e-07, "logits/chosen": 0.2465255707502365, "logits/rejected": NaN, "logps/chosen": -201.890625, "logps/rejected": -948.1500244140625, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -5.728613376617432, "rewards/margins": 37.993751525878906, "rewards/rejected": -43.71875, "step": 8330 }, { "epoch": 2.102858223300665, "grad_norm": 0.05980094149708748, "learning_rate": 1.2434419960012536e-07, "logits/chosen": 0.5119430422782898, "logits/rejected": 1.0720551013946533, "logps/chosen": -173.9523468017578, "logps/rejected": -950.5499877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.122363090515137, "rewards/margins": 39.63750076293945, "rewards/rejected": -44.75, "step": 8340 }, { "epoch": 2.105379258185485, "grad_norm": 0.0027656294405460358, "learning_rate": 1.2371065361322626e-07, "logits/chosen": 0.5268402099609375, "logits/rejected": 1.1068298816680908, "logps/chosen": -187.2468719482422, "logps/rejected": -915.4500122070312, "loss": 0.0106, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.526415824890137, "rewards/margins": 37.529685974121094, "rewards/rejected": -43.029685974121094, "step": 8350 }, { "epoch": 2.1079002930703052, "grad_norm": 0.029048455879092216, "learning_rate": 1.2307819467608688e-07, "logits/chosen": 0.4337005615234375, "logits/rejected": NaN, "logps/chosen": -180.421875, "logps/rejected": -924.9000244140625, "loss": 0.0068, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.263769626617432, "rewards/margins": 38.96484375, "rewards/rejected": -44.220314025878906, "step": 8360 }, { "epoch": 2.1104213279551254, "grad_norm": 0.0010737302945926785, "learning_rate": 1.2244682823266867e-07, "logits/chosen": 0.5243927240371704, "logits/rejected": 1.0102355480194092, "logps/chosen": -199.49374389648438, "logps/rejected": -979.0999755859375, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.748266696929932, "rewards/margins": 39.525001525878906, "rewards/rejected": -45.279685974121094, "step": 8370 }, { "epoch": 2.112942362839946, "grad_norm": 0.09518476575613022, "learning_rate": 1.2181655971752947e-07, "logits/chosen": 0.46284180879592896, "logits/rejected": NaN, "logps/chosen": -172.2531280517578, "logps/rejected": -985.75, "loss": 0.0113, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.838433742523193, "rewards/margins": 41.01171875, "rewards/rejected": -45.83906173706055, "step": 8380 }, { "epoch": 2.115463397724766, "grad_norm": 0.004680037032812834, "learning_rate": 1.2118739455577637e-07, "logits/chosen": 0.162261962890625, "logits/rejected": NaN, "logps/chosen": -185.7890625, "logps/rejected": -957.8499755859375, "loss": 0.0255, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.473876953125, "rewards/margins": 38.2578125, "rewards/rejected": -42.73749923706055, "step": 8390 }, { "epoch": 2.1179844326095862, "grad_norm": 0.007956395857036114, "learning_rate": 1.205593381630193e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -186.6531219482422, "logps/rejected": -939.1500244140625, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.444580078125, "rewards/margins": 38.529685974121094, "rewards/rejected": -42.974998474121094, "step": 8400 }, { "epoch": 2.1205054674944064, "grad_norm": 0.17255857586860657, "learning_rate": 1.1993239594532423e-07, "logits/chosen": 0.3725524842739105, "logits/rejected": 0.9725433588027954, "logps/chosen": -173.8874969482422, "logps/rejected": -939.0, "loss": 0.0287, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.624951362609863, "rewards/margins": 38.381248474121094, "rewards/rejected": -43.01250076293945, "step": 8410 }, { "epoch": 2.1230265023792265, "grad_norm": 0.0032815837766975164, "learning_rate": 1.1930657329916704e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -169.02499389648438, "logps/rejected": -923.2999877929688, "loss": 0.0133, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9112305641174316, "rewards/margins": 37.279685974121094, "rewards/rejected": -41.20624923706055, "step": 8420 }, { "epoch": 2.1255475372640467, "grad_norm": 0.0012539627496153116, "learning_rate": 1.1868187561138637e-07, "logits/chosen": 0.3478347659111023, "logits/rejected": NaN, "logps/chosen": -170.11874389648438, "logps/rejected": -908.4500122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.823779344558716, "rewards/margins": 36.264060974121094, "rewards/rejected": -40.079689025878906, "step": 8430 }, { "epoch": 2.1280685721488672, "grad_norm": 0.004931879695504904, "learning_rate": 1.180583082591381e-07, "logits/chosen": NaN, "logits/rejected": 0.9827117919921875, "logps/chosen": -165.64999389648438, "logps/rejected": -921.5999755859375, "loss": 0.0486, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.477270603179932, "rewards/margins": 37.015625, "rewards/rejected": -41.48906326293945, "step": 8440 }, { "epoch": 2.1305896070336874, "grad_norm": 0.0009592586429789662, "learning_rate": 1.1743587660984814e-07, "logits/chosen": 0.2225189208984375, "logits/rejected": NaN, "logps/chosen": -170.609375, "logps/rejected": -932.0, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.8666014671325684, "rewards/margins": 36.619529724121094, "rewards/rejected": -40.4765625, "step": 8450 }, { "epoch": 2.1331106419185075, "grad_norm": 0.00030546728521585464, "learning_rate": 1.1681458602116714e-07, "logits/chosen": 0.21745148301124573, "logits/rejected": NaN, "logps/chosen": -189.55624389648438, "logps/rejected": -938.0499877929688, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -3.802044630050659, "rewards/margins": 35.533592224121094, "rewards/rejected": -39.345314025878906, "step": 8460 }, { "epoch": 2.1356316768033277, "grad_norm": 0.022328168153762817, "learning_rate": 1.161944418409237e-07, "logits/chosen": 0.3280029296875, "logits/rejected": 0.6452789306640625, "logps/chosen": -171.40625, "logps/rejected": -897.8499755859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9460082054138184, "rewards/margins": 33.15156173706055, "rewards/rejected": -37.092185974121094, "step": 8470 }, { "epoch": 2.138152711688148, "grad_norm": 0.4196684956550598, "learning_rate": 1.1557544940707853e-07, "logits/chosen": 0.15756531059741974, "logits/rejected": NaN, "logps/chosen": -189.84375, "logps/rejected": -932.9500122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.291308403015137, "rewards/margins": 35.2890625, "rewards/rejected": -39.595314025878906, "step": 8480 }, { "epoch": 2.1406737465729684, "grad_norm": 0.00027302815578877926, "learning_rate": 1.1495761404767882e-07, "logits/chosen": 0.1400909423828125, "logits/rejected": NaN, "logps/chosen": -196.0343780517578, "logps/rejected": -939.0499877929688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -4.787524223327637, "rewards/margins": 36.22187423706055, "rewards/rejected": -41.00312423706055, "step": 8490 }, { "epoch": 2.1431947814577885, "grad_norm": 0.015516266226768494, "learning_rate": 1.1434094108081186e-07, "logits/chosen": 0.17211762070655823, "logits/rejected": 0.695904552936554, "logps/chosen": -188.0398406982422, "logps/rejected": -884.0, "loss": 0.0857, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.79608154296875, "rewards/margins": 32.87031173706055, "rewards/rejected": -37.646873474121094, "step": 8500 }, { "epoch": 2.1457158163426087, "grad_norm": 0.00019120647630188614, "learning_rate": 1.1372543581455949e-07, "logits/chosen": 0.14965057373046875, "logits/rejected": NaN, "logps/chosen": -177.875, "logps/rejected": -950.3499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9785752296447754, "rewards/margins": 35.712501525878906, "rewards/rejected": -39.689064025878906, "step": 8510 }, { "epoch": 2.148236851227429, "grad_norm": 0.011825147084891796, "learning_rate": 1.131111035469528e-07, "logits/chosen": 0.2042396515607834, "logits/rejected": NaN, "logps/chosen": -159.2218780517578, "logps/rejected": -875.4500122070312, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.085669040679932, "rewards/margins": 34.22343826293945, "rewards/rejected": -38.3046875, "step": 8520 }, { "epoch": 2.150757886112249, "grad_norm": 0.0031893388368189335, "learning_rate": 1.1249794956592576e-07, "logits/chosen": 0.236602783203125, "logits/rejected": NaN, "logps/chosen": -174.9656219482422, "logps/rejected": -918.8499755859375, "loss": 0.0107, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.220385551452637, "rewards/margins": 33.9609375, "rewards/rejected": -38.20000076293945, "step": 8530 }, { "epoch": 2.153278920997069, "grad_norm": 0.04355587437748909, "learning_rate": 1.1188597914927028e-07, "logits/chosen": 0.12315063178539276, "logits/rejected": NaN, "logps/chosen": -166.4093780517578, "logps/rejected": -893.7000122070312, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.7870850563049316, "rewards/margins": 33.99609375, "rewards/rejected": -37.787498474121094, "step": 8540 }, { "epoch": 2.1557999558818897, "grad_norm": 0.010151010937988758, "learning_rate": 1.1127519756459047e-07, "logits/chosen": 0.17731627821922302, "logits/rejected": NaN, "logps/chosen": -178.8156280517578, "logps/rejected": -895.1500244140625, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.922778367996216, "rewards/margins": 33.78125, "rewards/rejected": -37.6796875, "step": 8550 }, { "epoch": 2.15832099076671, "grad_norm": 0.010332955047488213, "learning_rate": 1.1066561006925779e-07, "logits/chosen": 0.15197142958641052, "logits/rejected": NaN, "logps/chosen": -178.6875, "logps/rejected": -896.3499755859375, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.167578220367432, "rewards/margins": 35.09375, "rewards/rejected": -39.271873474121094, "step": 8560 }, { "epoch": 2.16084202565153, "grad_norm": 0.7306037545204163, "learning_rate": 1.1005722191036492e-07, "logits/chosen": 0.3028411865234375, "logits/rejected": 0.7414520382881165, "logps/chosen": -170.3992156982422, "logps/rejected": -934.5499877929688, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.320068359375, "rewards/margins": 35.678123474121094, "rewards/rejected": -40.0, "step": 8570 }, { "epoch": 2.16336306053635, "grad_norm": 0.0002608724171295762, "learning_rate": 1.0945003832468169e-07, "logits/chosen": 0.1360015869140625, "logits/rejected": 0.6764281988143921, "logps/chosen": -175.71249389648438, "logps/rejected": -926.5499877929688, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.218847751617432, "rewards/margins": 36.470314025878906, "rewards/rejected": -40.69218826293945, "step": 8580 }, { "epoch": 2.16588409542117, "grad_norm": 0.610596239566803, "learning_rate": 1.0884406453860886e-07, "logits/chosen": 0.2071884125471115, "logits/rejected": 0.8358367681503296, "logps/chosen": -171.4031219482422, "logps/rejected": -889.9000244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.79937744140625, "rewards/margins": 33.33281326293945, "rewards/rejected": -37.139060974121094, "step": 8590 }, { "epoch": 2.168405130305991, "grad_norm": 0.3795524835586548, "learning_rate": 1.0823930576813425e-07, "logits/chosen": 0.15550383925437927, "logits/rejected": 0.6229339838027954, "logps/chosen": -177.49063110351562, "logps/rejected": -936.5499877929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.9339842796325684, "rewards/margins": 35.98906326293945, "rewards/rejected": -39.939064025878906, "step": 8600 }, { "epoch": 2.170926165190811, "grad_norm": 0.004137884825468063, "learning_rate": 1.0763576721878686e-07, "logits/chosen": 0.24252624809741974, "logits/rejected": NaN, "logps/chosen": -185.75, "logps/rejected": -859.6500244140625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": -3.702392578125, "rewards/margins": 32.203125, "rewards/rejected": -35.90937423706055, "step": 8610 }, { "epoch": 2.173447200075631, "grad_norm": 0.0047319140285253525, "learning_rate": 1.0703345408559261e-07, "logits/chosen": 0.11767120659351349, "logits/rejected": NaN, "logps/chosen": -182.89999389648438, "logps/rejected": -868.0999755859375, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8382568359375, "rewards/margins": 33.505470275878906, "rewards/rejected": -37.342185974121094, "step": 8620 }, { "epoch": 2.175968234960451, "grad_norm": 0.003823092905804515, "learning_rate": 1.0643237155302975e-07, "logits/chosen": 0.23224487900733948, "logits/rejected": NaN, "logps/chosen": -181.06875610351562, "logps/rejected": -909.5, "loss": 0.0185, "rewards/accuracies": 0.984375, "rewards/chosen": -4.499585151672363, "rewards/margins": 35.53593826293945, "rewards/rejected": -40.04999923706055, "step": 8630 }, { "epoch": 2.1784892698452714, "grad_norm": 2.203221321105957, "learning_rate": 1.0583252479498372e-07, "logits/chosen": 0.02857666090130806, "logits/rejected": NaN, "logps/chosen": -176.77499389648438, "logps/rejected": -874.0999755859375, "loss": 0.0353, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8621582984924316, "rewards/margins": 34.264060974121094, "rewards/rejected": -38.1171875, "step": 8640 }, { "epoch": 2.1810103047300915, "grad_norm": 0.0024506214540451765, "learning_rate": 1.0523391897470299e-07, "logits/chosen": 0.13394013047218323, "logits/rejected": NaN, "logps/chosen": -203.20156860351562, "logps/rejected": -921.6500244140625, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": -4.526196479797363, "rewards/margins": 35.3203125, "rewards/rejected": -39.84843826293945, "step": 8650 }, { "epoch": 2.183531339614912, "grad_norm": 0.019111698493361473, "learning_rate": 1.0463655924475442e-07, "logits/chosen": 0.32924193143844604, "logits/rejected": 0.7140105962753296, "logps/chosen": -184.66250610351562, "logps/rejected": -928.9500122070312, "loss": 0.0479, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.583447456359863, "rewards/margins": 36.75468826293945, "rewards/rejected": -41.3359375, "step": 8660 }, { "epoch": 2.186052374499732, "grad_norm": 0.06881701946258545, "learning_rate": 1.0404045074697929e-07, "logits/chosen": 0.3058944642543793, "logits/rejected": 0.7876907587051392, "logps/chosen": -173.2624969482422, "logps/rejected": -912.0999755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.034448146820068, "rewards/margins": 35.728126525878906, "rewards/rejected": -39.75468826293945, "step": 8670 }, { "epoch": 2.1885734093845524, "grad_norm": 0.009399820119142532, "learning_rate": 1.034455986124485e-07, "logits/chosen": 0.087646484375, "logits/rejected": NaN, "logps/chosen": -179.2859344482422, "logps/rejected": -928.5499877929688, "loss": 0.0115, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.921826124191284, "rewards/margins": 34.55781173706055, "rewards/rejected": -38.482810974121094, "step": 8680 }, { "epoch": 2.1910944442693725, "grad_norm": 0.011912876740098, "learning_rate": 1.028520079614186e-07, "logits/chosen": 0.061065673828125, "logits/rejected": NaN, "logps/chosen": -181.1531219482422, "logps/rejected": -922.625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9137206077575684, "rewards/margins": 35.474998474121094, "rewards/rejected": -39.389060974121094, "step": 8690 }, { "epoch": 2.1936154791541926, "grad_norm": 0.4029819369316101, "learning_rate": 1.0225968390328816e-07, "logits/chosen": 0.04274291917681694, "logits/rejected": NaN, "logps/chosen": -169.1125030517578, "logps/rejected": -896.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.4525146484375, "rewards/margins": 34.04296875, "rewards/rejected": -37.50312423706055, "step": 8700 }, { "epoch": 2.196136514039013, "grad_norm": 0.04643462225794792, "learning_rate": 1.016686315365529e-07, "logits/chosen": 0.2211456298828125, "logits/rejected": NaN, "logps/chosen": -175.625, "logps/rejected": -892.1500244140625, "loss": 0.0098, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8020050525665283, "rewards/margins": 35.171875, "rewards/rejected": -38.99531173706055, "step": 8710 }, { "epoch": 2.1986575489238334, "grad_norm": 7.988294964889064e-05, "learning_rate": 1.0107885594876281e-07, "logits/chosen": 0.11687622219324112, "logits/rejected": NaN, "logps/chosen": -172.53750610351562, "logps/rejected": -888.0250244140625, "loss": 0.021, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.6047637462615967, "rewards/margins": 34.235939025878906, "rewards/rejected": -37.83124923706055, "step": 8720 }, { "epoch": 2.2011785838086535, "grad_norm": 0.010618013329803944, "learning_rate": 1.0049036221647741e-07, "logits/chosen": 0.2628173828125, "logits/rejected": NaN, "logps/chosen": -172.7703094482422, "logps/rejected": -904.5250244140625, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.645495653152466, "rewards/margins": 35.389060974121094, "rewards/rejected": -39.032814025878906, "step": 8730 }, { "epoch": 2.2036996186934736, "grad_norm": 0.00017698536976240575, "learning_rate": 9.990315540522296e-08, "logits/chosen": 0.27116698026657104, "logits/rejected": 0.9712417721748352, "logps/chosen": -157.69375610351562, "logps/rejected": -882.2999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.4784178733825684, "rewards/margins": 35.275001525878906, "rewards/rejected": -38.751564025878906, "step": 8740 }, { "epoch": 2.2062206535782938, "grad_norm": 4.0647602872923017e-05, "learning_rate": 9.931724056944801e-08, "logits/chosen": 0.2889343202114105, "logits/rejected": 0.882276177406311, "logps/chosen": -163.3273468017578, "logps/rejected": -881.8499755859375, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.525891065597534, "rewards/margins": 34.984375, "rewards/rejected": -38.506248474121094, "step": 8750 }, { "epoch": 2.208741688463114, "grad_norm": 1.5981154319888446e-06, "learning_rate": 9.873262275248037e-08, "logits/chosen": 0.05462036281824112, "logits/rejected": NaN, "logps/chosen": -167.0812530517578, "logps/rejected": -888.8499755859375, "loss": 0.0163, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.604199171066284, "rewards/margins": 34.990623474121094, "rewards/rejected": -38.60468673706055, "step": 8760 }, { "epoch": 2.2112627233479345, "grad_norm": 2.5704634026624262e-05, "learning_rate": 9.814930698648388e-08, "logits/chosen": 0.3345092833042145, "logits/rejected": NaN, "logps/chosen": -157.45938110351562, "logps/rejected": -913.5, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.756664991378784, "rewards/margins": 36.4609375, "rewards/rejected": -40.204689025878906, "step": 8770 }, { "epoch": 2.2137837582327546, "grad_norm": 0.010813537985086441, "learning_rate": 9.756729829241455e-08, "logits/chosen": 0.2804122865200043, "logits/rejected": NaN, "logps/chosen": -165.9578094482422, "logps/rejected": -897.1500244140625, "loss": 0.0111, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4125123023986816, "rewards/margins": 35.35468673706055, "rewards/rejected": -38.759376525878906, "step": 8780 }, { "epoch": 2.216304793117575, "grad_norm": 1.5535975762759335e-05, "learning_rate": 9.698660167997766e-08, "logits/chosen": 0.18426819145679474, "logits/rejected": NaN, "logps/chosen": -169.90625, "logps/rejected": -901.5, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.804126024246216, "rewards/margins": 34.609375, "rewards/rejected": -38.412498474121094, "step": 8790 }, { "epoch": 2.218825828002395, "grad_norm": 0.023024646565318108, "learning_rate": 9.64072221475846e-08, "logits/chosen": 0.0844573974609375, "logits/rejected": NaN, "logps/chosen": -162.97500610351562, "logps/rejected": -892.7999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.2935547828674316, "rewards/margins": 34.853904724121094, "rewards/rejected": -38.142967224121094, "step": 8800 }, { "epoch": 2.221346862887215, "grad_norm": 0.007788578514009714, "learning_rate": 9.582916468231003e-08, "logits/chosen": 0.11102447658777237, "logits/rejected": 0.8673782348632812, "logps/chosen": -184.1281280517578, "logps/rejected": -909.5250244140625, "loss": 0.0096, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7047119140625, "rewards/margins": 35.834373474121094, "rewards/rejected": -39.53125, "step": 8810 }, { "epoch": 2.2238678977720356, "grad_norm": 0.007624502293765545, "learning_rate": 9.525243425984858e-08, "logits/chosen": 0.10608367621898651, "logits/rejected": NaN, "logps/chosen": -175.60311889648438, "logps/rejected": -898.9000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.523388624191284, "rewards/margins": 35.15625, "rewards/rejected": -38.685935974121094, "step": 8820 }, { "epoch": 2.226388932656856, "grad_norm": 0.986233651638031, "learning_rate": 9.467703584447214e-08, "logits/chosen": 0.03681640699505806, "logits/rejected": 0.970262885093689, "logps/chosen": -179.41250610351562, "logps/rejected": -906.875, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.195465087890625, "rewards/margins": 35.92499923706055, "rewards/rejected": -40.10468673706055, "step": 8830 }, { "epoch": 2.228909967541676, "grad_norm": 0.02572605200111866, "learning_rate": 9.410297438898751e-08, "logits/chosen": 0.09101410210132599, "logits/rejected": NaN, "logps/chosen": -196.6906280517578, "logps/rejected": -893.2000122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.8343749046325684, "rewards/margins": 34.571876525878906, "rewards/rejected": -38.40625, "step": 8840 }, { "epoch": 2.231431002426496, "grad_norm": 0.016055447980761528, "learning_rate": 9.353025483469309e-08, "logits/chosen": -0.0016906738746911287, "logits/rejected": NaN, "logps/chosen": -176.3562469482422, "logps/rejected": -926.75, "loss": 0.0109, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.772448778152466, "rewards/margins": 35.342185974121094, "rewards/rejected": -39.115623474121094, "step": 8850 }, { "epoch": 2.233952037311316, "grad_norm": 0.04804972559213638, "learning_rate": 9.295888211133704e-08, "logits/chosen": 0.15255126357078552, "logits/rejected": NaN, "logps/chosen": -185.9031219482422, "logps/rejected": -915.4749755859375, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9581542015075684, "rewards/margins": 36.076560974121094, "rewards/rejected": -40.021873474121094, "step": 8860 }, { "epoch": 2.2364730721961363, "grad_norm": 0.0177646242082119, "learning_rate": 9.238886113707422e-08, "logits/chosen": 0.445272833108902, "logits/rejected": NaN, "logps/chosen": -169.46249389648438, "logps/rejected": -940.6500244140625, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.348462104797363, "rewards/margins": 38.37031173706055, "rewards/rejected": -42.712501525878906, "step": 8870 }, { "epoch": 2.238994107080957, "grad_norm": 0.03369855135679245, "learning_rate": 9.182019681842448e-08, "logits/chosen": 0.401885986328125, "logits/rejected": NaN, "logps/chosen": -200.6218719482422, "logps/rejected": -937.3499755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.928210496902466, "rewards/margins": 38.040626525878906, "rewards/rejected": -41.95624923706055, "step": 8880 }, { "epoch": 2.241515141965777, "grad_norm": 0.0003995455044787377, "learning_rate": 9.125289405022981e-08, "logits/chosen": NaN, "logits/rejected": 1.0214446783065796, "logps/chosen": -172.265625, "logps/rejected": -964.2000122070312, "loss": 0.0083, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.5806884765625, "rewards/margins": 39.25312423706055, "rewards/rejected": -43.82500076293945, "step": 8890 }, { "epoch": 2.244036176850597, "grad_norm": 0.6609624028205872, "learning_rate": 9.068695771561261e-08, "logits/chosen": 0.2645629942417145, "logits/rejected": 0.9762420654296875, "logps/chosen": -195.6531219482422, "logps/rejected": -941.75, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.052832126617432, "rewards/margins": 37.029685974121094, "rewards/rejected": -42.06562423706055, "step": 8900 }, { "epoch": 2.2465572117354173, "grad_norm": 0.015623444691300392, "learning_rate": 9.012239268593363e-08, "logits/chosen": 0.3916488587856293, "logits/rejected": NaN, "logps/chosen": -183.0, "logps/rejected": -911.5499877929688, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.270776271820068, "rewards/margins": 38.05781173706055, "rewards/rejected": -42.328125, "step": 8910 }, { "epoch": 2.2490782466202375, "grad_norm": 0.00015187650569714606, "learning_rate": 8.955920382074991e-08, "logits/chosen": 0.4386327862739563, "logits/rejected": 0.9196746945381165, "logps/chosen": -192.1531219482422, "logps/rejected": -964.0499877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.818041801452637, "rewards/margins": 39.87187576293945, "rewards/rejected": -44.69062423706055, "step": 8920 }, { "epoch": 2.251599281505058, "grad_norm": 0.0022624994162470102, "learning_rate": 8.899739596777292e-08, "logits/chosen": 0.44580382108688354, "logits/rejected": NaN, "logps/chosen": -182.60311889648438, "logps/rejected": -918.4500122070312, "loss": 0.0034, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.933215141296387, "rewards/margins": 37.275001525878906, "rewards/rejected": -42.2109375, "step": 8930 }, { "epoch": 2.254120316389878, "grad_norm": 1.9400588274002075, "learning_rate": 8.84369739628269e-08, "logits/chosen": 0.345947265625, "logits/rejected": NaN, "logps/chosen": -184.46249389648438, "logps/rejected": -948.9000244140625, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.992944240570068, "rewards/margins": 37.61249923706055, "rewards/rejected": -42.607810974121094, "step": 8940 }, { "epoch": 2.2566413512746983, "grad_norm": 35.809600830078125, "learning_rate": 8.78779426298075e-08, "logits/chosen": 0.19473418593406677, "logits/rejected": NaN, "logps/chosen": -209.0, "logps/rejected": -937.0999755859375, "loss": 0.0271, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.056372165679932, "rewards/margins": 37.265625, "rewards/rejected": -42.321876525878906, "step": 8950 }, { "epoch": 2.2591623861595185, "grad_norm": 30.925039291381836, "learning_rate": 8.732030678063973e-08, "logits/chosen": 0.35123902559280396, "logits/rejected": NaN, "logps/chosen": -192.6906280517578, "logps/rejected": -904.7999877929688, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.734118461608887, "rewards/margins": 34.9453125, "rewards/rejected": -39.681251525878906, "step": 8960 }, { "epoch": 2.2616834210443386, "grad_norm": 0.0020356574095785618, "learning_rate": 8.676407121523682e-08, "logits/chosen": 0.14501647651195526, "logits/rejected": NaN, "logps/chosen": -177.640625, "logps/rejected": -958.0999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.054931640625, "rewards/margins": 39.75, "rewards/rejected": -43.7890625, "step": 8970 }, { "epoch": 2.2642044559291588, "grad_norm": 2.908935070037842, "learning_rate": 8.620924072145916e-08, "logits/chosen": 0.23104706406593323, "logits/rejected": 0.860546886920929, "logps/chosen": -175.43515014648438, "logps/rejected": -939.9000244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.293798923492432, "rewards/margins": 37.32343673706055, "rewards/rejected": -41.59687423706055, "step": 8980 }, { "epoch": 2.266725490813979, "grad_norm": 0.005058923736214638, "learning_rate": 8.56558200750725e-08, "logits/chosen": 0.18536376953125, "logits/rejected": NaN, "logps/chosen": -174.4031219482422, "logps/rejected": -936.9000244140625, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.735827684402466, "rewards/margins": 37.993751525878906, "rewards/rejected": -41.7265625, "step": 8990 }, { "epoch": 2.2692465256987995, "grad_norm": 0.006509655620902777, "learning_rate": 8.510381403970748e-08, "logits/chosen": 0.21973876655101776, "logits/rejected": 1.0168182849884033, "logps/chosen": -173.70156860351562, "logps/rejected": -932.3499755859375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.049647331237793, "rewards/margins": 37.712501525878906, "rewards/rejected": -41.74531173706055, "step": 9000 }, { "epoch": 2.2717675605836196, "grad_norm": 0.05116549879312515, "learning_rate": 8.4553227366818e-08, "logits/chosen": 0.2963195741176605, "logits/rejected": NaN, "logps/chosen": -163.7859344482422, "logps/rejected": -927.0499877929688, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3849639892578125, "rewards/margins": 37.57421875, "rewards/rejected": -40.954689025878906, "step": 9010 }, { "epoch": 2.2742885954684398, "grad_norm": 0.003382830647751689, "learning_rate": 8.400406479564098e-08, "logits/chosen": 0.30292052030563354, "logits/rejected": NaN, "logps/chosen": -183.5343780517578, "logps/rejected": -924.1500244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.643420219421387, "rewards/margins": 36.985939025878906, "rewards/rejected": -41.607810974121094, "step": 9020 }, { "epoch": 2.27680963035326, "grad_norm": 0.17142857611179352, "learning_rate": 8.345633105315497e-08, "logits/chosen": 0.4185546934604645, "logits/rejected": NaN, "logps/chosen": -175.0187530517578, "logps/rejected": -917.0499877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.076318264007568, "rewards/margins": 37.900001525878906, "rewards/rejected": -41.970314025878906, "step": 9030 }, { "epoch": 2.2793306652380805, "grad_norm": 8.509245872497559, "learning_rate": 8.291003085403969e-08, "logits/chosen": 0.604779064655304, "logits/rejected": 0.986236572265625, "logps/chosen": -158.5234375, "logps/rejected": -896.7000122070312, "loss": 0.0315, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.675183057785034, "rewards/margins": 35.681251525878906, "rewards/rejected": -39.368751525878906, "step": 9040 }, { "epoch": 2.2818517001229006, "grad_norm": 5.7703318816493265e-06, "learning_rate": 8.236516890063572e-08, "logits/chosen": 0.3044677674770355, "logits/rejected": 0.824169933795929, "logps/chosen": -170.4499969482422, "logps/rejected": -995.3499755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.728906154632568, "rewards/margins": 39.212501525878906, "rewards/rejected": -43.95000076293945, "step": 9050 }, { "epoch": 2.2843727350077208, "grad_norm": 0.644170343875885, "learning_rate": 8.182174988290361e-08, "logits/chosen": 0.29655152559280396, "logits/rejected": NaN, "logps/chosen": -190.40625, "logps/rejected": -975.7000122070312, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.881543159484863, "rewards/margins": 39.170310974121094, "rewards/rejected": -44.0546875, "step": 9060 }, { "epoch": 2.286893769892541, "grad_norm": 0.030749812722206116, "learning_rate": 8.127977847838365e-08, "logits/chosen": 0.3984130918979645, "logits/rejected": NaN, "logps/chosen": -173.7062530517578, "logps/rejected": -920.5, "loss": 0.0811, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.337328910827637, "rewards/margins": 38.1171875, "rewards/rejected": -42.45624923706055, "step": 9070 }, { "epoch": 2.289414804777361, "grad_norm": 0.15078449249267578, "learning_rate": 8.073925935215567e-08, "logits/chosen": 0.354605108499527, "logits/rejected": NaN, "logps/chosen": -175.25936889648438, "logps/rejected": -950.2999877929688, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.368719577789307, "rewards/margins": 38.696876525878906, "rewards/rejected": -43.046875, "step": 9080 }, { "epoch": 2.291935839662181, "grad_norm": 0.1754370927810669, "learning_rate": 8.020019715679896e-08, "logits/chosen": 0.3769592344760895, "logits/rejected": 1.0253143310546875, "logps/chosen": -191.02499389648438, "logps/rejected": -937.75, "loss": 0.0184, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.706787109375, "rewards/margins": 37.131248474121094, "rewards/rejected": -41.834373474121094, "step": 9090 }, { "epoch": 2.2944568745470013, "grad_norm": 0.00023682565370108932, "learning_rate": 7.9662596532352e-08, "logits/chosen": 0.15894775092601776, "logits/rejected": 0.989147961139679, "logps/chosen": -184.828125, "logps/rejected": -927.9000244140625, "loss": 0.0032, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.133886814117432, "rewards/margins": 36.841407775878906, "rewards/rejected": -40.96953201293945, "step": 9100 }, { "epoch": 2.296977909431822, "grad_norm": 0.16501523554325104, "learning_rate": 7.912646210627252e-08, "logits/chosen": 0.3115478456020355, "logits/rejected": NaN, "logps/chosen": -190.89688110351562, "logps/rejected": -931.0, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.416586399078369, "rewards/margins": 36.82500076293945, "rewards/rejected": -41.23906326293945, "step": 9110 }, { "epoch": 2.299498944316642, "grad_norm": 0.03360718861222267, "learning_rate": 7.859179849339814e-08, "logits/chosen": 0.31773680448532104, "logits/rejected": NaN, "logps/chosen": -163.0343780517578, "logps/rejected": -912.0750122070312, "loss": 0.055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.221117973327637, "rewards/margins": 36.790626525878906, "rewards/rejected": -41.01250076293945, "step": 9120 }, { "epoch": 2.302019979201462, "grad_norm": 8.076288213487715e-05, "learning_rate": 7.80586102959059e-08, "logits/chosen": 0.2569518983364105, "logits/rejected": NaN, "logps/chosen": -182.00625610351562, "logps/rejected": -930.7000122070312, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9328980445861816, "rewards/margins": 36.890625, "rewards/rejected": -40.818748474121094, "step": 9130 }, { "epoch": 2.3045410140862823, "grad_norm": 0.0006922541651874781, "learning_rate": 7.752690210327337e-08, "logits/chosen": 0.410980224609375, "logits/rejected": NaN, "logps/chosen": -165.7624969482422, "logps/rejected": -877.4000244140625, "loss": 0.0067, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8898072242736816, "rewards/margins": 34.919532775878906, "rewards/rejected": -38.83124923706055, "step": 9140 }, { "epoch": 2.3070620489711025, "grad_norm": 0.0013041390338912606, "learning_rate": 7.699667849223842e-08, "logits/chosen": 0.44231873750686646, "logits/rejected": NaN, "logps/chosen": -168.8156280517578, "logps/rejected": -884.7249755859375, "loss": 0.016, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.184057712554932, "rewards/margins": 35.56718826293945, "rewards/rejected": -39.73906326293945, "step": 9150 }, { "epoch": 2.309583083855923, "grad_norm": 0.004136791452765465, "learning_rate": 7.646794402676071e-08, "logits/chosen": 0.481008917093277, "logits/rejected": NaN, "logps/chosen": -179.8937530517578, "logps/rejected": -929.8499755859375, "loss": 0.0219, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.839209079742432, "rewards/margins": 36.775001525878906, "rewards/rejected": -41.58281326293945, "step": 9160 }, { "epoch": 2.312104118740743, "grad_norm": 0.00041664214222691953, "learning_rate": 7.594070325798149e-08, "logits/chosen": 0.31203001737594604, "logits/rejected": NaN, "logps/chosen": -178.25, "logps/rejected": -931.0999755859375, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.227929592132568, "rewards/margins": 36.8515625, "rewards/rejected": -41.07109451293945, "step": 9170 }, { "epoch": 2.3146251536255633, "grad_norm": 0.05029057711362839, "learning_rate": 7.541496072418498e-08, "logits/chosen": 0.2128036469221115, "logits/rejected": NaN, "logps/chosen": -187.4734344482422, "logps/rejected": -951.0999755859375, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.574347019195557, "rewards/margins": 37.31328201293945, "rewards/rejected": -41.90156173706055, "step": 9180 }, { "epoch": 2.3171461885103835, "grad_norm": 4.361311221146025e-05, "learning_rate": 7.489072095075933e-08, "logits/chosen": 0.31392669677734375, "logits/rejected": NaN, "logps/chosen": -177.6828155517578, "logps/rejected": -913.7999877929688, "loss": 0.0089, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.713507175445557, "rewards/margins": 36.1015625, "rewards/rejected": -40.80937576293945, "step": 9190 }, { "epoch": 2.3196672233952036, "grad_norm": 0.01025487668812275, "learning_rate": 7.436798845015727e-08, "logits/chosen": 0.235107421875, "logits/rejected": NaN, "logps/chosen": -175.16250610351562, "logps/rejected": -901.8499755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.150805473327637, "rewards/margins": 35.08281326293945, "rewards/rejected": -39.25312423706055, "step": 9200 }, { "epoch": 2.3221882582800237, "grad_norm": 0.00035426628892309964, "learning_rate": 7.384676772185767e-08, "logits/chosen": 0.38451844453811646, "logits/rejected": 0.7309478521347046, "logps/chosen": -162.6374969482422, "logps/rejected": -885.9249877929688, "loss": 0.0431, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.70849609375, "rewards/margins": 34.9296875, "rewards/rejected": -38.64531326293945, "step": 9210 }, { "epoch": 2.3247092931648443, "grad_norm": 0.013341221958398819, "learning_rate": 7.332706325232649e-08, "logits/chosen": NaN, "logits/rejected": 0.8751739263534546, "logps/chosen": -179.8468780517578, "logps/rejected": -947.25, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.097607612609863, "rewards/margins": 37.006248474121094, "rewards/rejected": -41.10625076293945, "step": 9220 }, { "epoch": 2.3272303280496645, "grad_norm": 146.5841064453125, "learning_rate": 7.28088795149786e-08, "logits/chosen": 0.261697381734848, "logits/rejected": NaN, "logps/chosen": -165.1875, "logps/rejected": -897.9249877929688, "loss": 0.0799, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.070752143859863, "rewards/margins": 35.73749923706055, "rewards/rejected": -39.810935974121094, "step": 9230 }, { "epoch": 2.3297513629344846, "grad_norm": 0.02105412818491459, "learning_rate": 7.229222097013878e-08, "logits/chosen": 0.2992309629917145, "logits/rejected": NaN, "logps/chosen": -190.13436889648438, "logps/rejected": -924.8250122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.392675876617432, "rewards/margins": 35.673439025878906, "rewards/rejected": -40.064064025878906, "step": 9240 }, { "epoch": 2.3322723978193047, "grad_norm": 1.1712759733200073, "learning_rate": 7.177709206500346e-08, "logits/chosen": 0.2328536957502365, "logits/rejected": NaN, "logps/chosen": -163.3874969482422, "logps/rejected": -919.5750122070312, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.720019578933716, "rewards/margins": 34.974998474121094, "rewards/rejected": -38.68437576293945, "step": 9250 }, { "epoch": 2.334793432704125, "grad_norm": 0.0012338722590357065, "learning_rate": 7.126349723360284e-08, "logits/chosen": 0.2965347170829773, "logits/rejected": 0.9630386233329773, "logps/chosen": -184.6687469482422, "logps/rejected": -890.6500244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.935961723327637, "rewards/margins": 34.337501525878906, "rewards/rejected": -39.26874923706055, "step": 9260 }, { "epoch": 2.3373144675889455, "grad_norm": 0.02073715440928936, "learning_rate": 7.075144089676207e-08, "logits/chosen": 0.38186341524124146, "logits/rejected": NaN, "logps/chosen": -176.8468780517578, "logps/rejected": -941.75, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.317626953125, "rewards/margins": 37.09843826293945, "rewards/rejected": -41.44843673706055, "step": 9270 }, { "epoch": 2.3398355024737656, "grad_norm": 0.007958456873893738, "learning_rate": 7.024092746206383e-08, "logits/chosen": 0.12557068467140198, "logits/rejected": NaN, "logps/chosen": -201.21249389648438, "logps/rejected": -902.5499877929688, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.233935356140137, "rewards/margins": 35.252342224121094, "rewards/rejected": -39.49531173706055, "step": 9280 }, { "epoch": 2.3423565373585857, "grad_norm": 0.1649903804063797, "learning_rate": 6.973196132380979e-08, "logits/chosen": 0.30316162109375, "logits/rejected": NaN, "logps/chosen": -187.98202514648438, "logps/rejected": -944.4249877929688, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.206964015960693, "rewards/margins": 37.724998474121094, "rewards/rejected": -41.93281173706055, "step": 9290 }, { "epoch": 2.344877572243406, "grad_norm": 0.05101102963089943, "learning_rate": 6.922454686298346e-08, "logits/chosen": 0.24309387803077698, "logits/rejected": NaN, "logps/chosen": -165.703125, "logps/rejected": -937.2000122070312, "loss": 0.0444, "rewards/accuracies": 0.984375, "rewards/chosen": -3.778045654296875, "rewards/margins": 36.8828125, "rewards/rejected": -40.66875076293945, "step": 9300 }, { "epoch": 2.347398607128226, "grad_norm": 0.0024397526867687702, "learning_rate": 6.87186884472118e-08, "logits/chosen": 0.2616027891635895, "logits/rejected": 0.7850448489189148, "logps/chosen": -176.7937469482422, "logps/rejected": -911.4749755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.40618896484375, "rewards/margins": 36.046875, "rewards/rejected": -40.446876525878906, "step": 9310 }, { "epoch": 2.349919642013046, "grad_norm": 0.00015059942961670458, "learning_rate": 6.821439043072793e-08, "logits/chosen": 0.28759536147117615, "logits/rejected": NaN, "logps/chosen": -167.5906219482422, "logps/rejected": -925.4749755859375, "loss": 0.0029, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.5039641857147217, "rewards/margins": 37.51874923706055, "rewards/rejected": -41.015625, "step": 9320 }, { "epoch": 2.3524406768978667, "grad_norm": 0.16008149087429047, "learning_rate": 6.7711657154334e-08, "logits/chosen": 0.282766729593277, "logits/rejected": 1.0760986804962158, "logps/chosen": -197.65625, "logps/rejected": -924.7999877929688, "loss": 0.026, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -4.988659858703613, "rewards/margins": 36.713279724121094, "rewards/rejected": -41.70000076293945, "step": 9330 }, { "epoch": 2.354961711782687, "grad_norm": 0.030424844473600388, "learning_rate": 6.721049294536313e-08, "logits/chosen": 0.21492615342140198, "logits/rejected": NaN, "logps/chosen": -163.8718719482422, "logps/rejected": -962.8499755859375, "loss": 0.0041, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.4547486305236816, "rewards/margins": 39.4453125, "rewards/rejected": -42.90156173706055, "step": 9340 }, { "epoch": 2.357482746667507, "grad_norm": 4.935023784637451, "learning_rate": 6.671090211764266e-08, "logits/chosen": 0.33104246854782104, "logits/rejected": NaN, "logps/chosen": -182.6203155517578, "logps/rejected": -931.3499755859375, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.303235054016113, "rewards/margins": 37.9375, "rewards/rejected": -42.248435974121094, "step": 9350 }, { "epoch": 2.360003781552327, "grad_norm": 0.0413714200258255, "learning_rate": 6.62128889714568e-08, "logits/chosen": 0.44854736328125, "logits/rejected": NaN, "logps/chosen": -170.859375, "logps/rejected": -910.9500122070312, "loss": 0.0109, "rewards/accuracies": 0.984375, "rewards/chosen": -4.281152248382568, "rewards/margins": 37.06562423706055, "rewards/rejected": -41.3515625, "step": 9360 }, { "epoch": 2.3625248164371473, "grad_norm": 0.01733812317252159, "learning_rate": 6.571645779350984e-08, "logits/chosen": 0.4319869875907898, "logits/rejected": NaN, "logps/chosen": -167.7312469482422, "logps/rejected": -949.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.473486423492432, "rewards/margins": 39.556251525878906, "rewards/rejected": -44.015625, "step": 9370 }, { "epoch": 2.365045851321968, "grad_norm": 0.02040502056479454, "learning_rate": 6.522161285688899e-08, "logits/chosen": 0.44688111543655396, "logits/rejected": NaN, "logps/chosen": -175.00625610351562, "logps/rejected": -966.5250244140625, "loss": 0.06, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.495459079742432, "rewards/margins": 39.23749923706055, "rewards/rejected": -43.72968673706055, "step": 9380 }, { "epoch": 2.367566886206788, "grad_norm": 0.004684232175350189, "learning_rate": 6.472835842102758e-08, "logits/chosen": 0.27312010526657104, "logits/rejected": NaN, "logps/chosen": -180.86563110351562, "logps/rejected": -949.0499877929688, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.181872367858887, "rewards/margins": 39.209373474121094, "rewards/rejected": -43.404685974121094, "step": 9390 }, { "epoch": 2.370087921091608, "grad_norm": 7.352055399678648e-05, "learning_rate": 6.42366987316689e-08, "logits/chosen": 0.42982786893844604, "logits/rejected": 0.9242447018623352, "logps/chosen": -186.1906280517578, "logps/rejected": -974.25, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.263574123382568, "rewards/margins": 40.295310974121094, "rewards/rejected": -45.54218673706055, "step": 9400 }, { "epoch": 2.3726089559764283, "grad_norm": 0.0007447283715009689, "learning_rate": 6.374663802082886e-08, "logits/chosen": 0.3003524839878082, "logits/rejected": NaN, "logps/chosen": -189.4921875, "logps/rejected": -954.4000244140625, "loss": 0.0067, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.675341606140137, "rewards/margins": 38.15937423706055, "rewards/rejected": -42.826560974121094, "step": 9410 }, { "epoch": 2.3751299908612484, "grad_norm": 0.00033585389610379934, "learning_rate": 6.325818050676032e-08, "logits/chosen": 0.44846802949905396, "logits/rejected": NaN, "logps/chosen": -157.4093780517578, "logps/rejected": -904.6500244140625, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.096948146820068, "rewards/margins": 38.178123474121094, "rewards/rejected": -42.26250076293945, "step": 9420 }, { "epoch": 2.3776510257460686, "grad_norm": 0.01278859656304121, "learning_rate": 6.277133039391616e-08, "logits/chosen": 0.24196776747703552, "logits/rejected": 0.6980346441268921, "logps/chosen": -181.21875, "logps/rejected": -965.0999755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.935717821121216, "rewards/margins": 38.173439025878906, "rewards/rejected": -42.109375, "step": 9430 }, { "epoch": 2.380172060630889, "grad_norm": 28.424144744873047, "learning_rate": 6.228609187291365e-08, "logits/chosen": 0.29674071073532104, "logits/rejected": NaN, "logps/chosen": -176.0, "logps/rejected": -952.0999755859375, "loss": 0.0079, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9669432640075684, "rewards/margins": 36.625, "rewards/rejected": -40.58906173706055, "step": 9440 }, { "epoch": 2.3826930955157093, "grad_norm": 55.680458068847656, "learning_rate": 6.180246912049788e-08, "logits/chosen": 0.15998229384422302, "logits/rejected": 0.7671295404434204, "logps/chosen": -191.5749969482422, "logps/rejected": -932.25, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.596093654632568, "rewards/margins": 36.290626525878906, "rewards/rejected": -40.884376525878906, "step": 9450 }, { "epoch": 2.3852141304005294, "grad_norm": 163.96632385253906, "learning_rate": 6.132046629950605e-08, "logits/chosen": 0.20424652099609375, "logits/rejected": NaN, "logps/chosen": -181.3125, "logps/rejected": -915.1500244140625, "loss": 0.0723, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8846192359924316, "rewards/margins": 36.3984375, "rewards/rejected": -40.29218673706055, "step": 9460 }, { "epoch": 2.3877351652853496, "grad_norm": 0.0003758469538297504, "learning_rate": 6.084008755883183e-08, "logits/chosen": 0.25812989473342896, "logits/rejected": NaN, "logps/chosen": -193.10000610351562, "logps/rejected": -890.0999755859375, "loss": 0.015, "rewards/accuracies": 0.984375, "rewards/chosen": -4.642214298248291, "rewards/margins": 35.459373474121094, "rewards/rejected": -40.09687423706055, "step": 9470 }, { "epoch": 2.3902562001701697, "grad_norm": 0.12620648741722107, "learning_rate": 6.036133703338919e-08, "logits/chosen": 0.23021240532398224, "logits/rejected": NaN, "logps/chosen": -181.8054656982422, "logps/rejected": -904.2000122070312, "loss": 0.0392, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.199511528015137, "rewards/margins": 34.631248474121094, "rewards/rejected": -38.842185974121094, "step": 9480 }, { "epoch": 2.3927772350549903, "grad_norm": 0.0013787291245535016, "learning_rate": 5.988421884407715e-08, "logits/chosen": 0.34544676542282104, "logits/rejected": 0.7585815191268921, "logps/chosen": -175.19375610351562, "logps/rejected": -952.3250122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.7722411155700684, "rewards/margins": 38.203125, "rewards/rejected": -41.95624923706055, "step": 9490 }, { "epoch": 2.3952982699398104, "grad_norm": 1.8901970179285854e-05, "learning_rate": 5.9408737097744186e-08, "logits/chosen": 0.17846374213695526, "logits/rejected": NaN, "logps/chosen": -179.32656860351562, "logps/rejected": -953.2249755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.995501756668091, "rewards/margins": 36.732810974121094, "rewards/rejected": -40.720314025878906, "step": 9500 }, { "epoch": 2.3978193048246306, "grad_norm": 0.19533436000347137, "learning_rate": 5.893489588715303e-08, "logits/chosen": 0.336141973733902, "logits/rejected": 0.9755691289901733, "logps/chosen": -159.0281219482422, "logps/rejected": -926.9249877929688, "loss": 0.0222, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.932324171066284, "rewards/margins": 37.86249923706055, "rewards/rejected": -41.80156326293945, "step": 9510 }, { "epoch": 2.4003403397094507, "grad_norm": 1.43635892868042, "learning_rate": 5.846269929094516e-08, "logits/chosen": 0.29017335176467896, "logits/rejected": NaN, "logps/chosen": -174.74374389648438, "logps/rejected": -949.7000122070312, "loss": 0.0118, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.1309814453125, "rewards/margins": 37.287498474121094, "rewards/rejected": -41.44062423706055, "step": 9520 }, { "epoch": 2.402861374594271, "grad_norm": 7.495351019315422e-05, "learning_rate": 5.799215137360583e-08, "logits/chosen": 0.2871856689453125, "logits/rejected": NaN, "logps/chosen": -196.05313110351562, "logps/rejected": -941.5, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.469171047210693, "rewards/margins": 38.19062423706055, "rewards/rejected": -42.67499923706055, "step": 9530 }, { "epoch": 2.405382409479091, "grad_norm": 0.013077608309686184, "learning_rate": 5.7523256185429356e-08, "logits/chosen": 0.28207093477249146, "logits/rejected": NaN, "logps/chosen": -164.0500030517578, "logps/rejected": -923.0250244140625, "loss": 0.0168, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.249291896820068, "rewards/margins": 36.439064025878906, "rewards/rejected": -40.69062423706055, "step": 9540 }, { "epoch": 2.4079034443639116, "grad_norm": 0.0004856160085182637, "learning_rate": 5.7056017762483674e-08, "logits/chosen": 0.2739624083042145, "logits/rejected": 0.9291778802871704, "logps/chosen": -158.38436889648438, "logps/rejected": -945.4500122070312, "loss": 0.0149, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.319067478179932, "rewards/margins": 37.66093826293945, "rewards/rejected": -41.982810974121094, "step": 9550 }, { "epoch": 2.4104244792487317, "grad_norm": 4.135718822479248, "learning_rate": 5.6590440126576237e-08, "logits/chosen": 0.385934442281723, "logits/rejected": NaN, "logps/chosen": -176.8874969482422, "logps/rejected": -900.25, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.303857326507568, "rewards/margins": 35.748435974121094, "rewards/rejected": -40.046875, "step": 9560 }, { "epoch": 2.412945514133552, "grad_norm": 0.082797572016716, "learning_rate": 5.612652728521877e-08, "logits/chosen": 0.3556060791015625, "logits/rejected": NaN, "logps/chosen": -171.24063110351562, "logps/rejected": -923.4000244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.310986518859863, "rewards/margins": 37.5625, "rewards/rejected": -41.87187576293945, "step": 9570 }, { "epoch": 2.415466549018372, "grad_norm": 0.01004081778228283, "learning_rate": 5.566428323159345e-08, "logits/chosen": 0.23374709486961365, "logits/rejected": NaN, "logps/chosen": -183.29061889648438, "logps/rejected": -949.125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.731030464172363, "rewards/margins": 37.25312423706055, "rewards/rejected": -41.98749923706055, "step": 9580 }, { "epoch": 2.417987583903192, "grad_norm": 0.005286132451146841, "learning_rate": 5.520371194451787e-08, "logits/chosen": 0.39759141206741333, "logits/rejected": NaN, "logps/chosen": -174.3156280517578, "logps/rejected": -915.0750122070312, "loss": 0.0158, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.334179878234863, "rewards/margins": 37.498435974121094, "rewards/rejected": -41.84843826293945, "step": 9590 }, { "epoch": 2.4205086187880127, "grad_norm": 0.1591946929693222, "learning_rate": 5.4744817388411136e-08, "logits/chosen": 0.2820667326450348, "logits/rejected": NaN, "logps/chosen": -188.42813110351562, "logps/rejected": -904.2000122070312, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.274950981140137, "rewards/margins": 36.001564025878906, "rewards/rejected": -41.265625, "step": 9600 }, { "epoch": 2.423029653672833, "grad_norm": 0.06570759415626526, "learning_rate": 5.4287603513259924e-08, "logits/chosen": 0.3121833801269531, "logits/rejected": NaN, "logps/chosen": -177.78750610351562, "logps/rejected": -956.5, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.508056640625, "rewards/margins": 37.828125, "rewards/rejected": -42.337501525878906, "step": 9610 }, { "epoch": 2.425550688557653, "grad_norm": 2.634457588195801, "learning_rate": 5.3832074254584056e-08, "logits/chosen": 0.45370179414749146, "logits/rejected": 0.9185012578964233, "logps/chosen": -180.93124389648438, "logps/rejected": -939.2249755859375, "loss": 0.009, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.416699409484863, "rewards/margins": 38.52734375, "rewards/rejected": -43.939064025878906, "step": 9620 }, { "epoch": 2.428071723442473, "grad_norm": 0.0601741187274456, "learning_rate": 5.337823353340285e-08, "logits/chosen": 0.3696609437465668, "logits/rejected": 0.8266052007675171, "logps/chosen": -175.2781219482422, "logps/rejected": -985.7999877929688, "loss": 0.0116, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.796777248382568, "rewards/margins": 40.29375076293945, "rewards/rejected": -45.09687423706055, "step": 9630 }, { "epoch": 2.4305927583272933, "grad_norm": 45.90312957763672, "learning_rate": 5.292608525620138e-08, "logits/chosen": 0.371286004781723, "logits/rejected": NaN, "logps/chosen": -191.25, "logps/rejected": -941.375, "loss": 0.0271, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.282568454742432, "rewards/margins": 38.47187423706055, "rewards/rejected": -43.759376525878906, "step": 9640 }, { "epoch": 2.4331137932121134, "grad_norm": 0.25030517578125, "learning_rate": 5.247563331489688e-08, "logits/chosen": 0.3659515380859375, "logits/rejected": NaN, "logps/chosen": -172.5070343017578, "logps/rejected": -946.375, "loss": 0.0184, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.419470310211182, "rewards/margins": 36.962501525878906, "rewards/rejected": -41.37812423706055, "step": 9650 }, { "epoch": 2.4356348280969335, "grad_norm": 0.07569602876901627, "learning_rate": 5.20268815868051e-08, "logits/chosen": 0.3668777346611023, "logits/rejected": NaN, "logps/chosen": -164.1125030517578, "logps/rejected": -906.9000244140625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5660126209259033, "rewards/margins": 37.10859298706055, "rewards/rejected": -40.681251525878906, "step": 9660 }, { "epoch": 2.438155862981754, "grad_norm": 0.000265712063992396, "learning_rate": 5.157983393460696e-08, "logits/chosen": 0.3784378170967102, "logits/rejected": 1.0350021123886108, "logps/chosen": -191.53280639648438, "logps/rejected": -943.75, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.313940525054932, "rewards/margins": 37.053123474121094, "rewards/rejected": -42.35468673706055, "step": 9670 }, { "epoch": 2.4406768978665743, "grad_norm": 0.01203934196382761, "learning_rate": 5.113449420631558e-08, "logits/chosen": 0.3473251461982727, "logits/rejected": NaN, "logps/chosen": -180.15625, "logps/rejected": -942.7999877929688, "loss": 0.0231, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.765368700027466, "rewards/margins": 38.55781173706055, "rewards/rejected": -42.318748474121094, "step": 9680 }, { "epoch": 2.4431979327513944, "grad_norm": 0.0005524472799152136, "learning_rate": 5.06908662352426e-08, "logits/chosen": 0.2598815858364105, "logits/rejected": NaN, "logps/chosen": -178.82968139648438, "logps/rejected": -949.4500122070312, "loss": 0.015, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.580786228179932, "rewards/margins": 38.390625, "rewards/rejected": -42.9453125, "step": 9690 }, { "epoch": 2.4457189676362145, "grad_norm": 0.0328015573322773, "learning_rate": 5.0248953839965884e-08, "logits/chosen": 0.2330070436000824, "logits/rejected": NaN, "logps/chosen": -182.99063110351562, "logps/rejected": -943.1500244140625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.3687744140625, "rewards/margins": 38.6484375, "rewards/rejected": -43.03203201293945, "step": 9700 }, { "epoch": 2.448240002521035, "grad_norm": 0.0737701877951622, "learning_rate": 4.980876082429597e-08, "logits/chosen": 0.15483704209327698, "logits/rejected": 0.9031051397323608, "logps/chosen": -185.5281219482422, "logps/rejected": -936.5, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.389843940734863, "rewards/margins": 37.46171951293945, "rewards/rejected": -41.84843826293945, "step": 9710 }, { "epoch": 2.4507610374058553, "grad_norm": 0.043324220925569534, "learning_rate": 4.937029097724385e-08, "logits/chosen": 0.48939818143844604, "logits/rejected": NaN, "logps/chosen": -181.55313110351562, "logps/rejected": -889.8250122070312, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.660009860992432, "rewards/margins": 34.334373474121094, "rewards/rejected": -39.01093673706055, "step": 9720 }, { "epoch": 2.4532820722906754, "grad_norm": 0.021868793293833733, "learning_rate": 4.893354807298805e-08, "logits/chosen": 0.19950027763843536, "logits/rejected": NaN, "logps/chosen": -178.89688110351562, "logps/rejected": -960.5499877929688, "loss": 0.011, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.328515529632568, "rewards/margins": 39.52812576293945, "rewards/rejected": -43.87968826293945, "step": 9730 }, { "epoch": 2.4558031071754955, "grad_norm": 0.014247738756239414, "learning_rate": 4.849853587084218e-08, "logits/chosen": 0.09941864013671875, "logits/rejected": NaN, "logps/chosen": -176.5968780517578, "logps/rejected": -940.5499877929688, "loss": 0.0203, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.651824951171875, "rewards/margins": 38.07500076293945, "rewards/rejected": -41.72968673706055, "step": 9740 }, { "epoch": 2.4583241420603157, "grad_norm": 0.21384702622890472, "learning_rate": 4.806525811522288e-08, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -176.85311889648438, "logps/rejected": -959.9500122070312, "loss": 0.0236, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.900439262390137, "rewards/margins": 38.032814025878906, "rewards/rejected": -42.93281173706055, "step": 9750 }, { "epoch": 2.460845176945136, "grad_norm": 0.184492290019989, "learning_rate": 4.763371853561709e-08, "logits/chosen": 0.15521851181983948, "logits/rejected": NaN, "logps/chosen": -194.14999389648438, "logps/rejected": -895.75, "loss": 0.0042, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.1741943359375, "rewards/margins": 35.89374923706055, "rewards/rejected": -40.0546875, "step": 9760 }, { "epoch": 2.463366211829956, "grad_norm": 105.94002532958984, "learning_rate": 4.720392084655031e-08, "logits/chosen": 0.32419127225875854, "logits/rejected": NaN, "logps/chosen": -163.1453094482422, "logps/rejected": -923.7999877929688, "loss": 0.0226, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.223059177398682, "rewards/margins": 38.32343673706055, "rewards/rejected": -42.55156326293945, "step": 9770 }, { "epoch": 2.4658872467147765, "grad_norm": 79.15545654296875, "learning_rate": 4.677586874755443e-08, "logits/chosen": 0.29498594999313354, "logits/rejected": 0.9323959350585938, "logps/chosen": -177.078125, "logps/rejected": -940.0499877929688, "loss": 0.0092, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.805743217468262, "rewards/margins": 38.390625, "rewards/rejected": -43.204689025878906, "step": 9780 }, { "epoch": 2.4684082815995967, "grad_norm": 0.12477759271860123, "learning_rate": 4.634956592313624e-08, "logits/chosen": 0.2040863037109375, "logits/rejected": 0.75653076171875, "logps/chosen": -187.265625, "logps/rejected": -918.4500122070312, "loss": 0.0104, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.693212985992432, "rewards/margins": 36.732810974121094, "rewards/rejected": -41.431251525878906, "step": 9790 }, { "epoch": 2.470929316484417, "grad_norm": 5.2001828407810535e-06, "learning_rate": 4.592501604274512e-08, "logits/chosen": 0.4724174439907074, "logits/rejected": 1.0791015625, "logps/chosen": -165.1125030517578, "logps/rejected": -909.4249877929688, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.634051322937012, "rewards/margins": 37.6953125, "rewards/rejected": -42.306251525878906, "step": 9800 }, { "epoch": 2.473450351369237, "grad_norm": 0.09914711862802505, "learning_rate": 4.550222276074198e-08, "logits/chosen": 0.12412261962890625, "logits/rejected": NaN, "logps/chosen": -180.0500030517578, "logps/rejected": -949.6500244140625, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.400403022766113, "rewards/margins": 38.153907775878906, "rewards/rejected": -42.55781173706055, "step": 9810 }, { "epoch": 2.475971386254057, "grad_norm": 0.008237622678279877, "learning_rate": 4.5081189716367596e-08, "logits/chosen": 0.30174559354782104, "logits/rejected": NaN, "logps/chosen": -164.3468780517578, "logps/rejected": -932.5499877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.7926268577575684, "rewards/margins": 38.09843826293945, "rewards/rejected": -41.88750076293945, "step": 9820 }, { "epoch": 2.4784924211388777, "grad_norm": 183.7530517578125, "learning_rate": 4.466192053371123e-08, "logits/chosen": 0.2579101622104645, "logits/rejected": NaN, "logps/chosen": -180.1687469482422, "logps/rejected": -909.7000122070312, "loss": 0.1284, "rewards/accuracies": 0.984375, "rewards/chosen": -4.300195217132568, "rewards/margins": 36.12578201293945, "rewards/rejected": -40.42499923706055, "step": 9830 }, { "epoch": 2.481013456023698, "grad_norm": 0.0035131394397467375, "learning_rate": 4.424441882167962e-08, "logits/chosen": 0.24027404189109802, "logits/rejected": NaN, "logps/chosen": -172.1218719482422, "logps/rejected": -970.2750244140625, "loss": 0.0383, "rewards/accuracies": 0.984375, "rewards/chosen": -4.022875785827637, "rewards/margins": 39.451560974121094, "rewards/rejected": -43.45000076293945, "step": 9840 }, { "epoch": 2.483534490908518, "grad_norm": 0.4334668219089508, "learning_rate": 4.382868817396562e-08, "logits/chosen": 0.2967468202114105, "logits/rejected": NaN, "logps/chosen": -176.39999389648438, "logps/rejected": -945.2000122070312, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.418920993804932, "rewards/margins": 38.04375076293945, "rewards/rejected": -42.470314025878906, "step": 9850 }, { "epoch": 2.486055525793338, "grad_norm": 0.0009708735742606223, "learning_rate": 4.3414732169017724e-08, "logits/chosen": 0.33478546142578125, "logits/rejected": 0.8664795160293579, "logps/chosen": -179.2843780517578, "logps/rejected": -936.7999877929688, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.241869926452637, "rewards/margins": 37.740623474121094, "rewards/rejected": -41.970314025878906, "step": 9860 }, { "epoch": 2.4885765606781582, "grad_norm": 0.0036616893485188484, "learning_rate": 4.300255437000869e-08, "logits/chosen": 0.2796874940395355, "logits/rejected": NaN, "logps/chosen": -181.0968780517578, "logps/rejected": -982.8499755859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.112963676452637, "rewards/margins": 39.38593673706055, "rewards/rejected": -43.484375, "step": 9870 }, { "epoch": 2.4910975955629784, "grad_norm": 120.64323425292969, "learning_rate": 4.259215832480531e-08, "logits/chosen": 0.2659362852573395, "logits/rejected": NaN, "logps/chosen": -181.5500030517578, "logps/rejected": -963.9500122070312, "loss": 0.0385, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.136328220367432, "rewards/margins": 39.96406173706055, "rewards/rejected": -44.109375, "step": 9880 }, { "epoch": 2.493618630447799, "grad_norm": 0.0005003733094781637, "learning_rate": 4.218354756593781e-08, "logits/chosen": 0.5251709222793579, "logits/rejected": 0.8514205813407898, "logps/chosen": -175.6999969482422, "logps/rejected": -952.7249755859375, "loss": 0.037, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.080517768859863, "rewards/margins": 37.790626525878906, "rewards/rejected": -42.868751525878906, "step": 9890 }, { "epoch": 2.496139665332619, "grad_norm": 9.535076969768852e-06, "learning_rate": 4.177672561056922e-08, "logits/chosen": 0.4294982850551605, "logits/rejected": NaN, "logps/chosen": -177.0187530517578, "logps/rejected": -934.25, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.147924900054932, "rewards/margins": 38.564064025878906, "rewards/rejected": -42.712501525878906, "step": 9900 }, { "epoch": 2.4986607002174392, "grad_norm": 2.194819927215576, "learning_rate": 4.1371695960465304e-08, "logits/chosen": 0.29500502347946167, "logits/rejected": NaN, "logps/chosen": -195.3718719482422, "logps/rejected": -968.9249877929688, "loss": 0.0082, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.935956001281738, "rewards/margins": 38.860939025878906, "rewards/rejected": -43.790626525878906, "step": 9910 }, { "epoch": 2.5011817351022594, "grad_norm": 0.0022303611040115356, "learning_rate": 4.096846210196428e-08, "logits/chosen": 0.3044174313545227, "logits/rejected": NaN, "logps/chosen": -180.05624389648438, "logps/rejected": -941.3499755859375, "loss": 0.0123, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.940258979797363, "rewards/margins": 38.1875, "rewards/rejected": -43.1171875, "step": 9920 }, { "epoch": 2.50370276998708, "grad_norm": 0.6928690671920776, "learning_rate": 4.0567027505947093e-08, "logits/chosen": 0.2946929931640625, "logits/rejected": NaN, "logps/chosen": -189.77499389648438, "logps/rejected": -927.2999877929688, "loss": 0.0744, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.949975490570068, "rewards/margins": 36.048439025878906, "rewards/rejected": -40.998435974121094, "step": 9930 }, { "epoch": 2.5062238048719, "grad_norm": 0.012021396309137344, "learning_rate": 4.016739562780713e-08, "logits/chosen": 0.357382208108902, "logits/rejected": NaN, "logps/chosen": -208.3828125, "logps/rejected": -961.5499877929688, "loss": 0.0106, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.161145210266113, "rewards/margins": 38.798439025878906, "rewards/rejected": -43.9609375, "step": 9940 }, { "epoch": 2.5087448397567202, "grad_norm": 0.010567042976617813, "learning_rate": 3.976956990742072e-08, "logits/chosen": 0.281961053609848, "logits/rejected": NaN, "logps/chosen": -179.1218719482422, "logps/rejected": -941.875, "loss": 0.0036, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.430938720703125, "rewards/margins": 38.029685974121094, "rewards/rejected": -42.44843673706055, "step": 9950 }, { "epoch": 2.5112658746415404, "grad_norm": 4.088534114998765e-05, "learning_rate": 3.937355376911758e-08, "logits/chosen": 0.5276581048965454, "logits/rejected": 0.8477783203125, "logps/chosen": -174.2937469482422, "logps/rejected": -983.625, "loss": 0.013, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.692736625671387, "rewards/margins": 38.900001525878906, "rewards/rejected": -43.599998474121094, "step": 9960 }, { "epoch": 2.5137869095263605, "grad_norm": 0.00529529107734561, "learning_rate": 3.897935062165111e-08, "logits/chosen": 0.23216858506202698, "logits/rejected": NaN, "logps/chosen": -197.015625, "logps/rejected": -946.5, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.194531440734863, "rewards/margins": 37.10625076293945, "rewards/rejected": -42.296875, "step": 9970 }, { "epoch": 2.5163079444111807, "grad_norm": 0.00072746432852, "learning_rate": 3.858696385816937e-08, "logits/chosen": 0.33177489042282104, "logits/rejected": NaN, "logps/chosen": -177.0749969482422, "logps/rejected": -959.2750244140625, "loss": 0.0287, "rewards/accuracies": 0.984375, "rewards/chosen": -4.558886528015137, "rewards/margins": 39.603126525878906, "rewards/rejected": -44.162498474121094, "step": 9980 }, { "epoch": 2.518828979296001, "grad_norm": 0.00798420887440443, "learning_rate": 3.819639685618545e-08, "logits/chosen": 0.178141787648201, "logits/rejected": NaN, "logps/chosen": -171.52969360351562, "logps/rejected": -914.1500244140625, "loss": 0.0587, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.409228324890137, "rewards/margins": 35.56718826293945, "rewards/rejected": -39.985939025878906, "step": 9990 }, { "epoch": 2.5213500141808214, "grad_norm": 0.005533737130463123, "learning_rate": 3.780765297754887e-08, "logits/chosen": 0.40092164278030396, "logits/rejected": 1.015045166015625, "logps/chosen": -199.7531280517578, "logps/rejected": -979.3499755859375, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.098340034484863, "rewards/margins": 38.079689025878906, "rewards/rejected": -43.1796875, "step": 10000 }, { "epoch": 2.5238710490656415, "grad_norm": 0.0012829650659114122, "learning_rate": 3.7420735568416295e-08, "logits/chosen": 0.2853759825229645, "logits/rejected": NaN, "logps/chosen": -189.8781280517578, "logps/rejected": -901.4000244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.256335258483887, "rewards/margins": 36.7265625, "rewards/rejected": -40.99687576293945, "step": 10010 }, { "epoch": 2.5263920839504617, "grad_norm": 9.308197149948683e-06, "learning_rate": 3.703564795922276e-08, "logits/chosen": 0.3661506772041321, "logits/rejected": NaN, "logps/chosen": -186.1750030517578, "logps/rejected": -913.0499877929688, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -4.644055366516113, "rewards/margins": 35.25468826293945, "rewards/rejected": -39.900001525878906, "step": 10020 }, { "epoch": 2.528913118835282, "grad_norm": 6.542544364929199, "learning_rate": 3.665239346465335e-08, "logits/chosen": 0.21446838974952698, "logits/rejected": 0.7662353515625, "logps/chosen": -192.8093719482422, "logps/rejected": -965.4500122070312, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.634472846984863, "rewards/margins": 38.19062423706055, "rewards/rejected": -42.821876525878906, "step": 10030 }, { "epoch": 2.531434153720102, "grad_norm": 1.2078587133146357e-05, "learning_rate": 3.627097538361415e-08, "logits/chosen": 0.45177918672561646, "logits/rejected": 1.005651831626892, "logps/chosen": -184.6281280517578, "logps/rejected": -913.7000122070312, "loss": 0.0177, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.717871189117432, "rewards/margins": 36.946876525878906, "rewards/rejected": -41.6640625, "step": 10040 }, { "epoch": 2.5339551886049225, "grad_norm": 0.10679259151220322, "learning_rate": 3.589139699920424e-08, "logits/chosen": 0.270669549703598, "logits/rejected": NaN, "logps/chosen": -196.0437469482422, "logps/rejected": -938.25, "loss": 0.003, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.968554496765137, "rewards/margins": 37.8984375, "rewards/rejected": -42.876564025878906, "step": 10050 }, { "epoch": 2.5364762234897427, "grad_norm": 72.3330307006836, "learning_rate": 3.5513661578687236e-08, "logits/chosen": 0.31103819608688354, "logits/rejected": 0.7081100344657898, "logps/chosen": -184.46875, "logps/rejected": -967.0999755859375, "loss": 0.0137, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -5.064611911773682, "rewards/margins": 38.708595275878906, "rewards/rejected": -43.76093673706055, "step": 10060 }, { "epoch": 2.538997258374563, "grad_norm": 0.003053137566894293, "learning_rate": 3.513777237346335e-08, "logits/chosen": 0.2874511778354645, "logits/rejected": NaN, "logps/chosen": -187.25, "logps/rejected": -940.75, "loss": 0.0116, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.561669826507568, "rewards/margins": 37.032814025878906, "rewards/rejected": -41.60468673706055, "step": 10070 }, { "epoch": 2.541518293259383, "grad_norm": 0.0008214399567805231, "learning_rate": 3.476373261904117e-08, "logits/chosen": 0.08816833794116974, "logits/rejected": NaN, "logps/chosen": -214.39999389648438, "logps/rejected": -961.2000122070312, "loss": 0.0182, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.942162990570068, "rewards/margins": 36.78125, "rewards/rejected": -41.728126525878906, "step": 10080 }, { "epoch": 2.544039328144203, "grad_norm": 0.006694540847092867, "learning_rate": 3.439154553500992e-08, "logits/chosen": 0.2040359526872635, "logits/rejected": NaN, "logps/chosen": -197.1906280517578, "logps/rejected": -916.2999877929688, "loss": 0.0117, "rewards/accuracies": 0.984375, "rewards/chosen": -4.466454982757568, "rewards/margins": 36.5859375, "rewards/rejected": -41.0625, "step": 10090 }, { "epoch": 2.546560363029023, "grad_norm": 0.057096052914857864, "learning_rate": 3.402121432501193e-08, "logits/chosen": 0.24578246474266052, "logits/rejected": 0.795819103717804, "logps/chosen": -177.50625610351562, "logps/rejected": -930.0999755859375, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.2308349609375, "rewards/margins": 36.134376525878906, "rewards/rejected": -40.3671875, "step": 10100 }, { "epoch": 2.5490813979138434, "grad_norm": 3.050635814666748, "learning_rate": 3.3652742176714625e-08, "logits/chosen": 0.15234223008155823, "logits/rejected": NaN, "logps/chosen": -182.11874389648438, "logps/rejected": -972.2999877929688, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.939282178878784, "rewards/margins": 37.359375, "rewards/rejected": -41.31562423706055, "step": 10110 }, { "epoch": 2.551602432798664, "grad_norm": 0.004248359706252813, "learning_rate": 3.328613226178359e-08, "logits/chosen": 0.21459349989891052, "logits/rejected": NaN, "logps/chosen": -174.328125, "logps/rejected": -931.5250244140625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9927000999450684, "rewards/margins": 35.8359375, "rewards/rejected": -39.837501525878906, "step": 10120 }, { "epoch": 2.554123467683484, "grad_norm": 1.7066938877105713, "learning_rate": 3.2921387735854845e-08, "logits/chosen": 0.2506423890590668, "logits/rejected": NaN, "logps/chosen": -180.1515655517578, "logps/rejected": -930.4000244140625, "loss": 0.0164, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.823901414871216, "rewards/margins": 38.166404724121094, "rewards/rejected": -41.95781326293945, "step": 10130 }, { "epoch": 2.5566445025683042, "grad_norm": 9.082412725547329e-05, "learning_rate": 3.255851173850804e-08, "logits/chosen": 0.1978500336408615, "logits/rejected": 0.8368171453475952, "logps/chosen": -165.1593780517578, "logps/rejected": -929.7999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.349285840988159, "rewards/margins": 37.6328125, "rewards/rejected": -40.967185974121094, "step": 10140 }, { "epoch": 2.5591655374531244, "grad_norm": 0.007557215169072151, "learning_rate": 3.219750739323906e-08, "logits/chosen": 0.11488647758960724, "logits/rejected": 0.7171920537948608, "logps/chosen": -173.54061889648438, "logps/rejected": -927.2000122070312, "loss": 0.0339, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.7828369140625, "rewards/margins": 36.795310974121094, "rewards/rejected": -40.58124923706055, "step": 10150 }, { "epoch": 2.561686572337945, "grad_norm": 0.01292756199836731, "learning_rate": 3.183837780743345e-08, "logits/chosen": 0.3789993226528168, "logits/rejected": NaN, "logps/chosen": -179.5500030517578, "logps/rejected": -885.0499877929688, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8778929710388184, "rewards/margins": 35.595314025878906, "rewards/rejected": -39.462501525878906, "step": 10160 }, { "epoch": 2.564207607222765, "grad_norm": 5.922001219005324e-05, "learning_rate": 3.1481126072339575e-08, "logits/chosen": 0.26081085205078125, "logits/rejected": NaN, "logps/chosen": -183.546875, "logps/rejected": -961.7750244140625, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.022732734680176, "rewards/margins": 38.451560974121094, "rewards/rejected": -42.47968673706055, "step": 10170 }, { "epoch": 2.5667286421075852, "grad_norm": 0.00035689485957846045, "learning_rate": 3.1125755263041894e-08, "logits/chosen": 0.26795655488967896, "logits/rejected": NaN, "logps/chosen": -186.44375610351562, "logps/rejected": -923.2000122070312, "loss": 0.0078, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.098828315734863, "rewards/margins": 36.7421875, "rewards/rejected": -40.84687423706055, "step": 10180 }, { "epoch": 2.5692496769924054, "grad_norm": 0.14266346395015717, "learning_rate": 3.077226843843467e-08, "logits/chosen": 0.277841180562973, "logits/rejected": 0.820880115032196, "logps/chosen": -178.88436889648438, "logps/rejected": -915.5499877929688, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.478137016296387, "rewards/margins": 35.076560974121094, "rewards/rejected": -39.560935974121094, "step": 10190 }, { "epoch": 2.5717707118772255, "grad_norm": 0.0007406485383398831, "learning_rate": 3.042066864119544e-08, "logits/chosen": 0.3263793885707855, "logits/rejected": NaN, "logps/chosen": -174.36874389648438, "logps/rejected": -937.0499877929688, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.9058594703674316, "rewards/margins": 37.65625, "rewards/rejected": -41.560935974121094, "step": 10200 }, { "epoch": 2.5742917467620456, "grad_norm": 173.31092834472656, "learning_rate": 3.0070958897759155e-08, "logits/chosen": 0.3054443299770355, "logits/rejected": NaN, "logps/chosen": -176.7062530517578, "logps/rejected": -940.9000244140625, "loss": 0.0158, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.153491020202637, "rewards/margins": 38.118751525878906, "rewards/rejected": -42.28125, "step": 10210 }, { "epoch": 2.576812781646866, "grad_norm": 3.197899422957562e-05, "learning_rate": 2.9723142218291726e-08, "logits/chosen": 0.2561187744140625, "logits/rejected": NaN, "logps/chosen": -178.81875610351562, "logps/rejected": -924.4500122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7543091773986816, "rewards/margins": 35.91718673706055, "rewards/rejected": -39.6640625, "step": 10220 }, { "epoch": 2.5793338165316864, "grad_norm": 0.11181757599115372, "learning_rate": 2.9377221596664252e-08, "logits/chosen": 0.31110841035842896, "logits/rejected": 0.9173629879951477, "logps/chosen": -174.3249969482422, "logps/rejected": -922.0499877929688, "loss": 0.0104, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.273348808288574, "rewards/margins": 36.415626525878906, "rewards/rejected": -40.662498474121094, "step": 10230 }, { "epoch": 2.5818548514165065, "grad_norm": 0.43740227818489075, "learning_rate": 2.903320001042761e-08, "logits/chosen": 0.27217406034469604, "logits/rejected": NaN, "logps/chosen": -163.5656280517578, "logps/rejected": -935.0999755859375, "loss": 0.0576, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.821768283843994, "rewards/margins": 37.70000076293945, "rewards/rejected": -41.50390625, "step": 10240 }, { "epoch": 2.5843758863013266, "grad_norm": 0.00010748999193310738, "learning_rate": 2.8691080420786133e-08, "logits/chosen": 0.30871278047561646, "logits/rejected": NaN, "logps/chosen": -171.0281219482422, "logps/rejected": -905.4000244140625, "loss": 0.0522, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.935412645339966, "rewards/margins": 36.48749923706055, "rewards/rejected": -40.4140625, "step": 10250 }, { "epoch": 2.586896921186147, "grad_norm": 7.566466592834331e-06, "learning_rate": 2.8350865772572812e-08, "logits/chosen": 0.2697692811489105, "logits/rejected": NaN, "logps/chosen": -174.8562469482422, "logps/rejected": -922.0499877929688, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.005279541015625, "rewards/margins": 36.859375, "rewards/rejected": -40.87187576293945, "step": 10260 }, { "epoch": 2.5894179560709674, "grad_norm": 0.0008949653129093349, "learning_rate": 2.8012558994223416e-08, "logits/chosen": 0.29017943143844604, "logits/rejected": 0.7354675531387329, "logps/chosen": -166.2156219482422, "logps/rejected": -887.3499755859375, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.586560010910034, "rewards/margins": 34.243751525878906, "rewards/rejected": -37.82500076293945, "step": 10270 }, { "epoch": 2.5919389909557875, "grad_norm": 6.198712071636692e-05, "learning_rate": 2.7676162997751718e-08, "logits/chosen": 0.3956771790981293, "logits/rejected": NaN, "logps/chosen": -190.01406860351562, "logps/rejected": -919.5250244140625, "loss": 0.032, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.236914157867432, "rewards/margins": 34.6171875, "rewards/rejected": -38.857810974121094, "step": 10280 }, { "epoch": 2.5944600258406076, "grad_norm": 0.0006612225552089512, "learning_rate": 2.7341680678724028e-08, "logits/chosen": 0.09900512546300888, "logits/rejected": 0.8384186029434204, "logps/chosen": -159.5593719482422, "logps/rejected": -922.7000122070312, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.4183716773986816, "rewards/margins": 37.30937576293945, "rewards/rejected": -40.71562576293945, "step": 10290 }, { "epoch": 2.596981060725428, "grad_norm": 0.015336347743868828, "learning_rate": 2.7009114916234534e-08, "logits/chosen": 0.330740362405777, "logits/rejected": 0.8606246709823608, "logps/chosen": -196.9328155517578, "logps/rejected": -933.5750122070312, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": -4.353448390960693, "rewards/margins": 35.548439025878906, "rewards/rejected": -39.912498474121094, "step": 10300 }, { "epoch": 2.599502095610248, "grad_norm": 25.633609771728516, "learning_rate": 2.667846857288053e-08, "logits/chosen": 0.2768005430698395, "logits/rejected": NaN, "logps/chosen": -172.9406280517578, "logps/rejected": -894.7999877929688, "loss": 0.0031, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.929443359375, "rewards/margins": 35.52812576293945, "rewards/rejected": -39.47187423706055, "step": 10310 }, { "epoch": 2.602023130495068, "grad_norm": 0.003824861254543066, "learning_rate": 2.634974449473759e-08, "logits/chosen": 0.065216064453125, "logits/rejected": 0.6028808355331421, "logps/chosen": -179.8125, "logps/rejected": -962.4500122070312, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.01434326171875, "rewards/margins": 38.80156326293945, "rewards/rejected": -42.82343673706055, "step": 10320 }, { "epoch": 2.604544165379888, "grad_norm": 0.0009421991417184472, "learning_rate": 2.602294551133519e-08, "logits/chosen": 0.412118524312973, "logits/rejected": 0.8791229128837585, "logps/chosen": -173.6687469482422, "logps/rejected": -956.25, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.216210842132568, "rewards/margins": 38.673439025878906, "rewards/rejected": -42.8984375, "step": 10330 }, { "epoch": 2.607065200264709, "grad_norm": 0.002771044382825494, "learning_rate": 2.569807443563232e-08, "logits/chosen": 0.22050170600414276, "logits/rejected": NaN, "logps/chosen": -173.359375, "logps/rejected": -900.0999755859375, "loss": 0.0088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8273072242736816, "rewards/margins": 35.37421798706055, "rewards/rejected": -39.204689025878906, "step": 10340 }, { "epoch": 2.609586235149529, "grad_norm": 0.0005943687283433974, "learning_rate": 2.5375134063993416e-08, "logits/chosen": 0.2651214599609375, "logits/rejected": 0.8464401364326477, "logps/chosen": -177.8984375, "logps/rejected": -911.0750122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.065673828125, "rewards/margins": 35.396873474121094, "rewards/rejected": -39.46406173706055, "step": 10350 }, { "epoch": 2.612107270034349, "grad_norm": 0.024271707981824875, "learning_rate": 2.5054127176164008e-08, "logits/chosen": 0.399392694234848, "logits/rejected": 0.7767699956893921, "logps/chosen": -165.734375, "logps/rejected": -908.2000122070312, "loss": 0.0323, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.838818311691284, "rewards/margins": 36.365623474121094, "rewards/rejected": -40.1953125, "step": 10360 }, { "epoch": 2.614628304919169, "grad_norm": 8.782892227172852, "learning_rate": 2.473505653524699e-08, "logits/chosen": 0.24695511162281036, "logits/rejected": 0.8702995181083679, "logps/chosen": -200.2062530517578, "logps/rejected": -942.9000244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.626928806304932, "rewards/margins": 35.6796875, "rewards/rejected": -40.296875, "step": 10370 }, { "epoch": 2.61714933980399, "grad_norm": 0.002604049863293767, "learning_rate": 2.4417924887678825e-08, "logits/chosen": 0.09686279296875, "logits/rejected": NaN, "logps/chosen": -174.984375, "logps/rejected": -899.5999755859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.456713914871216, "rewards/margins": 35.810935974121094, "rewards/rejected": -39.271873474121094, "step": 10380 }, { "epoch": 2.61967037468881, "grad_norm": 0.00016281355055980384, "learning_rate": 2.4102734963205834e-08, "logits/chosen": 0.14175109565258026, "logits/rejected": NaN, "logps/chosen": -170.24063110351562, "logps/rejected": -926.0, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8734374046325684, "rewards/margins": 37.02812576293945, "rewards/rejected": -40.896873474121094, "step": 10390 }, { "epoch": 2.62219140957363, "grad_norm": 9.959085582522675e-05, "learning_rate": 2.378948947486084e-08, "logits/chosen": 0.3086914122104645, "logits/rejected": 0.9489593505859375, "logps/chosen": -176.29843139648438, "logps/rejected": -914.1749877929688, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.931985378265381, "rewards/margins": 35.765625, "rewards/rejected": -39.7109375, "step": 10400 }, { "epoch": 2.62471244445845, "grad_norm": 0.009016551077365875, "learning_rate": 2.3478191118939504e-08, "logits/chosen": 0.46402281522750854, "logits/rejected": 0.859051525592804, "logps/chosen": -192.0, "logps/rejected": -924.9500122070312, "loss": 0.0391, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.493371486663818, "rewards/margins": 35.39374923706055, "rewards/rejected": -39.90312576293945, "step": 10410 }, { "epoch": 2.6272334793432703, "grad_norm": 79.52387237548828, "learning_rate": 2.3168842574977558e-08, "logits/chosen": 0.20639649033546448, "logits/rejected": NaN, "logps/chosen": -177.2687530517578, "logps/rejected": -883.7999877929688, "loss": 0.0329, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.235302925109863, "rewards/margins": 35.842185974121094, "rewards/rejected": -40.07343673706055, "step": 10420 }, { "epoch": 2.6297545142280905, "grad_norm": 0.0006741678225807846, "learning_rate": 2.2861446505727412e-08, "logits/chosen": 0.302947998046875, "logits/rejected": 0.7790771722793579, "logps/chosen": -175.8000030517578, "logps/rejected": -909.2999877929688, "loss": 0.0114, "rewards/accuracies": 0.984375, "rewards/chosen": -3.758056640625, "rewards/margins": 36.58515548706055, "rewards/rejected": -40.33124923706055, "step": 10430 }, { "epoch": 2.6322755491129106, "grad_norm": 0.008349223993718624, "learning_rate": 2.255600555713519e-08, "logits/chosen": 0.21286925673484802, "logits/rejected": NaN, "logps/chosen": -180.8171844482422, "logps/rejected": -911.375, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.913464307785034, "rewards/margins": 36.23749923706055, "rewards/rejected": -40.150001525878906, "step": 10440 }, { "epoch": 2.634796583997731, "grad_norm": 0.002291887067258358, "learning_rate": 2.225252235831837e-08, "logits/chosen": 0.22842101752758026, "logits/rejected": NaN, "logps/chosen": -174.49063110351562, "logps/rejected": -918.2000122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.838000535964966, "rewards/margins": 36.69218826293945, "rewards/rejected": -40.51093673706055, "step": 10450 }, { "epoch": 2.6373176188825513, "grad_norm": 0.00019312271615490317, "learning_rate": 2.1950999521542695e-08, "logits/chosen": 0.23821410536766052, "logits/rejected": 0.9559692144393921, "logps/chosen": -162.015625, "logps/rejected": -942.2999877929688, "loss": 0.0069, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.518902540206909, "rewards/margins": 39.03437423706055, "rewards/rejected": -42.546875, "step": 10460 }, { "epoch": 2.6398386537673715, "grad_norm": 0.6867592930793762, "learning_rate": 2.165143964219987e-08, "logits/chosen": 0.23706817626953125, "logits/rejected": NaN, "logps/chosen": -184.95468139648438, "logps/rejected": -929.9000244140625, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.826464653015137, "rewards/margins": 37.01093673706055, "rewards/rejected": -41.818748474121094, "step": 10470 }, { "epoch": 2.6423596886521916, "grad_norm": 0.005006849300116301, "learning_rate": 2.1353845298785255e-08, "logits/chosen": 0.20872803032398224, "logits/rejected": NaN, "logps/chosen": -175.14999389648438, "logps/rejected": -923.0999755859375, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.6313538551330566, "rewards/margins": 36.3125, "rewards/rejected": -39.93437576293945, "step": 10480 }, { "epoch": 2.644880723537012, "grad_norm": 0.035266440361738205, "learning_rate": 2.1058219052875747e-08, "logits/chosen": 0.23966674506664276, "logits/rejected": NaN, "logps/chosen": -173.02499389648438, "logps/rejected": -920.4249877929688, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.8713135719299316, "rewards/margins": 36.587501525878906, "rewards/rejected": -40.454689025878906, "step": 10490 }, { "epoch": 2.6474017584218323, "grad_norm": 0.623710036277771, "learning_rate": 2.0764563449107457e-08, "logits/chosen": 0.3844451904296875, "logits/rejected": NaN, "logps/chosen": -180.7453155517578, "logps/rejected": -888.4749755859375, "loss": 0.0273, "rewards/accuracies": 0.984375, "rewards/chosen": -3.958984375, "rewards/margins": 35.661720275878906, "rewards/rejected": -39.607032775878906, "step": 10500 }, { "epoch": 2.6499227933066525, "grad_norm": 0.002897808328270912, "learning_rate": 2.0472881015153988e-08, "logits/chosen": 0.19672545790672302, "logits/rejected": 0.9147583246231079, "logps/chosen": -158.4875030517578, "logps/rejected": -937.5499877929688, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.7312989234924316, "rewards/margins": 37.857810974121094, "rewards/rejected": -41.5859375, "step": 10510 }, { "epoch": 2.6524438281914726, "grad_norm": 3.5973856449127197, "learning_rate": 2.0183174261704794e-08, "logits/chosen": 0.2649169862270355, "logits/rejected": NaN, "logps/chosen": -163.1906280517578, "logps/rejected": -910.0499877929688, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -3.95703125, "rewards/margins": 37.946876525878906, "rewards/rejected": -41.904685974121094, "step": 10520 }, { "epoch": 2.6549648630762928, "grad_norm": 0.5990272760391235, "learning_rate": 1.989544568244328e-08, "logits/chosen": 0.2666580080986023, "logits/rejected": NaN, "logps/chosen": -180.4031219482422, "logps/rejected": -919.7999877929688, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.120153903961182, "rewards/margins": 36.18281173706055, "rewards/rejected": -40.30937576293945, "step": 10530 }, { "epoch": 2.657485897961113, "grad_norm": 2.8077635765075684, "learning_rate": 1.9609697754025683e-08, "logits/chosen": 0.2585510313510895, "logits/rejected": NaN, "logps/chosen": -190.6062469482422, "logps/rejected": -953.7249755859375, "loss": 0.0486, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.387182712554932, "rewards/margins": 38.04375076293945, "rewards/rejected": -42.428123474121094, "step": 10540 }, { "epoch": 2.660006932845933, "grad_norm": 0.01236701663583517, "learning_rate": 1.932593293605933e-08, "logits/chosen": 0.15333251655101776, "logits/rejected": NaN, "logps/chosen": -162.76718139648438, "logps/rejected": -943.5, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.5400390625, "rewards/margins": 38.384376525878906, "rewards/rejected": -41.923439025878906, "step": 10550 }, { "epoch": 2.6625279677307536, "grad_norm": 0.011022365652024746, "learning_rate": 1.904415367108192e-08, "logits/chosen": 0.26316529512405396, "logits/rejected": NaN, "logps/chosen": -179.3156280517578, "logps/rejected": -936.0, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.862719774246216, "rewards/margins": 37.2421875, "rewards/rejected": -41.09843826293945, "step": 10560 }, { "epoch": 2.6650490026155738, "grad_norm": 0.04358352720737457, "learning_rate": 1.8764362384540128e-08, "logits/chosen": 0.12364502251148224, "logits/rejected": NaN, "logps/chosen": -188.0625, "logps/rejected": -936.7999877929688, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.540514945983887, "rewards/margins": 36.03125, "rewards/rejected": -40.55937576293945, "step": 10570 }, { "epoch": 2.667570037500394, "grad_norm": 0.0008201187592931092, "learning_rate": 1.8486561484768905e-08, "logits/chosen": 0.34739989042282104, "logits/rejected": NaN, "logps/chosen": -183.3000030517578, "logps/rejected": -960.5499877929688, "loss": 0.005, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.565039157867432, "rewards/margins": 38.950782775878906, "rewards/rejected": -43.51250076293945, "step": 10580 }, { "epoch": 2.670091072385214, "grad_norm": 0.0003634513996075839, "learning_rate": 1.8210753362970814e-08, "logits/chosen": 0.248748779296875, "logits/rejected": NaN, "logps/chosen": -183.0031280517578, "logps/rejected": -936.25, "loss": 0.0035, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.383984565734863, "rewards/margins": 38.08906173706055, "rewards/rejected": -42.485939025878906, "step": 10590 }, { "epoch": 2.6726121072700346, "grad_norm": 0.012345212511718273, "learning_rate": 1.7936940393195322e-08, "logits/chosen": 0.249501034617424, "logits/rejected": NaN, "logps/chosen": -176.99063110351562, "logps/rejected": -944.7999877929688, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.9407591819763184, "rewards/margins": 37.712501525878906, "rewards/rejected": -41.6484375, "step": 10600 }, { "epoch": 2.6751331421548548, "grad_norm": 0.04295622557401657, "learning_rate": 1.766512493231831e-08, "logits/chosen": 0.18638916313648224, "logits/rejected": NaN, "logps/chosen": -184.5749969482422, "logps/rejected": -946.5, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.560272216796875, "rewards/margins": 38.951560974121094, "rewards/rejected": -43.515625, "step": 10610 }, { "epoch": 2.677654177039675, "grad_norm": 3.01535851576773e-06, "learning_rate": 1.739530932002195e-08, "logits/chosen": 0.24904480576515198, "logits/rejected": 0.9528884887695312, "logps/chosen": -179.05624389648438, "logps/rejected": -932.375, "loss": 0.0259, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.1666259765625, "rewards/margins": 37.287498474121094, "rewards/rejected": -41.462501525878906, "step": 10620 }, { "epoch": 2.680175211924495, "grad_norm": 0.4987049996852875, "learning_rate": 1.7127495878774545e-08, "logits/chosen": 0.1774444580078125, "logits/rejected": NaN, "logps/chosen": -177.27499389648438, "logps/rejected": -996.5750122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.095025539398193, "rewards/margins": 40.23125076293945, "rewards/rejected": -44.29999923706055, "step": 10630 }, { "epoch": 2.682696246809315, "grad_norm": 7.035883027128875e-05, "learning_rate": 1.6861686913810376e-08, "logits/chosen": 0.27610474824905396, "logits/rejected": NaN, "logps/chosen": -167.5187530517578, "logps/rejected": -934.9500122070312, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7759461402893066, "rewards/margins": 37.665626525878906, "rewards/rejected": -41.43281173706055, "step": 10640 }, { "epoch": 2.6852172816941353, "grad_norm": 9.796793892746791e-05, "learning_rate": 1.6597884713109968e-08, "logits/chosen": 0.23512573540210724, "logits/rejected": NaN, "logps/chosen": -184.2312469482422, "logps/rejected": -929.2999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.201098442077637, "rewards/margins": 37.967185974121094, "rewards/rejected": -42.171875, "step": 10650 }, { "epoch": 2.6877383165789555, "grad_norm": 0.005762585438787937, "learning_rate": 1.633609154738058e-08, "logits/chosen": 0.3332153260707855, "logits/rejected": NaN, "logps/chosen": -163.0968780517578, "logps/rejected": -927.5999755859375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.399075508117676, "rewards/margins": 37.8828125, "rewards/rejected": -42.264060974121094, "step": 10660 }, { "epoch": 2.690259351463776, "grad_norm": 0.005141604691743851, "learning_rate": 1.607630967003623e-08, "logits/chosen": 0.192158505320549, "logits/rejected": 0.8667449951171875, "logps/chosen": -180.38125610351562, "logps/rejected": -953.7000122070312, "loss": 0.0107, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.965747117996216, "rewards/margins": 38.834373474121094, "rewards/rejected": -42.782814025878906, "step": 10670 }, { "epoch": 2.692780386348596, "grad_norm": 0.0019262601854279637, "learning_rate": 1.5818541317178747e-08, "logits/chosen": 0.16916504502296448, "logits/rejected": 0.6532699465751648, "logps/chosen": -200.390625, "logps/rejected": -965.2999877929688, "loss": 0.0082, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.288549900054932, "rewards/margins": 37.87031173706055, "rewards/rejected": -42.15156173706055, "step": 10680 }, { "epoch": 2.6953014212334163, "grad_norm": 0.000733332650270313, "learning_rate": 1.5562788707578128e-08, "logits/chosen": 0.22591857612133026, "logits/rejected": NaN, "logps/chosen": -197.85781860351562, "logps/rejected": -925.9000244140625, "loss": 0.0053, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -5.089013576507568, "rewards/margins": 36.795310974121094, "rewards/rejected": -41.88593673706055, "step": 10690 }, { "epoch": 2.6978224561182365, "grad_norm": 0.018652208149433136, "learning_rate": 1.530905404265387e-08, "logits/chosen": 0.16546782851219177, "logits/rejected": NaN, "logps/chosen": -194.27499389648438, "logps/rejected": -931.0250244140625, "loss": 0.016, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.474749565124512, "rewards/margins": 36.904685974121094, "rewards/rejected": -41.388282775878906, "step": 10700 }, { "epoch": 2.7003434910030566, "grad_norm": 0.07825402170419693, "learning_rate": 1.5057339506455573e-08, "logits/chosen": 0.3581985533237457, "logits/rejected": NaN, "logps/chosen": -183.53125, "logps/rejected": -941.4500122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.765704154968262, "rewards/margins": 38.65937423706055, "rewards/rejected": -43.454689025878906, "step": 10710 }, { "epoch": 2.702864525887877, "grad_norm": 4.376104354858398, "learning_rate": 1.4807647265644429e-08, "logits/chosen": 0.4084533751010895, "logits/rejected": NaN, "logps/chosen": -185.9812469482422, "logps/rejected": -935.0, "loss": 0.0113, "rewards/accuracies": 0.984375, "rewards/chosen": -4.4334716796875, "rewards/margins": 38.265625, "rewards/rejected": -42.6953125, "step": 10720 }, { "epoch": 2.7053855607726973, "grad_norm": 0.11823631823062897, "learning_rate": 1.4559979469474487e-08, "logits/chosen": 0.608074963092804, "logits/rejected": 1.1670761108398438, "logps/chosen": -176.66250610351562, "logps/rejected": -926.5499877929688, "loss": 0.0397, "rewards/accuracies": 0.984375, "rewards/chosen": -4.4096832275390625, "rewards/margins": 38.157814025878906, "rewards/rejected": -42.560935974121094, "step": 10730 }, { "epoch": 2.7079065956575175, "grad_norm": 0.0006661415100097656, "learning_rate": 1.4314338249774239e-08, "logits/chosen": 0.35097962617874146, "logits/rejected": NaN, "logps/chosen": -171.6531219482422, "logps/rejected": -912.2999877929688, "loss": 0.0047, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8213744163513184, "rewards/margins": 37.349998474121094, "rewards/rejected": -41.1640625, "step": 10740 }, { "epoch": 2.7104276305423376, "grad_norm": 0.0004550785815808922, "learning_rate": 1.4070725720928072e-08, "logits/chosen": 0.19205626845359802, "logits/rejected": NaN, "logps/chosen": -170.2156219482422, "logps/rejected": -957.6500244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.564105272293091, "rewards/margins": 39.765625, "rewards/rejected": -43.314064025878906, "step": 10750 }, { "epoch": 2.7129486654271577, "grad_norm": 67.2163314819336, "learning_rate": 1.382914397985821e-08, "logits/chosen": 0.28379517793655396, "logits/rejected": NaN, "logps/chosen": -176.6531219482422, "logps/rejected": -952.5, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.170397758483887, "rewards/margins": 38.40156173706055, "rewards/rejected": -42.5859375, "step": 10760 }, { "epoch": 2.715469700311978, "grad_norm": 0.7094961404800415, "learning_rate": 1.358959510600674e-08, "logits/chosen": 0.462380975484848, "logits/rejected": NaN, "logps/chosen": -170.72811889648438, "logps/rejected": -927.7999877929688, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.652112007141113, "rewards/margins": 38.11406326293945, "rewards/rejected": -42.78593826293945, "step": 10770 }, { "epoch": 2.717990735196798, "grad_norm": 0.9466407299041748, "learning_rate": 1.335208116131753e-08, "logits/chosen": 0.4462219178676605, "logits/rejected": 1.0952484607696533, "logps/chosen": -168.11563110351562, "logps/rejected": -904.2000122070312, "loss": 0.0093, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.234570503234863, "rewards/margins": 37.83906173706055, "rewards/rejected": -42.08906173706055, "step": 10780 }, { "epoch": 2.7205117700816186, "grad_norm": 0.03179473802447319, "learning_rate": 1.3116604190218538e-08, "logits/chosen": 0.3194778561592102, "logits/rejected": 0.9041702151298523, "logps/chosen": -175.9718780517578, "logps/rejected": -946.375, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.513964653015137, "rewards/margins": 38.111717224121094, "rewards/rejected": -42.618751525878906, "step": 10790 }, { "epoch": 2.7230328049664387, "grad_norm": 2.512996070436202e-05, "learning_rate": 1.2883166219604391e-08, "logits/chosen": 0.23339538276195526, "logits/rejected": 0.9362152218818665, "logps/chosen": -196.30313110351562, "logps/rejected": -969.3499755859375, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.658032417297363, "rewards/margins": 38.8125, "rewards/rejected": -43.474998474121094, "step": 10800 }, { "epoch": 2.725553839851259, "grad_norm": 0.0003590815467759967, "learning_rate": 1.2651769258818556e-08, "logits/chosen": 0.2654785215854645, "logits/rejected": NaN, "logps/chosen": -169.0421905517578, "logps/rejected": -935.1500244140625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -3.794140577316284, "rewards/margins": 38.639060974121094, "rewards/rejected": -42.431251525878906, "step": 10810 }, { "epoch": 2.728074874736079, "grad_norm": 0.009653220884501934, "learning_rate": 1.2422415299636529e-08, "logits/chosen": 0.21637573838233948, "logits/rejected": NaN, "logps/chosen": -169.984375, "logps/rejected": -915.5, "loss": 0.0065, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.3376221656799316, "rewards/margins": 37.94062423706055, "rewards/rejected": -41.2734375, "step": 10820 }, { "epoch": 2.7305959096208996, "grad_norm": 0.3737920820713043, "learning_rate": 1.2195106316248233e-08, "logits/chosen": 0.20157165825366974, "logits/rejected": 0.75579833984375, "logps/chosen": -179.36563110351562, "logps/rejected": -957.9500122070312, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.8370819091796875, "rewards/margins": 38.665626525878906, "rewards/rejected": -43.517189025878906, "step": 10830 }, { "epoch": 2.7331169445057197, "grad_norm": 0.0008435755735263228, "learning_rate": 1.196984426524142e-08, "logits/chosen": 0.4718994200229645, "logits/rejected": 0.9897521734237671, "logps/chosen": -166.50936889648438, "logps/rejected": -927.0499877929688, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.543829441070557, "rewards/margins": 37.829689025878906, "rewards/rejected": -42.392189025878906, "step": 10840 }, { "epoch": 2.73563797939054, "grad_norm": 20.021787643432617, "learning_rate": 1.1746631085584463e-08, "logits/chosen": 0.4589378237724304, "logits/rejected": NaN, "logps/chosen": -166.14999389648438, "logps/rejected": -935.5999755859375, "loss": 0.0059, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.458581447601318, "rewards/margins": 38.42656326293945, "rewards/rejected": -42.884376525878906, "step": 10850 }, { "epoch": 2.73815901427536, "grad_norm": 0.00811950583010912, "learning_rate": 1.152546869860993e-08, "logits/chosen": 0.18424224853515625, "logits/rejected": NaN, "logps/chosen": -188.94375610351562, "logps/rejected": -947.2999877929688, "loss": 0.0181, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -5.010229587554932, "rewards/margins": 38.579689025878906, "rewards/rejected": -43.595314025878906, "step": 10860 }, { "epoch": 2.74068004916018, "grad_norm": 0.0022036819718778133, "learning_rate": 1.130635900799795e-08, "logits/chosen": 0.21379394829273224, "logits/rejected": NaN, "logps/chosen": -201.8937530517578, "logps/rejected": -959.0, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.6072998046875, "rewards/margins": 37.421875, "rewards/rejected": -42.048439025878906, "step": 10870 }, { "epoch": 2.7432010840450003, "grad_norm": 0.0003073798434343189, "learning_rate": 1.10893038997599e-08, "logits/chosen": 0.24405518174171448, "logits/rejected": NaN, "logps/chosen": -175.49374389648438, "logps/rejected": -951.5750122070312, "loss": 0.0228, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.01513671875, "rewards/margins": 38.220314025878906, "rewards/rejected": -42.243751525878906, "step": 10880 }, { "epoch": 2.7457221189298204, "grad_norm": 0.000541714602150023, "learning_rate": 1.0874305242221993e-08, "logits/chosen": 0.47424620389938354, "logits/rejected": NaN, "logps/chosen": -161.36563110351562, "logps/rejected": -922.5750122070312, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.443652153015137, "rewards/margins": 38.26171875, "rewards/rejected": -42.685935974121094, "step": 10890 }, { "epoch": 2.748243153814641, "grad_norm": 4.116281161259394e-06, "learning_rate": 1.0661364886009356e-08, "logits/chosen": 0.2861175537109375, "logits/rejected": NaN, "logps/chosen": -170.7687530517578, "logps/rejected": -923.6749877929688, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.101904392242432, "rewards/margins": 37.03125, "rewards/rejected": -41.139060974121094, "step": 10900 }, { "epoch": 2.750764188699461, "grad_norm": 1.6133475583046675e-05, "learning_rate": 1.045048466403009e-08, "logits/chosen": 0.3542938232421875, "logits/rejected": NaN, "logps/chosen": -191.04061889648438, "logps/rejected": -933.25, "loss": 0.0308, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.835497856140137, "rewards/margins": 37.59375, "rewards/rejected": -42.423439025878906, "step": 10910 }, { "epoch": 2.7532852235842813, "grad_norm": 0.03503101319074631, "learning_rate": 1.0241666391459457e-08, "logits/chosen": 0.18352356553077698, "logits/rejected": NaN, "logps/chosen": -200.1218719482422, "logps/rejected": -958.0, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.21728515625, "rewards/margins": 38.603126525878906, "rewards/rejected": -42.81718826293945, "step": 10920 }, { "epoch": 2.7558062584691014, "grad_norm": 0.06765129417181015, "learning_rate": 1.0034911865724193e-08, "logits/chosen": 0.3201843202114105, "logits/rejected": NaN, "logps/chosen": -197.203125, "logps/rejected": -959.9000244140625, "loss": 0.0189, "rewards/accuracies": 0.984375, "rewards/chosen": -5.00223445892334, "rewards/margins": 38.0859375, "rewards/rejected": -43.087501525878906, "step": 10930 }, { "epoch": 2.758327293353922, "grad_norm": 0.05482494458556175, "learning_rate": 9.830222866487187e-09, "logits/chosen": 0.48095703125, "logits/rejected": 1.1112762689590454, "logps/chosen": -182.8562469482422, "logps/rejected": -925.9000244140625, "loss": 0.0448, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.991406440734863, "rewards/margins": 37.6640625, "rewards/rejected": -42.63750076293945, "step": 10940 }, { "epoch": 2.760848328238742, "grad_norm": 0.0005284110084176064, "learning_rate": 9.627601155631965e-09, "logits/chosen": 0.10930786281824112, "logits/rejected": 0.4914444088935852, "logps/chosen": -178.95938110351562, "logps/rejected": -961.7000122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.268212795257568, "rewards/margins": 37.314064025878906, "rewards/rejected": -41.5859375, "step": 10950 }, { "epoch": 2.7633693631235623, "grad_norm": 0.19993969798088074, "learning_rate": 9.42704847724779e-09, "logits/chosen": 0.188140869140625, "logits/rejected": 0.8907715082168579, "logps/chosen": -176.140625, "logps/rejected": -935.7999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.494604587554932, "rewards/margins": 37.0390625, "rewards/rejected": -41.537498474121094, "step": 10960 }, { "epoch": 2.7658903980083824, "grad_norm": 86.00687408447266, "learning_rate": 9.22856655761431e-09, "logits/chosen": 0.29352569580078125, "logits/rejected": 0.834197998046875, "logps/chosen": -171.0593719482422, "logps/rejected": -929.4000244140625, "loss": 0.0061, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.297265529632568, "rewards/margins": 37.94218826293945, "rewards/rejected": -42.251564025878906, "step": 10970 }, { "epoch": 2.7684114328932026, "grad_norm": 0.009349415078759193, "learning_rate": 9.032157105187094e-09, "logits/chosen": 0.2713211178779602, "logits/rejected": NaN, "logps/chosen": -170.421875, "logps/rejected": -884.9000244140625, "loss": 0.0099, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.986499071121216, "rewards/margins": 35.701560974121094, "rewards/rejected": -39.6796875, "step": 10980 }, { "epoch": 2.7709324677780227, "grad_norm": 0.3167308568954468, "learning_rate": 8.83782181058254e-09, "logits/chosen": 0.21469268202781677, "logits/rejected": NaN, "logps/chosen": -187.4718780517578, "logps/rejected": -925.9749755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.308447360992432, "rewards/margins": 36.875, "rewards/rejected": -41.18281173706055, "step": 10990 }, { "epoch": 2.773453502662843, "grad_norm": 2.3843218514230102e-05, "learning_rate": 8.645562346563551e-09, "logits/chosen": 0.27138423919677734, "logits/rejected": NaN, "logps/chosen": -188.8249969482422, "logps/rejected": -934.2750244140625, "loss": 0.0276, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.164233207702637, "rewards/margins": 37.4296875, "rewards/rejected": -41.587501525878906, "step": 11000 }, { "epoch": 2.7759745375476634, "grad_norm": 0.000498538778629154, "learning_rate": 8.455380368025122e-09, "logits/chosen": 0.33666688203811646, "logits/rejected": 0.979901134967804, "logps/chosen": -181.5593719482422, "logps/rejected": -954.8499755859375, "loss": 0.1534, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.368798732757568, "rewards/margins": 38.63593673706055, "rewards/rejected": -42.993751525878906, "step": 11010 }, { "epoch": 2.7784955724324836, "grad_norm": 0.0716213509440422, "learning_rate": 8.267277511980058e-09, "logits/chosen": 0.277587890625, "logits/rejected": 0.8464210629463196, "logps/chosen": -180.390625, "logps/rejected": -903.5999755859375, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.375536918640137, "rewards/margins": 35.18437576293945, "rewards/rejected": -39.53437423706055, "step": 11020 }, { "epoch": 2.7810166073173037, "grad_norm": 68.89803314208984, "learning_rate": 8.081255397544812e-09, "logits/chosen": 0.3707412779331207, "logits/rejected": NaN, "logps/chosen": -188.86093139648438, "logps/rejected": -942.4000244140625, "loss": 0.0316, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.437323093414307, "rewards/margins": 37.556251525878906, "rewards/rejected": -42.01093673706055, "step": 11030 }, { "epoch": 2.783537642202124, "grad_norm": 0.0006181775825098157, "learning_rate": 7.89731562592566e-09, "logits/chosen": 0.2501159608364105, "logits/rejected": NaN, "logps/chosen": -170.921875, "logps/rejected": -943.4500122070312, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.694140672683716, "rewards/margins": 38.90312576293945, "rewards/rejected": -42.592185974121094, "step": 11040 }, { "epoch": 2.7860586770869444, "grad_norm": 1.0477881914994214e-05, "learning_rate": 7.715459780404943e-09, "logits/chosen": 0.2763725221157074, "logits/rejected": NaN, "logps/chosen": -178.6750030517578, "logps/rejected": -918.8499755859375, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.394921779632568, "rewards/margins": 36.90937423706055, "rewards/rejected": -41.295310974121094, "step": 11050 }, { "epoch": 2.7885797119717646, "grad_norm": 0.0005480454419739544, "learning_rate": 7.53568942632729e-09, "logits/chosen": 0.350637823343277, "logits/rejected": NaN, "logps/chosen": -168.54061889648438, "logps/rejected": -936.0, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.20263671875, "rewards/margins": 37.8359375, "rewards/rejected": -42.02812576293945, "step": 11060 }, { "epoch": 2.7911007468565847, "grad_norm": 0.007429076824337244, "learning_rate": 7.3580061110861954e-09, "logits/chosen": 0.1868637055158615, "logits/rejected": NaN, "logps/chosen": -170.4499969482422, "logps/rejected": -958.5, "loss": 0.0108, "rewards/accuracies": 0.984375, "rewards/chosen": -3.927197217941284, "rewards/margins": 39.31718826293945, "rewards/rejected": -43.25312423706055, "step": 11070 }, { "epoch": 2.793621781741405, "grad_norm": 0.06133945658802986, "learning_rate": 7.18241136411088e-09, "logits/chosen": 0.3176124691963196, "logits/rejected": 1.0732910633087158, "logps/chosen": -179.67813110351562, "logps/rejected": -945.25, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.827670097351074, "rewards/margins": 37.65156173706055, "rewards/rejected": -42.498435974121094, "step": 11080 }, { "epoch": 2.796142816626225, "grad_norm": 0.0007015736191533506, "learning_rate": 7.008906696852806e-09, "logits/chosen": 0.3617042601108551, "logits/rejected": NaN, "logps/chosen": -176.49374389648438, "logps/rejected": -922.1500244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.891345262527466, "rewards/margins": 36.779685974121094, "rewards/rejected": -40.671875, "step": 11090 }, { "epoch": 2.798663851511045, "grad_norm": 2.5694267606013454e-05, "learning_rate": 6.837493602772998e-09, "logits/chosen": 0.2897705137729645, "logits/rejected": 0.8652591705322266, "logps/chosen": -188.2843780517578, "logps/rejected": -948.6500244140625, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.813867092132568, "rewards/margins": 37.39374923706055, "rewards/rejected": -42.20781326293945, "step": 11100 }, { "epoch": 2.8011848863958653, "grad_norm": 0.015201364643871784, "learning_rate": 6.668173557328877e-09, "logits/chosen": 0.34977418184280396, "logits/rejected": NaN, "logps/chosen": -166.80624389648438, "logps/rejected": -939.2000122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.142920017242432, "rewards/margins": 38.068748474121094, "rewards/rejected": -42.193748474121094, "step": 11110 }, { "epoch": 2.803705921280686, "grad_norm": 0.06571114808320999, "learning_rate": 6.500948017961832e-09, "logits/chosen": 0.30016326904296875, "logits/rejected": NaN, "logps/chosen": -179.22811889648438, "logps/rejected": -926.4500122070312, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.186474800109863, "rewards/margins": 37.642189025878906, "rewards/rejected": -41.842185974121094, "step": 11120 }, { "epoch": 2.806226956165506, "grad_norm": 1.5595659017562866, "learning_rate": 6.335818424084538e-09, "logits/chosen": 0.34124755859375, "logits/rejected": NaN, "logps/chosen": -182.71249389648438, "logps/rejected": -887.7750244140625, "loss": 0.0078, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.033648490905762, "rewards/margins": 37.40156173706055, "rewards/rejected": -41.443748474121094, "step": 11130 }, { "epoch": 2.808747991050326, "grad_norm": 0.2382611632347107, "learning_rate": 6.172786197068486e-09, "logits/chosen": 0.3333190977573395, "logits/rejected": NaN, "logps/chosen": -172.9296875, "logps/rejected": -952.6500244140625, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.033239841461182, "rewards/margins": 38.071876525878906, "rewards/rejected": -42.107810974121094, "step": 11140 }, { "epoch": 2.8112690259351463, "grad_norm": 79.93607330322266, "learning_rate": 6.011852740231943e-09, "logits/chosen": 0.44568175077438354, "logits/rejected": NaN, "logps/chosen": -181.9093780517578, "logps/rejected": -929.4500122070312, "loss": 0.042, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.162329196929932, "rewards/margins": 37.626564025878906, "rewards/rejected": -41.790626525878906, "step": 11150 }, { "epoch": 2.813790060819967, "grad_norm": 0.0016335331602022052, "learning_rate": 5.853019438827767e-09, "logits/chosen": 0.17489013075828552, "logits/rejected": NaN, "logps/chosen": -186.36563110351562, "logps/rejected": -951.5499877929688, "loss": 0.0091, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.725024223327637, "rewards/margins": 38.482810974121094, "rewards/rejected": -43.19843673706055, "step": 11160 }, { "epoch": 2.816311095704787, "grad_norm": 1.070627831722959e-06, "learning_rate": 5.69628766003144e-09, "logits/chosen": 0.225779727101326, "logits/rejected": NaN, "logps/chosen": -172.9562530517578, "logps/rejected": -957.5750122070312, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.353628635406494, "rewards/margins": 38.6875, "rewards/rejected": -43.046875, "step": 11170 }, { "epoch": 2.818832130589607, "grad_norm": 0.12195402383804321, "learning_rate": 5.54165875292939e-09, "logits/chosen": 0.3173271119594574, "logits/rejected": NaN, "logps/chosen": -204.1593780517578, "logps/rejected": -945.4749755859375, "loss": 0.0025, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.560943603515625, "rewards/margins": 38.5625, "rewards/rejected": -43.12031173706055, "step": 11180 }, { "epoch": 2.8213531654744273, "grad_norm": 0.00045466262963600457, "learning_rate": 5.389134048507382e-09, "logits/chosen": 0.16789093613624573, "logits/rejected": 0.7066818475723267, "logps/chosen": -180.7375030517578, "logps/rejected": -939.6500244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.058752536773682, "rewards/margins": 37.60468673706055, "rewards/rejected": -41.654685974121094, "step": 11190 }, { "epoch": 2.8238742003592474, "grad_norm": 0.001113788690418005, "learning_rate": 5.2387148596389205e-09, "logits/chosen": 0.21293945610523224, "logits/rejected": NaN, "logps/chosen": -175.35000610351562, "logps/rejected": -893.7000122070312, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8382568359375, "rewards/margins": 36.48749923706055, "rewards/rejected": -40.328125, "step": 11200 }, { "epoch": 2.8263952352440675, "grad_norm": 5.571145534515381, "learning_rate": 5.090402481074119e-09, "logits/chosen": 0.2969512939453125, "logits/rejected": NaN, "logps/chosen": -173.68905639648438, "logps/rejected": -915.6500244140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.378900051116943, "rewards/margins": 36.79999923706055, "rewards/rejected": -41.1796875, "step": 11210 }, { "epoch": 2.8289162701288877, "grad_norm": 0.012452131137251854, "learning_rate": 4.944198189428455e-09, "logits/chosen": 0.42456817626953125, "logits/rejected": 1.126519799232483, "logps/chosen": -161.9796905517578, "logps/rejected": -893.7249755859375, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.044287204742432, "rewards/margins": 36.6484375, "rewards/rejected": -40.689064025878906, "step": 11220 }, { "epoch": 2.8314373050137083, "grad_norm": 0.0014645534101873636, "learning_rate": 4.800103243171816e-09, "logits/chosen": 0.2033843994140625, "logits/rejected": NaN, "logps/chosen": -215.78125, "logps/rejected": -923.8499755859375, "loss": 0.0188, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.858935594558716, "rewards/margins": 37.15312576293945, "rewards/rejected": -41.0234375, "step": 11230 }, { "epoch": 2.8339583398985284, "grad_norm": 0.044304098933935165, "learning_rate": 4.6581188826176896e-09, "logits/chosen": 0.1995895355939865, "logits/rejected": NaN, "logps/chosen": -171.28750610351562, "logps/rejected": -968.1500244140625, "loss": 0.0038, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.955883741378784, "rewards/margins": 39.548439025878906, "rewards/rejected": -43.5, "step": 11240 }, { "epoch": 2.8364793747833486, "grad_norm": 2.927586317062378, "learning_rate": 4.5182463299124344e-09, "logits/chosen": 0.190745547413826, "logits/rejected": 0.7750183343887329, "logps/chosen": -176.9343719482422, "logps/rejected": -962.0999755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.852038621902466, "rewards/margins": 37.87968826293945, "rewards/rejected": -41.72187423706055, "step": 11250 }, { "epoch": 2.8390004096681687, "grad_norm": 0.09549788385629654, "learning_rate": 4.380486789024751e-09, "logits/chosen": 0.14723816514015198, "logits/rejected": 0.7126358151435852, "logps/chosen": -191.9968719482422, "logps/rejected": -959.0499877929688, "loss": 0.0075, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.461096286773682, "rewards/margins": 37.90937423706055, "rewards/rejected": -42.389060974121094, "step": 11260 }, { "epoch": 2.8415214445529893, "grad_norm": 0.090050607919693, "learning_rate": 4.244841445735447e-09, "logits/chosen": 0.27522581815719604, "logits/rejected": NaN, "logps/chosen": -179.5593719482422, "logps/rejected": -937.7000122070312, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.524243354797363, "rewards/margins": 38.28125, "rewards/rejected": -42.806251525878906, "step": 11270 }, { "epoch": 2.8440424794378094, "grad_norm": 1.1452112197875977, "learning_rate": 4.1113114676270265e-09, "logits/chosen": 0.3450965881347656, "logits/rejected": NaN, "logps/chosen": -178.6328125, "logps/rejected": -922.9500122070312, "loss": 0.0054, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.094689846038818, "rewards/margins": 37.234375, "rewards/rejected": -41.3515625, "step": 11280 }, { "epoch": 2.8465635143226296, "grad_norm": 0.3386777937412262, "learning_rate": 3.9798980040738336e-09, "logits/chosen": 0.28044891357421875, "logits/rejected": NaN, "logps/chosen": -167.4187469482422, "logps/rejected": -933.4500122070312, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.226342678070068, "rewards/margins": 38.23125076293945, "rewards/rejected": -42.478126525878906, "step": 11290 }, { "epoch": 2.8490845492074497, "grad_norm": 1.272935390472412, "learning_rate": 3.85060218623201e-09, "logits/chosen": 0.08534546196460724, "logits/rejected": NaN, "logps/chosen": -169.6281280517578, "logps/rejected": -963.5499877929688, "loss": 0.0068, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.831817626953125, "rewards/margins": 40.185935974121094, "rewards/rejected": -44.03593826293945, "step": 11300 }, { "epoch": 2.85160558409227, "grad_norm": 0.19967198371887207, "learning_rate": 3.7234251270298887e-09, "logits/chosen": 0.24124297499656677, "logits/rejected": NaN, "logps/chosen": -177.91250610351562, "logps/rejected": -914.7000122070312, "loss": 0.0126, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.017431735992432, "rewards/margins": 37.12812423706055, "rewards/rejected": -41.15937423706055, "step": 11310 }, { "epoch": 2.85412661897709, "grad_norm": 2.8995299339294434, "learning_rate": 3.5983679211583073e-09, "logits/chosen": 0.28778839111328125, "logits/rejected": NaN, "logps/chosen": -189.68124389648438, "logps/rejected": -927.7750244140625, "loss": 0.0057, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.448046684265137, "rewards/margins": 37.997657775878906, "rewards/rejected": -42.4375, "step": 11320 }, { "epoch": 2.85664765386191, "grad_norm": 0.00028238262166269124, "learning_rate": 3.4754316450612275e-09, "logits/chosen": 0.26008909940719604, "logits/rejected": NaN, "logps/chosen": -192.7453155517578, "logps/rejected": -957.8499755859375, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.833093166351318, "rewards/margins": 37.450782775878906, "rewards/rejected": -42.303123474121094, "step": 11330 }, { "epoch": 2.8591686887467307, "grad_norm": 0.0031679249368608, "learning_rate": 3.3546173569264925e-09, "logits/chosen": 0.3226028382778168, "logits/rejected": NaN, "logps/chosen": -168.63125610351562, "logps/rejected": -920.0, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.074450492858887, "rewards/margins": 38.2109375, "rewards/rejected": -42.2890625, "step": 11340 }, { "epoch": 2.861689723631551, "grad_norm": 9.016798139782622e-05, "learning_rate": 3.2359260966766667e-09, "logits/chosen": 0.24735412001609802, "logits/rejected": NaN, "logps/chosen": -190.05624389648438, "logps/rejected": -963.7999877929688, "loss": 0.0825, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.952929496765137, "rewards/margins": 39.46562576293945, "rewards/rejected": -44.41718673706055, "step": 11350 }, { "epoch": 2.864210758516371, "grad_norm": 0.007405490148812532, "learning_rate": 3.119358885960155e-09, "logits/chosen": 0.36410218477249146, "logits/rejected": NaN, "logps/chosen": -169.85311889648438, "logps/rejected": -906.5499877929688, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.7270750999450684, "rewards/margins": 37.4375, "rewards/rejected": -41.17499923706055, "step": 11360 }, { "epoch": 2.866731793401191, "grad_norm": 0.0026119202375411987, "learning_rate": 3.0049167281423204e-09, "logits/chosen": 0.3984939455986023, "logits/rejected": NaN, "logps/chosen": -169.0234375, "logps/rejected": -936.8499755859375, "loss": 0.0085, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.8287415504455566, "rewards/margins": 38.201560974121094, "rewards/rejected": -42.029685974121094, "step": 11370 }, { "epoch": 2.8692528282860112, "grad_norm": 0.0171663835644722, "learning_rate": 2.892600608296908e-09, "logits/chosen": 0.2695358395576477, "logits/rejected": NaN, "logps/chosen": -189.6593780517578, "logps/rejected": -926.25, "loss": 0.0191, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.200079441070557, "rewards/margins": 38.021873474121094, "rewards/rejected": -42.22968673706055, "step": 11380 }, { "epoch": 2.871773863170832, "grad_norm": 6.8408653532969765e-06, "learning_rate": 2.7824114931975783e-09, "logits/chosen": 0.18880920112133026, "logits/rejected": 0.7545700073242188, "logps/chosen": -178.3156280517578, "logps/rejected": -944.8499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.506799221038818, "rewards/margins": 38.670310974121094, "rewards/rejected": -43.18437576293945, "step": 11390 }, { "epoch": 2.874294898055652, "grad_norm": 0.0014730860712006688, "learning_rate": 2.6743503313095274e-09, "logits/chosen": NaN, "logits/rejected": 1.0351272821426392, "logps/chosen": -173.55313110351562, "logps/rejected": -923.75, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.9132080078125, "rewards/margins": 37.64374923706055, "rewards/rejected": -41.556251525878906, "step": 11400 }, { "epoch": 2.876815932940472, "grad_norm": 0.03852550685405731, "learning_rate": 2.5684180527813513e-09, "logits/chosen": 0.270089715719223, "logits/rejected": 0.9680541753768921, "logps/chosen": -177.6437530517578, "logps/rejected": -957.5499877929688, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.255883693695068, "rewards/margins": 39.6015625, "rewards/rejected": -43.86249923706055, "step": 11410 }, { "epoch": 2.8793369678252922, "grad_norm": 0.0754161924123764, "learning_rate": 2.464615569437112e-09, "logits/chosen": 0.25773924589157104, "logits/rejected": NaN, "logps/chosen": -167.0812530517578, "logps/rejected": -946.25, "loss": 0.0046, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.8943114280700684, "rewards/margins": 38.6171875, "rewards/rejected": -42.51093673706055, "step": 11420 }, { "epoch": 2.8818580027101124, "grad_norm": 0.00021470423962455243, "learning_rate": 2.3629437747682836e-09, "logits/chosen": 0.32725220918655396, "logits/rejected": NaN, "logps/chosen": -196.7781219482422, "logps/rejected": -939.4500122070312, "loss": 0.1478, "rewards/accuracies": 0.984375, "rewards/chosen": -4.908617973327637, "rewards/margins": 36.65156173706055, "rewards/rejected": -41.568748474121094, "step": 11430 }, { "epoch": 2.8843790375949325, "grad_norm": 8.412905299337581e-05, "learning_rate": 2.2634035439263454e-09, "logits/chosen": 0.22711563110351562, "logits/rejected": NaN, "logps/chosen": -178.27499389648438, "logps/rejected": -924.75, "loss": 0.0031, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.9640870094299316, "rewards/margins": 37.31718826293945, "rewards/rejected": -41.27656173706055, "step": 11440 }, { "epoch": 2.8869000724797527, "grad_norm": 0.05228603631258011, "learning_rate": 2.165995733715009e-09, "logits/chosen": 0.17130127549171448, "logits/rejected": 0.8265731930732727, "logps/chosen": -181.515625, "logps/rejected": -946.4000244140625, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.142285346984863, "rewards/margins": 37.84687423706055, "rewards/rejected": -41.98749923706055, "step": 11450 }, { "epoch": 2.8894211073645732, "grad_norm": 0.004331837873905897, "learning_rate": 2.0707211825829463e-09, "logits/chosen": 0.235667422413826, "logits/rejected": 0.9276153445243835, "logps/chosen": -167.41561889648438, "logps/rejected": -913.0250244140625, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.275231838226318, "rewards/margins": 36.692970275878906, "rewards/rejected": -40.98125076293945, "step": 11460 }, { "epoch": 2.8919421422493934, "grad_norm": 44.12592697143555, "learning_rate": 1.9775807106165996e-09, "logits/chosen": 0.292449951171875, "logits/rejected": NaN, "logps/chosen": -170.6218719482422, "logps/rejected": -892.0999755859375, "loss": 0.0243, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.01531982421875, "rewards/margins": 36.544532775878906, "rewards/rejected": -40.545310974121094, "step": 11470 }, { "epoch": 2.8944631771342135, "grad_norm": 2.7737979888916016, "learning_rate": 1.8865751195330517e-09, "logits/chosen": 0.331146240234375, "logits/rejected": NaN, "logps/chosen": -190.4875030517578, "logps/rejected": -955.4000244140625, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.598028659820557, "rewards/margins": 37.275001525878906, "rewards/rejected": -41.881248474121094, "step": 11480 }, { "epoch": 2.8969842120190337, "grad_norm": 0.1144828125834465, "learning_rate": 1.7977051926731935e-09, "logits/chosen": 0.3930419981479645, "logits/rejected": NaN, "logps/chosen": -175.8406219482422, "logps/rejected": -946.0499877929688, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.605233669281006, "rewards/margins": 38.13750076293945, "rewards/rejected": -42.73125076293945, "step": 11490 }, { "epoch": 2.8995052469038542, "grad_norm": 0.0035964958369731903, "learning_rate": 1.7109716949949004e-09, "logits/chosen": 0.24399414658546448, "logits/rejected": NaN, "logps/chosen": -161.93124389648438, "logps/rejected": -923.7999877929688, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.995190382003784, "rewards/margins": 37.94843673706055, "rewards/rejected": -41.94218826293945, "step": 11500 }, { "epoch": 2.9020262817886744, "grad_norm": 0.0008834443287923932, "learning_rate": 1.6263753730664797e-09, "logits/chosen": 0.265615850687027, "logits/rejected": NaN, "logps/chosen": -185.9031219482422, "logps/rejected": -924.8250122070312, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.548338413238525, "rewards/margins": 36.935935974121094, "rewards/rejected": -41.48125076293945, "step": 11510 }, { "epoch": 2.9045473166734945, "grad_norm": 9.057550050783902e-05, "learning_rate": 1.5439169550603703e-09, "logits/chosen": 0.42925262451171875, "logits/rejected": 0.8472946286201477, "logps/chosen": -167.75936889648438, "logps/rejected": -928.9000244140625, "loss": 0.0077, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.079003810882568, "rewards/margins": 37.646095275878906, "rewards/rejected": -41.73125076293945, "step": 11520 }, { "epoch": 2.9070683515583147, "grad_norm": 0.0016482868231832981, "learning_rate": 1.4635971507465927e-09, "logits/chosen": 0.1558837890625, "logits/rejected": NaN, "logps/chosen": -179.07186889648438, "logps/rejected": -935.3499755859375, "loss": 0.004, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.201586723327637, "rewards/margins": 38.3515625, "rewards/rejected": -42.5703125, "step": 11530 }, { "epoch": 2.909589386443135, "grad_norm": 0.04818696901202202, "learning_rate": 1.3854166514869758e-09, "logits/chosen": 0.2636169493198395, "logits/rejected": NaN, "logps/chosen": -182.14999389648438, "logps/rejected": -954.9500122070312, "loss": 0.0066, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.508398532867432, "rewards/margins": 38.407814025878906, "rewards/rejected": -42.91718673706055, "step": 11540 }, { "epoch": 2.912110421327955, "grad_norm": 0.38054731488227844, "learning_rate": 1.3093761302289396e-09, "logits/chosen": 0.42222899198532104, "logits/rejected": NaN, "logps/chosen": -161.15625, "logps/rejected": -904.125, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.889086961746216, "rewards/margins": 36.47343826293945, "rewards/rejected": -40.360939025878906, "step": 11550 }, { "epoch": 2.914631456212775, "grad_norm": 26.24993896484375, "learning_rate": 1.2354762414998333e-09, "logits/chosen": 0.38120728731155396, "logits/rejected": 1.0461212396621704, "logps/chosen": -182.0281219482422, "logps/rejected": -917.6500244140625, "loss": 0.0295, "rewards/accuracies": 0.984375, "rewards/chosen": -4.347363471984863, "rewards/margins": 37.650001525878906, "rewards/rejected": -42.00312423706055, "step": 11560 }, { "epoch": 2.9171524910975957, "grad_norm": 0.00038803336792625487, "learning_rate": 1.1637176214012168e-09, "logits/chosen": 0.318533331155777, "logits/rejected": NaN, "logps/chosen": -179.953125, "logps/rejected": -901.6500244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.474218845367432, "rewards/margins": 36.892189025878906, "rewards/rejected": -41.360939025878906, "step": 11570 }, { "epoch": 2.919673525982416, "grad_norm": 0.009810560382902622, "learning_rate": 1.0941008876035606e-09, "logits/chosen": 0.224314883351326, "logits/rejected": NaN, "logps/chosen": -156.45468139648438, "logps/rejected": -938.0750122070312, "loss": 0.0065, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.4954590797424316, "rewards/margins": 38.506248474121094, "rewards/rejected": -42.0078125, "step": 11580 }, { "epoch": 2.922194560867236, "grad_norm": 0.023445861414074898, "learning_rate": 1.0266266393406665e-09, "logits/chosen": 0.15113982558250427, "logits/rejected": NaN, "logps/chosen": -177.890625, "logps/rejected": -945.625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -4.136743068695068, "rewards/margins": 37.59843826293945, "rewards/rejected": -41.74531173706055, "step": 11590 }, { "epoch": 2.924715595752056, "grad_norm": 0.10623300075531006, "learning_rate": 9.612954574047537e-10, "logits/chosen": 0.30876463651657104, "logits/rejected": NaN, "logps/chosen": -171.79843139648438, "logps/rejected": -916.5, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.6328125, "rewards/margins": 36.842185974121094, "rewards/rejected": -40.485939025878906, "step": 11600 }, { "epoch": 2.9272366306368767, "grad_norm": 0.17828954756259918, "learning_rate": 8.981079041413541e-10, "logits/chosen": 0.25176697969436646, "logits/rejected": NaN, "logps/chosen": -166.9656219482422, "logps/rejected": -918.2000122070312, "loss": 0.0541, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7630248069763184, "rewards/margins": 38.29999923706055, "rewards/rejected": -42.079689025878906, "step": 11610 }, { "epoch": 2.929757665521697, "grad_norm": 3.4039878755720565e-06, "learning_rate": 8.370645234443974e-10, "logits/chosen": 0.2955780029296875, "logits/rejected": NaN, "logps/chosen": -159.60311889648438, "logps/rejected": -920.8499755859375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -3.796661376953125, "rewards/margins": 36.985939025878906, "rewards/rejected": -40.764060974121094, "step": 11620 }, { "epoch": 2.932278700406517, "grad_norm": 0.000858031737152487, "learning_rate": 7.781658407516322e-10, "logits/chosen": 0.3621582090854645, "logits/rejected": 0.8892791867256165, "logps/chosen": -160.0421905517578, "logps/rejected": -952.4249877929688, "loss": 0.0267, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.401403903961182, "rewards/margins": 38.08671951293945, "rewards/rejected": -42.4765625, "step": 11630 }, { "epoch": 2.934799735291337, "grad_norm": 0.0010292973602190614, "learning_rate": 7.214123630401581e-10, "logits/chosen": 0.37868958711624146, "logits/rejected": 1.0005309581756592, "logps/chosen": -166.3781280517578, "logps/rejected": -925.25, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.127734184265137, "rewards/margins": 37.829689025878906, "rewards/rejected": -41.984375, "step": 11640 }, { "epoch": 2.9373207701761572, "grad_norm": 0.0135868014767766, "learning_rate": 6.668045788218723e-10, "logits/chosen": 0.3152526915073395, "logits/rejected": 0.8572753667831421, "logps/chosen": -185.890625, "logps/rejected": -924.4000244140625, "loss": 0.0086, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.317288398742676, "rewards/margins": 36.537498474121094, "rewards/rejected": -40.86406326293945, "step": 11650 }, { "epoch": 2.9398418050609774, "grad_norm": 0.02727876417338848, "learning_rate": 6.143429581394466e-10, "logits/chosen": 0.42097777128219604, "logits/rejected": NaN, "logps/chosen": -167.21719360351562, "logps/rejected": -953.8499755859375, "loss": 0.0139, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.881518602371216, "rewards/margins": 37.357810974121094, "rewards/rejected": -41.22343826293945, "step": 11660 }, { "epoch": 2.9423628399457975, "grad_norm": 0.00421537272632122, "learning_rate": 5.640279525621627e-10, "logits/chosen": 0.31309205293655396, "logits/rejected": 0.9638031125068665, "logps/chosen": -169.99063110351562, "logps/rejected": -922.8250122070312, "loss": 0.0386, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.047991752624512, "rewards/margins": 38.11249923706055, "rewards/rejected": -42.16875076293945, "step": 11670 }, { "epoch": 2.944883874830618, "grad_norm": 0.028680147603154182, "learning_rate": 5.158599951821385e-10, "logits/chosen": 0.3892150819301605, "logits/rejected": NaN, "logps/chosen": -188.2843780517578, "logps/rejected": -964.5499877929688, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.731665134429932, "rewards/margins": 37.962501525878906, "rewards/rejected": -42.6953125, "step": 11680 }, { "epoch": 2.9474049097154382, "grad_norm": 0.25985395908355713, "learning_rate": 4.698395006104417e-10, "logits/chosen": 0.22679710388183594, "logits/rejected": NaN, "logps/chosen": -163.078125, "logps/rejected": -922.7000122070312, "loss": 0.0621, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.7359070777893066, "rewards/margins": 38.861717224121094, "rewards/rejected": -42.584373474121094, "step": 11690 }, { "epoch": 2.9499259446002584, "grad_norm": 5.131104469299316, "learning_rate": 4.2596686497367583e-10, "logits/chosen": 0.0718231201171875, "logits/rejected": NaN, "logps/chosen": -214.3874969482422, "logps/rejected": -939.7249755859375, "loss": 0.0033, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.4822678565979, "rewards/margins": 36.342185974121094, "rewards/rejected": -40.81562423706055, "step": 11700 }, { "epoch": 2.9524469794850785, "grad_norm": 0.00595678249374032, "learning_rate": 3.8424246591048373e-10, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -169.4890594482422, "logps/rejected": -931.9749755859375, "loss": 0.0072, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.99542236328125, "rewards/margins": 38.568748474121094, "rewards/rejected": -42.5546875, "step": 11710 }, { "epoch": 2.954968014369899, "grad_norm": 0.028395524248480797, "learning_rate": 3.44666662568327e-10, "logits/chosen": 0.24979552626609802, "logits/rejected": NaN, "logps/chosen": -184.2890625, "logps/rejected": -922.6500244140625, "loss": 0.0044, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.273022651672363, "rewards/margins": 37.73749923706055, "rewards/rejected": -42.02656173706055, "step": 11720 }, { "epoch": 2.9574890492547192, "grad_norm": 0.07159549742937088, "learning_rate": 3.0723979560037806e-10, "logits/chosen": 0.2939910888671875, "logits/rejected": NaN, "logps/chosen": -190.63125610351562, "logps/rejected": -898.9500122070312, "loss": 0.0027, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.676904201507568, "rewards/margins": 36.45624923706055, "rewards/rejected": -41.134376525878906, "step": 11730 }, { "epoch": 2.9600100841395394, "grad_norm": 0.49191224575042725, "learning_rate": 2.719621871626332e-10, "logits/chosen": 0.22511596977710724, "logits/rejected": NaN, "logps/chosen": -200.5828094482422, "logps/rejected": -918.9500122070312, "loss": 0.0026, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.806189060211182, "rewards/margins": 36.76093673706055, "rewards/rejected": -41.57500076293945, "step": 11740 }, { "epoch": 2.9625311190243595, "grad_norm": 0.015475540421903133, "learning_rate": 2.388341409110539e-10, "logits/chosen": 0.22547607123851776, "logits/rejected": NaN, "logps/chosen": -185.58749389648438, "logps/rejected": -928.0750122070312, "loss": 0.0055, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.7335267066955566, "rewards/margins": 36.30390548706055, "rewards/rejected": -40.014060974121094, "step": 11750 }, { "epoch": 2.9650521539091796, "grad_norm": 0.001703117974102497, "learning_rate": 2.0785594199901334e-10, "logits/chosen": 0.3143554627895355, "logits/rejected": NaN, "logps/chosen": -182.9343719482422, "logps/rejected": -964.3499755859375, "loss": 0.0023, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.980322360992432, "rewards/margins": 38.446876525878906, "rewards/rejected": -43.439064025878906, "step": 11760 }, { "epoch": 2.967573188794, "grad_norm": 0.00015123074990697205, "learning_rate": 1.7902785707488156e-10, "logits/chosen": 0.34695130586624146, "logits/rejected": NaN, "logps/chosen": -161.6296844482422, "logps/rejected": -964.2000122070312, "loss": 0.0284, "rewards/accuracies": 0.984375, "rewards/chosen": -3.7932496070861816, "rewards/margins": 38.490623474121094, "rewards/rejected": -42.267189025878906, "step": 11770 }, { "epoch": 2.97009422367882, "grad_norm": 0.008683509193360806, "learning_rate": 1.5235013427961073e-10, "logits/chosen": 0.07832030951976776, "logits/rejected": 0.7002456784248352, "logps/chosen": -171.02499389648438, "logps/rejected": -942.6500244140625, "loss": 0.0442, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.177868843078613, "rewards/margins": 38.30156326293945, "rewards/rejected": -42.4921875, "step": 11780 }, { "epoch": 2.9726152585636405, "grad_norm": 1.2240608157298993e-05, "learning_rate": 1.2782300324470917e-10, "logits/chosen": 0.27564698457717896, "logits/rejected": NaN, "logps/chosen": -182.38125610351562, "logps/rejected": -931.9500122070312, "loss": 0.0223, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.872973680496216, "rewards/margins": 37.446876525878906, "rewards/rejected": -41.314064025878906, "step": 11790 }, { "epoch": 2.9751362934484606, "grad_norm": 8.410179361817427e-07, "learning_rate": 1.0544667509024274e-10, "logits/chosen": 0.257272332906723, "logits/rejected": NaN, "logps/chosen": -164.0593719482422, "logps/rejected": -945.5250244140625, "loss": 0.0022, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.8779664039611816, "rewards/margins": 39.751564025878906, "rewards/rejected": -43.610939025878906, "step": 11800 }, { "epoch": 2.977657328333281, "grad_norm": 0.034203220158815384, "learning_rate": 8.522134242294754e-11, "logits/chosen": 0.20909729599952698, "logits/rejected": 0.7791748046875, "logps/chosen": -188.6062469482422, "logps/rejected": -933.2249755859375, "loss": 0.0024, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -4.598095893859863, "rewards/margins": 36.259376525878906, "rewards/rejected": -40.83124923706055, "step": 11810 }, { "epoch": 2.980178363218101, "grad_norm": 0.006785076577216387, "learning_rate": 6.714717933464786e-11, "logits/chosen": 0.3248397707939148, "logits/rejected": 0.7961082458496094, "logps/chosen": -179.88436889648438, "logps/rejected": -948.9000244140625, "loss": 0.0045, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.345751762390137, "rewards/margins": 38.662498474121094, "rewards/rejected": -43.006248474121094, "step": 11820 }, { "epoch": 2.9826993981029215, "grad_norm": 0.0010458655888214707, "learning_rate": 5.122434140075738e-11, "logits/chosen": 0.2675323486328125, "logits/rejected": NaN, "logps/chosen": -185.40625, "logps/rejected": -986.0499877929688, "loss": 0.0084, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.479986667633057, "rewards/margins": 39.90312576293945, "rewards/rejected": -44.375, "step": 11830 }, { "epoch": 2.9852204329877416, "grad_norm": 0.26493164896965027, "learning_rate": 3.745296567886358e-11, "logits/chosen": 0.11383819580078125, "logits/rejected": NaN, "logps/chosen": -187.86874389648438, "logps/rejected": -940.9500122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.03009033203125, "rewards/margins": 37.209373474121094, "rewards/rejected": -41.23906326293945, "step": 11840 }, { "epoch": 2.987741467872562, "grad_norm": 0.0009305024286732078, "learning_rate": 2.5833170707645347e-11, "logits/chosen": 0.3660644590854645, "logits/rejected": NaN, "logps/chosen": -172.88516235351562, "logps/rejected": -921.5499877929688, "loss": 0.0186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.44769287109375, "rewards/margins": 37.349998474121094, "rewards/rejected": -41.80937576293945, "step": 11850 }, { "epoch": 2.990262502757382, "grad_norm": 0.44064122438430786, "learning_rate": 1.6365056505818208e-11, "logits/chosen": 0.36039429903030396, "logits/rejected": 0.9656478762626648, "logps/chosen": -175.71875, "logps/rejected": -932.7000122070312, "loss": 0.0043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.112744331359863, "rewards/margins": 38.860939025878906, "rewards/rejected": -42.9765625, "step": 11860 }, { "epoch": 2.992783537642202, "grad_norm": 0.002422350225970149, "learning_rate": 9.04870457124618e-12, "logits/chosen": 0.3118423521518707, "logits/rejected": 0.769439697265625, "logps/chosen": -193.60000610351562, "logps/rejected": -945.125, "loss": 0.0083, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -5.2315497398376465, "rewards/margins": 36.743751525878906, "rewards/rejected": -41.959373474121094, "step": 11870 }, { "epoch": 2.995304572527022, "grad_norm": 0.06348605453968048, "learning_rate": 3.884177880192352e-12, "logits/chosen": 0.22486266493797302, "logits/rejected": 0.9637786746025085, "logps/chosen": -175.51718139648438, "logps/rejected": -927.0999755859375, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.680468797683716, "rewards/margins": 37.599998474121094, "rewards/rejected": -41.265625, "step": 11880 }, { "epoch": 2.9978256074118423, "grad_norm": 2.0363388061523438, "learning_rate": 8.715208869580771e-13, "logits/chosen": 0.2766479551792145, "logits/rejected": NaN, "logps/chosen": -170.67813110351562, "logps/rejected": -970.5999755859375, "loss": 0.0197, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -4.145947456359863, "rewards/margins": 38.08906173706055, "rewards/rejected": -42.2421875, "step": 11890 } ], "logging_steps": 10, "max_steps": 11898, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }