{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9998424353196986, "eval_steps": 500, "global_step": 11898, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0025210348848202185, "grad_norm": 144.7147216796875, "learning_rate": 4.201680672268907e-09, "logits/chosen": -1.244873046875, "logits/rejected": -1.282617211341858, "logps/chosen": -287.54376220703125, "logps/rejected": -257.1000061035156, "loss": 0.7034, "rewards/accuracies": 0.20937499403953552, "rewards/chosen": -0.011601448059082031, "rewards/margins": -0.019524574279785156, "rewards/rejected": 0.007927512750029564, "step": 10 }, { "epoch": 0.005042069769640437, "grad_norm": 90.3583984375, "learning_rate": 8.403361344537815e-09, "logits/chosen": NaN, "logits/rejected": -1.2949707508087158, "logps/chosen": -279.6000061035156, "logps/rejected": -271.91876220703125, "loss": 0.6987, "rewards/accuracies": 0.3531250059604645, "rewards/chosen": 0.0007650375482626259, "rewards/margins": -0.00536766042932868, "rewards/rejected": 0.0061325072310864925, "step": 20 }, { "epoch": 0.007563104654460656, "grad_norm": 100.33983612060547, "learning_rate": 1.2605042016806723e-08, "logits/chosen": -1.305029273033142, "logits/rejected": -1.313232421875, "logps/chosen": -283.7906188964844, "logps/rejected": -255.85000610351562, "loss": 0.6953, "rewards/accuracies": 0.32499998807907104, "rewards/chosen": 0.003363323165103793, "rewards/margins": -0.0035388946998864412, "rewards/rejected": 0.006898784544318914, "step": 30 }, { "epoch": 0.010084139539280874, "grad_norm": 97.06249237060547, "learning_rate": 1.680672268907563e-08, "logits/chosen": -1.3307616710662842, "logits/rejected": -1.3382079601287842, "logps/chosen": -290.76873779296875, "logps/rejected": -244.93124389648438, "loss": 0.6978, "rewards/accuracies": 0.34062498807907104, "rewards/chosen": 0.0071510314010083675, "rewards/margins": 0.00169200892560184, "rewards/rejected": 0.0054794312454760075, "step": 40 }, { "epoch": 0.012605174424101093, "grad_norm": 90.40794372558594, "learning_rate": 2.1008403361344538e-08, "logits/chosen": -1.2814452648162842, "logits/rejected": -1.3154296875, "logps/chosen": -283.1499938964844, "logps/rejected": -257.88592529296875, "loss": 0.6925, "rewards/accuracies": 0.33125001192092896, "rewards/chosen": 0.0049995421431958675, "rewards/margins": 0.009321212768554688, "rewards/rejected": -0.004360771272331476, "step": 50 }, { "epoch": 0.015126209308921312, "grad_norm": 118.56312561035156, "learning_rate": 2.5210084033613446e-08, "logits/chosen": -1.2708008289337158, "logits/rejected": -1.3049805164337158, "logps/chosen": -279.1875, "logps/rejected": -260.3125, "loss": 0.6926, "rewards/accuracies": 0.34375, "rewards/chosen": 0.009726142510771751, "rewards/margins": 0.004282951354980469, "rewards/rejected": 0.005441856570541859, "step": 60 }, { "epoch": 0.01764724419374153, "grad_norm": 89.31433868408203, "learning_rate": 2.941176470588235e-08, "logits/chosen": -1.2632324695587158, "logits/rejected": -1.30419921875, "logps/chosen": -292.8812561035156, "logps/rejected": -268.70001220703125, "loss": 0.6935, "rewards/accuracies": 0.28437501192092896, "rewards/chosen": 0.012535477057099342, "rewards/margins": 0.0027713775634765625, "rewards/rejected": 0.009746169671416283, "step": 70 }, { "epoch": 0.020168279078561748, "grad_norm": 107.96665954589844, "learning_rate": 3.361344537815126e-08, "logits/chosen": -1.313720703125, "logits/rejected": -1.3276855945587158, "logps/chosen": -295.0874938964844, "logps/rejected": -256.734375, "loss": 0.6941, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": 0.01777505874633789, "rewards/margins": -0.0024406432639807463, "rewards/rejected": 0.020194053649902344, "step": 80 }, { "epoch": 0.022689313963381967, "grad_norm": 84.0765380859375, "learning_rate": 3.7815126050420164e-08, "logits/chosen": -1.294189453125, "logits/rejected": -1.278295874595642, "logps/chosen": -321.5874938964844, "logps/rejected": -261.8656311035156, "loss": 0.6892, "rewards/accuracies": 0.328125, "rewards/chosen": 0.03657836839556694, "rewards/margins": 0.01087188720703125, "rewards/rejected": 0.025716591626405716, "step": 90 }, { "epoch": 0.025210348848202186, "grad_norm": 102.03905487060547, "learning_rate": 4.2016806722689076e-08, "logits/chosen": -1.2692382335662842, "logits/rejected": -1.3091552257537842, "logps/chosen": -291.4156188964844, "logps/rejected": -268.7562561035156, "loss": 0.6936, "rewards/accuracies": 0.34687501192092896, "rewards/chosen": 0.04565773159265518, "rewards/margins": -0.0016721725696697831, "rewards/rejected": 0.04732322692871094, "step": 100 }, { "epoch": 0.027731383733022405, "grad_norm": 90.58088684082031, "learning_rate": 4.621848739495798e-08, "logits/chosen": -1.277929663658142, "logits/rejected": -1.276123046875, "logps/chosen": -299.125, "logps/rejected": -264.0, "loss": 0.6902, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.0779266357421875, "rewards/margins": 0.013805675320327282, "rewards/rejected": 0.06411953270435333, "step": 110 }, { "epoch": 0.030252418617842624, "grad_norm": 87.24566650390625, "learning_rate": 5.042016806722689e-08, "logits/chosen": -1.280419945716858, "logits/rejected": -1.312744140625, "logps/chosen": -298.75, "logps/rejected": -271.76873779296875, "loss": 0.6794, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.12237701565027237, "rewards/margins": 0.030515288934111595, "rewards/rejected": 0.09185028076171875, "step": 120 }, { "epoch": 0.03277345350266284, "grad_norm": 75.01036071777344, "learning_rate": 5.46218487394958e-08, "logits/chosen": -1.2948729991912842, "logits/rejected": -1.3290526866912842, "logps/chosen": -276.0625, "logps/rejected": -255.8562469482422, "loss": 0.692, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": 0.15773162245750427, "rewards/margins": 0.005900573916733265, "rewards/rejected": 0.1518508940935135, "step": 130 }, { "epoch": 0.03529448838748306, "grad_norm": 97.48009490966797, "learning_rate": 5.88235294117647e-08, "logits/chosen": -1.262451171875, "logits/rejected": -1.261206030845642, "logps/chosen": -281.99688720703125, "logps/rejected": -259.36248779296875, "loss": 0.6887, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": 0.21947021782398224, "rewards/margins": 0.013390731997787952, "rewards/rejected": 0.20619507133960724, "step": 140 }, { "epoch": 0.03781552327230328, "grad_norm": 72.7547836303711, "learning_rate": 6.302521008403361e-08, "logits/chosen": -1.2507812976837158, "logits/rejected": -1.2765624523162842, "logps/chosen": -271.92498779296875, "logps/rejected": -284.1656188964844, "loss": 0.6835, "rewards/accuracies": 0.453125, "rewards/chosen": 0.2730773985385895, "rewards/margins": 0.02921295166015625, "rewards/rejected": 0.243865966796875, "step": 150 }, { "epoch": 0.040336558157123496, "grad_norm": 83.03253173828125, "learning_rate": 6.722689075630252e-08, "logits/chosen": -1.237402319908142, "logits/rejected": -1.281835913658142, "logps/chosen": -276.99688720703125, "logps/rejected": -250.66250610351562, "loss": 0.6773, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.30729371309280396, "rewards/margins": 0.03227996826171875, "rewards/rejected": 0.2750183045864105, "step": 160 }, { "epoch": 0.04285759304194372, "grad_norm": 92.10990142822266, "learning_rate": 7.142857142857142e-08, "logits/chosen": -1.273535132408142, "logits/rejected": -1.3026854991912842, "logps/chosen": -293.109375, "logps/rejected": -249.16250610351562, "loss": 0.6798, "rewards/accuracies": 0.4468750059604645, "rewards/chosen": 0.37623292207717896, "rewards/margins": 0.03588562086224556, "rewards/rejected": 0.3403076231479645, "step": 170 }, { "epoch": 0.045378627926763934, "grad_norm": 99.00688934326172, "learning_rate": 7.563025210084033e-08, "logits/chosen": -1.336669921875, "logits/rejected": -1.320703148841858, "logps/chosen": -293.23126220703125, "logps/rejected": -271.29376220703125, "loss": 0.6813, "rewards/accuracies": 0.4593749940395355, "rewards/chosen": 0.43438720703125, "rewards/margins": 0.04239501804113388, "rewards/rejected": 0.391998291015625, "step": 180 }, { "epoch": 0.04789966281158416, "grad_norm": 78.80610656738281, "learning_rate": 7.983193277310923e-08, "logits/chosen": -1.259521484375, "logits/rejected": -1.280517578125, "logps/chosen": -286.9375, "logps/rejected": -270.3999938964844, "loss": 0.6701, "rewards/accuracies": 0.5, "rewards/chosen": 0.5096435546875, "rewards/margins": 0.06426696479320526, "rewards/rejected": 0.4456543028354645, "step": 190 }, { "epoch": 0.05042069769640437, "grad_norm": 79.90438842773438, "learning_rate": 8.403361344537815e-08, "logits/chosen": -1.298828125, "logits/rejected": -1.296905517578125, "logps/chosen": -262.9437561035156, "logps/rejected": -240.984375, "loss": 0.6611, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.5528808832168579, "rewards/margins": 0.07709045708179474, "rewards/rejected": 0.4760375916957855, "step": 200 }, { "epoch": 0.052941732581224595, "grad_norm": 80.46623992919922, "learning_rate": 8.823529411764706e-08, "logits/chosen": -1.301489233970642, "logits/rejected": -1.3430664539337158, "logps/chosen": -271.40625, "logps/rejected": -233.71249389648438, "loss": 0.6695, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.56744384765625, "rewards/margins": 0.06519164890050888, "rewards/rejected": 0.502685546875, "step": 210 }, { "epoch": 0.05546276746604481, "grad_norm": 93.72669982910156, "learning_rate": 9.243697478991596e-08, "logits/chosen": -1.339013695716858, "logits/rejected": -1.3197753429412842, "logps/chosen": -284.09375, "logps/rejected": -246.71875, "loss": 0.6632, "rewards/accuracies": 0.534375011920929, "rewards/chosen": 0.5970458984375, "rewards/margins": 0.07635955512523651, "rewards/rejected": 0.520922839641571, "step": 220 }, { "epoch": 0.05798380235086503, "grad_norm": 84.46882629394531, "learning_rate": 9.663865546218488e-08, "logits/chosen": -1.284082055091858, "logits/rejected": -1.331884741783142, "logps/chosen": -286.3062438964844, "logps/rejected": -239.81875610351562, "loss": 0.675, "rewards/accuracies": 0.5, "rewards/chosen": 0.6225341558456421, "rewards/margins": 0.055267333984375, "rewards/rejected": 0.5671020746231079, "step": 230 }, { "epoch": 0.06050483723568525, "grad_norm": 86.55229949951172, "learning_rate": 1.0084033613445378e-07, "logits/chosen": -1.298730492591858, "logits/rejected": -1.3097655773162842, "logps/chosen": -268.171875, "logps/rejected": -257.51251220703125, "loss": 0.6622, "rewards/accuracies": 0.528124988079071, "rewards/chosen": 0.620678722858429, "rewards/margins": 0.08840332180261612, "rewards/rejected": 0.5323852300643921, "step": 240 }, { "epoch": 0.06302587212050546, "grad_norm": 83.27806854248047, "learning_rate": 1.0504201680672269e-07, "logits/chosen": -1.290429711341858, "logits/rejected": -1.319921851158142, "logps/chosen": -277.7250061035156, "logps/rejected": -250.27188110351562, "loss": 0.6593, "rewards/accuracies": 0.559374988079071, "rewards/chosen": 0.630175769329071, "rewards/margins": 0.09326171875, "rewards/rejected": 0.53704833984375, "step": 250 }, { "epoch": 0.06554690700532569, "grad_norm": 79.56951141357422, "learning_rate": 1.092436974789916e-07, "logits/chosen": -1.2774658203125, "logits/rejected": -1.353417992591858, "logps/chosen": -281.546875, "logps/rejected": -254.2062530517578, "loss": 0.6541, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 0.6353759765625, "rewards/margins": 0.10556793212890625, "rewards/rejected": 0.5298095941543579, "step": 260 }, { "epoch": 0.06806794189014591, "grad_norm": 76.68507385253906, "learning_rate": 1.134453781512605e-07, "logits/chosen": -1.2549316883087158, "logits/rejected": -1.302832007408142, "logps/chosen": -248.0625, "logps/rejected": -250.64999389648438, "loss": 0.6652, "rewards/accuracies": 0.546875, "rewards/chosen": 0.633593738079071, "rewards/margins": 0.08696289360523224, "rewards/rejected": 0.546875, "step": 270 }, { "epoch": 0.07058897677496612, "grad_norm": 75.71800994873047, "learning_rate": 1.176470588235294e-07, "logits/chosen": -1.3131835460662842, "logits/rejected": -1.277856469154358, "logps/chosen": -288.16876220703125, "logps/rejected": -275.109375, "loss": 0.6514, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.575854480266571, "rewards/margins": 0.11370544135570526, "rewards/rejected": 0.4620361328125, "step": 280 }, { "epoch": 0.07311001165978634, "grad_norm": 83.98974609375, "learning_rate": 1.2184873949579832e-07, "logits/chosen": -1.3290283679962158, "logits/rejected": -1.357177734375, "logps/chosen": -315.25, "logps/rejected": -264.66876220703125, "loss": 0.6568, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.5980224609375, "rewards/margins": 0.11147765815258026, "rewards/rejected": 0.4864868223667145, "step": 290 }, { "epoch": 0.07563104654460656, "grad_norm": 78.99283599853516, "learning_rate": 1.2605042016806723e-07, "logits/chosen": -1.321191430091858, "logits/rejected": -1.306396484375, "logps/chosen": -313.17498779296875, "logps/rejected": -297.0874938964844, "loss": 0.6544, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.594555675983429, "rewards/margins": 0.121393583714962, "rewards/rejected": 0.47313231229782104, "step": 300 }, { "epoch": 0.07815208142942678, "grad_norm": 76.23942565917969, "learning_rate": 1.3025210084033613e-07, "logits/chosen": -1.273828148841858, "logits/rejected": -1.316625952720642, "logps/chosen": -291.34375, "logps/rejected": -279.51251220703125, "loss": 0.6524, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": 0.513500988483429, "rewards/margins": 0.13037414848804474, "rewards/rejected": 0.3830398619174957, "step": 310 }, { "epoch": 0.08067311631424699, "grad_norm": 85.54627990722656, "learning_rate": 1.3445378151260504e-07, "logits/chosen": NaN, "logits/rejected": -1.305908203125, "logps/chosen": -273.25, "logps/rejected": -257.0062561035156, "loss": 0.6564, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.4950103759765625, "rewards/margins": 0.10860347747802734, "rewards/rejected": 0.3865097165107727, "step": 320 }, { "epoch": 0.08319415119906722, "grad_norm": 79.61593627929688, "learning_rate": 1.3865546218487394e-07, "logits/chosen": -1.2723388671875, "logits/rejected": -1.293066382408142, "logps/chosen": -268.15313720703125, "logps/rejected": -250.85000610351562, "loss": 0.641, "rewards/accuracies": 0.578125, "rewards/chosen": 0.4624877870082855, "rewards/margins": 0.1453094482421875, "rewards/rejected": 0.3171554505825043, "step": 330 }, { "epoch": 0.08571518608388744, "grad_norm": 87.76576232910156, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -1.2678954601287842, "logits/rejected": -1.346899390220642, "logps/chosen": -310.12969970703125, "logps/rejected": -265.8218688964844, "loss": 0.6213, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.42416077852249146, "rewards/margins": 0.19175872206687927, "rewards/rejected": 0.2326004058122635, "step": 340 }, { "epoch": 0.08823622096870766, "grad_norm": 94.80514526367188, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -1.2676270008087158, "logits/rejected": -1.311425805091858, "logps/chosen": -270.75, "logps/rejected": -248.9375, "loss": 0.6295, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.36808472871780396, "rewards/margins": 0.174671933054924, "rewards/rejected": 0.19347229599952698, "step": 350 }, { "epoch": 0.09075725585352787, "grad_norm": 77.4742431640625, "learning_rate": 1.5126050420168066e-07, "logits/chosen": -1.245568871498108, "logits/rejected": -1.2759521007537842, "logps/chosen": -265.8125, "logps/rejected": -257.7562561035156, "loss": 0.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.266213983297348, "rewards/margins": 0.18226012587547302, "rewards/rejected": 0.08396835625171661, "step": 360 }, { "epoch": 0.09327829073834809, "grad_norm": 73.68795013427734, "learning_rate": 1.554621848739496e-07, "logits/chosen": -1.2880859375, "logits/rejected": -1.327734351158142, "logps/chosen": -273.13751220703125, "logps/rejected": -243.6125030517578, "loss": 0.6406, "rewards/accuracies": 0.625, "rewards/chosen": 0.2410484254360199, "rewards/margins": 0.16602936387062073, "rewards/rejected": 0.07509537041187286, "step": 370 }, { "epoch": 0.09579932562316831, "grad_norm": 85.77863311767578, "learning_rate": 1.5966386554621847e-07, "logits/chosen": NaN, "logits/rejected": -1.3193359375, "logps/chosen": -289.1656188964844, "logps/rejected": -253.24374389648438, "loss": 0.6078, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.19881896674633026, "rewards/margins": 0.25654298067092896, "rewards/rejected": -0.057816315442323685, "step": 380 }, { "epoch": 0.09832036050798854, "grad_norm": 69.80066680908203, "learning_rate": 1.638655462184874e-07, "logits/chosen": -1.274169921875, "logits/rejected": -1.2903320789337158, "logps/chosen": -294.56561279296875, "logps/rejected": -272.609375, "loss": 0.6149, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.19277039170265198, "rewards/margins": 0.2486732453107834, "rewards/rejected": -0.055816650390625, "step": 390 }, { "epoch": 0.10084139539280874, "grad_norm": 73.84134674072266, "learning_rate": 1.680672268907563e-07, "logits/chosen": -1.2617676258087158, "logits/rejected": -1.277001976966858, "logps/chosen": -289.4125061035156, "logps/rejected": -285.171875, "loss": 0.6195, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2807784974575043, "rewards/margins": 0.23712310194969177, "rewards/rejected": 0.04382782056927681, "step": 400 }, { "epoch": 0.10336243027762897, "grad_norm": 90.36155700683594, "learning_rate": 1.722689075630252e-07, "logits/chosen": -1.302832007408142, "logits/rejected": -1.361230492591858, "logps/chosen": -284.65625, "logps/rejected": -287.73748779296875, "loss": 0.6147, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": 0.2798774838447571, "rewards/margins": 0.2889160215854645, "rewards/rejected": -0.009383773431181908, "step": 410 }, { "epoch": 0.10588346516244919, "grad_norm": 116.59656524658203, "learning_rate": 1.764705882352941e-07, "logits/chosen": -1.2851073741912842, "logits/rejected": -1.30224609375, "logps/chosen": -279.1875, "logps/rejected": -269.29998779296875, "loss": 0.5927, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10270003974437714, "rewards/margins": 0.3286422789096832, "rewards/rejected": -0.22588805854320526, "step": 420 }, { "epoch": 0.1084045000472694, "grad_norm": 74.44758605957031, "learning_rate": 1.8067226890756302e-07, "logits/chosen": -1.3424072265625, "logits/rejected": -1.3671875, "logps/chosen": -297.32501220703125, "logps/rejected": -257.3062438964844, "loss": 0.5745, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09719619899988174, "rewards/margins": 0.38787537813186646, "rewards/rejected": -0.290719598531723, "step": 430 }, { "epoch": 0.11092553493208962, "grad_norm": 73.85236358642578, "learning_rate": 1.8487394957983192e-07, "logits/chosen": -1.315185546875, "logits/rejected": -1.316796898841858, "logps/chosen": -278.0375061035156, "logps/rejected": -264.6937561035156, "loss": 0.6022, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.14359435439109802, "rewards/margins": 0.3448944091796875, "rewards/rejected": -0.20127257704734802, "step": 440 }, { "epoch": 0.11344656981690984, "grad_norm": 84.43276977539062, "learning_rate": 1.8907563025210083e-07, "logits/chosen": -1.2849853038787842, "logits/rejected": -1.307861328125, "logps/chosen": -276.375, "logps/rejected": -273.6312561035156, "loss": 0.5859, "rewards/accuracies": 0.671875, "rewards/chosen": 0.3189895749092102, "rewards/margins": 0.39819639921188354, "rewards/rejected": -0.079370878636837, "step": 450 }, { "epoch": 0.11596760470173006, "grad_norm": 90.26596069335938, "learning_rate": 1.9327731092436976e-07, "logits/chosen": -1.2935791015625, "logits/rejected": -1.325781226158142, "logps/chosen": -317.5679626464844, "logps/rejected": -264.5874938964844, "loss": 0.6054, "rewards/accuracies": 0.65625, "rewards/chosen": 0.3327392637729645, "rewards/margins": 0.360220342874527, "rewards/rejected": -0.02756500244140625, "step": 460 }, { "epoch": 0.11848863958655027, "grad_norm": 73.2596435546875, "learning_rate": 1.9747899159663864e-07, "logits/chosen": -1.2628905773162842, "logits/rejected": -1.300537109375, "logps/chosen": -299.64373779296875, "logps/rejected": -285.57501220703125, "loss": 0.5807, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.02835845947265625, "rewards/margins": 0.433358758687973, "rewards/rejected": -0.4616455137729645, "step": 470 }, { "epoch": 0.1210096744713705, "grad_norm": 84.38887023925781, "learning_rate": 2.0168067226890757e-07, "logits/chosen": -1.215356469154358, "logits/rejected": -1.287939429283142, "logps/chosen": -274.875, "logps/rejected": -266.01251220703125, "loss": 0.5676, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.106053926050663, "rewards/margins": 0.49801331758499146, "rewards/rejected": -0.6037139892578125, "step": 480 }, { "epoch": 0.12353070935619072, "grad_norm": 89.26333618164062, "learning_rate": 2.0588235294117645e-07, "logits/chosen": -1.2712891101837158, "logits/rejected": -1.285058617591858, "logps/chosen": -298.1312561035156, "logps/rejected": -268.21875, "loss": 0.5684, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.3065994381904602, "rewards/margins": 0.5032730102539062, "rewards/rejected": -0.19690552353858948, "step": 490 }, { "epoch": 0.12605174424101093, "grad_norm": 86.56298065185547, "learning_rate": 2.1008403361344538e-07, "logits/chosen": -1.278344750404358, "logits/rejected": -1.300927758216858, "logps/chosen": -279.90625, "logps/rejected": -256.640625, "loss": 0.5294, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.02459106408059597, "rewards/margins": 0.6083038449287415, "rewards/rejected": -0.583740234375, "step": 500 }, { "epoch": 0.12857277912583115, "grad_norm": 78.020263671875, "learning_rate": 2.1428571428571426e-07, "logits/chosen": -1.270605444908142, "logits/rejected": -1.335058569908142, "logps/chosen": -265.26873779296875, "logps/rejected": -261.3812561035156, "loss": 0.5688, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5009399652481079, "rewards/margins": 0.5434356927871704, "rewards/rejected": -1.0441162586212158, "step": 510 }, { "epoch": 0.13109381401065137, "grad_norm": 68.09835052490234, "learning_rate": 2.184873949579832e-07, "logits/chosen": -1.298486351966858, "logits/rejected": -1.310644507408142, "logps/chosen": -286.26873779296875, "logps/rejected": -265.04376220703125, "loss": 0.5984, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -0.0276031494140625, "rewards/margins": 0.4960693418979645, "rewards/rejected": -0.5234416723251343, "step": 520 }, { "epoch": 0.1336148488954716, "grad_norm": 94.15994262695312, "learning_rate": 2.226890756302521e-07, "logits/chosen": -1.2277343273162842, "logits/rejected": -1.3334472179412842, "logps/chosen": -266.94061279296875, "logps/rejected": -250.85000610351562, "loss": 0.5546, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.03800354152917862, "rewards/margins": 0.599743664264679, "rewards/rejected": -0.561663806438446, "step": 530 }, { "epoch": 0.13613588378029182, "grad_norm": 89.99702453613281, "learning_rate": 2.26890756302521e-07, "logits/chosen": -1.329443335533142, "logits/rejected": -1.3420898914337158, "logps/chosen": -295.42498779296875, "logps/rejected": -259.51251220703125, "loss": 0.5859, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07108917087316513, "rewards/margins": 0.5303710699081421, "rewards/rejected": -0.6014175415039062, "step": 540 }, { "epoch": 0.13865691866511204, "grad_norm": 105.61231994628906, "learning_rate": 2.3109243697478993e-07, "logits/chosen": -1.3101074695587158, "logits/rejected": -1.319091796875, "logps/chosen": -294.79998779296875, "logps/rejected": -272.45623779296875, "loss": 0.5617, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03331298753619194, "rewards/margins": 0.595355212688446, "rewards/rejected": -0.6286590695381165, "step": 550 }, { "epoch": 0.14117795354993223, "grad_norm": 72.71495819091797, "learning_rate": 2.352941176470588e-07, "logits/chosen": -1.247802734375, "logits/rejected": -1.289453148841858, "logps/chosen": -254.91250610351562, "logps/rejected": -248.2375030517578, "loss": 0.5539, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.1997833251953125, "rewards/margins": 0.6131622195243835, "rewards/rejected": -0.4132888913154602, "step": 560 }, { "epoch": 0.14369898843475246, "grad_norm": 83.22612762451172, "learning_rate": 2.394957983193277e-07, "logits/chosen": -1.2476074695587158, "logits/rejected": -1.243749976158142, "logps/chosen": -267.5249938964844, "logps/rejected": -274.3374938964844, "loss": 0.5608, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.33015137910842896, "rewards/margins": 0.5673843622207642, "rewards/rejected": -0.8973663449287415, "step": 570 }, { "epoch": 0.14622002331957268, "grad_norm": 68.80695343017578, "learning_rate": 2.4369747899159664e-07, "logits/chosen": -1.2406494617462158, "logits/rejected": -1.292321801185608, "logps/chosen": -266.38751220703125, "logps/rejected": -275.38751220703125, "loss": 0.5623, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.2216869294643402, "rewards/margins": 0.6139160394668579, "rewards/rejected": -0.835650622844696, "step": 580 }, { "epoch": 0.1487410582043929, "grad_norm": 72.53914642333984, "learning_rate": 2.478991596638655e-07, "logits/chosen": -1.29248046875, "logits/rejected": -1.28662109375, "logps/chosen": -295.5249938964844, "logps/rejected": -256.65313720703125, "loss": 0.5886, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.18014831840991974, "rewards/margins": 0.593457043170929, "rewards/rejected": -0.773388683795929, "step": 590 }, { "epoch": 0.15126209308921312, "grad_norm": 90.98090362548828, "learning_rate": 2.5210084033613445e-07, "logits/chosen": -1.281396508216858, "logits/rejected": -1.3123047351837158, "logps/chosen": -299.625, "logps/rejected": -278.0093688964844, "loss": 0.5836, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.263723760843277, "rewards/margins": 0.5306030511856079, "rewards/rejected": -0.7945464849472046, "step": 600 }, { "epoch": 0.15378312797403335, "grad_norm": 72.77758026123047, "learning_rate": 2.5630252100840333e-07, "logits/chosen": -1.2614014148712158, "logits/rejected": -1.3181641101837158, "logps/chosen": -275.26251220703125, "logps/rejected": -263.3031311035156, "loss": 0.5572, "rewards/accuracies": 0.671875, "rewards/chosen": -0.4960670471191406, "rewards/margins": 0.58111572265625, "rewards/rejected": -1.077172875404358, "step": 610 }, { "epoch": 0.15630416285885357, "grad_norm": 106.92414855957031, "learning_rate": 2.6050420168067226e-07, "logits/chosen": -1.290771484375, "logits/rejected": -1.342529296875, "logps/chosen": -283.70001220703125, "logps/rejected": -257.7593688964844, "loss": 0.5548, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.2074592560529709, "rewards/margins": 0.618426501750946, "rewards/rejected": -0.8258941769599915, "step": 620 }, { "epoch": 0.1588251977436738, "grad_norm": 79.6308364868164, "learning_rate": 2.6470588235294114e-07, "logits/chosen": -1.259765625, "logits/rejected": NaN, "logps/chosen": -298.61248779296875, "logps/rejected": -279.70623779296875, "loss": 0.5645, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.35757750272750854, "rewards/margins": 0.611968994140625, "rewards/rejected": -0.9693847894668579, "step": 630 }, { "epoch": 0.16134623262849399, "grad_norm": 101.44197082519531, "learning_rate": 2.689075630252101e-07, "logits/chosen": -1.28271484375, "logits/rejected": -1.2960937023162842, "logps/chosen": -279.96875, "logps/rejected": -270.5093688964844, "loss": 0.5506, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.2633987367153168, "rewards/margins": 0.649487316608429, "rewards/rejected": -0.9127746820449829, "step": 640 }, { "epoch": 0.1638672675133142, "grad_norm": 76.77125549316406, "learning_rate": 2.7310924369747895e-07, "logits/chosen": -1.3201172351837158, "logits/rejected": -1.388916015625, "logps/chosen": -290.2124938964844, "logps/rejected": -272.3843688964844, "loss": 0.5504, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12568894028663635, "rewards/margins": 0.6548095941543579, "rewards/rejected": -0.5296264886856079, "step": 650 }, { "epoch": 0.16638830239813443, "grad_norm": 97.28453826904297, "learning_rate": 2.773109243697479e-07, "logits/chosen": -1.2907226085662842, "logits/rejected": -1.312597632408142, "logps/chosen": -280.70623779296875, "logps/rejected": -284.04998779296875, "loss": 0.5809, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07011871039867401, "rewards/margins": 0.598468005657196, "rewards/rejected": -0.5285194516181946, "step": 660 }, { "epoch": 0.16890933728295465, "grad_norm": 91.88009643554688, "learning_rate": 2.815126050420168e-07, "logits/chosen": -1.2091796398162842, "logits/rejected": NaN, "logps/chosen": -298.46875, "logps/rejected": -269.84375, "loss": 0.5684, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.16863402724266052, "rewards/margins": 0.6591461300849915, "rewards/rejected": -0.8283462524414062, "step": 670 }, { "epoch": 0.17143037216777487, "grad_norm": 74.19715118408203, "learning_rate": 2.857142857142857e-07, "logits/chosen": -1.261328101158142, "logits/rejected": -1.2793457508087158, "logps/chosen": -289.64373779296875, "logps/rejected": -281.83123779296875, "loss": 0.5778, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.475006103515625, "rewards/margins": 0.564221203327179, "rewards/rejected": -1.0394470691680908, "step": 680 }, { "epoch": 0.1739514070525951, "grad_norm": 75.1677017211914, "learning_rate": 2.899159663865546e-07, "logits/chosen": -1.225683569908142, "logits/rejected": -1.315527319908142, "logps/chosen": -306.6343688964844, "logps/rejected": -273.3687438964844, "loss": 0.5072, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.19892120361328125, "rewards/margins": 0.7209564447402954, "rewards/rejected": -0.919659435749054, "step": 690 }, { "epoch": 0.17647244193741532, "grad_norm": 97.35240936279297, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.290869116783142, "logits/rejected": -1.293237328529358, "logps/chosen": -264.83123779296875, "logps/rejected": -257.33123779296875, "loss": 0.6068, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.08776245266199112, "rewards/margins": 0.542065441608429, "rewards/rejected": -0.6296356320381165, "step": 700 }, { "epoch": 0.17899347682223551, "grad_norm": 73.25398254394531, "learning_rate": 2.9831932773109244e-07, "logits/chosen": -1.2614257335662842, "logits/rejected": -1.300537109375, "logps/chosen": -289.8531188964844, "logps/rejected": -278.890625, "loss": 0.5682, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28628236055374146, "rewards/margins": 0.6059295535087585, "rewards/rejected": -0.8918091058731079, "step": 710 }, { "epoch": 0.18151451170705574, "grad_norm": 82.12859344482422, "learning_rate": 3.025210084033613e-07, "logits/chosen": -1.349365234375, "logits/rejected": -1.3427734375, "logps/chosen": -290.1937561035156, "logps/rejected": -276.265625, "loss": 0.5392, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.135517880320549, "rewards/margins": 0.6761139035224915, "rewards/rejected": -0.5405243039131165, "step": 720 }, { "epoch": 0.18403554659187596, "grad_norm": 65.37681579589844, "learning_rate": 3.0672268907563024e-07, "logits/chosen": -1.2498047351837158, "logits/rejected": -1.31591796875, "logps/chosen": -278.0843811035156, "logps/rejected": -281.8125, "loss": 0.5853, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.43583983182907104, "rewards/margins": 0.5022827386856079, "rewards/rejected": -0.06654968112707138, "step": 730 }, { "epoch": 0.18655658147669618, "grad_norm": 101.68387603759766, "learning_rate": 3.109243697478992e-07, "logits/chosen": -1.2600586414337158, "logits/rejected": -1.3029296398162842, "logps/chosen": -294.7250061035156, "logps/rejected": -284.4437561035156, "loss": 0.5604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.02090301550924778, "rewards/margins": 0.6147094964981079, "rewards/rejected": -0.5933437347412109, "step": 740 }, { "epoch": 0.1890776163615164, "grad_norm": 67.09628295898438, "learning_rate": 3.1512605042016805e-07, "logits/chosen": -1.283447265625, "logits/rejected": -1.2777099609375, "logps/chosen": -276.29376220703125, "logps/rejected": -269.3812561035156, "loss": 0.5529, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.22530308365821838, "rewards/margins": 0.774169921875, "rewards/rejected": -0.9990142583847046, "step": 750 }, { "epoch": 0.19159865124633663, "grad_norm": 80.1001205444336, "learning_rate": 3.1932773109243693e-07, "logits/chosen": -1.291162133216858, "logits/rejected": -1.2841796875, "logps/chosen": -289.6000061035156, "logps/rejected": -261.45623779296875, "loss": 0.5307, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.24888916313648224, "rewards/margins": 0.6986938714981079, "rewards/rejected": -0.44995421171188354, "step": 760 }, { "epoch": 0.19411968613115685, "grad_norm": 56.933860778808594, "learning_rate": 3.2352941176470586e-07, "logits/chosen": -1.3205077648162842, "logits/rejected": -1.314550757408142, "logps/chosen": -293.04998779296875, "logps/rejected": -254.9718780517578, "loss": 0.5417, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2917343080043793, "rewards/margins": 0.7984863519668579, "rewards/rejected": -1.0905120372772217, "step": 770 }, { "epoch": 0.19664072101597707, "grad_norm": 95.23239135742188, "learning_rate": 3.277310924369748e-07, "logits/chosen": -1.2874023914337158, "logits/rejected": -1.283959984779358, "logps/chosen": -288.79376220703125, "logps/rejected": -278.95623779296875, "loss": 0.6097, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.217681884765625, "rewards/margins": 0.5911788940429688, "rewards/rejected": -0.8089507818222046, "step": 780 }, { "epoch": 0.19916175590079727, "grad_norm": 92.63106536865234, "learning_rate": 3.319327731092437e-07, "logits/chosen": -1.2746093273162842, "logits/rejected": -1.323339819908142, "logps/chosen": -303.8687438964844, "logps/rejected": -289.54376220703125, "loss": 0.5632, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.18847045302391052, "rewards/margins": 0.6622680425643921, "rewards/rejected": -0.8506530523300171, "step": 790 }, { "epoch": 0.2016827907856175, "grad_norm": 80.16415405273438, "learning_rate": 3.361344537815126e-07, "logits/chosen": -1.289941430091858, "logits/rejected": -1.2956054210662842, "logps/chosen": -277.3187561035156, "logps/rejected": -269.70001220703125, "loss": 0.5811, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3729003965854645, "rewards/margins": 0.63128662109375, "rewards/rejected": -1.0042724609375, "step": 800 }, { "epoch": 0.2042038256704377, "grad_norm": 61.04458999633789, "learning_rate": 3.403361344537815e-07, "logits/chosen": -1.283203125, "logits/rejected": -1.317968726158142, "logps/chosen": -294.71875, "logps/rejected": -259.11248779296875, "loss": 0.5079, "rewards/accuracies": 0.734375, "rewards/chosen": -0.07389144599437714, "rewards/margins": 0.750195324420929, "rewards/rejected": -0.824145495891571, "step": 810 }, { "epoch": 0.20672486055525793, "grad_norm": 73.20579528808594, "learning_rate": 3.445378151260504e-07, "logits/chosen": -1.2428710460662842, "logits/rejected": -1.3032715320587158, "logps/chosen": -293.8374938964844, "logps/rejected": -287.2562561035156, "loss": 0.5371, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.15455932915210724, "rewards/margins": 0.719097912311554, "rewards/rejected": -0.5641471743583679, "step": 820 }, { "epoch": 0.20924589544007816, "grad_norm": 119.09114074707031, "learning_rate": 3.487394957983193e-07, "logits/chosen": -1.2250244617462158, "logits/rejected": -1.2574951648712158, "logps/chosen": -293.98126220703125, "logps/rejected": -274.2875061035156, "loss": 0.5384, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.280783087015152, "rewards/margins": 0.8061981201171875, "rewards/rejected": -1.0869262218475342, "step": 830 }, { "epoch": 0.21176693032489838, "grad_norm": 78.48435974121094, "learning_rate": 3.529411764705882e-07, "logits/chosen": -1.2794921398162842, "logits/rejected": -1.2690918445587158, "logps/chosen": -274.99688720703125, "logps/rejected": -266.2250061035156, "loss": 0.5478, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.04948578029870987, "rewards/margins": 0.789715588092804, "rewards/rejected": -0.8386299014091492, "step": 840 }, { "epoch": 0.2142879652097186, "grad_norm": 80.56698608398438, "learning_rate": 3.5714285714285716e-07, "logits/chosen": -1.275170922279358, "logits/rejected": -1.351904273033142, "logps/chosen": -287.8187561035156, "logps/rejected": -246.4187469482422, "loss": 0.5376, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": 0.43180543184280396, "rewards/margins": 0.683911144733429, "rewards/rejected": -0.2513900697231293, "step": 850 }, { "epoch": 0.2168090000945388, "grad_norm": 83.84162139892578, "learning_rate": 3.6134453781512604e-07, "logits/chosen": -1.247167944908142, "logits/rejected": -1.2339355945587158, "logps/chosen": -279.45001220703125, "logps/rejected": -255.4562530517578, "loss": 0.5869, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.07878265529870987, "rewards/margins": 0.677410900592804, "rewards/rejected": -0.5982635617256165, "step": 860 }, { "epoch": 0.21933003497935902, "grad_norm": 71.6579818725586, "learning_rate": 3.655462184873949e-07, "logits/chosen": -1.259765625, "logits/rejected": -1.245581030845642, "logps/chosen": -287.58123779296875, "logps/rejected": -255.28750610351562, "loss": 0.5896, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.03903808444738388, "rewards/margins": 0.5729767084121704, "rewards/rejected": -0.5338500738143921, "step": 870 }, { "epoch": 0.22185106986417924, "grad_norm": 75.61174774169922, "learning_rate": 3.6974789915966385e-07, "logits/chosen": -1.290307641029358, "logits/rejected": -1.34326171875, "logps/chosen": -319.64373779296875, "logps/rejected": -292.51251220703125, "loss": 0.5762, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.2825942933559418, "rewards/margins": 0.6470428705215454, "rewards/rejected": -0.9297851324081421, "step": 880 }, { "epoch": 0.22437210474899946, "grad_norm": 74.02259826660156, "learning_rate": 3.739495798319328e-07, "logits/chosen": -1.2446715831756592, "logits/rejected": -1.341577172279358, "logps/chosen": -306.1875, "logps/rejected": -257.70623779296875, "loss": 0.5545, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5517059564590454, "rewards/margins": 0.668322741985321, "rewards/rejected": -1.219885230064392, "step": 890 }, { "epoch": 0.22689313963381968, "grad_norm": 79.9630126953125, "learning_rate": 3.7815126050420166e-07, "logits/chosen": -1.255957007408142, "logits/rejected": -1.2941405773162842, "logps/chosen": -301.47186279296875, "logps/rejected": -265.76251220703125, "loss": 0.5639, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -1.028466820716858, "rewards/margins": 0.7416824102401733, "rewards/rejected": -1.7698242664337158, "step": 900 }, { "epoch": 0.2294141745186399, "grad_norm": 77.22490692138672, "learning_rate": 3.8235294117647053e-07, "logits/chosen": -1.249658226966858, "logits/rejected": -1.2742187976837158, "logps/chosen": -279.21875, "logps/rejected": -259.8125, "loss": 0.585, "rewards/accuracies": 0.6468750238418579, "rewards/chosen": -1.0482285022735596, "rewards/margins": 0.6260131597518921, "rewards/rejected": -1.674279808998108, "step": 910 }, { "epoch": 0.23193520940346013, "grad_norm": 68.81954956054688, "learning_rate": 3.865546218487395e-07, "logits/chosen": -1.274072289466858, "logits/rejected": -1.309667944908142, "logps/chosen": -276.625, "logps/rejected": -259.3656311035156, "loss": 0.5303, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.28308868408203125, "rewards/margins": 0.7613174319267273, "rewards/rejected": -1.044244408607483, "step": 920 }, { "epoch": 0.23445624428828035, "grad_norm": 68.08296203613281, "learning_rate": 3.907563025210084e-07, "logits/chosen": -1.2756836414337158, "logits/rejected": -1.3571288585662842, "logps/chosen": -301.5, "logps/rejected": -261.21875, "loss": 0.5026, "rewards/accuracies": 0.721875011920929, "rewards/chosen": 0.04568786546587944, "rewards/margins": 0.817272961139679, "rewards/rejected": -0.7716888189315796, "step": 930 }, { "epoch": 0.23697727917310055, "grad_norm": 102.95132446289062, "learning_rate": 3.949579831932773e-07, "logits/chosen": -1.247900366783142, "logits/rejected": -1.289941430091858, "logps/chosen": -299.3374938964844, "logps/rejected": -267.14373779296875, "loss": 0.5384, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.35980987548828125, "rewards/margins": 0.7607055902481079, "rewards/rejected": -1.120202660560608, "step": 940 }, { "epoch": 0.23949831405792077, "grad_norm": 96.6849365234375, "learning_rate": 3.991596638655462e-07, "logits/chosen": -1.2032959461212158, "logits/rejected": -1.236962914466858, "logps/chosen": -278.64373779296875, "logps/rejected": -262.89373779296875, "loss": 0.5637, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.323556512594223, "rewards/margins": 0.7710052728652954, "rewards/rejected": -1.095068335533142, "step": 950 }, { "epoch": 0.242019348942741, "grad_norm": 61.88032150268555, "learning_rate": 4.0336134453781514e-07, "logits/chosen": -1.254980444908142, "logits/rejected": -1.2530028820037842, "logps/chosen": -294.7437438964844, "logps/rejected": -257.9437561035156, "loss": 0.5456, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.07322387397289276, "rewards/margins": 0.7105438113212585, "rewards/rejected": -0.7835647463798523, "step": 960 }, { "epoch": 0.24454038382756121, "grad_norm": 76.08362579345703, "learning_rate": 4.07563025210084e-07, "logits/chosen": -1.26416015625, "logits/rejected": -1.3228271007537842, "logps/chosen": -286.6273498535156, "logps/rejected": -273.36248779296875, "loss": 0.5411, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.09781646728515625, "rewards/margins": 0.781726062297821, "rewards/rejected": -0.6835899353027344, "step": 970 }, { "epoch": 0.24706141871238144, "grad_norm": 79.66098022460938, "learning_rate": 4.117647058823529e-07, "logits/chosen": -1.285864233970642, "logits/rejected": -1.3110840320587158, "logps/chosen": -290.21875, "logps/rejected": -267.29376220703125, "loss": 0.5379, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.03505401685833931, "rewards/margins": 0.8145843744277954, "rewards/rejected": -0.8496490716934204, "step": 980 }, { "epoch": 0.24958245359720166, "grad_norm": 88.81765747070312, "learning_rate": 4.159663865546218e-07, "logits/chosen": -1.2433593273162842, "logits/rejected": -1.279296875, "logps/chosen": -270.4750061035156, "logps/rejected": -264.48748779296875, "loss": 0.5104, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.21263274550437927, "rewards/margins": 0.974438488483429, "rewards/rejected": -1.1868622303009033, "step": 990 }, { "epoch": 0.25210348848202185, "grad_norm": 94.12862396240234, "learning_rate": 4.2016806722689076e-07, "logits/chosen": -1.2711670398712158, "logits/rejected": -1.3352539539337158, "logps/chosen": -300.98126220703125, "logps/rejected": -301.36248779296875, "loss": 0.5281, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.40046995878219604, "rewards/margins": 0.9420257806777954, "rewards/rejected": -1.3425171375274658, "step": 1000 }, { "epoch": 0.2546245233668421, "grad_norm": 76.39344024658203, "learning_rate": 4.2436974789915964e-07, "logits/chosen": -1.215795874595642, "logits/rejected": -1.247656226158142, "logps/chosen": -274.0, "logps/rejected": -274.6499938964844, "loss": 0.5548, "rewards/accuracies": 0.684374988079071, "rewards/chosen": 0.12611083686351776, "rewards/margins": 0.807659924030304, "rewards/rejected": -0.6822448968887329, "step": 1010 }, { "epoch": 0.2571455582516623, "grad_norm": 62.967628479003906, "learning_rate": 4.285714285714285e-07, "logits/chosen": -1.2568359375, "logits/rejected": -1.3361084461212158, "logps/chosen": -298.0718688964844, "logps/rejected": -263.27813720703125, "loss": 0.5717, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.04311523586511612, "rewards/margins": 0.7250305414199829, "rewards/rejected": -0.682232677936554, "step": 1020 }, { "epoch": 0.25966659313648255, "grad_norm": 72.51446533203125, "learning_rate": 4.327731092436975e-07, "logits/chosen": -1.292822241783142, "logits/rejected": -1.268945336341858, "logps/chosen": -294.0249938964844, "logps/rejected": -285.58123779296875, "loss": 0.5328, "rewards/accuracies": 0.703125, "rewards/chosen": 0.2802795469760895, "rewards/margins": 0.7537078857421875, "rewards/rejected": -0.47330933809280396, "step": 1030 }, { "epoch": 0.26218762802130274, "grad_norm": 110.47750091552734, "learning_rate": 4.369747899159664e-07, "logits/chosen": NaN, "logits/rejected": NaN, "logps/chosen": -270.46875, "logps/rejected": -256.91876220703125, "loss": 0.5586, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.0010284424060955644, "rewards/margins": 0.735827624797821, "rewards/rejected": -0.737213134765625, "step": 1040 }, { "epoch": 0.26470866290612294, "grad_norm": 99.0147476196289, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -1.225927710533142, "logits/rejected": -1.264062523841858, "logps/chosen": -284.73126220703125, "logps/rejected": -259.83123779296875, "loss": 0.5227, "rewards/accuracies": 0.71875, "rewards/chosen": -0.999798595905304, "rewards/margins": 0.890423595905304, "rewards/rejected": -1.889714002609253, "step": 1050 }, { "epoch": 0.2672296977909432, "grad_norm": 82.93985748291016, "learning_rate": 4.453781512605042e-07, "logits/chosen": -1.226342797279358, "logits/rejected": -1.2687499523162842, "logps/chosen": -262.21875, "logps/rejected": -264.4437561035156, "loss": 0.5703, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.681231677532196, "rewards/margins": 0.7460876703262329, "rewards/rejected": -1.4273468255996704, "step": 1060 }, { "epoch": 0.2697507326757634, "grad_norm": 88.91585540771484, "learning_rate": 4.495798319327731e-07, "logits/chosen": -1.2632324695587158, "logits/rejected": -1.2573730945587158, "logps/chosen": -303.3687438964844, "logps/rejected": -292.6625061035156, "loss": 0.5432, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.758679211139679, "rewards/margins": 0.8320068120956421, "rewards/rejected": -1.590661644935608, "step": 1070 }, { "epoch": 0.27227176756058363, "grad_norm": 76.33614349365234, "learning_rate": 4.53781512605042e-07, "logits/chosen": -1.189550757408142, "logits/rejected": NaN, "logps/chosen": -317.40313720703125, "logps/rejected": -271.9312438964844, "loss": 0.582, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5956023931503296, "rewards/margins": 0.6631225347518921, "rewards/rejected": -1.2587738037109375, "step": 1080 }, { "epoch": 0.2747928024454038, "grad_norm": 83.96426391601562, "learning_rate": 4.579831932773109e-07, "logits/chosen": -1.24169921875, "logits/rejected": -1.2775390148162842, "logps/chosen": -276.65625, "logps/rejected": -270.703125, "loss": 0.5503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7082275152206421, "rewards/margins": 0.714599609375, "rewards/rejected": -1.4234130382537842, "step": 1090 }, { "epoch": 0.2773138373302241, "grad_norm": 65.58870697021484, "learning_rate": 4.6218487394957986e-07, "logits/chosen": -1.245947241783142, "logits/rejected": -1.291894555091858, "logps/chosen": -319.10626220703125, "logps/rejected": -290.3062438964844, "loss": 0.5692, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.990826427936554, "rewards/margins": 0.6439148187637329, "rewards/rejected": -1.6346435546875, "step": 1100 }, { "epoch": 0.2798348722150443, "grad_norm": 69.3920669555664, "learning_rate": 4.6638655462184874e-07, "logits/chosen": -1.219946265220642, "logits/rejected": -1.2772216796875, "logps/chosen": -292.7906188964844, "logps/rejected": -255.46875, "loss": 0.5544, "rewards/accuracies": 0.6875, "rewards/chosen": -0.80059814453125, "rewards/margins": 0.734448254108429, "rewards/rejected": -1.534936547279358, "step": 1110 }, { "epoch": 0.28235590709986447, "grad_norm": 61.433929443359375, "learning_rate": 4.705882352941176e-07, "logits/chosen": -1.2393310070037842, "logits/rejected": -1.2877686023712158, "logps/chosen": -286.4937438964844, "logps/rejected": -263.48126220703125, "loss": 0.5409, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4710235595703125, "rewards/margins": 0.8074401617050171, "rewards/rejected": -1.278076171875, "step": 1120 }, { "epoch": 0.2848769419846847, "grad_norm": 77.4461898803711, "learning_rate": 4.747899159663865e-07, "logits/chosen": -1.238745093345642, "logits/rejected": -1.228674292564392, "logps/chosen": -318.1499938964844, "logps/rejected": -300.8687438964844, "loss": 0.5577, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3881072998046875, "rewards/margins": 0.819897472858429, "rewards/rejected": -1.2079254388809204, "step": 1130 }, { "epoch": 0.2873979768695049, "grad_norm": 60.04735565185547, "learning_rate": 4.789915966386554e-07, "logits/chosen": -1.271484375, "logits/rejected": -1.2472107410430908, "logps/chosen": -286.38751220703125, "logps/rejected": -267.48748779296875, "loss": 0.5506, "rewards/accuracies": 0.6875, "rewards/chosen": -0.02253875695168972, "rewards/margins": 0.913525402545929, "rewards/rejected": -0.9367294311523438, "step": 1140 }, { "epoch": 0.28991901175432516, "grad_norm": 73.21792602539062, "learning_rate": 4.831932773109244e-07, "logits/chosen": -1.256591796875, "logits/rejected": -1.2744140625, "logps/chosen": -247.07186889648438, "logps/rejected": -249.0281219482422, "loss": 0.5595, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.13423919677734375, "rewards/margins": 0.7934204339981079, "rewards/rejected": -0.9277893304824829, "step": 1150 }, { "epoch": 0.29244004663914536, "grad_norm": 87.73804473876953, "learning_rate": 4.873949579831933e-07, "logits/chosen": -1.2108154296875, "logits/rejected": -1.281152367591858, "logps/chosen": -293.52813720703125, "logps/rejected": -270.58123779296875, "loss": 0.5719, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6134201288223267, "rewards/margins": 0.809191882610321, "rewards/rejected": -1.4230468273162842, "step": 1160 }, { "epoch": 0.2949610815239656, "grad_norm": 72.05789947509766, "learning_rate": 4.915966386554621e-07, "logits/chosen": -1.179834008216858, "logits/rejected": -1.2007324695587158, "logps/chosen": -309.3062438964844, "logps/rejected": -277.39373779296875, "loss": 0.5744, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -1.015649437904358, "rewards/margins": 0.764697253704071, "rewards/rejected": -1.779718041419983, "step": 1170 }, { "epoch": 0.2974821164087858, "grad_norm": 72.16474914550781, "learning_rate": 4.95798319327731e-07, "logits/chosen": -1.264892578125, "logits/rejected": -1.2868163585662842, "logps/chosen": -291.6312561035156, "logps/rejected": -277.23748779296875, "loss": 0.5899, "rewards/accuracies": 0.671875, "rewards/chosen": -0.49605637788772583, "rewards/margins": 0.6390106081962585, "rewards/rejected": -1.1349884271621704, "step": 1180 }, { "epoch": 0.30000315129360605, "grad_norm": 89.52666473388672, "learning_rate": 5e-07, "logits/chosen": -1.2664062976837158, "logits/rejected": -1.28076171875, "logps/chosen": -311.8187561035156, "logps/rejected": -287.5625, "loss": 0.4778, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": 0.05623779445886612, "rewards/margins": 1.046759009361267, "rewards/rejected": -0.9906204342842102, "step": 1190 }, { "epoch": 0.30252418617842625, "grad_norm": 80.31555938720703, "learning_rate": 4.999989240484344e-07, "logits/chosen": -1.2333495616912842, "logits/rejected": -1.224755883216858, "logps/chosen": -294.96875, "logps/rejected": -292.45623779296875, "loss": 0.5438, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5280731320381165, "rewards/margins": 0.948699951171875, "rewards/rejected": -1.477178931236267, "step": 1200 }, { "epoch": 0.30504522106324644, "grad_norm": 76.8759765625, "learning_rate": 4.999956962029988e-07, "logits/chosen": -1.238378882408142, "logits/rejected": -1.2504394054412842, "logps/chosen": -295.5874938964844, "logps/rejected": -296.97186279296875, "loss": 0.5663, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.9184936285018921, "rewards/margins": 0.7978515625, "rewards/rejected": -1.716162085533142, "step": 1210 }, { "epoch": 0.3075662559480667, "grad_norm": 73.19841003417969, "learning_rate": 4.999903164914773e-07, "logits/chosen": -1.2347412109375, "logits/rejected": -1.262109398841858, "logps/chosen": -265.07501220703125, "logps/rejected": -253.00936889648438, "loss": 0.4706, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.09563598781824112, "rewards/margins": 1.0638549327850342, "rewards/rejected": -1.1590576171875, "step": 1220 }, { "epoch": 0.3100872908328869, "grad_norm": 76.41851806640625, "learning_rate": 4.999827849601764e-07, "logits/chosen": -1.217748999595642, "logits/rejected": -1.246728539466858, "logps/chosen": -298.8812561035156, "logps/rejected": -296.73126220703125, "loss": 0.5161, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.3525436520576477, "rewards/margins": 0.919293224811554, "rewards/rejected": -1.2719390392303467, "step": 1230 }, { "epoch": 0.31260832571770714, "grad_norm": 72.04425048828125, "learning_rate": 4.999731016739247e-07, "logits/chosen": -1.201635718345642, "logits/rejected": -1.237548828125, "logps/chosen": -263.76873779296875, "logps/rejected": -271.63751220703125, "loss": 0.5377, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.5170654058456421, "rewards/margins": 0.9412124752998352, "rewards/rejected": -1.4580810070037842, "step": 1240 }, { "epoch": 0.31512936060252733, "grad_norm": 81.7342300415039, "learning_rate": 4.99961266716072e-07, "logits/chosen": -1.230615258216858, "logits/rejected": -1.2393066883087158, "logps/chosen": -270.1812438964844, "logps/rejected": -244.5124969482422, "loss": 0.5097, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.07908324897289276, "rewards/margins": 1.05462646484375, "rewards/rejected": -1.1331360340118408, "step": 1250 }, { "epoch": 0.3176503954873476, "grad_norm": 77.84716796875, "learning_rate": 4.999472801884891e-07, "logits/chosen": -1.26123046875, "logits/rejected": -1.281103491783142, "logps/chosen": -287.46875, "logps/rejected": -265.29376220703125, "loss": 0.5473, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27579039335250854, "rewards/margins": 1.008734107017517, "rewards/rejected": -1.283843994140625, "step": 1260 }, { "epoch": 0.3201714303721678, "grad_norm": 79.17774200439453, "learning_rate": 4.999311422115667e-07, "logits/chosen": -1.2472655773162842, "logits/rejected": -1.268945336341858, "logps/chosen": -291.84063720703125, "logps/rejected": -275.8374938964844, "loss": 0.5277, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.46207427978515625, "rewards/margins": 1.02911376953125, "rewards/rejected": -1.4900543689727783, "step": 1270 }, { "epoch": 0.32269246525698797, "grad_norm": 73.71128845214844, "learning_rate": 4.99912852924214e-07, "logits/chosen": -1.22265625, "logits/rejected": -1.2743408679962158, "logps/chosen": -275.86248779296875, "logps/rejected": -257.4312438964844, "loss": 0.5621, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.861004650592804, "rewards/margins": 0.884442150592804, "rewards/rejected": -1.7460525035858154, "step": 1280 }, { "epoch": 0.3252135001418082, "grad_norm": 72.18524169921875, "learning_rate": 4.998924124838582e-07, "logits/chosen": -1.2352294921875, "logits/rejected": -1.1897461414337158, "logps/chosen": -286.125, "logps/rejected": -284.8687438964844, "loss": 0.5683, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.728625476360321, "rewards/margins": 0.8816589117050171, "rewards/rejected": -1.610467553138733, "step": 1290 }, { "epoch": 0.3277345350266284, "grad_norm": 87.51490783691406, "learning_rate": 4.99869821066443e-07, "logits/chosen": -1.231103539466858, "logits/rejected": -1.284033179283142, "logps/chosen": -290.26873779296875, "logps/rejected": -262.1000061035156, "loss": 0.5934, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.03660278394818306, "rewards/margins": 0.6874328851699829, "rewards/rejected": -0.7236099243164062, "step": 1300 }, { "epoch": 0.33025556991144867, "grad_norm": 90.1872329711914, "learning_rate": 4.998450788664262e-07, "logits/chosen": -1.2363770008087158, "logits/rejected": -1.287353515625, "logps/chosen": -289.62188720703125, "logps/rejected": -279.73126220703125, "loss": 0.5287, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.158843994140625, "rewards/margins": 0.8626343011856079, "rewards/rejected": -0.7040694952011108, "step": 1310 }, { "epoch": 0.33277660479626886, "grad_norm": 114.13217163085938, "learning_rate": 4.998181860967792e-07, "logits/chosen": -1.1602051258087158, "logits/rejected": -1.217187523841858, "logps/chosen": -284.8125, "logps/rejected": -261.71563720703125, "loss": 0.5191, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2588607668876648, "rewards/margins": 0.930255115032196, "rewards/rejected": -1.189550757408142, "step": 1320 }, { "epoch": 0.3352976396810891, "grad_norm": 71.58808898925781, "learning_rate": 4.997891429889845e-07, "logits/chosen": -1.2212402820587158, "logits/rejected": -1.2065918445587158, "logps/chosen": -291.2562561035156, "logps/rejected": -277.01251220703125, "loss": 0.572, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5848525762557983, "rewards/margins": 0.8794952630996704, "rewards/rejected": -1.464563012123108, "step": 1330 }, { "epoch": 0.3378186745659093, "grad_norm": 63.032630920410156, "learning_rate": 4.997579497930341e-07, "logits/chosen": -1.193566918373108, "logits/rejected": -1.198278784751892, "logps/chosen": -313.0531311035156, "logps/rejected": -278.10626220703125, "loss": 0.528, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.6841018795967102, "rewards/margins": 1.0489501953125, "rewards/rejected": -1.733422875404358, "step": 1340 }, { "epoch": 0.3403397094507295, "grad_norm": 78.56951904296875, "learning_rate": 4.997246067774266e-07, "logits/chosen": -1.218603491783142, "logits/rejected": -1.226709008216858, "logps/chosen": -285.6812438964844, "logps/rejected": -266.7875061035156, "loss": 0.5302, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.16157226264476776, "rewards/margins": 1.0615661144256592, "rewards/rejected": -1.223168969154358, "step": 1350 }, { "epoch": 0.34286074433554975, "grad_norm": 76.92902374267578, "learning_rate": 4.99689114229166e-07, "logits/chosen": -1.2064940929412842, "logits/rejected": -1.243896484375, "logps/chosen": -282.04376220703125, "logps/rejected": -265.60626220703125, "loss": 0.4936, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.0020385743118822575, "rewards/margins": 1.1804535388946533, "rewards/rejected": -1.181549072265625, "step": 1360 }, { "epoch": 0.34538177922036994, "grad_norm": 56.34629440307617, "learning_rate": 4.996514724537585e-07, "logits/chosen": -1.178930640220642, "logits/rejected": -1.2376220226287842, "logps/chosen": -306.2124938964844, "logps/rejected": -279.7749938964844, "loss": 0.5027, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7499847412109375, "rewards/margins": 1.110345482826233, "rewards/rejected": -1.86083984375, "step": 1370 }, { "epoch": 0.3479028141051902, "grad_norm": 60.09955596923828, "learning_rate": 4.996116817752096e-07, "logits/chosen": -1.16754150390625, "logits/rejected": -1.201391577720642, "logps/chosen": -301.88751220703125, "logps/rejected": -297.83123779296875, "loss": 0.5366, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8794525265693665, "rewards/margins": 1.064611792564392, "rewards/rejected": -1.9426391124725342, "step": 1380 }, { "epoch": 0.3504238489900104, "grad_norm": 63.94362258911133, "learning_rate": 4.995697425360223e-07, "logits/chosen": -1.216455101966858, "logits/rejected": -1.1755859851837158, "logps/chosen": -284.35626220703125, "logps/rejected": -255.9187469482422, "loss": 0.5926, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7036636471748352, "rewards/margins": 0.81256103515625, "rewards/rejected": -1.5166473388671875, "step": 1390 }, { "epoch": 0.35294488387483064, "grad_norm": 109.69884490966797, "learning_rate": 4.995256550971933e-07, "logits/chosen": -1.262841820716858, "logits/rejected": -1.2425537109375, "logps/chosen": -299.10626220703125, "logps/rejected": -281.3687438964844, "loss": 0.5305, "rewards/accuracies": 0.71875, "rewards/chosen": -0.377114862203598, "rewards/margins": 0.9460830688476562, "rewards/rejected": -1.323400855064392, "step": 1400 }, { "epoch": 0.35546591875965083, "grad_norm": 80.09408569335938, "learning_rate": 4.9947941983821e-07, "logits/chosen": -1.225195288658142, "logits/rejected": -1.2828369140625, "logps/chosen": -275.29998779296875, "logps/rejected": -276.2562561035156, "loss": 0.5974, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.24356994032859802, "rewards/margins": 0.79180908203125, "rewards/rejected": -1.0354797840118408, "step": 1410 }, { "epoch": 0.35798695364447103, "grad_norm": 60.25597381591797, "learning_rate": 4.994310371570477e-07, "logits/chosen": -1.2249755859375, "logits/rejected": -1.2291991710662842, "logps/chosen": -277.26873779296875, "logps/rejected": -263.29376220703125, "loss": 0.5328, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": 0.07745361328125, "rewards/margins": 0.9789673089981079, "rewards/rejected": -0.901257336139679, "step": 1420 }, { "epoch": 0.3605079885292913, "grad_norm": 60.347469329833984, "learning_rate": 4.993805074701659e-07, "logits/chosen": -1.164282202720642, "logits/rejected": -1.2513916492462158, "logps/chosen": -299.3656311035156, "logps/rejected": -283.31561279296875, "loss": 0.5581, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.4916442930698395, "rewards/margins": 0.9429168701171875, "rewards/rejected": -1.4341919422149658, "step": 1430 }, { "epoch": 0.3630290234141115, "grad_norm": 81.96858978271484, "learning_rate": 4.993278312125045e-07, "logits/chosen": -1.2378418445587158, "logits/rejected": -1.256103515625, "logps/chosen": -295.25, "logps/rejected": -272.72186279296875, "loss": 0.569, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0679504871368408, "rewards/margins": 0.987762451171875, "rewards/rejected": -2.055981397628784, "step": 1440 }, { "epoch": 0.3655500582989317, "grad_norm": 76.29911804199219, "learning_rate": 4.992730088374802e-07, "logits/chosen": -1.215185523033142, "logits/rejected": -1.2369873523712158, "logps/chosen": -307.68438720703125, "logps/rejected": -293.7250061035156, "loss": 0.5266, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.8915435671806335, "rewards/margins": 1.090203881263733, "rewards/rejected": -1.9816772937774658, "step": 1450 }, { "epoch": 0.3680710931837519, "grad_norm": 70.62122344970703, "learning_rate": 4.992160408169828e-07, "logits/chosen": -1.286474585533142, "logits/rejected": -1.270532250404358, "logps/chosen": -287.875, "logps/rejected": -276.76251220703125, "loss": 0.5476, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.19958190619945526, "rewards/margins": 0.9004150629043579, "rewards/rejected": -1.1003234386444092, "step": 1460 }, { "epoch": 0.37059212806857217, "grad_norm": 74.41858673095703, "learning_rate": 4.991569276413711e-07, "logits/chosen": -1.216650366783142, "logits/rejected": -1.264550805091858, "logps/chosen": -312.46875, "logps/rejected": -291.67498779296875, "loss": 0.5291, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3129440248012543, "rewards/margins": 1.09747314453125, "rewards/rejected": -1.4101440906524658, "step": 1470 }, { "epoch": 0.37311316295339236, "grad_norm": 81.14917755126953, "learning_rate": 4.990956698194681e-07, "logits/chosen": -1.228118896484375, "logits/rejected": -1.2777588367462158, "logps/chosen": -272.15625, "logps/rejected": -272.5062561035156, "loss": 0.5202, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.8779250979423523, "rewards/margins": 1.0378296375274658, "rewards/rejected": -1.9163818359375, "step": 1480 }, { "epoch": 0.3756341978382126, "grad_norm": 87.38745880126953, "learning_rate": 4.990322678785578e-07, "logits/chosen": -1.238916039466858, "logits/rejected": -1.2570068836212158, "logps/chosen": -297.91876220703125, "logps/rejected": -295.28125, "loss": 0.4919, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.1091797351837158, "rewards/margins": 1.247705101966858, "rewards/rejected": -2.355029344558716, "step": 1490 }, { "epoch": 0.3781552327230328, "grad_norm": 74.2760009765625, "learning_rate": 4.989667223643792e-07, "logits/chosen": -1.2134277820587158, "logits/rejected": -1.200659155845642, "logps/chosen": -301.6499938964844, "logps/rejected": -299.79998779296875, "loss": 0.5682, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -1.2309494018554688, "rewards/margins": 1.127984642982483, "rewards/rejected": -2.3590087890625, "step": 1500 }, { "epoch": 0.380676267607853, "grad_norm": 70.29530334472656, "learning_rate": 4.988990338411229e-07, "logits/chosen": -1.2376220226287842, "logits/rejected": -1.272760033607483, "logps/chosen": -320.3062438964844, "logps/rejected": -278.5874938964844, "loss": 0.5067, "rewards/accuracies": 0.71875, "rewards/chosen": -1.02947998046875, "rewards/margins": 1.2171509265899658, "rewards/rejected": -2.24700927734375, "step": 1510 }, { "epoch": 0.38319730249267325, "grad_norm": 44.722530364990234, "learning_rate": 4.988292028914254e-07, "logits/chosen": -1.245141625404358, "logits/rejected": -1.253173828125, "logps/chosen": -284.7718811035156, "logps/rejected": -275.0375061035156, "loss": 0.5143, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5649795532226562, "rewards/margins": 1.0688445568084717, "rewards/rejected": -1.633752465248108, "step": 1520 }, { "epoch": 0.38571833737749345, "grad_norm": 86.90478515625, "learning_rate": 4.987572301163644e-07, "logits/chosen": -1.238378882408142, "logits/rejected": -1.230688452720642, "logps/chosen": -303.03125, "logps/rejected": -274.4312438964844, "loss": 0.5151, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.41166990995407104, "rewards/margins": 1.1161895990371704, "rewards/rejected": -1.527093529701233, "step": 1530 }, { "epoch": 0.3882393722623137, "grad_norm": 78.6090087890625, "learning_rate": 4.986831161354537e-07, "logits/chosen": -1.23388671875, "logits/rejected": -1.258203148841858, "logps/chosen": -283.61248779296875, "logps/rejected": -279.29376220703125, "loss": 0.5023, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.6829506158828735, "rewards/margins": 1.1151854991912842, "rewards/rejected": -1.79736328125, "step": 1540 }, { "epoch": 0.3907604071471339, "grad_norm": 111.22064971923828, "learning_rate": 4.986068615866377e-07, "logits/chosen": -1.1684691905975342, "logits/rejected": -1.2122070789337158, "logps/chosen": -292.04998779296875, "logps/rejected": -285.5687561035156, "loss": 0.5618, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.413818359375, "rewards/margins": 1.0403320789337158, "rewards/rejected": -2.4546875953674316, "step": 1550 }, { "epoch": 0.39328144203195414, "grad_norm": 106.2069320678711, "learning_rate": 4.985284671262863e-07, "logits/chosen": -1.1885986328125, "logits/rejected": -1.222631812095642, "logps/chosen": -300.015625, "logps/rejected": -273.89373779296875, "loss": 0.5136, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3954284191131592, "rewards/margins": 1.0596771240234375, "rewards/rejected": -2.454785108566284, "step": 1560 }, { "epoch": 0.39580247691677434, "grad_norm": 56.032257080078125, "learning_rate": 4.984479334291882e-07, "logits/chosen": -1.2393615245819092, "logits/rejected": -1.23095703125, "logps/chosen": -289.7250061035156, "logps/rejected": -294.17498779296875, "loss": 0.5326, "rewards/accuracies": 0.703125, "rewards/chosen": -0.7645019292831421, "rewards/margins": 1.0052673816680908, "rewards/rejected": -1.7699096202850342, "step": 1570 }, { "epoch": 0.39832351180159453, "grad_norm": 79.32884216308594, "learning_rate": 4.983652611885465e-07, "logits/chosen": -1.1991698741912842, "logits/rejected": -1.258935570716858, "logps/chosen": -261.4125061035156, "logps/rejected": -287.96875, "loss": 0.4822, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.258187860250473, "rewards/margins": 1.1524276733398438, "rewards/rejected": -1.4100220203399658, "step": 1580 }, { "epoch": 0.4008445466864148, "grad_norm": 103.19246673583984, "learning_rate": 4.982804511159718e-07, "logits/chosen": -1.2248046398162842, "logits/rejected": -1.2166839838027954, "logps/chosen": -308.65936279296875, "logps/rejected": -285.4375, "loss": 0.5429, "rewards/accuracies": 0.703125, "rewards/chosen": -0.41632384061813354, "rewards/margins": 0.9964599609375, "rewards/rejected": -1.4127594232559204, "step": 1590 }, { "epoch": 0.403365581571235, "grad_norm": 67.15472412109375, "learning_rate": 4.981935039414763e-07, "logits/chosen": -1.2517821788787842, "logits/rejected": -1.2402832508087158, "logps/chosen": -296.7562561035156, "logps/rejected": -255.85000610351562, "loss": 0.5903, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.444192498922348, "rewards/margins": 1.0130493640899658, "rewards/rejected": -1.456445336341858, "step": 1600 }, { "epoch": 0.4058866164560552, "grad_norm": 90.23637390136719, "learning_rate": 4.981044204134676e-07, "logits/chosen": -1.244873046875, "logits/rejected": -1.239648461341858, "logps/chosen": -288.7250061035156, "logps/rejected": -286.16876220703125, "loss": 0.5393, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.34358978271484375, "rewards/margins": 1.0264098644256592, "rewards/rejected": -1.37060546875, "step": 1610 }, { "epoch": 0.4084076513408754, "grad_norm": 74.42823791503906, "learning_rate": 4.980132012987421e-07, "logits/chosen": NaN, "logits/rejected": -1.278662085533142, "logps/chosen": -295.98126220703125, "logps/rejected": -286.5874938964844, "loss": 0.5761, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.25356751680374146, "rewards/margins": 0.9522186517715454, "rewards/rejected": -1.2055633068084717, "step": 1620 }, { "epoch": 0.41092868622569567, "grad_norm": 73.99746704101562, "learning_rate": 4.979198473824788e-07, "logits/chosen": -1.266332983970642, "logits/rejected": -1.230126976966858, "logps/chosen": -276.3187561035156, "logps/rejected": -283.76873779296875, "loss": 0.5857, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.18967895209789276, "rewards/margins": 0.88836669921875, "rewards/rejected": -1.0779266357421875, "step": 1630 }, { "epoch": 0.41344972111051587, "grad_norm": 46.616146087646484, "learning_rate": 4.97824359468232e-07, "logits/chosen": -1.288915991783142, "logits/rejected": -1.2846190929412842, "logps/chosen": -292.8687438964844, "logps/rejected": -275.16876220703125, "loss": 0.518, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2936088442802429, "rewards/margins": 1.007684350013733, "rewards/rejected": -1.30059814453125, "step": 1640 }, { "epoch": 0.41597075599533606, "grad_norm": 87.4283218383789, "learning_rate": 4.977267383779244e-07, "logits/chosen": -1.268652319908142, "logits/rejected": -1.2604491710662842, "logps/chosen": -313.86248779296875, "logps/rejected": -284.7875061035156, "loss": 0.6047, "rewards/accuracies": 0.659375011920929, "rewards/chosen": -0.5855468511581421, "rewards/margins": 0.8833038210868835, "rewards/rejected": -1.4690628051757812, "step": 1650 }, { "epoch": 0.4184917908801563, "grad_norm": 105.78712463378906, "learning_rate": 4.976269849518408e-07, "logits/chosen": -1.226586937904358, "logits/rejected": -1.279638648033142, "logps/chosen": -289.37811279296875, "logps/rejected": -290.04998779296875, "loss": 0.579, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.7635498046875, "rewards/margins": 0.906237781047821, "rewards/rejected": -1.6697266101837158, "step": 1660 }, { "epoch": 0.4210128257649765, "grad_norm": 64.9921646118164, "learning_rate": 4.9752510004862e-07, "logits/chosen": -1.2200195789337158, "logits/rejected": -1.2061767578125, "logps/chosen": -281.3812561035156, "logps/rejected": -284.1812438964844, "loss": 0.5522, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7096710205078125, "rewards/margins": 1.0792877674102783, "rewards/rejected": -1.788580298423767, "step": 1670 }, { "epoch": 0.42353386064979676, "grad_norm": 68.02913665771484, "learning_rate": 4.974210845452476e-07, "logits/chosen": -1.2034912109375, "logits/rejected": -1.201416015625, "logps/chosen": -300.07501220703125, "logps/rejected": -315.1937561035156, "loss": 0.5504, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.8024185299873352, "rewards/margins": 1.2061767578125, "rewards/rejected": -2.0079345703125, "step": 1680 }, { "epoch": 0.42605489553461695, "grad_norm": 64.83089447021484, "learning_rate": 4.97314939337049e-07, "logits/chosen": -1.249243140220642, "logits/rejected": -1.2323729991912842, "logps/chosen": -312.64373779296875, "logps/rejected": -290.66876220703125, "loss": 0.5003, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8062804937362671, "rewards/margins": 1.150183081626892, "rewards/rejected": -1.9544677734375, "step": 1690 }, { "epoch": 0.4285759304194372, "grad_norm": 52.674869537353516, "learning_rate": 4.972066653376808e-07, "logits/chosen": -1.138159155845642, "logits/rejected": -1.2062499523162842, "logps/chosen": -281.0062561035156, "logps/rejected": -274.3374938964844, "loss": 0.4777, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5297576785087585, "rewards/margins": 1.428741455078125, "rewards/rejected": -1.957452416419983, "step": 1700 }, { "epoch": 0.4310969653042574, "grad_norm": 79.29434967041016, "learning_rate": 4.970962634791238e-07, "logits/chosen": -1.2255127429962158, "logits/rejected": -1.250732421875, "logps/chosen": -300.34686279296875, "logps/rejected": -268.4125061035156, "loss": 0.5453, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.44197386503219604, "rewards/margins": 1.0496704578399658, "rewards/rejected": -1.4912230968475342, "step": 1710 }, { "epoch": 0.4336180001890776, "grad_norm": 81.51036071777344, "learning_rate": 4.969837347116744e-07, "logits/chosen": -1.215917944908142, "logits/rejected": -1.224023461341858, "logps/chosen": -304.67498779296875, "logps/rejected": -292.2250061035156, "loss": 0.5661, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.29062652587890625, "rewards/margins": 0.96728515625, "rewards/rejected": -1.257940649986267, "step": 1720 }, { "epoch": 0.43613903507389784, "grad_norm": 101.07459259033203, "learning_rate": 4.968690800039365e-07, "logits/chosen": -1.227880835533142, "logits/rejected": NaN, "logps/chosen": -295.9437561035156, "logps/rejected": -275.73748779296875, "loss": 0.5391, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3242965638637543, "rewards/margins": 1.051721215248108, "rewards/rejected": -1.3746246099472046, "step": 1730 }, { "epoch": 0.43866006995871804, "grad_norm": 56.40563201904297, "learning_rate": 4.967523003428134e-07, "logits/chosen": -1.218603491783142, "logits/rejected": -1.171240210533142, "logps/chosen": -299.00311279296875, "logps/rejected": -297.45001220703125, "loss": 0.5762, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2166931629180908, "rewards/margins": 0.8678528070449829, "rewards/rejected": -2.084667921066284, "step": 1740 }, { "epoch": 0.4411811048435383, "grad_norm": 66.51710510253906, "learning_rate": 4.966333967334992e-07, "logits/chosen": -1.186132788658142, "logits/rejected": -1.247412085533142, "logps/chosen": -304.3687438964844, "logps/rejected": -270.8187561035156, "loss": 0.5463, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0644409656524658, "rewards/margins": 1.007904052734375, "rewards/rejected": -2.071972608566284, "step": 1750 }, { "epoch": 0.4437021397283585, "grad_norm": 62.56184005737305, "learning_rate": 4.965123701994703e-07, "logits/chosen": -1.208642601966858, "logits/rejected": -1.2099120616912842, "logps/chosen": -276.78125, "logps/rejected": -277.39373779296875, "loss": 0.5387, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0719420909881592, "rewards/margins": 1.0028259754180908, "rewards/rejected": -2.0741944313049316, "step": 1760 }, { "epoch": 0.44622317461317873, "grad_norm": 58.99638366699219, "learning_rate": 4.963892217824761e-07, "logits/chosen": -1.20947265625, "logits/rejected": -1.2544434070587158, "logps/chosen": -276.07501220703125, "logps/rejected": -262.7875061035156, "loss": 0.5892, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6793349981307983, "rewards/margins": 1.070892333984375, "rewards/rejected": -1.750097632408142, "step": 1770 }, { "epoch": 0.4487442094979989, "grad_norm": 63.794219970703125, "learning_rate": 4.962639525425303e-07, "logits/chosen": -1.280517578125, "logits/rejected": -1.318701148033142, "logps/chosen": -290.01873779296875, "logps/rejected": -275.1000061035156, "loss": 0.5096, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.1992340087890625, "rewards/margins": 1.1635620594024658, "rewards/rejected": -1.3624451160430908, "step": 1780 }, { "epoch": 0.4512652443828191, "grad_norm": 55.53218078613281, "learning_rate": 4.961365635579021e-07, "logits/chosen": -1.268701195716858, "logits/rejected": -1.291357398033142, "logps/chosen": -277.58123779296875, "logps/rejected": -247.2624969482422, "loss": 0.5004, "rewards/accuracies": 0.734375, "rewards/chosen": -0.1181182861328125, "rewards/margins": 1.1982421875, "rewards/rejected": -1.316192626953125, "step": 1790 }, { "epoch": 0.45378627926763937, "grad_norm": 97.4455337524414, "learning_rate": 4.960070559251066e-07, "logits/chosen": -1.2337646484375, "logits/rejected": -1.273535132408142, "logps/chosen": -307.07501220703125, "logps/rejected": -273.15936279296875, "loss": 0.5169, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.477792352437973, "rewards/margins": 1.31329345703125, "rewards/rejected": -1.7910888195037842, "step": 1800 }, { "epoch": 0.45630731415245956, "grad_norm": 64.75396728515625, "learning_rate": 4.958754307588952e-07, "logits/chosen": -1.198339819908142, "logits/rejected": -1.2329590320587158, "logps/chosen": -289.68438720703125, "logps/rejected": -287.51251220703125, "loss": 0.5812, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0152862071990967, "rewards/margins": 1.1847991943359375, "rewards/rejected": -2.2004973888397217, "step": 1810 }, { "epoch": 0.4588283490372798, "grad_norm": 89.44642639160156, "learning_rate": 4.957416891922463e-07, "logits/chosen": -1.2565796375274658, "logits/rejected": -1.2080566883087158, "logps/chosen": -303.11248779296875, "logps/rejected": -298.90625, "loss": 0.5223, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.950427234172821, "rewards/margins": 1.1783263683319092, "rewards/rejected": -2.129077196121216, "step": 1820 }, { "epoch": 0.4613493839221, "grad_norm": 64.8090591430664, "learning_rate": 4.956058323763555e-07, "logits/chosen": -1.2447998523712158, "logits/rejected": -1.2228515148162842, "logps/chosen": -297.65625, "logps/rejected": -268.0874938964844, "loss": 0.5152, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.6448333859443665, "rewards/margins": 1.299646019935608, "rewards/rejected": -1.944091796875, "step": 1830 }, { "epoch": 0.46387041880692026, "grad_norm": 140.39163208007812, "learning_rate": 4.954678614806258e-07, "logits/chosen": -1.247900366783142, "logits/rejected": -1.224389672279358, "logps/chosen": -298.84375, "logps/rejected": -293.09375, "loss": 0.5528, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.3558692932128906, "rewards/margins": 1.054406762123108, "rewards/rejected": -1.4098694324493408, "step": 1840 }, { "epoch": 0.46639145369174045, "grad_norm": 56.89118194580078, "learning_rate": 4.953277776926571e-07, "logits/chosen": -1.257055640220642, "logits/rejected": -1.268774390220642, "logps/chosen": -290.3687438964844, "logps/rejected": -271.0843811035156, "loss": 0.5025, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.17799682915210724, "rewards/margins": 1.232853651046753, "rewards/rejected": -1.4112396240234375, "step": 1850 }, { "epoch": 0.4689124885765607, "grad_norm": 65.44432830810547, "learning_rate": 4.951855822182363e-07, "logits/chosen": -1.2215576171875, "logits/rejected": -1.235107421875, "logps/chosen": -307.48126220703125, "logps/rejected": -281.35626220703125, "loss": 0.5334, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.6034393310546875, "rewards/margins": 1.090521216392517, "rewards/rejected": -1.6938965320587158, "step": 1860 }, { "epoch": 0.4714335234613809, "grad_norm": 85.89576721191406, "learning_rate": 4.95041276281327e-07, "logits/chosen": -1.171777367591858, "logits/rejected": -1.188232421875, "logps/chosen": -285.703125, "logps/rejected": -254.36874389648438, "loss": 0.5769, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.7682098150253296, "rewards/margins": 1.019464135169983, "rewards/rejected": -1.7874023914337158, "step": 1870 }, { "epoch": 0.4739545583462011, "grad_norm": 83.72393798828125, "learning_rate": 4.948948611240588e-07, "logits/chosen": -1.2034423351287842, "logits/rejected": -1.2545654773712158, "logps/chosen": -285.41876220703125, "logps/rejected": -269.15936279296875, "loss": 0.6519, "rewards/accuracies": 0.671875, "rewards/chosen": -0.9195785522460938, "rewards/margins": 0.881665050983429, "rewards/rejected": -1.8018066883087158, "step": 1880 }, { "epoch": 0.47647559323102134, "grad_norm": 70.85868072509766, "learning_rate": 4.947463380067166e-07, "logits/chosen": -1.190771460533142, "logits/rejected": -1.188134789466858, "logps/chosen": -270.0531311035156, "logps/rejected": -266.96875, "loss": 0.5712, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.18328857421875, "rewards/margins": 0.9207916259765625, "rewards/rejected": -2.1041016578674316, "step": 1890 }, { "epoch": 0.47899662811584154, "grad_norm": 44.25507354736328, "learning_rate": 4.945957082077298e-07, "logits/chosen": -1.2071533203125, "logits/rejected": -1.2270019054412842, "logps/chosen": -290.39373779296875, "logps/rejected": -291.3999938964844, "loss": 0.5092, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0802001953125, "rewards/margins": 1.0626647472381592, "rewards/rejected": -2.142773389816284, "step": 1900 }, { "epoch": 0.4815176630006618, "grad_norm": 75.29605865478516, "learning_rate": 4.944429730236617e-07, "logits/chosen": -1.2137939929962158, "logits/rejected": -1.208673119544983, "logps/chosen": -310.04998779296875, "logps/rejected": -284.546875, "loss": 0.5739, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.103143334388733, "rewards/margins": 1.0565338134765625, "rewards/rejected": -2.1590819358825684, "step": 1910 }, { "epoch": 0.484038697885482, "grad_norm": 92.45514678955078, "learning_rate": 4.942881337691971e-07, "logits/chosen": -1.1910889148712158, "logits/rejected": -1.1811034679412842, "logps/chosen": -286.01251220703125, "logps/rejected": -278.5249938964844, "loss": 0.5611, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7924102544784546, "rewards/margins": 1.12811279296875, "rewards/rejected": -1.920019507408142, "step": 1920 }, { "epoch": 0.48655973277030223, "grad_norm": 45.24339294433594, "learning_rate": 4.941311917771324e-07, "logits/chosen": -1.1636962890625, "logits/rejected": -1.1869628429412842, "logps/chosen": -313.07501220703125, "logps/rejected": -286.3531188964844, "loss": 0.5601, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8625229001045227, "rewards/margins": 1.1022552251815796, "rewards/rejected": -1.965612769126892, "step": 1930 }, { "epoch": 0.48908076765512243, "grad_norm": 64.42066192626953, "learning_rate": 4.939721483983639e-07, "logits/chosen": -1.19384765625, "logits/rejected": -1.223486304283142, "logps/chosen": -288.98748779296875, "logps/rejected": -267.63751220703125, "loss": 0.6278, "rewards/accuracies": 0.6656249761581421, "rewards/chosen": -0.7465095520019531, "rewards/margins": 1.087432861328125, "rewards/rejected": -1.833532691001892, "step": 1940 }, { "epoch": 0.4916018025399426, "grad_norm": 73.966552734375, "learning_rate": 4.938110050018747e-07, "logits/chosen": -1.1904785633087158, "logits/rejected": -1.214257836341858, "logps/chosen": -281.84063720703125, "logps/rejected": -277.6812438964844, "loss": 0.62, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7041015625, "rewards/margins": 1.0345275402069092, "rewards/rejected": -1.7391235828399658, "step": 1950 }, { "epoch": 0.4941228374247629, "grad_norm": 94.01651000976562, "learning_rate": 4.936477629747253e-07, "logits/chosen": -1.164306640625, "logits/rejected": -1.1973876953125, "logps/chosen": -317.2875061035156, "logps/rejected": -308.91876220703125, "loss": 0.5443, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.9269149899482727, "rewards/margins": 0.9577087163925171, "rewards/rejected": -1.884362816810608, "step": 1960 }, { "epoch": 0.49664387230958307, "grad_norm": 63.18036651611328, "learning_rate": 4.934824237220395e-07, "logits/chosen": -1.193017601966858, "logits/rejected": -1.193090796470642, "logps/chosen": -307.71875, "logps/rejected": -277.96875, "loss": 0.578, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.0204544067382812, "rewards/margins": 1.105505347251892, "rewards/rejected": -2.1258301734924316, "step": 1970 }, { "epoch": 0.4991649071944033, "grad_norm": 66.69244384765625, "learning_rate": 4.933149886669936e-07, "logits/chosen": -1.252294898033142, "logits/rejected": -1.254492163658142, "logps/chosen": -281.015625, "logps/rejected": -281.17498779296875, "loss": 0.5839, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.6375198364257812, "rewards/margins": 0.9779907464981079, "rewards/rejected": -1.6150085926055908, "step": 1980 }, { "epoch": 0.5016859420792236, "grad_norm": 67.1665267944336, "learning_rate": 4.931454592508037e-07, "logits/chosen": -1.1150634288787842, "logits/rejected": -1.1283690929412842, "logps/chosen": -270.01873779296875, "logps/rejected": -251.30624389648438, "loss": 0.5874, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.4391418397426605, "rewards/margins": 1.0847870111465454, "rewards/rejected": -1.524133324623108, "step": 1990 }, { "epoch": 0.5042069769640437, "grad_norm": 68.26512145996094, "learning_rate": 4.929738369327133e-07, "logits/chosen": -1.143884301185608, "logits/rejected": -1.1930663585662842, "logps/chosen": -267.05938720703125, "logps/rejected": -284.03125, "loss": 0.5463, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.475616455078125, "rewards/margins": 1.23541259765625, "rewards/rejected": -1.7103195190429688, "step": 2000 }, { "epoch": 0.506728011848864, "grad_norm": 82.5437240600586, "learning_rate": 4.928001231899809e-07, "logits/chosen": -1.1070556640625, "logits/rejected": -1.1834716796875, "logps/chosen": -269.8843688964844, "logps/rejected": -276.53125, "loss": 0.4932, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5311965942382812, "rewards/margins": 1.4447052478790283, "rewards/rejected": -1.9757812023162842, "step": 2010 }, { "epoch": 0.5092490467336842, "grad_norm": 84.4751968383789, "learning_rate": 4.926243195178669e-07, "logits/chosen": -1.182763695716858, "logits/rejected": -1.1971557140350342, "logps/chosen": -253.47500610351562, "logps/rejected": -257.90625, "loss": 0.5344, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2667526304721832, "rewards/margins": 1.116084337234497, "rewards/rejected": -1.382940649986267, "step": 2020 }, { "epoch": 0.5117700816185043, "grad_norm": 56.32522201538086, "learning_rate": 4.924464274296214e-07, "logits/chosen": -1.174340844154358, "logits/rejected": -1.2169067859649658, "logps/chosen": -287.35626220703125, "logps/rejected": -264.3812561035156, "loss": 0.5841, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.005999756045639515, "rewards/margins": 0.9309631586074829, "rewards/rejected": -0.9245926141738892, "step": 2030 }, { "epoch": 0.5142911165033246, "grad_norm": 86.52482604980469, "learning_rate": 4.922664484564704e-07, "logits/chosen": -1.199121117591858, "logits/rejected": -1.259497046470642, "logps/chosen": -293.8187561035156, "logps/rejected": -286.4437561035156, "loss": 0.5224, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.08463134616613388, "rewards/margins": 1.15631103515625, "rewards/rejected": -1.241546630859375, "step": 2040 }, { "epoch": 0.5168121513881448, "grad_norm": 91.60623931884766, "learning_rate": 4.920843841476032e-07, "logits/chosen": -1.1902587413787842, "logits/rejected": NaN, "logps/chosen": -292.3031311035156, "logps/rejected": -273.16876220703125, "loss": 0.5612, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.268798828125, "rewards/margins": 1.181677222251892, "rewards/rejected": -1.450585961341858, "step": 2050 }, { "epoch": 0.5193331862729651, "grad_norm": 86.1439208984375, "learning_rate": 4.91900236070159e-07, "logits/chosen": -1.124169945716858, "logits/rejected": -1.155847191810608, "logps/chosen": -288.82501220703125, "logps/rejected": -275.0375061035156, "loss": 0.5803, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7064574956893921, "rewards/margins": 1.1885802745819092, "rewards/rejected": -1.894866943359375, "step": 2060 }, { "epoch": 0.5218542211577852, "grad_norm": 73.70453643798828, "learning_rate": 4.917140058092128e-07, "logits/chosen": -1.213134765625, "logits/rejected": -1.210473656654358, "logps/chosen": -274.95623779296875, "logps/rejected": -275.3062438964844, "loss": 0.5484, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.441751092672348, "rewards/margins": 1.181280493736267, "rewards/rejected": -1.6232421398162842, "step": 2070 }, { "epoch": 0.5243752560426055, "grad_norm": 74.48878479003906, "learning_rate": 4.915256949677628e-07, "logits/chosen": -1.193359375, "logits/rejected": -1.2157714366912842, "logps/chosen": -277.3062438964844, "logps/rejected": -269.32501220703125, "loss": 0.5753, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.715441882610321, "rewards/margins": 0.9764038324356079, "rewards/rejected": -1.6917724609375, "step": 2080 }, { "epoch": 0.5268962909274257, "grad_norm": 81.1043472290039, "learning_rate": 4.913353051667155e-07, "logits/chosen": -1.1866455078125, "logits/rejected": -1.180078148841858, "logps/chosen": -309.45001220703125, "logps/rejected": -277.92498779296875, "loss": 0.5364, "rewards/accuracies": 0.703125, "rewards/chosen": -0.877227783203125, "rewards/margins": 1.07843017578125, "rewards/rejected": -1.9558594226837158, "step": 2090 }, { "epoch": 0.5294173258122459, "grad_norm": 91.78522491455078, "learning_rate": 4.911428380448727e-07, "logits/chosen": -1.1493651866912842, "logits/rejected": -1.1673095226287842, "logps/chosen": -292.4375, "logps/rejected": -261.45623779296875, "loss": 0.5596, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.981091320514679, "rewards/margins": 1.201904296875, "rewards/rejected": -2.1824951171875, "step": 2100 }, { "epoch": 0.5319383606970661, "grad_norm": 72.78822326660156, "learning_rate": 4.909482952589169e-07, "logits/chosen": -1.1352112293243408, "logits/rejected": -1.1470215320587158, "logps/chosen": -292.5687561035156, "logps/rejected": -300.8374938964844, "loss": 0.5316, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.216394066810608, "rewards/margins": 1.23919677734375, "rewards/rejected": -2.455810546875, "step": 2110 }, { "epoch": 0.5344593955818864, "grad_norm": 68.66783142089844, "learning_rate": 4.907516784833968e-07, "logits/chosen": -1.201318383216858, "logits/rejected": -1.208349585533142, "logps/chosen": -319.29998779296875, "logps/rejected": -301.4750061035156, "loss": 0.544, "rewards/accuracies": 0.703125, "rewards/chosen": -0.8149169683456421, "rewards/margins": 1.1826903820037842, "rewards/rejected": -1.9975707530975342, "step": 2120 }, { "epoch": 0.5369804304667066, "grad_norm": 70.89950561523438, "learning_rate": 4.905529894107136e-07, "logits/chosen": -1.2000732421875, "logits/rejected": -1.1967284679412842, "logps/chosen": -317.9750061035156, "logps/rejected": -269.3374938964844, "loss": 0.5125, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.8348633050918579, "rewards/margins": 1.355010986328125, "rewards/rejected": -2.1889891624450684, "step": 2130 }, { "epoch": 0.5395014653515268, "grad_norm": 76.0267105102539, "learning_rate": 4.903522297511058e-07, "logits/chosen": -1.2312500476837158, "logits/rejected": -1.2475097179412842, "logps/chosen": -298.65936279296875, "logps/rejected": -294.0625, "loss": 0.4639, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -0.6670074462890625, "rewards/margins": 1.417443871498108, "rewards/rejected": -2.0840697288513184, "step": 2140 }, { "epoch": 0.542022500236347, "grad_norm": 85.77741241455078, "learning_rate": 4.901494012326346e-07, "logits/chosen": -1.179589867591858, "logits/rejected": -1.2236816883087158, "logps/chosen": -282.1968688964844, "logps/rejected": -281.1312561035156, "loss": 0.5782, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.866058349609375, "rewards/margins": 1.250817894935608, "rewards/rejected": -2.116564989089966, "step": 2150 }, { "epoch": 0.5445435351211673, "grad_norm": 74.22309112548828, "learning_rate": 4.899445056011695e-07, "logits/chosen": -1.203039526939392, "logits/rejected": -1.198950171470642, "logps/chosen": -294.85626220703125, "logps/rejected": -282.2437438964844, "loss": 0.515, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.5551101565361023, "rewards/margins": 1.4315674304962158, "rewards/rejected": -1.987248182296753, "step": 2160 }, { "epoch": 0.5470645700059874, "grad_norm": 83.23567199707031, "learning_rate": 4.897375446203727e-07, "logits/chosen": -1.156347632408142, "logits/rejected": -1.172631859779358, "logps/chosen": -282.48748779296875, "logps/rejected": -289.40313720703125, "loss": 0.5962, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.651623547077179, "rewards/margins": 1.131292700767517, "rewards/rejected": -1.781622290611267, "step": 2170 }, { "epoch": 0.5495856048908077, "grad_norm": 62.51706314086914, "learning_rate": 4.89528520071684e-07, "logits/chosen": -1.22314453125, "logits/rejected": -1.261450171470642, "logps/chosen": -276.70623779296875, "logps/rejected": -281.86248779296875, "loss": 0.5031, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.3576812744140625, "rewards/margins": 1.385986328125, "rewards/rejected": -1.7438232898712158, "step": 2180 }, { "epoch": 0.5521066397756279, "grad_norm": 70.63616180419922, "learning_rate": 4.893174337543058e-07, "logits/chosen": -1.2021973133087158, "logits/rejected": -1.212744116783142, "logps/chosen": -266.765625, "logps/rejected": -272.07501220703125, "loss": 0.5897, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.4884323179721832, "rewards/margins": 1.024169921875, "rewards/rejected": -1.5122253894805908, "step": 2190 }, { "epoch": 0.5546276746604482, "grad_norm": 68.28787994384766, "learning_rate": 4.891042874851873e-07, "logits/chosen": -1.2497680187225342, "logits/rejected": -1.2576904296875, "logps/chosen": -297.79998779296875, "logps/rejected": -272.76873779296875, "loss": 0.5681, "rewards/accuracies": 0.703125, "rewards/chosen": -0.14769592881202698, "rewards/margins": 1.087792992591858, "rewards/rejected": -1.235931396484375, "step": 2200 }, { "epoch": 0.5571487095452683, "grad_norm": 88.721435546875, "learning_rate": 4.888890830990091e-07, "logits/chosen": -1.171240210533142, "logits/rejected": -1.20465087890625, "logps/chosen": -295.2749938964844, "logps/rejected": -279.8374938964844, "loss": 0.5215, "rewards/accuracies": 0.71875, "rewards/chosen": -0.351248174905777, "rewards/margins": 1.323974609375, "rewards/rejected": -1.676110863685608, "step": 2210 }, { "epoch": 0.5596697444300885, "grad_norm": 78.94097137451172, "learning_rate": 4.88671822448167e-07, "logits/chosen": -1.20654296875, "logits/rejected": -1.1768066883087158, "logps/chosen": -306.9375, "logps/rejected": -268.90625, "loss": 0.5687, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.5443404912948608, "rewards/margins": 1.320776343345642, "rewards/rejected": -1.8653564453125, "step": 2220 }, { "epoch": 0.5621907793149088, "grad_norm": 58.73942565917969, "learning_rate": 4.884525074027566e-07, "logits/chosen": -1.165771484375, "logits/rejected": -1.180419921875, "logps/chosen": -296.7749938964844, "logps/rejected": -300.0062561035156, "loss": 0.5771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.616558849811554, "rewards/margins": 1.1680572032928467, "rewards/rejected": -1.7853882312774658, "step": 2230 }, { "epoch": 0.5647118141997289, "grad_norm": 70.50154876708984, "learning_rate": 4.882311398505568e-07, "logits/chosen": -1.226660132408142, "logits/rejected": -1.211279273033142, "logps/chosen": -300.0687561035156, "logps/rejected": -255.1875, "loss": 0.5165, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.6135452389717102, "rewards/margins": 1.326025366783142, "rewards/rejected": -1.9389464855194092, "step": 2240 }, { "epoch": 0.5672328490845492, "grad_norm": 79.97781372070312, "learning_rate": 4.880077216970139e-07, "logits/chosen": -1.245263695716858, "logits/rejected": -1.2196533679962158, "logps/chosen": -289.67498779296875, "logps/rejected": -293.3125, "loss": 0.5038, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7392807006835938, "rewards/margins": 1.505157470703125, "rewards/rejected": -2.2438080310821533, "step": 2250 }, { "epoch": 0.5697538839693694, "grad_norm": 70.58702850341797, "learning_rate": 4.877822548652244e-07, "logits/chosen": -1.199804663658142, "logits/rejected": -1.2206542491912842, "logps/chosen": -311.0687561035156, "logps/rejected": -302.6187438964844, "loss": 0.5111, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.6523681879043579, "rewards/margins": 1.5244140625, "rewards/rejected": -2.17681884765625, "step": 2260 }, { "epoch": 0.5722749188541897, "grad_norm": 73.1260986328125, "learning_rate": 4.875547412959198e-07, "logits/chosen": -1.2357909679412842, "logits/rejected": -1.248437523841858, "logps/chosen": -304.5625, "logps/rejected": -303.49688720703125, "loss": 0.5175, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.703564465045929, "rewards/margins": 1.4135863780975342, "rewards/rejected": -2.1169190406799316, "step": 2270 }, { "epoch": 0.5747959537390098, "grad_norm": 77.04090881347656, "learning_rate": 4.873251829474485e-07, "logits/chosen": -1.2665283679962158, "logits/rejected": -1.200292944908142, "logps/chosen": -313.34375, "logps/rejected": -279.61248779296875, "loss": 0.5583, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.1821548491716385, "rewards/margins": 1.315148949623108, "rewards/rejected": -1.4973633289337158, "step": 2280 }, { "epoch": 0.5773169886238301, "grad_norm": 63.97391128540039, "learning_rate": 4.870935817957599e-07, "logits/chosen": -1.2111327648162842, "logits/rejected": -1.1955077648162842, "logps/chosen": -266.8500061035156, "logps/rejected": -255.46249389648438, "loss": 0.572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.24982604384422302, "rewards/margins": 1.177056908607483, "rewards/rejected": -1.4273192882537842, "step": 2290 }, { "epoch": 0.5798380235086503, "grad_norm": 84.7044448852539, "learning_rate": 4.868599398343871e-07, "logits/chosen": -1.1641356945037842, "logits/rejected": -1.198388695716858, "logps/chosen": -268.82501220703125, "logps/rejected": -250.08749389648438, "loss": 0.5223, "rewards/accuracies": 0.734375, "rewards/chosen": -0.32326966524124146, "rewards/margins": 1.1341736316680908, "rewards/rejected": -1.4576812982559204, "step": 2300 }, { "epoch": 0.5823590583934706, "grad_norm": 93.58554077148438, "learning_rate": 4.866242590744294e-07, "logits/chosen": -1.206445336341858, "logits/rejected": NaN, "logps/chosen": -300.45001220703125, "logps/rejected": -315.16876220703125, "loss": 0.5786, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.4164672791957855, "rewards/margins": 1.204016089439392, "rewards/rejected": -1.620294213294983, "step": 2310 }, { "epoch": 0.5848800932782907, "grad_norm": 90.5208969116211, "learning_rate": 4.863865415445356e-07, "logits/chosen": -1.2510986328125, "logits/rejected": -1.243798851966858, "logps/chosen": -282.48748779296875, "logps/rejected": -271.71875, "loss": 0.507, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.37880247831344604, "rewards/margins": 1.333251953125, "rewards/rejected": -1.71160888671875, "step": 2320 }, { "epoch": 0.587401128163111, "grad_norm": 57.87455749511719, "learning_rate": 4.861467892908859e-07, "logits/chosen": -1.249536156654358, "logits/rejected": -1.220556616783142, "logps/chosen": -293.70623779296875, "logps/rejected": -272.26873779296875, "loss": 0.5443, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7286742925643921, "rewards/margins": 1.315006971359253, "rewards/rejected": -2.044512987136841, "step": 2330 }, { "epoch": 0.5899221630479312, "grad_norm": 56.66450500488281, "learning_rate": 4.85905004377175e-07, "logits/chosen": -1.192236304283142, "logits/rejected": -1.2483398914337158, "logps/chosen": -310.5375061035156, "logps/rejected": -304.9375, "loss": 0.5306, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9629257321357727, "rewards/margins": 1.29730224609375, "rewards/rejected": -2.260237216949463, "step": 2340 }, { "epoch": 0.5924431979327514, "grad_norm": 99.49959564208984, "learning_rate": 4.856611888845937e-07, "logits/chosen": -1.2095215320587158, "logits/rejected": -1.2102539539337158, "logps/chosen": -307.4937438964844, "logps/rejected": -315.46875, "loss": 0.5489, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8058074712753296, "rewards/margins": 1.5135376453399658, "rewards/rejected": -2.3196136951446533, "step": 2350 }, { "epoch": 0.5949642328175716, "grad_norm": 68.80374145507812, "learning_rate": 4.854153449118112e-07, "logits/chosen": -1.1959960460662842, "logits/rejected": -1.1903808116912842, "logps/chosen": -289.48748779296875, "logps/rejected": -258.50311279296875, "loss": 0.579, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.03845367580652237, "rewards/margins": 1.2675507068634033, "rewards/rejected": -1.3045227527618408, "step": 2360 }, { "epoch": 0.5974852677023919, "grad_norm": 95.79170227050781, "learning_rate": 4.851674745749571e-07, "logits/chosen": -1.15673828125, "logits/rejected": -1.094934105873108, "logps/chosen": -293.453125, "logps/rejected": -257.1000061035156, "loss": 0.561, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.86993408203125, "rewards/margins": 1.3911011219024658, "rewards/rejected": -2.261944532394409, "step": 2370 }, { "epoch": 0.6000063025872121, "grad_norm": 75.12720489501953, "learning_rate": 4.849175800076034e-07, "logits/chosen": -1.131585717201233, "logits/rejected": -1.112768530845642, "logps/chosen": -320.23748779296875, "logps/rejected": -314.5625, "loss": 0.502, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.776953101158142, "rewards/margins": 1.454748511314392, "rewards/rejected": -3.2318358421325684, "step": 2380 }, { "epoch": 0.6025273374720322, "grad_norm": 86.88487243652344, "learning_rate": 4.846656633607458e-07, "logits/chosen": -1.111425757408142, "logits/rejected": -1.0926392078399658, "logps/chosen": -314.4750061035156, "logps/rejected": -323.64373779296875, "loss": 0.6269, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -2.241943359375, "rewards/margins": 0.9921509027481079, "rewards/rejected": -3.2347655296325684, "step": 2390 }, { "epoch": 0.6050483723568525, "grad_norm": 69.06053161621094, "learning_rate": 4.844117268027848e-07, "logits/chosen": -1.138671875, "logits/rejected": -1.102941870689392, "logps/chosen": -309.96875, "logps/rejected": -274.91876220703125, "loss": 0.5555, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.883856177330017, "rewards/margins": 1.180932641029358, "rewards/rejected": -3.063671827316284, "step": 2400 }, { "epoch": 0.6075694072416727, "grad_norm": 69.51000213623047, "learning_rate": 4.841557725195083e-07, "logits/chosen": -1.155615210533142, "logits/rejected": -1.1832275390625, "logps/chosen": -296.39373779296875, "logps/rejected": -289.2437438964844, "loss": 0.6145, "rewards/accuracies": 0.6875, "rewards/chosen": -1.435693383216858, "rewards/margins": 1.1839478015899658, "rewards/rejected": -2.620678663253784, "step": 2410 }, { "epoch": 0.6100904421264929, "grad_norm": 65.10906982421875, "learning_rate": 4.838978027140713e-07, "logits/chosen": -1.150396704673767, "logits/rejected": -1.200341820716858, "logps/chosen": -299.8343811035156, "logps/rejected": -277.08123779296875, "loss": 0.533, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0263763666152954, "rewards/margins": 1.4067351818084717, "rewards/rejected": -2.4328370094299316, "step": 2420 }, { "epoch": 0.6126114770113131, "grad_norm": 84.71194458007812, "learning_rate": 4.836378196069781e-07, "logits/chosen": -1.1405670642852783, "logits/rejected": -1.1593017578125, "logps/chosen": -278.7749938964844, "logps/rejected": -252.38125610351562, "loss": 0.6179, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.8090149164199829, "rewards/margins": 1.1510009765625, "rewards/rejected": -1.9599120616912842, "step": 2430 }, { "epoch": 0.6151325118961334, "grad_norm": 65.30789947509766, "learning_rate": 4.833758254360625e-07, "logits/chosen": -1.197998046875, "logits/rejected": -1.17822265625, "logps/chosen": -281.12188720703125, "logps/rejected": -260.6937561035156, "loss": 0.5126, "rewards/accuracies": 0.71875, "rewards/chosen": -0.529400646686554, "rewards/margins": 1.24432373046875, "rewards/rejected": -1.772973656654358, "step": 2440 }, { "epoch": 0.6176535467809536, "grad_norm": 47.99665832519531, "learning_rate": 4.831118224564688e-07, "logits/chosen": -1.1317627429962158, "logits/rejected": -1.1888916492462158, "logps/chosen": -293.66876220703125, "logps/rejected": -297.16876220703125, "loss": 0.4998, "rewards/accuracies": 0.734375, "rewards/chosen": -0.425079345703125, "rewards/margins": 1.356597900390625, "rewards/rejected": -1.7805907726287842, "step": 2450 }, { "epoch": 0.6201745816657738, "grad_norm": 61.13064956665039, "learning_rate": 4.828458129406322e-07, "logits/chosen": -1.1535766124725342, "logits/rejected": -1.12652587890625, "logps/chosen": -286.98126220703125, "logps/rejected": -270.76251220703125, "loss": 0.4807, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7186523675918579, "rewards/margins": 1.408850073814392, "rewards/rejected": -2.1275877952575684, "step": 2460 }, { "epoch": 0.622695616550594, "grad_norm": 79.37671661376953, "learning_rate": 4.825777991782599e-07, "logits/chosen": -1.1739501953125, "logits/rejected": -1.191259741783142, "logps/chosen": -300.09375, "logps/rejected": -290.0062561035156, "loss": 0.5922, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8119446039199829, "rewards/margins": 1.4115784168243408, "rewards/rejected": -2.2237305641174316, "step": 2470 }, { "epoch": 0.6252166514354143, "grad_norm": 78.25971221923828, "learning_rate": 4.823077834763102e-07, "logits/chosen": -1.094567894935608, "logits/rejected": -1.180395483970642, "logps/chosen": -297.1000061035156, "logps/rejected": -280.2124938964844, "loss": 0.5548, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.138482689857483, "rewards/margins": 1.4807860851287842, "rewards/rejected": -2.618457078933716, "step": 2480 }, { "epoch": 0.6277376863202344, "grad_norm": 53.465755462646484, "learning_rate": 4.820357681589738e-07, "logits/chosen": -1.214599609375, "logits/rejected": -1.178979516029358, "logps/chosen": -294.3812561035156, "logps/rejected": -275.8187561035156, "loss": 0.5115, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -1.1684691905975342, "rewards/margins": 1.486230492591858, "rewards/rejected": -2.6543211936950684, "step": 2490 }, { "epoch": 0.6302587212050547, "grad_norm": 85.9769515991211, "learning_rate": 4.817617555676531e-07, "logits/chosen": -1.1224853992462158, "logits/rejected": -1.14208984375, "logps/chosen": -289.125, "logps/rejected": -272.9437561035156, "loss": 0.5296, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.059788465499878, "rewards/margins": 1.2673156261444092, "rewards/rejected": -2.327471971511841, "step": 2500 }, { "epoch": 0.6327797560898749, "grad_norm": 65.66055297851562, "learning_rate": 4.814857480609423e-07, "logits/chosen": -1.2054870128631592, "logits/rejected": -1.235107421875, "logps/chosen": -293.28436279296875, "logps/rejected": -288.66876220703125, "loss": 0.5634, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9910598993301392, "rewards/margins": 1.3326294422149658, "rewards/rejected": -2.324328660964966, "step": 2510 }, { "epoch": 0.6353007909746952, "grad_norm": 83.93264770507812, "learning_rate": 4.812077480146071e-07, "logits/chosen": -1.1200683116912842, "logits/rejected": -1.1492187976837158, "logps/chosen": -292.90625, "logps/rejected": -282.76251220703125, "loss": 0.5442, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0575134754180908, "rewards/margins": 1.38275146484375, "rewards/rejected": -2.4418272972106934, "step": 2520 }, { "epoch": 0.6378218258595153, "grad_norm": 100.24918365478516, "learning_rate": 4.809277578215642e-07, "logits/chosen": -1.1622314453125, "logits/rejected": -1.1765868663787842, "logps/chosen": -282.8062438964844, "logps/rejected": -269.70001220703125, "loss": 0.5422, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.6311782598495483, "rewards/margins": 1.385986328125, "rewards/rejected": -2.017224073410034, "step": 2530 }, { "epoch": 0.6403428607443356, "grad_norm": 67.45328521728516, "learning_rate": 4.806457798918605e-07, "logits/chosen": -1.241113305091858, "logits/rejected": -1.204003930091858, "logps/chosen": -312.625, "logps/rejected": -297.92498779296875, "loss": 0.5716, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.5490020513534546, "rewards/margins": 1.1793212890625, "rewards/rejected": -1.7283859252929688, "step": 2540 }, { "epoch": 0.6428638956291558, "grad_norm": 85.26969146728516, "learning_rate": 4.80361816652653e-07, "logits/chosen": -1.196069359779358, "logits/rejected": -1.23370361328125, "logps/chosen": -307.26251220703125, "logps/rejected": -271.75, "loss": 0.5261, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.793475329875946, "rewards/margins": 1.27081298828125, "rewards/rejected": -2.0640501976013184, "step": 2550 }, { "epoch": 0.6453849305139759, "grad_norm": 61.15713882446289, "learning_rate": 4.800758705481872e-07, "logits/chosen": -1.1762206554412842, "logits/rejected": -1.200537085533142, "logps/chosen": -298.8374938964844, "logps/rejected": -280.8125, "loss": 0.4726, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.532562255859375, "rewards/margins": 1.32684326171875, "rewards/rejected": -1.859521508216858, "step": 2560 }, { "epoch": 0.6479059653987962, "grad_norm": 71.99295806884766, "learning_rate": 4.797879440397764e-07, "logits/chosen": -1.144287109375, "logits/rejected": -1.174462914466858, "logps/chosen": -260.73748779296875, "logps/rejected": -258.9437561035156, "loss": 0.603, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8031395077705383, "rewards/margins": 1.1868774890899658, "rewards/rejected": -1.9891357421875, "step": 2570 }, { "epoch": 0.6504270002836164, "grad_norm": 72.66377258300781, "learning_rate": 4.794980396057802e-07, "logits/chosen": -1.137353539466858, "logits/rejected": -1.152807593345642, "logps/chosen": -289.0062561035156, "logps/rejected": -292.0375061035156, "loss": 0.5609, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.7954727411270142, "rewards/margins": 1.278875708580017, "rewards/rejected": -2.07403564453125, "step": 2580 }, { "epoch": 0.6529480351684367, "grad_norm": 69.01895141601562, "learning_rate": 4.792061597415838e-07, "logits/chosen": -1.2290527820587158, "logits/rejected": -1.218847632408142, "logps/chosen": -284.45623779296875, "logps/rejected": -278.07501220703125, "loss": 0.5333, "rewards/accuracies": 0.71875, "rewards/chosen": -0.824688732624054, "rewards/margins": 1.3822143077850342, "rewards/rejected": -2.2069945335388184, "step": 2590 }, { "epoch": 0.6554690700532568, "grad_norm": 68.94561004638672, "learning_rate": 4.78912306959576e-07, "logits/chosen": -1.154443383216858, "logits/rejected": -1.1708190441131592, "logps/chosen": -316.5687561035156, "logps/rejected": -292.3062438964844, "loss": 0.4711, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.5568755865097046, "rewards/margins": 1.523004174232483, "rewards/rejected": -2.0794615745544434, "step": 2600 }, { "epoch": 0.6579901049380771, "grad_norm": 50.01373291015625, "learning_rate": 4.786164837891277e-07, "logits/chosen": -1.108984351158142, "logits/rejected": -1.1860840320587158, "logps/chosen": -277.65625, "logps/rejected": -298.6937561035156, "loss": 0.5241, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.6713012456893921, "rewards/margins": 1.461004614830017, "rewards/rejected": -2.133190870285034, "step": 2610 }, { "epoch": 0.6605111398228973, "grad_norm": 76.2265396118164, "learning_rate": 4.7831869277657e-07, "logits/chosen": -1.1467773914337158, "logits/rejected": -1.172705054283142, "logps/chosen": -280.00311279296875, "logps/rejected": -277.2437438964844, "loss": 0.5286, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8196624517440796, "rewards/margins": 1.400231957435608, "rewards/rejected": -2.2198243141174316, "step": 2620 }, { "epoch": 0.6630321747077175, "grad_norm": 90.98784637451172, "learning_rate": 4.780189364851726e-07, "logits/chosen": -1.1557128429412842, "logits/rejected": -1.1880614757537842, "logps/chosen": -281.21875, "logps/rejected": -282.53125, "loss": 0.5537, "rewards/accuracies": 0.75, "rewards/chosen": -0.435556024312973, "rewards/margins": 1.472467064857483, "rewards/rejected": -1.908056616783142, "step": 2630 }, { "epoch": 0.6655532095925377, "grad_norm": 110.27445983886719, "learning_rate": 4.777172174951216e-07, "logits/chosen": -1.1761963367462158, "logits/rejected": -1.188818335533142, "logps/chosen": -312.390625, "logps/rejected": -299.3374938964844, "loss": 0.5696, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5568572878837585, "rewards/margins": 1.3341461420059204, "rewards/rejected": -1.8907806873321533, "step": 2640 }, { "epoch": 0.668074244477358, "grad_norm": 68.85711669921875, "learning_rate": 4.77413538403497e-07, "logits/chosen": -1.197778344154358, "logits/rejected": -1.183837890625, "logps/chosen": -303.17498779296875, "logps/rejected": -293.57501220703125, "loss": 0.4299, "rewards/accuracies": 0.7906249761581421, "rewards/chosen": -0.8754485845565796, "rewards/margins": 1.8027832508087158, "rewards/rejected": -2.6781952381134033, "step": 2650 }, { "epoch": 0.6705952793621782, "grad_norm": 90.8910903930664, "learning_rate": 4.771079018242509e-07, "logits/chosen": NaN, "logits/rejected": -1.073815941810608, "logps/chosen": -308.98748779296875, "logps/rejected": -316.8687438964844, "loss": 0.6116, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -2.0481934547424316, "rewards/margins": 1.4583740234375, "rewards/rejected": -3.5071778297424316, "step": 2660 }, { "epoch": 0.6731163142469984, "grad_norm": 88.4048843383789, "learning_rate": 4.7680031038818445e-07, "logits/chosen": -1.0806763172149658, "logits/rejected": -1.0692627429962158, "logps/chosen": -308.1625061035156, "logps/rejected": -303.8062438964844, "loss": 0.5529, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5895018577575684, "rewards/margins": 1.290319800376892, "rewards/rejected": -3.880664110183716, "step": 2670 }, { "epoch": 0.6756373491318186, "grad_norm": 60.40487289428711, "learning_rate": 4.7649076674292564e-07, "logits/chosen": -1.06890869140625, "logits/rejected": -1.095422387123108, "logps/chosen": -294.2437438964844, "logps/rejected": -289.6187438964844, "loss": 0.5113, "rewards/accuracies": 0.75, "rewards/chosen": -2.401989698410034, "rewards/margins": 1.3101806640625, "rewards/rejected": -3.7132811546325684, "step": 2680 }, { "epoch": 0.6781583840166389, "grad_norm": 102.43937683105469, "learning_rate": 4.761792735529061e-07, "logits/chosen": -1.0726318359375, "logits/rejected": -1.087988257408142, "logps/chosen": -291.21563720703125, "logps/rejected": -302.2875061035156, "loss": 0.5948, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.4117188453674316, "rewards/margins": 1.0986328125, "rewards/rejected": -3.509765625, "step": 2690 }, { "epoch": 0.680679418901459, "grad_norm": 190.5872802734375, "learning_rate": 4.7586583349933864e-07, "logits/chosen": -1.0610473155975342, "logits/rejected": -1.0805175304412842, "logps/chosen": -322.51251220703125, "logps/rejected": -307.36248779296875, "loss": 0.617, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -2.056689500808716, "rewards/margins": 1.2536132335662842, "rewards/rejected": -3.310351610183716, "step": 2700 }, { "epoch": 0.6832004537862792, "grad_norm": 94.1674575805664, "learning_rate": 4.755504492801937e-07, "logits/chosen": -1.0991699695587158, "logits/rejected": -1.0919067859649658, "logps/chosen": -307.98748779296875, "logps/rejected": -278.57501220703125, "loss": 0.5943, "rewards/accuracies": 0.703125, "rewards/chosen": -2.4027342796325684, "rewards/margins": 1.0565063953399658, "rewards/rejected": -3.4593749046325684, "step": 2710 }, { "epoch": 0.6857214886710995, "grad_norm": 69.85820770263672, "learning_rate": 4.7523312361017654e-07, "logits/chosen": -1.136254906654358, "logits/rejected": -1.1059448719024658, "logps/chosen": -298.8687438964844, "logps/rejected": -284.8500061035156, "loss": 0.4529, "rewards/accuracies": 0.765625, "rewards/chosen": -2.005786180496216, "rewards/margins": 1.614477515220642, "rewards/rejected": -3.6192383766174316, "step": 2720 }, { "epoch": 0.6882425235559197, "grad_norm": 82.56523132324219, "learning_rate": 4.7491385922070347e-07, "logits/chosen": NaN, "logits/rejected": -1.112707495689392, "logps/chosen": -306.09375, "logps/rejected": -314.23126220703125, "loss": 0.5461, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7273375988006592, "rewards/margins": 1.6251709461212158, "rewards/rejected": -3.3524413108825684, "step": 2730 }, { "epoch": 0.6907635584407399, "grad_norm": 70.45012664794922, "learning_rate": 4.7459265885987865e-07, "logits/chosen": -1.096398949623108, "logits/rejected": -1.0847504138946533, "logps/chosen": -303.64373779296875, "logps/rejected": -281.29376220703125, "loss": 0.5543, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.6130187511444092, "rewards/margins": 1.3455810546875, "rewards/rejected": -2.958935499191284, "step": 2740 }, { "epoch": 0.6932845933255601, "grad_norm": 63.99974060058594, "learning_rate": 4.7426952529247047e-07, "logits/chosen": -1.188989281654358, "logits/rejected": -1.1235840320587158, "logps/chosen": -311.48748779296875, "logps/rejected": -293.61248779296875, "loss": 0.6665, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8017578125, "rewards/margins": 1.1265137195587158, "rewards/rejected": -2.9278321266174316, "step": 2750 }, { "epoch": 0.6958056282103804, "grad_norm": 61.86286163330078, "learning_rate": 4.739444612998872e-07, "logits/chosen": -1.136474609375, "logits/rejected": -1.1675536632537842, "logps/chosen": -294.0375061035156, "logps/rejected": -263.76873779296875, "loss": 0.4585, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -1.348413109779358, "rewards/margins": 1.5325438976287842, "rewards/rejected": -2.8815369606018066, "step": 2760 }, { "epoch": 0.6983266630952005, "grad_norm": 53.799129486083984, "learning_rate": 4.7361746968015396e-07, "logits/chosen": -1.1840667724609375, "logits/rejected": -1.2018311023712158, "logps/chosen": -322.61248779296875, "logps/rejected": -308.8187561035156, "loss": 0.5971, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -1.2908141613006592, "rewards/margins": 1.435052514076233, "rewards/rejected": -2.7259521484375, "step": 2770 }, { "epoch": 0.7008476979800208, "grad_norm": 65.56761169433594, "learning_rate": 4.732885532478879e-07, "logits/chosen": -1.1656982898712158, "logits/rejected": -1.1639893054962158, "logps/chosen": -291.3968811035156, "logps/rejected": -264.57501220703125, "loss": 0.5467, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7727325558662415, "rewards/margins": 1.4450256824493408, "rewards/rejected": -2.216906785964966, "step": 2780 }, { "epoch": 0.703368732864841, "grad_norm": 85.63340759277344, "learning_rate": 4.729577148342742e-07, "logits/chosen": -1.209924340248108, "logits/rejected": -1.228369116783142, "logps/chosen": -303.625, "logps/rejected": -281.60626220703125, "loss": 0.5424, "rewards/accuracies": 0.703125, "rewards/chosen": -0.653521716594696, "rewards/margins": 1.5137939453125, "rewards/rejected": -2.167309522628784, "step": 2790 }, { "epoch": 0.7058897677496613, "grad_norm": 54.452789306640625, "learning_rate": 4.7262495728704156e-07, "logits/chosen": -1.145410180091858, "logits/rejected": -1.1492431163787842, "logps/chosen": -307.11248779296875, "logps/rejected": -294.8500061035156, "loss": 0.5321, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.905059814453125, "rewards/margins": 1.4968140125274658, "rewards/rejected": -2.402099609375, "step": 2800 }, { "epoch": 0.7084108026344814, "grad_norm": 61.9515266418457, "learning_rate": 4.7229028347043826e-07, "logits/chosen": -1.1328125, "logits/rejected": -1.1371581554412842, "logps/chosen": -280.28436279296875, "logps/rejected": -276.81561279296875, "loss": 0.554, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7785491943359375, "rewards/margins": 1.3621094226837158, "rewards/rejected": -2.1416258811950684, "step": 2810 }, { "epoch": 0.7109318375193017, "grad_norm": 59.2089729309082, "learning_rate": 4.719536962652067e-07, "logits/chosen": -1.1936523914337158, "logits/rejected": -1.1558837890625, "logps/chosen": -294.48126220703125, "logps/rejected": -291.6156311035156, "loss": 0.5687, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8442443609237671, "rewards/margins": 1.181768774986267, "rewards/rejected": -2.026538133621216, "step": 2820 }, { "epoch": 0.7134528724041219, "grad_norm": 72.19743347167969, "learning_rate": 4.7161519856855915e-07, "logits/chosen": -1.1260986328125, "logits/rejected": -1.1375243663787842, "logps/chosen": -291.0874938964844, "logps/rejected": -273.3187561035156, "loss": 0.5357, "rewards/accuracies": 0.75, "rewards/chosen": -0.684588611125946, "rewards/margins": 1.328607201576233, "rewards/rejected": -2.0126099586486816, "step": 2830 }, { "epoch": 0.7159739072889421, "grad_norm": 58.94084548950195, "learning_rate": 4.7127479329415266e-07, "logits/chosen": -1.1718933582305908, "logits/rejected": -1.22802734375, "logps/chosen": -294.58123779296875, "logps/rejected": -277.1812438964844, "loss": 0.5834, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6978393793106079, "rewards/margins": 1.2098877429962158, "rewards/rejected": -1.9071533679962158, "step": 2840 }, { "epoch": 0.7184949421737623, "grad_norm": 64.77632904052734, "learning_rate": 4.709324833720639e-07, "logits/chosen": -1.173608422279358, "logits/rejected": -1.18798828125, "logps/chosen": -305.28125, "logps/rejected": -273.5, "loss": 0.5544, "rewards/accuracies": 0.734375, "rewards/chosen": -0.7196075320243835, "rewards/margins": 1.408563256263733, "rewards/rejected": -2.128100633621216, "step": 2850 }, { "epoch": 0.7210159770585826, "grad_norm": 81.05403900146484, "learning_rate": 4.7058827174876406e-07, "logits/chosen": -1.155676245689392, "logits/rejected": -1.110107421875, "logps/chosen": -269.25311279296875, "logps/rejected": -267.3187561035156, "loss": 0.5214, "rewards/accuracies": 0.75, "rewards/chosen": -0.6199615597724915, "rewards/margins": 1.470703125, "rewards/rejected": -2.0902466773986816, "step": 2860 }, { "epoch": 0.7235370119434028, "grad_norm": 108.76309204101562, "learning_rate": 4.7024216138709333e-07, "logits/chosen": -1.146337866783142, "logits/rejected": -1.080541968345642, "logps/chosen": -286.04998779296875, "logps/rejected": -279.23748779296875, "loss": 0.5623, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.6038573980331421, "rewards/margins": 1.650976538658142, "rewards/rejected": -2.253796339035034, "step": 2870 }, { "epoch": 0.726058046828223, "grad_norm": 73.72408294677734, "learning_rate": 4.6989415526623566e-07, "logits/chosen": NaN, "logits/rejected": -1.15380859375, "logps/chosen": -270.7593688964844, "logps/rejected": -282.75, "loss": 0.6657, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.44719237089157104, "rewards/margins": 1.3208129405975342, "rewards/rejected": -1.767968773841858, "step": 2880 }, { "epoch": 0.7285790817130432, "grad_norm": 66.9427719116211, "learning_rate": 4.69544256381693e-07, "logits/chosen": NaN, "logits/rejected": -1.100317358970642, "logps/chosen": -277.33123779296875, "logps/rejected": -260.64373779296875, "loss": 0.5389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.080474853515625, "rewards/margins": 1.3751342296600342, "rewards/rejected": -1.4554367065429688, "step": 2890 }, { "epoch": 0.7311001165978634, "grad_norm": 85.16573333740234, "learning_rate": 4.691924677452592e-07, "logits/chosen": -1.1538574695587158, "logits/rejected": -1.186254858970642, "logps/chosen": -316.6812438964844, "logps/rejected": -291.23126220703125, "loss": 0.5846, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5623626708984375, "rewards/margins": 1.3126099109649658, "rewards/rejected": -1.874786376953125, "step": 2900 }, { "epoch": 0.7336211514826836, "grad_norm": 80.74299621582031, "learning_rate": 4.688387923849947e-07, "logits/chosen": -1.1162598133087158, "logits/rejected": -1.0867096185684204, "logps/chosen": -331.0843811035156, "logps/rejected": -290.32501220703125, "loss": 0.6125, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.106848120689392, "rewards/margins": 1.276983618736267, "rewards/rejected": -2.3839659690856934, "step": 2910 }, { "epoch": 0.7361421863675038, "grad_norm": 72.91273498535156, "learning_rate": 4.684832333451998e-07, "logits/chosen": -1.050482153892517, "logits/rejected": -1.0858886241912842, "logps/chosen": -295.3062438964844, "logps/rejected": -309.61248779296875, "loss": 0.5397, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.507714867591858, "rewards/margins": 1.477258324623108, "rewards/rejected": -2.985400438308716, "step": 2920 }, { "epoch": 0.7386632212523241, "grad_norm": 65.06329345703125, "learning_rate": 4.68125793686389e-07, "logits/chosen": -1.129608154296875, "logits/rejected": -1.1381103992462158, "logps/chosen": -307.91876220703125, "logps/rejected": -297.45001220703125, "loss": 0.5723, "rewards/accuracies": 0.703125, "rewards/chosen": -1.5120971202850342, "rewards/margins": 1.3308227062225342, "rewards/rejected": -2.8423829078674316, "step": 2930 }, { "epoch": 0.7411842561371443, "grad_norm": 99.28950500488281, "learning_rate": 4.677664764852644e-07, "logits/chosen": -1.070556640625, "logits/rejected": -1.10809326171875, "logps/chosen": -283.8374938964844, "logps/rejected": -283.53125, "loss": 0.5436, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.4549328088760376, "rewards/margins": 1.3447662591934204, "rewards/rejected": -2.7989745140075684, "step": 2940 }, { "epoch": 0.7437052910219645, "grad_norm": 64.50562286376953, "learning_rate": 4.6740528483468926e-07, "logits/chosen": -1.1115601062774658, "logits/rejected": -1.1370360851287842, "logps/chosen": -287.9468688964844, "logps/rejected": -279.6625061035156, "loss": 0.5576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3379623889923096, "rewards/margins": 1.367529273033142, "rewards/rejected": -2.7056641578674316, "step": 2950 }, { "epoch": 0.7462263259067847, "grad_norm": 50.57285690307617, "learning_rate": 4.670422218436613e-07, "logits/chosen": -1.146215796470642, "logits/rejected": -1.088281273841858, "logps/chosen": -295.2718811035156, "logps/rejected": -280.7250061035156, "loss": 0.5895, "rewards/accuracies": 0.703125, "rewards/chosen": -1.332891821861267, "rewards/margins": 1.275415062904358, "rewards/rejected": -2.609619140625, "step": 2960 }, { "epoch": 0.748747360791605, "grad_norm": 93.81817626953125, "learning_rate": 4.6667729063728616e-07, "logits/chosen": -1.117163062095642, "logits/rejected": -1.115881323814392, "logps/chosen": -295.3687438964844, "logps/rejected": -283.03125, "loss": 0.517, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -1.1867492198944092, "rewards/margins": 1.4429442882537842, "rewards/rejected": -2.629931688308716, "step": 2970 }, { "epoch": 0.7512683956764252, "grad_norm": 74.55547332763672, "learning_rate": 4.663104943567502e-07, "logits/chosen": -1.1306641101837158, "logits/rejected": -1.1548340320587158, "logps/chosen": -279.8999938964844, "logps/rejected": -277.0562438964844, "loss": 0.595, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0513458251953125, "rewards/margins": 1.223480224609375, "rewards/rejected": -2.2755126953125, "step": 2980 }, { "epoch": 0.7537894305612454, "grad_norm": 66.2352294921875, "learning_rate": 4.659418361592936e-07, "logits/chosen": -1.07928466796875, "logits/rejected": NaN, "logps/chosen": -285.15625, "logps/rejected": -283.14373779296875, "loss": 0.593, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7939056158065796, "rewards/margins": 1.283117651939392, "rewards/rejected": -2.076037645339966, "step": 2990 }, { "epoch": 0.7563104654460656, "grad_norm": 100.5522689819336, "learning_rate": 4.655713192181835e-07, "logits/chosen": -1.186669945716858, "logits/rejected": -1.159265160560608, "logps/chosen": -327.4125061035156, "logps/rejected": -274.95001220703125, "loss": 0.5666, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.5676056146621704, "rewards/margins": 1.240380883216858, "rewards/rejected": -1.8081023693084717, "step": 3000 }, { "epoch": 0.7588315003308859, "grad_norm": 51.96790313720703, "learning_rate": 4.651989467226859e-07, "logits/chosen": -1.166284203529358, "logits/rejected": -1.1627686023712158, "logps/chosen": -301.23748779296875, "logps/rejected": -268.55938720703125, "loss": 0.5571, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.42068177461624146, "rewards/margins": 1.1670043468475342, "rewards/rejected": -1.5872313976287842, "step": 3010 }, { "epoch": 0.761352535215706, "grad_norm": 78.86668395996094, "learning_rate": 4.648247218780391e-07, "logits/chosen": -1.1724426746368408, "logits/rejected": -1.1681029796600342, "logps/chosen": -279.98126220703125, "logps/rejected": -264.46875, "loss": 0.6394, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.5438163876533508, "rewards/margins": 1.0772705078125, "rewards/rejected": -1.621557593345642, "step": 3020 }, { "epoch": 0.7638735701005263, "grad_norm": 48.36394500732422, "learning_rate": 4.644486479054256e-07, "logits/chosen": -1.1013062000274658, "logits/rejected": -1.158178687095642, "logps/chosen": -308.86248779296875, "logps/rejected": -294.0562438964844, "loss": 0.5717, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.8183044195175171, "rewards/margins": 1.3197815418243408, "rewards/rejected": -2.139080762863159, "step": 3030 }, { "epoch": 0.7663946049853465, "grad_norm": 57.000423431396484, "learning_rate": 4.640707280419444e-07, "logits/chosen": -1.135351538658142, "logits/rejected": -1.151525855064392, "logps/chosen": -291.95001220703125, "logps/rejected": -270.45623779296875, "loss": 0.5595, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.9432922601699829, "rewards/margins": 1.415521264076233, "rewards/rejected": -2.3579344749450684, "step": 3040 }, { "epoch": 0.7689156398701668, "grad_norm": 50.99508285522461, "learning_rate": 4.636909655405832e-07, "logits/chosen": -1.132177710533142, "logits/rejected": -1.151269555091858, "logps/chosen": -300.16876220703125, "logps/rejected": -289.1937561035156, "loss": 0.5131, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.838409423828125, "rewards/margins": 1.471771240234375, "rewards/rejected": -2.3096680641174316, "step": 3050 }, { "epoch": 0.7714366747549869, "grad_norm": 125.89732360839844, "learning_rate": 4.633093636701904e-07, "logits/chosen": -1.0341064929962158, "logits/rejected": -1.1121826171875, "logps/chosen": -272.1875, "logps/rejected": -264.9750061035156, "loss": 0.5571, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.6455932855606079, "rewards/margins": 1.581274390220642, "rewards/rejected": -2.2275543212890625, "step": 3060 }, { "epoch": 0.7739577096398071, "grad_norm": 59.311336517333984, "learning_rate": 4.629259257154472e-07, "logits/chosen": -1.117163062095642, "logits/rejected": -1.1147582530975342, "logps/chosen": -267.73748779296875, "logps/rejected": -266.01873779296875, "loss": 0.5544, "rewards/accuracies": 0.734375, "rewards/chosen": -0.3879333436489105, "rewards/margins": 1.393408179283142, "rewards/rejected": -1.7818222045898438, "step": 3070 }, { "epoch": 0.7764787445246274, "grad_norm": 75.37936401367188, "learning_rate": 4.625406549768389e-07, "logits/chosen": -1.1622803211212158, "logits/rejected": -1.1376464366912842, "logps/chosen": -282.03125, "logps/rejected": -297.0375061035156, "loss": 0.6313, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.18592986464500427, "rewards/margins": 1.1645171642303467, "rewards/rejected": -1.3501007556915283, "step": 3080 }, { "epoch": 0.7789997794094475, "grad_norm": 45.26335906982422, "learning_rate": 4.621535547706267e-07, "logits/chosen": -1.115576148033142, "logits/rejected": -1.152734398841858, "logps/chosen": -297.2875061035156, "logps/rejected": -256.875, "loss": 0.6164, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.223388671875, "rewards/margins": 1.0387756824493408, "rewards/rejected": -1.2618286609649658, "step": 3090 }, { "epoch": 0.7815208142942678, "grad_norm": 62.349674224853516, "learning_rate": 4.6176462842881914e-07, "logits/chosen": -1.164794921875, "logits/rejected": -1.13763427734375, "logps/chosen": -290.48748779296875, "logps/rejected": -307.4375, "loss": 0.5724, "rewards/accuracies": 0.6875, "rewards/chosen": -0.137237548828125, "rewards/margins": 1.046820044517517, "rewards/rejected": -1.1840629577636719, "step": 3100 }, { "epoch": 0.784041849179088, "grad_norm": 53.608009338378906, "learning_rate": 4.6137387929914355e-07, "logits/chosen": -1.122167944908142, "logits/rejected": -1.119543433189392, "logps/chosen": -302.51873779296875, "logps/rejected": -283.5062561035156, "loss": 0.4835, "rewards/accuracies": 0.765625, "rewards/chosen": -0.22745056450366974, "rewards/margins": 1.39703369140625, "rewards/rejected": -1.6243774890899658, "step": 3110 }, { "epoch": 0.7865628840639083, "grad_norm": 55.97255325317383, "learning_rate": 4.60981310745017e-07, "logits/chosen": -1.1291015148162842, "logits/rejected": -1.1198852062225342, "logps/chosen": -301.4312438964844, "logps/rejected": -289.1625061035156, "loss": 0.5309, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9412506222724915, "rewards/margins": 1.3587372303009033, "rewards/rejected": -2.30029296875, "step": 3120 }, { "epoch": 0.7890839189487284, "grad_norm": 47.36808776855469, "learning_rate": 4.6058692614551755e-07, "logits/chosen": -1.126550316810608, "logits/rejected": -1.1423828601837158, "logps/chosen": -311.375, "logps/rejected": -301.09375, "loss": 0.5324, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.9432128667831421, "rewards/margins": 1.5833008289337158, "rewards/rejected": -2.525921583175659, "step": 3130 }, { "epoch": 0.7916049538335487, "grad_norm": 46.63396072387695, "learning_rate": 4.6019072889535495e-07, "logits/chosen": -1.0670897960662842, "logits/rejected": -1.073114037513733, "logps/chosen": -321.45001220703125, "logps/rejected": -313.63751220703125, "loss": 0.5413, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.2390563488006592, "rewards/margins": 1.4928436279296875, "rewards/rejected": -2.731982469558716, "step": 3140 }, { "epoch": 0.7941259887183689, "grad_norm": 67.96356964111328, "learning_rate": 4.5979272240484156e-07, "logits/chosen": -1.0963256359100342, "logits/rejected": -1.0971252918243408, "logps/chosen": -311.7437438964844, "logps/rejected": -280.92498779296875, "loss": 0.5821, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9785400629043579, "rewards/margins": 1.6365845203399658, "rewards/rejected": -2.6151123046875, "step": 3150 }, { "epoch": 0.7966470236031891, "grad_norm": 69.48004150390625, "learning_rate": 4.593929100998632e-07, "logits/chosen": -1.096643090248108, "logits/rejected": -1.138452172279358, "logps/chosen": -263.83123779296875, "logps/rejected": -282.66876220703125, "loss": 0.5359, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5240997076034546, "rewards/margins": 1.551782250404358, "rewards/rejected": -2.075640916824341, "step": 3160 }, { "epoch": 0.7991680584880093, "grad_norm": 149.04946899414062, "learning_rate": 4.5899129542184914e-07, "logits/chosen": -1.0813720226287842, "logits/rejected": -1.1157958507537842, "logps/chosen": -286.39373779296875, "logps/rejected": -295.98748779296875, "loss": 0.5777, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3289123475551605, "rewards/margins": 1.510929822921753, "rewards/rejected": -1.838830590248108, "step": 3170 }, { "epoch": 0.8016890933728296, "grad_norm": 60.59016799926758, "learning_rate": 4.5858788182774296e-07, "logits/chosen": -1.1536865234375, "logits/rejected": -1.177758812904358, "logps/chosen": -285.71875, "logps/rejected": -271.8062438964844, "loss": 0.5454, "rewards/accuracies": 0.753125011920929, "rewards/chosen": -0.2949768006801605, "rewards/margins": 1.6162230968475342, "rewards/rejected": -1.911962866783142, "step": 3180 }, { "epoch": 0.8042101282576498, "grad_norm": 81.2911376953125, "learning_rate": 4.581826727899725e-07, "logits/chosen": -1.142236351966858, "logits/rejected": -1.137426733970642, "logps/chosen": -293.79376220703125, "logps/rejected": -280.20623779296875, "loss": 0.5982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3628478944301605, "rewards/margins": 1.4190642833709717, "rewards/rejected": -1.782073974609375, "step": 3190 }, { "epoch": 0.80673116314247, "grad_norm": 71.78868103027344, "learning_rate": 4.577756717964203e-07, "logits/chosen": -1.063012719154358, "logits/rejected": -1.068792700767517, "logps/chosen": -312.0874938964844, "logps/rejected": -301.5562438964844, "loss": 0.5286, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.6008270382881165, "rewards/margins": 1.579858422279358, "rewards/rejected": -2.180126905441284, "step": 3200 }, { "epoch": 0.8092521980272902, "grad_norm": 80.68486022949219, "learning_rate": 4.57366882350393e-07, "logits/chosen": -1.046118140220642, "logits/rejected": -1.0717284679412842, "logps/chosen": -281.98748779296875, "logps/rejected": -282.8187561035156, "loss": 0.5391, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.924053966999054, "rewards/margins": 1.3265869617462158, "rewards/rejected": -2.2493653297424316, "step": 3210 }, { "epoch": 0.8117732329121105, "grad_norm": 61.843082427978516, "learning_rate": 4.569563079705919e-07, "logits/chosen": -1.1148681640625, "logits/rejected": -1.072900414466858, "logps/chosen": -301.4937438964844, "logps/rejected": -293.8999938964844, "loss": 0.5562, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7594634890556335, "rewards/margins": 1.3990600109100342, "rewards/rejected": -2.1579222679138184, "step": 3220 }, { "epoch": 0.8142942677969306, "grad_norm": 62.5531005859375, "learning_rate": 4.5654395219108224e-07, "logits/chosen": -1.13018798828125, "logits/rejected": -1.1165587902069092, "logps/chosen": -317.23126220703125, "logps/rejected": -289.3999938964844, "loss": 0.6072, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.8774398565292358, "rewards/margins": 1.1413085460662842, "rewards/rejected": -2.0191009044647217, "step": 3230 }, { "epoch": 0.8168153026817508, "grad_norm": 83.58039093017578, "learning_rate": 4.5612981856126264e-07, "logits/chosen": -1.123754858970642, "logits/rejected": -1.109716773033142, "logps/chosen": -283.6937561035156, "logps/rejected": -295.3062438964844, "loss": 0.5905, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.789410412311554, "rewards/margins": 1.2579864263534546, "rewards/rejected": -2.0471160411834717, "step": 3240 }, { "epoch": 0.8193363375665711, "grad_norm": 79.18340301513672, "learning_rate": 4.55713910645835e-07, "logits/chosen": -1.1595947742462158, "logits/rejected": -1.1443603038787842, "logps/chosen": -294.20623779296875, "logps/rejected": -283.6000061035156, "loss": 0.5571, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -0.502471923828125, "rewards/margins": 1.348358154296875, "rewards/rejected": -1.8505951166152954, "step": 3250 }, { "epoch": 0.8218573724513913, "grad_norm": 79.3178939819336, "learning_rate": 4.552962320247734e-07, "logits/chosen": -1.1867187023162842, "logits/rejected": -1.1916015148162842, "logps/chosen": -311.015625, "logps/rejected": -301.78125, "loss": 0.6201, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.644396960735321, "rewards/margins": 1.343725562095642, "rewards/rejected": -1.98846435546875, "step": 3260 }, { "epoch": 0.8243784073362115, "grad_norm": 73.20548248291016, "learning_rate": 4.5487678629329373e-07, "logits/chosen": NaN, "logits/rejected": -1.068078637123108, "logps/chosen": -288.9375, "logps/rejected": -294.54376220703125, "loss": 0.5417, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5860260128974915, "rewards/margins": 1.6181671619415283, "rewards/rejected": -2.204010009765625, "step": 3270 }, { "epoch": 0.8268994422210317, "grad_norm": 76.6052474975586, "learning_rate": 4.544555770618222e-07, "logits/chosen": -1.139013648033142, "logits/rejected": -1.1353759765625, "logps/chosen": -273.1499938964844, "logps/rejected": -298.125, "loss": 0.601, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7464264035224915, "rewards/margins": 1.3448486328125, "rewards/rejected": -2.091296434402466, "step": 3280 }, { "epoch": 0.829420477105852, "grad_norm": 48.098236083984375, "learning_rate": 4.540326079559647e-07, "logits/chosen": -1.07611083984375, "logits/rejected": -1.1259765625, "logps/chosen": -310.8999938964844, "logps/rejected": -272.73748779296875, "loss": 0.477, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0783233642578125, "rewards/margins": 1.664270043373108, "rewards/rejected": -2.742236375808716, "step": 3290 }, { "epoch": 0.8319415119906721, "grad_norm": 81.13333129882812, "learning_rate": 4.5360788261647544e-07, "logits/chosen": -1.105444312095642, "logits/rejected": -1.031591773033142, "logps/chosen": -311.0375061035156, "logps/rejected": -284.4937438964844, "loss": 0.5279, "rewards/accuracies": 0.75, "rewards/chosen": -1.309301733970642, "rewards/margins": 1.5679442882537842, "rewards/rejected": -2.8753418922424316, "step": 3300 }, { "epoch": 0.8344625468754924, "grad_norm": 66.38290405273438, "learning_rate": 4.531814046992255e-07, "logits/chosen": -1.042199730873108, "logits/rejected": NaN, "logps/chosen": -295.3374938964844, "logps/rejected": -285.51251220703125, "loss": 0.558, "rewards/accuracies": 0.75, "rewards/chosen": -1.3630584478378296, "rewards/margins": 1.519934058189392, "rewards/rejected": -2.8828186988830566, "step": 3310 }, { "epoch": 0.8369835817603126, "grad_norm": 81.1019287109375, "learning_rate": 4.5275317787517166e-07, "logits/chosen": -1.1407959461212158, "logits/rejected": -1.1571776866912842, "logps/chosen": -314.76251220703125, "logps/rejected": -304.16876220703125, "loss": 0.6103, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.140203833580017, "rewards/margins": 1.455957055091858, "rewards/rejected": -2.595172166824341, "step": 3320 }, { "epoch": 0.8395046166451329, "grad_norm": 46.38509750366211, "learning_rate": 4.5232320583032437e-07, "logits/chosen": -1.0676758289337158, "logits/rejected": -1.0481750965118408, "logps/chosen": -294.16876220703125, "logps/rejected": -275.0, "loss": 0.5718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3431777954101562, "rewards/margins": 1.3079712390899658, "rewards/rejected": -2.651416063308716, "step": 3330 }, { "epoch": 0.842025651529953, "grad_norm": 65.28993225097656, "learning_rate": 4.518914922657164e-07, "logits/chosen": -1.0771973133087158, "logits/rejected": -1.04425048828125, "logps/chosen": -286.7250061035156, "logps/rejected": -290.0062561035156, "loss": 0.5264, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1759170293807983, "rewards/margins": 1.192602515220642, "rewards/rejected": -2.368701219558716, "step": 3340 }, { "epoch": 0.8445466864147733, "grad_norm": 53.799381256103516, "learning_rate": 4.5145804089737093e-07, "logits/chosen": -1.0987670421600342, "logits/rejected": -1.08984375, "logps/chosen": -295.4937438964844, "logps/rejected": -261.08123779296875, "loss": 0.547, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1604430675506592, "rewards/margins": 1.3437378406524658, "rewards/rejected": -2.504638671875, "step": 3350 }, { "epoch": 0.8470677212995935, "grad_norm": 91.82332611083984, "learning_rate": 4.510228554562693e-07, "logits/chosen": -1.1641356945037842, "logits/rejected": -1.0832030773162842, "logps/chosen": -304.3125, "logps/rejected": -287.65625, "loss": 0.5168, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9715011715888977, "rewards/margins": 1.479833960533142, "rewards/rejected": -2.45159912109375, "step": 3360 }, { "epoch": 0.8495887561844137, "grad_norm": 91.4314193725586, "learning_rate": 4.505859396883192e-07, "logits/chosen": -1.1128661632537842, "logits/rejected": -1.1441528797149658, "logps/chosen": -254.0656280517578, "logps/rejected": -269.9937438964844, "loss": 0.5498, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.9849303960800171, "rewards/margins": 1.516363501548767, "rewards/rejected": -2.5011229515075684, "step": 3370 }, { "epoch": 0.8521097910692339, "grad_norm": 76.95771789550781, "learning_rate": 4.501472973543222e-07, "logits/chosen": -1.0636154413223267, "logits/rejected": -1.068426489830017, "logps/chosen": -307.46875, "logps/rejected": -297.5, "loss": 0.61, "rewards/accuracies": 0.671875, "rewards/chosen": -1.4217407703399658, "rewards/margins": 1.369140625, "rewards/rejected": -2.790942430496216, "step": 3380 }, { "epoch": 0.8546308259540542, "grad_norm": 83.77713775634766, "learning_rate": 4.497069322299417e-07, "logits/chosen": -1.122802734375, "logits/rejected": -1.1002686023712158, "logps/chosen": -305.2124938964844, "logps/rejected": -291.82501220703125, "loss": 0.5781, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.9173462390899658, "rewards/margins": 1.332556128501892, "rewards/rejected": -3.2500977516174316, "step": 3390 }, { "epoch": 0.8571518608388744, "grad_norm": 63.612693786621094, "learning_rate": 4.4926484810567e-07, "logits/chosen": -1.072717308998108, "logits/rejected": -1.0795409679412842, "logps/chosen": -331.09375, "logps/rejected": -307.88751220703125, "loss": 0.5327, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -1.918676733970642, "rewards/margins": 1.33758544921875, "rewards/rejected": -3.2566895484924316, "step": 3400 }, { "epoch": 0.8596728957236945, "grad_norm": 57.99867248535156, "learning_rate": 4.4882104878679584e-07, "logits/chosen": -1.105688452720642, "logits/rejected": -1.032354712486267, "logps/chosen": -281.70623779296875, "logps/rejected": -253.40625, "loss": 0.4861, "rewards/accuracies": 0.7718750238418579, "rewards/chosen": -2.032440185546875, "rewards/margins": 1.5134704113006592, "rewards/rejected": -3.5458984375, "step": 3410 }, { "epoch": 0.8621939306085148, "grad_norm": 48.57344436645508, "learning_rate": 4.4837553809337194e-07, "logits/chosen": -1.030920386314392, "logits/rejected": -1.0585143566131592, "logps/chosen": -301.8125, "logps/rejected": -283.3500061035156, "loss": 0.6095, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -2.375805616378784, "rewards/margins": 1.429931640625, "rewards/rejected": -3.8055663108825684, "step": 3420 }, { "epoch": 0.864714965493335, "grad_norm": 71.92064666748047, "learning_rate": 4.479283198601816e-07, "logits/chosen": -1.089746117591858, "logits/rejected": -1.106909155845642, "logps/chosen": -300.7562561035156, "logps/rejected": -300.15625, "loss": 0.5747, "rewards/accuracies": 0.703125, "rewards/chosen": -2.282421827316284, "rewards/margins": 1.3709625005722046, "rewards/rejected": -3.6548829078674316, "step": 3430 }, { "epoch": 0.8672360003781552, "grad_norm": 59.84413146972656, "learning_rate": 4.474793979367061e-07, "logits/chosen": -1.0660400390625, "logits/rejected": -1.075524926185608, "logps/chosen": -292.2250061035156, "logps/rejected": -282.1625061035156, "loss": 0.638, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.9407684803009033, "rewards/margins": 1.238427758216858, "rewards/rejected": -3.1781249046325684, "step": 3440 }, { "epoch": 0.8697570352629754, "grad_norm": 59.574527740478516, "learning_rate": 4.470287761870916e-07, "logits/chosen": -1.0941650867462158, "logits/rejected": -1.115942358970642, "logps/chosen": -299.73126220703125, "logps/rejected": -307.7124938964844, "loss": 0.5402, "rewards/accuracies": 0.71875, "rewards/chosen": -1.517303466796875, "rewards/margins": 1.4228637218475342, "rewards/rejected": -2.940234422683716, "step": 3450 }, { "epoch": 0.8722780701477957, "grad_norm": 65.44307708740234, "learning_rate": 4.465764584901152e-07, "logits/chosen": -1.032434105873108, "logits/rejected": -1.0498778820037842, "logps/chosen": -258.25, "logps/rejected": -264.54998779296875, "loss": 0.5861, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.127105712890625, "rewards/margins": 1.3675415515899658, "rewards/rejected": -2.4952635765075684, "step": 3460 }, { "epoch": 0.8747991050326159, "grad_norm": 73.56661987304688, "learning_rate": 4.461224487391526e-07, "logits/chosen": -1.093042016029358, "logits/rejected": -1.0864746570587158, "logps/chosen": -317.6312561035156, "logps/rejected": -293.34375, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -1.166131615638733, "rewards/margins": 1.443243384361267, "rewards/rejected": -2.6086058616638184, "step": 3470 }, { "epoch": 0.8773201399174361, "grad_norm": 86.67921447753906, "learning_rate": 4.456667508421438e-07, "logits/chosen": -1.0773437023162842, "logits/rejected": -1.0750000476837158, "logps/chosen": -316.9375, "logps/rejected": -292.60626220703125, "loss": 0.6976, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.1835922002792358, "rewards/margins": 1.2973754405975342, "rewards/rejected": -2.480395555496216, "step": 3480 }, { "epoch": 0.8798411748022563, "grad_norm": 84.18367004394531, "learning_rate": 4.4520936872155967e-07, "logits/chosen": -1.1320312023162842, "logits/rejected": -1.151269555091858, "logps/chosen": -279.5249938964844, "logps/rejected": -290.98748779296875, "loss": 0.5921, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6704986691474915, "rewards/margins": 1.3552429676055908, "rewards/rejected": -2.024883985519409, "step": 3490 }, { "epoch": 0.8823622096870766, "grad_norm": 80.883056640625, "learning_rate": 4.447503063143683e-07, "logits/chosen": -1.1244628429412842, "logits/rejected": -1.1005980968475342, "logps/chosen": -300.46563720703125, "logps/rejected": -296.7875061035156, "loss": 0.6703, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.8366973996162415, "rewards/margins": 1.289392113685608, "rewards/rejected": -2.125659227371216, "step": 3500 }, { "epoch": 0.8848832445718967, "grad_norm": 83.55731964111328, "learning_rate": 4.4428956757200096e-07, "logits/chosen": -1.111962914466858, "logits/rejected": -1.09796142578125, "logps/chosen": -284.5562438964844, "logps/rejected": -262.9937438964844, "loss": 0.5635, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.689208984375, "rewards/margins": 1.4481933116912842, "rewards/rejected": -2.137737989425659, "step": 3510 }, { "epoch": 0.887404279456717, "grad_norm": 68.90113830566406, "learning_rate": 4.4382715646031834e-07, "logits/chosen": -1.0959961414337158, "logits/rejected": -1.048187255859375, "logps/chosen": -298.9375, "logps/rejected": -266.875, "loss": 0.4765, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.7831360101699829, "rewards/margins": 1.63409423828125, "rewards/rejected": -2.416159152984619, "step": 3520 }, { "epoch": 0.8899253143415372, "grad_norm": 50.8995475769043, "learning_rate": 4.4336307695957605e-07, "logits/chosen": -1.07177734375, "logits/rejected": -1.093530297279358, "logps/chosen": -288.6875, "logps/rejected": -294.5874938964844, "loss": 0.6003, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.7579803466796875, "rewards/margins": 1.699853539466858, "rewards/rejected": -2.456860303878784, "step": 3530 }, { "epoch": 0.8924463492263575, "grad_norm": 65.32662200927734, "learning_rate": 4.428973330643906e-07, "logits/chosen": -1.082128882408142, "logits/rejected": -1.100927710533142, "logps/chosen": -299.3500061035156, "logps/rejected": -293.04998779296875, "loss": 0.5206, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.817181408405304, "rewards/margins": 1.541632056236267, "rewards/rejected": -2.3595337867736816, "step": 3540 }, { "epoch": 0.8949673841111776, "grad_norm": 75.93994140625, "learning_rate": 4.4242992878370493e-07, "logits/chosen": -1.0684661865234375, "logits/rejected": -1.129541039466858, "logps/chosen": -286.25311279296875, "logps/rejected": -296.6937561035156, "loss": 0.5547, "rewards/accuracies": 0.734375, "rewards/chosen": -0.8977721929550171, "rewards/margins": 1.512475609779358, "rewards/rejected": -2.4103026390075684, "step": 3550 }, { "epoch": 0.8974884189959978, "grad_norm": 88.91752624511719, "learning_rate": 4.4196086814075405e-07, "logits/chosen": -1.0927734375, "logits/rejected": -1.099035620689392, "logps/chosen": -288.8687438964844, "logps/rejected": -283.9437561035156, "loss": 0.603, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.9323943853378296, "rewards/margins": 1.367132544517517, "rewards/rejected": -2.2994384765625, "step": 3560 }, { "epoch": 0.9000094538808181, "grad_norm": 43.13289260864258, "learning_rate": 4.4149015517303035e-07, "logits/chosen": -1.126708984375, "logits/rejected": -1.063146948814392, "logps/chosen": -271.8500061035156, "logps/rejected": -283.4125061035156, "loss": 0.4782, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.3869872987270355, "rewards/margins": 1.6957275867462158, "rewards/rejected": -2.083209276199341, "step": 3570 }, { "epoch": 0.9025304887656382, "grad_norm": 66.90422821044922, "learning_rate": 4.410177939322484e-07, "logits/chosen": -1.1051146984100342, "logits/rejected": -1.123419165611267, "logps/chosen": -312.7124938964844, "logps/rejected": -295.29998779296875, "loss": 0.4763, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.920245349407196, "rewards/margins": 1.6244995594024658, "rewards/rejected": -2.5440430641174316, "step": 3580 }, { "epoch": 0.9050515236504585, "grad_norm": 75.59705352783203, "learning_rate": 4.4054378848431086e-07, "logits/chosen": -1.067163109779358, "logits/rejected": -1.0641663074493408, "logps/chosen": -303.83123779296875, "logps/rejected": -297.3374938964844, "loss": 0.545, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.148584008216858, "rewards/margins": 1.770471215248108, "rewards/rejected": -2.9188232421875, "step": 3590 }, { "epoch": 0.9075725585352787, "grad_norm": 83.44449615478516, "learning_rate": 4.40068142909273e-07, "logits/chosen": -1.1025879383087158, "logits/rejected": -1.0754272937774658, "logps/chosen": -287.3218688964844, "logps/rejected": -281.82501220703125, "loss": 0.5576, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.1833312511444092, "rewards/margins": 1.5989501476287842, "rewards/rejected": -2.783398389816284, "step": 3600 }, { "epoch": 0.910093593420099, "grad_norm": 78.62376403808594, "learning_rate": 4.395908613013076e-07, "logits/chosen": -1.113623023033142, "logits/rejected": -1.074987769126892, "logps/chosen": -318.90625, "logps/rejected": -304.01873779296875, "loss": 0.6044, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.873699963092804, "rewards/margins": 1.5673949718475342, "rewards/rejected": -2.4405517578125, "step": 3610 }, { "epoch": 0.9126146283049191, "grad_norm": 82.23873138427734, "learning_rate": 4.391119477686698e-07, "logits/chosen": -1.0331542491912842, "logits/rejected": -1.044946312904358, "logps/chosen": -271.75, "logps/rejected": -270.8531188964844, "loss": 0.6195, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.747485339641571, "rewards/margins": 1.320947289466858, "rewards/rejected": -2.0690064430236816, "step": 3620 }, { "epoch": 0.9151356631897394, "grad_norm": 75.17716217041016, "learning_rate": 4.386314064336617e-07, "logits/chosen": -1.060644507408142, "logits/rejected": -1.0835692882537842, "logps/chosen": -279.6312561035156, "logps/rejected": -285.07501220703125, "loss": 0.6073, "rewards/accuracies": 0.734375, "rewards/chosen": -0.6068664789199829, "rewards/margins": 1.296630859375, "rewards/rejected": -1.9022216796875, "step": 3630 }, { "epoch": 0.9176566980745596, "grad_norm": 72.4251937866211, "learning_rate": 4.38149241432597e-07, "logits/chosen": -1.0910155773162842, "logits/rejected": -1.138879418373108, "logps/chosen": -295.5687561035156, "logps/rejected": -281.5625, "loss": 0.5357, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.6645095944404602, "rewards/margins": 1.4889037609100342, "rewards/rejected": -2.1529541015625, "step": 3640 }, { "epoch": 0.9201777329593799, "grad_norm": 53.61591720581055, "learning_rate": 4.3766545691576507e-07, "logits/chosen": -1.1067383289337158, "logits/rejected": -1.127832055091858, "logps/chosen": -280.53125, "logps/rejected": -278.9125061035156, "loss": 0.5628, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6921478509902954, "rewards/margins": 1.342340111732483, "rewards/rejected": -2.034716844558716, "step": 3650 }, { "epoch": 0.9226987678442, "grad_norm": 79.49798583984375, "learning_rate": 4.3718005704739567e-07, "logits/chosen": -1.1438171863555908, "logits/rejected": -1.1519043445587158, "logps/chosen": -278.85626220703125, "logps/rejected": -294.26251220703125, "loss": 0.6286, "rewards/accuracies": 0.703125, "rewards/chosen": -1.0359008312225342, "rewards/margins": 1.2102782726287842, "rewards/rejected": -2.246447801589966, "step": 3660 }, { "epoch": 0.9252198027290203, "grad_norm": 44.99201965332031, "learning_rate": 4.366930460056227e-07, "logits/chosen": NaN, "logits/rejected": -1.0991332530975342, "logps/chosen": -291.3687438964844, "logps/rejected": -303.41876220703125, "loss": 0.5093, "rewards/accuracies": 0.7593749761581421, "rewards/chosen": -0.9651123285293579, "rewards/margins": 1.519689917564392, "rewards/rejected": -2.485278367996216, "step": 3670 }, { "epoch": 0.9277408376138405, "grad_norm": 59.285888671875, "learning_rate": 4.362044279824487e-07, "logits/chosen": -1.093774437904358, "logits/rejected": -1.1437256336212158, "logps/chosen": -274.8374938964844, "logps/rejected": -291.0625, "loss": 0.5187, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0367004871368408, "rewards/margins": 1.437585473060608, "rewards/rejected": -2.4761719703674316, "step": 3680 }, { "epoch": 0.9302618724986607, "grad_norm": 75.43614196777344, "learning_rate": 4.357142071837081e-07, "logits/chosen": -1.0698974132537842, "logits/rejected": -1.0759766101837158, "logps/chosen": -298.70623779296875, "logps/rejected": -275.73126220703125, "loss": 0.5363, "rewards/accuracies": 0.703125, "rewards/chosen": -1.1840789318084717, "rewards/margins": 1.457769751548767, "rewards/rejected": -2.6414551734924316, "step": 3690 }, { "epoch": 0.9327829073834809, "grad_norm": 63.65312957763672, "learning_rate": 4.3522238782903157e-07, "logits/chosen": -1.1274902820587158, "logits/rejected": -1.1024658679962158, "logps/chosen": -302.3500061035156, "logps/rejected": -299.7437438964844, "loss": 0.5913, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.0835082530975342, "rewards/margins": 1.494378685951233, "rewards/rejected": -2.578015089035034, "step": 3700 }, { "epoch": 0.9353039422683012, "grad_norm": 58.389888763427734, "learning_rate": 4.347289741518097e-07, "logits/chosen": -1.082788109779358, "logits/rejected": -1.055932641029358, "logps/chosen": -304.51251220703125, "logps/rejected": -298.7437438964844, "loss": 0.4947, "rewards/accuracies": 0.765625, "rewards/chosen": -1.125823974609375, "rewards/margins": 1.711151123046875, "rewards/rejected": -2.836346387863159, "step": 3710 }, { "epoch": 0.9378249771531214, "grad_norm": 58.16753005981445, "learning_rate": 4.342339703991561e-07, "logits/chosen": -1.1070556640625, "logits/rejected": -1.120361328125, "logps/chosen": -318.04998779296875, "logps/rejected": -307.88751220703125, "loss": 0.5755, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.029840111732483, "rewards/margins": 1.768408179283142, "rewards/rejected": -2.798046827316284, "step": 3720 }, { "epoch": 0.9403460120379415, "grad_norm": 65.45536041259766, "learning_rate": 4.337373808318713e-07, "logits/chosen": -1.1427733898162842, "logits/rejected": -1.125146508216858, "logps/chosen": -296.48748779296875, "logps/rejected": -289.7562561035156, "loss": 0.5597, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.7230895757675171, "rewards/margins": 1.6128143072128296, "rewards/rejected": -2.3357300758361816, "step": 3730 }, { "epoch": 0.9428670469227618, "grad_norm": 87.32518768310547, "learning_rate": 4.33239209724406e-07, "logits/chosen": -1.1684081554412842, "logits/rejected": -1.177221655845642, "logps/chosen": -319.82501220703125, "logps/rejected": -300.2749938964844, "loss": 0.587, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.0697815418243408, "rewards/margins": 1.7837402820587158, "rewards/rejected": -2.8533935546875, "step": 3740 }, { "epoch": 0.945388081807582, "grad_norm": 52.16835403442383, "learning_rate": 4.327394613648239e-07, "logits/chosen": -1.135839819908142, "logits/rejected": -1.119384765625, "logps/chosen": -304.609375, "logps/rejected": -283.1000061035156, "loss": 0.4795, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5196136236190796, "rewards/margins": 1.75726318359375, "rewards/rejected": -2.275927782058716, "step": 3750 }, { "epoch": 0.9479091166924022, "grad_norm": 92.80891418457031, "learning_rate": 4.322381400547653e-07, "logits/chosen": -1.1255614757537842, "logits/rejected": -1.0975220203399658, "logps/chosen": -306.82501220703125, "logps/rejected": -282.1187438964844, "loss": 0.6211, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.5638717412948608, "rewards/margins": 1.37689208984375, "rewards/rejected": -1.9398682117462158, "step": 3760 }, { "epoch": 0.9504301515772224, "grad_norm": 63.26457595825195, "learning_rate": 4.317352501094099e-07, "logits/chosen": -1.0829346179962158, "logits/rejected": -1.0719726085662842, "logps/chosen": -301.9375, "logps/rejected": -275.79998779296875, "loss": 0.5633, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.53662109375, "rewards/margins": 1.661810278892517, "rewards/rejected": -2.19915771484375, "step": 3770 }, { "epoch": 0.9529511864620427, "grad_norm": 64.89823150634766, "learning_rate": 4.3123079585743933e-07, "logits/chosen": -1.052557349205017, "logits/rejected": -1.072479248046875, "logps/chosen": -287.65625, "logps/rejected": -299.07501220703125, "loss": 0.5662, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.6813461184501648, "rewards/margins": 1.3924071788787842, "rewards/rejected": -2.0730528831481934, "step": 3780 }, { "epoch": 0.9554722213468629, "grad_norm": 73.54103088378906, "learning_rate": 4.3072478164100035e-07, "logits/chosen": -1.0793945789337158, "logits/rejected": -1.087927222251892, "logps/chosen": -296.8187561035156, "logps/rejected": -315.5687561035156, "loss": 0.6007, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.9742339849472046, "rewards/margins": 1.363793969154358, "rewards/rejected": -2.3383421897888184, "step": 3790 }, { "epoch": 0.9579932562316831, "grad_norm": 50.553653717041016, "learning_rate": 4.3021721181566726e-07, "logits/chosen": -1.109521508216858, "logits/rejected": -1.0997314453125, "logps/chosen": -325.26251220703125, "logps/rejected": -288.6499938964844, "loss": 0.6171, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.8993377685546875, "rewards/margins": 1.397332787513733, "rewards/rejected": -2.2955565452575684, "step": 3800 }, { "epoch": 0.9605142911165033, "grad_norm": 65.36611938476562, "learning_rate": 4.297080907504046e-07, "logits/chosen": -1.0438506603240967, "logits/rejected": -1.0786254405975342, "logps/chosen": -283.39373779296875, "logps/rejected": -275.76873779296875, "loss": 0.5093, "rewards/accuracies": 0.746874988079071, "rewards/chosen": -1.195440649986267, "rewards/margins": 1.481787085533142, "rewards/rejected": -2.678997755050659, "step": 3810 }, { "epoch": 0.9630353260013236, "grad_norm": 67.0266342163086, "learning_rate": 4.2919742282752914e-07, "logits/chosen": -1.1145751476287842, "logits/rejected": -1.133203148841858, "logps/chosen": -264.58905029296875, "logps/rejected": -294.76873779296875, "loss": 0.5937, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -1.2303345203399658, "rewards/margins": 1.3708922863006592, "rewards/rejected": -2.60076904296875, "step": 3820 }, { "epoch": 0.9655563608861437, "grad_norm": 56.374698638916016, "learning_rate": 4.2868521244267234e-07, "logits/chosen": -1.032470703125, "logits/rejected": -1.096923828125, "logps/chosen": -264.90625, "logps/rejected": -296.75, "loss": 0.5216, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -1.167138695716858, "rewards/margins": 1.549462914466858, "rewards/rejected": -2.7169432640075684, "step": 3830 }, { "epoch": 0.968077395770964, "grad_norm": 81.97763061523438, "learning_rate": 4.2817146400474293e-07, "logits/chosen": -1.0275390148162842, "logits/rejected": -1.0859863758087158, "logps/chosen": -287.1781311035156, "logps/rejected": -298.29998779296875, "loss": 0.5933, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.2765655517578125, "rewards/margins": 1.3431823253631592, "rewards/rejected": -2.620379686355591, "step": 3840 }, { "epoch": 0.9705984306557842, "grad_norm": 66.56047821044922, "learning_rate": 4.276561819358883e-07, "logits/chosen": -1.1472899913787842, "logits/rejected": -1.1371643543243408, "logps/chosen": -297.51251220703125, "logps/rejected": -296.53125, "loss": 0.5619, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.728131115436554, "rewards/margins": 1.429907202720642, "rewards/rejected": -2.1571717262268066, "step": 3850 }, { "epoch": 0.9731194655406045, "grad_norm": 90.69190979003906, "learning_rate": 4.271393706714569e-07, "logits/chosen": -1.12060546875, "logits/rejected": -1.113427758216858, "logps/chosen": -312.5249938964844, "logps/rejected": -294.8687438964844, "loss": 0.5835, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.7211105227470398, "rewards/margins": 1.262664794921875, "rewards/rejected": -1.983435034751892, "step": 3860 }, { "epoch": 0.9756405004254246, "grad_norm": 71.51448822021484, "learning_rate": 4.266210346599597e-07, "logits/chosen": -1.151025414466858, "logits/rejected": -1.1381347179412842, "logps/chosen": -288.3812561035156, "logps/rejected": -266.82501220703125, "loss": 0.5589, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6968048214912415, "rewards/margins": 1.412664771080017, "rewards/rejected": -2.109405517578125, "step": 3870 }, { "epoch": 0.9781615353102449, "grad_norm": 66.34625244140625, "learning_rate": 4.261011783630325e-07, "logits/chosen": -1.1195557117462158, "logits/rejected": -1.1390869617462158, "logps/chosen": -296.375, "logps/rejected": -281.1937561035156, "loss": 0.6018, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.8242248296737671, "rewards/margins": 1.478912353515625, "rewards/rejected": -2.3028197288513184, "step": 3880 }, { "epoch": 0.9806825701950651, "grad_norm": 47.40770721435547, "learning_rate": 4.255798062553966e-07, "logits/chosen": -1.1241943836212158, "logits/rejected": -1.1107879877090454, "logps/chosen": -273.63751220703125, "logps/rejected": -283.4750061035156, "loss": 0.5358, "rewards/accuracies": 0.75, "rewards/chosen": -1.1348450183868408, "rewards/margins": 1.4095947742462158, "rewards/rejected": -2.5440917015075684, "step": 3890 }, { "epoch": 0.9832036050798852, "grad_norm": 69.58695983886719, "learning_rate": 4.250569228248213e-07, "logits/chosen": -1.0292479991912842, "logits/rejected": -1.034875512123108, "logps/chosen": -314.4750061035156, "logps/rejected": -306.59063720703125, "loss": 0.6183, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -1.425872802734375, "rewards/margins": 1.4315338134765625, "rewards/rejected": -2.8571624755859375, "step": 3900 }, { "epoch": 0.9857246399647055, "grad_norm": 48.445335388183594, "learning_rate": 4.245325325720844e-07, "logits/chosen": -1.129638671875, "logits/rejected": -1.0939452648162842, "logps/chosen": -300.98748779296875, "logps/rejected": -303.8812561035156, "loss": 0.5403, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.1617858409881592, "rewards/margins": 1.6011841297149658, "rewards/rejected": -2.7610106468200684, "step": 3910 }, { "epoch": 0.9882456748495257, "grad_norm": 88.5267333984375, "learning_rate": 4.2400664001093407e-07, "logits/chosen": -1.047967553138733, "logits/rejected": -1.0731322765350342, "logps/chosen": -275.98748779296875, "logps/rejected": -276.17498779296875, "loss": 0.5492, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.2724609375, "rewards/margins": 1.3170502185821533, "rewards/rejected": -2.5896973609924316, "step": 3920 }, { "epoch": 0.990766709734346, "grad_norm": 49.751216888427734, "learning_rate": 4.234792496680497e-07, "logits/chosen": -1.0155150890350342, "logits/rejected": -1.0476562976837158, "logps/chosen": -289.84686279296875, "logps/rejected": -288.3500061035156, "loss": 0.5058, "rewards/accuracies": 0.78125, "rewards/chosen": -1.2067779302597046, "rewards/margins": 1.655676245689392, "rewards/rejected": -2.862683057785034, "step": 3930 }, { "epoch": 0.9932877446191661, "grad_norm": 60.2679557800293, "learning_rate": 4.2295036608300305e-07, "logits/chosen": -1.029028296470642, "logits/rejected": -1.086694359779358, "logps/chosen": -298.09375, "logps/rejected": -296.5562438964844, "loss": 0.5761, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -1.298675537109375, "rewards/margins": 1.4742920398712158, "rewards/rejected": -2.773193359375, "step": 3940 }, { "epoch": 0.9958087795039864, "grad_norm": 72.0003890991211, "learning_rate": 4.224199938082191e-07, "logits/chosen": -1.1206543445587158, "logits/rejected": -1.135717749595642, "logps/chosen": -299.6499938964844, "logps/rejected": -288.7875061035156, "loss": 0.5698, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -1.208764672279358, "rewards/margins": 1.361181616783142, "rewards/rejected": -2.568603515625, "step": 3950 }, { "epoch": 0.9983298143888066, "grad_norm": 72.28197479248047, "learning_rate": 4.218881374089369e-07, "logits/chosen": -1.087988257408142, "logits/rejected": -1.125512719154358, "logps/chosen": -287.3812561035156, "logps/rejected": -302.92498779296875, "loss": 0.5375, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.873364269733429, "rewards/margins": 1.751013159751892, "rewards/rejected": -2.62274169921875, "step": 3960 }, { "epoch": 1.0010084139539281, "grad_norm": 27.972684860229492, "learning_rate": 4.2135480146317016e-07, "logits/chosen": -1.102655291557312, "logits/rejected": -1.0655343532562256, "logps/chosen": -298.625, "logps/rejected": -284.827392578125, "loss": 0.4539, "rewards/accuracies": 0.8095238208770752, "rewards/chosen": -0.445919930934906, "rewards/margins": 2.1674513816833496, "rewards/rejected": -2.612729072570801, "step": 3970 }, { "epoch": 1.0035294488387483, "grad_norm": 55.124820709228516, "learning_rate": 4.2081999056166807e-07, "logits/chosen": -1.1157211065292358, "logits/rejected": -1.1358826160430908, "logps/chosen": -331.21875, "logps/rejected": -315.48748779296875, "loss": 0.2347, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.14286498725414276, "rewards/margins": 2.959460496902466, "rewards/rejected": -3.1031250953674316, "step": 3980 }, { "epoch": 1.0060504837235684, "grad_norm": 38.26470947265625, "learning_rate": 4.202837093078756e-07, "logits/chosen": -1.104638695716858, "logits/rejected": -1.105981469154358, "logps/chosen": -311.70623779296875, "logps/rejected": -302.4937438964844, "loss": 0.1835, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.2649780213832855, "rewards/margins": 3.245849609375, "rewards/rejected": -3.512011766433716, "step": 3990 }, { "epoch": 1.0085715186083888, "grad_norm": 55.7176399230957, "learning_rate": 4.1974596231789416e-07, "logits/chosen": -0.9954498410224915, "logits/rejected": NaN, "logps/chosen": -292.46875, "logps/rejected": -295.2437438964844, "loss": 0.2882, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.7926574945449829, "rewards/margins": 3.079882860183716, "rewards/rejected": -3.8719725608825684, "step": 4000 }, { "epoch": 1.011092553493209, "grad_norm": 31.28335952758789, "learning_rate": 4.192067542204413e-07, "logits/chosen": -1.061560034751892, "logits/rejected": -1.023706078529358, "logps/chosen": -297.0375061035156, "logps/rejected": -312.42498779296875, "loss": 0.234, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.3645355701446533, "rewards/margins": 3.3698487281799316, "rewards/rejected": -4.734765529632568, "step": 4010 }, { "epoch": 1.0136135883780293, "grad_norm": 39.66532516479492, "learning_rate": 4.186660896568116e-07, "logits/chosen": -1.114648461341858, "logits/rejected": -1.123077392578125, "logps/chosen": -303.0375061035156, "logps/rejected": -319.46875, "loss": 0.2072, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.310919165611267, "rewards/margins": 3.1480469703674316, "rewards/rejected": -4.459179878234863, "step": 4020 }, { "epoch": 1.0161346232628494, "grad_norm": 68.80731964111328, "learning_rate": 4.1812397328083584e-07, "logits/chosen": -1.105810523033142, "logits/rejected": -1.0820404291152954, "logps/chosen": -281.0062561035156, "logps/rejected": -290.17498779296875, "loss": 0.2325, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6036239862442017, "rewards/margins": 3.431103467941284, "rewards/rejected": -4.033593654632568, "step": 4030 }, { "epoch": 1.0186556581476696, "grad_norm": 37.8831672668457, "learning_rate": 4.1758040975884195e-07, "logits/chosen": -1.057647705078125, "logits/rejected": -1.104516625404358, "logps/chosen": -272.9750061035156, "logps/rejected": -280.9375, "loss": 0.2249, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.8665634393692017, "rewards/margins": 3.409106492996216, "rewards/rejected": -4.276953220367432, "step": 4040 }, { "epoch": 1.02117669303249, "grad_norm": 21.118183135986328, "learning_rate": 4.1703540376961406e-07, "logits/chosen": -1.1152832508087158, "logits/rejected": -1.0788085460662842, "logps/chosen": -299.4125061035156, "logps/rejected": -300.39373779296875, "loss": 0.2219, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.9221404790878296, "rewards/margins": 3.247851610183716, "rewards/rejected": -4.170507907867432, "step": 4050 }, { "epoch": 1.02369772791731, "grad_norm": 39.22066116333008, "learning_rate": 4.164889600043525e-07, "logits/chosen": NaN, "logits/rejected": -1.0491211414337158, "logps/chosen": -273.3031311035156, "logps/rejected": -300.625, "loss": 0.2248, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.1497116088867188, "rewards/margins": 3.146044969558716, "rewards/rejected": -4.2958984375, "step": 4060 }, { "epoch": 1.0262187628021302, "grad_norm": 24.629756927490234, "learning_rate": 4.1594108316663347e-07, "logits/chosen": -1.041906714439392, "logits/rejected": NaN, "logps/chosen": -292.03125, "logps/rejected": -285.01873779296875, "loss": 0.2257, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.960662841796875, "rewards/margins": 3.183337450027466, "rewards/rejected": -4.14208984375, "step": 4070 }, { "epoch": 1.0287397976869506, "grad_norm": 43.63905334472656, "learning_rate": 4.153917779723686e-07, "logits/chosen": -1.091064453125, "logits/rejected": -1.0660521984100342, "logps/chosen": -293.48126220703125, "logps/rejected": -295.78125, "loss": 0.2126, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.9144653081893921, "rewards/margins": 3.102587938308716, "rewards/rejected": -4.017382621765137, "step": 4080 }, { "epoch": 1.0312608325717707, "grad_norm": 32.19606018066406, "learning_rate": 4.14841049149764e-07, "logits/chosen": -1.089135766029358, "logits/rejected": -1.0004394054412842, "logps/chosen": -288.5062561035156, "logps/rejected": -289.11248779296875, "loss": 0.2117, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.5238097906112671, "rewards/margins": 3.288330078125, "rewards/rejected": -3.81298828125, "step": 4090 }, { "epoch": 1.0337818674565908, "grad_norm": 30.850324630737305, "learning_rate": 4.142889014392802e-07, "logits/chosen": -1.11529541015625, "logits/rejected": -1.135351538658142, "logps/chosen": -299.3687438964844, "logps/rejected": -302.21875, "loss": 0.2065, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.20770874619483948, "rewards/margins": 3.523681640625, "rewards/rejected": -3.730664014816284, "step": 4100 }, { "epoch": 1.0363029023414112, "grad_norm": 34.62665557861328, "learning_rate": 4.137353395935905e-07, "logits/chosen": -1.090728759765625, "logits/rejected": -1.0654175281524658, "logps/chosen": -281.92498779296875, "logps/rejected": -289.78125, "loss": 0.2114, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.37632447481155396, "rewards/margins": 3.229663133621216, "rewards/rejected": -3.607226610183716, "step": 4110 }, { "epoch": 1.0388239372262313, "grad_norm": 24.406591415405273, "learning_rate": 4.13180368377541e-07, "logits/chosen": -1.145361304283142, "logits/rejected": -1.11834716796875, "logps/chosen": -275.59375, "logps/rejected": -304.63751220703125, "loss": 0.1627, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.637542724609375, "rewards/margins": 3.4898438453674316, "rewards/rejected": -4.128710746765137, "step": 4120 }, { "epoch": 1.0413449721110515, "grad_norm": 21.681640625, "learning_rate": 4.126239925681088e-07, "logits/chosen": -1.078955054283142, "logits/rejected": -1.080316185951233, "logps/chosen": -269.88751220703125, "logps/rejected": -292.29998779296875, "loss": 0.1666, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.8504608273506165, "rewards/margins": 3.740917921066284, "rewards/rejected": -4.592675685882568, "step": 4130 }, { "epoch": 1.0438660069958718, "grad_norm": 34.73222732543945, "learning_rate": 4.120662169543612e-07, "logits/chosen": -1.1584351062774658, "logits/rejected": -1.063623070716858, "logps/chosen": -281.9624938964844, "logps/rejected": -304.09375, "loss": 0.2293, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -1.2438995838165283, "rewards/margins": 3.4716553688049316, "rewards/rejected": -4.715429782867432, "step": 4140 }, { "epoch": 1.046387041880692, "grad_norm": 70.9593276977539, "learning_rate": 4.1150704633741456e-07, "logits/chosen": -1.0972900390625, "logits/rejected": -1.084020972251892, "logps/chosen": -300.6937561035156, "logps/rejected": -295.1875, "loss": 0.2751, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.6405212879180908, "rewards/margins": 3.447216749191284, "rewards/rejected": -5.084863185882568, "step": 4150 }, { "epoch": 1.0489080767655123, "grad_norm": 43.35206604003906, "learning_rate": 4.1094648553039315e-07, "logits/chosen": -0.990277111530304, "logits/rejected": -1.0615966320037842, "logps/chosen": -282.14373779296875, "logps/rejected": -327.26251220703125, "loss": 0.2454, "rewards/accuracies": 0.921875, "rewards/chosen": -2.327831983566284, "rewards/margins": 3.537353515625, "rewards/rejected": -5.863671779632568, "step": 4160 }, { "epoch": 1.0514291116503325, "grad_norm": 45.486671447753906, "learning_rate": 4.103845393583868e-07, "logits/chosen": -1.1049010753631592, "logits/rejected": -1.0955932140350342, "logps/chosen": -301.1499938964844, "logps/rejected": -308.79998779296875, "loss": 0.187, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.193652391433716, "rewards/margins": 3.316357374191284, "rewards/rejected": -5.512304782867432, "step": 4170 }, { "epoch": 1.0539501465351526, "grad_norm": 59.81594467163086, "learning_rate": 4.0982121265841073e-07, "logits/chosen": -1.031347632408142, "logits/rejected": -1.0300414562225342, "logps/chosen": -318.1937561035156, "logps/rejected": -328.17498779296875, "loss": 0.2188, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.973791480064392, "rewards/margins": 3.4609131813049316, "rewards/rejected": -5.435351371765137, "step": 4180 }, { "epoch": 1.056471181419973, "grad_norm": 32.9062385559082, "learning_rate": 4.092565102793628e-07, "logits/chosen": -1.133276343345642, "logits/rejected": -1.0665709972381592, "logps/chosen": -282.04998779296875, "logps/rejected": -321.0062561035156, "loss": 0.2493, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.005548119544983, "rewards/margins": 3.323657274246216, "rewards/rejected": -4.329516410827637, "step": 4190 }, { "epoch": 1.0589922163047931, "grad_norm": 36.85378646850586, "learning_rate": 4.0869043708198224e-07, "logits/chosen": -1.15753173828125, "logits/rejected": -1.1194274425506592, "logps/chosen": -332.3999938964844, "logps/rejected": -332.04998779296875, "loss": 0.2871, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.8577331304550171, "rewards/margins": 3.3770995140075684, "rewards/rejected": -4.233202934265137, "step": 4200 }, { "epoch": 1.0615132511896133, "grad_norm": 34.67184829711914, "learning_rate": 4.0812299793880785e-07, "logits/chosen": -1.1424560546875, "logits/rejected": -1.085229516029358, "logps/chosen": -292.36248779296875, "logps/rejected": -318.36248779296875, "loss": 0.1899, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.0035202503204346, "rewards/margins": 3.4837889671325684, "rewards/rejected": -4.485547065734863, "step": 4210 }, { "epoch": 1.0640342860744336, "grad_norm": 45.7728157043457, "learning_rate": 4.075541977341358e-07, "logits/chosen": -1.108056664466858, "logits/rejected": -1.0416381359100342, "logps/chosen": -322.3187561035156, "logps/rejected": -324.38751220703125, "loss": 0.2596, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.581518530845642, "rewards/margins": 3.370800733566284, "rewards/rejected": -4.953125, "step": 4220 }, { "epoch": 1.0665553209592538, "grad_norm": 34.45916748046875, "learning_rate": 4.0698404136397805e-07, "logits/chosen": -1.1266601085662842, "logits/rejected": -1.0173828601837158, "logps/chosen": -293.25, "logps/rejected": -311.20001220703125, "loss": 0.2186, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5161864757537842, "rewards/margins": 3.4703125953674316, "rewards/rejected": -4.988085746765137, "step": 4230 }, { "epoch": 1.069076355844074, "grad_norm": 36.84361267089844, "learning_rate": 4.0641253373601957e-07, "logits/chosen": -1.054724097251892, "logits/rejected": -1.0573333501815796, "logps/chosen": -282.3187561035156, "logps/rejected": -310.42498779296875, "loss": 0.1894, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.352813720703125, "rewards/margins": 3.495434522628784, "rewards/rejected": -4.846875190734863, "step": 4240 }, { "epoch": 1.0715973907288943, "grad_norm": 34.635414123535156, "learning_rate": 4.0583967976957654e-07, "logits/chosen": -1.0933074951171875, "logits/rejected": -1.059136986732483, "logps/chosen": -305.29998779296875, "logps/rejected": -311.6875, "loss": 0.2144, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2784011363983154, "rewards/margins": 3.570849657058716, "rewards/rejected": -4.85107421875, "step": 4250 }, { "epoch": 1.0741184256137144, "grad_norm": 35.24597930908203, "learning_rate": 4.0526548439555407e-07, "logits/chosen": -1.1156494617462158, "logits/rejected": -1.130895972251892, "logps/chosen": -278.92498779296875, "logps/rejected": -306.1625061035156, "loss": 0.1801, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.1475830078125, "rewards/margins": 3.564257860183716, "rewards/rejected": -4.711523532867432, "step": 4260 }, { "epoch": 1.0766394604985345, "grad_norm": 48.14378356933594, "learning_rate": 4.046899525564034e-07, "logits/chosen": -1.176538109779358, "logits/rejected": -1.14471435546875, "logps/chosen": -308.3500061035156, "logps/rejected": -312.29376220703125, "loss": 0.184, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.820605456829071, "rewards/margins": 3.810253858566284, "rewards/rejected": -4.631933689117432, "step": 4270 }, { "epoch": 1.079160495383355, "grad_norm": 36.98891830444336, "learning_rate": 4.0411308920607953e-07, "logits/chosen": -1.175878882408142, "logits/rejected": -1.155859351158142, "logps/chosen": -312.39373779296875, "logps/rejected": -315.13751220703125, "loss": 0.2341, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.203314185142517, "rewards/margins": 3.703418016433716, "rewards/rejected": -4.906836032867432, "step": 4280 }, { "epoch": 1.081681530268175, "grad_norm": 16.921276092529297, "learning_rate": 4.0353489930999876e-07, "logits/chosen": -1.178369164466858, "logits/rejected": -1.121728539466858, "logps/chosen": -301.1937561035156, "logps/rejected": -306.6187438964844, "loss": 0.233, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4920272827148438, "rewards/margins": 3.525927782058716, "rewards/rejected": -5.0166015625, "step": 4290 }, { "epoch": 1.0842025651529954, "grad_norm": 47.32865905761719, "learning_rate": 4.029553878449956e-07, "logits/chosen": -1.0679442882537842, "logits/rejected": -1.0796630382537842, "logps/chosen": -282.92498779296875, "logps/rejected": -314.5, "loss": 0.1814, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5190032720565796, "rewards/margins": 3.7252440452575684, "rewards/rejected": -5.246679782867432, "step": 4300 }, { "epoch": 1.0867236000378155, "grad_norm": 52.33177185058594, "learning_rate": 4.0237455979928024e-07, "logits/chosen": -1.1435058116912842, "logits/rejected": -1.093946099281311, "logps/chosen": -322.875, "logps/rejected": -320.01873779296875, "loss": 0.2327, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5239578485488892, "rewards/margins": 3.4808106422424316, "rewards/rejected": -5.002831935882568, "step": 4310 }, { "epoch": 1.0892446349226357, "grad_norm": 38.97206115722656, "learning_rate": 4.0179242017239544e-07, "logits/chosen": -1.139623999595642, "logits/rejected": -1.134057641029358, "logps/chosen": -307.42498779296875, "logps/rejected": -295.79998779296875, "loss": 0.2415, "rewards/accuracies": 0.90625, "rewards/chosen": -1.366888403892517, "rewards/margins": 3.210205078125, "rewards/rejected": -4.574658393859863, "step": 4320 }, { "epoch": 1.091765669807456, "grad_norm": 16.978683471679688, "learning_rate": 4.012089739751735e-07, "logits/chosen": -1.093713402748108, "logits/rejected": -1.1429932117462158, "logps/chosen": -282.9375, "logps/rejected": -303.3500061035156, "loss": 0.2093, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.5668976306915283, "rewards/margins": 3.2781004905700684, "rewards/rejected": -4.845898628234863, "step": 4330 }, { "epoch": 1.0942867046922762, "grad_norm": 40.3631477355957, "learning_rate": 4.006242262296933e-07, "logits/chosen": -1.140039086341858, "logits/rejected": -1.1598999500274658, "logps/chosen": -275.3500061035156, "logps/rejected": -299.23126220703125, "loss": 0.1771, "rewards/accuracies": 0.921875, "rewards/chosen": -1.427282691001892, "rewards/margins": 3.5241942405700684, "rewards/rejected": -4.950976371765137, "step": 4340 }, { "epoch": 1.0968077395770963, "grad_norm": 54.68492126464844, "learning_rate": 4.0003818196923677e-07, "logits/chosen": -1.1357421875, "logits/rejected": -1.099816918373108, "logps/chosen": -290.08123779296875, "logps/rejected": -278.10626220703125, "loss": 0.2141, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.343328833580017, "rewards/margins": 3.33837890625, "rewards/rejected": -4.6826171875, "step": 4350 }, { "epoch": 1.0993287744619167, "grad_norm": 29.327800750732422, "learning_rate": 3.994508462382459e-07, "logits/chosen": -1.1252257823944092, "logits/rejected": -1.1992371082305908, "logps/chosen": -275.7437438964844, "logps/rejected": -311.9624938964844, "loss": 0.183, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1376861333847046, "rewards/margins": 3.511059522628784, "rewards/rejected": -4.649218559265137, "step": 4360 }, { "epoch": 1.1018498093467368, "grad_norm": 36.42438507080078, "learning_rate": 3.98862224092279e-07, "logits/chosen": -1.129235863685608, "logits/rejected": -1.137121558189392, "logps/chosen": -277.17498779296875, "logps/rejected": -291.23126220703125, "loss": 0.1848, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.118890404701233, "rewards/margins": 3.633984327316284, "rewards/rejected": -4.752636909484863, "step": 4370 }, { "epoch": 1.104370844231557, "grad_norm": 47.18445587158203, "learning_rate": 3.982723205979675e-07, "logits/chosen": -1.212890625, "logits/rejected": -1.2003905773162842, "logps/chosen": -301.38751220703125, "logps/rejected": -311.1499938964844, "loss": 0.2051, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.223547339439392, "rewards/margins": 3.695605516433716, "rewards/rejected": -4.917187690734863, "step": 4380 }, { "epoch": 1.1068918791163773, "grad_norm": 26.98342514038086, "learning_rate": 3.976811408329721e-07, "logits/chosen": -1.1772339344024658, "logits/rejected": -1.1562378406524658, "logps/chosen": -285.75311279296875, "logps/rejected": -319.1312561035156, "loss": 0.2132, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.302832007408142, "rewards/margins": 3.7227540016174316, "rewards/rejected": -5.026757717132568, "step": 4390 }, { "epoch": 1.1094129140011975, "grad_norm": 18.257844924926758, "learning_rate": 3.9708868988593916e-07, "logits/chosen": -1.110937476158142, "logits/rejected": -1.16046142578125, "logps/chosen": -297.7406311035156, "logps/rejected": -309.64373779296875, "loss": 0.1726, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.378991723060608, "rewards/margins": 3.613525390625, "rewards/rejected": -4.991991996765137, "step": 4400 }, { "epoch": 1.1119339488860178, "grad_norm": 38.7391471862793, "learning_rate": 3.9649497285645673e-07, "logits/chosen": -1.2002990245819092, "logits/rejected": -1.1981627941131592, "logps/chosen": -318.8812561035156, "logps/rejected": -331.1875, "loss": 0.2149, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.6891753673553467, "rewards/margins": 3.5896973609924316, "rewards/rejected": -5.277734279632568, "step": 4410 }, { "epoch": 1.114454983770838, "grad_norm": 28.373977661132812, "learning_rate": 3.958999948550111e-07, "logits/chosen": -1.178857445716858, "logits/rejected": -1.144433617591858, "logps/chosen": -324.8812561035156, "logps/rejected": -305.4125061035156, "loss": 0.2874, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9126068353652954, "rewards/margins": 3.287841796875, "rewards/rejected": -5.19921875, "step": 4420 }, { "epoch": 1.116976018655658, "grad_norm": 28.570444107055664, "learning_rate": 3.9530376100294236e-07, "logits/chosen": -1.1751220226287842, "logits/rejected": -1.147436499595642, "logps/chosen": -301.73748779296875, "logps/rejected": -348.5562438964844, "loss": 0.2238, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.564123511314392, "rewards/margins": 3.5875000953674316, "rewards/rejected": -5.152539253234863, "step": 4430 }, { "epoch": 1.1194970535404785, "grad_norm": 36.46849822998047, "learning_rate": 3.9470627643240054e-07, "logits/chosen": -1.22021484375, "logits/rejected": -1.1673583984375, "logps/chosen": -291.09375, "logps/rejected": -312.8374938964844, "loss": 0.177, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.914196789264679, "rewards/margins": 3.430957078933716, "rewards/rejected": -4.343163967132568, "step": 4440 }, { "epoch": 1.1220180884252986, "grad_norm": 29.296356201171875, "learning_rate": 3.941075462863011e-07, "logits/chosen": -1.1571776866912842, "logits/rejected": -1.1372559070587158, "logps/chosen": -291.10626220703125, "logps/rejected": -315.5625, "loss": 0.1728, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.166351318359375, "rewards/margins": 3.57763671875, "rewards/rejected": -4.7451171875, "step": 4450 }, { "epoch": 1.1245391233101187, "grad_norm": 29.039588928222656, "learning_rate": 3.935075757182813e-07, "logits/chosen": -1.1462234258651733, "logits/rejected": -1.103796362876892, "logps/chosen": -271.5562438964844, "logps/rejected": -312.73126220703125, "loss": 0.2095, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5503631830215454, "rewards/margins": 3.8505859375, "rewards/rejected": -5.400390625, "step": 4460 }, { "epoch": 1.127060158194939, "grad_norm": 32.84434509277344, "learning_rate": 3.9290636989265536e-07, "logits/chosen": -1.152032494544983, "logits/rejected": -1.0812256336212158, "logps/chosen": -298.4624938964844, "logps/rejected": -316.11248779296875, "loss": 0.1636, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.385565161705017, "rewards/margins": 4.075976371765137, "rewards/rejected": -5.461230278015137, "step": 4470 }, { "epoch": 1.1295811930797592, "grad_norm": 43.07621765136719, "learning_rate": 3.923039339843699e-07, "logits/chosen": -1.156396508216858, "logits/rejected": -1.1776825189590454, "logps/chosen": -292.60626220703125, "logps/rejected": -299.4750061035156, "loss": 0.2127, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4523742198944092, "rewards/margins": 3.860107421875, "rewards/rejected": -5.315331935882568, "step": 4480 }, { "epoch": 1.1321022279645794, "grad_norm": 38.23931121826172, "learning_rate": 3.9170027317895993e-07, "logits/chosen": -1.113500952720642, "logits/rejected": -1.069494605064392, "logps/chosen": -313.70001220703125, "logps/rejected": -326.48126220703125, "loss": 0.1679, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.3317139148712158, "rewards/margins": 4.272363185882568, "rewards/rejected": -5.60205078125, "step": 4490 }, { "epoch": 1.1346232628493997, "grad_norm": 57.834754943847656, "learning_rate": 3.910953926725037e-07, "logits/chosen": -1.1160156726837158, "logits/rejected": -1.0078856945037842, "logps/chosen": -330.23126220703125, "logps/rejected": -323.35626220703125, "loss": 0.2029, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.3919098377227783, "rewards/margins": 3.94287109375, "rewards/rejected": -5.335253715515137, "step": 4500 }, { "epoch": 1.1371442977342199, "grad_norm": 44.31748580932617, "learning_rate": 3.904892976715783e-07, "logits/chosen": -1.203222632408142, "logits/rejected": -1.091040015220642, "logps/chosen": -323.2875061035156, "logps/rejected": -319.29376220703125, "loss": 0.2073, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.4412353038787842, "rewards/margins": 3.8412108421325684, "rewards/rejected": -5.28173828125, "step": 4510 }, { "epoch": 1.1396653326190402, "grad_norm": 29.34000587463379, "learning_rate": 3.898819933932146e-07, "logits/chosen": -1.1533203125, "logits/rejected": -1.1740233898162842, "logps/chosen": -334.1812438964844, "logps/rejected": -323.07501220703125, "loss": 0.2344, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.94696044921875, "rewards/margins": 3.8212890625, "rewards/rejected": -4.7689208984375, "step": 4520 }, { "epoch": 1.1421863675038604, "grad_norm": 45.65370559692383, "learning_rate": 3.8927348506485253e-07, "logits/chosen": -1.1566283702850342, "logits/rejected": -1.1243164539337158, "logps/chosen": -300.3812561035156, "logps/rejected": -317.2437438964844, "loss": 0.2256, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.33416748046875, "rewards/margins": 3.622729539871216, "rewards/rejected": -4.956250190734863, "step": 4530 }, { "epoch": 1.1447074023886805, "grad_norm": 51.753509521484375, "learning_rate": 3.8866377792429593e-07, "logits/chosen": -1.1453125476837158, "logits/rejected": -1.13385009765625, "logps/chosen": -289.14373779296875, "logps/rejected": -302.78125, "loss": 0.1984, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.309106469154358, "rewards/margins": 3.377490282058716, "rewards/rejected": -4.688086032867432, "step": 4540 }, { "epoch": 1.1472284372735007, "grad_norm": 51.0242919921875, "learning_rate": 3.880528772196677e-07, "logits/chosen": NaN, "logits/rejected": -1.122686743736267, "logps/chosen": -303.51873779296875, "logps/rejected": -315.7562561035156, "loss": 0.1893, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.9603027105331421, "rewards/margins": 3.603710889816284, "rewards/rejected": -4.563086032867432, "step": 4550 }, { "epoch": 1.149749472158321, "grad_norm": 64.95894622802734, "learning_rate": 3.8744078820936445e-07, "logits/chosen": -1.139245629310608, "logits/rejected": -1.1644775867462158, "logps/chosen": -307.64373779296875, "logps/rejected": -311.42498779296875, "loss": 0.2393, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6932830810546875, "rewards/margins": 3.6019043922424316, "rewards/rejected": -4.295312404632568, "step": 4560 }, { "epoch": 1.1522705070431412, "grad_norm": 39.9935188293457, "learning_rate": 3.8682751616201106e-07, "logits/chosen": -1.162573218345642, "logits/rejected": -1.1469237804412842, "logps/chosen": -265.61248779296875, "logps/rejected": -276.01251220703125, "loss": 0.2416, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.8093169927597046, "rewards/margins": 3.1261229515075684, "rewards/rejected": -3.933300733566284, "step": 4570 }, { "epoch": 1.1547915419279615, "grad_norm": 32.537593841552734, "learning_rate": 3.862130663564158e-07, "logits/chosen": -1.1119506359100342, "logits/rejected": -1.17852783203125, "logps/chosen": -279.26873779296875, "logps/rejected": -284.28125, "loss": 0.2043, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7371917963027954, "rewards/margins": 3.585644483566284, "rewards/rejected": -4.320776462554932, "step": 4580 }, { "epoch": 1.1573125768127817, "grad_norm": 34.01283645629883, "learning_rate": 3.855974440815244e-07, "logits/chosen": -1.1630370616912842, "logits/rejected": -1.180322289466858, "logps/chosen": -286.79376220703125, "logps/rejected": -328.3187561035156, "loss": 0.1801, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.8788086175918579, "rewards/margins": 3.6949219703674316, "rewards/rejected": -4.571972846984863, "step": 4590 }, { "epoch": 1.1598336116976018, "grad_norm": 39.68577575683594, "learning_rate": 3.8498065463637505e-07, "logits/chosen": -1.20361328125, "logits/rejected": -1.1982910633087158, "logps/chosen": -305.0, "logps/rejected": -313.8812561035156, "loss": 0.2171, "rewards/accuracies": 0.921875, "rewards/chosen": -0.7044006586074829, "rewards/margins": 3.478759765625, "rewards/rejected": -4.183569431304932, "step": 4600 }, { "epoch": 1.1623546465824222, "grad_norm": 49.09182357788086, "learning_rate": 3.843627033300521e-07, "logits/chosen": -1.240966796875, "logits/rejected": -1.192773461341858, "logps/chosen": -281.23748779296875, "logps/rejected": -282.0, "loss": 0.1685, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.22492066025733948, "rewards/margins": 3.614208936691284, "rewards/rejected": -3.8397459983825684, "step": 4610 }, { "epoch": 1.1648756814672423, "grad_norm": 31.07063102722168, "learning_rate": 3.83743595481641e-07, "logits/chosen": -1.22607421875, "logits/rejected": -1.106134057044983, "logps/chosen": -278.5687561035156, "logps/rejected": -294.0, "loss": 0.2021, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.831127941608429, "rewards/margins": 3.7333984375, "rewards/rejected": -4.563330173492432, "step": 4620 }, { "epoch": 1.1673967163520624, "grad_norm": 18.663665771484375, "learning_rate": 3.831233364201825e-07, "logits/chosen": -1.205297827720642, "logits/rejected": -1.162261962890625, "logps/chosen": -290.859375, "logps/rejected": -304.9624938964844, "loss": 0.1641, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.909899890422821, "rewards/margins": 3.749798536300659, "rewards/rejected": -4.66015625, "step": 4630 }, { "epoch": 1.1699177512368828, "grad_norm": 33.56493377685547, "learning_rate": 3.8250193148462583e-07, "logits/chosen": -1.1095702648162842, "logits/rejected": -1.103540062904358, "logps/chosen": -304.0, "logps/rejected": -307.58123779296875, "loss": 0.1681, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.443878173828125, "rewards/margins": 3.925585985183716, "rewards/rejected": -5.368456840515137, "step": 4640 }, { "epoch": 1.172438786121703, "grad_norm": 34.2917366027832, "learning_rate": 3.8187938602378413e-07, "logits/chosen": NaN, "logits/rejected": -1.0922362804412842, "logps/chosen": -283.5, "logps/rejected": -292.28125, "loss": 0.1955, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.5723145008087158, "rewards/margins": 3.814746141433716, "rewards/rejected": -5.384375095367432, "step": 4650 }, { "epoch": 1.174959821006523, "grad_norm": 15.947632789611816, "learning_rate": 3.812557053962875e-07, "logits/chosen": -1.072790503501892, "logits/rejected": -1.041162133216858, "logps/chosen": -307.6937561035156, "logps/rejected": -297.15625, "loss": 0.2179, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.70806884765625, "rewards/margins": 3.767627000808716, "rewards/rejected": -5.477734565734863, "step": 4660 }, { "epoch": 1.1774808558913434, "grad_norm": 40.98158264160156, "learning_rate": 3.8063089497053713e-07, "logits/chosen": -1.0684082508087158, "logits/rejected": -1.1115906238555908, "logps/chosen": -282.6499938964844, "logps/rejected": -317.4125061035156, "loss": 0.1868, "rewards/accuracies": 0.921875, "rewards/chosen": -1.52655029296875, "rewards/margins": 3.707080125808716, "rewards/rejected": -5.233300685882568, "step": 4670 }, { "epoch": 1.1800018907761636, "grad_norm": 41.0838623046875, "learning_rate": 3.80004960124659e-07, "logits/chosen": -1.1486327648162842, "logits/rejected": -1.0456054210662842, "logps/chosen": -295.26251220703125, "logps/rejected": -307.0375061035156, "loss": 0.1645, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.7212646007537842, "rewards/margins": 3.996386766433716, "rewards/rejected": -5.716406345367432, "step": 4680 }, { "epoch": 1.182522925660984, "grad_norm": 41.059024810791016, "learning_rate": 3.7937790624645776e-07, "logits/chosen": -1.188085913658142, "logits/rejected": -1.183569312095642, "logps/chosen": -320.6031188964844, "logps/rejected": -301.5, "loss": 0.2758, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.802056908607483, "rewards/margins": 3.351367235183716, "rewards/rejected": -5.156298637390137, "step": 4690 }, { "epoch": 1.185043960545804, "grad_norm": 39.89297866821289, "learning_rate": 3.7874973873337026e-07, "logits/chosen": -1.094244360923767, "logits/rejected": -1.1009094715118408, "logps/chosen": -320.04998779296875, "logps/rejected": -341.0375061035156, "loss": 0.2047, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5172607898712158, "rewards/margins": 3.868945360183716, "rewards/rejected": -5.386328220367432, "step": 4700 }, { "epoch": 1.1875649954306242, "grad_norm": 46.3644905090332, "learning_rate": 3.78120462992419e-07, "logits/chosen": -1.1583251953125, "logits/rejected": -1.1174805164337158, "logps/chosen": -300.0375061035156, "logps/rejected": -312.67498779296875, "loss": 0.2217, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.545629858970642, "rewards/margins": 3.8341307640075684, "rewards/rejected": -5.380078315734863, "step": 4710 }, { "epoch": 1.1900860303154446, "grad_norm": 27.166078567504883, "learning_rate": 3.774900844401657e-07, "logits/chosen": -1.1233947277069092, "logits/rejected": -1.1427733898162842, "logps/chosen": -300.64373779296875, "logps/rejected": -331.9750061035156, "loss": 0.1788, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.1992919445037842, "rewards/margins": 4.047656059265137, "rewards/rejected": -5.247754096984863, "step": 4720 }, { "epoch": 1.1926070652002647, "grad_norm": 34.90217971801758, "learning_rate": 3.768586085026648e-07, "logits/chosen": -1.188079833984375, "logits/rejected": -1.186242699623108, "logps/chosen": -292.1625061035156, "logps/rejected": -333.8062438964844, "loss": 0.2236, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3991210460662842, "rewards/margins": 4.101513862609863, "rewards/rejected": -5.499413967132568, "step": 4730 }, { "epoch": 1.1951281000850849, "grad_norm": 37.92390823364258, "learning_rate": 3.7622604061541646e-07, "logits/chosen": -1.107110619544983, "logits/rejected": -1.143762230873108, "logps/chosen": -290.91876220703125, "logps/rejected": -305.4624938964844, "loss": 0.1878, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.101324439048767, "rewards/margins": 3.680908203125, "rewards/rejected": -4.785742282867432, "step": 4740 }, { "epoch": 1.1976491349699052, "grad_norm": 48.947086334228516, "learning_rate": 3.755923862233199e-07, "logits/chosen": -1.1432373523712158, "logits/rejected": -1.1442382335662842, "logps/chosen": -269.4125061035156, "logps/rejected": -282.82501220703125, "loss": 0.2411, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.0486876964569092, "rewards/margins": 3.489990234375, "rewards/rejected": -4.5390625, "step": 4750 }, { "epoch": 1.2001701698547254, "grad_norm": 39.559715270996094, "learning_rate": 3.7495765078062653e-07, "logits/chosen": -1.140527367591858, "logits/rejected": -1.118432641029358, "logps/chosen": -300.84375, "logps/rejected": -302.29998779296875, "loss": 0.2145, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.0033690929412842, "rewards/margins": 3.4307618141174316, "rewards/rejected": -4.433789253234863, "step": 4760 }, { "epoch": 1.2026912047395455, "grad_norm": 49.91185760498047, "learning_rate": 3.7432183975089326e-07, "logits/chosen": -1.210205078125, "logits/rejected": -1.1429321765899658, "logps/chosen": -302.67498779296875, "logps/rejected": -301.375, "loss": 0.1675, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.054997205734253, "rewards/margins": 3.777148485183716, "rewards/rejected": -4.833691596984863, "step": 4770 }, { "epoch": 1.2052122396243659, "grad_norm": 41.0870475769043, "learning_rate": 3.7368495860693493e-07, "logits/chosen": -1.200048804283142, "logits/rejected": -1.1798095703125, "logps/chosen": -293.3812561035156, "logps/rejected": -326.7749938964844, "loss": 0.1553, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -0.9073547124862671, "rewards/margins": 4.201464653015137, "rewards/rejected": -5.106543064117432, "step": 4780 }, { "epoch": 1.207733274509186, "grad_norm": 60.1196403503418, "learning_rate": 3.730470128307778e-07, "logits/chosen": -1.1578384637832642, "logits/rejected": -1.140051245689392, "logps/chosen": -285.04376220703125, "logps/rejected": -319.7124938964844, "loss": 0.2429, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.4465453624725342, "rewards/margins": 3.917773485183716, "rewards/rejected": -5.362402439117432, "step": 4790 }, { "epoch": 1.2102543093940064, "grad_norm": 41.10110092163086, "learning_rate": 3.7240800791361176e-07, "logits/chosen": -1.1771361827850342, "logits/rejected": -1.1097290515899658, "logps/chosen": -304.59375, "logps/rejected": -302.625, "loss": 0.213, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.100122094154358, "rewards/margins": 4.222363471984863, "rewards/rejected": -5.322070121765137, "step": 4800 }, { "epoch": 1.2127753442788265, "grad_norm": 49.76337432861328, "learning_rate": 3.717679493557437e-07, "logits/chosen": -1.167504906654358, "logits/rejected": -1.1450592279434204, "logps/chosen": -295.32501220703125, "logps/rejected": -322.9750061035156, "loss": 0.2393, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.7194992303848267, "rewards/margins": 4.076074123382568, "rewards/rejected": -4.795702934265137, "step": 4810 }, { "epoch": 1.2152963791636466, "grad_norm": 35.84429168701172, "learning_rate": 3.7112684266654954e-07, "logits/chosen": -1.1913330554962158, "logits/rejected": -1.170629858970642, "logps/chosen": -299.88751220703125, "logps/rejected": -293.16876220703125, "loss": 0.2405, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5069335699081421, "rewards/margins": 3.823779344558716, "rewards/rejected": -4.329345703125, "step": 4820 }, { "epoch": 1.2178174140484668, "grad_norm": 39.85462188720703, "learning_rate": 3.7048469336442735e-07, "logits/chosen": -1.1529052257537842, "logits/rejected": -1.0819122791290283, "logps/chosen": -289.34375, "logps/rejected": -320.7749938964844, "loss": 0.1778, "rewards/accuracies": 0.921875, "rewards/chosen": -0.807940661907196, "rewards/margins": 4.0791015625, "rewards/rejected": -4.886620998382568, "step": 4830 }, { "epoch": 1.2203384489332871, "grad_norm": 50.39110565185547, "learning_rate": 3.698415069767494e-07, "logits/chosen": -1.2173583507537842, "logits/rejected": -1.1823852062225342, "logps/chosen": -303.1000061035156, "logps/rejected": -301.1000061035156, "loss": 0.2131, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1938186883926392, "rewards/margins": 3.8489990234375, "rewards/rejected": -5.044238090515137, "step": 4840 }, { "epoch": 1.2228594838181073, "grad_norm": 54.38093185424805, "learning_rate": 3.69197289039815e-07, "logits/chosen": -1.1825439929962158, "logits/rejected": -1.17578125, "logps/chosen": -293.3374938964844, "logps/rejected": -329.4750061035156, "loss": 0.2147, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0760619640350342, "rewards/margins": 3.711376905441284, "rewards/rejected": -4.787304878234863, "step": 4850 }, { "epoch": 1.2253805187029276, "grad_norm": 36.31102752685547, "learning_rate": 3.6855204509880243e-07, "logits/chosen": -1.126806616783142, "logits/rejected": -1.089715600013733, "logps/chosen": -289.5062561035156, "logps/rejected": -301.83123779296875, "loss": 0.1686, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2572510242462158, "rewards/margins": 4.262890815734863, "rewards/rejected": -5.521288871765137, "step": 4860 }, { "epoch": 1.2279015535877478, "grad_norm": 35.117347717285156, "learning_rate": 3.6790578070772166e-07, "logits/chosen": -1.216943383216858, "logits/rejected": -1.17559814453125, "logps/chosen": -285.9125061035156, "logps/rejected": -316.5625, "loss": 0.1947, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.0204894542694092, "rewards/margins": 4.046972751617432, "rewards/rejected": -5.066113471984863, "step": 4870 }, { "epoch": 1.230422588472568, "grad_norm": 59.419010162353516, "learning_rate": 3.672585014293661e-07, "logits/chosen": -1.1372191905975342, "logits/rejected": -1.0812866687774658, "logps/chosen": -314.4750061035156, "logps/rejected": -336.4375, "loss": 0.1868, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.260986328125, "rewards/margins": 4.205786228179932, "rewards/rejected": -5.466992378234863, "step": 4880 }, { "epoch": 1.2329436233573883, "grad_norm": 44.0726203918457, "learning_rate": 3.666102128352649e-07, "logits/chosen": -1.19561767578125, "logits/rejected": -1.107458472251892, "logps/chosen": -293.45001220703125, "logps/rejected": -290.95001220703125, "loss": 0.1788, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.756848156452179, "rewards/margins": 4.030517578125, "rewards/rejected": -4.79052734375, "step": 4890 }, { "epoch": 1.2354646582422084, "grad_norm": 71.85173034667969, "learning_rate": 3.6596092050563513e-07, "logits/chosen": -1.206689476966858, "logits/rejected": -1.11602783203125, "logps/chosen": -301.375, "logps/rejected": -335.359375, "loss": 0.2441, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9993346929550171, "rewards/margins": 3.8375487327575684, "rewards/rejected": -4.838183403015137, "step": 4900 }, { "epoch": 1.2379856931270286, "grad_norm": 37.4676399230957, "learning_rate": 3.653106300293336e-07, "logits/chosen": -1.1802246570587158, "logits/rejected": -1.162255883216858, "logps/chosen": -308.85626220703125, "logps/rejected": -314.5, "loss": 0.2388, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.76812744140625, "rewards/margins": 3.83447265625, "rewards/rejected": -4.603711128234863, "step": 4910 }, { "epoch": 1.240506728011849, "grad_norm": 31.14033317565918, "learning_rate": 3.6465934700380873e-07, "logits/chosen": -1.173559546470642, "logits/rejected": NaN, "logps/chosen": -314.70001220703125, "logps/rejected": -307.5874938964844, "loss": 0.1561, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.8431457281112671, "rewards/margins": 3.899951219558716, "rewards/rejected": -4.7431640625, "step": 4920 }, { "epoch": 1.243027762896669, "grad_norm": 35.24359893798828, "learning_rate": 3.640070770350524e-07, "logits/chosen": -1.168951392173767, "logits/rejected": -1.149084448814392, "logps/chosen": -298.35626220703125, "logps/rejected": -303.17498779296875, "loss": 0.183, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.4486083984375, "rewards/margins": 3.719006299972534, "rewards/rejected": -5.167870998382568, "step": 4930 }, { "epoch": 1.2455487977814892, "grad_norm": 73.50921630859375, "learning_rate": 3.633538257375519e-07, "logits/chosen": -1.2502930164337158, "logits/rejected": -1.245141625404358, "logps/chosen": -314.8500061035156, "logps/rejected": -315.08123779296875, "loss": 0.2339, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.270166039466858, "rewards/margins": 3.577587842941284, "rewards/rejected": -4.847826957702637, "step": 4940 }, { "epoch": 1.2480698326663096, "grad_norm": 43.814964294433594, "learning_rate": 3.626995987342412e-07, "logits/chosen": -1.182275414466858, "logits/rejected": -1.1380615234375, "logps/chosen": -288.65625, "logps/rejected": -313.6000061035156, "loss": 0.2684, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -0.8729751706123352, "rewards/margins": 3.7605957984924316, "rewards/rejected": -4.633496284484863, "step": 4950 }, { "epoch": 1.2505908675511297, "grad_norm": 54.624507904052734, "learning_rate": 3.620444016564528e-07, "logits/chosen": -1.147241234779358, "logits/rejected": -1.146948218345642, "logps/chosen": -284.2749938964844, "logps/rejected": -300.23126220703125, "loss": 0.228, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9929214715957642, "rewards/margins": 3.8893799781799316, "rewards/rejected": -4.886034965515137, "step": 4960 }, { "epoch": 1.25311190243595, "grad_norm": 52.134803771972656, "learning_rate": 3.6138824014386945e-07, "logits/chosen": -1.160058617591858, "logits/rejected": -1.15692138671875, "logps/chosen": -288.23126220703125, "logps/rejected": -314.20001220703125, "loss": 0.1854, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.8441619873046875, "rewards/margins": 4.047338962554932, "rewards/rejected": -4.894335746765137, "step": 4970 }, { "epoch": 1.2556329373207702, "grad_norm": 33.152957916259766, "learning_rate": 3.6073111984447497e-07, "logits/chosen": -1.1812744140625, "logits/rejected": -1.164697289466858, "logps/chosen": -303.39373779296875, "logps/rejected": -310.0, "loss": 0.2534, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.86151123046875, "rewards/margins": 3.7456297874450684, "rewards/rejected": -4.605175971984863, "step": 4980 }, { "epoch": 1.2581539722055903, "grad_norm": 32.13079071044922, "learning_rate": 3.600730464145064e-07, "logits/chosen": -1.172521948814392, "logits/rejected": -1.151885986328125, "logps/chosen": -304.4468688964844, "logps/rejected": -310.79998779296875, "loss": 0.2877, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.9182235598564148, "rewards/margins": 3.5555176734924316, "rewards/rejected": -4.47412109375, "step": 4990 }, { "epoch": 1.2606750070904107, "grad_norm": 25.800579071044922, "learning_rate": 3.594140255184048e-07, "logits/chosen": -1.1635010242462158, "logits/rejected": -1.1095459461212158, "logps/chosen": -269.71875, "logps/rejected": -281.1875, "loss": 0.2093, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.6491439938545227, "rewards/margins": 3.390625, "rewards/rejected": -4.040600776672363, "step": 5000 }, { "epoch": 1.2631960419752308, "grad_norm": 22.663894653320312, "learning_rate": 3.5875406282876676e-07, "logits/chosen": -1.180810570716858, "logits/rejected": -1.2098388671875, "logps/chosen": -306.36248779296875, "logps/rejected": -308.0874938964844, "loss": 0.2132, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.8019149899482727, "rewards/margins": 3.5633301734924316, "rewards/rejected": -4.363476753234863, "step": 5010 }, { "epoch": 1.265717076860051, "grad_norm": 29.97337532043457, "learning_rate": 3.5809316402629533e-07, "logits/chosen": -1.1132690906524658, "logits/rejected": -1.1020996570587158, "logps/chosen": -282.05938720703125, "logps/rejected": -291.78125, "loss": 0.2372, "rewards/accuracies": 0.890625, "rewards/chosen": -1.367584228515625, "rewards/margins": 3.456860303878784, "rewards/rejected": -4.825781345367432, "step": 5020 }, { "epoch": 1.2682381117448713, "grad_norm": 22.39763832092285, "learning_rate": 3.5743133479975137e-07, "logits/chosen": -1.1588592529296875, "logits/rejected": -1.1710937023162842, "logps/chosen": -303.28125, "logps/rejected": -323.33123779296875, "loss": 0.1932, "rewards/accuracies": 0.921875, "rewards/chosen": -1.229089379310608, "rewards/margins": 3.6304688453674316, "rewards/rejected": -4.860058784484863, "step": 5030 }, { "epoch": 1.2707591466296915, "grad_norm": 54.3851432800293, "learning_rate": 3.567685808459044e-07, "logits/chosen": -1.1513671875, "logits/rejected": -1.0747467279434204, "logps/chosen": -274.8374938964844, "logps/rejected": -301.0062561035156, "loss": 0.1905, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.5656249523162842, "rewards/margins": 3.6270508766174316, "rewards/rejected": -5.195508003234863, "step": 5040 }, { "epoch": 1.2732801815145116, "grad_norm": 41.404754638671875, "learning_rate": 3.5610490786948353e-07, "logits/chosen": -1.1798827648162842, "logits/rejected": -1.154943823814392, "logps/chosen": -297.7875061035156, "logps/rejected": -317.70623779296875, "loss": 0.1926, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4342010021209717, "rewards/margins": 3.7634034156799316, "rewards/rejected": -5.200585842132568, "step": 5050 }, { "epoch": 1.275801216399332, "grad_norm": 33.274147033691406, "learning_rate": 3.5544032158312883e-07, "logits/chosen": -1.1477782726287842, "logits/rejected": -1.0680663585662842, "logps/chosen": -295.86248779296875, "logps/rejected": -328.8500061035156, "loss": 0.1995, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.2218414545059204, "rewards/margins": 3.9695801734924316, "rewards/rejected": -5.192285060882568, "step": 5060 }, { "epoch": 1.2783222512841521, "grad_norm": 47.26895523071289, "learning_rate": 3.5477482770734137e-07, "logits/chosen": -1.1197388172149658, "logits/rejected": -1.0371825695037842, "logps/chosen": -276.1312561035156, "logps/rejected": -292.6000061035156, "loss": 0.1895, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4241516590118408, "rewards/margins": 3.542285203933716, "rewards/rejected": -4.967089653015137, "step": 5070 }, { "epoch": 1.2808432861689725, "grad_norm": 51.5677375793457, "learning_rate": 3.5410843197043454e-07, "logits/chosen": -1.1300780773162842, "logits/rejected": -1.0869140625, "logps/chosen": -307.4125061035156, "logps/rejected": -321.6187438964844, "loss": 0.2179, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.6571533679962158, "rewards/margins": 3.7133545875549316, "rewards/rejected": -5.372460842132568, "step": 5080 }, { "epoch": 1.2833643210537926, "grad_norm": 32.997013092041016, "learning_rate": 3.534411401084848e-07, "logits/chosen": -1.159387230873108, "logits/rejected": -1.157080054283142, "logps/chosen": -300.1312561035156, "logps/rejected": -300.98748779296875, "loss": 0.202, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5150359869003296, "rewards/margins": 3.515869140625, "rewards/rejected": -5.030077934265137, "step": 5090 }, { "epoch": 1.2858853559386128, "grad_norm": 54.570343017578125, "learning_rate": 3.5277295786528183e-07, "logits/chosen": -1.150732398033142, "logits/rejected": -1.17657470703125, "logps/chosen": -275.2124938964844, "logps/rejected": -297.0625, "loss": 0.1461, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.5175079107284546, "rewards/margins": 4.005663871765137, "rewards/rejected": -5.522558689117432, "step": 5100 }, { "epoch": 1.288406390823433, "grad_norm": 35.95811080932617, "learning_rate": 3.521038909922794e-07, "logits/chosen": -1.0597655773162842, "logits/rejected": -1.049890160560608, "logps/chosen": -281.54998779296875, "logps/rejected": -328.9437561035156, "loss": 0.2522, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8772491216659546, "rewards/margins": 3.99951171875, "rewards/rejected": -5.873827934265137, "step": 5110 }, { "epoch": 1.2909274257082533, "grad_norm": 40.40537643432617, "learning_rate": 3.5143394524854613e-07, "logits/chosen": -1.1229889392852783, "logits/rejected": -1.109014868736267, "logps/chosen": -314.7875061035156, "logps/rejected": -323.8125, "loss": 0.2066, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.917639136314392, "rewards/margins": 3.8738770484924316, "rewards/rejected": -5.791015625, "step": 5120 }, { "epoch": 1.2934484605930734, "grad_norm": 20.03754425048828, "learning_rate": 3.5076312640071515e-07, "logits/chosen": -1.137481689453125, "logits/rejected": -1.1807434558868408, "logps/chosen": -308.21875, "logps/rejected": -331.3500061035156, "loss": 0.1823, "rewards/accuracies": 0.9375, "rewards/chosen": -2.027453660964966, "rewards/margins": 4.025097846984863, "rewards/rejected": -6.052636623382568, "step": 5130 }, { "epoch": 1.2959694954778938, "grad_norm": 44.395782470703125, "learning_rate": 3.5009144022293533e-07, "logits/chosen": -1.202172875404358, "logits/rejected": -1.1327636241912842, "logps/chosen": -336.58123779296875, "logps/rejected": -338.54998779296875, "loss": 0.2106, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.85870361328125, "rewards/margins": 4.142480373382568, "rewards/rejected": -6.0029296875, "step": 5140 }, { "epoch": 1.298490530362714, "grad_norm": 68.62153625488281, "learning_rate": 3.4941889249682095e-07, "logits/chosen": -1.1988646984100342, "logits/rejected": -1.1822509765625, "logps/chosen": -312.5249938964844, "logps/rejected": -334.26251220703125, "loss": 0.1575, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -2.0000457763671875, "rewards/margins": 4.140576362609863, "rewards/rejected": -6.140234470367432, "step": 5150 }, { "epoch": 1.301011565247534, "grad_norm": 34.79885482788086, "learning_rate": 3.487454890114023e-07, "logits/chosen": -1.132080078125, "logits/rejected": -1.1778686046600342, "logps/chosen": -294.10626220703125, "logps/rejected": -334.5, "loss": 0.1483, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -1.72509765625, "rewards/margins": 4.214453220367432, "rewards/rejected": -5.939648628234863, "step": 5160 }, { "epoch": 1.3035326001323544, "grad_norm": 42.98704147338867, "learning_rate": 3.480712355630757e-07, "logits/chosen": -1.1443359851837158, "logits/rejected": -1.146215796470642, "logps/chosen": -325.26251220703125, "logps/rejected": -336.8125, "loss": 0.2341, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.732019066810608, "rewards/margins": 3.8372559547424316, "rewards/rejected": -5.572070121765137, "step": 5170 }, { "epoch": 1.3060536350171745, "grad_norm": 29.974592208862305, "learning_rate": 3.4739613795555345e-07, "logits/chosen": -1.1156005859375, "logits/rejected": -1.0713622570037842, "logps/chosen": -295.01873779296875, "logps/rejected": -321.1875, "loss": 0.2239, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9411742687225342, "rewards/margins": 3.8954100608825684, "rewards/rejected": -5.833593845367432, "step": 5180 }, { "epoch": 1.308574669901995, "grad_norm": 24.7445068359375, "learning_rate": 3.4672020199981414e-07, "logits/chosen": -1.097814917564392, "logits/rejected": -1.0927612781524658, "logps/chosen": -301.0093688964844, "logps/rejected": -303.35626220703125, "loss": 0.1724, "rewards/accuracies": 0.90625, "rewards/chosen": -1.75927734375, "rewards/margins": 3.8087401390075684, "rewards/rejected": -5.568163871765137, "step": 5190 }, { "epoch": 1.311095704786815, "grad_norm": 58.359310150146484, "learning_rate": 3.4604343351405276e-07, "logits/chosen": -1.1069214344024658, "logits/rejected": -1.1426513195037842, "logps/chosen": -315.3812561035156, "logps/rejected": -330.57501220703125, "loss": 0.2514, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.066546678543091, "rewards/margins": 3.8038086891174316, "rewards/rejected": -5.871679782867432, "step": 5200 }, { "epoch": 1.3136167396716352, "grad_norm": 49.104549407958984, "learning_rate": 3.4536583832363e-07, "logits/chosen": -1.120825171470642, "logits/rejected": -1.0521240234375, "logps/chosen": -270.88751220703125, "logps/rejected": -318.5249938964844, "loss": 0.2491, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.7755858898162842, "rewards/margins": 3.6437010765075684, "rewards/rejected": -5.419921875, "step": 5210 }, { "epoch": 1.3161377745564553, "grad_norm": 23.951229095458984, "learning_rate": 3.4468742226102285e-07, "logits/chosen": -1.1461303234100342, "logits/rejected": -1.1341063976287842, "logps/chosen": -278.4437561035156, "logps/rejected": -297.8999938964844, "loss": 0.1853, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.2996337413787842, "rewards/margins": 3.734326124191284, "rewards/rejected": -5.033398628234863, "step": 5220 }, { "epoch": 1.3186588094412757, "grad_norm": 29.159059524536133, "learning_rate": 3.44008191165774e-07, "logits/chosen": -1.1116211414337158, "logits/rejected": -1.099633812904358, "logps/chosen": -313.6312561035156, "logps/rejected": -331.13751220703125, "loss": 0.2361, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.46197509765625, "rewards/margins": 3.6006836891174316, "rewards/rejected": -5.063086032867432, "step": 5230 }, { "epoch": 1.3211798443260958, "grad_norm": 37.64070510864258, "learning_rate": 3.4332815088444126e-07, "logits/chosen": -1.108544945716858, "logits/rejected": -1.1564819812774658, "logps/chosen": -308.5375061035156, "logps/rejected": -323.32501220703125, "loss": 0.1673, "rewards/accuracies": 0.9375, "rewards/chosen": -1.8876953125, "rewards/margins": 3.787109375, "rewards/rejected": -5.674023628234863, "step": 5240 }, { "epoch": 1.3237008792109162, "grad_norm": 20.202035903930664, "learning_rate": 3.4264730727054813e-07, "logits/chosen": -1.107452392578125, "logits/rejected": -1.107476830482483, "logps/chosen": -297.9937438964844, "logps/rejected": -323.1000061035156, "loss": 0.2111, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.939056396484375, "rewards/margins": 3.8095703125, "rewards/rejected": -5.746289253234863, "step": 5250 }, { "epoch": 1.3262219140957363, "grad_norm": 47.521690368652344, "learning_rate": 3.4196566618453236e-07, "logits/chosen": -1.03961181640625, "logits/rejected": -1.011254906654358, "logps/chosen": -304.03125, "logps/rejected": -335.6875, "loss": 0.207, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5875823497772217, "rewards/margins": 3.8901000022888184, "rewards/rejected": -5.474413871765137, "step": 5260 }, { "epoch": 1.3287429489805564, "grad_norm": 57.11899185180664, "learning_rate": 3.4128323349369657e-07, "logits/chosen": -1.1620604991912842, "logits/rejected": -1.0976440906524658, "logps/chosen": -332.54998779296875, "logps/rejected": -335.1875, "loss": 0.1966, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4975097179412842, "rewards/margins": 4.015295505523682, "rewards/rejected": -5.511816501617432, "step": 5270 }, { "epoch": 1.3312639838653768, "grad_norm": 71.34944915771484, "learning_rate": 3.4060001507215675e-07, "logits/chosen": -1.1159179210662842, "logits/rejected": -1.084747314453125, "logps/chosen": -307.73748779296875, "logps/rejected": -319.57501220703125, "loss": 0.2152, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.62872314453125, "rewards/margins": 3.6752686500549316, "rewards/rejected": -5.306445121765137, "step": 5280 }, { "epoch": 1.333785018750197, "grad_norm": 29.985002517700195, "learning_rate": 3.399160168007924e-07, "logits/chosen": -1.1326415538787842, "logits/rejected": -1.0793578624725342, "logps/chosen": -302.38751220703125, "logps/rejected": -297.15625, "loss": 0.1693, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.788507103919983, "rewards/margins": 3.4935059547424316, "rewards/rejected": -5.283789157867432, "step": 5290 }, { "epoch": 1.3363060536350173, "grad_norm": 28.097501754760742, "learning_rate": 3.392312445671957e-07, "logits/chosen": -1.118872046470642, "logits/rejected": -1.0843017101287842, "logps/chosen": -320.6625061035156, "logps/rejected": -348.6625061035156, "loss": 0.2394, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.546484351158142, "rewards/margins": 3.837890625, "rewards/rejected": -5.383008003234863, "step": 5300 }, { "epoch": 1.3388270885198374, "grad_norm": 49.60430145263672, "learning_rate": 3.385457042656206e-07, "logits/chosen": -1.1826660633087158, "logits/rejected": -1.0979735851287842, "logps/chosen": -320.4437561035156, "logps/rejected": -331.0562438964844, "loss": 0.2391, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.9879882335662842, "rewards/margins": 3.7488770484924316, "rewards/rejected": -5.738965034484863, "step": 5310 }, { "epoch": 1.3413481234046576, "grad_norm": 53.35076141357422, "learning_rate": 3.378594017969324e-07, "logits/chosen": -1.087255835533142, "logits/rejected": -1.0226256847381592, "logps/chosen": -296.01873779296875, "logps/rejected": -303.0874938964844, "loss": 0.2151, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -2.108654737472534, "rewards/margins": 3.589794874191284, "rewards/rejected": -5.699804782867432, "step": 5320 }, { "epoch": 1.3438691582894777, "grad_norm": 70.57498931884766, "learning_rate": 3.3717234306855686e-07, "logits/chosen": -1.175225853919983, "logits/rejected": -1.122869849205017, "logps/chosen": -318.7718811035156, "logps/rejected": -327.54376220703125, "loss": 0.2535, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4980590343475342, "rewards/margins": 3.7567381858825684, "rewards/rejected": -5.253808498382568, "step": 5330 }, { "epoch": 1.346390193174298, "grad_norm": 49.551551818847656, "learning_rate": 3.364845339944292e-07, "logits/chosen": NaN, "logits/rejected": -1.133337378501892, "logps/chosen": -308.70001220703125, "logps/rejected": -317.11248779296875, "loss": 0.2449, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.0967658758163452, "rewards/margins": 3.7632813453674316, "rewards/rejected": -4.859667778015137, "step": 5340 }, { "epoch": 1.3489112280591182, "grad_norm": 41.36616134643555, "learning_rate": 3.357959804949435e-07, "logits/chosen": NaN, "logits/rejected": -1.105920433998108, "logps/chosen": -273.4437561035156, "logps/rejected": -288.01251220703125, "loss": 0.1594, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -0.784588634967804, "rewards/margins": 3.78125, "rewards/rejected": -4.5654296875, "step": 5350 }, { "epoch": 1.3514322629439386, "grad_norm": 60.294315338134766, "learning_rate": 3.3510668849690155e-07, "logits/chosen": -1.1477782726287842, "logits/rejected": -1.159887671470642, "logps/chosen": -282.60626220703125, "logps/rejected": -291.04998779296875, "loss": 0.2097, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.16192626953125, "rewards/margins": 3.8250489234924316, "rewards/rejected": -4.98779296875, "step": 5360 }, { "epoch": 1.3539532978287587, "grad_norm": 48.605125427246094, "learning_rate": 3.3441666393346167e-07, "logits/chosen": -1.1647217273712158, "logits/rejected": -1.057653784751892, "logps/chosen": -293.56561279296875, "logps/rejected": -302.9375, "loss": 0.1869, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.273126244544983, "rewards/margins": 4.179394721984863, "rewards/rejected": -5.451074123382568, "step": 5370 }, { "epoch": 1.3564743327135789, "grad_norm": 30.89805030822754, "learning_rate": 3.33725912744088e-07, "logits/chosen": -1.164605736732483, "logits/rejected": -1.119970679283142, "logps/chosen": -290.17498779296875, "logps/rejected": -314.85626220703125, "loss": 0.2443, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.1137816905975342, "rewards/margins": 3.905468702316284, "rewards/rejected": -5.019629001617432, "step": 5380 }, { "epoch": 1.358995367598399, "grad_norm": 51.46559524536133, "learning_rate": 3.330344408744992e-07, "logits/chosen": -1.0963866710662842, "logits/rejected": -1.160302758216858, "logps/chosen": -260.2875061035156, "logps/rejected": -294.32501220703125, "loss": 0.2679, "rewards/accuracies": 0.90625, "rewards/chosen": -1.488922119140625, "rewards/margins": 3.6673340797424316, "rewards/rejected": -5.156884670257568, "step": 5390 }, { "epoch": 1.3615164024832194, "grad_norm": 70.4235610961914, "learning_rate": 3.3234225427661697e-07, "logits/chosen": -1.0990784168243408, "logits/rejected": -1.1343505382537842, "logps/chosen": -303.73748779296875, "logps/rejected": -290.33123779296875, "loss": 0.3044, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.653594970703125, "rewards/margins": 3.6046385765075684, "rewards/rejected": -5.256640434265137, "step": 5400 }, { "epoch": 1.3640374373680395, "grad_norm": 63.168914794921875, "learning_rate": 3.316493589085155e-07, "logits/chosen": -1.148919701576233, "logits/rejected": -1.170166015625, "logps/chosen": -284.45001220703125, "logps/rejected": -297.0, "loss": 0.249, "rewards/accuracies": 0.890625, "rewards/chosen": -1.8449890613555908, "rewards/margins": 3.6565918922424316, "rewards/rejected": -5.50146484375, "step": 5410 }, { "epoch": 1.3665584722528599, "grad_norm": 77.1202392578125, "learning_rate": 3.3095576073436964e-07, "logits/chosen": -1.07568359375, "logits/rejected": -1.0973358154296875, "logps/chosen": -298.73126220703125, "logps/rejected": -321.7875061035156, "loss": 0.3114, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.070343017578125, "rewards/margins": 3.6612792015075684, "rewards/rejected": -5.731249809265137, "step": 5420 }, { "epoch": 1.36907950713768, "grad_norm": 43.10295867919922, "learning_rate": 3.3026146572440366e-07, "logits/chosen": -1.143774390220642, "logits/rejected": -1.1075561046600342, "logps/chosen": -306.2875061035156, "logps/rejected": -294.3999938964844, "loss": 0.214, "rewards/accuracies": 0.90625, "rewards/chosen": -1.480169653892517, "rewards/margins": 3.926074266433716, "rewards/rejected": -5.407031059265137, "step": 5430 }, { "epoch": 1.3716005420225001, "grad_norm": 49.544525146484375, "learning_rate": 3.295664798548401e-07, "logits/chosen": -1.1381804943084717, "logits/rejected": -1.108007788658142, "logps/chosen": -302.20001220703125, "logps/rejected": -317.4125061035156, "loss": 0.1675, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.933416724205017, "rewards/margins": 4.133349418640137, "rewards/rejected": -6.068359375, "step": 5440 }, { "epoch": 1.3741215769073205, "grad_norm": 52.61790466308594, "learning_rate": 3.288708091078479e-07, "logits/chosen": -1.137231469154358, "logits/rejected": -1.139685034751892, "logps/chosen": -301.1000061035156, "logps/rejected": -322.0249938964844, "loss": 0.2209, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.884552001953125, "rewards/margins": 3.8568358421325684, "rewards/rejected": -5.739648342132568, "step": 5450 }, { "epoch": 1.3766426117921406, "grad_norm": 32.10395431518555, "learning_rate": 3.281744594714914e-07, "logits/chosen": -1.072509765625, "logits/rejected": -1.027063012123108, "logps/chosen": -343.3999938964844, "logps/rejected": -319.8500061035156, "loss": 0.1719, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.499053955078125, "rewards/margins": 4.1962890625, "rewards/rejected": -5.696875095367432, "step": 5460 }, { "epoch": 1.379163646676961, "grad_norm": 52.091915130615234, "learning_rate": 3.274774369396783e-07, "logits/chosen": -1.142431616783142, "logits/rejected": -1.1350829601287842, "logps/chosen": -294.6312561035156, "logps/rejected": -319.4375, "loss": 0.2173, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5295836925506592, "rewards/margins": 3.746264696121216, "rewards/rejected": -5.27783203125, "step": 5470 }, { "epoch": 1.3816846815617811, "grad_norm": 56.385223388671875, "learning_rate": 3.267797475121087e-07, "logits/chosen": -1.1925048828125, "logits/rejected": -1.196374535560608, "logps/chosen": -277.29376220703125, "logps/rejected": -308.82501220703125, "loss": 0.1981, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.1184570789337158, "rewards/margins": 3.901611328125, "rewards/rejected": -5.021679878234863, "step": 5480 }, { "epoch": 1.3842057164466013, "grad_norm": 32.52472686767578, "learning_rate": 3.260813971942226e-07, "logits/chosen": -1.22039794921875, "logits/rejected": -1.18621826171875, "logps/chosen": -308.4750061035156, "logps/rejected": -336.9937438964844, "loss": 0.2595, "rewards/accuracies": 0.90625, "rewards/chosen": -0.998486340045929, "rewards/margins": 3.873974561691284, "rewards/rejected": -4.875781059265137, "step": 5490 }, { "epoch": 1.3867267513314214, "grad_norm": 23.37797737121582, "learning_rate": 3.2538239199714917e-07, "logits/chosen": -1.09912109375, "logits/rejected": -1.102990746498108, "logps/chosen": -266.57501220703125, "logps/rejected": -297.4125061035156, "loss": 0.2051, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5279052257537842, "rewards/margins": 3.7853760719299316, "rewards/rejected": -5.312890529632568, "step": 5500 }, { "epoch": 1.3892477862162418, "grad_norm": 31.321426391601562, "learning_rate": 3.246827379376542e-07, "logits/chosen": -1.1439940929412842, "logits/rejected": -1.192895531654358, "logps/chosen": -342.1000061035156, "logps/rejected": -360.1875, "loss": 0.1509, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.5791015625, "rewards/margins": 4.104394435882568, "rewards/rejected": -5.683203220367432, "step": 5510 }, { "epoch": 1.391768821101062, "grad_norm": 33.14871597290039, "learning_rate": 3.239824410380888e-07, "logits/chosen": -1.122247338294983, "logits/rejected": -1.107336401939392, "logps/chosen": -293.85626220703125, "logps/rejected": -302.35626220703125, "loss": 0.1789, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7326171398162842, "rewards/margins": 3.7352538108825684, "rewards/rejected": -5.467382907867432, "step": 5520 }, { "epoch": 1.3942898559858823, "grad_norm": 47.060447692871094, "learning_rate": 3.232815073263372e-07, "logits/chosen": -1.175439476966858, "logits/rejected": -1.114465355873108, "logps/chosen": -300.63751220703125, "logps/rejected": -323.3125, "loss": 0.2601, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.6728515625, "rewards/margins": 3.818798780441284, "rewards/rejected": -5.490234375, "step": 5530 }, { "epoch": 1.3968108908707024, "grad_norm": 59.03270721435547, "learning_rate": 3.225799428357652e-07, "logits/chosen": -1.1072266101837158, "logits/rejected": -1.101220726966858, "logps/chosen": -297.4125061035156, "logps/rejected": -321.3062438964844, "loss": 0.2506, "rewards/accuracies": 0.90625, "rewards/chosen": -2.2929320335388184, "rewards/margins": 3.776440382003784, "rewards/rejected": -6.070703029632568, "step": 5540 }, { "epoch": 1.3993319257555226, "grad_norm": 29.274913787841797, "learning_rate": 3.2187775360516827e-07, "logits/chosen": -1.188848853111267, "logits/rejected": -1.0924255847930908, "logps/chosen": -338.9375, "logps/rejected": -330.6000061035156, "loss": 0.2505, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.454846143722534, "rewards/margins": 3.9281249046325684, "rewards/rejected": -6.381445407867432, "step": 5550 }, { "epoch": 1.401852960640343, "grad_norm": 27.96847915649414, "learning_rate": 3.2117494567871914e-07, "logits/chosen": -1.184326171875, "logits/rejected": -1.112554907798767, "logps/chosen": -298.70001220703125, "logps/rejected": -316.46875, "loss": 0.1937, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.985376000404358, "rewards/margins": 3.8896484375, "rewards/rejected": -5.876367092132568, "step": 5560 }, { "epoch": 1.404373995525163, "grad_norm": 45.046634674072266, "learning_rate": 3.20471525105916e-07, "logits/chosen": -1.160980224609375, "logits/rejected": -1.121740698814392, "logps/chosen": -296.8062438964844, "logps/rejected": -306.4125061035156, "loss": 0.1803, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.79638671875, "rewards/margins": 4.122363090515137, "rewards/rejected": -5.918359279632568, "step": 5570 }, { "epoch": 1.4068950304099834, "grad_norm": 58.81211853027344, "learning_rate": 3.197674979415308e-07, "logits/chosen": -1.2167479991912842, "logits/rejected": -1.166601538658142, "logps/chosen": -328.6312561035156, "logps/rejected": -336.8500061035156, "loss": 0.1951, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.554895043373108, "rewards/margins": 4.173193454742432, "rewards/rejected": -5.726758003234863, "step": 5580 }, { "epoch": 1.4094160652948036, "grad_norm": 48.055259704589844, "learning_rate": 3.190628702455565e-07, "logits/chosen": -1.20501708984375, "logits/rejected": -1.149023413658142, "logps/chosen": -309.70623779296875, "logps/rejected": -308.89373779296875, "loss": 0.2834, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.6253143548965454, "rewards/margins": 3.464245557785034, "rewards/rejected": -5.089062690734863, "step": 5590 }, { "epoch": 1.4119371001796237, "grad_norm": 25.23592185974121, "learning_rate": 3.183576480831551e-07, "logits/chosen": -1.10205078125, "logits/rejected": -1.1086914539337158, "logps/chosen": -293.6875, "logps/rejected": -317.8374938964844, "loss": 0.2192, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4288756847381592, "rewards/margins": 3.617626905441284, "rewards/rejected": -5.044140815734863, "step": 5600 }, { "epoch": 1.4144581350644438, "grad_norm": 32.868778228759766, "learning_rate": 3.17651837524606e-07, "logits/chosen": -1.1915283203125, "logits/rejected": -1.1032593250274658, "logps/chosen": -302.6625061035156, "logps/rejected": -319.7875061035156, "loss": 0.2123, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.565393090248108, "rewards/margins": 3.9247803688049316, "rewards/rejected": -5.489062309265137, "step": 5610 }, { "epoch": 1.4169791699492642, "grad_norm": 56.18616485595703, "learning_rate": 3.1694544464525274e-07, "logits/chosen": -1.14697265625, "logits/rejected": -1.1057250499725342, "logps/chosen": -292.6625061035156, "logps/rejected": -316.76251220703125, "loss": 0.1597, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.793402075767517, "rewards/margins": 3.890576124191284, "rewards/rejected": -5.682226657867432, "step": 5620 }, { "epoch": 1.4195002048340843, "grad_norm": 20.569725036621094, "learning_rate": 3.162384755254517e-07, "logits/chosen": -1.1304199695587158, "logits/rejected": -1.0923950672149658, "logps/chosen": -304.5687561035156, "logps/rejected": -316.8500061035156, "loss": 0.1808, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.7995116710662842, "rewards/margins": 4.0400390625, "rewards/rejected": -5.840039253234863, "step": 5630 }, { "epoch": 1.4220212397189047, "grad_norm": 46.42686462402344, "learning_rate": 3.155309362505191e-07, "logits/chosen": -1.080780029296875, "logits/rejected": -1.074194312095642, "logps/chosen": -266.5093688964844, "logps/rejected": -285.66876220703125, "loss": 0.2078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.6823608875274658, "rewards/margins": 3.513378858566284, "rewards/rejected": -5.196484565734863, "step": 5640 }, { "epoch": 1.4245422746037248, "grad_norm": 38.89899444580078, "learning_rate": 3.1482283291067886e-07, "logits/chosen": -1.1553223133087158, "logits/rejected": -1.1325562000274658, "logps/chosen": -284.4312438964844, "logps/rejected": -292.66876220703125, "loss": 0.1991, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.1062500476837158, "rewards/margins": 3.6515870094299316, "rewards/rejected": -4.758105278015137, "step": 5650 }, { "epoch": 1.427063309488545, "grad_norm": 31.15665054321289, "learning_rate": 3.141141716010101e-07, "logits/chosen": -1.1189086437225342, "logits/rejected": -1.129052758216858, "logps/chosen": -307.15625, "logps/rejected": -305.88751220703125, "loss": 0.168, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -0.9198631048202515, "rewards/margins": 4.07763671875, "rewards/rejected": -4.99609375, "step": 5660 }, { "epoch": 1.4295843443733653, "grad_norm": 60.87965393066406, "learning_rate": 3.134049584213949e-07, "logits/chosen": -1.1614501476287842, "logits/rejected": -1.225744605064392, "logps/chosen": -304.1000061035156, "logps/rejected": -345.1875, "loss": 0.2647, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2272918224334717, "rewards/margins": 3.766308546066284, "rewards/rejected": -4.992089748382568, "step": 5670 }, { "epoch": 1.4321053792581855, "grad_norm": 38.806068420410156, "learning_rate": 3.1269519947646534e-07, "logits/chosen": -1.163183569908142, "logits/rejected": -1.1135742664337158, "logps/chosen": -280.0874938964844, "logps/rejected": -322.57501220703125, "loss": 0.2206, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.7157471179962158, "rewards/margins": 3.8421387672424316, "rewards/rejected": -5.557909965515137, "step": 5680 }, { "epoch": 1.4346264141430056, "grad_norm": 22.673965454101562, "learning_rate": 3.119849008755515e-07, "logits/chosen": -1.2099120616912842, "logits/rejected": -1.2332274913787842, "logps/chosen": -338.4437561035156, "logps/rejected": -329.23748779296875, "loss": 0.1861, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4517090320587158, "rewards/margins": 4.054003715515137, "rewards/rejected": -5.509081840515137, "step": 5690 }, { "epoch": 1.437147449027826, "grad_norm": 25.874675750732422, "learning_rate": 3.112740687326286e-07, "logits/chosen": -1.1012604236602783, "logits/rejected": -1.0789794921875, "logps/chosen": -293.23126220703125, "logps/rejected": -304.1937561035156, "loss": 0.2343, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.7524551153182983, "rewards/margins": 3.746533155441284, "rewards/rejected": -5.501562595367432, "step": 5700 }, { "epoch": 1.4396684839126461, "grad_norm": 53.42656326293945, "learning_rate": 3.105627091662641e-07, "logits/chosen": -1.1712768077850342, "logits/rejected": -1.115148901939392, "logps/chosen": -280.15625, "logps/rejected": -308.10626220703125, "loss": 0.1776, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.9433562755584717, "rewards/margins": 3.83447265625, "rewards/rejected": -5.779492378234863, "step": 5710 }, { "epoch": 1.4421895187974663, "grad_norm": 34.40613555908203, "learning_rate": 3.098508282995657e-07, "logits/chosen": -1.130029320716858, "logits/rejected": -1.1058349609375, "logps/chosen": -309.9750061035156, "logps/rejected": -338.5375061035156, "loss": 0.2043, "rewards/accuracies": 0.921875, "rewards/chosen": -2.000903367996216, "rewards/margins": 4.115283012390137, "rewards/rejected": -6.117773532867432, "step": 5720 }, { "epoch": 1.4447105536822866, "grad_norm": 22.414045333862305, "learning_rate": 3.091384322601279e-07, "logits/chosen": -1.1539306640625, "logits/rejected": -1.146875023841858, "logps/chosen": -310.3812561035156, "logps/rejected": -346.7124938964844, "loss": 0.2165, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.7687591314315796, "rewards/margins": 4.116943359375, "rewards/rejected": -5.885546684265137, "step": 5730 }, { "epoch": 1.4472315885671068, "grad_norm": 59.29484176635742, "learning_rate": 3.0842552717998e-07, "logits/chosen": -1.112634301185608, "logits/rejected": -1.109826683998108, "logps/chosen": -274.4125061035156, "logps/rejected": -334.125, "loss": 0.2335, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -2.154559373855591, "rewards/margins": 3.826464891433716, "rewards/rejected": -5.981054782867432, "step": 5740 }, { "epoch": 1.4497526234519271, "grad_norm": 38.43928909301758, "learning_rate": 3.077121191955324e-07, "logits/chosen": -1.1369812488555908, "logits/rejected": -1.123291015625, "logps/chosen": -316.89373779296875, "logps/rejected": -323.0874938964844, "loss": 0.1646, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.9062316417694092, "rewards/margins": 4.0869140625, "rewards/rejected": -5.992578029632568, "step": 5750 }, { "epoch": 1.4522736583367473, "grad_norm": 40.83302307128906, "learning_rate": 3.0699821444752484e-07, "logits/chosen": -1.1530029773712158, "logits/rejected": -1.1438720226287842, "logps/chosen": -311.38751220703125, "logps/rejected": -322.8125, "loss": 0.2641, "rewards/accuracies": 0.890625, "rewards/chosen": -2.0092499256134033, "rewards/margins": 3.8388671875, "rewards/rejected": -5.847754001617432, "step": 5760 }, { "epoch": 1.4547946932215674, "grad_norm": 37.13199234008789, "learning_rate": 3.062838190809727e-07, "logits/chosen": -1.154150366783142, "logits/rejected": -1.152459740638733, "logps/chosen": -329.17498779296875, "logps/rejected": -331.86248779296875, "loss": 0.1963, "rewards/accuracies": 0.921875, "rewards/chosen": -1.7770264148712158, "rewards/margins": 4.0390625, "rewards/rejected": -5.81640625, "step": 5770 }, { "epoch": 1.4573157281063875, "grad_norm": 69.80667114257812, "learning_rate": 3.055689392451144e-07, "logits/chosen": -1.1111571788787842, "logits/rejected": -1.1268310546875, "logps/chosen": -282.5062561035156, "logps/rejected": -317.53125, "loss": 0.2322, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.979040503501892, "rewards/margins": 3.720410108566284, "rewards/rejected": -5.700097560882568, "step": 5780 }, { "epoch": 1.459836762991208, "grad_norm": 52.87074661254883, "learning_rate": 3.0485358109335875e-07, "logits/chosen": -1.127844214439392, "logits/rejected": -1.0900757312774658, "logps/chosen": -289.6000061035156, "logps/rejected": -306.3999938964844, "loss": 0.1803, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.8661377429962158, "rewards/margins": 4.056787014007568, "rewards/rejected": -5.922265529632568, "step": 5790 }, { "epoch": 1.462357797876028, "grad_norm": 54.009159088134766, "learning_rate": 3.041377507832313e-07, "logits/chosen": -1.1489989757537842, "logits/rejected": -1.1700439453125, "logps/chosen": -316.9624938964844, "logps/rejected": -341.65625, "loss": 0.2669, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.86407470703125, "rewards/margins": 3.7484374046325684, "rewards/rejected": -5.61181640625, "step": 5800 }, { "epoch": 1.4648788327608484, "grad_norm": 85.05703735351562, "learning_rate": 3.034214544763223e-07, "logits/chosen": -1.185888648033142, "logits/rejected": -1.1497223377227783, "logps/chosen": -295.5562438964844, "logps/rejected": -303.20623779296875, "loss": 0.3623, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6642272472381592, "rewards/margins": 3.2563233375549316, "rewards/rejected": -4.918359279632568, "step": 5810 }, { "epoch": 1.4673998676456685, "grad_norm": 50.31169509887695, "learning_rate": 3.0270469833823246e-07, "logits/chosen": -1.1232178211212158, "logits/rejected": -1.1271483898162842, "logps/chosen": -278.10626220703125, "logps/rejected": -280.1625061035156, "loss": 0.1864, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.6465972661972046, "rewards/margins": 3.705029249191284, "rewards/rejected": -5.348828315734863, "step": 5820 }, { "epoch": 1.4699209025304887, "grad_norm": 35.56777572631836, "learning_rate": 3.019874885385211e-07, "logits/chosen": -1.0975830554962158, "logits/rejected": -1.1710418462753296, "logps/chosen": -301.0249938964844, "logps/rejected": -318.79998779296875, "loss": 0.19, "rewards/accuracies": 0.921875, "rewards/chosen": -1.66668701171875, "rewards/margins": 3.7491211891174316, "rewards/rejected": -5.416796684265137, "step": 5830 }, { "epoch": 1.472441937415309, "grad_norm": 41.645145416259766, "learning_rate": 3.012698312506523e-07, "logits/chosen": -1.104577660560608, "logits/rejected": -1.1301758289337158, "logps/chosen": -320.40625, "logps/rejected": -345.70623779296875, "loss": 0.174, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4617538452148438, "rewards/margins": 4.060351371765137, "rewards/rejected": -5.521679878234863, "step": 5840 }, { "epoch": 1.4749629723001292, "grad_norm": 76.22579193115234, "learning_rate": 3.0055173265194184e-07, "logits/chosen": -1.119897484779358, "logits/rejected": -1.038330078125, "logps/chosen": -273.0562438964844, "logps/rejected": -283.375, "loss": 0.2618, "rewards/accuracies": 0.875, "rewards/chosen": -1.8042480945587158, "rewards/margins": 3.3912596702575684, "rewards/rejected": -5.197949409484863, "step": 5850 }, { "epoch": 1.4774840071849495, "grad_norm": 41.23089599609375, "learning_rate": 2.998331989235042e-07, "logits/chosen": -0.995147705078125, "logits/rejected": -1.065283179283142, "logps/chosen": -296.29376220703125, "logps/rejected": -320.5562438964844, "loss": 0.2265, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.659918189048767, "rewards/margins": 3.641796827316284, "rewards/rejected": -5.302148342132568, "step": 5860 }, { "epoch": 1.4800050420697697, "grad_norm": 20.371374130249023, "learning_rate": 2.991142362501994e-07, "logits/chosen": -1.1528136730194092, "logits/rejected": -1.080206274986267, "logps/chosen": -327.1625061035156, "logps/rejected": -334.3374938964844, "loss": 0.1988, "rewards/accuracies": 0.90625, "rewards/chosen": -1.268011450767517, "rewards/margins": 4.265576362609863, "rewards/rejected": -5.5341796875, "step": 5870 }, { "epoch": 1.4825260769545898, "grad_norm": 69.50802612304688, "learning_rate": 2.9839485082057945e-07, "logits/chosen": -1.068603515625, "logits/rejected": -1.0320342779159546, "logps/chosen": -291.1499938964844, "logps/rejected": -318.2875061035156, "loss": 0.2308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.6911499500274658, "rewards/margins": 3.997119188308716, "rewards/rejected": -5.686230659484863, "step": 5880 }, { "epoch": 1.48504711183941, "grad_norm": 20.13905906677246, "learning_rate": 2.976750488268355e-07, "logits/chosen": -1.068457007408142, "logits/rejected": -1.037744164466858, "logps/chosen": -308.76251220703125, "logps/rejected": -339.63751220703125, "loss": 0.2132, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.357421875, "rewards/margins": 4.101513862609863, "rewards/rejected": -5.459668159484863, "step": 5890 }, { "epoch": 1.4875681467242303, "grad_norm": 25.19346809387207, "learning_rate": 2.96954836464744e-07, "logits/chosen": -1.145776391029358, "logits/rejected": -1.1566894054412842, "logps/chosen": -330.23126220703125, "logps/rejected": -327.5625, "loss": 0.1967, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.196740746498108, "rewards/margins": 4.27294921875, "rewards/rejected": -5.466894626617432, "step": 5900 }, { "epoch": 1.4900891816090505, "grad_norm": 48.2508659362793, "learning_rate": 2.9623421993361407e-07, "logits/chosen": -1.126550316810608, "logits/rejected": -1.1150023937225342, "logps/chosen": -312.5375061035156, "logps/rejected": -328.57501220703125, "loss": 0.2607, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.260687232017517, "rewards/margins": 3.830078125, "rewards/rejected": -5.091796875, "step": 5910 }, { "epoch": 1.4926102164938708, "grad_norm": 33.797828674316406, "learning_rate": 2.955132054362335e-07, "logits/chosen": -1.12750244140625, "logits/rejected": -1.135461449623108, "logps/chosen": -295.4468688964844, "logps/rejected": -316.7749938964844, "loss": 0.2019, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.3718032836914062, "rewards/margins": 3.5299315452575684, "rewards/rejected": -4.899218559265137, "step": 5920 }, { "epoch": 1.495131251378691, "grad_norm": 36.69331359863281, "learning_rate": 2.9479179917881593e-07, "logits/chosen": -1.1229248046875, "logits/rejected": -1.0802001953125, "logps/chosen": -308.9312438964844, "logps/rejected": -335.57501220703125, "loss": 0.1621, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.5434691905975342, "rewards/margins": 4.004296779632568, "rewards/rejected": -5.548437595367432, "step": 5930 }, { "epoch": 1.497652286263511, "grad_norm": 61.35308837890625, "learning_rate": 2.9407000737094655e-07, "logits/chosen": -1.1416747570037842, "logits/rejected": -1.08526611328125, "logps/chosen": -297.3999938964844, "logps/rejected": -305.2875061035156, "loss": 0.2349, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.389404296875, "rewards/margins": 3.870654344558716, "rewards/rejected": -5.259179592132568, "step": 5940 }, { "epoch": 1.5001733211483312, "grad_norm": 57.0819091796875, "learning_rate": 2.9334783622552983e-07, "logits/chosen": -1.0996582508087158, "logits/rejected": -1.113623023033142, "logps/chosen": -296.64373779296875, "logps/rejected": -327.1875, "loss": 0.1803, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.975231945514679, "rewards/margins": 4.025097846984863, "rewards/rejected": -5.000586032867432, "step": 5950 }, { "epoch": 1.5026943560331516, "grad_norm": 35.29164505004883, "learning_rate": 2.9262529195873506e-07, "logits/chosen": -1.155432105064392, "logits/rejected": -1.187261939048767, "logps/chosen": -313.4125061035156, "logps/rejected": -310.8062438964844, "loss": 0.2001, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.1038116216659546, "rewards/margins": 3.7796874046325684, "rewards/rejected": -4.883203029632568, "step": 5960 }, { "epoch": 1.505215390917972, "grad_norm": 20.90247917175293, "learning_rate": 2.9190238078994326e-07, "logits/chosen": -1.1132323741912842, "logits/rejected": -1.115136742591858, "logps/chosen": -313.07501220703125, "logps/rejected": -297.5687561035156, "loss": 0.1939, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4117408990859985, "rewards/margins": 3.9153809547424316, "rewards/rejected": -5.327929496765137, "step": 5970 }, { "epoch": 1.507736425802792, "grad_norm": 41.171974182128906, "learning_rate": 2.911791089416938e-07, "logits/chosen": -1.0940430164337158, "logits/rejected": -1.1343994140625, "logps/chosen": -294.66876220703125, "logps/rejected": -303.4750061035156, "loss": 0.2523, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.30426025390625, "rewards/margins": 4.020263671875, "rewards/rejected": -5.324926853179932, "step": 5980 }, { "epoch": 1.5102574606876122, "grad_norm": 36.254337310791016, "learning_rate": 2.904554826396304e-07, "logits/chosen": -1.144506812095642, "logits/rejected": -1.1100342273712158, "logps/chosen": -290.5, "logps/rejected": -324.2749938964844, "loss": 0.2526, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.72589111328125, "rewards/margins": 3.7916259765625, "rewards/rejected": -5.515234470367432, "step": 5990 }, { "epoch": 1.5127784955724324, "grad_norm": 38.0024299621582, "learning_rate": 2.89731508112448e-07, "logits/chosen": -1.145452857017517, "logits/rejected": -1.141699194908142, "logps/chosen": -310.36248779296875, "logps/rejected": -310.3999938964844, "loss": 0.2551, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.7955322265625, "rewards/margins": 3.600830078125, "rewards/rejected": -5.396777153015137, "step": 6000 }, { "epoch": 1.5152995304572527, "grad_norm": 13.969449996948242, "learning_rate": 2.890071915918387e-07, "logits/chosen": -1.14007568359375, "logits/rejected": -1.06732177734375, "logps/chosen": -299.98126220703125, "logps/rejected": -323.7562561035156, "loss": 0.1965, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.576837182044983, "rewards/margins": 3.966747999191284, "rewards/rejected": -5.544043064117432, "step": 6010 }, { "epoch": 1.5178205653420729, "grad_norm": 23.30685806274414, "learning_rate": 2.8828253931243846e-07, "logits/chosen": -1.136743187904358, "logits/rejected": -1.092736840248108, "logps/chosen": -297.5625, "logps/rejected": -295.63751220703125, "loss": 0.2065, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.3041808605194092, "rewards/margins": 3.9515624046325684, "rewards/rejected": -5.256249904632568, "step": 6020 }, { "epoch": 1.5203416002268932, "grad_norm": 55.4643669128418, "learning_rate": 2.8755755751177333e-07, "logits/chosen": -1.1088378429412842, "logits/rejected": -1.1104736328125, "logps/chosen": -298.98126220703125, "logps/rejected": -310.26251220703125, "loss": 0.2441, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.9515517950057983, "rewards/margins": 3.7629151344299316, "rewards/rejected": -4.714453220367432, "step": 6030 }, { "epoch": 1.5228626351117134, "grad_norm": 38.759033203125, "learning_rate": 2.8683225243020576e-07, "logits/chosen": -1.1571044921875, "logits/rejected": -1.1136596202850342, "logps/chosen": -310.8812561035156, "logps/rejected": -321.5, "loss": 0.1841, "rewards/accuracies": 0.921875, "rewards/chosen": -0.6566246151924133, "rewards/margins": 4.045752048492432, "rewards/rejected": -4.702441215515137, "step": 6040 }, { "epoch": 1.5253836699965335, "grad_norm": 66.81090545654297, "learning_rate": 2.861066303108808e-07, "logits/chosen": -1.144775390625, "logits/rejected": -1.14501953125, "logps/chosen": -303.71875, "logps/rejected": -322.5562438964844, "loss": 0.2178, "rewards/accuracies": 0.90625, "rewards/chosen": -1.238885521888733, "rewards/margins": 3.8529052734375, "rewards/rejected": -5.09521484375, "step": 6050 }, { "epoch": 1.5279047048813537, "grad_norm": 58.58757400512695, "learning_rate": 2.8538069739967257e-07, "logits/chosen": -1.160614013671875, "logits/rejected": -1.0935790538787842, "logps/chosen": -301.48126220703125, "logps/rejected": -307.95001220703125, "loss": 0.2144, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5325835943222046, "rewards/margins": 3.763232469558716, "rewards/rejected": -5.295312404632568, "step": 6060 }, { "epoch": 1.530425739766174, "grad_norm": 22.95831298828125, "learning_rate": 2.8465445994513024e-07, "logits/chosen": -1.1400635242462158, "logits/rejected": -1.107177734375, "logps/chosen": -298.421875, "logps/rejected": -296.40625, "loss": 0.1795, "rewards/accuracies": 0.9375, "rewards/chosen": -1.3224213123321533, "rewards/margins": 3.811230421066284, "rewards/rejected": -5.132910251617432, "step": 6070 }, { "epoch": 1.5329467746509944, "grad_norm": 44.37894058227539, "learning_rate": 2.8392792419842447e-07, "logits/chosen": -1.174072265625, "logits/rejected": -1.1715087890625, "logps/chosen": -294.20001220703125, "logps/rejected": -311.1625061035156, "loss": 0.1708, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1699402332305908, "rewards/margins": 3.9228148460388184, "rewards/rejected": -5.094140529632568, "step": 6080 }, { "epoch": 1.5354678095358145, "grad_norm": 26.776601791381836, "learning_rate": 2.832010964132934e-07, "logits/chosen": -1.0991790294647217, "logits/rejected": -1.1131622791290283, "logps/chosen": -303.20001220703125, "logps/rejected": -311.32501220703125, "loss": 0.1787, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.362890601158142, "rewards/margins": 3.931884765625, "rewards/rejected": -5.296484470367432, "step": 6090 }, { "epoch": 1.5379888444206347, "grad_norm": 34.41385269165039, "learning_rate": 2.82473982845989e-07, "logits/chosen": -1.1327393054962158, "logits/rejected": -1.085089087486267, "logps/chosen": -297.2749938964844, "logps/rejected": -300.29376220703125, "loss": 0.2211, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2868438959121704, "rewards/margins": 3.897876024246216, "rewards/rejected": -5.1845703125, "step": 6100 }, { "epoch": 1.5405098793054548, "grad_norm": 33.31650161743164, "learning_rate": 2.8174658975522305e-07, "logits/chosen": -1.105615258216858, "logits/rejected": -1.089147925376892, "logps/chosen": -287.28125, "logps/rejected": -295.875, "loss": 0.2103, "rewards/accuracies": 0.90625, "rewards/chosen": -1.403588891029358, "rewards/margins": 3.818066358566284, "rewards/rejected": -5.220507621765137, "step": 6110 }, { "epoch": 1.5430309141902752, "grad_norm": 43.380516052246094, "learning_rate": 2.810189234021135e-07, "logits/chosen": -1.16046142578125, "logits/rejected": -1.108343482017517, "logps/chosen": -301.6656188964844, "logps/rejected": -322.0249938964844, "loss": 0.1868, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.209442138671875, "rewards/margins": 3.7587890625, "rewards/rejected": -4.966210842132568, "step": 6120 }, { "epoch": 1.5455519490750953, "grad_norm": 40.877193450927734, "learning_rate": 2.802909900501304e-07, "logits/chosen": -1.1108887195587158, "logits/rejected": -1.1327393054962158, "logps/chosen": -276.1625061035156, "logps/rejected": -307.6312561035156, "loss": 0.2464, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.244757056236267, "rewards/margins": 3.612719774246216, "rewards/rejected": -4.85546875, "step": 6130 }, { "epoch": 1.5480729839599157, "grad_norm": 55.626399993896484, "learning_rate": 2.7956279596504197e-07, "logits/chosen": -1.153906226158142, "logits/rejected": -1.125640869140625, "logps/chosen": -300.61248779296875, "logps/rejected": -322.625, "loss": 0.2489, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.4402649402618408, "rewards/margins": 3.6713624000549316, "rewards/rejected": -5.11181640625, "step": 6140 }, { "epoch": 1.5505940188447358, "grad_norm": 38.66291809082031, "learning_rate": 2.7883434741486065e-07, "logits/chosen": -1.0975463390350342, "logits/rejected": -1.108941674232483, "logps/chosen": -300.625, "logps/rejected": -321.0249938964844, "loss": 0.2227, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5415527820587158, "rewards/margins": 3.765380859375, "rewards/rejected": -5.306445121765137, "step": 6150 }, { "epoch": 1.553115053729556, "grad_norm": 52.171146392822266, "learning_rate": 2.7810565066978944e-07, "logits/chosen": -1.167871117591858, "logits/rejected": -1.1710693836212158, "logps/chosen": -313.03125, "logps/rejected": -323.46875, "loss": 0.2665, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5628662109375, "rewards/margins": 3.578356981277466, "rewards/rejected": -5.140820503234863, "step": 6160 }, { "epoch": 1.555636088614376, "grad_norm": 52.03303909301758, "learning_rate": 2.7737671200216745e-07, "logits/chosen": -1.181249976158142, "logits/rejected": -1.1878662109375, "logps/chosen": -303.5874938964844, "logps/rejected": -306.3500061035156, "loss": 0.2064, "rewards/accuracies": 0.90625, "rewards/chosen": -1.112512230873108, "rewards/margins": 3.9339842796325684, "rewards/rejected": -5.047949314117432, "step": 6170 }, { "epoch": 1.5581571234991964, "grad_norm": 50.642539978027344, "learning_rate": 2.766475376864163e-07, "logits/chosen": -1.167016625404358, "logits/rejected": -1.126977562904358, "logps/chosen": -303.61248779296875, "logps/rejected": -309.4750061035156, "loss": 0.3047, "rewards/accuracies": 0.890625, "rewards/chosen": -1.3833954334259033, "rewards/margins": 3.9080567359924316, "rewards/rejected": -5.290625095367432, "step": 6180 }, { "epoch": 1.5606781583840168, "grad_norm": 22.918367385864258, "learning_rate": 2.75918133998986e-07, "logits/chosen": -1.1688721179962158, "logits/rejected": -1.167626976966858, "logps/chosen": -313.53125, "logps/rejected": -349.125, "loss": 0.1866, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.345666527748108, "rewards/margins": 3.9029297828674316, "rewards/rejected": -5.250195503234863, "step": 6190 }, { "epoch": 1.563199193268837, "grad_norm": 44.8996467590332, "learning_rate": 2.751885072183009e-07, "logits/chosen": -1.181610107421875, "logits/rejected": -1.16558837890625, "logps/chosen": -295.9375, "logps/rejected": -287.42498779296875, "loss": 0.2211, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.675573706626892, "rewards/margins": 3.5116209983825684, "rewards/rejected": -5.189843654632568, "step": 6200 }, { "epoch": 1.565720228153657, "grad_norm": 45.8497314453125, "learning_rate": 2.744586636247056e-07, "logits/chosen": -1.1245849132537842, "logits/rejected": -1.102929711341858, "logps/chosen": -311.15625, "logps/rejected": -335.0, "loss": 0.2237, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.4038269519805908, "rewards/margins": 3.883105516433716, "rewards/rejected": -5.289648532867432, "step": 6210 }, { "epoch": 1.5682412630384772, "grad_norm": 55.734989166259766, "learning_rate": 2.7372860950041085e-07, "logits/chosen": -1.1724975109100342, "logits/rejected": -1.0930907726287842, "logps/chosen": -308.79376220703125, "logps/rejected": -320.2749938964844, "loss": 0.2928, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.165258765220642, "rewards/margins": 3.848950147628784, "rewards/rejected": -5.014843940734863, "step": 6220 }, { "epoch": 1.5707622979232974, "grad_norm": 27.495655059814453, "learning_rate": 2.7299835112943984e-07, "logits/chosen": -1.1748778820037842, "logits/rejected": -1.119470238685608, "logps/chosen": -295.79376220703125, "logps/rejected": -305.76251220703125, "loss": 0.2145, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.0674560070037842, "rewards/margins": 3.8984131813049316, "rewards/rejected": -4.965918064117432, "step": 6230 }, { "epoch": 1.5732833328081177, "grad_norm": 49.23637008666992, "learning_rate": 2.7226789479757355e-07, "logits/chosen": -1.130926489830017, "logits/rejected": -1.1825683116912842, "logps/chosen": -286.89373779296875, "logps/rejected": -333.01873779296875, "loss": 0.1955, "rewards/accuracies": 0.9375, "rewards/chosen": -0.8009887933731079, "rewards/margins": 3.798046827316284, "rewards/rejected": -4.597460746765137, "step": 6240 }, { "epoch": 1.575804367692938, "grad_norm": 54.19160079956055, "learning_rate": 2.7153724679229707e-07, "logits/chosen": -1.167236328125, "logits/rejected": -1.1602294445037842, "logps/chosen": -282.60626220703125, "logps/rejected": -316.25, "loss": 0.2016, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.9388580322265625, "rewards/margins": 3.560107469558716, "rewards/rejected": -4.497851371765137, "step": 6250 }, { "epoch": 1.5783254025777582, "grad_norm": 58.92972946166992, "learning_rate": 2.7080641340274536e-07, "logits/chosen": -1.1320679187774658, "logits/rejected": -1.1422545909881592, "logps/chosen": -276.9437561035156, "logps/rejected": -276.73126220703125, "loss": 0.2117, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.9053314328193665, "rewards/margins": 3.3994140625, "rewards/rejected": -4.304980278015137, "step": 6260 }, { "epoch": 1.5808464374625784, "grad_norm": 70.27766418457031, "learning_rate": 2.70075400919649e-07, "logits/chosen": -1.146142601966858, "logits/rejected": -1.141119360923767, "logps/chosen": -308.76873779296875, "logps/rejected": -334.01251220703125, "loss": 0.2086, "rewards/accuracies": 0.921875, "rewards/chosen": -1.1092650890350342, "rewards/margins": 3.701171875, "rewards/rejected": -4.8095703125, "step": 6270 }, { "epoch": 1.5833674723473985, "grad_norm": 53.645111083984375, "learning_rate": 2.6934421563528037e-07, "logits/chosen": -1.161352515220642, "logits/rejected": -1.1814453601837158, "logps/chosen": -300.6937561035156, "logps/rejected": -303.8999938964844, "loss": 0.2134, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.4230773448944092, "rewards/margins": 3.53857421875, "rewards/rejected": -4.9619140625, "step": 6280 }, { "epoch": 1.5858885072322189, "grad_norm": 61.04668045043945, "learning_rate": 2.6861286384339884e-07, "logits/chosen": -1.156274437904358, "logits/rejected": -1.114892601966858, "logps/chosen": -275.88751220703125, "logps/rejected": -305.64373779296875, "loss": 0.1491, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -1.345971703529358, "rewards/margins": 4.110595703125, "rewards/rejected": -5.456640720367432, "step": 6290 }, { "epoch": 1.588409542117039, "grad_norm": 41.34684753417969, "learning_rate": 2.6788135183919743e-07, "logits/chosen": NaN, "logits/rejected": -1.123925805091858, "logps/chosen": -324.57501220703125, "logps/rejected": -337.3500061035156, "loss": 0.2148, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.9361572265625, "rewards/margins": 4.215234279632568, "rewards/rejected": -6.151757717132568, "step": 6300 }, { "epoch": 1.5909305770018594, "grad_norm": 29.185256958007812, "learning_rate": 2.671496859192479e-07, "logits/chosen": -1.129492163658142, "logits/rejected": -1.0899779796600342, "logps/chosen": -317.42498779296875, "logps/rejected": -328.57501220703125, "loss": 0.2089, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9986541271209717, "rewards/margins": 4.295092582702637, "rewards/rejected": -6.292187690734863, "step": 6310 }, { "epoch": 1.5934516118866795, "grad_norm": 63.9970588684082, "learning_rate": 2.6641787238144703e-07, "logits/chosen": -1.07855224609375, "logits/rejected": -1.054968237876892, "logps/chosen": -323.3062438964844, "logps/rejected": -328.46875, "loss": 0.1911, "rewards/accuracies": 0.9375, "rewards/chosen": -2.1715362071990967, "rewards/margins": 4.328906059265137, "rewards/rejected": -6.4990234375, "step": 6320 }, { "epoch": 1.5959726467714996, "grad_norm": 39.56163787841797, "learning_rate": 2.656859175249622e-07, "logits/chosen": -1.181298851966858, "logits/rejected": -1.049646019935608, "logps/chosen": -304.78125, "logps/rejected": -321.0687561035156, "loss": 0.1864, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.1056151390075684, "rewards/margins": 4.275586128234863, "rewards/rejected": -6.382616996765137, "step": 6330 }, { "epoch": 1.5984936816563198, "grad_norm": 52.43605041503906, "learning_rate": 2.6495382765017726e-07, "logits/chosen": -1.0822356939315796, "logits/rejected": -1.0875732898712158, "logps/chosen": -318.8125, "logps/rejected": -325.84375, "loss": 0.2078, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -2.670114040374756, "rewards/margins": 4.224707126617432, "rewards/rejected": -6.895117282867432, "step": 6340 }, { "epoch": 1.6010147165411401, "grad_norm": 44.35512161254883, "learning_rate": 2.6422160905863816e-07, "logits/chosen": NaN, "logits/rejected": -1.0967528820037842, "logps/chosen": -307.70001220703125, "logps/rejected": -338.08123779296875, "loss": 0.2578, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -2.820687770843506, "rewards/margins": 3.874316453933716, "rewards/rejected": -6.6953125, "step": 6350 }, { "epoch": 1.6035357514259605, "grad_norm": 38.240203857421875, "learning_rate": 2.634892680529988e-07, "logits/chosen": -1.10693359375, "logits/rejected": -1.1155273914337158, "logps/chosen": -300.4375, "logps/rejected": -321.70001220703125, "loss": 0.1526, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -2.11431884765625, "rewards/margins": 4.200097560882568, "rewards/rejected": -6.314257621765137, "step": 6360 }, { "epoch": 1.6060567863107806, "grad_norm": 29.181943893432617, "learning_rate": 2.627568109369668e-07, "logits/chosen": -1.0560302734375, "logits/rejected": -1.066827416419983, "logps/chosen": -326.5687561035156, "logps/rejected": -333.5249938964844, "loss": 0.2289, "rewards/accuracies": 0.90625, "rewards/chosen": -2.367840528488159, "rewards/margins": 4.1337890625, "rewards/rejected": -6.503125190734863, "step": 6370 }, { "epoch": 1.6085778211956008, "grad_norm": 30.766389846801758, "learning_rate": 2.6202424401524914e-07, "logits/chosen": -1.096716284751892, "logits/rejected": -1.065435767173767, "logps/chosen": -290.5562438964844, "logps/rejected": -314.20001220703125, "loss": 0.2036, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.8820984363555908, "rewards/margins": 4.274706840515137, "rewards/rejected": -6.159375190734863, "step": 6380 }, { "epoch": 1.611098856080421, "grad_norm": 23.376792907714844, "learning_rate": 2.6129157359349806e-07, "logits/chosen": -1.146142601966858, "logits/rejected": -1.0878417491912842, "logps/chosen": -319.375, "logps/rejected": -319.42498779296875, "loss": 0.1467, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.89276123046875, "rewards/margins": 4.2249755859375, "rewards/rejected": -6.118359565734863, "step": 6390 }, { "epoch": 1.6136198909652413, "grad_norm": 26.900299072265625, "learning_rate": 2.605588059782567e-07, "logits/chosen": -1.1492187976837158, "logits/rejected": -1.09906005859375, "logps/chosen": -313.6187438964844, "logps/rejected": -328.20001220703125, "loss": 0.1892, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4521667957305908, "rewards/margins": 4.266747951507568, "rewards/rejected": -5.717675685882568, "step": 6400 }, { "epoch": 1.6161409258500614, "grad_norm": 79.60032653808594, "learning_rate": 2.5982594747690483e-07, "logits/chosen": -1.165283203125, "logits/rejected": -1.1063048839569092, "logps/chosen": -308.58123779296875, "logps/rejected": -323.95001220703125, "loss": 0.2384, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.6304610967636108, "rewards/margins": 3.887451171875, "rewards/rejected": -5.51953125, "step": 6410 }, { "epoch": 1.6186619607348818, "grad_norm": 55.93655014038086, "learning_rate": 2.590930043976044e-07, "logits/chosen": -1.186163306236267, "logits/rejected": -1.1176269054412842, "logps/chosen": -319.48748779296875, "logps/rejected": -334.7875061035156, "loss": 0.3004, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.7587006092071533, "rewards/margins": 4.107324123382568, "rewards/rejected": -5.8671875, "step": 6420 }, { "epoch": 1.621182995619702, "grad_norm": 57.82849884033203, "learning_rate": 2.583599830492453e-07, "logits/chosen": -1.1078612804412842, "logits/rejected": -1.145532250404358, "logps/chosen": -303.41876220703125, "logps/rejected": -319.4437561035156, "loss": 0.2404, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.714318871498108, "rewards/margins": 3.898632764816284, "rewards/rejected": -5.613085746765137, "step": 6430 }, { "epoch": 1.623704030504522, "grad_norm": 40.817840576171875, "learning_rate": 2.576268897413916e-07, "logits/chosen": -1.159692406654358, "logits/rejected": -1.152978539466858, "logps/chosen": -281.42498779296875, "logps/rejected": -312.35626220703125, "loss": 0.2292, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.397894263267517, "rewards/margins": 3.7555909156799316, "rewards/rejected": -5.153027534484863, "step": 6440 }, { "epoch": 1.6262250653893422, "grad_norm": 53.961727142333984, "learning_rate": 2.5689373078422603e-07, "logits/chosen": -1.1435668468475342, "logits/rejected": NaN, "logps/chosen": -331.71875, "logps/rejected": -324.4125061035156, "loss": 0.228, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.54876708984375, "rewards/margins": 4.006982326507568, "rewards/rejected": -5.555468559265137, "step": 6450 }, { "epoch": 1.6287461002741626, "grad_norm": 27.66611671447754, "learning_rate": 2.5616051248849707e-07, "logits/chosen": -1.141503930091858, "logits/rejected": -1.1709167957305908, "logps/chosen": -302.95001220703125, "logps/rejected": -310.3125, "loss": 0.2209, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5449860095977783, "rewards/margins": 4.110595703125, "rewards/rejected": -5.657422065734863, "step": 6460 }, { "epoch": 1.631267135158983, "grad_norm": 36.329322814941406, "learning_rate": 2.5542724116546365e-07, "logits/chosen": -1.1394531726837158, "logits/rejected": -1.1514403820037842, "logps/chosen": -315.26873779296875, "logps/rejected": -322.5625, "loss": 0.2073, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.494232177734375, "rewards/margins": 3.925488233566284, "rewards/rejected": -5.420117378234863, "step": 6470 }, { "epoch": 1.633788170043803, "grad_norm": 34.16090393066406, "learning_rate": 2.5469392312684123e-07, "logits/chosen": NaN, "logits/rejected": -1.1286499500274658, "logps/chosen": -304.8125, "logps/rejected": -320.42498779296875, "loss": 0.1821, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.2781188488006592, "rewards/margins": 4.002392768859863, "rewards/rejected": -5.278027534484863, "step": 6480 }, { "epoch": 1.6363092049286232, "grad_norm": 43.980491638183594, "learning_rate": 2.539605646847473e-07, "logits/chosen": -1.1635253429412842, "logits/rejected": -1.1563720703125, "logps/chosen": -340.35626220703125, "logps/rejected": -325.48748779296875, "loss": 0.2075, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.445581078529358, "rewards/margins": 4.289258003234863, "rewards/rejected": -5.734179496765137, "step": 6490 }, { "epoch": 1.6388302398134433, "grad_norm": 44.270267486572266, "learning_rate": 2.532271721516472e-07, "logits/chosen": -1.1770508289337158, "logits/rejected": -1.088861107826233, "logps/chosen": -339.67498779296875, "logps/rejected": -346.9125061035156, "loss": 0.2181, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.1042418479919434, "rewards/margins": 3.935253858566284, "rewards/rejected": -6.040429592132568, "step": 6500 }, { "epoch": 1.6413512746982635, "grad_norm": 51.83606719970703, "learning_rate": 2.524937518402997e-07, "logits/chosen": -1.099084496498108, "logits/rejected": -1.062255859375, "logps/chosen": -324.21875, "logps/rejected": -335.28125, "loss": 0.2701, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -2.1596922874450684, "rewards/margins": 3.80712890625, "rewards/rejected": -5.967382907867432, "step": 6510 }, { "epoch": 1.6438723095830838, "grad_norm": 35.56692886352539, "learning_rate": 2.5176031006370253e-07, "logits/chosen": -1.087866187095642, "logits/rejected": -1.108789086341858, "logps/chosen": -305.35626220703125, "logps/rejected": -327.5, "loss": 0.2258, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -2.174731492996216, "rewards/margins": 3.7359862327575684, "rewards/rejected": -5.912499904632568, "step": 6520 }, { "epoch": 1.6463933444679042, "grad_norm": 45.89760971069336, "learning_rate": 2.510268531350384e-07, "logits/chosen": -1.107061743736267, "logits/rejected": -1.056298851966858, "logps/chosen": -297.4624938964844, "logps/rejected": -302.9624938964844, "loss": 0.1988, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.023632764816284, "rewards/margins": 3.876147508621216, "rewards/rejected": -5.900586128234863, "step": 6530 }, { "epoch": 1.6489143793527243, "grad_norm": 43.021244049072266, "learning_rate": 2.502933873676204e-07, "logits/chosen": -1.1321594715118408, "logits/rejected": -1.0499999523162842, "logps/chosen": -293.5375061035156, "logps/rejected": -317.54998779296875, "loss": 0.2084, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.8830718994140625, "rewards/margins": 4.081250190734863, "rewards/rejected": -5.96484375, "step": 6540 }, { "epoch": 1.6514354142375445, "grad_norm": 42.96616744995117, "learning_rate": 2.4955991907483763e-07, "logits/chosen": -1.1565430164337158, "logits/rejected": -1.0712158679962158, "logps/chosen": -320.3374938964844, "logps/rejected": -312.1499938964844, "loss": 0.2099, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.6129639148712158, "rewards/margins": 4.214648246765137, "rewards/rejected": -5.828320503234863, "step": 6550 }, { "epoch": 1.6539564491223646, "grad_norm": 43.674652099609375, "learning_rate": 2.4882645457010096e-07, "logits/chosen": -1.1515624523162842, "logits/rejected": -1.103051781654358, "logps/chosen": -310.86248779296875, "logps/rejected": -305.64373779296875, "loss": 0.253, "rewards/accuracies": 0.90625, "rewards/chosen": -1.888769507408142, "rewards/margins": 3.878222703933716, "rewards/rejected": -5.765038967132568, "step": 6560 }, { "epoch": 1.656477484007185, "grad_norm": 29.459733963012695, "learning_rate": 2.480930001667887e-07, "logits/chosen": -1.109655737876892, "logits/rejected": -0.9969848394393921, "logps/chosen": -314.07501220703125, "logps/rejected": -313.45001220703125, "loss": 0.1971, "rewards/accuracies": 0.921875, "rewards/chosen": -2.0628294944763184, "rewards/margins": 4.00244140625, "rewards/rejected": -6.064843654632568, "step": 6570 }, { "epoch": 1.6589985188920051, "grad_norm": 41.641334533691406, "learning_rate": 2.473595621781919e-07, "logits/chosen": -1.125646948814392, "logits/rejected": -1.1973145008087158, "logps/chosen": -303.6937561035156, "logps/rejected": -334.1000061035156, "loss": 0.2295, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -2.050854444503784, "rewards/margins": 3.986132860183716, "rewards/rejected": -6.037695407867432, "step": 6580 }, { "epoch": 1.6615195537768255, "grad_norm": 72.03292846679688, "learning_rate": 2.4662614691746096e-07, "logits/chosen": -1.15771484375, "logits/rejected": -1.0685577392578125, "logps/chosen": -292.7250061035156, "logps/rejected": -310.7250061035156, "loss": 0.2149, "rewards/accuracies": 0.921875, "rewards/chosen": -1.60009765625, "rewards/margins": 3.970703125, "rewards/rejected": -5.571093559265137, "step": 6590 }, { "epoch": 1.6640405886616456, "grad_norm": 63.81726837158203, "learning_rate": 2.4589276069754994e-07, "logits/chosen": -1.092504858970642, "logits/rejected": -1.0665283203125, "logps/chosen": -289.0375061035156, "logps/rejected": -316.7875061035156, "loss": 0.2592, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.723236083984375, "rewards/margins": 3.6299805641174316, "rewards/rejected": -5.355078220367432, "step": 6600 }, { "epoch": 1.6665616235464658, "grad_norm": 34.75861358642578, "learning_rate": 2.451594098311635e-07, "logits/chosen": -1.0768005847930908, "logits/rejected": -1.108007788658142, "logps/chosen": -288.41876220703125, "logps/rejected": -275.2875061035156, "loss": 0.2183, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.7859923839569092, "rewards/margins": 3.5325927734375, "rewards/rejected": -5.3203125, "step": 6610 }, { "epoch": 1.669082658431286, "grad_norm": 38.13240432739258, "learning_rate": 2.4442610063070143e-07, "logits/chosen": -1.0810546875, "logits/rejected": -1.0842163562774658, "logps/chosen": -273.73126220703125, "logps/rejected": -293.4750061035156, "loss": 0.2145, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.8680388927459717, "rewards/margins": 3.718017578125, "rewards/rejected": -5.587500095367432, "step": 6620 }, { "epoch": 1.6716036933161063, "grad_norm": 42.62815856933594, "learning_rate": 2.4369283940820557e-07, "logits/chosen": -1.1541016101837158, "logits/rejected": -1.151544213294983, "logps/chosen": -314.98126220703125, "logps/rejected": -334.1000061035156, "loss": 0.2425, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.681176781654358, "rewards/margins": 4.090722560882568, "rewards/rejected": -5.773828029632568, "step": 6630 }, { "epoch": 1.6741247282009266, "grad_norm": 41.75690841674805, "learning_rate": 2.429596324753042e-07, "logits/chosen": -1.145074486732483, "logits/rejected": -1.09808349609375, "logps/chosen": -322.0062561035156, "logps/rejected": -330.26251220703125, "loss": 0.1859, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.9456055164337158, "rewards/margins": 4.161035060882568, "rewards/rejected": -6.108007907867432, "step": 6640 }, { "epoch": 1.6766457630857468, "grad_norm": 22.159154891967773, "learning_rate": 2.422264861431584e-07, "logits/chosen": -1.1547729969024658, "logits/rejected": -1.1577637195587158, "logps/chosen": -277.33123779296875, "logps/rejected": -302.78125, "loss": 0.1759, "rewards/accuracies": 0.921875, "rewards/chosen": -1.4130675792694092, "rewards/margins": 4.004248142242432, "rewards/rejected": -5.413964748382568, "step": 6650 }, { "epoch": 1.679166797970567, "grad_norm": 53.878028869628906, "learning_rate": 2.41493406722408e-07, "logits/chosen": NaN, "logits/rejected": -1.054284691810608, "logps/chosen": -264.5562438964844, "logps/rejected": -327.29998779296875, "loss": 0.2667, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.1799819469451904, "rewards/margins": 3.6225829124450684, "rewards/rejected": -4.803124904632568, "step": 6660 }, { "epoch": 1.681687832855387, "grad_norm": 15.097171783447266, "learning_rate": 2.407604005231163e-07, "logits/chosen": -1.246667504310608, "logits/rejected": -1.18994140625, "logps/chosen": -295.9312438964844, "logps/rejected": -312.6937561035156, "loss": 0.2059, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -0.8031219244003296, "rewards/margins": 3.8587403297424316, "rewards/rejected": -4.663183689117432, "step": 6670 }, { "epoch": 1.6842088677402074, "grad_norm": 58.274845123291016, "learning_rate": 2.4002747385471686e-07, "logits/chosen": -1.180334448814392, "logits/rejected": -1.1510741710662842, "logps/chosen": -267.84375, "logps/rejected": -305.1499938964844, "loss": 0.246, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.9923461675643921, "rewards/margins": 3.7740235328674316, "rewards/rejected": -4.76513671875, "step": 6680 }, { "epoch": 1.6867299026250275, "grad_norm": 33.82486343383789, "learning_rate": 2.392946330259583e-07, "logits/chosen": -1.151635766029358, "logits/rejected": -1.0900757312774658, "logps/chosen": -299.1499938964844, "logps/rejected": -308.9750061035156, "loss": 0.1602, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.610064685344696, "rewards/margins": 4.367919921875, "rewards/rejected": -4.980175971984863, "step": 6690 }, { "epoch": 1.689250937509848, "grad_norm": 43.90882873535156, "learning_rate": 2.385618843448507e-07, "logits/chosen": -1.201269507408142, "logits/rejected": -1.0997803211212158, "logps/chosen": -284.8187561035156, "logps/rejected": -289.1499938964844, "loss": 0.2367, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.1014130115509033, "rewards/margins": 3.6915040016174316, "rewards/rejected": -4.791325569152832, "step": 6700 }, { "epoch": 1.691771972394668, "grad_norm": 44.88566589355469, "learning_rate": 2.378292341186107e-07, "logits/chosen": -1.1644287109375, "logits/rejected": -1.15838623046875, "logps/chosen": -302.82501220703125, "logps/rejected": -298.9937438964844, "loss": 0.2156, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.352014183998108, "rewards/margins": 3.757251024246216, "rewards/rejected": -5.109570503234863, "step": 6710 }, { "epoch": 1.6942930072794882, "grad_norm": 73.87053680419922, "learning_rate": 2.370966886536074e-07, "logits/chosen": -1.1813232898712158, "logits/rejected": -1.1998291015625, "logps/chosen": -316.10626220703125, "logps/rejected": -330.23748779296875, "loss": 0.2489, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.630670189857483, "rewards/margins": 4.134839057922363, "rewards/rejected": -5.764404296875, "step": 6720 }, { "epoch": 1.6968140421643083, "grad_norm": 42.54814147949219, "learning_rate": 2.3636425425530857e-07, "logits/chosen": -1.15771484375, "logits/rejected": -1.0667235851287842, "logps/chosen": -333.2749938964844, "logps/rejected": -339.6875, "loss": 0.2005, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.386584520339966, "rewards/margins": 3.863940477371216, "rewards/rejected": -6.252050876617432, "step": 6730 }, { "epoch": 1.6993350770491287, "grad_norm": 59.08740997314453, "learning_rate": 2.3563193722822555e-07, "logits/chosen": -1.107031226158142, "logits/rejected": -1.1286437511444092, "logps/chosen": -306.78125, "logps/rejected": -313.9624938964844, "loss": 0.252, "rewards/accuracies": 0.890625, "rewards/chosen": -2.3460936546325684, "rewards/margins": 3.794116258621216, "rewards/rejected": -6.141406059265137, "step": 6740 }, { "epoch": 1.701856111933949, "grad_norm": 55.42203140258789, "learning_rate": 2.3489974387585964e-07, "logits/chosen": -1.159082055091858, "logits/rejected": -1.0864746570587158, "logps/chosen": -309.65625, "logps/rejected": -326.42498779296875, "loss": 0.21, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.3440918922424316, "rewards/margins": 3.704394578933716, "rewards/rejected": -6.046679496765137, "step": 6750 }, { "epoch": 1.7043771468187692, "grad_norm": 34.926353454589844, "learning_rate": 2.3416768050064739e-07, "logits/chosen": -1.160681128501892, "logits/rejected": -1.121313452720642, "logps/chosen": -313.40625, "logps/rejected": -317.76873779296875, "loss": 0.1959, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.807916283607483, "rewards/margins": 4.132616996765137, "rewards/rejected": -5.940625190734863, "step": 6760 }, { "epoch": 1.7068981817035893, "grad_norm": 58.994178771972656, "learning_rate": 2.334357534039069e-07, "logits/chosen": -1.1013672351837158, "logits/rejected": -1.03985595703125, "logps/chosen": -296.45001220703125, "logps/rejected": -308.60626220703125, "loss": 0.2025, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.033160448074341, "rewards/margins": 3.9764161109924316, "rewards/rejected": -6.009765625, "step": 6770 }, { "epoch": 1.7094192165884095, "grad_norm": 52.459075927734375, "learning_rate": 2.3270396888578283e-07, "logits/chosen": -1.0966370105743408, "logits/rejected": -1.0048949718475342, "logps/chosen": -287.21875, "logps/rejected": -310.25, "loss": 0.226, "rewards/accuracies": 0.921875, "rewards/chosen": -1.9747803211212158, "rewards/margins": 3.9398193359375, "rewards/rejected": -5.914453029632568, "step": 6780 }, { "epoch": 1.7119402514732298, "grad_norm": 52.806907653808594, "learning_rate": 2.3197233324519274e-07, "logits/chosen": -1.19219970703125, "logits/rejected": NaN, "logps/chosen": -300.26251220703125, "logps/rejected": -328.5, "loss": 0.2229, "rewards/accuracies": 0.890625, "rewards/chosen": -1.9263885021209717, "rewards/margins": 3.865185499191284, "rewards/rejected": -5.791015625, "step": 6790 }, { "epoch": 1.71446128635805, "grad_norm": 54.634307861328125, "learning_rate": 2.312408527797729e-07, "logits/chosen": -1.1331787109375, "logits/rejected": -1.1707031726837158, "logps/chosen": -316.3500061035156, "logps/rejected": -334.4375, "loss": 0.1708, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.149493455886841, "rewards/margins": 4.013085842132568, "rewards/rejected": -6.160351753234863, "step": 6800 }, { "epoch": 1.7169823212428703, "grad_norm": 36.946712493896484, "learning_rate": 2.305095337858236e-07, "logits/chosen": -1.110748291015625, "logits/rejected": -1.109460473060608, "logps/chosen": -315.21563720703125, "logps/rejected": -321.29998779296875, "loss": 0.2232, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0787720680236816, "rewards/margins": 4.053857326507568, "rewards/rejected": -6.132226467132568, "step": 6810 }, { "epoch": 1.7195033561276905, "grad_norm": 60.76621627807617, "learning_rate": 2.2977838255825545e-07, "logits/chosen": -1.2317078113555908, "logits/rejected": -1.173803687095642, "logps/chosen": -313.70001220703125, "logps/rejected": -329.4750061035156, "loss": 0.1947, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.5935790538787842, "rewards/margins": 4.097851753234863, "rewards/rejected": -5.691992282867432, "step": 6820 }, { "epoch": 1.7220243910125106, "grad_norm": 36.37934875488281, "learning_rate": 2.2904740539053477e-07, "logits/chosen": -1.1745116710662842, "logits/rejected": -1.1514892578125, "logps/chosen": -292.6812438964844, "logps/rejected": -274.5375061035156, "loss": 0.2504, "rewards/accuracies": 0.8656250238418579, "rewards/chosen": -1.1040496826171875, "rewards/margins": 3.497354030609131, "rewards/rejected": -4.602148532867432, "step": 6830 }, { "epoch": 1.7245454258973307, "grad_norm": 44.25069046020508, "learning_rate": 2.2831660857462998e-07, "logits/chosen": -1.1512359380722046, "logits/rejected": -1.1413360834121704, "logps/chosen": -301.2749938964844, "logps/rejected": -314.3812561035156, "loss": 0.2143, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.8554443120956421, "rewards/margins": 3.899218797683716, "rewards/rejected": -4.75341796875, "step": 6840 }, { "epoch": 1.727066460782151, "grad_norm": 61.40363693237305, "learning_rate": 2.275859984009568e-07, "logits/chosen": -1.1895751953125, "logits/rejected": -1.2206542491912842, "logps/chosen": -273.83123779296875, "logps/rejected": -304.70001220703125, "loss": 0.2551, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.072595238685608, "rewards/margins": 3.430371046066284, "rewards/rejected": -4.502050876617432, "step": 6850 }, { "epoch": 1.7295874956669715, "grad_norm": 54.41880416870117, "learning_rate": 2.2685558115832445e-07, "logits/chosen": -1.2162597179412842, "logits/rejected": -1.142572045326233, "logps/chosen": -289.38751220703125, "logps/rejected": -313.3999938964844, "loss": 0.2022, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.388641357421875, "rewards/margins": 3.712695360183716, "rewards/rejected": -5.100781440734863, "step": 6860 }, { "epoch": 1.7321085305517916, "grad_norm": 14.012011528015137, "learning_rate": 2.2612536313388172e-07, "logits/chosen": -1.1832396984100342, "logits/rejected": -1.1500122547149658, "logps/chosen": -291.01873779296875, "logps/rejected": -322.83123779296875, "loss": 0.1987, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.426141381263733, "rewards/margins": 3.859081983566284, "rewards/rejected": -5.286035060882568, "step": 6870 }, { "epoch": 1.7346295654366117, "grad_norm": 24.559688568115234, "learning_rate": 2.253953506130622e-07, "logits/chosen": -1.149999976158142, "logits/rejected": -1.1400635242462158, "logps/chosen": -274.54998779296875, "logps/rejected": -305.1812438964844, "loss": 0.1856, "rewards/accuracies": 0.921875, "rewards/chosen": -1.602081298828125, "rewards/margins": 3.8511719703674316, "rewards/rejected": -5.452343940734863, "step": 6880 }, { "epoch": 1.7371506003214319, "grad_norm": 85.66828918457031, "learning_rate": 2.2466554987953107e-07, "logits/chosen": -1.1237976551055908, "logits/rejected": -1.166357398033142, "logps/chosen": -283.7124938964844, "logps/rejected": -311.38751220703125, "loss": 0.2423, "rewards/accuracies": 0.90625, "rewards/chosen": -1.662532091140747, "rewards/margins": 3.855029344558716, "rewards/rejected": -5.517675876617432, "step": 6890 }, { "epoch": 1.739671635206252, "grad_norm": 81.61643981933594, "learning_rate": 2.2393596721512994e-07, "logits/chosen": -1.1964905261993408, "logits/rejected": -1.189306616783142, "logps/chosen": -296.73748779296875, "logps/rejected": -299.70001220703125, "loss": 0.2583, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.5518035888671875, "rewards/margins": 4.013427734375, "rewards/rejected": -5.566601753234863, "step": 6900 }, { "epoch": 1.7421926700910724, "grad_norm": 64.79661560058594, "learning_rate": 2.23206608899824e-07, "logits/chosen": -1.2158203125, "logits/rejected": -1.1887085437774658, "logps/chosen": -299.1499938964844, "logps/rejected": -308.51873779296875, "loss": 0.2358, "rewards/accuracies": 0.90625, "rewards/chosen": -1.6992003917694092, "rewards/margins": 3.9305663108825684, "rewards/rejected": -5.630468845367432, "step": 6910 }, { "epoch": 1.7447137049758927, "grad_norm": 24.7640380859375, "learning_rate": 2.2247748121164686e-07, "logits/chosen": -1.194635033607483, "logits/rejected": -1.178918480873108, "logps/chosen": -289.7437438964844, "logps/rejected": -308.0874938964844, "loss": 0.2048, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5421264171600342, "rewards/margins": 3.921142578125, "rewards/rejected": -5.463183403015137, "step": 6920 }, { "epoch": 1.7472347398607129, "grad_norm": 44.29692840576172, "learning_rate": 2.2174859042664706e-07, "logits/chosen": -1.230371117591858, "logits/rejected": NaN, "logps/chosen": -315.3125, "logps/rejected": -304.9125061035156, "loss": 0.1659, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -1.4905884265899658, "rewards/margins": 3.768798828125, "rewards/rejected": -5.2587890625, "step": 6930 }, { "epoch": 1.749755774745533, "grad_norm": 47.58283996582031, "learning_rate": 2.210199428188343e-07, "logits/chosen": -1.18408203125, "logits/rejected": -1.138671875, "logps/chosen": -312.3999938964844, "logps/rejected": -308.0062561035156, "loss": 0.2728, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.701696753501892, "rewards/margins": 3.532910108566284, "rewards/rejected": -5.235449314117432, "step": 6940 }, { "epoch": 1.7522768096303531, "grad_norm": 32.884429931640625, "learning_rate": 2.2029154466012466e-07, "logits/chosen": -1.1586425304412842, "logits/rejected": -1.135107398033142, "logps/chosen": -281.6968688964844, "logps/rejected": -332.04998779296875, "loss": 0.1999, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.636499047279358, "rewards/margins": 3.898730516433716, "rewards/rejected": -5.537207126617432, "step": 6950 }, { "epoch": 1.7547978445151735, "grad_norm": 38.59349822998047, "learning_rate": 2.1956340222028732e-07, "logits/chosen": -1.208898901939392, "logits/rejected": -1.2027466297149658, "logps/chosen": -309.78125, "logps/rejected": -323.6875, "loss": 0.2094, "rewards/accuracies": 0.90625, "rewards/chosen": -1.5239684581756592, "rewards/margins": 3.780078172683716, "rewards/rejected": -5.302734375, "step": 6960 }, { "epoch": 1.7573188793999936, "grad_norm": 60.351741790771484, "learning_rate": 2.1883552176689016e-07, "logits/chosen": -1.214746117591858, "logits/rejected": -1.175744652748108, "logps/chosen": -318.6625061035156, "logps/rejected": -330.14373779296875, "loss": 0.1817, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.6495177745819092, "rewards/margins": 4.301953315734863, "rewards/rejected": -5.952538967132568, "step": 6970 }, { "epoch": 1.759839914284814, "grad_norm": 22.56215476989746, "learning_rate": 2.181079095652463e-07, "logits/chosen": -1.1741211414337158, "logits/rejected": -1.136987328529358, "logps/chosen": -295.79376220703125, "logps/rejected": -296.2437438964844, "loss": 0.1906, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.6622803211212158, "rewards/margins": 3.964404344558716, "rewards/rejected": -5.62890625, "step": 6980 }, { "epoch": 1.7623609491696342, "grad_norm": 23.98560333251953, "learning_rate": 2.1738057187835952e-07, "logits/chosen": -1.173437476158142, "logits/rejected": -1.0842773914337158, "logps/chosen": -303.7749938964844, "logps/rejected": -305.2749938964844, "loss": 0.1738, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -1.3464279174804688, "rewards/margins": 4.083984375, "rewards/rejected": -5.429883003234863, "step": 6990 }, { "epoch": 1.7648819840544543, "grad_norm": 36.42485427856445, "learning_rate": 2.1665351496687068e-07, "logits/chosen": -1.077172875404358, "logits/rejected": -1.1149780750274658, "logps/chosen": -291.0249938964844, "logps/rejected": -330.3500061035156, "loss": 0.2611, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.8449188470840454, "rewards/margins": 3.899365186691284, "rewards/rejected": -5.742578029632568, "step": 7000 }, { "epoch": 1.7674030189392744, "grad_norm": 41.8332633972168, "learning_rate": 2.159267450890042e-07, "logits/chosen": -1.1143372058868408, "logits/rejected": NaN, "logps/chosen": -282.1625061035156, "logps/rejected": -318.3125, "loss": 0.2058, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.282708764076233, "rewards/margins": 4.120995998382568, "rewards/rejected": -5.402539253234863, "step": 7010 }, { "epoch": 1.7699240538240948, "grad_norm": 52.472660064697266, "learning_rate": 2.1520026850051342e-07, "logits/chosen": -1.2456543445587158, "logits/rejected": -1.1812255382537842, "logps/chosen": -307.08123779296875, "logps/rejected": -330.25, "loss": 0.2573, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.2497161626815796, "rewards/margins": 3.5826783180236816, "rewards/rejected": -4.832129001617432, "step": 7020 }, { "epoch": 1.7724450887089152, "grad_norm": 40.5081901550293, "learning_rate": 2.1447409145462742e-07, "logits/chosen": -1.233862280845642, "logits/rejected": -1.2039916515350342, "logps/chosen": -282.20001220703125, "logps/rejected": -295.9437561035156, "loss": 0.2211, "rewards/accuracies": 0.90625, "rewards/chosen": -0.7961761355400085, "rewards/margins": 3.987255811691284, "rewards/rejected": -4.781152248382568, "step": 7030 }, { "epoch": 1.7749661235937353, "grad_norm": 42.15639877319336, "learning_rate": 2.1374822020199668e-07, "logits/chosen": -1.2635376453399658, "logits/rejected": -1.2142455577850342, "logps/chosen": -317.4125061035156, "logps/rejected": -310.29998779296875, "loss": 0.1602, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -0.8376251459121704, "rewards/margins": 4.2254638671875, "rewards/rejected": -5.0625, "step": 7040 }, { "epoch": 1.7774871584785554, "grad_norm": 40.584781646728516, "learning_rate": 2.130226609906399e-07, "logits/chosen": -1.2356750965118408, "logits/rejected": -1.185400366783142, "logps/chosen": -303.4437561035156, "logps/rejected": -303.38751220703125, "loss": 0.2227, "rewards/accuracies": 0.921875, "rewards/chosen": -1.0569336414337158, "rewards/margins": 3.962695360183716, "rewards/rejected": -5.021484375, "step": 7050 }, { "epoch": 1.7800081933633756, "grad_norm": 71.22672271728516, "learning_rate": 2.1229742006588953e-07, "logits/chosen": -1.1936523914337158, "logits/rejected": -1.193823218345642, "logps/chosen": -293.42498779296875, "logps/rejected": -318.6000061035156, "loss": 0.1658, "rewards/accuracies": 0.9375, "rewards/chosen": -1.1354248523712158, "rewards/margins": 4.288598537445068, "rewards/rejected": -5.423437595367432, "step": 7060 }, { "epoch": 1.782529228248196, "grad_norm": 61.99361801147461, "learning_rate": 2.115725036703383e-07, "logits/chosen": -1.1898071765899658, "logits/rejected": -1.1717712879180908, "logps/chosen": -302.58123779296875, "logps/rejected": -342.4750061035156, "loss": 0.21, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.127203345298767, "rewards/margins": 4.528173923492432, "rewards/rejected": -5.654199123382568, "step": 7070 }, { "epoch": 1.785050263133016, "grad_norm": 68.46553802490234, "learning_rate": 2.1084791804378592e-07, "logits/chosen": -1.232263207435608, "logits/rejected": -1.1902587413787842, "logps/chosen": -329.109375, "logps/rejected": -313.3062438964844, "loss": 0.2801, "rewards/accuracies": 0.878125011920929, "rewards/chosen": -2.0232481956481934, "rewards/margins": 3.694580078125, "rewards/rejected": -5.715234279632568, "step": 7080 }, { "epoch": 1.7875712980178364, "grad_norm": 20.95526885986328, "learning_rate": 2.101236694231845e-07, "logits/chosen": -1.1466553211212158, "logits/rejected": -1.16998291015625, "logps/chosen": -309.26251220703125, "logps/rejected": -317.2875061035156, "loss": 0.2012, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.7224304676055908, "rewards/margins": 4.093945503234863, "rewards/rejected": -5.818945407867432, "step": 7090 }, { "epoch": 1.7900923329026566, "grad_norm": 32.11468505859375, "learning_rate": 2.0939976404258567e-07, "logits/chosen": -1.24072265625, "logits/rejected": -1.1890380382537842, "logps/chosen": -319.48748779296875, "logps/rejected": -315.75, "loss": 0.2403, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.635186791419983, "rewards/margins": 3.9652342796325684, "rewards/rejected": -5.6015625, "step": 7100 }, { "epoch": 1.7926133677874767, "grad_norm": 45.03824234008789, "learning_rate": 2.086762081330863e-07, "logits/chosen": -1.1531493663787842, "logits/rejected": -1.1338379383087158, "logps/chosen": -310.3374938964844, "logps/rejected": -337.125, "loss": 0.1384, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.635858178138733, "rewards/margins": 4.368029594421387, "rewards/rejected": -6.004687309265137, "step": 7110 }, { "epoch": 1.7951344026722968, "grad_norm": 51.74740982055664, "learning_rate": 2.079530079227755e-07, "logits/chosen": -1.1532471179962158, "logits/rejected": -1.1278960704803467, "logps/chosen": -303.3062438964844, "logps/rejected": -322.4125061035156, "loss": 0.2002, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.971154808998108, "rewards/margins": 4.110644340515137, "rewards/rejected": -6.081445217132568, "step": 7120 }, { "epoch": 1.7976554375571172, "grad_norm": 39.18357467651367, "learning_rate": 2.072301696366803e-07, "logits/chosen": -1.1811370849609375, "logits/rejected": -1.1466064453125, "logps/chosen": -326.42498779296875, "logps/rejected": -306.10626220703125, "loss": 0.236, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -2.0232481956481934, "rewards/margins": 3.9530272483825684, "rewards/rejected": -5.977343559265137, "step": 7130 }, { "epoch": 1.8001764724419376, "grad_norm": 21.702817916870117, "learning_rate": 2.0650769949671257e-07, "logits/chosen": -1.1583983898162842, "logits/rejected": -1.0685821771621704, "logps/chosen": -319.9750061035156, "logps/rejected": -321.7875061035156, "loss": 0.1995, "rewards/accuracies": 0.921875, "rewards/chosen": -1.6598937511444092, "rewards/margins": 4.170605659484863, "rewards/rejected": -5.830078125, "step": 7140 }, { "epoch": 1.8026975073267577, "grad_norm": 70.05231475830078, "learning_rate": 2.057856037216155e-07, "logits/chosen": -1.197509765625, "logits/rejected": -1.1431152820587158, "logps/chosen": -328.5406188964844, "logps/rejected": -326.84375, "loss": 0.2782, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.883947730064392, "rewards/margins": 3.9864501953125, "rewards/rejected": -5.870703220367432, "step": 7150 }, { "epoch": 1.8052185422115778, "grad_norm": 51.588199615478516, "learning_rate": 2.0506388852690958e-07, "logits/chosen": -1.1375305652618408, "logits/rejected": -1.0667235851287842, "logps/chosen": -309.46875, "logps/rejected": -319.26251220703125, "loss": 0.2137, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.923803687095642, "rewards/margins": 3.979199171066284, "rewards/rejected": -5.904296875, "step": 7160 }, { "epoch": 1.807739577096398, "grad_norm": 50.91193389892578, "learning_rate": 2.043425601248397e-07, "logits/chosen": -1.1796386241912842, "logits/rejected": -1.147558569908142, "logps/chosen": -305.07501220703125, "logps/rejected": -349.6000061035156, "loss": 0.2098, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.8881622552871704, "rewards/margins": 4.063036918640137, "rewards/rejected": -5.951171875, "step": 7170 }, { "epoch": 1.8102606119812181, "grad_norm": 30.04096221923828, "learning_rate": 2.03621624724321e-07, "logits/chosen": -1.219384789466858, "logits/rejected": -1.1800323724746704, "logps/chosen": -315.734375, "logps/rejected": -317.91876220703125, "loss": 0.1466, "rewards/accuracies": 0.9375, "rewards/chosen": -1.241387963294983, "rewards/margins": 4.197802543640137, "rewards/rejected": -5.439135551452637, "step": 7180 }, { "epoch": 1.8127816468660385, "grad_norm": 68.7504653930664, "learning_rate": 2.0290108853088634e-07, "logits/chosen": -1.182397484779358, "logits/rejected": -1.1044189929962158, "logps/chosen": -298.3062438964844, "logps/rejected": -339.58123779296875, "loss": 0.2239, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3178939819335938, "rewards/margins": 3.9729981422424316, "rewards/rejected": -5.290625095367432, "step": 7190 }, { "epoch": 1.8153026817508588, "grad_norm": 36.92831039428711, "learning_rate": 2.0218095774663197e-07, "logits/chosen": -1.19891357421875, "logits/rejected": -1.244384765625, "logps/chosen": -290.9375, "logps/rejected": -325.7875061035156, "loss": 0.2379, "rewards/accuracies": 0.921875, "rewards/chosen": -1.2448089122772217, "rewards/margins": 3.6868653297424316, "rewards/rejected": -4.9296875, "step": 7200 }, { "epoch": 1.817823716635679, "grad_norm": 50.52592086791992, "learning_rate": 2.0146123857016453e-07, "logits/chosen": -1.220617651939392, "logits/rejected": -1.1038939952850342, "logps/chosen": -315.75, "logps/rejected": -317.3125, "loss": 0.1802, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.163214087486267, "rewards/margins": 4.184423923492432, "rewards/rejected": -5.347754001617432, "step": 7210 }, { "epoch": 1.8203447515204991, "grad_norm": 29.57152557373047, "learning_rate": 2.0074193719654803e-07, "logits/chosen": -1.17822265625, "logits/rejected": -1.1136901378631592, "logps/chosen": -299.57501220703125, "logps/rejected": -310.375, "loss": 0.1828, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -0.9800170660018921, "rewards/margins": 4.062353610992432, "rewards/rejected": -5.04345703125, "step": 7220 }, { "epoch": 1.8228657864053193, "grad_norm": 53.481231689453125, "learning_rate": 2.0002305981724983e-07, "logits/chosen": -1.1823852062225342, "logits/rejected": NaN, "logps/chosen": -307.79998779296875, "logps/rejected": -343.0375061035156, "loss": 0.2837, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -0.9479309320449829, "rewards/margins": 3.863085985183716, "rewards/rejected": -4.809912204742432, "step": 7230 }, { "epoch": 1.8253868212901396, "grad_norm": 49.503753662109375, "learning_rate": 1.99304612620088e-07, "logits/chosen": -1.165826439857483, "logits/rejected": -1.1661498546600342, "logps/chosen": -316.64373779296875, "logps/rejected": -328.20001220703125, "loss": 0.1966, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.039465308189392, "rewards/margins": 4.062207221984863, "rewards/rejected": -5.102246284484863, "step": 7240 }, { "epoch": 1.8279078561749598, "grad_norm": 37.43131637573242, "learning_rate": 1.9858660178917743e-07, "logits/chosen": NaN, "logits/rejected": -1.1943480968475342, "logps/chosen": -300.20623779296875, "logps/rejected": -325.29376220703125, "loss": 0.1436, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.8471435308456421, "rewards/margins": 4.044335842132568, "rewards/rejected": -4.891406059265137, "step": 7250 }, { "epoch": 1.8304288910597801, "grad_norm": 38.07368087768555, "learning_rate": 1.9786903350487737e-07, "logits/chosen": -1.1146240234375, "logits/rejected": NaN, "logps/chosen": -289.29998779296875, "logps/rejected": -323.75, "loss": 0.2243, "rewards/accuracies": 0.921875, "rewards/chosen": -1.232031226158142, "rewards/margins": 3.966357469558716, "rewards/rejected": -5.198339939117432, "step": 7260 }, { "epoch": 1.8329499259446003, "grad_norm": 37.894126892089844, "learning_rate": 1.9715191394373745e-07, "logits/chosen": -1.1885559558868408, "logits/rejected": -1.108056664466858, "logps/chosen": -310.34375, "logps/rejected": -295.4624938964844, "loss": 0.2932, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.688812255859375, "rewards/margins": 3.6373047828674316, "rewards/rejected": -5.326367378234863, "step": 7270 }, { "epoch": 1.8354709608294204, "grad_norm": 18.592529296875, "learning_rate": 1.964352492784449e-07, "logits/chosen": -1.239660620689392, "logits/rejected": -1.174230933189392, "logps/chosen": -329.13751220703125, "logps/rejected": -322.2562561035156, "loss": 0.2346, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.550012230873108, "rewards/margins": 3.8741211891174316, "rewards/rejected": -5.4228515625, "step": 7280 }, { "epoch": 1.8379919957142405, "grad_norm": 65.15435028076172, "learning_rate": 1.957190456777717e-07, "logits/chosen": -1.218542456626892, "logits/rejected": -1.170739769935608, "logps/chosen": -336.3999938964844, "logps/rejected": -329.73748779296875, "loss": 0.2284, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.6474609375, "rewards/margins": 3.7369141578674316, "rewards/rejected": -5.384375095367432, "step": 7290 }, { "epoch": 1.840513030599061, "grad_norm": 48.60396957397461, "learning_rate": 1.9500330930652073e-07, "logits/chosen": -1.1867187023162842, "logits/rejected": -1.112945556640625, "logps/chosen": -315.9624938964844, "logps/rejected": -312.76873779296875, "loss": 0.2434, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.55596923828125, "rewards/margins": 3.893359422683716, "rewards/rejected": -5.448534965515137, "step": 7300 }, { "epoch": 1.8430340654838813, "grad_norm": 49.56249237060547, "learning_rate": 1.9428804632547348e-07, "logits/chosen": -1.152978539466858, "logits/rejected": -1.1125946044921875, "logps/chosen": -302.4312438964844, "logps/rejected": -322.86248779296875, "loss": 0.2171, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -2.0119261741638184, "rewards/margins": 3.822216749191284, "rewards/rejected": -5.832226753234863, "step": 7310 }, { "epoch": 1.8455551003687014, "grad_norm": 31.021007537841797, "learning_rate": 1.9357326289133635e-07, "logits/chosen": -1.170019507408142, "logits/rejected": -1.130346655845642, "logps/chosen": -294.8999938964844, "logps/rejected": -314.86248779296875, "loss": 0.2278, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -1.97314453125, "rewards/margins": 3.8570313453674316, "rewards/rejected": -5.830664157867432, "step": 7320 }, { "epoch": 1.8480761352535215, "grad_norm": 27.7810001373291, "learning_rate": 1.9285896515668841e-07, "logits/chosen": -1.089074730873108, "logits/rejected": -1.113073706626892, "logps/chosen": -300.5, "logps/rejected": -339.4750061035156, "loss": 0.2816, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -2.132189989089966, "rewards/margins": 3.7269043922424316, "rewards/rejected": -5.856249809265137, "step": 7330 }, { "epoch": 1.8505971701383417, "grad_norm": 63.46874237060547, "learning_rate": 1.9214515926992775e-07, "logits/chosen": -1.1709716320037842, "logits/rejected": -1.149572730064392, "logps/chosen": -297.88751220703125, "logps/rejected": -338.7124938964844, "loss": 0.2044, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.0968995094299316, "rewards/margins": 3.894238233566284, "rewards/rejected": -5.992383003234863, "step": 7340 }, { "epoch": 1.853118205023162, "grad_norm": 48.2232780456543, "learning_rate": 1.9143185137521863e-07, "logits/chosen": -1.1376526355743408, "logits/rejected": -1.178442358970642, "logps/chosen": -320.36248779296875, "logps/rejected": -307.8125, "loss": 0.22, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.9373047351837158, "rewards/margins": 3.641845703125, "rewards/rejected": -5.579297065734863, "step": 7350 }, { "epoch": 1.8556392399079822, "grad_norm": 56.41461181640625, "learning_rate": 1.9071904761243935e-07, "logits/chosen": -1.116979956626892, "logits/rejected": -1.0767822265625, "logps/chosen": -292.09375, "logps/rejected": -312.11248779296875, "loss": 0.2359, "rewards/accuracies": 0.921875, "rewards/chosen": -1.8706543445587158, "rewards/margins": 3.8099608421325684, "rewards/rejected": -5.6826171875, "step": 7360 }, { "epoch": 1.8581602747928025, "grad_norm": 42.00067138671875, "learning_rate": 1.9000675411712827e-07, "logits/chosen": -1.0812804698944092, "logits/rejected": -1.056420922279358, "logps/chosen": -288.96875, "logps/rejected": -306.2124938964844, "loss": 0.1695, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.909002661705017, "rewards/margins": 3.8208985328674316, "rewards/rejected": -5.732226371765137, "step": 7370 }, { "epoch": 1.8606813096776227, "grad_norm": 32.646976470947266, "learning_rate": 1.8929497702043194e-07, "logits/chosen": -1.1477782726287842, "logits/rejected": -1.127862572669983, "logps/chosen": -305.01873779296875, "logps/rejected": -340.20001220703125, "loss": 0.2013, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.578704833984375, "rewards/margins": 3.863208055496216, "rewards/rejected": -5.444238185882568, "step": 7380 }, { "epoch": 1.8632023445624428, "grad_norm": 35.01163864135742, "learning_rate": 1.8858372244905162e-07, "logits/chosen": -1.206610083580017, "logits/rejected": -1.176977515220642, "logps/chosen": -310.64373779296875, "logps/rejected": -320.23748779296875, "loss": 0.2215, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.5214020013809204, "rewards/margins": 3.83935546875, "rewards/rejected": -5.360253810882568, "step": 7390 }, { "epoch": 1.865723379447263, "grad_norm": 60.340877532958984, "learning_rate": 1.878729965251913e-07, "logits/chosen": -1.2282226085662842, "logits/rejected": -1.130517601966858, "logps/chosen": -297.16876220703125, "logps/rejected": -297.3999938964844, "loss": 0.2292, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.558538794517517, "rewards/margins": 3.6513671875, "rewards/rejected": -5.211328029632568, "step": 7400 }, { "epoch": 1.8682444143320833, "grad_norm": 27.210193634033203, "learning_rate": 1.871628053665043e-07, "logits/chosen": -1.1539306640625, "logits/rejected": -1.116418480873108, "logps/chosen": -281.4312438964844, "logps/rejected": -301.7250061035156, "loss": 0.2023, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.800054907798767, "rewards/margins": 3.984326124191284, "rewards/rejected": -5.78125, "step": 7410 }, { "epoch": 1.8707654492169037, "grad_norm": 62.806583404541016, "learning_rate": 1.864531550860407e-07, "logits/chosen": -1.128961205482483, "logits/rejected": -1.1193358898162842, "logps/chosen": -315.125, "logps/rejected": -343.7124938964844, "loss": 0.2706, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.7468230724334717, "rewards/margins": 4.126611232757568, "rewards/rejected": -5.873632907867432, "step": 7420 }, { "epoch": 1.8732864841017238, "grad_norm": 52.17653274536133, "learning_rate": 1.8574405179219548e-07, "logits/chosen": -1.184814453125, "logits/rejected": -1.18505859375, "logps/chosen": -291.1875, "logps/rejected": -314.29998779296875, "loss": 0.237, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -1.6356322765350342, "rewards/margins": 3.832763671875, "rewards/rejected": -5.46728515625, "step": 7430 }, { "epoch": 1.875807518986544, "grad_norm": 56.88821792602539, "learning_rate": 1.8503550158865476e-07, "logits/chosen": -1.171630859375, "logits/rejected": -1.1357085704803467, "logps/chosen": -308.3500061035156, "logps/rejected": -325.875, "loss": 0.2269, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.572625756263733, "rewards/margins": 3.9573731422424316, "rewards/rejected": -5.529394626617432, "step": 7440 }, { "epoch": 1.878328553871364, "grad_norm": 76.11968994140625, "learning_rate": 1.8432751057434438e-07, "logits/chosen": -1.2202575206756592, "logits/rejected": -1.1420433521270752, "logps/chosen": -300.82501220703125, "logps/rejected": -312.4624938964844, "loss": 0.2412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.3674484491348267, "rewards/margins": 3.9940428733825684, "rewards/rejected": -5.360547065734863, "step": 7450 }, { "epoch": 1.8808495887561845, "grad_norm": 48.39967346191406, "learning_rate": 1.8362008484337637e-07, "logits/chosen": -1.162255883216858, "logits/rejected": -1.19366455078125, "logps/chosen": -280.67498779296875, "logps/rejected": -331.71875, "loss": 0.1942, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.4645812511444092, "rewards/margins": 3.917285203933716, "rewards/rejected": -5.380078315734863, "step": 7460 }, { "epoch": 1.8833706236410046, "grad_norm": 56.637794494628906, "learning_rate": 1.8291323048499762e-07, "logits/chosen": -1.2105224132537842, "logits/rejected": -1.1904785633087158, "logps/chosen": -301.11248779296875, "logps/rejected": -297.3374938964844, "loss": 0.2696, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.576446533203125, "rewards/margins": 3.677197217941284, "rewards/rejected": -5.254296779632568, "step": 7470 }, { "epoch": 1.885891658525825, "grad_norm": 29.607648849487305, "learning_rate": 1.8220695358353643e-07, "logits/chosen": -1.1474609375, "logits/rejected": -1.118371605873108, "logps/chosen": -305.5375061035156, "logps/rejected": -324.7562561035156, "loss": 0.2046, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.4046661853790283, "rewards/margins": 4.186132907867432, "rewards/rejected": -5.593359470367432, "step": 7480 }, { "epoch": 1.888412693410645, "grad_norm": 13.99630069732666, "learning_rate": 1.815012602183506e-07, "logits/chosen": -1.153906226158142, "logits/rejected": -1.085168480873108, "logps/chosen": -288.42498779296875, "logps/rejected": -315.51251220703125, "loss": 0.2775, "rewards/accuracies": 0.878125011920929, "rewards/chosen": -1.6456924676895142, "rewards/margins": 3.4505372047424316, "rewards/rejected": -5.094628810882568, "step": 7490 }, { "epoch": 1.8909337282954652, "grad_norm": 46.438785552978516, "learning_rate": 1.8079615646377535e-07, "logits/chosen": -1.2250487804412842, "logits/rejected": -1.2243163585662842, "logps/chosen": -297.53125, "logps/rejected": -304.17498779296875, "loss": 0.2278, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.40008544921875, "rewards/margins": 3.6764159202575684, "rewards/rejected": -5.075976371765137, "step": 7500 }, { "epoch": 1.8934547631802854, "grad_norm": 60.958229064941406, "learning_rate": 1.800916483890705e-07, "logits/chosen": -1.116601586341858, "logits/rejected": -1.04278564453125, "logps/chosen": -300.8812561035156, "logps/rejected": -331.48748779296875, "loss": 0.2387, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.576141357421875, "rewards/margins": 4.094872951507568, "rewards/rejected": -5.670312404632568, "step": 7510 }, { "epoch": 1.8959757980651057, "grad_norm": 28.104612350463867, "learning_rate": 1.793877420583686e-07, "logits/chosen": -1.158361792564392, "logits/rejected": -1.134667992591858, "logps/chosen": -315.29998779296875, "logps/rejected": -324.6875, "loss": 0.2139, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.790228247642517, "rewards/margins": 3.9569334983825684, "rewards/rejected": -5.746289253234863, "step": 7520 }, { "epoch": 1.898496832949926, "grad_norm": 43.907108306884766, "learning_rate": 1.786844435306225e-07, "logits/chosen": -1.1804687976837158, "logits/rejected": -1.0574157238006592, "logps/chosen": -294.1312561035156, "logps/rejected": -276.9125061035156, "loss": 0.1924, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.487634301185608, "rewards/margins": 3.907885789871216, "rewards/rejected": -5.393750190734863, "step": 7530 }, { "epoch": 1.9010178678347462, "grad_norm": 47.292930603027344, "learning_rate": 1.7798175885955364e-07, "logits/chosen": -1.1721069812774658, "logits/rejected": -1.195397973060608, "logps/chosen": -311.875, "logps/rejected": -292.3125, "loss": 0.2577, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -1.620141625404358, "rewards/margins": 3.6788086891174316, "rewards/rejected": -5.298144340515137, "step": 7540 }, { "epoch": 1.9035389027195664, "grad_norm": 66.42214965820312, "learning_rate": 1.7727969409359922e-07, "logits/chosen": -1.2217285633087158, "logits/rejected": -1.190954566001892, "logps/chosen": -283.6937561035156, "logps/rejected": -308.70001220703125, "loss": 0.1982, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.5985686779022217, "rewards/margins": 3.812548875808716, "rewards/rejected": -5.410351753234863, "step": 7550 }, { "epoch": 1.9060599376043865, "grad_norm": 40.26822280883789, "learning_rate": 1.7657825527586066e-07, "logits/chosen": -1.2541015148162842, "logits/rejected": -1.2113158702850342, "logps/chosen": -307.51873779296875, "logps/rejected": -322.45623779296875, "loss": 0.2394, "rewards/accuracies": 0.921875, "rewards/chosen": -1.6601440906524658, "rewards/margins": 3.708544969558716, "rewards/rejected": -5.368359565734863, "step": 7560 }, { "epoch": 1.9085809724892067, "grad_norm": 62.55760192871094, "learning_rate": 1.7587744844405172e-07, "logits/chosen": -1.2342407703399658, "logits/rejected": -1.202978491783142, "logps/chosen": -297.98748779296875, "logps/rejected": -305.90625, "loss": 0.2699, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.6728484630584717, "rewards/margins": 3.495410203933716, "rewards/rejected": -5.1689453125, "step": 7570 }, { "epoch": 1.911102007374027, "grad_norm": 19.37392234802246, "learning_rate": 1.7517727963044592e-07, "logits/chosen": -1.1997802257537842, "logits/rejected": -1.0855712890625, "logps/chosen": -295.1312561035156, "logps/rejected": -291.5687561035156, "loss": 0.2427, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.407446265220642, "rewards/margins": 3.903393507003784, "rewards/rejected": -5.311816215515137, "step": 7580 }, { "epoch": 1.9136230422588474, "grad_norm": 26.273832321166992, "learning_rate": 1.7447775486182518e-07, "logits/chosen": -1.254736304283142, "logits/rejected": -1.1654876470565796, "logps/chosen": -306.0062561035156, "logps/rejected": -315.51873779296875, "loss": 0.1584, "rewards/accuracies": 0.9375, "rewards/chosen": -1.317114233970642, "rewards/margins": 4.092382907867432, "rewards/rejected": -5.406933784484863, "step": 7590 }, { "epoch": 1.9161440771436675, "grad_norm": 39.29946517944336, "learning_rate": 1.7377888015942748e-07, "logits/chosen": -1.203125, "logits/rejected": -1.188256859779358, "logps/chosen": -293.53125, "logps/rejected": -300.0874938964844, "loss": 0.2223, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.52764892578125, "rewards/margins": 3.745898485183716, "rewards/rejected": -5.2744140625, "step": 7600 }, { "epoch": 1.9186651120284877, "grad_norm": 23.348567962646484, "learning_rate": 1.7308066153889578e-07, "logits/chosen": -1.155664086341858, "logits/rejected": -1.10052490234375, "logps/chosen": -293.5687561035156, "logps/rejected": -323.2749938964844, "loss": 0.2016, "rewards/accuracies": 0.90625, "rewards/chosen": -1.3254272937774658, "rewards/margins": 3.885498046875, "rewards/rejected": -5.208105564117432, "step": 7610 }, { "epoch": 1.9211861469133078, "grad_norm": 48.23688888549805, "learning_rate": 1.7238310501022517e-07, "logits/chosen": -1.153076171875, "logits/rejected": -1.1710388660430908, "logps/chosen": -288.125, "logps/rejected": -305.9375, "loss": 0.2616, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.390020728111267, "rewards/margins": 3.6179442405700684, "rewards/rejected": -5.009375095367432, "step": 7620 }, { "epoch": 1.9237071817981282, "grad_norm": 23.3243465423584, "learning_rate": 1.71686216577712e-07, "logits/chosen": -1.1703002452850342, "logits/rejected": -1.1937377452850342, "logps/chosen": -293.9125061035156, "logps/rejected": -311.5718688964844, "loss": 0.1641, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.2073485851287842, "rewards/margins": 3.781494140625, "rewards/rejected": -4.989501953125, "step": 7630 }, { "epoch": 1.9262282166829483, "grad_norm": 70.19459533691406, "learning_rate": 1.70990002239902e-07, "logits/chosen": -1.2073974609375, "logits/rejected": -1.1364624500274658, "logps/chosen": -323.54376220703125, "logps/rejected": -310.78125, "loss": 0.2142, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4185912609100342, "rewards/margins": 3.931445360183716, "rewards/rejected": -5.3515625, "step": 7640 }, { "epoch": 1.9287492515677687, "grad_norm": 57.75076675415039, "learning_rate": 1.7029446798953828e-07, "logits/chosen": -1.165252685546875, "logits/rejected": -1.179132103919983, "logps/chosen": -278.4375, "logps/rejected": -337.29998779296875, "loss": 0.1795, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -1.5640045404434204, "rewards/margins": 4.150390625, "rewards/rejected": -5.712206840515137, "step": 7650 }, { "epoch": 1.9312702864525888, "grad_norm": 26.637720108032227, "learning_rate": 1.6959961981351025e-07, "logits/chosen": -1.166986107826233, "logits/rejected": -1.143103003501892, "logps/chosen": -304.1812438964844, "logps/rejected": -309.17498779296875, "loss": 0.2837, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.929724097251892, "rewards/margins": 3.954785108566284, "rewards/rejected": -5.8876953125, "step": 7660 }, { "epoch": 1.933791321337409, "grad_norm": 69.45800018310547, "learning_rate": 1.6890546369280167e-07, "logits/chosen": -1.156982421875, "logits/rejected": -1.2170898914337158, "logps/chosen": -307.39373779296875, "logps/rejected": -331.65625, "loss": 0.2257, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.6480376720428467, "rewards/margins": 4.034033298492432, "rewards/rejected": -5.680859565734863, "step": 7670 }, { "epoch": 1.936312356222229, "grad_norm": 46.982643127441406, "learning_rate": 1.6821200560243963e-07, "logits/chosen": -1.1876220703125, "logits/rejected": -1.088720679283142, "logps/chosen": -288.5, "logps/rejected": -316.8125, "loss": 0.2756, "rewards/accuracies": 0.890625, "rewards/chosen": -1.824945092201233, "rewards/margins": 3.7763915061950684, "rewards/rejected": -5.601171970367432, "step": 7680 }, { "epoch": 1.9388333911070494, "grad_norm": 35.12260437011719, "learning_rate": 1.6751925151144259e-07, "logits/chosen": -1.2165100574493408, "logits/rejected": -1.153601050376892, "logps/chosen": -285.70001220703125, "logps/rejected": -319.6499938964844, "loss": 0.1952, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.728387475013733, "rewards/margins": 3.882763624191284, "rewards/rejected": -5.612206935882568, "step": 7690 }, { "epoch": 1.9413544259918698, "grad_norm": 35.687042236328125, "learning_rate": 1.6682720738276918e-07, "logits/chosen": -1.19384765625, "logits/rejected": -1.167883276939392, "logps/chosen": -314.6875, "logps/rejected": -322.61248779296875, "loss": 0.2299, "rewards/accuracies": 0.90625, "rewards/chosen": -1.49139404296875, "rewards/margins": 3.896240234375, "rewards/rejected": -5.388281345367432, "step": 7700 }, { "epoch": 1.94387546087669, "grad_norm": 13.64608097076416, "learning_rate": 1.6613587917326738e-07, "logits/chosen": -1.192773461341858, "logits/rejected": -1.1346924304962158, "logps/chosen": -296.86248779296875, "logps/rejected": -321.3999938964844, "loss": 0.1853, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.4157593250274658, "rewards/margins": 4.344433784484863, "rewards/rejected": -5.75927734375, "step": 7710 }, { "epoch": 1.94639649576151, "grad_norm": 64.79629516601562, "learning_rate": 1.6544527283362237e-07, "logits/chosen": -1.1622436046600342, "logits/rejected": -1.1357421875, "logps/chosen": -287.6499938964844, "logps/rejected": -302.5375061035156, "loss": 0.1799, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -1.3306884765625, "rewards/margins": 3.9735350608825684, "rewards/rejected": -5.307226657867432, "step": 7720 }, { "epoch": 1.9489175306463302, "grad_norm": 48.76823425292969, "learning_rate": 1.6475539430830604e-07, "logits/chosen": -1.1828491687774658, "logits/rejected": -1.1461181640625, "logps/chosen": -302.29998779296875, "logps/rejected": -328.8374938964844, "loss": 0.27, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.4570099115371704, "rewards/margins": 3.7859864234924316, "rewards/rejected": -5.241601467132568, "step": 7730 }, { "epoch": 1.9514385655311506, "grad_norm": 47.993526458740234, "learning_rate": 1.640662495355253e-07, "logits/chosen": -1.2107055187225342, "logits/rejected": -1.1277954578399658, "logps/chosen": -314.9375, "logps/rejected": -307.20001220703125, "loss": 0.2817, "rewards/accuracies": 0.90625, "rewards/chosen": -1.59698486328125, "rewards/margins": 3.7835450172424316, "rewards/rejected": -5.379101753234863, "step": 7740 }, { "epoch": 1.9539596004159707, "grad_norm": 64.34617614746094, "learning_rate": 1.6337784444717142e-07, "logits/chosen": -1.1866943836212158, "logits/rejected": -1.1796753406524658, "logps/chosen": -275.7875061035156, "logps/rejected": -296.9750061035156, "loss": 0.2628, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.360137939453125, "rewards/margins": 3.6226563453674316, "rewards/rejected": -4.983300685882568, "step": 7750 }, { "epoch": 1.956480635300791, "grad_norm": 52.332664489746094, "learning_rate": 1.626901849687687e-07, "logits/chosen": -1.182946801185608, "logits/rejected": -1.1652953624725342, "logps/chosen": -289.2093811035156, "logps/rejected": -308.66876220703125, "loss": 0.1713, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.5947754383087158, "rewards/margins": 4.042382717132568, "rewards/rejected": -5.638867378234863, "step": 7760 }, { "epoch": 1.9590016701856112, "grad_norm": 52.05509567260742, "learning_rate": 1.6200327701942328e-07, "logits/chosen": -1.251123070716858, "logits/rejected": -1.162377953529358, "logps/chosen": -316.75, "logps/rejected": -322.75, "loss": 0.2451, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.1040756702423096, "rewards/margins": 3.8963379859924316, "rewards/rejected": -4.997754096984863, "step": 7770 }, { "epoch": 1.9615227050704314, "grad_norm": 22.568510055541992, "learning_rate": 1.6131712651177288e-07, "logits/chosen": -1.1433242559432983, "logits/rejected": -1.0986816883087158, "logps/chosen": -274.8999938964844, "logps/rejected": -298.79998779296875, "loss": 0.1936, "rewards/accuracies": 0.921875, "rewards/chosen": -1.565435767173767, "rewards/margins": 3.958056688308716, "rewards/rejected": -5.525195121765137, "step": 7780 }, { "epoch": 1.9640437399552515, "grad_norm": 37.83517837524414, "learning_rate": 1.6063173935193503e-07, "logits/chosen": -1.076226830482483, "logits/rejected": -1.0985596179962158, "logps/chosen": -274.79376220703125, "logps/rejected": -309.38751220703125, "loss": 0.2418, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -1.1774413585662842, "rewards/margins": 4.007861137390137, "rewards/rejected": -5.183667182922363, "step": 7790 }, { "epoch": 1.9665647748400719, "grad_norm": 33.46553039550781, "learning_rate": 1.5994712143945693e-07, "logits/chosen": -1.147851586341858, "logits/rejected": -1.068884253501892, "logps/chosen": -294.14373779296875, "logps/rejected": -310.8999938964844, "loss": 0.2499, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.3985474109649658, "rewards/margins": 3.975830078125, "rewards/rejected": -5.372656345367432, "step": 7800 }, { "epoch": 1.9690858097248922, "grad_norm": 63.27622985839844, "learning_rate": 1.592632786672642e-07, "logits/chosen": -1.171142578125, "logits/rejected": -1.1185424327850342, "logps/chosen": -312.65625, "logps/rejected": -321.3500061035156, "loss": 0.2516, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.239068627357483, "rewards/margins": 3.9042725563049316, "rewards/rejected": -5.14501953125, "step": 7810 }, { "epoch": 1.9716068446097124, "grad_norm": 53.166255950927734, "learning_rate": 1.5858021692161054e-07, "logits/chosen": -1.1641693115234375, "logits/rejected": -1.1528809070587158, "logps/chosen": -301.13751220703125, "logps/rejected": -289.9312438964844, "loss": 0.2474, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -1.0548827648162842, "rewards/margins": 3.8763670921325684, "rewards/rejected": -4.929931640625, "step": 7820 }, { "epoch": 1.9741278794945325, "grad_norm": 33.32133102416992, "learning_rate": 1.578979420820268e-07, "logits/chosen": -1.178247094154358, "logits/rejected": -1.154272437095642, "logps/chosen": -291.5, "logps/rejected": -300.875, "loss": 0.217, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4634521007537842, "rewards/margins": 3.802197217941284, "rewards/rejected": -5.264062404632568, "step": 7830 }, { "epoch": 1.9766489143793526, "grad_norm": 32.27366638183594, "learning_rate": 1.572164600212703e-07, "logits/chosen": -1.191552758216858, "logits/rejected": -1.1298401355743408, "logps/chosen": -306.3812561035156, "logps/rejected": -330.3374938964844, "loss": 0.1884, "rewards/accuracies": 0.921875, "rewards/chosen": -1.5618469715118408, "rewards/margins": 3.993359327316284, "rewards/rejected": -5.555468559265137, "step": 7840 }, { "epoch": 1.9791699492641728, "grad_norm": 66.00794982910156, "learning_rate": 1.5653577660527474e-07, "logits/chosen": -1.1798095703125, "logits/rejected": -1.1678345203399658, "logps/chosen": -314.6000061035156, "logps/rejected": -314.54376220703125, "loss": 0.2634, "rewards/accuracies": 0.90625, "rewards/chosen": -1.4441254138946533, "rewards/margins": 3.921435594558716, "rewards/rejected": -5.366015434265137, "step": 7850 }, { "epoch": 1.9816909841489931, "grad_norm": 40.01249694824219, "learning_rate": 1.5585589769309904e-07, "logits/chosen": -1.217675805091858, "logits/rejected": -1.2192871570587158, "logps/chosen": -326.34375, "logps/rejected": -334.41876220703125, "loss": 0.2045, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.3330872058868408, "rewards/margins": 4.169384956359863, "rewards/rejected": -5.503710746765137, "step": 7860 }, { "epoch": 1.9842120190338135, "grad_norm": 14.252655982971191, "learning_rate": 1.5517682913687764e-07, "logits/chosen": -1.1039550304412842, "logits/rejected": -1.070104956626892, "logps/chosen": -303.0062561035156, "logps/rejected": -313.33123779296875, "loss": 0.2002, "rewards/accuracies": 0.921875, "rewards/chosen": -1.8024413585662842, "rewards/margins": 3.9036622047424316, "rewards/rejected": -5.705078125, "step": 7870 }, { "epoch": 1.9867330539186336, "grad_norm": 68.55850982666016, "learning_rate": 1.544985767817693e-07, "logits/chosen": -1.151281714439392, "logits/rejected": -1.1851074695587158, "logps/chosen": -304.3187561035156, "logps/rejected": -309.3812561035156, "loss": 0.2472, "rewards/accuracies": 0.878125011920929, "rewards/chosen": -1.7944762706756592, "rewards/margins": 3.6312499046325684, "rewards/rejected": -5.427148342132568, "step": 7880 }, { "epoch": 1.9892540888034538, "grad_norm": 79.71931457519531, "learning_rate": 1.5382114646590776e-07, "logits/chosen": NaN, "logits/rejected": -1.160913109779358, "logps/chosen": -327.76251220703125, "logps/rejected": -325.1875, "loss": 0.2788, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.6047241687774658, "rewards/margins": 3.879687547683716, "rewards/rejected": -5.482714653015137, "step": 7890 }, { "epoch": 1.991775123688274, "grad_norm": 86.6191177368164, "learning_rate": 1.5314454402035055e-07, "logits/chosen": -1.221826195716858, "logits/rejected": -1.097039818763733, "logps/chosen": -306.8687438964844, "logps/rejected": -310.0687561035156, "loss": 0.283, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.570031762123108, "rewards/margins": 4.181323051452637, "rewards/rejected": -5.751074314117432, "step": 7900 }, { "epoch": 1.9942961585730943, "grad_norm": 54.50166702270508, "learning_rate": 1.5246877526902925e-07, "logits/chosen": -1.2642333507537842, "logits/rejected": -1.164770483970642, "logps/chosen": -293.1156311035156, "logps/rejected": -290.8999938964844, "loss": 0.2078, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.9391708374023438, "rewards/margins": 3.8685059547424316, "rewards/rejected": -4.808203220367432, "step": 7910 }, { "epoch": 1.9968171934579144, "grad_norm": 36.27590560913086, "learning_rate": 1.5179384602869963e-07, "logits/chosen": -1.250512719154358, "logits/rejected": -1.2113158702850342, "logps/chosen": -324.53125, "logps/rejected": -305.98126220703125, "loss": 0.1916, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.6489197015762329, "rewards/margins": 3.955273389816284, "rewards/rejected": -4.604784965515137, "step": 7920 }, { "epoch": 1.9993382283427348, "grad_norm": 58.246299743652344, "learning_rate": 1.5111976210889093e-07, "logits/chosen": -1.2183349132537842, "logits/rejected": NaN, "logps/chosen": -306.46875, "logps/rejected": -326.53125, "loss": 0.2107, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.170811414718628, "rewards/margins": 3.90380859375, "rewards/rejected": -5.075293064117432, "step": 7930 }, { "epoch": 2.0020168279078563, "grad_norm": 10.656667709350586, "learning_rate": 1.5044652931185647e-07, "logits/chosen": NaN, "logits/rejected": -1.1141183376312256, "logps/chosen": -300.76190185546875, "logps/rejected": -312.1428527832031, "loss": 0.1289, "rewards/accuracies": 0.9553571343421936, "rewards/chosen": -0.8152959942817688, "rewards/margins": 4.322544574737549, "rewards/rejected": -5.13876485824585, "step": 7940 }, { "epoch": 2.0045378627926764, "grad_norm": 6.418241024017334, "learning_rate": 1.4977415343252313e-07, "logits/chosen": -1.1527984142303467, "logits/rejected": -1.1155884265899658, "logps/chosen": -291.15625, "logps/rejected": -302.0249938964844, "loss": 0.0487, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.9614166021347046, "rewards/margins": 4.958788871765137, "rewards/rejected": -5.920117378234863, "step": 7950 }, { "epoch": 2.0070588976774966, "grad_norm": 18.13500213623047, "learning_rate": 1.4910264025844217e-07, "logits/chosen": -1.182641625404358, "logits/rejected": -1.108789086341858, "logps/chosen": -284.61248779296875, "logps/rejected": -325.0, "loss": 0.0575, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.067224144935608, "rewards/margins": 4.996484279632568, "rewards/rejected": -6.062695503234863, "step": 7960 }, { "epoch": 2.0095799325623167, "grad_norm": 9.839264869689941, "learning_rate": 1.4843199556973868e-07, "logits/chosen": -1.1890137195587158, "logits/rejected": -1.1378052234649658, "logps/chosen": -328.7875061035156, "logps/rejected": -331.54998779296875, "loss": 0.0505, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.9071899652481079, "rewards/margins": 5.5849609375, "rewards/rejected": -6.490624904632568, "step": 7970 }, { "epoch": 2.012100967447137, "grad_norm": 8.16525936126709, "learning_rate": 1.4776222513906216e-07, "logits/chosen": -1.2042968273162842, "logits/rejected": -1.1714050769805908, "logps/chosen": -302.328125, "logps/rejected": -322.51251220703125, "loss": 0.0756, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -1.3151428699493408, "rewards/margins": 5.09765625, "rewards/rejected": -6.412109375, "step": 7980 }, { "epoch": 2.0146220023319574, "grad_norm": 9.761605262756348, "learning_rate": 1.4709333473153717e-07, "logits/chosen": -1.192602515220642, "logits/rejected": -1.120324730873108, "logps/chosen": -299.92498779296875, "logps/rejected": -337.5249938964844, "loss": 0.0611, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.3048675060272217, "rewards/margins": 5.318945407867432, "rewards/rejected": -6.623339653015137, "step": 7990 }, { "epoch": 2.0171430372167776, "grad_norm": 5.7592291831970215, "learning_rate": 1.4642533010471304e-07, "logits/chosen": -1.21588134765625, "logits/rejected": -1.2119872570037842, "logps/chosen": -312.75, "logps/rejected": -350.7250061035156, "loss": 0.0502, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.2261474132537842, "rewards/margins": 5.638671875, "rewards/rejected": -6.861718654632568, "step": 8000 }, { "epoch": 2.0196640721015977, "grad_norm": 12.713602066040039, "learning_rate": 1.4575821700851485e-07, "logits/chosen": -1.238745093345642, "logits/rejected": -1.130151391029358, "logps/chosen": -294.09375, "logps/rejected": -326.15625, "loss": 0.0482, "rewards/accuracies": 0.984375, "rewards/chosen": -1.1590087413787842, "rewards/margins": 5.390429496765137, "rewards/rejected": -6.548632621765137, "step": 8010 }, { "epoch": 2.022185106986418, "grad_norm": 21.444751739501953, "learning_rate": 1.4509200118519347e-07, "logits/chosen": -1.2036864757537842, "logits/rejected": -1.1689269542694092, "logps/chosen": -325.88751220703125, "logps/rejected": -332.45623779296875, "loss": 0.0487, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.475341796875, "rewards/margins": 5.63671875, "rewards/rejected": -7.108202934265137, "step": 8020 }, { "epoch": 2.024706141871238, "grad_norm": 10.78364372253418, "learning_rate": 1.444266883692768e-07, "logits/chosen": -1.2017090320587158, "logits/rejected": -1.1442413330078125, "logps/chosen": -311.42498779296875, "logps/rejected": -320.73126220703125, "loss": 0.0449, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7278320789337158, "rewards/margins": 5.237890720367432, "rewards/rejected": -6.96484375, "step": 8030 }, { "epoch": 2.0272271767560586, "grad_norm": 19.230571746826172, "learning_rate": 1.4376228428751963e-07, "logits/chosen": -1.237158179283142, "logits/rejected": -1.1794312000274658, "logps/chosen": -297.51873779296875, "logps/rejected": -375.57501220703125, "loss": 0.0355, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.4023864269256592, "rewards/margins": 6.161718845367432, "rewards/rejected": -7.562890529632568, "step": 8040 }, { "epoch": 2.0297482116408787, "grad_norm": 11.802494049072266, "learning_rate": 1.4309879465885478e-07, "logits/chosen": -1.171606421470642, "logits/rejected": -1.0933105945587158, "logps/chosen": -288.84375, "logps/rejected": -332.0625, "loss": 0.0446, "rewards/accuracies": 0.984375, "rewards/chosen": -1.374725341796875, "rewards/margins": 5.962695121765137, "rewards/rejected": -7.337304592132568, "step": 8050 }, { "epoch": 2.032269246525699, "grad_norm": 27.448190689086914, "learning_rate": 1.4243622519434407e-07, "logits/chosen": -1.0920288562774658, "logits/rejected": -0.997802734375, "logps/chosen": -264.1312561035156, "logps/rejected": -327.0, "loss": 0.0643, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.884466528892517, "rewards/margins": 5.689257621765137, "rewards/rejected": -7.575390815734863, "step": 8060 }, { "epoch": 2.034790281410519, "grad_norm": 5.394522190093994, "learning_rate": 1.4177458159712863e-07, "logits/chosen": -1.201196312904358, "logits/rejected": -1.0519500970840454, "logps/chosen": -308.6625061035156, "logps/rejected": -336.5874938964844, "loss": 0.0501, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.566705346107483, "rewards/margins": 6.06640625, "rewards/rejected": -7.633593559265137, "step": 8070 }, { "epoch": 2.037311316295339, "grad_norm": 8.081001281738281, "learning_rate": 1.411138695623802e-07, "logits/chosen": -1.247949242591858, "logits/rejected": -1.1958434581756592, "logps/chosen": -293.25, "logps/rejected": -314.3125, "loss": 0.0424, "rewards/accuracies": 0.984375, "rewards/chosen": -1.3922271728515625, "rewards/margins": 5.664843559265137, "rewards/rejected": -7.056836128234863, "step": 8080 }, { "epoch": 2.0398323511801593, "grad_norm": 13.59231948852539, "learning_rate": 1.4045409477725185e-07, "logits/chosen": -1.230902075767517, "logits/rejected": -1.155676245689392, "logps/chosen": -293.3187561035156, "logps/rejected": -328.29376220703125, "loss": 0.0597, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -1.3070068359375, "rewards/margins": 5.885546684265137, "rewards/rejected": -7.192773342132568, "step": 8090 }, { "epoch": 2.04235338606498, "grad_norm": 13.345389366149902, "learning_rate": 1.3979526292082938e-07, "logits/chosen": -1.145593285560608, "logits/rejected": -1.1276366710662842, "logps/chosen": -320.34375, "logps/rejected": -358.61248779296875, "loss": 0.0536, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7216370105743408, "rewards/margins": 5.676660060882568, "rewards/rejected": -7.401953220367432, "step": 8100 }, { "epoch": 2.0448744209498, "grad_norm": 11.767427444458008, "learning_rate": 1.391373796640822e-07, "logits/chosen": -1.1553466320037842, "logits/rejected": -1.1388061046600342, "logps/chosen": -288.45001220703125, "logps/rejected": -320.2875061035156, "loss": 0.0539, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.22845458984375, "rewards/margins": 5.697070121765137, "rewards/rejected": -7.925000190734863, "step": 8110 }, { "epoch": 2.04739545583462, "grad_norm": 29.374244689941406, "learning_rate": 1.3848045066981433e-07, "logits/chosen": -1.238500952720642, "logits/rejected": -1.1590087413787842, "logps/chosen": -300.5687561035156, "logps/rejected": -326.875, "loss": 0.0465, "rewards/accuracies": 0.984375, "rewards/chosen": -2.066088914871216, "rewards/margins": 5.727734565734863, "rewards/rejected": -7.7958984375, "step": 8120 }, { "epoch": 2.0499164907194403, "grad_norm": 20.105798721313477, "learning_rate": 1.3782448159261617e-07, "logits/chosen": -1.1344482898712158, "logits/rejected": NaN, "logps/chosen": -291.51251220703125, "logps/rejected": -338.57501220703125, "loss": 0.0417, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.4490723609924316, "rewards/margins": 5.634961128234863, "rewards/rejected": -8.087109565734863, "step": 8130 }, { "epoch": 2.0524375256042604, "grad_norm": 8.567663192749023, "learning_rate": 1.3716947807881524e-07, "logits/chosen": -1.209387183189392, "logits/rejected": -1.16644287109375, "logps/chosen": -311.2875061035156, "logps/rejected": -332.91876220703125, "loss": 0.0613, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.869696021080017, "rewards/margins": 5.961523532867432, "rewards/rejected": -7.832421779632568, "step": 8140 }, { "epoch": 2.054958560489081, "grad_norm": 15.12371826171875, "learning_rate": 1.3651544576642808e-07, "logits/chosen": -1.185278296470642, "logits/rejected": -1.1482665538787842, "logps/chosen": -278.17498779296875, "logps/rejected": -326.45001220703125, "loss": 0.0555, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1180450916290283, "rewards/margins": 6.062109470367432, "rewards/rejected": -8.181055068969727, "step": 8150 }, { "epoch": 2.057479595373901, "grad_norm": 27.506376266479492, "learning_rate": 1.358623902851112e-07, "logits/chosen": -1.194268822669983, "logits/rejected": -1.099145531654358, "logps/chosen": -290.54376220703125, "logps/rejected": -349.3374938964844, "loss": 0.0596, "rewards/accuracies": 0.96875, "rewards/chosen": -1.6643279790878296, "rewards/margins": 5.855859279632568, "rewards/rejected": -7.521484375, "step": 8160 }, { "epoch": 2.0600006302587213, "grad_norm": 5.201971054077148, "learning_rate": 1.3521031725611342e-07, "logits/chosen": -1.2114379405975342, "logits/rejected": -1.170434594154358, "logps/chosen": -308.4125061035156, "logps/rejected": -331.4937438964844, "loss": 0.0449, "rewards/accuracies": 0.984375, "rewards/chosen": -1.5582275390625, "rewards/margins": 5.56640625, "rewards/rejected": -7.1220703125, "step": 8170 }, { "epoch": 2.0625216651435414, "grad_norm": 11.198143005371094, "learning_rate": 1.345592322922266e-07, "logits/chosen": -1.27203369140625, "logits/rejected": -1.1819946765899658, "logps/chosen": -308.375, "logps/rejected": -337.82501220703125, "loss": 0.048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.663183569908142, "rewards/margins": 5.663866996765137, "rewards/rejected": -7.327538967132568, "step": 8180 }, { "epoch": 2.0650427000283615, "grad_norm": 12.780969619750977, "learning_rate": 1.3390914099773773e-07, "logits/chosen": NaN, "logits/rejected": -1.1007812023162842, "logps/chosen": -309.8374938964844, "logps/rejected": -351.0625, "loss": 0.0436, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.851709008216858, "rewards/margins": 5.76953125, "rewards/rejected": -7.619921684265137, "step": 8190 }, { "epoch": 2.0675637349131817, "grad_norm": 13.616006851196289, "learning_rate": 1.3326004896838096e-07, "logits/chosen": NaN, "logits/rejected": -1.057580590248108, "logps/chosen": -286.6187438964844, "logps/rejected": -313.61248779296875, "loss": 0.0424, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9296753406524658, "rewards/margins": 5.961133003234863, "rewards/rejected": -7.889843940734863, "step": 8200 }, { "epoch": 2.0700847697980023, "grad_norm": 28.805870056152344, "learning_rate": 1.3261196179128885e-07, "logits/chosen": -1.1484496593475342, "logits/rejected": -1.0574462413787842, "logps/chosen": -311.23748779296875, "logps/rejected": -352.04998779296875, "loss": 0.0652, "rewards/accuracies": 0.96875, "rewards/chosen": -2.540820360183716, "rewards/margins": 5.760937690734863, "rewards/rejected": -8.300585746765137, "step": 8210 }, { "epoch": 2.0726058046828224, "grad_norm": 48.506614685058594, "learning_rate": 1.3196488504494477e-07, "logits/chosen": -1.084375023841858, "logits/rejected": -1.068701148033142, "logps/chosen": -315.0375061035156, "logps/rejected": -340.38751220703125, "loss": 0.0941, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.3023314476013184, "rewards/margins": 6.016797065734863, "rewards/rejected": -8.318163871765137, "step": 8220 }, { "epoch": 2.0751268395676425, "grad_norm": 7.5023956298828125, "learning_rate": 1.3131882429913449e-07, "logits/chosen": -1.206689476966858, "logits/rejected": -1.1428344249725342, "logps/chosen": -319.7875061035156, "logps/rejected": -331.70001220703125, "loss": 0.0573, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.846343994140625, "rewards/margins": 5.7177734375, "rewards/rejected": -7.5654296875, "step": 8230 }, { "epoch": 2.0776478744524627, "grad_norm": 11.308616638183594, "learning_rate": 1.3067378511489865e-07, "logits/chosen": -1.1685912609100342, "logits/rejected": -1.1099151372909546, "logps/chosen": -276.96875, "logps/rejected": -319.5562438964844, "loss": 0.0568, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.963647484779358, "rewards/margins": 5.4619140625, "rewards/rejected": -7.428124904632568, "step": 8240 }, { "epoch": 2.080168909337283, "grad_norm": 25.78227424621582, "learning_rate": 1.3002977304448477e-07, "logits/chosen": -1.1837341785430908, "logits/rejected": -1.12335205078125, "logps/chosen": -296.671875, "logps/rejected": -338.3999938964844, "loss": 0.0544, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9570465087890625, "rewards/margins": 5.804491996765137, "rewards/rejected": -7.7626953125, "step": 8250 }, { "epoch": 2.082689944222103, "grad_norm": 8.633604049682617, "learning_rate": 1.2938679363129896e-07, "logits/chosen": -1.0977294445037842, "logits/rejected": -1.002557396888733, "logps/chosen": -279.4750061035156, "logps/rejected": -314.23748779296875, "loss": 0.049, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.847680687904358, "rewards/margins": 5.765234470367432, "rewards/rejected": -7.612109184265137, "step": 8260 }, { "epoch": 2.0852109791069235, "grad_norm": 17.364688873291016, "learning_rate": 1.287448524098591e-07, "logits/chosen": -1.2186279296875, "logits/rejected": -1.14141845703125, "logps/chosen": -299.4437561035156, "logps/rejected": -332.3812561035156, "loss": 0.093, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0953369140625, "rewards/margins": 5.600976467132568, "rewards/rejected": -7.696484565734863, "step": 8270 }, { "epoch": 2.0877320139917437, "grad_norm": 13.196175575256348, "learning_rate": 1.2810395490574637e-07, "logits/chosen": -1.1842772960662842, "logits/rejected": -1.0907989740371704, "logps/chosen": -306.3687438964844, "logps/rejected": -344.0874938964844, "loss": 0.0624, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.4218124151229858, "rewards/margins": 6.040234565734863, "rewards/rejected": -7.460741996765137, "step": 8280 }, { "epoch": 2.090253048876564, "grad_norm": 15.400144577026367, "learning_rate": 1.2746410663555817e-07, "logits/chosen": -1.2322509288787842, "logits/rejected": -1.1100952625274658, "logps/chosen": -301.8062438964844, "logps/rejected": -318.73126220703125, "loss": 0.045, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8669312000274658, "rewards/margins": 5.905077934265137, "rewards/rejected": -7.7734375, "step": 8290 }, { "epoch": 2.092774083761384, "grad_norm": 23.49390983581543, "learning_rate": 1.268253131068604e-07, "logits/chosen": -1.2156860828399658, "logits/rejected": -1.058203101158142, "logps/chosen": -288.01251220703125, "logps/rejected": -310.04998779296875, "loss": 0.0711, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.6891295909881592, "rewards/margins": 6.189257621765137, "rewards/rejected": -7.877148628234863, "step": 8300 }, { "epoch": 2.095295118646204, "grad_norm": 8.779989242553711, "learning_rate": 1.261875798181404e-07, "logits/chosen": -1.279028296470642, "logits/rejected": -1.2348754405975342, "logps/chosen": -328.38751220703125, "logps/rejected": -337.6625061035156, "loss": 0.0579, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.896887183189392, "rewards/margins": 5.675585746765137, "rewards/rejected": -7.574023246765137, "step": 8310 }, { "epoch": 2.0978161535310247, "grad_norm": 17.310787200927734, "learning_rate": 1.2555091225875912e-07, "logits/chosen": -1.1862914562225342, "logits/rejected": -1.0946991443634033, "logps/chosen": -306.98126220703125, "logps/rejected": -322.125, "loss": 0.0748, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.109936475753784, "rewards/margins": 5.740185737609863, "rewards/rejected": -7.849804878234863, "step": 8320 }, { "epoch": 2.100337188415845, "grad_norm": 4.619143009185791, "learning_rate": 1.2491531590890413e-07, "logits/chosen": -1.2049438953399658, "logits/rejected": -1.130090355873108, "logps/chosen": -333.53125, "logps/rejected": -344.13751220703125, "loss": 0.0397, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1664185523986816, "rewards/margins": 6.011914253234863, "rewards/rejected": -8.179491996765137, "step": 8330 }, { "epoch": 2.102858223300665, "grad_norm": 29.662790298461914, "learning_rate": 1.2428079623954274e-07, "logits/chosen": -1.118188500404358, "logits/rejected": -1.0854980945587158, "logps/chosen": -296.1187438964844, "logps/rejected": -332.85626220703125, "loss": 0.07, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3039183616638184, "rewards/margins": 5.928320407867432, "rewards/rejected": -8.230273246765137, "step": 8340 }, { "epoch": 2.105379258185485, "grad_norm": 37.8450813293457, "learning_rate": 1.236473587123743e-07, "logits/chosen": -1.144342064857483, "logits/rejected": -1.068994164466858, "logps/chosen": -281.9375, "logps/rejected": -317.2437438964844, "loss": 0.0764, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2720947265625, "rewards/margins": 5.494360446929932, "rewards/rejected": -7.770312309265137, "step": 8350 }, { "epoch": 2.1079002930703052, "grad_norm": 9.65998649597168, "learning_rate": 1.2301500877978353e-07, "logits/chosen": -1.212011694908142, "logits/rejected": -1.070886254310608, "logps/chosen": -305.9437561035156, "logps/rejected": -311.41876220703125, "loss": 0.0541, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.6458313465118408, "rewards/margins": 5.796875, "rewards/rejected": -7.442773342132568, "step": 8360 }, { "epoch": 2.1104213279551254, "grad_norm": 25.525508880615234, "learning_rate": 1.2238375188479374e-07, "logits/chosen": -1.160986304283142, "logits/rejected": -1.1099731922149658, "logps/chosen": -290.9624938964844, "logps/rejected": -352.48126220703125, "loss": 0.0488, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.065624952316284, "rewards/margins": 6.021288871765137, "rewards/rejected": -8.088086128234863, "step": 8370 }, { "epoch": 2.112942362839946, "grad_norm": 22.408308029174805, "learning_rate": 1.217535934610196e-07, "logits/chosen": -1.1594116687774658, "logits/rejected": -1.130712866783142, "logps/chosen": -333.45623779296875, "logps/rejected": -357.7875061035156, "loss": 0.0449, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.0124573707580566, "rewards/margins": 6.266211032867432, "rewards/rejected": -8.2763671875, "step": 8380 }, { "epoch": 2.115463397724766, "grad_norm": 26.604774475097656, "learning_rate": 1.2112453893262077e-07, "logits/chosen": -1.25897216796875, "logits/rejected": -1.1997497081756592, "logps/chosen": -333.39373779296875, "logps/rejected": -361.04376220703125, "loss": 0.0576, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.855340600013733, "rewards/margins": 5.884961128234863, "rewards/rejected": -7.741796970367432, "step": 8390 }, { "epoch": 2.1179844326095862, "grad_norm": 23.80531883239746, "learning_rate": 1.204965937142548e-07, "logits/chosen": NaN, "logits/rejected": -1.179907202720642, "logps/chosen": -315.93438720703125, "logps/rejected": -343.5062561035156, "loss": 0.0513, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.9385802745819092, "rewards/margins": 6.142968654632568, "rewards/rejected": -8.082616806030273, "step": 8400 }, { "epoch": 2.1205054674944064, "grad_norm": 9.067641258239746, "learning_rate": 1.1986976321103073e-07, "logits/chosen": -1.172882080078125, "logits/rejected": -1.0681641101837158, "logps/chosen": -302.9906311035156, "logps/rejected": -322.8125, "loss": 0.051, "rewards/accuracies": 0.96875, "rewards/chosen": -2.067333936691284, "rewards/margins": 6.111718654632568, "rewards/rejected": -8.177148818969727, "step": 8410 }, { "epoch": 2.1230265023792265, "grad_norm": 21.33924674987793, "learning_rate": 1.1924405281846285e-07, "logits/chosen": NaN, "logits/rejected": -1.105712890625, "logps/chosen": -307.8999938964844, "logps/rejected": -346.7437438964844, "loss": 0.0619, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.160839796066284, "rewards/margins": 6.102734565734863, "rewards/rejected": -8.26214599609375, "step": 8420 }, { "epoch": 2.1255475372640467, "grad_norm": 7.438653469085693, "learning_rate": 1.1861946792242372e-07, "logits/chosen": -1.1189727783203125, "logits/rejected": -1.0370910167694092, "logps/chosen": -291.64373779296875, "logps/rejected": -342.0625, "loss": 0.0616, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.2601685523986816, "rewards/margins": 5.860742092132568, "rewards/rejected": -8.1201171875, "step": 8430 }, { "epoch": 2.1280685721488672, "grad_norm": 24.812488555908203, "learning_rate": 1.1799601389909795e-07, "logits/chosen": -1.121026635169983, "logits/rejected": -1.0014861822128296, "logps/chosen": -296.25311279296875, "logps/rejected": -330.8687438964844, "loss": 0.072, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4051146507263184, "rewards/margins": 5.999413967132568, "rewards/rejected": -8.405077934265137, "step": 8440 }, { "epoch": 2.1305896070336874, "grad_norm": 27.550500869750977, "learning_rate": 1.1737369611493639e-07, "logits/chosen": -1.1107299327850342, "logits/rejected": -1.031671166419983, "logps/chosen": -306.2875061035156, "logps/rejected": -353.4937438964844, "loss": 0.0526, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.4344239234924316, "rewards/margins": 6.117480278015137, "rewards/rejected": -8.553319931030273, "step": 8450 }, { "epoch": 2.1331106419185075, "grad_norm": 24.330074310302734, "learning_rate": 1.1675251992660931e-07, "logits/chosen": -1.1606566905975342, "logits/rejected": -1.1696898937225342, "logps/chosen": -311.9375, "logps/rejected": -375.23748779296875, "loss": 0.0565, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.465075731277466, "rewards/margins": 6.476953029632568, "rewards/rejected": -8.94140625, "step": 8460 }, { "epoch": 2.1356316768033277, "grad_norm": 10.037461280822754, "learning_rate": 1.161324906809607e-07, "logits/chosen": -1.108489990234375, "logits/rejected": -1.1308135986328125, "logps/chosen": -320.35626220703125, "logps/rejected": -356.875, "loss": 0.0472, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.190533399581909, "rewards/margins": 6.083398342132568, "rewards/rejected": -8.2724609375, "step": 8470 }, { "epoch": 2.138152711688148, "grad_norm": 16.36578369140625, "learning_rate": 1.155136137149619e-07, "logits/chosen": -1.19464111328125, "logits/rejected": -1.132653832435608, "logps/chosen": -324.11248779296875, "logps/rejected": -370.5249938964844, "loss": 0.0717, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.456561326980591, "rewards/margins": 5.934960842132568, "rewards/rejected": -8.390233993530273, "step": 8480 }, { "epoch": 2.1406737465729684, "grad_norm": 42.44452667236328, "learning_rate": 1.1489589435566627e-07, "logits/chosen": -1.1802489757537842, "logits/rejected": -1.091644287109375, "logps/chosen": -299.3687438964844, "logps/rejected": -351.01251220703125, "loss": 0.0742, "rewards/accuracies": 0.96875, "rewards/chosen": -2.403979539871216, "rewards/margins": 5.965014457702637, "rewards/rejected": -8.366991996765137, "step": 8490 }, { "epoch": 2.1431947814577885, "grad_norm": 4.1698479652404785, "learning_rate": 1.1427933792016248e-07, "logits/chosen": -1.2140624523162842, "logits/rejected": -1.104516625404358, "logps/chosen": -286.72186279296875, "logps/rejected": -343.64373779296875, "loss": 0.0438, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.132031202316284, "rewards/margins": 5.950585842132568, "rewards/rejected": -8.080469131469727, "step": 8500 }, { "epoch": 2.1457158163426087, "grad_norm": 20.45341682434082, "learning_rate": 1.1366394971552962e-07, "logits/chosen": -1.1300170421600342, "logits/rejected": -1.051171898841858, "logps/chosen": -322.2875061035156, "logps/rejected": -381.2749938964844, "loss": 0.0657, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.202221632003784, "rewards/margins": 6.493359565734863, "rewards/rejected": -8.6953125, "step": 8510 }, { "epoch": 2.148236851227429, "grad_norm": 6.115811824798584, "learning_rate": 1.1304973503879076e-07, "logits/chosen": -1.1869628429412842, "logits/rejected": -1.080224633216858, "logps/chosen": -311.23126220703125, "logps/rejected": -320.11248779296875, "loss": 0.0452, "rewards/accuracies": 0.984375, "rewards/chosen": -2.466625928878784, "rewards/margins": 6.030859470367432, "rewards/rejected": -8.498046875, "step": 8520 }, { "epoch": 2.150757886112249, "grad_norm": 21.175352096557617, "learning_rate": 1.1243669917686797e-07, "logits/chosen": -1.100653052330017, "logits/rejected": -1.1054198741912842, "logps/chosen": -298.17498779296875, "logps/rejected": -365.3999938964844, "loss": 0.0432, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.580127000808716, "rewards/margins": 6.098046779632568, "rewards/rejected": -8.6796875, "step": 8530 }, { "epoch": 2.153278920997069, "grad_norm": 38.15406036376953, "learning_rate": 1.1182484740653636e-07, "logits/chosen": -1.07086181640625, "logits/rejected": -1.073522925376892, "logps/chosen": -314.73748779296875, "logps/rejected": -354.2250061035156, "loss": 0.0448, "rewards/accuracies": 0.984375, "rewards/chosen": -2.737866163253784, "rewards/margins": 6.369531154632568, "rewards/rejected": -9.104296684265137, "step": 8540 }, { "epoch": 2.1557999558818897, "grad_norm": 11.452657699584961, "learning_rate": 1.1121418499437881e-07, "logits/chosen": -1.150183081626892, "logits/rejected": -0.9835265874862671, "logps/chosen": -328.6875, "logps/rejected": -351.6499938964844, "loss": 0.047, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.7571043968200684, "rewards/margins": 5.888574123382568, "rewards/rejected": -8.6484375, "step": 8550 }, { "epoch": 2.15832099076671, "grad_norm": 9.922450065612793, "learning_rate": 1.1060471719674092e-07, "logits/chosen": NaN, "logits/rejected": -1.154449462890625, "logps/chosen": -308.09375, "logps/rejected": -336.16876220703125, "loss": 0.0526, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.2076447010040283, "rewards/margins": 6.534960746765137, "rewards/rejected": -8.737890243530273, "step": 8560 }, { "epoch": 2.16084202565153, "grad_norm": 13.323744773864746, "learning_rate": 1.099964492596852e-07, "logits/chosen": -1.1337707042694092, "logits/rejected": -1.10675048828125, "logps/chosen": -305.2875061035156, "logps/rejected": -357.8500061035156, "loss": 0.0541, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.3842225074768066, "rewards/margins": 6.028515815734863, "rewards/rejected": -8.414843559265137, "step": 8570 }, { "epoch": 2.16336306053635, "grad_norm": 15.773117065429688, "learning_rate": 1.0938938641894635e-07, "logits/chosen": -1.1461181640625, "logits/rejected": -1.1283690929412842, "logps/chosen": -294.8500061035156, "logps/rejected": -343.375, "loss": 0.0526, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.3901610374450684, "rewards/margins": 6.066601753234863, "rewards/rejected": -8.456640243530273, "step": 8580 }, { "epoch": 2.16588409542117, "grad_norm": 15.18516731262207, "learning_rate": 1.087835338998862e-07, "logits/chosen": -1.091589331626892, "logits/rejected": -1.0149962902069092, "logps/chosen": -325.7124938964844, "logps/rejected": -351.4125061035156, "loss": 0.0488, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1823487281799316, "rewards/margins": 6.094336032867432, "rewards/rejected": -8.279687881469727, "step": 8590 }, { "epoch": 2.168405130305991, "grad_norm": 19.302209854125977, "learning_rate": 1.0817889691744844e-07, "logits/chosen": -1.1664917469024658, "logits/rejected": -1.096594214439392, "logps/chosen": -323.1812438964844, "logps/rejected": -364.1625061035156, "loss": 0.0395, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.308727979660034, "rewards/margins": 6.358984470367432, "rewards/rejected": -8.664843559265137, "step": 8600 }, { "epoch": 2.170926165190811, "grad_norm": 22.567893981933594, "learning_rate": 1.0757548067611388e-07, "logits/chosen": -1.056249976158142, "logits/rejected": -0.7930053472518921, "logps/chosen": -323.54376220703125, "logps/rejected": -344.2250061035156, "loss": 0.0473, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.4972596168518066, "rewards/margins": 6.3125, "rewards/rejected": -8.811132431030273, "step": 8610 }, { "epoch": 2.173447200075631, "grad_norm": 14.041077613830566, "learning_rate": 1.0697329036985567e-07, "logits/chosen": -1.1444275379180908, "logits/rejected": -1.07598876953125, "logps/chosen": -322.2250061035156, "logps/rejected": -333.0625, "loss": 0.0687, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.640765428543091, "rewards/margins": 6.111132621765137, "rewards/rejected": -8.752344131469727, "step": 8620 }, { "epoch": 2.175968234960451, "grad_norm": 39.61917495727539, "learning_rate": 1.0637233118209482e-07, "logits/chosen": -1.09375, "logits/rejected": -1.0608398914337158, "logps/chosen": -301.45623779296875, "logps/rejected": -344.54998779296875, "loss": 0.0694, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.80816650390625, "rewards/margins": 6.245312690734863, "rewards/rejected": -9.0537109375, "step": 8630 }, { "epoch": 2.1784892698452714, "grad_norm": 30.946664810180664, "learning_rate": 1.0577260828565492e-07, "logits/chosen": -1.172705054283142, "logits/rejected": -1.0229613780975342, "logps/chosen": -320.51251220703125, "logps/rejected": -333.6625061035156, "loss": 0.0478, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.435986280441284, "rewards/margins": 6.369726657867432, "rewards/rejected": -8.804296493530273, "step": 8640 }, { "epoch": 2.1810103047300915, "grad_norm": 14.99609375, "learning_rate": 1.0517412684271856e-07, "logits/chosen": -1.214208960533142, "logits/rejected": -1.089514136314392, "logps/chosen": -319.4156188964844, "logps/rejected": -360.3374938964844, "loss": 0.0468, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.698712110519409, "rewards/margins": 6.314257621765137, "rewards/rejected": -9.019726753234863, "step": 8650 }, { "epoch": 2.183531339614912, "grad_norm": 73.42444610595703, "learning_rate": 1.0457689200478185e-07, "logits/chosen": -1.110388159751892, "logits/rejected": -1.174536108970642, "logps/chosen": -321.03125, "logps/rejected": -346.79998779296875, "loss": 0.0627, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.540576219558716, "rewards/margins": 6.526757717132568, "rewards/rejected": -9.067968368530273, "step": 8660 }, { "epoch": 2.186052374499732, "grad_norm": 25.020496368408203, "learning_rate": 1.0398090891261105e-07, "logits/chosen": -1.173162817955017, "logits/rejected": -1.091455101966858, "logps/chosen": -289.6812438964844, "logps/rejected": -344.6625061035156, "loss": 0.0694, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8015990257263184, "rewards/margins": 5.858593940734863, "rewards/rejected": -8.662500381469727, "step": 8670 }, { "epoch": 2.1885734093845524, "grad_norm": 14.680599212646484, "learning_rate": 1.0338618269619762e-07, "logits/chosen": -1.21966552734375, "logits/rejected": -1.0860412120819092, "logps/chosen": -307.0562438964844, "logps/rejected": -379.5249938964844, "loss": 0.0474, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.5776000022888184, "rewards/margins": 6.258008003234863, "rewards/rejected": -8.833984375, "step": 8680 }, { "epoch": 2.1910944442693725, "grad_norm": 18.51276397705078, "learning_rate": 1.0279271847471426e-07, "logits/chosen": -1.2114989757537842, "logits/rejected": -1.11944580078125, "logps/chosen": -324.66876220703125, "logps/rejected": -363.8500061035156, "loss": 0.0456, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.37677001953125, "rewards/margins": 6.630468845367432, "rewards/rejected": -10.005468368530273, "step": 8690 }, { "epoch": 2.1936154791541926, "grad_norm": 16.530040740966797, "learning_rate": 1.0220052135647129e-07, "logits/chosen": -1.1586456298828125, "logits/rejected": -1.078637719154358, "logps/chosen": -328.6187438964844, "logps/rejected": -369.6875, "loss": 0.0659, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0513062477111816, "rewards/margins": 6.638867378234863, "rewards/rejected": -9.693359375, "step": 8700 }, { "epoch": 2.196136514039013, "grad_norm": 17.24599266052246, "learning_rate": 1.0160959643887187e-07, "logits/chosen": -1.0912902355194092, "logits/rejected": -1.132360816001892, "logps/chosen": -302.2562561035156, "logps/rejected": -347.42498779296875, "loss": 0.0518, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.469165086746216, "rewards/margins": 6.382616996765137, "rewards/rejected": -9.849609375, "step": 8710 }, { "epoch": 2.1986575489238334, "grad_norm": 53.602500915527344, "learning_rate": 1.010199488083687e-07, "logits/chosen": -1.150793433189392, "logits/rejected": -1.121130347251892, "logps/chosen": -326.2250061035156, "logps/rejected": -357.0625, "loss": 0.0602, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.182080030441284, "rewards/margins": 6.177929878234863, "rewards/rejected": -9.355859756469727, "step": 8720 }, { "epoch": 2.2011785838086535, "grad_norm": 8.69321060180664, "learning_rate": 1.0043158354042027e-07, "logits/chosen": -1.0912597179412842, "logits/rejected": -0.937573254108429, "logps/chosen": -317.20623779296875, "logps/rejected": -353.625, "loss": 0.039, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.010894775390625, "rewards/margins": 6.508398532867432, "rewards/rejected": -9.523828506469727, "step": 8730 }, { "epoch": 2.2036996186934736, "grad_norm": 11.698653221130371, "learning_rate": 9.984450569944672e-08, "logits/chosen": -1.1413695812225342, "logits/rejected": -1.0407836437225342, "logps/chosen": -299.03125, "logps/rejected": -330.4624938964844, "loss": 0.0259, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -2.7392945289611816, "rewards/margins": 6.866796970367432, "rewards/rejected": -9.6015625, "step": 8740 }, { "epoch": 2.2062206535782938, "grad_norm": 26.454465866088867, "learning_rate": 9.925872033878662e-08, "logits/chosen": -1.194921851158142, "logits/rejected": -1.0609207153320312, "logps/chosen": -284.7124938964844, "logps/rejected": -334.17498779296875, "loss": 0.0482, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0052733421325684, "rewards/margins": 6.383496284484863, "rewards/rejected": -9.394922256469727, "step": 8750 }, { "epoch": 2.208741688463114, "grad_norm": 12.642827033996582, "learning_rate": 9.867423250065332e-08, "logits/chosen": -1.101049780845642, "logits/rejected": -1.042272925376892, "logps/chosen": -316.96875, "logps/rejected": -340.95001220703125, "loss": 0.0461, "rewards/accuracies": 0.984375, "rewards/chosen": -3.052014112472534, "rewards/margins": 6.135644435882568, "rewards/rejected": -9.191015243530273, "step": 8760 }, { "epoch": 2.2112627233479345, "grad_norm": 12.33161735534668, "learning_rate": 9.809104721609182e-08, "logits/chosen": -1.243432641029358, "logits/rejected": -1.089147925376892, "logps/chosen": -296.84375, "logps/rejected": -350.13751220703125, "loss": 0.0547, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.00762939453125, "rewards/margins": 6.277734279632568, "rewards/rejected": -9.284375190734863, "step": 8770 }, { "epoch": 2.2137837582327546, "grad_norm": 17.560251235961914, "learning_rate": 9.75091695049349e-08, "logits/chosen": -1.136560082435608, "logits/rejected": -1.00762939453125, "logps/chosen": -302.1312561035156, "logps/rejected": -349.82501220703125, "loss": 0.0404, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.611377000808716, "rewards/margins": 6.48828125, "rewards/rejected": -9.103124618530273, "step": 8780 }, { "epoch": 2.216304793117575, "grad_norm": 43.00522232055664, "learning_rate": 9.692860437576061e-08, "logits/chosen": -1.1465332508087158, "logits/rejected": -1.110925316810608, "logps/chosen": -297.03436279296875, "logps/rejected": -354.36248779296875, "loss": 0.0768, "rewards/accuracies": 0.96875, "rewards/chosen": -2.4622559547424316, "rewards/margins": 6.297656059265137, "rewards/rejected": -8.760156631469727, "step": 8790 }, { "epoch": 2.218825828002395, "grad_norm": 29.944978713989258, "learning_rate": 9.634935682584846e-08, "logits/chosen": -1.186193823814392, "logits/rejected": -1.1214599609375, "logps/chosen": -312.85626220703125, "logps/rejected": -360.07501220703125, "loss": 0.059, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.4377198219299316, "rewards/margins": 6.615038871765137, "rewards/rejected": -9.050390243530273, "step": 8800 }, { "epoch": 2.221346862887215, "grad_norm": 9.026261329650879, "learning_rate": 9.577143184113711e-08, "logits/chosen": -1.2394530773162842, "logits/rejected": -0.97979736328125, "logps/chosen": -335.4156188964844, "logps/rejected": -352.23748779296875, "loss": 0.0358, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.465161085128784, "rewards/margins": 6.535546779632568, "rewards/rejected": -9.005468368530273, "step": 8810 }, { "epoch": 2.2238678977720356, "grad_norm": 7.27154016494751, "learning_rate": 9.519483439618075e-08, "logits/chosen": -1.1887328624725342, "logits/rejected": -1.03375244140625, "logps/chosen": -336.2124938964844, "logps/rejected": -355.7875061035156, "loss": 0.0385, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.6427979469299316, "rewards/margins": 6.421679496765137, "rewards/rejected": -9.065234184265137, "step": 8820 }, { "epoch": 2.226388932656856, "grad_norm": 25.24820899963379, "learning_rate": 9.461956945410676e-08, "logits/chosen": -1.2418029308319092, "logits/rejected": -1.0834472179412842, "logps/chosen": -315.5249938964844, "logps/rejected": -335.1499938964844, "loss": 0.0358, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6830687522888184, "rewards/margins": 6.211133003234863, "rewards/rejected": -8.895312309265137, "step": 8830 }, { "epoch": 2.228909967541676, "grad_norm": 7.522816181182861, "learning_rate": 9.404564196657298e-08, "logits/chosen": -1.1717712879180908, "logits/rejected": -1.060333251953125, "logps/chosen": -331.5874938964844, "logps/rejected": -346.82501220703125, "loss": 0.0516, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8359375, "rewards/margins": 6.302929878234863, "rewards/rejected": -9.135546684265137, "step": 8840 }, { "epoch": 2.231431002426496, "grad_norm": 34.26163864135742, "learning_rate": 9.347305687372475e-08, "logits/chosen": -1.2626953125, "logits/rejected": -1.1672241687774658, "logps/chosen": -321.9312438964844, "logps/rejected": -375.6875, "loss": 0.0754, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.4271607398986816, "rewards/margins": 6.099413871765137, "rewards/rejected": -8.525781631469727, "step": 8850 }, { "epoch": 2.233952037311316, "grad_norm": 31.264087677001953, "learning_rate": 9.290181910415263e-08, "logits/chosen": -1.2138671875, "logits/rejected": -1.154516577720642, "logps/chosen": -339.1875, "logps/rejected": -354.82501220703125, "loss": 0.0674, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.8809571266174316, "rewards/margins": 5.992383003234863, "rewards/rejected": -8.870702743530273, "step": 8860 }, { "epoch": 2.2364730721961363, "grad_norm": 5.758995532989502, "learning_rate": 9.233193357485014e-08, "logits/chosen": -1.1931030750274658, "logits/rejected": -1.066613793373108, "logps/chosen": -323.17498779296875, "logps/rejected": -354.875, "loss": 0.0329, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.0781617164611816, "rewards/margins": 6.630859375, "rewards/rejected": -9.706640243530273, "step": 8870 }, { "epoch": 2.238994107080957, "grad_norm": 18.613447189331055, "learning_rate": 9.176340519117106e-08, "logits/chosen": -1.114111304283142, "logits/rejected": -1.09332275390625, "logps/chosen": -319.75, "logps/rejected": -359.40625, "loss": 0.0513, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.808239698410034, "rewards/margins": 6.661913871765137, "rewards/rejected": -9.473047256469727, "step": 8880 }, { "epoch": 2.241515141965777, "grad_norm": 27.966007232666016, "learning_rate": 9.11962388467874e-08, "logits/chosen": NaN, "logits/rejected": -1.0225493907928467, "logps/chosen": -306.5375061035156, "logps/rejected": -362.3125, "loss": 0.0408, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0164794921875, "rewards/margins": 6.390038967132568, "rewards/rejected": -9.404687881469727, "step": 8890 }, { "epoch": 2.244036176850597, "grad_norm": 11.037649154663086, "learning_rate": 9.063043942364717e-08, "logits/chosen": -1.1480591297149658, "logits/rejected": -1.0681030750274658, "logps/chosen": -308.35626220703125, "logps/rejected": -357.2562561035156, "loss": 0.0474, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8564209938049316, "rewards/margins": 6.332812309265137, "rewards/rejected": -9.19140625, "step": 8900 }, { "epoch": 2.2465572117354173, "grad_norm": 61.01218032836914, "learning_rate": 9.006601179193283e-08, "logits/chosen": -1.119531273841858, "logits/rejected": -0.996899425983429, "logps/chosen": -311.484375, "logps/rejected": -325.38751220703125, "loss": 0.0653, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.7358460426330566, "rewards/margins": 6.0791015625, "rewards/rejected": -8.814844131469727, "step": 8910 }, { "epoch": 2.2490782466202375, "grad_norm": 21.154726028442383, "learning_rate": 8.950296081001846e-08, "logits/chosen": -1.11468505859375, "logits/rejected": -1.112548828125, "logps/chosen": -331.95623779296875, "logps/rejected": -356.8125, "loss": 0.0519, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.967517137527466, "rewards/margins": 6.508008003234863, "rewards/rejected": -9.478124618530273, "step": 8920 }, { "epoch": 2.251599281505058, "grad_norm": 13.69310474395752, "learning_rate": 8.894129132442898e-08, "logits/chosen": -1.1307251453399658, "logits/rejected": -1.028753638267517, "logps/chosen": -309.26873779296875, "logps/rejected": -339.32501220703125, "loss": 0.0515, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8682618141174316, "rewards/margins": 6.331250190734863, "rewards/rejected": -9.195703506469727, "step": 8930 }, { "epoch": 2.254120316389878, "grad_norm": 9.159889221191406, "learning_rate": 8.838100816979751e-08, "logits/chosen": -1.133874535560608, "logits/rejected": -0.9897094964981079, "logps/chosen": -309.54376220703125, "logps/rejected": -360.1937561035156, "loss": 0.046, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.882641553878784, "rewards/margins": 6.360547065734863, "rewards/rejected": -9.245312690734863, "step": 8940 }, { "epoch": 2.2566413512746983, "grad_norm": 8.470044136047363, "learning_rate": 8.782211616882451e-08, "logits/chosen": -1.19439697265625, "logits/rejected": -1.115759253501892, "logps/chosen": -317.1499938964844, "logps/rejected": -356.66876220703125, "loss": 0.0725, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.3541502952575684, "rewards/margins": 6.437109470367432, "rewards/rejected": -8.792577743530273, "step": 8950 }, { "epoch": 2.2591623861595185, "grad_norm": 29.923450469970703, "learning_rate": 8.726462013223568e-08, "logits/chosen": -1.2116210460662842, "logits/rejected": -1.075598120689392, "logps/chosen": -320.91876220703125, "logps/rejected": -351.6000061035156, "loss": 0.1267, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.960766553878784, "rewards/margins": 6.324804782867432, "rewards/rejected": -9.283203125, "step": 8960 }, { "epoch": 2.2616834210443386, "grad_norm": 20.205322265625, "learning_rate": 8.67085248587408e-08, "logits/chosen": -1.205041527748108, "logits/rejected": -1.1007080078125, "logps/chosen": -360.4312438964844, "logps/rejected": -363.23748779296875, "loss": 0.041, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.801135301589966, "rewards/margins": 6.781445503234863, "rewards/rejected": -9.582616806030273, "step": 8970 }, { "epoch": 2.2642044559291588, "grad_norm": 39.44192886352539, "learning_rate": 8.615383513499271e-08, "logits/chosen": -1.1978759765625, "logits/rejected": -1.099829077720642, "logps/chosen": -332.51873779296875, "logps/rejected": -364.6000061035156, "loss": 0.07, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9340453147888184, "rewards/margins": 6.63330078125, "rewards/rejected": -9.565234184265137, "step": 8980 }, { "epoch": 2.266725490813979, "grad_norm": 27.780113220214844, "learning_rate": 8.56005557355455e-08, "logits/chosen": -1.223425269126892, "logits/rejected": -1.102380394935608, "logps/chosen": -299.54376220703125, "logps/rejected": -354.45001220703125, "loss": 0.0375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2032470703125, "rewards/margins": 6.170507907867432, "rewards/rejected": -9.373827934265137, "step": 8990 }, { "epoch": 2.2692465256987995, "grad_norm": 33.395301818847656, "learning_rate": 8.50486914228138e-08, "logits/chosen": -1.200769066810608, "logits/rejected": -0.997790515422821, "logps/chosen": -330.96875, "logps/rejected": -359.1499938964844, "loss": 0.0513, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.3094482421875, "rewards/margins": 6.15234375, "rewards/rejected": -9.4609375, "step": 9000 }, { "epoch": 2.2717675605836196, "grad_norm": 12.158639907836914, "learning_rate": 8.449824694703192e-08, "logits/chosen": -1.0890014171600342, "logits/rejected": -1.094213843345642, "logps/chosen": -318.39373779296875, "logps/rejected": -357.125, "loss": 0.0646, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.5050995349884033, "rewards/margins": 6.290136814117432, "rewards/rejected": -9.798437118530273, "step": 9010 }, { "epoch": 2.2742885954684398, "grad_norm": 19.436952590942383, "learning_rate": 8.39492270462126e-08, "logits/chosen": -1.182153344154358, "logits/rejected": -1.0277221202850342, "logps/chosen": -321.54376220703125, "logps/rejected": -350.5874938964844, "loss": 0.049, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.445483446121216, "rewards/margins": 6.463281154632568, "rewards/rejected": -9.912500381469727, "step": 9020 }, { "epoch": 2.27680963035326, "grad_norm": 7.460870742797852, "learning_rate": 8.340163644610634e-08, "logits/chosen": -1.094354271888733, "logits/rejected": -0.9900146722793579, "logps/chosen": -307.91876220703125, "logps/rejected": -335.13751220703125, "loss": 0.0455, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8498778343200684, "rewards/margins": 6.662304878234863, "rewards/rejected": -9.508984565734863, "step": 9030 }, { "epoch": 2.2793306652380805, "grad_norm": 6.054429531097412, "learning_rate": 8.285547986016081e-08, "logits/chosen": -1.124536156654358, "logits/rejected": -1.0042846202850342, "logps/chosen": -295.70001220703125, "logps/rejected": -347.45001220703125, "loss": 0.0501, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.805468797683716, "rewards/margins": 6.591894626617432, "rewards/rejected": -9.399609565734863, "step": 9040 }, { "epoch": 2.2818517001229006, "grad_norm": 10.674135208129883, "learning_rate": 8.231076198948044e-08, "logits/chosen": -1.124176025390625, "logits/rejected": -1.138848900794983, "logps/chosen": -287.9125061035156, "logps/rejected": -384.2875061035156, "loss": 0.0596, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.860626220703125, "rewards/margins": 6.3427734375, "rewards/rejected": -9.203516006469727, "step": 9050 }, { "epoch": 2.2843727350077208, "grad_norm": 14.529641151428223, "learning_rate": 8.176748752278543e-08, "logits/chosen": -1.1708495616912842, "logits/rejected": -1.1232421398162842, "logps/chosen": -305.9750061035156, "logps/rejected": -369.6499938964844, "loss": 0.0468, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.926654100418091, "rewards/margins": 6.535742282867432, "rewards/rejected": -9.466211318969727, "step": 9060 }, { "epoch": 2.286893769892541, "grad_norm": 67.26555633544922, "learning_rate": 8.122566113637203e-08, "logits/chosen": -1.163580298423767, "logits/rejected": -1.071862816810608, "logps/chosen": -285.9125061035156, "logps/rejected": -331.9624938964844, "loss": 0.054, "rewards/accuracies": 0.96875, "rewards/chosen": -2.461444139480591, "rewards/margins": 6.383984565734863, "rewards/rejected": -8.845312118530273, "step": 9070 }, { "epoch": 2.289414804777361, "grad_norm": 19.25269889831543, "learning_rate": 8.068528749407169e-08, "logits/chosen": -1.198327660560608, "logits/rejected": -1.1657836437225342, "logps/chosen": -316.96875, "logps/rejected": -348.9375, "loss": 0.0616, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.714787244796753, "rewards/margins": 6.205859184265137, "rewards/rejected": -8.924219131469727, "step": 9080 }, { "epoch": 2.291935839662181, "grad_norm": 12.033791542053223, "learning_rate": 8.014637124721149e-08, "logits/chosen": -1.12884521484375, "logits/rejected": -1.029595971107483, "logps/chosen": -309.9937438964844, "logps/rejected": -351.01251220703125, "loss": 0.0369, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0118165016174316, "rewards/margins": 6.296484470367432, "rewards/rejected": -9.310546875, "step": 9090 }, { "epoch": 2.2944568745470013, "grad_norm": 22.721290588378906, "learning_rate": 7.960891703457362e-08, "logits/chosen": -1.1833984851837158, "logits/rejected": -1.060644507408142, "logps/chosen": -336.0375061035156, "logps/rejected": -352.9125061035156, "loss": 0.0504, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.138824462890625, "rewards/margins": 6.34765625, "rewards/rejected": -9.487500190734863, "step": 9100 }, { "epoch": 2.296977909431822, "grad_norm": 29.020238876342773, "learning_rate": 7.907292948235555e-08, "logits/chosen": -1.196441650390625, "logits/rejected": -1.1320068836212158, "logps/chosen": -339.8187561035156, "logps/rejected": -365.3374938964844, "loss": 0.038, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8363280296325684, "rewards/margins": 6.8603515625, "rewards/rejected": -9.697070121765137, "step": 9110 }, { "epoch": 2.299498944316642, "grad_norm": 6.52321195602417, "learning_rate": 7.853841320413065e-08, "logits/chosen": -1.078771948814392, "logits/rejected": -1.09234619140625, "logps/chosen": -298.78125, "logps/rejected": -344.0874938964844, "loss": 0.0568, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.917797803878784, "rewards/margins": 6.548242092132568, "rewards/rejected": -9.466796875, "step": 9120 }, { "epoch": 2.302019979201462, "grad_norm": 2.411801338195801, "learning_rate": 7.800537280080785e-08, "logits/chosen": -1.173828125, "logits/rejected": -1.09161376953125, "logps/chosen": -348.33123779296875, "logps/rejected": -372.1000061035156, "loss": 0.0562, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.947094678878784, "rewards/margins": 6.550000190734863, "rewards/rejected": -9.495312690734863, "step": 9130 }, { "epoch": 2.3045410140862823, "grad_norm": 19.04088020324707, "learning_rate": 7.747381286059232e-08, "logits/chosen": -1.05914306640625, "logits/rejected": -0.9982833862304688, "logps/chosen": -307.64373779296875, "logps/rejected": -334.9125061035156, "loss": 0.0653, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.0763823986053467, "rewards/margins": 6.140527248382568, "rewards/rejected": -9.219141006469727, "step": 9140 }, { "epoch": 2.3070620489711025, "grad_norm": 27.200042724609375, "learning_rate": 7.694373795894621e-08, "logits/chosen": -1.1356079578399658, "logits/rejected": -1.057153344154358, "logps/chosen": -312.2124938964844, "logps/rejected": -328.0874938964844, "loss": 0.0635, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.725433349609375, "rewards/margins": 6.452929496765137, "rewards/rejected": -9.180078506469727, "step": 9150 }, { "epoch": 2.309583083855923, "grad_norm": 12.557526588439941, "learning_rate": 7.641515265854882e-08, "logits/chosen": -1.166192650794983, "logits/rejected": -1.040686011314392, "logps/chosen": -303.3374938964844, "logps/rejected": -348.63751220703125, "loss": 0.0388, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.8875975608825684, "rewards/margins": 6.574609279632568, "rewards/rejected": -9.466601371765137, "step": 9160 }, { "epoch": 2.312104118740743, "grad_norm": 9.453475952148438, "learning_rate": 7.588806150925755e-08, "logits/chosen": -1.136755347251892, "logits/rejected": -1.0892333984375, "logps/chosen": -350.48126220703125, "logps/rejected": -367.7124938964844, "loss": 0.0701, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.347241163253784, "rewards/margins": 6.4130859375, "rewards/rejected": -9.757421493530273, "step": 9170 }, { "epoch": 2.3146251536255633, "grad_norm": 33.222103118896484, "learning_rate": 7.536246904806878e-08, "logits/chosen": -1.1843140125274658, "logits/rejected": -1.15594482421875, "logps/chosen": -323.76251220703125, "logps/rejected": -376.875, "loss": 0.055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.750439405441284, "rewards/margins": 6.929296970367432, "rewards/rejected": -9.6787109375, "step": 9180 }, { "epoch": 2.3171461885103835, "grad_norm": 67.34077453613281, "learning_rate": 7.483837979907886e-08, "logits/chosen": -1.2142333984375, "logits/rejected": -1.0245850086212158, "logps/chosen": -307.46875, "logps/rejected": -341.20001220703125, "loss": 0.0518, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.303955078125, "rewards/margins": 6.383008003234863, "rewards/rejected": -9.687108993530273, "step": 9190 }, { "epoch": 2.3196672233952036, "grad_norm": 11.72309684753418, "learning_rate": 7.431579827344486e-08, "logits/chosen": -1.1179687976837158, "logits/rejected": -1.111181616783142, "logps/chosen": -316.6625061035156, "logps/rejected": -353.76251220703125, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7853331565856934, "rewards/margins": 6.5966796875, "rewards/rejected": -9.383203506469727, "step": 9200 }, { "epoch": 2.3221882582800237, "grad_norm": 34.17678451538086, "learning_rate": 7.379472896934619e-08, "logits/chosen": -1.0644652843475342, "logits/rejected": -1.1063232421875, "logps/chosen": -312.13751220703125, "logps/rejected": -347.4750061035156, "loss": 0.0559, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.909912109375, "rewards/margins": 6.501367092132568, "rewards/rejected": -9.409375190734863, "step": 9210 }, { "epoch": 2.3247092931648443, "grad_norm": 22.319129943847656, "learning_rate": 7.327517637194535e-08, "logits/chosen": NaN, "logits/rejected": -1.126708984375, "logps/chosen": -296.3125, "logps/rejected": -369.5625, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7155518531799316, "rewards/margins": 6.420312404632568, "rewards/rejected": -9.133593559265137, "step": 9220 }, { "epoch": 2.3272303280496645, "grad_norm": 59.209041595458984, "learning_rate": 7.275714495334997e-08, "logits/chosen": -1.2271239757537842, "logits/rejected": -1.120031714439392, "logps/chosen": -300.76873779296875, "logps/rejected": -343.91876220703125, "loss": 0.0862, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.5219664573669434, "rewards/margins": 6.419726371765137, "rewards/rejected": -8.942968368530273, "step": 9230 }, { "epoch": 2.3297513629344846, "grad_norm": 6.592267990112305, "learning_rate": 7.224063917257369e-08, "logits/chosen": -1.124670386314392, "logits/rejected": -1.000756859779358, "logps/chosen": -316.08123779296875, "logps/rejected": -371.2749938964844, "loss": 0.0413, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.987893581390381, "rewards/margins": 6.382421970367432, "rewards/rejected": -9.369531631469727, "step": 9240 }, { "epoch": 2.3322723978193047, "grad_norm": 6.588650703430176, "learning_rate": 7.172566347549808e-08, "logits/chosen": -1.1592895984649658, "logits/rejected": -1.0453979969024658, "logps/chosen": -307.1937561035156, "logps/rejected": -376.92498779296875, "loss": 0.066, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.601367235183716, "rewards/margins": 6.116015434265137, "rewards/rejected": -8.718358993530273, "step": 9250 }, { "epoch": 2.334793432704125, "grad_norm": 6.252317905426025, "learning_rate": 7.12122222948345e-08, "logits/chosen": -1.1423218250274658, "logits/rejected": -1.06195068359375, "logps/chosen": -303.61248779296875, "logps/rejected": -329.9750061035156, "loss": 0.0515, "rewards/accuracies": 0.984375, "rewards/chosen": -2.7229981422424316, "rewards/margins": 6.4677734375, "rewards/rejected": -9.189844131469727, "step": 9260 }, { "epoch": 2.3373144675889455, "grad_norm": 28.286134719848633, "learning_rate": 7.070032005008567e-08, "logits/chosen": -1.1438720226287842, "logits/rejected": NaN, "logps/chosen": -327.5625, "logps/rejected": -363.13751220703125, "loss": 0.0557, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.8480467796325684, "rewards/margins": 6.434179782867432, "rewards/rejected": -9.281641006469727, "step": 9270 }, { "epoch": 2.3398355024737656, "grad_norm": 24.243497848510742, "learning_rate": 7.018996114750766e-08, "logits/chosen": -1.217199683189392, "logits/rejected": -1.0411498546600342, "logps/chosen": -352.23126220703125, "logps/rejected": -357.5874938964844, "loss": 0.0546, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.2514891624450684, "rewards/margins": 6.593163967132568, "rewards/rejected": -9.844531059265137, "step": 9280 }, { "epoch": 2.3423565373585857, "grad_norm": 25.643503189086914, "learning_rate": 6.968114998007232e-08, "logits/chosen": -1.198950171470642, "logits/rejected": -1.0894286632537842, "logps/chosen": -321.5375061035156, "logps/rejected": -366.3999938964844, "loss": 0.0461, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.7505125999450684, "rewards/margins": 6.627539157867432, "rewards/rejected": -9.380078315734863, "step": 9290 }, { "epoch": 2.344877572243406, "grad_norm": 23.797788619995117, "learning_rate": 6.917389092742893e-08, "logits/chosen": -1.2471191883087158, "logits/rejected": -1.09259033203125, "logps/chosen": -331.9937438964844, "logps/rejected": -369.95001220703125, "loss": 0.0695, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.0775389671325684, "rewards/margins": 6.3974609375, "rewards/rejected": -9.473437309265137, "step": 9300 }, { "epoch": 2.347398607128226, "grad_norm": 66.45001983642578, "learning_rate": 6.866818835586687e-08, "logits/chosen": -1.1241180896759033, "logits/rejected": -1.132360816001892, "logps/chosen": -302.0687561035156, "logps/rejected": -342.38751220703125, "loss": 0.0405, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0251221656799316, "rewards/margins": 6.545507907867432, "rewards/rejected": -9.572656631469727, "step": 9310 }, { "epoch": 2.349919642013046, "grad_norm": 17.576152801513672, "learning_rate": 6.816404661827785e-08, "logits/chosen": -1.078521728515625, "logits/rejected": -1.0055725574493408, "logps/chosen": -329.2250061035156, "logps/rejected": -354.4624938964844, "loss": 0.0411, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.387951612472534, "rewards/margins": 6.321093559265137, "rewards/rejected": -9.709765434265137, "step": 9320 }, { "epoch": 2.3524406768978667, "grad_norm": 34.27827072143555, "learning_rate": 6.766147005411879e-08, "logits/chosen": -1.078393578529358, "logits/rejected": NaN, "logps/chosen": -310.9375, "logps/rejected": -347.67498779296875, "loss": 0.0779, "rewards/accuracies": 0.953125, "rewards/chosen": -3.5388426780700684, "rewards/margins": 6.27001953125, "rewards/rejected": -9.808984756469727, "step": 9330 }, { "epoch": 2.354961711782687, "grad_norm": 17.350936889648438, "learning_rate": 6.716046298937384e-08, "logits/chosen": -1.1638672351837158, "logits/rejected": -1.191857933998108, "logps/chosen": -320.0062561035156, "logps/rejected": -378.13751220703125, "loss": 0.0334, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.066455125808716, "rewards/margins": 6.711133003234863, "rewards/rejected": -9.775781631469727, "step": 9340 }, { "epoch": 2.357482746667507, "grad_norm": 35.49551010131836, "learning_rate": 6.666102973651782e-08, "logits/chosen": -1.2030150890350342, "logits/rejected": -1.0501830577850342, "logps/chosen": -332.7250061035156, "logps/rejected": -352.5375061035156, "loss": 0.0459, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.151684522628784, "rewards/margins": 6.623632907867432, "rewards/rejected": -9.774218559265137, "step": 9350 }, { "epoch": 2.360003781552327, "grad_norm": 40.842647552490234, "learning_rate": 6.616317459447851e-08, "logits/chosen": -1.1645386219024658, "logits/rejected": -1.0828826427459717, "logps/chosen": -300.0406188964844, "logps/rejected": -338.8125, "loss": 0.071, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.6125426292419434, "rewards/margins": 6.321679592132568, "rewards/rejected": -8.932812690734863, "step": 9360 }, { "epoch": 2.3625248164371473, "grad_norm": 14.02625846862793, "learning_rate": 6.566690184860028e-08, "logits/chosen": -1.1471679210662842, "logits/rejected": -1.134558081626892, "logps/chosen": -314.38751220703125, "logps/rejected": -344.0625, "loss": 0.0514, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1183838844299316, "rewards/margins": 6.333593845367432, "rewards/rejected": -9.451171875, "step": 9370 }, { "epoch": 2.365045851321968, "grad_norm": 19.2847957611084, "learning_rate": 6.517221577060644e-08, "logits/chosen": -1.1254456043243408, "logits/rejected": -1.0552978515625, "logps/chosen": -308.3187561035156, "logps/rejected": -365.5, "loss": 0.0479, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9517579078674316, "rewards/margins": 6.430859565734863, "rewards/rejected": -9.380468368530273, "step": 9380 }, { "epoch": 2.367566886206788, "grad_norm": 14.257923126220703, "learning_rate": 6.46791206185631e-08, "logits/chosen": -1.1007812023162842, "logits/rejected": -1.085473656654358, "logps/chosen": -322.70623779296875, "logps/rejected": -349.9624938964844, "loss": 0.0551, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.730761766433716, "rewards/margins": 6.459912300109863, "rewards/rejected": -9.19140625, "step": 9390 }, { "epoch": 2.370087921091608, "grad_norm": 5.871348857879639, "learning_rate": 6.418762063684239e-08, "logits/chosen": -1.181860327720642, "logits/rejected": -1.190222144126892, "logps/chosen": -309.71875, "logps/rejected": -350.5625, "loss": 0.0614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.810070753097534, "rewards/margins": 6.241406440734863, "rewards/rejected": -9.053515434265137, "step": 9400 }, { "epoch": 2.3726089559764283, "grad_norm": 18.109651565551758, "learning_rate": 6.36977200560856e-08, "logits/chosen": -1.09820556640625, "logits/rejected": -1.0347900390625, "logps/chosen": -330.88751220703125, "logps/rejected": -367.125, "loss": 0.0863, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9474244117736816, "rewards/margins": 6.869531154632568, "rewards/rejected": -9.816015243530273, "step": 9410 }, { "epoch": 2.3751299908612484, "grad_norm": 25.832246780395508, "learning_rate": 6.320942309316704e-08, "logits/chosen": -1.1589676141738892, "logits/rejected": -1.07421875, "logps/chosen": -284.4937438964844, "logps/rejected": -326.01251220703125, "loss": 0.091, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2571043968200684, "rewards/margins": 6.000195503234863, "rewards/rejected": -9.259374618530273, "step": 9420 }, { "epoch": 2.3776510257460686, "grad_norm": 43.289939880371094, "learning_rate": 6.272273395115794e-08, "logits/chosen": -1.174719214439392, "logits/rejected": -1.1595947742462158, "logps/chosen": -342.1187438964844, "logps/rejected": -381.57501220703125, "loss": 0.0507, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.741284132003784, "rewards/margins": 6.70703125, "rewards/rejected": -9.451952934265137, "step": 9430 }, { "epoch": 2.380172060630889, "grad_norm": 5.902862548828125, "learning_rate": 6.223765681928977e-08, "logits/chosen": -1.118981957435608, "logits/rejected": -1.092565894126892, "logps/chosen": -320.6625061035156, "logps/rejected": -389.26251220703125, "loss": 0.0369, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.941821336746216, "rewards/margins": 6.872265815734863, "rewards/rejected": -9.816797256469727, "step": 9440 }, { "epoch": 2.3826930955157093, "grad_norm": 50.332122802734375, "learning_rate": 6.175419587291853e-08, "logits/chosen": -1.236669898033142, "logits/rejected": -1.1200439929962158, "logps/chosen": -332.0249938964844, "logps/rejected": -367.3999938964844, "loss": 0.0703, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.7341766357421875, "rewards/margins": 6.459570407867432, "rewards/rejected": -9.197265625, "step": 9450 }, { "epoch": 2.3852141304005294, "grad_norm": 21.89457130432129, "learning_rate": 6.127235527348862e-08, "logits/chosen": -1.2223694324493408, "logits/rejected": -1.11663818359375, "logps/chosen": -335.78125, "logps/rejected": -347.61248779296875, "loss": 0.0593, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.610461473464966, "rewards/margins": 6.554296970367432, "rewards/rejected": -9.166796684265137, "step": 9460 }, { "epoch": 2.3877351652853496, "grad_norm": 55.019737243652344, "learning_rate": 6.079213916849737e-08, "logits/chosen": -1.1846435070037842, "logits/rejected": -1.0324188470840454, "logps/chosen": -317.8374938964844, "logps/rejected": -324.79998779296875, "loss": 0.0501, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.8798460960388184, "rewards/margins": 6.211133003234863, "rewards/rejected": -9.086328506469727, "step": 9470 }, { "epoch": 2.3902562001701697, "grad_norm": 13.911991119384766, "learning_rate": 6.031355169145882e-08, "logits/chosen": -1.269677758216858, "logits/rejected": -1.11077880859375, "logps/chosen": -315.8187561035156, "logps/rejected": -347.4125061035156, "loss": 0.0405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.2803986072540283, "rewards/margins": 6.312597751617432, "rewards/rejected": -8.592187881469727, "step": 9480 }, { "epoch": 2.3927772350549903, "grad_norm": 2.236504077911377, "learning_rate": 5.983659696186868e-08, "logits/chosen": -1.07550048828125, "logits/rejected": -1.106024146080017, "logps/chosen": -290.0843811035156, "logps/rejected": -364.3125, "loss": 0.0337, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.88458251953125, "rewards/margins": 6.879296779632568, "rewards/rejected": -9.765039443969727, "step": 9490 }, { "epoch": 2.3952982699398104, "grad_norm": 5.578651428222656, "learning_rate": 5.9361279085168274e-08, "logits/chosen": -1.188720703125, "logits/rejected": -1.0909423828125, "logps/chosen": -308.5687561035156, "logps/rejected": -386.76251220703125, "loss": 0.0365, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.633349657058716, "rewards/margins": 6.662109375, "rewards/rejected": -9.301172256469727, "step": 9500 }, { "epoch": 2.3978193048246306, "grad_norm": 26.881193161010742, "learning_rate": 5.888760215270988e-08, "logits/chosen": -1.137841820716858, "logits/rejected": -1.0743408203125, "logps/chosen": -294.98748779296875, "logps/rejected": -348.79376220703125, "loss": 0.0548, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.935420274734497, "rewards/margins": 6.504101753234863, "rewards/rejected": -9.437891006469727, "step": 9510 }, { "epoch": 2.4003403397094507, "grad_norm": 13.205660820007324, "learning_rate": 5.8415570241720916e-08, "logits/chosen": -1.1109740734100342, "logits/rejected": -1.0967925786972046, "logps/chosen": -330.2250061035156, "logps/rejected": -377.625, "loss": 0.0674, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.0494627952575684, "rewards/margins": 6.604687690734863, "rewards/rejected": -9.655858993530273, "step": 9520 }, { "epoch": 2.402861374594271, "grad_norm": 20.070270538330078, "learning_rate": 5.7945187415269076e-08, "logits/chosen": -1.134033203125, "logits/rejected": -1.137353539466858, "logps/chosen": -299.6312561035156, "logps/rejected": -360.17498779296875, "loss": 0.0416, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.712261915206909, "rewards/margins": 6.659375190734863, "rewards/rejected": -9.369140625, "step": 9530 }, { "epoch": 2.405382409479091, "grad_norm": 17.515649795532227, "learning_rate": 5.747645772222767e-08, "logits/chosen": -1.218408226966858, "logits/rejected": -1.003991723060608, "logps/chosen": -316.70001220703125, "logps/rejected": -351.42498779296875, "loss": 0.0417, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.394238233566284, "rewards/margins": 6.923242092132568, "rewards/rejected": -9.311132431030273, "step": 9540 }, { "epoch": 2.4079034443639116, "grad_norm": 85.17425537109375, "learning_rate": 5.700938519724016e-08, "logits/chosen": -1.1747620105743408, "logits/rejected": -1.0635712146759033, "logps/chosen": -292.5625, "logps/rejected": -363.4125061035156, "loss": 0.0702, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.712146043777466, "rewards/margins": 6.213086128234863, "rewards/rejected": -8.924609184265137, "step": 9550 }, { "epoch": 2.4104244792487317, "grad_norm": 36.34426498413086, "learning_rate": 5.6543973860685796e-08, "logits/chosen": -1.1073486804962158, "logits/rejected": NaN, "logps/chosen": -295.4375, "logps/rejected": -338.26251220703125, "loss": 0.0661, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.8862547874450684, "rewards/margins": 6.222265720367432, "rewards/rejected": -9.106640815734863, "step": 9560 }, { "epoch": 2.412945514133552, "grad_norm": 36.26641082763672, "learning_rate": 5.608022771864515e-08, "logits/chosen": -1.1702239513397217, "logits/rejected": -1.047338843345642, "logps/chosen": -310.48748779296875, "logps/rejected": -341.0375061035156, "loss": 0.0318, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.709491014480591, "rewards/margins": 6.6884765625, "rewards/rejected": -9.393359184265137, "step": 9570 }, { "epoch": 2.415466549018372, "grad_norm": 32.18575668334961, "learning_rate": 5.56181507628653e-08, "logits/chosen": -1.1581542491912842, "logits/rejected": -1.124108910560608, "logps/chosen": -318.3500061035156, "logps/rejected": -375.20623779296875, "loss": 0.0487, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.131308078765869, "rewards/margins": 6.637109279632568, "rewards/rejected": -9.770703315734863, "step": 9580 }, { "epoch": 2.417987583903192, "grad_norm": 7.18175745010376, "learning_rate": 5.5157746970725614e-08, "logits/chosen": -1.1223633289337158, "logits/rejected": -0.990283191204071, "logps/chosen": -303.8687438964844, "logps/rejected": -339.54998779296875, "loss": 0.0388, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.357025146484375, "rewards/margins": 6.460156440734863, "rewards/rejected": -9.817187309265137, "step": 9590 }, { "epoch": 2.4205086187880127, "grad_norm": 8.686003684997559, "learning_rate": 5.469902030520346e-08, "logits/chosen": -1.137231469154358, "logits/rejected": -1.021478295326233, "logps/chosen": -297.88751220703125, "logps/rejected": -333.13751220703125, "loss": 0.0338, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8655028343200684, "rewards/margins": 6.748046875, "rewards/rejected": -9.61328125, "step": 9600 }, { "epoch": 2.423029653672833, "grad_norm": 23.312318801879883, "learning_rate": 5.424197471484041e-08, "logits/chosen": -1.124792456626892, "logits/rejected": -1.041748046875, "logps/chosen": -322.3374938964844, "logps/rejected": -375.23748779296875, "loss": 0.0519, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.4595704078674316, "rewards/margins": 6.5380859375, "rewards/rejected": -9.000391006469727, "step": 9610 }, { "epoch": 2.425550688557653, "grad_norm": 6.126657485961914, "learning_rate": 5.378661413370761e-08, "logits/chosen": -1.0678832530975342, "logits/rejected": -1.0589721202850342, "logps/chosen": -297.79998779296875, "logps/rejected": -340.59375, "loss": 0.066, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.125317335128784, "rewards/margins": 6.318359375, "rewards/rejected": -9.444140434265137, "step": 9620 }, { "epoch": 2.428071723442473, "grad_norm": 44.89402389526367, "learning_rate": 5.333294248137268e-08, "logits/chosen": -1.218359351158142, "logits/rejected": -1.157678246498108, "logps/chosen": -332.0562438964844, "logps/rejected": -371.98748779296875, "loss": 0.0779, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.752542018890381, "rewards/margins": 6.654101371765137, "rewards/rejected": -9.4052734375, "step": 9630 }, { "epoch": 2.4305927583272933, "grad_norm": 26.419673919677734, "learning_rate": 5.288096366286526e-08, "logits/chosen": -1.1844971179962158, "logits/rejected": -1.0656158924102783, "logps/chosen": -319.45623779296875, "logps/rejected": -349.2749938964844, "loss": 0.0525, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8952879905700684, "rewards/margins": 6.634375095367432, "rewards/rejected": -9.530077934265137, "step": 9640 }, { "epoch": 2.4331137932121134, "grad_norm": 43.81555938720703, "learning_rate": 5.243068156864405e-08, "logits/chosen": -1.0962402820587158, "logits/rejected": -1.0572326183319092, "logps/chosen": -314.1937561035156, "logps/rejected": -377.42498779296875, "loss": 0.0641, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0970458984375, "rewards/margins": 6.368554592132568, "rewards/rejected": -9.466015815734863, "step": 9650 }, { "epoch": 2.4356348280969335, "grad_norm": 29.52123260498047, "learning_rate": 5.1982100074562776e-08, "logits/chosen": -1.113500952720642, "logits/rejected": -0.996478259563446, "logps/chosen": -306.35626220703125, "logps/rejected": -349.6499938964844, "loss": 0.0655, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.409008741378784, "rewards/margins": 6.287549018859863, "rewards/rejected": -9.693359375, "step": 9660 }, { "epoch": 2.438155862981754, "grad_norm": 21.85391616821289, "learning_rate": 5.153522304183702e-08, "logits/chosen": -1.110693335533142, "logits/rejected": -1.025354027748108, "logps/chosen": -297.4125061035156, "logps/rejected": -359.5625, "loss": 0.0429, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4086670875549316, "rewards/margins": 6.50390625, "rewards/rejected": -9.913671493530273, "step": 9670 }, { "epoch": 2.4406768978665743, "grad_norm": 56.9681510925293, "learning_rate": 5.10900543170113e-08, "logits/chosen": -1.1635253429412842, "logits/rejected": -1.068750023841858, "logps/chosen": -314.7124938964844, "logps/rejected": -357.13751220703125, "loss": 0.0654, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.184521436691284, "rewards/margins": 6.593945503234863, "rewards/rejected": -9.780077934265137, "step": 9680 }, { "epoch": 2.4431979327513944, "grad_norm": 18.39649772644043, "learning_rate": 5.064659773192542e-08, "logits/chosen": -1.156762719154358, "logits/rejected": -1.145349144935608, "logps/chosen": -326.375, "logps/rejected": -356.5874938964844, "loss": 0.0303, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.018969774246216, "rewards/margins": 6.5927734375, "rewards/rejected": -9.611719131469727, "step": 9690 }, { "epoch": 2.4457189676362145, "grad_norm": 8.979217529296875, "learning_rate": 5.020485710368177e-08, "logits/chosen": -1.1500060558319092, "logits/rejected": -1.086267113685608, "logps/chosen": -323.6625061035156, "logps/rejected": -352.36248779296875, "loss": 0.0548, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.845294237136841, "rewards/margins": 6.4541015625, "rewards/rejected": -9.298828125, "step": 9700 }, { "epoch": 2.448240002521035, "grad_norm": 23.726537704467773, "learning_rate": 4.9764836234612665e-08, "logits/chosen": -1.1351196765899658, "logits/rejected": -1.0086548328399658, "logps/chosen": -327.1000061035156, "logps/rejected": -352.2250061035156, "loss": 0.0448, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0148072242736816, "rewards/margins": 6.356249809265137, "rewards/rejected": -9.368749618530273, "step": 9710 }, { "epoch": 2.4507610374058553, "grad_norm": 6.30889892578125, "learning_rate": 4.932653891224719e-08, "logits/chosen": -1.147802710533142, "logits/rejected": -1.0699462890625, "logps/chosen": -299.45001220703125, "logps/rejected": -350.3125, "loss": 0.0433, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.856768846511841, "rewards/margins": 6.569531440734863, "rewards/rejected": -9.425390243530273, "step": 9720 }, { "epoch": 2.4532820722906754, "grad_norm": 10.74817943572998, "learning_rate": 4.8889968909278824e-08, "logits/chosen": -1.2151610851287842, "logits/rejected": -1.161279320716858, "logps/chosen": -328.82501220703125, "logps/rejected": -359.54998779296875, "loss": 0.0598, "rewards/accuracies": 0.96875, "rewards/chosen": -3.0374083518981934, "rewards/margins": 6.2802734375, "rewards/rejected": -9.314453125, "step": 9730 }, { "epoch": 2.4558031071754955, "grad_norm": 20.406330108642578, "learning_rate": 4.845512998353296e-08, "logits/chosen": -1.159692406654358, "logits/rejected": -1.1065948009490967, "logps/chosen": -322.5874938964844, "logps/rejected": -365.1000061035156, "loss": 0.083, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5016846656799316, "rewards/margins": 6.512890815734863, "rewards/rejected": -9.017187118530273, "step": 9740 }, { "epoch": 2.4583241420603157, "grad_norm": 14.469663619995117, "learning_rate": 4.802202587793469e-08, "logits/chosen": NaN, "logits/rejected": -1.0395019054412842, "logps/chosen": -281.01873779296875, "logps/rejected": -371.25, "loss": 0.044, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.469738721847534, "rewards/margins": 6.765038967132568, "rewards/rejected": -9.234766006469727, "step": 9750 }, { "epoch": 2.460845176945136, "grad_norm": 60.83958435058594, "learning_rate": 4.7590660320476236e-08, "logits/chosen": -1.204382300376892, "logits/rejected": -1.154455542564392, "logps/chosen": -327.3062438964844, "logps/rejected": -346.71875, "loss": 0.0532, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.436450242996216, "rewards/margins": 6.305078029632568, "rewards/rejected": -8.7431640625, "step": 9760 }, { "epoch": 2.463366211829956, "grad_norm": 64.44366455078125, "learning_rate": 4.716103702418528e-08, "logits/chosen": -1.1702392101287842, "logits/rejected": -1.071496605873108, "logps/chosen": -295.7093811035156, "logps/rejected": -337.0625, "loss": 0.0604, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.7420318126678467, "rewards/margins": 6.470312595367432, "rewards/rejected": -9.209765434265137, "step": 9770 }, { "epoch": 2.4658872467147765, "grad_norm": 8.146585464477539, "learning_rate": 4.673315968709257e-08, "logits/chosen": -1.226287841796875, "logits/rejected": -1.0896484851837158, "logps/chosen": -298.45001220703125, "logps/rejected": -336.23748779296875, "loss": 0.0539, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.3854613304138184, "rewards/margins": 6.4619140625, "rewards/rejected": -8.845703125, "step": 9780 }, { "epoch": 2.4684082815995967, "grad_norm": 8.578082084655762, "learning_rate": 4.630703199220054e-08, "logits/chosen": -1.158361792564392, "logits/rejected": -1.140661597251892, "logps/chosen": -329.29998779296875, "logps/rejected": -345.9624938964844, "loss": 0.0689, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.5289580821990967, "rewards/margins": 6.202734470367432, "rewards/rejected": -8.731640815734863, "step": 9790 }, { "epoch": 2.470929316484417, "grad_norm": 24.780441284179688, "learning_rate": 4.588265760745125e-08, "logits/chosen": -1.122656226158142, "logits/rejected": -1.034277319908142, "logps/chosen": -276.54376220703125, "logps/rejected": -323.3374938964844, "loss": 0.0643, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.920092821121216, "rewards/margins": 6.118359565734863, "rewards/rejected": -9.040234565734863, "step": 9800 }, { "epoch": 2.473450351369237, "grad_norm": 10.892682075500488, "learning_rate": 4.546004018569488e-08, "logits/chosen": -1.15179443359375, "logits/rejected": -1.143945336341858, "logps/chosen": -320.2749938964844, "logps/rejected": -362.1875, "loss": 0.0485, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.894238233566284, "rewards/margins": 6.426562309265137, "rewards/rejected": -9.318750381469727, "step": 9810 }, { "epoch": 2.475971386254057, "grad_norm": 5.433866500854492, "learning_rate": 4.503918336465859e-08, "logits/chosen": -1.178442358970642, "logits/rejected": -0.9962524175643921, "logps/chosen": -328.9312438964844, "logps/rejected": -355.7749938964844, "loss": 0.0585, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0894532203674316, "rewards/margins": 6.390454292297363, "rewards/rejected": -9.479296684265137, "step": 9820 }, { "epoch": 2.4784924211388777, "grad_norm": 105.5046615600586, "learning_rate": 4.462009076691472e-08, "logits/chosen": -1.1352112293243408, "logits/rejected": -1.079833984375, "logps/chosen": -314.6312561035156, "logps/rejected": -349.4125061035156, "loss": 0.0829, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.270092725753784, "rewards/margins": 6.248193264007568, "rewards/rejected": -9.517969131469727, "step": 9830 }, { "epoch": 2.481013456023698, "grad_norm": 10.480962753295898, "learning_rate": 4.420276599984993e-08, "logits/chosen": -1.086206078529358, "logits/rejected": -1.0146484375, "logps/chosen": -320.9437561035156, "logps/rejected": -371.88751220703125, "loss": 0.0416, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.675891160964966, "rewards/margins": 6.841796875, "rewards/rejected": -9.517969131469727, "step": 9840 }, { "epoch": 2.483534490908518, "grad_norm": 19.221046447753906, "learning_rate": 4.3787212655634234e-08, "logits/chosen": -1.1977050304412842, "logits/rejected": -1.037451148033142, "logps/chosen": -306.28125, "logps/rejected": -359.1625061035156, "loss": 0.0425, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.0283446311950684, "rewards/margins": 6.506054878234863, "rewards/rejected": -9.528905868530273, "step": 9850 }, { "epoch": 2.486055525793338, "grad_norm": 8.99593734741211, "learning_rate": 4.337343431118973e-08, "logits/chosen": -1.1455199718475342, "logits/rejected": -1.07769775390625, "logps/chosen": -307.5062561035156, "logps/rejected": -359.03125, "loss": 0.0468, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8803467750549316, "rewards/margins": 6.687304496765137, "rewards/rejected": -9.559374809265137, "step": 9860 }, { "epoch": 2.4885765606781582, "grad_norm": 23.91136360168457, "learning_rate": 4.296143452816009e-08, "logits/chosen": -1.186376929283142, "logits/rejected": -1.1511108875274658, "logps/chosen": -332.21875, "logps/rejected": -385.92498779296875, "loss": 0.0367, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.979846239089966, "rewards/margins": 6.645117282867432, "rewards/rejected": -9.623827934265137, "step": 9870 }, { "epoch": 2.4910975955629784, "grad_norm": 10.396635055541992, "learning_rate": 4.255121685287974e-08, "logits/chosen": -1.1538788080215454, "logits/rejected": -1.103967308998108, "logps/chosen": -329.13751220703125, "logps/rejected": -360.1499938964844, "loss": 0.0512, "rewards/accuracies": 0.984375, "rewards/chosen": -2.778430223464966, "rewards/margins": 6.522070407867432, "rewards/rejected": -9.301953315734863, "step": 9880 }, { "epoch": 2.493618630447799, "grad_norm": 34.48313522338867, "learning_rate": 4.214278481634362e-08, "logits/chosen": -1.192022681236267, "logits/rejected": -1.1108276844024658, "logps/chosen": -311.75, "logps/rejected": -371.0625, "loss": 0.0402, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.337881565093994, "rewards/margins": 6.7109375, "rewards/rejected": -10.049219131469727, "step": 9890 }, { "epoch": 2.496139665332619, "grad_norm": 12.752933502197266, "learning_rate": 4.173614193417629e-08, "logits/chosen": -1.0169861316680908, "logits/rejected": -1.079620361328125, "logps/chosen": -310.76251220703125, "logps/rejected": -359.79998779296875, "loss": 0.0604, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.3267884254455566, "rewards/margins": 6.771777153015137, "rewards/rejected": -10.099609375, "step": 9900 }, { "epoch": 2.4986607002174392, "grad_norm": 31.37298011779785, "learning_rate": 4.133129170660227e-08, "logits/chosen": -1.1909668445587158, "logits/rejected": -1.039147973060608, "logps/chosen": -320.0062561035156, "logps/rejected": -372.8374938964844, "loss": 0.0512, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.055462598800659, "rewards/margins": 6.76171875, "rewards/rejected": -9.815625190734863, "step": 9910 }, { "epoch": 2.5011817351022594, "grad_norm": 11.590690612792969, "learning_rate": 4.0928237618415294e-08, "logits/chosen": -1.1640136241912842, "logits/rejected": -1.0757172107696533, "logps/chosen": -334.34375, "logps/rejected": -356.0874938964844, "loss": 0.0348, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.127246141433716, "rewards/margins": 6.898241996765137, "rewards/rejected": -10.026952743530273, "step": 9920 }, { "epoch": 2.50370276998708, "grad_norm": 3.064460039138794, "learning_rate": 4.052698313894892e-08, "logits/chosen": -1.18548583984375, "logits/rejected": -1.00244140625, "logps/chosen": -319.32501220703125, "logps/rejected": -365.73748779296875, "loss": 0.0497, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.109729051589966, "rewards/margins": 6.683789253234863, "rewards/rejected": -9.796875, "step": 9930 }, { "epoch": 2.5062238048719, "grad_norm": 14.619731903076172, "learning_rate": 4.0127531722046195e-08, "logits/chosen": -1.189910888671875, "logits/rejected": -1.1230590343475342, "logps/chosen": -323.98748779296875, "logps/rejected": -377.5874938964844, "loss": 0.0398, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.218334913253784, "rewards/margins": 7.058789253234863, "rewards/rejected": -10.279296875, "step": 9940 }, { "epoch": 2.5087448397567202, "grad_norm": 4.51336669921875, "learning_rate": 3.972988680603001e-08, "logits/chosen": -1.147375464439392, "logits/rejected": -1.0868377685546875, "logps/chosen": -316.14373779296875, "logps/rejected": -358.3125, "loss": 0.0606, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.720410108566284, "rewards/margins": 6.407812595367432, "rewards/rejected": -9.127344131469727, "step": 9950 }, { "epoch": 2.5112658746415404, "grad_norm": 43.95208740234375, "learning_rate": 3.933405181367391e-08, "logits/chosen": -1.144555687904358, "logits/rejected": -1.0635986328125, "logps/chosen": -302.54376220703125, "logps/rejected": -388.0874938964844, "loss": 0.0742, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.041552782058716, "rewards/margins": 6.627148628234863, "rewards/rejected": -9.667577743530273, "step": 9960 }, { "epoch": 2.5137869095263605, "grad_norm": 12.412017822265625, "learning_rate": 3.894003015217206e-08, "logits/chosen": -1.17156982421875, "logits/rejected": -1.103387475013733, "logps/chosen": -322.4312438964844, "logps/rejected": -362.2124938964844, "loss": 0.0719, "rewards/accuracies": 0.96875, "rewards/chosen": -2.747363328933716, "rewards/margins": 6.265625, "rewards/rejected": -9.013671875, "step": 9970 }, { "epoch": 2.5163079444111807, "grad_norm": 14.251262664794922, "learning_rate": 3.854782521311018e-08, "logits/chosen": -1.1359131336212158, "logits/rejected": -1.132318139076233, "logps/chosen": -329.45001220703125, "logps/rejected": -360.3374938964844, "loss": 0.0762, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.1038451194763184, "rewards/margins": 6.655468940734863, "rewards/rejected": -9.762499809265137, "step": 9980 }, { "epoch": 2.518828979296001, "grad_norm": 33.08596420288086, "learning_rate": 3.815744037243651e-08, "logits/chosen": -1.1314818859100342, "logits/rejected": -1.0926392078399658, "logps/chosen": -309.73126220703125, "logps/rejected": -361.7562561035156, "loss": 0.0836, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.118969678878784, "rewards/margins": 6.161523342132568, "rewards/rejected": -9.279687881469727, "step": 9990 }, { "epoch": 2.5213500141808214, "grad_norm": 15.152300834655762, "learning_rate": 3.776887899043246e-08, "logits/chosen": -1.1444488763809204, "logits/rejected": -1.103326439857483, "logps/chosen": -312.76873779296875, "logps/rejected": -382.4125061035156, "loss": 0.0702, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1241211891174316, "rewards/margins": 6.632031440734863, "rewards/rejected": -9.7568359375, "step": 10000 }, { "epoch": 2.5238710490656415, "grad_norm": 11.703275680541992, "learning_rate": 3.7382144411683857e-08, "logits/chosen": -1.2043030261993408, "logits/rejected": -1.166845679283142, "logps/chosen": -316.3687438964844, "logps/rejected": -334.8125, "loss": 0.0465, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0115113258361816, "rewards/margins": 6.575585842132568, "rewards/rejected": -9.583984375, "step": 10010 }, { "epoch": 2.5263920839504617, "grad_norm": 43.19799041748047, "learning_rate": 3.699723996505205e-08, "logits/chosen": -1.1665023565292358, "logits/rejected": -1.110015869140625, "logps/chosen": -317.2749938964844, "logps/rejected": -356.70001220703125, "loss": 0.0522, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.833251953125, "rewards/margins": 6.406445503234863, "rewards/rejected": -9.242968559265137, "step": 10020 }, { "epoch": 2.528913118835282, "grad_norm": 16.9388484954834, "learning_rate": 3.661416896364547e-08, "logits/chosen": -1.1895630359649658, "logits/rejected": -1.1419556140899658, "logps/chosen": -313.3812561035156, "logps/rejected": -374.9624938964844, "loss": 0.0457, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.9688477516174316, "rewards/margins": 6.257421970367432, "rewards/rejected": -9.228515625, "step": 10030 }, { "epoch": 2.531434153720102, "grad_norm": 11.054365158081055, "learning_rate": 3.623293470479075e-08, "logits/chosen": -1.1443603038787842, "logits/rejected": -1.1128661632537842, "logps/chosen": -300.0375061035156, "logps/rejected": -341.32501220703125, "loss": 0.0469, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.071154832839966, "rewards/margins": 6.469140529632568, "rewards/rejected": -9.54296875, "step": 10040 }, { "epoch": 2.5339551886049225, "grad_norm": 30.528623580932617, "learning_rate": 3.58535404700048e-08, "logits/chosen": -1.153588891029358, "logits/rejected": -1.169403076171875, "logps/chosen": -324.4750061035156, "logps/rejected": -348.0, "loss": 0.0497, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.210498094558716, "rewards/margins": 6.308398246765137, "rewards/rejected": -9.520312309265137, "step": 10050 }, { "epoch": 2.5364762234897427, "grad_norm": 33.49970245361328, "learning_rate": 3.5475989524966085e-08, "logits/chosen": -1.219335913658142, "logits/rejected": -1.163299560546875, "logps/chosen": -319.98748779296875, "logps/rejected": -367.3374938964844, "loss": 0.0298, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.139880418777466, "rewards/margins": 6.911523342132568, "rewards/rejected": -10.049219131469727, "step": 10060 }, { "epoch": 2.538997258374563, "grad_norm": 13.296183586120605, "learning_rate": 3.5100285119486926e-08, "logits/chosen": -1.166174292564392, "logits/rejected": -0.998913586139679, "logps/chosen": -313.54998779296875, "logps/rejected": -370.48748779296875, "loss": 0.0405, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9495301246643066, "rewards/margins": 6.721484184265137, "rewards/rejected": -9.670702934265137, "step": 10070 }, { "epoch": 2.541518293259383, "grad_norm": 81.87371063232422, "learning_rate": 3.472643048748525e-08, "logits/chosen": -1.2131226062774658, "logits/rejected": -1.116235375404358, "logps/chosen": -338.3687438964844, "logps/rejected": -383.0375061035156, "loss": 0.0785, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -2.916888475418091, "rewards/margins": 6.690234184265137, "rewards/rejected": -9.607812881469727, "step": 10080 }, { "epoch": 2.544039328144203, "grad_norm": 16.84267807006836, "learning_rate": 3.43544288469568e-08, "logits/chosen": -1.158911108970642, "logits/rejected": -1.0852539539337158, "logps/chosen": -336.7437438964844, "logps/rejected": -347.2875061035156, "loss": 0.0559, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.92132568359375, "rewards/margins": 6.407910346984863, "rewards/rejected": -9.330273628234863, "step": 10090 }, { "epoch": 2.546560363029023, "grad_norm": 36.27756881713867, "learning_rate": 3.398428339994763e-08, "logits/chosen": -1.18414306640625, "logits/rejected": -1.1505248546600342, "logps/chosen": -302.58123779296875, "logps/rejected": -365.1625061035156, "loss": 0.0585, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.7022337913513184, "rewards/margins": 6.570508003234863, "rewards/rejected": -9.26953125, "step": 10100 }, { "epoch": 2.5490813979138434, "grad_norm": 30.258563995361328, "learning_rate": 3.3615997332526345e-08, "logits/chosen": -1.2242310047149658, "logits/rejected": -1.186425805091858, "logps/chosen": -343.4750061035156, "logps/rejected": -400.86248779296875, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.875537157058716, "rewards/margins": 6.746874809265137, "rewards/rejected": -9.627734184265137, "step": 10110 }, { "epoch": 2.551602432798664, "grad_norm": 35.232215881347656, "learning_rate": 3.32495738147566e-08, "logits/chosen": -1.159143090248108, "logits/rejected": -1.0797851085662842, "logps/chosen": -338.61248779296875, "logps/rejected": -373.90625, "loss": 0.0366, "rewards/accuracies": 0.984375, "rewards/chosen": -3.01007080078125, "rewards/margins": 6.5791015625, "rewards/rejected": -9.591015815734863, "step": 10120 }, { "epoch": 2.554123467683484, "grad_norm": 6.726482391357422, "learning_rate": 3.288501600067017e-08, "logits/chosen": -1.147790551185608, "logits/rejected": -1.033911108970642, "logps/chosen": -314.71875, "logps/rejected": -351.0375061035156, "loss": 0.0989, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.8470826148986816, "rewards/margins": 6.75390625, "rewards/rejected": -9.600781440734863, "step": 10130 }, { "epoch": 2.5566445025683042, "grad_norm": 3.90889835357666, "learning_rate": 3.2522327028239456e-08, "logits/chosen": -1.206384301185608, "logits/rejected": -1.1241180896759033, "logps/chosen": -318.73126220703125, "logps/rejected": -356.17498779296875, "loss": 0.0238, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.2820801734924316, "rewards/margins": 6.616601467132568, "rewards/rejected": -9.895312309265137, "step": 10140 }, { "epoch": 2.5591655374531244, "grad_norm": 11.42145824432373, "learning_rate": 3.2161510019350524e-08, "logits/chosen": -1.1995360851287842, "logits/rejected": -1.1785399913787842, "logps/chosen": -326.63751220703125, "logps/rejected": -358.3500061035156, "loss": 0.0321, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8296661376953125, "rewards/margins": 6.55078125, "rewards/rejected": -9.386327743530273, "step": 10150 }, { "epoch": 2.561686572337945, "grad_norm": 20.189697265625, "learning_rate": 3.180256807977638e-08, "logits/chosen": -1.1577575206756592, "logits/rejected": -1.0860474109649658, "logps/chosen": -315.8500061035156, "logps/rejected": -338.70001220703125, "loss": 0.0495, "rewards/accuracies": 0.96875, "rewards/chosen": -3.1072998046875, "rewards/margins": 6.552343845367432, "rewards/rejected": -9.6611328125, "step": 10160 }, { "epoch": 2.564207607222765, "grad_norm": 87.76728057861328, "learning_rate": 3.144550429915027e-08, "logits/chosen": -1.1464354991912842, "logits/rejected": -1.103448510169983, "logps/chosen": -329.38751220703125, "logps/rejected": -378.9624938964844, "loss": 0.0394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.390063524246216, "rewards/margins": 6.573437690734863, "rewards/rejected": -9.958593368530273, "step": 10170 }, { "epoch": 2.5667286421075852, "grad_norm": 31.60454559326172, "learning_rate": 3.10903217509387e-08, "logits/chosen": -1.179846167564392, "logits/rejected": NaN, "logps/chosen": -355.6312561035156, "logps/rejected": -361.6625061035156, "loss": 0.0496, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.893359422683716, "rewards/margins": 6.592968940734863, "rewards/rejected": -9.485937118530273, "step": 10180 }, { "epoch": 2.5692496769924054, "grad_norm": 26.464574813842773, "learning_rate": 3.0737023492415606e-08, "logits/chosen": -1.1988189220428467, "logits/rejected": NaN, "logps/chosen": -325.1812438964844, "logps/rejected": -365.4375, "loss": 0.0378, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.072436571121216, "rewards/margins": 6.5830078125, "rewards/rejected": -9.656641006469727, "step": 10190 }, { "epoch": 2.5717707118772255, "grad_norm": 38.3802490234375, "learning_rate": 3.0385612564635346e-08, "logits/chosen": -1.1513671875, "logits/rejected": -1.091589331626892, "logps/chosen": -334.5, "logps/rejected": -370.57501220703125, "loss": 0.0525, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.2652831077575684, "rewards/margins": 7.131445407867432, "rewards/rejected": -10.394140243530273, "step": 10200 }, { "epoch": 2.5742917467620456, "grad_norm": 51.423492431640625, "learning_rate": 3.003609199240711e-08, "logits/chosen": NaN, "logits/rejected": -1.1724364757537842, "logps/chosen": -316.73126220703125, "logps/rejected": -353.6499938964844, "loss": 0.0499, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.3993163108825684, "rewards/margins": 6.147851467132568, "rewards/rejected": -9.5439453125, "step": 10210 }, { "epoch": 2.576812781646866, "grad_norm": 18.73659896850586, "learning_rate": 2.9688464784268563e-08, "logits/chosen": -1.090063452720642, "logits/rejected": -1.012841820716858, "logps/chosen": -332.8687438964844, "logps/rejected": -375.13751220703125, "loss": 0.0529, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.112597703933716, "rewards/margins": 6.841015815734863, "rewards/rejected": -9.948827743530273, "step": 10220 }, { "epoch": 2.5793338165316864, "grad_norm": 13.378907203674316, "learning_rate": 2.9342733932459923e-08, "logits/chosen": -1.169531226158142, "logits/rejected": -1.110009789466858, "logps/chosen": -306.5249938964844, "logps/rejected": -360.63751220703125, "loss": 0.0491, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4999022483825684, "rewards/margins": 6.477148532867432, "rewards/rejected": -9.979296684265137, "step": 10230 }, { "epoch": 2.5818548514165065, "grad_norm": 26.04082489013672, "learning_rate": 2.8998902412898514e-08, "logits/chosen": -1.1234649419784546, "logits/rejected": -1.0699462890625, "logps/chosen": -310.61248779296875, "logps/rejected": -369.5, "loss": 0.0407, "rewards/accuracies": 0.984375, "rewards/chosen": -3.35650634765625, "rewards/margins": 6.9501953125, "rewards/rejected": -10.307031631469727, "step": 10240 }, { "epoch": 2.5843758863013266, "grad_norm": 12.310823440551758, "learning_rate": 2.8656973185152754e-08, "logits/chosen": -1.1652100086212158, "logits/rejected": -1.130334496498108, "logps/chosen": -315.9312438964844, "logps/rejected": -350.1875, "loss": 0.0565, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.19012451171875, "rewards/margins": 6.421679496765137, "rewards/rejected": -9.615234375, "step": 10250 }, { "epoch": 2.586896921186147, "grad_norm": 19.390209197998047, "learning_rate": 2.831694919241695e-08, "logits/chosen": -1.107080101966858, "logits/rejected": -1.073034644126892, "logps/chosen": -318.32501220703125, "logps/rejected": -353.79998779296875, "loss": 0.0818, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.2092316150665283, "rewards/margins": 6.260351657867432, "rewards/rejected": -9.469531059265137, "step": 10260 }, { "epoch": 2.5894179560709674, "grad_norm": 63.577392578125, "learning_rate": 2.7978833361485933e-08, "logits/chosen": -1.2262694835662842, "logits/rejected": -1.148596167564392, "logps/chosen": -320.5375061035156, "logps/rejected": -353.3500061035156, "loss": 0.0444, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.0331053733825684, "rewards/margins": 6.652148246765137, "rewards/rejected": -9.686718940734863, "step": 10270 }, { "epoch": 2.5919389909557875, "grad_norm": 73.93006134033203, "learning_rate": 2.7642628602729758e-08, "logits/chosen": -1.1412231922149658, "logits/rejected": -1.0458862781524658, "logps/chosen": -324.2875061035156, "logps/rejected": -384.1875, "loss": 0.0736, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.447253465652466, "rewards/margins": 6.890038967132568, "rewards/rejected": -10.337890625, "step": 10280 }, { "epoch": 2.5944600258406076, "grad_norm": 46.664955139160156, "learning_rate": 2.7308337810068665e-08, "logits/chosen": -1.2644164562225342, "logits/rejected": -1.1243407726287842, "logps/chosen": -338.2124938964844, "logps/rejected": -364.8374938964844, "loss": 0.077, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.272927761077881, "rewards/margins": 6.66015625, "rewards/rejected": -9.934374809265137, "step": 10290 }, { "epoch": 2.596981060725428, "grad_norm": 3.487577438354492, "learning_rate": 2.6975963860948247e-08, "logits/chosen": -1.143798828125, "logits/rejected": -1.0145995616912842, "logps/chosen": -346.5062561035156, "logps/rejected": -377.5249938964844, "loss": 0.083, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.0484619140625, "rewards/margins": 6.5068359375, "rewards/rejected": -9.555078506469727, "step": 10300 }, { "epoch": 2.599502095610248, "grad_norm": 23.394622802734375, "learning_rate": 2.664550961631476e-08, "logits/chosen": -1.1463501453399658, "logits/rejected": -1.0346343517303467, "logps/chosen": -313.95001220703125, "logps/rejected": -355.1625061035156, "loss": 0.0514, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.3636717796325684, "rewards/margins": 6.446484565734863, "rewards/rejected": -9.816015243530273, "step": 10310 }, { "epoch": 2.602023130495068, "grad_norm": 12.553984642028809, "learning_rate": 2.6316977920590234e-08, "logits/chosen": -1.226318359375, "logits/rejected": -1.187158226966858, "logps/chosen": -336.13751220703125, "logps/rejected": -367.26251220703125, "loss": 0.0495, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.112927198410034, "rewards/margins": 7.0615234375, "rewards/rejected": -10.178515434265137, "step": 10320 }, { "epoch": 2.604544165379888, "grad_norm": 76.98346710205078, "learning_rate": 2.599037160164827e-08, "logits/chosen": -1.0966796875, "logits/rejected": -1.0803344249725342, "logps/chosen": -320.20623779296875, "logps/rejected": -362.2875061035156, "loss": 0.0383, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.385519504547119, "rewards/margins": 6.6201171875, "rewards/rejected": -10.006640434265137, "step": 10330 }, { "epoch": 2.607065200264709, "grad_norm": 28.021957397460938, "learning_rate": 2.5665693470789423e-08, "logits/chosen": -1.196313500404358, "logits/rejected": -1.0872313976287842, "logps/chosen": -307.57501220703125, "logps/rejected": -356.8062438964844, "loss": 0.053, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.856762647628784, "rewards/margins": 6.643164157867432, "rewards/rejected": -9.501172065734863, "step": 10340 }, { "epoch": 2.609586235149529, "grad_norm": 29.01761245727539, "learning_rate": 2.534294632271733e-08, "logits/chosen": -1.1695435047149658, "logits/rejected": -1.0758788585662842, "logps/chosen": -337.9624938964844, "logps/rejected": -361.7749938964844, "loss": 0.0463, "rewards/accuracies": 0.984375, "rewards/chosen": -2.8138794898986816, "rewards/margins": 6.504492282867432, "rewards/rejected": -9.318359375, "step": 10350 }, { "epoch": 2.612107270034349, "grad_norm": 27.207761764526367, "learning_rate": 2.5022132935514333e-08, "logits/chosen": -1.10498046875, "logits/rejected": -1.078881859779358, "logps/chosen": -298.73126220703125, "logps/rejected": -349.2124938964844, "loss": 0.05, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0404067039489746, "rewards/margins": 6.368554592132568, "rewards/rejected": -9.411230087280273, "step": 10360 }, { "epoch": 2.614628304919169, "grad_norm": 13.011737823486328, "learning_rate": 2.470325607061774e-08, "logits/chosen": -1.168432593345642, "logits/rejected": -1.0833740234375, "logps/chosen": -323.9375, "logps/rejected": -376.0375061035156, "loss": 0.0649, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.160961866378784, "rewards/margins": 6.526171684265137, "rewards/rejected": -9.688672065734863, "step": 10370 }, { "epoch": 2.61714933980399, "grad_norm": 18.515071868896484, "learning_rate": 2.4386318472796125e-08, "logits/chosen": -1.166589379310608, "logits/rejected": -1.118554711341858, "logps/chosen": -326.82501220703125, "logps/rejected": -360.48126220703125, "loss": 0.0548, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.373303174972534, "rewards/margins": 6.196093559265137, "rewards/rejected": -9.569531440734863, "step": 10380 }, { "epoch": 2.61967037468881, "grad_norm": 12.66776180267334, "learning_rate": 2.4071322870125475e-08, "logits/chosen": -1.167730689048767, "logits/rejected": -1.111474633216858, "logps/chosen": -313.63751220703125, "logps/rejected": -364.73748779296875, "loss": 0.0247, "rewards/accuracies": 0.996874988079071, "rewards/chosen": -3.2600951194763184, "rewards/margins": 6.756249904632568, "rewards/rejected": -10.015233993530273, "step": 10390 }, { "epoch": 2.62219140957363, "grad_norm": 8.786449432373047, "learning_rate": 2.3758271973965848e-08, "logits/chosen": -1.1691405773162842, "logits/rejected": -1.035882592201233, "logps/chosen": -325.86248779296875, "logps/rejected": -366.5874938964844, "loss": 0.0512, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.1101410388946533, "rewards/margins": 6.529296875, "rewards/rejected": -9.640625, "step": 10400 }, { "epoch": 2.62471244445845, "grad_norm": 33.089996337890625, "learning_rate": 2.344716847893813e-08, "logits/chosen": -1.0660278797149658, "logits/rejected": -1.1205322742462158, "logps/chosen": -307.7875061035156, "logps/rejected": -370.5687561035156, "loss": 0.0577, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.206298828125, "rewards/margins": 6.607812404632568, "rewards/rejected": -9.816015243530273, "step": 10410 }, { "epoch": 2.6272334793432703, "grad_norm": 15.56445598602295, "learning_rate": 2.313801506290064e-08, "logits/chosen": -1.1512877941131592, "logits/rejected": -1.077978491783142, "logps/chosen": -324.57501220703125, "logps/rejected": -328.9624938964844, "loss": 0.0665, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.110943555831909, "rewards/margins": 6.384863376617432, "rewards/rejected": -9.496874809265137, "step": 10420 }, { "epoch": 2.6297545142280905, "grad_norm": 42.54911804199219, "learning_rate": 2.283081438692619e-08, "logits/chosen": -1.1371886730194092, "logits/rejected": -1.137750267982483, "logps/chosen": -307.8500061035156, "logps/rejected": -346.57501220703125, "loss": 0.0498, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.9408812522888184, "rewards/margins": 6.560742378234863, "rewards/rejected": -9.499804496765137, "step": 10430 }, { "epoch": 2.6322755491129106, "grad_norm": 18.617380142211914, "learning_rate": 2.252556909527911e-08, "logits/chosen": -1.1597900390625, "logits/rejected": -1.0846679210662842, "logps/chosen": -323.3062438964844, "logps/rejected": -361.57501220703125, "loss": 0.0672, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.796948194503784, "rewards/margins": 6.628125190734863, "rewards/rejected": -9.42578125, "step": 10440 }, { "epoch": 2.634796583997731, "grad_norm": 50.39127731323242, "learning_rate": 2.222228181539268e-08, "logits/chosen": -1.1407592296600342, "logits/rejected": -1.1284911632537842, "logps/chosen": -326.5562438964844, "logps/rejected": -362.8374938964844, "loss": 0.038, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.072589159011841, "rewards/margins": 6.466796875, "rewards/rejected": -9.537890434265137, "step": 10450 }, { "epoch": 2.6373176188825513, "grad_norm": 5.374075412750244, "learning_rate": 2.1920955157846228e-08, "logits/chosen": -1.1473388671875, "logits/rejected": -1.0724060535430908, "logps/chosen": -325.5625, "logps/rejected": -357.25, "loss": 0.0349, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.348498582839966, "rewards/margins": 6.5615234375, "rewards/rejected": -9.907031059265137, "step": 10460 }, { "epoch": 2.6398386537673715, "grad_norm": 21.453832626342773, "learning_rate": 2.1621591716342926e-08, "logits/chosen": -1.152258276939392, "logits/rejected": -1.100378394126892, "logps/chosen": -319.2562561035156, "logps/rejected": -351.2124938964844, "loss": 0.0503, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1752991676330566, "rewards/margins": 6.2255859375, "rewards/rejected": -9.402539253234863, "step": 10470 }, { "epoch": 2.6423596886521916, "grad_norm": 37.59913635253906, "learning_rate": 2.1324194067687235e-08, "logits/chosen": -1.152099609375, "logits/rejected": -1.0808899402618408, "logps/chosen": -323.22186279296875, "logps/rejected": -369.36248779296875, "loss": 0.0493, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.341320753097534, "rewards/margins": 6.634375095367432, "rewards/rejected": -9.969335556030273, "step": 10480 }, { "epoch": 2.644880723537012, "grad_norm": 19.285566329956055, "learning_rate": 2.1028764771762906e-08, "logits/chosen": -1.062231421470642, "logits/rejected": NaN, "logps/chosen": -316.8500061035156, "logps/rejected": -361.6875, "loss": 0.0597, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.268933057785034, "rewards/margins": 6.443359375, "rewards/rejected": -9.713671684265137, "step": 10490 }, { "epoch": 2.6474017584218323, "grad_norm": 18.747461318969727, "learning_rate": 2.073530637151086e-08, "logits/chosen": -1.1163330078125, "logits/rejected": -1.0819275379180908, "logps/chosen": -316.48126220703125, "logps/rejected": -341.76251220703125, "loss": 0.0613, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.099609375, "rewards/margins": 6.423047065734863, "rewards/rejected": -9.522656440734863, "step": 10500 }, { "epoch": 2.6499227933066525, "grad_norm": 25.62896728515625, "learning_rate": 2.0443821392907208e-08, "logits/chosen": -1.1439392566680908, "logits/rejected": -1.0073730945587158, "logps/chosen": -298.2875061035156, "logps/rejected": -363.98126220703125, "loss": 0.0514, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.3671202659606934, "rewards/margins": 6.742578029632568, "rewards/rejected": -10.109766006469727, "step": 10510 }, { "epoch": 2.6524438281914726, "grad_norm": 20.20026206970215, "learning_rate": 2.0154312344941833e-08, "logits/chosen": -1.228845238685608, "logits/rejected": -1.0604064464569092, "logps/chosen": -314.32501220703125, "logps/rejected": -335.8500061035156, "loss": 0.0542, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.132769823074341, "rewards/margins": 6.710546970367432, "rewards/rejected": -9.845703125, "step": 10520 }, { "epoch": 2.6549648630762928, "grad_norm": 9.448894500732422, "learning_rate": 1.9866781719596355e-08, "logits/chosen": -1.099523901939392, "logits/rejected": -1.088903784751892, "logps/chosen": -312.1875, "logps/rejected": -362.42498779296875, "loss": 0.0396, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0770263671875, "rewards/margins": 6.994921684265137, "rewards/rejected": -10.071484565734863, "step": 10530 }, { "epoch": 2.657485897961113, "grad_norm": 32.552032470703125, "learning_rate": 1.9581231991823045e-08, "logits/chosen": -1.0786864757537842, "logits/rejected": -1.0769164562225342, "logps/chosen": -311.8812561035156, "logps/rejected": -366.1812438964844, "loss": 0.0687, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.353442430496216, "rewards/margins": 6.366601467132568, "rewards/rejected": -9.720312118530273, "step": 10540 }, { "epoch": 2.660006932845933, "grad_norm": 14.566728591918945, "learning_rate": 1.92976656195232e-08, "logits/chosen": -1.184838891029358, "logits/rejected": -1.0648651123046875, "logps/chosen": -336.51873779296875, "logps/rejected": -373.4125061035156, "loss": 0.0634, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.2521910667419434, "rewards/margins": 6.7919921875, "rewards/rejected": -10.043359756469727, "step": 10550 }, { "epoch": 2.6625279677307536, "grad_norm": 39.01841735839844, "learning_rate": 1.9016085043526446e-08, "logits/chosen": -1.210479736328125, "logits/rejected": -1.082800269126892, "logps/chosen": -334.5, "logps/rejected": -371.7250061035156, "loss": 0.0468, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.4530272483825684, "rewards/margins": 6.431250095367432, "rewards/rejected": -9.881250381469727, "step": 10560 }, { "epoch": 2.6650490026155738, "grad_norm": 8.452898025512695, "learning_rate": 1.8736492687569163e-08, "logits/chosen": -1.1693603992462158, "logits/rejected": -1.219018578529358, "logps/chosen": -338.8374938964844, "logps/rejected": -378.4125061035156, "loss": 0.0582, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.443838596343994, "rewards/margins": 6.309765815734863, "rewards/rejected": -9.75390625, "step": 10570 }, { "epoch": 2.667570037500394, "grad_norm": 38.938663482666016, "learning_rate": 1.8458890958273994e-08, "logits/chosen": -1.1510009765625, "logits/rejected": -1.090490698814392, "logps/chosen": -316.3374938964844, "logps/rejected": -365.26251220703125, "loss": 0.04, "rewards/accuracies": 0.984375, "rewards/chosen": -3.244241237640381, "rewards/margins": 6.348046779632568, "rewards/rejected": -9.593358993530273, "step": 10580 }, { "epoch": 2.670091072385214, "grad_norm": 15.141484260559082, "learning_rate": 1.818328224512916e-08, "logits/chosen": -1.1576049327850342, "logits/rejected": -1.1728026866912842, "logps/chosen": -320.125, "logps/rejected": -353.07501220703125, "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.539276123046875, "rewards/margins": 6.409375190734863, "rewards/rejected": -9.950390815734863, "step": 10590 }, { "epoch": 2.6726121072700346, "grad_norm": 16.160194396972656, "learning_rate": 1.790966892046758e-08, "logits/chosen": -1.137274146080017, "logits/rejected": -0.9729369878768921, "logps/chosen": -313.70001220703125, "logps/rejected": -372.79998779296875, "loss": 0.0601, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.016976833343506, "rewards/margins": 6.833788871765137, "rewards/rejected": -9.848437309265137, "step": 10600 }, { "epoch": 2.6751331421548548, "grad_norm": 40.86832046508789, "learning_rate": 1.7638053339446818e-08, "logits/chosen": -1.179101586341858, "logits/rejected": -1.145532250404358, "logps/chosen": -313.7250061035156, "logps/rejected": -358.32501220703125, "loss": 0.0633, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.262402296066284, "rewards/margins": 6.678515434265137, "rewards/rejected": -9.939844131469727, "step": 10610 }, { "epoch": 2.677654177039675, "grad_norm": 23.61850357055664, "learning_rate": 1.736843784002848e-08, "logits/chosen": -1.1197693347930908, "logits/rejected": -1.0531005859375, "logps/chosen": -300.6812438964844, "logps/rejected": -356.9375, "loss": 0.0474, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.128833055496216, "rewards/margins": 6.652905464172363, "rewards/rejected": -9.782422065734863, "step": 10620 }, { "epoch": 2.680175211924495, "grad_norm": 9.803144454956055, "learning_rate": 1.7100824742958375e-08, "logits/chosen": -1.1581542491912842, "logits/rejected": -1.02667236328125, "logps/chosen": -330.1499938964844, "logps/rejected": -390.7250061035156, "loss": 0.0409, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.4138426780700684, "rewards/margins": 6.759765625, "rewards/rejected": -10.170312881469727, "step": 10630 }, { "epoch": 2.682696246809315, "grad_norm": 4.958293914794922, "learning_rate": 1.683521635174631e-08, "logits/chosen": -1.120001196861267, "logits/rejected": -0.98406982421875, "logps/chosen": -306.0625, "logps/rejected": -363.2562561035156, "loss": 0.0802, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.177197217941284, "rewards/margins": 6.23828125, "rewards/rejected": -9.4130859375, "step": 10640 }, { "epoch": 2.6852172816941353, "grad_norm": 27.18450164794922, "learning_rate": 1.657161495264639e-08, "logits/chosen": -1.151159644126892, "logits/rejected": -1.1621825695037842, "logps/chosen": -300.67498779296875, "logps/rejected": -341.86248779296875, "loss": 0.0531, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.03460693359375, "rewards/margins": 6.3662109375, "rewards/rejected": -9.396484375, "step": 10650 }, { "epoch": 2.6877383165789555, "grad_norm": 9.695204734802246, "learning_rate": 1.6310022814637364e-08, "logits/chosen": -1.13702392578125, "logits/rejected": -1.073846459388733, "logps/chosen": -296.03125, "logps/rejected": -350.1875, "loss": 0.0464, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.7076613903045654, "rewards/margins": 6.625586032867432, "rewards/rejected": -9.333593368530273, "step": 10660 }, { "epoch": 2.690259351463776, "grad_norm": 6.534129619598389, "learning_rate": 1.605044218940299e-08, "logits/chosen": -1.184533715248108, "logits/rejected": -1.124597191810608, "logps/chosen": -311.46875, "logps/rejected": -366.7124938964844, "loss": 0.0319, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.9455809593200684, "rewards/margins": 6.4033203125, "rewards/rejected": -9.347265243530273, "step": 10670 }, { "epoch": 2.692780386348596, "grad_norm": 8.636625289916992, "learning_rate": 1.579287531131268e-08, "logits/chosen": -1.233251929283142, "logits/rejected": -1.165441870689392, "logps/chosen": -310.3500061035156, "logps/rejected": -381.42498779296875, "loss": 0.0326, "rewards/accuracies": 0.984375, "rewards/chosen": -2.940380811691284, "rewards/margins": 6.638867378234863, "rewards/rejected": -9.581640243530273, "step": 10680 }, { "epoch": 2.6953014212334163, "grad_norm": 15.20366382598877, "learning_rate": 1.553732439740227e-08, "logits/chosen": -1.180090308189392, "logits/rejected": -1.1100342273712158, "logps/chosen": -318.73126220703125, "logps/rejected": -352.4624938964844, "loss": 0.0492, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.214587450027466, "rewards/margins": 6.668554782867432, "rewards/rejected": -9.882031440734863, "step": 10690 }, { "epoch": 2.6978224561182365, "grad_norm": 12.776803016662598, "learning_rate": 1.5283791647355133e-08, "logits/chosen": -1.152380347251892, "logits/rejected": -1.084497094154358, "logps/chosen": -318.95001220703125, "logps/rejected": -363.7250061035156, "loss": 0.0544, "rewards/accuracies": 0.984375, "rewards/chosen": -3.245312452316284, "rewards/margins": 6.268750190734863, "rewards/rejected": -9.516016006469727, "step": 10700 }, { "epoch": 2.7003434910030566, "grad_norm": 11.677002906799316, "learning_rate": 1.503227924348288e-08, "logits/chosen": -1.174353003501892, "logits/rejected": -1.10626220703125, "logps/chosen": -320.8500061035156, "logps/rejected": -355.20001220703125, "loss": 0.0653, "rewards/accuracies": 0.96875, "rewards/chosen": -3.3708252906799316, "rewards/margins": 6.367968559265137, "rewards/rejected": -9.73828125, "step": 10710 }, { "epoch": 2.702864525887877, "grad_norm": 11.679018020629883, "learning_rate": 1.4782789350706759e-08, "logits/chosen": -1.1708984375, "logits/rejected": -0.9654906988143921, "logps/chosen": -310.29376220703125, "logps/rejected": -356.9125061035156, "loss": 0.0342, "rewards/accuracies": 0.984375, "rewards/chosen": -3.1154541969299316, "rewards/margins": 6.944531440734863, "rewards/rejected": -10.057031631469727, "step": 10720 }, { "epoch": 2.7053855607726973, "grad_norm": 8.325181007385254, "learning_rate": 1.4535324116539238e-08, "logits/chosen": -1.105200171470642, "logits/rejected": -1.017724633216858, "logps/chosen": -283.5375061035156, "logps/rejected": -345.45001220703125, "loss": 0.078, "rewards/accuracies": 0.9593750238418579, "rewards/chosen": -3.199658155441284, "rewards/margins": 6.387499809265137, "rewards/rejected": -9.585156440734863, "step": 10730 }, { "epoch": 2.7079065956575175, "grad_norm": 15.453259468078613, "learning_rate": 1.4289885671065011e-08, "logits/chosen": -1.168847680091858, "logits/rejected": -0.9189468622207642, "logps/chosen": -311.79376220703125, "logps/rejected": -339.51251220703125, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.95849609375, "rewards/margins": 6.334765434265137, "rewards/rejected": -9.292577743530273, "step": 10740 }, { "epoch": 2.7104276305423376, "grad_norm": 4.290037631988525, "learning_rate": 1.404647612692328e-08, "logits/chosen": -1.0586121082305908, "logits/rejected": -1.096710205078125, "logps/chosen": -323.1875, "logps/rejected": -374.38751220703125, "loss": 0.0351, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.5386962890625, "rewards/margins": 6.713281154632568, "rewards/rejected": -10.253710746765137, "step": 10750 }, { "epoch": 2.7129486654271577, "grad_norm": 47.159427642822266, "learning_rate": 1.3805097579288938e-08, "logits/chosen": -1.122076392173767, "logits/rejected": -1.0566527843475342, "logps/chosen": -327.5562438964844, "logps/rejected": -371.1000061035156, "loss": 0.0699, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2559571266174316, "rewards/margins": 6.468554496765137, "rewards/rejected": -9.725000381469727, "step": 10760 }, { "epoch": 2.715469700311978, "grad_norm": 3.60469913482666, "learning_rate": 1.3565752105855088e-08, "logits/chosen": -1.1114838123321533, "logits/rejected": -1.087731957435608, "logps/chosen": -299.10626220703125, "logps/rejected": -345.3062438964844, "loss": 0.0464, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.2244811058044434, "rewards/margins": 6.491796970367432, "rewards/rejected": -9.717187881469727, "step": 10770 }, { "epoch": 2.717990735196798, "grad_norm": 21.518774032592773, "learning_rate": 1.332844176681483e-08, "logits/chosen": -1.07366943359375, "logits/rejected": -1.0236937999725342, "logps/chosen": -293.8999938964844, "logps/rejected": -327.8374938964844, "loss": 0.0511, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.255139112472534, "rewards/margins": 6.462500095367432, "rewards/rejected": -9.717968940734863, "step": 10780 }, { "epoch": 2.7205117700816186, "grad_norm": 32.8369255065918, "learning_rate": 1.3093168604843524e-08, "logits/chosen": -1.0945556163787842, "logits/rejected": -1.083715796470642, "logps/chosen": -321.21875, "logps/rejected": -366.17498779296875, "loss": 0.0536, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.238452196121216, "rewards/margins": 6.838086128234863, "rewards/rejected": -10.075390815734863, "step": 10790 }, { "epoch": 2.7230328049664387, "grad_norm": 50.40583038330078, "learning_rate": 1.2859934645081477e-08, "logits/chosen": -1.1863524913787842, "logits/rejected": -1.061547875404358, "logps/chosen": -329.57501220703125, "logps/rejected": -377.45001220703125, "loss": 0.0345, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.126269578933716, "rewards/margins": 6.931836128234863, "rewards/rejected": -10.059374809265137, "step": 10800 }, { "epoch": 2.725553839851259, "grad_norm": 7.012859344482422, "learning_rate": 1.2628741895116174e-08, "logits/chosen": -1.117730736732483, "logits/rejected": -1.086175560951233, "logps/chosen": -307.26873779296875, "logps/rejected": -356.2250061035156, "loss": 0.0732, "rewards/accuracies": 0.96875, "rewards/chosen": -3.193988084793091, "rewards/margins": 6.300976753234863, "rewards/rejected": -9.491406440734863, "step": 10810 }, { "epoch": 2.728074874736079, "grad_norm": 8.012761116027832, "learning_rate": 1.2399592344965293e-08, "logits/chosen": -1.132226586341858, "logits/rejected": -0.9681762456893921, "logps/chosen": -318.4125061035156, "logps/rejected": -343.4125061035156, "loss": 0.0429, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9146485328674316, "rewards/margins": 6.807226657867432, "rewards/rejected": -9.723828315734863, "step": 10820 }, { "epoch": 2.7305959096208996, "grad_norm": 26.902252197265625, "learning_rate": 1.2172487967059276e-08, "logits/chosen": -1.152307152748108, "logits/rejected": -1.112207055091858, "logps/chosen": -313.76873779296875, "logps/rejected": -361.76251220703125, "loss": 0.0757, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.9421753883361816, "rewards/margins": 6.5927734375, "rewards/rejected": -9.533984184265137, "step": 10830 }, { "epoch": 2.7331169445057197, "grad_norm": 7.114460468292236, "learning_rate": 1.1947430716224727e-08, "logits/chosen": -1.0782287120819092, "logits/rejected": -1.0315673351287842, "logps/chosen": -307.73748779296875, "logps/rejected": -346.0874938964844, "loss": 0.0502, "rewards/accuracies": 0.984375, "rewards/chosen": -3.268994092941284, "rewards/margins": 6.445898532867432, "rewards/rejected": -9.717187881469727, "step": 10840 }, { "epoch": 2.73563797939054, "grad_norm": 20.74818229675293, "learning_rate": 1.1724422529667182e-08, "logits/chosen": -1.0840942859649658, "logits/rejected": -1.007910132408142, "logps/chosen": -293.20623779296875, "logps/rejected": -352.23748779296875, "loss": 0.0532, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.3934082984924316, "rewards/margins": 6.584374904632568, "rewards/rejected": -9.978124618530273, "step": 10850 }, { "epoch": 2.73815901427536, "grad_norm": 40.46851348876953, "learning_rate": 1.1503465326954703e-08, "logits/chosen": -1.159704566001892, "logits/rejected": -0.997851550579071, "logps/chosen": -313.76873779296875, "logps/rejected": -348.9750061035156, "loss": 0.0844, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.4705443382263184, "rewards/margins": 6.450976371765137, "rewards/rejected": -9.922656059265137, "step": 10860 }, { "epoch": 2.74068004916018, "grad_norm": 5.118013858795166, "learning_rate": 1.1284561010001304e-08, "logits/chosen": -1.2329833507537842, "logits/rejected": -1.178552269935608, "logps/chosen": -343.4375, "logps/rejected": -379.29998779296875, "loss": 0.0801, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.0774383544921875, "rewards/margins": 6.67529296875, "rewards/rejected": -9.753125190734863, "step": 10870 }, { "epoch": 2.7432010840450003, "grad_norm": 13.006446838378906, "learning_rate": 1.1067711463050495e-08, "logits/chosen": -1.137304663658142, "logits/rejected": -1.071325659751892, "logps/chosen": -321.7875061035156, "logps/rejected": -377.36248779296875, "loss": 0.0602, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.237518310546875, "rewards/margins": 6.621679782867432, "rewards/rejected": -9.864062309265137, "step": 10880 }, { "epoch": 2.7457221189298204, "grad_norm": 52.72837448120117, "learning_rate": 1.0852918552659185e-08, "logits/chosen": -1.148718237876892, "logits/rejected": -1.0368530750274658, "logps/chosen": -291.86248779296875, "logps/rejected": -337.51251220703125, "loss": 0.0696, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.13336181640625, "rewards/margins": 6.482226371765137, "rewards/rejected": -9.615625381469727, "step": 10890 }, { "epoch": 2.748243153814641, "grad_norm": 18.06778907775879, "learning_rate": 1.0640184127681472e-08, "logits/chosen": -1.176794409751892, "logits/rejected": -1.083398461341858, "logps/chosen": -315.45001220703125, "logps/rejected": -359.98748779296875, "loss": 0.0426, "rewards/accuracies": 0.984375, "rewards/chosen": -2.621978759765625, "rewards/margins": 6.8486328125, "rewards/rejected": -9.466992378234863, "step": 10900 }, { "epoch": 2.750764188699461, "grad_norm": 40.34992980957031, "learning_rate": 1.0429510019252936e-08, "logits/chosen": -1.1633422374725342, "logits/rejected": -1.1007201671600342, "logps/chosen": -301.2562561035156, "logps/rejected": -345.3374938964844, "loss": 0.0608, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.822552442550659, "rewards/margins": 6.488671779632568, "rewards/rejected": -9.312108993530273, "step": 10910 }, { "epoch": 2.7532852235842813, "grad_norm": 5.839588642120361, "learning_rate": 1.0220898040774611e-08, "logits/chosen": -1.0826934576034546, "logits/rejected": -1.069067358970642, "logps/chosen": -341.7250061035156, "logps/rejected": -378.5375061035156, "loss": 0.0224, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.054443359375, "rewards/margins": 7.171288967132568, "rewards/rejected": -10.229296684265137, "step": 10920 }, { "epoch": 2.7558062584691014, "grad_norm": 22.24617576599121, "learning_rate": 1.0014349987897575e-08, "logits/chosen": -1.0678832530975342, "logits/rejected": -1.052160620689392, "logps/chosen": -307.20623779296875, "logps/rejected": -376.76251220703125, "loss": 0.0593, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.235089063644409, "rewards/margins": 6.379296779632568, "rewards/rejected": -9.615234375, "step": 10930 }, { "epoch": 2.758327293353922, "grad_norm": 4.8370442390441895, "learning_rate": 9.809867638507468e-09, "logits/chosen": -1.155603051185608, "logits/rejected": -1.070288062095642, "logps/chosen": -294.7124938964844, "logps/rejected": -337.70001220703125, "loss": 0.0381, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.126147508621216, "rewards/margins": 6.896093845367432, "rewards/rejected": -10.023046493530273, "step": 10940 }, { "epoch": 2.760848328238742, "grad_norm": 12.15842342376709, "learning_rate": 9.607452752709105e-09, "logits/chosen": -1.1739990711212158, "logits/rejected": -1.1599915027618408, "logps/chosen": -343.6312561035156, "logps/rejected": -389.2124938964844, "loss": 0.0405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8974242210388184, "rewards/margins": 6.529687404632568, "rewards/rejected": -9.421875, "step": 10950 }, { "epoch": 2.7633693631235623, "grad_norm": 36.01392364501953, "learning_rate": 9.407107072811393e-09, "logits/chosen": -1.1204345226287842, "logits/rejected": -1.054931640625, "logps/chosen": -316.04376220703125, "logps/rejected": -364.5, "loss": 0.0616, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.126208543777466, "rewards/margins": 6.627539157867432, "rewards/rejected": -9.755078315734863, "step": 10960 }, { "epoch": 2.7658903980083824, "grad_norm": 18.595779418945312, "learning_rate": 9.208832323312293e-09, "logits/chosen": -1.209130883216858, "logits/rejected": -1.159936547279358, "logps/chosen": -308.2749938964844, "logps/rejected": -352.2875061035156, "loss": 0.0421, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1209716796875, "rewards/margins": 6.792773246765137, "rewards/rejected": -9.911328315734863, "step": 10970 }, { "epoch": 2.7684114328932026, "grad_norm": 51.25161361694336, "learning_rate": 9.012630210884053e-09, "logits/chosen": -1.1571776866912842, "logits/rejected": -1.007531762123108, "logps/chosen": -299.95001220703125, "logps/rejected": -337.2124938964844, "loss": 0.0821, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.107006788253784, "rewards/margins": 6.533984184265137, "rewards/rejected": -9.644922256469727, "step": 10980 }, { "epoch": 2.7709324677780227, "grad_norm": 22.487014770507812, "learning_rate": 8.818502424358442e-09, "logits/chosen": -1.1804687976837158, "logits/rejected": -1.07830810546875, "logps/chosen": -316.34375, "logps/rejected": -361.2250061035156, "loss": 0.0442, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.305590867996216, "rewards/margins": 6.674218654632568, "rewards/rejected": -9.978515625, "step": 10990 }, { "epoch": 2.773453502662843, "grad_norm": 14.883648872375488, "learning_rate": 8.62645063471218e-09, "logits/chosen": -1.1775391101837158, "logits/rejected": -1.16473388671875, "logps/chosen": -335.1625061035156, "logps/rejected": -365.0, "loss": 0.0805, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.346142530441284, "rewards/margins": 6.494921684265137, "rewards/rejected": -9.839648246765137, "step": 11000 }, { "epoch": 2.7759745375476634, "grad_norm": 4.850130081176758, "learning_rate": 8.43647649505269e-09, "logits/chosen": -1.093542456626892, "logits/rejected": -1.0819518566131592, "logps/chosen": -308.9750061035156, "logps/rejected": -364.6000061035156, "loss": 0.0385, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1682372093200684, "rewards/margins": 6.779589653015137, "rewards/rejected": -9.9453125, "step": 11010 }, { "epoch": 2.7784955724324836, "grad_norm": 4.473644733428955, "learning_rate": 8.248581640603741e-09, "logits/chosen": -1.1863892078399658, "logits/rejected": -1.0742676258087158, "logps/chosen": -320.23748779296875, "logps/rejected": -358.23748779296875, "loss": 0.071, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.234020948410034, "rewards/margins": 6.4443359375, "rewards/rejected": -9.680468559265137, "step": 11020 }, { "epoch": 2.7810166073173037, "grad_norm": 4.190011024475098, "learning_rate": 8.062767688691463e-09, "logits/chosen": -1.12969970703125, "logits/rejected": -1.104284644126892, "logps/chosen": -309.92498779296875, "logps/rejected": -369.4624938964844, "loss": 0.0418, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.3687682151794434, "rewards/margins": 6.503222465515137, "rewards/rejected": -9.873046875, "step": 11030 }, { "epoch": 2.783537642202124, "grad_norm": 26.84963607788086, "learning_rate": 7.879036238730319e-09, "logits/chosen": -1.153222680091858, "logits/rejected": -1.0390503406524658, "logps/chosen": -329.21875, "logps/rejected": -362.76873779296875, "loss": 0.0579, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3701539039611816, "rewards/margins": 6.40478515625, "rewards/rejected": -9.776562690734863, "step": 11040 }, { "epoch": 2.7860586770869444, "grad_norm": 9.767061233520508, "learning_rate": 7.697388872209498e-09, "logits/chosen": -1.1641967296600342, "logits/rejected": -1.1090209484100342, "logps/chosen": -301.60626220703125, "logps/rejected": -350.04998779296875, "loss": 0.0605, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3002867698669434, "rewards/margins": 6.400097846984863, "rewards/rejected": -9.69921875, "step": 11050 }, { "epoch": 2.7885797119717646, "grad_norm": 14.56065559387207, "learning_rate": 7.517827152679096e-09, "logits/chosen": -1.053826928138733, "logits/rejected": NaN, "logps/chosen": -304.91876220703125, "logps/rejected": -358.9624938964844, "loss": 0.0569, "rewards/accuracies": 0.984375, "rewards/chosen": -2.956677198410034, "rewards/margins": 6.554883003234863, "rewards/rejected": -9.513671875, "step": 11060 }, { "epoch": 2.7911007468565847, "grad_norm": 25.90925407409668, "learning_rate": 7.34035262573679e-09, "logits/chosen": -1.037420630455017, "logits/rejected": NaN, "logps/chosen": -320.1499938964844, "logps/rejected": -370.57501220703125, "loss": 0.0611, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.003216505050659, "rewards/margins": 6.718847751617432, "rewards/rejected": -9.721484184265137, "step": 11070 }, { "epoch": 2.793621781741405, "grad_norm": 12.51234245300293, "learning_rate": 7.164966819014628e-09, "logits/chosen": -1.05120849609375, "logits/rejected": -0.9507385492324829, "logps/chosen": -318.5, "logps/rejected": -362.82501220703125, "loss": 0.0728, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.9307618141174316, "rewards/margins": 6.7783203125, "rewards/rejected": -9.710156440734863, "step": 11080 }, { "epoch": 2.796142816626225, "grad_norm": 6.220926284790039, "learning_rate": 6.991671242165625e-09, "logits/chosen": -1.0963134765625, "logits/rejected": -0.9919189214706421, "logps/chosen": -319.6875, "logps/rejected": -366.61248779296875, "loss": 0.0661, "rewards/accuracies": 0.96875, "rewards/chosen": -3.267333984375, "rewards/margins": 6.716406345367432, "rewards/rejected": -9.983202934265137, "step": 11090 }, { "epoch": 2.798663851511045, "grad_norm": 10.663161277770996, "learning_rate": 6.820467386850964e-09, "logits/chosen": -1.1235229969024658, "logits/rejected": -1.0130188465118408, "logps/chosen": -310.10626220703125, "logps/rejected": -365.3125, "loss": 0.0469, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.08074951171875, "rewards/margins": 6.567773342132568, "rewards/rejected": -9.647265434265137, "step": 11100 }, { "epoch": 2.8011848863958653, "grad_norm": 5.685007572174072, "learning_rate": 6.651356726727064e-09, "logits/chosen": -1.0920593738555908, "logits/rejected": -1.085046410560608, "logps/chosen": -321.2437438964844, "logps/rejected": -360.26873779296875, "loss": 0.0377, "rewards/accuracies": 0.984375, "rewards/chosen": -3.092785596847534, "rewards/margins": 6.755859375, "rewards/rejected": -9.852343559265137, "step": 11110 }, { "epoch": 2.803705921280686, "grad_norm": 26.812538146972656, "learning_rate": 6.4843407174330065e-09, "logits/chosen": -1.1303924322128296, "logits/rejected": -1.0161926746368408, "logps/chosen": -321.63751220703125, "logps/rejected": -350.48126220703125, "loss": 0.0483, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.475689649581909, "rewards/margins": 6.286328315734863, "rewards/rejected": -9.763671875, "step": 11120 }, { "epoch": 2.806226956165506, "grad_norm": 14.367386817932129, "learning_rate": 6.319420796577879e-09, "logits/chosen": -1.116998314857483, "logits/rejected": -0.963482677936554, "logps/chosen": -314.5562438964844, "logps/rejected": -325.3999938964844, "loss": 0.0349, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.049877882003784, "rewards/margins": 6.791015625, "rewards/rejected": -9.841796875, "step": 11130 }, { "epoch": 2.808747991050326, "grad_norm": 37.2702522277832, "learning_rate": 6.156598383728451e-09, "logits/chosen": -1.148046851158142, "logits/rejected": -1.056848168373108, "logps/chosen": -334.6499938964844, "logps/rejected": -371.125, "loss": 0.0476, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.0511107444763184, "rewards/margins": 6.743554592132568, "rewards/rejected": -9.795702934265137, "step": 11140 }, { "epoch": 2.8112690259351463, "grad_norm": 35.44508361816406, "learning_rate": 5.995874880396962e-09, "logits/chosen": -1.1152832508087158, "logits/rejected": -0.964923083782196, "logps/chosen": -315.0062561035156, "logps/rejected": -358.625, "loss": 0.0626, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.4000244140625, "rewards/margins": 6.755273342132568, "rewards/rejected": -10.154687881469727, "step": 11150 }, { "epoch": 2.813790060819967, "grad_norm": 11.824356079101562, "learning_rate": 5.83725167002902e-09, "logits/chosen": -1.146704077720642, "logits/rejected": -1.1065673828125, "logps/chosen": -307.8999938964844, "logps/rejected": -359.7875061035156, "loss": 0.0314, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.1994690895080566, "rewards/margins": 6.810351371765137, "rewards/rejected": -10.010937690734863, "step": 11160 }, { "epoch": 2.816311095704787, "grad_norm": 8.701419830322266, "learning_rate": 5.680730117991833e-09, "logits/chosen": -1.1065521240234375, "logits/rejected": -1.0941650867462158, "logps/chosen": -317.51251220703125, "logps/rejected": -370.5874938964844, "loss": 0.0426, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.38165283203125, "rewards/margins": 6.740429878234863, "rewards/rejected": -10.119140625, "step": 11170 }, { "epoch": 2.818832130589607, "grad_norm": 7.306780815124512, "learning_rate": 5.5263115715621925e-09, "logits/chosen": -1.107873558998108, "logits/rejected": -1.008874535560608, "logps/chosen": -343.46875, "logps/rejected": -359.0249938964844, "loss": 0.0392, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1722412109375, "rewards/margins": 6.995019435882568, "rewards/rejected": -10.165624618530273, "step": 11180 }, { "epoch": 2.8213531654744273, "grad_norm": 6.022578716278076, "learning_rate": 5.373997359915172e-09, "logits/chosen": -1.1570250988006592, "logits/rejected": -1.118261694908142, "logps/chosen": -327.78125, "logps/rejected": -365.73748779296875, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9304442405700684, "rewards/margins": 7.023633003234863, "rewards/rejected": -9.959765434265137, "step": 11190 }, { "epoch": 2.8238742003592474, "grad_norm": 42.67591857910156, "learning_rate": 5.223788794112449e-09, "logits/chosen": -1.149041771888733, "logits/rejected": -1.1051025390625, "logps/chosen": -324.1937561035156, "logps/rejected": -339.5375061035156, "loss": 0.0517, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.979199171066284, "rewards/margins": 6.374218940734863, "rewards/rejected": -9.354687690734863, "step": 11200 }, { "epoch": 2.8263952352440675, "grad_norm": 41.82542419433594, "learning_rate": 5.075687167091169e-09, "logits/chosen": -1.199365258216858, "logits/rejected": -1.0837891101837158, "logps/chosen": -307.8062438964844, "logps/rejected": -345.5, "loss": 0.0501, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.337634325027466, "rewards/margins": 6.540625095367432, "rewards/rejected": -9.876953125, "step": 11210 }, { "epoch": 2.8289162701288877, "grad_norm": 34.202144622802734, "learning_rate": 4.9296937536527635e-09, "logits/chosen": -1.106774926185608, "logits/rejected": -1.017059326171875, "logps/chosen": -291.04998779296875, "logps/rejected": -336.13751220703125, "loss": 0.0554, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.25830078125, "rewards/margins": 6.3466796875, "rewards/rejected": -9.604687690734863, "step": 11220 }, { "epoch": 2.8314373050137083, "grad_norm": 5.910395622253418, "learning_rate": 4.785809810451958e-09, "logits/chosen": -1.172827124595642, "logits/rejected": -1.027063012123108, "logps/chosen": -322.0062561035156, "logps/rejected": -366.8374938964844, "loss": 0.0392, "rewards/accuracies": 0.984375, "rewards/chosen": -2.98138427734375, "rewards/margins": 6.910937309265137, "rewards/rejected": -9.889452934265137, "step": 11230 }, { "epoch": 2.8339583398985284, "grad_norm": 8.160270690917969, "learning_rate": 4.644036575985999e-09, "logits/chosen": -1.1519683599472046, "logits/rejected": -1.0647369623184204, "logps/chosen": -320.91876220703125, "logps/rejected": -370.6000061035156, "loss": 0.0506, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.365161180496216, "rewards/margins": 6.4072265625, "rewards/rejected": -9.774609565734863, "step": 11240 }, { "epoch": 2.8364793747833486, "grad_norm": 8.186322212219238, "learning_rate": 4.504375270583921e-09, "logits/chosen": -1.1119384765625, "logits/rejected": -1.0484497547149658, "logps/chosen": -327.3374938964844, "logps/rejected": -388.8999938964844, "loss": 0.03, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.942028760910034, "rewards/margins": 6.8125, "rewards/rejected": -9.753515243530273, "step": 11250 }, { "epoch": 2.8390004096681687, "grad_norm": 10.144037246704102, "learning_rate": 4.366827096396131e-09, "logits/chosen": -1.1428711414337158, "logits/rejected": -1.105322241783142, "logps/chosen": -320.90625, "logps/rejected": -377.32501220703125, "loss": 0.0424, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.176776170730591, "rewards/margins": 6.537695407867432, "rewards/rejected": -9.714452743530273, "step": 11260 }, { "epoch": 2.8415214445529893, "grad_norm": 7.901134490966797, "learning_rate": 4.231393237384057e-09, "logits/chosen": -1.158837914466858, "logits/rejected": -1.12847900390625, "logps/chosen": -296.9781188964844, "logps/rejected": -359.51251220703125, "loss": 0.0498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.226635694503784, "rewards/margins": 6.946679592132568, "rewards/rejected": -10.176953315734863, "step": 11270 }, { "epoch": 2.8440424794378094, "grad_norm": 12.296640396118164, "learning_rate": 4.098074859309825e-09, "logits/chosen": -1.1344115734100342, "logits/rejected": -1.0700805187225342, "logps/chosen": -303.98126220703125, "logps/rejected": -351.42498779296875, "loss": 0.0402, "rewards/accuracies": 0.984375, "rewards/chosen": -3.12567138671875, "rewards/margins": 6.556836128234863, "rewards/rejected": -9.681249618530273, "step": 11280 }, { "epoch": 2.8465635143226296, "grad_norm": 35.342552185058594, "learning_rate": 3.9668731097264315e-09, "logits/chosen": -1.194067358970642, "logits/rejected": -1.0934569835662842, "logps/chosen": -304.54376220703125, "logps/rejected": -355.8500061035156, "loss": 0.0369, "rewards/accuracies": 0.984375, "rewards/chosen": -2.982250928878784, "rewards/margins": 6.919921875, "rewards/rejected": -9.901952743530273, "step": 11290 }, { "epoch": 2.8490845492074497, "grad_norm": 17.08281707763672, "learning_rate": 3.837789117967643e-09, "logits/chosen": -1.2255370616912842, "logits/rejected": -1.1570312976837158, "logps/chosen": -335.9750061035156, "logps/rejected": -362.5874938964844, "loss": 0.0454, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.254565477371216, "rewards/margins": 6.574609279632568, "rewards/rejected": -9.8251953125, "step": 11300 }, { "epoch": 2.85160558409227, "grad_norm": 5.149807453155518, "learning_rate": 3.7108239951385014e-09, "logits/chosen": -1.203222632408142, "logits/rejected": -1.130133032798767, "logps/chosen": -315.8500061035156, "logps/rejected": -349.1000061035156, "loss": 0.0512, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.07989501953125, "rewards/margins": 6.508203029632568, "rewards/rejected": -9.590624809265137, "step": 11310 }, { "epoch": 2.85412661897709, "grad_norm": 11.808905601501465, "learning_rate": 3.585978834105524e-09, "logits/chosen": -1.2020752429962158, "logits/rejected": -1.12860107421875, "logps/chosen": -344.4125061035156, "logps/rejected": -343.3374938964844, "loss": 0.0456, "rewards/accuracies": 0.984375, "rewards/chosen": -3.5422699451446533, "rewards/margins": 6.391015529632568, "rewards/rejected": -9.932812690734863, "step": 11320 }, { "epoch": 2.85664765386191, "grad_norm": 51.579833984375, "learning_rate": 3.463254709487551e-09, "logits/chosen": -1.134191870689392, "logits/rejected": -1.115087866783142, "logps/chosen": -338.9375, "logps/rejected": -377.70001220703125, "loss": 0.0509, "rewards/accuracies": 0.96875, "rewards/chosen": -3.247082471847534, "rewards/margins": 6.463281154632568, "rewards/rejected": -9.707616806030273, "step": 11330 }, { "epoch": 2.8591686887467307, "grad_norm": 32.98530578613281, "learning_rate": 3.342652677646246e-09, "logits/chosen": -1.137359619140625, "logits/rejected": -1.135321021080017, "logps/chosen": -298.20623779296875, "logps/rejected": -338.25, "loss": 0.0641, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.1140379905700684, "rewards/margins": 6.370995998382568, "rewards/rejected": -9.483789443969727, "step": 11340 }, { "epoch": 2.861689723631551, "grad_norm": 13.890373229980469, "learning_rate": 3.2241737766771637e-09, "logits/chosen": -1.16510009765625, "logits/rejected": -1.0728271007537842, "logps/chosen": -309.79998779296875, "logps/rejected": -363.6499938964844, "loss": 0.0382, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.141650438308716, "rewards/margins": 6.865820407867432, "rewards/rejected": -10.006250381469727, "step": 11350 }, { "epoch": 2.864210758516371, "grad_norm": 6.062414169311523, "learning_rate": 3.1078190264008376e-09, "logits/chosen": -1.18145751953125, "logits/rejected": -1.05517578125, "logps/chosen": -330.86248779296875, "logps/rejected": -350.13751220703125, "loss": 0.0365, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.515551805496216, "rewards/margins": 6.795117378234863, "rewards/rejected": -10.306640625, "step": 11360 }, { "epoch": 2.866731793401191, "grad_norm": 21.793306350708008, "learning_rate": 2.9935894283538154e-09, "logits/chosen": -1.05755615234375, "logits/rejected": -1.0051085948944092, "logps/chosen": -302.0, "logps/rejected": -359.3374938964844, "loss": 0.0476, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.3633484840393066, "rewards/margins": 6.482226371765137, "rewards/rejected": -9.845312118530273, "step": 11370 }, { "epoch": 2.8692528282860112, "grad_norm": 12.266679763793945, "learning_rate": 2.8814859657802227e-09, "logits/chosen": -1.20147705078125, "logits/rejected": -1.0837676525115967, "logps/chosen": -314.2250061035156, "logps/rejected": -351.25, "loss": 0.0419, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.3489317893981934, "rewards/margins": 6.389062404632568, "rewards/rejected": -9.737500190734863, "step": 11380 }, { "epoch": 2.871773863170832, "grad_norm": 58.831626892089844, "learning_rate": 2.77150960362324e-09, "logits/chosen": -1.199304223060608, "logits/rejected": -1.1606323719024658, "logps/chosen": -331.2749938964844, "logps/rejected": -355.26251220703125, "loss": 0.0426, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.2643675804138184, "rewards/margins": 6.762499809265137, "rewards/rejected": -10.026171684265137, "step": 11390 }, { "epoch": 2.874294898055652, "grad_norm": 24.17465591430664, "learning_rate": 2.6636612885167775e-09, "logits/chosen": -1.146575927734375, "logits/rejected": -0.9815276861190796, "logps/chosen": -324.6875, "logps/rejected": -342.2562561035156, "loss": 0.065, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.903027296066284, "rewards/margins": 6.735156059265137, "rewards/rejected": -9.639452934265137, "step": 11400 }, { "epoch": 2.876815932940472, "grad_norm": 16.471965789794922, "learning_rate": 2.5579419487773424e-09, "logits/chosen": -1.0539124011993408, "logits/rejected": -1.096160888671875, "logps/chosen": -335.2250061035156, "logps/rejected": -360.125, "loss": 0.0752, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.3601441383361816, "rewards/margins": 6.7421875, "rewards/rejected": -10.101171493530273, "step": 11410 }, { "epoch": 2.8793369678252922, "grad_norm": 2.419618844985962, "learning_rate": 2.4543524943960448e-09, "logits/chosen": -1.16253662109375, "logits/rejected": -1.125451683998108, "logps/chosen": -325.9375, "logps/rejected": -362.36248779296875, "loss": 0.0452, "rewards/accuracies": 0.984375, "rewards/chosen": -3.241986036300659, "rewards/margins": 6.643750190734863, "rewards/rejected": -9.885156631469727, "step": 11420 }, { "epoch": 2.8818580027101124, "grad_norm": 17.081071853637695, "learning_rate": 2.352893817030799e-09, "logits/chosen": -1.224328637123108, "logits/rejected": -1.121362328529358, "logps/chosen": -320.3999938964844, "logps/rejected": -373.11248779296875, "loss": 0.073, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.23199462890625, "rewards/margins": 6.426562309265137, "rewards/rejected": -9.658203125, "step": 11430 }, { "epoch": 2.8843790375949325, "grad_norm": 48.259456634521484, "learning_rate": 2.253566789998523e-09, "logits/chosen": -1.1755859851837158, "logits/rejected": -1.0964233875274658, "logps/chosen": -342.7124938964844, "logps/rejected": -360.3999938964844, "loss": 0.0517, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.3258299827575684, "rewards/margins": 6.460741996765137, "rewards/rejected": -9.78515625, "step": 11440 }, { "epoch": 2.8869000724797527, "grad_norm": 3.223163366317749, "learning_rate": 2.156372268267842e-09, "logits/chosen": -1.1101500988006592, "logits/rejected": -1.1073486804962158, "logps/chosen": -323.92498779296875, "logps/rejected": -375.7749938964844, "loss": 0.0528, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.199572801589966, "rewards/margins": 6.766992092132568, "rewards/rejected": -9.963671684265137, "step": 11450 }, { "epoch": 2.8894211073645732, "grad_norm": 36.076942443847656, "learning_rate": 2.061311088451506e-09, "logits/chosen": -1.1073119640350342, "logits/rejected": -1.0601990222930908, "logps/chosen": -308.61248779296875, "logps/rejected": -347.95001220703125, "loss": 0.0507, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.39984130859375, "rewards/margins": 6.541015625, "rewards/rejected": -9.941797256469727, "step": 11460 }, { "epoch": 2.8919421422493934, "grad_norm": 3.8748505115509033, "learning_rate": 1.9683840687993448e-09, "logits/chosen": -1.1381317377090454, "logits/rejected": -1.048803687095642, "logps/chosen": -299.88751220703125, "logps/rejected": -332.2875061035156, "loss": 0.0497, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.089038133621216, "rewards/margins": 6.600781440734863, "rewards/rejected": -9.693359375, "step": 11470 }, { "epoch": 2.8944631771342135, "grad_norm": 24.626474380493164, "learning_rate": 1.8775920091911034e-09, "logits/chosen": -1.156835913658142, "logits/rejected": -1.0944945812225342, "logps/chosen": -335.8125, "logps/rejected": -382.32501220703125, "loss": 0.0483, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.244555711746216, "rewards/margins": 6.644726753234863, "rewards/rejected": -9.890625, "step": 11480 }, { "epoch": 2.8969842120190337, "grad_norm": 7.209171772003174, "learning_rate": 1.7889356911296448e-09, "logits/chosen": -1.0824127197265625, "logits/rejected": -1.033239722251892, "logps/chosen": -335.0874938964844, "logps/rejected": -366.29998779296875, "loss": 0.0627, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.234814405441284, "rewards/margins": 6.557421684265137, "rewards/rejected": -9.793359756469727, "step": 11490 }, { "epoch": 2.8995052469038542, "grad_norm": 27.53736686706543, "learning_rate": 1.702415877734259e-09, "logits/chosen": -1.1435120105743408, "logits/rejected": -1.096521019935608, "logps/chosen": -311.3500061035156, "logps/rejected": -349.125, "loss": 0.0386, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.0798583030700684, "rewards/margins": 6.843554496765137, "rewards/rejected": -9.927734375, "step": 11500 }, { "epoch": 2.9020262817886744, "grad_norm": 8.53842544555664, "learning_rate": 1.6180333137339186e-09, "logits/chosen": -1.103082299232483, "logits/rejected": -1.1004149913787842, "logps/chosen": -321.3500061035156, "logps/rejected": -356.6187438964844, "loss": 0.0433, "rewards/accuracies": 0.96875, "rewards/chosen": -2.960705518722534, "rewards/margins": 6.835156440734863, "rewards/rejected": -9.796093940734863, "step": 11510 }, { "epoch": 2.9045473166734945, "grad_norm": 6.406665802001953, "learning_rate": 1.5357887254610623e-09, "logits/chosen": -1.1190612316131592, "logits/rejected": -1.068109154701233, "logps/chosen": -317.70001220703125, "logps/rejected": -363.09375, "loss": 0.0604, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.0172119140625, "rewards/margins": 6.927929878234863, "rewards/rejected": -9.950390815734863, "step": 11520 }, { "epoch": 2.9070683515583147, "grad_norm": 21.32110023498535, "learning_rate": 1.4556828208452388e-09, "logits/chosen": -1.2355225086212158, "logits/rejected": -1.145727515220642, "logps/chosen": -322.8374938964844, "logps/rejected": -352.8999938964844, "loss": 0.0416, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -2.916674852371216, "rewards/margins": 6.700585842132568, "rewards/rejected": -9.616796493530273, "step": 11530 }, { "epoch": 2.909589386443135, "grad_norm": 8.482006072998047, "learning_rate": 1.3777162894070272e-09, "logits/chosen": -1.1676514148712158, "logits/rejected": -1.1561400890350342, "logps/chosen": -333.25, "logps/rejected": -370.5874938964844, "loss": 0.0573, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.3196778297424316, "rewards/margins": 6.361620903015137, "rewards/rejected": -9.681249618530273, "step": 11540 }, { "epoch": 2.912110421327955, "grad_norm": 30.82038116455078, "learning_rate": 1.3018898022521263e-09, "logits/chosen": -1.105224609375, "logits/rejected": -1.039794921875, "logps/chosen": -303.375, "logps/rejected": -355.2875061035156, "loss": 0.0562, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.277972459793091, "rewards/margins": 6.635351657867432, "rewards/rejected": -9.911718368530273, "step": 11550 }, { "epoch": 2.914631456212775, "grad_norm": 6.3294572830200195, "learning_rate": 1.2282040120655534e-09, "logits/chosen": -1.093475341796875, "logits/rejected": -1.021142601966858, "logps/chosen": -310.42498779296875, "logps/rejected": -342.11248779296875, "loss": 0.045, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.444873094558716, "rewards/margins": 6.439257621765137, "rewards/rejected": -9.880078315734863, "step": 11560 }, { "epoch": 2.9171524910975957, "grad_norm": 19.349315643310547, "learning_rate": 1.1566595531060374e-09, "logits/chosen": -1.14337158203125, "logits/rejected": -1.151037573814392, "logps/chosen": -310.63751220703125, "logps/rejected": -334.29998779296875, "loss": 0.0534, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.9684205055236816, "rewards/margins": 6.611132621765137, "rewards/rejected": -9.581640243530273, "step": 11570 }, { "epoch": 2.919673525982416, "grad_norm": 12.430782318115234, "learning_rate": 1.087257041200551e-09, "logits/chosen": -1.119287133216858, "logits/rejected": -1.060430884361267, "logps/chosen": -319.375, "logps/rejected": -360.17498779296875, "loss": 0.0449, "rewards/accuracies": 0.984375, "rewards/chosen": -3.308154344558716, "rewards/margins": 6.586865425109863, "rewards/rejected": -9.897265434265137, "step": 11580 }, { "epoch": 2.922194560867236, "grad_norm": 18.37933349609375, "learning_rate": 1.019997073739065e-09, "logits/chosen": -1.1892883777618408, "logits/rejected": -1.063391089439392, "logps/chosen": -339.0375061035156, "logps/rejected": -377.34375, "loss": 0.0452, "rewards/accuracies": 0.984375, "rewards/chosen": -3.057421922683716, "rewards/margins": 6.838086128234863, "rewards/rejected": -9.889843940734863, "step": 11590 }, { "epoch": 2.924715595752056, "grad_norm": 43.751304626464844, "learning_rate": 9.548802296692749e-10, "logits/chosen": -1.097900390625, "logits/rejected": -1.026818871498108, "logps/chosen": -312.35626220703125, "logps/rejected": -363.70001220703125, "loss": 0.0495, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.345568895339966, "rewards/margins": 6.401757717132568, "rewards/rejected": -9.747265815734863, "step": 11600 }, { "epoch": 2.9272366306368767, "grad_norm": 6.898996353149414, "learning_rate": 8.919070694917708e-10, "logits/chosen": -1.1406066417694092, "logits/rejected": -1.0116455554962158, "logps/chosen": -313.4125061035156, "logps/rejected": -346.4375, "loss": 0.0294, "rewards/accuracies": 0.9906250238418579, "rewards/chosen": -3.204620361328125, "rewards/margins": 6.840429782867432, "rewards/rejected": -10.045312881469727, "step": 11610 }, { "epoch": 2.929757665521697, "grad_norm": 34.288265228271484, "learning_rate": 8.310781352550977e-10, "logits/chosen": -1.161535620689392, "logits/rejected": -1.116845726966858, "logps/chosen": -324.70623779296875, "logps/rejected": -363.375, "loss": 0.0865, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.22119140625, "rewards/margins": 6.787109375, "rewards/rejected": -10.008984565734863, "step": 11620 }, { "epoch": 2.932278700406517, "grad_norm": 11.433563232421875, "learning_rate": 7.723939505511478e-10, "logits/chosen": -1.154638648033142, "logits/rejected": -1.136083960533142, "logps/chosen": -306.57501220703125, "logps/rejected": -366.2562561035156, "loss": 0.0342, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.341662645339966, "rewards/margins": 7.1279296875, "rewards/rejected": -10.459375381469727, "step": 11630 }, { "epoch": 2.934799735291337, "grad_norm": 28.94364356994629, "learning_rate": 7.15855020510664e-10, "logits/chosen": -1.1469237804412842, "logits/rejected": -1.078161597251892, "logps/chosen": -286.7124938964844, "logps/rejected": -351.3374938964844, "loss": 0.0367, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.1283507347106934, "rewards/margins": 6.691992282867432, "rewards/rejected": -9.822461128234863, "step": 11640 }, { "epoch": 2.9373207701761572, "grad_norm": 10.941765785217285, "learning_rate": 6.614618317988263e-10, "logits/chosen": -1.072302222251892, "logits/rejected": -1.0020020008087158, "logps/chosen": -307.6000061035156, "logps/rejected": -357.13751220703125, "loss": 0.033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -3.409228563308716, "rewards/margins": 6.709765434265137, "rewards/rejected": -10.120702743530273, "step": 11650 }, { "epoch": 2.9398418050609774, "grad_norm": 41.47216796875, "learning_rate": 6.092148526111451e-10, "logits/chosen": -1.1149444580078125, "logits/rejected": -1.0994384288787842, "logps/chosen": -303.2250061035156, "logps/rejected": -388.0, "loss": 0.0552, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.141491651535034, "rewards/margins": 6.625781059265137, "rewards/rejected": -9.762109756469727, "step": 11660 }, { "epoch": 2.9423628399457975, "grad_norm": 7.507928371429443, "learning_rate": 5.591145326693525e-10, "logits/chosen": -1.175994873046875, "logits/rejected": NaN, "logps/chosen": -314.6937561035156, "logps/rejected": -340.17498779296875, "loss": 0.0529, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.896167039871216, "rewards/margins": 6.885937690734863, "rewards/rejected": -9.782031059265137, "step": 11670 }, { "epoch": 2.944883874830618, "grad_norm": 10.430318832397461, "learning_rate": 5.111613032176277e-10, "logits/chosen": -1.146691918373108, "logits/rejected": -1.0836150646209717, "logps/chosen": -311.76873779296875, "logps/rejected": -389.25, "loss": 0.0547, "rewards/accuracies": 0.984375, "rewards/chosen": -3.265576124191284, "rewards/margins": 6.797461032867432, "rewards/rejected": -10.062891006469727, "step": 11680 }, { "epoch": 2.9474049097154382, "grad_norm": 16.366127014160156, "learning_rate": 4.6535557701873896e-10, "logits/chosen": -1.174015760421753, "logits/rejected": -1.119775414466858, "logps/chosen": -324.90625, "logps/rejected": -346.57501220703125, "loss": 0.045, "rewards/accuracies": 0.984375, "rewards/chosen": -3.0557494163513184, "rewards/margins": 6.736523628234863, "rewards/rejected": -9.793164253234863, "step": 11690 }, { "epoch": 2.9499259446002584, "grad_norm": 33.4983024597168, "learning_rate": 4.216977483506856e-10, "logits/chosen": -1.176855444908142, "logits/rejected": -1.0918090343475342, "logps/chosen": -361.5874938964844, "logps/rejected": -373.4624938964844, "loss": 0.0647, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -3.1922364234924316, "rewards/margins": 6.511328220367432, "rewards/rejected": -9.704297065734863, "step": 11700 }, { "epoch": 2.9524469794850785, "grad_norm": 36.88227462768555, "learning_rate": 3.8018819300308925e-10, "logits/chosen": NaN, "logits/rejected": -1.0539124011993408, "logps/chosen": -297.8812561035156, "logps/rejected": -354.9375, "loss": 0.0489, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.350720167160034, "rewards/margins": 6.738085746765137, "rewards/rejected": -10.090624809265137, "step": 11710 }, { "epoch": 2.954968014369899, "grad_norm": 16.585119247436523, "learning_rate": 3.408272682741409e-10, "logits/chosen": -1.1395142078399658, "logits/rejected": -1.1004638671875, "logps/chosen": -320.4125061035156, "logps/rejected": -357.67498779296875, "loss": 0.044, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.1923828125, "rewards/margins": 6.826757907867432, "rewards/rejected": -10.017578125, "step": 11720 }, { "epoch": 2.9574890492547192, "grad_norm": 65.93038940429688, "learning_rate": 3.036153129674368e-10, "logits/chosen": -1.170751929283142, "logits/rejected": -1.065454125404358, "logps/chosen": -307.82501220703125, "logps/rejected": -332.1000061035156, "loss": 0.0543, "rewards/accuracies": 0.984375, "rewards/chosen": -2.9806151390075684, "rewards/margins": 6.692480564117432, "rewards/rejected": -9.671093940734863, "step": 11730 }, { "epoch": 2.9600100841395394, "grad_norm": 10.023194313049316, "learning_rate": 2.685526473890365e-10, "logits/chosen": -1.092157006263733, "logits/rejected": -1.047845482826233, "logps/chosen": -301.8187561035156, "logps/rejected": -351.0249938964844, "loss": 0.0412, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.9297118186950684, "rewards/margins": 7.150976657867432, "rewards/rejected": -10.081250190734863, "step": 11740 }, { "epoch": 2.9625311190243595, "grad_norm": 6.3317365646362305, "learning_rate": 2.3563957334482575e-10, "logits/chosen": -1.0945556163787842, "logits/rejected": -1.085790991783142, "logps/chosen": -300.01251220703125, "logps/rejected": -373.1499938964844, "loss": 0.0629, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.794238328933716, "rewards/margins": 6.842382907867432, "rewards/rejected": -9.637890815734863, "step": 11750 }, { "epoch": 2.9650521539091796, "grad_norm": 8.28089714050293, "learning_rate": 2.0487637413776903e-10, "logits/chosen": -1.1538817882537842, "logits/rejected": -1.142126441001892, "logps/chosen": -295.25, "logps/rejected": -366.6625061035156, "loss": 0.0474, "rewards/accuracies": 0.984375, "rewards/chosen": -3.4425110816955566, "rewards/margins": 6.487890720367432, "rewards/rejected": -9.92578125, "step": 11760 }, { "epoch": 2.967573188794, "grad_norm": 11.035055160522461, "learning_rate": 1.762633145655501e-10, "logits/chosen": -1.125695824623108, "logits/rejected": -1.102941870689392, "logps/chosen": -311.45001220703125, "logps/rejected": -386.98748779296875, "loss": 0.0442, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2675232887268066, "rewards/margins": 6.892187595367432, "rewards/rejected": -10.164453506469727, "step": 11770 }, { "epoch": 2.97009422367882, "grad_norm": 13.995767593383789, "learning_rate": 1.4980064091835166e-10, "logits/chosen": -1.1941039562225342, "logits/rejected": -1.138085961341858, "logps/chosen": -322.6499938964844, "logps/rejected": -358.5249938964844, "loss": 0.0554, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.00164794921875, "rewards/margins": 6.380663871765137, "rewards/rejected": -9.379687309265137, "step": 11780 }, { "epoch": 2.9726152585636405, "grad_norm": 27.741172790527344, "learning_rate": 1.2548858097655157e-10, "logits/chosen": -1.1244628429412842, "logits/rejected": -1.027490258216858, "logps/chosen": -306.54998779296875, "logps/rejected": -369.76251220703125, "loss": 0.0662, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.30889892578125, "rewards/margins": 6.887304782867432, "rewards/rejected": -10.193750381469727, "step": 11790 }, { "epoch": 2.9751362934484606, "grad_norm": 9.173103332519531, "learning_rate": 1.0332734400897437e-10, "logits/chosen": -1.211492896080017, "logits/rejected": -1.057397484779358, "logps/chosen": -300.70001220703125, "logps/rejected": -351.7875061035156, "loss": 0.0317, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -3.187939405441284, "rewards/margins": 6.911718845367432, "rewards/rejected": -10.100390434265137, "step": 11800 }, { "epoch": 2.977657328333281, "grad_norm": 22.094951629638672, "learning_rate": 8.331712077094821e-11, "logits/chosen": -1.1710205078125, "logits/rejected": -1.077600121498108, "logps/chosen": -335.67498779296875, "logps/rejected": -369.125, "loss": 0.033, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.8613648414611816, "rewards/margins": 6.783984184265137, "rewards/rejected": -9.644335746765137, "step": 11810 }, { "epoch": 2.980178363218101, "grad_norm": 17.2425537109375, "learning_rate": 6.545808350272297e-11, "logits/chosen": -1.195837378501892, "logits/rejected": -1.118493676185608, "logps/chosen": -300.3999938964844, "logps/rejected": -356.70001220703125, "loss": 0.0313, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.832348585128784, "rewards/margins": 6.853906154632568, "rewards/rejected": -9.684374809265137, "step": 11820 }, { "epoch": 2.9826993981029215, "grad_norm": 20.294713973999023, "learning_rate": 4.9750385927971315e-11, "logits/chosen": -1.142907738685608, "logits/rejected": -1.128930687904358, "logps/chosen": -328.4437561035156, "logps/rejected": -386.25, "loss": 0.0473, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.2728271484375, "rewards/margins": 6.649609565734863, "rewards/rejected": -9.918359756469727, "step": 11830 }, { "epoch": 2.9852204329877416, "grad_norm": 12.122188568115234, "learning_rate": 3.619416325251201e-11, "logits/chosen": -1.1633621454238892, "logits/rejected": -1.13446044921875, "logps/chosen": -310.07501220703125, "logps/rejected": -371.1625061035156, "loss": 0.0601, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.8426756858825684, "rewards/margins": 6.851366996765137, "rewards/rejected": -9.696484565734863, "step": 11840 }, { "epoch": 2.987741467872562, "grad_norm": 32.661895751953125, "learning_rate": 2.4789532162977632e-11, "logits/chosen": -1.1033751964569092, "logits/rejected": -1.0698974132537842, "logps/chosen": -323.9937438964844, "logps/rejected": -352.88751220703125, "loss": 0.0678, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.232617139816284, "rewards/margins": 6.576757907867432, "rewards/rejected": -9.807421684265137, "step": 11850 }, { "epoch": 2.990262502757382, "grad_norm": 41.79987716674805, "learning_rate": 1.5536590826065177e-11, "logits/chosen": -1.104711890220642, "logits/rejected": -1.12957763671875, "logps/chosen": -299.4312438964844, "logps/rejected": -342.6875, "loss": 0.0549, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -3.066140651702881, "rewards/margins": 6.560351371765137, "rewards/rejected": -9.627344131469727, "step": 11860 }, { "epoch": 2.992783537642202, "grad_norm": 3.907005548477173, "learning_rate": 8.435418887509094e-12, "logits/chosen": -1.119897484779358, "logits/rejected": -1.130041480064392, "logps/chosen": -316.83123779296875, "logps/rejected": -372.1875, "loss": 0.0502, "rewards/accuracies": 0.984375, "rewards/chosen": -3.2793946266174316, "rewards/margins": 6.817578315734863, "rewards/rejected": -10.09765625, "step": 11870 }, { "epoch": 2.995304572527022, "grad_norm": 49.77162170410156, "learning_rate": 3.486077471415161e-12, "logits/chosen": -1.1683471202850342, "logits/rejected": -1.020263671875, "logps/chosen": -318.46875, "logps/rejected": -362.42498779296875, "loss": 0.0627, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -3.028076171875, "rewards/margins": 6.994336128234863, "rewards/rejected": -10.017969131469727, "step": 11880 }, { "epoch": 2.9978256074118423, "grad_norm": 22.62541961669922, "learning_rate": 6.886091798441462e-13, "logits/chosen": -1.0814208984375, "logits/rejected": -1.0946776866912842, "logps/chosen": -336.9937438964844, "logps/rejected": -395.875, "loss": 0.0526, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.205029249191284, "rewards/margins": 6.807714939117432, "rewards/rejected": -10.0146484375, "step": 11890 } ], "logging_steps": 10, "max_steps": 11898, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }